├── .cache ├── ChatGPT.pkl ├── InstructGPT.pkl ├── retrieval-finterms.json └── retrieval-finterms.pkl ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.es.md ├── README.ijcai_challenge.md ├── README.md ├── README.zh.md ├── docker ├── DOCKERFILE └── build_and_upload.sh ├── notebooks ├── Finarg-ecc-auc+Edtsum_evaluation_sample.ipynb └── evaluate.ipynb ├── requirements.txt ├── scripts ├── docker_run.sh ├── run_evaluation.sh └── run_interface.sh ├── src ├── chatlm.py ├── eval.py ├── evaluator.py ├── factscore_package │ ├── .cache │ │ ├── demons.json │ │ ├── demons.txt │ │ ├── demons_complex.json │ │ ├── demons_full.txt │ │ ├── demos │ │ │ └── demons.json │ │ ├── fin_rare_terms.jsonl │ │ └── finterms.jsonl │ ├── __init__.py │ ├── abstain_detection.py │ ├── atomic_facts.py │ ├── clm.py │ ├── demons.json │ ├── download_data.py │ ├── en_core_web_sm-3.7.1.tar.gz │ ├── factscorer.py │ ├── lm.py │ ├── npm.py │ ├── openai_lm.py │ ├── retrieval.py │ └── utils.py ├── interface.py ├── model_prompt.py ├── tasks │ ├── __init__.py │ ├── flare.py │ ├── utils.py │ └── zhutils.py └── utils.py └── static ├── av.jpg ├── cr.jpg ├── formula.jpg ├── md.jpg └── sr.jpg /.cache/ChatGPT.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/ChatGPT.pkl -------------------------------------------------------------------------------- /.cache/InstructGPT.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/InstructGPT.pkl -------------------------------------------------------------------------------- /.cache/retrieval-finterms.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/.cache/retrieval-finterms.pkl -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | __pycache__/ 3 | lm_cache/ 4 | *_results 5 | /*.json 6 | .hypothesis 7 | *_private.* 8 | *_private 9 | *_debug.* 10 | *.code-workspace -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "src/financial-evaluation"] 2 | path = src/financial-evaluation 3 | url = https://github.com/chancefocus/financial-evaluation.git 4 | [submodule "src/metrics/BARTScore"] 5 | path = src/metrics/BARTScore 6 | url = https://github.com/neulab/BARTScore.git 7 | [submodule "FinMem-LLM-StockTrading"] 8 | path = FinMem-LLM-StockTrading 9 | url = https://github.com/pipiku915/FinMem-LLM-StockTrading.git 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 乾阜资产 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.es.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |
5 |
6 | Qianqian Xie1  7 | Weiguang Han2  8 | Zhengyu Chen2  9 | Ruoyu Xiang1  10 | Xiao Zhang1  11 | Yueru He1  12 | Mengxi Xiao2  13 | Dong Li2  14 | Yongfu Dai7  15 | Duanyu Feng7  16 | Yijing Xu1  17 | Haoqiang Kang5  18 | Ziyan Kuang12  19 | Chenhan Yuan3  20 | Kailai Yang3  21 | Zheheng Luo3  22 | Tianlin Zhang3  23 | Zhiwei Liu3  24 | Guojun Xiong10  25 | Zhiyang Deng9  26 | Yuechen Jiang9  27 | Zhiyuan Yao9  28 | Haohang Li9  29 | Yangyang Yu9  30 | Gang Hu8  31 | Jiajia Huang11  32 | Xiao-Yang Liu5  33 | Alejandro Lopez-Lira4  34 | Benyou Wang6  35 | Yanzhao Lai13  36 | Hao Wang7  37 | Min Peng2*  38 | Sophia Ananiadou3  39 | Jimin Huang1 40 |
41 |
42 | 43 |
44 | 1The Fin AI  45 | 2Wuhan University  46 | 3The University of Manchester  47 | 4University of Florida  48 | 5Columbia University  49 | 6The Chinese University of Hong Kong, Shenzhen  50 | 7Sichuan University  51 | 8Yunnan University  52 | 9Stevens Institute of Technology  53 | 10Stony Brook University  54 | 11Nanjin Audit University  55 | 12Jiangxi Normal University  56 | 13Southwest Jiaotong University 57 |
58 |
59 | 60 |
61 | Wuhan University Logo  62 | Manchester University Logo  63 | University of Florida Logo  64 | Columbia University Logo  65 | HK University (shenzhen) Logo  66 | Sichuan University  67 | Yunnan University  68 | Stevens Insititute of Technology  69 | Stony Brook University  70 | Nanjing Audit University  71 | Jiangxi Normal University  72 | Southwest Jiaotong University Logo  73 |
74 | ----------------- 75 | 76 | ![](https://img.shields.io/badge/pixiu-v0.1-gold) 77 | ![](https://black.readthedocs.io/en/stable/_static/license.svg) 78 | [![Discord](https://img.shields.io/discord/1146837080798933112)](https://discord.gg/HRWpUmKB) 79 | 80 | [Pixiu Paper](https://arxiv.org/abs/2306.05443) | [FinBen Leaderboard](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard) 81 | 82 | **Descargo de responsabilidad** 83 | 84 | Este repositorio y su contenido se proporcionan **únicamente con fines académicos y educativos**. Ninguno de los materiales constituye asesoramiento financiero, legal o de inversión. No se ofrecen garantías, explícitas o implícitas, respecto a la precisión, integridad o utilidad del contenido. Los autores y colaboradores no son responsables de errores, omisiones o cualquier consecuencia derivada del uso de la información aquí contenida. Los usuarios deben ejercer su propio juicio y consultar a profesionales antes de tomar cualquier decisión financiera, legal o de inversión. El uso del software e información contenida en este repositorio es bajo el propio riesgo del usuario. 85 | 86 | **Al utilizar o acceder a la información de este repositorio, usted acepta indemnizar, defender y eximir de responsabilidad a los autores, colaboradores y cualquier organización o persona afiliada por cualquier reclamo o daño.** 87 | 88 | 89 | 90 | 91 | **Puntos de control:** 92 | 93 | - [FinMA v0.1 (Full 7B version)](https://huggingface.co/ChanceFocus/finma-7b-full) 94 | 95 | **Idiomas** 96 | 97 | - [Inglés](README.md) 98 | - [Español](README.es.md) 99 | 100 | **documento** 101 | 102 | - [PIXIU: A Comprehensive Benchmark, Instruction Dataset and Large Language Model for Finance](https://arxiv.org/abs/2306.05443) 103 | - [The FinBen: An Holistic Financial Benchmark for Large Language Models](https://arxiv.org/abs/2402.12659) 104 | - [No Language is an Island: Unifying Chinese and English in Financial Large Language Models, Instruction Data, and Benchmarks](https://arxiv.org/abs/2403.06249) 105 | - [Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English](https://arxiv.org/abs/2402.07405) 106 | 107 | **Evaluaciones** (más detalles en la sección FinBen): 108 | 109 | - [flare (flare-es-financees)](https://huggingface.co/datasets/TheFinAI/flare-es-financees) 110 | - [flare (flare-es-tsa)](https://huggingface.co/datasets/TheFinAI/flare-es-tsa) 111 | - [flare (flare-es-fns)](https://huggingface.co/datasets/TheFinAI/flare-es-fns) 112 | - [flare (flare-es-efpa)](https://huggingface.co/datasets/TheFinAI/flare-es-efpa) 113 | - [flare (flare-es-efp)](https://huggingface.co/datasets/TheFinAI/flare-es-efp) 114 | - [flare (flare-es-multifin)](https://huggingface.co/datasets/TheFinAI/flare-es-multifin) 115 | 116 | ## Descripción general 117 | 118 | **FinBen_ES** es una iniciativa fundamental enfocada en el dominio financiero español. FinBen_ES busca reforzar el progreso, perfeccionamiento y evaluación de Modelos de Lenguaje a Gran Escala (MLGs) diseñados específicamente para contextos financieros españoles. Como un segmento vital del esfuerzo más amplio de PIXIU, FinBen_ES se erige como un testimonio del compromiso por aprovechar las capacidades de los MLGs, asegurando que los profesionales y entusiastas financieros del mundo hispanohablante tengan a su disposición herramientas lingüísticas de primera categoría. 119 | 120 | ### Características clave 121 | 122 | - **Recursos abiertos**: PIXIU proporciona abiertamente el LLM financiero, los datos de instrucción de ajuste fino y los conjuntos de datos incluidos en el conjunto de evaluación de referencia para fomentar la investigación abierta y la transparencia. 123 | - **Multitarea**: Los datos de instrucción y el conjunto de referencia en PIXIU cubren un diverso conjunto de tareas financieras, que incluyen cuatro tareas de NLP financiero y una tarea de predicción financiera. 124 | - **Multimodalidad**: Los datos de instrucción y el conjunto de referencia de PIXIU consisten en datos financieros multimodales, que incluyen datos de series de tiempo de la tarea de predicción de movimientos de acciones. Cubre varios tipos de textos financieros, que incluyen informes, artículos de noticias, tweets y presentaciones regulatorias. 125 | - **Diversidad**: A diferencia de conjuntos de referencia anteriores que se centran principalmente en tareas de NLP financiero, el conjunto de evaluación de referencia de PIXIU incluye tareas críticas de predicción financiera alineadas con escenarios del mundo real, lo que lo hace más desafiante. 126 | 127 | --- 128 | 129 | ## FinBen_ES: Conjunto de evaluación de comprensión y predicción del lenguaje financiero 130 | 131 | En esta sección, proporcionamos un análisis de rendimiento detallado de FinMA en comparación con otros modelos líderes, incluyendo ChatGPT, GPT-4, lince-zero et al. Para este análisis, hemos elegido una gama de tareas y métricas que abarcan varios aspectos del Procesamiento del Lenguaje Natural financiero y de la predicción financiera. 132 | 133 | ### Tareas 134 | 135 | | Datos | Tarea | Bruto | Tipos de Datos | Modalidades | Licencia | Artículo | 136 | | --------------------- | ------------------------------ | ------ | ----------------------------------- | ----------------- | --------------- | -------- | 137 | | MultiFin | clasificación de titulares | 230 | titulares de noticias | texto | CC BY 4.0 | [1] | 138 | | FNS | respuesta a preguntas | 50 | informes de ganancias | texto | Público | [2] | 139 | | TSA | análisis de sentimientos | 3,829 | titulares de noticias | texto | CC BY 4.0 | [3] | 140 | | Financees | análisis de sentimientos | 6,539 | titulares de noticias | texto | Público | [4] | 141 | | EFP | respuesta a preguntas | 37 | preguntas de evaluación empresarial | texto | Público | | 142 | | EFPA | respuesta a preguntas | 228 | preguntas de evaluación empresarial | texto | Público | | 143 | 144 | 1. Rasmus Jørgensen, Oliver Brandt, Mareike Hartmann, Xiang Dai, Christian Igel, and Desmond Elliott. 2023. MultiFin: A Dataset for Multilingual Financial NLP. In Findings of the Association for Computational Linguistics: EACL 2023, 894–909. Association for Computational Linguistics, Dubrovnik, Croatia. 145 | 2. [FNS 2023. FNP 2023.](http://wp.lancs.ac.uk/cfie/fns2023/). 146 | 3. Pan R, García-Díaz JA, Garcia-Sanchez F, and Valencia-García R. 2023. Evaluation of transformer models for financial targeted sentiment analysis in Spanish. In PeerJ Computer Science, 9:e1377. https://doi.org/10.7717/peerj-cs.1377. 147 | 4. CodaLab. 2023. [Competition](https://codalab.lisn.upsaclay.fr/competitions/10052) 148 | 149 | 150 | ### Evaluación 151 | 152 | #### Preparación 153 | ##### Instalación local 154 | ```bash 155 | git clone https://github.com/TheFinAI/PIXIU.git --recursive 156 | cd PIXIU 157 | pip install -r requirements.txt 158 | cd PIXIU/src/financial-evaluation 159 | pip install -e .[multilingual] 160 | ``` 161 | ##### Imagen de Docker 162 | ```bash 163 | sudo bash scripts/docker_run.sh 164 | ``` 165 | El comando anterior inicia un contenedor docker, puede modificar docker_run.sh para adaptarlo a su entorno. Proporcionamos una imagen precompilada ejecutando sudo docker pull tothemoon/pixiu:latest 166 | 167 | ```bash 168 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 169 | --network host \ 170 | --env https_proxy=$https_proxy \ 171 | --env http_proxy=$http_proxy \ 172 | --env all_proxy=$all_proxy \ 173 | --env HF_HOME=$hf_home \ 174 | -it [--rm] \ 175 | --name pixiu \ 176 | -v $pixiu_path:$pixiu_path \ 177 | -v $hf_home:$hf_home \ 178 | -v $ssh_pub_key:/root/.ssh/authorized_keys \ 179 | -w $workdir \ 180 | $docker_user/pixiu:$tag \ 181 | [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"] 182 | ``` 183 | Argumentos de explicación: 184 | - `[]` significa argumentos ignorables 185 | - `HF_HOME`: directorio de caché huggingface 186 | - `sshd_port`: puerto sshd del contenedor, puede ejecutar `ssh -i private_key -p $sshd_port root@$ip` para conectarse al contenedor, el valor predeterminado es 22001 187 | - `--rm`: elimina el contenedor al salir del contenedor (es decir,`CTRL + D`) 188 | 189 | #### Evaluación automatizada de tareas 190 | Antes de la evaluación, descargue el [punto de control BART](https://drive.google.com/u/0/uc?id=1_7JfF7KOInb7ZrxKHIigTMR4ChVET01m&export=download) en `src/metrics/BARTScore/bart_score.pth`. 191 | 192 | Para la evaluación automatizada, siga estas instrucciones: 193 | 194 | 1. Transformador Huggingface 195 | 196 | Para evaluar un modelo alojado en HuggingFace Hub (por ejemplo, finma-7b-full), use este comando: 197 | 198 | ```bash 199 | python eval.py \ 200 | --model "hf-causal-llama" \ 201 | --model_args "use_accelerate=True,pretrained=chancefocus/finma-7b-full,tokenizer=chancefocus/finma-7b-full,use_fast=False" \ 202 | --tasks "flare_ner,flare_sm_acl,flare_fpb" 203 | ``` 204 | 205 | Puede encontrar más detalles en la documentación de [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness). 206 | 207 | 2. API comerciales 208 | 209 | 210 | Tenga en cuenta que para tareas como NER, la evaluación automatizada se basa en un patrón específico. Esto podría no extraer información relevante en entornos de cero disparos, dando como resultado un rendimiento relativamente más bajo en comparación con los resultados anteriores anotados manualmente. 211 | 212 | ```bash 213 | export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE 214 | python eval.py \ 215 | --model gpt-4 \ 216 | --tasks flare_ner,flare_sm_acl,flare_fpb 217 | ``` 218 | 219 | --- 220 | 221 | 222 | ## License 223 | 224 | PIXIU tiene licencia [MIT]. Para más detalles, consulte el archivo [MIT](LICENSE). 225 | 226 | ## Historial de estrellas 227 | 228 | [![Star History Chart](https://api.star-history.com/svg?repos=The-FinAI/PIXIU&type=Date)](https://star-history.com/#The-FinAI/PIXIU&Date) 229 | 230 | -------------------------------------------------------------------------------- /README.ijcai_challenge.md: -------------------------------------------------------------------------------- 1 | # IJCAI2024-challenge starter-kit 2 | 3 | We're pleased to invite you to attend the IJCAI2024-challenge, ["Financial Challenges in Large Language Models - FinLLM"](https://sites.google.com/nlg.csie.ntu.edu.tw/finnlp-agentscen/shared-task-finllm). 4 | 5 | ## Outline 6 | - [Task 1 Financial Classification Starter Kit](#task-1-financial-classification-starter-kit) 7 | - [Task 2 Financial Text Summarization Starter Kit](#task-2-financial-text-summarization-starter-kit) 8 | - [Task 3 Single Stock Trading Starter Kit](#task-3-single-stock-trading-starter-kit) 9 | - [Fine Tune](#fine-tune) 10 | - [Model Cheating Detection](#Model-Cheating-Detection) 11 | 12 | ## Task 1 Financial Classification Starter Kit 13 | ### Introduction 14 | This task focuses on argument unit classification to test the capabilities of LLMs to identify and categorize texts as premises or claims. Participants receive a financial text and two options, following design the prompt query template, and then classify the text as a claim or premise. 15 | 16 | We provide 7.75k training data and 969 test data to categorize sentences as claims or premises. 17 | 18 | We use the following prompt template to ask and answer the question in this task. 19 | 20 | Instruction: [task prompt] Text: [input text] Response: [output] 21 | 22 | [input text] denotes the financial text in the prompt, [output] is the classified label (i.e., "Claim" or "Premise"). 23 | 24 | ### Performance Metrics 25 | We use two metrics to evaluate classification capability, like F1 and Accuracy. 26 | We use F1 score as the final ranking metrics. 27 | 28 | ### Evaluation 29 | You can follow the instructions in the [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ogcCmhMc5lPhUamCk6512H3PJwPEaBZN?usp=sharing) to do evaluations on Task 1: financial classification. 30 | 31 | ### Dataset Example 32 | | id | query | answer | text | choices | gold | 33 | | -- | ----- | ------ | ---- | ------- | ---- | 34 | | finargeccauc0 | Analyze sentences from earnings conference calls and identify their argumentative function. Each sentence is either a premise, offering evidence or reasoning, or a claim, asserting a conclusion or viewpoint. Return only premise or claim. Text: I mean, sometimes it's not that you came up with some brilliant strategy, it's just like really good work consistently over a long period of time. Answer: | premise | I mean, sometimes it's not that you came up with some brilliant strategy, it's just like really good work consistently over a long period of time. | [ "premise", "claim" ] | 0 | 35 | | finargeccauc1 | Analyze sentences from earnings conference calls and identify their argumentative function. Each sentence is either a premise, offering evidence or reasoning, or a claim, asserting a conclusion or viewpoint. Return only premise or claim. Text: Even while in International, we're continuing to invest in a lot of areas, we continue to frontload Prime benefits for the newer geographies, we continue to launch new countries as we launch Prime in Australia recently. Answer: | claim | Even while in International, we're continuing to invest in a lot of areas, we continue to frontload Prime benefits for the newer geographies, we continue to launch new countries as we launch Prime in Australia recently. | [ "premise", "claim" ] | 1 | 36 | 37 | 38 | 39 | ## Task 2 Financial Text Summarization Starter Kit 40 | ### Introduction 41 | This task is designed to test the capabilities of LLMs to generate coherent summaries. Participants need to summarize a corresponding concise text according to the given financial news text, following the designed prompt template of query. 42 | 43 | We provide 8k training data and 2k test data for abstract financial news articles into concise summaries. 44 | 45 | We use the following prompt template to ask and answer the question in this task. 46 | 47 | Instruction: [task prompt] Context: [input context] Response: [output] 48 | 49 | [input text] denotes the multiple-sentence text in financial news article, [output] is the abstractive summarization on this text. 50 | 51 | ### Performance Metrics 52 | We utilize three metrics, such as ROUGE (1, 2, and L) and BERTScore, to evaluate generated summaries in terms of Relevance. 53 | We use ROUGE -1 score as the final ranking metrics. 54 | 55 | ### Evaluation 56 | You can follow the instructions in the [![Google Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1ogcCmhMc5lPhUamCk6512H3PJwPEaBZN?usp=sharing) to do evaluations on Task 2: financial text summarization. 57 | 58 | ### Dataset Example 59 | | id | query | answer | text | 60 | | -- | ----- | ------ | ---- | 61 | | edtsum0 | You are given a text that consists of multiple sentences. Your task is to perform abstractive summarization on this text. Use your understanding of the content to express the main ideas and crucial details in a shorter, coherent, and natural sounding text. Text: MONROE, Conn., Dec. 16, 2020 /PRNewswire/ --Elidah, maker of ELITONE, a home-use treatment for incontinence, announced it was selected out of 7500 entries from 159 countries to win a Top Ten award from the global SLINGSHOT 2020 start-up competition. Elidah was the only company from the United States awarded this distinction, and one of two start-ups in the life science category.Normally held in Singapore,this year the event was virtual and offered a record $750,000 in cash prizes by StartUpSG. One hundred companies pitched at the live event, and from those ten finalists were selected. The award winners included start-ups from all over the world including Israel, United Kingdom, Singapore, and India, among others. Continue Reading Gloria Kolb holding ELITONE Elidah "We sometimes have the mindset that successful start-ups must come from Silicon Valley," said Gloria Kolb, co-founder and CEO of Elidah, "but innovation is flourishing in the rest of the world as entrepreneurial support systems expand. I was impressed by the other finalists, advancing technologies such as biometric security, artificial intelligence, and gene editing." Although the top prize went to another start-up, Ms. Kolb, as the only female entrepreneur in the finals, was happy to see a company focused on women's health receive recognition. "Women's health should not be a taboo subject, and I hope that investors realize it presents a large market, ripe for innovation." ELITONE is the first home-health device that performs the hard-to-do pelvic floor exercises for women. It operates externally, without the invasiveness of vaginal probes. Exercises are needed to tone the pelvic floor muscles, but they can be hard to do correctly. The wearable nature of ELITONE allows women to do other activities while getting treatment at home, saving time, cost, and risk of infection. In a time when the clinics and pelvic floor physical therapy offices have shut down, at-home over-the-counter devices like ELITONE provide much-needed access to effective FDA-cleared treatments. About ElidahElidah is a women-owned FemTech company established to develop technologies that integrate recent advances in wearable devices, biomaterials, and mobile interfaces to deliver innovative therapeutic solutions.Elidah is led by entrepreneur Gloria Kolb, Founder and CEO, an MIT and Stanford trained engineer whose previous accolades include Boston's 40 under 40 and MIT Technology Review's World Top Innovators Under 35 (TR35). To learn more visit elitone.com.Contact: Gloria Kolb[emailprotected] 810 Main St., Ste C, Monroe, CT 06468978-435-4324SOURCE Elidah Answer: | Elidah Becomes the Only US Company to Win a Top Ten Prize in a Global Start-up Competition, Advancing Recognition for Women's Health | MONROE, Conn., Dec. 16, 2020 /PRNewswire/ --Elidah, maker of ELITONE, a home-use treatment for incontinence, announced it was selected out of 7500 entries from 159 countries to win a Top Ten award from the global SLINGSHOT 2020 start-up competition. Elidah was the only company from the United States awarded this distinction, and one of two start-ups in the life science category.Normally held in Singapore,this year the event was virtual and offered a record $750,000 in cash prizes by StartUpSG. One hundred companies pitched at the live event, and from those ten finalists were selected. The award winners included start-ups from all over the world including Israel, United Kingdom, Singapore, and India, among others. Continue Reading Gloria Kolb holding ELITONE Elidah ""We sometimes have the mindset that successful start-ups must come from Silicon Valley,"" said Gloria Kolb, co-founder and CEO of Elidah, ""but innovation is flourishing in the rest of the world as entrepreneurial support systems expand. I was impressed by the other finalists, advancing technologies such as biometric security, artificial intelligence, and gene editing."" Although the top prize went to another start-up, Ms. Kolb, as the only female entrepreneur in the finals, was happy to see a company focused on women's health receive recognition. ""Women's health should not be a taboo subject, and I hope that investors realize it presents a large market, ripe for innovation."" ELITONE is the first home-health device that performs the hard-to-do pelvic floor exercises for women. It operates externally, without the invasiveness of vaginal probes. Exercises are needed to tone the pelvic floor muscles, but they can be hard to do correctly. The wearable nature of ELITONE allows women to do other activities while getting treatment at home, saving time, cost, and risk of infection. In a time when the clinics and pelvic floor physical therapy offices have shut down, at-home over-the-counter devices like ELITONE provide much-needed access to effective FDA-cleared treatments. About ElidahElidah is a women-owned FemTech company established to develop technologies that integrate recent advances in wearable devices, biomaterials, and mobile interfaces to deliver innovative therapeutic solutions.Elidah is led by entrepreneur Gloria Kolb, Founder and CEO, an MIT and Stanford trained engineer whose previous accolades include Boston's 40 under 40 and MIT Technology Review's World Top Innovators Under 35 (TR35). To learn more visit elitone.com.Contact: Gloria Kolb[emailprotected] 810 Main St., Ste C, Monroe, CT 06468978-435-4324SOURCE Elidah | 62 | 63 | 64 | 65 | ## Task 3 Single Stock Trading Starter Kit 66 | ### Introduction 67 | This task aims to evaluate LLMs’ ability to make sophisticated decisions in trading activities, which is currently restricted by human’s limited ability to process large volumes of data rapidly. Participants receive a combination of open-source data for stocks and an ETF. The system should output one of the three trading decisions (“buy”, “sell”, “hold”) with reasonings. 68 | 69 | We provide 291 data to evaluate LLMs on sophisticated stock decisions. 70 | 71 | We use the following prompt template to ask and answer the question in this task. 72 | 73 | Instruction: [task prompt] Context: [input context] Response: [output] 74 | 75 | [input text] denotes the financial investment information in the prompt, [output] should strictly conform the following JSON format without any additional contents: {"investment_decision": string, "summary_reason": string, "short_memory_index": number, "middle_memory_index": number, "long_memory_index": number, "reflection_memory_index": number} 76 | 77 | ### Performance Metrics 78 | We offer a comprehensive assessment of profitability, risk management, and decision-making prowess by a series of metrics, such as Sharpe Ratio (SR), Cumulative Return (CR), Daily (DV) and Annualized volatility (AV), and Maximum Drawdown (MD). 79 | 80 | We use Sharpe Ratio (SR) score as the final ranking metrics. 81 | 82 | The formulas are as follows: 83 | ![image](static/sr.jpg) 84 | ![image](static/cr.jpg) 85 | ![image](static/av.jpg) 86 | ![image](static/md.jpg) 87 | 88 | 89 | ### Evaluation 90 | You can follow the [instructions](https://github.com/The-FinAI/PIXIU?tab=readme-ov-file#finmem-a-performance-enhanced-llm-trading-agent) to do evaluations on Task 3: single stock trading. 91 | 92 | ### Dataset Example 93 | | id | date | price | filing_k | filing_q | news | 94 | | -- | ---- | ----- | -------- | -------- | ---- | 95 | | jnj_test0 | "2020-10-09" | { "DRIV": 17.52210235595703 } | { "FORM": "null" } | { "FORM": "null" } | { ""DRIV"": [ ""The global cloud enterprise content management market is expected to reach \\$62.4 billion by 2027, driven by a CAGR of 25.6% and significant growth in the U.S. and China. The positive score for this news is 2.3659735504111268e-08. The neutral score for this news is 0.9999990463256836. The negative score for this news is 9.636863751438796e-07."", ""The global emergency lighting batteries market is expected to reach \\$2.8 billion by 2027, growing at a CAGR of 10.8% despite the COVID-19 pandemic's impact. The positive score for this news is 1.1662441465887241e-05. The neutral score for this news is 0.9995514750480652. The negative score for this news is 0.000436866597738117."", ""Despite the impact of the COVID-19 pandemic, the global market for two-wheeler spark plugs is expected to reach 86.2 million units by 2027, growing at a CAGR of 4.9%. The positive score for this news is 1.1285221262369305e-05. The neutral score for this news is 0.9988551139831543. The negative score for this news is 0.0011336031602695584."", ""Despite pandemic setbacks, the global market for two-wheeler upside-down forks is expected to reach 701.8 thousand units by 2027, driven by growth in China and the U.S. The positive score for this news is 9.909140175068387e-08. The neutral score for this news is 0.9999970197677612. The negative score for this news is 2.81238385468896e-06."", ""The global embedded analytics market is expected to reach \\$84.6 billion by 2027, driven by a 13% CAGR, with cloud-based solutions leading the growth. The positive score for this news is 6.070506231026229e-08. The neutral score for this news is 0.9999868869781494. The negative score for this news is 1.2994331882509869e-05."", ""Despite the COVID-19 pandemic, the global battery monitoring system market is expected to reach \\$9.8 billion by 2027, with significant growth in the US and China. The positive score for this news is 4.437213263486228e-08. The neutral score for this news is 0.9999984502792358. The negative score for this news is 1.6080473415058805e-06."", ""Despite the impact of the COVID-19 pandemic, the global microwave transmission equipment market is expected to reach \\$6.7 billion by 2027, with a CAGR of 3.2%. The positive score for this news is 0.00034257289371453226. The neutral score for this news is 0.004475872032344341. The negative score for this news is 0.9951815009117126."", ""Despite the impact of the COVID-19 pandemic, the global transfer membrane market is expected to reach \\$200.3 million by 2027, with the PVDF segment leading the growth. The positive score for this news is 1.5521750640346e-07. The neutral score for this news is 0.9999940395355225. The negative score for this news is 5.781918389402563e-06."", ""Despite the impact of COVID-19, the global thermal analysis market is expected to reach \\$739.1 million by 2027, with a CAGR of 4.4%. The positive score for this news is 0.00015923684986773878. The neutral score for this news is 0.0002189901570091024. The negative score for this news is 0.9996217489242554."" ] } | 96 | 97 | 98 | 99 | ## Fine-tune 100 | We recommend you to use [AutoTrain-Advanced](https://github.com/huggingface/autotrain-advanced.git) to train your models. 101 | 102 | ## Model Cheating Detection 103 | To measure the risk of data leakage from the test set used in the training of a model, the Model Cheating, we have developed a new metric called the Data Leakage Test (DLT), building on existing research. 104 | 105 | The DLT calculates the difference in perplexity of the large language models (LLMs) on both the training and test data to determine its data generation tendencies. Specifically, we separately input the training set and the test set into the LLMs, and calculate the perplexity on the training set (ppl-on-train) and the perplexity on the test set (ppl-on-test). The DLT value is then computed by subtracting the ppl-on-train from the ppl-on-test. A larger difference implies that the LLM is less likely to have seen the test set during training compared to the training set and suggests a lower likelihood of the model cheating. Conversely, a smaller difference implies that the LLM is more likely to have seen the test set during training and suggests a higher likelihood of the model cheating. 106 | 107 | In the detection process, we will calculate the DLT values for some LLMs to establish a reference baseline of Model Cheating, and minimize the impact of generalization on the metric. The formula is as follows: 108 | ![image](static/formula.jpg) 109 | 110 | -------------------------------------------------------------------------------- /README.zh.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 |
5 |
6 | Qianqian Xie1  7 | Weiguang Han2  8 | Zhengyu Chen2  9 | Ruoyu Xiang1  10 | Xiao Zhang1  11 | Yueru He1  12 | Mengxi Xiao2  13 | Dong Li2  14 | Yongfu Dai7  15 | Duanyu Feng7  16 | Yijing Xu1  17 | Haoqiang Kang5  18 | Ziyan Kuang12  19 | Chenhan Yuan3  20 | Kailai Yang3  21 | Zheheng Luo3  22 | Tianlin Zhang3  23 | Zhiwei Liu3  24 | Guojun Xiong10  25 | Zhiyang Deng9  26 | Yuechen Jiang9  27 | Zhiyuan Yao9  28 | Haohang Li9  29 | Yangyang Yu9  30 | Gang Hu8  31 | Jiajia Huang11  32 | Xiao-Yang Liu5  33 | Alejandro Lopez-Lira4  34 | Benyou Wang6  35 | Yanzhao Lai13  36 | Hao Wang7  37 | Min Peng2*  38 | Sophia Ananiadou3  39 | Jimin Huang1 40 |
41 |
42 | 43 |
44 | 1The Fin AI  45 | 2Wuhan University  46 | 3The University of Manchester  47 | 4University of Florida  48 | 5Columbia University  49 | 6The Chinese University of Hong Kong, Shenzhen  50 | 7Sichuan University  51 | 8Yunnan University  52 | 9Stevens Institute of Technology  53 | 10Stony Brook University  54 | 11Nanjin Audit University  55 | 12Jiangxi Normal University  56 | 13Southwest Jiaotong University 57 |
58 |
59 | 60 |
61 | Wuhan University Logo  62 | Manchester University Logo  63 | University of Florida Logo  64 | Columbia University Logo  65 | HK University (shenzhen) Logo  66 | Sichuan University  67 | Yunnan University  68 | Stevens Insititute of Technology  69 | Stony Brook University  70 | Nanjing Audit University  71 | Jiangxi Normal University  72 | Southwest Jiaotong University Logo  73 |
74 | 75 | ----------------- 76 | 77 | ![](https://img.shields.io/badge/pixiu-v0.1-gold) 78 | ![](https://black.readthedocs.io/en/stable/_static/license.svg) 79 | [![Discord](https://img.shields.io/discord/1146837080798933112)](https://discord.gg/HRWpUmKB) 80 | 81 | [Pixiu Paper](https://arxiv.org/abs/2306.05443) | [FinBen Leaderboard](https://huggingface.co/spaces/finosfoundation/Open-Financial-LLM-Leaderboard) 82 | 83 | **免责声明** 84 | 85 | 本资料库及其内容仅用于**学术和教育目的**。所有资料均不构成金融、法律或投资建议。不对内容的准确性、完整性或实用性提供任何明示或暗示的保证。作者和撰稿人不对任何错误、遗漏或因使用本网站信息而产生的任何后果负责。用户在做出任何财务、法律或投资决定之前,应自行判断并咨询专业人士。使用本资料库所含软件和信息的风险完全由用户自行承担。 86 | 87 | **使用或访问本资源库中的信息,即表示您同意对作者、撰稿人以及任何附属组织或个人的任何及所有索赔或损害进行赔偿、为其辩护并使其免受损害。** 88 | 89 | 90 | 91 | 92 | **检查点:** 93 | 94 | - [FinMA v0.1 (Full 7B version)](https://huggingface.co/TheFinAI/finma-7b-full) 95 | 96 | **语言** 97 | 98 | - [英文](README.md) 99 | - [中文](README.zh.md) 100 | - [西班牙语](README.es.md) 101 | 102 | **论文** 103 | 104 | - [PIXIU: A Comprehensive Benchmark, Instruction Dataset and Large Language Model for Finance](https://arxiv.org/abs/2306.05443) 105 | - [The FinBen: An Holistic Financial Benchmark for Large Language Models](https://arxiv.org/abs/2402.12659) 106 | - [No Language is an Island: Unifying Chinese and English in Financial Large Language Models, Instruction Data, and Benchmarks](https://arxiv.org/abs/2403.06249) 107 | - [Dólares or Dollars? Unraveling the Bilingual Prowess of Financial LLMs Between Spanish and English](https://arxiv.org/abs/2402.07405) 108 | 109 | **评估** (更多详情,请参阅FinBen部分): 110 | 111 | - [flare (flare-zh-afqmc)](https://huggingface.co/datasets/TheFinAI/flare-zh-afqmc) 112 | 113 | - [flare (flare-zh-stocka)](https://huggingface.co/datasets/TheFinAI/flare-zh-stocka) 114 | 115 | - [flare (flare-zh-corpus)](https://huggingface.co/datasets/TheFinAI/flare-zh-corpus) 116 | 117 | - [flare (flare-zh-fineval)](https://huggingface.co/datasets/TheFinAI/flare-zh-fineval) 118 | 119 | - [flare (flare-zh-fe)](https://huggingface.co/datasets/TheFinAI/flare-zh-fe) 120 | 121 | - [flare (flare-zh-nl)](https://huggingface.co/datasets/TheFinAI/flare-zh-nl) 122 | 123 | - [flare (flare-zh-nl2)](https://huggingface.co/datasets/TheFinAI/flare-zh-nl2) 124 | 125 | - [flare (flare-zh-nsp)](https://huggingface.co/datasets/TheFinAI/flare-zh-nsp) 126 | 127 | - [flare (flare-zh-re)](https://huggingface.co/datasets/TheFinAI/flare-zh-re) 128 | 129 | - [flare (flare-zh-stockb)](https://huggingface.co/datasets/TheFinAI/flare-zh-stockb) 130 | 131 | - [flare (flare-zh-qa)](https://huggingface.co/datasets/TheFinAI/flare-zh-qa) 132 | 133 | - [flare (flare-zh-na)](https://huggingface.co/datasets/TheFinAI/flare-zh-na) 134 | 135 | - [flare (flare-zh-19ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-19ccks) 136 | 137 | - [flare (flare-zh-20ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-20ccks) 138 | 139 | - [flare (flare-zh-21ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-21ccks) 140 | 141 | - [flare (flare-zh-22ccks)](https://huggingface.co/datasets/TheFinAI/flare-zh-22ccks) 142 | 143 | - [flare (flare-zh-ner)](https://huggingface.co/datasets/TheFinAI/flare-zh-ner) 144 | 145 | - [flare (flare-zh-fpb)](https://huggingface.co/datasets/TheFinAI/flare-zh-fpb) 146 | 147 | 148 | 149 | ## 概述 150 | 151 | **FinBen_ZH** 是一项专注于中文金融领域的基石计划,旨在促进专为中文金融环境定制的大型语言模型(LLMs)的进展、完善和评估。FinBen_ZH 是 PIXIU 更大范围工作的一个重要部分,证明了我们在利用 LLMs 能力方面的承诺,确保中文世界的金融专业人士和爱好者拥有顶级的语言工具。 152 | 153 | ### 主要特征 154 | 155 | - **公开资源**: PIXIU 公开提供财务 LLM、教学调整数据和评估基准中的数据集,以鼓励公开研究和透明度。 156 | - **多任务**: PIXIU 中的指令调整数据和基准涵盖了一系列不同的金融任务。 157 | - **多模态**: PIXIU 的指令调整数据和基准由多模态金融数据组成,包括股票走势预测任务的时间序列数据。它涵盖各种类型的金融文本,包括报告、新闻报道、推特和监管文件。 158 | - **多样性**: 与以往主要侧重于金融 NLP 任务的基准不同,PIXIU 的评估基准包括与真实世界场景相一致的关键金融预测任务,因此更具挑战性。 159 | 160 | --- 161 | 162 | ## FinBen_ZH: 金融语言理解和预测评估基准 163 | 164 | 在本节中,我们将提供 FinMA 与其他领先模型(包括 ChatGPT、GPT-4、ince-zero 等)相比的详细性能分析。为了进行分析,我们选择了一系列任务和指标,涵盖了金融自然语言处理和金融预测的各个方面。 165 | 166 | ### 任务 167 | 168 | | 数据 | 任务类型 | 原始数据| 数据类型 | 模式 | 许可证 | 论文 | 169 | | ------------------ | --------------------------- | ------- | -------------------------------- | ----------------- | ----------------- | ----- | 170 | | AFQMC | 语义匹配 | 38,650 | 提问数据, 对话 | 文本 | Apache-2.0 | [1] | 171 | | corpus | 语义匹配 | 120,000 | 提问数据, 对话 | 文本 | Public | [2] | 172 | | stockA | 股票分类 | 14,769 | 新闻, 历史价格 | 文本, 时间序列 | Public | [3] | 173 | | Fineval | 多项选择 | 1,115 | 金融考试 | 文本 | Apache-2.0 | [4] | 174 | | NL | 新闻分类 | 7,955 | 新闻报道 | 文本 | Public | [5] | 175 | | NL2 | 新闻分类 | 7,955 | 新闻报道 | 文本 | Public | [5] | 176 | | NSP | 负面新闻判断 | 4,499 | 新闻、社交媒体文本 | 文本 | Public | [5] | 177 | | RE | 关系识别 | 14,973 | 新闻、实体对 | 文本 | Public | [5] | 178 | | FE | 情感分析 | 18,177 | 金融社交媒体文本 | 文本 | Public | [5] | 179 | | stockB | 情感分析 | 9,812 | 金融社交媒体文本 | 文本 | Apache-2.0 | [6] | 180 | | QA | 金融问答 | 22,375 | 财经新闻公告 | 文本, 表格 | Public | [5] | 181 | | NA | 文本摘要 | 32,400 | 新闻文章、公告 | 文本 | Public | [5] | 182 | | 19CCKS | 事件主体提取 | 156,834 | 新闻报道 | 文本 | CC BY-SA 4.0 | [7] | 183 | | 20CCKS | 事件主体提取 | 372,810 | 新闻报道 | 文本 | CC BY-SA 4.0 | [8] | 184 | | 21CCKS | 事件因果关系抽取 | 8,000 | 新闻报道 | 文本 | CC BY-SA 4.0 | [9] | 185 | | 22CCKS | 事件主体提取 | 109,555 | 新闻报道 | 文本 | CC BY-SA 4.0 | [10] | 186 | | NER | 命名实体识别 | 1,685 | 新闻报道 | 文本 | Public | [11] | 187 | | FPB | 情感分析 | 4,845 | 新闻 | 文本 | MIT license | [12] | 188 | | FIQASA | 情感分析 | 1,173 | 新闻头条、推文 | 文本 | MIT license | [12] | 189 | | Headlines | 新闻标题分类 | 11,412 | 新闻头条 | 文本 | MIT license | [12] | 190 | | BigData | 股票走势预测 | 7,164 | 推文、历史价格 | 文本, 时间序列 | MIT license | [12] | 191 | | ACL | 股票走势预测 | 27,053 | 推文、历史价格 | 文本, 时间序列 | MIT license | [12] | 192 | | CIKM | 股票走势预测 | 4,967 | 推文、历史价格 | 文本, 时间序列 | MIT license | [12] | 193 | | FinQA | 金融问答 | 14,900 | 收益报告 | 文本, 表格 | MIT license | [12] | 194 | | ConvFinQA | 多轮问答 | 48,364 | 收益报告 | 文本, 表格 | MIT license | [12] | 195 | 196 | 197 | 1. Xu L, Hu H, Zhang X, et al. CLUE: A Chinese language understanding evaluation benchmark[J]. arXiv preprint arXiv:2004.05986, 2020. 198 | 2. Jing Chen, Qingcai Chen, Xin Liu, Haijun Yang, Daohe Lu, and Buzhou Tang. 2018. The BQ Corpus: A Large-scale Domain-specific Chinese Corpus For Sentence Semantic Equivalence Identification. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 4946–4951, Brussels, Belgium. Association for Computational Linguistics. 199 | 3. Jinan Zou, Haiyao Cao, Lingqiao Liu, Yuhao Lin, Ehsan Abbasnejad, and Javen Qinfeng Shi. 2022. Astock: A New Dataset and Automated Stock Trading based on Stock-specific News Analyzing Model. In Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP), pages 178–186, Abu Dhabi, United Arab Emirates (Hybrid). Association for Computational Linguistics. 200 | 4. Zhang L, Cai W, Liu Z, et al. FinEval: A Chinese Financial Domain Knowledge Evaluation Benchmark for Large Language Models[J]. arxiv preprint arxiv:2308.09975, 2023. 201 | 5. Lu D, Liang J, Xu Y, et al. BBT-Fin: Comprehensive Construction of Chinese Financial Domain Pre-trained Language Model, Corpus and Benchmark[J]. arxiv preprint arxiv:2302.09432, 2023. 202 | 6. https://huggingface.co/datasets/kuroneko5943/stock11 203 | 7. https://www.biendata.xyz/competition/ccks_2019_4/ 204 | 8. https://www.biendata.xyz/competition/ccks_2020_4_1/ 205 | 9. https://www.biendata.xyz/competition/ccks_2021_task6_2/ 206 | 10. https://www.biendata.xyz/competition/ccks2022_eventext/ 207 | 11. Jia C, Shi Y, Yang Q, et al. Entity enhanced BERT pre-training for Chinese NER[C]//Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP). 2020: 6384-6396. 208 | 12. Xie Q, Han W, Zhang X, et al. PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance[J]. arXiv preprint arXiv:2306.05443, 2023. 209 | 210 | ### 评估 211 | 212 | #### 准备工作 213 | ##### 本地安装 214 | ```bash 215 | git clone https://github.com/TheFinAI/PIXIU.git --recursive 216 | cd PIXIU 217 | pip install -r requirements.txt 218 | cd PIXIU/src/financial-evaluation 219 | pip install -e .[multilingual] 220 | ``` 221 | ##### Docker 镜像 222 | ```bash 223 | sudo bash scripts/docker_run.sh 224 | ``` 225 | 以上命令会启动一个 docker 容器,你可以根据自己的环境修改 `docker_run.sh`。我们通过运行 `sudo docker pull tothemoon/pixiu:latest` 来提供预编译镜像。 226 | 227 | ```bash 228 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 229 | --network host \ 230 | --env https_proxy=$https_proxy \ 231 | --env http_proxy=$http_proxy \ 232 | --env all_proxy=$all_proxy \ 233 | --env HF_HOME=$hf_home \ 234 | -it [--rm] \ 235 | --name pixiu \ 236 | -v $pixiu_path:$pixiu_path \ 237 | -v $hf_home:$hf_home \ 238 | -v $ssh_pub_key:/root/.ssh/authorized_keys \ 239 | -w $workdir \ 240 | $docker_user/pixiu:$tag \ 241 | [--sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash"] 242 | ``` 243 | 参数说明: 244 | - `[]` 表示可忽略的参数 245 | - `HF_HOME`: huggingface 缓存目录 246 | - `sshd_port`: 容器的 sshd 端口,可以运行 `ssh -i private_key -p $sshd_port root@$ip` 来连接容器,默认为 22001 247 | - `--rm`: 退出容器时移除容器(即 `CTRL + D`) 248 | 249 | #### 自动化任务评估 250 | 在评估前, 请下载 [punto de control BART](https://drive.google.com/u/0/uc?id=1_7JfF7KOInb7ZrxKHIigTMR4ChVET01m&export=download) 到 `src/metrics/BARTScore/bart_score.pth`. 251 | 252 | 如需进行自动评估,请按照以下说明操作: 253 | 254 | 1. Transformador Huggingface 255 | 256 | 要评估 HuggingFace Hub 上托管的模型(例如,finma-7b-full),请使用此命令: 257 | 258 | ```bash 259 | python eval.py \ 260 | --model "hf-causal-llama" \ 261 | --model_args "use_accelerate=True,pretrained=TheFinAI/finma-7b-full,tokenizer=TheFinAI/finma-7b-full,use_fast=False" \ 262 | --tasks "flare_ner,flare_sm_acl,flare_fpb" 263 | ``` 264 | 265 | 更多详情,请参阅 [lm_eval](https://github.com/EleutherAI/lm-evaluation-harness) 文档。 266 | 267 | 2. 商用接口 268 | 269 | 270 | 请注意,对于 NER 等任务,自动评估是基于特定模式进行的。这可能无法提取零镜头设置中的相关信息,导致性能相对低于之前的人工标注结果。 271 | 272 | ```bash 273 | export OPENAI_API_SECRET_KEY=YOUR_KEY_HERE 274 | python eval.py \ 275 | --model gpt-4 \ 276 | --tasks flare_ner,flare_sm_acl,flare_fpb 277 | ``` 278 | 279 | --- 280 | 281 | ## 引用 282 | 283 | 如果您在项目中使用了PIXIU,请引用我们的文章。 284 | 285 | ``` 286 | @misc{xie2023pixiu, 287 | title={PIXIU: A Large Language Model, Instruction Data and Evaluation Benchmark for Finance}, 288 | author={Qianqian Xie and Weiguang Han and Xiao Zhang and Yanzhao Lai and Min Peng and Alejandro Lopez-Lira and Jimin Huang}, 289 | year={2023}, 290 | eprint={2306.05443}, 291 | archivePrefix={arXiv}, 292 | primaryClass={cs.CL} 293 | } 294 | 295 | @misc{xie2024FinBen, 296 | title={The FinBen: An Holistic Financial Benchmark for Large Language Models}, 297 | author={Qianqian Xie and Weiguang Han and Zhengyu Chen and Ruoyu Xiang and Xiao Zhang and Yueru He and Mengxi Xiao and Dong Li and Yongfu Dai and Duanyu Feng and Yijing Xu and Haoqiang Kang and Ziyan Kuang and Chenhan Yuan and Kailai Yang and Zheheng Luo and Tianlin Zhang and Zhiwei Liu and Guojun Xiong and Zhiyang Deng and Yuechen Jiang and Zhiyuan Yao and Haohang Li and Yangyang Yu and Gang Hu and Jiajia Huang and Xiao-Yang Liu and Alejandro Lopez-Lira and Benyou Wang and Yanzhao Lai and Hao Wang and Min Peng and Sophia Ananiadou and Jimin Huang}, 298 | year={2024}, 299 | eprint={2402.12659}, 300 | archivePrefix={arXiv}, 301 | primaryClass={cs.CL} 302 | } 303 | ``` 304 | 305 | 306 | 307 | ## 许可证 308 | 309 | PIXIU 采用 [MIT] 许可。有关详细信息,请参阅 [MIT](LICENSE) 文件。 310 | 311 | ## 星标历史 312 | 313 | ![Star History Chart](https://api.star-history.com/svg?repos=The-FinAI/PIXIU&type=Date) 314 | 315 | -------------------------------------------------------------------------------- /docker/DOCKERFILE: -------------------------------------------------------------------------------- 1 | FROM tothemoon/llm 2 | 3 | RUN python3 -m pip install -U --no-cache-dir sqlitedict 4 | RUN python3 -m pip install -U --no-cache-dir omegaconf 5 | RUN python3 -m pip install -U --no-cache-dir pycountry 6 | RUN python3 -m pip install -U --no-cache-dir seqeval 7 | RUN python3 -m pip install -U --no-cache-dir ipywidgets 8 | RUN python3 -m pip install -U --no-cache-dir pytablewriter 9 | RUN python3 -m pip install -U --no-cache-dir git+https://github.com/Tiiiger/bert_score -------------------------------------------------------------------------------- /docker/build_and_upload.sh: -------------------------------------------------------------------------------- 1 | export https_proxy=... 2 | export http_proxy=... 3 | export all_proxy=... 4 | docker_user=... 5 | tag=$(date +%Y%m%d) 6 | 7 | docker build --network host --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy --build-arg all_proxy=$all_proxy -t pixiu -f DOCKERFILE . 8 | docker tag pixiu $docker_user/pixiu:$tag 9 | docker push $docker_user/pixiu:$tag 10 | docker tag pixiu $docker_user/pixiu:latest 11 | docker push $docker_user/pixiu:latest 12 | -------------------------------------------------------------------------------- /notebooks/evaluate.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sys\n", 10 | "sys.path.append('/data/hanweiguang/Projects/PIXIU')" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 25, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "from src.utils import MultiClient\n", 20 | "from sklearn.metrics import confusion_matrix, matthews_corrcoef, f1_score, accuracy_score\n", 21 | "import json" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "GENERATION_CONFIG = [\n", 31 | " 0.1, # int | float (numeric value between 0 and 1) in 'Temperature' Slider component\n", 32 | " 0.75, # int | float (numeric value between 0 and 1) in 'Top p' Slider component\n", 33 | " 40, # int | float (numeric value between 0 and 100) in 'Top k' Slider component\n", 34 | " 1, # int | float (numeric value between 1 and 4) in 'Beams Number' Slider component\n", 35 | " True, # do sample\n", 36 | " 8, # int | float (numeric value between 1 and 2000) in 'Max New Tokens' Slider component\n", 37 | " 1, # int | float (numeric value between 1 and 300) in 'Min New Tokens' Slider component\n", 38 | " 1.2, # int | float (numeric value between 1.0 and 2.0) in 'Repetition Penalty' Slider component\n", 39 | "]" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 4, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "with open(\"/data/hanweiguang/Projects/PIXIU/data/cikm18/test.jsonl\") as f:\n", 49 | " data = f.readlines()\n", 50 | " data = [json.loads(val) for val in data]" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 5, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "Loaded as API: http://127.0.0.1:17860/ ✔\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "worker_addrs = [\n", 68 | " f\"http://127.0.0.1:{17860 + i}\" for i in range(1)\n", 69 | "]\n", 70 | "clients = MultiClient(worker_addrs)" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 6, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stderr", 80 | "output_type": "stream", 81 | "text": [ 82 | "100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 200/200 [03:21<00:00, 1.01s/it]\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "results = clients.predict(\n", 88 | " [\n", 89 | " [\n", 90 | " datum[\"conversations\"][0][\"value\"]\n", 91 | " ] + GENERATION_CONFIG for datum in data[:200]\n", 92 | " ]\n", 93 | ")" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 11, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "labels = [\n", 103 | " datum[\"label\"] for datum in data[:200]\n", 104 | "]" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 12, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "y_true = [1 if i == \"Rise\" else 0 for i in labels]\n", 114 | "y_pred = [1 if i == \"Rise\" else 0 for i in results]" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 22, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "MCC: -0.05380001385625025\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "# Calculate confusion matrix\n", 132 | "tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()\n", 133 | "\n", 134 | "# Calculate Matthews correlation coefficient\n", 135 | "mcc = matthews_corrcoef(y_true, y_pred)\n", 136 | "print(f'MCC: {mcc}')" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 23, 142 | "metadata": {}, 143 | "outputs": [ 144 | { 145 | "name": "stdout", 146 | "output_type": "stream", 147 | "text": [ 148 | "F1: 0.4573069852941177\n" 149 | ] 150 | } 151 | ], 152 | "source": [ 153 | "f1 = f1_score(y_true, y_pred, average='weighted')\n", 154 | "print(f'F1: {f1}')" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 27, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "name": "stdout", 164 | "output_type": "stream", 165 | "text": [ 166 | "accuracy: 0.51\n" 167 | ] 168 | } 169 | ], 170 | "source": [ 171 | "accuracy = accuracy_score(y_true, y_pred)\n", 172 | "print(f'accuracy: {accuracy}')" 173 | ] 174 | } 175 | ], 176 | "metadata": { 177 | "kernelspec": { 178 | "display_name": "Python 3 (ipykernel)", 179 | "language": "python", 180 | "name": "python3" 181 | }, 182 | "orig_nbformat": 4 183 | }, 184 | "nbformat": 4, 185 | "nbformat_minor": 2 186 | } 187 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | peft 3 | gradio 4 | tqdm 5 | scikit-learn 6 | sentencepiece 7 | pandas 8 | seqeval 9 | numpy 10 | evaluate 11 | openai 12 | rank_bm25 13 | spacy 14 | sentence_transformers 15 | src/factscore_package/en_core_web_sm-3.7.1.tar.gz 16 | vllm==0.2.7 17 | -------------------------------------------------------------------------------- /scripts/docker_run.sh: -------------------------------------------------------------------------------- 1 | # 需要先安装container-toolkit 2 | # https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html 3 | 4 | export https_proxy=... 5 | export http_proxy=... 6 | export all_proxy=... 7 | 8 | pixiu_path=... 9 | docker_user=tothemoon 10 | tag="latest" 11 | hf_home=... 12 | ssh_pub_key=... 13 | workdir="$pixiu_path" 14 | chown root:root $ssh_pub_key 15 | 16 | docker run --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \ 17 | --network host \ 18 | --env https_proxy=$https_proxy \ 19 | --env http_proxy=$http_proxy \ 20 | --env all_proxy=$all_proxy \ 21 | --env HF_HOME=$hf_home \ 22 | -it --rm \ 23 | --name pixiu \ 24 | -v $pixiu_path:$pixiu_path \ 25 | -v $hf_home:$hf_home \ 26 | -v $ssh_pub_key:/root/.ssh/authorized_keys \ 27 | -w $workdir \ 28 | $docker_user/pixiu:$tag \ 29 | --sshd_port 2201 --cmd "echo 'Hello, world!' && /bin/bash" -------------------------------------------------------------------------------- /scripts/run_evaluation.sh: -------------------------------------------------------------------------------- 1 | pixiu_path='/root/PIXIU' 2 | export PYTHONPATH="$pixiu_path/src:$pixiu_path/src/financial-evaluation:$pixiu_path/src/metrics/BARTScore" 3 | echo $PYTHONPATH 4 | export CUDA_VISIBLE_DEVICES="0" 5 | 6 | python src/eval.py \ 7 | --model hf-causal-vllm \ 8 | --tasks flare_en_fintern \ 9 | --model_args use_accelerate=True,pretrained=llama-2-7b-chat-hf,tokenizer=llama-2-7b-chat-hf,use_fast=False,max_gen_toks=1024,dtype=float16 \ 10 | --no_cache \ 11 | --batch_size 2 \ 12 | --model_prompt 'finma_prompt' \ 13 | --num_fewshot 0 \ 14 | --write_out 15 | -------------------------------------------------------------------------------- /scripts/run_interface.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES='0,1,2,3,4,7' 2 | export PYTHONPATH='.../PIXIU/src' 3 | 4 | model_name_or_path='...' 5 | 6 | python src/interface.py \ 7 | --model_name_or_path $model_name_or_path \ 8 | --llama 9 | -------------------------------------------------------------------------------- /src/chatlm.py: -------------------------------------------------------------------------------- 1 | import os 2 | import asyncio 3 | import numpy as np 4 | import transformers 5 | from lm_eval.base import BaseLM 6 | from lm_eval import utils 7 | from tqdm import tqdm 8 | import time 9 | 10 | BACKOFF_TIME = 0.1 11 | 12 | async def single_chat(client, **kwargs): 13 | global BACKOFF_TIME 14 | backoff_time = BACKOFF_TIME 15 | while True: 16 | try: 17 | r = await client.post(**kwargs, timeout=20) 18 | json_response = r.json() 19 | s = json_response['choices'][0]["message"]['content'] 20 | time.sleep(backoff_time) 21 | return s 22 | except Exception: 23 | import traceback 24 | 25 | traceback.print_exc() 26 | time.sleep(backoff_time * 30) 27 | BACKOFF_TIME *= 1.05 28 | 29 | 30 | async def oa_completion(**kwargs): 31 | """Query OpenAI API for completion. 32 | 33 | Retry with back-off until they respond 34 | """ 35 | import httpx 36 | 37 | async with httpx.AsyncClient() as client: 38 | tasks = [single_chat( 39 | client=client, 40 | url=kwargs["url"], headers=kwargs["headers"], 41 | json={ 42 | "temperature": kwargs["temperature"], "max_tokens": kwargs["max_tokens"], 43 | "model": kwargs["model"], "messages": [message,], 44 | } 45 | ) for message in kwargs["messages"]] 46 | results = await asyncio.gather(*tasks) 47 | return results 48 | 49 | 50 | class ChatLM(BaseLM): 51 | REQ_CHUNK_SIZE = 20 52 | 53 | def __init__(self, model, truncate=False): 54 | """ 55 | 56 | :param model: str 57 | :param truncate: bool 58 | Truncate input if too long (if False and input is too long, throw error) 59 | """ 60 | super().__init__() 61 | 62 | import openai 63 | 64 | self.model = model 65 | self.truncate = truncate 66 | # Read from environment variable OPENAI_API_SECRET_KEY 67 | api_key = os.environ["OPENAI_API_SECRET_KEY"] 68 | self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2") 69 | self.headers = { 70 | "Content-Type": "application/json", 71 | "Authorization": f"Bearer {api_key}" 72 | } 73 | 74 | @property 75 | def eot_token_id(self): 76 | return self.tokenizer.eos_token_id 77 | 78 | @property 79 | def max_length(self): 80 | # Note: the OpenAI API supports up to 2049 tokens, with the first token being the first input token 81 | return 4096 82 | 83 | @property 84 | def max_gen_toks(self): 85 | return 10 86 | 87 | @property 88 | def batch_size(self): 89 | # Isn't used because we override _loglikelihood_tokens 90 | raise NotImplementedError() 91 | 92 | @property 93 | def device(self): 94 | # Isn't used because we override _loglikelihood_tokens 95 | raise NotImplementedError() 96 | 97 | def tok_encode(self, string: str): 98 | return self.tokenizer.encode(string, add_special_tokens=False) 99 | 100 | def tok_decode(self, tokens): 101 | return self.tokenizer.decode(tokens) 102 | 103 | def _loglikelihood_tokens(self, requests, disable_tqdm=False): 104 | raise NotImplementedError() 105 | 106 | def greedy_until(self, requests): 107 | if not requests: 108 | return [] 109 | res = [] 110 | 111 | def _collate(x): 112 | toks = self.tok_encode(x[0]) 113 | return len(toks), x[0] 114 | 115 | re_ord = utils.Reorderer(requests, _collate) 116 | 117 | def sameuntil_chunks(xs, size): 118 | ret = [] 119 | lastuntil = "" 120 | for x in xs: 121 | if len(ret) >= size: 122 | yield ret, lastuntil 123 | ret = [] 124 | lastuntil = "" 125 | ret.append(x) 126 | 127 | if ret: 128 | yield ret, lastuntil 129 | 130 | # todo: more intelligent batching for heterogeneous `until` 131 | for chunk, until in tqdm( 132 | list(sameuntil_chunks(re_ord.get_reordered(), self.REQ_CHUNK_SIZE)) 133 | ): 134 | inps = [] 135 | for context in chunk: 136 | inps.append(context[0]) 137 | 138 | responses = asyncio.run(oa_completion( 139 | url="https://api.openai.com/v1/chat/completions", 140 | headers=self.headers, 141 | model=self.model, 142 | messages=[{"role": "user", "content": inp} for inp in inps], 143 | max_tokens=self.max_gen_toks, 144 | temperature=0.0, 145 | # stop=until, 146 | )) 147 | 148 | for resp, context in zip(responses, chunk): 149 | s = resp 150 | 151 | # partial caching 152 | self.cache_hook.add_partial("greedy_until", (context, ""), s) 153 | 154 | res.append(s) 155 | 156 | return re_ord.get_original(res) 157 | 158 | def _model_call(self, inps): 159 | # Isn't used because we override _loglikelihood_tokens 160 | raise NotImplementedError() 161 | 162 | def _model_generate(self, context, max_length, eos_token_id): 163 | # Isn't used because we override greedy_until 164 | raise NotImplementedError() 165 | -------------------------------------------------------------------------------- /src/eval.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import logging 4 | import os 5 | import tasks 6 | 7 | from lm_eval import utils 8 | import evaluator 9 | from model_prompt import MODEL_PROMPT_MAP 10 | 11 | logging.getLogger("openai").setLevel(logging.WARNING) 12 | 13 | def parse_args(): 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--model", required=True) 16 | parser.add_argument("--model_args", default="") 17 | parser.add_argument("--tasks", default=None, choices=utils.MultiChoice(tasks.ALL_TASKS)) 18 | parser.add_argument("--model_prompt", default="no_prompt", choices=list(MODEL_PROMPT_MAP.keys())) 19 | parser.add_argument("--provide_description", action="store_true") 20 | parser.add_argument("--num_fewshot", type=int, default=0) 21 | parser.add_argument("--batch_size", type=str, default=None) 22 | parser.add_argument("--max_batch_size", type=int, default=None, 23 | help="Maximal batch size to try with --batch_size auto") 24 | parser.add_argument("--device", type=str, default=None) 25 | parser.add_argument("--output_path", default=None) 26 | parser.add_argument("--limit", type=float, default=None, 27 | help="Limit the number of examples per task. " 28 | "If <1, limit is a percentage of the total number of examples.") 29 | parser.add_argument("--data_sampling", type=float, default=None) 30 | parser.add_argument("--no_cache", action="store_true") 31 | parser.add_argument("--decontamination_ngrams_path", default=None) 32 | parser.add_argument("--description_dict_path", default=None) 33 | parser.add_argument("--check_integrity", action="store_true") 34 | parser.add_argument("--write_out", action="store_true", default=False) 35 | parser.add_argument("--output_base_path", type=str, default=None) 36 | 37 | return parser.parse_args() 38 | 39 | 40 | def main(): 41 | args = parse_args() 42 | 43 | assert not args.provide_description # not implemented 44 | 45 | if args.limit: 46 | print( 47 | "WARNING: --limit SHOULD ONLY BE USED FOR TESTING. REAL METRICS SHOULD NOT BE COMPUTED USING LIMIT." 48 | ) 49 | 50 | if args.tasks is None: 51 | task_names = tasks.ALL_TASKS 52 | else: 53 | task_names = utils.pattern_match(args.tasks.split(","), tasks.ALL_TASKS) 54 | 55 | print(f"Selected Tasks: {task_names}") 56 | 57 | description_dict = {} 58 | if args.description_dict_path: 59 | with open(args.description_dict_path, "r") as f: 60 | description_dict = json.load(f) 61 | 62 | results = evaluator.simple_evaluate( 63 | model=args.model, 64 | model_args=args.model_args, 65 | tasks=task_names, 66 | num_fewshot=args.num_fewshot, 67 | batch_size=args.batch_size, 68 | max_batch_size=args.max_batch_size, 69 | device=args.device, 70 | no_cache=args.no_cache, 71 | limit=args.limit, 72 | description_dict=description_dict, 73 | decontamination_ngrams_path=args.decontamination_ngrams_path, 74 | check_integrity=args.check_integrity, 75 | write_out=args.write_out, 76 | output_base_path=args.output_base_path, 77 | model_prompt=args.model_prompt 78 | ) 79 | 80 | dumped = json.dumps(results, indent=2) 81 | print(dumped) 82 | 83 | if args.output_path: 84 | os.makedirs(os.path.dirname(args.output_path), exist_ok=True) 85 | with open(args.output_path, "w") as f: 86 | f.write(dumped) 87 | 88 | batch_sizes = ",".join(map(str, results["config"]["batch_sizes"])) 89 | print( 90 | f"{args.model} ({args.model_args}), limit: {args.limit}, provide_description: {args.provide_description}, " 91 | f"num_fewshot: {args.num_fewshot}, batch_size: {args.batch_size}{f' ({batch_sizes})' if batch_sizes else ''}" 92 | ) 93 | print(evaluator.make_table(results)) 94 | 95 | 96 | if __name__ == "__main__": 97 | main() 98 | -------------------------------------------------------------------------------- /src/evaluator.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import itertools 3 | import numpy as np 4 | import random 5 | 6 | from lm_eval.utils import positional_deprecated, run_task_tests 7 | import lm_eval.metrics 8 | import lm_eval.models 9 | import lm_eval.tasks 10 | import lm_eval.base 11 | 12 | from model_prompt import MODEL_PROMPT_MAP 13 | from chatlm import ChatLM 14 | import tasks as ta 15 | 16 | @positional_deprecated 17 | def simple_evaluate( 18 | model, 19 | model_args=None, 20 | tasks=[], 21 | num_fewshot=0, 22 | batch_size=None, 23 | max_batch_size=None, 24 | device=None, 25 | no_cache=False, 26 | limit=None, 27 | bootstrap_iters=100, 28 | description_dict=None, 29 | check_integrity=False, 30 | decontamination_ngrams_path=None, 31 | write_out=False, 32 | output_base_path=None, 33 | model_prompt=None 34 | ): 35 | """Instantiate and evaluate a model on a list of tasks. 36 | 37 | :param model: Union[str, LM] 38 | Name of model or LM object, see lm_eval.models.get_model 39 | :param model_args: Optional[str] 40 | String arguments for each model class, see LM.create_from_arg_string. 41 | Ignored if `model` argument is a LM object. 42 | :param tasks: list[Union[str, Task]] 43 | List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. 44 | :param num_fewshot: int 45 | Number of examples in few-shot context 46 | :param batch_size: int or str, optional 47 | Batch size for model 48 | :param max_batch_size: int, optional 49 | Maximal batch size to try with automatic batch size detection 50 | :param device: str, optional 51 | PyTorch device (e.g. "cpu" or "cuda:0") for running models 52 | :param no_cache: bool 53 | Whether or not to cache 54 | :param limit: int or float, optional 55 | Limit the number of examples per task (only use this for testing), If <1, limit is a percentage of the total number of examples. 56 | :param bootstrap_iters: 57 | Number of iterations for bootstrap statistics 58 | :param description_dict: dict[str, str] 59 | Dictionary of custom task descriptions of the form: `task_name: description` 60 | :param check_integrity: bool 61 | Whether to run the relevant part of the test suite for the tasks 62 | :param write_out: bool 63 | If True, write details about prompts and logits to json for all tasks 64 | :param output_base_path: str, optional 65 | Directory to which detailed eval info will be written. Defaults to present working dir. 66 | :return 67 | Dictionary of results 68 | """ 69 | random.seed(1234) 70 | np.random.seed(1234) 71 | 72 | assert len(tasks) != 0, "No tasks specified" 73 | 74 | if isinstance(model, str): 75 | if model_args is None: 76 | model_args = "" 77 | if model[:3] != "gpt": 78 | lm = lm_eval.models.get_model(model).create_from_arg_string( 79 | model_args, {"batch_size": batch_size, "max_batch_size": max_batch_size, "device": device} 80 | ) 81 | else: 82 | lm = ChatLM(model) 83 | else: 84 | assert isinstance(model, lm_eval.base.LM) 85 | lm = model 86 | 87 | if not no_cache: 88 | lm = lm_eval.base.CachingLM( 89 | lm, 90 | "lm_cache/" 91 | + (model if isinstance(model, str) else model.model.config._name_or_path) 92 | + "_" 93 | + model_args.replace("=", "-").replace(",", "_").replace("/", "-") 94 | + ".db", 95 | ) 96 | 97 | task_dict = ta.get_task_dict(tasks) 98 | 99 | if check_integrity: 100 | run_task_tests(task_list=tasks) 101 | 102 | results = evaluate( 103 | lm=lm, 104 | task_dict=task_dict, 105 | num_fewshot=num_fewshot, 106 | limit=limit, 107 | bootstrap_iters=bootstrap_iters, 108 | description_dict=description_dict, 109 | decontamination_ngrams_path=decontamination_ngrams_path, 110 | write_out=write_out, 111 | output_base_path=output_base_path, 112 | model_prompt=model_prompt 113 | ) 114 | 115 | # add info about the model and few shot config 116 | results["config"] = { 117 | "model": (model if isinstance(model, str) else model.model.config._name_or_path), 118 | "model_args": model_args, 119 | "num_fewshot": num_fewshot, 120 | "batch_size": batch_size, 121 | "batch_sizes": list(lm.batch_sizes.values()) if hasattr(lm, "batch_sizes") else [], 122 | "device": device, 123 | "no_cache": no_cache, 124 | "limit": limit, 125 | "bootstrap_iters": bootstrap_iters, 126 | "description_dict": description_dict, 127 | } 128 | 129 | return results 130 | 131 | 132 | decontaminate_suffix = "_decontaminate" 133 | 134 | 135 | @positional_deprecated 136 | def evaluate( 137 | lm, 138 | task_dict, 139 | provide_description=None, 140 | num_fewshot=0, 141 | limit=None, 142 | bootstrap_iters=100000, 143 | description_dict=None, 144 | decontamination_ngrams_path=None, 145 | write_out=False, 146 | output_base_path=None, 147 | model_prompt=None 148 | ): 149 | """Instantiate and evaluate a model on a list of tasks. 150 | 151 | :param lm: obj 152 | Language Model 153 | :param task_dict: dict[str, Task] 154 | Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. 155 | :param provide_description: bool 156 | Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method 157 | :param num_fewshot: int 158 | Number of examples in few-shot context 159 | :param limit: int, optional 160 | Limit the number of examples per task (only use this for testing) 161 | :param bootstrap_iters: 162 | Number of iterations for bootstrap statistics 163 | :param description_dict: dict[str, str] 164 | Dictionary of custom task descriptions of the form: `task_name: description` 165 | :param write_out: bool 166 | If True, write all prompts, logits and metrics to json for offline analysis 167 | :param output_base_path: str, optional 168 | Directory to which detailed eval info will be written. Defaults to present working dir 169 | :return 170 | Dictionary of results 171 | """ 172 | # TODO: completely refactor this entire function to not be a huge mess, ideally breaking it down into smaller pieces 173 | 174 | # TODO: todo: implement proper description-providing system 175 | assert not provide_description # not implemented. 176 | if provide_description is not None: 177 | # nudge people to not specify it at all 178 | print( 179 | "WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict" 180 | ) 181 | 182 | decontaminate = decontamination_ngrams_path is not None 183 | 184 | task_dict_items = [ 185 | (name, task) 186 | for name, task in task_dict.items() 187 | if (task.has_validation_docs() or task.has_test_docs()) 188 | ] 189 | 190 | results = collections.defaultdict(dict) 191 | versions = collections.defaultdict(dict) 192 | 193 | requests = collections.defaultdict(list) 194 | turn_requests = collections.defaultdict(dict) 195 | requests_origin = collections.defaultdict(list) 196 | 197 | overlaps = collections.defaultdict(list) # {task_name: contaminated_docs} 198 | 199 | # If we ever run into issues where the eval tasks don't fit in memory and we can't afford a machine with bigger 200 | # memory, we can always modify this plumbing to support that, but I didn't want to include it just yet because 201 | # over-engineering is bad (or we could make it write the requests to disk and then read them back out again 202 | # - probably using an sqlite db because of all the moving parts we have 203 | 204 | # TODO: we need unit tests & sanity checks or something to ensure that the return of `validation_docs` is stable 205 | docs = {} 206 | write_out_info = {} 207 | 208 | docs_for_decontamination = collections.defaultdict(list) 209 | 210 | # get lists of each type of request 211 | for task_name, task in task_dict_items: 212 | versions[task_name] = task.VERSION 213 | # default to test doc, fall back to val doc if validation unavailable 214 | # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point 215 | if task.has_test_docs(): 216 | task_doc_func = task.test_docs 217 | task_set = "test" # Required for caching in the decontamination 218 | elif task.has_validation_docs(): 219 | task_set = "val" # Required for caching in the decontamination 220 | task_doc_func = task.validation_docs 221 | else: 222 | raise RuntimeError("Task has neither test_docs nor validation_docs") 223 | 224 | # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order 225 | task_docs = list(task_doc_func()) 226 | rnd = random.Random() 227 | rnd.seed(42) 228 | rnd.shuffle(task_docs) 229 | print(f"Task: {task_name}; number of docs: {len(task_docs)}") 230 | 231 | if write_out: 232 | prompt_details = [] 233 | 234 | description = ( 235 | description_dict[task_name] 236 | if description_dict and task_name in description_dict 237 | else "" 238 | ) 239 | if limit is not None: 240 | limit = int(len(task_docs) * limit) if limit < 1.0 else int(limit) 241 | 242 | if model_prompt is None: 243 | model_prompt = 'no_prompt' 244 | 245 | for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit)): 246 | if decontaminate and task.should_decontaminate(): 247 | docs_for_decontamination[(task_name, task_set)].append( 248 | task.doc_to_decontamination_query(doc) 249 | ) 250 | 251 | docs[(task_name, doc_id)] = doc 252 | ctx = task.fewshot_context( 253 | doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description 254 | ) 255 | 256 | ctx = MODEL_PROMPT_MAP[model_prompt](ctx) 257 | 258 | reqs = task.construct_requests(doc, ctx) 259 | 260 | if write_out: 261 | prompt_details.append({"doc_id": doc_id}) 262 | 263 | # print the prompt for the first few documents 264 | if doc_id < 1: 265 | print( 266 | f"Task: {task_name}; document {doc_id}; context prompt (starting on next line):\n{ctx}\n(end of prompt on previous line)" 267 | ) 268 | print("Requests:", reqs) 269 | 270 | if not isinstance(reqs, (list, tuple)): 271 | reqs = [reqs] 272 | for i, req in enumerate(reqs): 273 | requests[req.request_type].append(req) 274 | # i: index in requests for a single task instance 275 | # doc_id: unique id that we can get back to a doc using `docs` 276 | diag_id = doc.get("dialogue_id", doc_id) 277 | turn = doc.get("turn", 0) 278 | turn_requests[(diag_id, turn)] = (task_name, doc, doc_id, req) 279 | requests_origin[req.request_type].append((i, task_name, doc, doc_id, diag_id, turn)) 280 | 281 | #print("req: " + str(req.args)) 282 | 283 | if write_out: 284 | prompt_details[-1][f"prompt_{i}"] = "".join( 285 | (map(lambda x: "".join(x), req.args)) 286 | ) 287 | 288 | #print("request:" + request[]) 289 | if write_out: 290 | write_out_info[task_name] = prompt_details 291 | 292 | # Compare all tasks/sets at once to ensure a single training set scan 293 | if decontaminate: 294 | from lm_eval.decontamination.decontaminate import get_train_overlap 295 | 296 | print("Finding train/test overlap, please wait...") 297 | overlaps = get_train_overlap( 298 | docs_for_decontamination, decontamination_ngrams_path, limit 299 | ) 300 | 301 | # all responses for each (task, doc) 302 | process_res_queue = collections.defaultdict(list) 303 | 304 | # execute each type of request 305 | for reqtype, reqs in requests.items(): 306 | # TODO: right now, this code runs multiple separate LM requests for multiple Requests differing 307 | # only in index. We could implement some kind of caching, but that would be more of a band-aid 308 | # solution. we could also implement some kind of auto-grouping here; 309 | # they should end up next to each other. 310 | 311 | max_turns = max([val[-1] for val in requests_origin[reqtype]]) 312 | print("Running", reqtype, "requests") 313 | print(f"Maximum {max_turns} turns") 314 | task_turns = {} 315 | for cur_turn in range(max_turns+1): 316 | print(f"Running {cur_turn}th turn") 317 | 318 | filtered_reqs = [] 319 | 320 | for req, (i, task_name, doc, doc_id, diag_id, turn) in zip(reqs, requests_origin[reqtype] 321 | ): 322 | if turn != cur_turn: 323 | continue 324 | task_turns[task_name] = max(turn, task_turns.get(task_name, -1)) 325 | task = task_dict[task_name] 326 | req = task.reformulate_turn_req(req, [(turn_requests.get((diag_id, t), None), t) for 327 | t in range(turn)], turn) 328 | filtered_reqs.append([req, (i, task_name, doc, doc_id, diag_id, turn)]) 329 | 330 | resps = getattr(lm, reqtype)([req.args for req in reqs]) 331 | resps = [ 332 | x if req[0].index is None else x[req[0].index] for x, req in zip(resps, filtered_reqs 333 | ) 334 | ] 335 | 336 | for resp, req in zip(resps, filtered_reqs): 337 | i, task_name, doc, doc_id, diag_id, turn = req[1] 338 | task = task_dict[task_name] 339 | if not task.EVAL_LAST_TURN or turn == task_turns[task_name]: 340 | process_res_queue[(task_name, doc_id)].append((i, resp)) 341 | turn_requests[(diag_id, turn)] = resp 342 | 343 | if write_out: 344 | write_out_info[task_name][doc_id][f"logit_{i}"] = resp 345 | task = task_dict[task_name] 346 | if isinstance(task, lm_eval.base.MultipleChoiceTask): 347 | write_out_info[task_name][doc_id]["truth"] = doc["gold"] 348 | elif isinstance(task, lm_eval.tasks.winogrande.Winogrande): 349 | write_out_info[task_name][doc_id]["truth"] = task.answer_to_num[ 350 | doc["answer"] 351 | ] 352 | else: 353 | write_out_info[task_name][doc_id]["truth"] = task.doc_to_target(doc) 354 | vals = collections.defaultdict(list) 355 | 356 | # unpack results and sort back in order and return control to Task 357 | for (task_name, doc_id), requests in process_res_queue.items(): 358 | requests.sort(key=lambda x: x[0]) 359 | requests = [x[1] for x in requests] 360 | 361 | task = task_dict[task_name] 362 | doc = docs[(task_name, doc_id)] 363 | print("doc: "+ str(doc)) 364 | print("requests: "+ str(requests)) 365 | 366 | 367 | metrics = task.process_results(doc, requests) 368 | for metric, value in metrics.items(): 369 | vals[(task_name, metric)].append(value) 370 | 371 | if write_out: 372 | write_out_info[task_name][doc_id][metric] = str(value) 373 | 374 | # Re-use the evaluation for the decontaminated set by just ignoring the overlaps 375 | if decontaminate and task_name in overlaps: 376 | if doc_id not in overlaps[task_name]: 377 | vals[(task_name, metric + decontaminate_suffix)].append(value) 378 | 379 | # aggregate results 380 | for (task_name, metric), items in vals.items(): 381 | task = task_dict[task_name] 382 | real_metric = metric # key when looking up the metric with task.aggregation 383 | if metric.endswith(decontaminate_suffix): 384 | real_metric = metric.replace( 385 | decontaminate_suffix, "" 386 | ) # decontaminated still uses the same metric 387 | 388 | results[task_name][metric] = task.aggregation()[real_metric](items) 389 | # hotfix: bleu, chrf, ter seem to be really expensive to bootstrap 390 | # so we run them less iterations. still looking for a cleaner way to do this 391 | 392 | stderr = lm_eval.metrics.stderr_for_metric( 393 | metric=task.aggregation()[real_metric], 394 | bootstrap_iters=min(bootstrap_iters, 1000) if metric in ["bleu", "chrf", "ter"] else bootstrap_iters, 395 | ) 396 | 397 | if stderr is not None: 398 | results[task_name][metric + "_stderr"] = stderr(items) 399 | 400 | if write_out: 401 | import json 402 | import pathlib 403 | 404 | output_base_path = ( 405 | pathlib.Path(output_base_path) 406 | if output_base_path is not None 407 | else pathlib.Path(".") 408 | ) 409 | try: 410 | output_base_path.mkdir(parents=True, exist_ok=False) 411 | except FileExistsError: 412 | pass 413 | 414 | for task_name, _ in task_dict_items: 415 | with open( 416 | output_base_path.joinpath(f"{task_name}_write_out_info.json"), 417 | "w", 418 | encoding="utf8", 419 | ) as fp: 420 | json.dump(write_out_info[task_name], fp, indent=4, ensure_ascii=False) 421 | 422 | return {"results": dict(results), "versions": dict(versions)} 423 | 424 | 425 | def make_table(result_dict): 426 | """Generate table of results.""" 427 | from pytablewriter import MarkdownTableWriter, LatexTableWriter 428 | 429 | md_writer = MarkdownTableWriter() 430 | latex_writer = LatexTableWriter() 431 | md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] 432 | latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] 433 | 434 | values = [] 435 | 436 | for k, dic in result_dict["results"].items(): 437 | version = result_dict["versions"][k] 438 | for m, v in dic.items(): 439 | if m.endswith("_stderr"): 440 | continue 441 | 442 | if m + "_stderr" in dic: 443 | se = dic[m + "_stderr"] 444 | values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se]) 445 | else: 446 | values.append([k, version, m, "%.4f" % v, "", ""]) 447 | k = "" 448 | version = "" 449 | md_writer.value_matrix = values 450 | latex_writer.value_matrix = values 451 | 452 | # todo: make latex table look good 453 | # print(latex_writer.dumps()) 454 | 455 | return md_writer.dumps() 456 | -------------------------------------------------------------------------------- /src/factscore_package/.cache/demons.json: -------------------------------------------------------------------------------- 1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]} 2 | -------------------------------------------------------------------------------- /src/factscore_package/.cache/demons.txt: -------------------------------------------------------------------------------- 1 | He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s. 2 | - He made his acting debut in the film. 3 | - He made his acting debut in The Moon is the Sun's Dream. 4 | - The Moon is the Sun's Dream is a film. 5 | - The Moon is the Sun's Dream was released in 1992. 6 | - After his acting debut, he appeared in small and supporting roles. 7 | - After his acting debut, he appeared in small and supporting roles throughout the 1990s. 8 | 9 | He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift. 10 | - He is successful. 11 | - He is a producer. 12 | - He is a engineer. 13 | - He has worked with a wide variety of artists. 14 | - Willie Nelson is an artist. 15 | - He has worked with Willie Nelson. 16 | - Tim McGraw is an artist. 17 | - He has worked with Tim McGraw. 18 | - Taylor Swift is an artist. 19 | - He has worked with Taylor Swift. 20 | 21 | In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission. 22 | - Collins became an astronaut. 23 | - Collins became one of the third group of astronauts. 24 | - Collins became one of the third group of astronauts selected. 25 | - Collins became one of the third group of astronauts selected by NASA. 26 | - Collins became one of the third group of astronauts selected by NASA in 1963. 27 | - He served as the Command Module Pilot. 28 | - He served as the back-up Command Module Pilot. 29 | - He served as the Command Module Pilot for the Gemini 7 mission. 30 | 31 | In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut. 32 | - Bateman has acting roles. 33 | - Bateman has written two short films. 34 | - Bateman has directed two short films. 35 | - Bateman has written and directed two short films. 36 | - Bateman is currently in development on his feature debut. 37 | 38 | Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969. 39 | - Michael Collins was born on October 31, 1930. 40 | - Michael Collins is retired. 41 | - Michael Collins is an American. 42 | - Michael Collins was an astronaut. 43 | - Michael Collins was a test pilot. 44 | - Michael Collins was the Command Module Pilot. 45 | - Michael Collins was the Command Module Pilot for the Apollo 11 mission. 46 | - Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969. 47 | 48 | He was an American composer, conductor, and musical director. 49 | - He was an American. 50 | - He was a composer. 51 | - He was a conductor. 52 | - He was a musical director. 53 | 54 | She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019. 55 | - She currently stars in Love and Destiny. 56 | - Love and Destiny is a romantic comedy series. 57 | - Love and Destiny premiered in 2019. 58 | 59 | His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock. 60 | - His music has been described as a mix. 61 | - His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock. 62 | 63 | He also serves as an ambassador for the charity Leonard Cheshire Disability. 64 | - He serves as an ambassador. 65 | - He serves as an ambassador for Leonard Cheshire Disability. 66 | - Leonard Cheshire Disability is a charity. 67 | 68 | He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999. 69 | - He began his career in Nashville. 70 | - He began his career in the late 1950s. 71 | - He began his career in Nashville in the late 1950s. 72 | - Since he began his career, he has released numerous albums. 73 | - Since he began his career, he has released a greatest hits collection. 74 | - Since he began his career, he has released a greatest hits collection in 1999. 75 | 76 | He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material. 77 | - He has been performing since the age of 8. 78 | - He joined a band. 79 | - His hometown is Guadalajara. 80 | - He joined a band in his hometown. 81 | - He joined a band in his hometown of Guadalajara at the age of 8. 82 | - He has gone on to record six studio albums. 83 | - He has gone on to record six studio albums at the age of 8. 84 | - He has gone on to record several singles of his own original material. 85 | - He has gone on to record several singles of his own original material at the age of 8. 86 | 87 | She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013. 88 | - She is the former President. 89 | - She is also the former President of the Malaysian Chinese Association (MCA) 90 | - She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013. 91 | 92 | During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars. 93 | - McCoy played for the Broncos. 94 | - McCoy played for the Broncos during his professional career. 95 | - McCoy played for the San Diego Chargers. 96 | - McCoy played for the San Diego Chargers during his professional career. 97 | - McCoy played for the Minnesota Vikings. 98 | - McCoy played for the Minnesota Vikings during his professional career. 99 | - McCoy played for the Jacksonville Jaguars. 100 | - McCoy played for the Jacksonville Jaguars during his professional career. 101 | 102 | Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues. 103 | - Miller has been described as the architect. 104 | - Miller has been described as the architect of Trump's controversial immigration policies. 105 | - Miller has previously worked for Alabama Senator Jeff Sessions. 106 | - Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues. 107 | 108 | Her work is often described as whimsical and dreamlike. 109 | - Her work is often described as whimsical. 110 | - Her work is often described as dreamlike. 111 | 112 | He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force. 113 | - He graduated from the United States Military Academy. 114 | - He graduated from the United States Military Academy in 1952. 115 | - He went on to serve in the United States Air Force. 116 | - He went on to serve in the United States Air Force after he graduated from the United States Military Academy. 117 | 118 | He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019). 119 | - One of his best known roles is in Memories of Murder. 120 | - Memories of Murder is a film. 121 | - Memories of Murder was released in 2003. 122 | - One of his best known roles is in The Host. 123 | - The Host is a film. 124 | - The Host was released in 2006. 125 | - One of his best known roles is in Parasite. 126 | - Parasite is a film. 127 | - Parasite was released in 2019. 128 | 129 | Song Kang-ho was born in Gongju, South Korea in 1967. 130 | - Song Kang-ho was born in Gongju. 131 | - Song Kang-ho was born in South Korea. 132 | - Song Kang-ho was born in 1967. 133 | 134 | He studied theater at Chung-Ang University in Seoul. 135 | - He studied theater. 136 | - He studied at Chung-Ang University. 137 | - He studied at Chung-Ang University in Seoul. 138 | 139 | His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003. 140 | - His breakthrough came with Memories of Murder. 141 | - He was the leading role in Memories of Murder. 142 | - Memories of Murder was released in 2003. 143 | - Memories of Murder is a film. 144 | - Memories of Murder is an acclaimed crime-drama film. 145 | 146 | This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time. 147 | - This was followed by The Host. 148 | - The Host is the movie. 149 | - The Host is a monster movie. 150 | - The Host was released in 2006. 151 | - The Host became the highest-grossing film in Korean history at the time. 152 | - The Host is not the highest-grossing film in Korean history anymore. 153 | -------------------------------------------------------------------------------- /src/factscore_package/.cache/demons_complex.json: -------------------------------------------------------------------------------- 1 | {"He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "In 1970, the Empire State Building in New York City was the tallest building in the United States and the world, standing at 1,250 feet tall.": ["The Empire State Building is in New York City.", "In 1970, the Empire State Building was the tallest building in the United States.", "In 1970, the Empire State Building was the tallest building in the world.", "The Empire State Building stands at 1,250 feet tall."], "The Willis Tower (formerly the Sears Tower) in Chicago was the first to do so, reaching 1,450 feet in 1973. ": ["The Willis Tower is formerly called the Sears Tower.", "The Willis Tower is in Chicago.", "The Willis Tower reached 1,450 feet in 1973."], "The current tallest building in the United States is One World Trade Center in New York City, which stands at 1,776 feet. ": ["The current tallest building in the United States is One World Trade Center.", "One World Trade Center is in New York City.", "One World Trade Center stands at 1,776 feet."], "William E. Moerner is an American physical chemist who was affiliated with the University of Sussex as a visiting professor. ": ["William E. Moerner is an American.", "William E. Moerner is an physical chemist.", "William E. Moerner was affiliated with the University of Sussex.", "William E. Moerner was affiliated with the University of Sussex as a visiting professor."], "Sir Harold Walter Kroto, an English chemist, shared the 1996 Nobel Prize in Chemistry with Robert Curl and Richard Smalley for their discovery of a new form of carbon, buckminsterfullerene, also known as buckyballs. ": ["Sir Harold Walter Kroto is English.", "Sir Harold Walter Kroto is an chemist.", "Sir Harold Walter Kroto won the Nobel Prize in 1996.", "Sir Harold Walter Kroto won the Nobel Prize in Chemistry.", "Sir Harold Walter Kroto shared the Nobel Prize with Robert Curl and Richard Smalley.", "They won the prize for their discovery of a new form of carbon, buckminsterfullerene, also known as buckyballs."]} 2 | -------------------------------------------------------------------------------- /src/factscore_package/.cache/demons_full.txt: -------------------------------------------------------------------------------- 1 | During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars. 2 | - McCoy played for the Broncos. 3 | - McCoy played for the Broncos during his professional career. 4 | - McCoy played for the San Diego Chargers. 5 | - McCoy played for the San Diego Chargers during his professional career. 6 | - McCoy played for the Minnesota Vikings. 7 | - McCoy played for the Minnesota Vikings during his professional career. 8 | - McCoy played for the Jacksonville Jaguars. 9 | - McCoy played for the Jacksonville Jaguars during his professional career. 10 | 11 | In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut. 12 | - Bateman has acting roles. 13 | - Bateman has written two short films. 14 | - Bateman has directed two short films. 15 | - Bateman has written and directed two short films. 16 | - Bateman is currently in development on his feature debut. 17 | 18 | He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift. 19 | - He is a successful producer. 20 | - He is a successful engineer. 21 | - He has worked with a wide variety of artists. 22 | - Willie Nelson is an artist. 23 | - He has worked with Willie Nelson. 24 | - Tim McGraw is an artist. 25 | - He has worked with Tim McGraw. 26 | - Taylor Swift is an artist. 27 | - He has worked with Taylor Swift. 28 | 29 | He is the founder and Chairman of The Schneider Group, a multi-discipline consulting firm that provides strategic advice to clients in the areas of corporate strategy, operations, finance, and human resources. 30 | - He is the founder of The Schneider Group. 31 | - He is the Chairman of The Schneider Group. 32 | - The Schneider Group is a multi-discipline consulting firm. 33 | - The Schneider Group provides strategic advice to clients in the area of corporate strategy. 34 | - The Schneider Group provides strategic advice to clients. 35 | - The Schneider Group provides strategic advice to clients in the area of operations. 36 | - The Schneider Group provides strategic advice to clients in the area of finance. 37 | - The Schneider Group provides strategic advice to clients in the area of human resources. 38 | 39 | He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999. 40 | - He began his career in Nashville. 41 | - He began his career in the late 1950s. 42 | - He began his career in Nashville in the late 1950s. 43 | - Since he began his career, he has released numerous albums. 44 | - Since he began his career, he has released a greatest hits collection. 45 | - Since he began his career, he has released a greatest hits collection in 1999. 46 | 47 | He has been producing music since the early 2000s, and has been featured on various albums and compilations, including the Grammy Award-winning album The Emancipation of Mimi, by Mariah Carey. 48 | - He has been producing music. 49 | - He has been producing music since the early 2000s. 50 | - He has been featured on various albums. 51 | - He has been featured on compilations. 52 | - The Emancipation of Mimi is by Mariah Carey. 53 | - The Emancipation of Mimi is the Grammy Award-winning album. 54 | - He has been featured on The Emancipation of Mimi. 55 | 56 | He played college football for the University of Oregon, where he was an All-Pac-12 selection and was named to the All-America team in 2016. 57 | - He played college football. 58 | - He played college football for the University of Oregon. 59 | - He was an All-Pac-12 selection. 60 | - He was an All-Pac-12 selection at the University of Oregon. 61 | - He was named to the All-America team. 62 | - He was named to the All-America team in 2016. 63 | - He was named to the All-America team in 2016 at the University of Oregon. 64 | 65 | He is also the author of the book “The Entrepreneur’s Journey: From Idea to IPO”, which chronicles his experience as an investor and entrepreneur. 66 | - “The Entrepreneur’s Journey: From Idea to IPO” is a book. 67 | - He is the author of the book “The Entrepreneur’s Journey: From Idea to IPO”. 68 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience. 69 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience as an investor. 70 | - “The Entrepreneur’s Journey: From Idea to IPO” chronicles his experience as an entrepreneur. 71 | 72 | He is an internationally renowned scientist who has made major contributions to the field of membrane trafficking, a fundamental process in cells that is essential for normal physiology and disease. 73 | - He is an internationally renowned scientist. 74 | - He has made major contributions to the field of membrane trafficking. 75 | - Membrane trafficking is a fundamental process in cells. 76 | - Membrane trafficking is essential for normal physiology. 77 | - Membrane trafficking is essential for normal disease. 78 | 79 | Patrick has also been a keynote speaker at numerous industry events and conferences and is a highly sought-after mentor for aspiring entrepreneurs. 80 | - Patrick has been keynote speaker. 81 | - Patrick has also been a keynote speaker at numerous industry events. 82 | - Patrick has also been a keynote speaker at numerous industry conferences. 83 | - Patrick is a highly sought-after mentor. 84 | - Patrick is a highly sought-after mentor for aspiring entrepreneurs. 85 | 86 | He began practicing law in Romney, West Virginia and was elected to the Virginia House of Delegates in 1823, where he served until 1827. 87 | - He began practicing law in Romney, West Virginia. 88 | - He was elected to the Virginia House of Delegates. 89 | - He was elected to the Virginia House of Delegates in 1823. 90 | - He served in the Virginia House of Delegates. 91 | - He served in the Virginia House of Delegates until 1827. 92 | 93 | He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material. 94 | - He has been performing. 95 | - He has been performing since the age of 8. 96 | - He joined a band. 97 | - His hometown is Guadalajara. 98 | - He joined a band in his hometown. 99 | - He joined a band in his hometown of Guadalajara at the age of 8. 100 | - He has gone on to record six studio albums. 101 | - He has gone on to record six studio albums at the age of 8. 102 | - He has gone on to record several singles of his own original material. 103 | - He has gone on to record several singles of his own original material at the age of 8. 104 | 105 | He started his professional career in Ghana, playing for Berekum Arsenal, Heart of Lions and Bechem United before moving to the United States in 2012. 106 | - He started his professional career in Ghana. 107 | - He played for Berekum Arsenal. 108 | - He played for Berekum Arsenal before moving to the United States. 109 | - He played for Berekum Arsenal before moving to the United States in 2021. 110 | - He played for Heart of Lions. 111 | - He played for Heart of Lions before moving to the United States. 112 | - He played for Heart of Lions before moving to the United States in 2021. 113 | - He played for Bechem United. 114 | - He played for Bechem United before moving to the United States. 115 | - He played for Bechem United before moving to the United States in 2021. 116 | 117 | His style is heavily influenced by modern classical, electronic, and ambient music, while also drawing from jazz, hip-hop, and world music. 118 | - His style is heavily influenced. 119 | - His style is heavily influenced by modern classical music. 120 | - His style is heavily influenced by modern electronic music. 121 | - His style is heavily influenced by modern ambient music. 122 | - His style draws from jazz. 123 | - His style draws from hip-hop. 124 | - His style draws from world music. 125 | 126 | He rose to prominence in 1600, when he was appointed to lead the Western Army in the Battle of Sekigahara, where he defeated the Eastern Army of Tokugawa Ieyasu, who would later become the first shogun of the Edo period. 127 | - He rose to prominence. 128 | - He rose to prominence in 1600. 129 | - He was appointed to lead the Western Army. 130 | - He was appointed to lead the Western Army in the Battle of Sekigahara. 131 | - He was appointed to lead the Western Army in the Battle of Sekigahara in 1600. 132 | - He defeated the Eastern Army of Tokugawa Ieyasu. 133 | - He defeated the Eastern Army of Tokugawa Ieyasu in the Battle of Sekigahara. 134 | - Tokugawa Ieyasu became the first shogun of the Edo period. 135 | - Tokugawa Ieyasu became the first shogun of the Edo period after 1600. 136 | 137 | He began his career at Manchester United, for whom he made his professional debut in 2011, and made 11 appearances over four seasons. 138 | - He began his career at Manchester United. 139 | - He made his professional debut in 2011. 140 | - He made his professional debut in 2011 at Manchester United. 141 | - He made 11 appearances. 142 | - He made 11 appearances over four seasons. 143 | - He made 11 appearances over four seasons at Manchester United. 144 | -------------------------------------------------------------------------------- /src/factscore_package/.cache/demos/demons.json: -------------------------------------------------------------------------------- 1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]} 2 | -------------------------------------------------------------------------------- /src/factscore_package/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/src/factscore_package/__init__.py -------------------------------------------------------------------------------- /src/factscore_package/abstain_detection.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | 4 | invalid_ppl_mentions = [ 5 | "I could not find any information", 6 | "The search results do not provide", 7 | "There is no information", 8 | "There are no search results", 9 | "there are no provided search results", 10 | "not provided in the search results", 11 | "is not mentioned in the provided search results", 12 | "There seems to be a mistake in the question", 13 | "Not sources found", 14 | "No sources found", 15 | "Try a more general question" 16 | ] 17 | 18 | def remove_citation(text): 19 | # text = re.sub(r'\[\d+\]', '', text) 20 | text = re.sub(r"\s*\[\d+\]\s*","", text) 21 | if text.startswith("According to , "): 22 | text = text.replace("According to , ", "According to the search results, ") 23 | return text 24 | 25 | def is_invalid_ppl(text): 26 | return np.any([text.lower().startswith(mention.lower()) for mention in invalid_ppl_mentions]) 27 | 28 | def is_invalid_paragraph_ppl(text): 29 | return len(text.strip())==0 or np.any([mention.lower() in text.lower() for mention in invalid_ppl_mentions]) 30 | 31 | def perplexity_ai_abstain_detect(generation): 32 | output = remove_citation(generation) 33 | if is_invalid_ppl(output): 34 | return True 35 | valid_paras = [] 36 | for para in output.split("\n\n"): 37 | if is_invalid_paragraph_ppl(para): 38 | break 39 | valid_paras.append(para.strip()) 40 | 41 | if len(valid_paras) == 0: 42 | return True 43 | else: 44 | return False 45 | 46 | def generic_abstain_detect(generation): 47 | return generation.startswith("I'm sorry") or "provide more" in generation 48 | 49 | def is_response_abstained(generation, fn_type): 50 | if fn_type == "perplexity_ai": 51 | return perplexity_ai_abstain_detect(generation) 52 | 53 | elif fn_type == "generic": 54 | return generic_abstain_detect(generation) 55 | 56 | else: 57 | return False 58 | -------------------------------------------------------------------------------- /src/factscore_package/atomic_facts.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import re 4 | import functools 5 | import string 6 | import spacy 7 | import sys 8 | import nltk 9 | import openai 10 | from rank_bm25 import BM25Okapi 11 | import os 12 | import time 13 | from nltk.tokenize import sent_tokenize 14 | 15 | from .openai_lm import OpenAIModel 16 | 17 | nltk.download("punkt") 18 | 19 | 20 | class AtomicFactGenerator(object): 21 | def __init__(self, key_path, demon_dir, gpt3_cache_file=None): 22 | self.nlp = spacy.load("en_core_web_sm") 23 | self.is_bio = True 24 | self.demon_path = os.path.join(demon_dir, "demons.json" if self.is_bio else "demons_complex.json") 25 | 26 | self.openai_lm = OpenAIModel("InstructGPT", cache_file=gpt3_cache_file, key=key_path) 27 | 28 | # get the demos 29 | with open('/data/chenzhengyu/projects/PIXIU_fingpt/PIXIU/src/factscore_package/demons.json', 'r') as f: 30 | self.demons = json.load(f) 31 | 32 | tokenized_corpus = [doc.split(" ") for doc in self.demons.keys()] 33 | self.bm25 = BM25Okapi(tokenized_corpus) 34 | 35 | def save_cache(self): 36 | self.openai_lm.save_cache() 37 | 38 | def run(self, generation, cost_estimate=None): 39 | """Convert the generation into a set of atomic facts. Return a total words cost if cost_estimate != None.""" 40 | assert isinstance(generation, str), "generation must be a string" 41 | paragraphs = [para.strip() for para in generation.split("\n") if len(para.strip()) > 0] 42 | return self.get_atomic_facts_from_paragraph(paragraphs, cost_estimate=cost_estimate) 43 | 44 | def get_atomic_facts_from_paragraph(self, paragraphs, cost_estimate=None): 45 | sentences = [] 46 | para_breaks = [] 47 | for para_idx, paragraph in enumerate(paragraphs): 48 | if para_idx > 0 : 49 | para_breaks.append(len(sentences)) 50 | 51 | initials = detect_initials(paragraph) 52 | 53 | curr_sentences = sent_tokenize(paragraph) 54 | curr_sentences_2 = sent_tokenize(paragraph) 55 | 56 | curr_sentences = fix_sentence_splitter(curr_sentences, initials) 57 | curr_sentences_2 = fix_sentence_splitter(curr_sentences_2, initials) 58 | 59 | # checking this, just to ensure the crediability of the sentence splitter fixing algorithm 60 | assert curr_sentences == curr_sentences_2, (paragraph, curr_sentences, curr_sentences_2) 61 | 62 | sentences += curr_sentences 63 | 64 | atoms_or_estimate = self.get_init_atomic_facts_from_sentence([sent for i, sent in enumerate(sentences) if not (not self.is_bio and ( \ 65 | (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \ 66 | (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))))], cost_estimate=cost_estimate) 67 | 68 | if cost_estimate: 69 | return atoms_or_estimate 70 | else: 71 | atoms = atoms_or_estimate 72 | 73 | atomic_facts_pairs = [] 74 | for i, sent in enumerate(sentences): 75 | if not self.is_bio and ( \ 76 | (i==0 and (sent.startswith("Sure") or sent.startswith("Here are"))) or \ 77 | (i==len(sentences)-1 and (sent.startswith("Please") or sent.startswith("I hope") or sent.startswith("Here are")))): 78 | atomic_facts_pairs.append((sent, [])) 79 | elif self.is_bio and sent.startswith("This sentence does not contain any facts"): 80 | atomic_facts_pairs.append((sent, [])) 81 | elif sent.startswith("Sure") or sent.startswith("Please") or (i==0 and sent.startswith("Here are")): 82 | atomic_facts_pairs.append((sent, [])) 83 | else: 84 | atomic_facts_pairs.append((sent, atoms[sent])) 85 | 86 | # postprocess_atomic_facts will fix minor issues from InstructGPT 87 | # it is supposed to handle sentence splitter issue too, but since here 88 | # we fixed sentence splitter issue already, 89 | # the new para_breaks should be identical to the original para_breaks 90 | if self.is_bio: 91 | atomic_facts_pairs, para_breaks = postprocess_atomic_facts(atomic_facts_pairs, list(para_breaks), self.nlp) 92 | 93 | return atomic_facts_pairs, para_breaks 94 | 95 | 96 | def get_init_atomic_facts_from_sentence(self, sentences, cost_estimate=None): 97 | """Get the initial atomic facts from the sentences. Return a total words cost if cost_estimate != None.""" 98 | 99 | is_bio = self.is_bio 100 | demons = self.demons 101 | 102 | k = 1 if is_bio else 0 103 | n = 7 if is_bio else 8 104 | 105 | prompts = [] 106 | prompt_to_sent = {} 107 | atoms = {} 108 | for sentence in sentences: 109 | if sentence in atoms: 110 | continue 111 | top_machings = best_demos(sentence, self.bm25, list(demons.keys()), k) 112 | prompt = "" 113 | 114 | for i in range(n): 115 | prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(list(demons.keys())[i]) 116 | for fact in demons[list(demons.keys())[i]]: 117 | prompt = prompt + "- {}\n".format(fact) 118 | prompt = prompt + "\n" 119 | 120 | for match in top_machings: 121 | prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(match) 122 | for fact in demons[match]: 123 | prompt = prompt + "- {}\n".format(fact) 124 | prompt = prompt + "\n" 125 | prompt = prompt + "Please breakdown the following sentence into independent facts: {}\n".format(sentence) 126 | prompts.append(prompt) 127 | prompt_to_sent[prompt] = sentence 128 | 129 | if cost_estimate: 130 | total_words_estimate = 0 131 | for prompt in prompts: 132 | if cost_estimate == "consider_cache" and (prompt.strip() + "_0") in self.openai_lm.cache_dict: 133 | continue 134 | total_words_estimate += len(prompt.split()) 135 | return total_words_estimate 136 | else: 137 | for prompt in prompts: 138 | outputs = self.openai_lm.generate(prompt) 139 | #print(outputs) 140 | output, _ = outputs 141 | atoms[prompt_to_sent[prompt]] = text_to_sentences(output) 142 | 143 | for key, value in demons.items(): 144 | if key not in atoms: 145 | atoms[key] = value 146 | 147 | return atoms 148 | 149 | 150 | def best_demos(query, bm25, demons_sents, k): 151 | tokenized_query = query.split(" ") 152 | top_machings = bm25.get_top_n(tokenized_query, demons_sents, k) 153 | return top_machings 154 | 155 | 156 | # transform InstructGPT output into sentences 157 | def text_to_sentences(text): 158 | sentences = text.split("- ")[1:] 159 | sentences = [sent.strip()[:-1] if sent.strip()[-1] == '\n' else sent.strip() for sent in sentences] 160 | if len(sentences) > 0: 161 | if sentences[-1][-1] != '.': 162 | sentences[-1] = sentences[-1] + '.' 163 | else: 164 | sentences = [] 165 | return sentences 166 | 167 | 168 | def normalize_answer(s): 169 | """Lower text and remove punctuation, articles and extra whitespace.""" 170 | def remove_articles(text): 171 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 172 | return re.sub(regex, ' ', text) 173 | def white_space_fix(text): 174 | return ' '.join(text.split()) 175 | def remove_punc(text): 176 | exclude = set(string.punctuation) 177 | return ''.join(ch for ch in text if ch not in exclude) 178 | def lower(text): 179 | return text.lower() 180 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 181 | 182 | MONTHS = ["January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"] 183 | MONTHS = [m.lower() for m in MONTHS] 184 | 185 | def is_num(text): 186 | try: 187 | text = int(text) 188 | return True 189 | except Exception: 190 | return False 191 | 192 | def is_date(text): 193 | text = normalize_answer(text) 194 | for token in text.split(" "): 195 | if (not is_num(token)) and token not in MONTHS: 196 | return False 197 | return True 198 | 199 | def extract_numeric_values(text): 200 | pattern = r'\b\d+\b' # regular expression pattern for integers 201 | numeric_values = re.findall(pattern, text) # find all numeric values in the text 202 | return set([value for value in numeric_values]) # convert the values to float and return as a list 203 | 204 | 205 | def detect_entities(text, nlp): 206 | doc = nlp(text) 207 | entities = set() 208 | 209 | def _add_to_entities(text): 210 | if "-" in text: 211 | for _text in text.split("-"): 212 | entities.add(_text.strip()) 213 | else: 214 | entities.add(text) 215 | 216 | 217 | for ent in doc.ents: 218 | # spacy often has errors with other types of entities 219 | if ent.label_ in ["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"]: 220 | 221 | if is_date(ent.text): 222 | _add_to_entities(ent.text) 223 | else: 224 | for token in ent.text.split(): 225 | if is_date(token): 226 | _add_to_entities(token) 227 | 228 | for new_ent in extract_numeric_values(text): 229 | if not np.any([new_ent in ent for ent in entities]): 230 | entities.add(new_ent) 231 | 232 | return entities 233 | 234 | def postprocess_atomic_facts(_atomic_facts, para_breaks, nlp): 235 | 236 | verbs = ["born.", " appointed.", " characterized.", " described.", " known.", " member.", " advocate.", "served.", "elected."] 237 | permitted_verbs = ["founding member."] 238 | 239 | atomic_facts = [] 240 | new_atomic_facts = [] 241 | new_para_breaks = [] 242 | 243 | for i, (sent, facts) in enumerate(_atomic_facts): 244 | sent = sent.strip() 245 | if len(sent.split())==1 and i not in para_breaks and i > 0: 246 | assert i not in para_breaks 247 | atomic_facts[-1][0] += " " + sent 248 | atomic_facts[-1][1] += facts 249 | else: 250 | if i in para_breaks: 251 | new_para_breaks.append(len(atomic_facts)) 252 | atomic_facts.append([sent, facts]) 253 | 254 | for i, (sent, facts) in enumerate(atomic_facts): 255 | entities = detect_entities(sent, nlp) 256 | covered_entities = set() 257 | # print (entities) 258 | new_facts = [] 259 | for i, fact in enumerate(facts): 260 | if any([fact.endswith(verb) for verb in verbs]) and not any([fact.endswith(verb) for verb in permitted_verbs]): 261 | if any([fact[:-1] in other_fact for j, other_fact in enumerate(facts) if j != i]): 262 | continue 263 | sent_entities = detect_entities(fact, nlp) 264 | covered_entities |= set([e for e in sent_entities if e in entities]) 265 | new_entities = sent_entities - entities 266 | if len(new_entities) > 0: 267 | do_pass = False 268 | for new_ent in new_entities: 269 | pre_ent = None 270 | for ent in entities: 271 | if ent.startswith(new_ent): 272 | pre_ent = ent 273 | break 274 | if pre_ent is None: 275 | do_pass = True 276 | break 277 | fact = fact.replace(new_ent, pre_ent) 278 | covered_entities.add(pre_ent) 279 | if do_pass: 280 | continue 281 | if fact in new_facts: 282 | continue 283 | new_facts.append(fact) 284 | try: 285 | assert entities==covered_entities 286 | except Exception: 287 | new_facts = facts # there is a bug in spacy entity linker, so just go with the previous facts 288 | 289 | new_atomic_facts.append((sent, new_facts)) 290 | 291 | return new_atomic_facts, new_para_breaks 292 | 293 | def is_integer(s): 294 | try: 295 | s = int(s) 296 | return True 297 | except Exception: 298 | return False 299 | 300 | def detect_initials(text): 301 | pattern = r"[A-Z]\. ?[A-Z]\." 302 | match = re.findall(pattern, text) 303 | return [m for m in match] 304 | 305 | def fix_sentence_splitter(curr_sentences, initials): 306 | for initial in initials: 307 | if not np.any([initial in sent for sent in curr_sentences]): 308 | alpha1, alpha2 = [t.strip() for t in initial.split(".") if len(t.strip())>0] 309 | for i, (sent1, sent2) in enumerate(zip(curr_sentences, curr_sentences[1:])): 310 | if sent1.endswith(alpha1 + ".") and sent2.startswith(alpha2 + "."): 311 | # merge sentence i and i+1 312 | curr_sentences = curr_sentences[:i] + [curr_sentences[i] + " " + curr_sentences[i+1]] + curr_sentences[i+2:] 313 | break 314 | sentences = [] 315 | combine_with_previous = None 316 | for sent_idx, sent in enumerate(curr_sentences): 317 | if len(sent.split())<=1 and sent_idx==0: 318 | assert not combine_with_previous 319 | combine_with_previous = True 320 | sentences.append(sent) 321 | elif len(sent.split())<=1: 322 | assert sent_idx > 0 323 | sentences[-1] += " " + sent 324 | combined_with_previous = False 325 | elif sent[0].isalpha() and not sent[0].isupper() and sent_idx > 0: 326 | assert sent_idx > 0, curr_sentences 327 | sentences[-1] += " " + sent 328 | combine_with_previous = False 329 | elif combine_with_previous: 330 | assert sent_idx > 0 331 | sentences[-1] += " " + sent 332 | combine_with_previous = False 333 | else: 334 | assert not combine_with_previous 335 | sentences.append(sent) 336 | return sentences 337 | 338 | 339 | def main(): 340 | pass 341 | #generator = AtomicFactGenerator("api.key", "demos", gpt3_cache_dir=None) 342 | #atomic_facts, para_breaks = generator.run("Thierry Henry (born 17 August 1977) is a French professional football coach, pundit, and former player. He is considered one of the greatest strikers of all time, and one the greatest players of the Premier League history. He has been named Arsenal F.C's greatest ever player.\n\nHenry made his professional debut with Monaco in 1994 before signing for defending Serie A champions Juventus. However, limited playing time, coupled with disagreements with the club's hierarchy, led to him signing for Premier League club Arsenal for £11 million in 1999.") 343 | 344 | #print(atomic_facts) 345 | #print(para_breaks) 346 | 347 | if __name__ == "__main__": 348 | main() 349 | -------------------------------------------------------------------------------- /src/factscore_package/clm.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import math 9 | import time 10 | import json 11 | import numpy as np 12 | import torch 13 | from tqdm import tqdm 14 | from collections import defaultdict 15 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoConfig 16 | from torch import cuda, bfloat16 17 | import json 18 | from huggingface_hub import login 19 | from transformers import AutoModelForCausalLM 20 | from transformers import LlamaTokenizer 21 | 22 | from .utils import convert_model_to_int8_on_gpu 23 | from .lm import LM 24 | 25 | class CLM(LM): 26 | def __init__(self, model_name, model_dir, cache_file=None): 27 | self.model_name = model_name 28 | self.model_dir = model_dir 29 | if cache_file: 30 | super().__init__(cache_file) 31 | 32 | def load_model(self): 33 | model_id = 'meta-llama/Llama-2-7b-chat-hf' 34 | 35 | bnb_config = BitsAndBytesConfig( 36 | load_in_4bit=True, 37 | bnb_4bit_quant_type='nf4', 38 | bnb_4bit_use_double_quant=True, 39 | bnb_4bit_compute_dtype=bfloat16 40 | ) 41 | 42 | # begin initializing HF items, you need an access token 43 | hf_auth = "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" 44 | model_config = AutoConfig.from_pretrained( 45 | model_id, 46 | use_auth_token=hf_auth 47 | ) 48 | 49 | model = AutoModelForCausalLM.from_pretrained( 50 | model_id, 51 | trust_remote_code=True, 52 | config=model_config, 53 | quantization_config=bnb_config, 54 | device_map='auto', 55 | use_auth_token=hf_auth 56 | ) 57 | #model_name_or_path = "TheBloke/Llama-2-70B-chat-AWQ" 58 | #model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True, 59 | #trust_remote_code=False, safetensors=True) 60 | # 2. Tie the weights 61 | #model.tie_weights() 62 | 63 | tokenizer = AutoTokenizer.from_pretrained( 64 | model_id, 65 | use_auth_token=hf_auth 66 | ) 67 | 68 | self.model = model 69 | self.tokenizer = tokenizer 70 | #self.model = AutoModelForCausalLM.from_pretrained(self.model_dir) 71 | #self.model = convert_model_to_int8_on_gpu(self.model, device='auto') 72 | #self.tokenizer = LlamaTokenizer.from_pretrained(self.model_dir) 73 | 74 | def _generate(self, prompts, max_sequence_length=2048, max_output_length=128, 75 | end_if_newline=False, end_if_second_newline=False, verbose=False): 76 | is_single = type(prompts)==str 77 | if is_single: 78 | prompts = [prompts] 79 | 80 | input_ids = self.tokenizer(prompts).input_ids 81 | if verbose: 82 | input_ids = tqdm(input_ids) 83 | 84 | generations = [] 85 | scores = [] 86 | for curr_input_ids in input_ids: 87 | if len(curr_input_ids) > max_sequence_length - max_output_length: 88 | curr_input_ids = curr_input_ids[-(max_sequence_length - max_output_length):] 89 | curr_input_ids = torch.LongTensor([curr_input_ids]) 90 | gen_outputs = self.model.generate( 91 | curr_input_ids, 92 | max_length=curr_input_ids.shape[1]+max_output_length, 93 | return_dict_in_generate=True, 94 | output_scores=True 95 | ) 96 | gen_tokens = gen_outputs["sequences"] 97 | # saving the logits for the very first token 98 | gen_scores = gen_outputs["scores"][0][0].detach().cpu().numpy() 99 | gen = self.tokenizer.decode(gen_tokens[0, curr_input_ids.shape[-1]:]) 100 | 101 | if end_if_newline: 102 | gen = gen.split("\n")[0].strip() 103 | elif end_if_second_newline: 104 | gen = "\n".join(gen.split("\n")[:2]).strip() 105 | 106 | if verbose and len(generations)==0: 107 | print ("Input:", prompts[0]) 108 | print ("Prediction:", gen) 109 | 110 | if self.model_name.startswith("llama-sni"): 111 | gen = gen.split("")[0] 112 | 113 | generations.append(gen) 114 | scores.append(gen_scores) 115 | 116 | assert len(generations)==len(prompts)==len(scores) 117 | if is_single: 118 | return generations[0], scores[0] 119 | 120 | return generations, scores 121 | 122 | -------------------------------------------------------------------------------- /src/factscore_package/demons.json: -------------------------------------------------------------------------------- 1 | {"He made his acting debut in the film The Moon is the Sun's Dream (1992), and continued to appear in small and supporting roles throughout the 1990s.": ["He made his acting debut in the film.", "He made his acting debut in The Moon is the Sun's Dream.", "The Moon is the Sun's Dream is a film.", "The Moon is the Sun's Dream was released in 1992.", "After his acting debut, he appeared in small and supporting roles.", "After his acting debut, he appeared in small and supporting roles throughout the 1990s."], "He is also a successful producer and engineer, having worked with a wide variety of artists, including Willie Nelson, Tim McGraw, and Taylor Swift.": ["He is successful.", "He is a producer.", "He is a engineer.", "He has worked with a wide variety of artists.", "Willie Nelson is an artist.", "He has worked with Willie Nelson.", "Tim McGraw is an artist.", "He has worked with Tim McGraw.", "Taylor Swift is an artist.", "He has worked with Taylor Swift."], "In 1963, Collins became one of the third group of astronauts selected by NASA and he served as the back-up Command Module Pilot for the Gemini 7 mission.": ["Collins became an astronaut.", "Collins became one of the third group of astronauts.", "Collins became one of the third group of astronauts selected.", "Collins became one of the third group of astronauts selected by NASA.", "Collins became one of the third group of astronauts selected by NASA in 1963.", "He served as the Command Module Pilot.", "He served as the back-up Command Module Pilot.", "He served as the Command Module Pilot for the Gemini 7 mission."], "In addition to his acting roles, Bateman has written and directed two short films and is currently in development on his feature debut.": ["Bateman has acting roles.", "Bateman has written two short films.", "Bateman has directed two short films.", "Bateman has written and directed two short films.", "Bateman is currently in development on his feature debut."], "Michael Collins (born October 31, 1930) is a retired American astronaut and test pilot who was the Command Module Pilot for the Apollo 11 mission in 1969.": ["Michael Collins was born on October 31, 1930.", "Michael Collins is retired.", "Michael Collins is an American.", "Michael Collins was an astronaut.", "Michael Collins was a test pilot.", "Michael Collins was the Command Module Pilot.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission.", "Michael Collins was the Command Module Pilot for the Apollo 11 mission in 1969."], "He was an American composer, conductor, and musical director.": ["He was an American.", "He was a composer.", "He was a conductor.", "He was a musical director."], "She currently stars in the romantic comedy series, Love and Destiny, which premiered in 2019.": ["She currently stars in Love and Destiny.", "Love and Destiny is a romantic comedy series.", "Love and Destiny premiered in 2019. "], "His music has been described as a mix of traditional Mexican and Latin American styles, as well as jazz, folk, and rock.": ["His music has been described as a mix.", "His music has been described as a mix of traditional Mexican, Latin American styles, as well as jazz, folk, and rock."], "He also serves as an ambassador for the charity Leonard Cheshire Disability.": ["He serves as an ambassador.", "He serves as an ambassador for Leonard Cheshire Disability.", "Leonard Cheshire Disability is a charity."], "He began his career in Nashville in the late 1950s and has since released numerous albums, including a greatest hits collection in 1999.": ["He began his career in Nashville.", "He began his career in the late 1950s.", "He began his career in Nashville in the late 1950s.", "Since he began his career, he has released numerous albums.", "Since he began his career, he has released a greatest hits collection.", "Since he began his career, he has released a greatest hits collection in 1999."], "He has been performing since the age of 8, when he joined a band in his hometown of Guadalajara and has since gone on to record six studio albums and several singles of his own original material.": ["He has been performing since the age of 8.", "He joined a band.", "His hometown is Guadalajara.", "He joined a band in his hometown.", "He joined a band in his hometown of Guadalajara at the age of 8.", "He has gone on to record six studio albums.", "He has gone on to record six studio albums at the age of 8.", "He has gone on to record several singles of his own original material.", "He has gone on to record several singles of his own original material at the age of 8."], "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013.": ["She is the former President.", "She is also the former President of the Malaysian Chinese Association (MCA)", "She is also the former President of the Malaysian Chinese Association (MCA) from 2010 to 2013."], "During his professional career, McCoy played for the Broncos, the San Diego Chargers, the Minnesota Vikings, and the Jacksonville Jaguars.": ["McCoy played for the Broncos.", "McCoy played for the Broncos during his professional career.", "McCoy played for the San Diego Chargers.", "McCoy played for the San Diego Chargers during his professional career.", "McCoy played for the Minnesota Vikings.", "McCoy played for the Minnesota Vikings during his professional career.", "McCoy played for the Jacksonville Jaguars.", "McCoy played for the Jacksonville Jaguars during his professional career."], "Miller has been described as the architect of Trump's controversial immigration policies, and has previously worked for Alabama Senator Jeff Sessions on immigration issues.": ["Miller has been described as the architect.", "Miller has been described as the architect of Trump's controversial immigration policies.", "Miller has previously worked for Alabama Senator Jeff Sessions.", "Miller has previously worked for Alabama Senator Jeff Sessions on immigration issues."], "Her work is often described as whimsical and dreamlike.": ["Her work is often described as whimsical.", "Her work is often described as dreamlike."], "He graduated from the United States Military Academy in 1952, and then went on to serve in the United States Air Force.": ["He graduated from the United States Military Academy.", "He graduated from the United States Military Academy in 1952.", "He went on to serve in the United States Air Force.", "He went on to serve in the United States Air Force after he graduated from the United States Military Academy."], "He is best known for his roles in the films Memories of Murder (2003), The Host (2006), (...) and Parasite (2019).": ["One of his best known roles is in Memories of Murder.", "Memories of Murder is a film.", "Memories of Murder was released in 2003.", "One of his best known roles is in The Host.", "The Host is a film.", "The Host was released in 2006.", "One of his best known roles is in Parasite.", "Parasite is a film.", "Parasite was released in 2019."], "Song Kang-ho was born in Gongju, South Korea in 1967.": ["Song Kang-ho was born in Gongju.", "Song Kang-ho was born in South Korea.", "Song Kang-ho was born in 1967."], "He studied theater at Chung-Ang University in Seoul.": ["He studied theater.", "He studied at Chung-Ang University.", "He studied at Chung-Ang University in Seoul."], "His breakthrough came with the leading role in the acclaimed crime-drama film Memories of Murder in 2003.": ["His breakthrough came with Memories of Murder.", "He was the leading role in Memories of Murder.", "Memories of Murder was released in 2003.", "Memories of Murder is a film.", "Memories of Murder is an acclaimed crime-drama film."], "This was followed by the monster movie The Host in 2006, which became the highest-grossing film in Korean history at the time.": ["This was followed by The Host.", "The Host is the movie.", "The Host is a monster movie.", "The Host was released in 2006.", "The Host became the highest-grossing film in Korean history at the time.", "The Host is not the highest-grossing film in Korean history anymore."]} 2 | -------------------------------------------------------------------------------- /src/factscore_package/download_data.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import subprocess 4 | import torch 5 | import tqdm 6 | import transformers 7 | 8 | 9 | def download_file(_id, dest, cache_dir): 10 | if os.path.exists(dest) or os.path.exists(os.path.join(cache_dir, dest)): 11 | print ("[Already exists] Skipping", dest) 12 | print ("If you want to download the file in another location, please specify a different path") 13 | return 14 | 15 | if os.path.exists(dest.replace(".zip", "")) or os.path.exists(os.path.join(cache_dir, dest.replace(".zip", ""))): 16 | print ("[Already exists] Skipping", dest) 17 | print ("If you want to download the file in another location, please specify a different path") 18 | return 19 | 20 | if "/" in dest: 21 | dest_dir = "/".join(dest.split("/")[:-1]) 22 | if not os.path.isdir(dest_dir): 23 | os.makedirs(dest_dir) 24 | else: 25 | dest_dir = "." 26 | 27 | if _id.startswith("https://"): 28 | command = """wget -O %s %s""" % (dest, _id) 29 | else: 30 | command = """wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=%s' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\\1\\n/p')&id=%s" -O %s && rm -rf /tmp/cookies.txt""" % (_id, _id, dest) 31 | 32 | ret_code = subprocess.run([command], shell=True) 33 | if ret_code.returncode != 0: 34 | print("Download {} ... [Failed]".format(dest)) 35 | else: 36 | print("Download {} ... [Success]".format(dest)) 37 | 38 | if dest.endswith(".zip"): 39 | command = """unzip %s -d %s && rm %s""" % (dest, dest_dir, dest) 40 | 41 | ret_code = subprocess.run([command], shell=True) 42 | if ret_code.returncode != 0: 43 | print("Unzip {} ... [Failed]".format(dest)) 44 | else: 45 | print("Unzip {} ... [Success]".format(dest)) 46 | 47 | 48 | 49 | def smart_tokenizer_and_embedding_resize(special_tokens_dict, tokenizer, model): 50 | """Resize tokenizer and embedding. 51 | Note: This is the unoptimized version that may make your embedding size not be divisible by 64. 52 | """ 53 | num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) 54 | model.resize_token_embeddings(len(tokenizer)) 55 | 56 | if num_new_tokens > 0: 57 | input_embeddings = model.get_input_embeddings().weight.data 58 | output_embeddings = model.get_output_embeddings().weight.data 59 | 60 | input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) 61 | output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True) 62 | 63 | input_embeddings[-num_new_tokens:] = input_embeddings_avg 64 | output_embeddings[-num_new_tokens:] = output_embeddings_avg 65 | 66 | 67 | def recover_instruct_llama(path_raw, output_path, device="cpu", test_recovered_model=False): 68 | """Heavily adapted from https://github.com/tatsu-lab/stanford_alpaca/blob/main/weight_diff.py.""" 69 | 70 | model_raw = transformers.AutoModelForCausalLM.from_pretrained( 71 | path_raw, 72 | device_map={"": torch.device(device)}, 73 | torch_dtype=torch.float32, 74 | low_cpu_mem_usage=True, 75 | ) 76 | model_recovered = transformers.AutoModelForCausalLM.from_pretrained( 77 | "kalpeshk2011/instruct-llama-7b-wdiff", 78 | device_map={"": torch.device(device)}, 79 | torch_dtype=torch.float32, 80 | low_cpu_mem_usage=True, 81 | ) 82 | 83 | tokenizer_raw = transformers.AutoTokenizer.from_pretrained(path_raw) 84 | if tokenizer_raw.pad_token is None: 85 | smart_tokenizer_and_embedding_resize( 86 | special_tokens_dict=dict(pad_token="[PAD]"), 87 | model=model_raw, 88 | tokenizer=tokenizer_raw, 89 | ) 90 | tokenizer_recovered = transformers.AutoTokenizer.from_pretrained("kalpeshk2011/instruct-llama-7b-wdiff") 91 | 92 | state_dict_recovered = model_recovered.state_dict() 93 | state_dict_raw = model_raw.state_dict() 94 | for key in tqdm.tqdm(state_dict_recovered): 95 | state_dict_recovered[key].add_(state_dict_raw[key]) 96 | 97 | if output_path is not None: 98 | model_recovered.save_pretrained(output_path) 99 | tokenizer_recovered.save_pretrained(output_path) 100 | 101 | if test_recovered_model: 102 | input_text = ( 103 | "Below is an instruction that describes a task. " 104 | "Write a response that appropriately completes the request.\r\n\r\n" 105 | "### Instruction:\r\nList three technologies that make life easier.\r\n\r\n### Response:" 106 | ) 107 | inputs = tokenizer_recovered(input_text, return_tensors="pt") 108 | out = model_recovered.generate(inputs=inputs.input_ids, max_new_tokens=100) 109 | output_text = tokenizer_recovered.batch_decode(out, skip_special_tokens=True)[0] 110 | output_text = output_text[len(input_text) :] 111 | print(f"Input: {input_text}\nCompletion: {output_text}") 112 | 113 | return model_recovered, tokenizer_recovered 114 | 115 | if __name__ == '__main__': 116 | 117 | parser = argparse.ArgumentParser() 118 | parser.add_argument('--data_dir', 119 | type=str, 120 | default=".cache/factscore") 121 | parser.add_argument('--model_dir', 122 | type=str, 123 | default=".cache/factscore") 124 | parser.add_argument('--llama_7B_HF_path', 125 | type=str, 126 | default=None) 127 | 128 | args = parser.parse_args() 129 | 130 | if not os.path.exists(args.model_dir): 131 | os.makedirs(args.model_dir) 132 | 133 | if not os.path.exists(args.data_dir): 134 | os.makedirs(args.data_dir) 135 | 136 | download_file("1IseEAflk1qqV0z64eM60Fs3dTgnbgiyt", "demos.zip", args.data_dir) 137 | download_file("1enz1PxwxeMr4FRF9dtpCPXaZQCBejuVF", "data.zip", args.data_dir) 138 | download_file("1mekls6OGOKLmt7gYtHs0WGf5oTamTNat", "enwiki-20230401.db", args.data_dir) 139 | 140 | if args.llama_7B_HF_path: 141 | recover_instruct_llama(args.llama_7B_HF_path, os.path.join(args.model_dir, "inst-llama-7B")) 142 | 143 | # download the roberta_stopwords.txt file 144 | subprocess.run(["wget https://raw.githubusercontent.com/shmsw25/FActScore/main/roberta_stopwords.txt"], shell=True) 145 | 146 | # move the files to the data directory 147 | subprocess.run(["mv demos %s" % args.data_dir], shell=True) 148 | subprocess.run(["mv enwiki-20230401.db %s" % args.data_dir], shell=True) 149 | -------------------------------------------------------------------------------- /src/factscore_package/en_core_web_sm-3.7.1.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/src/factscore_package/en_core_web_sm-3.7.1.tar.gz -------------------------------------------------------------------------------- /src/factscore_package/factscorer.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import string 3 | import json 4 | import numpy as np 5 | import os 6 | import logging 7 | 8 | from tqdm import tqdm 9 | from .abstain_detection import is_response_abstained 10 | from .atomic_facts import AtomicFactGenerator 11 | from .clm import CLM 12 | from .npm import NPM 13 | from .openai_lm import OpenAIModel 14 | from .retrieval import DocDB, Retrieval 15 | 16 | class FactScorer(object): 17 | 18 | def __init__(self, 19 | model_name="retrieval+ChatGPT", 20 | data_dir=".cache", 21 | model_dir=".cache", 22 | cache_dir=".cache", 23 | openai_key="api.key", 24 | cost_estimate="consider_cache", 25 | abstain_detection_type=None, 26 | batch_size=256): 27 | assert model_name in ["retrieval+llama", "retrieval+llama+npm", "retrieval+ChatGPT", "retrieval+GPT4", "npm", "retrieval+ChatGPT+npm"] 28 | self.model_name = model_name 29 | 30 | self.db = {} 31 | self.retrieval = {} 32 | self.npm = {} 33 | self.batch_size = batch_size # batch size for retrieval 34 | self.openai_key = openai_key 35 | self.abstain_detection_type = abstain_detection_type 36 | 37 | self.data_dir = data_dir 38 | self.cache_dir = cache_dir 39 | if not os.path.exists(cache_dir): 40 | os.makedirs(cache_dir) 41 | 42 | self.af_generator = None 43 | self.cost_estimate = cost_estimate 44 | 45 | if "llama" in model_name: 46 | self.lm = CLM("Llama2-7B-chat", 47 | model_dir='meta-llama/Llama-2-7b-chat-hf', 48 | cache_file=os.path.join(cache_dir, "Llama2-7B-chat.pkl")) 49 | #self.lm = CLM("inst-llama-7B", 50 | #model_dir=os.path.join(model_dir, "inst-llama-7B"), 51 | #cache_file=os.path.join(cache_dir, "inst-llama-7B.pkl")) 52 | elif "ChatGPT" in model_name: 53 | self.lm = OpenAIModel("ChatGPT", 54 | cache_file=os.path.join(cache_dir, "ChatGPT.pkl"), 55 | key=openai_key) 56 | elif "GPT4" in model_name: 57 | self.lm = OpenAIModel("GPT4", 58 | cache_file=os.path.join(cache_dir, "GPT4.pkl"), 59 | key=openai_key) 60 | else: 61 | self.lm = None 62 | 63 | def save_cache(self): 64 | if self.lm: 65 | self.lm.save_cache() 66 | if "npm" in self.model_name: 67 | for k, v in self.npm.items(): 68 | v.save_cache() 69 | for k, v in self.retrieval.items(): 70 | v.save_cache() 71 | 72 | def register_knowledge_source(self, name="enwiki-20230401", db_path=None, data_path=None): 73 | assert name not in self.retrieval, f"{name} already registered" 74 | if db_path is None: 75 | db_path = os.path.join(self.data_dir, f"{name}.db") 76 | 77 | if data_path is None: 78 | data_path = os.path.join(self.data_dir, f"{name}.jsonl") 79 | 80 | cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.json") 81 | embed_cache_path = os.path.join(self.cache_dir, f"retrieval-{name}.pkl") 82 | 83 | self.db[name] = DocDB(db_path=db_path, data_path=data_path) 84 | self.retrieval[name] = Retrieval(self.db[name], cache_path, embed_cache_path, batch_size=self.batch_size) 85 | if "npm" in self.model_name: 86 | cache_path = os.path.join(self.cache_dir, f"bm25-{name}.json") 87 | embed_cache_path = os.path.join(self.cache_dir, f"bm25-{name}.pkl") 88 | self.npm[name] = NPM(Retrieval(self.db[name], cache_path, embed_cache_path, "bm25"), 89 | "npm-single", 90 | cache_file=os.path.join(self.cache_dir, f"npm-{name}.pkl")) 91 | 92 | 93 | def print_cost_estimates(self, total_words, task, model): 94 | # https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them 95 | # Number of tokens are roughly 4/3 of the number of words 96 | total_tokens = total_words * 4.0 / 3 97 | 98 | # https://openai.com/pricing 99 | # if we use davinci-003, the cost is $0.02 per 1000 tokens 100 | # if we use gpt-3.5-turbo, the cost is $0.002 per 1000 tokens 101 | if model == "davinci-003": 102 | rate = 0.02 103 | elif model == "gpt-3.5-turbo": 104 | rate = 0.002 105 | 106 | total_cost = total_tokens * rate / 1000 107 | 108 | # print the total words, tokens, and cost along with rate 109 | logging.critical("Estimated OpenAI API cost for %s ($%.3f per 1000 tokens): $%.2f for %d words and %d tokens" % (task, rate, total_cost, total_words, total_tokens)) 110 | 111 | def get_score(self, 112 | topics, 113 | generations, 114 | gamma=10, 115 | atomic_facts=None, 116 | knowledge_source=None, 117 | verbose=False): 118 | if knowledge_source is None: 119 | # use the default knowledge source 120 | knowledge_source = "enwiki-20230401" 121 | 122 | if knowledge_source not in self.retrieval: 123 | self.register_knowledge_source(knowledge_source) 124 | 125 | if type(topics)==len(generations)==str: 126 | topics = [topics] 127 | generations = [generations] 128 | else: 129 | assert type(topics)==type(generations)==list, "`topics` and `generations` should be lists." 130 | assert len(topics)==len(generations), "`topics` and `generations` should have the same length" 131 | 132 | if atomic_facts is not None: 133 | assert len(topics)==len(atomic_facts), "`topics` and `atomic_facts` should have the same length" 134 | else: 135 | if self.af_generator is None: 136 | self.af_generator = AtomicFactGenerator(key_path=self.openai_key, 137 | demon_dir=os.path.join(self.data_dir, "demos"), 138 | gpt3_cache_file=os.path.join(self.cache_dir, "InstructGPT.pkl")) 139 | 140 | # estimate the total cost of atomic fact generation 141 | total_words = 0 142 | for gen in generations: 143 | total_words += self.af_generator.run(gen, cost_estimate=self.cost_estimate) 144 | 145 | self.print_cost_estimates(total_words, task="atomic fact generation", model="davinci-003") 146 | 147 | if verbose: 148 | topics = tqdm(topics) 149 | 150 | atomic_facts = [] 151 | for topic, gen in zip(topics, generations): 152 | # optionally, first detect if the response is abstained 153 | response_abstained = is_response_abstained(gen, self.abstain_detection_type) 154 | if response_abstained: 155 | atomic_facts.append(None) 156 | continue 157 | # continue only when the response is not abstained 158 | curr_afs, _ = self.af_generator.run(gen) 159 | curr_afs = [fact for _, facts in curr_afs for fact in facts] 160 | if len(curr_afs)==0: 161 | atomic_facts.append(None) 162 | else: 163 | atomic_facts.append(curr_afs) 164 | if len(atomic_facts) % 10 == 0: 165 | self.af_generator.save_cache() 166 | 167 | assert len(atomic_facts)==len(topics) 168 | self.af_generator.save_cache() 169 | 170 | respond_ratio = np.mean([facts is not None for facts in atomic_facts]) 171 | 172 | if "ChatGPT" in self.model_name: 173 | # estimate the total cost of response generation 174 | total_words = 0 175 | for topic, generation, facts in zip(topics, generations, atomic_facts): 176 | if facts is not None: 177 | total_words += self._get_score(topic, generation, facts, knowledge_source, cost_estimate=self.cost_estimate) 178 | 179 | self.print_cost_estimates(total_words, task="factscore evaluation", model="gpt-3.5-turbo") 180 | 181 | if verbose: 182 | topics = tqdm(topics) 183 | 184 | scores = [] 185 | init_scores = [] 186 | decisions = [] 187 | for topic, generation, facts in zip(topics, generations, atomic_facts): 188 | if facts is None: 189 | decisions.append(None) 190 | else: 191 | decision = self._get_score(topic, generation, facts, knowledge_source) 192 | score = np.mean([d["is_supported"] for d in decision]) 193 | 194 | if gamma: 195 | init_scores.append(score) 196 | penalty = 1.0 if len(facts)>gamma else np.exp(1-gamma/len(facts)) 197 | score = penalty * score 198 | 199 | decisions.append(decision) 200 | scores.append(score) 201 | if len(scores) % 10 == 0: 202 | self.save_cache() 203 | 204 | self.save_cache() 205 | 206 | out = {"score": np.mean(scores), 207 | "respond_ratio": respond_ratio, 208 | "decisions": decisions, 209 | "num_facts_per_response": np.mean([len(d) for d in decisions if d is not None])} 210 | 211 | if gamma: 212 | out["init_score"] = np.mean(init_scores) 213 | 214 | return out 215 | 216 | def _get_score(self, topic, generation, atomic_facts, knowledge_source, cost_estimate=None): 217 | decisions = [] 218 | total_words = 0 219 | for atom in atomic_facts: 220 | atom = atom.strip() 221 | if self.lm: 222 | passages = self.retrieval[knowledge_source].get_passages(topic, atom, k=5) 223 | definition = "Answer the question about {} based on the given context.\n\n".format(topic) 224 | context = "" 225 | for psg_idx, psg in enumerate(reversed(passages)): 226 | context += "Title: {}\nText: {}\n\n".format(psg["title"], psg["text"].replace("", "").replace("", "")) 227 | definition += context.strip() 228 | if not definition[-1] in string.punctuation: 229 | definition += "." 230 | prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(), atom.strip()) 231 | 232 | if cost_estimate: 233 | if cost_estimate == "consider_cache" and (prompt.strip() + "_0") not in self.lm.cache_dict: 234 | total_words += len(prompt.split()) 235 | elif cost_estimate == "ignore_cache": 236 | total_words += len(prompt.split()) 237 | continue 238 | 239 | output = self.lm.generate(prompt) 240 | 241 | if type(output[1])==np.ndarray: 242 | # when logits are available 243 | logits = np.array(output[1]) 244 | assert logits.shape[0] in [32000, 32001] 245 | true_score = logits[5852] 246 | false_score = logits[7700] 247 | is_supported = true_score > false_score 248 | else: 249 | # when logits are unavailable 250 | generated_answer = output[0].lower() 251 | if "true" in generated_answer or "false" in generated_answer: 252 | if "true" in generated_answer and "false" not in generated_answer: 253 | is_supported = True 254 | elif "false" in generated_answer and "true" not in generated_answer: 255 | is_supported = False 256 | else: 257 | is_supported = generated_answer.index("true") > generated_answer.index("false") 258 | else: 259 | is_supported = all([keyword not in generated_answer.lower().translate(str.maketrans("", "", string.punctuation)).split() for keyword in ["not", "cannot", "unknown", "information"]]) 260 | 261 | else: 262 | is_supported = True 263 | 264 | if is_supported and "npm" in self.model_name: 265 | npprob = self.npm[knowledge_source].get_probabilty(topic, atom) 266 | is_supported = npprob > 0.3 267 | 268 | decisions.append({"atom": atom, "is_supported": is_supported}) 269 | 270 | if cost_estimate: 271 | return total_words 272 | else: 273 | return decisions 274 | 275 | if __name__ == '__main__': 276 | 277 | parser = argparse.ArgumentParser() 278 | parser.add_argument('--input_path', 279 | type=str, 280 | default="data/labeled/InstructGPT.jsonl") 281 | parser.add_argument('--model_name', 282 | type=str, 283 | default="retrieval+ChatGPT") 284 | parser.add_argument('--gamma', 285 | type=int, 286 | default=10, 287 | help="hyperparameter for length penalty") 288 | 289 | parser.add_argument('--openai_key', 290 | type=str, 291 | default="api.key") 292 | parser.add_argument('--data_dir', 293 | type=str, 294 | default=".cache/factscore/") 295 | parser.add_argument('--model_dir', 296 | type=str, 297 | default=".cache/factscore/") 298 | parser.add_argument('--cache_dir', 299 | type=str, 300 | default=".cache/factscore/") 301 | parser.add_argument('--knowledge_source', 302 | type=str, 303 | default=None) 304 | 305 | 306 | parser.add_argument('--cost_estimate', 307 | type=str, 308 | default="consider_cache", 309 | choices=["consider_cache", "ignore_cache"]) 310 | parser.add_argument('--abstain_detection_type', 311 | type=str, 312 | default=None, 313 | choices=["perplexity_ai", "generic", "none"]) 314 | parser.add_argument('--use_atomic_facts', 315 | action="store_true") 316 | parser.add_argument('--verbose', 317 | action="store_true", 318 | help="for printing out the progress bar") 319 | parser.add_argument('--print_rate_limit_error', 320 | action="store_true", 321 | help="for printing out rate limit error when using OpenAI keys") 322 | parser.add_argument('--n_samples', 323 | type=int, 324 | default=None) 325 | 326 | args = parser.parse_args() 327 | 328 | logging.basicConfig(format='%(asctime)s - %(name)s - %(message)s', 329 | datefmt='%m/%d/%Y %H:%M:%S', 330 | level=logging.ERROR if args.print_rate_limit_error else logging.CRITICAL) 331 | 332 | fs = FactScorer(model_name=args.model_name, 333 | data_dir=args.data_dir, 334 | model_dir=args.model_dir, 335 | cache_dir=args.cache_dir, 336 | openai_key=args.openai_key, 337 | cost_estimate=args.cost_estimate, 338 | abstain_detection_type=args.abstain_detection_type) 339 | 340 | tot = 0 341 | topics, generations, atomic_facts = [], [], [] 342 | with open(args.input_path) as f: 343 | for line in f: 344 | dp = json.loads(line) 345 | tot += 1 346 | if args.use_atomic_facts: 347 | assert "annotations" in dp, "You can specify `--use_atomic_facts` only when atomic facts are available in the input data already." 348 | if dp["annotations"] is None: 349 | continue 350 | topics.append(dp["topic"]) 351 | generations.append(dp["output"]) 352 | atomic_facts.append([atom["text"] for sent in dp["annotations"] for atom in sent["model-atomic-facts"]]) 353 | else: 354 | topics.append(dp["topic"]) 355 | generations.append(dp["output"]) 356 | if args.n_samples is not None and tot==args.n_samples: 357 | break 358 | out = fs.get_score(topics=topics, 359 | generations=generations, 360 | gamma=args.gamma, 361 | atomic_facts=atomic_facts if args.use_atomic_facts else None, 362 | knowledge_source=args.knowledge_source, 363 | verbose=args.verbose) 364 | logging.critical("FActScore = %.1f%%" % (100*out["score"])) 365 | if "init_score" in out: 366 | logging.critical("FActScore w/o length penalty = %.1f%%" % (100*out["init_score"])) 367 | logging.critical("Respond ratio = %.1f%%" % (100*out["respond_ratio"])) 368 | logging.critical("# Atomic facts per valid response = %.1f" % (out["num_facts_per_response"])) 369 | 370 | # Save out as a json file 371 | with open(args.input_path.replace(".jsonl", f"_factscore_output.json"), 'w') as f: 372 | f.write(json.dumps(out) + "\n") 373 | 374 | -------------------------------------------------------------------------------- /src/factscore_package/lm.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import time 4 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig, AutoConfig 5 | from torch import cuda, bfloat16 6 | import json 7 | from huggingface_hub import login 8 | 9 | 10 | class LM(): 11 | 12 | def __init__(self, cache_file): 13 | self.cache_file = cache_file 14 | self.cache_dict = self.load_cache() 15 | self.model = None 16 | self.save_interval = 100 17 | self.add_n = 0 18 | 19 | def load_model(self): 20 | # load the model and put it as self.model 21 | raise NotImplementedError() 22 | #model_id = 'meta-llama/Llama-2-7b-chat-hf' 23 | 24 | #bnb_config = BitsAndBytesConfig( 25 | #load_in_4bit=True, 26 | #bnb_4bit_quant_type='nf4', 27 | #bnb_4bit_use_double_quant=True, 28 | #bnb_4bit_compute_dtype=bfloat16 29 | #) 30 | 31 | # begin initializing HF items, you need an access token 32 | #hf_auth = "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" 33 | #model_config = AutoConfig.from_pretrained( 34 | # model_id, 35 | #use_auth_token=hf_auth 36 | #) 37 | 38 | #model = AutoModelForCausalLM.from_pretrained( 39 | #model_id, 40 | ##trust_remote_code=True, 41 | #config=model_config, 42 | #quantization_config=bnb_config, 43 | #device_map='auto', 44 | #use_auth_token=hf_auth 45 | #) 46 | #model_name_or_path = "TheBloke/Llama-2-70B-chat-AWQ" 47 | #model = AutoAWQForCausalLM.from_quantized(model_name_or_path, fuse_layers=True, 48 | #trust_remote_code=False, safetensors=True) 49 | # 2. Tie the weights 50 | #model.tie_weights() 51 | 52 | #tokenizer = AutoTokenizer.from_pretrained( 53 | #model_id, 54 | #use_auth_token=hf_auth 55 | #) 56 | # 3. Create the pipeline using the model with tied weights. 57 | #generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device_map="auto") 58 | #self.model = generator 59 | 60 | 61 | def generate(self, prompt, sample_idx=0, max_sequence_length=2048, max_output_length=128): 62 | prompt = prompt.strip() # it's important not to end with a whitespace 63 | cache_key = f"{prompt}_{sample_idx}" 64 | 65 | if cache_key in self.cache_dict: 66 | return self.cache_dict[cache_key] 67 | 68 | if self.model is None: 69 | self.load_model() 70 | 71 | if prompt.endswith(" True or False?\nAnswer:"): 72 | generated = self._generate(prompt, max_sequence_length=max_sequence_length, max_output_length=1) 73 | else: 74 | generated = self._generate(prompt, max_sequence_length=max_sequence_length, max_output_length=max_output_length) 75 | 76 | self.cache_dict[cache_key] = generated 77 | self.add_n += 1 78 | return generated 79 | 80 | """ 81 | def _generate(self, prompt, max_output_length): 82 | if self.add_n % self.save_interval == 0: 83 | self.save_cache() 84 | generate_kwargs = dict(max_new_tokens=max_output_length, do_sample=True, temperature=0.5) 85 | output = self.model(prompt, **generate_kwargs) 86 | #print(output) 87 | output = output[0]['generated_text'][len(prompt):].strip() 88 | if "\n" in output: 89 | output = output[:output.index("\n")] 90 | return output 91 | """ 92 | def save_cache(self): 93 | if self.add_n == 0: 94 | return 95 | 96 | # load the latest cache first, since if there were other processes running in parallel, cache might have been updated 97 | for k, v in self.load_cache().items(): 98 | self.cache_dict[k] = v 99 | 100 | with open(self.cache_file, "wb") as f: 101 | pickle.dump(self.cache_dict, f) 102 | 103 | def load_cache(self, allow_retry=True): 104 | if os.path.exists(self.cache_file): 105 | while True: 106 | try: 107 | with open(self.cache_file, "rb") as f: 108 | cache = pickle.load(f) 109 | break 110 | except Exception: 111 | if not allow_retry: 112 | assert False 113 | print ("Pickle Error: Retry in 5sec...") 114 | time.sleep(5) 115 | else: 116 | cache = {} 117 | return cache 118 | 119 | 120 | 121 | -------------------------------------------------------------------------------- /src/factscore_package/npm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import time 4 | from collections import defaultdict 5 | from transformers import AutoModelForMaskedLM, AutoTokenizer 6 | 7 | from .lm import LM 8 | from .retrieval import Retrieval 9 | 10 | def softmax(x): 11 | return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum()) 12 | 13 | class NPM(LM): 14 | 15 | def __init__(self, bm25, model_name, cache_file): 16 | assert model_name.startswith("npm") 17 | self.bm25 = bm25 18 | self.model_name = model_name 19 | self.model = None 20 | 21 | self.tokenizer = AutoTokenizer.from_pretrained("facebook/" + self.model_name) 22 | self.mask_id = self.tokenizer.mask_token_id 23 | 24 | with open("roberta_stopwords.txt", "r") as f: 25 | self.stopwords = set() 26 | for line in f: 27 | self.stopwords.add(int(line.strip())) 28 | 29 | super().__init__(cache_file=cache_file) 30 | 31 | def load_model(self): 32 | self.model = AutoModelForMaskedLM.from_pretrained("facebook/" + self.model_name) 33 | self.model.cuda() 34 | self.model.eval() 35 | 36 | def save_cache(self): 37 | super().save_cache() 38 | self.bm25.save_cache() 39 | 40 | def tokenize(self, texts, skip_special_tokens=False, padding=True): 41 | assert type(texts)==list 42 | all_input_ids = self.tokenizer(texts)["input_ids"] 43 | if skip_special_tokens: 44 | for i, input_ids in enumerate(all_input_ids): 45 | assert input_ids[0]==0 and input_ids[-1]==2 46 | all_input_ids[i] = input_ids[1:-1] 47 | if not padding: 48 | return all_input_ids 49 | max_length = np.max([len(_ids) for _ids in all_input_ids]) 50 | _all_input_ids = [] 51 | _all_attention_mask = [] 52 | for i, input_ids in enumerate(all_input_ids): 53 | n_valid = len(input_ids) 54 | n_masks = max_length - n_valid 55 | _all_input_ids.append(input_ids + [0 for _ in range(n_masks)]) 56 | _all_attention_mask.append([1 for _ in range(n_valid)] + [0 for _ in range(n_masks)]) 57 | return torch.LongTensor(_all_input_ids), torch.LongTensor(_all_attention_mask) 58 | 59 | def decode(self, input_ids): 60 | return self.tokenizer.decode(input_ids) 61 | 62 | def encode(self, texts, skip_special_tokens=False, gt_input_ids=None): 63 | assert type(texts)==list 64 | if self.model is None: 65 | self.load_model() 66 | if gt_input_ids is not None: 67 | assert len(texts)==len(gt_input_ids) 68 | all_input_ids, all_attention_mask = self.tokenize(texts, skip_special_tokens=skip_special_tokens) 69 | 70 | with torch.no_grad(): 71 | outputs = self.model(all_input_ids.cuda(), 72 | all_attention_mask.cuda(), 73 | output_hidden_states=True, 74 | return_dict=True) 75 | all_logits = outputs["logits"].detach().cpu().numpy() 76 | all_hidden_states = outputs["hidden_states"][-1].detach().cpu().numpy() 77 | 78 | results = [] 79 | for i, (text, input_ids, logits, hidden_states) in enumerate(zip(texts, all_input_ids, all_logits, all_hidden_states)): 80 | input_ids = input_ids.numpy().tolist() 81 | if self.mask_id in input_ids: 82 | idx = input_ids.index(self.mask_id) 83 | assert gt_input_ids is not None 84 | prob = softmax(logits[idx])[gt_input_ids[i]] 85 | results.append((prob, hidden_states[idx])) 86 | else: 87 | _input_ids = [_id for _id in input_ids if _id not in [0, 2]] 88 | _hidden_states = [h for _id, h in zip(input_ids, hidden_states) if _id not in [0, 2]] 89 | results.append((_input_ids, _hidden_states)) 90 | 91 | return results 92 | 93 | def get_probabilty(self, topic, question): 94 | passages = self.bm25.get_passages(topic, question, k=3) 95 | passages = [p["text"].strip() for p in passages] 96 | cache_key = question + "#" + "#".join(passages) 97 | 98 | if cache_key not in self.cache_dict: 99 | encoded = self.encode(passages, skip_special_tokens=True) 100 | stacked_passage_tokens, stacked_passage_vectors = [], [] 101 | for input_ids, vectors in encoded: 102 | stacked_passage_tokens += input_ids 103 | if len(vectors)>0: 104 | stacked_passage_vectors.append(vectors) 105 | stacked_passage_vectors = np.concatenate(stacked_passage_vectors, 0) 106 | 107 | question_input_ids = self.tokenize(["Fact: " + question], skip_special_tokens=False, padding=False)[0] 108 | if 2 in question_input_ids: 109 | question_input_ids = question_input_ids[:question_input_ids.index(2)] 110 | question_input_ids = question_input_ids[1:] 111 | 112 | ''' 113 | triples = [] 114 | prefix = True 115 | for i, input_id in enumerate(question_input_ids): 116 | if prefix: 117 | if input_id==35: # the end of prefix 118 | prefix = False 119 | continue 120 | if input_id in [0, 2] or input_id in self.stopwords: 121 | continue 122 | new_question = self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:]) 123 | prob, vector = self.encode(new_question, gt_input_id=input_id) 124 | triples.append((prob, vector, input_id)) 125 | ''' 126 | triples = [] 127 | batch = [] 128 | gt_input_ids = [] 129 | prefix = True 130 | for i, input_id in enumerate(question_input_ids): 131 | if prefix: 132 | if input_id==35: # the end of prefix 133 | prefix = False 134 | continue 135 | if input_id in [0, 2] or input_id in self.stopwords: 136 | continue 137 | batch.append(self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:])) 138 | gt_input_ids.append(input_id) 139 | for (prob, vector), gt_input_id in zip(self.encode(batch, gt_input_ids=gt_input_ids), gt_input_ids): 140 | triples.append((prob, vector, gt_input_id)) 141 | 142 | stacked_question_vectors = np.stack([v for _, v, _ in triples], 0) 143 | all_scores = np.exp(np.inner(stacked_question_vectors, stacked_passage_vectors) / np.sqrt(stacked_passage_vectors.shape[-1])) 144 | 145 | probs = [] 146 | for (softmax_prob, vector, input_id), scores in zip(triples, all_scores): 147 | assert len(stacked_passage_tokens)==len(scores) 148 | if input_id not in stacked_passage_tokens: 149 | probs.append(0) 150 | else: 151 | aggregated_scores = defaultdict(list) 152 | for token, score in zip(stacked_passage_tokens, scores): 153 | aggregated_scores[token].append(score) 154 | tot = np.sum([np.sum(v) for v in aggregated_scores.values()]) 155 | prob = np.sum(aggregated_scores[input_id]) / tot 156 | probs.append(prob) 157 | 158 | self.cache_dict[cache_key] = np.mean(probs) 159 | self.add_n += 1 160 | 161 | return self.cache_dict[cache_key] 162 | 163 | 164 | 165 | 166 | -------------------------------------------------------------------------------- /src/factscore_package/openai_lm.py: -------------------------------------------------------------------------------- 1 | from .lm import LM 2 | from openai import OpenAI 3 | import openai 4 | import sys 5 | import time 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | #os.environ["http_proxy"] = "http://localhost:27890" 11 | #os.environ["https_proxy"] = "http://localhost:27890" 12 | 13 | class OpenAIModel(LM): 14 | 15 | def __init__(self, model_name, cache_file=None, key=""): 16 | self.model_name = model_name 17 | self.temp = 0.7 18 | self.save_interval = 100 19 | self.client = OpenAI(api_key=key.strip()) 20 | super().__init__(cache_file) 21 | 22 | def load_model(self): 23 | pass 24 | # load api key 25 | #key_path = self.key_path 26 | #assert os.path.exists(key_path), f"Please place your OpenAI APT Key in {key_path}." 27 | #with open(key_path, 'r') as f: 28 | #api_key = f.readline() 29 | #self.client = OpenAI(api_key=api_key.strip()) 30 | #self.model = self.model_name 31 | 32 | def _generate(self, prompt, max_sequence_length=2048, max_output_length=128): 33 | if self.add_n % self.save_interval == 0: 34 | self.save_cache() 35 | # return a tuple of string (generated text) and metadata (any format) 36 | # This should be about generating a response from the prompt, no matter what the application is 37 | if self.model_name == "ChatGPT": 38 | # Construct the prompt send to ChatGPT 39 | message = [{"role": "user", "content": prompt}] 40 | # Call API 41 | response = self.call_ChatGPT(message, temp=self.temp, max_len=max_sequence_length) 42 | # Get the output from the response 43 | output = response.choices[0].message.content 44 | return output, response 45 | elif self.model_name == "InstructGPT": 46 | # Call API 47 | response = self.call_GPT3(prompt, temp=self.temp) 48 | # Get the output from the response 49 | output = response.choices[0].text 50 | return output, response 51 | else: 52 | raise NotImplementedError() 53 | 54 | def call_ChatGPT(self, message, model_name="gpt-3.5-turbo", max_len=1024, temp=0.7, verbose=False): 55 | # call GPT-3 API until result is provided and then return it 56 | response = None 57 | received = False 58 | num_rate_errors = 0 59 | while not received: 60 | try: 61 | response = self.client.chat.completions.create(model=model_name, 62 | messages=message, 63 | max_tokens=max_len, 64 | temperature=temp) 65 | received = True 66 | except: 67 | # print(message) 68 | num_rate_errors += 1 69 | error = sys.exc_info()[0] 70 | if error == openai.error.InvalidRequestError: 71 | # something is wrong: e.g. prompt too long 72 | logging.critical(f"InvalidRequestError\nPrompt passed in:\n\n{message}\n\n") 73 | assert False 74 | 75 | logging.error("API error: %s (%d). Waiting %dsec" % (error, num_rate_errors, np.power(2, num_rate_errors))) 76 | time.sleep(np.power(2, num_rate_errors)) 77 | return response 78 | 79 | 80 | def call_GPT3(self, prompt, model_name="text-davinci-003", max_len=512, temp=0.7, num_log_probs=0, echo=False, verbose=False): 81 | # call GPT-3 API until result is provided and then return it 82 | response = None 83 | received = False 84 | num_rate_errors = 0 85 | while not received: 86 | try: 87 | response = self.client.completions.create(model="gpt-3.5-turbo-instruct", 88 | prompt=prompt, 89 | max_tokens=max_len, 90 | temperature=temp, 91 | logprobs=num_log_probs, 92 | echo=echo) 93 | received = True 94 | except: 95 | error = sys.exc_info()[0] 96 | num_rate_errors += 1 97 | if error == openai.error.InvalidRequestError: 98 | # something is wrong: e.g. prompt too long 99 | logging.critical(f"InvalidRequestError\nPrompt passed in:\n\n{prompt}\n\n") 100 | assert False 101 | logging.error("API error: %s (%d)" % (error, num_rate_errors)) 102 | time.sleep(np.power(2, num_rate_errors)) 103 | return response 104 | -------------------------------------------------------------------------------- /src/factscore_package/retrieval.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import os 4 | 5 | import sqlite3 6 | import numpy as np 7 | import pickle as pkl 8 | 9 | from rank_bm25 import BM25Okapi 10 | 11 | SPECIAL_SEPARATOR = "####SPECIAL####SEPARATOR####" 12 | MAX_LENGTH = 256 13 | 14 | class DocDB(object): 15 | """Sqlite backed document storage. 16 | Implements get_doc_text(doc_id). 17 | """ 18 | 19 | def __init__(self, db_path=None, data_path=None): 20 | self.db_path = db_path 21 | self.connection = sqlite3.connect(self.db_path, check_same_thread=False) 22 | 23 | cursor = self.connection.cursor() 24 | cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") 25 | 26 | if len(cursor.fetchall())==0: 27 | assert data_path is not None, f"{self.db_path} is empty. Specify `data_path` in order to create a DB." 28 | print (f"{self.db_path} is empty. start building DB from {data_path}...") 29 | self.build_db(self.db_path, data_path) 30 | 31 | def __enter__(self): 32 | return self 33 | 34 | def __exit__(self, *args): 35 | self.close() 36 | 37 | def path(self): 38 | """Return the path to the file that backs this database.""" 39 | return self.path 40 | 41 | def close(self): 42 | """Close the connection to the database.""" 43 | self.connection.close() 44 | 45 | def build_db(self, db_path, data_path): 46 | from transformers import RobertaTokenizer 47 | tokenizer = RobertaTokenizer.from_pretrained("roberta-large") 48 | 49 | titles = set() 50 | output_lines = [] 51 | tot = 0 52 | start_time = time.time() 53 | c = self.connection.cursor() 54 | c.execute("CREATE TABLE documents (title PRIMARY KEY, text);") 55 | 56 | with open(data_path, "r") as f: 57 | for line in f: 58 | dp = json.loads(line) 59 | title = dp["title"] 60 | text = dp["text"] 61 | if title in titles: 62 | continue 63 | titles.add(title) 64 | if type(text)==str: 65 | text = [text] 66 | passages = [[]] 67 | for sent_idx, sent in enumerate(text): 68 | assert len(sent.strip())>0 69 | tokens = tokenizer(sent)["input_ids"] 70 | max_length = MAX_LENGTH - len(passages[-1]) 71 | if len(tokens) <= max_length: 72 | passages[-1].extend(tokens) 73 | else: 74 | passages[-1].extend(tokens[:max_length]) 75 | offset = max_length 76 | while offset < len(tokens): 77 | passages.append(tokens[offset:offset+MAX_LENGTH]) 78 | offset += MAX_LENGTH 79 | 80 | psgs = [tokenizer.decode(tokens) for tokens in passages if np.sum([t not in [0, 2] for t in tokens])>0] 81 | text = SPECIAL_SEPARATOR.join(psgs) 82 | output_lines.append((title, text)) 83 | tot += 1 84 | 85 | if len(output_lines) == 1000000: 86 | c.executemany("INSERT INTO documents VALUES (?,?)", output_lines) 87 | output_lines = [] 88 | print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60)) 89 | 90 | if len(output_lines) > 0: 91 | c.executemany("INSERT INTO documents VALUES (?,?)", output_lines) 92 | print ("Finish saving %dM documents (%dmin)" % (tot / 1000000, (time.time()-start_time)/60)) 93 | 94 | self.connection.commit() 95 | self.connection.close() 96 | 97 | def get_text_from_title(self, title): 98 | """Fetch the raw text of the doc for 'doc_id'.""" 99 | cursor = self.connection.cursor() 100 | cursor.execute("SELECT text FROM documents WHERE title = ?", (title,)) 101 | results = cursor.fetchall() 102 | results = [r for r in results] 103 | cursor.close() 104 | assert results is not None and len(results)==1, f"`topic` in your data ({title}) is likely to be not a valid title in the DB." 105 | results = [{"title": title, "text": para} for para in results[0][0].split(SPECIAL_SEPARATOR)] 106 | assert len(results)>0, f"`topic` in your data ({title}) is likely to be not a valid title in the DB." 107 | return results 108 | 109 | class Retrieval(object): 110 | 111 | def __init__(self, db, cache_path, embed_cache_path, 112 | retrieval_type="gtr-t5-large", batch_size=None): 113 | self.db = db 114 | self.cache_path = cache_path 115 | self.embed_cache_path = embed_cache_path 116 | self.retrieval_type = retrieval_type 117 | self.batch_size = batch_size 118 | assert retrieval_type=="bm25" or retrieval_type.startswith("gtr-") 119 | 120 | self.encoder = None 121 | self.load_cache() 122 | self.add_n = 0 123 | self.add_n_embed = 0 124 | 125 | def load_encoder(self): 126 | from sentence_transformers import SentenceTransformer 127 | encoder = SentenceTransformer("sentence-transformers/" + self.retrieval_type) 128 | encoder = encoder.cuda() 129 | encoder = encoder.eval() 130 | self.encoder = encoder 131 | assert self.batch_size is not None 132 | 133 | def load_cache(self): 134 | if os.path.exists(self.cache_path): 135 | with open(self.cache_path, "r") as f: 136 | self.cache = json.load(f) 137 | else: 138 | self.cache = {} 139 | if os.path.exists(self.embed_cache_path): 140 | with open(self.embed_cache_path, "rb") as f: 141 | self.embed_cache = pkl.load(f) 142 | else: 143 | self.embed_cache = {} 144 | 145 | def save_cache(self): 146 | if self.add_n > 0: 147 | if os.path.exists(self.cache_path): 148 | with open(self.cache_path, "r") as f: 149 | new_cache = json.load(f) 150 | self.cache.update(new_cache) 151 | 152 | with open(self.cache_path, "w") as f: 153 | json.dump(self.cache, f) 154 | 155 | if self.add_n_embed > 0: 156 | if os.path.exists(self.embed_cache_path): 157 | with open(self.embed_cache_path, "rb") as f: 158 | new_cache = pkl.load(f) 159 | self.embed_cache.update(new_cache) 160 | 161 | with open(self.embed_cache_path, "wb") as f: 162 | pkl.dump(self.embed_cache, f) 163 | 164 | def get_bm25_passages(self, topic, query, passages, k): 165 | if topic in self.embed_cache: 166 | bm25 = self.embed_cache[topic] 167 | else: 168 | bm25 = BM25Okapi([psg["text"].replace("", "").replace("", "").split() for psg in passages]) 169 | self.embed_cache[topic] = bm25 170 | self.add_n_embed += 1 171 | scores = bm25.get_scores(query.split()) 172 | indices = np.argsort(-scores)[:k] 173 | return [passages[i] for i in indices] 174 | 175 | def get_gtr_passages(self, topic, retrieval_query, passages, k): 176 | if self.encoder is None: 177 | self.load_encoder() 178 | if topic in self.embed_cache: 179 | passage_vectors = self.embed_cache[topic] 180 | else: 181 | inputs = [psg["title"] + " " + psg["text"].replace("", "").replace("", "") for psg in passages] 182 | passage_vectors = self.encoder.encode(inputs, batch_size=self.batch_size, device=self.encoder.device) 183 | self.embed_cache[topic] = passage_vectors 184 | self.add_n_embed += 1 185 | query_vectors = self.encoder.encode([retrieval_query], 186 | batch_size=self.batch_size, 187 | device=self.encoder.device)[0] 188 | scores = np.inner(query_vectors, passage_vectors) 189 | indices = np.argsort(-scores)[:k] 190 | return [passages[i] for i in indices] 191 | 192 | def get_passages(self, topic, question, k): 193 | retrieval_query = topic + " " + question.strip() 194 | cache_key = topic + "#" + retrieval_query 195 | 196 | if cache_key not in self.cache: 197 | passages = self.db.get_text_from_title(topic) 198 | if self.retrieval_type=="bm25": 199 | self.cache[cache_key] = self.get_bm25_passages(topic, retrieval_query, passages, k) 200 | else: 201 | self.cache[cache_key] = self.get_gtr_passages(topic, retrieval_query, passages, k) 202 | assert len(self.cache[cache_key]) in [k, len(passages)] 203 | self.add_n += 1 204 | 205 | 206 | return self.cache[cache_key] 207 | 208 | 209 | 210 | 211 | -------------------------------------------------------------------------------- /src/factscore_package/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | import torch 7 | 8 | def assert_all_approx_close(a, b, rtol, atol, count): 9 | 10 | idx = torch.isclose(a.float(), b.float(), rtol, atol) 11 | sumval = (idx==0).sum().item() 12 | if sumval > count: 13 | print(f'Too many values not close: assert {sumval} < {count}') 14 | try: 15 | torch.testing.assert_allclose(a, b, rtol, atol) 16 | except Exception as e: 17 | print(e) 18 | 19 | 20 | def get_memory_footprint(model, return_buffers=True): 21 | """ 22 | Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. 23 | Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the 24 | PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 25 | Arguments: 26 | return_buffers (`bool`, *optional*, defaults to `True`): 27 | Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers 28 | are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch 29 | norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 30 | """ 31 | mem = sum([param.nelement() * param.element_size() for param in model.parameters()]) 32 | if return_buffers: 33 | mem_bufs = sum([buf.nelement() * buf.element_size() for buf in model.buffers()]) 34 | mem = mem + mem_bufs 35 | return mem 36 | 37 | 38 | def ـreplace_linear_with_int8linear(model, modules_to_not_convert="lm_head"): 39 | for name, module in model.named_children(): 40 | ـreplace_linear_with_int8linear(module, modules_to_not_convert) 41 | 42 | if isinstance(module, torch.nn.Linear) and name != modules_to_not_convert: 43 | model._modules[name] = QuantizedLinearInt8(linear_layer=module) 44 | return 45 | 46 | 47 | class QuantizedLinearInt8(torch.nn.Module): 48 | ''' 49 | A simple but effictive implmenetion of Int8 quantization for linear layers. 50 | The weights are quantized and stored as Int8, which saves ~50% of the gpu memory. 51 | During the forwared pass, the weights are de-quantized back to fp16 to do multiplication. 52 | Pros: 53 | - saves ~50% of the gpu memory 54 | - accurate quantization because only the weights are quantized, and the weights don't suffer 55 | from the "outliers" issue mentioned in the LLM.int8 paper; only the activations do. 56 | - high precision results beacuse the multiplication is done in fp16 57 | - much faster than LLM.int8 58 | Cons: 59 | - a bit slower because of the added computation of dequantization in each forward pass. In practice, the slowdown 60 | is not large because in the generation application, gpu utilization is not very high. 61 | ''' 62 | def __init__(self, linear_layer): 63 | super().__init__() 64 | self.bias = linear_layer.bias 65 | 66 | weight_bit_width = 8 67 | weight = linear_layer.weight 68 | 69 | self.weight_scale = torch.nn.Parameter( 70 | (weight.abs().max(dim=-1).values / ((2 ** (weight_bit_width - 1)) - 1)).half(), 71 | ) 72 | # print(self.weight_scale.max().item(), self.weight_scale.min().item(), self.weight_scale.mean().item()) 73 | # if self.weight_scale.max().item() > 0.002: 74 | # print(self.weight_scale.max().item()) 75 | self.weight = torch.nn.Parameter( 76 | torch.round(weight.float() / self.weight_scale[:, None]).char(), 77 | requires_grad=False 78 | ) 79 | 80 | def forward(self, x): 81 | weight = self.weight.half() * self.weight_scale[:, None] 82 | return torch.nn.functional.linear(x, weight, self.bias) 83 | 84 | 85 | def convert_model_to_int8_on_gpu(model, device): 86 | """ 87 | Quantize a model to int8 and move it to GPU using a simple method. 88 | """ 89 | if 'cuda' not in device: 90 | raise ValueError(f"Target device should be a gpu. Device {device} is not supported") 91 | 92 | model.half() 93 | 94 | memory_before_quantization = get_memory_footprint(model) # without lm_head 95 | 96 | ـreplace_linear_with_int8linear(model) # replace `Linear` with `QuantizedLinearInt8` 97 | 98 | model.to(device=device) 99 | memory_after_quantization = get_memory_footprint(model) # without lm_head 100 | 101 | saving = round(100 * memory_after_quantization/memory_before_quantization) 102 | memory_before_quantization = round(memory_before_quantization / 2**30, 2) # rounding for printing 103 | memory_after_quantization = round(memory_after_quantization / 2**30, 2) # rounding for printing 104 | 105 | print(f'Quantization memory - before: {memory_before_quantization} GB, after: {memory_after_quantization} GB ({saving}% of the size before)') 106 | return model 107 | -------------------------------------------------------------------------------- /src/interface.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | import gradio as gr 4 | import torch 5 | from peft import PeftModel 6 | from transformers import ( 7 | AutoModelForCausalLM, 8 | AutoTokenizer, 9 | GenerationConfig, 10 | LlamaTokenizer, 11 | LlamaForCausalLM 12 | ) 13 | 14 | parser = argparse.ArgumentParser() 15 | parser.add_argument("--model_name_or_path", type=str, required=True) 16 | parser.add_argument("--ckpt_path", type=str, default=None) 17 | parser.add_argument("--use_lora", action="store_true") 18 | parser.add_argument("--llama", action="store_true") 19 | parser.add_argument("--base_port", default=17860, type=int) 20 | parser.add_argument("--use_raw_prompt", action="store_true") 21 | args = parser.parse_args() 22 | 23 | 24 | def generate_prompt(input_text): 25 | if not args.use_raw_prompt: 26 | return f"Human: \n{input_text}\n\nAssistant: \n" 27 | else: 28 | return input_text 29 | 30 | 31 | def evaluate( 32 | model, 33 | tokenizer, 34 | input: str, 35 | temperature=0.1, 36 | top_p=0.75, 37 | top_k=40, 38 | num_beams=4, 39 | do_sample=False, 40 | max_new_tokens=128, 41 | min_new_tokens=1, 42 | repetition_penalty=1.2, 43 | ): 44 | prompt = generate_prompt(input) 45 | inputs = tokenizer(prompt, add_special_tokens=False, return_tensors="pt") 46 | 47 | input_ids = inputs["input_ids"].to(getattr(model, "module", model).device) 48 | generation_config = GenerationConfig( 49 | temperature=temperature, 50 | top_p=top_p, 51 | top_k=top_k, 52 | num_beams=num_beams, 53 | bos_token_id=tokenizer.bos_token_id, 54 | eos_token_id=tokenizer.eos_token_id, 55 | pad_token_id=tokenizer.pad_token_id, 56 | max_new_tokens=max_new_tokens, # max_length=max_new_tokens+input_sequence 57 | min_new_tokens=min_new_tokens, # min_length=min_new_tokens+input_sequence 58 | repetition_penalty=repetition_penalty, 59 | do_sample=do_sample, 60 | ) 61 | with torch.no_grad(): 62 | generation_output = model.generate( 63 | input_ids=input_ids, 64 | generation_config=generation_config, 65 | return_dict_in_generate=True, 66 | output_scores=False, 67 | ) 68 | output = generation_output.sequences[0] 69 | output = tokenizer.decode( 70 | output, 71 | skip_special_tokens=True 72 | )[len(prompt):].strip() 73 | return output 74 | 75 | 76 | if __name__ == "__main__": 77 | load_type = torch.float16 # Sometimes may need torch.float32 78 | if args.ckpt_path is None or args.ckpt_path == '': 79 | args.ckpt_path = args.model_name_or_path 80 | 81 | if args.llama: 82 | tokenizer = LlamaTokenizer.from_pretrained(args.model_name_or_path) 83 | tokenizer.add_special_tokens( 84 | { 85 | "bos_token": "", 86 | "eos_token": "", 87 | "unk_token": "", 88 | "pad_token": "", 89 | } 90 | ) 91 | else: 92 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path) 93 | 94 | print(f"loading model...") 95 | 96 | if args.llama: 97 | model = LlamaForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, device_map='auto') 98 | model.config.use_flash_attention = True 99 | model.config.pad_token_id = 0 100 | model.config.eos_token_id = 2 101 | else: 102 | model = AutoModelForCausalLM.from_pretrained(args.ckpt_path, torch_dtype=load_type, device_map='auto') 103 | 104 | # peft model 105 | if args.use_lora: 106 | model = PeftModel.from_pretrained(model, args.ckpt_path, torch_dtype=load_type) 107 | 108 | if not torch.cuda.is_available(): 109 | device = torch.device("cpu") 110 | model.float() 111 | model.to(device) 112 | 113 | model.eval() 114 | 115 | print("Load model successfully") 116 | # https://gradio.app/docs/ 117 | gr.Interface( 118 | fn=partial(evaluate, model, tokenizer), 119 | inputs=[ 120 | gr.components.Textbox( 121 | lines=2, label="Input", placeholder="Welcome to the BELLE model" 122 | ), 123 | gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), 124 | gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), 125 | gr.components.Slider( 126 | minimum=0, maximum=100, step=1, value=40, label="Top k" 127 | ), 128 | gr.components.Slider( 129 | minimum=1, maximum=4, step=1, value=1, label="Beams Number" 130 | ), 131 | gr.components.Checkbox(value=False, label="Do sample"), 132 | gr.components.Slider( 133 | minimum=1, maximum=2000, step=10, value=512, label="Max New Tokens" 134 | ), 135 | gr.components.Slider( 136 | minimum=1, maximum=300, step=10, value=1, label="Min New Tokens" 137 | ), 138 | gr.components.Slider( 139 | minimum=1.0, 140 | maximum=2.0, 141 | step=0.1, 142 | value=1.2, 143 | label="Repetition Penalty", 144 | ), 145 | ], 146 | outputs=[ 147 | gr.components.Textbox( 148 | lines=25, 149 | label="Output", 150 | ) 151 | ], 152 | title="FinMA: Financial Large Language Model", 153 | ).queue().launch( 154 | share=True, server_name="0.0.0.0", server_port=args.base_port 155 | ) 156 | -------------------------------------------------------------------------------- /src/model_prompt.py: -------------------------------------------------------------------------------- 1 | def no_prompt(ctx): 2 | return ctx 3 | 4 | def finma_prompt(ctx): 5 | return f'Human: \n{ctx}\n\nAssistant: \n' 6 | 7 | MODEL_PROMPT_MAP = { 8 | "no_prompt": no_prompt, 9 | "finma_prompt": finma_prompt 10 | } -------------------------------------------------------------------------------- /src/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | from pprint import pprint 2 | from typing import List, Union 3 | 4 | import json 5 | import lm_eval.base 6 | 7 | from . import flare 8 | 9 | TASK_REGISTRY = { 10 | "flare_es_financees": flare.ESFINANCEES, 11 | "flare_es_multifin": flare.ESMultiFin, 12 | "flare_es_efp": flare.ESEFP, 13 | "flare_es_efpa": flare.ESEFPA, 14 | "flare_es_fns": flare.ESFNS, 15 | "flare_es_tsa": flare.ESTSA, 16 | "flare_fpb": flare.FPB, 17 | "flare_fiqasa": flare.FIQASA, 18 | "flare_ner": flare.NER, 19 | "flare_finqa": flare.FinQA, 20 | "flare_convfinqa": flare.ConvFinQA, 21 | "flare_headlines": flare.Headlines, 22 | "flare_finer_ord": flare.FinerOrd, 23 | "flare_fomc": flare.FOMC, 24 | "flare_german": flare.German, 25 | "flare_australian": flare.Australian, 26 | "flare_fomc": flare.FOMC, 27 | "flare_ectsum": flare.ECTSUM, 28 | "flare_edtsum": flare.EDTSUM, 29 | "flare_finarg_ecc_auc": flare.FinargECCAUC, 30 | "flare_finarg_ecc_arc": flare.FinargECCARC, 31 | "flare_cd": flare.CD, 32 | "flare_multifin_en": flare.MultiFinEN, 33 | "flare_tsa": flare.TSA, 34 | "flare_cfa": flare.CFA, 35 | "flare_ma": flare.MA, 36 | "flare_causal20_sc": flare.Causal20SC, 37 | "flare_finarg_ecc_arc": flare.FINARGECCARC, 38 | "flare_finarg_ecc_auc": flare.FINARGECCAUC, 39 | "flare_mlesg": flare.MLESG, 40 | "flare_fnxl": flare.FNXL, 41 | "flare_fsrl": flare.FSRL, 42 | "flare_tatqa": flare.TATQA, 43 | "flare_finred": flare.FinRED, 44 | "flare_cra_lendingclub": flare.lendingclub, 45 | "flare_cra_ccf": flare.ccf, 46 | "flare_cra_ccfraud": flare.ccfraud, 47 | "flare_cra_polish": flare.polish, 48 | "flare_cra_taiwan": flare.taiwan, 49 | "flare_cra_portoseguro": flare.portoseguro, 50 | "flare_cra_travelinsurace": flare.travelinsurace, 51 | "flare_sm_bigdata": flare.StockMovementBigData, 52 | "flare_sm_acl": flare.StockMovementACL, 53 | "flare_sm_cikm": flare.StockMovementCIKM, 54 | "flare_en_finterm": flare.FINTERM, 55 | "flare_en_acronym": flare.ACRONYM, 56 | **flare.SM_TASKS, 57 | "flare_finarg_ecc_auc_test": flare.FINARGECCAUC_test, 58 | "flare_edtsum_test": flare.EDTSUM_test, 59 | } 60 | 61 | ALL_TASKS = sorted(list(TASK_REGISTRY)) 62 | 63 | _EXAMPLE_JSON_PATH = "split:key:/absolute/path/to/data.json" 64 | 65 | 66 | def add_json_task(task_name): 67 | """Add a JSON perplexity task if the given task name matches the 68 | JSON task specification. 69 | 70 | See `json.JsonPerplexity`. 71 | """ 72 | if not task_name.startswith("json"): 73 | return 74 | 75 | def create_json_task(): 76 | splits = task_name.split("=", 1) 77 | if len(splits) != 2 or not splits[1]: 78 | raise ValueError( 79 | "json tasks need a path argument pointing to the local " 80 | "dataset, specified like this: json=" 81 | + _EXAMPLE_JSON_PATH 82 | + ' (if there are no splits, use "train")' 83 | ) 84 | 85 | json_path = splits[1] 86 | if json_path == _EXAMPLE_JSON_PATH: 87 | raise ValueError( 88 | "please do not copy the example path directly, but substitute " 89 | "it with a path to your local dataset" 90 | ) 91 | return lambda: json.JsonPerplexity(json_path) 92 | 93 | TASK_REGISTRY[task_name] = create_json_task() 94 | 95 | 96 | def get_task(task_name): 97 | try: 98 | add_json_task(task_name) 99 | return TASK_REGISTRY[task_name] 100 | except KeyError: 101 | print("Available tasks:") 102 | pprint(TASK_REGISTRY) 103 | raise KeyError(f"Missing task {task_name}") 104 | 105 | 106 | def get_task_name_from_object(task_object): 107 | for name, class_ in TASK_REGISTRY.items(): 108 | if class_ is task_object: 109 | return name 110 | 111 | # this gives a mechanism for non-registered tasks to have a custom name anyways when reporting 112 | return ( 113 | task_object.EVAL_HARNESS_NAME 114 | if hasattr(task_object, "EVAL_HARNESS_NAME") 115 | else type(task_object).__name__ 116 | ) 117 | 118 | 119 | def get_task_dict(task_name_list: List[Union[str, lm_eval.base.Task]]): 120 | task_name_dict = { 121 | task_name: get_task(task_name)() 122 | for task_name in task_name_list 123 | if isinstance(task_name, str) 124 | } 125 | task_name_from_object_dict = { 126 | get_task_name_from_object(task_object): task_object 127 | for task_object in task_name_list 128 | if not isinstance(task_object, str) 129 | } 130 | assert set(task_name_dict.keys()).isdisjoint(set(task_name_from_object_dict.keys())) 131 | return {**task_name_dict, **task_name_from_object_dict} 132 | -------------------------------------------------------------------------------- /src/tasks/utils.py: -------------------------------------------------------------------------------- 1 | def process_text(entity_string, text): 2 | # Initialize 3 | entity_list = [(", ".join(val.split(", ")[:-1]), val.split(", ")[-1]) for val in entity_string.split("\n")] 4 | text_words = text.split() 5 | labels = ['O'] * len(text_words) 6 | # text_lower = text.lower() 7 | text_lower = text 8 | 9 | # Create a list to store the start index of each word 10 | word_indices = [0] 11 | for word in text_words[:-1]: 12 | word_indices.append(word_indices[-1] + len(word) + 1) 13 | 14 | # Iterate over the entity list 15 | print (entity_list) 16 | for entity, entity_type in entity_list: 17 | entity_words = entity.split() 18 | entity_lower = entity 19 | 20 | # Find start and end index of each occurrence of the entity in the text 21 | start = 0 22 | while True: 23 | start = text_lower.find(entity_lower, start) 24 | if not entity or start == -1: break # No more occurrence 25 | end = start + len(entity) - 1 26 | 27 | # Find the words included in this occurrence 28 | try: 29 | start_word = next(i for i, ind in enumerate(word_indices) if ind >= start) 30 | end_word = next(i for i, ind in enumerate(word_indices) if ind > end) 31 | 32 | # Label the words 33 | labels[start_word] = 'B-' + entity_type 34 | for i in range(start_word+1, end_word): 35 | labels[i] = 'I-' + entity_type 36 | 37 | # Move to the next character after the occurrence 38 | except Exception: 39 | pass 40 | start = end + 1 41 | 42 | return labels 43 | -------------------------------------------------------------------------------- /src/tasks/zhutils.py: -------------------------------------------------------------------------------- 1 | def process_zhtext(entity_string, text): 2 | # Initialize 3 | name = entity_string.split(',')[0] 4 | if len(entity_string.split(',')) > 1 and entity_string.split(',')[1]: 5 | entity_type = entity_string.split(',')[1].strip() 6 | else: 7 | entity_type = 0 8 | formatted_name = ' '.join(list(name)) 9 | formatted_result = f"{formatted_name}, {entity_type}" 10 | 11 | entity_list = [(", ".join(val.split(", ")[:-1]), val.split(", ")[-1]) for val in formatted_result.split("\n")] 12 | text_words = text.split() 13 | labels = ['O'] * len(text_words) 14 | text_lower = text 15 | 16 | # Create a list to store the start index of each word 17 | word_indices = [0] 18 | for word in text_words[:-1]: 19 | word_indices.append(word_indices[-1] + len(word) + 1) 20 | 21 | # Iterate over the entity list 22 | print ("entity_list:",entity_list) 23 | for entity, entity_type in entity_list: 24 | entity_words = entity.split() 25 | entity_lower = entity 26 | # print ("entity_lower:", entity_lower) 27 | 28 | # Find start and end index of each occurrence of the entity in the text 29 | start = 0 30 | while True: 31 | start = text_lower.find(entity_lower, start) 32 | if not entity or start == -1: break # No more occurrence 33 | end = start + len(entity) - 1 34 | 35 | # Find the words included in this occurrence 36 | try: 37 | start_word = next(i for i, ind in enumerate(word_indices) if ind >= start) 38 | end_word = next(i for i, ind in enumerate(word_indices) if ind > end) 39 | 40 | # Label the words 41 | labels[start_word] = 'B-' + entity_type 42 | for i in range(start_word+1, end_word): 43 | labels[i] = 'I-' + entity_type 44 | 45 | # Move to the next character after the occurrence 46 | except Exception: 47 | pass 48 | start = end + 1 49 | 50 | return labels 51 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any, List 3 | import copy 4 | from gradio_client import Client 5 | from tqdm import tqdm 6 | 7 | 8 | class MultiClient(object): 9 | def __init__(self, worker_addrs, synced_worker=False) -> None: 10 | self.clients = [Client(addr) for addr in worker_addrs] 11 | self.synced_worker = synced_worker 12 | 13 | def predict(self, tasks: List[List], max_retries: int = 3) -> List[Any]: 14 | assert len(tasks) >= 1, "No predict tasks!" 15 | num_tasks = len(tasks) 16 | if self.synced_worker and len(tasks) % len(self.clients) != 0: 17 | num_dummy_tasks = len(self.clients) - len(tasks) % len(self.clients) 18 | tasks.extend([copy.deepcopy(tasks[-1]) for _ in range(num_dummy_tasks)]) 19 | 20 | pbar = tqdm(total=len(tasks)) 21 | jobs = { 22 | client: (i, client.submit(*(tasks[i]), api_name="/predict")) 23 | for i, client in enumerate(self.clients) 24 | if i < len(tasks) 25 | } 26 | results = {} 27 | retries = {i: 0 for i in range(len(tasks))} 28 | 29 | while jobs: 30 | for client, (i, job) in list(jobs.items()): 31 | if job.done(): 32 | pbar.update(1) 33 | del jobs[client] 34 | try: 35 | result = job.result() 36 | results[i] = result 37 | except Exception as e: 38 | print("Job failed with error:", e) 39 | if retries[i] < max_retries: 40 | print("Retrying job...") 41 | retries[i] += 1 42 | new_job = client.submit( 43 | *tasks[i], api_name="/predict") 44 | jobs[client] = (i, new_job) 45 | continue # Skip the rest of the loop 46 | else: 47 | results[i] = None 48 | 49 | new_i = len(results) + len(jobs) 50 | if new_i < len(tasks): 51 | new_task = tasks[new_i] 52 | new_job = client.submit( 53 | *new_task, api_name="/predict") 54 | jobs[client] = (new_i, new_job) 55 | time.sleep(1) 56 | pbar.close() 57 | 58 | predicts = [results[i] for i in range(num_tasks)] 59 | 60 | return predicts 61 | -------------------------------------------------------------------------------- /static/av.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/av.jpg -------------------------------------------------------------------------------- /static/cr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/cr.jpg -------------------------------------------------------------------------------- /static/formula.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/formula.jpg -------------------------------------------------------------------------------- /static/md.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/md.jpg -------------------------------------------------------------------------------- /static/sr.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/The-FinAI/PIXIU/da45ac467ca3a828621315881e33c68bca3bbb1f/static/sr.jpg --------------------------------------------------------------------------------