├── LICENSE ├── README.md ├── data_processing ├── make_data_sentence.ipynb ├── make_data_set_cnn.ipynb ├── make_data_set_mrc.ipynb ├── make_data_set_multi_news.ipynb ├── make_data_wikisum.ipynb └── shuffle_for_data.ipynb ├── evaluation.ipynb ├── evaluation.py ├── evaluation_sum.py ├── inference.ipynb ├── qwen ├── qwen_inference.py ├── qwen_inference_lora.py └── qwen_train.py ├── requirements.txt ├── run.sh ├── run_train.sh ├── source ├── __pycache__ │ ├── modeling_qwen2_inf.cpython-38.pyc │ ├── modeling_qwen2_mean.cpython-38.pyc │ ├── modeling_qwen2_pn.cpython-38.pyc │ ├── modeling_qwen2_pn_2.cpython-38.pyc │ └── modeling_qwen2_pn_test.cpython-38.pyc ├── inference_1108.py ├── inference_baseline.py ├── inference_mean.py ├── inference_origin.py ├── inference_pn.py ├── inference_pn_att_1106.py ├── inference_pn_att_1106_sum.py ├── inference_pn_att_1107.py ├── inference_pn_att_1107_sum.py ├── inference_upper.py ├── modeling_qwen2_.py ├── modeling_qwen2_mean.py ├── modeling_qwen2_pn.py ├── modeling_qwen2_pn_att_1106.py ├── modeling_qwen2_pn_att_1106_lmhead.py ├── modeling_qwen2_pn_att_1106_sum.py ├── modeling_qwen2_pn_att_1107.py ├── modeling_qwen2_pn_att_1107_baseline.py ├── modeling_qwen2_pn_att_1107_sum.py ├── modeling_qwen2_pn_att_1107_upper.py ├── train_mean.py ├── train_origin.py ├── train_pn.py ├── train_pn_2step.py ├── train_pn_att.py ├── train_pn_noloss.py ├── train_pn_yesloss.py └── train_upper.py ├── test.ipynb ├── tmp.ipynb └── train.ipynb /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kyubeen Han 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XAI_rationale-inference-LLM -------------------------------------------------------------------------------- /data_processing/make_data_sentence.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 20, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "file_path = \"../data/1010data/train_data_1011.json\"\n", 11 | "with open(file_path, 'r', encoding='utf-8') as f:\n", 12 | " data = json.load(f)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 21, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "name": "stderr", 22 | "output_type": "stream", 23 | "text": [ 24 | "[nltk_data] Downloading package punkt to /home/rbqlsquf2/nltk_data...\n", 25 | "[nltk_data] Package punkt is already up-to-date!\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "import nltk\n", 31 | "nltk.download('punkt') # NLTK에서 사용하는 토크나이저 데이터 다운로드\n", 32 | "\n", 33 | "from nltk.tokenize import sent_tokenize\n", 34 | "\n", 35 | "def split_sentences_nltk(text):\n", 36 | " sentences = sent_tokenize(text)\n", 37 | " return sentences" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 25, 43 | "metadata": {}, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\n", 50 | "\"OMG.\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "text = \"Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\\\"OMG.\"\n", 56 | "sentences = split_sentences_nltk(text)\n", 57 | "print(sentences[0])\n", 58 | "print(sentences[1])" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 26, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "def preprocess_punctuation(text):\n", 68 | " # 구두점이 반복되는 경우 하나의 구두점으로 축소\n", 69 | " text = re.sub(r'([!?])\\1+', r'\\1', text)\n", 70 | " return text\n", 71 | "\n", 72 | "def split_sentences_nltk_with_punctuation(text):\n", 73 | " # 구두점 전처리\n", 74 | " processed_text = preprocess_punctuation(text)\n", 75 | " # NLTK의 sent_tokenize 사용\n", 76 | " sentences = sent_tokenize(processed_text)\n", 77 | " return sentences" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 27, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stderr", 87 | "output_type": "stream", 88 | "text": [ 89 | "100%|██████████| 90000/90000 [01:07<00:00, 1328.13it/s]\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "from tqdm import tqdm\n", 95 | "import re\n", 96 | "for d in tqdm(data):\n", 97 | " d[\"sent\"] = split_sentences_nltk_with_punctuation(d[\"document\"])" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 33, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "data": { 107 | "text/plain": [ 108 | "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n", 109 | " 'Tracy Morgan just passed out on stage at the #Emmys.',\n", 110 | " 'Turn ABC on right now.\"',\n", 111 | " '~ @jimmykimmel, telling viewers what to tweet.',\n", 112 | " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n", 113 | " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n", 114 | " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n", 115 | " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n", 116 | " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n", 117 | " '!',\n", 118 | " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n", 119 | " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n", 120 | " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n", 121 | " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']" 122 | ] 123 | }, 124 | "execution_count": 33, 125 | "metadata": {}, 126 | "output_type": "execute_result" 127 | } 128 | ], 129 | "source": [ 130 | "sent_tokenize(data[0][\"document\"])" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 34, 136 | "metadata": {}, 137 | "outputs": [ 138 | { 139 | "data": { 140 | "text/plain": [ 141 | "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n", 142 | " 'Tracy Morgan just passed out on stage at the #Emmys.',\n", 143 | " 'Turn ABC on right now.\"',\n", 144 | " '~ @jimmykimmel, telling viewers what to tweet.',\n", 145 | " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n", 146 | " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n", 147 | " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n", 148 | " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n", 149 | " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n", 150 | " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n", 151 | " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n", 152 | " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n", 153 | " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']" 154 | ] 155 | }, 156 | "execution_count": 34, 157 | "metadata": {}, 158 | "output_type": "execute_result" 159 | } 160 | ], 161 | "source": [ 162 | "\n", 163 | "sent_tokenize(preprocess_punctuation(data[0][\"document\"]))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 28, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n", 175 | " 'Tracy Morgan just passed out on stage at the #Emmys.',\n", 176 | " 'Turn ABC on right now.\"',\n", 177 | " '~ @jimmykimmel, telling viewers what to tweet.',\n", 178 | " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n", 179 | " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n", 180 | " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n", 181 | " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n", 182 | " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n", 183 | " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n", 184 | " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n", 185 | " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n", 186 | " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']" 187 | ] 188 | }, 189 | "execution_count": 28, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "data[0][\"sent\"]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 19, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "file_path = \"../data/1017data/train_sent.json\"\n", 205 | "with open(file_path, 'w', encoding='utf-8') as f:\n", 206 | " json.dump(data, f, ensure_ascii=False, indent=4)" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": ".venv", 220 | "language": "python", 221 | "name": "python3" 222 | }, 223 | "language_info": { 224 | "codemirror_mode": { 225 | "name": "ipython", 226 | "version": 3 227 | }, 228 | "file_extension": ".py", 229 | "mimetype": "text/x-python", 230 | "name": "python", 231 | "nbconvert_exporter": "python", 232 | "pygments_lexer": "ipython3", 233 | "version": "3.8.10" 234 | } 235 | }, 236 | "nbformat": 4, 237 | "nbformat_minor": 2 238 | } 239 | -------------------------------------------------------------------------------- /data_processing/make_data_set_cnn.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from datasets import load_dataset\n", 19 | "\n", 20 | "ds = load_dataset(\"abisee/cnn_dailymail\", \"3.0.0\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 7, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "'42c027e4ff9730fbb3de84c1af0d2c506e41c3e4'" 32 | ] 33 | }, 34 | "execution_count": 7, 35 | "metadata": {}, 36 | "output_type": "execute_result" 37 | } 38 | ], 39 | "source": [ 40 | "ds[\"train\"][0][\"id\"]" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 6, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stderr", 50 | "output_type": "stream", 51 | "text": [ 52 | "[nltk_data] Downloading package punkt_tab to\n", 53 | "[nltk_data] /home/rbqlsquf2/nltk_data...\n", 54 | "[nltk_data] Package punkt_tab is already up-to-date!\n" 55 | ] 56 | } 57 | ], 58 | "source": [ 59 | "import nltk\n", 60 | "nltk.download('punkt_tab') # Download the necessary tokenizer data\n", 61 | "from nltk.tokenize import sent_tokenize\n", 62 | "\n", 63 | "def split_into_sentences(text):\n", 64 | " sentences = sent_tokenize(text)\n", 65 | " return sentences\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 7, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "data": { 75 | "text/plain": [ 76 | "DatasetDict({\n", 77 | " train: Dataset({\n", 78 | " features: ['article', 'highlights', 'id'],\n", 79 | " num_rows: 287113\n", 80 | " })\n", 81 | " validation: Dataset({\n", 82 | " features: ['article', 'highlights', 'id'],\n", 83 | " num_rows: 13368\n", 84 | " })\n", 85 | " test: Dataset({\n", 86 | " features: ['article', 'highlights', 'id'],\n", 87 | " num_rows: 11490\n", 88 | " })\n", 89 | "})" 90 | ] 91 | }, 92 | "execution_count": 7, 93 | "metadata": {}, 94 | "output_type": "execute_result" 95 | } 96 | ], 97 | "source": [ 98 | "ds" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 8, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "from tqdm import tqdm\n", 108 | "\n", 109 | "def create_example(all_data):\n", 110 | " all_result = []\n", 111 | " for i, data in enumerate(tqdm(all_data)):\n", 112 | " data_id = data[\"id\"]\n", 113 | " summary = data[\"highlights\"].replace(\"\\n\", \" \")\n", 114 | " context = split_into_sentences(data[\"article\"])\n", 115 | " \n", 116 | " result = {}\n", 117 | " result[\"_id\"] = data_id\n", 118 | " result[\"question\"] = \"summary\"\n", 119 | " result[\"document\"] = data[\"article\"]\n", 120 | " result[\"sent\"] = context\n", 121 | " result[\"output\"] = summary\n", 122 | " \n", 123 | " all_result.append(result)\n", 124 | "\n", 125 | " return all_result\n", 126 | " " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 9, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stderr", 136 | "output_type": "stream", 137 | "text": [ 138 | "100%|██████████| 287113/287113 [03:00<00:00, 1591.72it/s]\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "import json\n", 144 | "\n", 145 | "input_data = create_example(ds['train'])" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 10, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "with open(\"../data/1020data/cnn_train.json\", \"w\", encoding=\"utf-8\") as f:\n", 155 | " json.dump(input_data, f, ensure_ascii=False, indent=4)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 17, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stderr", 165 | "output_type": "stream", 166 | "text": [ 167 | "100%|██████████| 13368/13368 [00:22<00:00, 582.69it/s]\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "from tqdm import tqdm\n", 173 | "all_len = []\n", 174 | "all_result = []\n", 175 | "\n", 176 | "for input_data_ in tqdm(input_data):\n", 177 | " text = input_data_[\"text\"]\n", 178 | " if len(tokenizer(text)[\"input_ids\"]) <= 2048:\n", 179 | " # data[\"text\"] = data[\"text\"]\n", 180 | " all_result.append(input_data_)\n", 181 | " # all_len.append(len(tokenizer(text)[\"input_ids\"]))\n", 182 | "\n", 183 | "with open(\"../data/qwen_cnn_test_data.json\", \"w\", encoding=\"utf-8\") as f:\n", 184 | " json.dump(all_result, f, ensure_ascii=False, indent=4)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 11, 190 | "metadata": {}, 191 | "outputs": [ 192 | { 193 | "name": "stderr", 194 | "output_type": "stream", 195 | "text": [ 196 | "100%|██████████| 11490/11490 [00:07<00:00, 1619.94it/s]\n", 197 | "100%|██████████| 11490/11490 [00:23<00:00, 498.68it/s]" 198 | ] 199 | }, 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "759\n" 205 | ] 206 | }, 207 | { 208 | "name": "stderr", 209 | "output_type": "stream", 210 | "text": [ 211 | "\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "input_data = create_example(ds['test'])\n", 217 | "\n", 218 | "\n", 219 | "all_len = []\n", 220 | "all_result = []\n", 221 | "over_num = 0\n", 222 | "for input_data_ in tqdm(input_data):\n", 223 | " text = input_data_[\"all_text\"]\n", 224 | " count = len(tokenizer(text)[\"input_ids\"])\n", 225 | " if count <= 2048:\n", 226 | " all_result.append(input_data_)\n", 227 | " else:\n", 228 | " over_len = count - 2048\n", 229 | " input_data_['text'] = input_data_['text'][:over_len]\n", 230 | " over_num +=1\n", 231 | " # all_len.append(len(tokenizer(text)[\"input_ids\"]))\n", 232 | "print(over_num)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 13, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "with open(\"../data/qwen_cnn_test_data.json\", \"w\", encoding=\"utf-8\") as f:\n", 242 | " json.dump(all_result, f, ensure_ascii=False, indent=4)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": null, 255 | "metadata": {}, 256 | "outputs": [], 257 | "source": [] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 38, 262 | "metadata": {}, 263 | "outputs": [], 264 | "source": [ 265 | "for result in all_result:\n", 266 | " result[\"label\"] = \"assistant\\n\" + result[\"label\"]\n", 267 | " " 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": 39, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "with open(\"data/qwen_dev_data.json\", \"w\", encoding=\"utf-8\") as f:\n", 277 | " json.dump(all_result, f, ensure_ascii=False, indent=4)" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 14, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stderr", 287 | "output_type": "stream", 288 | "text": [ 289 | "7405it [00:17, 426.88it/s]\n" 290 | ] 291 | } 292 | ], 293 | "source": [ 294 | "from tqdm import tqdm\n", 295 | "all_len = []\n", 296 | "all_result = []\n", 297 | "\n", 298 | "for data, input_data_ in tqdm(zip(dev_data, input_data)):\n", 299 | " text = input_data_[\"text\"]\n", 300 | " if len(tokenizer(text)[\"input_ids\"]) <= 2048:\n", 301 | " # data[\"text\"] = data[\"text\"]\n", 302 | " all_result.append(data)\n", 303 | " # all_len.append(len(tokenizer(text)[\"input_ids\"]))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 16, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "with open(\"data/teddst_dev.json\", \"w\", encoding=\"utf-8\") as f:\n", 313 | " json.dump(input_data, f, ensure_ascii=False, indent=4)" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": 7, 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "**Answer**: yes\n", 326 | "**Supporting Sentences**: [4] Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer.\n", 327 | "[17] Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.\n", 328 | "\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "print(input_data[0][\"label\"])" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 16, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "6539\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "count = len(list(filter(lambda x: x < 2048, all_len)))\n", 351 | "print(count)" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [] 360 | } 361 | ], 362 | "metadata": { 363 | "kernelspec": { 364 | "display_name": ".venv", 365 | "language": "python", 366 | "name": "python3" 367 | }, 368 | "language_info": { 369 | "codemirror_mode": { 370 | "name": "ipython", 371 | "version": 3 372 | }, 373 | "file_extension": ".py", 374 | "mimetype": "text/x-python", 375 | "name": "python", 376 | "nbconvert_exporter": "python", 377 | "pygments_lexer": "ipython3", 378 | "version": "3.8.10" 379 | } 380 | }, 381 | "nbformat": 4, 382 | "nbformat_minor": 2 383 | } -------------------------------------------------------------------------------- /data_processing/make_data_wikisum.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", 13 | " from .autonotebook import tqdm as notebook_tqdm\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "from datasets import load_dataset\n", 19 | "\n", 20 | "ds = load_dataset(\"d0rj/wikisum\")" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "data": { 30 | "text/plain": [ 31 | "DatasetDict({\n", 32 | " train: Dataset({\n", 33 | " features: ['url', 'title', 'summary', 'article', 'step_headers'],\n", 34 | " num_rows: 35775\n", 35 | " })\n", 36 | " validation: Dataset({\n", 37 | " features: ['url', 'title', 'summary', 'article', 'step_headers'],\n", 38 | " num_rows: 2000\n", 39 | " })\n", 40 | " test: Dataset({\n", 41 | " features: ['url', 'title', 'summary', 'article', 'step_headers'],\n", 42 | " num_rows: 2000\n", 43 | " })\n", 44 | "})" 45 | ] 46 | }, 47 | "execution_count": 2, 48 | "metadata": {}, 49 | "output_type": "execute_result" 50 | } 51 | ], 52 | "source": [ 53 | "ds" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stdout", 63 | "output_type": "stream", 64 | "text": [ 65 | "Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of ice into the bottom of the container. Do not keep your oysters in a sealed or closed-top container. Doing so will suffocate them. You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check your oysters regularly. Place your oysters on top of the ice bed deep side down. Just like seafood merchants, you'll be storing your oysters on ice to keep them as chilled and fresh as possible. Make sure to turn each of your oysters so that the deeper side faces down, a technique that will help them better retain their juices. Dampen a towel with cold water and place it on top of the oysters. Dip a thin, clean kitchen towel in cold water and ring out the excess liquid. Then, gently lay the towel on top of the oysters. This will keep the oysters from drying out while preventing fresh water poisoning. If you'd prefer, you can cover the oysters with damp paper towels or newspaper instead. Oysters are salt water creatures, so submerging them in fresh water will essentially poison them and lead to their death. Place your container in a refrigerator. If possible, set your refrigerator to a temperature between 35 and 40 °F (2 and 4 °C). Make sure to store your oysters above any raw meat so the juices don't drip down onto your shellfish. If possible, check on your oysters at least once a day while they're in the fridge. If the towel dries out, dampen it again. If the ice in your container melts, pour it out and replace it with new ice. Keep your oysters in the fridge for up to 2 days. For safety, remove and consume your oysters within about 2 days of initially storing them. Though some oysters may last for a week or longer, eating them that late puts you at greater risk of food poisoning and other unwanted ailments. If your oysters came with an expiration date, use that as your guide for maximum storage time. Freeze your oysters if you need to store them for more than 2 days. Shuck the oysters when you’re ready to eat them. Once you finish storing the oysters, run them under cool water and open their shells. Then, run a knife under the flat side of the oyster and pop the shell off. Before eating, carefully separate the oyster from the rest of the shell using a knife. Before eating an oyster, inspect it to make sure it is still good. If the shell appears to be damaged, if the oyster smells foul, or if the meat is a cloudy shade of grey, brown, black, or pink, throw the oyster away. Keep the oysters in their shells and rinse them off. Storing your oysters inside their shells will make them less likely to go bad and, in some cases, better preserve their taste. Unlike refrigerating oysters, rinsing the shells under cold water to clean them off prevents any bacteria from living on the oysters. If you don't have enough room in your freezer to keep full-shelled oysters, you can shuck them before storage. If you do so, save the internal liquor for later use. Place your oysters in a freezer-safe container. To keep your oysters safe, place them inside a moisture-resistant, freezer-safe bag. If you're storing shucked oysters, you can use a firm plastic container instead. To prevent freezer burns, leave no more than 0.5 in (1.3 cm) of head space in the container. Pour oyster liquor into the container if you’re freezing shucked oysters. To help your shucked oysters retain their juiciness, pour the liquor you removed during the shucking process into your freezer-safe container. Keep pouring until you've completely submerged the oysters inside the liquid. If you don't have enough liquor to fill the container, pour in water as well. Seal the container. If you're using a resealable bag, press any excess air out of it using your fingers. Then, seal your container right before you put it into the freezer. Unlike with refrigerated oysters, closing the container will help better preserve your shellfish during long-term storage. If you're using a solid plastic container, make sure the lid you seal it with is air-tight. Make sure to write the initial storage date on your container. Keep your oysters in the freezer for up to 3 months. When frozen properly, fresh oysters should last for between 2 and 3 months. To make sure your oysters aren't going bad, look over them regularly and remove any that have cracked shells or cloudy meat that is a pink, black, brown, or grey color. While your oysters may remain safe to eat during this time, the taste will degrade gradually. Thaw your oysters in the fridge before consuming. Carefully take your oyster container out of the freezer and place it in a clear, open part of your refrigerator. Depending on the exact temperature of your appliances, the thawing process could take up to 20 hours to complete. Thawing your oysters using this method gives them a slightly longer shelf life, meaning you don't have to use them immediately after they thaw. If you'd like, you can thaw your oysters by submerging their container in cold water. However, you'll have to consume them immediately after they thaw, otherwise they'll go bad. \n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "print(ds['train'][0]['article'])" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stderr", 80 | "output_type": "stream", 81 | "text": [ 82 | "[nltk_data] Downloading package punkt_tab to\n", 83 | "[nltk_data] /home/rbqlsquf2/nltk_data...\n", 84 | "[nltk_data] Package punkt_tab is already up-to-date!\n" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "import nltk\n", 90 | "nltk.download('punkt_tab') # Download the necessary tokenizer data\n", 91 | "from nltk.tokenize import sent_tokenize\n", 92 | "\n", 93 | "def split_into_sentences(text):\n", 94 | " sentences = sent_tokenize(text)\n", 95 | " return sentences" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 5, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "from tqdm import tqdm\n", 105 | "def create_example(all_data):\n", 106 | " all_result = []\n", 107 | " data_id = 0\n", 108 | " for data in tqdm(all_data):\n", 109 | " context = split_into_sentences(data[\"article\"])\n", 110 | " result = {}\n", 111 | " result[\"_id\"] = data_id\n", 112 | " result[\"question\"] = \"summary\"\n", 113 | " result[\"document\"] = data['article']\n", 114 | " result[\"sent\"] = context\n", 115 | " result[\"output\"] = data[\"summary\"]\n", 116 | " all_result.append(result)\n", 117 | " return all_result" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 9, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stderr", 127 | "output_type": "stream", 128 | "text": [ 129 | "100%|██████████| 2000/2000 [00:02<00:00, 678.30it/s]\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "input_data = create_example(ds['validation'])" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 10, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "import json\n", 144 | "with open(\"../data/1017data/wikisum_dev.json\", \"w\", encoding=\"utf-8\") as f:\n", 145 | " json.dump(input_data, f, ensure_ascii=False, indent=4)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": ".venv", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "codemirror_mode": { 164 | "name": "ipython", 165 | "version": 3 166 | }, 167 | "file_extension": ".py", 168 | "mimetype": "text/x-python", 169 | "name": "python", 170 | "nbconvert_exporter": "python", 171 | "pygments_lexer": "ipython3", 172 | "version": "3.8.10" 173 | } 174 | }, 175 | "nbformat": 4, 176 | "nbformat_minor": 2 177 | } 178 | -------------------------------------------------------------------------------- /evaluation.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def normalize_answer(s): 5 | """간단한 토큰화와 정규화""" 6 | s = s.lower() # 소문자 변환 7 | s = re.sub(r"\b(a|an|the)\b", " ", s) # 불필요한 관사 제거 8 | s = re.sub(r"[^a-z0-9]", " ", s) # 알파벳과 숫자 외 제거 9 | return " ".join(s.split()) # 공백 정리 10 | 11 | 12 | def exact_match_score(prediction, ground_truth): 13 | """예측 답과 실제 답 간의 EM 점수 계산""" 14 | return int(normalize_answer(prediction) == normalize_answer(ground_truth)) 15 | 16 | 17 | def f1_score_hotpot(prediction, ground_truth): 18 | """예측 답과 실제 답 간의 F1 점수 계산""" 19 | pred_tokens = normalize_answer(prediction).split() 20 | gt_tokens = normalize_answer(ground_truth).split() 21 | 22 | common_tokens = set(pred_tokens) & set(gt_tokens) 23 | num_common = len(common_tokens) 24 | 25 | if num_common == 0: 26 | return 0 27 | 28 | precision = num_common / len(pred_tokens) 29 | recall = num_common / len(gt_tokens) 30 | 31 | f1 = 2 * (precision * recall) / (precision + recall) 32 | return f1 33 | 34 | 35 | def evaluate_supporting_facts(gold_sp, pred_sp): 36 | """Supporting facts에 대한 EM, Precision, Recall, F1 점수를 계산하는 함수""" 37 | # 단일 정수를 리스트로 변환 38 | gold_sp = [gold_sp] if isinstance(gold_sp, int) else gold_sp 39 | pred_sp = [pred_sp] if isinstance(pred_sp, int) else pred_sp 40 | 41 | # 예측과 정답 집합으로 변환 42 | gold_set = set(gold_sp) 43 | pred_set = set(pred_sp) 44 | 45 | # True Positives 계산 46 | tp = len(gold_set & pred_set) 47 | 48 | # Precision, Recall 계산 49 | precision = tp / len(pred_set) if pred_set else 0 50 | recall = tp / len(gold_set) if gold_set else 0 51 | 52 | # F1 점수 계산 53 | f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0 54 | 55 | # Exact Match 계산 56 | em = 1 if gold_set == pred_set else 0 57 | 58 | return em, precision, recall, f1 59 | 60 | 61 | import json 62 | 63 | file_path = "data/1125data/hotpot_dev.json" 64 | with open(file_path, "r", encoding="utf-8") as f: 65 | dev_data = json.load(f) 66 | 67 | 68 | for i in range(150, 151, 2): 69 | f_name = f"result/1127_baseline/all_{i}00.json" 70 | 71 | with open(f_name, "r", encoding="utf-8") as file: 72 | test_data = json.load(file) 73 | score = [] 74 | all_em_score = [] 75 | all_precision_score = [] 76 | all_recall_score = [] 77 | all_f1_score = [] 78 | result_f1 = [] 79 | result_em = [] 80 | ignore = 0 81 | gold_sp_option = False 82 | for dev, data in zip(dev_data, test_data): 83 | assert dev["_id"] == data["_id"] 84 | predict = "" 85 | answer = ( 86 | data["answer"] 87 | .replace("**Answer:", "") 88 | .replace("<|im_start|>assistant", "") 89 | .replace("<|im_end|>", "") 90 | .strip() 91 | ) 92 | generated_text = data["generated_text"].replace("**Answer:", "").strip() 93 | if answer == "yes": 94 | if answer in generated_text.lower() and "no" not in generated_text.lower(): 95 | generated_text = "yes" 96 | else: 97 | generated_text = "" 98 | elif answer == "no": 99 | if answer in generated_text.lower() and "yes" not in generated_text.lower(): 100 | generated_text = "no" 101 | else: 102 | generated_text = "" 103 | answer = answer.strip() 104 | predict = generated_text.strip() 105 | print(answer) 106 | print(generated_text) 107 | print("==========================") 108 | result_f1.append(f1_score_hotpot(answer, predict)) 109 | result_em.append(exact_match_score(predict, answer)) 110 | ################################################ 111 | if "gold_sp" in data.keys(): 112 | gold_sp_option = True 113 | gold_sp = data["gold_sp"] 114 | # pred_sp = data["pred_sp"] 115 | pred_sp = [x for x in data["pred_sp"] if x != 0] 116 | em, precision, recall, f1 = evaluate_supporting_facts(gold_sp, pred_sp) 117 | all_em_score.append(em) 118 | all_precision_score.append(precision) 119 | all_recall_score.append(recall) 120 | all_f1_score.append(f1) 121 | 122 | for i in pred_sp: 123 | if answer == "yes" or answer == "no": 124 | ignore += 1 125 | break 126 | if predict in dev["sent"][i - 1]: 127 | score.append(dev["_id"]) 128 | # print(answer) 129 | # print(generated_text) 130 | # print(dev["sent"][i-1]) 131 | # print("================") 132 | break 133 | 134 | # F1 점수와 EM 점수 출력 135 | print(f_name) 136 | print("F1 점수: ", sum(result_f1) / len(result_f1)) 137 | print("EM 점수: ", sum(result_em) / len(result_em)) 138 | if gold_sp_option: 139 | # F1 점수와 EM 점수 출력 140 | print("all_em_score 점수: ", sum(all_em_score) / len(all_em_score)) 141 | print("all_f1_score 점수: ", sum(all_f1_score) / len(all_f1_score)) 142 | print("all_precision_score 점수: ", sum(all_precision_score) / len(all_precision_score)) 143 | print("all_recall_score 점수: ", sum(all_recall_score) / len(all_recall_score)) 144 | print("=================================================") 145 | print(len(result_em)) 146 | -------------------------------------------------------------------------------- /evaluation_sum.py: -------------------------------------------------------------------------------- 1 | import json 2 | from nltk.translate.bleu_score import sentence_bleu 3 | from nltk.tokenize import word_tokenize 4 | 5 | 6 | def calculate_bleu(reference, candidate): 7 | # 문장을 토큰화 8 | reference_tokens = [word_tokenize(reference.lower())] 9 | candidate_tokens = word_tokenize(candidate.lower()) 10 | 11 | # BLEU 점수 계산 (1-gram부터 4-gram까지의 누적 점수) 12 | weights = (1, 0, 0, 0) # unigram에만 가중치 부여 13 | return sentence_bleu(reference_tokens, candidate_tokens, weights=weights) 14 | 15 | for i in [1,3,7]: 16 | file_path = f"result/1106_weighted_sum/hotpot_ft_{i}000.json" 17 | with open(file_path, "r", encoding="utf-8") as file: 18 | dev_data = json.load(file) 19 | 20 | bleu_scores = [] 21 | for dev in dev_data: 22 | predict = "" 23 | answer = dev["answer"].split("**Summary:")[1].replace("\n<|im_end|>", "").strip() 24 | if "**Summary:" in dev["generated_text"]: 25 | predict = dev["generated_text"].split("**Summary:")[1].strip() 26 | else: 27 | predict = dev["generated_text"] 28 | 29 | print(answer) 30 | print("--") 31 | print(predict) 32 | print("=============") 33 | 34 | bleu_score = calculate_bleu(answer, predict) 35 | bleu_scores.append(bleu_score) 36 | 37 | # 평균 BLEU 점수 계산 38 | average_bleu = sum(bleu_scores) / len(bleu_scores) 39 | print(file_path) 40 | print(f"Average BLEU score: {average_bleu:.4f}") 41 | -------------------------------------------------------------------------------- /qwen/qwen_inference.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | import torch 3 | from tqdm import tqdm 4 | import json 5 | from peft import PeftModel, PeftConfig 6 | from datasets import Dataset 7 | 8 | 9 | def create_model(base_model_path, lora_path): 10 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 11 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto") 12 | new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]} 13 | tokenizer.add_special_tokens(new_special_tokens) 14 | base_model.resize_token_embeddings(len(tokenizer)) 15 | base_model.config.use_cache = False 16 | tokenizer.padding_side = "left" 17 | peft_model = PeftModel.from_pretrained(base_model, lora_path) 18 | return tokenizer, peft_model 19 | 20 | 21 | class InferenceInput: 22 | def __init__(self, _id, input_text, answer): 23 | self._id = _id 24 | self.input_text = input_text 25 | self.answer = answer 26 | 27 | 28 | def create_example(all_example, tokenizer): 29 | all_result = [] 30 | for example in tqdm(all_example): 31 | if example["question"] == "summary": 32 | messages = [ 33 | {"role": "system", "content": "<|MRC|>True<|SUM|>True"}, 34 | {"role": "user", "content": f"{example['document']}"}, 35 | ] 36 | else: # MRC의 경우 37 | # messages = [{"role": "system", "content": "<|MRC|>False<|SUM|>True"}, {"role": "user", "content": f"**Question:{example['question']}\n{example['document']}"}] 38 | messages = [ 39 | {"role": "system", "content": "<|MRC|>True<|SUM|>False"}, 40 | {"role": "user", "content": f"{example['document']}"}, 41 | ] 42 | result = {} 43 | result["input"] = tokenizer.apply_chat_template(messages, tokenize=False) 44 | result["output"] = example["output"] 45 | all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"])) 46 | if len(all_result) == 20: 47 | break 48 | return all_result 49 | 50 | 51 | def create_batches(input_list, batch_size): 52 | # Split the input list into batches of size 'batch_size' 53 | for i in range(0, len(input_list), batch_size): 54 | yield input_list[i : i + batch_size] 55 | 56 | 57 | def generate_batch_answer(batches, tokenizer, model): 58 | for batch_num, batch in enumerate(tqdm(batches)): 59 | batch_texts = [item.input_text for item in batch] 60 | inputs = tokenizer( 61 | batch_texts, # Tokenized texts after applying chat template 62 | return_tensors="pt", # Return in tensor format 63 | padding=True, # Pad sequences to the same length 64 | ).to("cuda") 65 | model.to("cuda") 66 | with torch.no_grad(): 67 | outputs = model.generate( 68 | **inputs, 69 | max_new_tokens=512, 70 | ) 71 | 72 | decoded_outputs = [ 73 | tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs) 74 | ] 75 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 76 | 77 | # Store the generated text back in the input objects 78 | for i, item in enumerate(batch): 79 | item.generated_text = decoded_outputs[i] 80 | item.generated_all_answer = decoded_outputs_[i] 81 | return batches 82 | 83 | 84 | def write_result(output_path): 85 | all_result = [] 86 | for batch_num, batch in enumerate(answer_batches): 87 | for item in batch: 88 | result = {} 89 | result["_id"] = item._id 90 | result["input_text"] = item.input_text 91 | if "assistant" in item.generated_text: 92 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 93 | else: 94 | result["generated_text"] = item.generated_text 95 | result["answer"] = item.answer 96 | result["generated_all_answer"] = item.generated_all_answer 97 | all_result.append(result) 98 | 99 | with open(output_path, "w", encoding="utf-8") as f: 100 | json.dump(all_result, f, ensure_ascii=False, indent=4) 101 | 102 | 103 | if __name__ == "__main__": 104 | base_model_path = "Qwen/Qwen2.5-3B-Instruct" 105 | model_path = "lora_tuning_1010" 106 | tokenizer, model = create_model(base_model_path, model_path) 107 | 108 | file_path = "data/hotpot_dev.json" 109 | batch_size = 16 110 | print(batch_size) 111 | 112 | with open(file_path, "r", encoding="utf-8") as file: 113 | dev_data = json.load(file) 114 | 115 | input_data = create_example(dev_data, tokenizer) 116 | 117 | # Create batches of input items 118 | batches = list(create_batches(input_data, batch_size)) 119 | 120 | answer_batches = generate_batch_answer(batches, tokenizer, model) 121 | #### 답변작성 122 | output_path = "output/1010/hotpot_no_q_tf.json" 123 | write_result(output_path) 124 | -------------------------------------------------------------------------------- /qwen/qwen_inference_lora.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | import torch 3 | from tqdm import tqdm 4 | import json 5 | from peft import PeftModel, PeftConfig 6 | from datasets import Dataset 7 | 8 | 9 | def create_model(base_model_path, lora_path): 10 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 11 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto") 12 | new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]} 13 | tokenizer.add_special_tokens(new_special_tokens) 14 | base_model.resize_token_embeddings(len(tokenizer)) 15 | base_model.config.use_cache = False 16 | tokenizer.padding_side = "left" 17 | peft_model = PeftModel.from_pretrained(base_model, lora_path) 18 | return tokenizer, peft_model 19 | 20 | 21 | class InferenceInput: 22 | def __init__(self, _id, input_text, answer): 23 | self._id = _id 24 | self.input_text = input_text 25 | self.answer = answer 26 | 27 | 28 | def create_example(all_example, tokenizer): 29 | all_result = [] 30 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 31 | for example in tqdm(all_example): 32 | if example["question"] == "summary": 33 | messages = [ 34 | {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"}, 35 | {"role": "user", "content": f"{example['document']}"}, 36 | ] 37 | else: # MRC의 경우 38 | messages = [ 39 | # { 40 | # "role": "system", 41 | # "content": f"<|MRC|>True<|SUM|>True", 42 | # }, 43 | {"role": "user", "content": f"**Question:{example['question']}\n{example['document']}"}, 44 | ] 45 | 46 | result = {} 47 | result["input"] = tokenizer.apply_chat_template(messages, tokenize=False) 48 | result["output"] = example["output"] 49 | all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"])) 50 | if len(all_result) == 20: 51 | break 52 | return all_result 53 | 54 | 55 | def create_batches(input_list, batch_size): 56 | # Split the input list into batches of size 'batch_size' 57 | for i in range(0, len(input_list), batch_size): 58 | yield input_list[i : i + batch_size] 59 | 60 | 61 | def generate_batch_answer(batches, tokenizer, model): 62 | for batch_num, batch in enumerate(tqdm(batches)): 63 | batch_texts = [item.input_text for item in batch] 64 | inputs = tokenizer( 65 | batch_texts, # Tokenized texts after applying chat template 66 | return_tensors="pt", # Return in tensor format 67 | padding=True, # Pad sequences to the same length 68 | ).to("cuda") 69 | model.to("cuda") 70 | with torch.no_grad(): 71 | outputs = model.generate( 72 | **inputs, 73 | max_new_tokens=512, 74 | ) 75 | 76 | decoded_outputs = [ 77 | tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs) 78 | ] 79 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 80 | 81 | # Store the generated text back in the input objects 82 | for i, item in enumerate(batch): 83 | item.generated_text = decoded_outputs[i] 84 | item.generated_all_answer = decoded_outputs_[i] 85 | return batches 86 | 87 | 88 | def write_result(output_path): 89 | all_result = [] 90 | for batch_num, batch in enumerate(answer_batches): 91 | for item in batch: 92 | result = {} 93 | result["_id"] = item._id 94 | result["input_text"] = item.input_text 95 | if "assistant" in item.generated_text: 96 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 97 | else: 98 | result["generated_text"] = item.generated_text 99 | result["answer"] = item.answer 100 | result["generated_all_answer"] = item.generated_all_answer 101 | all_result.append(result) 102 | 103 | with open(output_path, "w", encoding="utf-8") as f: 104 | json.dump(all_result, f, ensure_ascii=False, indent=4) 105 | 106 | 107 | if __name__ == "__main__": 108 | base_model_path = "Qwen/Qwen2.5-3B-Instruct" 109 | model_path = "model/origin/checkpoint-3000" 110 | tokenizer, model = create_model(base_model_path, model_path) 111 | 112 | file_path = "data/1008data/hotpot_dev.json" 113 | batch_size = 16 114 | print(batch_size) 115 | 116 | with open(file_path, "r", encoding="utf-8") as file: 117 | dev_data = json.load(file) 118 | 119 | input_data = create_example(dev_data, tokenizer) 120 | 121 | # Create batches of input items 122 | batches = list(create_batches(input_data, batch_size)) 123 | 124 | answer_batches = generate_batch_answer(batches, tokenizer, model) 125 | #### 답변작성 126 | output_path = "result/orrigin/test_3000.json" 127 | write_result(output_path) 128 | -------------------------------------------------------------------------------- /qwen/qwen_train.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from datasets import Dataset 4 | 5 | from transformers import ( 6 | AutoTokenizer, 7 | AutoModelForCausalLM, 8 | DataCollatorForSeq2Seq, 9 | TrainingArguments, 10 | Trainer, 11 | GenerationConfig, 12 | ) 13 | 14 | from peft import LoraConfig, get_peft_model 15 | from trl import SFTTrainer 16 | from torch.cuda.amp import autocast, GradScaler 17 | import wandb 18 | 19 | 20 | def create_model(model_path): 21 | tokenizer = AutoTokenizer.from_pretrained(model_path) 22 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda") 23 | new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]} 24 | tokenizer.add_special_tokens(new_special_tokens) 25 | model.resize_token_embeddings(len(tokenizer)) 26 | model.enable_input_require_grads() 27 | model.config.use_cache = False 28 | tokenizer.padding_side = "left" 29 | return tokenizer, model 30 | 31 | 32 | IGNORE_INDEX = -100 33 | 34 | 35 | def process_func(example, tokenizer): 36 | MAX_LENGTH = 2048 37 | input_ids, attention_mask, labels = [], [], [] 38 | mrc_value = -1 39 | sum_value = -1 40 | if example["mrc_type"] == "T": 41 | mrc_value = "True" 42 | else: 43 | mrc_value = "False" 44 | if example["sum_type"] == "T": 45 | sum_value = "True" 46 | else: 47 | sum_value = "False" 48 | 49 | example["document"] = example["document"].strip() 50 | ##############다시 51 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 52 | if example["data_type"] == "answer": 53 | if example["answer_type"] == "F": 54 | if example["question"] == "no": # 질문이 없는 경우 55 | instruction = tokenizer( 56 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n", 57 | add_special_tokens=False, 58 | ) 59 | else: 60 | instruction = tokenizer( 61 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n", 62 | add_special_tokens=False, 63 | ) 64 | response = tokenizer( 65 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 66 | ) 67 | else: # 답 해야하는 경우 질문은 무조건 있음 68 | instruction = tokenizer( 69 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n", 70 | add_special_tokens=False, 71 | ) 72 | response = tokenizer( 73 | f"<|im_start|>assistant\n**Answer:{example['output']}\n**Summary:\n<|im_end|>\n", 74 | add_special_tokens=False, 75 | ) 76 | elif example["data_type"] == "summary": 77 | if example["answer_type"] == "F": # 무응답의 경우 질문이 무조건 없음 78 | instruction = tokenizer( 79 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n", 80 | add_special_tokens=False, 81 | ) 82 | response = tokenizer( 83 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 84 | ) 85 | else: # 답 해야하는 경우 질문 유무 86 | if example["question"] == "summary": # 질문이 없는 경우 87 | instruction = tokenizer( 88 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n", 89 | add_special_tokens=False, 90 | ) 91 | else: 92 | instruction = tokenizer( 93 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n", 94 | add_special_tokens=False, 95 | ) 96 | response = tokenizer( 97 | f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output']}\n<|im_end|>\n", 98 | add_special_tokens=False, 99 | ) 100 | 101 | input_ids = instruction["input_ids"] + response["input_ids"] 102 | attention_mask = instruction["attention_mask"] + response["attention_mask"] 103 | labels = [IGNORE_INDEX] * len(instruction["input_ids"]) + response["input_ids"] 104 | if len(input_ids) > MAX_LENGTH: 105 | input_ids = input_ids[:MAX_LENGTH] 106 | attention_mask = attention_mask[:MAX_LENGTH] 107 | labels = labels[:MAX_LENGTH] 108 | return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels} 109 | 110 | 111 | if __name__ == "__main__": 112 | 113 | model_path = "Qwen/Qwen2.5-3B-Instruct" 114 | tokenizer, model = create_model(model_path) 115 | data_file = "data/train_data_1011.json" 116 | 117 | dataset = Dataset.from_json(data_file) 118 | 119 | processed_dataset = dataset.map(lambda example: process_func(example, tokenizer)) 120 | 121 | new_model = "qwen_lora_inst" 122 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 123 | peft_config = LoraConfig( 124 | target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 125 | lora_alpha=16, 126 | lora_dropout=0.1, 127 | r=8, 128 | bias="none", 129 | task_type="CAUSAL_LM", 130 | ) 131 | 132 | model = get_peft_model(model, peft_config) 133 | 134 | model.print_trainable_parameters() 135 | for name, param in model.named_parameters(): 136 | print(f"Parameter: {name}, requires_grad: {param.requires_grad}") 137 | wandb.init(project="qwen llm lora") 138 | training_params = TrainingArguments( 139 | output_dir="/hdd/rbqlsquf/qwen_lora_1015", 140 | num_train_epochs=1, 141 | per_device_train_batch_size=4, 142 | gradient_accumulation_steps=2, 143 | warmup_ratio=0.1, 144 | learning_rate=1e-4, 145 | logging_steps=10, 146 | run_name="qwen lora", 147 | lr_scheduler_type="cosine", 148 | gradient_checkpointing=True, 149 | save_steps=1000, 150 | save_on_each_node=True, 151 | do_train=True, 152 | push_to_hub=False, 153 | report_to="wandb", 154 | ) 155 | trainer = Trainer( 156 | model=model, 157 | args=training_params, 158 | train_dataset=processed_dataset, 159 | data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), 160 | ) 161 | trainer.train() 162 | trainer.save_model(new_model) 163 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | absl-py==2.1.0 2 | accelerate==0.29.3 3 | aiohappyeyeballs==2.4.0 4 | aiohttp==3.10.6 5 | aiosignal==1.3.1 6 | asttokens==2.4.1 7 | async-timeout==4.0.3 8 | attrs==24.2.0 9 | backcall==0.2.0 10 | bitsandbytes==0.43.1 11 | certifi==2024.8.30 12 | charset-normalizer==3.3.2 13 | click==8.1.7 14 | comm==0.2.2 15 | contourpy==1.1.1 16 | cycler==0.12.1 17 | datasets==2.19.0 18 | debugpy==1.8.6 19 | decorator==5.1.1 20 | dill==0.3.8 21 | docker-pycreds==0.4.0 22 | docstring_parser==0.16 23 | eval_type_backport==0.2.0 24 | executing==2.1.0 25 | filelock==3.16.1 26 | fonttools==4.54.1 27 | frozenlist==1.4.1 28 | fsspec==2024.3.1 29 | gitdb==4.0.11 30 | GitPython==3.1.43 31 | huggingface-hub==0.25.1 32 | idna==3.10 33 | importlib_metadata==8.5.0 34 | importlib_resources==6.4.5 35 | inquirerpy==0.3.4 36 | ipykernel==6.29.5 37 | ipython==8.12.3 38 | jedi==0.19.1 39 | Jinja2==3.1.4 40 | joblib==1.4.2 41 | jupyter_client==8.6.3 42 | jupyter_core==5.7.2 43 | kiwisolver==1.4.7 44 | markdown-it-py==3.0.0 45 | MarkupSafe==2.1.5 46 | matplotlib==3.7.5 47 | matplotlib-inline==0.1.7 48 | mdurl==0.1.2 49 | mpmath==1.3.0 50 | multidict==6.1.0 51 | multiprocess==0.70.16 52 | nest-asyncio==1.6.0 53 | networkx==3.1 54 | nltk==3.9.1 55 | numpy==1.24.4 56 | nvidia-cublas-cu12==12.1.3.1 57 | nvidia-cuda-cupti-cu12==12.1.105 58 | nvidia-cuda-nvrtc-cu12==12.1.105 59 | nvidia-cuda-runtime-cu12==12.1.105 60 | nvidia-cudnn-cu12==8.9.2.26 61 | nvidia-cufft-cu12==11.0.2.54 62 | nvidia-curand-cu12==10.3.2.106 63 | nvidia-cusolver-cu12==11.4.5.107 64 | nvidia-cusparse-cu12==12.1.0.106 65 | nvidia-nccl-cu12==2.18.1 66 | nvidia-nvjitlink-cu12==12.6.68 67 | nvidia-nvtx-cu12==12.1.105 68 | packaging==24.1 69 | pandas==2.0.3 70 | parso==0.8.4 71 | peft==0.10.0 72 | pexpect==4.9.0 73 | pfzy==0.3.4 74 | pickleshare==0.7.5 75 | pillow==10.4.0 76 | platformdirs==4.3.6 77 | prompt_toolkit==3.0.47 78 | protobuf==5.28.2 79 | psutil==6.0.0 80 | ptyprocess==0.7.0 81 | pure_eval==0.2.3 82 | pyarrow==17.0.0 83 | pyarrow-hotfix==0.6 84 | Pygments==2.18.0 85 | pyparsing==3.1.4 86 | python-dateutil==2.9.0.post0 87 | pytz==2024.2 88 | PyYAML==6.0.2 89 | pyzmq==26.2.0 90 | regex==2024.9.11 91 | requests==2.32.3 92 | rich==13.8.1 93 | rouge_score==0.1.2 94 | safetensors==0.4.5 95 | scikit-learn==1.3.2 96 | scipy==1.10.1 97 | seaborn==0.13.2 98 | sentry-sdk==2.14.0 99 | setproctitle==1.3.3 100 | shtab==1.7.1 101 | six==1.16.0 102 | smmap==5.0.1 103 | stack-data==0.6.3 104 | sympy==1.13.3 105 | threadpoolctl==3.5.0 106 | tokenizers==0.19.1 107 | torch==2.1.0 108 | tornado==6.4.1 109 | tqdm==4.66.5 110 | traitlets==5.14.3 111 | transformers==4.43.3 112 | triton==2.1.0 113 | trl==0.8.6 114 | typing_extensions==4.12.2 115 | tyro==0.8.11 116 | tzdata==2024.2 117 | urllib3==2.2.3 118 | wandb==0.17.7 119 | wcwidth==0.2.13 120 | xxhash==3.5.0 121 | yarl==1.12.1 122 | zipp==3.20.2 123 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | PYTHON_PATH="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/bin/python" 2 | 3 | BASE_DIR="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM" 4 | MODEL_DIR="model/1127_baseline" 5 | OUTPUT_DIR="$BASE_DIR/result/1127_baseline" 6 | SOURCE_DIR="$BASE_DIR/source" 7 | SCRIPT_NAME="inference_baseline.py" 8 | # for i in {54..44..-2}; do 9 | # checkpoint=$((i * 100)) 10 | # $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \ 11 | # --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \ 12 | # --output_dir "$OUTPUT_DIR/$checkpoint.json" 13 | # done 14 | 15 | # for i in {62..68..2}; do 16 | # checkpoint=$((i * 100)) 17 | # $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \ 18 | # --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \ 19 | # --output_dir "$OUTPUT_DIR/$checkpoint.json" 20 | # done 21 | 22 | 23 | for i in 150; do 24 | checkpoint=$((i * 100)) 25 | $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \ 26 | --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \ 27 | --output_dir "$OUTPUT_DIR/all_$checkpoint.json" 28 | done 29 | -------------------------------------------------------------------------------- /run_train.sh: -------------------------------------------------------------------------------- 1 | PYTHON_PATH="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/bin/python" 2 | 3 | BASE_DIR="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM" 4 | MODEL_DIR="$BASE_DIR/model" 5 | OUTPUT_DIR="$BASE_DIR/result" 6 | SOURCE_DIR="$BASE_DIR/source" 7 | 8 | 9 | $PYTHON_PATH $SOURCE_DIR/train_pn_yesloss.py \ 10 | --new_model 1210_pn_yesloss \ 11 | --output_dir model/1210_pn_yesloss \ 12 | --num_train_epochs 1 \ 13 | --batch_size 2 \ 14 | --beam_size 1 \ 15 | --gradient_accumulation_steps 1 \ 16 | --wandb_run_name 1210_pn_yesloss 17 | -------------------------------------------------------------------------------- /source/__pycache__/modeling_qwen2_inf.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_inf.cpython-38.pyc -------------------------------------------------------------------------------- /source/__pycache__/modeling_qwen2_mean.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_mean.cpython-38.pyc -------------------------------------------------------------------------------- /source/__pycache__/modeling_qwen2_pn.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn.cpython-38.pyc -------------------------------------------------------------------------------- /source/__pycache__/modeling_qwen2_pn_2.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn_2.cpython-38.pyc -------------------------------------------------------------------------------- /source/__pycache__/modeling_qwen2_pn_test.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn_test.cpython-38.pyc -------------------------------------------------------------------------------- /source/inference_1108.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1107 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder( 17 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 18 | ) 19 | trained_model.set_gru(gru) 20 | trained_model.config.use_cache = False 21 | tokenizer.padding_side = "left" 22 | print("LORA WEIGHT LOADING") 23 | trained_model.load_pn_model(lora_path) 24 | return tokenizer, trained_model 25 | 26 | 27 | class InferenceInput: 28 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 29 | self._id = _id 30 | self.input_text = input_text 31 | self.answer = answer 32 | self.attention_mask = attention_mask 33 | self.sent_masks = sent_masks 34 | self.gold_sp = gold_sp 35 | 36 | 37 | def create_example(all_example, tokenizer, data_sample): 38 | all_result = [] 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | instruction = tokenizer( 60 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 61 | add_special_tokens=False, 62 | ) 63 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n" 64 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 65 | input = instruction["input_ids"] + token_doc["input_ids"] 66 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 67 | output = response 68 | 69 | if "supporting_num" in example.keys(): 70 | gold_sp = example["supporting_num"] 71 | else: 72 | gold_sp = None 73 | assert len(input) == len(sentence_position) == len(attention_mask) 74 | 75 | all_result.append( 76 | InferenceInput( 77 | _id=example["_id"], 78 | input_text=input, 79 | answer=output, 80 | attention_mask=attention_mask, 81 | sent_masks=sentence_position, 82 | gold_sp=gold_sp, 83 | ) 84 | ) 85 | if data_sample: 86 | if len(all_result) == 100: 87 | break 88 | return all_result 89 | 90 | 91 | def create_batches(input_list, batch_size): 92 | # Split the input list into batches of size 'batch_size' 93 | for i in range(0, len(input_list), batch_size): 94 | yield input_list[i : i + batch_size] 95 | 96 | 97 | def generate_batch_answer(batches, tokenizer, model): 98 | for batch_num, batch in enumerate(tqdm(batches)): 99 | input_ids = [item.input_text for item in batch] 100 | attention_mask = [item.attention_mask for item in batch] 101 | sentence_masks = [item.sent_masks for item in batch] 102 | 103 | model.to("cuda") 104 | model.eval() 105 | input_batch = {} 106 | max_length = max(len(mask) for mask in input_ids) 107 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 108 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 109 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 110 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 111 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 112 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 113 | 114 | with torch.no_grad(): 115 | model.evidence = None 116 | model.sentence_number = None 117 | outputs = model.generate( 118 | input_ids=input_batch["input_ids"], 119 | attention_mask=input_batch["attention_mask"], 120 | sent_masks=input_batch["sent_masks"], 121 | max_new_tokens=50, 122 | ) 123 | 124 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 125 | decoded_outputs = [ 126 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 127 | ] 128 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 129 | 130 | # Store the generated text back in the input objects 131 | for i, item in enumerate(batch): 132 | item.input_text = input_text 133 | item.generated_text = decoded_outputs[i] 134 | item.generated_all_answer = decoded_outputs_[i] 135 | if model.sentence_number != None: 136 | item.pred_sp = model.sentence_number[i] 137 | return batches 138 | 139 | 140 | def write_result(output_path, answer_batches, tokenizer): 141 | all_result = [] 142 | for batch_num, batch in enumerate(answer_batches): 143 | for item in batch: 144 | result = {} 145 | result["_id"] = item._id 146 | if "assistant\n" in item.generated_text: 147 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 148 | elif "assistant" in item.generated_text: 149 | result["generated_text"] = item.generated_text.split("assistant")[1] 150 | else: 151 | result["generated_text"] = item.generated_text 152 | result["answer"] = item.answer 153 | result["generated_all_answer"] = item.generated_all_answer 154 | if item.gold_sp != None: 155 | result["gold_sp"] = item.gold_sp 156 | result["pred_sp"] = item.pred_sp.tolist() 157 | all_result.append(result) 158 | 159 | with open(output_path, "w", encoding="utf-8") as f: 160 | json.dump(all_result, f, ensure_ascii=False, indent=4) 161 | 162 | 163 | if __name__ == "__main__": 164 | ############################################################## 165 | # model param 추가할 내용 166 | ############################################################## 167 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 168 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 169 | parser.add_argument("--train_model_path", type=str, default="model/1115_yesloss_final/checkpoint-2000") 170 | parser.add_argument("--data_file", type=str, default="data/1113data/hotpot_dev.json") 171 | parser.add_argument("--beam_size", type=int, default=1) 172 | parser.add_argument("--max_dec_len", type=int, default=3) 173 | parser.add_argument("--output_dir", type=str, default="result/1115_yesloss_final/2000.json") 174 | parser.add_argument("--batch_size", type=int, default=4) 175 | parser.add_argument("--data_sample", type=bool, default=False) 176 | 177 | args = parser.parse_args() 178 | print(args) 179 | ######################################################### 180 | # 변수들 선언 181 | ######################################################### 182 | 183 | config = AutoConfig.from_pretrained(args.base_model_path) 184 | config.beam_size = args.beam_size 185 | config.max_dec_len = args.max_dec_len 186 | 187 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 188 | print("batch size : ", args.batch_size) 189 | 190 | with open(args.data_file, "r", encoding="utf-8") as file: 191 | dev_data = json.load(file) 192 | 193 | input_data = create_example(dev_data, tokenizer, args.data_sample) 194 | 195 | # Create batches of input items 196 | batches = list(create_batches(input_data, args.batch_size)) 197 | 198 | answer_batches = generate_batch_answer(batches, tokenizer, model) 199 | #### 답변작성 200 | 201 | write_result(args.output_dir, answer_batches, tokenizer) 202 | -------------------------------------------------------------------------------- /source/inference_baseline.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1107_baseline import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder( 17 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 18 | ) 19 | trained_model.set_gru(gru) 20 | trained_model.config.use_cache = False 21 | tokenizer.padding_side = "left" 22 | print("LORA WEIGHT LOADING") 23 | trained_model.load_pn_model(lora_path) 24 | return tokenizer, trained_model 25 | 26 | 27 | class InferenceInput: 28 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 29 | self._id = _id 30 | self.input_text = input_text 31 | self.answer = answer 32 | self.attention_mask = attention_mask 33 | self.sent_masks = sent_masks 34 | self.gold_sp = gold_sp 35 | 36 | 37 | def create_example(all_example, tokenizer, data_sample): 38 | all_result = [] 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | instruction = tokenizer( 60 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 61 | add_special_tokens=False, 62 | ) 63 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n" 64 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 65 | input = instruction["input_ids"] + token_doc["input_ids"] 66 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 67 | output = response 68 | 69 | if "supporting_num" in example.keys(): 70 | gold_sp = example["supporting_num"] 71 | else: 72 | gold_sp = None 73 | assert len(input) == len(sentence_position) == len(attention_mask) 74 | 75 | all_result.append( 76 | InferenceInput( 77 | _id=example["_id"], 78 | input_text=input, 79 | answer=output, 80 | attention_mask=attention_mask, 81 | sent_masks=sentence_position, 82 | gold_sp=gold_sp, 83 | ) 84 | ) 85 | if data_sample: 86 | if len(all_result) == 100: 87 | break 88 | return all_result 89 | 90 | 91 | def create_batches(input_list, batch_size): 92 | # Split the input list into batches of size 'batch_size' 93 | for i in range(0, len(input_list), batch_size): 94 | yield input_list[i : i + batch_size] 95 | 96 | 97 | def generate_batch_answer(batches, tokenizer, model): 98 | for batch_num, batch in enumerate(tqdm(batches)): 99 | input_ids = [item.input_text for item in batch] 100 | attention_mask = [item.attention_mask for item in batch] 101 | sentence_masks = [item.sent_masks for item in batch] 102 | 103 | model.to("cuda") 104 | input_batch = {} 105 | max_length = max(len(mask) for mask in input_ids) 106 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 107 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 108 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 109 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 110 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 111 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 112 | 113 | with torch.no_grad(): 114 | model.evidence = None 115 | model.sentence_number = None 116 | outputs = model.generate( 117 | input_ids=input_batch["input_ids"], 118 | attention_mask=input_batch["attention_mask"], 119 | sent_masks=input_batch["sent_masks"], 120 | max_new_tokens=50, 121 | ) 122 | 123 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 124 | decoded_outputs = [ 125 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 126 | ] 127 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 128 | 129 | # Store the generated text back in the input objects 130 | for i, item in enumerate(batch): 131 | item.input_text = input_text 132 | item.generated_text = decoded_outputs[i] 133 | item.generated_all_answer = decoded_outputs_[i] 134 | if model.sentence_number != None: 135 | item.pred_sp = model.sentence_number[i] 136 | else: 137 | item.pred_sp = None 138 | return batches 139 | 140 | 141 | def write_result(output_path, answer_batches, tokenizer): 142 | all_result = [] 143 | for batch_num, batch in enumerate(answer_batches): 144 | for item in batch: 145 | result = {} 146 | result["_id"] = item._id 147 | if "assistant\n" in item.generated_text: 148 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 149 | elif "assistant" in item.generated_text: 150 | result["generated_text"] = item.generated_text.split("assistant")[1] 151 | else: 152 | result["generated_text"] = item.generated_text 153 | result["answer"] = item.answer 154 | result["generated_all_answer"] = item.generated_all_answer 155 | if item.gold_sp != None and item.pred_sp != None: 156 | result["gold_sp"] = item.gold_sp 157 | result["pred_sp"] = item.pred_sp.tolist() 158 | all_result.append(result) 159 | 160 | with open(output_path, "w", encoding="utf-8") as f: 161 | json.dump(all_result, f, ensure_ascii=False, indent=4) 162 | 163 | 164 | if __name__ == "__main__": 165 | ############################################################## 166 | # model param 추가할 내용 167 | ############################################################## 168 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 169 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 170 | parser.add_argument("--train_model_path", type=str, default="model/1127_baseline_no_causal/checkpoint-2600") 171 | parser.add_argument("--data_file", type=str, default="data/1113data/hotpot_dev.json") 172 | parser.add_argument("--beam_size", type=int, default=1) 173 | parser.add_argument("--max_dec_len", type=int, default=3) 174 | parser.add_argument("--output_dir", type=str, default="result/1127_baseline_no_causal/2600.json") 175 | parser.add_argument("--batch_size", type=int, default=8) 176 | parser.add_argument("--data_sample", type=bool, default=False) 177 | 178 | args = parser.parse_args() 179 | print(args) 180 | ######################################################### 181 | # 변수들 선언 182 | ######################################################### 183 | 184 | config = AutoConfig.from_pretrained(args.base_model_path) 185 | config.beam_size = args.beam_size 186 | config.max_dec_len = args.max_dec_len 187 | 188 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 189 | print("batch size : ", args.batch_size) 190 | 191 | with open(args.data_file, "r", encoding="utf-8") as file: 192 | dev_data = json.load(file) 193 | 194 | input_data = create_example(dev_data, tokenizer, args.data_sample) 195 | 196 | # Create batches of input items 197 | batches = list(create_batches(input_data, args.batch_size)) 198 | 199 | answer_batches = generate_batch_answer(batches, tokenizer, model) 200 | #### 답변작성 201 | 202 | write_result(args.output_dir, answer_batches, tokenizer) 203 | -------------------------------------------------------------------------------- /source/inference_mean.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_mean import Qwen2ForCausalLM 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | 16 | # AutoModelForCausalLM -> Qwen2ForCausalLM 17 | base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, device_map="auto") 18 | new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]} 19 | tokenizer.add_special_tokens(new_special_tokens) 20 | base_model.resize_token_embeddings(len(tokenizer)) 21 | base_model.config.use_cache = False 22 | tokenizer.padding_side = "left" 23 | peft_model = PeftModel.from_pretrained(base_model, lora_path) 24 | return tokenizer, peft_model 25 | 26 | 27 | class InferenceInput: 28 | def __init__(self, _id, input_text, answer): 29 | self._id = _id 30 | self.input_text = input_text 31 | self.answer = answer 32 | 33 | 34 | def create_example(all_example, tokenizer): 35 | all_result = [] 36 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 37 | for example in tqdm(all_example): 38 | if example["question"] == "summary": 39 | messages = [ 40 | {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"}, 41 | {"role": "user", "content": f"**Document:\n{example['document']}"}, 42 | ] 43 | else: # MRC의 경우 44 | messages = [ 45 | { 46 | "role": "system", 47 | "content": f"{task_instruction}\n<|MRC|>True<|SUM|>False", 48 | }, 49 | {"role": "user", "content": f"**Question:{example['question']}\n**Document:\n{example['document']}"}, 50 | ] 51 | 52 | result = {} 53 | result["input"] = tokenizer.apply_chat_template(messages, tokenize=False) 54 | result["output"] = example["output"] 55 | all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"])) 56 | # if len(all_result) == 100: 57 | # break 58 | return all_result 59 | 60 | 61 | def create_batches(input_list, batch_size): 62 | # Split the input list into batches of size 'batch_size' 63 | for i in range(0, len(input_list), batch_size): 64 | yield input_list[i : i + batch_size] 65 | 66 | 67 | def generate_batch_answer(batches, tokenizer, model): 68 | for batch_num, batch in enumerate(tqdm(batches)): 69 | batch_texts = [item.input_text for item in batch] 70 | inputs = tokenizer( 71 | batch_texts, # Tokenized texts after applying chat template 72 | return_tensors="pt", # Return in tensor format 73 | padding=True, # Pad sequences to the same length 74 | ).to("cuda") 75 | model.to("cuda") 76 | with torch.no_grad(): 77 | model.model.evidence = None 78 | outputs = model.generate( 79 | **inputs, 80 | max_new_tokens=512, 81 | ) 82 | 83 | decoded_outputs = [ 84 | tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs) 85 | ] 86 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 87 | 88 | # Store the generated text back in the input objects 89 | for i, item in enumerate(batch): 90 | item.generated_text = decoded_outputs[i] 91 | item.generated_all_answer = decoded_outputs_[i] 92 | return batches 93 | 94 | 95 | def write_result(output_path): 96 | all_result = [] 97 | for batch_num, batch in enumerate(answer_batches): 98 | for item in batch: 99 | result = {} 100 | result["_id"] = item._id 101 | result["input_text"] = item.input_text 102 | if "assistant" in item.generated_text: 103 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 104 | else: 105 | result["generated_text"] = item.generated_text 106 | result["answer"] = item.answer 107 | result["generated_all_answer"] = item.generated_all_answer 108 | all_result.append(result) 109 | 110 | with open(output_path, "w", encoding="utf-8") as f: 111 | json.dump(all_result, f, ensure_ascii=False, indent=4) 112 | 113 | 114 | if __name__ == "__main__": 115 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 116 | parser.add_argument("--model_path", type=str, required=True, help="모델 경로") 117 | parser.add_argument("--output_path", type=str, required=True, help="결과저장 경로") 118 | args = parser.parse_args() 119 | model_path = args.model_path 120 | output_path = args.output_path 121 | 122 | # model_path = "model/mean/checkpoint-1000" 123 | # output_path = "result/mean/hotpot_1000.json" 124 | ########################################## 125 | 126 | base_model_path = "Qwen/Qwen2.5-3B-Instruct" 127 | 128 | tokenizer, model = create_model(base_model_path, model_path) 129 | 130 | file_path = "data/1008data/hotpot_dev.json" 131 | batch_size = 16 132 | print(batch_size) 133 | 134 | with open(file_path, "r", encoding="utf-8") as file: 135 | dev_data = json.load(file) 136 | 137 | input_data = create_example(dev_data, tokenizer) 138 | 139 | # Create batches of input items 140 | batches = list(create_batches(input_data, batch_size)) 141 | 142 | answer_batches = generate_batch_answer(batches, tokenizer, model) 143 | #### 답변작성 144 | 145 | write_result(output_path) 146 | -------------------------------------------------------------------------------- /source/inference_origin.py: -------------------------------------------------------------------------------- 1 | <<<<<<< HEAD 2 | from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM 3 | ======= 4 | from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM, AutoConfig 5 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 6 | 7 | import torch 8 | from tqdm import tqdm 9 | import json 10 | from peft import PeftModel, PeftConfig 11 | from datasets import Dataset 12 | 13 | import argparse 14 | 15 | 16 | <<<<<<< HEAD 17 | def create_model(base_model_path, lora_path): 18 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 19 | 20 | # AutoModelForCausalLM -> Qwen2ForCausalLM 21 | base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, device_map="auto") 22 | new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]} 23 | tokenizer.add_special_tokens(new_special_tokens) 24 | base_model.resize_token_embeddings(len(tokenizer)) 25 | ======= 26 | def create_model(base_model_path, lora_path, config): 27 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 28 | base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, config=config, device_map="auto") 29 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 30 | base_model.config.use_cache = False 31 | tokenizer.padding_side = "left" 32 | peft_model = PeftModel.from_pretrained(base_model, lora_path) 33 | return tokenizer, peft_model 34 | 35 | 36 | class InferenceInput: 37 | <<<<<<< HEAD 38 | def __init__(self, _id, input_text, answer): 39 | self._id = _id 40 | self.input_text = input_text 41 | self.answer = answer 42 | 43 | 44 | def create_example(all_example, tokenizer): 45 | all_result = [] 46 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 47 | for example in tqdm(all_example): 48 | if example["question"] == "summary": 49 | messages = [ 50 | {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"}, 51 | {"role": "user", "content": f"**Document:\n{example['document']}"}, 52 | ] 53 | else: # MRC의 경우 54 | messages = [ 55 | { 56 | "role": "system", 57 | "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True", 58 | }, 59 | {"role": "user", "content": f"**Question:{example['question']}\n**Document:\n{example['document']}"}, 60 | ] 61 | 62 | result = {} 63 | result["input"] = tokenizer.apply_chat_template(messages, tokenize=False) 64 | result["output"] = example["output"] 65 | all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"])) 66 | if len(all_result) == 100: 67 | break 68 | ======= 69 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks): 70 | self._id = _id 71 | self.input_text = input_text 72 | self.answer = answer 73 | self.attention_mask = attention_mask 74 | self.sent_masks = sent_masks 75 | 76 | 77 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value): 78 | all_result = [] 79 | 80 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 81 | for example in tqdm(all_example): 82 | example["document"] = example["document"].strip() 83 | # token 된 doc 84 | token_doc = {"input_ids": [], "attention_mask": []} 85 | # document 문장 index 86 | sentence_number = 0 87 | sentence_position = [] 88 | for i, sent in enumerate(example["sent"]): 89 | # 0번 문장은 instruction으로 지정할 계획 90 | sent = sent.strip() 91 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 92 | sentence_number += 1 # 1부터 시작 93 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 94 | token_doc["input_ids"] += token_sent["input_ids"] 95 | token_doc["attention_mask"] += token_sent["attention_mask"] 96 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 97 | sentence_position.extend([sentence_number] * len(token_end)) 98 | token_doc["input_ids"] += token_end["input_ids"] 99 | token_doc["attention_mask"] += token_end["attention_mask"] 100 | 101 | if example["question"] == "summary": 102 | instruction = tokenizer( 103 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 104 | add_special_tokens=False, 105 | ) 106 | response = tokenizer( 107 | f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 108 | add_special_tokens=False, 109 | ) 110 | else: # MRC의 경우 111 | instruction = tokenizer( 112 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 113 | add_special_tokens=False, 114 | ) 115 | response = tokenizer( 116 | f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 117 | add_special_tokens=False, 118 | ) 119 | 120 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 121 | input = instruction["input_ids"] + token_doc["input_ids"] 122 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 123 | output = example["output"] 124 | assert len(input) == len(sentence_position) == len(attention_mask) 125 | 126 | all_result.append( 127 | InferenceInput( 128 | _id=example["_id"], 129 | input_text=input, 130 | answer=output, 131 | attention_mask=attention_mask, 132 | sent_masks=sentence_position, 133 | ) 134 | ) 135 | if data_sample: 136 | if len(all_result) == 1: 137 | break 138 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 139 | return all_result 140 | 141 | 142 | def create_batches(input_list, batch_size): 143 | # Split the input list into batches of size 'batch_size' 144 | for i in range(0, len(input_list), batch_size): 145 | yield input_list[i : i + batch_size] 146 | 147 | 148 | def generate_batch_answer(batches, tokenizer, model): 149 | for batch_num, batch in enumerate(tqdm(batches)): 150 | <<<<<<< HEAD 151 | batch_texts = [item.input_text for item in batch] 152 | inputs = tokenizer( 153 | batch_texts, # Tokenized texts after applying chat template 154 | return_tensors="pt", # Return in tensor format 155 | padding=True, # Pad sequences to the same length 156 | ).to("cuda") 157 | model.to("cuda") 158 | with torch.no_grad(): 159 | model.model.evidence = None 160 | outputs = model.generate( 161 | **inputs, 162 | max_new_tokens=512, 163 | ) 164 | 165 | decoded_outputs = [ 166 | tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs) 167 | ======= 168 | input_ids = [item.input_text for item in batch] 169 | attention_mask = [item.attention_mask for item in batch] 170 | sentence_masks = [item.sent_masks for item in batch] 171 | 172 | model.to("cuda") 173 | input_batch = {} 174 | max_length = max(len(mask) for mask in input_ids) 175 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 176 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 177 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 178 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 179 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 180 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 181 | 182 | with torch.no_grad(): 183 | model.evidence = None 184 | outputs = model.generate( 185 | input_ids=input_batch["input_ids"], 186 | attention_mask=input_batch["attention_mask"], 187 | max_new_tokens=200, 188 | ) 189 | 190 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 191 | 192 | decoded_outputs = [ 193 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 194 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 195 | ] 196 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 197 | 198 | # Store the generated text back in the input objects 199 | for i, item in enumerate(batch): 200 | <<<<<<< HEAD 201 | ======= 202 | item.input_text = input_text 203 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 204 | item.generated_text = decoded_outputs[i] 205 | item.generated_all_answer = decoded_outputs_[i] 206 | return batches 207 | 208 | 209 | <<<<<<< HEAD 210 | def write_result(output_path): 211 | ======= 212 | def write_result(output_path, answer_batches, tokenizer): 213 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 214 | all_result = [] 215 | for batch_num, batch in enumerate(answer_batches): 216 | for item in batch: 217 | result = {} 218 | result["_id"] = item._id 219 | result["input_text"] = item.input_text 220 | <<<<<<< HEAD 221 | if "assistant" in item.generated_text: 222 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 223 | ======= 224 | if "assistant\n" in item.generated_text: 225 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 226 | elif "assistant" in item.generated_text: 227 | result["generated_text"] = item.generated_text.split("assistant")[1] 228 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 229 | else: 230 | result["generated_text"] = item.generated_text 231 | result["answer"] = item.answer 232 | result["generated_all_answer"] = item.generated_all_answer 233 | all_result.append(result) 234 | 235 | with open(output_path, "w", encoding="utf-8") as f: 236 | json.dump(all_result, f, ensure_ascii=False, indent=4) 237 | 238 | 239 | if __name__ == "__main__": 240 | <<<<<<< HEAD 241 | # parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 242 | # parser.add_argument("--model_path", type=str, required=True, help="모델 경로") 243 | # parser.add_argument("--output_path", type=str, required=True, help="결과저장 경로") 244 | # args = parser.parse_args() 245 | # model_path = args.model_path 246 | # output_path = args.output_path 247 | 248 | model_path = "model/hotpot_cnn/checkpoint-3000" 249 | output_path = "result/hotpot_cnn/hotpot_3000_tt.json" 250 | 251 | base_model_path = "Qwen/Qwen2.5-3B-Instruct" 252 | 253 | tokenizer, model = create_model(base_model_path, model_path) 254 | 255 | file_path = "data/1008data/hotpot_dev.json" 256 | batch_size = 16 257 | print(batch_size) 258 | 259 | with open(file_path, "r", encoding="utf-8") as file: 260 | dev_data = json.load(file) 261 | 262 | input_data = create_example(dev_data, tokenizer) 263 | 264 | # Create batches of input items 265 | batches = list(create_batches(input_data, batch_size)) 266 | ======= 267 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 268 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 269 | parser.add_argument("--train_model_path", type=str, default="/hdd/rbqlsquf/hotpot_cnn/checkpoint-4000") 270 | parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev.json") 271 | parser.add_argument("--beam_size", type=int, default=1) 272 | parser.add_argument("--max_dec_len", type=int, default=3) 273 | parser.add_argument("--output_dir", type=str, default="result/hotpot_cnn/hotpot_tf.json") 274 | parser.add_argument("--batch_size", type=int, default=8) 275 | parser.add_argument("--data_sample", type=bool, default=True) 276 | parser.add_argument("--mrc_value", type=str, default=True) 277 | parser.add_argument("--sum_value", type=str, default=False) 278 | args = parser.parse_args() 279 | print(args) 280 | ######################################################### 281 | # 변수들 선언 282 | ######################################################### 283 | config = AutoConfig.from_pretrained(args.base_model_path) 284 | config.beam_size = args.beam_size 285 | config.max_dec_len = args.max_dec_len 286 | 287 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 288 | print("batch size : ", args.batch_size) 289 | 290 | with open(args.data_file, "r", encoding="utf-8") as file: 291 | dev_data = json.load(file) 292 | 293 | input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value) 294 | 295 | # Create batches of input items 296 | batches = list(create_batches(input_data, args.batch_size)) 297 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 298 | 299 | answer_batches = generate_batch_answer(batches, tokenizer, model) 300 | #### 답변작성 301 | 302 | <<<<<<< HEAD 303 | write_result(output_path) 304 | ======= 305 | write_result(args.output_dir, answer_batches, tokenizer) 306 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f 307 | -------------------------------------------------------------------------------- /source/inference_pn.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size) 17 | trained_model.set_gru(gru) 18 | trained_model.config.use_cache = False 19 | tokenizer.padding_side = "left" 20 | trained_model.load_pn_model(lora_path) 21 | return tokenizer, trained_model 22 | 23 | 24 | class InferenceInput: 25 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 26 | self._id = _id 27 | self.input_text = input_text 28 | self.answer = answer 29 | self.attention_mask = attention_mask 30 | self.sent_masks = sent_masks 31 | self.gold_sp = gold_sp 32 | 33 | 34 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value): 35 | all_result = [] 36 | 37 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 38 | for example in tqdm(all_example): 39 | example["document"] = example["document"].strip() 40 | # token 된 doc 41 | token_doc = {"input_ids": [], "attention_mask": []} 42 | # document 문장 index 43 | sentence_number = 0 44 | sentence_position = [] 45 | for i, sent in enumerate(example["sent"]): 46 | # 0번 문장은 instruction으로 지정할 계획 47 | sent = sent.strip() 48 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 49 | sentence_number += 1 # 1부터 시작 50 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 51 | token_doc["input_ids"] += token_sent["input_ids"] 52 | token_doc["attention_mask"] += token_sent["attention_mask"] 53 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 54 | sentence_position.extend([sentence_number] * len(token_end)) 55 | token_doc["input_ids"] += token_end["input_ids"] 56 | token_doc["attention_mask"] += token_end["attention_mask"] 57 | 58 | if example["question"] == "summary": 59 | instruction = tokenizer( 60 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 61 | add_special_tokens=False, 62 | ) 63 | # response = tokenizer( 64 | # f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 65 | # add_special_tokens=False, 66 | # ) 67 | response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n" 68 | else: # MRC의 경우 69 | instruction = tokenizer( 70 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 71 | add_special_tokens=False, 72 | ) 73 | # response = tokenizer( 74 | # f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 75 | # add_special_tokens=False, 76 | # ) 77 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n" 78 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 79 | input = instruction["input_ids"] + token_doc["input_ids"] 80 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 81 | output = response 82 | 83 | if "supporting_num" in example.keys(): 84 | gold_sp = example["supporting_num"] 85 | else: 86 | gold_sp = None 87 | assert len(input) == len(sentence_position) == len(attention_mask) 88 | 89 | all_result.append( 90 | InferenceInput( 91 | _id=example["_id"], 92 | input_text=input, 93 | answer=output, 94 | attention_mask=attention_mask, 95 | sent_masks=sentence_position, 96 | gold_sp=gold_sp, 97 | ) 98 | ) 99 | if data_sample: 100 | if len(all_result) == 100: 101 | break 102 | return all_result 103 | 104 | 105 | def create_batches(input_list, batch_size): 106 | # Split the input list into batches of size 'batch_size' 107 | for i in range(0, len(input_list), batch_size): 108 | yield input_list[i : i + batch_size] 109 | 110 | 111 | def generate_batch_answer(batches, tokenizer, model): 112 | for batch_num, batch in enumerate(tqdm(batches)): 113 | input_ids = [item.input_text for item in batch] 114 | attention_mask = [item.attention_mask for item in batch] 115 | sentence_masks = [item.sent_masks for item in batch] 116 | 117 | model.to("cuda") 118 | input_batch = {} 119 | max_length = max(len(mask) for mask in input_ids) 120 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 121 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 122 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 123 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 124 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 125 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 126 | 127 | with torch.no_grad(): 128 | model.evidence = None 129 | model.sentence_number = None 130 | outputs = model.generate( 131 | input_ids=input_batch["input_ids"], 132 | attention_mask=input_batch["attention_mask"], 133 | sent_masks=input_batch["sent_masks"], 134 | max_new_tokens=200, 135 | ) 136 | 137 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 138 | decoded_outputs = [ 139 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 140 | ] 141 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 142 | 143 | # Store the generated text back in the input objects 144 | for i, item in enumerate(batch): 145 | item.input_text = input_text 146 | item.generated_text = decoded_outputs[i] 147 | item.generated_all_answer = decoded_outputs_[i] 148 | if model.sentence_number != None: 149 | item.pred_sp = model.sentence_number[i] 150 | return batches 151 | 152 | 153 | def write_result(output_path, answer_batches, tokenizer): 154 | all_result = [] 155 | for batch_num, batch in enumerate(answer_batches): 156 | for item in batch: 157 | result = {} 158 | result["_id"] = item._id 159 | if "assistant\n" in item.generated_text: 160 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 161 | elif "assistant" in item.generated_text: 162 | result["generated_text"] = item.generated_text.split("assistant")[1] 163 | else: 164 | result["generated_text"] = item.generated_text 165 | result["answer"] = item.answer 166 | result["generated_all_answer"] = item.generated_all_answer 167 | if item.gold_sp != None: 168 | result["gold_sp"] = item.gold_sp 169 | result["pred_sp"] = item.pred_sp.tolist() 170 | all_result.append(result) 171 | 172 | with open(output_path, "w", encoding="utf-8") as f: 173 | json.dump(all_result, f, ensure_ascii=False, indent=4) 174 | 175 | 176 | if __name__ == "__main__": 177 | ############################################################## 178 | # model param 추가할 내용 179 | ############################################################## 180 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 181 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 182 | parser.add_argument("--train_model_path", type=str, default="model/1105_noloss/checkpoint-16600") 183 | parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json") 184 | parser.add_argument("--beam_size", type=int, default=1) 185 | parser.add_argument("--max_dec_len", type=int, default=3) 186 | parser.add_argument("--output_dir", type=str, default="result/1105_noloss/test.json") 187 | parser.add_argument("--batch_size", type=int, default=8) 188 | parser.add_argument("--data_sample", type=bool, default=True) 189 | parser.add_argument("--mrc_value", type=str, default=True) 190 | parser.add_argument("--sum_value", type=str, default=False) 191 | args = parser.parse_args() 192 | print(args) 193 | ######################################################### 194 | # 변수들 선언 195 | ######################################################### 196 | 197 | config = AutoConfig.from_pretrained(args.base_model_path) 198 | config.beam_size = args.beam_size 199 | config.max_dec_len = args.max_dec_len 200 | 201 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 202 | print("batch size : ", args.batch_size) 203 | 204 | with open(args.data_file, "r", encoding="utf-8") as file: 205 | dev_data = json.load(file) 206 | 207 | input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value) 208 | 209 | # Create batches of input items 210 | batches = list(create_batches(input_data, args.batch_size)) 211 | 212 | answer_batches = generate_batch_answer(batches, tokenizer, model) 213 | #### 답변작성 214 | 215 | write_result(args.output_dir, answer_batches, tokenizer) -------------------------------------------------------------------------------- /source/inference_pn_att_1106.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1106 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size) 17 | trained_model.set_gru(gru) 18 | trained_model.config.use_cache = False 19 | tokenizer.padding_side = "left" 20 | print("LORA WEIGHT LOADING") 21 | trained_model.load_pn_model(lora_path) 22 | return tokenizer, trained_model 23 | 24 | 25 | class InferenceInput: 26 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 27 | self._id = _id 28 | self.input_text = input_text 29 | self.answer = answer 30 | self.attention_mask = attention_mask 31 | self.sent_masks = sent_masks 32 | self.gold_sp = gold_sp 33 | 34 | 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value): 36 | all_result = [] 37 | 38 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | if example["question"] == "summary": 60 | instruction = tokenizer( 61 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 62 | add_special_tokens=False, 63 | ) 64 | # response = tokenizer( 65 | # f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 66 | # add_special_tokens=False, 67 | # ) 68 | response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n" 69 | else: # MRC의 경우 70 | instruction = tokenizer( 71 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 72 | add_special_tokens=False, 73 | ) 74 | # response = tokenizer( 75 | # f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 76 | # add_special_tokens=False, 77 | # ) 78 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n" 79 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 80 | input = instruction["input_ids"] + token_doc["input_ids"] 81 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 82 | output = response 83 | 84 | if "supporting_num" in example.keys(): 85 | gold_sp = example["supporting_num"] 86 | else: 87 | gold_sp = None 88 | assert len(input) == len(sentence_position) == len(attention_mask) 89 | 90 | all_result.append( 91 | InferenceInput( 92 | _id=example["_id"], 93 | input_text=input, 94 | answer=output, 95 | attention_mask=attention_mask, 96 | sent_masks=sentence_position, 97 | gold_sp=gold_sp, 98 | ) 99 | ) 100 | if data_sample: 101 | if len(all_result) == 30: 102 | break 103 | return all_result 104 | 105 | 106 | def create_batches(input_list, batch_size): 107 | # Split the input list into batches of size 'batch_size' 108 | for i in range(0, len(input_list), batch_size): 109 | yield input_list[i : i + batch_size] 110 | 111 | 112 | def generate_batch_answer(batches, tokenizer, model): 113 | for batch_num, batch in enumerate(tqdm(batches)): 114 | input_ids = [item.input_text for item in batch] 115 | attention_mask = [item.attention_mask for item in batch] 116 | sentence_masks = [item.sent_masks for item in batch] 117 | 118 | model.to("cuda") 119 | input_batch = {} 120 | max_length = max(len(mask) for mask in input_ids) 121 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 122 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 123 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 124 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 125 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 126 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 127 | 128 | with torch.no_grad(): 129 | model.evidence = None 130 | model.sentence_number = None 131 | outputs = model.generate( 132 | input_ids=input_batch["input_ids"], 133 | attention_mask=input_batch["attention_mask"], 134 | sent_masks=input_batch["sent_masks"], 135 | max_new_tokens=200, 136 | ) 137 | 138 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 139 | decoded_outputs = [ 140 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 141 | ] 142 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 143 | 144 | # Store the generated text back in the input objects 145 | for i, item in enumerate(batch): 146 | item.input_text = input_text 147 | item.generated_text = decoded_outputs[i] 148 | item.generated_all_answer = decoded_outputs_[i] 149 | if model.sentence_number != None: 150 | item.pred_sp = model.sentence_number[i] 151 | return batches 152 | 153 | 154 | def write_result(output_path, answer_batches, tokenizer): 155 | all_result = [] 156 | for batch_num, batch in enumerate(answer_batches): 157 | for item in batch: 158 | result = {} 159 | result["_id"] = item._id 160 | if "assistant\n" in item.generated_text: 161 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 162 | elif "assistant" in item.generated_text: 163 | result["generated_text"] = item.generated_text.split("assistant")[1] 164 | else: 165 | result["generated_text"] = item.generated_text 166 | result["answer"] = item.answer 167 | result["generated_all_answer"] = item.generated_all_answer 168 | if item.gold_sp != None: 169 | result["gold_sp"] = item.gold_sp 170 | result["pred_sp"] = item.pred_sp.tolist() 171 | all_result.append(result) 172 | 173 | with open(output_path, "w", encoding="utf-8") as f: 174 | json.dump(all_result, f, ensure_ascii=False, indent=4) 175 | 176 | 177 | if __name__ == "__main__": 178 | ############################################################## 179 | # model param 추가할 내용 180 | ############################################################## 181 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 182 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 183 | parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200") 184 | parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json") 185 | parser.add_argument("--beam_size", type=int, default=1) 186 | parser.add_argument("--max_dec_len", type=int, default=3) 187 | parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json") 188 | parser.add_argument("--batch_size", type=int, default=8) 189 | parser.add_argument("--data_sample", type=bool, default=True) 190 | parser.add_argument("--mrc_value", type=str, default=True) 191 | parser.add_argument("--sum_value", type=str, default=False) 192 | args = parser.parse_args() 193 | print(args) 194 | ######################################################### 195 | # 변수들 선언 196 | ######################################################### 197 | 198 | config = AutoConfig.from_pretrained(args.base_model_path) 199 | config.beam_size = args.beam_size 200 | config.max_dec_len = args.max_dec_len 201 | 202 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 203 | print("batch size : ", args.batch_size) 204 | 205 | with open(args.data_file, "r", encoding="utf-8") as file: 206 | dev_data = json.load(file) 207 | 208 | input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value) 209 | 210 | # Create batches of input items 211 | batches = list(create_batches(input_data, args.batch_size)) 212 | 213 | answer_batches = generate_batch_answer(batches, tokenizer, model) 214 | #### 답변작성 215 | 216 | write_result(args.output_dir, answer_batches, tokenizer) -------------------------------------------------------------------------------- /source/inference_pn_att_1106_sum.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1106_sum import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size) 17 | trained_model.set_gru(gru) 18 | trained_model.config.use_cache = False 19 | tokenizer.padding_side = "left" 20 | print("LORA WEIGHT LOADING") 21 | trained_model.load_pn_model(lora_path) 22 | return tokenizer, trained_model 23 | 24 | 25 | class InferenceInput: 26 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 27 | self._id = _id 28 | self.input_text = input_text 29 | self.answer = answer 30 | self.attention_mask = attention_mask 31 | self.sent_masks = sent_masks 32 | self.gold_sp = gold_sp 33 | 34 | 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value): 36 | all_result = [] 37 | 38 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | if example["question"] == "summary": 60 | instruction = tokenizer( 61 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 62 | add_special_tokens=False, 63 | ) 64 | # response = tokenizer( 65 | # f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 66 | # add_special_tokens=False, 67 | # ) 68 | response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n" 69 | else: # MRC의 경우 70 | instruction = tokenizer( 71 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 72 | add_special_tokens=False, 73 | ) 74 | # response = tokenizer( 75 | # f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 76 | # add_special_tokens=False, 77 | # ) 78 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n" 79 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 80 | input = instruction["input_ids"] + token_doc["input_ids"] 81 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 82 | output = response 83 | 84 | if "supporting_num" in example.keys(): 85 | gold_sp = example["supporting_num"] 86 | else: 87 | gold_sp = None 88 | assert len(input) == len(sentence_position) == len(attention_mask) 89 | 90 | all_result.append( 91 | InferenceInput( 92 | _id=example["_id"], 93 | input_text=input, 94 | answer=output, 95 | attention_mask=attention_mask, 96 | sent_masks=sentence_position, 97 | gold_sp=gold_sp, 98 | ) 99 | ) 100 | if data_sample: 101 | if len(all_result) == 30: 102 | break 103 | return all_result 104 | 105 | 106 | def create_batches(input_list, batch_size): 107 | # Split the input list into batches of size 'batch_size' 108 | for i in range(0, len(input_list), batch_size): 109 | yield input_list[i : i + batch_size] 110 | 111 | 112 | def generate_batch_answer(batches, tokenizer, model): 113 | for batch_num, batch in enumerate(tqdm(batches)): 114 | input_ids = [item.input_text for item in batch] 115 | attention_mask = [item.attention_mask for item in batch] 116 | sentence_masks = [item.sent_masks for item in batch] 117 | 118 | model.to("cuda") 119 | input_batch = {} 120 | max_length = max(len(mask) for mask in input_ids) 121 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 122 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 123 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 124 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 125 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 126 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 127 | 128 | with torch.no_grad(): 129 | model.evidence = None 130 | model.sentence_number = None 131 | outputs = model.generate( 132 | input_ids=input_batch["input_ids"], 133 | attention_mask=input_batch["attention_mask"], 134 | sent_masks=input_batch["sent_masks"], 135 | max_new_tokens=200, 136 | ) 137 | 138 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 139 | decoded_outputs = [ 140 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 141 | ] 142 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 143 | 144 | # Store the generated text back in the input objects 145 | for i, item in enumerate(batch): 146 | item.input_text = input_text 147 | item.generated_text = decoded_outputs[i] 148 | item.generated_all_answer = decoded_outputs_[i] 149 | if model.sentence_number != None: 150 | item.pred_sp = model.sentence_number[i] 151 | return batches 152 | 153 | 154 | def write_result(output_path, answer_batches, tokenizer): 155 | all_result = [] 156 | for batch_num, batch in enumerate(answer_batches): 157 | for item in batch: 158 | result = {} 159 | result["_id"] = item._id 160 | if "assistant\n" in item.generated_text: 161 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 162 | elif "assistant" in item.generated_text: 163 | result["generated_text"] = item.generated_text.split("assistant")[1] 164 | else: 165 | result["generated_text"] = item.generated_text 166 | result["answer"] = item.answer 167 | result["generated_all_answer"] = item.generated_all_answer 168 | if item.gold_sp != None: 169 | result["gold_sp"] = item.gold_sp 170 | result["pred_sp"] = item.pred_sp.tolist() 171 | all_result.append(result) 172 | 173 | with open(output_path, "w", encoding="utf-8") as f: 174 | json.dump(all_result, f, ensure_ascii=False, indent=4) 175 | 176 | 177 | if __name__ == "__main__": 178 | ############################################################## 179 | # model param 추가할 내용 180 | ############################################################## 181 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 182 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 183 | parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200") 184 | parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json") 185 | parser.add_argument("--beam_size", type=int, default=1) 186 | parser.add_argument("--max_dec_len", type=int, default=3) 187 | parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json") 188 | parser.add_argument("--batch_size", type=int, default=8) 189 | parser.add_argument("--data_sample", type=bool, default=True) 190 | parser.add_argument("--mrc_value", type=str, default=True) 191 | parser.add_argument("--sum_value", type=str, default=False) 192 | args = parser.parse_args() 193 | print(args) 194 | ######################################################### 195 | # 변수들 선언 196 | ######################################################### 197 | 198 | config = AutoConfig.from_pretrained(args.base_model_path) 199 | config.beam_size = args.beam_size 200 | config.max_dec_len = args.max_dec_len 201 | 202 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 203 | print("batch size : ", args.batch_size) 204 | 205 | with open(args.data_file, "r", encoding="utf-8") as file: 206 | dev_data = json.load(file) 207 | 208 | input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value) 209 | 210 | # Create batches of input items 211 | batches = list(create_batches(input_data, args.batch_size)) 212 | 213 | answer_batches = generate_batch_answer(batches, tokenizer, model) 214 | #### 답변작성 215 | 216 | write_result(args.output_dir, answer_batches, tokenizer) -------------------------------------------------------------------------------- /source/inference_pn_att_1107.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1107 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder( 17 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 18 | ) 19 | trained_model.set_gru(gru) 20 | trained_model.config.use_cache = False 21 | tokenizer.padding_side = "left" 22 | print("LORA WEIGHT LOADING") 23 | trained_model.load_pn_model(lora_path) 24 | return tokenizer, trained_model 25 | 26 | 27 | class InferenceInput: 28 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 29 | self._id = _id 30 | self.input_text = input_text 31 | self.answer = answer 32 | self.attention_mask = attention_mask 33 | self.sent_masks = sent_masks 34 | self.gold_sp = gold_sp 35 | 36 | 37 | def create_example(all_example, tokenizer, data_sample): 38 | all_result = [] 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | instruction = tokenizer( 60 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 61 | add_special_tokens=False, 62 | ) 63 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n" 64 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 65 | input = instruction["input_ids"] + token_doc["input_ids"] 66 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 67 | output = response 68 | 69 | if "supporting_num" in example.keys(): 70 | gold_sp = example["supporting_num"] 71 | else: 72 | gold_sp = None 73 | assert len(input) == len(sentence_position) == len(attention_mask) 74 | 75 | all_result.append( 76 | InferenceInput( 77 | _id=example["_id"], 78 | input_text=input, 79 | answer=output, 80 | attention_mask=attention_mask, 81 | sent_masks=sentence_position, 82 | gold_sp=gold_sp, 83 | ) 84 | ) 85 | if data_sample: 86 | if len(all_result) == 100: 87 | break 88 | return all_result 89 | 90 | 91 | def create_batches(input_list, batch_size): 92 | # Split the input list into batches of size 'batch_size' 93 | for i in range(0, len(input_list), batch_size): 94 | yield input_list[i : i + batch_size] 95 | 96 | 97 | def generate_batch_answer(batches, tokenizer, model): 98 | for batch_num, batch in enumerate(tqdm(batches)): 99 | input_ids = [item.input_text for item in batch] 100 | attention_mask = [item.attention_mask for item in batch] 101 | sentence_masks = [item.sent_masks for item in batch] 102 | 103 | model.to("cuda") 104 | model.eval() 105 | input_batch = {} 106 | max_length = max(len(mask) for mask in input_ids) 107 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 108 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 109 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 110 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 111 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 112 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 113 | 114 | with torch.no_grad(): 115 | model.evidence = None 116 | model.sentence_number = None 117 | outputs = model.generate( 118 | input_ids=input_batch["input_ids"], 119 | attention_mask=input_batch["attention_mask"], 120 | sent_masks=input_batch["sent_masks"], 121 | max_new_tokens=50, 122 | # temperature=0.0, 123 | # do_sample=False, 124 | ) 125 | input_text = [] 126 | decoded_outputs = [] 127 | decoded_outputs_ = [] 128 | 129 | for i in range(len(input_ids)): 130 | input_text.append(tokenizer.decode(input_ids[i], skip_special_tokens=True)) 131 | trimmed_output = outputs[i][len(input_batch["input_ids"][i]) :] 132 | decoded_outputs.append(tokenizer.decode(trimmed_output, skip_special_tokens=True)) 133 | decoded_outputs_.append(tokenizer.decode(outputs[i], skip_special_tokens=True)) 134 | 135 | # Store the generated text back in the input objects 136 | for i, item in enumerate(batch): 137 | item.input_text = input_text 138 | item.generated_text = decoded_outputs[i] 139 | item.generated_all_answer = decoded_outputs_[i] 140 | if model.sentence_number != None: 141 | item.pred_sp = model.sentence_number[i] 142 | return batches 143 | 144 | 145 | def write_result(output_path, answer_batches, tokenizer): 146 | all_result = [] 147 | for batch_num, batch in enumerate(answer_batches): 148 | for item in batch: 149 | result = {} 150 | result["_id"] = item._id 151 | if "assistant\n" in item.generated_text: 152 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 153 | elif "assistant" in item.generated_text: 154 | result["generated_text"] = item.generated_text.split("assistant")[1] 155 | else: 156 | result["generated_text"] = item.generated_text 157 | result["answer"] = item.answer 158 | result["generated_all_answer"] = item.generated_all_answer 159 | if item.gold_sp != None: 160 | result["gold_sp"] = item.gold_sp 161 | result["pred_sp"] = item.pred_sp.tolist() 162 | all_result.append(result) 163 | 164 | with open(output_path, "w", encoding="utf-8") as f: 165 | json.dump(all_result, f, ensure_ascii=False, indent=4) 166 | 167 | 168 | if __name__ == "__main__": 169 | ############################################################## 170 | # model param 추가할 내용 171 | ############################################################## 172 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 173 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 174 | parser.add_argument("--train_model_path", type=str, default="model/1205_yesloss/checkpoint-15000") 175 | parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_dev.json") 176 | parser.add_argument("--beam_size", type=int, default=1) 177 | parser.add_argument("--max_dec_len", type=int, default=3) 178 | parser.add_argument("--output_dir", type=str, default="result/1205_yesloss/15000.json") 179 | parser.add_argument("--batch_size", type=int, default=8) 180 | parser.add_argument("--data_sample", type=bool, default=True) 181 | 182 | args = parser.parse_args() 183 | print(args) 184 | ######################################################### 185 | # 변수들 선언 186 | ######################################################### 187 | 188 | config = AutoConfig.from_pretrained(args.base_model_path) 189 | config.beam_size = args.beam_size 190 | config.max_dec_len = args.max_dec_len 191 | 192 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 193 | print("batch size : ", args.batch_size) 194 | 195 | with open(args.data_file, "r", encoding="utf-8") as file: 196 | dev_data = json.load(file) 197 | 198 | input_data = create_example(dev_data, tokenizer, args.data_sample) 199 | 200 | # Create batches of input items 201 | batches = list(create_batches(input_data, args.batch_size)) 202 | 203 | answer_batches = generate_batch_answer(batches, tokenizer, model) 204 | #### 답변작성 205 | 206 | write_result(args.output_dir, answer_batches, tokenizer) 207 | -------------------------------------------------------------------------------- /source/inference_pn_att_1107_sum.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1107_sum import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size) 17 | trained_model.set_gru(gru) 18 | trained_model.config.use_cache = False 19 | tokenizer.padding_side = "left" 20 | print("LORA WEIGHT LOADING") 21 | trained_model.load_pn_model(lora_path) 22 | return tokenizer, trained_model 23 | 24 | 25 | class InferenceInput: 26 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 27 | self._id = _id 28 | self.input_text = input_text 29 | self.answer = answer 30 | self.attention_mask = attention_mask 31 | self.sent_masks = sent_masks 32 | self.gold_sp = gold_sp 33 | 34 | 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value): 36 | all_result = [] 37 | 38 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | if example["question"] == "summary": 60 | instruction = tokenizer( 61 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 62 | add_special_tokens=False, 63 | ) 64 | # response = tokenizer( 65 | # f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 66 | # add_special_tokens=False, 67 | # ) 68 | response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n" 69 | else: # MRC의 경우 70 | instruction = tokenizer( 71 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 72 | add_special_tokens=False, 73 | ) 74 | # response = tokenizer( 75 | # f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 76 | # add_special_tokens=False, 77 | # ) 78 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n" 79 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 80 | input = instruction["input_ids"] + token_doc["input_ids"] 81 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 82 | output = response 83 | 84 | if "supporting_num" in example.keys(): 85 | gold_sp = example["supporting_num"] 86 | else: 87 | gold_sp = None 88 | assert len(input) == len(sentence_position) == len(attention_mask) 89 | 90 | all_result.append( 91 | InferenceInput( 92 | _id=example["_id"], 93 | input_text=input, 94 | answer=output, 95 | attention_mask=attention_mask, 96 | sent_masks=sentence_position, 97 | gold_sp=gold_sp, 98 | ) 99 | ) 100 | if data_sample: 101 | if len(all_result) == 30: 102 | break 103 | return all_result 104 | 105 | 106 | def create_batches(input_list, batch_size): 107 | # Split the input list into batches of size 'batch_size' 108 | for i in range(0, len(input_list), batch_size): 109 | yield input_list[i : i + batch_size] 110 | 111 | 112 | def generate_batch_answer(batches, tokenizer, model): 113 | for batch_num, batch in enumerate(tqdm(batches)): 114 | input_ids = [item.input_text for item in batch] 115 | attention_mask = [item.attention_mask for item in batch] 116 | sentence_masks = [item.sent_masks for item in batch] 117 | 118 | model.to("cuda") 119 | input_batch = {} 120 | max_length = max(len(mask) for mask in input_ids) 121 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 122 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 123 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 124 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 125 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 126 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 127 | 128 | with torch.no_grad(): 129 | model.evidence = None 130 | model.sentence_number = None 131 | outputs = model.generate( 132 | input_ids=input_batch["input_ids"], 133 | attention_mask=input_batch["attention_mask"], 134 | sent_masks=input_batch["sent_masks"], 135 | max_new_tokens=200, 136 | ) 137 | 138 | input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)] 139 | decoded_outputs = [ 140 | tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs) 141 | ] 142 | decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)] 143 | 144 | # Store the generated text back in the input objects 145 | for i, item in enumerate(batch): 146 | item.input_text = input_text 147 | item.generated_text = decoded_outputs[i] 148 | item.generated_all_answer = decoded_outputs_[i] 149 | if model.sentence_number != None: 150 | item.pred_sp = model.sentence_number[i] 151 | return batches 152 | 153 | 154 | def write_result(output_path, answer_batches, tokenizer): 155 | all_result = [] 156 | for batch_num, batch in enumerate(answer_batches): 157 | for item in batch: 158 | result = {} 159 | result["_id"] = item._id 160 | if "assistant\n" in item.generated_text: 161 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 162 | elif "assistant" in item.generated_text: 163 | result["generated_text"] = item.generated_text.split("assistant")[1] 164 | else: 165 | result["generated_text"] = item.generated_text 166 | result["answer"] = item.answer 167 | result["generated_all_answer"] = item.generated_all_answer 168 | if item.gold_sp != None: 169 | result["gold_sp"] = item.gold_sp 170 | result["pred_sp"] = item.pred_sp.tolist() 171 | all_result.append(result) 172 | 173 | with open(output_path, "w", encoding="utf-8") as f: 174 | json.dump(all_result, f, ensure_ascii=False, indent=4) 175 | 176 | 177 | if __name__ == "__main__": 178 | ############################################################## 179 | # model param 추가할 내용 180 | ############################################################## 181 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 182 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 183 | parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200") 184 | parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json") 185 | parser.add_argument("--beam_size", type=int, default=1) 186 | parser.add_argument("--max_dec_len", type=int, default=3) 187 | parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json") 188 | parser.add_argument("--batch_size", type=int, default=8) 189 | parser.add_argument("--data_sample", type=bool, default=True) 190 | parser.add_argument("--mrc_value", type=str, default=True) 191 | parser.add_argument("--sum_value", type=str, default=False) 192 | args = parser.parse_args() 193 | print(args) 194 | ######################################################### 195 | # 변수들 선언 196 | ######################################################### 197 | 198 | config = AutoConfig.from_pretrained(args.base_model_path) 199 | config.beam_size = args.beam_size 200 | config.max_dec_len = args.max_dec_len 201 | 202 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 203 | print("batch size : ", args.batch_size) 204 | 205 | with open(args.data_file, "r", encoding="utf-8") as file: 206 | dev_data = json.load(file) 207 | 208 | input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value) 209 | 210 | # Create batches of input items 211 | batches = list(create_batches(input_data, args.batch_size)) 212 | 213 | answer_batches = generate_batch_answer(batches, tokenizer, model) 214 | #### 답변작성 215 | 216 | write_result(args.output_dir, answer_batches, tokenizer) -------------------------------------------------------------------------------- /source/inference_upper.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig 2 | 3 | import torch 4 | from tqdm import tqdm 5 | import json 6 | from peft import PeftModel, PeftConfig 7 | from datasets import Dataset 8 | 9 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 10 | import argparse 11 | 12 | 13 | def create_model(base_model_path, lora_path, config): 14 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 15 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 16 | gru = BeamSearchAttentionDecoder( 17 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 18 | ) 19 | trained_model.set_gru(gru) 20 | trained_model.config.use_cache = False 21 | tokenizer.padding_side = "left" 22 | print("LORA WEIGHT LOADING") 23 | trained_model.load_pn_model(lora_path) 24 | return tokenizer, trained_model 25 | 26 | 27 | class InferenceInput: 28 | def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp): 29 | self._id = _id 30 | self.input_text = input_text 31 | self.answer = answer 32 | self.attention_mask = attention_mask 33 | self.sent_masks = sent_masks 34 | self.gold_sp = gold_sp 35 | 36 | 37 | def create_example(all_example, tokenizer, data_sample): 38 | all_result = [] 39 | for example in tqdm(all_example): 40 | example["document"] = example["document"].strip() 41 | # token 된 doc 42 | token_doc = {"input_ids": [], "attention_mask": []} 43 | # document 문장 index 44 | sentence_number = 0 45 | sentence_position = [] 46 | for i, sent in enumerate(example["sent"]): 47 | # 0번 문장은 instruction으로 지정할 계획 48 | sent = sent.strip() 49 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 50 | sentence_number += 1 # 1부터 시작 51 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 52 | token_doc["input_ids"] += token_sent["input_ids"] 53 | token_doc["attention_mask"] += token_sent["attention_mask"] 54 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 55 | sentence_position.extend([sentence_number] * len(token_end)) 56 | token_doc["input_ids"] += token_end["input_ids"] 57 | token_doc["attention_mask"] += token_end["attention_mask"] 58 | 59 | instruction = tokenizer( 60 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 61 | add_special_tokens=False, 62 | ) 63 | response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n" 64 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 65 | input = instruction["input_ids"] + token_doc["input_ids"] 66 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] 67 | output = response 68 | 69 | if "supporting_num" in example.keys(): 70 | gold_sp = example["supporting_num"] 71 | else: 72 | gold_sp = None 73 | assert len(input) == len(sentence_position) == len(attention_mask) 74 | 75 | all_result.append( 76 | InferenceInput( 77 | _id=example["_id"], 78 | input_text=input, 79 | answer=output, 80 | attention_mask=attention_mask, 81 | sent_masks=sentence_position, 82 | gold_sp=gold_sp, 83 | ) 84 | ) 85 | if data_sample: 86 | if len(all_result) == 100: 87 | break 88 | return all_result 89 | 90 | 91 | def create_batches(input_list, batch_size): 92 | # Split the input list into batches of size 'batch_size' 93 | for i in range(0, len(input_list), batch_size): 94 | yield input_list[i : i + batch_size] 95 | 96 | 97 | def generate_batch_answer(batches, tokenizer, model): 98 | for batch_num, batch in enumerate(tqdm(batches)): 99 | input_ids = [item.input_text for item in batch] 100 | attention_mask = [item.attention_mask for item in batch] 101 | sentence_masks = [item.sent_masks for item in batch] 102 | 103 | model.to("cuda") 104 | model.eval() 105 | input_batch = {} 106 | max_length = max(len(mask) for mask in input_ids) 107 | padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids] 108 | input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda() 109 | padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask] 110 | input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda() 111 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 112 | input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda() 113 | 114 | with torch.no_grad(): 115 | model.evidence = None 116 | model.sentence_number = None 117 | outputs = model.generate( 118 | input_ids=input_batch["input_ids"], 119 | attention_mask=input_batch["attention_mask"], 120 | sent_masks=input_batch["sent_masks"], 121 | max_new_tokens=50, 122 | # temperature=0.0, 123 | # do_sample=False, 124 | ) 125 | input_text = [] 126 | decoded_outputs = [] 127 | decoded_outputs_ = [] 128 | 129 | for i in range(len(input_ids)): 130 | input_text.append(tokenizer.decode(input_ids[i], skip_special_tokens=True)) 131 | trimmed_output = outputs[i][len(input_batch["input_ids"][i]) :] 132 | decoded_outputs.append(tokenizer.decode(trimmed_output, skip_special_tokens=True)) 133 | decoded_outputs_.append(tokenizer.decode(outputs[i], skip_special_tokens=True)) 134 | 135 | # Store the generated text back in the input objects 136 | for i, item in enumerate(batch): 137 | item.input_text = input_text 138 | item.generated_text = decoded_outputs[i] 139 | item.generated_all_answer = decoded_outputs_[i] 140 | if model.sentence_number != None: 141 | item.pred_sp = model.sentence_number[i] 142 | return batches 143 | 144 | 145 | def write_result(output_path, answer_batches, tokenizer): 146 | all_result = [] 147 | for batch_num, batch in enumerate(answer_batches): 148 | for item in batch: 149 | result = {} 150 | result["_id"] = item._id 151 | if "assistant\n" in item.generated_text: 152 | result["generated_text"] = item.generated_text.split("assistant\n")[1] 153 | elif "assistant" in item.generated_text: 154 | result["generated_text"] = item.generated_text.split("assistant")[1] 155 | else: 156 | result["generated_text"] = item.generated_text 157 | result["answer"] = item.answer 158 | result["generated_all_answer"] = item.generated_all_answer 159 | if item.gold_sp != None: 160 | result["gold_sp"] = item.gold_sp 161 | result["pred_sp"] = item.pred_sp.tolist() 162 | all_result.append(result) 163 | 164 | with open(output_path, "w", encoding="utf-8") as f: 165 | json.dump(all_result, f, ensure_ascii=False, indent=4) 166 | 167 | 168 | if __name__ == "__main__": 169 | ############################################################## 170 | # model param 추가할 내용 171 | ############################################################## 172 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 173 | parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 174 | parser.add_argument("--train_model_path", type=str, default="model/1126_upper/checkpoint-15000") 175 | parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_dev.json") 176 | parser.add_argument("--beam_size", type=int, default=1) 177 | parser.add_argument("--max_dec_len", type=int, default=3) 178 | parser.add_argument("--output_dir", type=str, default="result/1126_upper/15000_test.json") 179 | parser.add_argument("--batch_size", type=int, default=8) 180 | parser.add_argument("--data_sample", type=bool, default=True) 181 | 182 | args = parser.parse_args() 183 | print(args) 184 | ######################################################### 185 | # 변수들 선언 186 | ######################################################### 187 | 188 | config = AutoConfig.from_pretrained(args.base_model_path) 189 | config.beam_size = args.beam_size 190 | config.max_dec_len = args.max_dec_len 191 | 192 | tokenizer, model = create_model(args.base_model_path, args.train_model_path, config) 193 | print("batch size : ", args.batch_size) 194 | 195 | with open(args.data_file, "r", encoding="utf-8") as file: 196 | dev_data = json.load(file) 197 | 198 | input_data = create_example(dev_data, tokenizer, args.data_sample) 199 | 200 | # Create batches of input items 201 | batches = list(create_batches(input_data, args.batch_size)) 202 | 203 | answer_batches = generate_batch_answer(batches, tokenizer, model) 204 | #### 답변작성 205 | 206 | write_result(args.output_dir, answer_batches, tokenizer) 207 | -------------------------------------------------------------------------------- /source/train_mean.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from datasets import Dataset 4 | 5 | from transformers import ( 6 | AutoTokenizer, 7 | AutoModelForCausalLM, 8 | AutoConfig, 9 | DataCollatorForSeq2Seq, 10 | TrainingArguments, 11 | Trainer, 12 | ) 13 | 14 | from peft import LoraConfig, get_peft_model 15 | import wandb 16 | from modeling_qwen2_mean import Qwen2ForCausalLM 17 | from torch.nn import functional as F 18 | import argparse 19 | 20 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): 21 | def __call__(self, features): 22 | # sentence_masks를 제외한 features 리스트 생성 23 | features_without_masks = [{k: v for k, v in f.items() if k != "sent_masks"} for f in features] 24 | 25 | # 부모 클래스에서 features_without_masks 처리 26 | batch = super().__call__(features_without_masks) 27 | 28 | sentence_masks = [f.get("sent_masks", None) for f in features] 29 | # sentence_masks가 None이 아닌 경우 패딩 처리 30 | if sentence_masks[0] is not None: 31 | max_length = max(len(mask) for mask in sentence_masks) 32 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 33 | batch["sent_masks"] = torch.tensor(padded_sentence_masks) 34 | 35 | return batch 36 | 37 | class CustomTrainer(Trainer): 38 | def compute_loss(self, model, inputs, return_outputs=False): 39 | # input을 원하는 대로 수정 40 | model.model.evidence = None 41 | # 모델에 수정된 inputs 전달 42 | if self.label_smoother is not None and "labels" in inputs: 43 | labels = inputs.pop("labels") 44 | else: 45 | labels = None 46 | outputs = model(**inputs) 47 | # Save past state if it exists 48 | # TODO: this needs to be fixed and made cleaner later. 49 | if self.args.past_index >= 0: 50 | self._past = outputs[self.args.past_index] 51 | 52 | if labels is not None: 53 | unwrapped_model = self.accelerator.unwrap_model(model) 54 | if self._is_peft_model(unwrapped_model): 55 | model_name = unwrapped_model.base_model.model._get_name() 56 | else: 57 | model_name = unwrapped_model._get_name() 58 | # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 59 | loss = self.label_smoother(outputs, labels, shift_labels=True) 60 | # else: 61 | # loss = self.label_smoother(outputs, labels) 62 | else: 63 | if isinstance(outputs, dict) and "loss" not in outputs: 64 | raise ValueError( 65 | "The model did not return a loss from the inputs, only the following keys: " 66 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 67 | ) 68 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 69 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # path, batch , 1742(max_sent) 70 | r_loss = loss.requires_grad_(True) 71 | # r_loss = loss.clone().detach().requires_grad_(True) 72 | return (r_loss, outputs) if return_outputs else r_loss 73 | 74 | 75 | def create_model(model_path, config): 76 | tokenizer = AutoTokenizer.from_pretrained(model_path) 77 | model = Qwen2ForCausalLM.from_pretrained(model_path, config=config, device_map="cuda") 78 | model.enable_input_require_grads() 79 | model.config.use_cache = False 80 | tokenizer.padding_side = "left" 81 | return tokenizer, model 82 | 83 | 84 | IGNORE_INDEX = -100 85 | 86 | 87 | def process_func(example, tokenizer): 88 | MAX_LENGTH = 2048 89 | input_ids, attention_mask, labels = [], [], [] 90 | mrc_value = -1 91 | sum_value = -1 92 | if example["mrc_type"] == "T": 93 | mrc_value = "True" 94 | else: 95 | mrc_value = "False" 96 | if example["sum_type"] == "T": 97 | sum_value = "True" 98 | else: 99 | sum_value = "False" 100 | 101 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 102 | example["document"] = example["document"].strip() 103 | # token 된 doc 104 | token_doc = {"input_ids": [], "attention_mask": []} 105 | # document 문장 index 106 | sentence_number = 0 107 | sentence_position = [] 108 | for i, sent in enumerate(example["sent"]): 109 | # 0번 문장은 instruction으로 지정할 계획 110 | sent = sent.strip() 111 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 112 | sentence_number += 1 # 1부터 시작 113 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 114 | token_doc["input_ids"] += token_sent["input_ids"] 115 | token_doc["attention_mask"] += token_sent["attention_mask"] 116 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 117 | sentence_position.extend([0] * len(token_end)) 118 | token_doc["input_ids"] += token_end["input_ids"] 119 | token_doc["attention_mask"] += token_end["attention_mask"] 120 | 121 | if example["data_type"] == "answer": 122 | if example["answer_type"] == "F": 123 | if example["question"] == "no": # 질문이 없는 경우 124 | instruction = tokenizer( 125 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 126 | add_special_tokens=False, 127 | ) 128 | else: 129 | instruction = tokenizer( 130 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 131 | add_special_tokens=False, 132 | ) 133 | response = tokenizer( 134 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 135 | ) 136 | else: # 답 해야하는 경우 질문은 무조건 있음 137 | instruction = tokenizer( 138 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 139 | add_special_tokens=False, 140 | ) 141 | response = tokenizer( 142 | f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 143 | add_special_tokens=False, 144 | ) 145 | elif example["data_type"] == "summary": 146 | if example["answer_type"] == "F": # 무응답의 경우 질문이 무조건 없음 147 | instruction = tokenizer( 148 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 149 | add_special_tokens=False, 150 | ) 151 | response = tokenizer( 152 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 153 | ) 154 | else: # 답 해야하는 경우 질문 유무 155 | if example["question"] == "summary": # 질문이 없는 경우 156 | instruction = tokenizer( 157 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 158 | add_special_tokens=False, 159 | ) 160 | else: 161 | instruction = tokenizer( 162 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n", 163 | add_special_tokens=False, 164 | ) 165 | response = tokenizer( 166 | f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 167 | add_special_tokens=False, 168 | ) 169 | # instruction에 대한 문장 번호 170 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 171 | sentence_position.extend([0] * len(response["input_ids"])) 172 | input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"] 173 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"] 174 | labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"] 175 | assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels) 176 | 177 | if len(input_ids) > MAX_LENGTH: 178 | sentence_position = sentence_position[:MAX_LENGTH] 179 | input_ids = input_ids[:MAX_LENGTH] 180 | attention_mask = attention_mask[:MAX_LENGTH] 181 | labels = labels[:MAX_LENGTH] 182 | return { 183 | "input_ids": input_ids, 184 | "attention_mask": attention_mask, 185 | "labels": labels, 186 | "sent_masks": sentence_position, 187 | } 188 | 189 | 190 | if __name__ == "__main__": 191 | 192 | ############################################################## 193 | # model param 추가할 내용 194 | ############################################################## 195 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 196 | parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 197 | parser.add_argument("--data_file", type=str, default="data/1022data/hotpot_cnn_6k.json") 198 | parser.add_argument("--beam_size", type=int, default=1) 199 | parser.add_argument("--max_dec_len", type=int, default=1) 200 | parser.add_argument("--new_model", type=str, default="new_model") 201 | parser.add_argument("--wandb_project", type=str, default="llm pointer network") 202 | parser.add_argument("--wandb_run_name", type=str, default="1027") 203 | parser.add_argument("--output_dir", type=str, default="qwen_lora_1026") 204 | parser.add_argument("--num_train_epochs", type=int, default=1) 205 | parser.add_argument("--batch_size", type=int, default=4) 206 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 207 | parser.add_argument("--data_sample", type=bool, default=True) 208 | args = parser.parse_args() 209 | print(args) 210 | ######################################################### 211 | # 변수들 선언 212 | ######################################################### 213 | model_path = args.model_path 214 | 215 | config = AutoConfig.from_pretrained(model_path) 216 | config.beam_size = args.beam_size 217 | config.max_dec_len = args.max_dec_len 218 | 219 | tokenizer, model = create_model(model_path, config) 220 | data_file = args.data_file 221 | print("학습 데이터 : ", data_file) 222 | dataset = Dataset.from_json(data_file) 223 | if args.data_sample: 224 | dataset = dataset.select(range(100)) 225 | processed_dataset = dataset.map(lambda example: process_func(example, tokenizer)) 226 | 227 | new_model = args.new_model 228 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 229 | peft_config = LoraConfig( 230 | target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 231 | lora_alpha=16, 232 | lora_dropout=0.1, 233 | r=8, 234 | bias="none", 235 | task_type="CAUSAL_LM", 236 | ) 237 | 238 | model = get_peft_model(model, peft_config) 239 | 240 | model.print_trainable_parameters() 241 | for name, param in model.named_parameters(): 242 | if "gru" in name: 243 | param.requires_grad = True 244 | print(f"Parameter: {name}, requires_grad: {param.requires_grad}") 245 | 246 | ############################################################## 247 | # wanb 248 | ############################################################## 249 | wandb.init(project=args.wandb_project) 250 | wandb.run.name = args.wandb_run_name 251 | 252 | ############################################################## 253 | training_params = TrainingArguments( 254 | output_dir=args.output_dir, 255 | num_train_epochs=args.num_train_epochs, 256 | per_device_train_batch_size=args.batch_size, # 수정했음 257 | gradient_accumulation_steps=args.gradient_accumulation_steps, 258 | warmup_ratio=0.1, 259 | learning_rate=1e-4, 260 | logging_steps=1, 261 | lr_scheduler_type="cosine", 262 | gradient_checkpointing=True, 263 | save_steps=1000, 264 | save_on_each_node=True, 265 | do_train=True, 266 | push_to_hub=False, 267 | report_to="wandb", 268 | ) 269 | trainer = CustomTrainer( 270 | model=model, 271 | args=training_params, 272 | train_dataset=processed_dataset, 273 | data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), 274 | ) 275 | trainer.train() 276 | trainer.save_model(new_model) -------------------------------------------------------------------------------- /source/train_origin.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from datasets import Dataset 4 | 5 | from transformers import ( 6 | AutoTokenizer, 7 | AutoModelForCausalLM, 8 | AutoConfig, 9 | DataCollatorForSeq2Seq, 10 | TrainingArguments, 11 | Trainer, 12 | Qwen2ForCausalLM, 13 | ) 14 | 15 | from peft import LoraConfig, get_peft_model 16 | import wandb 17 | from torch.nn import functional as F 18 | import argparse 19 | 20 | 21 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): 22 | def __call__(self, features): 23 | # sentence_masks를 제외한 features 리스트 생성 24 | features_without_masks = [{k: v for k, v in f.items() if k != "sent_masks"} for f in features] 25 | 26 | # 부모 클래스에서 features_without_masks 처리 27 | batch = super().__call__(features_without_masks) 28 | 29 | sentence_masks = [f.get("sent_masks", None) for f in features] 30 | # sentence_masks가 None이 아닌 경우 패딩 처리 31 | if sentence_masks[0] is not None: 32 | max_length = max(len(mask) for mask in sentence_masks) 33 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 34 | batch["sent_masks"] = torch.tensor(padded_sentence_masks) 35 | 36 | return batch 37 | 38 | 39 | class CustomTrainer(Trainer): 40 | def compute_loss(self, model, inputs, return_outputs=False): 41 | # input을 원하는 대로 수정 42 | model.model.evidence = None 43 | 44 | if self.label_smoother is not None and "labels" in inputs: 45 | labels = inputs.pop("labels") 46 | else: 47 | labels = None 48 | outputs = model(**inputs) 49 | # Save past state if it exists 50 | # TODO: this needs to be fixed and made cleaner later. 51 | if self.args.past_index >= 0: 52 | self._past = outputs[self.args.past_index] 53 | 54 | if labels is not None: 55 | unwrapped_model = self.accelerator.unwrap_model(model) 56 | if self._is_peft_model(unwrapped_model): 57 | model_name = unwrapped_model.base_model.model._get_name() 58 | else: 59 | model_name = unwrapped_model._get_name() 60 | # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 61 | loss = self.label_smoother(outputs, labels, shift_labels=True) 62 | # else: 63 | # loss = self.label_smoother(outputs, labels) 64 | else: 65 | if isinstance(outputs, dict) and "loss" not in outputs: 66 | raise ValueError( 67 | "The model did not return a loss from the inputs, only the following keys: " 68 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 69 | ) 70 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 71 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # path, batch , 1742(max_sent) 72 | 73 | r_loss = loss 74 | return (r_loss, outputs) if return_outputs else r_loss 75 | 76 | 77 | def create_model(model_path, config): 78 | tokenizer = AutoTokenizer.from_pretrained(model_path) 79 | model = Qwen2ForCausalLM.from_pretrained(model_path, config=config, device_map="cuda") 80 | model.enable_input_require_grads() 81 | model.config.use_cache = False 82 | tokenizer.padding_side = "left" 83 | return tokenizer, model 84 | 85 | 86 | IGNORE_INDEX = -100 87 | 88 | 89 | def process_func(example, tokenizer): 90 | MAX_LENGTH = 2048 91 | input_ids, attention_mask, labels = [], [], [] 92 | mrc_value = -1 93 | sum_value = -1 94 | if example["mrc_type"] == "T": 95 | mrc_value = "True" 96 | else: 97 | mrc_value = "False" 98 | if example["sum_type"] == "T": 99 | sum_value = "True" 100 | else: 101 | sum_value = "False" 102 | 103 | task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False." 104 | example["document"] = example["document"].strip() 105 | # token 된 doc 106 | token_doc = {"input_ids": [], "attention_mask": []} 107 | # document 문장 index 108 | sentence_number = 0 109 | sentence_position = [] 110 | for i, sent in enumerate(example["sent"]): 111 | # 0번 문장은 instruction으로 지정할 계획 112 | sent = sent.strip() 113 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 114 | sentence_number += 1 # 1부터 시작 115 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 116 | token_doc["input_ids"] += token_sent["input_ids"] 117 | token_doc["attention_mask"] += token_sent["attention_mask"] 118 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 119 | sentence_position.extend([0] * len(token_end)) 120 | token_doc["input_ids"] += token_end["input_ids"] 121 | token_doc["attention_mask"] += token_end["attention_mask"] 122 | 123 | ######################################################################################################################## 124 | # 전처리 형태 바꾸기 125 | ######################################################################################################################## 126 | if example["data_type"] == "answer": 127 | if example["answer_type"] == "F": 128 | if example["question"] == "no": # 질문이 없는 경우 129 | instruction = tokenizer( 130 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 131 | add_special_tokens=False, 132 | ) 133 | else: 134 | instruction = tokenizer( 135 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n", 136 | add_special_tokens=False, 137 | ) 138 | response = tokenizer( 139 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 140 | ) 141 | else: # 답 해야하는 경우 질문은 무조건 있음 142 | instruction = tokenizer( 143 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n", 144 | add_special_tokens=False, 145 | ) 146 | response = tokenizer( 147 | f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n", 148 | add_special_tokens=False, 149 | ) 150 | elif example["data_type"] == "summary": 151 | if example["answer_type"] == "F": # 무응답의 경우 질문이 무조건 없음 152 | instruction = tokenizer( 153 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n", 154 | add_special_tokens=False, 155 | ) 156 | response = tokenizer( 157 | f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False 158 | ) 159 | else: # 답 해야하는 경우 질문 유무 160 | if example["question"] == "summary": # 질문이 없는 경우 161 | instruction = tokenizer( 162 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n", 163 | add_special_tokens=False, 164 | ) 165 | else: 166 | instruction = tokenizer( 167 | f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n", 168 | add_special_tokens=False, 169 | ) 170 | response = tokenizer( 171 | f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n", 172 | add_special_tokens=False, 173 | ) 174 | # instruction에 대한 문장 번호 175 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 176 | sentence_position.extend([0] * len(response["input_ids"])) 177 | input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"] 178 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"] 179 | labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"] 180 | assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels) 181 | 182 | if len(input_ids) > MAX_LENGTH: 183 | sentence_position = sentence_position[:MAX_LENGTH] 184 | input_ids = input_ids[:MAX_LENGTH] 185 | attention_mask = attention_mask[:MAX_LENGTH] 186 | labels = labels[:MAX_LENGTH] 187 | return { 188 | "input_ids": input_ids, 189 | "attention_mask": attention_mask, 190 | "labels": labels, 191 | "sent_masks": sentence_position, 192 | } 193 | 194 | 195 | if __name__ == "__main__": 196 | 197 | ############################################################## 198 | # model param 추가할 내용 199 | ############################################################## 200 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 201 | parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 202 | parser.add_argument("--data_file", type=str, default="data/train_hotpot_cnn_1022.json") 203 | parser.add_argument("--beam_size", type=int, default=1) 204 | parser.add_argument("--max_dec_len", type=int, default=3) 205 | parser.add_argument("--new_model", type=str, default="hotpot_cnn_origin") 206 | parser.add_argument("--wandb_project", type=str, default="llm pointer network") 207 | parser.add_argument("--wandb_run_name", type=str, default="hotpot_cnn_origin") 208 | parser.add_argument("--output_dir", type=str, default="/hdd/rbqlsquf/hotpot_cnn_origin") 209 | parser.add_argument("--num_train_epochs", type=int, default=1) 210 | parser.add_argument("--batch_size", type=int, default=4) 211 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 212 | parser.add_argument("--data_sample", type=bool, default=False) 213 | args = parser.parse_args() 214 | print(args) 215 | ######################################################### 216 | # 변수들 선언 217 | ######################################################### 218 | model_path = args.model_path 219 | 220 | config = AutoConfig.from_pretrained(model_path) 221 | config.beam_size = args.beam_size 222 | config.max_dec_len = args.max_dec_len 223 | 224 | tokenizer, model = create_model(model_path, config) 225 | data_file = args.data_file 226 | print("학습 데이터 : ", data_file) 227 | dataset = Dataset.from_json(data_file) 228 | if args.data_sample: 229 | dataset = dataset.select(range(12)) 230 | processed_dataset = dataset.map(lambda example: process_func(example, tokenizer)) 231 | 232 | new_model = args.new_model 233 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 234 | peft_config = LoraConfig( 235 | target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 236 | lora_alpha=16, 237 | lora_dropout=0.1, 238 | r=8, 239 | bias="none", 240 | task_type="CAUSAL_LM", 241 | ) 242 | 243 | model = get_peft_model(model, peft_config) 244 | 245 | model.print_trainable_parameters() 246 | 247 | ############################################################## 248 | # wanb 249 | ############################################################## 250 | wandb.init(project=args.wandb_project) 251 | wandb.run.name = args.wandb_run_name 252 | 253 | ############################################################## 254 | training_params = TrainingArguments( 255 | output_dir=args.output_dir, 256 | num_train_epochs=args.num_train_epochs, 257 | per_device_train_batch_size=args.batch_size, # 수정했음 258 | gradient_accumulation_steps=args.gradient_accumulation_steps, 259 | warmup_ratio=0.1, 260 | learning_rate=1e-4, 261 | logging_steps=1, 262 | lr_scheduler_type="cosine", 263 | gradient_checkpointing=True, 264 | save_steps=200, 265 | save_on_each_node=True, 266 | do_train=True, 267 | push_to_hub=False, 268 | report_to="wandb", 269 | ) 270 | trainer = CustomTrainer( 271 | model=model, 272 | args=training_params, 273 | train_dataset=processed_dataset, 274 | data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), 275 | ) 276 | trainer.train() 277 | trainer.save_model(new_model) -------------------------------------------------------------------------------- /source/train_pn_noloss.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from datasets import Dataset 4 | 5 | from transformers import ( 6 | AutoTokenizer, 7 | AutoModelForCausalLM, 8 | AutoConfig, 9 | DataCollatorForSeq2Seq, 10 | TrainingArguments, 11 | Trainer, 12 | ) 13 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 14 | from peft import LoraConfig, get_peft_model 15 | import wandb 16 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 17 | from nltk.translate.bleu_score import sentence_bleu 18 | from torch.nn import functional as F 19 | import argparse 20 | 21 | 22 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): 23 | def __call__(self, features): 24 | # sentence_masks를 제외한 features 리스트 생성 25 | features_without_masks = [ 26 | {k: v for k, v in f.items() if k != "sent_masks" and k != "gold_sp"} for f in features 27 | ] 28 | # 부모 클래스에서 features_without_masks 처리 29 | batch = super().__call__(features_without_masks) 30 | 31 | sentence_masks = [f.get("sent_masks", None) for f in features] 32 | gold_sp = [f.get("gold_sp", None) for f in features] 33 | # sentence_masks가 None이 아닌 경우 패딩 처리 34 | if sentence_masks[0] is not None: 35 | max_length = max(len(mask) for mask in sentence_masks) 36 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 37 | batch["sent_masks"] = torch.tensor(padded_sentence_masks) 38 | if gold_sp[0] is not None: 39 | max_length = 3 40 | padded_sentence_masks = [] 41 | for sp in gold_sp: 42 | if len(sp) > max_length: 43 | sp = sp[:max_length] 44 | # Pad if shorter than max_length 45 | padded_sp = sp + [0] * (max_length - len(sp)) 46 | padded_sentence_masks.append(padded_sp) 47 | batch["gold_sp"] = torch.tensor(padded_sentence_masks) 48 | return batch 49 | 50 | 51 | class CustomTrainer(Trainer): 52 | 53 | def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): 54 | super().save_model(output_dir, _internal_call) 55 | self.model.model.save_pn_model(output_dir) 56 | 57 | def compute_loss(self, model, inputs, return_outputs=False): 58 | # input을 원하는 대로 수정 59 | model.model.evidence = None 60 | 61 | if self.label_smoother is not None and "labels" in inputs: 62 | labels = inputs.pop("labels") 63 | else: 64 | labels = None 65 | outputs = model(**inputs) 66 | # Save past state if it exists 67 | # TODO: this needs to be fixed and made cleaner later. 68 | if self.args.past_index >= 0: 69 | self._past = outputs[self.args.past_index] 70 | 71 | if labels is not None: 72 | unwrapped_model = self.accelerator.unwrap_model(model) 73 | if self._is_peft_model(unwrapped_model): 74 | model_name = unwrapped_model.base_model.model._get_name() 75 | else: 76 | model_name = unwrapped_model._get_name() 77 | # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 78 | loss = self.label_smoother(outputs, labels, shift_labels=True) 79 | # else: 80 | # loss = self.label_smoother(outputs, labels) 81 | else: 82 | if isinstance(outputs, dict) and "loss" not in outputs: 83 | raise ValueError( 84 | "The model did not return a loss from the inputs, only the following keys: " 85 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 86 | ) 87 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 88 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # path, batch , 1742(max_sent) 89 | 90 | sampled_evidence_scores = outputs.get("attention_scores") # batch*path, 2, max_sent?? 91 | mask = outputs.get("mask") # batch, dec_len, max_sent 92 | path_logits = outputs.get("path_logits") # path, batch, max_len, 151667 93 | sampled_evidence_sentence = outputs.get("evidence_sentences") 94 | logit = torch.argmax(path_logits[0], dim=-1) 95 | 96 | decoded_outputs = [ 97 | tokenizer.decode(output[inputs["labels"][i] != -100], skip_special_tokens=True) 98 | for i, output in enumerate(logit) 99 | ] 100 | ############### 101 | print(decoded_outputs) 102 | 103 | r_loss = loss[0, :].mean() 104 | print("========================================") 105 | print(self.state.global_step) 106 | print("loss:{}".format(loss)) 107 | 108 | return (r_loss, outputs) if return_outputs else r_loss 109 | 110 | 111 | def create_model(model_path, config): 112 | tokenizer = AutoTokenizer.from_pretrained(model_path) 113 | model = Qwen2ForCausalLM_pn.from_pretrained(model_path, config=config, device_map="cuda") 114 | model.enable_input_require_grads() 115 | gru = BeamSearchAttentionDecoder( 116 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 117 | ) 118 | model.set_gru(gru) 119 | model.config.use_cache = False 120 | tokenizer.padding_side = "left" 121 | return tokenizer, model 122 | 123 | 124 | def create_model_for_debug(base_model_path, lora_path, config): 125 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 126 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 127 | gru = BeamSearchAttentionDecoder( 128 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 129 | ) 130 | trained_model.set_gru(gru) 131 | trained_model.config.use_cache = False 132 | tokenizer.padding_side = "left" 133 | print("LORA WEIGHT LOADING") 134 | trained_model.load_pn_model(lora_path) 135 | return tokenizer, trained_model 136 | 137 | 138 | IGNORE_INDEX = -100 139 | 140 | 141 | def process_func(example, tokenizer): 142 | MAX_LENGTH = 2048 143 | input_ids, attention_mask, labels = [], [], [] 144 | example["document"] = example["document"].strip() 145 | # token 된 doc 146 | token_doc = {"input_ids": [], "attention_mask": []} 147 | # document 문장 index 148 | sentence_number = 0 149 | sentence_position = [] 150 | for i, sent in enumerate(example["sent"]): 151 | # 0번 문장은 instruction으로 지정할 계획 152 | sent = sent.strip() 153 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 154 | sentence_number += 1 # 1부터 시작 155 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 156 | token_doc["input_ids"] += token_sent["input_ids"] 157 | token_doc["attention_mask"] += token_sent["attention_mask"] 158 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 159 | sentence_position.extend([sentence_number] * len(token_end)) 160 | token_doc["input_ids"] += token_end["input_ids"] 161 | token_doc["attention_mask"] += token_end["attention_mask"] 162 | 163 | ######################################################################################################################## 164 | # 전처리 형태 바꾸기 165 | ######################################################################################################################## 166 | instruction = tokenizer( 167 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n", 168 | add_special_tokens=False, 169 | ) 170 | response = tokenizer( 171 | f"<|im_start|>assistant\n**Answer:{example['output'].strip()}<|im_end|>\n", add_special_tokens=False 172 | ) 173 | 174 | # instruction에 대한 문장 번호 175 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 176 | sentence_position.extend([0] * len(response["input_ids"])) 177 | input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"] 178 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"] 179 | labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"] 180 | assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels) 181 | 182 | if len(input_ids) > MAX_LENGTH: 183 | sentence_position = sentence_position[:MAX_LENGTH] 184 | input_ids = input_ids[:MAX_LENGTH] 185 | attention_mask = attention_mask[:MAX_LENGTH] 186 | labels = labels[:MAX_LENGTH] 187 | return { 188 | "input_ids": input_ids, 189 | "attention_mask": attention_mask, 190 | "labels": labels, 191 | "sent_masks": sentence_position, 192 | "gold_sp": example["supporting_num"], 193 | } 194 | 195 | 196 | if __name__ == "__main__": 197 | 198 | ############################################################## 199 | # model param 추가할 내용 200 | ############################################################## 201 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 202 | parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 203 | parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_train_shuffle_30k.json") 204 | parser.add_argument("--lora_path", type=str, default="model/1124_upper/checkpoint-4400") 205 | parser.add_argument("--beam_size", type=int, default=1) 206 | parser.add_argument("--max_dec_len", type=int, default=3) 207 | parser.add_argument("--new_model", type=str, default="new_mode") 208 | parser.add_argument("--wandb_project", type=str, default="llm pointer network") 209 | parser.add_argument("--wandb_run_name", type=str, default="test") 210 | parser.add_argument("--output_dir", type=str, default="qwen_lora_1026") 211 | parser.add_argument("--num_train_epochs", type=int, default=1) 212 | parser.add_argument("--batch_size", type=int, default=2) 213 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 214 | parser.add_argument("--data_sample", type=bool, default=False) 215 | args = parser.parse_args() 216 | print(args) 217 | ######################################################### 218 | # 변수들 선언 219 | ######################################################### 220 | model_path = args.model_path 221 | 222 | config = AutoConfig.from_pretrained(model_path) 223 | config.beam_size = args.beam_size 224 | config.max_dec_len = args.max_dec_len 225 | 226 | tokenizer, model = create_model(model_path, config) 227 | data_file = args.data_file 228 | print("학습 데이터 : ", data_file) 229 | dataset = Dataset.from_json(data_file) 230 | if args.data_sample: 231 | dataset = dataset.select(range(100)) 232 | processed_dataset = dataset.map(lambda example: process_func(example, tokenizer)) 233 | 234 | new_model = args.new_model 235 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 236 | peft_config = LoraConfig( 237 | target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 238 | lora_alpha=16, 239 | lora_dropout=0.1, 240 | r=8, 241 | bias="none", 242 | task_type="CAUSAL_LM", 243 | ) 244 | 245 | model = get_peft_model(model, peft_config) 246 | 247 | model.print_trainable_parameters() 248 | for name, param in model.named_parameters(): 249 | if "gru" in name or "linear_w1" in name: 250 | param.requires_grad = True 251 | print(f"Parameter: {name}, requires_grad: {param.requires_grad}") 252 | 253 | ############################################################## 254 | # wanb 255 | ############################################################## 256 | wandb.init(project=args.wandb_project, save_code=True) 257 | wandb.run.name = args.wandb_run_name 258 | wandb.save("modeling_qwen2_pn_att_1107_upper.py") 259 | wandb.save("modeling_qwen2_.py") 260 | ############################################################## 261 | training_params = TrainingArguments( 262 | output_dir=args.output_dir, 263 | num_train_epochs=args.num_train_epochs, 264 | per_device_train_batch_size=args.batch_size, # 수정했음 265 | gradient_accumulation_steps=args.gradient_accumulation_steps, 266 | warmup_ratio=0.1, 267 | learning_rate=1e-4, 268 | logging_steps=1, 269 | lr_scheduler_type="cosine", 270 | gradient_checkpointing=True, 271 | save_steps=200, 272 | save_on_each_node=True, 273 | do_train=True, 274 | push_to_hub=False, 275 | report_to="wandb", 276 | ) 277 | trainer = CustomTrainer( 278 | model=model, 279 | args=training_params, 280 | train_dataset=processed_dataset, 281 | data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), 282 | ) 283 | trainer.train() 284 | trainer.save_model(new_model) 285 | -------------------------------------------------------------------------------- /source/train_upper.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from datasets import Dataset 4 | 5 | from transformers import ( 6 | AutoTokenizer, 7 | AutoModelForCausalLM, 8 | AutoConfig, 9 | DataCollatorForSeq2Seq, 10 | TrainingArguments, 11 | Trainer, 12 | ) 13 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union 14 | from peft import LoraConfig, get_peft_model 15 | import wandb 16 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder 17 | from nltk.translate.bleu_score import sentence_bleu 18 | from torch.nn import functional as F 19 | import argparse 20 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss 21 | 22 | 23 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq): 24 | def __call__(self, features): 25 | # sentence_masks를 제외한 features 리스트 생성 26 | features_without_masks = [ 27 | {k: v for k, v in f.items() if k != "sent_masks" and k != "gold_sp"} for f in features 28 | ] 29 | # 부모 클래스에서 features_without_masks 처리 30 | batch = super().__call__(features_without_masks) 31 | 32 | sentence_masks = [f.get("sent_masks", None) for f in features] 33 | gold_sp = [f.get("gold_sp", None) for f in features] 34 | # sentence_masks가 None이 아닌 경우 패딩 처리 35 | if sentence_masks[0] is not None: 36 | max_length = max(len(mask) for mask in sentence_masks) 37 | padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks] 38 | batch["sent_masks"] = torch.tensor(padded_sentence_masks) 39 | if gold_sp[0] is not None: 40 | max_length = 3 41 | padded_sentence_masks = [] 42 | for sp in gold_sp: 43 | if len(sp) > max_length: 44 | sp = sp[:max_length] 45 | # Pad if shorter than max_length 46 | padded_sp = sp + [0] * (max_length - len(sp)) 47 | padded_sentence_masks.append(padded_sp) 48 | batch["gold_sp"] = torch.tensor(padded_sentence_masks) 49 | return batch 50 | 51 | 52 | class CustomTrainer(Trainer): 53 | 54 | def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): 55 | super().save_model(output_dir, _internal_call) 56 | self.model.model.save_pn_model(output_dir) 57 | 58 | def compute_loss(self, model, inputs, return_outputs=False): 59 | # input을 원하는 대로 수정 60 | model.model.evidence = None 61 | 62 | if self.label_smoother is not None and "labels" in inputs: 63 | labels = inputs.pop("labels") 64 | else: 65 | labels = None 66 | outputs = model(**inputs) 67 | # Save past state if it exists 68 | # TODO: this needs to be fixed and made cleaner later. 69 | if self.args.past_index >= 0: 70 | self._past = outputs[self.args.past_index] 71 | 72 | if labels is not None: 73 | unwrapped_model = self.accelerator.unwrap_model(model) 74 | if self._is_peft_model(unwrapped_model): 75 | model_name = unwrapped_model.base_model.model._get_name() 76 | else: 77 | model_name = unwrapped_model._get_name() 78 | # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 79 | loss = self.label_smoother(outputs, labels, shift_labels=True) 80 | # else: 81 | # loss = self.label_smoother(outputs, labels) 82 | else: 83 | if isinstance(outputs, dict) and "loss" not in outputs: 84 | raise ValueError( 85 | "The model did not return a loss from the inputs, only the following keys: " 86 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 87 | ) 88 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 89 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # path, batch , 1742(max_sent) 90 | 91 | sampled_evidence_scores = outputs.get("attention_scores") # batch*path, 2, max_sent?? 92 | mask = outputs.get("mask") # batch, dec_len, max_sent 93 | path_logits = outputs.get("path_logits") # path, batch, max_len, 151667 94 | sampled_evidence_sentence = outputs.get("evidence_sentences") 95 | logit = torch.argmax(path_logits[0], dim=-1) 96 | 97 | decoded_outputs = [ 98 | tokenizer.decode(output[inputs["labels"][i] != -100], skip_special_tokens=True) 99 | for i, output in enumerate(logit) 100 | ] 101 | ############### 102 | print(decoded_outputs) 103 | loss_fct_2 = CrossEntropyLoss() 104 | try: 105 | loss_2 = loss_fct_2( 106 | sampled_evidence_scores.view(-1, sampled_evidence_scores.size(-1)), inputs["gold_sp"].view(-1) 107 | ) 108 | r_loss = (loss[0, :].mean() + loss_2) / 2 109 | print("========================================") 110 | print(self.state.global_step) 111 | print("loss:{}".format(loss)) 112 | print("loss_mean:{}".format(loss[0, :].mean())) 113 | print("loss_2:{}".format(loss_2)) 114 | print("r_loss : {}".format(r_loss)) 115 | except: 116 | r_loss = loss[0, :].mean() 117 | print("========================================") 118 | print(self.state.global_step) 119 | print("loss:{}".format(loss)) 120 | print("loss_mean:{}".format(loss[0, :].mean())) 121 | print("loss_2:nononono") 122 | print("r_loss : {}".format(r_loss)) 123 | # # Add wandb logging for the evidence losses 124 | # # Detailed wandb logging 125 | 126 | return (r_loss, outputs) if return_outputs else r_loss 127 | 128 | 129 | def create_model(model_path, config): 130 | tokenizer = AutoTokenizer.from_pretrained(model_path) 131 | model = Qwen2ForCausalLM_pn.from_pretrained(model_path, config=config, device_map="cuda") 132 | model.enable_input_require_grads() 133 | gru = BeamSearchAttentionDecoder( 134 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 135 | ) 136 | model.set_gru(gru) 137 | model.config.use_cache = False 138 | tokenizer.padding_side = "left" 139 | return tokenizer, model 140 | 141 | 142 | def create_model_for_debug(base_model_path, lora_path, config): 143 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 144 | trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto") 145 | gru = BeamSearchAttentionDecoder( 146 | hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size 147 | ) 148 | trained_model.set_gru(gru) 149 | trained_model.config.use_cache = False 150 | tokenizer.padding_side = "left" 151 | print("LORA WEIGHT LOADING") 152 | trained_model.load_pn_model(lora_path) 153 | return tokenizer, trained_model 154 | 155 | 156 | IGNORE_INDEX = -100 157 | 158 | 159 | def process_func(example, tokenizer): 160 | MAX_LENGTH = 2048 161 | input_ids, attention_mask, labels = [], [], [] 162 | example["document"] = example["document"].strip() 163 | # token 된 doc 164 | token_doc = {"input_ids": [], "attention_mask": []} 165 | # document 문장 index 166 | sentence_number = 0 167 | sentence_position = [] 168 | for i, sent in enumerate(example["sent"]): 169 | # 0번 문장은 instruction으로 지정할 계획 170 | sent = sent.strip() 171 | token_sent = tokenizer(sent + " ", add_special_tokens=False) 172 | sentence_number += 1 # 1부터 시작 173 | sentence_position.extend([sentence_number] * len(token_sent["input_ids"])) 174 | token_doc["input_ids"] += token_sent["input_ids"] 175 | token_doc["attention_mask"] += token_sent["attention_mask"] 176 | token_end = tokenizer("<|im_end|>\n", add_special_tokens=False) 177 | sentence_position.extend([sentence_number] * len(token_end)) 178 | token_doc["input_ids"] += token_end["input_ids"] 179 | token_doc["attention_mask"] += token_end["attention_mask"] 180 | 181 | ######################################################################################################################## 182 | # 전처리 형태 바꾸기 183 | ######################################################################################################################## 184 | instruction = tokenizer( 185 | f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n", 186 | add_special_tokens=False, 187 | ) 188 | response = tokenizer( 189 | f"<|im_start|>assistant\n**Answer:{example['output'].strip()}<|im_end|>\n", add_special_tokens=False 190 | ) 191 | 192 | # instruction에 대한 문장 번호 193 | sentence_position = [0] * len(instruction["input_ids"]) + sentence_position 194 | sentence_position.extend([0] * len(response["input_ids"])) 195 | input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"] 196 | attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"] 197 | labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"] 198 | assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels) 199 | 200 | if len(input_ids) > MAX_LENGTH: 201 | sentence_position = sentence_position[:MAX_LENGTH] 202 | input_ids = input_ids[:MAX_LENGTH] 203 | attention_mask = attention_mask[:MAX_LENGTH] 204 | labels = labels[:MAX_LENGTH] 205 | return { 206 | "input_ids": input_ids, 207 | "attention_mask": attention_mask, 208 | "labels": labels, 209 | "sent_masks": sentence_position, 210 | "gold_sp": example["supporting_num"], 211 | } 212 | 213 | 214 | if __name__ == "__main__": 215 | 216 | ############################################################## 217 | # model param 추가할 내용 218 | ############################################################## 219 | parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트") 220 | parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct") 221 | parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_train_shuffle_30k.json") 222 | parser.add_argument("--lora_path", type=str, default="model/1124_upper/checkpoint-4400") 223 | parser.add_argument("--beam_size", type=int, default=1) 224 | parser.add_argument("--max_dec_len", type=int, default=3) 225 | parser.add_argument("--new_model", type=str, default="new_mode") 226 | parser.add_argument("--wandb_project", type=str, default="llm pointer network") 227 | parser.add_argument("--wandb_run_name", type=str, default="test") 228 | parser.add_argument("--output_dir", type=str, default="qwen_lora_1026") 229 | parser.add_argument("--num_train_epochs", type=int, default=1) 230 | parser.add_argument("--batch_size", type=int, default=2) 231 | parser.add_argument("--gradient_accumulation_steps", type=int, default=1) 232 | parser.add_argument("--data_sample", type=bool, default=False) 233 | args = parser.parse_args() 234 | print(args) 235 | ######################################################### 236 | # 변수들 선언 237 | ######################################################### 238 | model_path = args.model_path 239 | 240 | config = AutoConfig.from_pretrained(model_path) 241 | config.beam_size = args.beam_size 242 | config.max_dec_len = args.max_dec_len 243 | 244 | tokenizer, model = create_model(model_path, config) 245 | # tokenizer, model = create_model_for_debug(model_path, args.lora_path, config) 246 | data_file = args.data_file 247 | print("학습 데이터 : ", data_file) 248 | dataset = Dataset.from_json(data_file) 249 | if args.data_sample: 250 | dataset = dataset.select(range(100)) 251 | processed_dataset = dataset.map(lambda example: process_func(example, tokenizer)) 252 | 253 | new_model = args.new_model 254 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 255 | peft_config = LoraConfig( 256 | target_modules=["q_proj", "v_proj", "k_proj", "o_proj"], 257 | lora_alpha=16, 258 | lora_dropout=0.1, 259 | r=8, 260 | bias="none", 261 | task_type="CAUSAL_LM", 262 | ) 263 | 264 | model = get_peft_model(model, peft_config) 265 | 266 | model.print_trainable_parameters() 267 | for name, param in model.named_parameters(): 268 | if "gru" in name or "linear_w1" in name: 269 | param.requires_grad = True 270 | print(f"Parameter: {name}, requires_grad: {param.requires_grad}") 271 | 272 | ############################################################## 273 | # wanb 274 | ############################################################## 275 | wandb.init(project=args.wandb_project, save_code=True) 276 | wandb.run.name = args.wandb_run_name 277 | wandb.save("modeling_qwen2_pn_att_1107_upper.py") 278 | wandb.save("modeling_qwen2_.py") 279 | ############################################################## 280 | training_params = TrainingArguments( 281 | output_dir=args.output_dir, 282 | num_train_epochs=args.num_train_epochs, 283 | per_device_train_batch_size=args.batch_size, # 수정했음 284 | gradient_accumulation_steps=args.gradient_accumulation_steps, 285 | warmup_ratio=0.1, 286 | learning_rate=1e-4, 287 | logging_steps=1, 288 | lr_scheduler_type="cosine", 289 | gradient_checkpointing=True, 290 | save_steps=200, 291 | save_on_each_node=True, 292 | do_train=True, 293 | push_to_hub=False, 294 | report_to="wandb", 295 | ) 296 | trainer = CustomTrainer( 297 | model=model, 298 | args=training_params, 299 | train_dataset=processed_dataset, 300 | data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True), 301 | ) 302 | trainer.train() 303 | trainer.save_model(new_model) 304 | -------------------------------------------------------------------------------- /tmp.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import json\n", 10 | "file_path = \"data/1020data/hotpot_train.json\"\n", 11 | "with open(file_path, 'r', encoding='utf-8') as f:\n", 12 | " data = json.load(f)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 6, 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "90447" 24 | ] 25 | }, 26 | "execution_count": 6, 27 | "metadata": {}, 28 | "output_type": "execute_result" 29 | } 30 | ], 31 | "source": [ 32 | "len(data)" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 7, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "all_result = []\n", 42 | "for d in data[:30000]:\n", 43 | " d[\"answer_type\"] = \"T\"\n", 44 | " d[\"mrc_type\"] = \"T\"\n", 45 | " d[\"sum_type\"] = \"F\"\n", 46 | " d[\"data_type\"] = \"answer\"\n", 47 | " d[\"_id\"] = str(d[\"_id\"])\n", 48 | " all_result.append(d)" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 8, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "file_path = \"data/1020data/hotpot_30k.json\"\n", 58 | "with open(file_path, 'w', encoding='utf-8') as f:\n", 59 | " json.dump(all_result, f, ensure_ascii=False, indent=4)" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 10, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "import json\n", 69 | "file_path = \"data/1020data/cnn_train.json\"\n", 70 | "with open(file_path, 'r', encoding='utf-8') as f:\n", 71 | " data = json.load(f)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 11, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "for d in data[:30000]:\n", 81 | " d[\"answer_type\"] = \"T\"\n", 82 | " d[\"mrc_type\"] = \"F\"\n", 83 | " d[\"sum_type\"] = \"T\"\n", 84 | " d[\"data_type\"] = \"answer\"\n", 85 | " d[\"_id\"] = str(d[\"_id\"])\n", 86 | " all_result.append(d)" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 12, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "import random\n", 96 | "random.seed(100)\n", 97 | "\n", 98 | "random.shuffle(all_result)" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 13, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "file_path = \"data/1022data/hotpot_cnn_6k.json\"\n", 108 | "with open(file_path, 'w', encoding='utf-8') as f:\n", 109 | " json.dump(all_result, f, ensure_ascii=False, indent=4)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": ".venv", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.8.10" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 2 141 | } 142 | --------------------------------------------------------------------------------