├── LICENSE
├── README.md
├── data_processing
    ├── make_data_sentence.ipynb
    ├── make_data_set_cnn.ipynb
    ├── make_data_set_mrc.ipynb
    ├── make_data_set_multi_news.ipynb
    ├── make_data_wikisum.ipynb
    └── shuffle_for_data.ipynb
├── evaluation.ipynb
├── evaluation.py
├── evaluation_sum.py
├── inference.ipynb
├── qwen
    ├── qwen_inference.py
    ├── qwen_inference_lora.py
    └── qwen_train.py
├── requirements.txt
├── run.sh
├── run_train.sh
├── source
    ├── __pycache__
    │   ├── modeling_qwen2_inf.cpython-38.pyc
    │   ├── modeling_qwen2_mean.cpython-38.pyc
    │   ├── modeling_qwen2_pn.cpython-38.pyc
    │   ├── modeling_qwen2_pn_2.cpython-38.pyc
    │   └── modeling_qwen2_pn_test.cpython-38.pyc
    ├── inference_1108.py
    ├── inference_baseline.py
    ├── inference_mean.py
    ├── inference_origin.py
    ├── inference_pn.py
    ├── inference_pn_att_1106.py
    ├── inference_pn_att_1106_sum.py
    ├── inference_pn_att_1107.py
    ├── inference_pn_att_1107_sum.py
    ├── inference_upper.py
    ├── modeling_qwen2_.py
    ├── modeling_qwen2_mean.py
    ├── modeling_qwen2_pn.py
    ├── modeling_qwen2_pn_att_1106.py
    ├── modeling_qwen2_pn_att_1106_lmhead.py
    ├── modeling_qwen2_pn_att_1106_sum.py
    ├── modeling_qwen2_pn_att_1107.py
    ├── modeling_qwen2_pn_att_1107_baseline.py
    ├── modeling_qwen2_pn_att_1107_sum.py
    ├── modeling_qwen2_pn_att_1107_upper.py
    ├── train_mean.py
    ├── train_origin.py
    ├── train_pn.py
    ├── train_pn_2step.py
    ├── train_pn_att.py
    ├── train_pn_noloss.py
    ├── train_pn_yesloss.py
    └── train_upper.py
├── test.ipynb
├── tmp.ipynb
└── train.ipynb


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kyubeen Han
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # XAI_rationale-inference-LLM


--------------------------------------------------------------------------------
/data_processing/make_data_sentence.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 20,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "file_path = \"../data/1010data/train_data_1011.json\"\n",
 11 |     "with open(file_path, 'r', encoding='utf-8') as f:\n",
 12 |     "    data = json.load(f)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 21,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "name": "stderr",
 22 |      "output_type": "stream",
 23 |      "text": [
 24 |       "[nltk_data] Downloading package punkt to /home/rbqlsquf2/nltk_data...\n",
 25 |       "[nltk_data]   Package punkt is already up-to-date!\n"
 26 |      ]
 27 |     }
 28 |    ],
 29 |    "source": [
 30 |     "import nltk\n",
 31 |     "nltk.download('punkt')  # NLTK에서 사용하는 토크나이저 데이터 다운로드\n",
 32 |     "\n",
 33 |     "from nltk.tokenize import sent_tokenize\n",
 34 |     "\n",
 35 |     "def split_sentences_nltk(text):\n",
 36 |     "    sentences = sent_tokenize(text)\n",
 37 |     "    return sentences"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 25,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\n",
 50 |       "\"OMG.\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "text = \"Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\\\"OMG.\"\n",
 56 |     "sentences = split_sentences_nltk(text)\n",
 57 |     "print(sentences[0])\n",
 58 |     "print(sentences[1])"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 26,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "def preprocess_punctuation(text):\n",
 68 |     "    # 구두점이 반복되는 경우 하나의 구두점으로 축소\n",
 69 |     "    text = re.sub(r'([!?])\\1+', r'\\1', text)\n",
 70 |     "    return text\n",
 71 |     "\n",
 72 |     "def split_sentences_nltk_with_punctuation(text):\n",
 73 |     "    # 구두점 전처리\n",
 74 |     "    processed_text = preprocess_punctuation(text)\n",
 75 |     "    # NLTK의 sent_tokenize 사용\n",
 76 |     "    sentences = sent_tokenize(processed_text)\n",
 77 |     "    return sentences"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 27,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stderr",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "100%|██████████| 90000/90000 [01:07<00:00, 1328.13it/s]\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "from tqdm import tqdm\n",
 95 |     "import re\n",
 96 |     "for d in tqdm(data):\n",
 97 |     "    d[\"sent\"] = split_sentences_nltk_with_punctuation(d[\"document\"])"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": 33,
103 |    "metadata": {},
104 |    "outputs": [
105 |     {
106 |      "data": {
107 |       "text/plain": [
108 |        "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n",
109 |        " 'Tracy Morgan just passed out on stage at the #Emmys.',\n",
110 |        " 'Turn ABC on right now.\"',\n",
111 |        " '~ @jimmykimmel, telling viewers what to tweet.',\n",
112 |        " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n",
113 |        " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n",
114 |        " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n",
115 |        " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n",
116 |        " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n",
117 |        " '!',\n",
118 |        " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n",
119 |        " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n",
120 |        " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n",
121 |        " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']"
122 |       ]
123 |      },
124 |      "execution_count": 33,
125 |      "metadata": {},
126 |      "output_type": "execute_result"
127 |     }
128 |    ],
129 |    "source": [
130 |     "sent_tokenize(data[0][\"document\"])"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": 34,
136 |    "metadata": {},
137 |    "outputs": [
138 |     {
139 |      "data": {
140 |       "text/plain": [
141 |        "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n",
142 |        " 'Tracy Morgan just passed out on stage at the #Emmys.',\n",
143 |        " 'Turn ABC on right now.\"',\n",
144 |        " '~ @jimmykimmel, telling viewers what to tweet.',\n",
145 |        " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n",
146 |        " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n",
147 |        " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n",
148 |        " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n",
149 |        " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n",
150 |        " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n",
151 |        " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n",
152 |        " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n",
153 |        " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']"
154 |       ]
155 |      },
156 |      "execution_count": 34,
157 |      "metadata": {},
158 |      "output_type": "execute_result"
159 |     }
160 |    ],
161 |    "source": [
162 |     "\n",
163 |     "sent_tokenize(preprocess_punctuation(data[0][\"document\"]))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 28,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "['Like every year, the 2012 Emmy Awards had its fair mix of jaw-dropping surprises, predictable winners and off-the-cut moments.From Julia Louis-Drefyus‘ hilarious acceptance speech to Jon Cryer’s head-scratching win for Best Comedy Actor, Celebuzz has compiled a list of the best and worst moments from Sunday’s telecast.What will everyone be talking about on Monday?Have a look at the eight top moments in our gallery, above, then share your own thoughts on the ceremony in the comments.For a full recap of the night’s big winners, click here.During the 64th Primetime Emmy Awards, host Jimmy Kimmel urged viewers to take to Twitter and type about comedian Tracy Morgan passing out on stage, instantly sparking 25,000 tweets.\"OMG.',\n",
175 |        " 'Tracy Morgan just passed out on stage at the #Emmys.',\n",
176 |        " 'Turn ABC on right now.\"',\n",
177 |        " '~ @jimmykimmel, telling viewers what to tweet.',\n",
178 |        " '— Brian A. Hernandez (@BAHjournalist) September 24, 2012Oh my God - Tracy Morgan just passed out onstage at the Emmys - turn ABC on NOW!\"',\n",
179 |        " \"— Jimmy Kimmel (@jimmykimmel) September 24, 2012Of course, Morgan didn't really pass out, but he did lie on the stage.\",\n",
180 |        " 'Morgan even stayed on his back when the award for Best Writing for a Drama was announced and the Homeland winners got on stage.',\n",
181 |        " 'Kimmel, who had teased about the prank all week, pulled the stunt to attract more viewers to tune into ABC.Within minutes, \"OMG Tracy Morgan\" was a worldwide trending topic on Twitter:And \"Omg Tracy Morgan\" is trending worldwide.',\n",
182 |        " \"#Emmys — Brian A. Hernandez (@BAHjournalist) September 24, 2012And Twitter's TV account shared these statistics:OMG 25,000 tweets instantly thanks to OMG TRacy Morgan #Emmys — thanks @jimmykimmel !\",\n",
183 |        " '— Twitter TV (@twittertv) September 24, 2012Not everyone enjoyed the stunt, though, as actor Omar Epps notes:Well..',\n",
184 |        " \"The consensus on my timeline is ya'll aren't feeling the Tracey Morgan stunt.. Point taken tweeps!\",\n",
185 |        " \"#Emmys — OMAR EPPS (@omarepps) September 24, 2012Tracy Morgan #Emmys stunt like that #Survivor who lied about Granny's death for pity vote: got them far but big turn off if you fell for it.\",\n",
186 |        " '— Jose m iniguez (@imJmi) September 24, 2012Photo from Twitter user @MSquareENt, GIF from Daily Dot and thumbnail from Art Streiber/NBC']"
187 |       ]
188 |      },
189 |      "execution_count": 28,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "data[0][\"sent\"]"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 19,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "file_path = \"../data/1017data/train_sent.json\"\n",
205 |     "with open(file_path, 'w', encoding='utf-8') as f:\n",
206 |     "    json.dump(data, f, ensure_ascii=False, indent=4)"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": ".venv",
220 |    "language": "python",
221 |    "name": "python3"
222 |   },
223 |   "language_info": {
224 |    "codemirror_mode": {
225 |     "name": "ipython",
226 |     "version": 3
227 |    },
228 |    "file_extension": ".py",
229 |    "mimetype": "text/x-python",
230 |    "name": "python",
231 |    "nbconvert_exporter": "python",
232 |    "pygments_lexer": "ipython3",
233 |    "version": "3.8.10"
234 |   }
235 |  },
236 |  "nbformat": 4,
237 |  "nbformat_minor": 2
238 | }
239 | 


--------------------------------------------------------------------------------
/data_processing/make_data_set_cnn.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |    "cells": [
  3 |       {
  4 |          "cell_type": "code",
  5 |          "execution_count": 1,
  6 |          "metadata": {},
  7 |          "outputs": [
  8 |             {
  9 |                "name": "stderr",
 10 |                "output_type": "stream",
 11 |                "text": [
 12 |                   "/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |                   "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |                ]
 15 |             }
 16 |          ],
 17 |          "source": [
 18 |             "from datasets import load_dataset\n",
 19 |             "\n",
 20 |             "ds = load_dataset(\"abisee/cnn_dailymail\", \"3.0.0\")"
 21 |          ]
 22 |       },
 23 |       {
 24 |          "cell_type": "code",
 25 |          "execution_count": 7,
 26 |          "metadata": {},
 27 |          "outputs": [
 28 |             {
 29 |                "data": {
 30 |                   "text/plain": [
 31 |                      "'42c027e4ff9730fbb3de84c1af0d2c506e41c3e4'"
 32 |                   ]
 33 |                },
 34 |                "execution_count": 7,
 35 |                "metadata": {},
 36 |                "output_type": "execute_result"
 37 |             }
 38 |          ],
 39 |          "source": [
 40 |             "ds[\"train\"][0][\"id\"]"
 41 |          ]
 42 |       },
 43 |       {
 44 |          "cell_type": "code",
 45 |          "execution_count": 6,
 46 |          "metadata": {},
 47 |          "outputs": [
 48 |             {
 49 |                "name": "stderr",
 50 |                "output_type": "stream",
 51 |                "text": [
 52 |                   "[nltk_data] Downloading package punkt_tab to\n",
 53 |                   "[nltk_data]     /home/rbqlsquf2/nltk_data...\n",
 54 |                   "[nltk_data]   Package punkt_tab is already up-to-date!\n"
 55 |                ]
 56 |             }
 57 |          ],
 58 |          "source": [
 59 |             "import nltk\n",
 60 |             "nltk.download('punkt_tab')  # Download the necessary tokenizer data\n",
 61 |             "from nltk.tokenize import sent_tokenize\n",
 62 |             "\n",
 63 |             "def split_into_sentences(text):\n",
 64 |             "    sentences = sent_tokenize(text)\n",
 65 |             "    return sentences\n"
 66 |          ]
 67 |       },
 68 |       {
 69 |          "cell_type": "code",
 70 |          "execution_count": 7,
 71 |          "metadata": {},
 72 |          "outputs": [
 73 |             {
 74 |                "data": {
 75 |                   "text/plain": [
 76 |                      "DatasetDict({\n",
 77 |                      "    train: Dataset({\n",
 78 |                      "        features: ['article', 'highlights', 'id'],\n",
 79 |                      "        num_rows: 287113\n",
 80 |                      "    })\n",
 81 |                      "    validation: Dataset({\n",
 82 |                      "        features: ['article', 'highlights', 'id'],\n",
 83 |                      "        num_rows: 13368\n",
 84 |                      "    })\n",
 85 |                      "    test: Dataset({\n",
 86 |                      "        features: ['article', 'highlights', 'id'],\n",
 87 |                      "        num_rows: 11490\n",
 88 |                      "    })\n",
 89 |                      "})"
 90 |                   ]
 91 |                },
 92 |                "execution_count": 7,
 93 |                "metadata": {},
 94 |                "output_type": "execute_result"
 95 |             }
 96 |          ],
 97 |          "source": [
 98 |             "ds"
 99 |          ]
100 |       },
101 |       {
102 |          "cell_type": "code",
103 |          "execution_count": 8,
104 |          "metadata": {},
105 |          "outputs": [],
106 |          "source": [
107 |             "from tqdm import tqdm\n",
108 |             "\n",
109 |             "def create_example(all_data):\n",
110 |             "    all_result = []\n",
111 |             "    for i, data in enumerate(tqdm(all_data)):\n",
112 |             "        data_id = data[\"id\"]\n",
113 |             "        summary = data[\"highlights\"].replace(\"\\n\", \" \")\n",
114 |             "        context = split_into_sentences(data[\"article\"])\n",
115 |             "        \n",
116 |             "        result = {}\n",
117 |             "        result[\"_id\"] = data_id\n",
118 |             "        result[\"question\"] = \"summary\"\n",
119 |             "        result[\"document\"] = data[\"article\"]\n",
120 |             "        result[\"sent\"] = context\n",
121 |             "        result[\"output\"] = summary\n",
122 |             "        \n",
123 |             "        all_result.append(result)\n",
124 |             "\n",
125 |             "    return all_result\n",
126 |             "    "
127 |          ]
128 |       },
129 |       {
130 |          "cell_type": "code",
131 |          "execution_count": 9,
132 |          "metadata": {},
133 |          "outputs": [
134 |             {
135 |                "name": "stderr",
136 |                "output_type": "stream",
137 |                "text": [
138 |                   "100%|██████████| 287113/287113 [03:00<00:00, 1591.72it/s]\n"
139 |                ]
140 |             }
141 |          ],
142 |          "source": [
143 |             "import json\n",
144 |             "\n",
145 |             "input_data = create_example(ds['train'])"
146 |          ]
147 |       },
148 |       {
149 |          "cell_type": "code",
150 |          "execution_count": 10,
151 |          "metadata": {},
152 |          "outputs": [],
153 |          "source": [
154 |             "with open(\"../data/1020data/cnn_train.json\", \"w\", encoding=\"utf-8\") as f:\n",
155 |             "    json.dump(input_data, f, ensure_ascii=False, indent=4)"
156 |          ]
157 |       },
158 |       {
159 |          "cell_type": "code",
160 |          "execution_count": 17,
161 |          "metadata": {},
162 |          "outputs": [
163 |             {
164 |                "name": "stderr",
165 |                "output_type": "stream",
166 |                "text": [
167 |                   "100%|██████████| 13368/13368 [00:22<00:00, 582.69it/s]\n"
168 |                ]
169 |             }
170 |          ],
171 |          "source": [
172 |             "from tqdm import tqdm\n",
173 |             "all_len = []\n",
174 |             "all_result = []\n",
175 |             "\n",
176 |             "for input_data_ in tqdm(input_data):\n",
177 |             "    text = input_data_[\"text\"]\n",
178 |             "    if len(tokenizer(text)[\"input_ids\"]) <= 2048:\n",
179 |             "        # data[\"text\"] = data[\"text\"]\n",
180 |             "        all_result.append(input_data_)\n",
181 |             "    # all_len.append(len(tokenizer(text)[\"input_ids\"]))\n",
182 |             "\n",
183 |             "with open(\"../data/qwen_cnn_test_data.json\", \"w\", encoding=\"utf-8\") as f:\n",
184 |             "    json.dump(all_result, f, ensure_ascii=False, indent=4)"
185 |          ]
186 |       },
187 |       {
188 |          "cell_type": "code",
189 |          "execution_count": 11,
190 |          "metadata": {},
191 |          "outputs": [
192 |             {
193 |                "name": "stderr",
194 |                "output_type": "stream",
195 |                "text": [
196 |                   "100%|██████████| 11490/11490 [00:07<00:00, 1619.94it/s]\n",
197 |                   "100%|██████████| 11490/11490 [00:23<00:00, 498.68it/s]"
198 |                ]
199 |             },
200 |             {
201 |                "name": "stdout",
202 |                "output_type": "stream",
203 |                "text": [
204 |                   "759\n"
205 |                ]
206 |             },
207 |             {
208 |                "name": "stderr",
209 |                "output_type": "stream",
210 |                "text": [
211 |                   "\n"
212 |                ]
213 |             }
214 |          ],
215 |          "source": [
216 |             "input_data = create_example(ds['test'])\n",
217 |             "\n",
218 |             "\n",
219 |             "all_len = []\n",
220 |             "all_result = []\n",
221 |             "over_num = 0\n",
222 |             "for input_data_ in tqdm(input_data):\n",
223 |             "    text = input_data_[\"all_text\"]\n",
224 |             "    count = len(tokenizer(text)[\"input_ids\"])\n",
225 |             "    if count <= 2048:\n",
226 |             "        all_result.append(input_data_)\n",
227 |             "    else:\n",
228 |             "        over_len = count - 2048\n",
229 |             "        input_data_['text'] = input_data_['text'][:over_len]\n",
230 |             "        over_num +=1\n",
231 |             "    # all_len.append(len(tokenizer(text)[\"input_ids\"]))\n",
232 |             "print(over_num)"
233 |          ]
234 |       },
235 |       {
236 |          "cell_type": "code",
237 |          "execution_count": 13,
238 |          "metadata": {},
239 |          "outputs": [],
240 |          "source": [
241 |             "with open(\"../data/qwen_cnn_test_data.json\", \"w\", encoding=\"utf-8\") as f:\n",
242 |             "    json.dump(all_result, f, ensure_ascii=False, indent=4)"
243 |          ]
244 |       },
245 |       {
246 |          "cell_type": "code",
247 |          "execution_count": null,
248 |          "metadata": {},
249 |          "outputs": [],
250 |          "source": []
251 |       },
252 |       {
253 |          "cell_type": "code",
254 |          "execution_count": null,
255 |          "metadata": {},
256 |          "outputs": [],
257 |          "source": []
258 |       },
259 |       {
260 |          "cell_type": "code",
261 |          "execution_count": 38,
262 |          "metadata": {},
263 |          "outputs": [],
264 |          "source": [
265 |             "for result in all_result:\n",
266 |             "    result[\"label\"] = \"assistant\\n\" + result[\"label\"]\n",
267 |             "    "
268 |          ]
269 |       },
270 |       {
271 |          "cell_type": "code",
272 |          "execution_count": 39,
273 |          "metadata": {},
274 |          "outputs": [],
275 |          "source": [
276 |             "with open(\"data/qwen_dev_data.json\", \"w\", encoding=\"utf-8\") as f:\n",
277 |             "    json.dump(all_result, f, ensure_ascii=False, indent=4)"
278 |          ]
279 |       },
280 |       {
281 |          "cell_type": "code",
282 |          "execution_count": 14,
283 |          "metadata": {},
284 |          "outputs": [
285 |             {
286 |                "name": "stderr",
287 |                "output_type": "stream",
288 |                "text": [
289 |                   "7405it [00:17, 426.88it/s]\n"
290 |                ]
291 |             }
292 |          ],
293 |          "source": [
294 |             "from tqdm import tqdm\n",
295 |             "all_len = []\n",
296 |             "all_result = []\n",
297 |             "\n",
298 |             "for data, input_data_ in tqdm(zip(dev_data, input_data)):\n",
299 |             "    text = input_data_[\"text\"]\n",
300 |             "    if len(tokenizer(text)[\"input_ids\"]) <= 2048:\n",
301 |             "        # data[\"text\"] = data[\"text\"]\n",
302 |             "        all_result.append(data)\n",
303 |             "    # all_len.append(len(tokenizer(text)[\"input_ids\"]))"
304 |          ]
305 |       },
306 |       {
307 |          "cell_type": "code",
308 |          "execution_count": 16,
309 |          "metadata": {},
310 |          "outputs": [],
311 |          "source": [
312 |             "with open(\"data/teddst_dev.json\", \"w\", encoding=\"utf-8\") as f:\n",
313 |             "    json.dump(input_data, f, ensure_ascii=False, indent=4)"
314 |          ]
315 |       },
316 |       {
317 |          "cell_type": "code",
318 |          "execution_count": 7,
319 |          "metadata": {},
320 |          "outputs": [
321 |             {
322 |                "name": "stdout",
323 |                "output_type": "stream",
324 |                "text": [
325 |                   "**Answer**: yes\n",
326 |                   "**Supporting Sentences**: [4] Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer.\n",
327 |                   "[17] Edward Davis Wood Jr. (October 10, 1924 – December 10, 1978) was an American filmmaker, actor, writer, producer, and director.\n",
328 |                   "\n"
329 |                ]
330 |             }
331 |          ],
332 |          "source": [
333 |             "print(input_data[0][\"label\"])"
334 |          ]
335 |       },
336 |       {
337 |          "cell_type": "code",
338 |          "execution_count": 16,
339 |          "metadata": {},
340 |          "outputs": [
341 |             {
342 |                "name": "stdout",
343 |                "output_type": "stream",
344 |                "text": [
345 |                   "6539\n"
346 |                ]
347 |             }
348 |          ],
349 |          "source": [
350 |             "count = len(list(filter(lambda x: x < 2048, all_len)))\n",
351 |             "print(count)"
352 |          ]
353 |       },
354 |       {
355 |          "cell_type": "code",
356 |          "execution_count": null,
357 |          "metadata": {},
358 |          "outputs": [],
359 |          "source": []
360 |       }
361 |    ],
362 |    "metadata": {
363 |       "kernelspec": {
364 |          "display_name": ".venv",
365 |          "language": "python",
366 |          "name": "python3"
367 |       },
368 |       "language_info": {
369 |          "codemirror_mode": {
370 |             "name": "ipython",
371 |             "version": 3
372 |          },
373 |          "file_extension": ".py",
374 |          "mimetype": "text/x-python",
375 |          "name": "python",
376 |          "nbconvert_exporter": "python",
377 |          "pygments_lexer": "ipython3",
378 |          "version": "3.8.10"
379 |       }
380 |    },
381 |    "nbformat": 4,
382 |    "nbformat_minor": 2
383 | }


--------------------------------------------------------------------------------
/data_processing/make_data_wikisum.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/lib/python3.8/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
 13 |       "  from .autonotebook import tqdm as notebook_tqdm\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "from datasets import load_dataset\n",
 19 |     "\n",
 20 |     "ds = load_dataset(\"d0rj/wikisum\")"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "data": {
 30 |       "text/plain": [
 31 |        "DatasetDict({\n",
 32 |        "    train: Dataset({\n",
 33 |        "        features: ['url', 'title', 'summary', 'article', 'step_headers'],\n",
 34 |        "        num_rows: 35775\n",
 35 |        "    })\n",
 36 |        "    validation: Dataset({\n",
 37 |        "        features: ['url', 'title', 'summary', 'article', 'step_headers'],\n",
 38 |        "        num_rows: 2000\n",
 39 |        "    })\n",
 40 |        "    test: Dataset({\n",
 41 |        "        features: ['url', 'title', 'summary', 'article', 'step_headers'],\n",
 42 |        "        num_rows: 2000\n",
 43 |        "    })\n",
 44 |        "})"
 45 |       ]
 46 |      },
 47 |      "execution_count": 2,
 48 |      "metadata": {},
 49 |      "output_type": "execute_result"
 50 |     }
 51 |    ],
 52 |    "source": [
 53 |     "ds"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {},
 60 |    "outputs": [
 61 |     {
 62 |      "name": "stdout",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "Do not shuck or wash your oysters. Oysters taste best when you shuck them immediately before eating them. In addition, keeping oysters in their shells makes them easier to store and reduces the chance that they'll go bad. If your oysters came pre-shucked in a plastic container, store them in the freezer until you're ready to use them. Leave the grit and dirt on the oysters. This will keep them moist and will help to insulate the meat. Pour ice into a small bowl or other open-top container. Grab a bowl, small cooler, or similar container that you can place inside your fridge. Make sure this container has an open top or removable lid. Then, pour a layer of ice into the bottom of the container. Do not keep your oysters in a sealed or closed-top container. Doing so will suffocate them. You may need to change your ice during the refrigeration process, so do not pour any into the container if you won't be able to check your oysters regularly. Place your oysters on top of the ice bed deep side down. Just like seafood merchants, you'll be storing your oysters on ice to keep them as chilled and fresh as possible. Make sure to turn each of your oysters so that the deeper side faces down, a technique that will help them better retain their juices. Dampen a towel with cold water and place it on top of the oysters. Dip a thin, clean kitchen towel in cold water and ring out the excess liquid. Then, gently lay the towel on top of the oysters. This will keep the oysters from drying out while preventing fresh water poisoning. If you'd prefer, you can cover the oysters with damp paper towels or newspaper instead. Oysters are salt water creatures, so submerging them in fresh water will essentially poison them and lead to their death. Place your container in a refrigerator. If possible, set your refrigerator to a temperature between 35 and 40 °F (2 and 4 °C). Make sure to store your oysters above any raw meat so the juices don't drip down onto your shellfish. If possible, check on your oysters at least once a day while they're in the fridge. If the towel dries out, dampen it again. If the ice in your container melts, pour it out and replace it with new ice. Keep your oysters in the fridge for up to 2 days. For safety, remove and consume your oysters within about 2 days of initially storing them. Though some oysters may last for a week or longer, eating them that late puts you at greater risk of food poisoning and other unwanted ailments. If your oysters came with an expiration date, use that as your guide for maximum storage time. Freeze your oysters if you need to store them for more than 2 days. Shuck the oysters when you’re ready to eat them. Once you finish storing the oysters, run them under cool water and open their shells. Then, run a knife under the flat side of the oyster and pop the shell off. Before eating, carefully separate the oyster from the rest of the shell using a knife. Before eating an oyster, inspect it to make sure it is still good. If the shell appears to be damaged, if the oyster smells foul, or if the meat is a cloudy shade of grey, brown, black, or pink, throw the oyster away. Keep the oysters in their shells and rinse them off. Storing your oysters inside their shells will make them less likely to go bad and, in some cases, better preserve their taste. Unlike refrigerating oysters, rinsing the shells under cold water to clean them off prevents any bacteria from living on the oysters. If you don't have enough room in your freezer to keep full-shelled oysters, you can shuck them before storage. If you do so, save the internal liquor for later use. Place your oysters in a freezer-safe container. To keep your oysters safe, place them inside a moisture-resistant, freezer-safe bag. If you're storing shucked oysters, you can use a firm plastic container instead. To prevent freezer burns, leave no more than 0.5 in (1.3 cm) of head space in the container. Pour oyster liquor into the container if you’re freezing shucked oysters. To help your shucked oysters retain their juiciness, pour the liquor you removed during the shucking process into your freezer-safe container. Keep pouring until you've completely submerged the oysters inside the liquid. If you don't have enough liquor to fill the container, pour in water as well. Seal the container. If you're using a resealable bag, press any excess air out of it using your fingers. Then, seal your container right before you put it into the freezer. Unlike with refrigerated oysters, closing the container will help better preserve your shellfish during long-term storage. If you're using a solid plastic container, make sure the lid you seal it with is air-tight. Make sure to write the initial storage date on your container. Keep your oysters in the freezer for up to 3 months. When frozen properly, fresh oysters should last for between 2 and 3 months. To make sure your oysters aren't going bad, look over them regularly and remove any that have cracked shells or cloudy meat that is a pink, black, brown, or grey color. While your oysters may remain safe to eat during this time, the taste will degrade gradually. Thaw your oysters in the fridge before consuming. Carefully take your oyster container out of the freezer and place it in a clear, open part of your refrigerator. Depending on the exact temperature of your appliances, the thawing process could take up to 20 hours to complete. Thawing your oysters using this method gives them a slightly longer shelf life, meaning you don't have to use them immediately after they thaw. If you'd like, you can thaw your oysters by submerging their container in cold water. However, you'll have to consume them immediately after they thaw, otherwise they'll go bad. \n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "print(ds['train'][0]['article'])"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 4,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stderr",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "[nltk_data] Downloading package punkt_tab to\n",
 83 |       "[nltk_data]     /home/rbqlsquf2/nltk_data...\n",
 84 |       "[nltk_data]   Package punkt_tab is already up-to-date!\n"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "import nltk\n",
 90 |     "nltk.download('punkt_tab')  # Download the necessary tokenizer data\n",
 91 |     "from nltk.tokenize import sent_tokenize\n",
 92 |     "\n",
 93 |     "def split_into_sentences(text):\n",
 94 |     "    sentences = sent_tokenize(text)\n",
 95 |     "    return sentences"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 5,
101 |    "metadata": {},
102 |    "outputs": [],
103 |    "source": [
104 |     "from tqdm import tqdm\n",
105 |     "def create_example(all_data):\n",
106 |     "    all_result = []\n",
107 |     "    data_id = 0\n",
108 |     "    for data in tqdm(all_data):\n",
109 |     "        context = split_into_sentences(data[\"article\"])\n",
110 |     "        result = {}\n",
111 |     "        result[\"_id\"] = data_id\n",
112 |     "        result[\"question\"] = \"summary\"\n",
113 |     "        result[\"document\"] = data['article']\n",
114 |     "        result[\"sent\"] = context\n",
115 |     "        result[\"output\"] = data[\"summary\"]\n",
116 |     "        all_result.append(result)\n",
117 |     "    return all_result"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 9,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stderr",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "100%|██████████| 2000/2000 [00:02<00:00, 678.30it/s]\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "input_data = create_example(ds['validation'])"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": 10,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "import json\n",
144 |     "with open(\"../data/1017data/wikisum_dev.json\", \"w\", encoding=\"utf-8\") as f:\n",
145 |     "    json.dump(input_data, f, ensure_ascii=False, indent=4)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": []
154 |   }
155 |  ],
156 |  "metadata": {
157 |   "kernelspec": {
158 |    "display_name": ".venv",
159 |    "language": "python",
160 |    "name": "python3"
161 |   },
162 |   "language_info": {
163 |    "codemirror_mode": {
164 |     "name": "ipython",
165 |     "version": 3
166 |    },
167 |    "file_extension": ".py",
168 |    "mimetype": "text/x-python",
169 |    "name": "python",
170 |    "nbconvert_exporter": "python",
171 |    "pygments_lexer": "ipython3",
172 |    "version": "3.8.10"
173 |   }
174 |  },
175 |  "nbformat": 4,
176 |  "nbformat_minor": 2
177 | }
178 | 


--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | 
  4 | def normalize_answer(s):
  5 |     """간단한 토큰화와 정규화"""
  6 |     s = s.lower()  # 소문자 변환
  7 |     s = re.sub(r"\b(a|an|the)\b", " ", s)  # 불필요한 관사 제거
  8 |     s = re.sub(r"[^a-z0-9]", " ", s)  # 알파벳과 숫자 외 제거
  9 |     return " ".join(s.split())  # 공백 정리
 10 | 
 11 | 
 12 | def exact_match_score(prediction, ground_truth):
 13 |     """예측 답과 실제 답 간의 EM 점수 계산"""
 14 |     return int(normalize_answer(prediction) == normalize_answer(ground_truth))
 15 | 
 16 | 
 17 | def f1_score_hotpot(prediction, ground_truth):
 18 |     """예측 답과 실제 답 간의 F1 점수 계산"""
 19 |     pred_tokens = normalize_answer(prediction).split()
 20 |     gt_tokens = normalize_answer(ground_truth).split()
 21 | 
 22 |     common_tokens = set(pred_tokens) & set(gt_tokens)
 23 |     num_common = len(common_tokens)
 24 | 
 25 |     if num_common == 0:
 26 |         return 0
 27 | 
 28 |     precision = num_common / len(pred_tokens)
 29 |     recall = num_common / len(gt_tokens)
 30 | 
 31 |     f1 = 2 * (precision * recall) / (precision + recall)
 32 |     return f1
 33 | 
 34 | 
 35 | def evaluate_supporting_facts(gold_sp, pred_sp):
 36 |     """Supporting facts에 대한 EM, Precision, Recall, F1 점수를 계산하는 함수"""
 37 |     # 단일 정수를 리스트로 변환
 38 |     gold_sp = [gold_sp] if isinstance(gold_sp, int) else gold_sp
 39 |     pred_sp = [pred_sp] if isinstance(pred_sp, int) else pred_sp
 40 | 
 41 |     # 예측과 정답 집합으로 변환
 42 |     gold_set = set(gold_sp)
 43 |     pred_set = set(pred_sp)
 44 | 
 45 |     # True Positives 계산
 46 |     tp = len(gold_set & pred_set)
 47 | 
 48 |     # Precision, Recall 계산
 49 |     precision = tp / len(pred_set) if pred_set else 0
 50 |     recall = tp / len(gold_set) if gold_set else 0
 51 | 
 52 |     # F1 점수 계산
 53 |     f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
 54 | 
 55 |     # Exact Match 계산
 56 |     em = 1 if gold_set == pred_set else 0
 57 | 
 58 |     return em, precision, recall, f1
 59 | 
 60 | 
 61 | import json
 62 | 
 63 | file_path = "data/1125data/hotpot_dev.json"
 64 | with open(file_path, "r", encoding="utf-8") as f:
 65 |     dev_data = json.load(f)
 66 | 
 67 | 
 68 | for i in range(150, 151, 2):
 69 |     f_name = f"result/1127_baseline/all_{i}00.json"
 70 | 
 71 |     with open(f_name, "r", encoding="utf-8") as file:
 72 |         test_data = json.load(file)
 73 |     score = []
 74 |     all_em_score = []
 75 |     all_precision_score = []
 76 |     all_recall_score = []
 77 |     all_f1_score = []
 78 |     result_f1 = []
 79 |     result_em = []
 80 |     ignore = 0
 81 |     gold_sp_option = False
 82 |     for dev, data in zip(dev_data, test_data):
 83 |         assert dev["_id"] == data["_id"]
 84 |         predict = ""
 85 |         answer = (
 86 |             data["answer"]
 87 |             .replace("**Answer:", "")
 88 |             .replace("<|im_start|>assistant", "")
 89 |             .replace("<|im_end|>", "")
 90 |             .strip()
 91 |         )
 92 |         generated_text = data["generated_text"].replace("**Answer:", "").strip()
 93 |         if answer == "yes":
 94 |             if answer in generated_text.lower() and "no" not in generated_text.lower():
 95 |                 generated_text = "yes"
 96 |             else:
 97 |                 generated_text = ""
 98 |         elif answer == "no":
 99 |             if answer in generated_text.lower() and "yes" not in generated_text.lower():
100 |                 generated_text = "no"
101 |             else:
102 |                 generated_text = ""
103 |         answer = answer.strip()
104 |         predict = generated_text.strip()
105 |         print(answer)
106 |         print(generated_text)
107 |         print("==========================")
108 |         result_f1.append(f1_score_hotpot(answer, predict))
109 |         result_em.append(exact_match_score(predict, answer))
110 |         ################################################
111 |         if "gold_sp" in data.keys():
112 |             gold_sp_option = True
113 |             gold_sp = data["gold_sp"]
114 |             # pred_sp = data["pred_sp"]
115 |             pred_sp = [x for x in data["pred_sp"] if x != 0]
116 |             em, precision, recall, f1 = evaluate_supporting_facts(gold_sp, pred_sp)
117 |             all_em_score.append(em)
118 |             all_precision_score.append(precision)
119 |             all_recall_score.append(recall)
120 |             all_f1_score.append(f1)
121 | 
122 |             for i in pred_sp:
123 |                 if answer == "yes" or answer == "no":
124 |                     ignore += 1
125 |                     break
126 |                 if predict in dev["sent"][i - 1]:
127 |                     score.append(dev["_id"])
128 |                     # print(answer)
129 |                     # print(generated_text)
130 |                     # print(dev["sent"][i-1])
131 |                     # print("================")
132 |                     break
133 | 
134 |         # F1 점수와 EM 점수 출력
135 |     print(f_name)
136 |     print("F1 점수: ", sum(result_f1) / len(result_f1))
137 |     print("EM 점수: ", sum(result_em) / len(result_em))
138 |     if gold_sp_option:
139 |         # F1 점수와 EM 점수 출력
140 |         print("all_em_score 점수: ", sum(all_em_score) / len(all_em_score))
141 |         print("all_f1_score 점수: ", sum(all_f1_score) / len(all_f1_score))
142 |         print("all_precision_score 점수: ", sum(all_precision_score) / len(all_precision_score))
143 |         print("all_recall_score 점수: ", sum(all_recall_score) / len(all_recall_score))
144 |     print("=================================================")
145 |     print(len(result_em))
146 | 


--------------------------------------------------------------------------------
/evaluation_sum.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from nltk.translate.bleu_score import sentence_bleu
 3 | from nltk.tokenize import word_tokenize
 4 | 
 5 |     
 6 | def calculate_bleu(reference, candidate):
 7 |     # 문장을 토큰화
 8 |     reference_tokens = [word_tokenize(reference.lower())]
 9 |     candidate_tokens = word_tokenize(candidate.lower())
10 |     
11 |     # BLEU 점수 계산 (1-gram부터 4-gram까지의 누적 점수)
12 |     weights = (1, 0, 0, 0)  # unigram에만 가중치 부여
13 |     return sentence_bleu(reference_tokens, candidate_tokens, weights=weights)
14 | 
15 | for i in [1,3,7]:
16 |     file_path = f"result/1106_weighted_sum/hotpot_ft_{i}000.json"
17 |     with open(file_path, "r", encoding="utf-8") as file:
18 |         dev_data = json.load(file)
19 |         
20 |     bleu_scores = []
21 |     for dev in dev_data:
22 |         predict = ""
23 |         answer = dev["answer"].split("**Summary:")[1].replace("\n<|im_end|>", "").strip()
24 |         if "**Summary:" in dev["generated_text"]:
25 |             predict = dev["generated_text"].split("**Summary:")[1].strip()
26 |         else:
27 |             predict = dev["generated_text"]
28 |         
29 |         print(answer)
30 |         print("--")
31 |         print(predict)
32 |         print("=============")
33 |         
34 |         bleu_score = calculate_bleu(answer, predict)
35 |         bleu_scores.append(bleu_score)
36 | 
37 |     # 평균 BLEU 점수 계산
38 |     average_bleu = sum(bleu_scores) / len(bleu_scores)
39 |     print(file_path)
40 |     print(f"Average BLEU score: {average_bleu:.4f}")
41 | 


--------------------------------------------------------------------------------
/qwen/qwen_inference.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM
  2 | import torch
  3 | from tqdm import tqdm
  4 | import json
  5 | from peft import PeftModel, PeftConfig
  6 | from datasets import Dataset
  7 | 
  8 | 
  9 | def create_model(base_model_path, lora_path):
 10 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 11 |     base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto")
 12 |     new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
 13 |     tokenizer.add_special_tokens(new_special_tokens)
 14 |     base_model.resize_token_embeddings(len(tokenizer))
 15 |     base_model.config.use_cache = False
 16 |     tokenizer.padding_side = "left"
 17 |     peft_model = PeftModel.from_pretrained(base_model, lora_path)
 18 |     return tokenizer, peft_model
 19 | 
 20 | 
 21 | class InferenceInput:
 22 |     def __init__(self, _id, input_text, answer):
 23 |         self._id = _id
 24 |         self.input_text = input_text
 25 |         self.answer = answer
 26 | 
 27 | 
 28 | def create_example(all_example, tokenizer):
 29 |     all_result = []
 30 |     for example in tqdm(all_example):
 31 |         if example["question"] == "summary":
 32 |             messages = [
 33 |                 {"role": "system", "content": "<|MRC|>True<|SUM|>True"},
 34 |                 {"role": "user", "content": f"{example['document']}"},
 35 |             ]
 36 |         else:  # MRC의 경우
 37 |             # messages = [{"role": "system", "content": "<|MRC|>False<|SUM|>True"}, {"role": "user", "content": f"**Question:{example['question']}\n{example['document']}"}]
 38 |             messages = [
 39 |                 {"role": "system", "content": "<|MRC|>True<|SUM|>False"},
 40 |                 {"role": "user", "content": f"{example['document']}"},
 41 |             ]
 42 |         result = {}
 43 |         result["input"] = tokenizer.apply_chat_template(messages, tokenize=False)
 44 |         result["output"] = example["output"]
 45 |         all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"]))
 46 |         if len(all_result) == 20:
 47 |             break
 48 |     return all_result
 49 | 
 50 | 
 51 | def create_batches(input_list, batch_size):
 52 |     # Split the input list into batches of size 'batch_size'
 53 |     for i in range(0, len(input_list), batch_size):
 54 |         yield input_list[i : i + batch_size]
 55 | 
 56 | 
 57 | def generate_batch_answer(batches, tokenizer, model):
 58 |     for batch_num, batch in enumerate(tqdm(batches)):
 59 |         batch_texts = [item.input_text for item in batch]
 60 |         inputs = tokenizer(
 61 |             batch_texts,  # Tokenized texts after applying chat template
 62 |             return_tensors="pt",  # Return in tensor format
 63 |             padding=True,  # Pad sequences to the same length
 64 |         ).to("cuda")
 65 |         model.to("cuda")
 66 |         with torch.no_grad():
 67 |             outputs = model.generate(
 68 |                 **inputs,
 69 |                 max_new_tokens=512,
 70 |             )
 71 | 
 72 |         decoded_outputs = [
 73 |             tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs)
 74 |         ]
 75 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
 76 | 
 77 |         # Store the generated text back in the input objects
 78 |         for i, item in enumerate(batch):
 79 |             item.generated_text = decoded_outputs[i]
 80 |             item.generated_all_answer = decoded_outputs_[i]
 81 |     return batches
 82 | 
 83 | 
 84 | def write_result(output_path):
 85 |     all_result = []
 86 |     for batch_num, batch in enumerate(answer_batches):
 87 |         for item in batch:
 88 |             result = {}
 89 |             result["_id"] = item._id
 90 |             result["input_text"] = item.input_text
 91 |             if "assistant" in item.generated_text:
 92 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
 93 |             else:
 94 |                 result["generated_text"] = item.generated_text
 95 |             result["answer"] = item.answer
 96 |             result["generated_all_answer"] = item.generated_all_answer
 97 |             all_result.append(result)
 98 | 
 99 |     with open(output_path, "w", encoding="utf-8") as f:
100 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     base_model_path = "Qwen/Qwen2.5-3B-Instruct"
105 |     model_path = "lora_tuning_1010"
106 |     tokenizer, model = create_model(base_model_path, model_path)
107 | 
108 |     file_path = "data/hotpot_dev.json"
109 |     batch_size = 16
110 |     print(batch_size)
111 | 
112 |     with open(file_path, "r", encoding="utf-8") as file:
113 |         dev_data = json.load(file)
114 | 
115 |     input_data = create_example(dev_data, tokenizer)
116 | 
117 |     # Create batches of input items
118 |     batches = list(create_batches(input_data, batch_size))
119 | 
120 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
121 |     #### 답변작성
122 |     output_path = "output/1010/hotpot_no_q_tf.json"
123 |     write_result(output_path)
124 | 


--------------------------------------------------------------------------------
/qwen/qwen_inference_lora.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM
  2 | import torch
  3 | from tqdm import tqdm
  4 | import json
  5 | from peft import PeftModel, PeftConfig
  6 | from datasets import Dataset
  7 | 
  8 | 
  9 | def create_model(base_model_path, lora_path):
 10 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 11 |     base_model = AutoModelForCausalLM.from_pretrained(base_model_path, device_map="auto")
 12 |     new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
 13 |     tokenizer.add_special_tokens(new_special_tokens)
 14 |     base_model.resize_token_embeddings(len(tokenizer))
 15 |     base_model.config.use_cache = False
 16 |     tokenizer.padding_side = "left"
 17 |     peft_model = PeftModel.from_pretrained(base_model, lora_path)
 18 |     return tokenizer, peft_model
 19 | 
 20 | 
 21 | class InferenceInput:
 22 |     def __init__(self, _id, input_text, answer):
 23 |         self._id = _id
 24 |         self.input_text = input_text
 25 |         self.answer = answer
 26 | 
 27 | 
 28 | def create_example(all_example, tokenizer):
 29 |     all_result = []
 30 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 31 |     for example in tqdm(all_example):
 32 |         if example["question"] == "summary":
 33 |             messages = [
 34 |                 {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"},
 35 |                 {"role": "user", "content": f"{example['document']}"},
 36 |             ]
 37 |         else:  # MRC의 경우
 38 |             messages = [
 39 |                 # {
 40 |                 #     "role": "system",
 41 |                 #     "content": f"<|MRC|>True<|SUM|>True",
 42 |                 # },
 43 |                 {"role": "user", "content": f"**Question:{example['question']}\n{example['document']}"},
 44 |             ]
 45 | 
 46 |         result = {}
 47 |         result["input"] = tokenizer.apply_chat_template(messages, tokenize=False)
 48 |         result["output"] = example["output"]
 49 |         all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"]))
 50 |         if len(all_result) == 20:
 51 |             break
 52 |     return all_result
 53 | 
 54 | 
 55 | def create_batches(input_list, batch_size):
 56 |     # Split the input list into batches of size 'batch_size'
 57 |     for i in range(0, len(input_list), batch_size):
 58 |         yield input_list[i : i + batch_size]
 59 | 
 60 | 
 61 | def generate_batch_answer(batches, tokenizer, model):
 62 |     for batch_num, batch in enumerate(tqdm(batches)):
 63 |         batch_texts = [item.input_text for item in batch]
 64 |         inputs = tokenizer(
 65 |             batch_texts,  # Tokenized texts after applying chat template
 66 |             return_tensors="pt",  # Return in tensor format
 67 |             padding=True,  # Pad sequences to the same length
 68 |         ).to("cuda")
 69 |         model.to("cuda")
 70 |         with torch.no_grad():
 71 |             outputs = model.generate(
 72 |                 **inputs,
 73 |                 max_new_tokens=512,
 74 |             )
 75 | 
 76 |         decoded_outputs = [
 77 |             tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs)
 78 |         ]
 79 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
 80 | 
 81 |         # Store the generated text back in the input objects
 82 |         for i, item in enumerate(batch):
 83 |             item.generated_text = decoded_outputs[i]
 84 |             item.generated_all_answer = decoded_outputs_[i]
 85 |     return batches
 86 | 
 87 | 
 88 | def write_result(output_path):
 89 |     all_result = []
 90 |     for batch_num, batch in enumerate(answer_batches):
 91 |         for item in batch:
 92 |             result = {}
 93 |             result["_id"] = item._id
 94 |             result["input_text"] = item.input_text
 95 |             if "assistant" in item.generated_text:
 96 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
 97 |             else:
 98 |                 result["generated_text"] = item.generated_text
 99 |             result["answer"] = item.answer
100 |             result["generated_all_answer"] = item.generated_all_answer
101 |             all_result.append(result)
102 | 
103 |     with open(output_path, "w", encoding="utf-8") as f:
104 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
105 | 
106 | 
107 | if __name__ == "__main__":
108 |     base_model_path = "Qwen/Qwen2.5-3B-Instruct"
109 |     model_path = "model/origin/checkpoint-3000"
110 |     tokenizer, model = create_model(base_model_path, model_path)
111 | 
112 |     file_path = "data/1008data/hotpot_dev.json"
113 |     batch_size = 16
114 |     print(batch_size)
115 | 
116 |     with open(file_path, "r", encoding="utf-8") as file:
117 |         dev_data = json.load(file)
118 | 
119 |     input_data = create_example(dev_data, tokenizer)
120 | 
121 |     # Create batches of input items
122 |     batches = list(create_batches(input_data, batch_size))
123 | 
124 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
125 |     #### 답변작성
126 |     output_path = "result/orrigin/test_3000.json"
127 |     write_result(output_path)
128 | 


--------------------------------------------------------------------------------
/qwen/qwen_train.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from datasets import Dataset
  4 | 
  5 | from transformers import (
  6 |     AutoTokenizer,
  7 |     AutoModelForCausalLM,
  8 |     DataCollatorForSeq2Seq,
  9 |     TrainingArguments,
 10 |     Trainer,
 11 |     GenerationConfig,
 12 | )
 13 | 
 14 | from peft import LoraConfig, get_peft_model
 15 | from trl import SFTTrainer
 16 | from torch.cuda.amp import autocast, GradScaler
 17 | import wandb
 18 | 
 19 | 
 20 | def create_model(model_path):
 21 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
 22 |     model = AutoModelForCausalLM.from_pretrained(model_path, device_map="cuda")
 23 |     new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
 24 |     tokenizer.add_special_tokens(new_special_tokens)
 25 |     model.resize_token_embeddings(len(tokenizer))
 26 |     model.enable_input_require_grads()
 27 |     model.config.use_cache = False
 28 |     tokenizer.padding_side = "left"
 29 |     return tokenizer, model
 30 | 
 31 | 
 32 | IGNORE_INDEX = -100
 33 | 
 34 | 
 35 | def process_func(example, tokenizer):
 36 |     MAX_LENGTH = 2048
 37 |     input_ids, attention_mask, labels = [], [], []
 38 |     mrc_value = -1
 39 |     sum_value = -1
 40 |     if example["mrc_type"] == "T":
 41 |         mrc_value = "True"
 42 |     else:
 43 |         mrc_value = "False"
 44 |     if example["sum_type"] == "T":
 45 |         sum_value = "True"
 46 |     else:
 47 |         sum_value = "False"
 48 | 
 49 |     example["document"] = example["document"].strip()
 50 |     ##############다시
 51 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 52 |     if example["data_type"] == "answer":
 53 |         if example["answer_type"] == "F":
 54 |             if example["question"] == "no":  # 질문이 없는 경우
 55 |                 instruction = tokenizer(
 56 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n",
 57 |                     add_special_tokens=False,
 58 |                 )
 59 |             else:
 60 |                 instruction = tokenizer(
 61 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n",
 62 |                     add_special_tokens=False,
 63 |                 )
 64 |             response = tokenizer(
 65 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
 66 |             )
 67 |         else:  # 답 해야하는 경우 질문은 무조건 있음
 68 |             instruction = tokenizer(
 69 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n",
 70 |                 add_special_tokens=False,
 71 |             )
 72 |             response = tokenizer(
 73 |                 f"<|im_start|>assistant\n**Answer:{example['output']}\n**Summary:\n<|im_end|>\n",
 74 |                 add_special_tokens=False,
 75 |             )
 76 |     elif example["data_type"] == "summary":
 77 |         if example["answer_type"] == "F":  # 무응답의 경우 질문이 무조건 없음
 78 |             instruction = tokenizer(
 79 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n",
 80 |                 add_special_tokens=False,
 81 |             )
 82 |             response = tokenizer(
 83 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
 84 |             )
 85 |         else:  # 답 해야하는 경우 질문 유무
 86 |             if example["question"] == "summary":  # 질문이 없는 경우
 87 |                 instruction = tokenizer(
 88 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n",
 89 |                     add_special_tokens=False,
 90 |                 )
 91 |             else:
 92 |                 instruction = tokenizer(
 93 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n",
 94 |                     add_special_tokens=False,
 95 |                 )
 96 |             response = tokenizer(
 97 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output']}\n<|im_end|>\n",
 98 |                 add_special_tokens=False,
 99 |             )
100 | 
101 |     input_ids = instruction["input_ids"] + response["input_ids"]
102 |     attention_mask = instruction["attention_mask"] + response["attention_mask"]
103 |     labels = [IGNORE_INDEX] * len(instruction["input_ids"]) + response["input_ids"]
104 |     if len(input_ids) > MAX_LENGTH:
105 |         input_ids = input_ids[:MAX_LENGTH]
106 |         attention_mask = attention_mask[:MAX_LENGTH]
107 |         labels = labels[:MAX_LENGTH]
108 |     return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}
109 | 
110 | 
111 | if __name__ == "__main__":
112 | 
113 |     model_path = "Qwen/Qwen2.5-3B-Instruct"
114 |     tokenizer, model = create_model(model_path)
115 |     data_file = "data/train_data_1011.json"
116 | 
117 |     dataset = Dataset.from_json(data_file)
118 | 
119 |     processed_dataset = dataset.map(lambda example: process_func(example, tokenizer))
120 | 
121 |     new_model = "qwen_lora_inst"
122 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
123 |     peft_config = LoraConfig(
124 |         target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
125 |         lora_alpha=16,
126 |         lora_dropout=0.1,
127 |         r=8,
128 |         bias="none",
129 |         task_type="CAUSAL_LM",
130 |     )
131 | 
132 |     model = get_peft_model(model, peft_config)
133 | 
134 |     model.print_trainable_parameters()
135 |     for name, param in model.named_parameters():
136 |         print(f"Parameter: {name}, requires_grad: {param.requires_grad}")
137 |     wandb.init(project="qwen llm lora")
138 |     training_params = TrainingArguments(
139 |         output_dir="/hdd/rbqlsquf/qwen_lora_1015",
140 |         num_train_epochs=1,
141 |         per_device_train_batch_size=4,
142 |         gradient_accumulation_steps=2,
143 |         warmup_ratio=0.1,
144 |         learning_rate=1e-4,
145 |         logging_steps=10,
146 |         run_name="qwen lora",
147 |         lr_scheduler_type="cosine",
148 |         gradient_checkpointing=True,
149 |         save_steps=1000,
150 |         save_on_each_node=True,
151 |         do_train=True,
152 |         push_to_hub=False,
153 |         report_to="wandb",
154 |     )
155 |     trainer = Trainer(
156 |         model=model,
157 |         args=training_params,
158 |         train_dataset=processed_dataset,
159 |         data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
160 |     )
161 |     trainer.train()
162 |     trainer.save_model(new_model)
163 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | absl-py==2.1.0
  2 | accelerate==0.29.3
  3 | aiohappyeyeballs==2.4.0
  4 | aiohttp==3.10.6
  5 | aiosignal==1.3.1
  6 | asttokens==2.4.1
  7 | async-timeout==4.0.3
  8 | attrs==24.2.0
  9 | backcall==0.2.0
 10 | bitsandbytes==0.43.1
 11 | certifi==2024.8.30
 12 | charset-normalizer==3.3.2
 13 | click==8.1.7
 14 | comm==0.2.2
 15 | contourpy==1.1.1
 16 | cycler==0.12.1
 17 | datasets==2.19.0
 18 | debugpy==1.8.6
 19 | decorator==5.1.1
 20 | dill==0.3.8
 21 | docker-pycreds==0.4.0
 22 | docstring_parser==0.16
 23 | eval_type_backport==0.2.0
 24 | executing==2.1.0
 25 | filelock==3.16.1
 26 | fonttools==4.54.1
 27 | frozenlist==1.4.1
 28 | fsspec==2024.3.1
 29 | gitdb==4.0.11
 30 | GitPython==3.1.43
 31 | huggingface-hub==0.25.1
 32 | idna==3.10
 33 | importlib_metadata==8.5.0
 34 | importlib_resources==6.4.5
 35 | inquirerpy==0.3.4
 36 | ipykernel==6.29.5
 37 | ipython==8.12.3
 38 | jedi==0.19.1
 39 | Jinja2==3.1.4
 40 | joblib==1.4.2
 41 | jupyter_client==8.6.3
 42 | jupyter_core==5.7.2
 43 | kiwisolver==1.4.7
 44 | markdown-it-py==3.0.0
 45 | MarkupSafe==2.1.5
 46 | matplotlib==3.7.5
 47 | matplotlib-inline==0.1.7
 48 | mdurl==0.1.2
 49 | mpmath==1.3.0
 50 | multidict==6.1.0
 51 | multiprocess==0.70.16
 52 | nest-asyncio==1.6.0
 53 | networkx==3.1
 54 | nltk==3.9.1
 55 | numpy==1.24.4
 56 | nvidia-cublas-cu12==12.1.3.1
 57 | nvidia-cuda-cupti-cu12==12.1.105
 58 | nvidia-cuda-nvrtc-cu12==12.1.105
 59 | nvidia-cuda-runtime-cu12==12.1.105
 60 | nvidia-cudnn-cu12==8.9.2.26
 61 | nvidia-cufft-cu12==11.0.2.54
 62 | nvidia-curand-cu12==10.3.2.106
 63 | nvidia-cusolver-cu12==11.4.5.107
 64 | nvidia-cusparse-cu12==12.1.0.106
 65 | nvidia-nccl-cu12==2.18.1
 66 | nvidia-nvjitlink-cu12==12.6.68
 67 | nvidia-nvtx-cu12==12.1.105
 68 | packaging==24.1
 69 | pandas==2.0.3
 70 | parso==0.8.4
 71 | peft==0.10.0
 72 | pexpect==4.9.0
 73 | pfzy==0.3.4
 74 | pickleshare==0.7.5
 75 | pillow==10.4.0
 76 | platformdirs==4.3.6
 77 | prompt_toolkit==3.0.47
 78 | protobuf==5.28.2
 79 | psutil==6.0.0
 80 | ptyprocess==0.7.0
 81 | pure_eval==0.2.3
 82 | pyarrow==17.0.0
 83 | pyarrow-hotfix==0.6
 84 | Pygments==2.18.0
 85 | pyparsing==3.1.4
 86 | python-dateutil==2.9.0.post0
 87 | pytz==2024.2
 88 | PyYAML==6.0.2
 89 | pyzmq==26.2.0
 90 | regex==2024.9.11
 91 | requests==2.32.3
 92 | rich==13.8.1
 93 | rouge_score==0.1.2
 94 | safetensors==0.4.5
 95 | scikit-learn==1.3.2
 96 | scipy==1.10.1
 97 | seaborn==0.13.2
 98 | sentry-sdk==2.14.0
 99 | setproctitle==1.3.3
100 | shtab==1.7.1
101 | six==1.16.0
102 | smmap==5.0.1
103 | stack-data==0.6.3
104 | sympy==1.13.3
105 | threadpoolctl==3.5.0
106 | tokenizers==0.19.1
107 | torch==2.1.0
108 | tornado==6.4.1
109 | tqdm==4.66.5
110 | traitlets==5.14.3
111 | transformers==4.43.3
112 | triton==2.1.0
113 | trl==0.8.6
114 | typing_extensions==4.12.2
115 | tyro==0.8.11
116 | tzdata==2024.2
117 | urllib3==2.2.3
118 | wandb==0.17.7
119 | wcwidth==0.2.13
120 | xxhash==3.5.0
121 | yarl==1.12.1
122 | zipp==3.20.2
123 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | PYTHON_PATH="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/bin/python"
 2 | 
 3 | BASE_DIR="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM"
 4 | MODEL_DIR="model/1127_baseline"
 5 | OUTPUT_DIR="$BASE_DIR/result/1127_baseline"
 6 | SOURCE_DIR="$BASE_DIR/source"
 7 | SCRIPT_NAME="inference_baseline.py"
 8 | # for i in {54..44..-2}; do
 9 | #     checkpoint=$((i * 100))
10 | #     $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \
11 | #         --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \
12 | #         --output_dir "$OUTPUT_DIR/$checkpoint.json"
13 | # done
14 | 
15 | # for i in {62..68..2}; do
16 | #     checkpoint=$((i * 100))
17 | #     $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \
18 | #         --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \
19 | #         --output_dir "$OUTPUT_DIR/$checkpoint.json"
20 | # done
21 | 
22 | 
23 | for i in 150; do
24 |     checkpoint=$((i * 100))
25 |     $PYTHON_PATH $SOURCE_DIR/$SCRIPT_NAME \
26 |         --train_model_path "$MODEL_DIR/checkpoint-$checkpoint" \
27 |         --output_dir "$OUTPUT_DIR/all_$checkpoint.json"
28 | done
29 | 


--------------------------------------------------------------------------------
/run_train.sh:
--------------------------------------------------------------------------------
 1 | PYTHON_PATH="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM/.venv/bin/python"
 2 | 
 3 | BASE_DIR="/home/rbqlsquf2/workspace/XAI_rationale-inference-LLM"
 4 | MODEL_DIR="$BASE_DIR/model"
 5 | OUTPUT_DIR="$BASE_DIR/result"
 6 | SOURCE_DIR="$BASE_DIR/source"
 7 | 
 8 | 
 9 | $PYTHON_PATH $SOURCE_DIR/train_pn_yesloss.py \
10 |     --new_model 1210_pn_yesloss \
11 |     --output_dir model/1210_pn_yesloss \
12 |     --num_train_epochs 1 \
13 |     --batch_size 2 \
14 |     --beam_size 1 \
15 |     --gradient_accumulation_steps 1 \
16 |     --wandb_run_name 1210_pn_yesloss
17 | 


--------------------------------------------------------------------------------
/source/__pycache__/modeling_qwen2_inf.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_inf.cpython-38.pyc


--------------------------------------------------------------------------------
/source/__pycache__/modeling_qwen2_mean.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_mean.cpython-38.pyc


--------------------------------------------------------------------------------
/source/__pycache__/modeling_qwen2_pn.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn.cpython-38.pyc


--------------------------------------------------------------------------------
/source/__pycache__/modeling_qwen2_pn_2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn_2.cpython-38.pyc


--------------------------------------------------------------------------------
/source/__pycache__/modeling_qwen2_pn_test.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rbqlsquf/XAI_rationale-inference-LLM/c50b9fead5901d809b7cab9451fa84aeb38bd2df/source/__pycache__/modeling_qwen2_pn_test.cpython-38.pyc


--------------------------------------------------------------------------------
/source/inference_1108.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1107 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(
 17 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
 18 |     )
 19 |     trained_model.set_gru(gru)
 20 |     trained_model.config.use_cache = False
 21 |     tokenizer.padding_side = "left"
 22 |     print("LORA WEIGHT LOADING")
 23 |     trained_model.load_pn_model(lora_path)
 24 |     return tokenizer, trained_model
 25 | 
 26 | 
 27 | class InferenceInput:
 28 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 29 |         self._id = _id
 30 |         self.input_text = input_text
 31 |         self.answer = answer
 32 |         self.attention_mask = attention_mask
 33 |         self.sent_masks = sent_masks
 34 |         self.gold_sp = gold_sp
 35 | 
 36 | 
 37 | def create_example(all_example, tokenizer, data_sample):
 38 |     all_result = []
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         instruction = tokenizer(
 60 |             f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 61 |             add_special_tokens=False,
 62 |         )
 63 |         response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n"
 64 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 65 |         input = instruction["input_ids"] + token_doc["input_ids"]
 66 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 67 |         output = response
 68 | 
 69 |         if "supporting_num" in example.keys():
 70 |             gold_sp = example["supporting_num"]
 71 |         else:
 72 |             gold_sp = None
 73 |         assert len(input) == len(sentence_position) == len(attention_mask)
 74 | 
 75 |         all_result.append(
 76 |             InferenceInput(
 77 |                 _id=example["_id"],
 78 |                 input_text=input,
 79 |                 answer=output,
 80 |                 attention_mask=attention_mask,
 81 |                 sent_masks=sentence_position,
 82 |                 gold_sp=gold_sp,
 83 |             )
 84 |         )
 85 |         if data_sample:
 86 |             if len(all_result) == 100:
 87 |                 break
 88 |     return all_result
 89 | 
 90 | 
 91 | def create_batches(input_list, batch_size):
 92 |     # Split the input list into batches of size 'batch_size'
 93 |     for i in range(0, len(input_list), batch_size):
 94 |         yield input_list[i : i + batch_size]
 95 | 
 96 | 
 97 | def generate_batch_answer(batches, tokenizer, model):
 98 |     for batch_num, batch in enumerate(tqdm(batches)):
 99 |         input_ids = [item.input_text for item in batch]
100 |         attention_mask = [item.attention_mask for item in batch]
101 |         sentence_masks = [item.sent_masks for item in batch]
102 | 
103 |         model.to("cuda")
104 |         model.eval()
105 |         input_batch = {}
106 |         max_length = max(len(mask) for mask in input_ids)
107 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
108 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
109 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
110 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
111 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
112 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
113 | 
114 |         with torch.no_grad():
115 |             model.evidence = None
116 |             model.sentence_number = None
117 |             outputs = model.generate(
118 |                 input_ids=input_batch["input_ids"],
119 |                 attention_mask=input_batch["attention_mask"],
120 |                 sent_masks=input_batch["sent_masks"],
121 |                 max_new_tokens=50,
122 |             )
123 | 
124 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
125 |         decoded_outputs = [
126 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
127 |         ]
128 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
129 | 
130 |         # Store the generated text back in the input objects
131 |         for i, item in enumerate(batch):
132 |             item.input_text = input_text
133 |             item.generated_text = decoded_outputs[i]
134 |             item.generated_all_answer = decoded_outputs_[i]
135 |             if model.sentence_number != None:
136 |                 item.pred_sp = model.sentence_number[i]
137 |     return batches
138 | 
139 | 
140 | def write_result(output_path, answer_batches, tokenizer):
141 |     all_result = []
142 |     for batch_num, batch in enumerate(answer_batches):
143 |         for item in batch:
144 |             result = {}
145 |             result["_id"] = item._id
146 |             if "assistant\n" in item.generated_text:
147 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
148 |             elif "assistant" in item.generated_text:
149 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
150 |             else:
151 |                 result["generated_text"] = item.generated_text
152 |             result["answer"] = item.answer
153 |             result["generated_all_answer"] = item.generated_all_answer
154 |             if item.gold_sp != None:
155 |                 result["gold_sp"] = item.gold_sp
156 |                 result["pred_sp"] = item.pred_sp.tolist()
157 |             all_result.append(result)
158 | 
159 |     with open(output_path, "w", encoding="utf-8") as f:
160 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
161 | 
162 | 
163 | if __name__ == "__main__":
164 |     ##############################################################
165 |     #               model param 추가할 내용
166 |     ##############################################################
167 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
168 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
169 |     parser.add_argument("--train_model_path", type=str, default="model/1115_yesloss_final/checkpoint-2000")
170 |     parser.add_argument("--data_file", type=str, default="data/1113data/hotpot_dev.json")
171 |     parser.add_argument("--beam_size", type=int, default=1)
172 |     parser.add_argument("--max_dec_len", type=int, default=3)
173 |     parser.add_argument("--output_dir", type=str, default="result/1115_yesloss_final/2000.json")
174 |     parser.add_argument("--batch_size", type=int, default=4)
175 |     parser.add_argument("--data_sample", type=bool, default=False)
176 | 
177 |     args = parser.parse_args()
178 |     print(args)
179 |     #########################################################
180 |     #           변수들 선언
181 |     #########################################################
182 | 
183 |     config = AutoConfig.from_pretrained(args.base_model_path)
184 |     config.beam_size = args.beam_size
185 |     config.max_dec_len = args.max_dec_len
186 | 
187 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
188 |     print("batch size : ", args.batch_size)
189 | 
190 |     with open(args.data_file, "r", encoding="utf-8") as file:
191 |         dev_data = json.load(file)
192 | 
193 |     input_data = create_example(dev_data, tokenizer, args.data_sample)
194 | 
195 |     # Create batches of input items
196 |     batches = list(create_batches(input_data, args.batch_size))
197 | 
198 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
199 |     #### 답변작성
200 | 
201 |     write_result(args.output_dir, answer_batches, tokenizer)
202 | 


--------------------------------------------------------------------------------
/source/inference_baseline.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1107_baseline import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(
 17 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
 18 |     )
 19 |     trained_model.set_gru(gru)
 20 |     trained_model.config.use_cache = False
 21 |     tokenizer.padding_side = "left"
 22 |     print("LORA WEIGHT LOADING")
 23 |     trained_model.load_pn_model(lora_path)
 24 |     return tokenizer, trained_model
 25 | 
 26 | 
 27 | class InferenceInput:
 28 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 29 |         self._id = _id
 30 |         self.input_text = input_text
 31 |         self.answer = answer
 32 |         self.attention_mask = attention_mask
 33 |         self.sent_masks = sent_masks
 34 |         self.gold_sp = gold_sp
 35 | 
 36 | 
 37 | def create_example(all_example, tokenizer, data_sample):
 38 |     all_result = []
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         instruction = tokenizer(
 60 |             f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 61 |             add_special_tokens=False,
 62 |         )
 63 |         response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n"
 64 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 65 |         input = instruction["input_ids"] + token_doc["input_ids"]
 66 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 67 |         output = response
 68 | 
 69 |         if "supporting_num" in example.keys():
 70 |             gold_sp = example["supporting_num"]
 71 |         else:
 72 |             gold_sp = None
 73 |         assert len(input) == len(sentence_position) == len(attention_mask)
 74 | 
 75 |         all_result.append(
 76 |             InferenceInput(
 77 |                 _id=example["_id"],
 78 |                 input_text=input,
 79 |                 answer=output,
 80 |                 attention_mask=attention_mask,
 81 |                 sent_masks=sentence_position,
 82 |                 gold_sp=gold_sp,
 83 |             )
 84 |         )
 85 |         if data_sample:
 86 |             if len(all_result) == 100:
 87 |                 break
 88 |     return all_result
 89 | 
 90 | 
 91 | def create_batches(input_list, batch_size):
 92 |     # Split the input list into batches of size 'batch_size'
 93 |     for i in range(0, len(input_list), batch_size):
 94 |         yield input_list[i : i + batch_size]
 95 | 
 96 | 
 97 | def generate_batch_answer(batches, tokenizer, model):
 98 |     for batch_num, batch in enumerate(tqdm(batches)):
 99 |         input_ids = [item.input_text for item in batch]
100 |         attention_mask = [item.attention_mask for item in batch]
101 |         sentence_masks = [item.sent_masks for item in batch]
102 | 
103 |         model.to("cuda")
104 |         input_batch = {}
105 |         max_length = max(len(mask) for mask in input_ids)
106 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
107 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
108 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
109 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
110 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
111 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
112 | 
113 |         with torch.no_grad():
114 |             model.evidence = None
115 |             model.sentence_number = None
116 |             outputs = model.generate(
117 |                 input_ids=input_batch["input_ids"],
118 |                 attention_mask=input_batch["attention_mask"],
119 |                 sent_masks=input_batch["sent_masks"],
120 |                 max_new_tokens=50,
121 |             )
122 | 
123 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
124 |         decoded_outputs = [
125 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
126 |         ]
127 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
128 | 
129 |         # Store the generated text back in the input objects
130 |         for i, item in enumerate(batch):
131 |             item.input_text = input_text
132 |             item.generated_text = decoded_outputs[i]
133 |             item.generated_all_answer = decoded_outputs_[i]
134 |             if model.sentence_number != None:
135 |                 item.pred_sp = model.sentence_number[i]
136 |             else:
137 |                 item.pred_sp = None
138 |     return batches
139 | 
140 | 
141 | def write_result(output_path, answer_batches, tokenizer):
142 |     all_result = []
143 |     for batch_num, batch in enumerate(answer_batches):
144 |         for item in batch:
145 |             result = {}
146 |             result["_id"] = item._id
147 |             if "assistant\n" in item.generated_text:
148 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
149 |             elif "assistant" in item.generated_text:
150 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
151 |             else:
152 |                 result["generated_text"] = item.generated_text
153 |             result["answer"] = item.answer
154 |             result["generated_all_answer"] = item.generated_all_answer
155 |             if item.gold_sp != None and item.pred_sp != None:
156 |                 result["gold_sp"] = item.gold_sp
157 |                 result["pred_sp"] = item.pred_sp.tolist()
158 |             all_result.append(result)
159 | 
160 |     with open(output_path, "w", encoding="utf-8") as f:
161 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
162 | 
163 | 
164 | if __name__ == "__main__":
165 |     ##############################################################
166 |     #               model param 추가할 내용
167 |     ##############################################################
168 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
169 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
170 |     parser.add_argument("--train_model_path", type=str, default="model/1127_baseline_no_causal/checkpoint-2600")
171 |     parser.add_argument("--data_file", type=str, default="data/1113data/hotpot_dev.json")
172 |     parser.add_argument("--beam_size", type=int, default=1)
173 |     parser.add_argument("--max_dec_len", type=int, default=3)
174 |     parser.add_argument("--output_dir", type=str, default="result/1127_baseline_no_causal/2600.json")
175 |     parser.add_argument("--batch_size", type=int, default=8)
176 |     parser.add_argument("--data_sample", type=bool, default=False)
177 | 
178 |     args = parser.parse_args()
179 |     print(args)
180 |     #########################################################
181 |     #           변수들 선언
182 |     #########################################################
183 | 
184 |     config = AutoConfig.from_pretrained(args.base_model_path)
185 |     config.beam_size = args.beam_size
186 |     config.max_dec_len = args.max_dec_len
187 | 
188 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
189 |     print("batch size : ", args.batch_size)
190 | 
191 |     with open(args.data_file, "r", encoding="utf-8") as file:
192 |         dev_data = json.load(file)
193 | 
194 |     input_data = create_example(dev_data, tokenizer, args.data_sample)
195 | 
196 |     # Create batches of input items
197 |     batches = list(create_batches(input_data, args.batch_size))
198 | 
199 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
200 |     #### 답변작성
201 | 
202 |     write_result(args.output_dir, answer_batches, tokenizer)
203 | 


--------------------------------------------------------------------------------
/source/inference_mean.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_mean import Qwen2ForCausalLM
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 | 
 16 |     # AutoModelForCausalLM -> Qwen2ForCausalLM
 17 |     base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, device_map="auto")
 18 |     new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
 19 |     tokenizer.add_special_tokens(new_special_tokens)
 20 |     base_model.resize_token_embeddings(len(tokenizer))
 21 |     base_model.config.use_cache = False
 22 |     tokenizer.padding_side = "left"
 23 |     peft_model = PeftModel.from_pretrained(base_model, lora_path)
 24 |     return tokenizer, peft_model
 25 | 
 26 | 
 27 | class InferenceInput:
 28 |     def __init__(self, _id, input_text, answer):
 29 |         self._id = _id
 30 |         self.input_text = input_text
 31 |         self.answer = answer
 32 | 
 33 | 
 34 | def create_example(all_example, tokenizer):
 35 |     all_result = []
 36 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 37 |     for example in tqdm(all_example):
 38 |         if example["question"] == "summary":
 39 |             messages = [
 40 |                 {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"},
 41 |                 {"role": "user", "content": f"**Document:\n{example['document']}"},
 42 |             ]
 43 |         else:  # MRC의 경우
 44 |             messages = [
 45 |                 {
 46 |                     "role": "system",
 47 |                     "content": f"{task_instruction}\n<|MRC|>True<|SUM|>False",
 48 |                 },
 49 |                 {"role": "user", "content": f"**Question:{example['question']}\n**Document:\n{example['document']}"},
 50 |             ]
 51 | 
 52 |         result = {}
 53 |         result["input"] = tokenizer.apply_chat_template(messages, tokenize=False)
 54 |         result["output"] = example["output"]
 55 |         all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"]))
 56 |         # if len(all_result) == 100:
 57 |         #     break
 58 |     return all_result
 59 | 
 60 | 
 61 | def create_batches(input_list, batch_size):
 62 |     # Split the input list into batches of size 'batch_size'
 63 |     for i in range(0, len(input_list), batch_size):
 64 |         yield input_list[i : i + batch_size]
 65 | 
 66 | 
 67 | def generate_batch_answer(batches, tokenizer, model):
 68 |     for batch_num, batch in enumerate(tqdm(batches)):
 69 |         batch_texts = [item.input_text for item in batch]
 70 |         inputs = tokenizer(
 71 |             batch_texts,  # Tokenized texts after applying chat template
 72 |             return_tensors="pt",  # Return in tensor format
 73 |             padding=True,  # Pad sequences to the same length
 74 |         ).to("cuda")
 75 |         model.to("cuda")
 76 |         with torch.no_grad():
 77 |             model.model.evidence = None
 78 |             outputs = model.generate(
 79 |                 **inputs,
 80 |                 max_new_tokens=512,
 81 |             )
 82 | 
 83 |         decoded_outputs = [
 84 |             tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs)
 85 |         ]
 86 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
 87 | 
 88 |         # Store the generated text back in the input objects
 89 |         for i, item in enumerate(batch):
 90 |             item.generated_text = decoded_outputs[i]
 91 |             item.generated_all_answer = decoded_outputs_[i]
 92 |     return batches
 93 | 
 94 | 
 95 | def write_result(output_path):
 96 |     all_result = []
 97 |     for batch_num, batch in enumerate(answer_batches):
 98 |         for item in batch:
 99 |             result = {}
100 |             result["_id"] = item._id
101 |             result["input_text"] = item.input_text
102 |             if "assistant" in item.generated_text:
103 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
104 |             else:
105 |                 result["generated_text"] = item.generated_text
106 |             result["answer"] = item.answer
107 |             result["generated_all_answer"] = item.generated_all_answer
108 |             all_result.append(result)
109 | 
110 |     with open(output_path, "w", encoding="utf-8") as f:
111 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
116 |     parser.add_argument("--model_path", type=str, required=True, help="모델 경로")
117 |     parser.add_argument("--output_path", type=str, required=True, help="결과저장 경로")
118 |     args = parser.parse_args()
119 |     model_path = args.model_path
120 |     output_path = args.output_path
121 | 
122 |     # model_path = "model/mean/checkpoint-1000"
123 |     # output_path = "result/mean/hotpot_1000.json"
124 |     ##########################################
125 | 
126 |     base_model_path = "Qwen/Qwen2.5-3B-Instruct"
127 | 
128 |     tokenizer, model = create_model(base_model_path, model_path)
129 | 
130 |     file_path = "data/1008data/hotpot_dev.json"
131 |     batch_size = 16
132 |     print(batch_size)
133 | 
134 |     with open(file_path, "r", encoding="utf-8") as file:
135 |         dev_data = json.load(file)
136 | 
137 |     input_data = create_example(dev_data, tokenizer)
138 | 
139 |     # Create batches of input items
140 |     batches = list(create_batches(input_data, batch_size))
141 | 
142 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
143 |     #### 답변작성
144 | 
145 |     write_result(output_path)
146 | 


--------------------------------------------------------------------------------
/source/inference_origin.py:
--------------------------------------------------------------------------------
  1 | <<<<<<< HEAD
  2 | from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM
  3 | =======
  4 | from transformers import AutoTokenizer, AutoModelForCausalLM, Qwen2ForCausalLM, AutoConfig
  5 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
  6 | 
  7 | import torch
  8 | from tqdm import tqdm
  9 | import json
 10 | from peft import PeftModel, PeftConfig
 11 | from datasets import Dataset
 12 | 
 13 | import argparse
 14 | 
 15 | 
 16 | <<<<<<< HEAD
 17 | def create_model(base_model_path, lora_path):
 18 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 19 | 
 20 |     # AutoModelForCausalLM -> Qwen2ForCausalLM
 21 |     base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, device_map="auto")
 22 |     new_special_tokens = {"additional_special_tokens": ["<|mrc|>", "<|summary|>"]}
 23 |     tokenizer.add_special_tokens(new_special_tokens)
 24 |     base_model.resize_token_embeddings(len(tokenizer))
 25 | =======
 26 | def create_model(base_model_path, lora_path, config):
 27 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 28 |     base_model = Qwen2ForCausalLM.from_pretrained(base_model_path, config=config, device_map="auto")
 29 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
 30 |     base_model.config.use_cache = False
 31 |     tokenizer.padding_side = "left"
 32 |     peft_model = PeftModel.from_pretrained(base_model, lora_path)
 33 |     return tokenizer, peft_model
 34 | 
 35 | 
 36 | class InferenceInput:
 37 | <<<<<<< HEAD
 38 |     def __init__(self, _id, input_text, answer):
 39 |         self._id = _id
 40 |         self.input_text = input_text
 41 |         self.answer = answer
 42 | 
 43 | 
 44 | def create_example(all_example, tokenizer):
 45 |     all_result = []
 46 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 47 |     for example in tqdm(all_example):
 48 |         if example["question"] == "summary":
 49 |             messages = [
 50 |                 {"role": "system", "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True"},
 51 |                 {"role": "user", "content": f"**Document:\n{example['document']}"},
 52 |             ]
 53 |         else:  # MRC의 경우
 54 |             messages = [
 55 |                 {
 56 |                     "role": "system",
 57 |                     "content": f"{task_instruction}\n<|MRC|>True<|SUM|>True",
 58 |                 },
 59 |                 {"role": "user", "content": f"**Question:{example['question']}\n**Document:\n{example['document']}"},
 60 |             ]
 61 | 
 62 |         result = {}
 63 |         result["input"] = tokenizer.apply_chat_template(messages, tokenize=False)
 64 |         result["output"] = example["output"]
 65 |         all_result.append(InferenceInput(_id=example["_id"], input_text=result["input"], answer=result["output"]))
 66 |         if len(all_result) == 100:
 67 |             break
 68 | =======
 69 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks):
 70 |         self._id = _id
 71 |         self.input_text = input_text
 72 |         self.answer = answer
 73 |         self.attention_mask = attention_mask
 74 |         self.sent_masks = sent_masks
 75 | 
 76 | 
 77 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value):
 78 |     all_result = []
 79 | 
 80 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 81 |     for example in tqdm(all_example):
 82 |         example["document"] = example["document"].strip()
 83 |         # token 된 doc
 84 |         token_doc = {"input_ids": [], "attention_mask": []}
 85 |         # document 문장 index
 86 |         sentence_number = 0
 87 |         sentence_position = []
 88 |         for i, sent in enumerate(example["sent"]):
 89 |             # 0번 문장은 instruction으로 지정할 계획
 90 |             sent = sent.strip()
 91 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 92 |             sentence_number += 1  # 1부터 시작
 93 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 94 |             token_doc["input_ids"] += token_sent["input_ids"]
 95 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 96 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 97 |         sentence_position.extend([sentence_number] * len(token_end))
 98 |         token_doc["input_ids"] += token_end["input_ids"]
 99 |         token_doc["attention_mask"] += token_end["attention_mask"]
100 | 
101 |         if example["question"] == "summary":
102 |             instruction = tokenizer(
103 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
104 |                 add_special_tokens=False,
105 |             )
106 |             response = tokenizer(
107 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
108 |                 add_special_tokens=False,
109 |             )
110 |         else:  # MRC의 경우
111 |             instruction = tokenizer(
112 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
113 |                 add_special_tokens=False,
114 |             )
115 |             response = tokenizer(
116 |                 f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
117 |                 add_special_tokens=False,
118 |             )
119 | 
120 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
121 |         input = instruction["input_ids"] + token_doc["input_ids"]
122 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
123 |         output = example["output"]
124 |         assert len(input) == len(sentence_position) == len(attention_mask)
125 | 
126 |         all_result.append(
127 |             InferenceInput(
128 |                 _id=example["_id"],
129 |                 input_text=input,
130 |                 answer=output,
131 |                 attention_mask=attention_mask,
132 |                 sent_masks=sentence_position,
133 |             )
134 |         )
135 |         if data_sample:
136 |             if len(all_result) == 1:
137 |                 break
138 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
139 |     return all_result
140 | 
141 | 
142 | def create_batches(input_list, batch_size):
143 |     # Split the input list into batches of size 'batch_size'
144 |     for i in range(0, len(input_list), batch_size):
145 |         yield input_list[i : i + batch_size]
146 | 
147 | 
148 | def generate_batch_answer(batches, tokenizer, model):
149 |     for batch_num, batch in enumerate(tqdm(batches)):
150 | <<<<<<< HEAD
151 |         batch_texts = [item.input_text for item in batch]
152 |         inputs = tokenizer(
153 |             batch_texts,  # Tokenized texts after applying chat template
154 |             return_tensors="pt",  # Return in tensor format
155 |             padding=True,  # Pad sequences to the same length
156 |         ).to("cuda")
157 |         model.to("cuda")
158 |         with torch.no_grad():
159 |             model.model.evidence = None
160 |             outputs = model.generate(
161 |                 **inputs,
162 |                 max_new_tokens=512,
163 |             )
164 | 
165 |         decoded_outputs = [
166 |             tokenizer.decode(output[len(inputs[i]) :], skip_special_tokens=True) for i, output in enumerate(outputs)
167 | =======
168 |         input_ids = [item.input_text for item in batch]
169 |         attention_mask = [item.attention_mask for item in batch]
170 |         sentence_masks = [item.sent_masks for item in batch]
171 | 
172 |         model.to("cuda")
173 |         input_batch = {}
174 |         max_length = max(len(mask) for mask in input_ids)
175 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
176 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
177 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
178 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
179 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
180 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
181 | 
182 |         with torch.no_grad():
183 |             model.evidence = None
184 |             outputs = model.generate(
185 |                 input_ids=input_batch["input_ids"],
186 |                 attention_mask=input_batch["attention_mask"],
187 |                 max_new_tokens=200,
188 |             )
189 | 
190 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
191 | 
192 |         decoded_outputs = [
193 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
194 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
195 |         ]
196 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
197 | 
198 |         # Store the generated text back in the input objects
199 |         for i, item in enumerate(batch):
200 | <<<<<<< HEAD
201 | =======
202 |             item.input_text = input_text
203 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
204 |             item.generated_text = decoded_outputs[i]
205 |             item.generated_all_answer = decoded_outputs_[i]
206 |     return batches
207 | 
208 | 
209 | <<<<<<< HEAD
210 | def write_result(output_path):
211 | =======
212 | def write_result(output_path, answer_batches, tokenizer):
213 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
214 |     all_result = []
215 |     for batch_num, batch in enumerate(answer_batches):
216 |         for item in batch:
217 |             result = {}
218 |             result["_id"] = item._id
219 |             result["input_text"] = item.input_text
220 | <<<<<<< HEAD
221 |             if "assistant" in item.generated_text:
222 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
223 | =======
224 |             if "assistant\n" in item.generated_text:
225 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
226 |             elif "assistant" in item.generated_text:
227 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
228 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
229 |             else:
230 |                 result["generated_text"] = item.generated_text
231 |             result["answer"] = item.answer
232 |             result["generated_all_answer"] = item.generated_all_answer
233 |             all_result.append(result)
234 | 
235 |     with open(output_path, "w", encoding="utf-8") as f:
236 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
237 | 
238 | 
239 | if __name__ == "__main__":
240 | <<<<<<< HEAD
241 |     # parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
242 |     # parser.add_argument("--model_path", type=str, required=True, help="모델 경로")
243 |     # parser.add_argument("--output_path", type=str, required=True, help="결과저장 경로")
244 |     # args = parser.parse_args()
245 |     # model_path = args.model_path
246 |     # output_path = args.output_path
247 | 
248 |     model_path = "model/hotpot_cnn/checkpoint-3000"
249 |     output_path = "result/hotpot_cnn/hotpot_3000_tt.json"
250 | 
251 |     base_model_path = "Qwen/Qwen2.5-3B-Instruct"
252 | 
253 |     tokenizer, model = create_model(base_model_path, model_path)
254 | 
255 |     file_path = "data/1008data/hotpot_dev.json"
256 |     batch_size = 16
257 |     print(batch_size)
258 | 
259 |     with open(file_path, "r", encoding="utf-8") as file:
260 |         dev_data = json.load(file)
261 | 
262 |     input_data = create_example(dev_data, tokenizer)
263 | 
264 |     # Create batches of input items
265 |     batches = list(create_batches(input_data, batch_size))
266 | =======
267 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
268 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
269 |     parser.add_argument("--train_model_path", type=str, default="/hdd/rbqlsquf/hotpot_cnn/checkpoint-4000")
270 |     parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev.json")
271 |     parser.add_argument("--beam_size", type=int, default=1)
272 |     parser.add_argument("--max_dec_len", type=int, default=3)
273 |     parser.add_argument("--output_dir", type=str, default="result/hotpot_cnn/hotpot_tf.json")
274 |     parser.add_argument("--batch_size", type=int, default=8)
275 |     parser.add_argument("--data_sample", type=bool, default=True)
276 |     parser.add_argument("--mrc_value", type=str, default=True)
277 |     parser.add_argument("--sum_value", type=str, default=False)
278 |     args = parser.parse_args()
279 |     print(args)
280 |     #########################################################
281 |     #           변수들 선언
282 |     #########################################################
283 |     config = AutoConfig.from_pretrained(args.base_model_path)
284 |     config.beam_size = args.beam_size
285 |     config.max_dec_len = args.max_dec_len
286 | 
287 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
288 |     print("batch size : ", args.batch_size)
289 | 
290 |     with open(args.data_file, "r", encoding="utf-8") as file:
291 |         dev_data = json.load(file)
292 | 
293 |     input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value)
294 | 
295 |     # Create batches of input items
296 |     batches = list(create_batches(input_data, args.batch_size))
297 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
298 | 
299 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
300 |     #### 답변작성
301 | 
302 | <<<<<<< HEAD
303 |     write_result(output_path)
304 | =======
305 |     write_result(args.output_dir, answer_batches, tokenizer)
306 | >>>>>>> 11d7d8a4757072d730dfacc957f0a3763ec1975f
307 | 


--------------------------------------------------------------------------------
/source/inference_pn.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size)
 17 |     trained_model.set_gru(gru)
 18 |     trained_model.config.use_cache = False
 19 |     tokenizer.padding_side = "left"
 20 |     trained_model.load_pn_model(lora_path)
 21 |     return tokenizer, trained_model
 22 | 
 23 | 
 24 | class InferenceInput:
 25 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 26 |         self._id = _id
 27 |         self.input_text = input_text
 28 |         self.answer = answer
 29 |         self.attention_mask = attention_mask
 30 |         self.sent_masks = sent_masks
 31 |         self.gold_sp = gold_sp
 32 | 
 33 | 
 34 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value):
 35 |     all_result = []
 36 | 
 37 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 38 |     for example in tqdm(all_example):
 39 |         example["document"] = example["document"].strip()
 40 |         # token 된 doc
 41 |         token_doc = {"input_ids": [], "attention_mask": []}
 42 |         # document 문장 index
 43 |         sentence_number = 0
 44 |         sentence_position = []
 45 |         for i, sent in enumerate(example["sent"]):
 46 |             # 0번 문장은 instruction으로 지정할 계획
 47 |             sent = sent.strip()
 48 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 49 |             sentence_number += 1  # 1부터 시작
 50 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 51 |             token_doc["input_ids"] += token_sent["input_ids"]
 52 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 53 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 54 |         sentence_position.extend([sentence_number] * len(token_end))
 55 |         token_doc["input_ids"] += token_end["input_ids"]
 56 |         token_doc["attention_mask"] += token_end["attention_mask"]
 57 | 
 58 |         if example["question"] == "summary":
 59 |             instruction = tokenizer(
 60 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
 61 |                 add_special_tokens=False,
 62 |             )
 63 |             # response = tokenizer(
 64 |             #     f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
 65 |             #     add_special_tokens=False,
 66 |             # )
 67 |             response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n"
 68 |         else:  # MRC의 경우
 69 |             instruction = tokenizer(
 70 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 71 |                 add_special_tokens=False,
 72 |             )
 73 |             # response = tokenizer(
 74 |             #     f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
 75 |             #     add_special_tokens=False,
 76 |             # )
 77 |             response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n"
 78 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 79 |         input = instruction["input_ids"] + token_doc["input_ids"]
 80 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 81 |         output = response
 82 | 
 83 |         if "supporting_num" in example.keys():
 84 |             gold_sp = example["supporting_num"]
 85 |         else:
 86 |             gold_sp = None
 87 |         assert len(input) == len(sentence_position) == len(attention_mask)
 88 | 
 89 |         all_result.append(
 90 |             InferenceInput(
 91 |                 _id=example["_id"],
 92 |                 input_text=input,
 93 |                 answer=output,
 94 |                 attention_mask=attention_mask,
 95 |                 sent_masks=sentence_position,
 96 |                 gold_sp=gold_sp,
 97 |             )
 98 |         )
 99 |         if data_sample:
100 |             if len(all_result) == 100:
101 |                 break
102 |     return all_result
103 | 
104 | 
105 | def create_batches(input_list, batch_size):
106 |     # Split the input list into batches of size 'batch_size'
107 |     for i in range(0, len(input_list), batch_size):
108 |         yield input_list[i : i + batch_size]
109 | 
110 | 
111 | def generate_batch_answer(batches, tokenizer, model):
112 |     for batch_num, batch in enumerate(tqdm(batches)):
113 |         input_ids = [item.input_text for item in batch]
114 |         attention_mask = [item.attention_mask for item in batch]
115 |         sentence_masks = [item.sent_masks for item in batch]
116 | 
117 |         model.to("cuda")
118 |         input_batch = {}
119 |         max_length = max(len(mask) for mask in input_ids)
120 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
121 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
122 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
123 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
124 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
125 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
126 | 
127 |         with torch.no_grad():
128 |             model.evidence = None
129 |             model.sentence_number = None
130 |             outputs = model.generate(
131 |                 input_ids=input_batch["input_ids"],
132 |                 attention_mask=input_batch["attention_mask"],
133 |                 sent_masks=input_batch["sent_masks"],
134 |                 max_new_tokens=200,
135 |             )
136 | 
137 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
138 |         decoded_outputs = [
139 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
140 |         ]
141 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
142 | 
143 |         # Store the generated text back in the input objects
144 |         for i, item in enumerate(batch):
145 |             item.input_text = input_text
146 |             item.generated_text = decoded_outputs[i]
147 |             item.generated_all_answer = decoded_outputs_[i]
148 |             if model.sentence_number != None:
149 |                 item.pred_sp = model.sentence_number[i]
150 |     return batches
151 | 
152 | 
153 | def write_result(output_path, answer_batches, tokenizer):
154 |     all_result = []
155 |     for batch_num, batch in enumerate(answer_batches):
156 |         for item in batch:
157 |             result = {}
158 |             result["_id"] = item._id
159 |             if "assistant\n" in item.generated_text:
160 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
161 |             elif "assistant" in item.generated_text:
162 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
163 |             else:
164 |                 result["generated_text"] = item.generated_text
165 |             result["answer"] = item.answer
166 |             result["generated_all_answer"] = item.generated_all_answer
167 |             if item.gold_sp != None:
168 |                 result["gold_sp"] = item.gold_sp
169 |                 result["pred_sp"] = item.pred_sp.tolist()
170 |             all_result.append(result)
171 | 
172 |     with open(output_path, "w", encoding="utf-8") as f:
173 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     ##############################################################
178 |     #               model param 추가할 내용
179 |     ##############################################################
180 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
181 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
182 |     parser.add_argument("--train_model_path", type=str, default="model/1105_noloss/checkpoint-16600")
183 |     parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json")
184 |     parser.add_argument("--beam_size", type=int, default=1)
185 |     parser.add_argument("--max_dec_len", type=int, default=3)
186 |     parser.add_argument("--output_dir", type=str, default="result/1105_noloss/test.json")
187 |     parser.add_argument("--batch_size", type=int, default=8)
188 |     parser.add_argument("--data_sample", type=bool, default=True)
189 |     parser.add_argument("--mrc_value", type=str, default=True)
190 |     parser.add_argument("--sum_value", type=str, default=False)
191 |     args = parser.parse_args()
192 |     print(args)
193 |     #########################################################
194 |     #           변수들 선언
195 |     #########################################################
196 | 
197 |     config = AutoConfig.from_pretrained(args.base_model_path)
198 |     config.beam_size = args.beam_size
199 |     config.max_dec_len = args.max_dec_len
200 | 
201 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
202 |     print("batch size : ", args.batch_size)
203 | 
204 |     with open(args.data_file, "r", encoding="utf-8") as file:
205 |         dev_data = json.load(file)
206 | 
207 |     input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value)
208 | 
209 |     # Create batches of input items
210 |     batches = list(create_batches(input_data, args.batch_size))
211 | 
212 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
213 |     #### 답변작성
214 | 
215 |     write_result(args.output_dir, answer_batches, tokenizer)


--------------------------------------------------------------------------------
/source/inference_pn_att_1106.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1106 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size)
 17 |     trained_model.set_gru(gru)
 18 |     trained_model.config.use_cache = False
 19 |     tokenizer.padding_side = "left"
 20 |     print("LORA WEIGHT LOADING")
 21 |     trained_model.load_pn_model(lora_path)
 22 |     return tokenizer, trained_model
 23 | 
 24 | 
 25 | class InferenceInput:
 26 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 27 |         self._id = _id
 28 |         self.input_text = input_text
 29 |         self.answer = answer
 30 |         self.attention_mask = attention_mask
 31 |         self.sent_masks = sent_masks
 32 |         self.gold_sp = gold_sp
 33 | 
 34 | 
 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value):
 36 |     all_result = []
 37 | 
 38 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         if example["question"] == "summary":
 60 |             instruction = tokenizer(
 61 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
 62 |                 add_special_tokens=False,
 63 |             )
 64 |             # response = tokenizer(
 65 |             #     f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
 66 |             #     add_special_tokens=False,
 67 |             # )
 68 |             response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n"
 69 |         else:  # MRC의 경우
 70 |             instruction = tokenizer(
 71 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 72 |                 add_special_tokens=False,
 73 |             )
 74 |             # response = tokenizer(
 75 |             #     f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
 76 |             #     add_special_tokens=False,
 77 |             # )
 78 |             response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n"
 79 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 80 |         input = instruction["input_ids"] + token_doc["input_ids"]
 81 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 82 |         output = response
 83 | 
 84 |         if "supporting_num" in example.keys():
 85 |             gold_sp = example["supporting_num"]
 86 |         else:
 87 |             gold_sp = None
 88 |         assert len(input) == len(sentence_position) == len(attention_mask)
 89 | 
 90 |         all_result.append(
 91 |             InferenceInput(
 92 |                 _id=example["_id"],
 93 |                 input_text=input,
 94 |                 answer=output,
 95 |                 attention_mask=attention_mask,
 96 |                 sent_masks=sentence_position,
 97 |                 gold_sp=gold_sp,
 98 |             )
 99 |         )
100 |         if data_sample:
101 |             if len(all_result) == 30:
102 |                 break
103 |     return all_result
104 | 
105 | 
106 | def create_batches(input_list, batch_size):
107 |     # Split the input list into batches of size 'batch_size'
108 |     for i in range(0, len(input_list), batch_size):
109 |         yield input_list[i : i + batch_size]
110 | 
111 | 
112 | def generate_batch_answer(batches, tokenizer, model):
113 |     for batch_num, batch in enumerate(tqdm(batches)):
114 |         input_ids = [item.input_text for item in batch]
115 |         attention_mask = [item.attention_mask for item in batch]
116 |         sentence_masks = [item.sent_masks for item in batch]
117 | 
118 |         model.to("cuda")
119 |         input_batch = {}
120 |         max_length = max(len(mask) for mask in input_ids)
121 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
122 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
123 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
124 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
125 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
126 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
127 | 
128 |         with torch.no_grad():
129 |             model.evidence = None
130 |             model.sentence_number = None
131 |             outputs = model.generate(
132 |                 input_ids=input_batch["input_ids"],
133 |                 attention_mask=input_batch["attention_mask"],
134 |                 sent_masks=input_batch["sent_masks"],
135 |                 max_new_tokens=200,
136 |             )
137 | 
138 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
139 |         decoded_outputs = [
140 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
141 |         ]
142 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
143 | 
144 |         # Store the generated text back in the input objects
145 |         for i, item in enumerate(batch):
146 |             item.input_text = input_text
147 |             item.generated_text = decoded_outputs[i]
148 |             item.generated_all_answer = decoded_outputs_[i]
149 |             if model.sentence_number != None:
150 |                 item.pred_sp = model.sentence_number[i]
151 |     return batches
152 | 
153 | 
154 | def write_result(output_path, answer_batches, tokenizer):
155 |     all_result = []
156 |     for batch_num, batch in enumerate(answer_batches):
157 |         for item in batch:
158 |             result = {}
159 |             result["_id"] = item._id
160 |             if "assistant\n" in item.generated_text:
161 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
162 |             elif "assistant" in item.generated_text:
163 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
164 |             else:
165 |                 result["generated_text"] = item.generated_text
166 |             result["answer"] = item.answer
167 |             result["generated_all_answer"] = item.generated_all_answer
168 |             if item.gold_sp != None:
169 |                 result["gold_sp"] = item.gold_sp
170 |                 result["pred_sp"] = item.pred_sp.tolist()
171 |             all_result.append(result)
172 | 
173 |     with open(output_path, "w", encoding="utf-8") as f:
174 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     ##############################################################
179 |     #               model param 추가할 내용
180 |     ##############################################################
181 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
182 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
183 |     parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200")
184 |     parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json")
185 |     parser.add_argument("--beam_size", type=int, default=1)
186 |     parser.add_argument("--max_dec_len", type=int, default=3)
187 |     parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json")
188 |     parser.add_argument("--batch_size", type=int, default=8)
189 |     parser.add_argument("--data_sample", type=bool, default=True)
190 |     parser.add_argument("--mrc_value", type=str, default=True)
191 |     parser.add_argument("--sum_value", type=str, default=False)
192 |     args = parser.parse_args()
193 |     print(args)
194 |     #########################################################
195 |     #           변수들 선언
196 |     #########################################################
197 | 
198 |     config = AutoConfig.from_pretrained(args.base_model_path)
199 |     config.beam_size = args.beam_size
200 |     config.max_dec_len = args.max_dec_len
201 | 
202 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
203 |     print("batch size : ", args.batch_size)
204 | 
205 |     with open(args.data_file, "r", encoding="utf-8") as file:
206 |         dev_data = json.load(file)
207 | 
208 |     input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value)
209 | 
210 |     # Create batches of input items
211 |     batches = list(create_batches(input_data, args.batch_size))
212 | 
213 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
214 |     #### 답변작성
215 | 
216 |     write_result(args.output_dir, answer_batches, tokenizer)


--------------------------------------------------------------------------------
/source/inference_pn_att_1106_sum.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1106_sum import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size)
 17 |     trained_model.set_gru(gru)
 18 |     trained_model.config.use_cache = False
 19 |     tokenizer.padding_side = "left"
 20 |     print("LORA WEIGHT LOADING")
 21 |     trained_model.load_pn_model(lora_path)
 22 |     return tokenizer, trained_model
 23 | 
 24 | 
 25 | class InferenceInput:
 26 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 27 |         self._id = _id
 28 |         self.input_text = input_text
 29 |         self.answer = answer
 30 |         self.attention_mask = attention_mask
 31 |         self.sent_masks = sent_masks
 32 |         self.gold_sp = gold_sp
 33 | 
 34 | 
 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value):
 36 |     all_result = []
 37 | 
 38 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         if example["question"] == "summary":
 60 |             instruction = tokenizer(
 61 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
 62 |                 add_special_tokens=False,
 63 |             )
 64 |             # response = tokenizer(
 65 |             #     f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
 66 |             #     add_special_tokens=False,
 67 |             # )
 68 |             response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n"
 69 |         else:  # MRC의 경우
 70 |             instruction = tokenizer(
 71 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 72 |                 add_special_tokens=False,
 73 |             )
 74 |             # response = tokenizer(
 75 |             #     f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
 76 |             #     add_special_tokens=False,
 77 |             # )
 78 |             response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n"
 79 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 80 |         input = instruction["input_ids"] + token_doc["input_ids"]
 81 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 82 |         output = response
 83 | 
 84 |         if "supporting_num" in example.keys():
 85 |             gold_sp = example["supporting_num"]
 86 |         else:
 87 |             gold_sp = None
 88 |         assert len(input) == len(sentence_position) == len(attention_mask)
 89 | 
 90 |         all_result.append(
 91 |             InferenceInput(
 92 |                 _id=example["_id"],
 93 |                 input_text=input,
 94 |                 answer=output,
 95 |                 attention_mask=attention_mask,
 96 |                 sent_masks=sentence_position,
 97 |                 gold_sp=gold_sp,
 98 |             )
 99 |         )
100 |         if data_sample:
101 |             if len(all_result) == 30:
102 |                 break
103 |     return all_result
104 | 
105 | 
106 | def create_batches(input_list, batch_size):
107 |     # Split the input list into batches of size 'batch_size'
108 |     for i in range(0, len(input_list), batch_size):
109 |         yield input_list[i : i + batch_size]
110 | 
111 | 
112 | def generate_batch_answer(batches, tokenizer, model):
113 |     for batch_num, batch in enumerate(tqdm(batches)):
114 |         input_ids = [item.input_text for item in batch]
115 |         attention_mask = [item.attention_mask for item in batch]
116 |         sentence_masks = [item.sent_masks for item in batch]
117 | 
118 |         model.to("cuda")
119 |         input_batch = {}
120 |         max_length = max(len(mask) for mask in input_ids)
121 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
122 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
123 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
124 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
125 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
126 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
127 | 
128 |         with torch.no_grad():
129 |             model.evidence = None
130 |             model.sentence_number = None
131 |             outputs = model.generate(
132 |                 input_ids=input_batch["input_ids"],
133 |                 attention_mask=input_batch["attention_mask"],
134 |                 sent_masks=input_batch["sent_masks"],
135 |                 max_new_tokens=200,
136 |             )
137 | 
138 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
139 |         decoded_outputs = [
140 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
141 |         ]
142 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
143 | 
144 |         # Store the generated text back in the input objects
145 |         for i, item in enumerate(batch):
146 |             item.input_text = input_text
147 |             item.generated_text = decoded_outputs[i]
148 |             item.generated_all_answer = decoded_outputs_[i]
149 |             if model.sentence_number != None:
150 |                 item.pred_sp = model.sentence_number[i]
151 |     return batches
152 | 
153 | 
154 | def write_result(output_path, answer_batches, tokenizer):
155 |     all_result = []
156 |     for batch_num, batch in enumerate(answer_batches):
157 |         for item in batch:
158 |             result = {}
159 |             result["_id"] = item._id
160 |             if "assistant\n" in item.generated_text:
161 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
162 |             elif "assistant" in item.generated_text:
163 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
164 |             else:
165 |                 result["generated_text"] = item.generated_text
166 |             result["answer"] = item.answer
167 |             result["generated_all_answer"] = item.generated_all_answer
168 |             if item.gold_sp != None:
169 |                 result["gold_sp"] = item.gold_sp
170 |                 result["pred_sp"] = item.pred_sp.tolist()
171 |             all_result.append(result)
172 | 
173 |     with open(output_path, "w", encoding="utf-8") as f:
174 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     ##############################################################
179 |     #               model param 추가할 내용
180 |     ##############################################################
181 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
182 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
183 |     parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200")
184 |     parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json")
185 |     parser.add_argument("--beam_size", type=int, default=1)
186 |     parser.add_argument("--max_dec_len", type=int, default=3)
187 |     parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json")
188 |     parser.add_argument("--batch_size", type=int, default=8)
189 |     parser.add_argument("--data_sample", type=bool, default=True)
190 |     parser.add_argument("--mrc_value", type=str, default=True)
191 |     parser.add_argument("--sum_value", type=str, default=False)
192 |     args = parser.parse_args()
193 |     print(args)
194 |     #########################################################
195 |     #           변수들 선언
196 |     #########################################################
197 | 
198 |     config = AutoConfig.from_pretrained(args.base_model_path)
199 |     config.beam_size = args.beam_size
200 |     config.max_dec_len = args.max_dec_len
201 | 
202 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
203 |     print("batch size : ", args.batch_size)
204 | 
205 |     with open(args.data_file, "r", encoding="utf-8") as file:
206 |         dev_data = json.load(file)
207 | 
208 |     input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value)
209 | 
210 |     # Create batches of input items
211 |     batches = list(create_batches(input_data, args.batch_size))
212 | 
213 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
214 |     #### 답변작성
215 | 
216 |     write_result(args.output_dir, answer_batches, tokenizer)


--------------------------------------------------------------------------------
/source/inference_pn_att_1107.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1107 import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(
 17 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
 18 |     )
 19 |     trained_model.set_gru(gru)
 20 |     trained_model.config.use_cache = False
 21 |     tokenizer.padding_side = "left"
 22 |     print("LORA WEIGHT LOADING")
 23 |     trained_model.load_pn_model(lora_path)
 24 |     return tokenizer, trained_model
 25 | 
 26 | 
 27 | class InferenceInput:
 28 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 29 |         self._id = _id
 30 |         self.input_text = input_text
 31 |         self.answer = answer
 32 |         self.attention_mask = attention_mask
 33 |         self.sent_masks = sent_masks
 34 |         self.gold_sp = gold_sp
 35 | 
 36 | 
 37 | def create_example(all_example, tokenizer, data_sample):
 38 |     all_result = []
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         instruction = tokenizer(
 60 |             f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 61 |             add_special_tokens=False,
 62 |         )
 63 |         response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n"
 64 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 65 |         input = instruction["input_ids"] + token_doc["input_ids"]
 66 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 67 |         output = response
 68 | 
 69 |         if "supporting_num" in example.keys():
 70 |             gold_sp = example["supporting_num"]
 71 |         else:
 72 |             gold_sp = None
 73 |         assert len(input) == len(sentence_position) == len(attention_mask)
 74 | 
 75 |         all_result.append(
 76 |             InferenceInput(
 77 |                 _id=example["_id"],
 78 |                 input_text=input,
 79 |                 answer=output,
 80 |                 attention_mask=attention_mask,
 81 |                 sent_masks=sentence_position,
 82 |                 gold_sp=gold_sp,
 83 |             )
 84 |         )
 85 |         if data_sample:
 86 |             if len(all_result) == 100:
 87 |                 break
 88 |     return all_result
 89 | 
 90 | 
 91 | def create_batches(input_list, batch_size):
 92 |     # Split the input list into batches of size 'batch_size'
 93 |     for i in range(0, len(input_list), batch_size):
 94 |         yield input_list[i : i + batch_size]
 95 | 
 96 | 
 97 | def generate_batch_answer(batches, tokenizer, model):
 98 |     for batch_num, batch in enumerate(tqdm(batches)):
 99 |         input_ids = [item.input_text for item in batch]
100 |         attention_mask = [item.attention_mask for item in batch]
101 |         sentence_masks = [item.sent_masks for item in batch]
102 | 
103 |         model.to("cuda")
104 |         model.eval()
105 |         input_batch = {}
106 |         max_length = max(len(mask) for mask in input_ids)
107 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
108 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
109 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
110 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
111 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
112 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
113 | 
114 |         with torch.no_grad():
115 |             model.evidence = None
116 |             model.sentence_number = None
117 |             outputs = model.generate(
118 |                 input_ids=input_batch["input_ids"],
119 |                 attention_mask=input_batch["attention_mask"],
120 |                 sent_masks=input_batch["sent_masks"],
121 |                 max_new_tokens=50,
122 |                 # temperature=0.0,
123 |                 # do_sample=False,
124 |             )
125 |         input_text = []
126 |         decoded_outputs = []
127 |         decoded_outputs_ = []
128 | 
129 |         for i in range(len(input_ids)):
130 |             input_text.append(tokenizer.decode(input_ids[i], skip_special_tokens=True))
131 |             trimmed_output = outputs[i][len(input_batch["input_ids"][i]) :]
132 |             decoded_outputs.append(tokenizer.decode(trimmed_output, skip_special_tokens=True))
133 |             decoded_outputs_.append(tokenizer.decode(outputs[i], skip_special_tokens=True))
134 | 
135 |         # Store the generated text back in the input objects
136 |         for i, item in enumerate(batch):
137 |             item.input_text = input_text
138 |             item.generated_text = decoded_outputs[i]
139 |             item.generated_all_answer = decoded_outputs_[i]
140 |             if model.sentence_number != None:
141 |                 item.pred_sp = model.sentence_number[i]
142 |     return batches
143 | 
144 | 
145 | def write_result(output_path, answer_batches, tokenizer):
146 |     all_result = []
147 |     for batch_num, batch in enumerate(answer_batches):
148 |         for item in batch:
149 |             result = {}
150 |             result["_id"] = item._id
151 |             if "assistant\n" in item.generated_text:
152 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
153 |             elif "assistant" in item.generated_text:
154 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
155 |             else:
156 |                 result["generated_text"] = item.generated_text
157 |             result["answer"] = item.answer
158 |             result["generated_all_answer"] = item.generated_all_answer
159 |             if item.gold_sp != None:
160 |                 result["gold_sp"] = item.gold_sp
161 |                 result["pred_sp"] = item.pred_sp.tolist()
162 |             all_result.append(result)
163 | 
164 |     with open(output_path, "w", encoding="utf-8") as f:
165 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     ##############################################################
170 |     #               model param 추가할 내용
171 |     ##############################################################
172 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
173 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
174 |     parser.add_argument("--train_model_path", type=str, default="model/1205_yesloss/checkpoint-15000")
175 |     parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_dev.json")
176 |     parser.add_argument("--beam_size", type=int, default=1)
177 |     parser.add_argument("--max_dec_len", type=int, default=3)
178 |     parser.add_argument("--output_dir", type=str, default="result/1205_yesloss/15000.json")
179 |     parser.add_argument("--batch_size", type=int, default=8)
180 |     parser.add_argument("--data_sample", type=bool, default=True)
181 | 
182 |     args = parser.parse_args()
183 |     print(args)
184 |     #########################################################
185 |     #           변수들 선언
186 |     #########################################################
187 | 
188 |     config = AutoConfig.from_pretrained(args.base_model_path)
189 |     config.beam_size = args.beam_size
190 |     config.max_dec_len = args.max_dec_len
191 | 
192 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
193 |     print("batch size : ", args.batch_size)
194 | 
195 |     with open(args.data_file, "r", encoding="utf-8") as file:
196 |         dev_data = json.load(file)
197 | 
198 |     input_data = create_example(dev_data, tokenizer, args.data_sample)
199 | 
200 |     # Create batches of input items
201 |     batches = list(create_batches(input_data, args.batch_size))
202 | 
203 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
204 |     #### 답변작성
205 | 
206 |     write_result(args.output_dir, answer_batches, tokenizer)
207 | 


--------------------------------------------------------------------------------
/source/inference_pn_att_1107_sum.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1107_sum import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size)
 17 |     trained_model.set_gru(gru)
 18 |     trained_model.config.use_cache = False
 19 |     tokenizer.padding_side = "left"
 20 |     print("LORA WEIGHT LOADING")
 21 |     trained_model.load_pn_model(lora_path)
 22 |     return tokenizer, trained_model
 23 | 
 24 | 
 25 | class InferenceInput:
 26 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 27 |         self._id = _id
 28 |         self.input_text = input_text
 29 |         self.answer = answer
 30 |         self.attention_mask = attention_mask
 31 |         self.sent_masks = sent_masks
 32 |         self.gold_sp = gold_sp
 33 | 
 34 | 
 35 | def create_example(all_example, tokenizer, data_sample, mrc_value, sum_value):
 36 |     all_result = []
 37 | 
 38 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         if example["question"] == "summary":
 60 |             instruction = tokenizer(
 61 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
 62 |                 add_special_tokens=False,
 63 |             )
 64 |             # response = tokenizer(
 65 |             #     f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
 66 |             #     add_special_tokens=False,
 67 |             # )
 68 |             response = f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n"
 69 |         else:  # MRC의 경우
 70 |             instruction = tokenizer(
 71 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 72 |                 add_special_tokens=False,
 73 |             )
 74 |             # response = tokenizer(
 75 |             #     f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
 76 |             #     add_special_tokens=False,
 77 |             # )
 78 |             response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n"
 79 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 80 |         input = instruction["input_ids"] + token_doc["input_ids"]
 81 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 82 |         output = response
 83 | 
 84 |         if "supporting_num" in example.keys():
 85 |             gold_sp = example["supporting_num"]
 86 |         else:
 87 |             gold_sp = None
 88 |         assert len(input) == len(sentence_position) == len(attention_mask)
 89 | 
 90 |         all_result.append(
 91 |             InferenceInput(
 92 |                 _id=example["_id"],
 93 |                 input_text=input,
 94 |                 answer=output,
 95 |                 attention_mask=attention_mask,
 96 |                 sent_masks=sentence_position,
 97 |                 gold_sp=gold_sp,
 98 |             )
 99 |         )
100 |         if data_sample:
101 |             if len(all_result) == 30:
102 |                 break
103 |     return all_result
104 | 
105 | 
106 | def create_batches(input_list, batch_size):
107 |     # Split the input list into batches of size 'batch_size'
108 |     for i in range(0, len(input_list), batch_size):
109 |         yield input_list[i : i + batch_size]
110 | 
111 | 
112 | def generate_batch_answer(batches, tokenizer, model):
113 |     for batch_num, batch in enumerate(tqdm(batches)):
114 |         input_ids = [item.input_text for item in batch]
115 |         attention_mask = [item.attention_mask for item in batch]
116 |         sentence_masks = [item.sent_masks for item in batch]
117 | 
118 |         model.to("cuda")
119 |         input_batch = {}
120 |         max_length = max(len(mask) for mask in input_ids)
121 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
122 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
123 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
124 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
125 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
126 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
127 | 
128 |         with torch.no_grad():
129 |             model.evidence = None
130 |             model.sentence_number = None
131 |             outputs = model.generate(
132 |                 input_ids=input_batch["input_ids"],
133 |                 attention_mask=input_batch["attention_mask"],
134 |                 sent_masks=input_batch["sent_masks"],
135 |                 max_new_tokens=200,
136 |             )
137 | 
138 |         input_text = [tokenizer.decode(input_id, skip_special_tokens=True) for i, input_id in enumerate(input_ids)]
139 |         decoded_outputs = [
140 |             tokenizer.decode(output[len(input_text) :], skip_special_tokens=True) for i, output in enumerate(outputs)
141 |         ]
142 |         decoded_outputs_ = [tokenizer.decode(output, skip_special_tokens=True) for i, output in enumerate(outputs)]
143 | 
144 |         # Store the generated text back in the input objects
145 |         for i, item in enumerate(batch):
146 |             item.input_text = input_text
147 |             item.generated_text = decoded_outputs[i]
148 |             item.generated_all_answer = decoded_outputs_[i]
149 |             if model.sentence_number != None:
150 |                 item.pred_sp = model.sentence_number[i]
151 |     return batches
152 | 
153 | 
154 | def write_result(output_path, answer_batches, tokenizer):
155 |     all_result = []
156 |     for batch_num, batch in enumerate(answer_batches):
157 |         for item in batch:
158 |             result = {}
159 |             result["_id"] = item._id
160 |             if "assistant\n" in item.generated_text:
161 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
162 |             elif "assistant" in item.generated_text:
163 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
164 |             else:
165 |                 result["generated_text"] = item.generated_text
166 |             result["answer"] = item.answer
167 |             result["generated_all_answer"] = item.generated_all_answer
168 |             if item.gold_sp != None:
169 |                 result["gold_sp"] = item.gold_sp
170 |                 result["pred_sp"] = item.pred_sp.tolist()
171 |             all_result.append(result)
172 | 
173 |     with open(output_path, "w", encoding="utf-8") as f:
174 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
175 | 
176 | 
177 | if __name__ == "__main__":
178 |     ##############################################################
179 |     #               model param 추가할 내용
180 |     ##############################################################
181 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
182 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
183 |     parser.add_argument("--train_model_path", type=str, default="model/1107_weighted_context/checkpoint-2200")
184 |     parser.add_argument("--data_file", type=str, default="data/1029data/hotpot_dev_supporting.json")
185 |     parser.add_argument("--beam_size", type=int, default=1)
186 |     parser.add_argument("--max_dec_len", type=int, default=3)
187 |     parser.add_argument("--output_dir", type=str, default="result/1107_weighted_context/hotpot_tt_2200.json")
188 |     parser.add_argument("--batch_size", type=int, default=8)
189 |     parser.add_argument("--data_sample", type=bool, default=True)
190 |     parser.add_argument("--mrc_value", type=str, default=True)
191 |     parser.add_argument("--sum_value", type=str, default=False)
192 |     args = parser.parse_args()
193 |     print(args)
194 |     #########################################################
195 |     #           변수들 선언
196 |     #########################################################
197 | 
198 |     config = AutoConfig.from_pretrained(args.base_model_path)
199 |     config.beam_size = args.beam_size
200 |     config.max_dec_len = args.max_dec_len
201 | 
202 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
203 |     print("batch size : ", args.batch_size)
204 | 
205 |     with open(args.data_file, "r", encoding="utf-8") as file:
206 |         dev_data = json.load(file)
207 | 
208 |     input_data = create_example(dev_data, tokenizer, args.data_sample, args.mrc_value, args.sum_value)
209 | 
210 |     # Create batches of input items
211 |     batches = list(create_batches(input_data, args.batch_size))
212 | 
213 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
214 |     #### 답변작성
215 | 
216 |     write_result(args.output_dir, answer_batches, tokenizer)


--------------------------------------------------------------------------------
/source/inference_upper.py:
--------------------------------------------------------------------------------
  1 | from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
  2 | 
  3 | import torch
  4 | from tqdm import tqdm
  5 | import json
  6 | from peft import PeftModel, PeftConfig
  7 | from datasets import Dataset
  8 | 
  9 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 10 | import argparse
 11 | 
 12 | 
 13 | def create_model(base_model_path, lora_path, config):
 14 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
 15 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
 16 |     gru = BeamSearchAttentionDecoder(
 17 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
 18 |     )
 19 |     trained_model.set_gru(gru)
 20 |     trained_model.config.use_cache = False
 21 |     tokenizer.padding_side = "left"
 22 |     print("LORA WEIGHT LOADING")
 23 |     trained_model.load_pn_model(lora_path)
 24 |     return tokenizer, trained_model
 25 | 
 26 | 
 27 | class InferenceInput:
 28 |     def __init__(self, _id, input_text, answer, attention_mask, sent_masks, gold_sp):
 29 |         self._id = _id
 30 |         self.input_text = input_text
 31 |         self.answer = answer
 32 |         self.attention_mask = attention_mask
 33 |         self.sent_masks = sent_masks
 34 |         self.gold_sp = gold_sp
 35 | 
 36 | 
 37 | def create_example(all_example, tokenizer, data_sample):
 38 |     all_result = []
 39 |     for example in tqdm(all_example):
 40 |         example["document"] = example["document"].strip()
 41 |         # token 된 doc
 42 |         token_doc = {"input_ids": [], "attention_mask": []}
 43 |         # document 문장 index
 44 |         sentence_number = 0
 45 |         sentence_position = []
 46 |         for i, sent in enumerate(example["sent"]):
 47 |             # 0번 문장은 instruction으로 지정할 계획
 48 |             sent = sent.strip()
 49 |             token_sent = tokenizer(sent + " ", add_special_tokens=False)
 50 |             sentence_number += 1  # 1부터 시작
 51 |             sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
 52 |             token_doc["input_ids"] += token_sent["input_ids"]
 53 |             token_doc["attention_mask"] += token_sent["attention_mask"]
 54 |         token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
 55 |         sentence_position.extend([sentence_number] * len(token_end))
 56 |         token_doc["input_ids"] += token_end["input_ids"]
 57 |         token_doc["attention_mask"] += token_end["attention_mask"]
 58 | 
 59 |         instruction = tokenizer(
 60 |             f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
 61 |             add_special_tokens=False,
 62 |         )
 63 |         response = f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n<|im_end|>\n"
 64 |         sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
 65 |         input = instruction["input_ids"] + token_doc["input_ids"]
 66 |         attention_mask = instruction["attention_mask"] + token_doc["attention_mask"]
 67 |         output = response
 68 | 
 69 |         if "supporting_num" in example.keys():
 70 |             gold_sp = example["supporting_num"]
 71 |         else:
 72 |             gold_sp = None
 73 |         assert len(input) == len(sentence_position) == len(attention_mask)
 74 | 
 75 |         all_result.append(
 76 |             InferenceInput(
 77 |                 _id=example["_id"],
 78 |                 input_text=input,
 79 |                 answer=output,
 80 |                 attention_mask=attention_mask,
 81 |                 sent_masks=sentence_position,
 82 |                 gold_sp=gold_sp,
 83 |             )
 84 |         )
 85 |         if data_sample:
 86 |             if len(all_result) == 100:
 87 |                 break
 88 |     return all_result
 89 | 
 90 | 
 91 | def create_batches(input_list, batch_size):
 92 |     # Split the input list into batches of size 'batch_size'
 93 |     for i in range(0, len(input_list), batch_size):
 94 |         yield input_list[i : i + batch_size]
 95 | 
 96 | 
 97 | def generate_batch_answer(batches, tokenizer, model):
 98 |     for batch_num, batch in enumerate(tqdm(batches)):
 99 |         input_ids = [item.input_text for item in batch]
100 |         attention_mask = [item.attention_mask for item in batch]
101 |         sentence_masks = [item.sent_masks for item in batch]
102 | 
103 |         model.to("cuda")
104 |         model.eval()
105 |         input_batch = {}
106 |         max_length = max(len(mask) for mask in input_ids)
107 |         padded_input_ids = [[tokenizer.pad_token_id] * (max_length - len(mask)) + mask for mask in input_ids]
108 |         input_batch["input_ids"] = torch.tensor(padded_input_ids).cuda()
109 |         padded_attention_mask = [[0] * (max_length - len(mask)) + mask for mask in attention_mask]
110 |         input_batch["attention_mask"] = torch.tensor(padded_attention_mask).cuda()
111 |         padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
112 |         input_batch["sent_masks"] = torch.tensor(padded_sentence_masks).cuda()
113 | 
114 |         with torch.no_grad():
115 |             model.evidence = None
116 |             model.sentence_number = None
117 |             outputs = model.generate(
118 |                 input_ids=input_batch["input_ids"],
119 |                 attention_mask=input_batch["attention_mask"],
120 |                 sent_masks=input_batch["sent_masks"],
121 |                 max_new_tokens=50,
122 |                 # temperature=0.0,
123 |                 # do_sample=False,
124 |             )
125 |         input_text = []
126 |         decoded_outputs = []
127 |         decoded_outputs_ = []
128 | 
129 |         for i in range(len(input_ids)):
130 |             input_text.append(tokenizer.decode(input_ids[i], skip_special_tokens=True))
131 |             trimmed_output = outputs[i][len(input_batch["input_ids"][i]) :]
132 |             decoded_outputs.append(tokenizer.decode(trimmed_output, skip_special_tokens=True))
133 |             decoded_outputs_.append(tokenizer.decode(outputs[i], skip_special_tokens=True))
134 | 
135 |         # Store the generated text back in the input objects
136 |         for i, item in enumerate(batch):
137 |             item.input_text = input_text
138 |             item.generated_text = decoded_outputs[i]
139 |             item.generated_all_answer = decoded_outputs_[i]
140 |             if model.sentence_number != None:
141 |                 item.pred_sp = model.sentence_number[i]
142 |     return batches
143 | 
144 | 
145 | def write_result(output_path, answer_batches, tokenizer):
146 |     all_result = []
147 |     for batch_num, batch in enumerate(answer_batches):
148 |         for item in batch:
149 |             result = {}
150 |             result["_id"] = item._id
151 |             if "assistant\n" in item.generated_text:
152 |                 result["generated_text"] = item.generated_text.split("assistant\n")[1]
153 |             elif "assistant" in item.generated_text:
154 |                 result["generated_text"] = item.generated_text.split("assistant")[1]
155 |             else:
156 |                 result["generated_text"] = item.generated_text
157 |             result["answer"] = item.answer
158 |             result["generated_all_answer"] = item.generated_all_answer
159 |             if item.gold_sp != None:
160 |                 result["gold_sp"] = item.gold_sp
161 |                 result["pred_sp"] = item.pred_sp.tolist()
162 |             all_result.append(result)
163 | 
164 |     with open(output_path, "w", encoding="utf-8") as f:
165 |         json.dump(all_result, f, ensure_ascii=False, indent=4)
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     ##############################################################
170 |     #               model param 추가할 내용
171 |     ##############################################################
172 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
173 |     parser.add_argument("--base_model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
174 |     parser.add_argument("--train_model_path", type=str, default="model/1126_upper/checkpoint-15000")
175 |     parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_dev.json")
176 |     parser.add_argument("--beam_size", type=int, default=1)
177 |     parser.add_argument("--max_dec_len", type=int, default=3)
178 |     parser.add_argument("--output_dir", type=str, default="result/1126_upper/15000_test.json")
179 |     parser.add_argument("--batch_size", type=int, default=8)
180 |     parser.add_argument("--data_sample", type=bool, default=True)
181 | 
182 |     args = parser.parse_args()
183 |     print(args)
184 |     #########################################################
185 |     #           변수들 선언
186 |     #########################################################
187 | 
188 |     config = AutoConfig.from_pretrained(args.base_model_path)
189 |     config.beam_size = args.beam_size
190 |     config.max_dec_len = args.max_dec_len
191 | 
192 |     tokenizer, model = create_model(args.base_model_path, args.train_model_path, config)
193 |     print("batch size : ", args.batch_size)
194 | 
195 |     with open(args.data_file, "r", encoding="utf-8") as file:
196 |         dev_data = json.load(file)
197 | 
198 |     input_data = create_example(dev_data, tokenizer, args.data_sample)
199 | 
200 |     # Create batches of input items
201 |     batches = list(create_batches(input_data, args.batch_size))
202 | 
203 |     answer_batches = generate_batch_answer(batches, tokenizer, model)
204 |     #### 답변작성
205 | 
206 |     write_result(args.output_dir, answer_batches, tokenizer)
207 | 


--------------------------------------------------------------------------------
/source/train_mean.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from datasets import Dataset
  4 | 
  5 | from transformers import (
  6 |     AutoTokenizer,
  7 |     AutoModelForCausalLM,
  8 |     AutoConfig,
  9 |     DataCollatorForSeq2Seq,
 10 |     TrainingArguments,
 11 |     Trainer,
 12 | )
 13 | 
 14 | from peft import LoraConfig, get_peft_model
 15 | import wandb
 16 | from modeling_qwen2_mean import Qwen2ForCausalLM
 17 | from torch.nn import functional as F
 18 | import argparse
 19 | 
 20 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
 21 |     def __call__(self, features):
 22 |         # sentence_masks를 제외한 features 리스트 생성
 23 |         features_without_masks = [{k: v for k, v in f.items() if k != "sent_masks"} for f in features]
 24 | 
 25 |         # 부모 클래스에서 features_without_masks 처리
 26 |         batch = super().__call__(features_without_masks)
 27 | 
 28 |         sentence_masks = [f.get("sent_masks", None) for f in features]
 29 |         # sentence_masks가 None이 아닌 경우 패딩 처리
 30 |         if sentence_masks[0] is not None:
 31 |             max_length = max(len(mask) for mask in sentence_masks)
 32 |             padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
 33 |             batch["sent_masks"] = torch.tensor(padded_sentence_masks)
 34 | 
 35 |         return batch
 36 | 
 37 | class CustomTrainer(Trainer):
 38 |     def compute_loss(self, model, inputs, return_outputs=False):
 39 |         # input을 원하는 대로 수정
 40 |         model.model.evidence = None
 41 |         # 모델에 수정된 inputs 전달
 42 |         if self.label_smoother is not None and "labels" in inputs:
 43 |             labels = inputs.pop("labels")
 44 |         else:
 45 |             labels = None
 46 |         outputs = model(**inputs)
 47 |         # Save past state if it exists
 48 |         # TODO: this needs to be fixed and made cleaner later.
 49 |         if self.args.past_index >= 0:
 50 |             self._past = outputs[self.args.past_index]
 51 | 
 52 |         if labels is not None:
 53 |             unwrapped_model = self.accelerator.unwrap_model(model)
 54 |             if self._is_peft_model(unwrapped_model):
 55 |                 model_name = unwrapped_model.base_model.model._get_name()
 56 |             else:
 57 |                 model_name = unwrapped_model._get_name()
 58 |             # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
 59 |             loss = self.label_smoother(outputs, labels, shift_labels=True)
 60 |             # else:
 61 |             #     loss = self.label_smoother(outputs, labels)
 62 |         else:
 63 |             if isinstance(outputs, dict) and "loss" not in outputs:
 64 |                 raise ValueError(
 65 |                     "The model did not return a loss from the inputs, only the following keys: "
 66 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
 67 |                 )
 68 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
 69 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] # path, batch , 1742(max_sent)
 70 |         r_loss = loss.requires_grad_(True)
 71 |         # r_loss = loss.clone().detach().requires_grad_(True)
 72 |         return (r_loss, outputs) if return_outputs else r_loss
 73 | 
 74 | 
 75 | def create_model(model_path, config):
 76 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
 77 |     model = Qwen2ForCausalLM.from_pretrained(model_path, config=config, device_map="cuda")
 78 |     model.enable_input_require_grads()
 79 |     model.config.use_cache = False
 80 |     tokenizer.padding_side = "left"
 81 |     return tokenizer, model
 82 | 
 83 | 
 84 | IGNORE_INDEX = -100
 85 | 
 86 | 
 87 | def process_func(example, tokenizer):
 88 |     MAX_LENGTH = 2048
 89 |     input_ids, attention_mask, labels = [], [], []
 90 |     mrc_value = -1
 91 |     sum_value = -1
 92 |     if example["mrc_type"] == "T":
 93 |         mrc_value = "True"
 94 |     else:
 95 |         mrc_value = "False"
 96 |     if example["sum_type"] == "T":
 97 |         sum_value = "True"
 98 |     else:
 99 |         sum_value = "False"
100 | 
101 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
102 |     example["document"] = example["document"].strip()
103 |     # token 된 doc
104 |     token_doc = {"input_ids": [], "attention_mask": []}
105 |     # document 문장 index
106 |     sentence_number = 0
107 |     sentence_position = []
108 |     for i, sent in enumerate(example["sent"]):
109 |         # 0번 문장은 instruction으로 지정할 계획
110 |         sent = sent.strip()
111 |         token_sent = tokenizer(sent + " ", add_special_tokens=False)
112 |         sentence_number += 1  # 1부터 시작
113 |         sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
114 |         token_doc["input_ids"] += token_sent["input_ids"]
115 |         token_doc["attention_mask"] += token_sent["attention_mask"]
116 |     token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
117 |     sentence_position.extend([0] * len(token_end))
118 |     token_doc["input_ids"] += token_end["input_ids"]
119 |     token_doc["attention_mask"] += token_end["attention_mask"]
120 | 
121 |     if example["data_type"] == "answer":
122 |         if example["answer_type"] == "F":
123 |             if example["question"] == "no":  # 질문이 없는 경우
124 |                 instruction = tokenizer(
125 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
126 |                     add_special_tokens=False,
127 |                 )
128 |             else:
129 |                 instruction = tokenizer(
130 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
131 |                     add_special_tokens=False,
132 |                 )
133 |             response = tokenizer(
134 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
135 |             )
136 |         else:  # 답 해야하는 경우 질문은 무조건 있음
137 |             instruction = tokenizer(
138 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
139 |                 add_special_tokens=False,
140 |             )
141 |             response = tokenizer(
142 |                 f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
143 |                 add_special_tokens=False,
144 |             )
145 |     elif example["data_type"] == "summary":
146 |         if example["answer_type"] == "F":  # 무응답의 경우 질문이 무조건 없음
147 |             instruction = tokenizer(
148 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
149 |                 add_special_tokens=False,
150 |             )
151 |             response = tokenizer(
152 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
153 |             )
154 |         else:  # 답 해야하는 경우 질문 유무
155 |             if example["question"] == "summary":  # 질문이 없는 경우
156 |                 instruction = tokenizer(
157 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
158 |                     add_special_tokens=False,
159 |                 )
160 |             else:
161 |                 instruction = tokenizer(
162 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question'].strip()}\n**Document:\n",
163 |                     add_special_tokens=False,
164 |                 )
165 |             response = tokenizer(
166 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
167 |                 add_special_tokens=False,
168 |             )
169 |     # instruction에 대한 문장 번호
170 |     sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
171 |     sentence_position.extend([0] * len(response["input_ids"]))
172 |     input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"]
173 |     attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"]
174 |     labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"]
175 |     assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels)
176 | 
177 |     if len(input_ids) > MAX_LENGTH:
178 |         sentence_position = sentence_position[:MAX_LENGTH]
179 |         input_ids = input_ids[:MAX_LENGTH]
180 |         attention_mask = attention_mask[:MAX_LENGTH]
181 |         labels = labels[:MAX_LENGTH]
182 |     return {
183 |         "input_ids": input_ids,
184 |         "attention_mask": attention_mask,
185 |         "labels": labels,
186 |         "sent_masks": sentence_position,
187 |     }
188 | 
189 | 
190 | if __name__ == "__main__":
191 | 
192 |     ##############################################################
193 |     #               model param 추가할 내용
194 |     ##############################################################
195 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
196 |     parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
197 |     parser.add_argument("--data_file", type=str, default="data/1022data/hotpot_cnn_6k.json")
198 |     parser.add_argument("--beam_size", type=int, default=1)
199 |     parser.add_argument("--max_dec_len", type=int, default=1)
200 |     parser.add_argument("--new_model", type=str, default="new_model")
201 |     parser.add_argument("--wandb_project", type=str, default="llm pointer network")
202 |     parser.add_argument("--wandb_run_name", type=str, default="1027")
203 |     parser.add_argument("--output_dir", type=str, default="qwen_lora_1026")
204 |     parser.add_argument("--num_train_epochs", type=int, default=1)
205 |     parser.add_argument("--batch_size", type=int, default=4)
206 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
207 |     parser.add_argument("--data_sample", type=bool, default=True)
208 |     args = parser.parse_args()
209 |     print(args)
210 |     #########################################################
211 |     #           변수들 선언
212 |     #########################################################
213 |     model_path = args.model_path
214 | 
215 |     config = AutoConfig.from_pretrained(model_path)
216 |     config.beam_size = args.beam_size
217 |     config.max_dec_len = args.max_dec_len
218 | 
219 |     tokenizer, model = create_model(model_path, config)
220 |     data_file = args.data_file
221 |     print("학습 데이터 : ", data_file)
222 |     dataset = Dataset.from_json(data_file)
223 |     if args.data_sample:
224 |         dataset = dataset.select(range(100))
225 |     processed_dataset = dataset.map(lambda example: process_func(example, tokenizer))
226 | 
227 |     new_model = args.new_model
228 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
229 |     peft_config = LoraConfig(
230 |         target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
231 |         lora_alpha=16,
232 |         lora_dropout=0.1,
233 |         r=8,
234 |         bias="none",
235 |         task_type="CAUSAL_LM",
236 |     )
237 | 
238 |     model = get_peft_model(model, peft_config)
239 | 
240 |     model.print_trainable_parameters()
241 |     for name, param in model.named_parameters():
242 |         if "gru" in name:
243 |             param.requires_grad = True
244 |         print(f"Parameter: {name}, requires_grad: {param.requires_grad}")
245 | 
246 |     ##############################################################
247 |     #               wanb
248 |     ##############################################################
249 |     wandb.init(project=args.wandb_project)
250 |     wandb.run.name = args.wandb_run_name
251 | 
252 |     ##############################################################
253 |     training_params = TrainingArguments(
254 |         output_dir=args.output_dir,
255 |         num_train_epochs=args.num_train_epochs,
256 |         per_device_train_batch_size=args.batch_size,  # 수정했음
257 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
258 |         warmup_ratio=0.1,
259 |         learning_rate=1e-4,
260 |         logging_steps=1,
261 |         lr_scheduler_type="cosine",
262 |         gradient_checkpointing=True,
263 |         save_steps=1000,
264 |         save_on_each_node=True,
265 |         do_train=True,
266 |         push_to_hub=False,
267 |         report_to="wandb",
268 |     )
269 |     trainer = CustomTrainer(
270 |         model=model,
271 |         args=training_params,
272 |         train_dataset=processed_dataset,
273 |         data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
274 |     )
275 |     trainer.train()
276 |     trainer.save_model(new_model)


--------------------------------------------------------------------------------
/source/train_origin.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from datasets import Dataset
  4 | 
  5 | from transformers import (
  6 |     AutoTokenizer,
  7 |     AutoModelForCausalLM,
  8 |     AutoConfig,
  9 |     DataCollatorForSeq2Seq,
 10 |     TrainingArguments,
 11 |     Trainer,
 12 |     Qwen2ForCausalLM,
 13 | )
 14 | 
 15 | from peft import LoraConfig, get_peft_model
 16 | import wandb
 17 | from torch.nn import functional as F
 18 | import argparse
 19 | 
 20 | 
 21 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
 22 |     def __call__(self, features):
 23 |         # sentence_masks를 제외한 features 리스트 생성
 24 |         features_without_masks = [{k: v for k, v in f.items() if k != "sent_masks"} for f in features]
 25 | 
 26 |         # 부모 클래스에서 features_without_masks 처리
 27 |         batch = super().__call__(features_without_masks)
 28 | 
 29 |         sentence_masks = [f.get("sent_masks", None) for f in features]
 30 |         # sentence_masks가 None이 아닌 경우 패딩 처리
 31 |         if sentence_masks[0] is not None:
 32 |             max_length = max(len(mask) for mask in sentence_masks)
 33 |             padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
 34 |             batch["sent_masks"] = torch.tensor(padded_sentence_masks)
 35 | 
 36 |         return batch
 37 | 
 38 | 
 39 | class CustomTrainer(Trainer):
 40 |     def compute_loss(self, model, inputs, return_outputs=False):
 41 |         # input을 원하는 대로 수정
 42 |         model.model.evidence = None
 43 | 
 44 |         if self.label_smoother is not None and "labels" in inputs:
 45 |             labels = inputs.pop("labels")
 46 |         else:
 47 |             labels = None
 48 |         outputs = model(**inputs)
 49 |         # Save past state if it exists
 50 |         # TODO: this needs to be fixed and made cleaner later.
 51 |         if self.args.past_index >= 0:
 52 |             self._past = outputs[self.args.past_index]
 53 | 
 54 |         if labels is not None:
 55 |             unwrapped_model = self.accelerator.unwrap_model(model)
 56 |             if self._is_peft_model(unwrapped_model):
 57 |                 model_name = unwrapped_model.base_model.model._get_name()
 58 |             else:
 59 |                 model_name = unwrapped_model._get_name()
 60 |             # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
 61 |             loss = self.label_smoother(outputs, labels, shift_labels=True)
 62 |             # else:
 63 |             #     loss = self.label_smoother(outputs, labels)
 64 |         else:
 65 |             if isinstance(outputs, dict) and "loss" not in outputs:
 66 |                 raise ValueError(
 67 |                     "The model did not return a loss from the inputs, only the following keys: "
 68 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
 69 |                 )
 70 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
 71 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]  # path, batch , 1742(max_sent)
 72 | 
 73 |         r_loss = loss
 74 |         return (r_loss, outputs) if return_outputs else r_loss
 75 | 
 76 | 
 77 | def create_model(model_path, config):
 78 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
 79 |     model = Qwen2ForCausalLM.from_pretrained(model_path, config=config, device_map="cuda")
 80 |     model.enable_input_require_grads()
 81 |     model.config.use_cache = False
 82 |     tokenizer.padding_side = "left"
 83 |     return tokenizer, model
 84 | 
 85 | 
 86 | IGNORE_INDEX = -100
 87 | 
 88 | 
 89 | def process_func(example, tokenizer):
 90 |     MAX_LENGTH = 2048
 91 |     input_ids, attention_mask, labels = [], [], []
 92 |     mrc_value = -1
 93 |     sum_value = -1
 94 |     if example["mrc_type"] == "T":
 95 |         mrc_value = "True"
 96 |     else:
 97 |         mrc_value = "False"
 98 |     if example["sum_type"] == "T":
 99 |         sum_value = "True"
100 |     else:
101 |         sum_value = "False"
102 | 
103 |     task_instruction = "Only fill in the **Answer to the **Question based on the **Document if <|MRC|> is True. Do not fill in the **Answer if the Question is not provided or if <|MRC|> is False. Only fill in the **Summary with a summary of the **Document if <|SUM|> is True. Do not fill in the **Summary if <|SUM|> is False."
104 |     example["document"] = example["document"].strip()
105 |     # token 된 doc
106 |     token_doc = {"input_ids": [], "attention_mask": []}
107 |     # document 문장 index
108 |     sentence_number = 0
109 |     sentence_position = []
110 |     for i, sent in enumerate(example["sent"]):
111 |         # 0번 문장은 instruction으로 지정할 계획
112 |         sent = sent.strip()
113 |         token_sent = tokenizer(sent + " ", add_special_tokens=False)
114 |         sentence_number += 1  # 1부터 시작
115 |         sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
116 |         token_doc["input_ids"] += token_sent["input_ids"]
117 |         token_doc["attention_mask"] += token_sent["attention_mask"]
118 |     token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
119 |     sentence_position.extend([0] * len(token_end))
120 |     token_doc["input_ids"] += token_end["input_ids"]
121 |     token_doc["attention_mask"] += token_end["attention_mask"]
122 | 
123 |     ########################################################################################################################
124 |     #           전처리 형태 바꾸기
125 |     ########################################################################################################################
126 |     if example["data_type"] == "answer":
127 |         if example["answer_type"] == "F":
128 |             if example["question"] == "no":  # 질문이 없는 경우
129 |                 instruction = tokenizer(
130 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
131 |                     add_special_tokens=False,
132 |                 )
133 |             else:
134 |                 instruction = tokenizer(
135 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n",
136 |                     add_special_tokens=False,
137 |                 )
138 |             response = tokenizer(
139 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
140 |             )
141 |         else:  # 답 해야하는 경우 질문은 무조건 있음
142 |             instruction = tokenizer(
143 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n",
144 |                 add_special_tokens=False,
145 |             )
146 |             response = tokenizer(
147 |                 f"<|im_start|>assistant\n**Answer:{example['output'].strip()}\n**Summary:\n<|im_end|>\n",
148 |                 add_special_tokens=False,
149 |             )
150 |     elif example["data_type"] == "summary":
151 |         if example["answer_type"] == "F":  # 무응답의 경우 질문이 무조건 없음
152 |             instruction = tokenizer(
153 |                 f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n",
154 |                 add_special_tokens=False,
155 |             )
156 |             response = tokenizer(
157 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:\n<|im_end|>\n", add_special_tokens=False
158 |             )
159 |         else:  # 답 해야하는 경우 질문 유무
160 |             if example["question"] == "summary":  # 질문이 없는 경우
161 |                 instruction = tokenizer(
162 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Document:\n{example['document']}<|im_end|>\n",
163 |                     add_special_tokens=False,
164 |                 )
165 |             else:
166 |                 instruction = tokenizer(
167 |                     f"<|im_start|>system\n{task_instruction}\n<|MRC|>{mrc_value}<|SUM|>{sum_value}<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n{example['document']}<|im_end|>\n",
168 |                     add_special_tokens=False,
169 |                 )
170 |             response = tokenizer(
171 |                 f"<|im_start|>assistant\n**Answer:\n**Summary:{example['output'].strip()}\n<|im_end|>\n",
172 |                 add_special_tokens=False,
173 |             )
174 |     # instruction에 대한 문장 번호
175 |     sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
176 |     sentence_position.extend([0] * len(response["input_ids"]))
177 |     input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"]
178 |     attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"]
179 |     labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"]
180 |     assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels)
181 | 
182 |     if len(input_ids) > MAX_LENGTH:
183 |         sentence_position = sentence_position[:MAX_LENGTH]
184 |         input_ids = input_ids[:MAX_LENGTH]
185 |         attention_mask = attention_mask[:MAX_LENGTH]
186 |         labels = labels[:MAX_LENGTH]
187 |     return {
188 |         "input_ids": input_ids,
189 |         "attention_mask": attention_mask,
190 |         "labels": labels,
191 |         "sent_masks": sentence_position,
192 |     }
193 | 
194 | 
195 | if __name__ == "__main__":
196 | 
197 |     ##############################################################
198 |     #               model param 추가할 내용
199 |     ##############################################################
200 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
201 |     parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
202 |     parser.add_argument("--data_file", type=str, default="data/train_hotpot_cnn_1022.json")
203 |     parser.add_argument("--beam_size", type=int, default=1)
204 |     parser.add_argument("--max_dec_len", type=int, default=3)
205 |     parser.add_argument("--new_model", type=str, default="hotpot_cnn_origin")
206 |     parser.add_argument("--wandb_project", type=str, default="llm pointer network")
207 |     parser.add_argument("--wandb_run_name", type=str, default="hotpot_cnn_origin")
208 |     parser.add_argument("--output_dir", type=str, default="/hdd/rbqlsquf/hotpot_cnn_origin")
209 |     parser.add_argument("--num_train_epochs", type=int, default=1)
210 |     parser.add_argument("--batch_size", type=int, default=4)
211 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
212 |     parser.add_argument("--data_sample", type=bool, default=False)
213 |     args = parser.parse_args()
214 |     print(args)
215 |     #########################################################
216 |     #           변수들 선언
217 |     #########################################################
218 |     model_path = args.model_path
219 | 
220 |     config = AutoConfig.from_pretrained(model_path)
221 |     config.beam_size = args.beam_size
222 |     config.max_dec_len = args.max_dec_len
223 | 
224 |     tokenizer, model = create_model(model_path, config)
225 |     data_file = args.data_file
226 |     print("학습 데이터 : ", data_file)
227 |     dataset = Dataset.from_json(data_file)
228 |     if args.data_sample:
229 |         dataset = dataset.select(range(12))
230 |     processed_dataset = dataset.map(lambda example: process_func(example, tokenizer))
231 | 
232 |     new_model = args.new_model
233 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
234 |     peft_config = LoraConfig(
235 |         target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
236 |         lora_alpha=16,
237 |         lora_dropout=0.1,
238 |         r=8,
239 |         bias="none",
240 |         task_type="CAUSAL_LM",
241 |     )
242 | 
243 |     model = get_peft_model(model, peft_config)
244 | 
245 |     model.print_trainable_parameters()
246 | 
247 |     ##############################################################
248 |     #               wanb
249 |     ##############################################################
250 |     wandb.init(project=args.wandb_project)
251 |     wandb.run.name = args.wandb_run_name
252 | 
253 |     ##############################################################
254 |     training_params = TrainingArguments(
255 |         output_dir=args.output_dir,
256 |         num_train_epochs=args.num_train_epochs,
257 |         per_device_train_batch_size=args.batch_size,  # 수정했음
258 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
259 |         warmup_ratio=0.1,
260 |         learning_rate=1e-4,
261 |         logging_steps=1,
262 |         lr_scheduler_type="cosine",
263 |         gradient_checkpointing=True,
264 |         save_steps=200,
265 |         save_on_each_node=True,
266 |         do_train=True,
267 |         push_to_hub=False,
268 |         report_to="wandb",
269 |     )
270 |     trainer = CustomTrainer(
271 |         model=model,
272 |         args=training_params,
273 |         train_dataset=processed_dataset,
274 |         data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
275 |     )
276 |     trainer.train()
277 |     trainer.save_model(new_model)


--------------------------------------------------------------------------------
/source/train_pn_noloss.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from datasets import Dataset
  4 | 
  5 | from transformers import (
  6 |     AutoTokenizer,
  7 |     AutoModelForCausalLM,
  8 |     AutoConfig,
  9 |     DataCollatorForSeq2Seq,
 10 |     TrainingArguments,
 11 |     Trainer,
 12 | )
 13 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 14 | from peft import LoraConfig, get_peft_model
 15 | import wandb
 16 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 17 | from nltk.translate.bleu_score import sentence_bleu
 18 | from torch.nn import functional as F
 19 | import argparse
 20 | 
 21 | 
 22 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
 23 |     def __call__(self, features):
 24 |         # sentence_masks를 제외한 features 리스트 생성
 25 |         features_without_masks = [
 26 |             {k: v for k, v in f.items() if k != "sent_masks" and k != "gold_sp"} for f in features
 27 |         ]
 28 |         # 부모 클래스에서 features_without_masks 처리
 29 |         batch = super().__call__(features_without_masks)
 30 | 
 31 |         sentence_masks = [f.get("sent_masks", None) for f in features]
 32 |         gold_sp = [f.get("gold_sp", None) for f in features]
 33 |         # sentence_masks가 None이 아닌 경우 패딩 처리
 34 |         if sentence_masks[0] is not None:
 35 |             max_length = max(len(mask) for mask in sentence_masks)
 36 |             padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
 37 |             batch["sent_masks"] = torch.tensor(padded_sentence_masks)
 38 |         if gold_sp[0] is not None:
 39 |             max_length = 3
 40 |             padded_sentence_masks = []
 41 |             for sp in gold_sp:
 42 |                 if len(sp) > max_length:
 43 |                     sp = sp[:max_length]
 44 |                 # Pad if shorter than max_length
 45 |                 padded_sp = sp + [0] * (max_length - len(sp))
 46 |                 padded_sentence_masks.append(padded_sp)
 47 |             batch["gold_sp"] = torch.tensor(padded_sentence_masks)
 48 |         return batch
 49 | 
 50 | 
 51 | class CustomTrainer(Trainer):
 52 | 
 53 |     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
 54 |         super().save_model(output_dir, _internal_call)
 55 |         self.model.model.save_pn_model(output_dir)
 56 | 
 57 |     def compute_loss(self, model, inputs, return_outputs=False):
 58 |         # input을 원하는 대로 수정
 59 |         model.model.evidence = None
 60 | 
 61 |         if self.label_smoother is not None and "labels" in inputs:
 62 |             labels = inputs.pop("labels")
 63 |         else:
 64 |             labels = None
 65 |         outputs = model(**inputs)
 66 |         # Save past state if it exists
 67 |         # TODO: this needs to be fixed and made cleaner later.
 68 |         if self.args.past_index >= 0:
 69 |             self._past = outputs[self.args.past_index]
 70 | 
 71 |         if labels is not None:
 72 |             unwrapped_model = self.accelerator.unwrap_model(model)
 73 |             if self._is_peft_model(unwrapped_model):
 74 |                 model_name = unwrapped_model.base_model.model._get_name()
 75 |             else:
 76 |                 model_name = unwrapped_model._get_name()
 77 |             # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
 78 |             loss = self.label_smoother(outputs, labels, shift_labels=True)
 79 |             # else:
 80 |             #     loss = self.label_smoother(outputs, labels)
 81 |         else:
 82 |             if isinstance(outputs, dict) and "loss" not in outputs:
 83 |                 raise ValueError(
 84 |                     "The model did not return a loss from the inputs, only the following keys: "
 85 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
 86 |                 )
 87 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
 88 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]  # path, batch , 1742(max_sent)
 89 | 
 90 |         sampled_evidence_scores = outputs.get("attention_scores")  # batch*path, 2, max_sent??
 91 |         mask = outputs.get("mask")  # batch, dec_len, max_sent
 92 |         path_logits = outputs.get("path_logits")  # path, batch, max_len, 151667
 93 |         sampled_evidence_sentence = outputs.get("evidence_sentences")
 94 |         logit = torch.argmax(path_logits[0], dim=-1)
 95 | 
 96 |         decoded_outputs = [
 97 |             tokenizer.decode(output[inputs["labels"][i] != -100], skip_special_tokens=True)
 98 |             for i, output in enumerate(logit)
 99 |         ]
100 |         ###############
101 |         print(decoded_outputs)
102 | 
103 |         r_loss = loss[0, :].mean()
104 |         print("========================================")
105 |         print(self.state.global_step)
106 |         print("loss:{}".format(loss))
107 | 
108 |         return (r_loss, outputs) if return_outputs else r_loss
109 | 
110 | 
111 | def create_model(model_path, config):
112 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
113 |     model = Qwen2ForCausalLM_pn.from_pretrained(model_path, config=config, device_map="cuda")
114 |     model.enable_input_require_grads()
115 |     gru = BeamSearchAttentionDecoder(
116 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
117 |     )
118 |     model.set_gru(gru)
119 |     model.config.use_cache = False
120 |     tokenizer.padding_side = "left"
121 |     return tokenizer, model
122 | 
123 | 
124 | def create_model_for_debug(base_model_path, lora_path, config):
125 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
126 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
127 |     gru = BeamSearchAttentionDecoder(
128 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
129 |     )
130 |     trained_model.set_gru(gru)
131 |     trained_model.config.use_cache = False
132 |     tokenizer.padding_side = "left"
133 |     print("LORA WEIGHT LOADING")
134 |     trained_model.load_pn_model(lora_path)
135 |     return tokenizer, trained_model
136 | 
137 | 
138 | IGNORE_INDEX = -100
139 | 
140 | 
141 | def process_func(example, tokenizer):
142 |     MAX_LENGTH = 2048
143 |     input_ids, attention_mask, labels = [], [], []
144 |     example["document"] = example["document"].strip()
145 |     # token 된 doc
146 |     token_doc = {"input_ids": [], "attention_mask": []}
147 |     # document 문장 index
148 |     sentence_number = 0
149 |     sentence_position = []
150 |     for i, sent in enumerate(example["sent"]):
151 |         # 0번 문장은 instruction으로 지정할 계획
152 |         sent = sent.strip()
153 |         token_sent = tokenizer(sent + " ", add_special_tokens=False)
154 |         sentence_number += 1  # 1부터 시작
155 |         sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
156 |         token_doc["input_ids"] += token_sent["input_ids"]
157 |         token_doc["attention_mask"] += token_sent["attention_mask"]
158 |     token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
159 |     sentence_position.extend([sentence_number] * len(token_end))
160 |     token_doc["input_ids"] += token_end["input_ids"]
161 |     token_doc["attention_mask"] += token_end["attention_mask"]
162 | 
163 |     ########################################################################################################################
164 |     #           전처리 형태 바꾸기
165 |     ########################################################################################################################
166 |     instruction = tokenizer(
167 |         f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n",
168 |         add_special_tokens=False,
169 |     )
170 |     response = tokenizer(
171 |         f"<|im_start|>assistant\n**Answer:{example['output'].strip()}<|im_end|>\n", add_special_tokens=False
172 |     )
173 | 
174 |     # instruction에 대한 문장 번호
175 |     sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
176 |     sentence_position.extend([0] * len(response["input_ids"]))
177 |     input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"]
178 |     attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"]
179 |     labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"]
180 |     assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels)
181 | 
182 |     if len(input_ids) > MAX_LENGTH:
183 |         sentence_position = sentence_position[:MAX_LENGTH]
184 |         input_ids = input_ids[:MAX_LENGTH]
185 |         attention_mask = attention_mask[:MAX_LENGTH]
186 |         labels = labels[:MAX_LENGTH]
187 |     return {
188 |         "input_ids": input_ids,
189 |         "attention_mask": attention_mask,
190 |         "labels": labels,
191 |         "sent_masks": sentence_position,
192 |         "gold_sp": example["supporting_num"],
193 |     }
194 | 
195 | 
196 | if __name__ == "__main__":
197 | 
198 |     ##############################################################
199 |     #               model param 추가할 내용
200 |     ##############################################################
201 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
202 |     parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
203 |     parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_train_shuffle_30k.json")
204 |     parser.add_argument("--lora_path", type=str, default="model/1124_upper/checkpoint-4400")
205 |     parser.add_argument("--beam_size", type=int, default=1)
206 |     parser.add_argument("--max_dec_len", type=int, default=3)
207 |     parser.add_argument("--new_model", type=str, default="new_mode")
208 |     parser.add_argument("--wandb_project", type=str, default="llm pointer network")
209 |     parser.add_argument("--wandb_run_name", type=str, default="test")
210 |     parser.add_argument("--output_dir", type=str, default="qwen_lora_1026")
211 |     parser.add_argument("--num_train_epochs", type=int, default=1)
212 |     parser.add_argument("--batch_size", type=int, default=2)
213 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
214 |     parser.add_argument("--data_sample", type=bool, default=False)
215 |     args = parser.parse_args()
216 |     print(args)
217 |     #########################################################
218 |     #           변수들 선언
219 |     #########################################################
220 |     model_path = args.model_path
221 | 
222 |     config = AutoConfig.from_pretrained(model_path)
223 |     config.beam_size = args.beam_size
224 |     config.max_dec_len = args.max_dec_len
225 | 
226 |     tokenizer, model = create_model(model_path, config)
227 |     data_file = args.data_file
228 |     print("학습 데이터 : ", data_file)
229 |     dataset = Dataset.from_json(data_file)
230 |     if args.data_sample:
231 |         dataset = dataset.select(range(100))
232 |     processed_dataset = dataset.map(lambda example: process_func(example, tokenizer))
233 | 
234 |     new_model = args.new_model
235 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
236 |     peft_config = LoraConfig(
237 |         target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
238 |         lora_alpha=16,
239 |         lora_dropout=0.1,
240 |         r=8,
241 |         bias="none",
242 |         task_type="CAUSAL_LM",
243 |     )
244 | 
245 |     model = get_peft_model(model, peft_config)
246 | 
247 |     model.print_trainable_parameters()
248 |     for name, param in model.named_parameters():
249 |         if "gru" in name or "linear_w1" in name:
250 |             param.requires_grad = True
251 |         print(f"Parameter: {name}, requires_grad: {param.requires_grad}")
252 | 
253 |     ##############################################################
254 |     #               wanb
255 |     ##############################################################
256 |     wandb.init(project=args.wandb_project, save_code=True)
257 |     wandb.run.name = args.wandb_run_name
258 |     wandb.save("modeling_qwen2_pn_att_1107_upper.py")
259 |     wandb.save("modeling_qwen2_.py")
260 |     ##############################################################
261 |     training_params = TrainingArguments(
262 |         output_dir=args.output_dir,
263 |         num_train_epochs=args.num_train_epochs,
264 |         per_device_train_batch_size=args.batch_size,  # 수정했음
265 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
266 |         warmup_ratio=0.1,
267 |         learning_rate=1e-4,
268 |         logging_steps=1,
269 |         lr_scheduler_type="cosine",
270 |         gradient_checkpointing=True,
271 |         save_steps=200,
272 |         save_on_each_node=True,
273 |         do_train=True,
274 |         push_to_hub=False,
275 |         report_to="wandb",
276 |     )
277 |     trainer = CustomTrainer(
278 |         model=model,
279 |         args=training_params,
280 |         train_dataset=processed_dataset,
281 |         data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
282 |     )
283 |     trainer.train()
284 |     trainer.save_model(new_model)
285 | 


--------------------------------------------------------------------------------
/source/train_upper.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from datasets import Dataset
  4 | 
  5 | from transformers import (
  6 |     AutoTokenizer,
  7 |     AutoModelForCausalLM,
  8 |     AutoConfig,
  9 |     DataCollatorForSeq2Seq,
 10 |     TrainingArguments,
 11 |     Trainer,
 12 | )
 13 | from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
 14 | from peft import LoraConfig, get_peft_model
 15 | import wandb
 16 | from modeling_qwen2_pn_att_1107_upper import Qwen2ForCausalLM_pn, BeamSearchAttentionDecoder
 17 | from nltk.translate.bleu_score import sentence_bleu
 18 | from torch.nn import functional as F
 19 | import argparse
 20 | from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 21 | 
 22 | 
 23 | class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
 24 |     def __call__(self, features):
 25 |         # sentence_masks를 제외한 features 리스트 생성
 26 |         features_without_masks = [
 27 |             {k: v for k, v in f.items() if k != "sent_masks" and k != "gold_sp"} for f in features
 28 |         ]
 29 |         # 부모 클래스에서 features_without_masks 처리
 30 |         batch = super().__call__(features_without_masks)
 31 | 
 32 |         sentence_masks = [f.get("sent_masks", None) for f in features]
 33 |         gold_sp = [f.get("gold_sp", None) for f in features]
 34 |         # sentence_masks가 None이 아닌 경우 패딩 처리
 35 |         if sentence_masks[0] is not None:
 36 |             max_length = max(len(mask) for mask in sentence_masks)
 37 |             padded_sentence_masks = [[0] * (max_length - len(mask)) + mask for mask in sentence_masks]
 38 |             batch["sent_masks"] = torch.tensor(padded_sentence_masks)
 39 |         if gold_sp[0] is not None:
 40 |             max_length = 3
 41 |             padded_sentence_masks = []
 42 |             for sp in gold_sp:
 43 |                 if len(sp) > max_length:
 44 |                     sp = sp[:max_length]
 45 |                 # Pad if shorter than max_length
 46 |                 padded_sp = sp + [0] * (max_length - len(sp))
 47 |                 padded_sentence_masks.append(padded_sp)
 48 |             batch["gold_sp"] = torch.tensor(padded_sentence_masks)
 49 |         return batch
 50 | 
 51 | 
 52 | class CustomTrainer(Trainer):
 53 | 
 54 |     def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False):
 55 |         super().save_model(output_dir, _internal_call)
 56 |         self.model.model.save_pn_model(output_dir)
 57 | 
 58 |     def compute_loss(self, model, inputs, return_outputs=False):
 59 |         # input을 원하는 대로 수정
 60 |         model.model.evidence = None
 61 | 
 62 |         if self.label_smoother is not None and "labels" in inputs:
 63 |             labels = inputs.pop("labels")
 64 |         else:
 65 |             labels = None
 66 |         outputs = model(**inputs)
 67 |         # Save past state if it exists
 68 |         # TODO: this needs to be fixed and made cleaner later.
 69 |         if self.args.past_index >= 0:
 70 |             self._past = outputs[self.args.past_index]
 71 | 
 72 |         if labels is not None:
 73 |             unwrapped_model = self.accelerator.unwrap_model(model)
 74 |             if self._is_peft_model(unwrapped_model):
 75 |                 model_name = unwrapped_model.base_model.model._get_name()
 76 |             else:
 77 |                 model_name = unwrapped_model._get_name()
 78 |             # if model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values():
 79 |             loss = self.label_smoother(outputs, labels, shift_labels=True)
 80 |             # else:
 81 |             #     loss = self.label_smoother(outputs, labels)
 82 |         else:
 83 |             if isinstance(outputs, dict) and "loss" not in outputs:
 84 |                 raise ValueError(
 85 |                     "The model did not return a loss from the inputs, only the following keys: "
 86 |                     f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}."
 87 |                 )
 88 |             # We don't use .loss here since the model may return tuples instead of ModelOutput.
 89 |             loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]  # path, batch , 1742(max_sent)
 90 | 
 91 |         sampled_evidence_scores = outputs.get("attention_scores")  # batch*path, 2, max_sent??
 92 |         mask = outputs.get("mask")  # batch, dec_len, max_sent
 93 |         path_logits = outputs.get("path_logits")  # path, batch, max_len, 151667
 94 |         sampled_evidence_sentence = outputs.get("evidence_sentences")
 95 |         logit = torch.argmax(path_logits[0], dim=-1)
 96 | 
 97 |         decoded_outputs = [
 98 |             tokenizer.decode(output[inputs["labels"][i] != -100], skip_special_tokens=True)
 99 |             for i, output in enumerate(logit)
100 |         ]
101 |         ###############
102 |         print(decoded_outputs)
103 |         loss_fct_2 = CrossEntropyLoss()
104 |         try:
105 |             loss_2 = loss_fct_2(
106 |                 sampled_evidence_scores.view(-1, sampled_evidence_scores.size(-1)), inputs["gold_sp"].view(-1)
107 |             )
108 |             r_loss = (loss[0, :].mean() + loss_2) / 2
109 |             print("========================================")
110 |             print(self.state.global_step)
111 |             print("loss:{}".format(loss))
112 |             print("loss_mean:{}".format(loss[0, :].mean()))
113 |             print("loss_2:{}".format(loss_2))
114 |             print("r_loss : {}".format(r_loss))
115 |         except:
116 |             r_loss = loss[0, :].mean()
117 |             print("========================================")
118 |             print(self.state.global_step)
119 |             print("loss:{}".format(loss))
120 |             print("loss_mean:{}".format(loss[0, :].mean()))
121 |             print("loss_2:nononono")
122 |             print("r_loss : {}".format(r_loss))
123 |         # # Add wandb logging for the evidence losses
124 |         # # Detailed wandb logging
125 | 
126 |         return (r_loss, outputs) if return_outputs else r_loss
127 | 
128 | 
129 | def create_model(model_path, config):
130 |     tokenizer = AutoTokenizer.from_pretrained(model_path)
131 |     model = Qwen2ForCausalLM_pn.from_pretrained(model_path, config=config, device_map="cuda")
132 |     model.enable_input_require_grads()
133 |     gru = BeamSearchAttentionDecoder(
134 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
135 |     )
136 |     model.set_gru(gru)
137 |     model.config.use_cache = False
138 |     tokenizer.padding_side = "left"
139 |     return tokenizer, model
140 | 
141 | 
142 | def create_model_for_debug(base_model_path, lora_path, config):
143 |     tokenizer = AutoTokenizer.from_pretrained(base_model_path)
144 |     trained_model = Qwen2ForCausalLM_pn.from_pretrained(lora_path, config=config, device_map="auto")
145 |     gru = BeamSearchAttentionDecoder(
146 |         hidden_size=config.hidden_size, num_sent=config.max_dec_len, topk=config.beam_size
147 |     )
148 |     trained_model.set_gru(gru)
149 |     trained_model.config.use_cache = False
150 |     tokenizer.padding_side = "left"
151 |     print("LORA WEIGHT LOADING")
152 |     trained_model.load_pn_model(lora_path)
153 |     return tokenizer, trained_model
154 | 
155 | 
156 | IGNORE_INDEX = -100
157 | 
158 | 
159 | def process_func(example, tokenizer):
160 |     MAX_LENGTH = 2048
161 |     input_ids, attention_mask, labels = [], [], []
162 |     example["document"] = example["document"].strip()
163 |     # token 된 doc
164 |     token_doc = {"input_ids": [], "attention_mask": []}
165 |     # document 문장 index
166 |     sentence_number = 0
167 |     sentence_position = []
168 |     for i, sent in enumerate(example["sent"]):
169 |         # 0번 문장은 instruction으로 지정할 계획
170 |         sent = sent.strip()
171 |         token_sent = tokenizer(sent + " ", add_special_tokens=False)
172 |         sentence_number += 1  # 1부터 시작
173 |         sentence_position.extend([sentence_number] * len(token_sent["input_ids"]))
174 |         token_doc["input_ids"] += token_sent["input_ids"]
175 |         token_doc["attention_mask"] += token_sent["attention_mask"]
176 |     token_end = tokenizer("<|im_end|>\n", add_special_tokens=False)
177 |     sentence_position.extend([sentence_number] * len(token_end))
178 |     token_doc["input_ids"] += token_end["input_ids"]
179 |     token_doc["attention_mask"] += token_end["attention_mask"]
180 | 
181 |     ########################################################################################################################
182 |     #           전처리 형태 바꾸기
183 |     ########################################################################################################################
184 |     instruction = tokenizer(
185 |         f"<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n**Question:{example['question']}\n**Document:\n",
186 |         add_special_tokens=False,
187 |     )
188 |     response = tokenizer(
189 |         f"<|im_start|>assistant\n**Answer:{example['output'].strip()}<|im_end|>\n", add_special_tokens=False
190 |     )
191 | 
192 |     # instruction에 대한 문장 번호
193 |     sentence_position = [0] * len(instruction["input_ids"]) + sentence_position
194 |     sentence_position.extend([0] * len(response["input_ids"]))
195 |     input_ids = instruction["input_ids"] + token_doc["input_ids"] + response["input_ids"]
196 |     attention_mask = instruction["attention_mask"] + token_doc["attention_mask"] + response["attention_mask"]
197 |     labels = [IGNORE_INDEX] * len(instruction["input_ids"] + token_doc["input_ids"]) + response["input_ids"]
198 |     assert len(input_ids) == len(sentence_position) == len(attention_mask) == len(labels)
199 | 
200 |     if len(input_ids) > MAX_LENGTH:
201 |         sentence_position = sentence_position[:MAX_LENGTH]
202 |         input_ids = input_ids[:MAX_LENGTH]
203 |         attention_mask = attention_mask[:MAX_LENGTH]
204 |         labels = labels[:MAX_LENGTH]
205 |     return {
206 |         "input_ids": input_ids,
207 |         "attention_mask": attention_mask,
208 |         "labels": labels,
209 |         "sent_masks": sentence_position,
210 |         "gold_sp": example["supporting_num"],
211 |     }
212 | 
213 | 
214 | if __name__ == "__main__":
215 | 
216 |     ##############################################################
217 |     #               model param 추가할 내용
218 |     ##############################################################
219 |     parser = argparse.ArgumentParser(description="인자값을 전달받는 Python 스크립트")
220 |     parser.add_argument("--model_path", type=str, default="Qwen/Qwen2.5-3B-Instruct")
221 |     parser.add_argument("--data_file", type=str, default="data/1125data/hotpot_train_shuffle_30k.json")
222 |     parser.add_argument("--lora_path", type=str, default="model/1124_upper/checkpoint-4400")
223 |     parser.add_argument("--beam_size", type=int, default=1)
224 |     parser.add_argument("--max_dec_len", type=int, default=3)
225 |     parser.add_argument("--new_model", type=str, default="new_mode")
226 |     parser.add_argument("--wandb_project", type=str, default="llm pointer network")
227 |     parser.add_argument("--wandb_run_name", type=str, default="test")
228 |     parser.add_argument("--output_dir", type=str, default="qwen_lora_1026")
229 |     parser.add_argument("--num_train_epochs", type=int, default=1)
230 |     parser.add_argument("--batch_size", type=int, default=2)
231 |     parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
232 |     parser.add_argument("--data_sample", type=bool, default=False)
233 |     args = parser.parse_args()
234 |     print(args)
235 |     #########################################################
236 |     #           변수들 선언
237 |     #########################################################
238 |     model_path = args.model_path
239 | 
240 |     config = AutoConfig.from_pretrained(model_path)
241 |     config.beam_size = args.beam_size
242 |     config.max_dec_len = args.max_dec_len
243 | 
244 |     tokenizer, model = create_model(model_path, config)
245 |     # tokenizer, model = create_model_for_debug(model_path, args.lora_path, config)
246 |     data_file = args.data_file
247 |     print("학습 데이터 : ", data_file)
248 |     dataset = Dataset.from_json(data_file)
249 |     if args.data_sample:
250 |         dataset = dataset.select(range(100))
251 |     processed_dataset = dataset.map(lambda example: process_func(example, tokenizer))
252 | 
253 |     new_model = args.new_model
254 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
255 |     peft_config = LoraConfig(
256 |         target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
257 |         lora_alpha=16,
258 |         lora_dropout=0.1,
259 |         r=8,
260 |         bias="none",
261 |         task_type="CAUSAL_LM",
262 |     )
263 | 
264 |     model = get_peft_model(model, peft_config)
265 | 
266 |     model.print_trainable_parameters()
267 |     for name, param in model.named_parameters():
268 |         if "gru" in name or "linear_w1" in name:
269 |             param.requires_grad = True
270 |         print(f"Parameter: {name}, requires_grad: {param.requires_grad}")
271 | 
272 |     ##############################################################
273 |     #               wanb
274 |     ##############################################################
275 |     wandb.init(project=args.wandb_project, save_code=True)
276 |     wandb.run.name = args.wandb_run_name
277 |     wandb.save("modeling_qwen2_pn_att_1107_upper.py")
278 |     wandb.save("modeling_qwen2_.py")
279 |     ##############################################################
280 |     training_params = TrainingArguments(
281 |         output_dir=args.output_dir,
282 |         num_train_epochs=args.num_train_epochs,
283 |         per_device_train_batch_size=args.batch_size,  # 수정했음
284 |         gradient_accumulation_steps=args.gradient_accumulation_steps,
285 |         warmup_ratio=0.1,
286 |         learning_rate=1e-4,
287 |         logging_steps=1,
288 |         lr_scheduler_type="cosine",
289 |         gradient_checkpointing=True,
290 |         save_steps=200,
291 |         save_on_each_node=True,
292 |         do_train=True,
293 |         push_to_hub=False,
294 |         report_to="wandb",
295 |     )
296 |     trainer = CustomTrainer(
297 |         model=model,
298 |         args=training_params,
299 |         train_dataset=processed_dataset,
300 |         data_collator=CustomDataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
301 |     )
302 |     trainer.train()
303 |     trainer.save_model(new_model)
304 | 


--------------------------------------------------------------------------------
/tmp.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import json\n",
 10 |     "file_path = \"data/1020data/hotpot_train.json\"\n",
 11 |     "with open(file_path, 'r', encoding='utf-8') as f:\n",
 12 |     "    data = json.load(f)"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 6,
 18 |    "metadata": {},
 19 |    "outputs": [
 20 |     {
 21 |      "data": {
 22 |       "text/plain": [
 23 |        "90447"
 24 |       ]
 25 |      },
 26 |      "execution_count": 6,
 27 |      "metadata": {},
 28 |      "output_type": "execute_result"
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "len(data)"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 7,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "all_result = []\n",
 42 |     "for d in data[:30000]:\n",
 43 |     "    d[\"answer_type\"] = \"T\"\n",
 44 |     "    d[\"mrc_type\"] = \"T\"\n",
 45 |     "    d[\"sum_type\"] = \"F\"\n",
 46 |     "    d[\"data_type\"] = \"answer\"\n",
 47 |     "    d[\"_id\"] = str(d[\"_id\"])\n",
 48 |     "    all_result.append(d)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 8,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "file_path = \"data/1020data/hotpot_30k.json\"\n",
 58 |     "with open(file_path, 'w', encoding='utf-8') as f:\n",
 59 |     "    json.dump(all_result, f, ensure_ascii=False, indent=4)"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": 10,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "import json\n",
 69 |     "file_path = \"data/1020data/cnn_train.json\"\n",
 70 |     "with open(file_path, 'r', encoding='utf-8') as f:\n",
 71 |     "    data = json.load(f)"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 11,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "for d in data[:30000]:\n",
 81 |     "    d[\"answer_type\"] = \"T\"\n",
 82 |     "    d[\"mrc_type\"] = \"F\"\n",
 83 |     "    d[\"sum_type\"] = \"T\"\n",
 84 |     "    d[\"data_type\"] = \"answer\"\n",
 85 |     "    d[\"_id\"] = str(d[\"_id\"])\n",
 86 |     "    all_result.append(d)"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 12,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "import random\n",
 96 |     "random.seed(100)\n",
 97 |     "\n",
 98 |     "random.shuffle(all_result)"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 13,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "file_path = \"data/1022data/hotpot_cnn_6k.json\"\n",
108 |     "with open(file_path, 'w', encoding='utf-8') as f:\n",
109 |     "    json.dump(all_result, f, ensure_ascii=False, indent=4)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": []
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": ".venv",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.8.10"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 2
141 | }
142 | 


--------------------------------------------------------------------------------