├── llama-7b-infer.ipynb ├── locally-fine-tuning-llama-2-using-qlora.ipynb ├── single-model-rewardtrainer-lora-llm.ipynb ├── with-openai-gpt-and-langchain.ipynb └── openbook-debertav3-large-baseline-single-model.ipynb /llama-7b-infer.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"import sys\nimport gc\nimport os\nimport numpy as np\nimport pandas as pd\nfrom tqdm import tqdm\nimport torch\nfrom torch import nn\nfrom transformers import LlamaTokenizer, AutoModelForCausalLM","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-07-29T15:58:35.950825Z","iopub.execute_input":"2023-07-29T15:58:35.951095Z","iopub.status.idle":"2023-07-29T15:59:05.620078Z","shell.execute_reply.started":"2023-07-29T15:58:35.951071Z","shell.execute_reply":"2023-07-29T15:59:05.619059Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"df = pd.read_csv(\"/kaggle/input/kaggle-llm-science-exam/test.csv\")\nsub = pd.read_csv(\"/kaggle/input/kaggle-llm-science-exam/sample_submission.csv\")\n\ndf['instruction'] = df['prompt'] + ' A: ' + df['A'] + ' B: ' + df['B'] + ' C: ' + df['C'] + ' D: ' + df['D'] + ' E: ' + df['E']\ndf","metadata":{"execution":{"iopub.status.busy":"2023-07-29T15:59:05.621932Z","iopub.execute_input":"2023-07-29T15:59:05.622527Z","iopub.status.idle":"2023-07-29T15:59:05.699501Z","shell.execute_reply.started":"2023-07-29T15:59:05.622491Z","shell.execute_reply":"2023-07-29T15:59:05.698478Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model_name = '/kaggle/input/llama2-7b-stem'\n\ntokenizer = LlamaTokenizer.from_pretrained(model_name)\n \nmodel = AutoModelForCausalLM.from_pretrained(\n model_name,\n torch_dtype=torch.bfloat16,\n device_map=\"auto\",\n trust_remote_code=True,\n )","metadata":{"execution":{"iopub.status.busy":"2023-07-29T15:59:05.701249Z","iopub.execute_input":"2023-07-29T15:59:05.701630Z","iopub.status.idle":"2023-07-29T16:01:37.131227Z","shell.execute_reply.started":"2023-07-29T15:59:05.701596Z","shell.execute_reply":"2023-07-29T16:01:37.130262Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"prompt = \"<>\\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.\\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\\n<>\\n\\n [INST] query [/INST] \"\n\npreds = []\nfor _, row in tqdm(df.iterrows(), total=len(df)):\n inputs = tokenizer(prompt.replace('query', row['instruction']) , return_tensors=\"pt\").to(f\"cuda:{model.device.index}\")\n with torch.no_grad():\n output = model.generate(input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"], max_length=1024, \n return_dict_in_generate=True, output_scores=True)\n\n first_token_probs = output.scores[0][0]\n option_scores = first_token_probs[[319, 350, 315, 360, 382]].float().cpu().numpy() #ABCDE\n pred = np.array([\"A\", \"B\", \"C\", \"D\", \"E\"])[np.argsort(option_scores)[::-1][:3]]\n pred = ' '.join(pred)\n preds.append(pred)\n \nsub['prediction'] = preds","metadata":{"execution":{"iopub.status.busy":"2023-07-29T16:01:37.135721Z","iopub.execute_input":"2023-07-29T16:01:37.136258Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub.to_csv('submission.csv', index=False)\nsub","metadata":{"trusted":true},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /locally-fine-tuning-llama-2-using-qlora.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# Install packages","metadata":{}},{"cell_type":"code","source":"!pip install /kaggle/input/nh-llama-2-7b/accelerate-0.21.0-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/bitsandbytes-0.41.1-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/peft-0.4.0-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/trl-0.5.0-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/openapi_schema_pydantic-1.2.4-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/langsmith-0.0.22-py3-none-any.whl\n!pip install /kaggle/input/nh-llama-2-7b/langchain-0.0.264-py3-none-any.whl","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.955997Z","iopub.status.idle":"2023-08-14T12:15:48.956836Z","shell.execute_reply.started":"2023-08-14T12:15:48.956555Z","shell.execute_reply":"2023-08-14T12:15:48.956579Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Imports","metadata":{}},{"cell_type":"code","source":"import torch\nfrom transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig\nimport pandas as pd\nfrom string import Template\nfrom pathlib import Path\n\n\nimport os\n\nimport warnings\nwarnings.simplefilter(\"ignore\")\n\nfrom tqdm.notebook import tqdm\n\n# for training\nfrom peft import LoraConfig, get_peft_model\nfrom transformers import TrainingArguments\nfrom trl import SFTTrainer, DataCollatorForCompletionOnlyLM\n# for traing set\nfrom datasets import load_dataset\nfrom langchain.prompts import PromptTemplate\nimport matplotlib.pyplot as plt\nimport bitsandbytes as bnb\nimport numpy as np\n\nfrom IPython.display import Markdown, display","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.958386Z","iopub.status.idle":"2023-08-14T12:15:48.959301Z","shell.execute_reply.started":"2023-08-14T12:15:48.959034Z","shell.execute_reply":"2023-08-14T12:15:48.95906Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# load model and tokenizer","metadata":{}},{"cell_type":"code","source":"# change model_name to the model of your choice.\n# This can be either name of the model on huggingface (requires internet) or path to the model\nmodel_name = \"/kaggle/input/llama2-7b-hf/Llama2-7b-hf\"\n\nbnb_config = BitsAndBytesConfig(\n load_in_4bit=True,\n bnb_4bit_use_double_quant=True,\n bnb_4bit_quant_type=\"nf4\",\n bnb_4bit_compute_dtyp=torch.bfloat16,\n)\n\nmodel = AutoModelForCausalLM.from_pretrained(\n model_name,\n quantization_config=bnb_config,\n trust_remote_code=True\n)\n# this should be set as False for finetuning\nmodel.config.use_cache = False\n\ntokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\ntokenizer.pad_token = tokenizer.eos_token","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.960944Z","iopub.status.idle":"2023-08-14T12:15:48.961913Z","shell.execute_reply.started":"2023-08-14T12:15:48.9616Z","shell.execute_reply":"2023-08-14T12:15:48.961627Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# prepare training data","metadata":{}},{"cell_type":"code","source":"# load training data\ntrain_dataset = load_dataset(\"csv\", data_files=\"/kaggle/input/kaggle-llm-science-exam/train.csv\")","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.973161Z","iopub.status.idle":"2023-08-14T12:15:48.974067Z","shell.execute_reply.started":"2023-08-14T12:15:48.973772Z","shell.execute_reply":"2023-08-14T12:15:48.973817Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# prepare template \ntemplate = \"\"\"Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D, E]\n\nQuestion: {prompt}\\n\nA) {a}\\n\nB) {b}\\n\nC) {c}\\n\nD) {d}\\n\nE) {e}\\n\n\n### Answer: {answer}\"\"\"\n\nprompt = PromptTemplate(template=template, input_variables=['prompt', 'a', 'b', 'c', 'd', 'e', 'answer'])","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.975665Z","iopub.status.idle":"2023-08-14T12:15:48.97657Z","shell.execute_reply.started":"2023-08-14T12:15:48.976291Z","shell.execute_reply":"2023-08-14T12:15:48.976317Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# display sample to see template\nsample = train_dataset['train'][0]\ndisplay(Markdown(prompt.format(prompt=sample['prompt'], \n a=sample['A'], \n b=sample['B'], \n c=sample['C'], \n d=sample['D'], \n e=sample['E'], \n answer=sample['answer'])))","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.978154Z","iopub.status.idle":"2023-08-14T12:15:48.97904Z","shell.execute_reply.started":"2023-08-14T12:15:48.978723Z","shell.execute_reply":"2023-08-14T12:15:48.97879Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def format_text(example):\n \"\"\" fill inputs in promt for a sample \"\"\"\n text = prompt.format(prompt=example['prompt'], \n a=example['A'], \n b=example['B'], \n c=example['C'], \n d=example['D'], \n e=example['E'], \n answer=example['answer'])\n return {\"text\": text}","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.980458Z","iopub.status.idle":"2023-08-14T12:15:48.981262Z","shell.execute_reply.started":"2023-08-14T12:15:48.981014Z","shell.execute_reply":"2023-08-14T12:15:48.981038Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"train_dataset = train_dataset.map(format_text)","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.982665Z","iopub.status.idle":"2023-08-14T12:15:48.983499Z","shell.execute_reply.started":"2023-08-14T12:15:48.983239Z","shell.execute_reply":"2023-08-14T12:15:48.983264Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Set up training arguments","metadata":{}},{"cell_type":"code","source":"# check model structure\nmodel","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def find_linear_layers(model):\n \"\"\" find linear layers in given transformer model \"\"\"\n lora_module_names = set()\n for name, module in model.named_modules():\n # 4 bits for qlora\n if isinstance(module, bnb.nn.Linear4bit): \n names = name.split('.')\n lora_module_names.add(names[0] if len(names) == 1 else names[-1])\n\n if 'lm_head' in lora_module_names:\n lora_module_names.remove('lm_head')\n print(f\"LoRA module names: {list(lora_module_names)}\")\n return list(lora_module_names)\n\n\ntarget_modules = find_linear_layers(model)\n#for llama 2 (they need different target module)\nqlora_config = LoraConfig(\n r=16, # dimension of the updated matrices\n lora_alpha=64, # parameter for scaling\n target_modules=target_modules, # this chooses on which layers QLoRA is applied\n lora_dropout=0.1, # dropout probability for layers\n bias=\"none\",\n task_type=\"CAUSAL_LM\",\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.985081Z","iopub.status.idle":"2023-08-14T12:15:48.985936Z","shell.execute_reply.started":"2023-08-14T12:15:48.985655Z","shell.execute_reply":"2023-08-14T12:15:48.985679Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# \"max_steps=1\" is just for testing execution\ntraining_args = TrainingArguments(\n output_dir=\"./SFT-llama2-7b\", \n per_device_train_batch_size=4,\n per_device_eval_batch_size=8,\n gradient_accumulation_steps=2,\n learning_rate=2e-4,\n logging_steps=20,\n logging_strategy=\"steps\",\n warmup_steps=2,\n# num_train_epochs=1,\n max_steps=1,\n optim=\"paged_adamw_8bit\",\n fp16=True,\n run_name=\"baseline-llama2-sft\",\n save_total_limit=1, # can be increased, but but beware of kaggle notebook output size limit\n report_to=\"none\"\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.987413Z","iopub.status.idle":"2023-08-14T12:15:48.988246Z","shell.execute_reply.started":"2023-08-14T12:15:48.988001Z","shell.execute_reply":"2023-08-14T12:15:48.988024Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"supervised_finetuning_trainer = SFTTrainer(\n model,\n train_dataset=train_dataset['train'],\n args=training_args,\n tokenizer=tokenizer,\n peft_config=qlora_config,\n dataset_text_field=\"text\",\n max_seq_length=3000,\n data_collator=DataCollatorForCompletionOnlyLM(tokenizer=tokenizer, \n response_template=\"Answer:\")\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.989648Z","iopub.status.idle":"2023-08-14T12:15:48.990491Z","shell.execute_reply.started":"2023-08-14T12:15:48.990228Z","shell.execute_reply":"2023-08-14T12:15:48.990253Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"supervised_finetuning_trainer.train()","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.992076Z","iopub.status.idle":"2023-08-14T12:15:48.99294Z","shell.execute_reply.started":"2023-08-14T12:15:48.992665Z","shell.execute_reply":"2023-08-14T12:15:48.992691Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Save model","metadata":{}},{"cell_type":"code","source":"model_to_save = supervised_finetuning_trainer.model.module if hasattr(supervised_finetuning_trainer.model, 'module') else supervised_finetuning_trainer.model\nmodel_to_save.save_pretrained(\"outputs\")","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.994388Z","iopub.status.idle":"2023-08-14T12:15:48.995253Z","shell.execute_reply.started":"2023-08-14T12:15:48.994988Z","shell.execute_reply":"2023-08-14T12:15:48.995012Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Applying lora","metadata":{}},{"cell_type":"code","source":"lora_config = LoraConfig.from_pretrained('outputs')\nmodel = get_peft_model(model, lora_config)","metadata":{"execution":{"iopub.status.busy":"2023-08-14T12:15:48.996724Z","iopub.status.idle":"2023-08-14T12:15:48.997519Z","shell.execute_reply.started":"2023-08-14T12:15:48.997272Z","shell.execute_reply":"2023-08-14T12:15:48.997296Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Create submission","metadata":{}},{"cell_type":"markdown","source":"### Prepare test set","metadata":{}},{"cell_type":"code","source":"# same prompt as before\ntemplate = \"\"\"Answer the following multiple choice question by giving the most appropriate response. Answer should be one among [A, B, C, D, E]\n\nQuestion: {prompt}\\n\nA) {a}\\n\nB) {b}\\n\nC) {c}\\n\nD) {d}\\n\nE) {e}\\n\n\n### Answer: {answer}\"\"\"\n\nprompt = PromptTemplate(template=template, input_variables=['prompt', 'a', 'b', 'c', 'd', 'e', 'answer'])","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# We don't have answers for test\ndef format_text_test(example):\n text = prompt.format(prompt=example['prompt'], \n a=example['A'], \n b=example['B'], \n c=example['C'], \n d=example['D'], \n e=example['E'], \n answer='')\n return {\"text\": text}\n\n\ntest_dataset = load_dataset(\"csv\", data_files=\"/kaggle/input/kaggle-llm-science-exam/test.csv\")\ntest_dataset = test_dataset.map(format_text_test)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### Predict with fine-tuned model","metadata":{}},{"cell_type":"code","source":"from torch import nn\nclass Perplexity(nn.Module):\n def __init__(self, reduce: bool = True):\n super().__init__()\n self.loss_fn = nn.CrossEntropyLoss()\n self.reduce = reduce\n\n def forward(self, logits, labels):\n shift_logits = logits[..., :-1, :].contiguous()\n shift_labels = labels[..., 1:].contiguous()\n\n perplexity = []\n for i in range(labels.shape[0]):\n perplexity.append(self.loss_fn(shift_logits[i], shift_labels[i]))\n perplexity = torch.stack(perplexity, dim=0)\n if self.reduce:\n perplexity = torch.mean(perplexity)\n return perplexity \n \nperp = Perplexity()","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"preds = []\nfor idx in tqdm(range(len(test_dataset[\"train\"])), total=len(test_dataset[\"train\"])):\n \n with torch.no_grad():\n cols = [\"A\", \"B\", \"C\", \"D\", \"E\"]\n perps = []\n samples = []\n for col in cols:\n prompt = test_dataset['train'][idx]['text']\n samples.append(prompt + col)\n inputs = tokenizer(samples, return_tensors=\"pt\", add_special_tokens=False, padding=True, truncation=True).to(\"cuda\")\n\n output = model(input_ids=inputs[\"input_ids\"], attention_mask=inputs[\"attention_mask\"])\n output = output.logits\n labels = inputs[\"input_ids\"]\n labels.masked_fill_(~inputs[\"attention_mask\"].bool(), -100)\n for j in range(len(cols)):\n p = perp(output[j].unsqueeze(0), labels[j].unsqueeze(0))\n perps.append(p.detach().cpu())\n \n del inputs\n del labels\n del output\n del p\n\n perps = np.array(perps)\n predictions = [np.array(cols)[np.argsort(perps)]]\n preds.append(predictions)","metadata":{},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"### format predictions to sumbission format and save","metadata":{"execution":{"iopub.status.busy":"2023-08-19T13:39:01.937215Z","iopub.execute_input":"2023-08-19T13:39:01.937617Z","iopub.status.idle":"2023-08-19T13:39:01.944443Z","shell.execute_reply.started":"2023-08-19T13:39:01.937586Z","shell.execute_reply":"2023-08-19T13:39:01.942939Z"}}},{"cell_type":"code","source":"def format_prediction(row, k=3):\n best_k_preds = row[0][:k]\n return ' '.join(best_k_preds)\n\ntest_df = pd.DataFrame(preds)\nformat_prediction(test_df.iloc[0, :])\ntest_df['prediction'] = test_df.apply(lambda x: format_prediction(x), axis=1)\ntest_df['id'] = test_df.index\n\nsubmission = test_df[['id', 'prediction']]\nsubmission.to_csv('submission.csv', index=False)","metadata":{},"execution_count":null,"outputs":[]}]} -------------------------------------------------------------------------------- /single-model-rewardtrainer-lora-llm.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"In this notebook, we will be training, validating, and inferring using Deberta Large along with the Low-Rank Adaptation technique.\n\nI am experimenting with a slightly different approach called Reward Modelling, and it has been yielding decent results thus far. Essentially, it involves following these steps:\n- Creating a new dataset format consisting of rejected and chosen examples for each option, essentially utilizing the hh-rlhf format.\n- Feeding these instances into the reward training pipeline.\n- Using a sequence classification head to predict logits for each pair.\n- Sorting each answer by its probability and selecting the top 3.\n\nFor training, I utilized [Radek's awesome dataset](https://www.kaggle.com/datasets/radek1/additional-train-data-for-llm-science-exam) and created some experimental dataset using llama-2-13b for reward training [here](https://www.kaggle.com/datasets/datafan07/rlhf-data-for-llm-science-exam). I believe that with larger datasets, the scores could be improved further, considering that lora training is quite fast, which should not pose a significant challenge for training. With the lora approach, I managed to train over 30k instances in under 5 minutes using a retail graphics card.\n\nSo let's get started...","metadata":{}},{"cell_type":"code","source":"# installing offline dependencies\n\n!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# using single gpu for this case\n\nimport os\nos.environ[\"CUDA_DEVICE_ORDER\"]=\"PCI_BUS_ID\"\nos.environ[\"CUDA_VISIBLE_DEVICES\"]=\"0\"","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:11.950119Z","iopub.execute_input":"2023-08-10T13:25:11.951054Z","iopub.status.idle":"2023-08-10T13:25:11.961259Z","shell.execute_reply.started":"2023-08-10T13:25:11.951006Z","shell.execute_reply":"2023-08-10T13:25:11.960244Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import torch\nimport numpy as np\nfrom dataclasses import dataclass, field\nfrom typing import Optional\n\nfrom datasets import load_dataset, Dataset\nfrom peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel, PeftConfig, TaskType, PeftModelForSequenceClassification\nfrom tqdm.auto import tqdm\nfrom transformers import (\n AutoModelForSequenceClassification, AutoModelForCausalLM, AutoModel,\n AutoTokenizer,\n BitsAndBytesConfig,\n HfArgumentParser,\n TrainingArguments,\n DataCollatorWithPadding\n)\n\nimport random\nimport os\nimport gc\n\n\n\nfrom trl import RewardTrainer\n\nimport pandas as pd\npd.set_option('display.max_colwidth', None)\ntqdm.pandas()","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:12.129893Z","iopub.execute_input":"2023-08-10T13:25:12.130558Z","iopub.status.idle":"2023-08-10T13:25:18.624492Z","shell.execute_reply.started":"2023-08-10T13:25:12.130525Z","shell.execute_reply":"2023-08-10T13:25:18.623467Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"class CFG:\n base_model = \"/kaggle/input/huggingfacedebertav3variants/deberta-v3-large\"\n seed = 42","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:18.626358Z","iopub.execute_input":"2023-08-10T13:25:18.627266Z","iopub.status.idle":"2023-08-10T13:25:18.631756Z","shell.execute_reply.started":"2023-08-10T13:25:18.627229Z","shell.execute_reply":"2023-08-10T13:25:18.630731Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def set_seed(seed = int):\n '''Sets the seed of the entire notebook so results are the same every time we run.\n This is for REPRODUCIBILITY.'''\n np.random.seed(seed)\n random_state = np.random.RandomState(seed)\n random.seed(seed)\n torch.manual_seed(seed)\n torch.cuda.manual_seed(seed)\n torch.backends.cudnn.deterministic = True\n torch.backends.cudnn.benchmark = False\n os.environ['PYTHONHASHSEED'] = str(seed)\n return random_state\n\n\nrandom_state = set_seed(CFG.seed)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:18.633254Z","iopub.execute_input":"2023-08-10T13:25:18.634160Z","iopub.status.idle":"2023-08-10T13:25:18.647859Z","shell.execute_reply.started":"2023-08-10T13:25:18.634125Z","shell.execute_reply":"2023-08-10T13:25:18.646844Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# loading multi option dataset\n\ntrain_df_0 = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/6000_train_examples.csv')\ntrain_df_1 = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv')\ntest_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')\n\n\n# merge and drop empty lines\n\ntrain_df = pd.concat((train_df_0, train_df_1), axis=0)\ntrain_df.dropna(inplace=True)\ntrain_df.reset_index(drop=True, inplace=True)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:18.652010Z","iopub.execute_input":"2023-08-10T13:25:18.652292Z","iopub.status.idle":"2023-08-10T13:25:18.722436Z","shell.execute_reply.started":"2023-08-10T13:25:18.652268Z","shell.execute_reply":"2023-08-10T13:25:18.721512Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"To prepare our dataset for the reward training task, we concatenate each question with its corresponding answer option, creating new pairs for each prompt. The chosen column represents the preferred answer, while the rejected column contains the incorrect answers. This approach enables our model to compare each pair effectively.","metadata":{}},{"cell_type":"code","source":"def generate_new_dataframe(df):\n new_rows = []\n\n # Iterate through each row in the original DataFrame\n for _, row in df.iterrows():\n prompt = row['prompt']\n chosen_option = row[row['answer']] # Get the text of the chosen option based on the 'answer' column\n\n # Iterate through each option\n for option in ['A', 'B', 'C', 'D', 'E']:\n if option != row['answer']:\n rejected_option = row[option] # Get the text of the rejected option\n new_row = {'chosen': prompt + ' ' + chosen_option, 'rejected': prompt + ' ' + rejected_option}\n new_rows.append(new_row)\n\n # Create a new DataFrame from the new_rows list\n new_df = pd.DataFrame(new_rows)\n return new_df\n\n\ntrain_df = generate_new_dataframe(train_df)\ntest_df = generate_new_dataframe(test_df)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:18.761696Z","iopub.execute_input":"2023-08-10T13:25:18.762039Z","iopub.status.idle":"2023-08-10T13:25:19.518086Z","shell.execute_reply.started":"2023-08-10T13:25:18.762007Z","shell.execute_reply":"2023-08-10T13:25:19.517089Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# adding extra reward data\n\ntrain_df_2 = pd.read_csv('/kaggle/input/rlhf-data-for-llm-science-exam/llm_rlhf_extra.csv')\ntrain_df = pd.concat((train_df, train_df_2), axis=0)\ntrain_df = train_df.sample(frac=1.0, random_state=CFG.seed)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:19.519693Z","iopub.execute_input":"2023-08-10T13:25:19.520086Z","iopub.status.idle":"2023-08-10T13:25:19.554387Z","shell.execute_reply.started":"2023-08-10T13:25:19.520051Z","shell.execute_reply":"2023-08-10T13:25:19.553433Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# converting to dataset format\n\ntrain_dataset = Dataset.from_pandas(train_df)\ntest_dataset = Dataset.from_pandas(test_df)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:19.555934Z","iopub.execute_input":"2023-08-10T13:25:19.556323Z","iopub.status.idle":"2023-08-10T13:25:19.599650Z","shell.execute_reply.started":"2023-08-10T13:25:19.556287Z","shell.execute_reply":"2023-08-10T13:25:19.598621Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# loading base model and tokenizers\n\nmodel = AutoModelForSequenceClassification.from_pretrained(\n CFG.base_model,\n num_labels=1,\n\n)\n\ntokenizer = AutoTokenizer.from_pretrained(CFG.base_model)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:19.603600Z","iopub.execute_input":"2023-08-10T13:25:19.603885Z","iopub.status.idle":"2023-08-10T13:25:27.445144Z","shell.execute_reply.started":"2023-08-10T13:25:19.603859Z","shell.execute_reply":"2023-08-10T13:25:27.443782Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Here is our lora config. With a relatively small rank, our model should be quite resource efficient during training.","metadata":{}},{"cell_type":"code","source":"peft_config = LoraConfig(\n r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, \n bias=\"none\", inference_mode=False, target_modules=[\"query_proj\", \"value_proj\"]\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:27.447063Z","iopub.execute_input":"2023-08-10T13:25:27.447743Z","iopub.status.idle":"2023-08-10T13:25:27.453380Z","shell.execute_reply.started":"2023-08-10T13:25:27.447705Z","shell.execute_reply":"2023-08-10T13:25:27.452237Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"As you can see we're going to train around 0.18% of actual model parameters.","metadata":{}},{"cell_type":"code","source":"model = get_peft_model(model, peft_config)\nmodel.print_trainable_parameters()","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:27.455258Z","iopub.execute_input":"2023-08-10T13:25:27.455984Z","iopub.status.idle":"2023-08-10T13:25:28.167496Z","shell.execute_reply.started":"2023-08-10T13:25:27.455949Z","shell.execute_reply":"2023-08-10T13:25:28.166427Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Here, we transform our dataset into the expected format for the RewardTrainer. Additionally, we exclude instances where the sequence length exceeds 256.","metadata":{}},{"cell_type":"code","source":"def preprocess_function(examples):\n new_examples = {\n \"input_ids_chosen\": [],\n \"attention_mask_chosen\": [],\n \"input_ids_rejected\": [],\n \"attention_mask_rejected\": [],\n }\n for chosen, rejected in zip(examples[\"chosen\"], examples[\"rejected\"]):\n tokenized_j = tokenizer(chosen, truncation=True)\n tokenized_k = tokenizer(rejected, truncation=True)\n\n new_examples[\"input_ids_chosen\"].append(tokenized_j[\"input_ids\"])\n new_examples[\"attention_mask_chosen\"].append(tokenized_j[\"attention_mask\"])\n new_examples[\"input_ids_rejected\"].append(tokenized_k[\"input_ids\"])\n new_examples[\"attention_mask_rejected\"].append(tokenized_k[\"attention_mask\"])\n\n return new_examples\n\ntrain_dataset = train_dataset.map(\n preprocess_function,\n batched=True,\n num_proc=4,\n)\ntrain_dataset = train_dataset.filter(\n lambda x: len(x[\"input_ids_chosen\"]) <= 256\n and len(x[\"input_ids_rejected\"]) <= 256\n)\n\n\ntest_dataset = test_dataset.map(\n preprocess_function,\n batched=True,\n num_proc=4,\n)\ntest_dataset = test_dataset.filter(\n lambda x: len(x[\"input_ids_chosen\"]) <= 2048\n and len(x[\"input_ids_rejected\"]) <= 2048\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:28.168857Z","iopub.execute_input":"2023-08-10T13:25:28.169544Z","iopub.status.idle":"2023-08-10T13:25:44.624549Z","shell.execute_reply.started":"2023-08-10T13:25:28.169509Z","shell.execute_reply":"2023-08-10T13:25:44.623355Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I will select specific settings, making slight alterations to accommodate Kaggle notebook limitations. This includes utilizing adafactor optimization and implementing gradient accumulation to conserve some memory.","metadata":{}},{"cell_type":"code","source":"training_args = TrainingArguments(\n output_dir='op',\n overwrite_output_dir = True,\n warmup_ratio=0.1,\n lr_scheduler_type='cosine',\n per_device_train_batch_size=8,\n per_device_eval_batch_size=8,\n num_train_epochs=1,\n gradient_accumulation_steps=2,\n learning_rate=2e-4,\n remove_unused_columns=False,\n optim=\"adafactor\",\n logging_steps=250,\n eval_steps=250,\n evaluation_strategy='steps',\n load_best_model_at_end=True,\n save_total_limit = 2,\n fp16=True,\n bf16=False,\n weight_decay=0.01,\n report_to=\"none\",\n)\n","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:44.626493Z","iopub.execute_input":"2023-08-10T13:25:44.627149Z","iopub.status.idle":"2023-08-10T13:25:44.636423Z","shell.execute_reply.started":"2023-08-10T13:25:44.627113Z","shell.execute_reply":"2023-08-10T13:25:44.635303Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Here we set our RewardTrainer, I am going to use actual competition training data for evaluation while external generated data for training.","metadata":{}},{"cell_type":"code","source":"trainer = RewardTrainer(\n model=model,\n tokenizer=tokenizer,\n args=training_args,\n train_dataset=train_dataset,\n eval_dataset=test_dataset,\n peft_config=peft_config,\n max_length=256,\n)","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:44.639280Z","iopub.execute_input":"2023-08-10T13:25:44.639967Z","iopub.status.idle":"2023-08-10T13:25:44.723715Z","shell.execute_reply.started":"2023-08-10T13:25:44.639928Z","shell.execute_reply":"2023-08-10T13:25:44.722663Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model.config.use_cache = False\ntrainer.train()\ntrainer.save_model('deberta_adapter')\n\ndel model\n\ngc.collect()\ntorch.cuda.empty_cache()","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:25:44.724957Z","iopub.execute_input":"2023-08-10T13:25:44.725312Z","iopub.status.idle":"2023-08-10T13:54:07.278950Z","shell.execute_reply.started":"2023-08-10T13:25:44.725277Z","shell.execute_reply":"2023-08-10T13:54:07.277800Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# loading original training data for evaluation.\ndf = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:54:07.281162Z","iopub.execute_input":"2023-08-10T13:54:07.282110Z","iopub.status.idle":"2023-08-10T13:54:07.295510Z","shell.execute_reply.started":"2023-08-10T13:54:07.282074Z","shell.execute_reply":"2023-08-10T13:54:07.294525Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"At this stage, we have a sequence classifier for each question-answer pair, which calculates the likelihood of how well each answer option fits with the given question. This process is performed for every pair, and we then select the top 3 pairs with the highest likelihood scores.","metadata":{}},{"cell_type":"code","source":"def get_score(model, tokenizer, prompt, response):\n inputs = tokenizer(prompt + ' ' + response, return_tensors=\"pt\", max_length=2048, padding='longest', truncation=True).to('cuda')\n model.to('cuda')\n model.eval()\n with torch.autocast('cuda', dtype=torch.float16):\n outputs = model(input_ids = inputs['input_ids'], attention_mask=inputs['attention_mask'])\n logits = outputs.logits\n\n return logits.item()\n\ndef get_top_3_winners(model, tokenizer, prompt, response_options):\n scores = []\n for index, response in enumerate(response_options):\n score = get_score(model, tokenizer, prompt, response)\n scores.append((index, score))\n\n \n sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)\n \n top_3_winners = sorted_scores[:3]\n top_3_winners = [t[0] for t in top_3_winners]\n\n int_to_string = {\n 0: 'A',\n 1: 'B',\n 2: 'C',\n 3: 'D',\n 4: 'E'\n }\n\n top_3_winners = [int_to_string[val] for val in top_3_winners]\n\n \n return top_3_winners","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:54:07.297262Z","iopub.execute_input":"2023-08-10T13:54:07.297653Z","iopub.status.idle":"2023-08-10T13:54:07.310412Z","shell.execute_reply.started":"2023-08-10T13:54:07.297616Z","shell.execute_reply":"2023-08-10T13:54:07.309226Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"preds = []\nfor _, row in tqdm(df.iterrows()):\n prompt = row['prompt']\n response_options = [\n row['A'],\n row['B'],\n row['C'],\n row['D'],\n row['E']\n ]\n top_3_winners = get_top_3_winners(trainer.model, tokenizer, prompt, response_options)\n preds.append(top_3_winners)\n \nfinal_preds = [' '.join(pred) for pred in preds]","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:54:07.312243Z","iopub.execute_input":"2023-08-10T13:54:07.314006Z","iopub.status.idle":"2023-08-10T13:56:02.391284Z","shell.execute_reply.started":"2023-08-10T13:54:07.313979Z","shell.execute_reply":"2023-08-10T13:56:02.390279Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# source https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking\n\ndef precision_at_k(r, k):\n \"\"\"Precision at k\"\"\"\n assert k <= len(r)\n assert k != 0\n return sum(int(x) for x in r[:k]) / k\n\ndef MAP_at_3(predictions, true_items):\n \"\"\"Score is mean average precision at 3\"\"\"\n U = len(predictions)\n map_at_3 = 0.0\n for u in range(U):\n user_preds = predictions[u]\n user_true = true_items[u]\n user_results = [1 if item == user_true else 0 for item in user_preds]\n for k in range(min(len(user_preds), 3)):\n map_at_3 += precision_at_k(user_results, k+1) * user_results[k]\n return map_at_3 / U\n\n\nMAP_at_3(final_preds, df['answer'])","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:56:02.405126Z","iopub.execute_input":"2023-08-10T13:56:02.405427Z","iopub.status.idle":"2023-08-10T13:56:02.421862Z","shell.execute_reply.started":"2023-08-10T13:56:02.405397Z","shell.execute_reply":"2023-08-10T13:56:02.420734Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Seems promising score, let's predict the competition data and make a submission.","metadata":{}},{"cell_type":"code","source":"df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')\npreds = []\nfor _, row in tqdm(df.iterrows()):\n prompt = row['prompt']\n response_options = [\n row['A'],\n row['B'],\n row['C'],\n row['D'],\n row['E']\n ]\n top_3_winners = get_top_3_winners(trainer.model, tokenizer, prompt, response_options)\n preds.append(top_3_winners)\nfinal_preds = [' '.join(pred) for pred in preds]","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:56:02.423308Z","iopub.execute_input":"2023-08-10T13:56:02.424062Z","iopub.status.idle":"2023-08-10T13:57:56.435370Z","shell.execute_reply.started":"2023-08-10T13:56:02.424027Z","shell.execute_reply":"2023-08-10T13:57:56.434275Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/sample_submission.csv')\nsub['prediction'] = final_preds","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:57:56.669371Z","iopub.execute_input":"2023-08-10T13:57:56.669709Z","iopub.status.idle":"2023-08-10T13:57:56.679903Z","shell.execute_reply.started":"2023-08-10T13:57:56.669681Z","shell.execute_reply":"2023-08-10T13:57:56.678955Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sub.to_csv('submission.csv', index=False)\npd.read_csv('submission.csv').head()","metadata":{"execution":{"iopub.status.busy":"2023-08-10T13:57:56.681365Z","iopub.execute_input":"2023-08-10T13:57:56.681873Z","iopub.status.idle":"2023-08-10T13:57:56.705884Z","shell.execute_reply.started":"2023-08-10T13:57:56.681838Z","shell.execute_reply":"2023-08-10T13:57:56.704924Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"I tried to come up with some alternate approach, aiming to add variety. I hope you find this baseline solution helpful, and if you do, kindly consider giving it an upvote. Thank you!","metadata":{}}]} -------------------------------------------------------------------------------- /with-openai-gpt-and-langchain.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"name":"python","version":"3.10.12","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"code","source":"# This Python 3 environment comes with many helpful analytics libraries installed\n# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n# For example, here's several helpful packages to load\n\nimport numpy as np # linear algebra\nimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n\n# Input data files are available in the read-only \"../input/\" directory\n# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n\nimport os\nfor dirname, _, filenames in os.walk('/kaggle/input'):\n for filename in filenames:\n print(os.path.join(dirname, filename))\n\n\n# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\" \n# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session","metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","execution":{"iopub.status.busy":"2023-09-11T13:54:31.005967Z","iopub.execute_input":"2023-09-11T13:54:31.006331Z","iopub.status.idle":"2023-09-11T13:54:31.014908Z","shell.execute_reply.started":"2023-09-11T13:54:31.006303Z","shell.execute_reply":"2023-09-11T13:54:31.013799Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# This notebook's description \nIn this notebook, the OpenAI's gpt3.5-turbo model was utilized to create prompts in a very simple manner (with minimal data manipulation), organize predicted values for each record, and calculate accuracy. For those who have not created prompts before, the section **\"create prompt and request openai gpt3.5 turbo to solve problems\"** provides guidance on prompt creation which could be helpful. \n\n**There are two points to note: \n1.To execute this notebook in its entirety, you will need to obtain an API key from OpenAI. \n2.This notebook is an attempt to use the results of gpt3.5 turbo as a personal benchmark. Even if you execute all the steps here, please be aware that it cannot be submitted.**\n\nThe gpt model generates responses in various forms, making their control challenging. Therefore, to stabilize the variation in response generation, it is effective to include phrases in the prompts that instruct the desired response format. Additionally, I believe it's important to adjust the value of the \"temperature\" parameter, which is one of the factors influencing response generation. \n\nLastly, thank you very much for reading up to this point! If you found this helpful in any way, I would greatly appreciate it if you could upvote. \n\n**The following text has been added as an update on August 29th.** \nAdditional methods to increase the percentage of correct answers were considered. \nPlease take a look at since the section entitled **\"In addition to GPT-3.5, I will attempt improvements by utilizing the Wikipedia API\"**, as it yielded a somewhat higher percentage of correct answers than simply getting the answers from GPT 3.5! \n\n**The following text has been added as an update on September 5.** \nAdditional methods to increase the percentage of correct answers were considered.\nPlease take a look at since the section entitled \"**Extract wikipedia information related to a question using the langchain library**\".Using the langchain library, I investigated ways to extract information that is highly relevant to the question, and including this in the prompt and generating responses with GPT 3.5 further increased the percentage of correct responses!","metadata":{}},{"cell_type":"markdown","source":"# check data","metadata":{}},{"cell_type":"code","source":"train = pd.read_csv(\"/kaggle/input/kaggle-llm-science-exam/train.csv\")\nprint(train.shape)\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2023-09-11T13:54:34.793649Z","iopub.execute_input":"2023-09-11T13:54:34.794032Z","iopub.status.idle":"2023-09-11T13:54:34.845040Z","shell.execute_reply.started":"2023-09-11T13:54:34.794002Z","shell.execute_reply":"2023-09-11T13:54:34.844048Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"i=0\nprint(\"prompt question is: \",train.loc[i,\"prompt\"])\nprint(\"----------------------------------\")\nfor option in [\"A\",\"B\",\"C\",\"D\",\"E\"]:\n print(\"Option \",option,\": \",train.loc[i,option])\n print(\"----------------------------------\")","metadata":{"execution":{"iopub.status.busy":"2023-09-11T13:54:35.056472Z","iopub.execute_input":"2023-09-11T13:54:35.056829Z","iopub.status.idle":"2023-09-11T13:54:35.064290Z","shell.execute_reply.started":"2023-09-11T13:54:35.056798Z","shell.execute_reply":"2023-09-11T13:54:35.063188Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Wow, this seems like a challenging problem indeed. It doesn't seem like something I could answer off the top of my head either.","metadata":{}},{"cell_type":"markdown","source":"# install library openai \nto request gpt3.5 turbo we need install openai library","metadata":{}},{"cell_type":"code","source":"!pip install openai","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-09-11T13:54:46.022340Z","iopub.execute_input":"2023-09-11T13:54:46.022696Z","iopub.status.idle":"2023-09-11T13:54:58.719922Z","shell.execute_reply.started":"2023-09-11T13:54:46.022669Z","shell.execute_reply":"2023-09-11T13:54:58.718636Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# set OpenAI API-key \nIn this notebook, I will be using OpenAI's API key. Once defined as a variable below, you will specify the API key from the variable 'apikey.' To obtain an OpenAI API key, you can create an account on OpenAI's official website: https://openai.com/ and generate it from your profile page.","metadata":{}},{"cell_type":"code","source":"apikey=\"your-api-key\"","metadata":{"execution":{"iopub.status.busy":"2023-09-11T13:54:58.721988Z","iopub.execute_input":"2023-09-11T13:54:58.722334Z","iopub.status.idle":"2023-09-11T13:54:58.728181Z","shell.execute_reply.started":"2023-09-11T13:54:58.722301Z","shell.execute_reply":"2023-09-11T13:54:58.726620Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# create prompt and request openai gpt3.5 turbo to solve problems","metadata":{}},{"cell_type":"code","source":"#Creates a generic GPT calling function\ndef request_gpt(model_name,messages_list):\n response=openai.ChatCompletion.create(\n model=model_name,\n messages=messages_list,\n temperature =0\n )\n return response","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:00:01.294106Z","iopub.execute_input":"2023-09-11T14:00:01.294494Z","iopub.status.idle":"2023-09-11T14:00:01.300253Z","shell.execute_reply.started":"2023-09-11T14:00:01.294463Z","shell.execute_reply":"2023-09-11T14:00:01.299171Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\nimport openai\nimport time\n\n# you cat get openai apikey when you create account\nopenai.api_key = apikey\nanswer_list=[]\n\nfor i in range(train.shape[0]):\n prompt_str=train.loc[i,\"prompt\"]+' Please select the most accurate option from the choices A to E above and answer just like \"The answer is A\" \\\\n----------------------------------\\\\n'\n for option in [\"A\",\"B\",\"C\",\"D\",\"E\"]:\n prompt_str=prompt_str+'Option ' + option + ' : ' + train.loc[i,option] + '\\\\n'\n messages=[{\"role\": \"user\", \"content\": prompt_str}]\n #Sometimes errors occur, so try&catch so that you can retry once\n try:\n response=request_gpt(\"gpt-3.5-turbo\",messages)\n except:\n response=request_gpt(\"gpt-3.5-turbo\",messages)\n if i%25==0:\n #print sample responce\n print(\"id\",str(i),\" responce is : \",response[\"choices\"][0][\"message\"][\"content\"])\n #Progress check\n answer_list.append(response[\"choices\"][0][\"message\"][\"content\"][14])\n #There is a 1-minute request limit with OpenAI, so please wait for 3 seconds between each request.\n #Reference information:https://platform.openai.com/docs/guides/rate-limits/overview\n time.sleep(1)\nprint(\"done\")\n\ntrain[\"prediction\"]=answer_list\n#Let's take a look at the answers for the first 10 questions\nanswer_list[:10]","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:00:03.441963Z","iopub.execute_input":"2023-09-11T14:00:03.442312Z","iopub.status.idle":"2023-09-11T14:09:12.650432Z","shell.execute_reply.started":"2023-09-11T14:00:03.442287Z","shell.execute_reply":"2023-09-11T14:09:12.649284Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# check accuracy score","metadata":{}},{"cell_type":"code","source":"from sklearn.metrics import accuracy_score\nprint(\"accuracy score is : \",accuracy_score(train[\"answer\"],train[\"prediction\"]))","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:09:12.652580Z","iopub.execute_input":"2023-09-11T14:09:12.653408Z","iopub.status.idle":"2023-09-11T14:09:13.189242Z","shell.execute_reply.started":"2023-09-11T14:09:12.653371Z","shell.execute_reply":"2023-09-11T14:09:13.187985Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Reflection-1 \nIt's not too bad to have a about 70 percent accuracy rate considering there are five multiple-choice questions. However, given that you're using gpt3.5-turbo, the performance could indeed be better. \n\n\n**The following text has been added as an update on August 29th.** \nSo, it may sound simplistic, but I considered adding additional features. \nAs for where to obtain information, I turned to Wikipedia. \nFortunately, retrieving Wikipedia information isn't overly challenging, with APIs and libraries readily available. \nHowever, to fetch Wikipedia information, specific search keywords are required for each question. \nAs these are unavailable, I've decided to have GPT-3.5 extract search keywords for me.","metadata":{}},{"cell_type":"markdown","source":"# install library wikipedia\nto access and parse data from Wikipedia, we need install wikipedia library","metadata":{}},{"cell_type":"code","source":"!pip install wikipedia","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-09-11T14:09:13.190626Z","iopub.execute_input":"2023-09-11T14:09:13.190973Z","iopub.status.idle":"2023-09-11T14:09:26.735291Z","shell.execute_reply.started":"2023-09-11T14:09:13.190944Z","shell.execute_reply":"2023-09-11T14:09:26.733703Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# I will attempt improvements by utilizing the Wikipedia API.","metadata":{}},{"cell_type":"code","source":"import wikipedia\nwiki_info=[]\nfor i in range(train.shape[0]):\n #Generate keywords for searching on Wikipedia using the question information and GPT.\n prompt_str=train.loc[i,\"prompt\"]\n messages=[]\n messages.append({\"role\": \"user\", \"content\": prompt_str})\n messages.append({\"role\": \"user\", \"content\": 'Create one most important keyword for searching the above question on Wikipedia. Keyword only, no explanations or additional text.'})\n \n try:\n response=response=request_gpt(\"gpt-3.5-turbo\",messages)\n except:\n response=response=request_gpt(\"gpt-3.5-turbo\",messages)\n\n keyword = response[\"choices\"][0][\"message\"][\"content\"]\n #Sometimes, there might be quotation marks at the beginning and end of the generated GPT text, so I will exclude these using slicing.\n if (keyword[0]=='\"\"')&(keyword[-1]=='\"\"'):\n keyword=keyword[1:-1]\n \n #utilizing the Wikipedia API\n wikipedia.set_lang(\"en\")\n search_response = wikipedia.search(keyword[:300])\n info_str=\"\"\n page_data=\"\"\n #The wikipedia.search method retrieves a list of information available on Wikipedia through execution.\n #However, there are occasional instances where the result is zero. In such cases, the addition of information is abandoned.\n if len(search_response)==0:\n info_str = \"\"\n else:\n #Sometimes, when the target page doesn't exist, I give up on adding the information.\n try:\n page_data = wikipedia.page(search_response[0])\n info_str = page_data.content\n except:\n try:\n if len(search_response)>1:\n page_data = wikipedia.page(search_response[1])\n info_str = page_data.content\n else:\n info_str = \"\"\n except:\n info_str = \"\"\n wiki_info.append(info_str)\n \n #Display a sample of each result.\n if i%50==0:\n print(\"id is : \",str(i))\n print(\"prompt message is : \" , messages)\n print('gpt3.5turbo answer(created keyword) : \\n',response[\"choices\"][0][\"message\"][\"content\"])\n print('wikipedia search result : \\n',search_response)\n print('wikipedia page content(Due to the large volume, only the first 200 characters will be displayed) : \\n',info_str[:100])\n print()\n\n time.sleep(1)\n\n#Add the information obtained from Wikipedia as a feature column.\ntrain[\"wiki_info\"]=wiki_info\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:13:07.127785Z","iopub.execute_input":"2023-09-11T14:13:07.128207Z","iopub.status.idle":"2023-09-11T14:28:02.935675Z","shell.execute_reply.started":"2023-09-11T14:13:07.128165Z","shell.execute_reply":"2023-09-11T14:28:02.934338Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"The amount of information that can be retrieved from wikipedia seems to vary from line to line.","metadata":{}},{"cell_type":"code","source":"import seaborn as sns\nprint(train[\"wiki_info\"].str.len().describe())\nsns.histplot(data=train[\"wiki_info\"].str.len())","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:28:02.937285Z","iopub.execute_input":"2023-09-11T14:28:02.937876Z","iopub.status.idle":"2023-09-11T14:28:03.616378Z","shell.execute_reply.started":"2023-09-11T14:28:02.937830Z","shell.execute_reply":"2023-09-11T14:28:03.615058Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# create prompt with wikipedia information to solve problems \ntry again! request turbo to solve problems \nDue to the large number of characters, model with 16k(= \"gpt-3.5-turbo-16k\") are used.","metadata":{}},{"cell_type":"code","source":"answer_list=[]\nfor i in range(train.shape[0]):\n prompt_str=train.loc[i,\"prompt\"]+\" Please choose the most accurate option from the choices A to E above and answer in the format 'The answer is A'.\"\n prompt_str=prompt_str+'\\n----------------------------------\\n'\n for option in [\"A\",\"B\",\"C\",\"D\",\"E\"]:\n prompt_str=prompt_str+'Option ' + option + ' : ' + train.loc[i,option] + '\\n'\n info_str=train.loc[i,\"wiki_info\"][:3000]\n messages=[]\n messages.append({\"role\": \"assistant\", \"content\": info_str})\n messages.append({\"role\": \"user\", \"content\": prompt_str})\n\n try:\n response=response=response=request_gpt(\"gpt-3.5-turbo-16k\",messages)\n except:\n response=response=response=request_gpt(\"gpt-3.5-turbo-16k\",messages)\n \n if i%50==0:\n #print sample responce\n print(\"id\",str(i),\" responce is : \",response[\"choices\"][0][\"message\"][\"content\"])\n \n answer_list.append(response[\"choices\"][0][\"message\"][\"content\"][14])\n time.sleep(1)\n \nprint(\"done\")\ntrain[\"prediction_2\"]=answer_list\n\n#Let's take a look at the answers for the first 10 questions\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:30:10.309461Z","iopub.execute_input":"2023-09-11T14:30:10.309893Z","iopub.status.idle":"2023-09-11T14:49:58.840170Z","shell.execute_reply.started":"2023-09-11T14:30:10.309843Z","shell.execute_reply":"2023-09-11T14:49:58.838672Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# print sample prompt string\nprint(prompt_str)","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:49:58.842798Z","iopub.execute_input":"2023-09-11T14:49:58.844013Z","iopub.status.idle":"2023-09-11T14:49:58.849949Z","shell.execute_reply.started":"2023-09-11T14:49:58.843959Z","shell.execute_reply":"2023-09-11T14:49:58.848319Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# check accuracy score again","metadata":{}},{"cell_type":"code","source":"print(\"accuracy score is : \",accuracy_score(train[\"answer\"],train[\"prediction_2\"]))","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:49:58.851697Z","iopub.execute_input":"2023-09-11T14:49:58.852201Z","iopub.status.idle":"2023-09-11T14:49:58.872572Z","shell.execute_reply.started":"2023-09-11T14:49:58.852158Z","shell.execute_reply":"2023-09-11T14:49:58.870996Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"Good! It is more accurate than before (score without wikipedia information = 0.72 correct response rate)!","metadata":{}},{"cell_type":"markdown","source":"# Reflection-2 \nIn a way, as expected, the accuracy was higher when the percentage of correct answers was given only for the rows to which wikipedia information could be added. \nFrom this perhaps? What we can tell is that requesting the LLM, such as GPT3.5, with the relevant information attached to it is more accurate than requesting only the question directly to the LLM. \nTherefore, it can be inferred that the correct response rate is greatly affected by how much correct (information necessary to derive the correct answer and related information) information is attached when making a request to an LLM model such as GPT. \n\n**The following text has been added as an update on September 5.** \nThe wikipedia information given in the above prompt (version with a correct response rate of 0.79) was given by extracting only the first 3,000 characters, as shown in the histogram diagram, because some of the lines have too many characters. \nHow can we add more information than this to the \"prompt\" column? \nAfter some searching, it seems that the langchian library and OpenAI's embedding function can be used to extract highly relevant text. I decided to do some additional work on this.","metadata":{}},{"cell_type":"markdown","source":"# install library langchain and others","metadata":{}},{"cell_type":"code","source":"!pip install langchain\n!pip install chromadb\n!pip install tiktoken","metadata":{"_kg_hide-output":true,"execution":{"iopub.status.busy":"2023-09-11T14:49:58.875067Z","iopub.execute_input":"2023-09-11T14:49:58.875466Z","iopub.status.idle":"2023-09-11T14:51:34.584780Z","shell.execute_reply.started":"2023-09-11T14:49:58.875431Z","shell.execute_reply":"2023-09-11T14:51:34.583151Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Extract wikipedia information related to a question using the langchain library","metadata":{"_kg_hide-output":true}},{"cell_type":"code","source":"from langchain.text_splitter import CharacterTextSplitter\nfrom langchain.embeddings import OpenAIEmbeddings\nfrom langchain.vectorstores import Chroma\nfrom langchain.schema.document import Document\n\ntext_splitter = CharacterTextSplitter(chunk_size=1000)\nembeddings = OpenAIEmbeddings(openai_api_key=apikey)\ndef choose_important_info(wiki_info_str,prompt_str):\n prompt_info_str=\"\"\n docs = [Document(page_content=x) for x in text_splitter.split_text(wiki_info_str)]\n if len(docs)>0:\n vectorstore = Chroma.from_documents(docs, embeddings)\n candidate = vectorstore.similarity_search(prompt_str)\n len_cnt=0\n each_info=[]\n for j in range(len(candidate)):\n len_cnt=len_cnt+len(candidate[j].page_content)\n each_info.append(candidate[j].page_content)\n if(len_cnt>2000):\n break\n prompt_info_str = '\\n'.join(list(set(each_info)))\n else:\n prompt_info_str=\"\"\n return prompt_info_str","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:51:34.587440Z","iopub.execute_input":"2023-09-11T14:51:34.588034Z","iopub.status.idle":"2023-09-11T14:51:37.892108Z","shell.execute_reply.started":"2023-09-11T14:51:34.587978Z","shell.execute_reply":"2023-09-11T14:51:37.890651Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# create prompt with Wikipedia information highly relevant to the question","metadata":{}},{"cell_type":"code","source":"answer_list=[]\nfor i in range(train.shape[0]):\n prompt_str=train.loc[i,\"prompt\"]+\" Please choose the most accurate option from the choices A to E above and answer in the format 'The answer is A'.\"\n prompt_str=prompt_str+'\\n----------------------------------\\n'\n for option in [\"A\",\"B\",\"C\",\"D\",\"E\"]:\n prompt_str=prompt_str+'Option ' + option + ' : ' + train.loc[i,option] + '\\n'\n info_str=choose_important_info(train.loc[i,\"wiki_info\"],train.loc[i,\"prompt\"])\n messages=[]\n messages.append({\"role\": \"assistant\", \"content\": info_str})\n messages.append({\"role\": \"user\", \"content\": prompt_str[:2000]})\n\n try:\n response=response=response=request_gpt(\"gpt-3.5-turbo-16k\",messages)\n except:\n response=response=response=request_gpt(\"gpt-3.5-turbo-16k\",messages)\n \n if i%50==0:\n #print sample prompt and responce\n print(\"prompt message is :\",messages)\n print(\"id\",str(i),\" responce is : \",response[\"choices\"][0][\"message\"][\"content\"])\n \n answer_list.append(response[\"choices\"][0][\"message\"][\"content\"][14])\n time.sleep(1)\n \nprint(\"done\")\ntrain[\"prediction_3\"]=answer_list\n\n#Let's take a look at the answers for the first 10 questions\ntrain.head()","metadata":{"execution":{"iopub.status.busy":"2023-09-11T14:51:37.893429Z","iopub.execute_input":"2023-09-11T14:51:37.893767Z","iopub.status.idle":"2023-09-11T15:09:47.743198Z","shell.execute_reply.started":"2023-09-11T14:51:37.893737Z","shell.execute_reply":"2023-09-11T15:09:47.741837Z"},"_kg_hide-output":true,"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# check accuracy score again","metadata":{}},{"cell_type":"code","source":"print(\"accuracy score is : \",accuracy_score(train[\"answer\"],train[\"prediction_3\"]))","metadata":{"execution":{"iopub.status.busy":"2023-09-11T15:09:47.744728Z","iopub.execute_input":"2023-09-11T15:09:47.745093Z","iopub.status.idle":"2023-09-11T15:09:47.753251Z","shell.execute_reply.started":"2023-09-11T15:09:47.745059Z","shell.execute_reply":"2023-09-11T15:09:47.751880Z"},"trusted":true},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"It's right on target! I was able to further increase the percentage of correct answers!","metadata":{}},{"cell_type":"markdown","source":"# Reflection-3 \nAt first, the GPT generates answers very simply, using only the questions, choices, and a few additional request sentences prepared in the training data set. \nNext, additional information is added and answers are generated by GPT. \nNext, additional information is added to the training data set and the GPT is used to generate answers. \nI divided the process into three phases and checked how the percentage of correct answers changed. \nAs I had hoped, I was able to increase the correct answer rate from 0.72 to 0.785 to 0.84. \nIn order to increase the percentage of correct answers, perhaps? Although I was able to examine the necessary information carefully, I have not yet been able to edit it for submission, so I will consider this issue from now on. \nAlso, in order to execute this notebook, it is necessary to obtain the API-KEY of OpenAI, but I would like to consider whether the same process can be done with another LLM (preferably one that does not require API-KEY, etc.).","metadata":{}}],"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}} -------------------------------------------------------------------------------- /openbook-debertav3-large-baseline-single-model.ipynb: -------------------------------------------------------------------------------- 1 | {"metadata":{"kernelspec":{"language":"python","display_name":"Python 3","name":"python3"},"language_info":{"pygments_lexer":"ipython3","nbconvert_exporter":"python","version":"3.6.4","file_extension":".py","codemirror_mode":{"name":"ipython","version":3},"name":"python","mimetype":"text/x-python"}},"nbformat_minor":4,"nbformat":4,"cells":[{"cell_type":"markdown","source":"# OpenBook DeBERTaV3-Large Baseline\n\nHi! This notebook is a merge of the following approaches:\n\n**OpenBook Approach**\n- https://www.kaggle.com/code/quangbk/open-book-llm-science-exam-reduced-ram-usage\n- https://www.kaggle.com/code/jjinho/open-book-llm-science-exam\n\n***DeBERTaV3-Large with extra data***\n- https://www.kaggle.com/code/radek1/new-dataset-deberta-v3-large-training\n\nThis both leverages the the multi-choice implementation of HuggingFace library and the context retrieval of the openbook approach. It can be extended with the utilization of billion parameter LLMs and better retrieval methods.","metadata":{}},{"cell_type":"code","source":"# installing offline dependencies\n!pip install -U /kaggle/input/faiss-gpu-173-python310/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl\n!cp -rf /kaggle/input/sentence-transformers-222/sentence-transformers /kaggle/working/sentence-transformers\n!pip install -U /kaggle/working/sentence-transformers\n!pip install -U /kaggle/input/blingfire-018/blingfire-0.1.8-py3-none-any.whl\n\n!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl\n!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl","metadata":{"_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","_kg_hide-input":false,"_kg_hide-output":true,"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","execution":{"iopub.execute_input":"2023-08-14T10:07:16.129442Z","iopub.status.busy":"2023-08-14T10:07:16.128308Z","iopub.status.idle":"2023-08-14T10:09:22.923467Z","shell.execute_reply":"2023-08-14T10:09:22.922291Z"},"papermill":{"duration":126.809817,"end_time":"2023-08-14T10:09:22.925969","exception":false,"start_time":"2023-08-14T10:07:16.116152","status":"completed"},"tags":[],"collapsed":true,"jupyter":{"outputs_hidden":true}},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import os\nimport gc\nimport pandas as pd\nimport numpy as np\nimport re\nfrom tqdm.auto import tqdm\nimport blingfire as bf\nfrom __future__ import annotations\n\nfrom collections.abc import Iterable\n\nimport faiss\nfrom faiss import write_index, read_index\n\nfrom sentence_transformers import SentenceTransformer\n\nimport torch\nimport ctypes\nlibc = ctypes.CDLL(\"libc.so.6\")","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:22.955274Z","iopub.status.busy":"2023-08-14T10:09:22.954300Z","iopub.status.idle":"2023-08-14T10:09:31.472186Z","shell.execute_reply":"2023-08-14T10:09:31.471270Z"},"papermill":{"duration":8.534957,"end_time":"2023-08-14T10:09:31.474781","exception":false,"start_time":"2023-08-14T10:09:22.939824","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"def process_documents(documents: Iterable[str],\n document_ids: Iterable,\n split_sentences: bool = True,\n filter_len: int = 3,\n disable_progress_bar: bool = False) -> pd.DataFrame:\n \"\"\"\n Main helper function to process documents from the EMR.\n\n :param documents: Iterable containing documents which are strings\n :param document_ids: Iterable containing document unique identifiers\n :param document_type: String denoting the document type to be processed\n :param document_sections: List of sections for a given document type to process\n :param split_sentences: Flag to determine whether to further split sections into sentences\n :param filter_len: Minimum character length of a sentence (otherwise filter out)\n :param disable_progress_bar: Flag to disable tqdm progress bar\n :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`\n \"\"\"\n \n df = sectionize_documents(documents, document_ids, disable_progress_bar)\n\n if split_sentences:\n df = sentencize(df.text.values, \n df.document_id.values,\n df.offset.values, \n filter_len, \n disable_progress_bar)\n return df\n\n\ndef sectionize_documents(documents: Iterable[str],\n document_ids: Iterable,\n disable_progress_bar: bool = False) -> pd.DataFrame:\n \"\"\"\n Obtains the sections of the imaging reports and returns only the \n selected sections (defaults to FINDINGS, IMPRESSION, and ADDENDUM).\n\n :param documents: Iterable containing documents which are strings\n :param document_ids: Iterable containing document unique identifiers\n :param disable_progress_bar: Flag to disable tqdm progress bar\n :return: Pandas DataFrame containing the columns `document_id`, `text`, `offset`\n \"\"\"\n processed_documents = []\n for document_id, document in tqdm(zip(document_ids, documents), total=len(documents), disable=disable_progress_bar):\n row = {}\n text, start, end = (document, 0, len(document))\n row['document_id'] = document_id\n row['text'] = text\n row['offset'] = (start, end)\n\n processed_documents.append(row)\n\n _df = pd.DataFrame(processed_documents)\n if _df.shape[0] > 0:\n return _df.sort_values(['document_id', 'offset']).reset_index(drop=True)\n else:\n return _df\n\n\ndef sentencize(documents: Iterable[str],\n document_ids: Iterable,\n offsets: Iterable[tuple[int, int]],\n filter_len: int = 3,\n disable_progress_bar: bool = False) -> pd.DataFrame:\n \"\"\"\n Split a document into sentences. Can be used with `sectionize_documents`\n to further split documents into more manageable pieces. Takes in offsets\n to ensure that after splitting, the sentences can be matched to the\n location in the original documents.\n\n :param documents: Iterable containing documents which are strings\n :param document_ids: Iterable containing document unique identifiers\n :param offsets: Iterable tuple of the start and end indices\n :param filter_len: Minimum character length of a sentence (otherwise filter out)\n :return: Pandas DataFrame containing the columns `document_id`, `text`, `section`, `offset`\n \"\"\"\n\n document_sentences = []\n for document, document_id, offset in tqdm(zip(documents, document_ids, offsets), total=len(documents), disable=disable_progress_bar):\n try:\n _, sentence_offsets = bf.text_to_sentences_and_offsets(document)\n for o in sentence_offsets:\n if o[1]-o[0] > filter_len:\n sentence = document[o[0]:o[1]]\n abs_offsets = (o[0]+offset[0], o[1]+offset[0])\n row = {}\n row['document_id'] = document_id\n row['text'] = sentence\n row['offset'] = abs_offsets\n document_sentences.append(row)\n except:\n continue\n return pd.DataFrame(document_sentences)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:31.556440Z","iopub.status.busy":"2023-08-14T10:09:31.556164Z","iopub.status.idle":"2023-08-14T10:09:31.571938Z","shell.execute_reply":"2023-08-14T10:09:31.571107Z"},"papermill":{"duration":0.034054,"end_time":"2023-08-14T10:09:31.574046","exception":false,"start_time":"2023-08-14T10:09:31.539992","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"SIM_MODEL = '/kaggle/input/sentencetransformers-allminilml6v2/sentence-transformers_all-MiniLM-L6-v2'\nDEVICE = 0\nMAX_LENGTH = 384\nBATCH_SIZE = 16\n\nWIKI_PATH = \"/kaggle/input/wikipedia-20230701\"\nwiki_files = os.listdir(WIKI_PATH)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:31.602444Z","iopub.status.busy":"2023-08-14T10:09:31.601698Z","iopub.status.idle":"2023-08-14T10:09:31.621434Z","shell.execute_reply":"2023-08-14T10:09:31.620631Z"},"papermill":{"duration":0.036342,"end_time":"2023-08-14T10:09:31.623595","exception":false,"start_time":"2023-08-14T10:09:31.587253","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Relevant Title Retrieval","metadata":{}},{"cell_type":"code","source":"trn = pd.read_csv(\"/kaggle/input/kaggle-llm-science-exam/test.csv\").drop(\"id\", 1)\ntrn.head()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:31.650942Z","iopub.status.busy":"2023-08-14T10:09:31.650687Z","iopub.status.idle":"2023-08-14T10:09:31.691013Z","shell.execute_reply":"2023-08-14T10:09:31.689907Z"},"papermill":{"duration":0.058533,"end_time":"2023-08-14T10:09:31.695383","exception":false,"start_time":"2023-08-14T10:09:31.636850","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"model = SentenceTransformer(SIM_MODEL, device='cuda')\nmodel.max_seq_length = MAX_LENGTH\nmodel = model.half()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:31.726783Z","iopub.status.busy":"2023-08-14T10:09:31.725295Z","iopub.status.idle":"2023-08-14T10:09:44.990463Z","shell.execute_reply":"2023-08-14T10:09:44.989410Z"},"papermill":{"duration":13.282604,"end_time":"2023-08-14T10:09:44.992949","exception":false,"start_time":"2023-08-14T10:09:31.710345","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"sentence_index = read_index(\"/kaggle/input/wikipedia-2023-07-faiss-index/wikipedia_202307.index\")","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:09:45.023326Z","iopub.status.busy":"2023-08-14T10:09:45.022984Z","iopub.status.idle":"2023-08-14T10:11:20.931589Z","shell.execute_reply":"2023-08-14T10:11:20.929626Z"},"papermill":{"duration":95.926417,"end_time":"2023-08-14T10:11:20.934445","exception":false,"start_time":"2023-08-14T10:09:45.008028","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"prompt_embeddings = model.encode(trn.prompt.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)\nprompt_embeddings = prompt_embeddings.detach().cpu().numpy()\n_ = gc.collect()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:11:20.983065Z","iopub.status.busy":"2023-08-14T10:11:20.982740Z","iopub.status.idle":"2023-08-14T10:11:31.845868Z","shell.execute_reply":"2023-08-14T10:11:31.844889Z"},"papermill":{"duration":10.891104,"end_time":"2023-08-14T10:11:31.848690","exception":false,"start_time":"2023-08-14T10:11:20.957586","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Get the top 3 pages that are likely to contain the topic of interest\nsearch_score, search_index = sentence_index.search(prompt_embeddings, 3)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:11:31.924603Z","iopub.status.busy":"2023-08-14T10:11:31.924304Z","iopub.status.idle":"2023-08-14T10:11:55.244590Z","shell.execute_reply":"2023-08-14T10:11:55.243663Z"},"papermill":{"duration":23.339585,"end_time":"2023-08-14T10:11:55.247556","exception":false,"start_time":"2023-08-14T10:11:31.907971","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Save memory - delete sentence_index since it is no longer necessary\ndel sentence_index\ndel prompt_embeddings\n_ = gc.collect()\nlibc.malloc_trim(0)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:11:55.283896Z","iopub.status.busy":"2023-08-14T10:11:55.283599Z","iopub.status.idle":"2023-08-14T10:11:56.143196Z","shell.execute_reply":"2023-08-14T10:11:56.142123Z"},"papermill":{"duration":0.877305,"end_time":"2023-08-14T10:11:56.145444","exception":false,"start_time":"2023-08-14T10:11:55.268139","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Getting Sentences from the Relevant Titles","metadata":{}},{"cell_type":"code","source":"df = pd.read_parquet(\"/kaggle/input/wikipedia-20230701/wiki_2023_index.parquet\",\n columns=['id', 'file'])","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:11:56.178483Z","iopub.status.busy":"2023-08-14T10:11:56.178096Z","iopub.status.idle":"2023-08-14T10:12:01.894786Z","shell.execute_reply":"2023-08-14T10:12:01.893653Z"},"papermill":{"duration":5.737408,"end_time":"2023-08-14T10:12:01.897408","exception":false,"start_time":"2023-08-14T10:11:56.160000","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Get the article and associated file location using the index\nwikipedia_file_data = []\n\nfor i, (scr, idx) in tqdm(enumerate(zip(search_score, search_index)), total=len(search_score)):\n scr_idx = idx\n _df = df.loc[scr_idx].copy()\n _df['prompt_id'] = i\n wikipedia_file_data.append(_df)\nwikipedia_file_data = pd.concat(wikipedia_file_data).reset_index(drop=True)\nwikipedia_file_data = wikipedia_file_data[['id', 'prompt_id', 'file']].drop_duplicates().sort_values(['file', 'id']).reset_index(drop=True)\n\n## Save memory - delete df since it is no longer necessary\ndel df\n_ = gc.collect()\nlibc.malloc_trim(0)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:12:01.930242Z","iopub.status.busy":"2023-08-14T10:12:01.929368Z","iopub.status.idle":"2023-08-14T10:12:02.710141Z","shell.execute_reply":"2023-08-14T10:12:02.709070Z"},"papermill":{"duration":0.799872,"end_time":"2023-08-14T10:12:02.712752","exception":false,"start_time":"2023-08-14T10:12:01.912880","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Get the full text data\nwiki_text_data = []\n\nfor file in tqdm(wikipedia_file_data.file.unique(), total=len(wikipedia_file_data.file.unique())):\n _id = [str(i) for i in wikipedia_file_data[wikipedia_file_data['file']==file]['id'].tolist()]\n _df = pd.read_parquet(f\"{WIKI_PATH}/{file}\", columns=['id', 'text'])\n\n _df_temp = _df[_df['id'].isin(_id)].copy()\n del _df\n _ = gc.collect()\n libc.malloc_trim(0)\n wiki_text_data.append(_df_temp)\nwiki_text_data = pd.concat(wiki_text_data).drop_duplicates().reset_index(drop=True)\n_ = gc.collect()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:12:02.745635Z","iopub.status.busy":"2023-08-14T10:12:02.745316Z","iopub.status.idle":"2023-08-14T10:17:06.706947Z","shell.execute_reply":"2023-08-14T10:17:06.705934Z"},"papermill":{"duration":303.981049,"end_time":"2023-08-14T10:17:06.710072","exception":false,"start_time":"2023-08-14T10:12:02.729023","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Parse documents into sentences\nprocessed_wiki_text_data = process_documents(wiki_text_data.text.values, wiki_text_data.id.values)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:06.745180Z","iopub.status.busy":"2023-08-14T10:17:06.744844Z","iopub.status.idle":"2023-08-14T10:17:11.218048Z","shell.execute_reply":"2023-08-14T10:17:11.217022Z"},"papermill":{"duration":4.491281,"end_time":"2023-08-14T10:17:11.220342","exception":false,"start_time":"2023-08-14T10:17:06.729061","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Get embeddings of the wiki text data\nwiki_data_embeddings = model.encode(processed_wiki_text_data.text,\n batch_size=BATCH_SIZE,\n device=DEVICE,\n show_progress_bar=True,\n convert_to_tensor=True,\n normalize_embeddings=True)#.half()\nwiki_data_embeddings = wiki_data_embeddings.detach().cpu().numpy()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:11.255088Z","iopub.status.busy":"2023-08-14T10:17:11.254041Z","iopub.status.idle":"2023-08-14T10:17:36.345952Z","shell.execute_reply":"2023-08-14T10:17:36.344929Z"},"papermill":{"duration":25.110593,"end_time":"2023-08-14T10:17:36.348422","exception":false,"start_time":"2023-08-14T10:17:11.237829","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"_ = gc.collect()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:36.382111Z","iopub.status.busy":"2023-08-14T10:17:36.380661Z","iopub.status.idle":"2023-08-14T10:17:36.677809Z","shell.execute_reply":"2023-08-14T10:17:36.676678Z"},"papermill":{"duration":0.315807,"end_time":"2023-08-14T10:17:36.679867","exception":false,"start_time":"2023-08-14T10:17:36.364060","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"## Combine all answers\ntrn['answer_all'] = trn.apply(lambda x: \" \".join([x['A'], x['B'], x['C'], x['D'], x['E']]), axis=1)\n\n\n## Search using the prompt and answers to guide the search\ntrn['prompt_answer_stem'] = trn['prompt'] + \" \" + trn['answer_all']","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:36.713788Z","iopub.status.busy":"2023-08-14T10:17:36.712233Z","iopub.status.idle":"2023-08-14T10:17:36.728025Z","shell.execute_reply":"2023-08-14T10:17:36.727138Z"},"papermill":{"duration":0.034767,"end_time":"2023-08-14T10:17:36.730378","exception":false,"start_time":"2023-08-14T10:17:36.695611","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"question_embeddings = model.encode(trn.prompt_answer_stem.values, batch_size=BATCH_SIZE, device=DEVICE, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)\nquestion_embeddings = question_embeddings.detach().cpu().numpy()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:36.764917Z","iopub.status.busy":"2023-08-14T10:17:36.763427Z","iopub.status.idle":"2023-08-14T10:17:37.175617Z","shell.execute_reply":"2023-08-14T10:17:37.174423Z"},"papermill":{"duration":0.431343,"end_time":"2023-08-14T10:17:37.177862","exception":false,"start_time":"2023-08-14T10:17:36.746519","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Extracting Matching Prompt-Sentence Pairs","metadata":{}},{"cell_type":"code","source":"## Parameter to determine how many relevant sentences to include\nNUM_SENTENCES_INCLUDE = 5\n\n## List containing just Context\ncontexts = []\n\nfor r in tqdm(trn.itertuples(), total=len(trn)):\n\n prompt_id = r.Index\n\n prompt_indices = processed_wiki_text_data[processed_wiki_text_data['document_id'].isin(wikipedia_file_data[wikipedia_file_data['prompt_id']==prompt_id]['id'].values)].index.values\n\n if prompt_indices.shape[0] > 0:\n prompt_index = faiss.index_factory(wiki_data_embeddings.shape[1], \"Flat\")\n prompt_index.add(wiki_data_embeddings[prompt_indices])\n\n context = \"\"\n \n ## Get the top matches\n ss, ii = prompt_index.search(question_embeddings, NUM_SENTENCES_INCLUDE)\n for _s, _i in zip(ss[prompt_id], ii[prompt_id]):\n context += processed_wiki_text_data.loc[prompt_indices]['text'].iloc[_i] + \" \"\n \n contexts.append(context)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:37.244626Z","iopub.status.busy":"2023-08-14T10:17:37.243963Z","iopub.status.idle":"2023-08-14T10:17:38.833828Z","shell.execute_reply":"2023-08-14T10:17:38.832943Z"},"papermill":{"duration":1.609553,"end_time":"2023-08-14T10:17:38.836268","exception":false,"start_time":"2023-08-14T10:17:37.226715","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trn['context'] = contexts","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:38.871435Z","iopub.status.busy":"2023-08-14T10:17:38.870861Z","iopub.status.idle":"2023-08-14T10:17:38.875890Z","shell.execute_reply":"2023-08-14T10:17:38.874711Z"},"papermill":{"duration":0.024188,"end_time":"2023-08-14T10:17:38.878394","exception":false,"start_time":"2023-08-14T10:17:38.854206","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trn[[\"prompt\", \"context\", \"A\", \"B\", \"C\", \"D\", \"E\"]].to_csv(\"./test_context.csv\", index=False)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:38.910233Z","iopub.status.busy":"2023-08-14T10:17:38.909938Z","iopub.status.idle":"2023-08-14T10:17:38.942436Z","shell.execute_reply":"2023-08-14T10:17:38.941611Z"},"papermill":{"duration":0.050945,"end_time":"2023-08-14T10:17:38.944423","exception":false,"start_time":"2023-08-14T10:17:38.893478","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":"# Inference","metadata":{"papermill":{"duration":0.015828,"end_time":"2023-08-14T10:17:39.007683","exception":false,"start_time":"2023-08-14T10:17:38.991855","status":"completed"},"tags":[]}},{"cell_type":"code","source":"model_dir = \"/kaggle/input/llm-se-debertav3-large\"","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:39.041941Z","iopub.status.busy":"2023-08-14T10:17:39.041059Z","iopub.status.idle":"2023-08-14T10:17:39.045484Z","shell.execute_reply":"2023-08-14T10:17:39.044554Z"},"papermill":{"duration":0.023448,"end_time":"2023-08-14T10:17:39.047454","exception":false,"start_time":"2023-08-14T10:17:39.024006","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"from dataclasses import dataclass\nfrom typing import Optional, Union\n\nimport torch\nimport numpy as np\nimport pandas as pd\nfrom datasets import Dataset\nfrom transformers import AutoTokenizer\nfrom transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer\nfrom transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:39.080034Z","iopub.status.busy":"2023-08-14T10:17:39.079258Z","iopub.status.idle":"2023-08-14T10:17:39.548602Z","shell.execute_reply":"2023-08-14T10:17:39.547612Z"},"papermill":{"duration":0.488286,"end_time":"2023-08-14T10:17:39.551173","exception":false,"start_time":"2023-08-14T10:17:39.062887","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_df = pd.read_csv(\"test_context.csv\")\ntest_df.index = list(range(len(test_df)))\ntest_df.id = list(range(len(test_df)))\ntest_df[\"prompt\"] = test_df[\"context\"] + \" #### \" + test_df[\"prompt\"]","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:39.585546Z","iopub.status.busy":"2023-08-14T10:17:39.584824Z","iopub.status.idle":"2023-08-14T10:17:39.602541Z","shell.execute_reply":"2023-08-14T10:17:39.601426Z"},"papermill":{"duration":0.037633,"end_time":"2023-08-14T10:17:39.605345","exception":false,"start_time":"2023-08-14T10:17:39.567712","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_df['answer'] = 'A'\ntest_ds = Dataset.from_pandas(test_df)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:39.638617Z","iopub.status.busy":"2023-08-14T10:17:39.637799Z","iopub.status.idle":"2023-08-14T10:17:39.649653Z","shell.execute_reply":"2023-08-14T10:17:39.648811Z"},"papermill":{"duration":0.030584,"end_time":"2023-08-14T10:17:39.651668","exception":false,"start_time":"2023-08-14T10:17:39.621084","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tokenizer = AutoTokenizer.from_pretrained(model_dir)\nmodel = AutoModelForMultipleChoice.from_pretrained(model_dir)\nmodel.eval()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:17:39.684625Z","iopub.status.busy":"2023-08-14T10:17:39.683835Z","iopub.status.idle":"2023-08-14T10:18:01.024594Z","shell.execute_reply":"2023-08-14T10:18:01.022741Z"},"papermill":{"duration":21.360878,"end_time":"2023-08-14T10:18:01.027859","exception":false,"start_time":"2023-08-14T10:17:39.666981","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"import numpy as np\ndef predictions_to_map_output(predictions):\n sorted_answer_indices = np.argsort(-predictions)\n top_answer_indices = sorted_answer_indices[:,:3] # Get the first three answers in each row\n top_answers = np.vectorize(index_to_option.get)(top_answer_indices)\n return np.apply_along_axis(lambda row: ' '.join(row), 1, top_answers)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:01.079858Z","iopub.status.busy":"2023-08-14T10:18:01.079031Z","iopub.status.idle":"2023-08-14T10:18:01.085479Z","shell.execute_reply":"2023-08-14T10:18:01.084471Z"},"papermill":{"duration":0.033179,"end_time":"2023-08-14T10:18:01.087489","exception":false,"start_time":"2023-08-14T10:18:01.054310","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# We'll create a dictionary to convert option names (A, B, C, D, E) into indices and back again\noptions = 'ABCDE'\nindices = list(range(5))\n\noption_to_index = {option: index for option, index in zip(options, indices)}\nindex_to_option = {index: option for option, index in zip(options, indices)}\n\ndef preprocess(example):\n # The AutoModelForMultipleChoice class expects a set of question/answer pairs\n # so we'll copy our question 5 times before tokenizing\n first_sentence = [example['prompt']] * 5\n second_sentence = []\n for option in options:\n second_sentence.append(example[option])\n # Our tokenizer will turn our text into token IDs BERT can understand\n tokenized_example = tokenizer(first_sentence, second_sentence, truncation=True)\n tokenized_example['label'] = option_to_index[example['answer']]\n return tokenized_example","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:01.120666Z","iopub.status.busy":"2023-08-14T10:18:01.119926Z","iopub.status.idle":"2023-08-14T10:18:01.127389Z","shell.execute_reply":"2023-08-14T10:18:01.126425Z"},"papermill":{"duration":0.026162,"end_time":"2023-08-14T10:18:01.129276","exception":false,"start_time":"2023-08-14T10:18:01.103114","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"@dataclass\nclass DataCollatorForMultipleChoice:\n tokenizer: PreTrainedTokenizerBase\n padding: Union[bool, str, PaddingStrategy] = True\n max_length: Optional[int] = None\n pad_to_multiple_of: Optional[int] = None\n \n def __call__(self, features):\n label_name = \"label\" if 'label' in features[0].keys() else 'labels'\n labels = [feature.pop(label_name) for feature in features]\n batch_size = len(features)\n num_choices = len(features[0]['input_ids'])\n flattened_features = [\n [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features\n ]\n flattened_features = sum(flattened_features, [])\n \n batch = self.tokenizer.pad(\n flattened_features,\n padding=self.padding,\n max_length=self.max_length,\n pad_to_multiple_of=self.pad_to_multiple_of,\n return_tensors='pt',\n )\n batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}\n batch['labels'] = torch.tensor(labels, dtype=torch.int64)\n return batch","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:01.163853Z","iopub.status.busy":"2023-08-14T10:18:01.162905Z","iopub.status.idle":"2023-08-14T10:18:01.173494Z","shell.execute_reply":"2023-08-14T10:18:01.172483Z"},"papermill":{"duration":0.030447,"end_time":"2023-08-14T10:18:01.175589","exception":false,"start_time":"2023-08-14T10:18:01.145142","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"trainer = Trainer(\n model=model,\n tokenizer=tokenizer,\n data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer)\n)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:01.209840Z","iopub.status.busy":"2023-08-14T10:18:01.209548Z","iopub.status.idle":"2023-08-14T10:18:01.683103Z","shell.execute_reply":"2023-08-14T10:18:01.682032Z"},"papermill":{"duration":0.493618,"end_time":"2023-08-14T10:18:01.685989","exception":false,"start_time":"2023-08-14T10:18:01.192371","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"tokenized_test_ds = test_ds.map(preprocess, batched=False, remove_columns=['prompt', 'A', 'B', 'C', 'D', 'E', 'answer'])","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:01.719768Z","iopub.status.busy":"2023-08-14T10:18:01.718895Z","iopub.status.idle":"2023-08-14T10:18:02.802181Z","shell.execute_reply":"2023-08-14T10:18:02.801216Z"},"papermill":{"duration":1.101895,"end_time":"2023-08-14T10:18:02.804298","exception":false,"start_time":"2023-08-14T10:18:01.702403","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"test_predictions = trainer.predict(tokenized_test_ds)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:18:02.839409Z","iopub.status.busy":"2023-08-14T10:18:02.837848Z","iopub.status.idle":"2023-08-14T10:19:17.681195Z","shell.execute_reply":"2023-08-14T10:19:17.680061Z"},"papermill":{"duration":74.862571,"end_time":"2023-08-14T10:19:17.683318","exception":false,"start_time":"2023-08-14T10:18:02.820747","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"submission_df = pd.DataFrame({\"id\": np.arange(len(test_df))})\nsubmission_df['prediction'] = predictions_to_map_output(test_predictions.predictions)\n\nsubmission_df.head()","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:19:17.717465Z","iopub.status.busy":"2023-08-14T10:19:17.717133Z","iopub.status.idle":"2023-08-14T10:19:17.731009Z","shell.execute_reply":"2023-08-14T10:19:17.729803Z"},"papermill":{"duration":0.033576,"end_time":"2023-08-14T10:19:17.733491","exception":false,"start_time":"2023-08-14T10:19:17.699915","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]},{"cell_type":"code","source":"# Once we write our submission file we're good to submit!\nsubmission_df.to_csv('submission.csv', index=False)","metadata":{"execution":{"iopub.execute_input":"2023-08-14T10:19:17.769853Z","iopub.status.busy":"2023-08-14T10:19:17.769003Z","iopub.status.idle":"2023-08-14T10:19:17.775472Z","shell.execute_reply":"2023-08-14T10:19:17.774541Z"},"papermill":{"duration":0.026091,"end_time":"2023-08-14T10:19:17.777618","exception":false,"start_time":"2023-08-14T10:19:17.751527","status":"completed"},"tags":[]},"execution_count":null,"outputs":[]}]} --------------------------------------------------------------------------------