├── README.md └── Multilingual_Farmer's_bot.ipynb /README.md: -------------------------------------------------------------------------------- 1 | Multilingual Audio based Farmer’s helper bot with Intel’s OpenVINO and Gemma 2 | 3 | Open-sourcing large language models (LLMs) boosts global access to AI technology. Although unlikely, it’s possible that the next big AI innovation could come from someone without access to widespread computing resources. However, in AI application development, there’s more flexibility in infrastructure choice. The combination of accessible CPUs, scalability, and open-source licensing of LLMs greatly supports AI progress. 4 | 5 | This project explores the intriguing process of fine-tuning Gemma and leveraging OpenVINO for quantization to achieve multilingual capabilities. 6 | 7 | Blog Link - https://usharengaraju.medium.com/multilingual-audio-based-farmers-helper-bot-with-intel-s-openvino-and-gemma-4145c4982c4a 8 | -------------------------------------------------------------------------------- /Multilingual_Farmer's_bot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [] 7 | }, 8 | "kernelspec": { 9 | "name": "python3", 10 | "display_name": "Python 3" 11 | }, 12 | "language_info": { 13 | "name": "python" 14 | } 15 | }, 16 | "cells": [ 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "id": "UOzUoYDp6jzc" 22 | }, 23 | "outputs": [], 24 | "source": [ 25 | "!pip install torch==2.0.1\n", 26 | "!pip install transformers\n", 27 | "!pip install bitsandbytes\n", 28 | "!pip install peft\n", 29 | "!pip install accelerate\n", 30 | "!pip install datasets\n", 31 | "!pip install trl\n", 32 | "!pip install einops\n", 33 | "!pip install scipy\n", 34 | "!pip install --upgrade openvino-nightly" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "source": [ 40 | "from huggingface_hub import login\n", 41 | "login(\"HF_TOKEN\")" 42 | ], 43 | "metadata": { 44 | "id": "qTU_te7m6p7R" 45 | }, 46 | "execution_count": null, 47 | "outputs": [] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "source": [ 52 | "import time\n", 53 | "from datasets import load_dataset\n", 54 | "from trl import SFTTrainer\n", 55 | "from transformers import (\n", 56 | " AutoModelForCausalLM,\n", 57 | " AutoTokenizer,\n", 58 | " BitsAndBytesConfig,\n", 59 | " HfArgumentParser,\n", 60 | " TrainingArguments)\n", 61 | "\n", 62 | "file_path = \"questionsv4.csv\"\n", 63 | "dataset = load_dataset(\"csv\", data_files={file_path})\n", 64 | "model_name = \"google/gemma-2b\"" 65 | ], 66 | "metadata": { 67 | "id": "hO2_z0lK6r__" 68 | }, 69 | "execution_count": null, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "source": [ 75 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", 76 | "tokenizer.pad_token = tokenizer.eos_token\n", 77 | "model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)" 78 | ], 79 | "metadata": { 80 | "id": "CSGC9g5l6wzy" 81 | }, 82 | "execution_count": null, 83 | "outputs": [] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "source": [ 88 | "prompt = \"\"\"Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n", 89 | "\n", 90 | "### Instruction:\n", 91 | "{}\n", 92 | "\n", 93 | "### Input:\n", 94 | "{}\n", 95 | "\n", 96 | "### Response:\n", 97 | "{}\"\"\"\n", 98 | "\n", 99 | "def formatting_prompts_func(examples):\n", 100 | "\n", 101 | " inputs = [\" \" for i in range(len(dataset['train']['questions']))]\n", 102 | " print(len(inputs))\n", 103 | " EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN\n", 104 | "\n", 105 | " instructions = examples[\"questions\"]\n", 106 | " inputs = inputs\n", 107 | " outputs = examples[\"answers\"]\n", 108 | " texts = []\n", 109 | " for instruction, input, output in zip(instructions, inputs, outputs):\n", 110 | " # Must add EOS_TOKEN, otherwise your generation will go on forever!\n", 111 | " text = prompt.format(instruction, input, output) + EOS_TOKEN\n", 112 | " #print(text)\n", 113 | " texts.append(text)\n", 114 | " return { \"text\" : texts}\n", 115 | "pass\n", 116 | "\n", 117 | "from datasets import load_dataset\n", 118 | "dataset = dataset.map(formatting_prompts_func, batched = True,)" 119 | ], 120 | "metadata": { 121 | "id": "u6A5YaHA61Av" 122 | }, 123 | "execution_count": null, 124 | "outputs": [] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "source": [ 129 | "training_arguments = TrainingArguments(\n", 130 | " output_dir=\"./results\",\n", 131 | " bf16=True,\n", 132 | " use_ipex=True,\n", 133 | " no_cuda=True,\n", 134 | " fp16_full_eval=False,\n", 135 | " )" 136 | ], 137 | "metadata": { 138 | "id": "fe9IYYmb6115" 139 | }, 140 | "execution_count": null, 141 | "outputs": [] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "source": [ 146 | "trainer = SFTTrainer(\n", 147 | " model=model,\n", 148 | " train_dataset=dataset[\"train\"],\n", 149 | " dataset_text_field=\"text\",\n", 150 | " max_seq_length=512,\n", 151 | " tokenizer=tokenizer,\n", 152 | " args=training_arguments,\n", 153 | " packing=True,\n", 154 | " )\n", 155 | "\n", 156 | "trainer.train()" 157 | ], 158 | "metadata": { 159 | "id": "iQ4CLg6h63q8" 160 | }, 161 | "execution_count": null, 162 | "outputs": [] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "source": [ 167 | "new_model = \"finetuned-gemma\"\n", 168 | "trainer.model.save_pretrained(new_model)" 169 | ], 170 | "metadata": { 171 | "id": "K0hIPebY66HD" 172 | }, 173 | "execution_count": null, 174 | "outputs": [] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "source": [ 179 | "import os\n", 180 | "import transformers\n", 181 | "import torch\n", 182 | "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig\n", 183 | "from transformers import AutoModelForSeq2SeqLM, pipeline\n", 184 | "from huggingface_hub import login\n", 185 | "import numpy as np\n", 186 | "\n", 187 | "new_model = \"tensorgirl/finetuned-gemma\"\n", 188 | "model = AutoModelForCausalLM.from_pretrained(new_model, trust_remote_code=True)\n", 189 | "tokenizer = AutoTokenizer.from_pretrained(new_model, trust_remote_code=True)\n", 190 | "tokenizer.pad_token = tokenizer.eos_token\n", 191 | "\n", 192 | "generator = transformers.pipeline(\n", 193 | " \"text-generation\",\n", 194 | " model=model,\n", 195 | " tokenizer=tokenizer,\n", 196 | " torch_dtype=torch.bfloat16,\n", 197 | " trust_remote_code=True,\n", 198 | " device_map=\"auto\",\n", 199 | " )\n", 200 | "\n", 201 | "model = AutoModelForSeq2SeqLM.from_pretrained(\"facebook/nllb-200-distilled-600M\")\n", 202 | "tokenizer = AutoTokenizer.from_pretrained(\"facebook/nllb-200-distilled-600M\")\n", 203 | "device = 0 if torch.cuda.is_available() else -1\n", 204 | "\n", 205 | "def translate(text, src_lang, tgt_lang):\n", 206 | "\n", 207 | " translation_pipeline = pipeline(\"translation\", model=model, tokenizer=tokenizer, src_lang=src_lang, tgt_lang=tgt_lang, max_length=400, device=device)\n", 208 | " result = translation_pipeline(text)\n", 209 | " return result[0]['translation_text']\n", 210 | "\n", 211 | "def English(audio):\n", 212 | "\n", 213 | " transcriber = pipeline(\"automatic-speech-recognition\", model=\"openai/whisper-base.en\")\n", 214 | " sr, y = audio\n", 215 | " y = y.astype(np.float32)\n", 216 | " y /= np.max(np.abs(y))\n", 217 | "\n", 218 | " return transcriber({\"sampling_rate\": sr, \"raw\": y})[\"text\"]\n", 219 | "\n", 220 | "def Hindi(audio):\n", 221 | "\n", 222 | " transcriber = pipeline(\"automatic-speech-recognition\", model=\"theainerd/Wav2Vec2-large-xlsr-hindi\")\n", 223 | " sr, y = audio\n", 224 | " y = y.astype(np.float32)\n", 225 | " y /= np.max(np.abs(y))\n", 226 | "\n", 227 | " text = transcriber({\"sampling_rate\":sr, \"raw\":y})[\"text\"]\n", 228 | "\n", 229 | " return translate(text, \"hin_Deva\", \"eng_Latn\")\n", 230 | "\n", 231 | "\n", 232 | "def Telegu(audio):\n", 233 | "\n", 234 | " transcriber = pipeline(\"automatic-speech-recognition\", model=\"anuragshas/wav2vec2-large-xlsr-53-telugu\")\n", 235 | " sr, y = audio\n", 236 | " y = y.astype(np.float32)\n", 237 | " y /= np.max(np.abs(y))\n", 238 | "\n", 239 | " text = transcriber({\"sampling_rate\":sr, \"raw\":y})[\"text\"]\n", 240 | "\n", 241 | " return translate(text, \"tel_Telu\", \"eng_Latn\")\n", 242 | "\n", 243 | "def Tamil(audio):\n", 244 | "\n", 245 | " transcriber = pipeline(\"automatic-speech-recognition\", model=\"Harveenchadha/vakyansh-wav2vec2-tamil-tam-250\")\n", 246 | " sr, y = audio\n", 247 | " y = y.astype(np.float32)\n", 248 | " y /= np.max(np.abs(y))\n", 249 | "\n", 250 | " text = transcriber({\"sampling_rate\":sr, \"raw\":y})[\"text\"]\n", 251 | "\n", 252 | " return translate(text, \"tam_Taml\", \"eng_Latn\")\n", 253 | "\n", 254 | "def Kannada(audio):\n", 255 | "\n", 256 | " transcriber = pipeline(\"automatic-speech-recognition\", model=\"vasista22/whisper-kannada-medium\")\n", 257 | " sr, y = audio\n", 258 | " y = y.astype(np.float32)\n", 259 | " y /= np.max(np.abs(y))\n", 260 | "\n", 261 | " text = transcriber({\"sampling_rate\":sr, \"raw\":y})[\"text\"]\n", 262 | "\n", 263 | " return translate(text, \"kan_Knda\", \"eng_Latn\")\n", 264 | "\n", 265 | "def predict(audio, language):\n", 266 | "\n", 267 | " if language == \"English\":\n", 268 | " message = English(audio)\n", 269 | "\n", 270 | " if language == \"Hindi\":\n", 271 | " message = Hindi(audio)\n", 272 | "\n", 273 | " if language == \"Telegu\":\n", 274 | " message = Telegu(audio)\n", 275 | "\n", 276 | " if language == \"Tamil\":\n", 277 | " message = Tamil(audio)\n", 278 | "\n", 279 | " if language == \"Kannada\":\n", 280 | " message = Kannada(audio)\n", 281 | "\n", 282 | " print(message)\n", 283 | "\n", 284 | " sequences = generator(\n", 285 | " message,\n", 286 | " max_length=200,\n", 287 | " do_sample=False,\n", 288 | " top_k=10,\n", 289 | " num_return_sequences=1,\n", 290 | " eos_token_id=tokenizer.eos_token_id,)\n", 291 | "\n", 292 | " answer = \"\"\n", 293 | " for seq in sequences:\n", 294 | " answer = answer + seq['generated_text'] + \" \"\n", 295 | "\n", 296 | " print(answer)\n", 297 | " if language == \"English\":\n", 298 | " return answer\n", 299 | "\n", 300 | " if language == \"Hindi\":\n", 301 | " return translate(answer,\"eng_Latn\", \"hin_Deva\")\n", 302 | "\n", 303 | " if language == \"Telegu\":\n", 304 | " return translate(answer,\"eng_Latn\", \"tel_Telu\")\n", 305 | "\n", 306 | " if language == \"Tamil\":\n", 307 | " return translate(answer, \"eng_Latn\", \"tam_Taml\")\n", 308 | "\n", 309 | " if language == \"Kannada\":\n", 310 | " return translate(answer, \"eng_Latn\", \"kan_Knda\")\n", 311 | "\n", 312 | " return answer" 313 | ], 314 | "metadata": { 315 | "id": "EFMhMKoL67uL" 316 | }, 317 | "execution_count": null, 318 | "outputs": [] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "source": [ 323 | "from transformers import AutoTokenizer, pipeline\n", 324 | "from optimum.intel import OVModelForCausalLM, OVWeightQuantizationConfig\n", 325 | "\n", 326 | "model_id = \"tensorgirl/finetuned-gemma\"\n", 327 | "\n", 328 | "# Create the quantization configuration with desired quantization parameters\n", 329 | "q_config = OVWeightQuantizationConfig(bits=4, group_size=128, ratio=0.8)\n", 330 | "\n", 331 | "# Create OpenVINO configuration with optimal settings for this model\n", 332 | "ov_config = {\"PERFORMANCE_HINT\": \"LATENCY\", \"CACHE_DIR\": \"model_cache\", \"INFERENCE_PRECISION_HINT\": \"f32\"}\n", 333 | "\n", 334 | "tokenizer = AutoTokenizer.from_pretrained(model_id)\n", 335 | "model = OVModelForCausalLM.from_pretrained(\n", 336 | " model_id,\n", 337 | " export=True,\n", 338 | " quantization_config=q_config,\n", 339 | " device=device,\n", 340 | " ov_config=ov_config,\n", 341 | " )" 342 | ], 343 | "metadata": { 344 | "id": "NyzmSUJP7DnG" 345 | }, 346 | "execution_count": null, 347 | "outputs": [] 348 | } 349 | ] 350 | } --------------------------------------------------------------------------------