├── .gitignore ├── Makefile ├── README.md ├── module_01 ├── 01_introduction.pdf ├── 02_getting_started.ipynb ├── 03_explore_transformers.ipynb ├── assets │ ├── banner_notebook_1.jpg │ ├── bert_models_layout_notebook_3.jpeg │ ├── cbow_arch_notebook_1.png │ ├── encoder_decoder_notebook_3.png │ ├── img_2_notebook_1.jpg │ ├── karpathy_emoji_tokenizer.jpeg │ ├── lm_training_notebook_3.png │ ├── multihead_attention_notebook_3.png │ ├── positional_emb_notebook_3.png │ ├── skipgram_arch_notebook_1.png │ └── transformer_arch_notebook_3.png ├── sherlock_homes.txt └── solutions │ ├── 02_getting_started.ipynb │ ├── 03_explore_transformers.ipynb │ ├── norm_corpus.txt │ └── sherlock_homes.txt ├── module_02 ├── 01_llm_overview.pdf ├── 02_simple_text_generator.ipynb ├── assets │ └── beamsearch_nb_2.png └── solutions │ └── 02_simple_text_generator.ipynb ├── module_03 ├── 01_llm_training_and_scaling.ipynb ├── 02_instruction_tuning_llama_t2sql.ipynb ├── 03_RLHF_phi2.ipynb ├── assets │ ├── chinchilla.png │ ├── cost_tweet.png │ ├── instruct_gpt_rlhf.png │ ├── lora_1.png │ ├── quantization.png │ ├── scaling_laws.png │ ├── soft_prompting_1.png │ ├── soft_prompting_2.png │ └── soft_prompting_perf.png ├── solutions │ ├── 01_llm_training_and_scaling.ipynb │ └── utils.py └── utils.py ├── module_04 ├── 01_prompt_engineeering_and_langchain.ipynb ├── 02_vector_databases_hf_inference_endpoint.ipynb ├── 03_OpenSource_ClosedSource_LLMs.ipynb ├── 04_retrieval_augmented_llm_app.ipynb ├── 05_dspy_demo.ipynb ├── 06_supercharge_llm_apps.ipynb ├── app.py ├── assets │ ├── chroma_workflow.png │ ├── cot_few_shot.png │ ├── dspy_banner.png │ ├── dspy_logo.png │ ├── dspy_workflow.png │ ├── langchain_workflow.png │ ├── langfuse_dashboard.png │ ├── langfuse_traces.png │ ├── llama_setup_1.png │ ├── llama_setup_2.png │ ├── llama_setup_3.png │ ├── llama_setup_4.png │ ├── llama_setup_5.png │ ├── mteb.png │ ├── pe_banner.jpg │ ├── pe_types.jpg │ ├── prompt_hacking_reddit.png │ ├── prompt_workflow.png │ ├── rap_banner.jpeg │ ├── react_sample.png │ ├── training_is_hard.png │ └── vector_banner.jpg ├── constants.py ├── llm_material.txt ├── scraper_utils.py └── utils.py ├── module_05 └── whats_next.pdf ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── workshop_introduction.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | *crswap* 11 | .DS_Store 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | share/python-wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | *.py,cover 52 | .hypothesis/ 53 | .pytest_cache/ 54 | cover/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: setup runpod_setup tests 2 | 3 | setup: 4 | @echo "Setting up the environment..." 5 | pyenv install 3.11.9 6 | pyenv virtualenv 3.11.9 datahack 7 | pyenv activate datahack 8 | poetry install 9 | 10 | runpod_setup: 11 | @echo "Setting up runpod environment..." 12 | @echo "Step 1 python dependencies..." 13 | pip install -r requirements.txt 14 | @echo "Step 2 nodes/npm dependencies..." 15 | wget -qO- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.0/install.sh | bash 16 | nvm install node 17 | npm install localtunnel 18 | @echo "Step 2 ollama dependencies..." 19 | curl -fsSL https://ollama.com/install.sh | sh 20 | export OLLAMA_MODELS=/workspace 21 | ollama pull llama3.1:8b 22 | @echo "Done!" 23 | 24 | tests: 25 | @echo "Running tests..." 26 | poetry run pytest --disable-warnings -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LLM Workshop 2024 2 | 3 | > [!IMPORTANT] 4 | > :dart: [DataHack Summit 2024](https://www.analyticsvidhya.com/datahacksummit/workshops/unleashing-llms-training-finetuning-and-evaluating) | :calendar: August 10 2024 | :round_pushpin: Bengaluru, India 5 | 6 | > Explore this comprehensive repository on LLMs, covering everything from the basics of NLP to fine-tuning and even RLHF. If you find the resources helpful, consider giving it a star ⭐ to show your support and help others discover it. 7 | --- 8 | ## Table of Contents 9 | - [Modules](#modules) 10 | - [Prerequisites](#prerequisites) 11 | - [Environment Setup](#environment-setup-instructions) 12 | 13 | --- 14 | 15 | ### Modules 16 | #### Module 1: "Foundations of Generative AI and Language Models" 17 | - [x] Overview of Generative AI and the basics of language modeling. 18 | - [x] :star: Hands-On: 19 | - [x] Getting Started: Text Representation 20 | - [x] Language Modeling Basics and Text Generation using a basic LM. 21 | 22 | #### Module 2: "Building Blocks of LLMs" 23 | - [x] Transformer Architectures: Detailed look into the Transformer architecture that powers modern LLMs. 24 | - [x] GPT Series of Models: Overview of the evolution of GPT models. 25 | - [x] Evaluation Metrics and Benchmarks: Methods to evaluate and benchmark LLM performance. 26 | - [x] :star: Hands-On: Training a mini Transformer model and experimenting with GPT-2 for text generation. 27 | 28 | #### Module 3: "Advanced LLM Techniques" 29 | - [x] Training Process and Scaling Laws: Understand how LLMs are trained and the laws governing their scaling. 30 | - [x] PEFT: Learn Parameter-Efficient Fine-Tuning methods. 31 | - [x] LoRA: Introduction to Low-Rank Adaptation. 32 | - [x] Instruction Tuning: Techniques for fine-tuning models using instructions. 33 | - [x] RLHF: Reinforcement Learning from Human Feedback and its applications. 34 | - [x] :star: Hands-On: 35 | - [x] Instruction Tuning: Text 2 SQL using LLaMA3.1 36 | - [x] RLHF Hands-on: Sentiment aligment for generating movie reviews 37 | 38 | #### Module 4: "Operationalizing LLMs" 39 | - [x] Prompt Engineering: Crafting effective prompts to get desired outputs. 40 | - [x] Prompt Hacking and Backdoors 41 | - [x] Vector Databases: Using vector databases for efficient data retrieval. 42 | - [x] RAGs: Techniques for retrieval-augmented generation. 43 | - [x] Beyond Prompting: Understanding Frameworks such as DSPY 44 | - [x] :star: Hands-On: 45 | - [x] Implementing basic prompt engineering techniques and 46 | - [x] Building a simple RAG system. 47 | - [x] Handson with DSPY 48 | 49 | #### Module 5: "The Future of LLMs and Next Steps" 50 | - Next Steps: Speculative topics on future advancements. 51 | - Beyond: Future possibilities and directions for LLM research. 52 | 53 | --- 54 | 55 | ### Prerequisites 56 | - Basics/hands-on experience of working with python 57 | - Basic understanding of linear algebra and machine larning 58 | - Basic understanding of Deep Neural Networks 59 | - Basics/hands-on experience with pytorch 60 | - Access to google-colab or similar python environment 61 | - Access to chatGPT or Google-Bard (free access) 62 | 63 | --- 64 | 65 | ## Environment Setup Instructions 66 | 67 | > [!Important] 68 | > - Follow Step by Step for a quick setup. This should work as-is for Mac/Linux based systems. 69 | > - If you already have your own way of managing dependencies, checkout pyproject.toml for poetry or requirements.txt for pip based systems 70 | > - The requirements.txt file is generated using the command ``poetry export --without-hashes --format=requirements.txt > requirements.txt`` 71 | 72 | - **We will make use of** : 73 | - ``pyenv`` for python version management 74 | - ``virtualenv`` for virtual environment management 75 | - ``poetry`` for dependency management 76 | 77 | - **Pyenv**: 78 | - ``brew install pyenv`` or ``curl https://pyenv.run | bash`` 79 | - **VirtualEnv**: 80 | - install: 81 | - ``brew install pyenv-virtualenv`` or 82 | - ``git clone https://github.com/pyenv/pyenv-virtualenv.git $(pyenv root)/plugins/pyenv-virtualenv`` 83 | - add this to your .rc file: ``eval "$(pyenv virtualenv-init -)"`` 84 | - **Poetry**: 85 | - install: 86 | - ``curl -sSL https://install.python-poetry.org | python3 -`` or 87 | - or check [here](https://python-poetry.org/docs/#installing-with-the-official-installer) 88 | 89 | - **Setup**: 90 | - Local Mac/Linux :If you have `make` available, simply execute: ``make setup`` otherwise: 91 | - RunPod or other Similar Providers: simply execute: ``make runpod_setup`` otherwise: 92 | - If you are using other ways of dependency management: 93 | - Python Environment: 94 | -``pyenv install 3.11.9`` 95 | - ``pyenv virtualenv 3.11.9 datahack`` 96 | - ``cd `` 97 | - ``pyenv activate datahack`` 98 | - ``poetry install`` <- Make sure ``pyproject.toml`` file is available in directory you execute this command 99 | OR 100 | - use the `requirements.txt` file for reference. 101 | - Setup ``nvm`` / ``node`` and install ``localtunnel`` 102 | 103 | 104 | 105 | 106 | -------------------------------------------------------------------------------- /module_01/01_introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/01_introduction.pdf -------------------------------------------------------------------------------- /module_01/02_getting_started.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "iDeUTNIJnpGh" 7 | }, 8 | "source": [ 9 | "# Getting Started : Text Representation\n", 10 | "\n", 11 | "\n", 12 | "\n", 13 | "The NLP domain wasn't always this buzzing with __attention__ and hype that we see today. \n", 14 | "The recent progress in this field is built on top of years of amazing work and research. Before we leap onto the current state of things, let us have a quick walk through of how we arrived here. The current NLP systems are standing tall and promising on the shoulders of very solid work from past decades\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Import Required Libraries\n", 22 | "\n", 23 | "\n", 24 | " \"Open\n", 25 | "" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "id": "UULjCk9BoIFF" 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "import torch\n", 37 | "import torchtext\n", 38 | "import os\n", 39 | "import collections\n", 40 | "import pandas as pd\n", 41 | "import numpy as np\n", 42 | "import re\n", 43 | "import torchtext \n", 44 | "torchtext.disable_torchtext_deprecation_warning()" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": { 51 | "colab": { 52 | "base_uri": "https://localhost:8080/" 53 | }, 54 | "id": "XtKof0Y2ZX_w", 55 | "outputId": "6af8d2ed-10eb-474f-d6a7-7809f544703e" 56 | }, 57 | "outputs": [], 58 | "source": [ 59 | "import nltk\n", 60 | "\n", 61 | "nltk.download('stopwords')\n", 62 | "nltk.download('punkt')" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "id": "621RWhAuokLt" 69 | }, 70 | "source": [ 71 | "### Get Text\n", 72 | "__The Gutenberg Project__ is an amazing project aimed at providing free access to some of the world's most amazing classical works. This makes it a wonderful source of textual data for NLP practitionars to use and improve their understanding of textual data. Ofcourse you can improve your litrary skills too \n", 73 | "\n", 74 | "For this module and workshop in general we will make use of materials available from the project. We begin by downloading the book __\"The Adventures of Sherlock Holmes by Arthur Conan Doyle\"__\n", 75 | "\n", 76 | "\n", 77 | "" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": { 84 | "colab": { 85 | "base_uri": "https://localhost:8080/" 86 | }, 87 | "id": "Lwf8jsuDoeoy", 88 | "outputId": "6f9566d7-9cb8-4eed-ec29-df72828226d9" 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": { 98 | "id": "RrwoWo-Yon-9" 99 | }, 100 | "source": [ 101 | "### Load Data" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": { 108 | "id": "-lKQYgNXonkD" 109 | }, 110 | "outputs": [], 111 | "source": [ 112 | "filename = \"sherlock_homes.txt\"\n", 113 | "file_text = open(filename, 'r', encoding='utf-8').read()\n", 114 | "\n", 115 | "# lower case text to reduce dimensionality\n", 116 | "file_text = file_text#TODO: Lowercase the file text\n", 117 | "\n", 118 | "# We remove first 1450 characters to remove\n", 119 | "# details related to project gutenberg\n", 120 | "raw_text = file_text [1450:]" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "id": "CAxaB81bsI-q" 127 | }, 128 | "source": [ 129 | "### Text Representation\n", 130 | "\n", 131 | "Feature Engineering is often known as the secret sauce to creating superior and better performing machine learning models. Just one excellent feature could be your ticket to winning a Kaggle challenge! The importance of feature engineering is even more important for unstructured, textual data because we need to convert free flowing text into some numeric representations which can then be understood by machine learning algorithms.\n", 132 | "\n", 133 | "Since text is mostly available in unstructured form yet very high in dimensionality (how??? :sweat: ), the ability to represent text in the most appropriate way is one of the key ingredients to work in this domain.\n", 134 | "\n", 135 | "\n", 136 | "Let us understand the current dataset at hand by checking the obvious aspects of a textual dataset" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": { 143 | "colab": { 144 | "base_uri": "https://localhost:8080/" 145 | }, 146 | "id": "TXqdpFXGpJuH", 147 | "outputId": "7c9bc0c8-f0ca-4d58-ce39-6e5666c59914" 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "# unique list of characters and total characters in the file\n", 152 | "char_vocab = sorted(set(raw_text))\n", 153 | "\n", 154 | "\n", 155 | "# summarize the loaded data\n", 156 | "n_chars = len(raw_text)\n", 157 | "n_vocab = len(char_vocab)\n", 158 | "print(\"Total Characters: \", n_chars)\n", 159 | "print(\"Total Vocab: \", n_vocab)" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": { 165 | "id": "uSlkgVJIpV_X" 166 | }, 167 | "source": [ 168 | "### Tokenize and Vectorize\n", 169 | "To leverage different algorithms we convert text into numbers that can be represented as tensors.\n", 170 | "\n", 171 | "The first step is to convert text to tokens - tokenization. If we use word-level representation, each word would be represented by its own token. We will use build-in tokenizer from torchtext module" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "import torchtext; torchtext.disable_torchtext_deprecation_warning()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "# Deprecation notice!\n", 190 | "from torchtext.data import get_tokenizer\n", 191 | "from torchtext.vocab import Vocab" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": null, 197 | "metadata": { 198 | "id": "sgfwH3lRpU0s" 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "tokenizer = get_tokenizer('basic_english')" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "colab": { 210 | "base_uri": "https://localhost:8080/" 211 | }, 212 | "id": "RRPUOkuhp168", 213 | "outputId": "2f4935ee-52bb-455d-fc2b-39365fc5793b" 214 | }, 215 | "outputs": [], 216 | "source": [ 217 | "tokens = tokenizer(raw_text[:50])\n", 218 | "print(f'\\Token list:\\n{tokens}')" 219 | ] 220 | }, 221 | { 222 | "cell_type": "markdown", 223 | "metadata": { 224 | "id": "yxlQEcTuqHzJ" 225 | }, 226 | "source": [ 227 | "Now, to convert text to numbers, we will need to build a vocabulary of all tokens." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": { 234 | "id": "f2D4CNkAqFRl" 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# word level vocab\n", 239 | "word_counter = collections.Counter()\n", 240 | "for line in raw_text.split('\\n'):\n", 241 | " word_counter.update(tokenizer(line))\n", 242 | "word_vocab = Vocab(word_counter)" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": { 249 | "colab": { 250 | "base_uri": "https://localhost:8080/" 251 | }, 252 | "id": "bmO3DBZCq7k3", 253 | "outputId": "dad353f3-9ce5-4f77-ce5b-4962cb20a95d" 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "# sample lookup at word-level\n", 258 | "#TODO: Print a few tokens with their indices" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "id": "XyEN3x5pqo2K" 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "# character level vocab\n", 270 | "char2idx = {u:i for i, u in enumerate(char_vocab)}\n", 271 | "idx2char = np.array(char_vocab)\n", 272 | "\n", 273 | "text_as_int = np.array([char2idx[c] for c in raw_text])" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": { 280 | "colab": { 281 | "base_uri": "https://localhost:8080/" 282 | }, 283 | "id": "ZWCe7yetq63L", 284 | "outputId": "5f2d4458-e4bb-432b-a391-9b2f685a1381" 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# char level mapping\n", 289 | "print('{')\n", 290 | "for char,_ in zip(char2idx, range(10)):\n", 291 | " print(' {:4s}: {:3d},'.format(repr(char), char2idx[char]))\n", 292 | "print(' ...\\n}')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "id": "R2ye2VNRrhyv" 299 | }, 300 | "source": [ 301 | "### Text as Vector\n", 302 | "\n", 303 | "``torchtext`` ``vocab.stoi`` dictionary allows us to convert from a string representation into numbers (``stoi`` -> \"from string to integers).\n", 304 | "\n", 305 | "To convert the text back from a numeric representation into text, we can use the ``vocab.itos`` dictionary to perform reverse lookup:" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "colab": { 313 | "base_uri": "https://localhost:8080/" 314 | }, 315 | "id": "znnAoejUrjP7", 316 | "outputId": "ab987cd3-37b4-4ef5-e201-3dff00571036" 317 | }, 318 | "outputs": [], 319 | "source": [ 320 | "word_vocab_size = len(word_vocab)\n", 321 | "print(f\"Word Vocab size= {word_vocab_size}\")\n", 322 | "\n", 323 | "\n", 324 | "def encode(x):\n", 325 | " return [word_vocab[s] for s in tokenizer(x)]\n", 326 | "\n", 327 | "vec = encode(raw_text[:100])\n", 328 | "print(vec)" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "id": "oXr9EcyCsLzw" 335 | }, 336 | "source": [ 337 | "### Bag Of Words Representation\n", 338 | "\n", 339 | "Bag of Words (BoW) representation is a traditional vector representation of text for NLP tasks. Each word/character is linked to a vector index, vector element contains the number of occurrences of a word/character in a given document.\n" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": { 346 | "id": "R5osyuH-vmE9" 347 | }, 348 | "outputs": [], 349 | "source": [ 350 | "def to_bow(text,bow_vocab_size=word_vocab_size):\n", 351 | " res = torch.zeros(bow_vocab_size,dtype=torch.float32)\n", 352 | " for i in encode(text):\n", 353 | " if i" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": { 517 | "id": "yDs_-hoJ3atN" 518 | }, 519 | "source": [ 520 | "### Skip-gram Model\n", 521 | "The Skip-gram model architecture usually tries to achieve the reverse of what the CBOW model does. It tries to predict the __`source context words`__ (surrounding words) given a __`target word`__ (the center word).\n", 522 | "\n", 523 | "Considering our simple sentence from earlier, “the quick brown fox jumps over the lazy dog”. If we used the CBOW model, we get pairs of (context_window, target_word) where if we consider a context window of size 2, we have examples like __([quick, fox], brown)__, __([the, brown], quick)__, __([the, dog], lazy)__ and so on.\n", 524 | "\n", 525 | "Now considering that the skip-gram model’s aim is to predict the context from the target word, the model typically inverts the contexts and targets, and tries to predict each context word from its target word. Hence the task becomes to predict the context [quick, fox] given target word ‘brown’ or [the, brown] given target word ‘quick’ and so on.\n", 526 | "\n", 527 | "Thus the model tries to predict the context_window words based on the target_word.\n", 528 | "\n", 529 | "" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": null, 535 | "metadata": { 536 | "id": "wlPE6usUvNdr" 537 | }, 538 | "outputs": [], 539 | "source": [ 540 | "corpus = ['The sky is blue and beautiful.',\n", 541 | " 'Love this blue and beautiful sky!',\n", 542 | " 'The quick brown fox jumps over the lazy dog.',\n", 543 | " \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n", 544 | " 'I love green eggs, ham, sausages and bacon!',\n", 545 | " 'The brown fox is quick and the blue dog is lazy!',\n", 546 | " 'The sky is very blue and the sky is very beautiful today',\n", 547 | " 'The dog is lazy but the brown fox is quick!'\n", 548 | "]" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": null, 554 | "metadata": { 555 | "colab": { 556 | "base_uri": "https://localhost:8080/" 557 | }, 558 | "id": "HJ32JCdnZQfD", 559 | "outputId": "4d1c7293-e5ac-451f-a75c-b6c64572761f" 560 | }, 561 | "outputs": [], 562 | "source": [ 563 | "stop_words = nltk.corpus.stopwords.words('english')\n", 564 | "\n", 565 | "def normalize_document(doc):\n", 566 | " # lower case and remove special characters\\whitespaces\n", 567 | " doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n", 568 | " doc = doc.lower()\n", 569 | " doc = doc.strip()\n", 570 | " # tokenize document\n", 571 | " tokens = nltk.word_tokenize(doc)\n", 572 | " # filter stopwords out of document\n", 573 | " filtered_tokens = [token for token in tokens if token not in stop_words]\n", 574 | " # re-create document from filtered tokens\n", 575 | " doc = #TODO: Join back the list of tokens as a string. \n", 576 | " return doc\n", 577 | "\n", 578 | "normalize_corpus = np.vectorize(normalize_document)\n", 579 | "\n", 580 | "norm_corpus = normalize_corpus(corpus)\n", 581 | "norm_corpus" 582 | ] 583 | }, 584 | { 585 | "cell_type": "markdown", 586 | "metadata": {}, 587 | "source": [ 588 | "## Gensim Framework\n", 589 | "\n", 590 | "The ``gensim`` framework, created by Radim Řehůřek consists of a robust, efficient and scalable implementation of the __Word2Vec__ model. We will leverage the same on our sample toy corpus. In our workflow, we will tokenize our normalized corpus and then focus on the following four parameters in the Word2Vec model to build it.\n", 591 | "\n", 592 | "- vector_size: The word embedding dimensionality\n", 593 | "- window: The context window size\n", 594 | "- min_count: The minimum word count\n", 595 | "- sample: The downsample setting for frequent words\n", 596 | "- sg: Training model, 1 for skip-gram otherwise CBOW\n", 597 | "\n", 598 | "We will build a simple Word2Vec model on the corpus and visualize the embeddings." 599 | ] 600 | }, 601 | { 602 | "cell_type": "code", 603 | "execution_count": null, 604 | "metadata": { 605 | "id": "iHITPzAY2a6b" 606 | }, 607 | "outputs": [], 608 | "source": [ 609 | "from gensim.models import word2vec" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": null, 615 | "metadata": { 616 | "colab": { 617 | "base_uri": "https://localhost:8080/" 618 | }, 619 | "id": "SD2BOBlR2ZJN", 620 | "outputId": "c0c52ace-517b-404e-fa9c-d06332424b19" 621 | }, 622 | "outputs": [], 623 | "source": [ 624 | "tokenized_corpus = [tokenizer(line) for line in norm_corpus]\n", 625 | "\n", 626 | "# Set values for various parameters\n", 627 | "feature_size = 15 # Word vector dimensionality\n", 628 | "window_context = 5 # Context window size\n", 629 | "min_word_count = 1 # Minimum word count\n", 630 | "sample = 1e-3 # Downsample setting for frequent words\n", 631 | "sg = 1 # skip-gram model\n", 632 | "\n", 633 | "w2v_model = word2vec.Word2Vec(tokenized_corpus,\n", 634 | " vector_size=feature_size,\n", 635 | " window=window_context,\n", 636 | " min_count = min_word_count,\n", 637 | " sg=sg,\n", 638 | " sample=sample,\n", 639 | " epochs=5000)\n", 640 | "w2v_model" 641 | ] 642 | }, 643 | { 644 | "cell_type": "code", 645 | "execution_count": null, 646 | "metadata": { 647 | "colab": { 648 | "base_uri": "https://localhost:8080/" 649 | }, 650 | "id": "jRs624Df4I4q", 651 | "outputId": "4ae21244-7bd1-44ca-fec6-1301cb8eb0a7" 652 | }, 653 | "outputs": [], 654 | "source": [ 655 | "w2v_model.wv['sky']" 656 | ] 657 | }, 658 | { 659 | "cell_type": "code", 660 | "execution_count": null, 661 | "metadata": {}, 662 | "outputs": [], 663 | "source": [ 664 | "#TODO: Print the vector for the word India" 665 | ] 666 | }, 667 | { 668 | "cell_type": "code", 669 | "execution_count": null, 670 | "metadata": { 671 | "id": "BOqfN1C_35dt" 672 | }, 673 | "outputs": [], 674 | "source": [ 675 | "import scienceplots\n", 676 | "import matplotlib.pyplot as plt\n", 677 | "from sklearn.manifold import TSNE\n", 678 | "plt.style.use(['science','ieee','no-latex'])\n", 679 | "\n", 680 | "%matplotlib inline" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": null, 686 | "metadata": { 687 | "colab": { 688 | "base_uri": "https://localhost:8080/", 689 | "height": 683 690 | }, 691 | "id": "AvqivFNy3siE", 692 | "outputId": "e9d8a7e0-c0cc-4715-8ef6-088a4eee1be3" 693 | }, 694 | "outputs": [], 695 | "source": [ 696 | "# visualize embeddings\n", 697 | "words = w2v_model.wv.index_to_key\n", 698 | "wvs = w2v_model.wv[words]\n", 699 | "\n", 700 | "tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5)\n", 701 | "np.set_printoptions(suppress=True)\n", 702 | "T = tsne.fit_transform(wvs)\n", 703 | "labels = words\n", 704 | "\n", 705 | "plt.figure(figsize=(12, 6))\n", 706 | "plt.scatter(T[:, 0], T[:, 1],)\n", 707 | "for label, x, y in zip(labels, T[:, 0], T[:, 1]):\n", 708 | " plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')" 709 | ] 710 | }, 711 | { 712 | "cell_type": "code", 713 | "execution_count": null, 714 | "metadata": { 715 | "colab": { 716 | "base_uri": "https://localhost:8080/" 717 | }, 718 | "id": "s8iIJ81tYBb0", 719 | "outputId": "4a9b9080-9dc9-46ab-9f40-e2715c1c5dd2" 720 | }, 721 | "outputs": [], 722 | "source": [ 723 | "w2v_model.wv.most_similar('dog', topn=10)" 724 | ] 725 | }, 726 | { 727 | "cell_type": "markdown", 728 | "metadata": { 729 | "id": "wfnq6B2K4RV2" 730 | }, 731 | "source": [ 732 | "## Similar and Improved Works \n", 733 | "- [GloVe](https://nlp.stanford.edu/pubs/glove.pdf)\n", 734 | "- [FastText](https://arxiv.org/pdf/1607.04606.pdf)\n", 735 | "- [Sent2Vec](https://arxiv.org/abs/1405.4053)\n", 736 | "- X2Vec" 737 | ] 738 | }, 739 | { 740 | "cell_type": "code", 741 | "execution_count": null, 742 | "metadata": {}, 743 | "outputs": [], 744 | "source": [ 745 | "with open(\"norm_corpus.txt\",\"w\") as f:\n", 746 | " for line in norm_corpus:\n", 747 | " f.write(line+'\\n')" 748 | ] 749 | }, 750 | { 751 | "cell_type": "code", 752 | "execution_count": null, 753 | "metadata": {}, 754 | "outputs": [], 755 | "source": [ 756 | "import fasttext\n", 757 | "fasttext_model = fasttext.train_unsupervised('norm_corpus.txt', model='skipgram',epoch=500000,minCount=1,loss='ns')" 758 | ] 759 | }, 760 | { 761 | "cell_type": "code", 762 | "execution_count": null, 763 | "metadata": {}, 764 | "outputs": [], 765 | "source": [ 766 | "fasttext_model.get_word_vector('sky')" 767 | ] 768 | }, 769 | { 770 | "cell_type": "code", 771 | "execution_count": null, 772 | "metadata": {}, 773 | "outputs": [], 774 | "source": [ 775 | "# TODO: Get Vector for India" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "# TODO: Identify nearest neighbors for the word breakfast" 785 | ] 786 | }, 787 | { 788 | "cell_type": "markdown", 789 | "metadata": { 790 | "id": "HHRdBNxQ5Ho6" 791 | }, 792 | "source": [ 793 | "### Limitations\n", 794 | "One key limitation of traditional pretrained embedding representations such as Word2Vec is the problem of word sense and removing ambiguity by making them clear. While pretrained embeddings can capture some of the meaning of words in context, every possible meaning of a word is encoded into the same embedding. This can cause problems in downstream models, since many words such as the word 'play' have different meanings depending on the context they are used in.\n", 795 | "\n", 796 | "For example, the word 'play' in these two different sentences have quite different meaning:\n", 797 | "\n", 798 | "- I went to a **play** at the theatre.\n", 799 | "- John wants to **play** with his friends.\n", 800 | "The pretrained embeddings above represent both meanings of the word 'play' in the same embedding. To overcome this limitation, we need to build embeddings based on the language model, which is trained on a large corpus of text, and knows how words can be put together in different contexts." 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "---\n", 808 | "\n", 809 | "## Thought Exercise\n", 810 | "\n", 811 | "- We discussed about representing text into tokens and why it is important\n", 812 | "- We discussed about different tokenization methods (character wise, word wise and more...)\n", 813 | "- But does it make any difference for the tokenizer (or even the model) in terms of any meaning of those token?\n", 814 | "\n", 815 | "Probably No. Check this experiment tweeted by [Andrej Karpathy](https://x.com/karpathy/status/1816637781659254908/photo/1)\n", 816 | "\n" 817 | ] 818 | }, 819 | { 820 | "cell_type": "markdown", 821 | "metadata": {}, 822 | "source": [] 823 | } 824 | ], 825 | "metadata": { 826 | "colab": { 827 | "provenance": [] 828 | }, 829 | "kernelspec": { 830 | "display_name": "Python 3 (ipykernel)", 831 | "language": "python", 832 | "name": "python3" 833 | }, 834 | "language_info": { 835 | "codemirror_mode": { 836 | "name": "ipython", 837 | "version": 3 838 | }, 839 | "file_extension": ".py", 840 | "mimetype": "text/x-python", 841 | "name": "python", 842 | "nbconvert_exporter": "python", 843 | "pygments_lexer": "ipython3", 844 | "version": "3.11.9" 845 | } 846 | }, 847 | "nbformat": 4, 848 | "nbformat_minor": 4 849 | } 850 | -------------------------------------------------------------------------------- /module_01/03_explore_transformers.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "22ab9deb-d5cd-45d4-a4be-4748d15df4e5", 6 | "metadata": {}, 7 | "source": [ 8 | "# Exploring Transformer Architectures\n", 9 | "\n", 10 | "\n", 11 | " \"Open\n", 12 | "\n" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "id": "5cfd7bf0-01eb-42fa-86a7-0614c6948ed9", 18 | "metadata": {}, 19 | "source": [ 20 | "## The RNN Limitation\n", 21 | "The RNN layer (LSTM, or GRU, etc.) takes in a context window of a defined size as input and encodes all of it into a single vector. This bottleneck vector needs to capture a lot of information in itself before the decoding stage can use it to start generating the next token. To enhance performance of sequence to sequence tasks a typical Encoder-Decoder architecture is the go-to choice.\n", 22 | "\n", 23 | "\n", 24 | "\n", 25 | "Let us consider the case of **Machine Translation**, i.e. translation of English to Spanish (or any other language).\n", 26 | "\n", 27 | "In a typical __Encoder-Decoder__ architecture, the Encoder takes in the input text in English as input and prepares a condensed vector representation of the whole input. Typically termed as bottleneck features. The Decoder then uses these features to generate the translated text in Spanish.\n", 28 | "\n", 29 | "While this architecture and its variants worked wonders, they had issues. Issues such as inability handle longer input sequences, cases where there is not a one to one mapping between input vs output language and so on.\n", 30 | "\n", 31 | "To handle these issues, __Vasvani et. al.__ in their now famouly titled paper __Attention Is All You Need__ build up on the concepts of attention. The main highlight of this work was the Transformer architecture. Transformers were shown to present state of the art results on multiple benchmarks without using any recurrence or convolutional components." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "c97031ae-f2ee-47f5-8873-062bc1da55f7", 37 | "metadata": {}, 38 | "source": [ 39 | "## Transformers\n", 40 | "- The transformer architecture was presented in the seminal paper __Attention is All You Need__ by Vaswani et al. back in 2017\n", 41 | "- A transformer is a __recurrence-__ and __convolution-free__ attention-based encoder-decoder architecture\n", 42 | "- Introduced the concept of multi-head attention and positional encodings\n", 43 | "- Also revolutionalised Computer Vision domain (see ViT)\n", 44 | "\n", 45 | "\n", 46 | "" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "id": "d34a3aab-a33b-4864-a5cc-f175146af5ce", 52 | "metadata": {}, 53 | "source": [ 54 | "## Attention is All you Need ⚠️\n", 55 | "\n", 56 | "\n", 57 | "### Attention to the Rescue\n", 58 | "Attention is one of the most powerful concepts in the deep learning space that really changed the game. The core idea behind the attention mechanism is to make use of all interim hidden states of the RNN to decide which one to focus upon before it is used by the decoding stage. \n", 59 | "\n", 60 | "### Contextual Embeddings\n", 61 | "The [TagLM architecture by Peters et al. in 2017](https://arxiv.org/abs/1705.00108) was one of the first works that provided an insight into how we could combine __pre-trained word embeddings__ with a __pre-trained neural language model__ to generate __context-aware embeddings__ for downstream NLP tasks.\n", 62 | "\n", 63 | "The big breakthrough that changed the NLP landscape came in the form of __ELMo, or Embeddings from Language Models__. The ELMo architecture was presented by Peters et al. in their work titled [__Deep Contextualized Word Representations in 2018__](https://arxiv.org/abs/1802.05365). Without going into too much detail, the main highlights of the ELMo architecture were:\n", 64 | "\n", 65 | "- The model used a bi-LSTM-based language model.\n", 66 | "- Character CNNs were used to generate embeddings, in place of pre-trained word vectors, which made use of huge 4096 LSTM units but transformed into smaller 512-sized vectors using feedforward layers.\n", 67 | "- The main innovation was to make use of all the hidden bi-LSTM layers for generating input representation. Unlike previous works, where only the final LSTM layer was used to fetch the representation of the input, this work took a weighted average of all the hidden layers' hidden states. This helped the model learn contextual word embeddings where each layer contributed to things like syntax and semantics.\n", 68 | "\n", 69 | "### Self-Attention\n", 70 | "- Self-attention was proposed by Cheng et al. in their paper titled Long Short-Term Memory Networks for Machine Reading in 2016\n", 71 | "- Self-attention enables a model to learn the correlation between the current token (character or word or sentence, etc.) and its context window. In other words, it is an attention mechanism that relates different positions of a given sequence so as to generate a representation of the same sequence\n", 72 | "\n", 73 | "### Multi-head Attention\n", 74 | "- Multi-head attention extends the self-attention mechanism by performing multiple parallel self-attention operations, each focusing on different learned linear projections of the input. Multiple attention heads allow the model to capture different types of relationships and learn more fine-grained representations (eg: grammar, context, dependency, etc.)\n", 75 | "\n", 76 | "\n", 77 | "\n", 78 | "> Source: [Vasvani et. al.](https://arxiv.org/pdf/1706.03762.pdf)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "id": "cd448c92-cc9c-44c5-a408-d8f4e5706862", 84 | "metadata": {}, 85 | "source": [ 86 | "### Positional Encoding\n", 87 | "Positional encoding is a technique used to incorporate the position of each token in the input sequence. It provides the model with information about the token's position without relying solely on the order of tokens.\n", 88 | "This additional aspect was required because transformers do not have the natural sequential setup of RNNs. In order to provide positional context, any encoding system should ideally have the following properties:\n", 89 | "\n", 90 | "- It should output a unique encoding for each time-step (word’s position in a sentence)\n", 91 | "- Distance between any two time-steps should be consistent across sentences with different lengths.\n", 92 | "- Our model should generalize to longer sentences without any efforts. Its values should be bounded.\n", 93 | "- It must be deterministic.\n", 94 | "\n", 95 | "\n", 96 | "\n", 97 | "\n", 98 | "\n", 99 | "### References\n", 100 | "- [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/)" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": null, 106 | "id": "3f54409c-36e1-4836-8336-a63bcd32d047", 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import numpy as np\n", 111 | "import scienceplots\n", 112 | "from matplotlib import pyplot as plt\n", 113 | "plt.style.use(['science','no-latex'])" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "bb1d2966-72aa-425c-ac73-51e7263a8bdc", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "def positional_encoding(pos,i,d_model,is_even=True):\n", 124 | " \"\"\"\n", 125 | " Method to generate positional encoding value\n", 126 | " :param pos: position of the input\n", 127 | " :param i: i-th dimension of the embedding\n", 128 | " :param d_model: length of the embedding vector\n", 129 | " :param is_even: if the position of the input is even or odd\n", 130 | " \"\"\"\n", 131 | " input_val = pos/np.power(10000,(2*i)/d_model)\n", 132 | " if is_even:\n", 133 | " return np.sin(input_val)\n", 134 | " else:\n", 135 | " return np.cos(input_val)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "id": "c6a5c6ec-402e-4d61-af32-1f72815e448b", 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "# input parameters for visualisations\n", 146 | "pos = np.arange(0,10,0.1) #10 input words, stepping at 0.1 for smoothness only\n", 147 | "dimensions = np.arange(0,512) # dimensionality of the positional encoding (same as d_model by default)\n", 148 | "d_model = 512 # length of embedding vector" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "id": "7a8ff1df-72d2-48e2-8d28-dd8dc2dd3a1f", 155 | "metadata": {}, 156 | "outputs": [], 157 | "source": [ 158 | "# positional encoding for even positions\n", 159 | "even_pos_emb = [positional_encoding(pos,i,d_model) for i in dimensions] \n", 160 | "\n", 161 | "# positional encoding for off positions\n", 162 | "odd_pos_emb = #TODO: prepare positional embeddings for odd positions" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "id": "fcbea557-8d48-4280-9841-e99a1a754638", 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "show_dim = [0,16,32] # visualise only a few dimensions for clarity\n", 173 | "plt.figure(figsize=(15, 5))\n", 174 | "for i in dimensions:\n", 175 | " if i in show_dim:\n", 176 | " plt.plot(pos,even_pos_emb[i])\n", 177 | " plt.plot(pos,odd_pos_emb[i])\n", 178 | " plt.axvline(2,linestyle='--',c='black')\n", 179 | "plt.title(\"Positional Encodings\") \n", 180 | "plt.xlabel(\"input positions\")\n", 181 | "plt.ylabel(\"encoding value\")\n", 182 | "plt.show()" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "id": "aec21692-04c7-44f6-ba07-58d01bffe43e", 188 | "metadata": {}, 189 | "source": [ 190 | "## BERT-ology\n", 191 | "- BERT, or __[Bi-Directional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805)__, was presented by Devlin et al., a team at Google AI in 2018\n", 192 | "- Multi-task Learning: BERT also helped push the transfer-learning envelope in the NLP domain by showcasing how a pre-trained model can be fine-tuned for various tasks to provide state-of-the-art performance\n", 193 | "- BERT tweaked the usual Language Model objective to only predict next token based on past context by building context from both directions, i.e. the objective of predicting masked words along with next sentence prediction.\n", 194 | "\n", 195 | "\n", 196 | "\n", 197 | "\n", 198 | "> source [PLM Papers](https://github.com/thunlp/PLMpapers)" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "id": "f8a38e20-e963-4d1a-b65c-8e09a60a267f", 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "import torch\n", 209 | "import transformers\n", 210 | "from transformers import pipeline" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "id": "fb245c9f-e147-49ef-a69c-e3defa41fae9", 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# Let us define some configs/constants\n", 221 | "DISTILBET_BASE_UNCASED_CHECKPOINT = \"distilbert/distilbert-base-uncased\"\n", 222 | "DISTILBET_QA_CHECKPOINT = \"distilbert/distilbert-base-uncased-distilled-squad\"\n", 223 | "DISTILBET_CLASSIFICATION_CHECKPOINT = \"distilbert/distilbert-base-uncased-finetuned-sst-2-english\"" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "id": "1a88cc2e-b8fb-48f9-8537-015420b4d902", 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "if torch.cuda.is_available():\n", 234 | " DEVICE = 'cuda'\n", 235 | " Tensor = torch.cuda.FloatTensor\n", 236 | " LongTensor = torch.cuda.LongTensor\n", 237 | " DEVICE_ID = 0\n", 238 | "elif torch.backends.mps.is_available():\n", 239 | " DEVICE = 'mps'\n", 240 | " Tensor = torch.FloatTensor\n", 241 | " LongTensor = torch.LongTensor\n", 242 | " DEVICE_ID = 0\n", 243 | "else:\n", 244 | " DEVICE = 'cpu'\n", 245 | " Tensor = torch.FloatTensor\n", 246 | " LongTensor = torch.LongTensor\n", 247 | " DEVICE_ID = -1\n", 248 | "print(f\"Backend Accelerator Device={DEVICE}\")" 249 | ] 250 | }, 251 | { 252 | "cell_type": "markdown", 253 | "id": "9f241206-25ed-4021-9457-e1bdd665a029", 254 | "metadata": {}, 255 | "source": [ 256 | "### Predicting the Masked Token\n", 257 | "This was a unique objective when BERT was originally introduced as compared to usual NLP tasks such as classification. The objective requires us to prepare a dataset where we mask a certain percentage of input tokens and train the model to learn to predict those tokens. This objective turns out to be very effective in helping the model learn the nuances of language. \n", 258 | "\n", 259 | "In this first task we will test the pre-trained model against this objective itself. The model outputs a bunch of things such as the predicted token, encoded index of the predicted token/word along with a score which indicates the model's confidence." 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": null, 265 | "id": "2e3efdda-c31f-438e-a19e-4593570dc323", 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "mlm_pipeline = pipeline(\n", 270 | " 'fill-mask',\n", 271 | " model=DISTILBET_BASE_UNCASED_CHECKPOINT,\n", 272 | " device=DEVICE_ID\n", 273 | ")\n", 274 | "mlm_pipeline(\"Bangalore is the IT [MASK] of India.\")" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "id": "c540cf39-e8d9-412f-b339-49c93faf9723", 280 | "metadata": {}, 281 | "source": [ 282 | "### Question Answering\n", 283 | "This is an interesting NLP task and quite complex one as well. For this task, the model is provided input consisting of the context along with a question and it predicts the answer by selecting text from the context. The training setup for this task is a bit involved process, the following is an overview:\n", 284 | "- The training input as triplet of context, question and answer\n", 285 | "- This is transformed as combined input of the form ``[CLS]question[SEP]context[SEP]`` or ``[CLS]contex[SEP]question[SEP]`` with answer acting as the label\n", 286 | "- The model is trained to predict the start and end indices of the the corresponding answer for each input.\n", 287 | "\n", 288 | "\n", 289 | "For our current setting, we will leverage both _pretrained_ and _fine-tuned_ versions of **DistilBERT** via the _question-answering_ pipeline and understand the performance difference." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "id": "08ae5a96-fd84-4b11-9943-5dfac261c432", 296 | "metadata": {}, 297 | "outputs": [], 298 | "source": [ 299 | "qa_ft_pipeline = pipeline(\n", 300 | " 'question-answering',\n", 301 | " model=DISTILBET_QA_CHECKPOINT,\n", 302 | " device=DEVICE_ID\n", 303 | ")\n", 304 | "qa_pt_pipeline = pipeline(\n", 305 | " 'question-answering',\n", 306 | " model=#TODO: Set the pretrained \n", 307 | " device=DEVICE_ID\n", 308 | ")" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "id": "cde485cf-1c21-4f34-a2d2-4d0130854905", 315 | "metadata": {}, 316 | "outputs": [], 317 | "source": [ 318 | "# we use a snippet about BERT like models from the module itself\n", 319 | "context = \"\"\"The key contribution from this set of models is the masked language modeling objective during the pre-training phase, where some tokens in the input are masked, and the model is trained to predict them (we will cover these in the upcoming section). Key works in this group of architectures are BERT, RoBERTa (or optimized BERT), DistilBERT (lighter and more efficient BERT), ELECTRA and ALBERT.\n", 320 | "In this notebook we will work through the task of Question Answering where our language model will learn to answer questions based on the context provided.\"\"\"\n", 321 | "question = \"What are the key works in this set of models?\"" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "id": "fdf038b7-e8c8-425e-bbd4-026895e73d6f", 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "ft_qa_result= qa_ft_pipeline(\n", 332 | " question=question,\n", 333 | " context=context\n", 334 | ")\n", 335 | "\n", 336 | "pt_qa_result= qa_pt_pipeline(\n", 337 | " question=question,\n", 338 | " context=context\n", 339 | ")" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "0cd01d95-491a-4c22-afac-833de30b17cd", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "print(\"*\"*55)\n", 350 | "print(f\"Context:{context}\")\n", 351 | "print(\"*\"*55)\n", 352 | "print(f\"Question:{question}\")\n", 353 | "print(\"-\"*55)\n", 354 | "print(f\"Response from Fine-Tuned Model:\\n{ft_qa_result}\")\n", 355 | "print()\n", 356 | "print(f\"Response from Pretrained Model:\\n{pt_qa_result}\")" 357 | ] 358 | }, 359 | { 360 | "cell_type": "markdown", 361 | "id": "8dc0ed16-e828-4ede-892e-04622bf82a35", 362 | "metadata": {}, 363 | "source": [ 364 | "# Generative Pretraining" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "id": "afd7417f-425a-47ee-8707-7c9872cf8ecc", 370 | "metadata": {}, 371 | "source": [ 372 | "## Behold, its GPT (Generative pre-training)\n", 373 | "\n", 374 | "The first model in this series is called GPT, or Generative Pre-Training. It was released in [2018](https://openai.com/blog/language-unsupervised/), about the same time as the BERT model. The paper presents a task-agnostic architecture based on the ideas of transformers and unsupervised learning.\n", 375 | "\n", 376 | "- GPT is essentially a language model based on the __transformer-decoder__ \n", 377 | "- Introduction of large training datasets: __BookCorpus__ dataset contains over 7,000 unique, unpublished books across different genres\n", 378 | "- The GPT architecture makes use of 12 decoder blocks (as opposed to 6 in the original transformer) with 768-dimensional states and 12 self-attention heads each.\n", 379 | "\n", 380 | "\n", 381 | "### GPT-2\n", 382 | "- Radford et al. presented the GPT-2 model as part of their work titled [Language Models are Unsupervised Multi-task Learners in 2019](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)\n", 383 | "- The model achieves state-of-the-art performance in a few-shot setting\n", 384 | "- Similar to GPT, the secret sauce for GPT-2 is its dataset. The authors prepared a massive 40 GB dataset by crawling 45 million outbound links from a social networking site called Reddit.\n", 385 | "- The vocabulary was also expanded to cover 50,000 words and the context window was expanded to 1,024 tokens (as compared to 512 for GPT).\n", 386 | "\n", 387 | "\n", 388 | "### GPT-3\n", 389 | "- OpenAI published paper titled [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) in May 2020. \n", 390 | "- This paper introduces the mammoth __175 billion-parameter GPT-3 model__.\n", 391 | "- Apart from more layers and parameters, this model made use of sparse attention\n", 392 | "- Dataset again played a key role, a 300 billion-token dataset based on existing datasets like Common Crawl (filtered for better content), WebText2 (a larger version of WebText used for GPT-2), Books1 and Books2, and the Wikipedia dataset was prepared for this model" 393 | ] 394 | }, 395 | { 396 | "cell_type": "markdown", 397 | "id": "383312da-e4a1-42d8-93c6-8c89f2a952d2", 398 | "metadata": {}, 399 | "source": [ 400 | "## Language Modeling\n", 401 | "By far the most widely used application from the NLP world is language modeling. We use it daily on our phone keyboards, email applications and a ton of other places.\n", 402 | "\n", 403 | "In simple words, a language model takes certain text as input context to generate the next set of words as output. This is interesting because a language model tries to understand the input context, the language structure (though in a very naive way) to predict the next word(s). We use it in the form of text completion utilities on search engines, chat platforms, emails etc. all the time. Language models are a perfect real life application of NLP and showcase the power of RNNs.\n", 404 | "\n", 405 | "Language models can be developed train in different ways. The most common and widely used method is the sliding window approach. The model takes a small window of text as input and tried to predict the next word as the output. The following figure illustrates the same visually.\n", 406 | "\n", 407 | "" 408 | ] 409 | }, 410 | { 411 | "cell_type": "markdown", 412 | "id": "6bc37334-041b-475b-b281-2b6c42902b27", 413 | "metadata": {}, 414 | "source": [ 415 | "### PreTrained GPT2 for Text Generation" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": null, 421 | "id": "f722ddfd-4fe3-4aeb-afbd-1eb3f964e110", 422 | "metadata": {}, 423 | "outputs": [], 424 | "source": [ 425 | "import torch\n", 426 | "from transformers import AutoModelForCausalLM, AutoTokenizer" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "ea73c89c-646f-47c7-8f36-ef4bc7b5b756", 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "# generative tasks are not available through MPS/Apple Silicon\n", 437 | "DEVICE = 'cpu'\n", 438 | "Tensor = torch.FloatTensor\n", 439 | "LongTensor = torch.LongTensor\n", 440 | "DEVICE_ID = -1\n", 441 | "print(f\"Backend Accelerator Device={DEVICE}\")" 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": null, 447 | "id": "f6949423-7182-4435-988c-89ed678e8f10", 448 | "metadata": {}, 449 | "outputs": [], 450 | "source": [ 451 | "tokenizer = AutoTokenizer.#TODO: get pretrained GPT2 tokenizer\n", 452 | "\n", 453 | "# add the EOS token as PAD token to avoid warnings\n", 454 | "model = AutoModelForCausalLM.from_pretrained(\"gpt2\", pad_token_id=tokenizer.eos_token_id).to(DEVICE)" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "id": "0b909212-f6a3-46fd-8acc-5dc77224bead", 461 | "metadata": {}, 462 | "outputs": [], 463 | "source": [ 464 | "# encode context the generation is conditioned on\n", 465 | "model_inputs = tokenizer('The king of England is', return_tensors='pt').to(DEVICE)\n", 466 | "\n", 467 | "# generate 40 new tokens\n", 468 | "greedy_output = model.generate(**model_inputs, max_new_tokens=40)\n", 469 | "\n", 470 | "print(\"Output:\\n\" + 100 * '-')\n", 471 | "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "id": "48210990-c4d2-4103-91ab-184addedba15", 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [] 481 | } 482 | ], 483 | "metadata": { 484 | "kernelspec": { 485 | "display_name": "Python 3 (ipykernel)", 486 | "language": "python", 487 | "name": "python3" 488 | }, 489 | "language_info": { 490 | "codemirror_mode": { 491 | "name": "ipython", 492 | "version": 3 493 | }, 494 | "file_extension": ".py", 495 | "mimetype": "text/x-python", 496 | "name": "python", 497 | "nbconvert_exporter": "python", 498 | "pygments_lexer": "ipython3", 499 | "version": "3.11.9" 500 | } 501 | }, 502 | "nbformat": 4, 503 | "nbformat_minor": 5 504 | } 505 | -------------------------------------------------------------------------------- /module_01/assets/banner_notebook_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/banner_notebook_1.jpg -------------------------------------------------------------------------------- /module_01/assets/bert_models_layout_notebook_3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/bert_models_layout_notebook_3.jpeg -------------------------------------------------------------------------------- /module_01/assets/cbow_arch_notebook_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/cbow_arch_notebook_1.png -------------------------------------------------------------------------------- /module_01/assets/encoder_decoder_notebook_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/encoder_decoder_notebook_3.png -------------------------------------------------------------------------------- /module_01/assets/img_2_notebook_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/img_2_notebook_1.jpg -------------------------------------------------------------------------------- /module_01/assets/karpathy_emoji_tokenizer.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/karpathy_emoji_tokenizer.jpeg -------------------------------------------------------------------------------- /module_01/assets/lm_training_notebook_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/lm_training_notebook_3.png -------------------------------------------------------------------------------- /module_01/assets/multihead_attention_notebook_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/multihead_attention_notebook_3.png -------------------------------------------------------------------------------- /module_01/assets/positional_emb_notebook_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/positional_emb_notebook_3.png -------------------------------------------------------------------------------- /module_01/assets/skipgram_arch_notebook_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/skipgram_arch_notebook_1.png -------------------------------------------------------------------------------- /module_01/assets/transformer_arch_notebook_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/transformer_arch_notebook_3.png -------------------------------------------------------------------------------- /module_01/solutions/norm_corpus.txt: -------------------------------------------------------------------------------- 1 | sky blue beautiful 2 | love blue beautiful sky 3 | quick brown fox jumps lazy dog 4 | kings breakfast sausages ham bacon eggs toast beans 5 | love green eggs ham sausages bacon 6 | brown fox quick blue dog lazy 7 | sky blue sky beautiful today 8 | dog lazy brown fox quick 9 | -------------------------------------------------------------------------------- /module_02/01_llm_overview.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_02/01_llm_overview.pdf -------------------------------------------------------------------------------- /module_02/02_simple_text_generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Generation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | " \"Open\n", 16 | "" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import torch" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "# if you are on apple silicon execute the below command before starting jupyter\n", 35 | "#export PYTORCH_ENABLE_MPS_FALLBACK=1\n", 36 | "if torch.cuda.is_available():\n", 37 | " DEVICE = 'cuda'\n", 38 | " Tensor = torch.cuda.FloatTensor\n", 39 | " LongTensor = torch.cuda.LongTensor\n", 40 | " DEVICE_ID = 0\n", 41 | "# Some Causal Modeling Ops are not available on MPS yet \n", 42 | "# elif torch.backends.mps.is_available():\n", 43 | "# DEVICE = 'mps'\n", 44 | "# Tensor = torch.FloatTensor\n", 45 | "# LongTensor = torch.LongTensor\n", 46 | "# DEVICE_ID = 0\n", 47 | "else:\n", 48 | " DEVICE = 'cpu'\n", 49 | " Tensor = torch.FloatTensor\n", 50 | " LongTensor = torch.LongTensor\n", 51 | " DEVICE_ID = -1\n", 52 | "print(f\"Backend Accelerator Device={DEVICE}\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import time\n", 62 | "import datetime" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "import pandas as pd\n", 72 | "import numpy as np\n", 73 | "import transformers\n", 74 | "from numpy import random\n", 75 | "from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config\n", 76 | "from transformers import get_linear_schedule_with_warmup\n", 77 | "from torch.optim import AdamW" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": null, 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "from torch.utils.data import Dataset, DataLoader\n", 87 | "from torch.utils.data import random_split, RandomSampler, SequentialSampler\n", 88 | "torch.manual_seed(42)" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "print(transformers.__version__)" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler\n", 107 | "torch.manual_seed(42)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# colab/gpu systems\n", 117 | "!nvidia-smi\n", 118 | "# htop or activity monitor for linux based systems" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "## Get Data\n", 126 | "We will fine-tune a pre-trained model GPT-2 model on our earlier dataset itself. But wait, what do you mean pre-trained?" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "filename = \"sherlock_homes.txt\"\n", 145 | "raw_text = open(filename, 'r', encoding='utf-8').read()\n", 146 | "text = raw_text [1450:100000]" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## Foundation & Pre-trained Models\n", 154 | "\n", 155 | "**Foundation models** are the models that are trained from scratch on a large corpus of data. In the context of NLP, these models are designed to learn the fundamental patterns, structures, and representations of natural language. Foundation models are typically trained using unsupervised learning objectives, such as language modeling or autoencoding, where the model predicts the next word in a sentence or reconstructs the original sentence from a corrupted version/masked version.\n", 156 | "Models such as GPT, BERT, T5, etc are typical examples of Foundation Models\n", 157 | "\n", 158 | "\n", 159 | "Instances of foundation models that have been trained on specific downstream tasks or datasets are termed as **Pre-Trained Models**. Pretrained models leverage the knowledge learned from foundation models and are fine-tuned on task-specific data to perform well on specific NLP tasks, such as text classification, named entity recognition, machine translation, sentiment analysis, etc." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "BOS_TOKEN = '<|sot|>'\n", 169 | "EOS_TOKEN = '<|eot|>'\n", 170 | "PAD_TOKEN = '<|pad|>'\n", 171 | "MODEL_NAME = \"raghavbali/gpt2_ft_sherlock_holmes\"" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "# first, let us get the tokenizer object\n", 181 | "tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME,\n", 182 | " bos_token=BOS_TOKEN,\n", 183 | " eos_token=EOS_TOKEN,\n", 184 | " pad_token=PAD_TOKEN\n", 185 | " )" 186 | ] 187 | }, 188 | { 189 | "cell_type": "markdown", 190 | "metadata": {}, 191 | "source": [ 192 | "## Prepare Dataset" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "class GPT2Dataset(Dataset):\n", 202 | "\n", 203 | " def __init__(self, txt_list, tokenizer, max_length=768):\n", 204 | "\n", 205 | " self.tokenizer = tokenizer\n", 206 | " self.input_ids = []\n", 207 | " self.attn_masks = []\n", 208 | "\n", 209 | " for txt in txt_list:\n", 210 | "\n", 211 | " encodings_dict = tokenizer(\n", 212 | " #TODO: Input format [beginning of sentence] input_text [end of sentence]\n", 213 | " truncation=True,\n", 214 | " max_length=max_length,\n", 215 | " padding=\"max_length\"\n", 216 | " )\n", 217 | "\n", 218 | " self.input_ids.append(torch.tensor(encodings_dict['input_ids']))\n", 219 | " self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))\n", 220 | "\n", 221 | " def __len__(self):\n", 222 | " return #TODO: return size of input_ids\n", 223 | "\n", 224 | " def __getitem__(self, idx):\n", 225 | " return self.input_ids[idx], self.attn_masks[idx]" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "# set batch size to work it out on colab\n", 235 | "BATCH_SIZE = 3" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "dataset = GPT2Dataset(text.split('\\n'),\n", 245 | " tokenizer, max_length=768)\n", 246 | "\n", 247 | "# Split into training and validation sets\n", 248 | "train_size = int(0.9 * len(dataset))\n", 249 | "val_size = len(dataset) - train_size\n", 250 | "\n", 251 | "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n", 252 | "\n", 253 | "print('{:>5,} training samples'.format(train_size))\n", 254 | "print('{:>5,} validation samples'.format(val_size))" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": null, 260 | "metadata": {}, 261 | "outputs": [], 262 | "source": [ 263 | "# Create the DataLoaders for our training and validation datasets.\n", 264 | "train_dataloader = DataLoader(\n", 265 | " train_dataset,\n", 266 | " sampler = RandomSampler(train_dataset),\n", 267 | " batch_size = #TODO: set batch-size\n", 268 | " )\n", 269 | "\n", 270 | "validation_dataloader = DataLoader(\n", 271 | " val_dataset,\n", 272 | " sampler = SequentialSampler(val_dataset),\n", 273 | " batch_size = #TODO: set batch-size\n", 274 | " )" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "## Setup Model Object" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# Training Params\n", 291 | "epochs = 1 #3 seems good if you train from gpt2 checkpoint\n", 292 | "learning_rate = 5e-4\n", 293 | "# to speed up learning\n", 294 | "warmup_steps = 1e2\n", 295 | "epsilon = 1e-8\n", 296 | "\n", 297 | "# generate output after N steps\n", 298 | "sample_every = 100" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "# Set Config\n", 308 | "configuration = GPT2Config.from_pretrained(MODEL_NAME,\n", 309 | " output_hidden_states=False)\n", 310 | "\n", 311 | "# instantiate the model\n", 312 | "model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, config=configuration,)\n", 313 | "\n", 314 | "# NOTE: This is important to imply that we have updated BOS, EOS, etc\n", 315 | "model.resize_token_embeddings(len(tokenizer))\n", 316 | "model = model.to(DEVICE)" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "optimizer = AdamW(model.parameters(),\n", 326 | " lr = learning_rate,\n", 327 | " eps = epsilon\n", 328 | " )" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "total_steps = len(train_dataloader) * epochs\n", 338 | "\n", 339 | "# Create the learning rate scheduler.\n", 340 | "scheduler = get_linear_schedule_with_warmup(optimizer,\n", 341 | " num_warmup_steps = warmup_steps,\n", 342 | " num_training_steps = total_steps)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": null, 348 | "metadata": {}, 349 | "outputs": [], 350 | "source": [ 351 | "def format_time(elapsed):\n", 352 | " return str(datetime.timedelta(seconds=int(round((elapsed)))))" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "total_t0 = time.time()\n", 362 | "training_stats = []\n", 363 | "\n", 364 | "\n", 365 | "for epoch_i in range(0, epochs):\n", 366 | "\n", 367 | " # Training\n", 368 | " print(\"*\"*25)\n", 369 | " print('>> Epoch {:} / {:} '.format(epoch_i + 1, epochs))\n", 370 | " print(\"*\"*25)\n", 371 | "\n", 372 | " t0 = time.time()\n", 373 | " total_train_loss = 0\n", 374 | "\n", 375 | " #TODO: call model's training interface\n", 376 | " for step, batch in enumerate(train_dataloader):\n", 377 | "\n", 378 | " b_input_ids = batch[0].to(DEVICE)\n", 379 | " b_labels = batch[0].to(DEVICE)\n", 380 | " b_masks = batch[1].to(DEVICE)\n", 381 | "\n", 382 | " model.zero_grad()\n", 383 | "\n", 384 | " outputs = model( b_input_ids,\n", 385 | " labels=b_labels,\n", 386 | " attention_mask = b_masks,\n", 387 | " token_type_ids=None\n", 388 | " )\n", 389 | "\n", 390 | " loss = outputs[0]\n", 391 | "\n", 392 | " batch_loss = loss.item()\n", 393 | " total_train_loss += batch_loss\n", 394 | "\n", 395 | " # Get sample every x batches.\n", 396 | " if step % sample_every == 0 and not step == 0:\n", 397 | "\n", 398 | " elapsed = format_time(time.time() - t0)\n", 399 | " print(' Batch {:>5,} of {:>5,}. Training Loss: {:>5,}. Time Taken: {:}.'.format(step,\n", 400 | " len(train_dataloader),\n", 401 | " batch_loss,\n", 402 | " elapsed))\n", 403 | "\n", 404 | " model.eval()\n", 405 | "\n", 406 | " sample_outputs = model.generate(\n", 407 | " do_sample=True,\n", 408 | " top_k=50,\n", 409 | " max_length = 200,\n", 410 | " top_p=0.95,\n", 411 | " num_return_sequences=1,\n", 412 | " pad_token_id=tokenizer.eos_token_id\n", 413 | " )\n", 414 | " for i, sample_output in enumerate(sample_outputs):\n", 415 | " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))\n", 416 | "\n", 417 | " model.train()\n", 418 | "\n", 419 | " loss.backward()\n", 420 | " optimizer.step()\n", 421 | "\n", 422 | " scheduler.step()\n", 423 | "\n", 424 | " # Average Loss\n", 425 | " avg_train_loss = total_train_loss / len(train_dataloader)\n", 426 | "\n", 427 | " # training time\n", 428 | " training_time = format_time(time.time() - t0)\n", 429 | "\n", 430 | " print(\"Average training loss: {0:.2f}\".format(avg_train_loss))\n", 431 | " print(\"Training epoch time: {:}\".format(training_time))\n", 432 | "\n", 433 | " # Validation\n", 434 | " t0 = time.time()\n", 435 | "\n", 436 | " model.eval()\n", 437 | " total_eval_loss = 0\n", 438 | " nb_eval_steps = 0\n", 439 | "\n", 440 | " for batch in validation_dataloader:\n", 441 | "\n", 442 | " b_input_ids = batch[0].to(DEVICE)\n", 443 | " b_labels = batch[0].to(DEVICE)\n", 444 | " b_masks = batch[1].to(DEVICE)\n", 445 | "\n", 446 | " with torch.no_grad():\n", 447 | "\n", 448 | " outputs = model(b_input_ids,#TODO: pass batch's ids,\n", 449 | " attention_mask = b_masks,\n", 450 | " labels=b_labels)\n", 451 | "\n", 452 | " loss = outputs[0]\n", 453 | "\n", 454 | " batch_loss = loss.item()\n", 455 | " total_eval_loss += batch_loss\n", 456 | "\n", 457 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", 458 | "\n", 459 | " validation_time = format_time(time.time() - t0)\n", 460 | "\n", 461 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", 462 | " print(\" Validation time: {:}\".format(validation_time))\n", 463 | "\n", 464 | " # Record all statistics from this epoch.\n", 465 | " training_stats.append(\n", 466 | " {\n", 467 | " 'epoch': epoch_i + 1,\n", 468 | " 'train_loss': avg_train_loss,\n", 469 | " 'val_oss': avg_val_loss,\n", 470 | " 'train_ime': training_time,\n", 471 | " 'val_ime': validation_time\n", 472 | " }\n", 473 | " )\n", 474 | "\n", 475 | "print(\"Training Completed\")\n", 476 | "print(\"Total training time {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": null, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "df_stats = pd.DataFrame(data=training_stats)\n", 486 | "df_stats" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": { 492 | "editable": true, 493 | "slideshow": { 494 | "slide_type": "" 495 | }, 496 | "tags": [] 497 | }, 498 | "source": [ 499 | "## Save the Model" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "import os" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "output_dir = './model_save/'\n", 518 | "\n", 519 | "if not os.path.exists(output_dir):\n", 520 | " os.makedirs(output_dir)\n", 521 | "\n", 522 | "model_to_save = model.module if hasattr(model, 'module') else model\n", 523 | "model_to_save.save_pretrained(output_dir)\n", 524 | "tokenizer.save_pretrained(output_dir)" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": {}, 531 | "outputs": [], 532 | "source": [ 533 | "model.eval()\n", 534 | "\n", 535 | "prompt = \"the King of England\"\n", 536 | "\n", 537 | "tokenized_inputs = torch.tensor(tokenizer.encode(BOS_TOKEN+prompt)).unsqueeze(0)\n", 538 | "tokenized_inputs = tokenized_inputs.to(DEVICE)\n", 539 | "\n", 540 | "sample_outputs = model.generate(\n", 541 | " tokenized_inputs,\n", 542 | " do_sample=True,\n", 543 | " top_k=50,\n", 544 | " max_length = len(generated) + 50,\n", 545 | " top_p=0.92,\n", 546 | " num_return_sequences=3,\n", 547 | " pad_token_id=tokenizer.eos_token_id,\n", 548 | " temperature=0.8,\n", 549 | " )\n", 550 | "\n", 551 | "for i, sample_output in enumerate(sample_outputs):\n", 552 | " print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "# compare output to foundation model\n", 562 | "pre_trainedtokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)\n", 563 | "pre_trainedmodel = GPT2LMHeadModel.from_pretrained(MODEL_NAME)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "input_ids = #TODO: Prepare tokenizer input, HINT: use trainedtokenizer object\n", 573 | "\n", 574 | "# Generate text\n", 575 | "output = pre_trainedmodel.generate(\n", 576 | " input_ids,\n", 577 | " bos_token_id=random.randint(1,30000),\n", 578 | " max_length=len(input_ids[0]) + 50,\n", 579 | " num_return_sequences=1,\n", 580 | " pad_token_id=tokenizer.eos_token_id,\n", 581 | " do_sample=True,\n", 582 | " top_p=0.92, # Adjust the sampling parameters as needed\n", 583 | " temperature=0.8,\n", 584 | ")" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "pre_trainedtokenizer.decode(output[0], skip_special_tokens=True)" 594 | ] 595 | }, 596 | { 597 | "cell_type": "markdown", 598 | "metadata": {}, 599 | "source": [ 600 | "## Decoding Strategies\n", 601 | "\n", 602 | "The ``generate()`` utility we used above used every output prediction as input for the next time step. This method of using the highest probability prediction as output is called __Greedy Decoding__. Greeding decoding is fast and simple but is marred with issues we saw in samples we just generated.\n", 603 | "\n", 604 | "Focusing on only highest probability output narrows our model's focus to just the next step which inturn may result in inconsistent or non-dictionary terms/words.\n", 605 | "\n", 606 | "### Beam Search\n", 607 | "Beam search is the obvious next step to improve the output predictions from the model. Instead of being greedy, beam search keeps track of n paths at any given time and selects the path with overall higher probability.\n", 608 | "\n", 609 | "\n", 610 | "\n", 611 | "### Other Key Decoding Strategies:\n", 612 | "- Sampling\n", 613 | "- Top-k Sampling\n", 614 | "- Nucleus Sampling\n", 615 | "\n", 616 | "### Temperature\n", 617 | "Though sampling helps bring in required amount of randomness, it is not free from issues. Random sampling leads to gibberish and incoherence at times. To control the amount of randomness, we introduce __temperature__. This parameter helps increase the likelihood of high probability terms reduce the likelihood of low probability ones. This leads to sharper distributions. \n", 618 | "\n", 619 | "> High temperature leads to more randomness while lower temperature brings in predictability.\n" 620 | ] 621 | }, 622 | { 623 | "cell_type": "code", 624 | "execution_count": null, 625 | "metadata": {}, 626 | "outputs": [], 627 | "source": [ 628 | "prompt = \"the King of England\"\n", 629 | "\n", 630 | "generated = tokenizer.encode(BOS_TOKEN+prompt,return_tensors='pt')\n", 631 | "generated = generated.to(DEVICE)\n", 632 | "\n", 633 | "beam_output = model.generate(\n", 634 | " **generated,\n", 635 | " max_new_tokens=40,\n", 636 | " num_beams=5,\n", 637 | " num_return_sequences=5,\n", 638 | " early_stopping=True\n", 639 | ")\n", 640 | "\n", 641 | "for i in range(5):\n", 642 | " print(f\"Beam {i} Output:\\n\" + 100 * '-')\n", 643 | " print(tokenizer.decode(beam_output[i], skip_special_tokens=True))" 644 | ] 645 | }, 646 | { 647 | "cell_type": "markdown", 648 | "metadata": {}, 649 | "source": [ 650 | "## Limitations and What Next?\n", 651 | "- Long Range Context\n", 652 | "- Scalability\n", 653 | "- Instruction led generation\n", 654 | "- Benchmarking\n", 655 | "- Halucination / Dreaming\n" 656 | ] 657 | }, 658 | { 659 | "cell_type": "markdown", 660 | "metadata": {}, 661 | "source": [] 662 | } 663 | ], 664 | "metadata": { 665 | "accelerator": "GPU", 666 | "colab": { 667 | "gpuType": "T4", 668 | "provenance": [] 669 | }, 670 | "kernelspec": { 671 | "display_name": "Python 3 (ipykernel)", 672 | "language": "python", 673 | "name": "python3" 674 | }, 675 | "language_info": { 676 | "codemirror_mode": { 677 | "name": "ipython", 678 | "version": 3 679 | }, 680 | "file_extension": ".py", 681 | "mimetype": "text/x-python", 682 | "name": "python", 683 | "nbconvert_exporter": "python", 684 | "pygments_lexer": "ipython3", 685 | "version": "3.11.9" 686 | } 687 | }, 688 | "nbformat": 4, 689 | "nbformat_minor": 4 690 | } 691 | -------------------------------------------------------------------------------- /module_02/assets/beamsearch_nb_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_02/assets/beamsearch_nb_2.png -------------------------------------------------------------------------------- /module_02/solutions/02_simple_text_generator.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Text Generation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "\n", 15 | " \"Open\n", 16 | "" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 2, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "import torch" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 25, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "name": "stdout", 35 | "output_type": "stream", 36 | "text": [ 37 | "Backend Accelerator Device=cpu\n" 38 | ] 39 | } 40 | ], 41 | "source": [ 42 | "# if you are on apple silicon execute the below command before starting jupyter\n", 43 | "#export PYTORCH_ENABLE_MPS_FALLBACK=1\n", 44 | "if torch.cuda.is_available():\n", 45 | " DEVICE = 'cuda'\n", 46 | " Tensor = torch.cuda.FloatTensor\n", 47 | " LongTensor = torch.cuda.LongTensor\n", 48 | " DEVICE_ID = 0\n", 49 | "# Some Causal Modeling Ops are not available on MPS yet \n", 50 | "# elif torch.backends.mps.is_available():\n", 51 | "# DEVICE = 'mps'\n", 52 | "# Tensor = torch.FloatTensor\n", 53 | "# LongTensor = torch.LongTensor\n", 54 | "# DEVICE_ID = 0\n", 55 | "else:\n", 56 | " DEVICE = 'cpu'\n", 57 | " Tensor = torch.FloatTensor\n", 58 | " LongTensor = torch.LongTensor\n", 59 | " DEVICE_ID = -1\n", 60 | "print(f\"Backend Accelerator Device={DEVICE}\")" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "import time\n", 70 | "import datetime" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 7, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "import pandas as pd\n", 80 | "import numpy as np\n", 81 | "import transformers\n", 82 | "from numpy import random\n", 83 | "from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config\n", 84 | "from transformers import get_linear_schedule_with_warmup\n", 85 | "from torch.optim import AdamW" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": 8, 91 | "metadata": {}, 92 | "outputs": [ 93 | { 94 | "data": { 95 | "text/plain": [ 96 | "" 97 | ] 98 | }, 99 | "execution_count": 8, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "from torch.utils.data import Dataset, DataLoader\n", 106 | "from torch.utils.data import random_split, RandomSampler, SequentialSampler\n", 107 | "torch.manual_seed(42)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 9, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "4.42.3\n" 120 | ] 121 | } 122 | ], 123 | "source": [ 124 | "print(transformers.__version__)" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": 10, 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "" 136 | ] 137 | }, 138 | "execution_count": 10, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler\n", 145 | "torch.manual_seed(42)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 27, 151 | "metadata": {}, 152 | "outputs": [ 153 | { 154 | "name": "stdout", 155 | "output_type": "stream", 156 | "text": [ 157 | "zsh:1: command not found: nvidia-smi\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "# colab/gpu systems\n", 163 | "!nvidia-smi\n", 164 | "# htop or activity monitor for linux based systems" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Get Data\n", 172 | "We will fine-tune a pre-trained model GPT-2 model on our earlier dataset itself. But wait, what do you mean pre-trained?" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 11, 178 | "metadata": {}, 179 | "outputs": [ 180 | { 181 | "name": "stdout", 182 | "output_type": "stream", 183 | "text": [ 184 | "--2024-07-28 00:53:52-- http://www.gutenberg.org/files/1661/1661-0.txt\n", 185 | "Resolving www.gutenberg.org (www.gutenberg.org)... 2610:28:3090:3000:0:bad:cafe:47, 152.19.134.47\n", 186 | "Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:80... connected.\n", 187 | "HTTP request sent, awaiting response... 302 Found\n", 188 | "Location: https://www.gutenberg.org/files/1661/1661-0.txt [following]\n", 189 | "--2024-07-28 00:53:52-- https://www.gutenberg.org/files/1661/1661-0.txt\n", 190 | "Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:443... connected.\n", 191 | "HTTP request sent, awaiting response... 200 OK\n", 192 | "Length: 607504 (593K) [text/plain]\n", 193 | "Saving to: ‘sherlock_homes.txt’\n", 194 | "\n", 195 | "sherlock_homes.txt 100%[===================>] 593.27K 1.21MB/s in 0.5s \n", 196 | "\n", 197 | "2024-07-28 00:53:53 (1.21 MB/s) - ‘sherlock_homes.txt’ saved [607504/607504]\n", 198 | "\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 12, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "filename = \"sherlock_homes.txt\"\n", 213 | "raw_text = open(filename, 'r', encoding='utf-8').read()\n", 214 | "text = raw_text [1450:100000]" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "## Foundation & Pre-trained Models\n", 222 | "\n", 223 | "**Foundation models** are the models that are trained from scratch on a large corpus of data. In the context of NLP, these models are designed to learn the fundamental patterns, structures, and representations of natural language. Foundation models are typically trained using unsupervised learning objectives, such as language modeling or autoencoding, where the model predicts the next word in a sentence or reconstructs the original sentence from a corrupted version/masked version.\n", 224 | "Models such as GPT, BERT, T5, etc are typical examples of Foundation Models\n", 225 | "\n", 226 | "\n", 227 | "Instances of foundation models that have been trained on specific downstream tasks or datasets are termed as **Pre-Trained Models**. Pretrained models leverage the knowledge learned from foundation models and are fine-tuned on task-specific data to perform well on specific NLP tasks, such as text classification, named entity recognition, machine translation, sentiment analysis, etc." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 13, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "BOS_TOKEN = '<|sot|>'\n", 237 | "EOS_TOKEN = '<|eot|>'\n", 238 | "PAD_TOKEN = '<|pad|>'\n", 239 | "MODEL_NAME = \"raghavbali/gpt2_ft_sherlock_holmes\"" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 14, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "# first, let us get the tokenizer object\n", 249 | "tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME,\n", 250 | " bos_token=BOS_TOKEN,\n", 251 | " eos_token=EOS_TOKEN,\n", 252 | " pad_token=PAD_TOKEN\n", 253 | " )" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Prepare Dataset" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 15, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "class GPT2Dataset(Dataset):\n", 270 | "\n", 271 | " def __init__(self, txt_list, tokenizer, max_length=768):\n", 272 | "\n", 273 | " self.tokenizer = tokenizer\n", 274 | " self.input_ids = []\n", 275 | " self.attn_masks = []\n", 276 | "\n", 277 | " for txt in txt_list:\n", 278 | "\n", 279 | " encodings_dict = tokenizer(\n", 280 | " BOS_TOKEN + txt + EOS_TOKEN, #TODO\n", 281 | " truncation=True,\n", 282 | " max_length=max_length,\n", 283 | " padding=\"max_length\"\n", 284 | " )\n", 285 | "\n", 286 | " self.input_ids.append(torch.tensor(encodings_dict['input_ids']))\n", 287 | " self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))\n", 288 | "\n", 289 | " def __len__(self):\n", 290 | " return len(self.input_ids)#TODO: return size of input_ids\n", 291 | "\n", 292 | " def __getitem__(self, idx):\n", 293 | " return self.input_ids[idx], self.attn_masks[idx]" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 16, 299 | "metadata": {}, 300 | "outputs": [], 301 | "source": [ 302 | "# set batch size to work it out on colab\n", 303 | "BATCH_SIZE = 3" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 26, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "1,949 training samples\n", 316 | " 217 validation samples\n" 317 | ] 318 | } 319 | ], 320 | "source": [ 321 | "dataset = GPT2Dataset(text.split('\\n'),\n", 322 | " tokenizer, max_length=768)\n", 323 | "\n", 324 | "# Split into training and validation sets\n", 325 | "train_size = int(0.9 * len(dataset))\n", 326 | "val_size = len(dataset) - train_size\n", 327 | "\n", 328 | "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n", 329 | "\n", 330 | "print('{:>5,} training samples'.format(train_size))\n", 331 | "print('{:>5,} validation samples'.format(val_size))" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 27, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "# Create the DataLoaders for our training and validation datasets.\n", 341 | "train_dataloader = DataLoader(\n", 342 | " train_dataset,\n", 343 | " sampler = RandomSampler(train_dataset),\n", 344 | " batch_size = BATCH_SIZE#TODO: set batch-size\n", 345 | " )\n", 346 | "\n", 347 | "validation_dataloader = DataLoader(\n", 348 | " val_dataset,\n", 349 | " sampler = SequentialSampler(val_dataset),\n", 350 | " batch_size = BATCH_SIZE#TODO: set batch-size\n", 351 | " )" 352 | ] 353 | }, 354 | { 355 | "cell_type": "markdown", 356 | "metadata": {}, 357 | "source": [ 358 | "## Setup Model Object" 359 | ] 360 | }, 361 | { 362 | "cell_type": "code", 363 | "execution_count": 28, 364 | "metadata": {}, 365 | "outputs": [], 366 | "source": [ 367 | "# Training Params\n", 368 | "epochs = 1 #3 seems good if you train from gpt2 checkpoint\n", 369 | "learning_rate = 5e-4\n", 370 | "# to speed up learning\n", 371 | "warmup_steps = 1e2\n", 372 | "epsilon = 1e-8\n", 373 | "\n", 374 | "# generate output after N steps\n", 375 | "sample_every = 100" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": 29, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "# Set Config\n", 385 | "configuration = GPT2Config.from_pretrained(MODEL_NAME,\n", 386 | " output_hidden_states=False)\n", 387 | "\n", 388 | "# instantiate the model\n", 389 | "model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, config=configuration,)\n", 390 | "\n", 391 | "# NOTE: This is important to imply that we have updated BOS, EOS, etc\n", 392 | "model.resize_token_embeddings(len(tokenizer))\n", 393 | "model = model.to(DEVICE)" 394 | ] 395 | }, 396 | { 397 | "cell_type": "code", 398 | "execution_count": 30, 399 | "metadata": {}, 400 | "outputs": [], 401 | "source": [ 402 | "optimizer = AdamW(model.parameters(),\n", 403 | " lr = learning_rate,\n", 404 | " eps = epsilon\n", 405 | " )" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 31, 411 | "metadata": {}, 412 | "outputs": [], 413 | "source": [ 414 | "total_steps = len(train_dataloader) * epochs\n", 415 | "\n", 416 | "# Create the learning rate scheduler.\n", 417 | "scheduler = get_linear_schedule_with_warmup(optimizer,\n", 418 | " num_warmup_steps = warmup_steps,\n", 419 | " num_training_steps = total_steps)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 32, 425 | "metadata": {}, 426 | "outputs": [], 427 | "source": [ 428 | "def format_time(elapsed):\n", 429 | " return str(datetime.timedelta(seconds=int(round((elapsed)))))" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [ 437 | { 438 | "name": "stdout", 439 | "output_type": "stream", 440 | "text": [ 441 | "*************************\n", 442 | ">> Epoch 1 / 1 \n", 443 | "*************************\n" 444 | ] 445 | } 446 | ], 447 | "source": [ 448 | "total_t0 = time.time()\n", 449 | "training_stats = []\n", 450 | "\n", 451 | "\n", 452 | "for epoch_i in range(0, epochs):\n", 453 | "\n", 454 | " # Training\n", 455 | " print(\"*\"*25)\n", 456 | " print('>> Epoch {:} / {:} '.format(epoch_i + 1, epochs))\n", 457 | " print(\"*\"*25)\n", 458 | "\n", 459 | " t0 = time.time()\n", 460 | " total_train_loss = 0\n", 461 | "\n", 462 | " #TODO: call model's training interface\n", 463 | " model.train()\n", 464 | " for step, batch in enumerate(train_dataloader):\n", 465 | "\n", 466 | " b_input_ids = batch[0].to(DEVICE)\n", 467 | " b_labels = batch[0].to(DEVICE)\n", 468 | " b_masks = batch[1].to(DEVICE)\n", 469 | "\n", 470 | " model.zero_grad()\n", 471 | "\n", 472 | " outputs = model( b_input_ids,\n", 473 | " labels=b_labels,\n", 474 | " attention_mask = b_masks,\n", 475 | " token_type_ids=None\n", 476 | " )\n", 477 | "\n", 478 | " loss = outputs[0]\n", 479 | "\n", 480 | " batch_loss = loss.item()\n", 481 | " total_train_loss += batch_loss\n", 482 | "\n", 483 | " # Get sample every x batches.\n", 484 | " if step % sample_every == 0 and not step == 0:\n", 485 | "\n", 486 | " elapsed = format_time(time.time() - t0)\n", 487 | " print(' Batch {:>5,} of {:>5,}. Training Loss: {:>5,}. Time Taken: {:}.'.format(step,\n", 488 | " len(train_dataloader),\n", 489 | " batch_loss,\n", 490 | " elapsed))\n", 491 | "\n", 492 | " model.eval()\n", 493 | "\n", 494 | " sample_outputs = model.generate(\n", 495 | " do_sample=True,\n", 496 | " top_k=50,\n", 497 | " max_length = 200,\n", 498 | " top_p=0.95,\n", 499 | " num_return_sequences=1,\n", 500 | " pad_token_id=tokenizer.eos_token_id\n", 501 | " )\n", 502 | " for i, sample_output in enumerate(sample_outputs):\n", 503 | " print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))\n", 504 | "\n", 505 | " model.train()\n", 506 | "\n", 507 | " loss.backward()\n", 508 | " optimizer.step()\n", 509 | "\n", 510 | " scheduler.step()\n", 511 | "\n", 512 | " # Average Loss\n", 513 | " avg_train_loss = total_train_loss / len(train_dataloader)\n", 514 | "\n", 515 | " # training time\n", 516 | " training_time = format_time(time.time() - t0)\n", 517 | "\n", 518 | " print(\"Average training loss: {0:.2f}\".format(avg_train_loss))\n", 519 | " print(\"Training epoch time: {:}\".format(training_time))\n", 520 | "\n", 521 | " # Validation\n", 522 | " t0 = time.time()\n", 523 | "\n", 524 | " model.eval()\n", 525 | " total_eval_loss = 0\n", 526 | " nb_eval_steps = 0\n", 527 | "\n", 528 | " for batch in validation_dataloader:\n", 529 | "\n", 530 | " b_input_ids = batch[0].to(DEVICE)\n", 531 | " b_labels = batch[0].to(DEVICE)\n", 532 | " b_masks = batch[1].to(DEVICE)\n", 533 | "\n", 534 | " with torch.no_grad():\n", 535 | "\n", 536 | " outputs = model(b_input_ids,#TODO: pass batch's ids,\n", 537 | " attention_mask = b_masks,\n", 538 | " labels=b_labels)\n", 539 | "\n", 540 | " loss = outputs[0]\n", 541 | "\n", 542 | " batch_loss = loss.item()\n", 543 | " total_eval_loss += batch_loss\n", 544 | "\n", 545 | " avg_val_loss = total_eval_loss / len(validation_dataloader)\n", 546 | "\n", 547 | " validation_time = format_time(time.time() - t0)\n", 548 | "\n", 549 | " print(\" Validation Loss: {0:.2f}\".format(avg_val_loss))\n", 550 | " print(\" Validation time: {:}\".format(validation_time))\n", 551 | "\n", 552 | " # Record all statistics from this epoch.\n", 553 | " training_stats.append(\n", 554 | " {\n", 555 | " 'epoch': epoch_i + 1,\n", 556 | " 'train_loss': avg_train_loss,\n", 557 | " 'val_oss': avg_val_loss,\n", 558 | " 'train_ime': training_time,\n", 559 | " 'val_ime': validation_time\n", 560 | " }\n", 561 | " )\n", 562 | "\n", 563 | "print(\"Training Completed\")\n", 564 | "print(\"Total training time {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))" 565 | ] 566 | }, 567 | { 568 | "cell_type": "code", 569 | "execution_count": null, 570 | "metadata": {}, 571 | "outputs": [], 572 | "source": [ 573 | "df_stats = pd.DataFrame(data=training_stats)\n", 574 | "df_stats" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": { 580 | "editable": true, 581 | "slideshow": { 582 | "slide_type": "" 583 | }, 584 | "tags": [] 585 | }, 586 | "source": [ 587 | "## Save the Model" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "import os" 597 | ] 598 | }, 599 | { 600 | "cell_type": "code", 601 | "execution_count": null, 602 | "metadata": {}, 603 | "outputs": [], 604 | "source": [ 605 | "output_dir = './model_save/'\n", 606 | "\n", 607 | "if not os.path.exists(output_dir):\n", 608 | " os.makedirs(output_dir)\n", 609 | "\n", 610 | "model_to_save = model.module if hasattr(model, 'module') else model\n", 611 | "model_to_save.save_pretrained(output_dir)\n", 612 | "tokenizer.save_pretrained(output_dir)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": 51, 618 | "metadata": {}, 619 | "outputs": [ 620 | { 621 | "name": "stdout", 622 | "output_type": "stream", 623 | "text": [ 624 | "0: the King of England, and he was as good a queen, as if she had\n", 625 | "\n", 626 | "\n", 627 | "1: the King of England, with a face of a royal-fancy one, and a thick,\n", 628 | "\n", 629 | "\n", 630 | "2: the King of England.”\n", 631 | "\n", 632 | "\n" 633 | ] 634 | } 635 | ], 636 | "source": [ 637 | "model.eval()\n", 638 | "\n", 639 | "prompt = \"the King of England\"\n", 640 | "\n", 641 | "generated = torch.tensor(tokenizer.encode(BOS_TOKEN+prompt)).unsqueeze(0)\n", 642 | "generated = generated.to(DEVICE)\n", 643 | "\n", 644 | "sample_outputs = model.generate(\n", 645 | " generated,\n", 646 | " do_sample=True,\n", 647 | " top_k=50,\n", 648 | " max_length = len(generated) + 50,\n", 649 | " top_p=0.92,\n", 650 | " num_return_sequences=3,\n", 651 | " pad_token_id=tokenizer.eos_token_id,\n", 652 | " temperature=0.8,\n", 653 | " )\n", 654 | "\n", 655 | "for i, sample_output in enumerate(sample_outputs):\n", 656 | " print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": 48, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "# compare output to foundation model\n", 666 | "pre_trainedtokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)\n", 667 | "pre_trainedmodel = GPT2LMHeadModel.from_pretrained(MODEL_NAME)" 668 | ] 669 | }, 670 | { 671 | "cell_type": "code", 672 | "execution_count": 52, 673 | "metadata": {}, 674 | "outputs": [], 675 | "source": [ 676 | "input_ids = pre_trainedtokenizer.encode(prompt, return_tensors=\"pt\")\n", 677 | "\n", 678 | "# Generate text\n", 679 | "output = pre_trainedmodel.generate(\n", 680 | " input_ids,\n", 681 | " bos_token_id=random.randint(1,30000),\n", 682 | " max_length=len(input_ids[0]) + 50,\n", 683 | " num_return_sequences=1,\n", 684 | " pad_token_id=tokenizer.eos_token_id,\n", 685 | " do_sample=True,\n", 686 | " top_p=0.92, # Adjust the sampling parameters as needed\n", 687 | " temperature=0.8,\n", 688 | ")" 689 | ] 690 | }, 691 | { 692 | "cell_type": "code", 693 | "execution_count": 53, 694 | "metadata": {}, 695 | "outputs": [ 696 | { 697 | "data": { 698 | "text/plain": [ 699 | "'the King of England was a good fellow, for I was the better'" 700 | ] 701 | }, 702 | "execution_count": 53, 703 | "metadata": {}, 704 | "output_type": "execute_result" 705 | } 706 | ], 707 | "source": [ 708 | "pre_trainedtokenizer.decode(output[0], skip_special_tokens=True)" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": {}, 714 | "source": [ 715 | "## Decoding Strategies\n", 716 | "\n", 717 | "The ``generate()`` utility we used above used every output prediction as input for the next time step. This method of using the highest probability prediction as output is called __Greedy Decoding__. Greeding decoding is fast and simple but is marred with issues we saw in samples we just generated.\n", 718 | "\n", 719 | "Focusing on only highest probability output narrows our model's focus to just the next step which inturn may result in inconsistent or non-dictionary terms/words.\n", 720 | "\n", 721 | "### Beam Search\n", 722 | "Beam search is the obvious next step to improve the output predictions from the model. Instead of being greedy, beam search keeps track of n paths at any given time and selects the path with overall higher probability.\n", 723 | "\n", 724 | "\n", 725 | "\n", 726 | "### Other Key Decoding Strategies:\n", 727 | "- Sampling\n", 728 | "- Top-k Sampling\n", 729 | "- Nucleus Sampling\n", 730 | "\n", 731 | "### Temperature\n", 732 | "Though sampling helps bring in required amount of randomness, it is not free from issues. Random sampling leads to gibberish and incoherence at times. To control the amount of randomness, we introduce __temperature__. This parameter helps increase the likelihood of high probability terms reduce the likelihood of low probability ones. This leads to sharper distributions. \n", 733 | "\n", 734 | "> High temperature leads to more randomness while lower temperature brings in predictability.\n" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "prompt = \"the King of England\"\n", 744 | "\n", 745 | "generated = tokenizer.encode(BOS_TOKEN+prompt,return_tensors='pt')\n", 746 | "generated = generated.to(DEVICE)\n", 747 | "\n", 748 | "beam_output = model.generate(\n", 749 | " **generated,\n", 750 | " max_new_tokens=40,\n", 751 | " num_beams=5,\n", 752 | " num_return_sequences=5,\n", 753 | " early_stopping=True\n", 754 | ")" 755 | ] 756 | }, 757 | { 758 | "cell_type": "markdown", 759 | "metadata": {}, 760 | "source": [ 761 | "## Limitations and What Next?\n", 762 | "- Long Range Context\n", 763 | "- Scalability\n", 764 | "- Instruction led generation\n", 765 | "- Benchmarking\n", 766 | "- Halucination / Dreaming\n" 767 | ] 768 | }, 769 | { 770 | "cell_type": "markdown", 771 | "metadata": {}, 772 | "source": [] 773 | } 774 | ], 775 | "metadata": { 776 | "accelerator": "GPU", 777 | "colab": { 778 | "gpuType": "T4", 779 | "provenance": [] 780 | }, 781 | "kernelspec": { 782 | "display_name": "Python 3 (ipykernel)", 783 | "language": "python", 784 | "name": "python3" 785 | }, 786 | "language_info": { 787 | "codemirror_mode": { 788 | "name": "ipython", 789 | "version": 3 790 | }, 791 | "file_extension": ".py", 792 | "mimetype": "text/x-python", 793 | "name": "python", 794 | "nbconvert_exporter": "python", 795 | "pygments_lexer": "ipython3", 796 | "version": "3.11.9" 797 | } 798 | }, 799 | "nbformat": 4, 800 | "nbformat_minor": 4 801 | } 802 | -------------------------------------------------------------------------------- /module_03/assets/chinchilla.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/chinchilla.png -------------------------------------------------------------------------------- /module_03/assets/cost_tweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/cost_tweet.png -------------------------------------------------------------------------------- /module_03/assets/instruct_gpt_rlhf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/instruct_gpt_rlhf.png -------------------------------------------------------------------------------- /module_03/assets/lora_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/lora_1.png -------------------------------------------------------------------------------- /module_03/assets/quantization.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/quantization.png -------------------------------------------------------------------------------- /module_03/assets/scaling_laws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/scaling_laws.png -------------------------------------------------------------------------------- /module_03/assets/soft_prompting_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_1.png -------------------------------------------------------------------------------- /module_03/assets/soft_prompting_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_2.png -------------------------------------------------------------------------------- /module_03/assets/soft_prompting_perf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_perf.png -------------------------------------------------------------------------------- /module_03/solutions/utils.py: -------------------------------------------------------------------------------- 1 | # source: https://stackoverflow.com/a/31631711 2 | def humanbytes(B): 3 | """Return the given bytes as a human friendly KB, MB, GB, or TB string.""" 4 | UNIT = 1000 5 | B = float(B) 6 | KB = float(UNIT) 7 | MB = float(KB ** 2) # 1,048,576 8 | GB = float(KB ** 3) # 1,073,741,824 9 | TB = float(KB ** 4) # 1,099,511,627,776 10 | 11 | if B < KB: 12 | return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte') 13 | elif KB <= B < MB: 14 | return '{0:.2f} KB'.format(B / KB) 15 | elif MB <= B < GB: 16 | return '{0:.2f} MB'.format(B / MB) 17 | elif GB <= B < TB: 18 | return '{0:.2f} GB'.format(B / GB) 19 | elif TB <= B: 20 | return '{0:.2f} TB'.format(B / TB) 21 | 22 | 23 | def memory_fit(req_memory,cpu_ram, gpu_ram): 24 | if req_memory<=cpu_ram or req_memory<=gpu_ram: 25 | return "Yes, fits either CPU or GPU" 26 | elif req_memory<= cpu_ram + gpu_ram: 27 | return "Yes, but fit needs both CPU and GPU" 28 | else: 29 | return "Nope, does not fit available memory" -------------------------------------------------------------------------------- /module_03/utils.py: -------------------------------------------------------------------------------- 1 | # source: https://stackoverflow.com/a/31631711 2 | def humanbytes(B): 3 | """Return the given bytes as a human friendly KB, MB, GB, or TB string.""" 4 | UNIT = 1000 5 | B = float(B) 6 | KB = float(UNIT) 7 | MB = float(KB ** 2) # 1,048,576 8 | GB = float(KB ** 3) # 1,073,741,824 9 | TB = float(KB ** 4) # 1,099,511,627,776 10 | 11 | if B < KB: 12 | return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte') 13 | elif KB <= B < MB: 14 | return '{0:.2f} KB'.format(B / KB) 15 | elif MB <= B < GB: 16 | return '{0:.2f} MB'.format(B / MB) 17 | elif GB <= B < TB: 18 | return '{0:.2f} GB'.format(B / GB) 19 | elif TB <= B: 20 | return '{0:.2f} TB'.format(B / TB) 21 | 22 | 23 | def memory_fit(req_memory,cpu_ram, gpu_ram): 24 | if req_memory<=cpu_ram or req_memory<=gpu_ram: 25 | return "Yes, fits either CPU or GPU" 26 | elif req_memory<= cpu_ram + gpu_ram: 27 | return "Yes, but fit needs both CPU and GPU" 28 | else: 29 | return "Nope, does not fit available memory" -------------------------------------------------------------------------------- /module_04/02_vector_databases_hf_inference_endpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "R9cjyKcNzU8i" 7 | }, 8 | "source": [ 9 | "## Vector Databases\n", 10 | "\n", 11 | "\n", 12 | "\n", 13 | "We started this workshop with **text representation** as one of the key components of any NLP system.\n", 14 | "As we progressed from simple Bag of Words setup to highly contextualised Transformer models, we now have rich & dense representations.\n", 15 | "The utility of such representations also increased multifold from word/sentence representations to features that can used for a number of downstream tasks.\n", 16 | "\n", 17 | "These representations, also called as vectors or embedding vectors are long series of numbers. Their retrieval and persistence requires specialised database management systems called **Vector Databases**.\n", 18 | "\n", 19 | "Vector Databases are particularly suited for handling data in the form of vectors, embeddings, or feature representations, which are commonly used in various applications like machine learning, natural language processing, computer vision, and recommendation systems.\n", 20 | "\n", 21 | "Key Features:\n", 22 | "- High-dimensional Data Support\n", 23 | "- Similarity Search\n", 24 | "- Indexing Techniques\n", 25 | "- Dimensionality Reduction\n", 26 | "\n", 27 | "There are a number of different off-the-shelf options available, such as:\n", 28 | "- [ChromaDB](https://www.trychroma.com/)\n", 29 | "- [PineCone](https://www.pinecone.io/)\n", 30 | "- [Milvus](https://milvus.io/)\n", 31 | "- [Weaviate](https://weaviate.io/)\n", 32 | "- [AeroSpike](https://aerospike.com/)\n", 33 | "- [OpenSearch](https://opensearch.org/)\n" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "id": "B_luN80B2Am7" 40 | }, 41 | "source": [ 42 | "## Let us Begin with Installation" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "\n", 50 | " \"Open\n", 51 | "" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 1, 57 | "metadata": { 58 | "id": "z_dNazilzRUF" 59 | }, 60 | "outputs": [], 61 | "source": [ 62 | "%%capture\n", 63 | "# install dependencies\n", 64 | "# !pip install -q chromadb\n", 65 | "# !pip install retry\n", 66 | "#!pip install -U sentence-transformers" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": { 72 | "id": "q960w1bz2Am9" 73 | }, 74 | "source": [ 75 | "## Imports" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 1, 81 | "metadata": { 82 | "id": "GV6LIKdMBy2r", 83 | "scrolled": true 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "import json\n", 88 | "import requests\n", 89 | "import pandas as pd\n", 90 | "from retry import retry\n", 91 | "\n", 92 | "import chromadb\n", 93 | "from chromadb.api.types import Documents, Embeddings" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "id": "dD7AixSz2Am_" 100 | }, 101 | "source": [ 102 | "## HuggingFace Inference EndPoints 🤗\n", 103 | "\n", 104 | "Another key offering from HuggingFace is *[Inference Endpoints](https://huggingface.co/inference-endpoints)*.\n", 105 | "These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.\n", 106 | "\n", 107 | "All you need is a quick [sign-up](https://huggingface.co/login) and an API Key and bingo!\n" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "id": "dOT-wEBZ2AnA" 114 | }, 115 | "source": [ 116 | "## Sentence Transformers\n", 117 | "\n", 118 | "This is an amazing python framework initially proposed along with the seminal paper titled [Sentence-BERT](https://www.sbert.net/).\n", 119 | "It provides clean high-level interfaces to easily use Language Models for computing text embeddings for various use-cases.\n", 120 | "\n", 121 | "In this notebook we will leverage pretrained models supported by sentence transformer rather than directly using the package.\n", 122 | "\n", 123 | "There is a [leaderboard](https://huggingface.co/spaces/mteb/leaderboard) now maintained to keep track of the state-of-the-art embedding models called the **Massive Text Embedding Benchmark (MTEB) Leaderboard**\n", 124 | "\n", 125 | "\n", 126 | "\n", 127 | "> Source : [HuggingFace](https://huggingface.co/spaces/mteb/leaderboard)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "id": "10G0QGLn2AnA" 134 | }, 135 | "source": [ 136 | "## MPNET Model\n", 137 | "\n", 138 | "- This model transforms sentences/paragraphs to a 768 dimensional vector space and is optimised for question-answering tasks.\n", 139 | "- The model card is available [here](https://huggingface.co/pinecone/mpnet-retriever-discourse)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 87, 145 | "metadata": { 146 | "id": "xr7GrCDXCHKM" 147 | }, 148 | "outputs": [], 149 | "source": [ 150 | "EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse'\n", 151 | "HF_TOKEN = ''\n", 152 | "EMB_API_URL = f\"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}\"\n", 153 | "HEADERS = {\"Authorization\": f\"Bearer {HF_TOKEN}\"}" 154 | ] 155 | }, 156 | { 157 | "cell_type": "markdown", 158 | "metadata": { 159 | "id": "DHA9LV_E2AnB" 160 | }, 161 | "source": [ 162 | "## Embeddings using 🤗 Inference Endpoint\n", 163 | "- We setup a utility function that takes a list of sentences as input and generates embeddings as response\n", 164 | "- We use the ``retry`` package to allow for sufficient time and retries for the APIs to respond" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "import chromadb.utils.embedding_functions as embedding_functions\n", 174 | "huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n", 175 | " api_key=HF_TOKEN,\n", 176 | " model_name=EMB_MODEL_ID\n", 177 | ")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 12, 183 | "metadata": { 184 | "id": "MqxEofYVCqqz" 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "sample_texts = [\n", 189 | " \"Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.\",\n", 190 | " \"This is an amazing python framework initially proposed along with the seminal paper titled Sentence-BERT. It provides clean high-level interfaces to easily use Language Models for computing text embeddings for various use-cases.\"\n", 191 | " ]" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 14, 197 | "metadata": { 198 | "scrolled": true 199 | }, 200 | "outputs": [ 201 | { 202 | "data": { 203 | "text/plain": [ 204 | "[-0.5249432325363159,\n", 205 | " -0.04365385323762894,\n", 206 | " 0.5124771595001221,\n", 207 | " 0.21908265352249146,\n", 208 | " 0.4560490548610687]" 209 | ] 210 | }, 211 | "execution_count": 14, 212 | "metadata": {}, 213 | "output_type": "execute_result" 214 | } 215 | ], 216 | "source": [ 217 | "# generate embeddings\n", 218 | "sample_emb = huggingface_ef(sample_texts[0])\n", 219 | "sample_emb[0][:5]" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": 15, 225 | "metadata": { 226 | "colab": { 227 | "base_uri": "https://localhost:8080/" 228 | }, 229 | "id": "WqCfZvSRS5zn", 230 | "outputId": "a56d0831-f7ea-4c2e-eb05-c9d0690c72da" 231 | }, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "768" 237 | ] 238 | }, 239 | "execution_count": 15, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | } 243 | ], 244 | "source": [ 245 | "# check embedding length\n", 246 | "len(sample_emb[0])" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": { 252 | "id": "nph6RzJbDsEx" 253 | }, 254 | "source": [ 255 | "## Vector Database: ChromaDB\n", 256 | "\n", 257 | "As mentioned above, there are a number of offering available. For this workshop we will make use of\n", 258 | "[ChromaDB](https://www.trychroma.com/).\n", 259 | "\n", 260 | "It is a super simple setup which is easy to use. The following figure showcases the overall flow\n", 261 | "\n", 262 | "\n", 263 | "\n", 264 | "> Source :[chromadb](https://docs.trychroma.com/)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": { 270 | "id": "ubFv6W-C2AnC" 271 | }, 272 | "source": [ 273 | "### Create an Instance of the Database Client" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 8, 279 | "metadata": { 280 | "id": "GVL3ByK9ZG76" 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "# in memory\n", 285 | "chroma_client = chromadb.Client()\n", 286 | "# save to disk: client = chromadb.PersistentClient(path=\"/path/to/data\")" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "metadata": { 293 | "id": "vJV9I0dJDB8G" 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "def create_db_and_load_data(chroma_client,collection_name, embedding_func, documents):\n", 298 | " db = chroma_client.create_collection(name=collection_name,\n", 299 | " embedding_function=embedding_func)\n", 300 | " for i,d in enumerate(documents):\n", 301 | " db.add(\n", 302 | " documents=d,\n", 303 | " ids=str(i)\n", 304 | " )\n", 305 | " return db" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": { 311 | "id": "cFAsZHnYEi8g" 312 | }, 313 | "source": [ 314 | "## Insert Data\n", 315 | "\n", 316 | "Now that we have a utility to interact with the vector database, let us add some data to it and check how it goes" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": 18, 322 | "metadata": { 323 | "id": "-h5HecZrEKTa" 324 | }, 325 | "outputs": [], 326 | "source": [ 327 | "db = create_db_and_load_data(chroma_client=chroma_client,\n", 328 | " collection_name=\"llm_workshop\",\n", 329 | " embedding_func=huggingface_ef,\n", 330 | " documents=sample_texts)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": 55, 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])" 342 | ] 343 | }, 344 | "execution_count": 55, 345 | "metadata": {}, 346 | "output_type": "execute_result" 347 | } 348 | ], 349 | "source": [ 350 | "db.peek().keys()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 60, 356 | "metadata": { 357 | "colab": { 358 | "base_uri": "https://localhost:8080/", 359 | "height": 112 360 | }, 361 | "id": "Sm_zGs8HEc2F", 362 | "outputId": "169b82d0-c268-4e03-b40d-ad8cda697f43" 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/html": [ 368 | "
\n", 369 | "\n", 382 | "\n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | "
idsembeddingsdocuments
00[-0.5249432325363159, -0.04365385323762894, 0....Another key offering from HuggingFace is Infer...
11[-0.5217247605323792, 0.5370820760726929, -0.2...This is an amazing python framework initially ...
\n", 406 | "
" 407 | ], 408 | "text/plain": [ 409 | " ids embeddings \\\n", 410 | "0 0 [-0.5249432325363159, -0.04365385323762894, 0.... \n", 411 | "1 1 [-0.5217247605323792, 0.5370820760726929, -0.2... \n", 412 | "\n", 413 | " documents \n", 414 | "0 Another key offering from HuggingFace is Infer... \n", 415 | "1 This is an amazing python framework initially ... " 416 | ] 417 | }, 418 | "execution_count": 60, 419 | "metadata": {}, 420 | "output_type": "execute_result" 421 | } 422 | ], 423 | "source": [ 424 | "results = db.peek()\n", 425 | "pd.DataFrame.from_dict({k:v for k,v in results.items() if k in['ids','documents','embeddings']})" 426 | ] 427 | }, 428 | { 429 | "cell_type": "markdown", 430 | "metadata": { 431 | "id": "uGgZzOzYElr5" 432 | }, 433 | "source": [ 434 | "## Retrieve Documents" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 61, 440 | "metadata": { 441 | "id": "N8CeEIX3aXiE" 442 | }, 443 | "outputs": [], 444 | "source": [ 445 | "question = \"HuggingFace Key Offering\"" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": 63, 451 | "metadata": { 452 | "colab": { 453 | "base_uri": "https://localhost:8080/" 454 | }, 455 | "id": "oGe1IwAzX98K", 456 | "outputId": "2d19d4fb-2a7e-4e86-caf2-4f71123dca4d" 457 | }, 458 | "outputs": [ 459 | { 460 | "data": { 461 | "text/plain": [ 462 | "{'ids': [['0']],\n", 463 | " 'distances': [[169.98219299316406]],\n", 464 | " 'metadatas': [[None]],\n", 465 | " 'embeddings': None,\n", 466 | " 'documents': [['Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.']],\n", 467 | " 'uris': None,\n", 468 | " 'data': None,\n", 469 | " 'included': ['metadatas', 'documents', 'distances']}" 470 | ] 471 | }, 472 | "execution_count": 63, 473 | "metadata": {}, 474 | "output_type": "execute_result" 475 | } 476 | ], 477 | "source": [ 478 | "db.query(query_texts=[question], n_results=1)" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 64, 484 | "metadata": { 485 | "id": "qFRSZWHcEfEd" 486 | }, 487 | "outputs": [], 488 | "source": [ 489 | "def get_relevant_documents(query, db):\n", 490 | " relevant_doc = db.query(query_texts=[query], n_results=1)['documents'][0][0]\n", 491 | " return relevant_doc" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 65, 497 | "metadata": { 498 | "colab": { 499 | "base_uri": "https://localhost:8080/", 500 | "height": 35 501 | }, 502 | "id": "GfucsODPEusp", 503 | "outputId": "9d0f89c1-0b1d-4337-9719-e9b390f0e8cd" 504 | }, 505 | "outputs": [ 506 | { 507 | "data": { 508 | "text/plain": [ 509 | "'Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.'" 510 | ] 511 | }, 512 | "execution_count": 65, 513 | "metadata": {}, 514 | "output_type": "execute_result" 515 | } 516 | ], 517 | "source": [ 518 | "# search using embeddings\n", 519 | "get_relevant_documents(question, db)" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "id": "3_9LRJZgMDs1" 526 | }, 527 | "source": [ 528 | "## HuggingFace Powered Question Answering Setup\n", 529 | "\n", 530 | "Similar to Embedding Endpoints, HF also provides us with capabilities to directly leverage models for tasks such as:\n", 531 | "- Text Generation\n", 532 | "- Question Answering, etc.\n", 533 | "\n", 534 | "We can leverage local setups like GPT4ALL with LangChain, OpenAI APIs or even HuggingFace transformers as well. For this exercise, we will focus on leveraging **HuggingFace Endpoints** for **QA tasks** itself.\n", 535 | "\n", 536 | "We will make use of [Roberta-Base-Squad2](https://huggingface.co/deepset/roberta-base-squad2) model." 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 90, 542 | "metadata": { 543 | "id": "KCFidBCIE0i_" 544 | }, 545 | "outputs": [ 546 | { 547 | "data": { 548 | "text/plain": [ 549 | "{'Authorization': 'Bearer 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'}" 550 | ] 551 | }, 552 | "execution_count": 90, 553 | "metadata": {}, 554 | "output_type": "execute_result" 555 | } 556 | ], 557 | "source": [ 558 | "QA_MODEL_ID = 'deepset/roberta-base-squad2'\n", 559 | "QA_API_URL = f\"https://api-inference.huggingface.co/models/{QA_MODEL_ID}\"\n", 560 | "HEADERS = {\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n", 561 | "HEADERS" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": 93, 567 | "metadata": { 568 | "id": "3sOqsmAkMJVK" 569 | }, 570 | "outputs": [], 571 | "source": [ 572 | "def get_answer(question,context):\n", 573 | " payload = {\n", 574 | " \"question\": question,\n", 575 | " \"context\":context\n", 576 | " }\n", 577 | " data = json.dumps(payload)\n", 578 | " response = requests.request(\"POST\", QA_API_URL, headers=HEADERS, data=data)\n", 579 | " try:\n", 580 | " decoded_response = json.loads(response.content.decode(\"utf-8\"))\n", 581 | " return decoded_response#decoded_response['answer'], decoded_response['score'], \"\"\n", 582 | " except Exception as ex:\n", 583 | " return \"Apologies but I could not find any relevant answer\", 0.0, ex" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 79, 589 | "metadata": { 590 | "colab": { 591 | "base_uri": "https://localhost:8080/" 592 | }, 593 | "id": "6IU-il-z2AnE", 594 | "outputId": "820acf52-12eb-48eb-88e9-9ab7a29f7670" 595 | }, 596 | "outputs": [ 597 | { 598 | "name": "stdout", 599 | "output_type": "stream", 600 | "text": [ 601 | "HuggingFace Key Offering\n" 602 | ] 603 | } 604 | ], 605 | "source": [ 606 | "print(question)" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 80, 612 | "metadata": { 613 | "colab": { 614 | "base_uri": "https://localhost:8080/", 615 | "height": 35 616 | }, 617 | "id": "LiCiEjwvVNGI", 618 | "outputId": "fc0930d9-4d09-4aa4-9ec6-ea5c2c154f85" 619 | }, 620 | "outputs": [ 621 | { 622 | "data": { 623 | "text/plain": [ 624 | "'Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.'" 625 | ] 626 | }, 627 | "execution_count": 80, 628 | "metadata": {}, 629 | "output_type": "execute_result" 630 | } 631 | ], 632 | "source": [ 633 | "context = get_relevant_documents(question, db)\n", 634 | "context" 635 | ] 636 | }, 637 | { 638 | "cell_type": "code", 639 | "execution_count": 94, 640 | "metadata": { 641 | "colab": { 642 | "base_uri": "https://localhost:8080/" 643 | }, 644 | "id": "cKMvj0aCNs2U", 645 | "outputId": "312d878f-2de9-45e9-ed36-abfbd6f15e05" 646 | }, 647 | "outputs": [ 648 | { 649 | "data": { 650 | "text/plain": [ 651 | "('Inference Endpoints', 0.1849050521850586, '')" 652 | ] 653 | }, 654 | "execution_count": 94, 655 | "metadata": {}, 656 | "output_type": "execute_result" 657 | } 658 | ], 659 | "source": [ 660 | "get_answer(question,context)" 661 | ] 662 | }, 663 | { 664 | "cell_type": "code", 665 | "execution_count": null, 666 | "metadata": { 667 | "id": "MLUe4RmN2IMP" 668 | }, 669 | "outputs": [], 670 | "source": [] 671 | } 672 | ], 673 | "metadata": { 674 | "colab": { 675 | "provenance": [] 676 | }, 677 | "kernelspec": { 678 | "display_name": "Python 3 (ipykernel)", 679 | "language": "python", 680 | "name": "python3" 681 | }, 682 | "language_info": { 683 | "codemirror_mode": { 684 | "name": "ipython", 685 | "version": 3 686 | }, 687 | "file_extension": ".py", 688 | "mimetype": "text/x-python", 689 | "name": "python", 690 | "nbconvert_exporter": "python", 691 | "pygments_lexer": "ipython3", 692 | "version": "3.11.9" 693 | } 694 | }, 695 | "nbformat": 4, 696 | "nbformat_minor": 4 697 | } 698 | -------------------------------------------------------------------------------- /module_04/03_OpenSource_ClosedSource_LLMs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1ae9124c-7015-488d-9791-92b9731386a9", 6 | "metadata": {}, 7 | "source": [ 8 | "# Open Source Vs Close Sourced LLMs\n", 9 | "\n", 10 | "Similar to any other piece of technology, LLMs are available in all flavours and license types. While some of the most popular offerings are closed source (OpenAI and Claud), OpenSource also has a number of options (LLaMA, Mistral and more!). \n", 11 | "\n", 12 | "# But **What does OpenSource or Closed Source mean for Language Models?**\n", 13 | "> Or you might ask, don't we all know about the transformer architecture and what powers these LLMs?\n", 14 | "\n", 15 | "The answer to such question lies in the secret sauce to setup and train such models. Similar to any other ML/AI/DL model, we have to go through the following steps for LLMs as well:\n", 16 | "\n", 17 | "- 📚 Collect Loads of Data (virtually whole of internet and **more!!!**)\n", 18 | "- 🧹 Clean-up the Dataset\n", 19 | "- 🤖 A very Large **Transformer**-like architecture\n", 20 | "- 💸 A **very Large GPU cluster** to train the model (probably **multiple** times) \n", 21 | "- 🧑‍💻 A whole suite of Human Evaluators (and labellers)\n", 22 | "- 💹 A proper benchmarking and evaluation setup\n", 23 | "\n", 24 | "Those are not just 6 steps, those are very hard 6 steps!\n", 25 | "\n", 26 | "\n", 27 | "\n", 28 | "> Source: [Tweet by Danial Han](https://x.com/danielhanchen/status/1765446273661075609) / @danielhanchen\n", 29 | "\n", 30 | "## Ok but still, Closed vs Open?\n", 31 | "\n", 32 | "- The secret sauce is to bring in optimizations at each and every step of this process (yes, all 6 are active areas of research!)\n", 33 | "- Open Sourced Models could be released with a focus the code, weights, datasets or evaluation details or even all of them. This presents a nice distinction where some models are open-weight models while others are completely open-sourced.\n", 34 | " - Open Weights: Mistral Nemo, Google/Gemma\n", 35 | " - Fully Open Sourced: LLaMA x, Alpaca, Stanford Alpaca (based on LLaMA)\n", 36 | "- Closed Source or Closed Weight models currently have superior performance and make use of proprietary improvements and datasets to achieve the same. " 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "410adc18-b5c4-450e-8c7e-04a949ae4bc5", 42 | "metadata": {}, 43 | "source": [ 44 | "## Are Open Source Models Any Good?\n", 45 | "The closed weights/closed source models still lead the pack but the open source community is catching up. Catching up fast and square onto performance on all front. Let us explore one such easy to use setup, the **🦙 🦙LLaMA 🦙 🦙**\n", 46 | "### Let's Setup Our Own Lamma 🦙" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 23, 52 | "id": "6e204afc-020b-4f4a-9dba-810174da6d98", 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/html": [ 58 | "\n", 59 | "

Download the Latest From ollama.com

\n", 60 | "
\n", 61 | " \n", 62 | " \n", 63 | "
\n", 64 | "
\n", 65 | "

Pull the Latest LLaMA

\n", 66 | "
\n", 67 | " \n", 68 | "

Off We Go!

\n", 69 | " \n", 70 | "
\n", 71 | " " 72 | ], 73 | "text/plain": [ 74 | "" 75 | ] 76 | }, 77 | "execution_count": 23, 78 | "metadata": {}, 79 | "output_type": "execute_result" 80 | } 81 | ], 82 | "source": [ 83 | "from IPython.display import Image, HTML, display\n", 84 | "\n", 85 | "image_path1 = \"./assets/llama_setup_1.png\"\n", 86 | "image_path2 = \"./assets/llama_setup_2.png\"\n", 87 | "image_path3 = \"./assets/llama_setup_3.png\"\n", 88 | "image_path4 = \"./assets/llama_setup_4.png\"\n", 89 | "\n", 90 | "\n", 91 | "HTML(f\"\"\"\n", 92 | "

Download the Latest From ollama.com

\n", 93 | "
\n", 94 | " \n", 95 | " \n", 96 | "
\n", 97 | "
\n", 98 | "

Pull the Latest LLaMA

\n", 99 | "
\n", 100 | " \n", 101 | "

Off We Go!

\n", 102 | " \n", 103 | "
\n", 104 | " \"\"\")" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "id": "fbd9c507-d508-4186-82b8-fe2840336630", 110 | "metadata": {}, 111 | "source": [ 112 | "## 🖥️ Look Ma, Python Bindings!\n", 113 | "\n", 114 | "- Ensure your setup steps were completed successfully, else:\n", 115 | "- Install Ollama server using ``curl -fsSL https://ollama.com/install.sh | sh``\n", 116 | "- Change download directory : ``export OLLAMA_MODELS=/workspace``\n", 117 | "- Pull a specific model : ``ollama pull llama3.1:8b``\n", 118 | "- Start the server (assuming it is in the same environment/shell, else run export command again)\n", 119 | " - ``ollama serve``\n", 120 | " - ``ollama run llama3.1:8b``" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 29, 126 | "id": "52d27d56-831b-4296-8652-4ecedd1a3417", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "import ollama\n", 131 | "import json" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 33, 137 | "id": "438d996e-0f0d-4a6f-b4a7-944ccea844cf", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "# Go to your terminal and make sure the following is running: ollama run llama3.1:8b\n", 142 | "response = ollama.chat(model='llama3.1:8b', messages=[\n", 143 | " {\n", 144 | " 'role': 'user',\n", 145 | " 'content': \"What is a Llama?\",\n", 146 | " },\n", 147 | "])" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 34, 153 | "id": "424062f1-b4da-4278-9343-aaeb32cfaf12", 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "{\n", 161 | " \"model\": \"llama3.1\",\n", 162 | " \"created_at\": \"2024-08-03T19:09:26.075267Z\",\n", 163 | " \"message\": {\n", 164 | " \"role\": \"assistant\",\n", 165 | " \"content\": \"A llama (Lama glama) is a South American camelid, a mammal closely related to camels and alpacas. They are native to the Andean region of present-day Peru, Bolivia, Ecuador, and Chile.\\n\\nHere are some key characteristics of llamas:\\n\\n1. **Physical appearance**: Llamas have a distinctive appearance with a soft, woolly coat (which can be brown, black, white, or various shades in between), a long neck, and relatively small ears compared to their body size. Adult llamas typically grow to 5-6 feet (1.5-1.8 meters) tall at the shoulder.\\n2. **Habitat**: Llamas are adapted to high-altitude grasslands, rocky plateaus, and scrub forests in South America.\\n3. **Diet**: They are herbivores and feed on plants, such as grasses, leaves, and shrubs. Their digestive system is efficient at breaking down cellulose, which allows them to extract nutrients from plant material that would be difficult for other animals to digest.\\n4. **Social behavior**: Llamas are social creatures that live in herds. They have a hierarchical structure within these groups, with dominant individuals leading the way and younger or subordinate members following.\\n5. **Usefulness**: For centuries, llamas have been used by indigenous communities as pack animals (carrying goods over long distances), while their wool has been valued for its warmth and durability.\\n\\nIn recent years, llamas have also become popular in other parts of the world as pets, companions, or even therapy animals!\\n\\nHow's that? Do you have any specific questions about llamas I can help with?\"\n", 166 | " },\n", 167 | " \"done_reason\": \"stop\",\n", 168 | " \"done\": true,\n", 169 | " \"total_duration\": 13250142666,\n", 170 | " \"load_duration\": 28561250,\n", 171 | " \"prompt_eval_count\": 16,\n", 172 | " \"prompt_eval_duration\": 230031000,\n", 173 | " \"eval_count\": 345,\n", 174 | " \"eval_duration\": 12990640000\n", 175 | "}\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "print(json.dumps(response, indent = 4))" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "20ee4a3f-e702-4470-b7ab-6496448f91dc", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [] 190 | } 191 | ], 192 | "metadata": { 193 | "kernelspec": { 194 | "display_name": "Python 3 (ipykernel)", 195 | "language": "python", 196 | "name": "python3" 197 | }, 198 | "language_info": { 199 | "codemirror_mode": { 200 | "name": "ipython", 201 | "version": 3 202 | }, 203 | "file_extension": ".py", 204 | "mimetype": "text/x-python", 205 | "name": "python", 206 | "nbconvert_exporter": "python", 207 | "pygments_lexer": "ipython3", 208 | "version": "3.11.9" 209 | } 210 | }, 211 | "nbformat": 4, 212 | "nbformat_minor": 5 213 | } 214 | -------------------------------------------------------------------------------- /module_04/04_retrieval_augmented_llm_app.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "CQ7lJK0y9gJO" 7 | }, 8 | "source": [ 9 | "# Retrieval Augmented LLM App\n", 10 | "\n", 11 | "\n", 12 | "We have covered quite some ground in terms of understanding and building components for:\n", 13 | "- Text Representation\n", 14 | "- NLP Tasks\n", 15 | "- Pretrained Models and Transfer Learning\n", 16 | "- Model Fine-Tuning PEFT\n", 17 | "- SFT and LLM Landscape\n", 18 | "- Vector Databases\n", 19 | "- Libraries and Frameworks\n", 20 | "\n", 21 | "Now we will work through development of an app to showcase how we can leverage all the concepts into a fully functioning system\n", 22 | "\n", 23 | "__Note__: In order to keep things simple, we will leverage most high-level APIs available but the overall setup should be easily extensible" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": { 29 | "id": "kFRjKaov9gJR" 30 | }, 31 | "source": [ 32 | "## Why Retrieval Augmentation\n", 33 | "\n", 34 | "While theoretically LLMs are capable of having super long context windows, in real world settings this is a challenge because of:\n", 35 | "- Inability/Limitation to ensure LLM focusses on correct sub-sections of the context\n", 36 | "- High Memory requirements\n", 37 | "- High API Cost\n", 38 | "- High Latency , etc.\n", 39 | "\n", 40 | "\n", 41 | "In order to overcome such challenges, we leverage vector databases to act as intelligent retrieval systems (again powered by LLMs) to:\n", 42 | "- Provide focussed context\n", 43 | "- Reduce memory, cost and latency requirements\n", 44 | "- Unlock super-abilities to use upto-date information\n", 45 | "- Offload trivial tasks to expert systems" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "bxi-yR1n9gJR" 52 | }, 53 | "source": [ 54 | "## Streamlit Enters the Arena\n", 55 | "\n", 56 | "[Streamlit](https://streamlit.io/) is an open-source Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "\n", 64 | " \"Open\n", 65 | "" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": { 71 | "id": "fTFt0rOY9gJR" 72 | }, 73 | "source": [ 74 | "## Let us Begin with Installation" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 1, 80 | "metadata": { 81 | "id": "z_dNazilzRUF" 82 | }, 83 | "outputs": [], 84 | "source": [ 85 | "%%capture\n", 86 | "# install dependencies\n", 87 | "# !pip install -q chromadb\n", 88 | "# !pip install retry\n", 89 | "# !pip install -q streamlit \n", 90 | "# !npm install localtunnel # this is needed if you are working from colab" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 1, 96 | "metadata": { 97 | "colab": { 98 | "base_uri": "https://localhost:8080/" 99 | }, 100 | "id": "qjSlib36bJ70", 101 | "outputId": "9ccfe431-7738-459d-9d4f-b977c7fc7a86" 102 | }, 103 | "outputs": [ 104 | { 105 | "name": "stdout", 106 | "output_type": "stream", 107 | "text": [ 108 | "Overwriting app.py\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "%%writefile app.py\n", 114 | "## import required components\n", 115 | "\n", 116 | "import pandas as pd\n", 117 | "from utils import (\n", 118 | " get_lines,\n", 119 | " load_data,\n", 120 | " get_relevant_documents,\n", 121 | " get_answer,\n", 122 | " create_db,\n", 123 | " sidebar,\n", 124 | ")\n", 125 | "import streamlit as st\n", 126 | "chroma_client, db = create_db()\n", 127 | "\n", 128 | "## Setup Page Header and Sidebar\n", 129 | "st.set_page_config(page_title=\"PersonalGPT\", page_icon=\"📖\", layout=\"wide\")\n", 130 | "lm_model = sidebar()\n", 131 | "st.header(f\"📖PersonalGPT\")\n", 132 | "st.markdown(f\">:zap: Responses Powered by **{lm_model}**\")\n", 133 | "\n", 134 | "if 'is_doc_uploaded' not in st.session_state:\n", 135 | " st.session_state['is_doc_uploaded'] = False\n", 136 | "\n", 137 | "\n", 138 | "## Add Uploader Component\n", 139 | "uploaded_file = st.file_uploader(\n", 140 | " \"Upload a txt file\",\n", 141 | " type=[\"txt\"],\n", 142 | " help=\"Text files with each sentence acting as a document\",\n", 143 | ")\n", 144 | "\n", 145 | "if not st.session_state['is_doc_uploaded']:\n", 146 | " ## Check if upload is complete\n", 147 | " if not uploaded_file:\n", 148 | " st.stop()\n", 149 | " \n", 150 | " ## Read uploaded file\n", 151 | " try:\n", 152 | " file_data = get_lines(uploaded_file)\n", 153 | " ## Verbose Status update\n", 154 | " st.markdown(f\"> Uploaded file has {len(file_data)} lines of text\")\n", 155 | " st.session_state['is_doc_uploaded'] = True\n", 156 | " except Exception as e:\n", 157 | " st.markdown(f\"Could not upload/read file={e}\")\n", 158 | " st.session_state['is_doc_uploaded'] = False\n", 159 | " \n", 160 | " ## Index Uploaded text file\n", 161 | " with st.spinner(\"Indexing document... This may take a while⏳\"):\n", 162 | " db_status_msg = load_data(db, documents=file_data)\n", 163 | " \n", 164 | " ## status update\n", 165 | " st.markdown(f\"> Database indexed {db.count()} documents\")\n", 166 | " if db.count() == 0:\n", 167 | " st.markdown(db_status_msg)\n", 168 | " st.session_state['is_doc_uploaded'] = False\n", 169 | "\n", 170 | "## Get User Input\n", 171 | "with st.form(key=\"qa_form\"):\n", 172 | " query = st.text_area(\"Enter Your Query:\",\n", 173 | " placeholder=\"Examples: \\nwhat is tf-idf?\\nwhich module covers RLHF\\nhow many moons does Jupiter have?\")\n", 174 | " submit = st.form_submit_button(\"Submit\")\n", 175 | "\n", 176 | "## Provide additional Options for citing source\n", 177 | "with st.expander(\"Advanced Options\"):\n", 178 | " show_source = st.checkbox(\"Show Source\")\n", 179 | "\n", 180 | "## Generate Output upon button click\n", 181 | "if submit:\n", 182 | " # Get relevant documents from DB\n", 183 | " context = get_relevant_documents(query, db)\n", 184 | "\n", 185 | " # get answer from LLM\n", 186 | " answer,score,error = get_answer(query,context,lm_model)\n", 187 | "\n", 188 | " # Showcase response on screen\n", 189 | " st.markdown(f\"**Answer:** _{answer}_\")\n", 190 | " st.markdown(f\"> **Relevance Score**:{score}\")\n", 191 | " st.markdown(\"---\")\n", 192 | "\n", 193 | " # Add more details if advanced option is chosen\n", 194 | " if show_source:\n", 195 | " st.markdown(\"**Source(s):**\")\n", 196 | " st.markdown(f\"- {context[:100]}...\", unsafe_allow_html=True)" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 3, 202 | "metadata": { 203 | "id": "mlGSHYN0bQSm" 204 | }, 205 | "outputs": [], 206 | "source": [ 207 | "# check the log file for localhost port\n", 208 | "# !streamlit run app.py &>logs.txt & " 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 30, 214 | "metadata": { 215 | "colab": { 216 | "base_uri": "https://localhost:8080/" 217 | }, 218 | "id": "b5dllFNabXhE", 219 | "outputId": "8dd09f2d-c36d-49f1-e25d-991b18d1574c" 220 | }, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "\u001b[K\u001b[?25hnpx: installed 22 in 2.41s\n", 227 | "your url is: https://icy-heads-enjoy.loca.lt\n", 228 | "^C\n" 229 | ] 230 | } 231 | ], 232 | "source": [ 233 | "# ensure setup is complete and your have install nvm/node/npm and localtunnel\n", 234 | "!lt --port 8501" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | } 244 | ], 245 | "metadata": { 246 | "colab": { 247 | "provenance": [] 248 | }, 249 | "kernelspec": { 250 | "display_name": "Python 3 (ipykernel)", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.11.9" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 4 269 | } 270 | -------------------------------------------------------------------------------- /module_04/06_supercharge_llm_apps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "7c4a39ae-3cbf-4ee7-878c-79af5d013d56", 6 | "metadata": {}, 7 | "source": [ 8 | "# Supercharge LLM Apps with DSPy and Langfuse\n", 9 | "\n", 10 | "Prompt engineering, the art of crafting precise instructions for LLMs, can be a time-consuming and iterative process. Debugging and troubleshooting LLM behavior can also be complex, given the inherent \"black box\" nature of these models. Additionally, gaining insights into the performance and cost implications of LLM applications is crucial for optimization and scalability (key components for any production grade setup).\n", 11 | "\n", 12 | "## The LLM Ecosystem\n", 13 | "The ecosystem for LLMs is still in its nascent stages. To address some of these challenges, a number of innovative tools and frameworks are being developed. DSPy from Stanford University is one such unique take towards formalizing LLM-based app development. Langfuse on the other-hand has emerged as an offering to streamline and operationalize aspects of LLM app maintenance. To put it in brief: \n", 14 | "- **[DSPY](https://dspy-docs.vercel.app/)** provides a modular and composable framework for building LLM applications, abstracting away the complexities of prompt engineering and enabling developers to focus on the core logic of their applications.\n", 15 | "- **[Langfuse](https://langfuse.com/docs)** offers a comprehensive observability platform for LLM apps, providing deep insights into model performance, cost, and user interactions.\n", 16 | "\n", 17 | "By combining DSPy and Langfuse, developers can unlock the full potential of LLMs, building robust, scalable, and insightful applications that deliver exceptional user experiences." 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "id": "7a4d29a0-a37b-41fe-9c8e-b4cb858e5af4", 23 | "metadata": {}, 24 | "source": [ 25 | "### Langfuse Setup\n", 26 | "We will make use of self-hosting option for Langfuse. This is based on ``docker`` and ``docker compose``.\n", 27 | "Steps:\n", 28 | "- Clone the langfuse repository: ``git clone https://github.com/langfuse/langfuse.git``\n", 29 | "- From the langfuse repository: ``cd langfuse``\n", 30 | "- Start the docker containers: ``docker compose up``\n", 31 | "> The last step spins up a container for langfuse and another one for postgres, you may change settings using the ``.env`` or ``docker-compose.yml`` files" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "id": "144f3ae6-0f65-4c98-8d60-3bc2b15855b5", 37 | "metadata": {}, 38 | "source": [ 39 | "### Imports and Config" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": null, 45 | "id": "1651dd12-e05b-4750-b02c-f64aca5d0741", 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "# !pip3 install dspy-ai=2.5.2\n", 50 | "# !pip3 install langfuse==2.51.2\n", 51 | "# pip3 install chromadb==0.5.5" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 18, 57 | "id": "38b72fd8-1510-4bc4-8c74-20165e6e9f8f", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import os\n", 62 | "import sys\n", 63 | "import dspy\n", 64 | "from dsp.utils import deduplicate\n", 65 | "from dspy.retrieve.chromadb_rm import ChromadbRM\n", 66 | "from dsp.trackers.langfuse_tracker import LangfuseTracker\n", 67 | "\n", 68 | "import chromadb\n", 69 | "from chromadb.utils import embedding_functions\n", 70 | "\n", 71 | "from langfuse import Langfuse\n", 72 | "\n", 73 | "import random\n", 74 | "import itertools\n", 75 | "from scraper_utils import NB_Markdown_Scraper\n", 76 | "from IPython.display import display, Markdown" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 2, 82 | "id": "4657c235-76f0-4e36-a12a-37b4b1f01873", 83 | "metadata": {}, 84 | "outputs": [], 85 | "source": [ 86 | "config = {\n", 87 | " 'LANGFUSE_PUBLIC_KEY': 'XXXX',\n", 88 | " 'LANGFUSE_SECRET_KEY': 'XXXX',\n", 89 | " 'LANGFUSE_HOST': 'http://localhost:3000',\n", 90 | " 'OPENAI_API_KEY': 'XXXX',\n", 91 | " 'OPENAI_BASE_URL': '',\n", 92 | " 'OPENAI_PROVIDER': '',\n", 93 | " 'CHROMA_DB_PATH': './chromadb/',\n", 94 | " 'CHROMA_COLLECTION_NAME':\"supercharged_workshop_collection\",\n", 95 | " 'CHROMA_EMB_MODEL': 'all-MiniLM-L6-v2'\n", 96 | "}" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 3, 102 | "id": "4d991f5e-ed34-4c32-8ea9-8ee070f1a62b", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = config.get('LANGFUSE_PUBLIC_KEY')\n", 107 | "os.environ[\"LANGFUSE_SECRET_KEY\"] = config.get('LANGFUSE_SECRET_KEY')\n", 108 | "os.environ[\"LANGFUSE_HOST\"] = config.get('LANGFUSE_HOST')\n", 109 | "os.environ[\"OPENAI_API_KEY\"] = config.get('OPENAI_API_KEY')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 4, 115 | "id": "3ff6ee91-2661-4697-8b73-a77f6747bc7e", 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "# setup Langfuse tracker\n", 120 | "langfuse_tracker = LangfuseTracker(session_id='supercharger001')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 5, 126 | "id": "6cdcd002-9eae-4b9a-91be-e5204b0f8293", 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "# instantiate language-model for DSPY\n", 131 | "llm_model = dspy.OpenAI(\n", 132 | " api_key=config.get('OPENAI_API_KEY'),\n", 133 | " model='gpt-4o-mini'\n", 134 | ")" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "id": "10de99e6-b0c5-4c36-b4de-2ffdb45ee94f", 140 | "metadata": {}, 141 | "source": [ 142 | "## Prepare Dataset" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 11, 148 | "id": "8151ff1b-9660-4c1f-a42d-199ad6dd576c", 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "nb_scraper = NB_Markdown_Scraper([f'../module_0{i}' for i in range(1,5)])\n", 153 | "nb_scraper.scrape_markdowns()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 12, 159 | "id": "ba7cdde6-8a0d-404e-a7ab-fb018fb7c1c1", 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "with open(\"./dspy_content.tsv\", \"w\") as record_file:\n", 164 | " for k,v in nb_scraper.notebook_md_dict.items():\n", 165 | " record_file.write(f\"{k}\\t{v}\\n\")" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 13, 171 | "id": "63769ba8-3eae-42ad-a248-a2675daf2a4f", 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "doc_ids = []\n", 176 | "ctr = 1\n", 177 | "for k,_ in nb_scraper.notebook_md_dict.items():\n", 178 | " doc_ids.append(f'{ctr}_{k}')\n", 179 | " ctr+= 1" 180 | ] 181 | }, 182 | { 183 | "cell_type": "markdown", 184 | "id": "67bd03a7-e17a-46c4-af1b-ccb10c078cc8", 185 | "metadata": {}, 186 | "source": [ 187 | "### Ingest Data into ChromaDB\n", 188 | "> ensure Chroma is running in our terminal\n", 189 | "> ``$>chroma run --path ./chromadb``" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 6, 195 | "id": "3baf1c5c-5bdc-4454-981e-2584433e4538", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stderr", 200 | "output_type": "stream", 201 | "text": [ 202 | "/Users/raghav.bali/.pyenv/versions/3.11.9/envs/datahack/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n", 203 | " warnings.warn(\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "chroma_emb_fn = embedding_functions.\\\n", 209 | " SentenceTransformerEmbeddingFunction(\n", 210 | " model_name=config.get(\n", 211 | " 'CHROMA_EMB_MODEL'\n", 212 | " )\n", 213 | " )\n", 214 | "client = chromadb.HttpClient()" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 11, 220 | "id": "bbdeb484-ae0d-401c-b8f0-f16d9c59b4bf", 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "# if collection exists\n", 225 | "collection = client.get_collection(config.get('CHROMA_COLLECTION_NAME'))" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 10, 231 | "id": "d5f12287-4d89-4b7a-b5e3-e04c8ba044ff", 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "collection = client.create_collection(\n", 236 | " config.get('CHROMA_COLLECTION_NAME'),\n", 237 | " embedding_function=chroma_emb_fn,\n", 238 | " metadata={\"hnsw:space\": \"cosine\"}\n", 239 | ")" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 19, 245 | "id": "93408574-7a22-45fc-82ba-d14feba27b91", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "# Add to collection\n", 250 | "collection.add(\n", 251 | " documents=[v for _,v in nb_scraper.notebook_md_dict.items()], \n", 252 | " ids=doc_ids, # must be unique for each doc\n", 253 | ")" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "id": "728a44fe-1a49-402b-996c-3f5e518e5161", 259 | "metadata": {}, 260 | "source": [ 261 | "### Test Retrieval using ChromaDB Client" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 12, 267 | "id": "78abd266-b272-4fae-a1f4-6260e2903e91", 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "['6_module_03_03_RLHF_phi2', '10_module_04_06_supercharge_llm_apps', '2_module_01_02_getting_started']\n", 275 | "[0.6175035195275418, 0.7261012146561765, 0.8062081214907408]\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "results = collection.query(\n", 281 | " query_texts=[\"RLHF\"], # Chroma will embed using the function we provided\n", 282 | " n_results=3 # how many results to return\n", 283 | ")\n", 284 | "print(results['ids'][0])\n", 285 | "print(results['distances'][0])\n", 286 | "#print([i[:100] for j in results['documents'] for i in j])" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "id": "cd096119-7012-4154-92fd-97ac8b94c7f6", 292 | "metadata": {}, 293 | "source": [ 294 | "### Setup ChromaDB as DSPy Retriever " 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 13, 300 | "id": "6cb9c5a1-1a56-49c6-a2f4-79dd4c3bc003", 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/markdown": [ 306 | "__Document__::# Quick Overview of RLFH\n", 307 | "\n", 308 | "The performance of Language Models until GPT-3 was kind of amazing as-is. ... \n" 309 | ], 310 | "text/plain": [ 311 | "" 312 | ] 313 | }, 314 | "metadata": {}, 315 | "output_type": "display_data" 316 | }, 317 | { 318 | "data": { 319 | "text/markdown": [ 320 | ">- __Document id__::6_module_03_03_RLHF_phi2 \n", 321 | ">- __Document score__::0.6174977412306334" 322 | ], 323 | "text/plain": [ 324 | "" 325 | ] 326 | }, 327 | "metadata": {}, 328 | "output_type": "display_data" 329 | }, 330 | { 331 | "data": { 332 | "text/markdown": [ 333 | "__Document__::... \n" 334 | ], 335 | "text/plain": [ 336 | "" 337 | ] 338 | }, 339 | "metadata": {}, 340 | "output_type": "display_data" 341 | }, 342 | { 343 | "data": { 344 | "text/markdown": [ 345 | ">- __Document id__::10_module_04_06_supercharge_llm_apps \n", 346 | ">- __Document score__::0.7260969660795557" 347 | ], 348 | "text/plain": [ 349 | "" 350 | ] 351 | }, 352 | "metadata": {}, 353 | "output_type": "display_data" 354 | }, 355 | { 356 | "data": { 357 | "text/markdown": [ 358 | "__Document__::# Getting Started : Text Representation\n", 359 | "\n", 360 | "\n", 361 | "\n", 362 | "The NLP domain ... \n" 363 | ], 364 | "text/plain": [ 365 | "" 366 | ] 367 | }, 368 | "metadata": {}, 369 | "output_type": "display_data" 370 | }, 371 | { 372 | "data": { 373 | "text/markdown": [ 374 | ">- __Document id__::2_module_01_02_getting_started \n", 375 | ">- __Document score__::0.8062083377747705" 376 | ], 377 | "text/plain": [ 378 | "" 379 | ] 380 | }, 381 | "metadata": {}, 382 | "output_type": "display_data" 383 | }, 384 | { 385 | "data": { 386 | "text/markdown": [ 387 | "__Document__::# Text Generation " 391 | ] 392 | }, 393 | "metadata": {}, 394 | "output_type": "display_data" 395 | }, 396 | { 397 | "data": { 398 | "text/markdown": [ 399 | ">- __Document id__::3_module_02_02_simple_text_generator \n", 400 | ">- __Document score__::0.8826038964887366" 401 | ], 402 | "text/plain": [ 403 | "" 404 | ] 405 | }, 406 | "metadata": {}, 407 | "output_type": "display_data" 408 | }, 409 | { 410 | "data": { 411 | "text/markdown": [ 412 | "__Document__::# DSPy: Beyond Prompting\n", 413 | "---\n", 414 | "" 418 | ] 419 | }, 420 | "metadata": {}, 421 | "output_type": "display_data" 422 | }, 423 | { 424 | "data": { 425 | "text/markdown": [ 426 | ">- __Document id__::12_module_04_05_dspy_demo \n", 427 | ">- __Document score__::0.9200280698248913" 428 | ], 429 | "text/plain": [ 430 | "" 431 | ] 432 | }, 433 | "metadata": {}, 434 | "output_type": "display_data" 435 | } 436 | ], 437 | "source": [ 438 | "retriever_model = ChromadbRM(\n", 439 | " config.get('CHROMA_COLLECTION_NAME'),\n", 440 | " config.get('CHROMA_DB_PATH'),\n", 441 | " embedding_function=chroma_emb_fn,\n", 442 | " client=client,\n", 443 | " k=5\n", 444 | ")\n", 445 | "\n", 446 | "# Test Retrieval\n", 447 | "results = retriever_model(\"RLHF\")\n", 448 | "for result in results:\n", 449 | " display(Markdown(f\"__Document__::{result.long_text[:100]}... \\n\"))\n", 450 | " display(Markdown(f\">- __Document id__::{result.id} \\n>- __Document score__::{result.score}\"))" 451 | ] 452 | }, 453 | { 454 | "cell_type": "markdown", 455 | "id": "f16c3643-bc0a-4f73-a32c-a846bd3e5882", 456 | "metadata": {}, 457 | "source": [ 458 | "## Prepare DSPy Program" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 14, 464 | "id": "75724e2f-266b-4b10-9c93-c8186ede6de4", 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "# Set up the LM and RM\n", 469 | "dspy.settings.configure(lm=llm_model,rm=retriever_model)" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 15, 475 | "id": "93e83804-a6d4-4a48-9924-294628eb3fa0", 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "class GenerateAnswer(dspy.Signature):\n", 480 | " \"\"\"Answer questions with short factoid answers.\"\"\"\n", 481 | "\n", 482 | " context = dspy.InputField(desc=\"may contain relevant facts\")\n", 483 | " question = dspy.InputField()\n", 484 | " answer = dspy.OutputField(desc=\"often less than 50 words\")" 485 | ] 486 | }, 487 | { 488 | "cell_type": "code", 489 | "execution_count": 16, 490 | "id": "01b93559-dc08-4b2c-a24b-7539c543ff26", 491 | "metadata": {}, 492 | "outputs": [], 493 | "source": [ 494 | "class RAG(dspy.Module):\n", 495 | " def __init__(self, num_passages=3):\n", 496 | " super().__init__()\n", 497 | "\n", 498 | " self.retrieve = dspy.Retrieve(k=num_passages)\n", 499 | " self.generate_answer = dspy.ChainOfThought(GenerateAnswer)\n", 500 | " \n", 501 | " def forward(self, question):\n", 502 | " context = self.retrieve(question).passages\n", 503 | " prediction = self.generate_answer(context=context, question=question)\n", 504 | " return dspy.Prediction(context=context, answer=prediction.answer)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "id": "2a1a3ba6-ce4f-4692-9732-6c77b7714c32", 510 | "metadata": {}, 511 | "source": [ 512 | "## Let us Answer Some Questions" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 69, 518 | "id": "5dd88f39-fcca-43fd-8aee-de19ddb17954", 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "compiled_rag = RAG()" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 70, 528 | "id": "3f6ac3c3-e084-4a0a-8862-adc0184ed782", 529 | "metadata": {}, 530 | "outputs": [ 531 | { 532 | "data": { 533 | "text/markdown": [ 534 | "__Question__: List the models covered in module03" 535 | ], 536 | "text/plain": [ 537 | "" 538 | ] 539 | }, 540 | "metadata": {}, 541 | "output_type": "display_data" 542 | }, 543 | { 544 | "data": { 545 | "text/markdown": [ 546 | "__Predicted Answer__: _The models covered in module 03 include LLaMA 3.1, Chinchilla, and Gopher._" 547 | ], 548 | "text/plain": [ 549 | "" 550 | ] 551 | }, 552 | "metadata": {}, 553 | "output_type": "display_data" 554 | }, 555 | { 556 | "data": { 557 | "text/markdown": [ 558 | "__Retrieved Contexts (truncated):__" 559 | ], 560 | "text/plain": [ 561 | "" 562 | ] 563 | }, 564 | "metadata": {}, 565 | "output_type": "display_data" 566 | }, 567 | { 568 | "name": "stdout", 569 | "output_type": "stream", 570 | "text": [ 571 | "1. # Scaling Neural Nets and Efficient Training\n", 572 | "\n", 573 | "We have covered quite some ground in previous 2 modules and observed the steady increase in size and performance of the models. These gains come at huge c...\n", 574 | "\n", 575 | "2. # Prompt Engineering\n", 576 | "\n", 577 | "\n", 578 | "Prompt Engineering is this thrilling new discipline that opens the door to a world of possibilities with large language models (LLMs).\n", 579 | "\n", 580 | "As a pr...\n", 581 | "\n", 582 | "3. # Text Generation \n", 583 | " " 594 | ] 595 | }, 596 | "metadata": {}, 597 | "output_type": "display_data" 598 | }, 599 | { 600 | "data": { 601 | "text/markdown": [ 602 | "__Question__: Brief summary of module02" 603 | ], 604 | "text/plain": [ 605 | "" 606 | ] 607 | }, 608 | "metadata": {}, 609 | "output_type": "display_data" 610 | }, 611 | { 612 | "data": { 613 | "text/markdown": [ 614 | "__Predicted Answer__: _Module 02 focuses on text generation using pre-trained models like GPT-2, explaining foundation models, decoding strategies (greedy, beam search, sampling), and the impact of temperature on randomness. It also discusses limitations like long-range context and hallucination._" 615 | ], 616 | "text/plain": [ 617 | "" 618 | ] 619 | }, 620 | "metadata": {}, 621 | "output_type": "display_data" 622 | }, 623 | { 624 | "data": { 625 | "text/markdown": [ 626 | "__Retrieved Contexts (truncated):__" 627 | ], 628 | "text/plain": [ 629 | "" 630 | ] 631 | }, 632 | "metadata": {}, 633 | "output_type": "display_data" 634 | }, 635 | { 636 | "name": "stdout", 637 | "output_type": "stream", 638 | "text": [ 639 | "1. # Prompt Engineering\n", 640 | "\n", 641 | "\n", 642 | "Prompt Engineering is this thrilling new discipline that opens the door to a world of possibilities with large language models (LLMs).\n", 643 | "\n", 644 | "As a pr...\n", 645 | "\n", 646 | "2. # Text Generation \n", 647 | " " 662 | ] 663 | }, 664 | "metadata": {}, 665 | "output_type": "display_data" 666 | }, 667 | { 668 | "data": { 669 | "text/markdown": [ 670 | "__Question__: What is LLaMA?" 671 | ], 672 | "text/plain": [ 673 | "" 674 | ] 675 | }, 676 | "metadata": {}, 677 | "output_type": "display_data" 678 | }, 679 | { 680 | "data": { 681 | "text/markdown": [ 682 | "__Predicted Answer__: _LLaMA is a language model from Meta.AI, available in sizes 8B, 70B, and 405B, and it outperforms many existing LLMs on various benchmarks._" 683 | ], 684 | "text/plain": [ 685 | "" 686 | ] 687 | }, 688 | "metadata": {}, 689 | "output_type": "display_data" 690 | }, 691 | { 692 | "data": { 693 | "text/markdown": [ 694 | "__Retrieved Contexts (truncated):__" 695 | ], 696 | "text/plain": [ 697 | "" 698 | ] 699 | }, 700 | "metadata": {}, 701 | "output_type": "display_data" 702 | }, 703 | { 704 | "name": "stdout", 705 | "output_type": "stream", 706 | "text": [ 707 | "1. # Open Source Vs Close Sourced LLMs\n", 708 | "\n", 709 | "Similar to any other piece of technology, LLMs are available in all flavours and license types. While some of the most popular offerings are closed source (OpenAI ...\n", 710 | "\n", 711 | "2. # Scaling Neural Nets and Efficient Training\n", 712 | "\n", 713 | "We have covered quite some ground in previous 2 modules and observed the steady increase in size and performance of the models. These gains come at huge c...\n", 714 | "\n", 715 | "3. # Retrieval Augmented LLM App\n", 716 | "\n", 717 | "\n", 718 | "We have covered quite some ground in terms of understanding and building components for:\n", 719 | "- Text Representation\n", 720 | "- NLP Tasks\n", 721 | "- Pretrai...\n", 722 | "\n" 723 | ] 724 | }, 725 | { 726 | "data": { 727 | "text/markdown": [ 728 | "---" 729 | ], 730 | "text/plain": [ 731 | "" 732 | ] 733 | }, 734 | "metadata": {}, 735 | "output_type": "display_data" 736 | } 737 | ], 738 | "source": [ 739 | "my_questions = [\n", 740 | " \"List the models covered in module03\",\n", 741 | " \"Brief summary of module02\",\n", 742 | " \"What is LLaMA?\"\n", 743 | "]\n", 744 | "\n", 745 | "for question in my_questions:\n", 746 | " # Get the prediction. This contains `pred.context` and `pred.answer`.\n", 747 | " pred = compiled_rag(question)\n", 748 | " \n", 749 | " display(Markdown(f\"__Question__: {question}\"))\n", 750 | " display(Markdown(f\"__Predicted Answer__: _{pred.answer}_\"))\n", 751 | " display(Markdown(\"__Retrieved Contexts (truncated):__\"))\n", 752 | " for idx,cont in enumerate(pred.context):\n", 753 | " print(f\"{idx+1}. {cont[:200]}...\" )\n", 754 | " print()\n", 755 | " display(Markdown('---'))" 756 | ] 757 | }, 758 | { 759 | "attachments": {}, 760 | "cell_type": "markdown", 761 | "id": "5d3f76da-c5f0-49c6-8a30-3654b0e526d2", 762 | "metadata": {}, 763 | "source": [ 764 | "## Langfuse\n", 765 | "Understanding Costs\n", 766 | "\n", 767 | "\n", 768 | "\n", 769 | "---\n", 770 | "\n", 771 | "" 772 | ] 773 | }, 774 | { 775 | "cell_type": "markdown", 776 | "id": "3ec2ca40-dd85-4eaa-8f74-492e48149421", 777 | "metadata": {}, 778 | "source": [ 779 | "## Testing Langfuse Dataset using OpenLLaMA" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": 23, 785 | "id": "d259e844-2444-4568-887e-a656d464fa55", 786 | "metadata": {}, 787 | "outputs": [], 788 | "source": [ 789 | "langfuse =langfuse_tracker.langfuse\n", 790 | "ollama_dspy = dspy.OllamaLocal(model='llama3.1',temperature=0.5)\n", 791 | "\n", 792 | "# Set up the ollama as LM and RM\n", 793 | "dspy.settings.configure(lm=ollama_dspy,rm=retriever_model)" 794 | ] 795 | }, 796 | { 797 | "cell_type": "code", 798 | "execution_count": 24, 799 | "id": "0e41de51-1925-4b55-8ac6-5e79c530ec31", 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "# get annotated dataset\n", 804 | "annotated_dataset = langfuse.get_dataset(\"llm_workshop_rag\")" 805 | ] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": 25, 810 | "id": "b9b60273-b9d7-4c91-ada8-e30a3d8391d5", 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "# test rag using ollama\n", 815 | "ollama_rag = RAG()" 816 | ] 817 | }, 818 | { 819 | "cell_type": "code", 820 | "execution_count": 27, 821 | "id": "116ccd33-2c0b-47fe-9cc1-bfef0f8de96b", 822 | "metadata": {}, 823 | "outputs": [ 824 | { 825 | "data": { 826 | "text/markdown": [ 827 | "__Question__: Brief summary of module02" 828 | ], 829 | "text/plain": [ 830 | "" 831 | ] 832 | }, 833 | "metadata": {}, 834 | "output_type": "display_data" 835 | }, 836 | { 837 | "data": { 838 | "text/markdown": [ 839 | "__Predicted Answer (LLaMA 3.1)__: Here is a brief summary of module02:\n", 840 | "\n", 841 | "* LoRA (Low-Rank Adaptation) technique for fine-tuning large models:\n", 842 | "\t+ Freezes base model weights\n", 843 | "\t+ Decomposes weight update matrix into lower rank matrices, reducing updates by 100-1000x\n", 844 | "* qLoRA: Combines quantization and LoRA to further improve efficiency\n", 845 | "* Model Parameters:\n", 846 | "\t+ Model size: 405 billion parameters\n", 847 | "\t+ Training dataset: 15 trillion data points\n", 848 | "* GPU Performance and Compute Time:\n", 849 | "\t+ Compute required for training large models\n", 850 | "\t+ Cost of training large models\n", 851 | "* Scaling Laws:\n", 852 | "\t+ Insights from the paper \"Scaling Laws for Neural Language Models\"" 853 | ], 854 | "text/plain": [ 855 | "" 856 | ] 857 | }, 858 | "metadata": {}, 859 | "output_type": "display_data" 860 | }, 861 | { 862 | "data": { 863 | "text/markdown": [ 864 | ">__Annotated Answer (GPT-4o-mini)__: _Module 02 focuses on text generation using pre-trained models like GPT-2, explaining foundation models, decoding strategies (greedy, beam search, sampling), and the impact of temperature on randomness. It also discusses limitations like long-range context and hallucination._" 865 | ], 866 | "text/plain": [ 867 | "" 868 | ] 869 | }, 870 | "metadata": {}, 871 | "output_type": "display_data" 872 | }, 873 | { 874 | "data": { 875 | "text/markdown": [ 876 | "__Question__: What is LLaMA?" 877 | ], 878 | "text/plain": [ 879 | "" 880 | ] 881 | }, 882 | "metadata": {}, 883 | "output_type": "display_data" 884 | }, 885 | { 886 | "data": { 887 | "text/markdown": [ 888 | "__Predicted Answer (LLaMA 3.1)__: It seems like you're trying to follow along with a workshop on Large Language Models (LLMs) and their applications. However, the question about LLaMA was not fully answered.\n", 889 | "\n", 890 | "To provide a complete answer:\n", 891 | "\n", 892 | "Llama is a large language model developed by Meta AI. It's designed for natural language processing tasks such as text generation, translation, and more. Like other popular LLMs like BERT and RoBERTa, Llama uses self-supervised learning to learn patterns in language from vast amounts of text data.\n", 893 | "\n", 894 | "Now, let's get back to the original question: \"Fine-Tuning PEFT - SFT and LLM Landscape - Vector Databases - Libraries and Frameworks\".\n", 895 | "\n", 896 | "To answer this question:\n", 897 | "\n", 898 | "The topic seems to be" 899 | ], 900 | "text/plain": [ 901 | "" 902 | ] 903 | }, 904 | "metadata": {}, 905 | "output_type": "display_data" 906 | }, 907 | { 908 | "data": { 909 | "text/markdown": [ 910 | ">__Annotated Answer (GPT-4o-mini)__: _LLaMA is a language model from Meta.AI, available in sizes 8B, 70B, and 405B, and it outperforms many existing LLMs on various benchmarks._" 911 | ], 912 | "text/plain": [ 913 | "" 914 | ] 915 | }, 916 | "metadata": {}, 917 | "output_type": "display_data" 918 | } 919 | ], 920 | "source": [ 921 | "for item in annotated_dataset.items:\n", 922 | " question = item.input[0]['content'].split('Question: ')[-1].split('\\n')[0]\n", 923 | " answer = item.expected_output['content'].split('Answer: ')[-1]\n", 924 | " o_pred = ollama_rag(question)\n", 925 | " with item.observe(\n", 926 | " run_name='ollama_experiment',\n", 927 | " run_description='compare LLaMA3.1 RAG vs GPT4o-mini RAG ',\n", 928 | " run_metadata={\"model\": \"llama3.1\"},\n", 929 | " ) as trace_id:\n", 930 | " langfuse.score(\n", 931 | " name=\"visual-eval\",\n", 932 | " # any float value\n", 933 | " value=1.0,\n", 934 | " comment=\"LLaMA3.1 is very verbose\",\n", 935 | " )\n", 936 | " langfuse.trace(input=question,output=o_pred.answer,metadata={'model':'LLaMA3.1'})\n", 937 | " display(Markdown(f\"__Question__: {question}\"))\n", 938 | " display(Markdown(f\"__Predicted Answer (LLaMA 3.1)__: {o_pred.answer}\"))\n", 939 | " display(Markdown(f\">__Annotated Answer (GPT-4o-mini)__: _{answer}_\"))" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": null, 945 | "id": "b56d5e1a-7713-4056-b4af-e8f9363cd1a6", 946 | "metadata": {}, 947 | "outputs": [], 948 | "source": [] 949 | } 950 | ], 951 | "metadata": { 952 | "kernelspec": { 953 | "display_name": "Python 3 (ipykernel)", 954 | "language": "python", 955 | "name": "python3" 956 | }, 957 | "language_info": { 958 | "codemirror_mode": { 959 | "name": "ipython", 960 | "version": 3 961 | }, 962 | "file_extension": ".py", 963 | "mimetype": "text/x-python", 964 | "name": "python", 965 | "nbconvert_exporter": "python", 966 | "pygments_lexer": "ipython3", 967 | "version": "3.11.9" 968 | } 969 | }, 970 | "nbformat": 4, 971 | "nbformat_minor": 5 972 | } 973 | -------------------------------------------------------------------------------- /module_04/app.py: -------------------------------------------------------------------------------- 1 | ## import required components 2 | 3 | import pandas as pd 4 | from utils import ( 5 | get_lines, 6 | load_data, 7 | get_relevant_documents, 8 | get_answer, 9 | create_db, 10 | sidebar, 11 | ) 12 | import streamlit as st 13 | chroma_client, db = create_db() 14 | 15 | ## Setup Page Header and Sidebar 16 | st.set_page_config(page_title="PersonalGPT", page_icon="📖", layout="wide") 17 | lm_model = sidebar() 18 | st.header(f"📖PersonalGPT") 19 | st.markdown(f">:zap: Responses Powered by **{lm_model}**") 20 | 21 | if 'is_doc_uploaded' not in st.session_state: 22 | st.session_state['is_doc_uploaded'] = False 23 | 24 | 25 | ## Add Uploader Component 26 | uploaded_file = st.file_uploader( 27 | "Upload a txt file", 28 | type=["txt"], 29 | help="Text files with each sentence acting as a document", 30 | ) 31 | 32 | if not st.session_state['is_doc_uploaded']: 33 | ## Check if upload is complete 34 | if not uploaded_file: 35 | st.stop() 36 | 37 | ## Read uploaded file 38 | try: 39 | file_data = get_lines(uploaded_file) 40 | ## Verbose Status update 41 | st.markdown(f"> Uploaded file has {len(file_data)} lines of text") 42 | st.session_state['is_doc_uploaded'] = True 43 | except Exception as e: 44 | st.markdown(f"Could not upload/read file={e}") 45 | st.session_state['is_doc_uploaded'] = False 46 | 47 | ## Index Uploaded text file 48 | with st.spinner("Indexing document... This may take a while⏳"): 49 | db_status_msg = load_data(db, documents=file_data) 50 | 51 | ## status update 52 | st.markdown(f"> Database indexed {db.count()} documents") 53 | if db.count() == 0: 54 | st.markdown(db_status_msg) 55 | st.session_state['is_doc_uploaded'] = False 56 | 57 | ## Get User Input 58 | with st.form(key="qa_form"): 59 | query = st.text_area("Enter Your Query:",placeholder="Examples: \nwhat is tf-idf?\nwhich module covers RLHF\nhow many moons does Jupiter have?") 60 | submit = st.form_submit_button("Submit") 61 | 62 | ## Provide additional Options for citing source 63 | with st.expander("Advanced Options"): 64 | show_source = st.checkbox("Show Source") 65 | 66 | ## Generate Output upon button click 67 | if submit: 68 | # Get relevant documents from DB 69 | context = get_relevant_documents(query, db) 70 | 71 | # get answer from LLM 72 | answer,score,error = get_answer(query,context,lm_model) 73 | 74 | # Showcase response on screen 75 | st.markdown(f"**Answer:** _{answer}_") 76 | st.markdown(f"> **Relevance Score**:{score}") 77 | st.markdown("---") 78 | 79 | # Add more details if advanced option is chosen 80 | if show_source: 81 | st.markdown("**Source(s):**") 82 | st.markdown(f"- {context[:100]}...", unsafe_allow_html=True) 83 | -------------------------------------------------------------------------------- /module_04/assets/chroma_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/chroma_workflow.png -------------------------------------------------------------------------------- /module_04/assets/cot_few_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/cot_few_shot.png -------------------------------------------------------------------------------- /module_04/assets/dspy_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_banner.png -------------------------------------------------------------------------------- /module_04/assets/dspy_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_logo.png -------------------------------------------------------------------------------- /module_04/assets/dspy_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_workflow.png -------------------------------------------------------------------------------- /module_04/assets/langchain_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langchain_workflow.png -------------------------------------------------------------------------------- /module_04/assets/langfuse_dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langfuse_dashboard.png -------------------------------------------------------------------------------- /module_04/assets/langfuse_traces.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langfuse_traces.png -------------------------------------------------------------------------------- /module_04/assets/llama_setup_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_1.png -------------------------------------------------------------------------------- /module_04/assets/llama_setup_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_2.png -------------------------------------------------------------------------------- /module_04/assets/llama_setup_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_3.png -------------------------------------------------------------------------------- /module_04/assets/llama_setup_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_4.png -------------------------------------------------------------------------------- /module_04/assets/llama_setup_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_5.png -------------------------------------------------------------------------------- /module_04/assets/mteb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/mteb.png -------------------------------------------------------------------------------- /module_04/assets/pe_banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/pe_banner.jpg -------------------------------------------------------------------------------- /module_04/assets/pe_types.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/pe_types.jpg -------------------------------------------------------------------------------- /module_04/assets/prompt_hacking_reddit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/prompt_hacking_reddit.png -------------------------------------------------------------------------------- /module_04/assets/prompt_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/prompt_workflow.png -------------------------------------------------------------------------------- /module_04/assets/rap_banner.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/rap_banner.jpeg -------------------------------------------------------------------------------- /module_04/assets/react_sample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/react_sample.png -------------------------------------------------------------------------------- /module_04/assets/training_is_hard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/training_is_hard.png -------------------------------------------------------------------------------- /module_04/assets/vector_banner.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/vector_banner.jpg -------------------------------------------------------------------------------- /module_04/constants.py: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## Set Constants 3 | ##################### 4 | HF_TOKEN = '' 5 | OPENAI_TOKEN = '' 6 | HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"} 7 | 8 | # Constants for embedding model 9 | EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse' 10 | EMB_API_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}" 11 | 12 | # Constants for QA model 13 | QA_MODEL_ID = 'deepset/roberta-base-squad2' 14 | 15 | # List of Different Endpoints 16 | HF_QA_ENDPOINT = 'HF-QA' 17 | HF_LM_ENDPOINT = 'HF-LM' 18 | OPENAI_ENDPOINT = 'OPENAI-LM' 19 | LOCAL_OLLAMA_ENDPOINT = 'OLLAMA' 20 | AVAILABLE_LMs = { 21 | 'models': 22 | [ 23 | 'deepset/roberta-base-squad2', 24 | 'Intel/dynamic_tinybert', 25 | #'google/gemma-2-2b-it', # this is timing out mostly 26 | 'Local-LLAMA-3.1:8b', 27 | 'OpenAI-GPT4o-mini' 28 | ], 29 | 'endpoints': 30 | [ 31 | HF_QA_ENDPOINT, 32 | HF_QA_ENDPOINT, 33 | #HF_LM_ENDPOINT, #this is timing out mostly 34 | LOCAL_OLLAMA_ENDPOINT, 35 | OPENAI_ENDPOINT, 36 | ] 37 | } -------------------------------------------------------------------------------- /module_04/scraper_utils.py: -------------------------------------------------------------------------------- 1 | # Adapted From: https://gist.github.com/psychemedia/925e190e2afd15b050f32334ceff9ef6 2 | import os 3 | import nbformat 4 | 5 | class NB_Markdown_Scraper: 6 | 7 | def __init__(self,input_paths=None): 8 | self.notebook_md_dict = dict() 9 | self.input_paths = input_paths 10 | 11 | def nbpathwalk(self,path): 12 | ''' Walk down a directory path looking for ipynb notebook files... ''' 13 | valid_notebook_files = [] 14 | for path, _, files in os.walk(path): 15 | if '.ipynb_checkpoints' in path or 'solutions' in path : continue 16 | for f in [i for i in files if i.endswith('.ipynb') and not i.startswith('dontcommit')]: 17 | valid_notebook_files.append(os.path.join(path, f)) 18 | return valid_notebook_files 19 | 20 | 21 | def get_cell_contents(self,nb_fn, c_md=None, cell_typ=None): 22 | ''' Extract the content of Jupyter notebook cells. ''' 23 | if cell_typ is None: cell_typ=['markdown'] 24 | if c_md is None: c_md = [] 25 | nb=nbformat.read(nb_fn,nbformat.NO_CONVERT) 26 | _c_md=[i for i in nb.cells if i['cell_type'] in cell_typ] 27 | ix=len(c_md) 28 | for c in _c_md: 29 | c.update( {"ix":str(ix)}) 30 | c.update( {"title":nb_fn}) 31 | ix = ix+1 32 | c_md = c_md + _c_md 33 | return c_md 34 | 35 | 36 | # scraper 37 | def scrape_markdowns(self): 38 | for directory in self.input_paths: 39 | directory_notebooks = self.nbpathwalk(directory) 40 | for notebook in directory_notebooks: 41 | notebook_cells = self.get_cell_contents(notebook, cell_typ=['markdown']) 42 | notebook_name = '_'.join(notebook.split('/')[1:]).split('.')[0] 43 | self.notebook_md_dict[notebook_name] = ' '.join([cell['source'] for cell in sorted(notebook_cells, 44 | key=lambda d: d['ix'])]) 45 | -------------------------------------------------------------------------------- /module_04/utils.py: -------------------------------------------------------------------------------- 1 | ##################### 2 | ## imports 3 | ##################### 4 | import pandas as pd 5 | import json 6 | import requests 7 | from retry import retry 8 | import streamlit as st 9 | import chromadb.utils.embedding_functions as embedding_functions 10 | from huggingface_hub import InferenceClient 11 | from openai import OpenAI 12 | import ollama 13 | from constants import ( 14 | HF_TOKEN, 15 | OPENAI_TOKEN, 16 | HEADERS, 17 | EMB_MODEL_ID, 18 | EMB_API_URL, 19 | QA_MODEL_ID, 20 | HF_QA_ENDPOINT, 21 | HF_LM_ENDPOINT, 22 | OPENAI_ENDPOINT, 23 | LOCAL_OLLAMA_ENDPOINT, 24 | AVAILABLE_LMs) 25 | 26 | 27 | import chromadb 28 | 29 | 30 | lm_df = pd.DataFrame.from_dict(AVAILABLE_LMs) 31 | 32 | ##################### 33 | ## Utility Functions 34 | ##################### 35 | 36 | def get_lines(uploaded_file): 37 | """ 38 | Utility to read raw text file in binary 39 | """ 40 | raw_data = [] 41 | for line in uploaded_file: 42 | raw_data.append(line.decode("utf-8") ) 43 | return raw_data 44 | 45 | def create_db(): 46 | """ 47 | Utility to instantiate vector db client and collection 48 | """ 49 | chroma_client = chromadb.Client() 50 | # huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction( 51 | # api_key=HF_TOKEN, 52 | # model_name=EMB_MODEL_ID 53 | # ) 54 | db = chroma_client.get_or_create_collection(name="nlp_llm_workshop",) 55 | #embedding_function=huggingface_ef) 56 | return chroma_client,db 57 | 58 | def load_data(db, documents): 59 | """ 60 | Utility to add/index data into vector db 61 | """ 62 | try: 63 | db.add( 64 | documents=documents, 65 | ids=[str(i) for i in range(len(documents))] 66 | ) 67 | except Exception as ex: 68 | return "Apologies but I could not ingest document", 0.0, ex 69 | 70 | def get_relevant_documents(query, db): 71 | """ 72 | Utility to retrieve relevant documents from vector DB 73 | """ 74 | try: 75 | relevant_doc = db.query(query_texts=[query], n_results=1)['documents'][0][0] 76 | return relevant_doc 77 | except Exception as ex: 78 | return "Apologies but I could not process your query", 0.0, ex 79 | 80 | def get_hf_qa_answer(payload,lm_model): 81 | data = json.dumps(payload) 82 | try: 83 | QA_API_URL = f"https://api-inference.huggingface.co/models/{lm_model}" 84 | response = requests.request("POST", QA_API_URL, headers=HEADERS, data=data) 85 | decoded_response = json.loads(response.content.decode("utf-8")) 86 | return decoded_response['answer'], decoded_response['score'], "" 87 | except Exception as ex: 88 | return "Apologies but I could not find any relevant answer", 0.0, ex 89 | 90 | # this is mostly timing out 91 | def get_hf_llm_answer(payload,lm_model): 92 | try: 93 | client = InferenceClient( 94 | "google/gemma-2-2b-it", 95 | token=HF_TOKEN,) 96 | 97 | content = f"Given the context, answer the question. \ncontext:{payload['context']}\nquestion:{payload['question']}" 98 | response= client.chat_completion( 99 | messages=[{"role": "user", "content": content}], 100 | max_tokens=500, 101 | stream=False, 102 | ) 103 | 104 | return json.loads(message.choices[0].delta.content), 0.0 105 | except Exception as ex: 106 | return "Apologies but I could not find any relevant answer", 0.0, ex 107 | 108 | def get_local_llama_answer(payload,lm_model): 109 | try: 110 | content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n2.Check if your answer is really in the provided context, otherwise respond with 'Sorry I could not find the answer'.\n 3.Generate a relevance score between 0 and 1.\n4. Format the output as a json with answer and score as keys.\n5.Do not add makrdown syntax only respond with json.\nBe careful and Think step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}" 111 | response = ollama.chat(model='llama3.1:8b', messages=[ 112 | { 113 | 'role': 'user', 114 | 'content': content, 115 | }, 116 | ] 117 | ) 118 | json_output = json.loads(response['message']['content']) 119 | return json_output['answer'], json_output['score'], "" 120 | except Exception as ex: 121 | st.markdown(ex) 122 | return "Apologies but I could not find any relevant answer", 0.0, ex 123 | 124 | def get_opeai_answer(payload,lm_model): 125 | try: 126 | client = OpenAI( 127 | api_key=OPENAI_TOKEN, 128 | ) 129 | content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n 2.Generate a relevance score.\n3. Format the output as a json with answer and score as keys. Do not add makrdown syntax.\nThink step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}" 130 | chat_completion = client.chat.completions.create( 131 | messages=[ 132 | { 133 | "role": "user", 134 | "content": content, 135 | } 136 | ], 137 | model="gpt-4o-mini", 138 | ) 139 | json_output = json.loads(chat_completion.choices[0].message.content) 140 | return json_output['answer'], json_output['score'], "" 141 | except Exception as ex: 142 | return "Apologies but I could not find any relevant answer", 0.0, ex 143 | 144 | 145 | def get_answer(question,context,lm_model): 146 | """ 147 | Utility to leverage QA model for answering question using given context 148 | and the mentioned model 149 | """ 150 | payload = { 151 | "question": question, 152 | "context":context 153 | } 154 | try: 155 | endpoint_type = lm_df[lm_df['models']==lm_model]['endpoints'].values[0] 156 | if endpoint_type == HF_QA_ENDPOINT: 157 | return get_hf_qa_answer(payload,lm_model) 158 | elif endpoint_type == HF_LM_ENDPOINT: 159 | return get_hf_llm_answer(payload,lm_model) 160 | elif endpoint_type == OPENAI_ENDPOINT: 161 | return get_opeai_answer(payload,lm_model) 162 | elif endpoint_type == LOCAL_OLLAMA_ENDPOINT: 163 | return get_local_llama_answer(payload,lm_model) 164 | else: 165 | "This is not implemented yet", 0.0, ex 166 | except Exception as ex: 167 | return "Apologies but I could not find any relevant answer", 0.0, ex 168 | 169 | 170 | def sidebar(): 171 | """ 172 | Utility to add content to sidebar 173 | """ 174 | with st.sidebar: 175 | st.markdown( 176 | "## How to use\n" 177 | "1. Upload a txt file📄\n" 178 | "3. Ask a question about the document💬\n" 179 | ) 180 | st.markdown("---") 181 | st.markdown("## Which LM would you like to use?") 182 | option = st.selectbox( 183 | "Select a Model", 184 | lm_df['models'], 185 | label_visibility='hidden' 186 | ) 187 | 188 | st.markdown("---") 189 | st.markdown("# About") 190 | st.markdown( 191 | "📖PersonalGPT is a demo to showcase retrieval augmented question answering system" 192 | ) 193 | st.markdown(":heart: Made by [raghav bali](https://raghavbali.github.io)") 194 | st.markdown("---") 195 | 196 | return option -------------------------------------------------------------------------------- /module_05/whats_next.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_05/whats_next.pdf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "llm-workshop" 3 | version = "0.1.0" 4 | description = "LLM Workshop 2024 by raghavbali.github.io" 5 | authors = ["raghavbali "] 6 | readme = "README.md" 7 | package-mode = false 8 | 9 | [tool.poetry.dependencies] 10 | python = "^3.11" 11 | pandas = "^2.2.2" 12 | matplotlib = "^3.9.0" 13 | seaborn = "^0.13.2" 14 | torch = "^2.3.1" 15 | transformers = "4.44.0" 16 | torchtext = "^0.18.0" 17 | gensim = "^4.3.2" 18 | jupyterlab = "^4.2.3" 19 | jupyter = "^1.0.0" 20 | nltk = "^3.8.1" 21 | scikit-learn = "^1.5.1" 22 | scipy = "1.12" 23 | scienceplots = "^2.1.1" 24 | fasttext = "^0.9.3" 25 | datasets = "^2.20.0" 26 | accelerate = "^0.33.0" 27 | peft = "^0.12.0" 28 | trl = "^0.9.6" 29 | tensorboardx = "^2.6.2.2" 30 | gpt4all = "^2.7.0" 31 | ollama = "^0.3.1" 32 | dspy-ai = "2.4.10" 33 | langchain = "^0.2.12" 34 | langchain-community = "^0.2.11" 35 | langchain-openai = "^0.1.20" 36 | chromadb = "^0.5.5" 37 | openai = "^1.38.0" 38 | streamlit = "^1.37.0" 39 | retry = "^0.9.2" 40 | sentence-transformers = "^3.0.1" 41 | langchainhub = "^0.1.20" 42 | watchdog = "^4.0.1" 43 | bitsandbytes = "0.43.3" 44 | 45 | 46 | [build-system] 47 | requires = ["poetry-core"] 48 | build-backend = "poetry.core.masonry.api" 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | 2 | pandas==2.2.2 3 | matplotlib==3.9.0 4 | seaborn==0.13.2 5 | torch==2.3.1 6 | transformers==4.44.0 7 | torchtext==0.18.0 8 | gensim==4.3.2 9 | jupyterlab==4.2.3 10 | jupyter==1.0.0 11 | nltk==3.8.1 12 | scikit-learn==1.5.1 13 | scipy==1.12 14 | scienceplots==2.1.1 15 | fasttext==0.9.3 16 | datasets==2.20.0 17 | accelerate==0.33.0 18 | peft==0.12.0 19 | trl==0.9.6 20 | tensorboardx==2.6.2.2 21 | gpt4all==2.7.0 22 | ollama==0.3.1 23 | dspy-ai==2.4.10 24 | langchain==0.2.12 25 | langchain-community==0.2.11 26 | langchain-openai==0.1.20 27 | chromadb==0.5.5 28 | openai==1.38.0 29 | streamlit==1.37.0 30 | retry==0.9.2 31 | sentence-transformers==3.0.1 32 | langchainhub==0.1.20 33 | watchdog==4.0.1 34 | bitsandbytes==0.43.3 -------------------------------------------------------------------------------- /workshop_introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/workshop_introduction.pdf --------------------------------------------------------------------------------