├── .gitignore
├── Makefile
├── README.md
├── module_01
    ├── 01_introduction.pdf
    ├── 02_getting_started.ipynb
    ├── 03_explore_transformers.ipynb
    ├── assets
    │   ├── banner_notebook_1.jpg
    │   ├── bert_models_layout_notebook_3.jpeg
    │   ├── cbow_arch_notebook_1.png
    │   ├── encoder_decoder_notebook_3.png
    │   ├── img_2_notebook_1.jpg
    │   ├── karpathy_emoji_tokenizer.jpeg
    │   ├── lm_training_notebook_3.png
    │   ├── multihead_attention_notebook_3.png
    │   ├── positional_emb_notebook_3.png
    │   ├── skipgram_arch_notebook_1.png
    │   └── transformer_arch_notebook_3.png
    ├── sherlock_homes.txt
    └── solutions
    │   ├── 02_getting_started.ipynb
    │   ├── 03_explore_transformers.ipynb
    │   ├── norm_corpus.txt
    │   └── sherlock_homes.txt
├── module_02
    ├── 01_llm_overview.pdf
    ├── 02_simple_text_generator.ipynb
    ├── assets
    │   └── beamsearch_nb_2.png
    └── solutions
    │   └── 02_simple_text_generator.ipynb
├── module_03
    ├── 01_llm_training_and_scaling.ipynb
    ├── 02_instruction_tuning_llama_t2sql.ipynb
    ├── 03_RLHF_phi2.ipynb
    ├── assets
    │   ├── chinchilla.png
    │   ├── cost_tweet.png
    │   ├── instruct_gpt_rlhf.png
    │   ├── lora_1.png
    │   ├── quantization.png
    │   ├── scaling_laws.png
    │   ├── soft_prompting_1.png
    │   ├── soft_prompting_2.png
    │   └── soft_prompting_perf.png
    ├── solutions
    │   ├── 01_llm_training_and_scaling.ipynb
    │   └── utils.py
    └── utils.py
├── module_04
    ├── 01_prompt_engineeering_and_langchain.ipynb
    ├── 02_vector_databases_hf_inference_endpoint.ipynb
    ├── 03_OpenSource_ClosedSource_LLMs.ipynb
    ├── 04_retrieval_augmented_llm_app.ipynb
    ├── 05_dspy_demo.ipynb
    ├── 06_supercharge_llm_apps.ipynb
    ├── app.py
    ├── assets
    │   ├── chroma_workflow.png
    │   ├── cot_few_shot.png
    │   ├── dspy_banner.png
    │   ├── dspy_logo.png
    │   ├── dspy_workflow.png
    │   ├── langchain_workflow.png
    │   ├── langfuse_dashboard.png
    │   ├── langfuse_traces.png
    │   ├── llama_setup_1.png
    │   ├── llama_setup_2.png
    │   ├── llama_setup_3.png
    │   ├── llama_setup_4.png
    │   ├── llama_setup_5.png
    │   ├── mteb.png
    │   ├── pe_banner.jpg
    │   ├── pe_types.jpg
    │   ├── prompt_hacking_reddit.png
    │   ├── prompt_workflow.png
    │   ├── rap_banner.jpeg
    │   ├── react_sample.png
    │   ├── training_is_hard.png
    │   └── vector_banner.jpg
    ├── constants.py
    ├── llm_material.txt
    ├── scraper_utils.py
    └── utils.py
├── module_05
    └── whats_next.pdf
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── workshop_introduction.pdf


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | *crswap*
 11 | .DS_Store
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | share/python-wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: setup runpod_setup tests
 2 | 
 3 | setup:
 4 | 	@echo "Setting up the environment..."
 5 | 	pyenv install 3.11.9
 6 | 	pyenv virtualenv 3.11.9 datahack
 7 | 	pyenv activate datahack
 8 | 	poetry install
 9 | 
10 | runpod_setup:
11 | 	@echo "Setting up runpod environment..."
12 | 	@echo "Step 1 python dependencies..."
13 | 	pip install -r requirements.txt
14 | 	@echo "Step 2 nodes/npm dependencies..."
15 | 	wget -qO- https://raw.githubusercontent.com/nvm-sh/nvm/v0.40.0/install.sh | bash
16 | 	nvm install node
17 | 	npm install localtunnel
18 | 	@echo "Step 2 ollama dependencies..."
19 | 	curl -fsSL https://ollama.com/install.sh | sh
20 | 	export OLLAMA_MODELS=/workspace
21 | 	ollama pull llama3.1:8b
22 | 	@echo "Done!"
23 | 
24 | tests:
25 | 	@echo "Running tests..."
26 | 	poetry run pytest --disable-warnings


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LLM Workshop 2024
  2 | 
  3 | > [!IMPORTANT]
  4 | > :dart: [DataHack Summit 2024](https://www.analyticsvidhya.com/datahacksummit/workshops/unleashing-llms-training-finetuning-and-evaluating) | :calendar: August 10 2024 | :round_pushpin: Bengaluru, India
  5 | 
  6 | > Explore this comprehensive repository on LLMs, covering everything from the basics of NLP to fine-tuning and even RLHF. If you find the resources helpful, consider giving it a star ⭐ to show your support and help others discover it.
  7 | ---
  8 | ## Table of Contents
  9 | - [Modules](#modules)
 10 | - [Prerequisites](#prerequisites)
 11 | - [Environment Setup](#environment-setup-instructions)
 12 | 
 13 | ---
 14 | 
 15 | ### Modules
 16 | #### Module 1: "Foundations of Generative AI and Language Models"
 17 | - [x] Overview of Generative AI and the basics of language modeling.
 18 | - [x] :star: Hands-On: 
 19 |     - [x] Getting Started: Text Representation
 20 |     - [x] Language Modeling Basics and Text Generation using a basic LM.
 21 | 
 22 | #### Module 2: "Building Blocks of LLMs"
 23 | - [x] Transformer Architectures: Detailed look into the Transformer architecture that powers modern LLMs.
 24 | - [x] GPT Series of Models: Overview of the evolution of GPT models.
 25 | - [x] Evaluation Metrics and Benchmarks: Methods to evaluate and benchmark LLM performance.
 26 | - [x] :star: Hands-On: Training a mini Transformer model and experimenting with GPT-2 for text generation.
 27 | 
 28 | #### Module 3: "Advanced LLM Techniques"
 29 | - [x] Training Process and Scaling Laws: Understand how LLMs are trained and the laws governing their scaling.
 30 | - [x] PEFT: Learn Parameter-Efficient Fine-Tuning methods.
 31 | - [x] LoRA: Introduction to Low-Rank Adaptation.
 32 | - [x] Instruction Tuning: Techniques for fine-tuning models using instructions.
 33 | - [x] RLHF: Reinforcement Learning from Human Feedback and its applications.
 34 | - [x] :star: Hands-On:
 35 |     - [x] Instruction Tuning: Text 2 SQL using LLaMA3.1
 36 |     - [x] RLHF Hands-on: Sentiment aligment for generating movie reviews
 37 | 
 38 | #### Module 4: "Operationalizing LLMs"
 39 | - [x] Prompt Engineering: Crafting effective prompts to get desired outputs.
 40 | - [x] Prompt Hacking and Backdoors
 41 | - [x] Vector Databases: Using vector databases for efficient data retrieval.
 42 | - [x] RAGs: Techniques for retrieval-augmented generation.
 43 | - [x] Beyond Prompting: Understanding Frameworks such as DSPY
 44 | - [x] :star: Hands-On:
 45 |     - [x] Implementing basic prompt engineering techniques and
 46 |     - [x] Building a simple RAG system.
 47 |     - [x] Handson with DSPY
 48 | 
 49 | #### Module 5: "The Future of LLMs and Next Steps"
 50 | - Next Steps: Speculative topics on future advancements.
 51 | - Beyond: Future possibilities and directions for LLM research.
 52 | 
 53 | ---
 54 | 
 55 | ### Prerequisites
 56 | - Basics/hands-on experience of working with python
 57 | - Basic understanding of linear algebra and machine larning
 58 | - Basic understanding of Deep Neural Networks
 59 | - Basics/hands-on experience with pytorch
 60 | - Access to google-colab or similar python environment
 61 | - Access to chatGPT or Google-Bard (free access) 
 62 | 
 63 | ---
 64 | 
 65 | ## Environment Setup Instructions
 66 | 
 67 | > [!Important]
 68 | > - Follow Step by Step for a quick setup. This should work as-is for Mac/Linux based systems.
 69 | > - If you already have your own way of managing dependencies, checkout pyproject.toml for poetry or requirements.txt for pip based systems
 70 | > - The requirements.txt file is generated using the command ``poetry export --without-hashes --format=requirements.txt > requirements.txt``
 71 | 
 72 | - **We will make use of** :
 73 |     - ``pyenv`` for python version management
 74 |     - ``virtualenv`` for virtual environment management
 75 |     - ``poetry`` for dependency management
 76 | 
 77 | - **Pyenv**: 
 78 |     - ``brew install pyenv`` or ``curl https://pyenv.run | bash``
 79 | - **VirtualEnv**: 
 80 |     - install: 
 81 |         - ``brew install pyenv-virtualenv`` or
 82 |         - ``git clone https://github.com/pyenv/pyenv-virtualenv.git $(pyenv root)/plugins/pyenv-virtualenv``
 83 |     - add this to your .rc file: ``eval "$(pyenv virtualenv-init -)"``
 84 | - **Poetry**:
 85 |     - install: 
 86 |         - ``curl -sSL https://install.python-poetry.org | python3 -`` or
 87 |         - or check [here](https://python-poetry.org/docs/#installing-with-the-official-installer)
 88 | 
 89 | - **Setup**:
 90 |     - Local Mac/Linux :If you have `make` available, simply execute: ``make setup`` otherwise:
 91 |     - RunPod or other Similar Providers: simply execute: ``make runpod_setup`` otherwise:
 92 |     - If you are using other ways of dependency management:
 93 |         - Python Environment:
 94 |             -``pyenv install 3.11.9``
 95 |             - ``pyenv virtualenv 3.11.9 datahack``
 96 |             - ``cd <path to this repo clone>``
 97 |             - ``pyenv activate datahack``
 98 |             - ``poetry install`` <- Make sure ``pyproject.toml`` file is available in directory you execute this command
 99 |             OR
100 |             - use the `requirements.txt` file for reference.
101 |         - Setup ``nvm`` / ``node`` and install ``localtunnel``
102 |     
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/module_01/01_introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/01_introduction.pdf


--------------------------------------------------------------------------------
/module_01/02_getting_started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "iDeUTNIJnpGh"
  7 |    },
  8 |    "source": [
  9 |     "# Getting Started : Text Representation\n",
 10 |     "<img src=\"./assets/banner_notebook_1.jpg\">\n",
 11 |     "\n",
 12 |     "\n",
 13 |     "The NLP domain wasn't always this buzzing with __attention__ and hype that we see today. \n",
 14 |     "The recent progress in this field is built on top of years of amazing work and research. Before we leap onto the current state of things, let us have a quick walk through of how we arrived here. The current NLP systems are standing tall and promising on the shoulders of very solid work from past decades\n"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "## Import Required Libraries\n",
 22 |     "\n",
 23 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_01/02_getting_started.ipynb\">\n",
 24 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 25 |     "</a>"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "id": "UULjCk9BoIFF"
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "import torch\n",
 37 |     "import torchtext\n",
 38 |     "import os\n",
 39 |     "import collections\n",
 40 |     "import pandas as pd\n",
 41 |     "import numpy as np\n",
 42 |     "import re\n",
 43 |     "import torchtext \n",
 44 |     "torchtext.disable_torchtext_deprecation_warning()"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {
 51 |     "colab": {
 52 |      "base_uri": "https://localhost:8080/"
 53 |     },
 54 |     "id": "XtKof0Y2ZX_w",
 55 |     "outputId": "6af8d2ed-10eb-474f-d6a7-7809f544703e"
 56 |    },
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "import nltk\n",
 60 |     "\n",
 61 |     "nltk.download('stopwords')\n",
 62 |     "nltk.download('punkt')"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {
 68 |     "id": "621RWhAuokLt"
 69 |    },
 70 |    "source": [
 71 |     "### Get Text\n",
 72 |     "__The Gutenberg Project__ is an amazing project aimed at providing free access to some of the world's most amazing classical works. This makes it a wonderful source of textual data for NLP practitionars to use and improve their understanding of textual data. Ofcourse you can improve your litrary skills too \n",
 73 |     "\n",
 74 |     "For this module and workshop in general we will make use of materials available from the project. We begin by downloading the book __\"The Adventures of Sherlock Holmes by Arthur Conan Doyle\"__\n",
 75 |     "\n",
 76 |     "\n",
 77 |     "<img src=\"./assets/img_2_notebook_1.jpg\">"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {
 84 |     "colab": {
 85 |      "base_uri": "https://localhost:8080/"
 86 |     },
 87 |     "id": "Lwf8jsuDoeoy",
 88 |     "outputId": "6f9566d7-9cb8-4eed-ec29-df72828226d9"
 89 |    },
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {
 98 |     "id": "RrwoWo-Yon-9"
 99 |    },
100 |    "source": [
101 |     "### Load Data"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": null,
107 |    "metadata": {
108 |     "id": "-lKQYgNXonkD"
109 |    },
110 |    "outputs": [],
111 |    "source": [
112 |     "filename = \"sherlock_homes.txt\"\n",
113 |     "file_text = open(filename, 'r', encoding='utf-8').read()\n",
114 |     "\n",
115 |     "# lower case text to reduce dimensionality\n",
116 |     "file_text = file_text#TODO: Lowercase the file text\n",
117 |     "\n",
118 |     "# We remove first 1450 characters to remove\n",
119 |     "# details related to project gutenberg\n",
120 |     "raw_text = file_text [1450:]"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {
126 |     "id": "CAxaB81bsI-q"
127 |    },
128 |    "source": [
129 |     "### Text Representation\n",
130 |     "\n",
131 |     "Feature Engineering is often known as the secret sauce to creating superior and better performing machine learning models. Just one excellent feature could be your ticket to winning a Kaggle challenge! The importance of feature engineering is even more important for unstructured, textual data because we need to convert free flowing text into some numeric representations which can then be understood by machine learning algorithms.\n",
132 |     "\n",
133 |     "Since text is mostly available in unstructured form yet very high in dimensionality (how??? :sweat: ), the ability to represent text in the most appropriate way is one of the key ingredients to work in this domain.\n",
134 |     "\n",
135 |     "\n",
136 |     "Let us understand the current dataset at hand by checking the obvious aspects of a textual dataset"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {
143 |     "colab": {
144 |      "base_uri": "https://localhost:8080/"
145 |     },
146 |     "id": "TXqdpFXGpJuH",
147 |     "outputId": "7c9bc0c8-f0ca-4d58-ce39-6e5666c59914"
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "# unique list of characters and total characters in the file\n",
152 |     "char_vocab = sorted(set(raw_text))\n",
153 |     "\n",
154 |     "\n",
155 |     "# summarize the loaded data\n",
156 |     "n_chars = len(raw_text)\n",
157 |     "n_vocab = len(char_vocab)\n",
158 |     "print(\"Total Characters: \", n_chars)\n",
159 |     "print(\"Total Vocab: \", n_vocab)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {
165 |     "id": "uSlkgVJIpV_X"
166 |    },
167 |    "source": [
168 |     "### Tokenize and Vectorize\n",
169 |     "To leverage different algorithms we convert text into numbers that can be represented as tensors.\n",
170 |     "\n",
171 |     "The first step is to convert text to tokens - tokenization. If we use word-level representation, each word would be represented by its own token. We will use build-in tokenizer from torchtext module"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "import torchtext; torchtext.disable_torchtext_deprecation_warning()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "# Deprecation notice!\n",
190 |     "from torchtext.data import get_tokenizer\n",
191 |     "from torchtext.vocab import Vocab"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {
198 |     "id": "sgfwH3lRpU0s"
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "tokenizer = get_tokenizer('basic_english')"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": null,
208 |    "metadata": {
209 |     "colab": {
210 |      "base_uri": "https://localhost:8080/"
211 |     },
212 |     "id": "RRPUOkuhp168",
213 |     "outputId": "2f4935ee-52bb-455d-fc2b-39365fc5793b"
214 |    },
215 |    "outputs": [],
216 |    "source": [
217 |     "tokens = tokenizer(raw_text[:50])\n",
218 |     "print(f'\\Token list:\\n{tokens}')"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {
224 |     "id": "yxlQEcTuqHzJ"
225 |    },
226 |    "source": [
227 |     "Now, to convert text to numbers, we will need to build a vocabulary of all tokens."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "id": "f2D4CNkAqFRl"
235 |    },
236 |    "outputs": [],
237 |    "source": [
238 |     "# word level vocab\n",
239 |     "word_counter = collections.Counter()\n",
240 |     "for line in raw_text.split('\\n'):\n",
241 |     "    word_counter.update(tokenizer(line))\n",
242 |     "word_vocab = Vocab(word_counter)"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {
249 |     "colab": {
250 |      "base_uri": "https://localhost:8080/"
251 |     },
252 |     "id": "bmO3DBZCq7k3",
253 |     "outputId": "dad353f3-9ce5-4f77-ce5b-4962cb20a95d"
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "# sample lookup at word-level\n",
258 |     "#TODO: Print a few tokens with their indices"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {
265 |     "id": "XyEN3x5pqo2K"
266 |    },
267 |    "outputs": [],
268 |    "source": [
269 |     "# character level vocab\n",
270 |     "char2idx = {u:i for i, u in enumerate(char_vocab)}\n",
271 |     "idx2char = np.array(char_vocab)\n",
272 |     "\n",
273 |     "text_as_int = np.array([char2idx[c] for c in raw_text])"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {
280 |     "colab": {
281 |      "base_uri": "https://localhost:8080/"
282 |     },
283 |     "id": "ZWCe7yetq63L",
284 |     "outputId": "5f2d4458-e4bb-432b-a391-9b2f685a1381"
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "# char level mapping\n",
289 |     "print('{')\n",
290 |     "for char,_ in zip(char2idx, range(10)):\n",
291 |     "    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))\n",
292 |     "print('  ...\\n}')"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "markdown",
297 |    "metadata": {
298 |     "id": "R2ye2VNRrhyv"
299 |    },
300 |    "source": [
301 |     "### Text as Vector\n",
302 |     "\n",
303 |     "``torchtext`` ``vocab.stoi`` dictionary allows us to convert from a string representation into numbers (``stoi`` -> \"from string to integers).\n",
304 |     "\n",
305 |     "To convert the text back from a numeric representation into text, we can use the ``vocab.itos`` dictionary to perform reverse lookup:"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "colab": {
313 |      "base_uri": "https://localhost:8080/"
314 |     },
315 |     "id": "znnAoejUrjP7",
316 |     "outputId": "ab987cd3-37b4-4ef5-e201-3dff00571036"
317 |    },
318 |    "outputs": [],
319 |    "source": [
320 |     "word_vocab_size = len(word_vocab)\n",
321 |     "print(f\"Word Vocab size= {word_vocab_size}\")\n",
322 |     "\n",
323 |     "\n",
324 |     "def encode(x):\n",
325 |     "    return [word_vocab[s] for s in tokenizer(x)]\n",
326 |     "\n",
327 |     "vec = encode(raw_text[:100])\n",
328 |     "print(vec)"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "metadata": {
334 |     "id": "oXr9EcyCsLzw"
335 |    },
336 |    "source": [
337 |     "### Bag Of Words Representation\n",
338 |     "\n",
339 |     "Bag of Words (BoW) representation is a traditional vector representation of text for NLP tasks. Each word/character is linked to a vector index, vector element contains the number of occurrences of a word/character in a given document.\n"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {
346 |     "id": "R5osyuH-vmE9"
347 |    },
348 |    "outputs": [],
349 |    "source": [
350 |     "def to_bow(text,bow_vocab_size=word_vocab_size):\n",
351 |     "    res = torch.zeros(bow_vocab_size,dtype=torch.float32)\n",
352 |     "    for i in encode(text):\n",
353 |     "        if i<bow_vocab_size:\n",
354 |     "            res[i] += 1\n",
355 |     "    return res"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "colab": {
363 |      "base_uri": "https://localhost:8080/"
364 |     },
365 |     "id": "iybYz0j6vsOo",
366 |     "outputId": "d34b8da0-462a-41d8-8183-017697efe1eb"
367 |    },
368 |    "outputs": [],
369 |    "source": [
370 |     "sample_text = \"this is a sample text to showcase text representation\"\n",
371 |     "print(f\"sample text:\\n{raw_text[100:150]}\")\n",
372 |     "#TODO: Print a BoW vector of the sample text segment chosen above"
373 |    ]
374 |   },
375 |   {
376 |    "cell_type": "markdown",
377 |    "metadata": {
378 |     "id": "ddDCeVoTsocg"
379 |    },
380 |    "source": [
381 |     "### TF-IDF\n",
382 |     "\n",
383 |     "TF-IDF stands for term frequency–inverse document frequency. It is a form of bag of words representation, where instead of a **binary value** indicating the appearance of a word in a document, a floating-point value is used, which is related to the frequency of word occurrence in the corpus.\n",
384 |     "\n",
385 |     "The formula to calculate TF-IDF is:\n",
386 |     "\n",
387 |     "$w_{ij}=tf_{ij}* \\log(\\frac{N}{df_i})$\n",
388 |     "\n",
389 |     "Where:\n",
390 |     "\n",
391 |     "- $i$ is the word\n",
392 |     "- $j$ is the document\n",
393 |     "- $w_{ij}$ is the weight or the importance of the word in the document\n",
394 |     "- $tf_{ij}$ is the number of occurrences of the word i in the document j, i.e. the BoW value we have seen before\n",
395 |     "- $N$ is the number of documents in the collection\n",
396 |     "- $df_i$ is the number of documents containing the word i in the whole collection.\n",
397 |     "\n",
398 |     "\n",
399 |     "TF-IDF value $w_{ij}$ increases proportionally to the number of times a word appears in a document and is offset by the number of documents in the corpus that contains the word, which helps to adjust for the fact that some words appear more frequently than others. For example, if the word appears in every document in the collection, $df_i=N$, and $w_{ij}=0$, and those terms would be completely disregarded."
400 |    ]
401 |   },
402 |   {
403 |    "cell_type": "markdown",
404 |    "metadata": {
405 |     "id": "_j1oOGsYvDy5"
406 |    },
407 |    "source": [
408 |     "Let's compute document frequency for each word to start with.\n",
409 |     "We can represent it as tensor of size vocab_size. We will limit the number of documents to N=1000 to speed up processing. For each input sentence, we compute the set of words (represented by their numbers), and increase the corresponding counter:"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {
416 |     "id": "rLG8yP7-PaL2"
417 |    },
418 |    "outputs": [],
419 |    "source": [
420 |     "raw_text_lines = raw_text.split('\\n')\n",
421 |     "raw_text_lines = [line for line in raw_text_lines if line not in [' ','']]"
422 |    ]
423 |   },
424 |   {
425 |    "cell_type": "code",
426 |    "execution_count": null,
427 |    "metadata": {
428 |     "colab": {
429 |      "base_uri": "https://localhost:8080/",
430 |      "height": 35
431 |     },
432 |     "id": "3VhaDSTfRr9e",
433 |     "outputId": "b33d090b-995e-42ba-e796-3c462c33720e"
434 |    },
435 |    "outputs": [],
436 |    "source": [
437 |     "raw_text_lines[3]"
438 |    ]
439 |   },
440 |   {
441 |    "cell_type": "code",
442 |    "execution_count": null,
443 |    "metadata": {
444 |     "id": "sxXg5OiILXZC"
445 |    },
446 |    "outputs": [],
447 |    "source": [
448 |     "from sklearn.feature_extraction.text import TfidfVectorizer\n",
449 |     "vectorizer = TfidfVectorizer()\n",
450 |     "X = vectorizer.fit_transform(raw_text[:2000].split('\\n'))"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": null,
456 |    "metadata": {
457 |     "colab": {
458 |      "base_uri": "https://localhost:8080/"
459 |     },
460 |     "id": "v4Cy7h7OMYam",
461 |     "outputId": "f578a03a-b384-4f57-b3c1-cac2ceccfd77"
462 |    },
463 |    "outputs": [],
464 |    "source": [
465 |     "vectorizer.transform([raw_text_lines[3]]).todense()"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {
471 |     "id": "QylmzQxIv8ZN"
472 |    },
473 |    "source": [
474 |     "### Word Embeddings\n",
475 |     "A word embedding is a learned dense representation of text. In this approach we represent words and documents as dense vectors that have distinct lexical properties. This can be considered as one of the key breakthroughs in the fielf of NLP.\n",
476 |     "\n",
477 |     "Let us briefly:\n",
478 |     "\n",
479 |     "- Understand the Word2Vec models called Skipgram and CBOW"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "markdown",
484 |    "metadata": {
485 |     "id": "WfNGgtji2v70"
486 |    },
487 |    "source": [
488 |     "### Word2Vec\n",
489 |     "\n",
490 |     "This model was created by [Mikolov et. al at Google in 2013](https://arxiv.org/abs/1301.3781). It is a predictive deep learning model designed to compute and generate high quality, distributed and continuous dense vector representations of words, which capture contextual and semantic similarity. Essentially these are unsupervised models which can be trained on massive textual corpora, create a vocabulary of possible words and generate dense word embeddings for each word in the vector space representing that vocabulary.\n",
491 |     "\n",
492 |     "There are two different model architectures which can be leveraged by Word2Vec to create these word embedding representations. These include,\n",
493 |     "\n",
494 |     "- The Continuous Bag of Words (CBOW) Model\n",
495 |     "- The Skip-gram Model"
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "markdown",
500 |    "metadata": {
501 |     "id": "yRSSB7MI3RQ6"
502 |    },
503 |    "source": [
504 |     "## Continuous Bag of Words (CBOW) Model\n",
505 |     "The CBOW model architecture tries to predict the current __`target word`__ (the center word) based on the __`source context words`__ (surrounding words).\n",
506 |     "\n",
507 |     "Considering a simple sentence, “the quick brown fox jumps over the lazy dog”, this can be pairs of (context_window, target_word) where if we consider a context window of size 2, we have examples like __([quick, fox], brown)__, __([the, brown], quick)__, __([the, dog], lazy)__ and so on.\n",
508 |     "\n",
509 |     "Thus the model tries to predict the target_word based on the context_window words.\n",
510 |     "\n",
511 |     "<img src=\"./assets/cbow_arch_notebook_1.png\">"
512 |    ]
513 |   },
514 |   {
515 |    "cell_type": "markdown",
516 |    "metadata": {
517 |     "id": "yDs_-hoJ3atN"
518 |    },
519 |    "source": [
520 |     "### Skip-gram Model\n",
521 |     "The Skip-gram model architecture usually tries to achieve the reverse of what the CBOW model does. It tries to predict the __`source context words`__ (surrounding words) given a __`target word`__ (the center word).\n",
522 |     "\n",
523 |     "Considering our simple sentence from earlier, “the quick brown fox jumps over the lazy dog”. If we used the CBOW model, we get pairs of (context_window, target_word) where if we consider a context window of size 2, we have examples like __([quick, fox], brown)__, __([the, brown], quick)__, __([the, dog], lazy)__ and so on.\n",
524 |     "\n",
525 |     "Now considering that the skip-gram model’s aim is to predict the context from the target word, the model typically inverts the contexts and targets, and tries to predict each context word from its target word. Hence the task becomes to predict the context [quick, fox] given target word ‘brown’ or [the, brown] given target word ‘quick’ and so on.\n",
526 |     "\n",
527 |     "Thus the model tries to predict the context_window words based on the target_word.\n",
528 |     "\n",
529 |     "<img src=\"./assets/skipgram_arch_notebook_1.png\">"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": null,
535 |    "metadata": {
536 |     "id": "wlPE6usUvNdr"
537 |    },
538 |    "outputs": [],
539 |    "source": [
540 |     "corpus = ['The sky is blue and beautiful.',\n",
541 |     "          'Love this blue and beautiful sky!',\n",
542 |     "          'The quick brown fox jumps over the lazy dog.',\n",
543 |     "          \"A king's breakfast has sausages, ham, bacon, eggs, toast and beans\",\n",
544 |     "          'I love green eggs, ham, sausages and bacon!',\n",
545 |     "          'The brown fox is quick and the blue dog is lazy!',\n",
546 |     "          'The sky is very blue and the sky is very beautiful today',\n",
547 |     "          'The dog is lazy but the brown fox is quick!'\n",
548 |     "]"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {
555 |     "colab": {
556 |      "base_uri": "https://localhost:8080/"
557 |     },
558 |     "id": "HJ32JCdnZQfD",
559 |     "outputId": "4d1c7293-e5ac-451f-a75c-b6c64572761f"
560 |    },
561 |    "outputs": [],
562 |    "source": [
563 |     "stop_words = nltk.corpus.stopwords.words('english')\n",
564 |     "\n",
565 |     "def normalize_document(doc):\n",
566 |     "    # lower case and remove special characters\\whitespaces\n",
567 |     "    doc = re.sub(r'[^a-zA-Z\\s]', '', doc, re.I|re.A)\n",
568 |     "    doc = doc.lower()\n",
569 |     "    doc = doc.strip()\n",
570 |     "    # tokenize document\n",
571 |     "    tokens = nltk.word_tokenize(doc)\n",
572 |     "    # filter stopwords out of document\n",
573 |     "    filtered_tokens = [token for token in tokens if token not in stop_words]\n",
574 |     "    # re-create document from filtered tokens\n",
575 |     "    doc = #TODO: Join back the list of tokens as a string. \n",
576 |     "    return doc\n",
577 |     "\n",
578 |     "normalize_corpus = np.vectorize(normalize_document)\n",
579 |     "\n",
580 |     "norm_corpus = normalize_corpus(corpus)\n",
581 |     "norm_corpus"
582 |    ]
583 |   },
584 |   {
585 |    "cell_type": "markdown",
586 |    "metadata": {},
587 |    "source": [
588 |     "## Gensim Framework\n",
589 |     "\n",
590 |     "The ``gensim`` framework, created by Radim Řehůřek consists of a robust, efficient and scalable implementation of the __Word2Vec__ model. We will leverage the same on our sample toy corpus. In our workflow, we will tokenize our normalized corpus and then focus on the following four parameters in the Word2Vec model to build it.\n",
591 |     "\n",
592 |     "- vector_size: The word embedding dimensionality\n",
593 |     "- window: The context window size\n",
594 |     "- min_count: The minimum word count\n",
595 |     "- sample: The downsample setting for frequent words\n",
596 |     "- sg: Training model, 1 for skip-gram otherwise CBOW\n",
597 |     "\n",
598 |     "We will build a simple Word2Vec model on the corpus and visualize the embeddings."
599 |    ]
600 |   },
601 |   {
602 |    "cell_type": "code",
603 |    "execution_count": null,
604 |    "metadata": {
605 |     "id": "iHITPzAY2a6b"
606 |    },
607 |    "outputs": [],
608 |    "source": [
609 |     "from gensim.models import word2vec"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": null,
615 |    "metadata": {
616 |     "colab": {
617 |      "base_uri": "https://localhost:8080/"
618 |     },
619 |     "id": "SD2BOBlR2ZJN",
620 |     "outputId": "c0c52ace-517b-404e-fa9c-d06332424b19"
621 |    },
622 |    "outputs": [],
623 |    "source": [
624 |     "tokenized_corpus = [tokenizer(line) for line in norm_corpus]\n",
625 |     "\n",
626 |     "# Set values for various parameters\n",
627 |     "feature_size = 15    # Word vector dimensionality\n",
628 |     "window_context = 5   # Context window size\n",
629 |     "min_word_count = 1   # Minimum word count\n",
630 |     "sample = 1e-3        # Downsample setting for frequent words\n",
631 |     "sg = 1               # skip-gram model\n",
632 |     "\n",
633 |     "w2v_model = word2vec.Word2Vec(tokenized_corpus,\n",
634 |     "                              vector_size=feature_size,\n",
635 |     "                              window=window_context,\n",
636 |     "                              min_count = min_word_count,\n",
637 |     "                              sg=sg,\n",
638 |     "                              sample=sample,\n",
639 |     "                              epochs=5000)\n",
640 |     "w2v_model"
641 |    ]
642 |   },
643 |   {
644 |    "cell_type": "code",
645 |    "execution_count": null,
646 |    "metadata": {
647 |     "colab": {
648 |      "base_uri": "https://localhost:8080/"
649 |     },
650 |     "id": "jRs624Df4I4q",
651 |     "outputId": "4ae21244-7bd1-44ca-fec6-1301cb8eb0a7"
652 |    },
653 |    "outputs": [],
654 |    "source": [
655 |     "w2v_model.wv['sky']"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "code",
660 |    "execution_count": null,
661 |    "metadata": {},
662 |    "outputs": [],
663 |    "source": [
664 |     "#TODO: Print the vector for the word India"
665 |    ]
666 |   },
667 |   {
668 |    "cell_type": "code",
669 |    "execution_count": null,
670 |    "metadata": {
671 |     "id": "BOqfN1C_35dt"
672 |    },
673 |    "outputs": [],
674 |    "source": [
675 |     "import scienceplots\n",
676 |     "import matplotlib.pyplot as plt\n",
677 |     "from sklearn.manifold import TSNE\n",
678 |     "plt.style.use(['science','ieee','no-latex'])\n",
679 |     "\n",
680 |     "%matplotlib inline"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": null,
686 |    "metadata": {
687 |     "colab": {
688 |      "base_uri": "https://localhost:8080/",
689 |      "height": 683
690 |     },
691 |     "id": "AvqivFNy3siE",
692 |     "outputId": "e9d8a7e0-c0cc-4715-8ef6-088a4eee1be3"
693 |    },
694 |    "outputs": [],
695 |    "source": [
696 |     "# visualize embeddings\n",
697 |     "words = w2v_model.wv.index_to_key\n",
698 |     "wvs = w2v_model.wv[words]\n",
699 |     "\n",
700 |     "tsne = TSNE(n_components=2, random_state=42, n_iter=5000, perplexity=5)\n",
701 |     "np.set_printoptions(suppress=True)\n",
702 |     "T = tsne.fit_transform(wvs)\n",
703 |     "labels = words\n",
704 |     "\n",
705 |     "plt.figure(figsize=(12, 6))\n",
706 |     "plt.scatter(T[:, 0], T[:, 1],)\n",
707 |     "for label, x, y in zip(labels, T[:, 0], T[:, 1]):\n",
708 |     "    plt.annotate(label, xy=(x+1, y+1), xytext=(0, 0), textcoords='offset points')"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "code",
713 |    "execution_count": null,
714 |    "metadata": {
715 |     "colab": {
716 |      "base_uri": "https://localhost:8080/"
717 |     },
718 |     "id": "s8iIJ81tYBb0",
719 |     "outputId": "4a9b9080-9dc9-46ab-9f40-e2715c1c5dd2"
720 |    },
721 |    "outputs": [],
722 |    "source": [
723 |     "w2v_model.wv.most_similar('dog', topn=10)"
724 |    ]
725 |   },
726 |   {
727 |    "cell_type": "markdown",
728 |    "metadata": {
729 |     "id": "wfnq6B2K4RV2"
730 |    },
731 |    "source": [
732 |     "## Similar and Improved Works \n",
733 |     "- [GloVe](https://nlp.stanford.edu/pubs/glove.pdf)\n",
734 |     "- [FastText](https://arxiv.org/pdf/1607.04606.pdf)\n",
735 |     "- [Sent2Vec](https://arxiv.org/abs/1405.4053)\n",
736 |     "- X2Vec"
737 |    ]
738 |   },
739 |   {
740 |    "cell_type": "code",
741 |    "execution_count": null,
742 |    "metadata": {},
743 |    "outputs": [],
744 |    "source": [
745 |     "with open(\"norm_corpus.txt\",\"w\") as f:\n",
746 |     "    for line in norm_corpus:\n",
747 |     "        f.write(line+'\\n')"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "code",
752 |    "execution_count": null,
753 |    "metadata": {},
754 |    "outputs": [],
755 |    "source": [
756 |     "import fasttext\n",
757 |     "fasttext_model = fasttext.train_unsupervised('norm_corpus.txt', model='skipgram',epoch=500000,minCount=1,loss='ns')"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": null,
763 |    "metadata": {},
764 |    "outputs": [],
765 |    "source": [
766 |     "fasttext_model.get_word_vector('sky')"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "code",
771 |    "execution_count": null,
772 |    "metadata": {},
773 |    "outputs": [],
774 |    "source": [
775 |     "# TODO: Get Vector for India"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": null,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": [
784 |     "# TODO: Identify nearest neighbors for the word breakfast"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "markdown",
789 |    "metadata": {
790 |     "id": "HHRdBNxQ5Ho6"
791 |    },
792 |    "source": [
793 |     "### Limitations\n",
794 |     "One key limitation of traditional pretrained embedding representations such as Word2Vec is the problem of word sense and removing ambiguity by making them clear. While pretrained embeddings can capture some of the meaning of words in context, every possible meaning of a word is encoded into the same embedding. This can cause problems in downstream models, since many words such as the word 'play' have different meanings depending on the context they are used in.\n",
795 |     "\n",
796 |     "For example, the word 'play' in these two different sentences have quite different meaning:\n",
797 |     "\n",
798 |     "- I went to a **play** at the theatre.\n",
799 |     "- John wants to **play** with his friends.\n",
800 |     "The pretrained embeddings above represent both meanings of the word 'play' in the same embedding. To overcome this limitation, we need to build embeddings based on the language model, which is trained on a large corpus of text, and knows how words can be put together in different contexts."
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "markdown",
805 |    "metadata": {},
806 |    "source": [
807 |     "---\n",
808 |     "\n",
809 |     "## Thought Exercise\n",
810 |     "\n",
811 |     "- We discussed about representing text into tokens and why it is important\n",
812 |     "- We discussed about different tokenization methods (character wise, word wise and more...)\n",
813 |     "- But does it make any difference for the tokenizer (or even the model) in terms of any meaning of those token?\n",
814 |     "\n",
815 |     "Probably No. Check this experiment tweeted by [Andrej Karpathy](https://x.com/karpathy/status/1816637781659254908/photo/1)\n",
816 |     "<img src=\"./assets/karpathy_emoji_tokenizer.jpeg\">\n"
817 |    ]
818 |   },
819 |   {
820 |    "cell_type": "markdown",
821 |    "metadata": {},
822 |    "source": []
823 |   }
824 |  ],
825 |  "metadata": {
826 |   "colab": {
827 |    "provenance": []
828 |   },
829 |   "kernelspec": {
830 |    "display_name": "Python 3 (ipykernel)",
831 |    "language": "python",
832 |    "name": "python3"
833 |   },
834 |   "language_info": {
835 |    "codemirror_mode": {
836 |     "name": "ipython",
837 |     "version": 3
838 |    },
839 |    "file_extension": ".py",
840 |    "mimetype": "text/x-python",
841 |    "name": "python",
842 |    "nbconvert_exporter": "python",
843 |    "pygments_lexer": "ipython3",
844 |    "version": "3.11.9"
845 |   }
846 |  },
847 |  "nbformat": 4,
848 |  "nbformat_minor": 4
849 | }
850 | 


--------------------------------------------------------------------------------
/module_01/03_explore_transformers.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "22ab9deb-d5cd-45d4-a4be-4748d15df4e5",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Exploring Transformer Architectures\n",
  9 |     "\n",
 10 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_01/03_explore_transformers.ipynb\">\n",
 11 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 12 |     "</a>\n"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "id": "5cfd7bf0-01eb-42fa-86a7-0614c6948ed9",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "## The RNN Limitation\n",
 21 |     "The RNN layer (LSTM, or GRU, etc.) takes in a context window of a defined size as input and encodes all of it into a single vector. This bottleneck vector needs to capture a lot of information in itself before the decoding stage can use it to start generating the next token. To enhance performance of sequence to sequence tasks a typical Encoder-Decoder architecture is the go-to choice.\n",
 22 |     "\n",
 23 |     "<img src=\"./assets/encoder_decoder_notebook_3.png\">\n",
 24 |     "\n",
 25 |     "Let us consider the case of **Machine Translation**, i.e. translation of English to Spanish (or any other language).\n",
 26 |     "\n",
 27 |     "In a typical __Encoder-Decoder__ architecture, the Encoder takes in the input text in English as input and prepares a condensed vector representation of the whole input. Typically termed as bottleneck features. The Decoder then uses these features to generate the translated text in Spanish.\n",
 28 |     "\n",
 29 |     "While this architecture and its variants worked wonders, they had issues. Issues such as inability handle longer input sequences, cases where there is not a one to one mapping between input vs output language and so on.\n",
 30 |     "\n",
 31 |     "To handle these issues, __Vasvani et. al.__ in their now famouly titled paper __Attention Is All You Need__ build up on the concepts of attention. The main highlight of this work was the Transformer architecture. Transformers were shown to present state of the art results on multiple benchmarks without using any recurrence or convolutional components."
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "id": "c97031ae-f2ee-47f5-8873-062bc1da55f7",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Transformers\n",
 40 |     "- The transformer architecture was presented in the seminal paper __Attention is All You Need__ by Vaswani et al. back in 2017\n",
 41 |     "- A transformer is a __recurrence-__ and __convolution-free__ attention-based encoder-decoder architecture\n",
 42 |     "- Introduced the concept of multi-head attention and positional encodings\n",
 43 |     "- Also revolutionalised Computer Vision domain (see ViT)\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "<img src=\"./assets/transformer_arch_notebook_3.png\">"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "id": "d34a3aab-a33b-4864-a5cc-f175146af5ce",
 52 |    "metadata": {},
 53 |    "source": [
 54 |     "## Attention is All you Need ⚠️\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "### Attention to the Rescue\n",
 58 |     "Attention is one of the most powerful concepts in the deep learning space that really changed the game. The core idea behind the attention mechanism is to make use of all interim hidden states of the RNN to decide which one to focus upon before it is used by the decoding stage. \n",
 59 |     "\n",
 60 |     "### Contextual Embeddings\n",
 61 |     "The [TagLM architecture by Peters et al. in 2017](https://arxiv.org/abs/1705.00108) was one of the first works that provided an insight into how we could combine __pre-trained word embeddings__ with a __pre-trained neural language model__ to generate __context-aware embeddings__ for downstream NLP tasks.\n",
 62 |     "\n",
 63 |     "The big breakthrough that changed the NLP landscape came in the form of __ELMo, or Embeddings from Language Models__. The ELMo architecture was presented by Peters et al. in their work titled [__Deep Contextualized Word Representations in 2018__](https://arxiv.org/abs/1802.05365). Without going into too much detail, the main highlights of the ELMo architecture were:\n",
 64 |     "\n",
 65 |     "- The model used a bi-LSTM-based language model.\n",
 66 |     "- Character CNNs were used to generate embeddings, in place of pre-trained word vectors, which made use of huge 4096 LSTM units but transformed into smaller 512-sized vectors using feedforward layers.\n",
 67 |     "- The main innovation was to make use of all the hidden bi-LSTM layers for generating input representation. Unlike previous works, where only the final LSTM layer was used to fetch the representation of the input, this work took a weighted average of all the hidden layers' hidden states. This helped the model learn contextual word embeddings where each layer contributed to things like syntax and semantics.\n",
 68 |     "\n",
 69 |     "### Self-Attention\n",
 70 |     "- Self-attention was proposed by Cheng et al. in their paper titled Long Short-Term Memory Networks for Machine Reading in 2016\n",
 71 |     "- Self-attention enables a model to learn the correlation between the current token (character or word or sentence, etc.) and its context window. In other words, it is an attention mechanism that relates different positions of a given sequence so as to generate a representation of the same sequence\n",
 72 |     "\n",
 73 |     "### Multi-head Attention\n",
 74 |     "- Multi-head attention extends the self-attention mechanism by performing multiple parallel self-attention operations, each focusing on different learned linear projections of the input. Multiple attention heads allow the model to capture different types of relationships and learn more fine-grained representations (eg: grammar, context, dependency, etc.)\n",
 75 |     "\n",
 76 |     "<img src=\"./assets/multihead_attention_notebook_3.png\">\n",
 77 |     "\n",
 78 |     "> Source: [Vasvani et. al.](https://arxiv.org/pdf/1706.03762.pdf)"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "id": "cd448c92-cc9c-44c5-a408-d8f4e5706862",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "### Positional Encoding\n",
 87 |     "Positional encoding is a technique used to incorporate the position of each token in the input sequence. It provides the model with information about the token's position without relying solely on the order of tokens.\n",
 88 |     "This additional aspect was required because transformers do not have the natural sequential setup of RNNs. In order to provide positional context, any encoding system should ideally have the following properties:\n",
 89 |     "\n",
 90 |     "- It should output a unique encoding for each time-step (word’s position in a sentence)\n",
 91 |     "- Distance between any two time-steps should be consistent across sentences with different lengths.\n",
 92 |     "- Our model should generalize to longer sentences without any efforts. Its values should be bounded.\n",
 93 |     "- It must be deterministic.\n",
 94 |     "\n",
 95 |     "<img src=\"./assets/positional_emb_notebook_3.png\">\n",
 96 |     "\n",
 97 |     "\n",
 98 |     "\n",
 99 |     "### References\n",
100 |     "- [The Illustrated Transformer](https://jalammar.github.io/illustrated-transformer/)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": null,
106 |    "id": "3f54409c-36e1-4836-8336-a63bcd32d047",
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "import numpy as np\n",
111 |     "import scienceplots\n",
112 |     "from matplotlib import pyplot as plt\n",
113 |     "plt.style.use(['science','no-latex'])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "id": "bb1d2966-72aa-425c-ac73-51e7263a8bdc",
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "def positional_encoding(pos,i,d_model,is_even=True):\n",
124 |     "    \"\"\"\n",
125 |     "    Method to generate positional encoding value\n",
126 |     "    :param pos: position of the input\n",
127 |     "    :param i: i-th dimension of the embedding\n",
128 |     "    :param d_model: length of the embedding vector\n",
129 |     "    :param is_even: if the position of the input is even or odd\n",
130 |     "    \"\"\"\n",
131 |     "    input_val = pos/np.power(10000,(2*i)/d_model)\n",
132 |     "    if is_even:\n",
133 |     "        return np.sin(input_val)\n",
134 |     "    else:\n",
135 |     "        return np.cos(input_val)"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "id": "c6a5c6ec-402e-4d61-af32-1f72815e448b",
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "# input parameters for visualisations\n",
146 |     "pos = np.arange(0,10,0.1) #10 input words, stepping at 0.1 for smoothness only\n",
147 |     "dimensions = np.arange(0,512) # dimensionality of the positional encoding (same as d_model by default)\n",
148 |     "d_model = 512 # length of embedding vector"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "id": "7a8ff1df-72d2-48e2-8d28-dd8dc2dd3a1f",
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "# positional encoding for even positions\n",
159 |     "even_pos_emb = [positional_encoding(pos,i,d_model) for i in dimensions] \n",
160 |     "\n",
161 |     "# positional encoding for off positions\n",
162 |     "odd_pos_emb = #TODO: prepare positional embeddings for odd positions"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "id": "fcbea557-8d48-4280-9841-e99a1a754638",
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "show_dim = [0,16,32] # visualise only a few dimensions for clarity\n",
173 |     "plt.figure(figsize=(15, 5))\n",
174 |     "for i in dimensions:\n",
175 |     "    if i in show_dim:\n",
176 |     "        plt.plot(pos,even_pos_emb[i])\n",
177 |     "        plt.plot(pos,odd_pos_emb[i])\n",
178 |     "    plt.axvline(2,linestyle='--',c='black')\n",
179 |     "plt.title(\"Positional Encodings\")        \n",
180 |     "plt.xlabel(\"input positions\")\n",
181 |     "plt.ylabel(\"encoding value\")\n",
182 |     "plt.show()"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "id": "aec21692-04c7-44f6-ba07-58d01bffe43e",
188 |    "metadata": {},
189 |    "source": [
190 |     "## BERT-ology\n",
191 |     "- BERT, or __[Bi-Directional Encoder Representations from Transformers](https://arxiv.org/abs/1810.04805)__, was presented by Devlin et al., a team at Google AI in 2018\n",
192 |     "- Multi-task Learning: BERT also helped push the transfer-learning envelope in the NLP domain by showcasing how a pre-trained model can be fine-tuned for various tasks to provide state-of-the-art performance\n",
193 |     "- BERT tweaked the usual Language Model objective to only predict next token based on past context by building context from both directions, i.e. the objective of predicting masked words along with next sentence prediction.\n",
194 |     "\n",
195 |     "\n",
196 |     "<img src=\"./assets/bert_models_layout_notebook_3.jpeg\">\n",
197 |     "\n",
198 |     "> source [PLM Papers](https://github.com/thunlp/PLMpapers)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "code",
203 |    "execution_count": null,
204 |    "id": "f8a38e20-e963-4d1a-b65c-8e09a60a267f",
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "import torch\n",
209 |     "import transformers\n",
210 |     "from transformers import pipeline"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "id": "fb245c9f-e147-49ef-a69c-e3defa41fae9",
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# Let us define some configs/constants\n",
221 |     "DISTILBET_BASE_UNCASED_CHECKPOINT = \"distilbert/distilbert-base-uncased\"\n",
222 |     "DISTILBET_QA_CHECKPOINT = \"distilbert/distilbert-base-uncased-distilled-squad\"\n",
223 |     "DISTILBET_CLASSIFICATION_CHECKPOINT = \"distilbert/distilbert-base-uncased-finetuned-sst-2-english\""
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "id": "1a88cc2e-b8fb-48f9-8537-015420b4d902",
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "if torch.cuda.is_available():\n",
234 |     "    DEVICE = 'cuda'\n",
235 |     "    Tensor = torch.cuda.FloatTensor\n",
236 |     "    LongTensor = torch.cuda.LongTensor\n",
237 |     "    DEVICE_ID = 0\n",
238 |     "elif torch.backends.mps.is_available():\n",
239 |     "    DEVICE = 'mps'\n",
240 |     "    Tensor = torch.FloatTensor\n",
241 |     "    LongTensor = torch.LongTensor\n",
242 |     "    DEVICE_ID = 0\n",
243 |     "else:\n",
244 |     "    DEVICE = 'cpu'\n",
245 |     "    Tensor = torch.FloatTensor\n",
246 |     "    LongTensor = torch.LongTensor\n",
247 |     "    DEVICE_ID = -1\n",
248 |     "print(f\"Backend Accelerator Device={DEVICE}\")"
249 |    ]
250 |   },
251 |   {
252 |    "cell_type": "markdown",
253 |    "id": "9f241206-25ed-4021-9457-e1bdd665a029",
254 |    "metadata": {},
255 |    "source": [
256 |     "### Predicting the Masked Token\n",
257 |     "This was a unique objective when BERT was originally introduced as compared to usual NLP tasks such as classification. The objective requires us to prepare a dataset where we mask a certain percentage of input tokens and train the model to learn to predict those tokens. This objective turns out to be very effective in helping the model learn the nuances of language. \n",
258 |     "\n",
259 |     "In this first task we will test the pre-trained model against this objective itself. The model outputs a bunch of things such as the predicted token, encoded index of the predicted token/word along with a score which indicates the model's confidence."
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "code",
264 |    "execution_count": null,
265 |    "id": "2e3efdda-c31f-438e-a19e-4593570dc323",
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "mlm_pipeline = pipeline(\n",
270 |     "    'fill-mask',\n",
271 |     "    model=DISTILBET_BASE_UNCASED_CHECKPOINT,\n",
272 |     "    device=DEVICE_ID\n",
273 |     ")\n",
274 |     "mlm_pipeline(\"Bangalore is the IT [MASK] of India.\")"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "id": "c540cf39-e8d9-412f-b339-49c93faf9723",
280 |    "metadata": {},
281 |    "source": [
282 |     "### Question Answering\n",
283 |     "This is an interesting NLP task and quite complex one as well. For this task, the model is provided input consisting of the context along with a question and it predicts the answer by selecting text from the context. The training setup for this task is a bit involved process, the following is an overview:\n",
284 |     "- The training input as triplet of context, question and answer\n",
285 |     "- This is transformed as combined input of the form ``[CLS]question[SEP]context[SEP]`` or ``[CLS]contex[SEP]question[SEP]`` with answer acting as the label\n",
286 |     "- The model is trained to predict the start and end indices of the the corresponding answer for each input.\n",
287 |     "\n",
288 |     "\n",
289 |     "For our current setting, we will leverage both _pretrained_ and _fine-tuned_ versions of **DistilBERT** via the _question-answering_ pipeline and understand the performance difference."
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "id": "08ae5a96-fd84-4b11-9943-5dfac261c432",
296 |    "metadata": {},
297 |    "outputs": [],
298 |    "source": [
299 |     "qa_ft_pipeline = pipeline(\n",
300 |     "    'question-answering',\n",
301 |     "    model=DISTILBET_QA_CHECKPOINT,\n",
302 |     "    device=DEVICE_ID\n",
303 |     ")\n",
304 |     "qa_pt_pipeline = pipeline(\n",
305 |     "    'question-answering',\n",
306 |     "    model=#TODO: Set the pretrained \n",
307 |     "    device=DEVICE_ID\n",
308 |     ")"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "id": "cde485cf-1c21-4f34-a2d2-4d0130854905",
315 |    "metadata": {},
316 |    "outputs": [],
317 |    "source": [
318 |     "# we use a snippet about BERT like models from the module itself\n",
319 |     "context = \"\"\"The key contribution from this set of models is the masked language modeling objective during the pre-training phase, where some tokens in the input are masked, and the model is trained to predict them (we will cover these in the upcoming section). Key works in this group of architectures are BERT, RoBERTa (or optimized BERT), DistilBERT (lighter and more efficient BERT), ELECTRA and ALBERT.\n",
320 |     "In this notebook we will work through the task of Question Answering where our language model will learn to answer questions based on the context provided.\"\"\"\n",
321 |     "question = \"What are the key works in this set of models?\""
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "code",
326 |    "execution_count": null,
327 |    "id": "fdf038b7-e8c8-425e-bbd4-026895e73d6f",
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "ft_qa_result= qa_ft_pipeline(\n",
332 |     "    question=question,\n",
333 |     "    context=context\n",
334 |     ")\n",
335 |     "\n",
336 |     "pt_qa_result= qa_pt_pipeline(\n",
337 |     "    question=question,\n",
338 |     "    context=context\n",
339 |     ")"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "id": "0cd01d95-491a-4c22-afac-833de30b17cd",
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "print(\"*\"*55)\n",
350 |     "print(f\"Context:{context}\")\n",
351 |     "print(\"*\"*55)\n",
352 |     "print(f\"Question:{question}\")\n",
353 |     "print(\"-\"*55)\n",
354 |     "print(f\"Response from Fine-Tuned Model:\\n{ft_qa_result}\")\n",
355 |     "print()\n",
356 |     "print(f\"Response from Pretrained Model:\\n{pt_qa_result}\")"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "id": "8dc0ed16-e828-4ede-892e-04622bf82a35",
362 |    "metadata": {},
363 |    "source": [
364 |     "# Generative Pretraining"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "id": "afd7417f-425a-47ee-8707-7c9872cf8ecc",
370 |    "metadata": {},
371 |    "source": [
372 |     "## Behold, its GPT (Generative pre-training)\n",
373 |     "\n",
374 |     "The first model in this series is called GPT, or Generative Pre-Training. It was released in [2018](https://openai.com/blog/language-unsupervised/), about the same time as the BERT model. The paper presents a task-agnostic architecture based on the ideas of transformers and unsupervised learning.\n",
375 |     "\n",
376 |     "- GPT is essentially a language model based on the __transformer-decoder__ \n",
377 |     "- Introduction of large training datasets: __BookCorpus__ dataset contains over 7,000 unique, unpublished books across different genres\n",
378 |     "- The GPT architecture makes use of 12 decoder blocks (as opposed to 6 in the original transformer) with 768-dimensional states and 12 self-attention heads each.\n",
379 |     "\n",
380 |     "\n",
381 |     "### GPT-2\n",
382 |     "- Radford et al. presented the GPT-2 model as part of their work titled [Language Models are Unsupervised Multi-task Learners in 2019](https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf)\n",
383 |     "- The model achieves state-of-the-art performance in a few-shot setting\n",
384 |     "- Similar to GPT, the secret sauce for GPT-2 is its dataset. The authors prepared a massive 40 GB dataset by crawling 45 million outbound links from a social networking site called Reddit.\n",
385 |     "- The vocabulary was also expanded to cover 50,000 words and the context window was expanded to 1,024 tokens (as compared to 512 for GPT).\n",
386 |     "\n",
387 |     "\n",
388 |     "### GPT-3\n",
389 |     "- OpenAI published paper titled [Language Models are Few Shot Learners](https://arxiv.org/abs/2005.14165) in May 2020. \n",
390 |     "- This paper introduces the mammoth __175 billion-parameter GPT-3 model__.\n",
391 |     "- Apart from more layers and parameters, this model made use of sparse attention\n",
392 |     "- Dataset again played a key role, a 300 billion-token dataset based on existing datasets like Common Crawl (filtered for better content), WebText2 (a larger version of WebText used for GPT-2), Books1 and Books2, and the Wikipedia dataset was prepared for this model"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "markdown",
397 |    "id": "383312da-e4a1-42d8-93c6-8c89f2a952d2",
398 |    "metadata": {},
399 |    "source": [
400 |     "## Language Modeling\n",
401 |     "By far the most widely used application from the NLP world is language modeling. We use it daily on our phone keyboards, email applications and a ton of other places.\n",
402 |     "\n",
403 |     "In simple words, a language model takes certain text as input context to generate the next set of words as output. This is interesting because a language model tries to understand the input context, the language structure (though in a very naive way) to predict the next word(s). We use it in the form of text completion utilities on search engines, chat platforms, emails etc. all the time. Language models are a perfect real life application of NLP and showcase the power of RNNs.\n",
404 |     "\n",
405 |     "Language models can be developed train in different ways. The most common and widely used method is the sliding window approach. The model takes a small window of text as input and tried to predict the next word as the output. The following figure illustrates the same visually.\n",
406 |     "\n",
407 |     "<img src=\"./assets/lm_training_notebook_3.png\">"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "markdown",
412 |    "id": "6bc37334-041b-475b-b281-2b6c42902b27",
413 |    "metadata": {},
414 |    "source": [
415 |     "### PreTrained GPT2 for Text Generation"
416 |    ]
417 |   },
418 |   {
419 |    "cell_type": "code",
420 |    "execution_count": null,
421 |    "id": "f722ddfd-4fe3-4aeb-afbd-1eb3f964e110",
422 |    "metadata": {},
423 |    "outputs": [],
424 |    "source": [
425 |     "import torch\n",
426 |     "from transformers import AutoModelForCausalLM, AutoTokenizer"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "code",
431 |    "execution_count": null,
432 |    "id": "ea73c89c-646f-47c7-8f36-ef4bc7b5b756",
433 |    "metadata": {},
434 |    "outputs": [],
435 |    "source": [
436 |     "# generative tasks are not available through MPS/Apple Silicon\n",
437 |     "DEVICE = 'cpu'\n",
438 |     "Tensor = torch.FloatTensor\n",
439 |     "LongTensor = torch.LongTensor\n",
440 |     "DEVICE_ID = -1\n",
441 |     "print(f\"Backend Accelerator Device={DEVICE}\")"
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": null,
447 |    "id": "f6949423-7182-4435-988c-89ed678e8f10",
448 |    "metadata": {},
449 |    "outputs": [],
450 |    "source": [
451 |     "tokenizer = AutoTokenizer.#TODO: get pretrained GPT2 tokenizer\n",
452 |     "\n",
453 |     "# add the EOS token as PAD token to avoid warnings\n",
454 |     "model = AutoModelForCausalLM.from_pretrained(\"gpt2\", pad_token_id=tokenizer.eos_token_id).to(DEVICE)"
455 |    ]
456 |   },
457 |   {
458 |    "cell_type": "code",
459 |    "execution_count": null,
460 |    "id": "0b909212-f6a3-46fd-8acc-5dc77224bead",
461 |    "metadata": {},
462 |    "outputs": [],
463 |    "source": [
464 |     "# encode context the generation is conditioned on\n",
465 |     "model_inputs = tokenizer('The king of England is', return_tensors='pt').to(DEVICE)\n",
466 |     "\n",
467 |     "# generate 40 new tokens\n",
468 |     "greedy_output = model.generate(**model_inputs, max_new_tokens=40)\n",
469 |     "\n",
470 |     "print(\"Output:\\n\" + 100 * '-')\n",
471 |     "print(tokenizer.decode(greedy_output[0], skip_special_tokens=True))"
472 |    ]
473 |   },
474 |   {
475 |    "cell_type": "code",
476 |    "execution_count": null,
477 |    "id": "48210990-c4d2-4103-91ab-184addedba15",
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": []
481 |   }
482 |  ],
483 |  "metadata": {
484 |   "kernelspec": {
485 |    "display_name": "Python 3 (ipykernel)",
486 |    "language": "python",
487 |    "name": "python3"
488 |   },
489 |   "language_info": {
490 |    "codemirror_mode": {
491 |     "name": "ipython",
492 |     "version": 3
493 |    },
494 |    "file_extension": ".py",
495 |    "mimetype": "text/x-python",
496 |    "name": "python",
497 |    "nbconvert_exporter": "python",
498 |    "pygments_lexer": "ipython3",
499 |    "version": "3.11.9"
500 |   }
501 |  },
502 |  "nbformat": 4,
503 |  "nbformat_minor": 5
504 | }
505 | 


--------------------------------------------------------------------------------
/module_01/assets/banner_notebook_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/banner_notebook_1.jpg


--------------------------------------------------------------------------------
/module_01/assets/bert_models_layout_notebook_3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/bert_models_layout_notebook_3.jpeg


--------------------------------------------------------------------------------
/module_01/assets/cbow_arch_notebook_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/cbow_arch_notebook_1.png


--------------------------------------------------------------------------------
/module_01/assets/encoder_decoder_notebook_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/encoder_decoder_notebook_3.png


--------------------------------------------------------------------------------
/module_01/assets/img_2_notebook_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/img_2_notebook_1.jpg


--------------------------------------------------------------------------------
/module_01/assets/karpathy_emoji_tokenizer.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/karpathy_emoji_tokenizer.jpeg


--------------------------------------------------------------------------------
/module_01/assets/lm_training_notebook_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/lm_training_notebook_3.png


--------------------------------------------------------------------------------
/module_01/assets/multihead_attention_notebook_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/multihead_attention_notebook_3.png


--------------------------------------------------------------------------------
/module_01/assets/positional_emb_notebook_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/positional_emb_notebook_3.png


--------------------------------------------------------------------------------
/module_01/assets/skipgram_arch_notebook_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/skipgram_arch_notebook_1.png


--------------------------------------------------------------------------------
/module_01/assets/transformer_arch_notebook_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_01/assets/transformer_arch_notebook_3.png


--------------------------------------------------------------------------------
/module_01/solutions/norm_corpus.txt:
--------------------------------------------------------------------------------
1 | sky blue beautiful
2 | love blue beautiful sky
3 | quick brown fox jumps lazy dog
4 | kings breakfast sausages ham bacon eggs toast beans
5 | love green eggs ham sausages bacon
6 | brown fox quick blue dog lazy
7 | sky blue sky beautiful today
8 | dog lazy brown fox quick
9 | 


--------------------------------------------------------------------------------
/module_02/01_llm_overview.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_02/01_llm_overview.pdf


--------------------------------------------------------------------------------
/module_02/02_simple_text_generator.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Text Generation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_02/02_simple_text_generator.ipynb\">\n",
 15 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 16 |     "</a>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import torch"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {},
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "# if you are on apple silicon execute the below command before starting jupyter\n",
 35 |     "#export PYTORCH_ENABLE_MPS_FALLBACK=1\n",
 36 |     "if torch.cuda.is_available():\n",
 37 |     "    DEVICE = 'cuda'\n",
 38 |     "    Tensor = torch.cuda.FloatTensor\n",
 39 |     "    LongTensor = torch.cuda.LongTensor\n",
 40 |     "    DEVICE_ID = 0\n",
 41 |     "# Some Causal Modeling Ops are not available on MPS yet \n",
 42 |     "# elif torch.backends.mps.is_available():\n",
 43 |     "#     DEVICE = 'mps'\n",
 44 |     "#     Tensor = torch.FloatTensor\n",
 45 |     "#     LongTensor = torch.LongTensor\n",
 46 |     "#     DEVICE_ID = 0\n",
 47 |     "else:\n",
 48 |     "    DEVICE = 'cpu'\n",
 49 |     "    Tensor = torch.FloatTensor\n",
 50 |     "    LongTensor = torch.LongTensor\n",
 51 |     "    DEVICE_ID = -1\n",
 52 |     "print(f\"Backend Accelerator Device={DEVICE}\")"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import time\n",
 62 |     "import datetime"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": null,
 68 |    "metadata": {},
 69 |    "outputs": [],
 70 |    "source": [
 71 |     "import pandas as pd\n",
 72 |     "import numpy as np\n",
 73 |     "import transformers\n",
 74 |     "from numpy import random\n",
 75 |     "from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config\n",
 76 |     "from transformers import get_linear_schedule_with_warmup\n",
 77 |     "from torch.optim import AdamW"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": null,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "from torch.utils.data import Dataset, DataLoader\n",
 87 |     "from torch.utils.data import random_split, RandomSampler, SequentialSampler\n",
 88 |     "torch.manual_seed(42)"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "print(transformers.__version__)"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler\n",
107 |     "torch.manual_seed(42)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# colab/gpu systems\n",
117 |     "!nvidia-smi\n",
118 |     "# htop or activity monitor for linux based systems"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "markdown",
123 |    "metadata": {},
124 |    "source": [
125 |     "## Get Data\n",
126 |     "We will fine-tune a pre-trained model GPT-2 model on our earlier dataset itself. But wait, what do you mean pre-trained?"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "filename = \"sherlock_homes.txt\"\n",
145 |     "raw_text = open(filename, 'r', encoding='utf-8').read()\n",
146 |     "text = raw_text [1450:100000]"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## Foundation & Pre-trained Models\n",
154 |     "\n",
155 |     "**Foundation models** are the models that are trained from scratch on a large corpus of data. In the context of NLP, these models are designed to learn the fundamental patterns, structures, and representations of natural language. Foundation models are typically trained using unsupervised learning objectives, such as language modeling or autoencoding, where the model predicts the next word in a sentence or reconstructs the original sentence from a corrupted version/masked version.\n",
156 |     "Models such as GPT, BERT, T5, etc are typical examples of Foundation Models\n",
157 |     "\n",
158 |     "\n",
159 |     "Instances of foundation models that have been trained on specific downstream tasks or datasets are termed as **Pre-Trained Models**. Pretrained models leverage the knowledge learned from foundation models and are fine-tuned on task-specific data to perform well on specific NLP tasks, such as text classification, named entity recognition, machine translation, sentiment analysis, etc."
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "BOS_TOKEN = '<|sot|>'\n",
169 |     "EOS_TOKEN = '<|eot|>'\n",
170 |     "PAD_TOKEN = '<|pad|>'\n",
171 |     "MODEL_NAME = \"raghavbali/gpt2_ft_sherlock_holmes\""
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "# first, let us get the tokenizer object\n",
181 |     "tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME,\n",
182 |     "                                          bos_token=BOS_TOKEN,\n",
183 |     "                                          eos_token=EOS_TOKEN,\n",
184 |     "                                          pad_token=PAD_TOKEN\n",
185 |     "                                          )"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "## Prepare Dataset"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "class GPT2Dataset(Dataset):\n",
202 |     "\n",
203 |     "  def __init__(self, txt_list, tokenizer, max_length=768):\n",
204 |     "\n",
205 |     "    self.tokenizer = tokenizer\n",
206 |     "    self.input_ids = []\n",
207 |     "    self.attn_masks = []\n",
208 |     "\n",
209 |     "    for txt in txt_list:\n",
210 |     "\n",
211 |     "      encodings_dict = tokenizer(\n",
212 |     "          #TODO: Input format [beginning of sentence] input_text [end of sentence]\n",
213 |     "          truncation=True,\n",
214 |     "          max_length=max_length,\n",
215 |     "          padding=\"max_length\"\n",
216 |     "          )\n",
217 |     "\n",
218 |     "      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))\n",
219 |     "      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))\n",
220 |     "\n",
221 |     "  def __len__(self):\n",
222 |     "    return #TODO: return size of input_ids\n",
223 |     "\n",
224 |     "  def __getitem__(self, idx):\n",
225 |     "    return self.input_ids[idx], self.attn_masks[idx]"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": null,
231 |    "metadata": {},
232 |    "outputs": [],
233 |    "source": [
234 |     "# set batch size to work it out on colab\n",
235 |     "BATCH_SIZE = 3"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "dataset = GPT2Dataset(text.split('\\n'),\n",
245 |     "                      tokenizer, max_length=768)\n",
246 |     "\n",
247 |     "# Split into training and validation sets\n",
248 |     "train_size = int(0.9 * len(dataset))\n",
249 |     "val_size = len(dataset) - train_size\n",
250 |     "\n",
251 |     "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
252 |     "\n",
253 |     "print('{:>5,} training samples'.format(train_size))\n",
254 |     "print('{:>5,} validation samples'.format(val_size))"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": null,
260 |    "metadata": {},
261 |    "outputs": [],
262 |    "source": [
263 |     "# Create the DataLoaders for our training and validation datasets.\n",
264 |     "train_dataloader = DataLoader(\n",
265 |     "            train_dataset,\n",
266 |     "            sampler = RandomSampler(train_dataset),\n",
267 |     "            batch_size = #TODO: set batch-size\n",
268 |     "        )\n",
269 |     "\n",
270 |     "validation_dataloader = DataLoader(\n",
271 |     "            val_dataset,\n",
272 |     "            sampler = SequentialSampler(val_dataset),\n",
273 |     "            batch_size = #TODO: set batch-size\n",
274 |     "        )"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "## Setup Model Object"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": null,
287 |    "metadata": {},
288 |    "outputs": [],
289 |    "source": [
290 |     "# Training Params\n",
291 |     "epochs = 1 #3 seems good if you train from gpt2 checkpoint\n",
292 |     "learning_rate = 5e-4\n",
293 |     "# to speed up learning\n",
294 |     "warmup_steps = 1e2\n",
295 |     "epsilon = 1e-8\n",
296 |     "\n",
297 |     "# generate output after N steps\n",
298 |     "sample_every = 100"
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": null,
304 |    "metadata": {},
305 |    "outputs": [],
306 |    "source": [
307 |     "# Set Config\n",
308 |     "configuration = GPT2Config.from_pretrained(MODEL_NAME,\n",
309 |     "                                           output_hidden_states=False)\n",
310 |     "\n",
311 |     "# instantiate the model\n",
312 |     "model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, config=configuration,)\n",
313 |     "\n",
314 |     "# NOTE: This is important to imply that we have updated BOS, EOS, etc\n",
315 |     "model.resize_token_embeddings(len(tokenizer))\n",
316 |     "model = model.to(DEVICE)"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "optimizer = AdamW(model.parameters(),\n",
326 |     "                  lr = learning_rate,\n",
327 |     "                  eps = epsilon\n",
328 |     "                )"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": null,
334 |    "metadata": {},
335 |    "outputs": [],
336 |    "source": [
337 |     "total_steps = len(train_dataloader) * epochs\n",
338 |     "\n",
339 |     "# Create the learning rate scheduler.\n",
340 |     "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
341 |     "                                            num_warmup_steps = warmup_steps,\n",
342 |     "                                            num_training_steps = total_steps)"
343 |    ]
344 |   },
345 |   {
346 |    "cell_type": "code",
347 |    "execution_count": null,
348 |    "metadata": {},
349 |    "outputs": [],
350 |    "source": [
351 |     "def format_time(elapsed):\n",
352 |     "    return str(datetime.timedelta(seconds=int(round((elapsed)))))"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "total_t0 = time.time()\n",
362 |     "training_stats = []\n",
363 |     "\n",
364 |     "\n",
365 |     "for epoch_i in range(0, epochs):\n",
366 |     "\n",
367 |     "    # Training\n",
368 |     "    print(\"*\"*25)\n",
369 |     "    print('>> Epoch {:} / {:} '.format(epoch_i + 1, epochs))\n",
370 |     "    print(\"*\"*25)\n",
371 |     "\n",
372 |     "    t0 = time.time()\n",
373 |     "    total_train_loss = 0\n",
374 |     "\n",
375 |     "    #TODO: call model's training interface\n",
376 |     "    for step, batch in enumerate(train_dataloader):\n",
377 |     "\n",
378 |     "        b_input_ids = batch[0].to(DEVICE)\n",
379 |     "        b_labels = batch[0].to(DEVICE)\n",
380 |     "        b_masks = batch[1].to(DEVICE)\n",
381 |     "\n",
382 |     "        model.zero_grad()\n",
383 |     "\n",
384 |     "        outputs = model(  b_input_ids,\n",
385 |     "                          labels=b_labels,\n",
386 |     "                          attention_mask = b_masks,\n",
387 |     "                          token_type_ids=None\n",
388 |     "                        )\n",
389 |     "\n",
390 |     "        loss = outputs[0]\n",
391 |     "\n",
392 |     "        batch_loss = loss.item()\n",
393 |     "        total_train_loss += batch_loss\n",
394 |     "\n",
395 |     "        # Get sample every x batches.\n",
396 |     "        if step % sample_every == 0 and not step == 0:\n",
397 |     "\n",
398 |     "            elapsed = format_time(time.time() - t0)\n",
399 |     "            print('  Batch {:>5,}  of  {:>5,}. Training Loss: {:>5,}.   Time Taken: {:}.'.format(step,\n",
400 |     "                                                                                     len(train_dataloader),\n",
401 |     "                                                                                     batch_loss,\n",
402 |     "                                                                                     elapsed))\n",
403 |     "\n",
404 |     "            model.eval()\n",
405 |     "\n",
406 |     "            sample_outputs = model.generate(\n",
407 |     "                                    do_sample=True,\n",
408 |     "                                    top_k=50,\n",
409 |     "                                    max_length = 200,\n",
410 |     "                                    top_p=0.95,\n",
411 |     "                                    num_return_sequences=1,\n",
412 |     "                                    pad_token_id=tokenizer.eos_token_id\n",
413 |     "                                )\n",
414 |     "            for i, sample_output in enumerate(sample_outputs):\n",
415 |     "                  print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))\n",
416 |     "\n",
417 |     "            model.train()\n",
418 |     "\n",
419 |     "        loss.backward()\n",
420 |     "        optimizer.step()\n",
421 |     "\n",
422 |     "        scheduler.step()\n",
423 |     "\n",
424 |     "    # Average Loss\n",
425 |     "    avg_train_loss = total_train_loss / len(train_dataloader)\n",
426 |     "\n",
427 |     "    # training time\n",
428 |     "    training_time = format_time(time.time() - t0)\n",
429 |     "\n",
430 |     "    print(\"Average training loss: {0:.2f}\".format(avg_train_loss))\n",
431 |     "    print(\"Training epoch time: {:}\".format(training_time))\n",
432 |     "\n",
433 |     "    # Validation\n",
434 |     "    t0 = time.time()\n",
435 |     "\n",
436 |     "    model.eval()\n",
437 |     "    total_eval_loss = 0\n",
438 |     "    nb_eval_steps = 0\n",
439 |     "\n",
440 |     "    for batch in validation_dataloader:\n",
441 |     "\n",
442 |     "        b_input_ids = batch[0].to(DEVICE)\n",
443 |     "        b_labels = batch[0].to(DEVICE)\n",
444 |     "        b_masks = batch[1].to(DEVICE)\n",
445 |     "\n",
446 |     "        with torch.no_grad():\n",
447 |     "\n",
448 |     "            outputs  = model(b_input_ids,#TODO: pass batch's ids,\n",
449 |     "                             attention_mask = b_masks,\n",
450 |     "                            labels=b_labels)\n",
451 |     "\n",
452 |     "            loss = outputs[0]\n",
453 |     "\n",
454 |     "        batch_loss = loss.item()\n",
455 |     "        total_eval_loss += batch_loss\n",
456 |     "\n",
457 |     "    avg_val_loss = total_eval_loss / len(validation_dataloader)\n",
458 |     "\n",
459 |     "    validation_time = format_time(time.time() - t0)\n",
460 |     "\n",
461 |     "    print(\"  Validation Loss: {0:.2f}\".format(avg_val_loss))\n",
462 |     "    print(\"  Validation time: {:}\".format(validation_time))\n",
463 |     "\n",
464 |     "    # Record all statistics from this epoch.\n",
465 |     "    training_stats.append(\n",
466 |     "        {\n",
467 |     "            'epoch': epoch_i + 1,\n",
468 |     "            'train_loss': avg_train_loss,\n",
469 |     "            'val_oss': avg_val_loss,\n",
470 |     "            'train_ime': training_time,\n",
471 |     "            'val_ime': validation_time\n",
472 |     "        }\n",
473 |     "    )\n",
474 |     "\n",
475 |     "print(\"Training Completed\")\n",
476 |     "print(\"Total training time {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))"
477 |    ]
478 |   },
479 |   {
480 |    "cell_type": "code",
481 |    "execution_count": null,
482 |    "metadata": {},
483 |    "outputs": [],
484 |    "source": [
485 |     "df_stats = pd.DataFrame(data=training_stats)\n",
486 |     "df_stats"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "markdown",
491 |    "metadata": {
492 |     "editable": true,
493 |     "slideshow": {
494 |      "slide_type": ""
495 |     },
496 |     "tags": []
497 |    },
498 |    "source": [
499 |     "## Save the Model"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "import os"
509 |    ]
510 |   },
511 |   {
512 |    "cell_type": "code",
513 |    "execution_count": null,
514 |    "metadata": {},
515 |    "outputs": [],
516 |    "source": [
517 |     "output_dir = './model_save/'\n",
518 |     "\n",
519 |     "if not os.path.exists(output_dir):\n",
520 |     "    os.makedirs(output_dir)\n",
521 |     "\n",
522 |     "model_to_save = model.module if hasattr(model, 'module') else model\n",
523 |     "model_to_save.save_pretrained(output_dir)\n",
524 |     "tokenizer.save_pretrained(output_dir)"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": null,
530 |    "metadata": {},
531 |    "outputs": [],
532 |    "source": [
533 |     "model.eval()\n",
534 |     "\n",
535 |     "prompt = \"the King of England\"\n",
536 |     "\n",
537 |     "tokenized_inputs = torch.tensor(tokenizer.encode(BOS_TOKEN+prompt)).unsqueeze(0)\n",
538 |     "tokenized_inputs = tokenized_inputs.to(DEVICE)\n",
539 |     "\n",
540 |     "sample_outputs = model.generate(\n",
541 |     "                                tokenized_inputs,\n",
542 |     "                                do_sample=True,\n",
543 |     "                                top_k=50,\n",
544 |     "                                max_length = len(generated) + 50,\n",
545 |     "                                top_p=0.92,\n",
546 |     "                                num_return_sequences=3,\n",
547 |     "                                pad_token_id=tokenizer.eos_token_id,\n",
548 |     "                                temperature=0.8,\n",
549 |     "                                )\n",
550 |     "\n",
551 |     "for i, sample_output in enumerate(sample_outputs):\n",
552 |     "  print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": null,
558 |    "metadata": {},
559 |    "outputs": [],
560 |    "source": [
561 |     "# compare output to foundation model\n",
562 |     "pre_trainedtokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)\n",
563 |     "pre_trainedmodel = GPT2LMHeadModel.from_pretrained(MODEL_NAME)"
564 |    ]
565 |   },
566 |   {
567 |    "cell_type": "code",
568 |    "execution_count": null,
569 |    "metadata": {},
570 |    "outputs": [],
571 |    "source": [
572 |     "input_ids = #TODO: Prepare tokenizer input, HINT: use trainedtokenizer object\n",
573 |     "\n",
574 |     "# Generate text\n",
575 |     "output = pre_trainedmodel.generate(\n",
576 |     "    input_ids,\n",
577 |     "    bos_token_id=random.randint(1,30000),\n",
578 |     "    max_length=len(input_ids[0]) + 50,\n",
579 |     "    num_return_sequences=1,\n",
580 |     "    pad_token_id=tokenizer.eos_token_id,\n",
581 |     "    do_sample=True,\n",
582 |     "    top_p=0.92,  # Adjust the sampling parameters as needed\n",
583 |     "    temperature=0.8,\n",
584 |     ")"
585 |    ]
586 |   },
587 |   {
588 |    "cell_type": "code",
589 |    "execution_count": null,
590 |    "metadata": {},
591 |    "outputs": [],
592 |    "source": [
593 |     "pre_trainedtokenizer.decode(output[0], skip_special_tokens=True)"
594 |    ]
595 |   },
596 |   {
597 |    "cell_type": "markdown",
598 |    "metadata": {},
599 |    "source": [
600 |     "## Decoding Strategies\n",
601 |     "\n",
602 |     "The ``generate()`` utility we used above used every output prediction as input for the next time step. This method of using the highest probability prediction as output is called __Greedy Decoding__. Greeding decoding is fast and simple but is marred with issues we saw in samples we just generated.\n",
603 |     "\n",
604 |     "Focusing on only highest probability output narrows our model's focus to just the next step which inturn may result in inconsistent or non-dictionary terms/words.\n",
605 |     "\n",
606 |     "### Beam Search\n",
607 |     "Beam search is the obvious next step to improve the output predictions from the model. Instead of being greedy, beam search keeps track of n paths at any given time and selects the path with overall higher probability.\n",
608 |     "\n",
609 |     "<img src=\"./assets/beamsearch_nb_2.png\">\n",
610 |     "\n",
611 |     "### Other Key Decoding Strategies:\n",
612 |     "- Sampling\n",
613 |     "- Top-k Sampling\n",
614 |     "- Nucleus Sampling\n",
615 |     "\n",
616 |     "### Temperature\n",
617 |     "Though sampling helps bring in required amount of randomness, it is not free from issues. Random sampling leads to gibberish and incoherence at times. To control the amount of randomness, we introduce __temperature__. This parameter helps increase the likelihood of high probability terms reduce the likelihood of low probability ones. This leads to sharper distributions. \n",
618 |     "\n",
619 |     "> High temperature leads to more randomness while lower temperature brings in predictability.\n"
620 |    ]
621 |   },
622 |   {
623 |    "cell_type": "code",
624 |    "execution_count": null,
625 |    "metadata": {},
626 |    "outputs": [],
627 |    "source": [
628 |     "prompt = \"the King of England\"\n",
629 |     "\n",
630 |     "generated = tokenizer.encode(BOS_TOKEN+prompt,return_tensors='pt')\n",
631 |     "generated = generated.to(DEVICE)\n",
632 |     "\n",
633 |     "beam_output = model.generate(\n",
634 |     "    **generated,\n",
635 |     "    max_new_tokens=40,\n",
636 |     "    num_beams=5,\n",
637 |     "    num_return_sequences=5,\n",
638 |     "    early_stopping=True\n",
639 |     ")\n",
640 |     "\n",
641 |     "for i in range(5):\n",
642 |     "    print(f\"Beam {i} Output:\\n\" + 100 * '-')\n",
643 |     "    print(tokenizer.decode(beam_output[i], skip_special_tokens=True))"
644 |    ]
645 |   },
646 |   {
647 |    "cell_type": "markdown",
648 |    "metadata": {},
649 |    "source": [
650 |     "## Limitations and What Next?\n",
651 |     "- Long Range Context\n",
652 |     "- Scalability\n",
653 |     "- Instruction led generation\n",
654 |     "- Benchmarking\n",
655 |     "- Halucination / Dreaming\n"
656 |    ]
657 |   },
658 |   {
659 |    "cell_type": "markdown",
660 |    "metadata": {},
661 |    "source": []
662 |   }
663 |  ],
664 |  "metadata": {
665 |   "accelerator": "GPU",
666 |   "colab": {
667 |    "gpuType": "T4",
668 |    "provenance": []
669 |   },
670 |   "kernelspec": {
671 |    "display_name": "Python 3 (ipykernel)",
672 |    "language": "python",
673 |    "name": "python3"
674 |   },
675 |   "language_info": {
676 |    "codemirror_mode": {
677 |     "name": "ipython",
678 |     "version": 3
679 |    },
680 |    "file_extension": ".py",
681 |    "mimetype": "text/x-python",
682 |    "name": "python",
683 |    "nbconvert_exporter": "python",
684 |    "pygments_lexer": "ipython3",
685 |    "version": "3.11.9"
686 |   }
687 |  },
688 |  "nbformat": 4,
689 |  "nbformat_minor": 4
690 | }
691 | 


--------------------------------------------------------------------------------
/module_02/assets/beamsearch_nb_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_02/assets/beamsearch_nb_2.png


--------------------------------------------------------------------------------
/module_02/solutions/02_simple_text_generator.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Text Generation"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_02/solutions/02_simple_text_generator.ipynb\">\n",
 15 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 16 |     "</a>"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "import torch"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 25,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "name": "stdout",
 35 |      "output_type": "stream",
 36 |      "text": [
 37 |       "Backend Accelerator Device=cpu\n"
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "# if you are on apple silicon execute the below command before starting jupyter\n",
 43 |     "#export PYTORCH_ENABLE_MPS_FALLBACK=1\n",
 44 |     "if torch.cuda.is_available():\n",
 45 |     "    DEVICE = 'cuda'\n",
 46 |     "    Tensor = torch.cuda.FloatTensor\n",
 47 |     "    LongTensor = torch.cuda.LongTensor\n",
 48 |     "    DEVICE_ID = 0\n",
 49 |     "# Some Causal Modeling Ops are not available on MPS yet \n",
 50 |     "# elif torch.backends.mps.is_available():\n",
 51 |     "#     DEVICE = 'mps'\n",
 52 |     "#     Tensor = torch.FloatTensor\n",
 53 |     "#     LongTensor = torch.LongTensor\n",
 54 |     "#     DEVICE_ID = 0\n",
 55 |     "else:\n",
 56 |     "    DEVICE = 'cpu'\n",
 57 |     "    Tensor = torch.FloatTensor\n",
 58 |     "    LongTensor = torch.LongTensor\n",
 59 |     "    DEVICE_ID = -1\n",
 60 |     "print(f\"Backend Accelerator Device={DEVICE}\")"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 6,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "import time\n",
 70 |     "import datetime"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 7,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "import pandas as pd\n",
 80 |     "import numpy as np\n",
 81 |     "import transformers\n",
 82 |     "from numpy import random\n",
 83 |     "from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config\n",
 84 |     "from transformers import get_linear_schedule_with_warmup\n",
 85 |     "from torch.optim import AdamW"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": 8,
 91 |    "metadata": {},
 92 |    "outputs": [
 93 |     {
 94 |      "data": {
 95 |       "text/plain": [
 96 |        "<torch._C.Generator at 0x1177d40f0>"
 97 |       ]
 98 |      },
 99 |      "execution_count": 8,
100 |      "metadata": {},
101 |      "output_type": "execute_result"
102 |     }
103 |    ],
104 |    "source": [
105 |     "from torch.utils.data import Dataset, DataLoader\n",
106 |     "from torch.utils.data import random_split, RandomSampler, SequentialSampler\n",
107 |     "torch.manual_seed(42)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 9,
113 |    "metadata": {},
114 |    "outputs": [
115 |     {
116 |      "name": "stdout",
117 |      "output_type": "stream",
118 |      "text": [
119 |       "4.42.3\n"
120 |      ]
121 |     }
122 |    ],
123 |    "source": [
124 |     "print(transformers.__version__)"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": 10,
130 |    "metadata": {},
131 |    "outputs": [
132 |     {
133 |      "data": {
134 |       "text/plain": [
135 |        "<torch._C.Generator at 0x1177d40f0>"
136 |       ]
137 |      },
138 |      "execution_count": 10,
139 |      "metadata": {},
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler\n",
145 |     "torch.manual_seed(42)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 27,
151 |    "metadata": {},
152 |    "outputs": [
153 |     {
154 |      "name": "stdout",
155 |      "output_type": "stream",
156 |      "text": [
157 |       "zsh:1: command not found: nvidia-smi\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "# colab/gpu systems\n",
163 |     "!nvidia-smi\n",
164 |     "# htop or activity monitor for linux based systems"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "markdown",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Get Data\n",
172 |     "We will fine-tune a pre-trained model GPT-2 model on our earlier dataset itself. But wait, what do you mean pre-trained?"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 11,
178 |    "metadata": {},
179 |    "outputs": [
180 |     {
181 |      "name": "stdout",
182 |      "output_type": "stream",
183 |      "text": [
184 |       "--2024-07-28 00:53:52--  http://www.gutenberg.org/files/1661/1661-0.txt\n",
185 |       "Resolving www.gutenberg.org (www.gutenberg.org)... 2610:28:3090:3000:0:bad:cafe:47, 152.19.134.47\n",
186 |       "Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:80... connected.\n",
187 |       "HTTP request sent, awaiting response... 302 Found\n",
188 |       "Location: https://www.gutenberg.org/files/1661/1661-0.txt [following]\n",
189 |       "--2024-07-28 00:53:52--  https://www.gutenberg.org/files/1661/1661-0.txt\n",
190 |       "Connecting to www.gutenberg.org (www.gutenberg.org)|2610:28:3090:3000:0:bad:cafe:47|:443... connected.\n",
191 |       "HTTP request sent, awaiting response... 200 OK\n",
192 |       "Length: 607504 (593K) [text/plain]\n",
193 |       "Saving to: ‘sherlock_homes.txt’\n",
194 |       "\n",
195 |       "sherlock_homes.txt  100%[===================>] 593.27K  1.21MB/s    in 0.5s    \n",
196 |       "\n",
197 |       "2024-07-28 00:53:53 (1.21 MB/s) - ‘sherlock_homes.txt’ saved [607504/607504]\n",
198 |       "\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "!wget -O sherlock_homes.txt http://www.gutenberg.org/files/1661/1661-0.txt"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 12,
209 |    "metadata": {},
210 |    "outputs": [],
211 |    "source": [
212 |     "filename = \"sherlock_homes.txt\"\n",
213 |     "raw_text = open(filename, 'r', encoding='utf-8').read()\n",
214 |     "text = raw_text [1450:100000]"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "markdown",
219 |    "metadata": {},
220 |    "source": [
221 |     "## Foundation & Pre-trained Models\n",
222 |     "\n",
223 |     "**Foundation models** are the models that are trained from scratch on a large corpus of data. In the context of NLP, these models are designed to learn the fundamental patterns, structures, and representations of natural language. Foundation models are typically trained using unsupervised learning objectives, such as language modeling or autoencoding, where the model predicts the next word in a sentence or reconstructs the original sentence from a corrupted version/masked version.\n",
224 |     "Models such as GPT, BERT, T5, etc are typical examples of Foundation Models\n",
225 |     "\n",
226 |     "\n",
227 |     "Instances of foundation models that have been trained on specific downstream tasks or datasets are termed as **Pre-Trained Models**. Pretrained models leverage the knowledge learned from foundation models and are fine-tuned on task-specific data to perform well on specific NLP tasks, such as text classification, named entity recognition, machine translation, sentiment analysis, etc."
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": 13,
233 |    "metadata": {},
234 |    "outputs": [],
235 |    "source": [
236 |     "BOS_TOKEN = '<|sot|>'\n",
237 |     "EOS_TOKEN = '<|eot|>'\n",
238 |     "PAD_TOKEN = '<|pad|>'\n",
239 |     "MODEL_NAME = \"raghavbali/gpt2_ft_sherlock_holmes\""
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 14,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "# first, let us get the tokenizer object\n",
249 |     "tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME,\n",
250 |     "                                          bos_token=BOS_TOKEN,\n",
251 |     "                                          eos_token=EOS_TOKEN,\n",
252 |     "                                          pad_token=PAD_TOKEN\n",
253 |     "                                          )"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "## Prepare Dataset"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 15,
266 |    "metadata": {},
267 |    "outputs": [],
268 |    "source": [
269 |     "class GPT2Dataset(Dataset):\n",
270 |     "\n",
271 |     "  def __init__(self, txt_list, tokenizer, max_length=768):\n",
272 |     "\n",
273 |     "    self.tokenizer = tokenizer\n",
274 |     "    self.input_ids = []\n",
275 |     "    self.attn_masks = []\n",
276 |     "\n",
277 |     "    for txt in txt_list:\n",
278 |     "\n",
279 |     "      encodings_dict = tokenizer(\n",
280 |     "          BOS_TOKEN + txt + EOS_TOKEN, #TODO\n",
281 |     "          truncation=True,\n",
282 |     "          max_length=max_length,\n",
283 |     "          padding=\"max_length\"\n",
284 |     "          )\n",
285 |     "\n",
286 |     "      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))\n",
287 |     "      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))\n",
288 |     "\n",
289 |     "  def __len__(self):\n",
290 |     "    return len(self.input_ids)#TODO: return size of input_ids\n",
291 |     "\n",
292 |     "  def __getitem__(self, idx):\n",
293 |     "    return self.input_ids[idx], self.attn_masks[idx]"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": 16,
299 |    "metadata": {},
300 |    "outputs": [],
301 |    "source": [
302 |     "# set batch size to work it out on colab\n",
303 |     "BATCH_SIZE = 3"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 26,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "name": "stdout",
313 |      "output_type": "stream",
314 |      "text": [
315 |       "1,949 training samples\n",
316 |       "  217 validation samples\n"
317 |      ]
318 |     }
319 |    ],
320 |    "source": [
321 |     "dataset = GPT2Dataset(text.split('\\n'),\n",
322 |     "                      tokenizer, max_length=768)\n",
323 |     "\n",
324 |     "# Split into training and validation sets\n",
325 |     "train_size = int(0.9 * len(dataset))\n",
326 |     "val_size = len(dataset) - train_size\n",
327 |     "\n",
328 |     "train_dataset, val_dataset = random_split(dataset, [train_size, val_size])\n",
329 |     "\n",
330 |     "print('{:>5,} training samples'.format(train_size))\n",
331 |     "print('{:>5,} validation samples'.format(val_size))"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 27,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "# Create the DataLoaders for our training and validation datasets.\n",
341 |     "train_dataloader = DataLoader(\n",
342 |     "            train_dataset,\n",
343 |     "            sampler = RandomSampler(train_dataset),\n",
344 |     "            batch_size = BATCH_SIZE#TODO: set batch-size\n",
345 |     "        )\n",
346 |     "\n",
347 |     "validation_dataloader = DataLoader(\n",
348 |     "            val_dataset,\n",
349 |     "            sampler = SequentialSampler(val_dataset),\n",
350 |     "            batch_size = BATCH_SIZE#TODO: set batch-size\n",
351 |     "        )"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "markdown",
356 |    "metadata": {},
357 |    "source": [
358 |     "## Setup Model Object"
359 |    ]
360 |   },
361 |   {
362 |    "cell_type": "code",
363 |    "execution_count": 28,
364 |    "metadata": {},
365 |    "outputs": [],
366 |    "source": [
367 |     "# Training Params\n",
368 |     "epochs = 1 #3 seems good if you train from gpt2 checkpoint\n",
369 |     "learning_rate = 5e-4\n",
370 |     "# to speed up learning\n",
371 |     "warmup_steps = 1e2\n",
372 |     "epsilon = 1e-8\n",
373 |     "\n",
374 |     "# generate output after N steps\n",
375 |     "sample_every = 100"
376 |    ]
377 |   },
378 |   {
379 |    "cell_type": "code",
380 |    "execution_count": 29,
381 |    "metadata": {},
382 |    "outputs": [],
383 |    "source": [
384 |     "# Set Config\n",
385 |     "configuration = GPT2Config.from_pretrained(MODEL_NAME,\n",
386 |     "                                           output_hidden_states=False)\n",
387 |     "\n",
388 |     "# instantiate the model\n",
389 |     "model = GPT2LMHeadModel.from_pretrained(MODEL_NAME, config=configuration,)\n",
390 |     "\n",
391 |     "# NOTE: This is important to imply that we have updated BOS, EOS, etc\n",
392 |     "model.resize_token_embeddings(len(tokenizer))\n",
393 |     "model = model.to(DEVICE)"
394 |    ]
395 |   },
396 |   {
397 |    "cell_type": "code",
398 |    "execution_count": 30,
399 |    "metadata": {},
400 |    "outputs": [],
401 |    "source": [
402 |     "optimizer = AdamW(model.parameters(),\n",
403 |     "                  lr = learning_rate,\n",
404 |     "                  eps = epsilon\n",
405 |     "                )"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 31,
411 |    "metadata": {},
412 |    "outputs": [],
413 |    "source": [
414 |     "total_steps = len(train_dataloader) * epochs\n",
415 |     "\n",
416 |     "# Create the learning rate scheduler.\n",
417 |     "scheduler = get_linear_schedule_with_warmup(optimizer,\n",
418 |     "                                            num_warmup_steps = warmup_steps,\n",
419 |     "                                            num_training_steps = total_steps)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 32,
425 |    "metadata": {},
426 |    "outputs": [],
427 |    "source": [
428 |     "def format_time(elapsed):\n",
429 |     "    return str(datetime.timedelta(seconds=int(round((elapsed)))))"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [
437 |     {
438 |      "name": "stdout",
439 |      "output_type": "stream",
440 |      "text": [
441 |       "*************************\n",
442 |       ">> Epoch 1 / 1 \n",
443 |       "*************************\n"
444 |      ]
445 |     }
446 |    ],
447 |    "source": [
448 |     "total_t0 = time.time()\n",
449 |     "training_stats = []\n",
450 |     "\n",
451 |     "\n",
452 |     "for epoch_i in range(0, epochs):\n",
453 |     "\n",
454 |     "    # Training\n",
455 |     "    print(\"*\"*25)\n",
456 |     "    print('>> Epoch {:} / {:} '.format(epoch_i + 1, epochs))\n",
457 |     "    print(\"*\"*25)\n",
458 |     "\n",
459 |     "    t0 = time.time()\n",
460 |     "    total_train_loss = 0\n",
461 |     "\n",
462 |     "    #TODO: call model's training interface\n",
463 |     "    model.train()\n",
464 |     "    for step, batch in enumerate(train_dataloader):\n",
465 |     "\n",
466 |     "        b_input_ids = batch[0].to(DEVICE)\n",
467 |     "        b_labels = batch[0].to(DEVICE)\n",
468 |     "        b_masks = batch[1].to(DEVICE)\n",
469 |     "\n",
470 |     "        model.zero_grad()\n",
471 |     "\n",
472 |     "        outputs = model(  b_input_ids,\n",
473 |     "                          labels=b_labels,\n",
474 |     "                          attention_mask = b_masks,\n",
475 |     "                          token_type_ids=None\n",
476 |     "                        )\n",
477 |     "\n",
478 |     "        loss = outputs[0]\n",
479 |     "\n",
480 |     "        batch_loss = loss.item()\n",
481 |     "        total_train_loss += batch_loss\n",
482 |     "\n",
483 |     "        # Get sample every x batches.\n",
484 |     "        if step % sample_every == 0 and not step == 0:\n",
485 |     "\n",
486 |     "            elapsed = format_time(time.time() - t0)\n",
487 |     "            print('  Batch {:>5,}  of  {:>5,}. Training Loss: {:>5,}.   Time Taken: {:}.'.format(step,\n",
488 |     "                                                                                     len(train_dataloader),\n",
489 |     "                                                                                     batch_loss,\n",
490 |     "                                                                                     elapsed))\n",
491 |     "\n",
492 |     "            model.eval()\n",
493 |     "\n",
494 |     "            sample_outputs = model.generate(\n",
495 |     "                                    do_sample=True,\n",
496 |     "                                    top_k=50,\n",
497 |     "                                    max_length = 200,\n",
498 |     "                                    top_p=0.95,\n",
499 |     "                                    num_return_sequences=1,\n",
500 |     "                                    pad_token_id=tokenizer.eos_token_id\n",
501 |     "                                )\n",
502 |     "            for i, sample_output in enumerate(sample_outputs):\n",
503 |     "                  print(\"{}: {}\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))\n",
504 |     "\n",
505 |     "            model.train()\n",
506 |     "\n",
507 |     "        loss.backward()\n",
508 |     "        optimizer.step()\n",
509 |     "\n",
510 |     "        scheduler.step()\n",
511 |     "\n",
512 |     "    # Average Loss\n",
513 |     "    avg_train_loss = total_train_loss / len(train_dataloader)\n",
514 |     "\n",
515 |     "    # training time\n",
516 |     "    training_time = format_time(time.time() - t0)\n",
517 |     "\n",
518 |     "    print(\"Average training loss: {0:.2f}\".format(avg_train_loss))\n",
519 |     "    print(\"Training epoch time: {:}\".format(training_time))\n",
520 |     "\n",
521 |     "    # Validation\n",
522 |     "    t0 = time.time()\n",
523 |     "\n",
524 |     "    model.eval()\n",
525 |     "    total_eval_loss = 0\n",
526 |     "    nb_eval_steps = 0\n",
527 |     "\n",
528 |     "    for batch in validation_dataloader:\n",
529 |     "\n",
530 |     "        b_input_ids = batch[0].to(DEVICE)\n",
531 |     "        b_labels = batch[0].to(DEVICE)\n",
532 |     "        b_masks = batch[1].to(DEVICE)\n",
533 |     "\n",
534 |     "        with torch.no_grad():\n",
535 |     "\n",
536 |     "            outputs  = model(b_input_ids,#TODO: pass batch's ids,\n",
537 |     "                             attention_mask = b_masks,\n",
538 |     "                            labels=b_labels)\n",
539 |     "\n",
540 |     "            loss = outputs[0]\n",
541 |     "\n",
542 |     "        batch_loss = loss.item()\n",
543 |     "        total_eval_loss += batch_loss\n",
544 |     "\n",
545 |     "    avg_val_loss = total_eval_loss / len(validation_dataloader)\n",
546 |     "\n",
547 |     "    validation_time = format_time(time.time() - t0)\n",
548 |     "\n",
549 |     "    print(\"  Validation Loss: {0:.2f}\".format(avg_val_loss))\n",
550 |     "    print(\"  Validation time: {:}\".format(validation_time))\n",
551 |     "\n",
552 |     "    # Record all statistics from this epoch.\n",
553 |     "    training_stats.append(\n",
554 |     "        {\n",
555 |     "            'epoch': epoch_i + 1,\n",
556 |     "            'train_loss': avg_train_loss,\n",
557 |     "            'val_oss': avg_val_loss,\n",
558 |     "            'train_ime': training_time,\n",
559 |     "            'val_ime': validation_time\n",
560 |     "        }\n",
561 |     "    )\n",
562 |     "\n",
563 |     "print(\"Training Completed\")\n",
564 |     "print(\"Total training time {:} (h:mm:ss)\".format(format_time(time.time()-total_t0)))"
565 |    ]
566 |   },
567 |   {
568 |    "cell_type": "code",
569 |    "execution_count": null,
570 |    "metadata": {},
571 |    "outputs": [],
572 |    "source": [
573 |     "df_stats = pd.DataFrame(data=training_stats)\n",
574 |     "df_stats"
575 |    ]
576 |   },
577 |   {
578 |    "cell_type": "markdown",
579 |    "metadata": {
580 |     "editable": true,
581 |     "slideshow": {
582 |      "slide_type": ""
583 |     },
584 |     "tags": []
585 |    },
586 |    "source": [
587 |     "## Save the Model"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": null,
593 |    "metadata": {},
594 |    "outputs": [],
595 |    "source": [
596 |     "import os"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "code",
601 |    "execution_count": null,
602 |    "metadata": {},
603 |    "outputs": [],
604 |    "source": [
605 |     "output_dir = './model_save/'\n",
606 |     "\n",
607 |     "if not os.path.exists(output_dir):\n",
608 |     "    os.makedirs(output_dir)\n",
609 |     "\n",
610 |     "model_to_save = model.module if hasattr(model, 'module') else model\n",
611 |     "model_to_save.save_pretrained(output_dir)\n",
612 |     "tokenizer.save_pretrained(output_dir)"
613 |    ]
614 |   },
615 |   {
616 |    "cell_type": "code",
617 |    "execution_count": 51,
618 |    "metadata": {},
619 |    "outputs": [
620 |     {
621 |      "name": "stdout",
622 |      "output_type": "stream",
623 |      "text": [
624 |       "0: the King of England, and he was as good a queen, as if she had\n",
625 |       "\n",
626 |       "\n",
627 |       "1: the King of England, with a face of a royal-fancy one, and a thick,\n",
628 |       "\n",
629 |       "\n",
630 |       "2: the King of England.”\n",
631 |       "\n",
632 |       "\n"
633 |      ]
634 |     }
635 |    ],
636 |    "source": [
637 |     "model.eval()\n",
638 |     "\n",
639 |     "prompt = \"the King of England\"\n",
640 |     "\n",
641 |     "generated = torch.tensor(tokenizer.encode(BOS_TOKEN+prompt)).unsqueeze(0)\n",
642 |     "generated = generated.to(DEVICE)\n",
643 |     "\n",
644 |     "sample_outputs = model.generate(\n",
645 |     "                                generated,\n",
646 |     "                                do_sample=True,\n",
647 |     "                                top_k=50,\n",
648 |     "                                max_length = len(generated) + 50,\n",
649 |     "                                top_p=0.92,\n",
650 |     "                                num_return_sequences=3,\n",
651 |     "                                pad_token_id=tokenizer.eos_token_id,\n",
652 |     "                                temperature=0.8,\n",
653 |     "                                )\n",
654 |     "\n",
655 |     "for i, sample_output in enumerate(sample_outputs):\n",
656 |     "  print(\"{}: {}\\n\\n\".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "code",
661 |    "execution_count": 48,
662 |    "metadata": {},
663 |    "outputs": [],
664 |    "source": [
665 |     "# compare output to foundation model\n",
666 |     "pre_trainedtokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)\n",
667 |     "pre_trainedmodel = GPT2LMHeadModel.from_pretrained(MODEL_NAME)"
668 |    ]
669 |   },
670 |   {
671 |    "cell_type": "code",
672 |    "execution_count": 52,
673 |    "metadata": {},
674 |    "outputs": [],
675 |    "source": [
676 |     "input_ids = pre_trainedtokenizer.encode(prompt, return_tensors=\"pt\")\n",
677 |     "\n",
678 |     "# Generate text\n",
679 |     "output = pre_trainedmodel.generate(\n",
680 |     "    input_ids,\n",
681 |     "    bos_token_id=random.randint(1,30000),\n",
682 |     "    max_length=len(input_ids[0]) + 50,\n",
683 |     "    num_return_sequences=1,\n",
684 |     "    pad_token_id=tokenizer.eos_token_id,\n",
685 |     "    do_sample=True,\n",
686 |     "    top_p=0.92,  # Adjust the sampling parameters as needed\n",
687 |     "    temperature=0.8,\n",
688 |     ")"
689 |    ]
690 |   },
691 |   {
692 |    "cell_type": "code",
693 |    "execution_count": 53,
694 |    "metadata": {},
695 |    "outputs": [
696 |     {
697 |      "data": {
698 |       "text/plain": [
699 |        "'the King of England was a good fellow, for I was the better'"
700 |       ]
701 |      },
702 |      "execution_count": 53,
703 |      "metadata": {},
704 |      "output_type": "execute_result"
705 |     }
706 |    ],
707 |    "source": [
708 |     "pre_trainedtokenizer.decode(output[0], skip_special_tokens=True)"
709 |    ]
710 |   },
711 |   {
712 |    "cell_type": "markdown",
713 |    "metadata": {},
714 |    "source": [
715 |     "## Decoding Strategies\n",
716 |     "\n",
717 |     "The ``generate()`` utility we used above used every output prediction as input for the next time step. This method of using the highest probability prediction as output is called __Greedy Decoding__. Greeding decoding is fast and simple but is marred with issues we saw in samples we just generated.\n",
718 |     "\n",
719 |     "Focusing on only highest probability output narrows our model's focus to just the next step which inturn may result in inconsistent or non-dictionary terms/words.\n",
720 |     "\n",
721 |     "### Beam Search\n",
722 |     "Beam search is the obvious next step to improve the output predictions from the model. Instead of being greedy, beam search keeps track of n paths at any given time and selects the path with overall higher probability.\n",
723 |     "\n",
724 |     "<img src=\"../assets/beamsearch_nb_2.png\">\n",
725 |     "\n",
726 |     "### Other Key Decoding Strategies:\n",
727 |     "- Sampling\n",
728 |     "- Top-k Sampling\n",
729 |     "- Nucleus Sampling\n",
730 |     "\n",
731 |     "### Temperature\n",
732 |     "Though sampling helps bring in required amount of randomness, it is not free from issues. Random sampling leads to gibberish and incoherence at times. To control the amount of randomness, we introduce __temperature__. This parameter helps increase the likelihood of high probability terms reduce the likelihood of low probability ones. This leads to sharper distributions. \n",
733 |     "\n",
734 |     "> High temperature leads to more randomness while lower temperature brings in predictability.\n"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": null,
740 |    "metadata": {},
741 |    "outputs": [],
742 |    "source": [
743 |     "prompt = \"the King of England\"\n",
744 |     "\n",
745 |     "generated = tokenizer.encode(BOS_TOKEN+prompt,return_tensors='pt')\n",
746 |     "generated = generated.to(DEVICE)\n",
747 |     "\n",
748 |     "beam_output = model.generate(\n",
749 |     "    **generated,\n",
750 |     "    max_new_tokens=40,\n",
751 |     "    num_beams=5,\n",
752 |     "    num_return_sequences=5,\n",
753 |     "    early_stopping=True\n",
754 |     ")"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "markdown",
759 |    "metadata": {},
760 |    "source": [
761 |     "## Limitations and What Next?\n",
762 |     "- Long Range Context\n",
763 |     "- Scalability\n",
764 |     "- Instruction led generation\n",
765 |     "- Benchmarking\n",
766 |     "- Halucination / Dreaming\n"
767 |    ]
768 |   },
769 |   {
770 |    "cell_type": "markdown",
771 |    "metadata": {},
772 |    "source": []
773 |   }
774 |  ],
775 |  "metadata": {
776 |   "accelerator": "GPU",
777 |   "colab": {
778 |    "gpuType": "T4",
779 |    "provenance": []
780 |   },
781 |   "kernelspec": {
782 |    "display_name": "Python 3 (ipykernel)",
783 |    "language": "python",
784 |    "name": "python3"
785 |   },
786 |   "language_info": {
787 |    "codemirror_mode": {
788 |     "name": "ipython",
789 |     "version": 3
790 |    },
791 |    "file_extension": ".py",
792 |    "mimetype": "text/x-python",
793 |    "name": "python",
794 |    "nbconvert_exporter": "python",
795 |    "pygments_lexer": "ipython3",
796 |    "version": "3.11.9"
797 |   }
798 |  },
799 |  "nbformat": 4,
800 |  "nbformat_minor": 4
801 | }
802 | 


--------------------------------------------------------------------------------
/module_03/assets/chinchilla.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/chinchilla.png


--------------------------------------------------------------------------------
/module_03/assets/cost_tweet.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/cost_tweet.png


--------------------------------------------------------------------------------
/module_03/assets/instruct_gpt_rlhf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/instruct_gpt_rlhf.png


--------------------------------------------------------------------------------
/module_03/assets/lora_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/lora_1.png


--------------------------------------------------------------------------------
/module_03/assets/quantization.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/quantization.png


--------------------------------------------------------------------------------
/module_03/assets/scaling_laws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/scaling_laws.png


--------------------------------------------------------------------------------
/module_03/assets/soft_prompting_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_1.png


--------------------------------------------------------------------------------
/module_03/assets/soft_prompting_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_2.png


--------------------------------------------------------------------------------
/module_03/assets/soft_prompting_perf.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_03/assets/soft_prompting_perf.png


--------------------------------------------------------------------------------
/module_03/solutions/utils.py:
--------------------------------------------------------------------------------
 1 | # source: https://stackoverflow.com/a/31631711
 2 | def humanbytes(B):
 3 |     """Return the given bytes as a human friendly KB, MB, GB, or TB string."""
 4 |     UNIT = 1000
 5 |     B = float(B)
 6 |     KB = float(UNIT)
 7 |     MB = float(KB ** 2) # 1,048,576
 8 |     GB = float(KB ** 3) # 1,073,741,824
 9 |     TB = float(KB ** 4) # 1,099,511,627,776
10 | 
11 |     if B < KB:
12 |         return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
13 |     elif KB <= B < MB:
14 |         return '{0:.2f} KB'.format(B / KB)
15 |     elif MB <= B < GB:
16 |         return '{0:.2f} MB'.format(B / MB)
17 |     elif GB <= B < TB:
18 |         return '{0:.2f} GB'.format(B / GB)
19 |     elif TB <= B:
20 |         return '{0:.2f} TB'.format(B / TB)
21 | 
22 | 
23 | def memory_fit(req_memory,cpu_ram, gpu_ram):
24 |     if req_memory<=cpu_ram or req_memory<=gpu_ram:
25 |         return "Yes, fits either CPU or GPU"
26 |     elif req_memory<= cpu_ram + gpu_ram:
27 |         return "Yes, but fit needs both CPU and GPU"
28 |     else:
29 |         return "Nope, does not fit available memory"


--------------------------------------------------------------------------------
/module_03/utils.py:
--------------------------------------------------------------------------------
 1 | # source: https://stackoverflow.com/a/31631711
 2 | def humanbytes(B):
 3 |     """Return the given bytes as a human friendly KB, MB, GB, or TB string."""
 4 |     UNIT = 1000
 5 |     B = float(B)
 6 |     KB = float(UNIT)
 7 |     MB = float(KB ** 2) # 1,048,576
 8 |     GB = float(KB ** 3) # 1,073,741,824
 9 |     TB = float(KB ** 4) # 1,099,511,627,776
10 | 
11 |     if B < KB:
12 |         return '{0} {1}'.format(B,'Bytes' if 0 == B > 1 else 'Byte')
13 |     elif KB <= B < MB:
14 |         return '{0:.2f} KB'.format(B / KB)
15 |     elif MB <= B < GB:
16 |         return '{0:.2f} MB'.format(B / MB)
17 |     elif GB <= B < TB:
18 |         return '{0:.2f} GB'.format(B / GB)
19 |     elif TB <= B:
20 |         return '{0:.2f} TB'.format(B / TB)
21 | 
22 | 
23 | def memory_fit(req_memory,cpu_ram, gpu_ram):
24 |     if req_memory<=cpu_ram or req_memory<=gpu_ram:
25 |         return "Yes, fits either CPU or GPU"
26 |     elif req_memory<= cpu_ram + gpu_ram:
27 |         return "Yes, but fit needs both CPU and GPU"
28 |     else:
29 |         return "Nope, does not fit available memory"


--------------------------------------------------------------------------------
/module_04/02_vector_databases_hf_inference_endpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "R9cjyKcNzU8i"
  7 |    },
  8 |    "source": [
  9 |     "## Vector Databases\n",
 10 |     "\n",
 11 |     "<img src=\"./assets/vector_banner.jpg\" height=\"25%\">\n",
 12 |     "\n",
 13 |     "We started this workshop with **text representation** as one of the key components of any NLP system.\n",
 14 |     "As we progressed from simple Bag of Words setup to highly contextualised Transformer models, we now have rich & dense representations.\n",
 15 |     "The utility of such representations also increased multifold from word/sentence representations to features that can used for a number of downstream tasks.\n",
 16 |     "\n",
 17 |     "These representations, also called as vectors or embedding vectors are long series of numbers. Their retrieval and persistence requires specialised database management systems called **Vector Databases**.\n",
 18 |     "\n",
 19 |     "Vector Databases are particularly suited for handling data in the form of vectors, embeddings, or feature representations, which are commonly used in various applications like machine learning, natural language processing, computer vision, and recommendation systems.\n",
 20 |     "\n",
 21 |     "Key Features:\n",
 22 |     "- High-dimensional Data Support\n",
 23 |     "- Similarity Search\n",
 24 |     "- Indexing Techniques\n",
 25 |     "- Dimensionality Reduction\n",
 26 |     "\n",
 27 |     "There are a number of different off-the-shelf options available, such as:\n",
 28 |     "- [ChromaDB](https://www.trychroma.com/)\n",
 29 |     "- [PineCone](https://www.pinecone.io/)\n",
 30 |     "- [Milvus](https://milvus.io/)\n",
 31 |     "- [Weaviate](https://weaviate.io/)\n",
 32 |     "- [AeroSpike](https://aerospike.com/)\n",
 33 |     "- [OpenSearch](https://opensearch.org/)\n"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {
 39 |     "id": "B_luN80B2Am7"
 40 |    },
 41 |    "source": [
 42 |     "## Let us Begin with Installation"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_04/02_vector_databases_hf_inference_endpoint.ipynb\">\n",
 50 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 51 |     "</a>"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 1,
 57 |    "metadata": {
 58 |     "id": "z_dNazilzRUF"
 59 |    },
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "%%capture\n",
 63 |     "# install dependencies\n",
 64 |     "# !pip install -q chromadb\n",
 65 |     "# !pip install retry\n",
 66 |     "#!pip install -U sentence-transformers"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "markdown",
 71 |    "metadata": {
 72 |     "id": "q960w1bz2Am9"
 73 |    },
 74 |    "source": [
 75 |     "## Imports"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 1,
 81 |    "metadata": {
 82 |     "id": "GV6LIKdMBy2r",
 83 |     "scrolled": true
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "import json\n",
 88 |     "import requests\n",
 89 |     "import pandas as pd\n",
 90 |     "from retry import retry\n",
 91 |     "\n",
 92 |     "import chromadb\n",
 93 |     "from chromadb.api.types import Documents, Embeddings"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {
 99 |     "id": "dD7AixSz2Am_"
100 |    },
101 |    "source": [
102 |     "## HuggingFace Inference EndPoints 🤗\n",
103 |     "\n",
104 |     "Another key offering from HuggingFace is *[Inference Endpoints](https://huggingface.co/inference-endpoints)*.\n",
105 |     "These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.\n",
106 |     "\n",
107 |     "All you need is a quick [sign-up](https://huggingface.co/login) and an API Key and bingo!\n"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {
113 |     "id": "dOT-wEBZ2AnA"
114 |    },
115 |    "source": [
116 |     "## Sentence Transformers\n",
117 |     "\n",
118 |     "This is an amazing python framework initially proposed along with the seminal paper titled [Sentence-BERT](https://www.sbert.net/).\n",
119 |     "It provides clean high-level interfaces to easily use Language Models for computing text embeddings for various use-cases.\n",
120 |     "\n",
121 |     "In this notebook we will leverage pretrained models supported by sentence transformer rather than directly using the package.\n",
122 |     "\n",
123 |     "There is a [leaderboard](https://huggingface.co/spaces/mteb/leaderboard) now maintained to keep track of the state-of-the-art embedding models called the **Massive Text Embedding Benchmark (MTEB) Leaderboard**\n",
124 |     "\n",
125 |     "<img src=\"./assets/mteb.png\">\n",
126 |     "\n",
127 |     "> Source : [HuggingFace](https://huggingface.co/spaces/mteb/leaderboard)"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {
133 |     "id": "10G0QGLn2AnA"
134 |    },
135 |    "source": [
136 |     "## MPNET Model\n",
137 |     "\n",
138 |     "- This model transforms sentences/paragraphs to a 768 dimensional vector space and is optimised for question-answering tasks.\n",
139 |     "- The model card is available [here](https://huggingface.co/pinecone/mpnet-retriever-discourse)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 87,
145 |    "metadata": {
146 |     "id": "xr7GrCDXCHKM"
147 |    },
148 |    "outputs": [],
149 |    "source": [
150 |     "EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse'\n",
151 |     "HF_TOKEN = '<YOUR-TOKEN>'\n",
152 |     "EMB_API_URL = f\"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}\"\n",
153 |     "HEADERS = {\"Authorization\": f\"Bearer {HF_TOKEN}\"}"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {
159 |     "id": "DHA9LV_E2AnB"
160 |    },
161 |    "source": [
162 |     "## Embeddings using 🤗 Inference Endpoint\n",
163 |     "- We setup a utility function that takes a list of sentences as input and generates embeddings as response\n",
164 |     "- We use the ``retry`` package to allow for sufficient time and retries for the APIs to respond"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 11,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "import chromadb.utils.embedding_functions as embedding_functions\n",
174 |     "huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(\n",
175 |     "    api_key=HF_TOKEN,\n",
176 |     "    model_name=EMB_MODEL_ID\n",
177 |     ")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 12,
183 |    "metadata": {
184 |     "id": "MqxEofYVCqqz"
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "sample_texts = [\n",
189 |     "        \"Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.\",\n",
190 |     "        \"This is an amazing python framework initially proposed along with the seminal paper titled Sentence-BERT. It provides clean high-level interfaces to easily use Language Models for computing text embeddings for various use-cases.\"\n",
191 |     "        ]"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 14,
197 |    "metadata": {
198 |     "scrolled": true
199 |    },
200 |    "outputs": [
201 |     {
202 |      "data": {
203 |       "text/plain": [
204 |        "[-0.5249432325363159,\n",
205 |        " -0.04365385323762894,\n",
206 |        " 0.5124771595001221,\n",
207 |        " 0.21908265352249146,\n",
208 |        " 0.4560490548610687]"
209 |       ]
210 |      },
211 |      "execution_count": 14,
212 |      "metadata": {},
213 |      "output_type": "execute_result"
214 |     }
215 |    ],
216 |    "source": [
217 |     "# generate embeddings\n",
218 |     "sample_emb = huggingface_ef(sample_texts[0])\n",
219 |     "sample_emb[0][:5]"
220 |    ]
221 |   },
222 |   {
223 |    "cell_type": "code",
224 |    "execution_count": 15,
225 |    "metadata": {
226 |     "colab": {
227 |      "base_uri": "https://localhost:8080/"
228 |     },
229 |     "id": "WqCfZvSRS5zn",
230 |     "outputId": "a56d0831-f7ea-4c2e-eb05-c9d0690c72da"
231 |    },
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "768"
237 |       ]
238 |      },
239 |      "execution_count": 15,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     }
243 |    ],
244 |    "source": [
245 |     "# check embedding length\n",
246 |     "len(sample_emb[0])"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "markdown",
251 |    "metadata": {
252 |     "id": "nph6RzJbDsEx"
253 |    },
254 |    "source": [
255 |     "## Vector Database: ChromaDB\n",
256 |     "\n",
257 |     "As mentioned above, there are a number of offering available. For this workshop we will make use of\n",
258 |     "[ChromaDB](https://www.trychroma.com/).\n",
259 |     "\n",
260 |     "It is a super simple setup which is easy to use. The following figure showcases the overall flow\n",
261 |     "\n",
262 |     "<img src=\"./assets/chroma_workflow.png\">\n",
263 |     "\n",
264 |     "> Source :[chromadb](https://docs.trychroma.com/)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {
270 |     "id": "ubFv6W-C2AnC"
271 |    },
272 |    "source": [
273 |     "### Create an Instance of the Database Client"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 8,
279 |    "metadata": {
280 |     "id": "GVL3ByK9ZG76"
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "# in memory\n",
285 |     "chroma_client = chromadb.Client()\n",
286 |     "# save to disk: client = chromadb.PersistentClient(path=\"/path/to/data\")"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 9,
292 |    "metadata": {
293 |     "id": "vJV9I0dJDB8G"
294 |    },
295 |    "outputs": [],
296 |    "source": [
297 |     "def create_db_and_load_data(chroma_client,collection_name, embedding_func, documents):\n",
298 |     "  db = chroma_client.create_collection(name=collection_name,\n",
299 |     "                                       embedding_function=embedding_func)\n",
300 |     "  for i,d in enumerate(documents):\n",
301 |     "    db.add(\n",
302 |     "      documents=d,\n",
303 |     "      ids=str(i)\n",
304 |     "    )\n",
305 |     "  return db"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {
311 |     "id": "cFAsZHnYEi8g"
312 |    },
313 |    "source": [
314 |     "## Insert Data\n",
315 |     "\n",
316 |     "Now that we have a utility to interact with the vector database, let us add some data to it and check how it goes"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": 18,
322 |    "metadata": {
323 |     "id": "-h5HecZrEKTa"
324 |    },
325 |    "outputs": [],
326 |    "source": [
327 |     "db = create_db_and_load_data(chroma_client=chroma_client,\n",
328 |     "                             collection_name=\"llm_workshop\",\n",
329 |     "                             embedding_func=huggingface_ef,\n",
330 |     "                             documents=sample_texts)"
331 |    ]
332 |   },
333 |   {
334 |    "cell_type": "code",
335 |    "execution_count": 55,
336 |    "metadata": {},
337 |    "outputs": [
338 |     {
339 |      "data": {
340 |       "text/plain": [
341 |        "dict_keys(['ids', 'embeddings', 'metadatas', 'documents', 'uris', 'data', 'included'])"
342 |       ]
343 |      },
344 |      "execution_count": 55,
345 |      "metadata": {},
346 |      "output_type": "execute_result"
347 |     }
348 |    ],
349 |    "source": [
350 |     "db.peek().keys()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 60,
356 |    "metadata": {
357 |     "colab": {
358 |      "base_uri": "https://localhost:8080/",
359 |      "height": 112
360 |     },
361 |     "id": "Sm_zGs8HEc2F",
362 |     "outputId": "169b82d0-c268-4e03-b40d-ad8cda697f43"
363 |    },
364 |    "outputs": [
365 |     {
366 |      "data": {
367 |       "text/html": [
368 |        "<div>\n",
369 |        "<style scoped>\n",
370 |        "    .dataframe tbody tr th:only-of-type {\n",
371 |        "        vertical-align: middle;\n",
372 |        "    }\n",
373 |        "\n",
374 |        "    .dataframe tbody tr th {\n",
375 |        "        vertical-align: top;\n",
376 |        "    }\n",
377 |        "\n",
378 |        "    .dataframe thead th {\n",
379 |        "        text-align: right;\n",
380 |        "    }\n",
381 |        "</style>\n",
382 |        "<table border=\"1\" class=\"dataframe\">\n",
383 |        "  <thead>\n",
384 |        "    <tr style=\"text-align: right;\">\n",
385 |        "      <th></th>\n",
386 |        "      <th>ids</th>\n",
387 |        "      <th>embeddings</th>\n",
388 |        "      <th>documents</th>\n",
389 |        "    </tr>\n",
390 |        "  </thead>\n",
391 |        "  <tbody>\n",
392 |        "    <tr>\n",
393 |        "      <th>0</th>\n",
394 |        "      <td>0</td>\n",
395 |        "      <td>[-0.5249432325363159, -0.04365385323762894, 0....</td>\n",
396 |        "      <td>Another key offering from HuggingFace is Infer...</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>1</th>\n",
400 |        "      <td>1</td>\n",
401 |        "      <td>[-0.5217247605323792, 0.5370820760726929, -0.2...</td>\n",
402 |        "      <td>This is an amazing python framework initially ...</td>\n",
403 |        "    </tr>\n",
404 |        "  </tbody>\n",
405 |        "</table>\n",
406 |        "</div>"
407 |       ],
408 |       "text/plain": [
409 |        "  ids                                         embeddings  \\\n",
410 |        "0   0  [-0.5249432325363159, -0.04365385323762894, 0....   \n",
411 |        "1   1  [-0.5217247605323792, 0.5370820760726929, -0.2...   \n",
412 |        "\n",
413 |        "                                           documents  \n",
414 |        "0  Another key offering from HuggingFace is Infer...  \n",
415 |        "1  This is an amazing python framework initially ...  "
416 |       ]
417 |      },
418 |      "execution_count": 60,
419 |      "metadata": {},
420 |      "output_type": "execute_result"
421 |     }
422 |    ],
423 |    "source": [
424 |     "results = db.peek()\n",
425 |     "pd.DataFrame.from_dict({k:v for k,v in results.items() if k in['ids','documents','embeddings']})"
426 |    ]
427 |   },
428 |   {
429 |    "cell_type": "markdown",
430 |    "metadata": {
431 |     "id": "uGgZzOzYElr5"
432 |    },
433 |    "source": [
434 |     "## Retrieve Documents"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 61,
440 |    "metadata": {
441 |     "id": "N8CeEIX3aXiE"
442 |    },
443 |    "outputs": [],
444 |    "source": [
445 |     "question = \"HuggingFace Key Offering\""
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": 63,
451 |    "metadata": {
452 |     "colab": {
453 |      "base_uri": "https://localhost:8080/"
454 |     },
455 |     "id": "oGe1IwAzX98K",
456 |     "outputId": "2d19d4fb-2a7e-4e86-caf2-4f71123dca4d"
457 |    },
458 |    "outputs": [
459 |     {
460 |      "data": {
461 |       "text/plain": [
462 |        "{'ids': [['0']],\n",
463 |        " 'distances': [[169.98219299316406]],\n",
464 |        " 'metadatas': [[None]],\n",
465 |        " 'embeddings': None,\n",
466 |        " 'documents': [['Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.']],\n",
467 |        " 'uris': None,\n",
468 |        " 'data': None,\n",
469 |        " 'included': ['metadatas', 'documents', 'distances']}"
470 |       ]
471 |      },
472 |      "execution_count": 63,
473 |      "metadata": {},
474 |      "output_type": "execute_result"
475 |     }
476 |    ],
477 |    "source": [
478 |     "db.query(query_texts=[question], n_results=1)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 64,
484 |    "metadata": {
485 |     "id": "qFRSZWHcEfEd"
486 |    },
487 |    "outputs": [],
488 |    "source": [
489 |     "def get_relevant_documents(query, db):\n",
490 |     "  relevant_doc = db.query(query_texts=[query], n_results=1)['documents'][0][0]\n",
491 |     "  return relevant_doc"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 65,
497 |    "metadata": {
498 |     "colab": {
499 |      "base_uri": "https://localhost:8080/",
500 |      "height": 35
501 |     },
502 |     "id": "GfucsODPEusp",
503 |     "outputId": "9d0f89c1-0b1d-4337-9719-e9b390f0e8cd"
504 |    },
505 |    "outputs": [
506 |     {
507 |      "data": {
508 |       "text/plain": [
509 |        "'Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.'"
510 |       ]
511 |      },
512 |      "execution_count": 65,
513 |      "metadata": {},
514 |      "output_type": "execute_result"
515 |     }
516 |    ],
517 |    "source": [
518 |     "# search using embeddings\n",
519 |     "get_relevant_documents(question, db)"
520 |    ]
521 |   },
522 |   {
523 |    "cell_type": "markdown",
524 |    "metadata": {
525 |     "id": "3_9LRJZgMDs1"
526 |    },
527 |    "source": [
528 |     "## HuggingFace Powered Question Answering Setup\n",
529 |     "\n",
530 |     "Similar to Embedding Endpoints, HF also provides us with capabilities to directly leverage models for tasks such as:\n",
531 |     "- Text Generation\n",
532 |     "- Question Answering, etc.\n",
533 |     "\n",
534 |     "We can leverage local setups like GPT4ALL with LangChain, OpenAI APIs or even HuggingFace transformers as well. For this exercise, we will focus on leveraging **HuggingFace Endpoints** for **QA tasks** itself.\n",
535 |     "\n",
536 |     "We will make use of [Roberta-Base-Squad2](https://huggingface.co/deepset/roberta-base-squad2) model."
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 90,
542 |    "metadata": {
543 |     "id": "KCFidBCIE0i_"
544 |    },
545 |    "outputs": [
546 |     {
547 |      "data": {
548 |       "text/plain": [
549 |        "{'Authorization': 'Bearer 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'}"
550 |       ]
551 |      },
552 |      "execution_count": 90,
553 |      "metadata": {},
554 |      "output_type": "execute_result"
555 |     }
556 |    ],
557 |    "source": [
558 |     "QA_MODEL_ID = 'deepset/roberta-base-squad2'\n",
559 |     "QA_API_URL = f\"https://api-inference.huggingface.co/models/{QA_MODEL_ID}\"\n",
560 |     "HEADERS = {\"Authorization\": f\"Bearer {HF_TOKEN}\"}\n",
561 |     "HEADERS"
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "code",
566 |    "execution_count": 93,
567 |    "metadata": {
568 |     "id": "3sOqsmAkMJVK"
569 |    },
570 |    "outputs": [],
571 |    "source": [
572 |     "def get_answer(question,context):\n",
573 |     "    payload = {\n",
574 |     "        \"question\": question,\n",
575 |     "        \"context\":context\n",
576 |     "    }\n",
577 |     "    data = json.dumps(payload)\n",
578 |     "    response = requests.request(\"POST\", QA_API_URL, headers=HEADERS, data=data)\n",
579 |     "    try:\n",
580 |     "      decoded_response = json.loads(response.content.decode(\"utf-8\"))\n",
581 |     "      return decoded_response#decoded_response['answer'], decoded_response['score'], \"\"\n",
582 |     "    except Exception as ex:\n",
583 |     "      return \"Apologies but I could not find any relevant answer\", 0.0, ex"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": 79,
589 |    "metadata": {
590 |     "colab": {
591 |      "base_uri": "https://localhost:8080/"
592 |     },
593 |     "id": "6IU-il-z2AnE",
594 |     "outputId": "820acf52-12eb-48eb-88e9-9ab7a29f7670"
595 |    },
596 |    "outputs": [
597 |     {
598 |      "name": "stdout",
599 |      "output_type": "stream",
600 |      "text": [
601 |       "HuggingFace Key Offering\n"
602 |      ]
603 |     }
604 |    ],
605 |    "source": [
606 |     "print(question)"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "code",
611 |    "execution_count": 80,
612 |    "metadata": {
613 |     "colab": {
614 |      "base_uri": "https://localhost:8080/",
615 |      "height": 35
616 |     },
617 |     "id": "LiCiEjwvVNGI",
618 |     "outputId": "fc0930d9-4d09-4aa4-9ec6-ea5c2c154f85"
619 |    },
620 |    "outputs": [
621 |     {
622 |      "data": {
623 |       "text/plain": [
624 |        "'Another key offering from HuggingFace is Inference Endpoints. These endpoints provide access to hundreds of large models hosted on HuggingFace infra for easy use.'"
625 |       ]
626 |      },
627 |      "execution_count": 80,
628 |      "metadata": {},
629 |      "output_type": "execute_result"
630 |     }
631 |    ],
632 |    "source": [
633 |     "context = get_relevant_documents(question, db)\n",
634 |     "context"
635 |    ]
636 |   },
637 |   {
638 |    "cell_type": "code",
639 |    "execution_count": 94,
640 |    "metadata": {
641 |     "colab": {
642 |      "base_uri": "https://localhost:8080/"
643 |     },
644 |     "id": "cKMvj0aCNs2U",
645 |     "outputId": "312d878f-2de9-45e9-ed36-abfbd6f15e05"
646 |    },
647 |    "outputs": [
648 |     {
649 |      "data": {
650 |       "text/plain": [
651 |        "('Inference Endpoints', 0.1849050521850586, '')"
652 |       ]
653 |      },
654 |      "execution_count": 94,
655 |      "metadata": {},
656 |      "output_type": "execute_result"
657 |     }
658 |    ],
659 |    "source": [
660 |     "get_answer(question,context)"
661 |    ]
662 |   },
663 |   {
664 |    "cell_type": "code",
665 |    "execution_count": null,
666 |    "metadata": {
667 |     "id": "MLUe4RmN2IMP"
668 |    },
669 |    "outputs": [],
670 |    "source": []
671 |   }
672 |  ],
673 |  "metadata": {
674 |   "colab": {
675 |    "provenance": []
676 |   },
677 |   "kernelspec": {
678 |    "display_name": "Python 3 (ipykernel)",
679 |    "language": "python",
680 |    "name": "python3"
681 |   },
682 |   "language_info": {
683 |    "codemirror_mode": {
684 |     "name": "ipython",
685 |     "version": 3
686 |    },
687 |    "file_extension": ".py",
688 |    "mimetype": "text/x-python",
689 |    "name": "python",
690 |    "nbconvert_exporter": "python",
691 |    "pygments_lexer": "ipython3",
692 |    "version": "3.11.9"
693 |   }
694 |  },
695 |  "nbformat": 4,
696 |  "nbformat_minor": 4
697 | }
698 | 


--------------------------------------------------------------------------------
/module_04/03_OpenSource_ClosedSource_LLMs.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "1ae9124c-7015-488d-9791-92b9731386a9",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Open Source Vs Close Sourced LLMs\n",
  9 |     "\n",
 10 |     "Similar to any other piece of technology, LLMs are available in all flavours and license types. While some of the most popular offerings are closed source (OpenAI and Claud), OpenSource also has a number of options (LLaMA, Mistral and more!). \n",
 11 |     "\n",
 12 |     "# But **What does OpenSource or Closed Source mean for Language Models?**\n",
 13 |     "> Or you might ask, don't we all know about the transformer architecture and what powers these LLMs?\n",
 14 |     "\n",
 15 |     "The answer to such question lies in the secret sauce to setup and train such models. Similar to any other ML/AI/DL model, we have to go through the following steps for LLMs as well:\n",
 16 |     "\n",
 17 |     "- 📚 Collect Loads of Data (virtually whole of internet and **more!!!**)\n",
 18 |     "- 🧹 Clean-up the Dataset\n",
 19 |     "- 🤖 A very Large **Transformer**-like architecture\n",
 20 |     "- 💸 A **very Large GPU cluster** to train the model (probably **multiple** times) \n",
 21 |     "- 🧑‍💻 A whole suite of Human Evaluators (and labellers)\n",
 22 |     "- 💹 A proper benchmarking and evaluation setup\n",
 23 |     "\n",
 24 |     "Those are not just 6 steps, those are very hard 6 steps!\n",
 25 |     "\n",
 26 |     "<img src=\"./assets/training_is_hard.png\" width=\"35%\">\n",
 27 |     "\n",
 28 |     "> Source: [Tweet by Danial Han](https://x.com/danielhanchen/status/1765446273661075609) / @danielhanchen\n",
 29 |     "\n",
 30 |     "## Ok but still, Closed vs Open?\n",
 31 |     "\n",
 32 |     "- The secret sauce is to bring in optimizations at each and every step of this process (yes, all 6 are active areas of research!)\n",
 33 |     "- Open Sourced Models could be released with a focus the code, weights, datasets or evaluation details or even all of them. This presents a nice distinction where some models are open-weight models while others are completely open-sourced.\n",
 34 |     "    - Open Weights: Mistral Nemo, Google/Gemma\n",
 35 |     "    - Fully Open Sourced: LLaMA x, Alpaca, Stanford Alpaca (based on LLaMA)\n",
 36 |     "- Closed Source or Closed Weight models currently have superior performance and make use of proprietary improvements and datasets to achieve the same. "
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "id": "410adc18-b5c4-450e-8c7e-04a949ae4bc5",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Are Open Source Models Any Good?\n",
 45 |     "The closed weights/closed source models still lead the pack but the open source community is catching up. Catching up fast and square onto performance on all front. Let us explore one such easy to use setup, the **🦙 🦙LLaMA 🦙 🦙**\n",
 46 |     "### Let's Setup Our Own Lamma 🦙"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 23,
 52 |    "id": "6e204afc-020b-4f4a-9dba-810174da6d98",
 53 |    "metadata": {},
 54 |    "outputs": [
 55 |     {
 56 |      "data": {
 57 |       "text/html": [
 58 |        "\n",
 59 |        "    <h2>Download the Latest From <a src=\"https://ollama.com/\">ollama.com</a></h2>\n",
 60 |        "    <div class=\"row\">\n",
 61 |        "            <img src=./assets/llama_setup_1.png style=\"width:40%\"> </img>\n",
 62 |        "            <img src=./assets/llama_setup_2.png style=\"width:40%\"> </img>\n",
 63 |        "    </div>\n",
 64 |        "    </br>\n",
 65 |        "    <h2>Pull the Latest LLaMA</h2>\n",
 66 |        "    <div class=\"row\">\n",
 67 |        "            <img src=./assets/llama_setup_3.png> </img>\n",
 68 |        "            <h2>Off We Go!</h2>\n",
 69 |        "            <img src=./assets/llama_setup_4.png> </img>\n",
 70 |        "    </div>\n",
 71 |        "    "
 72 |       ],
 73 |       "text/plain": [
 74 |        "<IPython.core.display.HTML object>"
 75 |       ]
 76 |      },
 77 |      "execution_count": 23,
 78 |      "metadata": {},
 79 |      "output_type": "execute_result"
 80 |     }
 81 |    ],
 82 |    "source": [
 83 |     "from IPython.display import Image, HTML, display\n",
 84 |     "\n",
 85 |     "image_path1 = \"./assets/llama_setup_1.png\"\n",
 86 |     "image_path2 = \"./assets/llama_setup_2.png\"\n",
 87 |     "image_path3 = \"./assets/llama_setup_3.png\"\n",
 88 |     "image_path4 = \"./assets/llama_setup_4.png\"\n",
 89 |     "\n",
 90 |     "\n",
 91 |     "HTML(f\"\"\"\n",
 92 |     "    <h2>Download the Latest From <a src=\"https://ollama.com/\">ollama.com</a></h2>\n",
 93 |     "    <div class=\"row\">\n",
 94 |     "            <img src={image_path1} style=\"width:40%\"> </img>\n",
 95 |     "            <img src={image_path2} style=\"width:40%\"> </img>\n",
 96 |     "    </div>\n",
 97 |     "    </br>\n",
 98 |     "    <h2>Pull the Latest LLaMA</h2>\n",
 99 |     "    <div class=\"row\">\n",
100 |     "            <img src={image_path3}> </img>\n",
101 |     "            <h2>Off We Go!</h2>\n",
102 |     "            <img src={image_path4}> </img>\n",
103 |     "    </div>\n",
104 |     "    \"\"\")"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "id": "fbd9c507-d508-4186-82b8-fe2840336630",
110 |    "metadata": {},
111 |    "source": [
112 |     "## 🖥️ Look Ma, Python Bindings!\n",
113 |     "\n",
114 |     "- Ensure your setup steps were completed successfully, else:\n",
115 |     "- Install Ollama server using ``curl -fsSL https://ollama.com/install.sh | sh``\n",
116 |     "- Change download directory : ``export OLLAMA_MODELS=/workspace``\n",
117 |     "- Pull a specific model : ``ollama pull llama3.1:8b``\n",
118 |     "- Start the server (assuming it is in the same environment/shell, else run export command again)\n",
119 |     "    - ``ollama serve``\n",
120 |     "    - ``ollama run llama3.1:8b``"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 29,
126 |    "id": "52d27d56-831b-4296-8652-4ecedd1a3417",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "import ollama\n",
131 |     "import json"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 33,
137 |    "id": "438d996e-0f0d-4a6f-b4a7-944ccea844cf",
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "# Go to your terminal and make sure the following is running: ollama run llama3.1:8b\n",
142 |     "response = ollama.chat(model='llama3.1:8b', messages=[\n",
143 |     "    {\n",
144 |     "        'role': 'user',\n",
145 |     "        'content': \"What is a Llama?\",\n",
146 |     "    },\n",
147 |     "])"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": 34,
153 |    "id": "424062f1-b4da-4278-9343-aaeb32cfaf12",
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "name": "stdout",
158 |      "output_type": "stream",
159 |      "text": [
160 |       "{\n",
161 |       "    \"model\": \"llama3.1\",\n",
162 |       "    \"created_at\": \"2024-08-03T19:09:26.075267Z\",\n",
163 |       "    \"message\": {\n",
164 |       "        \"role\": \"assistant\",\n",
165 |       "        \"content\": \"A llama (Lama glama) is a South American camelid, a mammal closely related to camels and alpacas. They are native to the Andean region of present-day Peru, Bolivia, Ecuador, and Chile.\\n\\nHere are some key characteristics of llamas:\\n\\n1. **Physical appearance**: Llamas have a distinctive appearance with a soft, woolly coat (which can be brown, black, white, or various shades in between), a long neck, and relatively small ears compared to their body size. Adult llamas typically grow to 5-6 feet (1.5-1.8 meters) tall at the shoulder.\\n2. **Habitat**: Llamas are adapted to high-altitude grasslands, rocky plateaus, and scrub forests in South America.\\n3. **Diet**: They are herbivores and feed on plants, such as grasses, leaves, and shrubs. Their digestive system is efficient at breaking down cellulose, which allows them to extract nutrients from plant material that would be difficult for other animals to digest.\\n4. **Social behavior**: Llamas are social creatures that live in herds. They have a hierarchical structure within these groups, with dominant individuals leading the way and younger or subordinate members following.\\n5. **Usefulness**: For centuries, llamas have been used by indigenous communities as pack animals (carrying goods over long distances), while their wool has been valued for its warmth and durability.\\n\\nIn recent years, llamas have also become popular in other parts of the world as pets, companions, or even therapy animals!\\n\\nHow's that? Do you have any specific questions about llamas I can help with?\"\n",
166 |       "    },\n",
167 |       "    \"done_reason\": \"stop\",\n",
168 |       "    \"done\": true,\n",
169 |       "    \"total_duration\": 13250142666,\n",
170 |       "    \"load_duration\": 28561250,\n",
171 |       "    \"prompt_eval_count\": 16,\n",
172 |       "    \"prompt_eval_duration\": 230031000,\n",
173 |       "    \"eval_count\": 345,\n",
174 |       "    \"eval_duration\": 12990640000\n",
175 |       "}\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "print(json.dumps(response, indent = 4))"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "id": "20ee4a3f-e702-4470-b7ab-6496448f91dc",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": []
190 |   }
191 |  ],
192 |  "metadata": {
193 |   "kernelspec": {
194 |    "display_name": "Python 3 (ipykernel)",
195 |    "language": "python",
196 |    "name": "python3"
197 |   },
198 |   "language_info": {
199 |    "codemirror_mode": {
200 |     "name": "ipython",
201 |     "version": 3
202 |    },
203 |    "file_extension": ".py",
204 |    "mimetype": "text/x-python",
205 |    "name": "python",
206 |    "nbconvert_exporter": "python",
207 |    "pygments_lexer": "ipython3",
208 |    "version": "3.11.9"
209 |   }
210 |  },
211 |  "nbformat": 4,
212 |  "nbformat_minor": 5
213 | }
214 | 


--------------------------------------------------------------------------------
/module_04/04_retrieval_augmented_llm_app.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "id": "CQ7lJK0y9gJO"
  7 |    },
  8 |    "source": [
  9 |     "# Retrieval Augmented LLM App\n",
 10 |     "<img src=\"./assets/rap_banner.jpeg\">\n",
 11 |     "\n",
 12 |     "We have covered quite some ground in terms of understanding and building components for:\n",
 13 |     "- Text Representation\n",
 14 |     "- NLP Tasks\n",
 15 |     "- Pretrained Models and Transfer Learning\n",
 16 |     "- Model Fine-Tuning PEFT\n",
 17 |     "- SFT and LLM Landscape\n",
 18 |     "- Vector Databases\n",
 19 |     "- Libraries and Frameworks\n",
 20 |     "\n",
 21 |     "Now we will work through development of an app to showcase how we can leverage all the concepts into a fully functioning system\n",
 22 |     "\n",
 23 |     "__Note__: In order to keep things simple, we will leverage most high-level APIs available but the overall setup should be easily extensible"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "markdown",
 28 |    "metadata": {
 29 |     "id": "kFRjKaov9gJR"
 30 |    },
 31 |    "source": [
 32 |     "## Why Retrieval Augmentation\n",
 33 |     "\n",
 34 |     "While theoretically LLMs are capable of having super long context windows, in real world settings this is a challenge because of:\n",
 35 |     "- Inability/Limitation to ensure LLM focusses on correct sub-sections of the context\n",
 36 |     "- High Memory requirements\n",
 37 |     "- High API Cost\n",
 38 |     "- High Latency , etc.\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "In order to overcome such challenges, we leverage vector databases to act as intelligent retrieval systems (again powered by LLMs) to:\n",
 42 |     "- Provide focussed context\n",
 43 |     "- Reduce memory, cost and latency requirements\n",
 44 |     "- Unlock super-abilities to use upto-date information\n",
 45 |     "- Offload trivial tasks to expert systems"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {
 51 |     "id": "bxi-yR1n9gJR"
 52 |    },
 53 |    "source": [
 54 |     "## Streamlit Enters the Arena\n",
 55 |     "\n",
 56 |     "[Streamlit](https://streamlit.io/) is an open-source Python library that makes it easy to create and share beautiful, custom web apps for machine learning and data science."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_04/04_retrieval_augmented_llm_app.ipynb\">\n",
 64 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
 65 |     "</a>"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {
 71 |     "id": "fTFt0rOY9gJR"
 72 |    },
 73 |    "source": [
 74 |     "## Let us Begin with Installation"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 1,
 80 |    "metadata": {
 81 |     "id": "z_dNazilzRUF"
 82 |    },
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "%%capture\n",
 86 |     "# install dependencies\n",
 87 |     "# !pip install -q chromadb\n",
 88 |     "# !pip install retry\n",
 89 |     "# !pip install -q streamlit \n",
 90 |     "# !npm install localtunnel # this is needed if you are working from colab"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 1,
 96 |    "metadata": {
 97 |     "colab": {
 98 |      "base_uri": "https://localhost:8080/"
 99 |     },
100 |     "id": "qjSlib36bJ70",
101 |     "outputId": "9ccfe431-7738-459d-9d4f-b977c7fc7a86"
102 |    },
103 |    "outputs": [
104 |     {
105 |      "name": "stdout",
106 |      "output_type": "stream",
107 |      "text": [
108 |       "Overwriting app.py\n"
109 |      ]
110 |     }
111 |    ],
112 |    "source": [
113 |     "%%writefile app.py\n",
114 |     "## import required components\n",
115 |     "\n",
116 |     "import pandas as pd\n",
117 |     "from utils import (\n",
118 |     "    get_lines,\n",
119 |     "    load_data,\n",
120 |     "    get_relevant_documents,\n",
121 |     "    get_answer,\n",
122 |     "    create_db,\n",
123 |     "    sidebar,\n",
124 |     ")\n",
125 |     "import streamlit as st\n",
126 |     "chroma_client, db = create_db()\n",
127 |     "\n",
128 |     "## Setup Page Header and Sidebar\n",
129 |     "st.set_page_config(page_title=\"PersonalGPT\", page_icon=\"📖\", layout=\"wide\")\n",
130 |     "lm_model = sidebar()\n",
131 |     "st.header(f\"📖PersonalGPT\")\n",
132 |     "st.markdown(f\">:zap: Responses Powered by **{lm_model}**\")\n",
133 |     "\n",
134 |     "if 'is_doc_uploaded' not in st.session_state:\n",
135 |     "    st.session_state['is_doc_uploaded'] = False\n",
136 |     "\n",
137 |     "\n",
138 |     "## Add Uploader Component\n",
139 |     "uploaded_file = st.file_uploader(\n",
140 |     "    \"Upload a txt file\",\n",
141 |     "    type=[\"txt\"],\n",
142 |     "    help=\"Text files with each sentence acting as a document\",\n",
143 |     ")\n",
144 |     "\n",
145 |     "if not st.session_state['is_doc_uploaded']:\n",
146 |     "    ## Check if upload is complete\n",
147 |     "    if not uploaded_file:\n",
148 |     "        st.stop()\n",
149 |     "    \n",
150 |     "    ## Read uploaded file\n",
151 |     "    try:\n",
152 |     "        file_data = get_lines(uploaded_file)\n",
153 |     "        ## Verbose Status update\n",
154 |     "        st.markdown(f\"> Uploaded file has {len(file_data)} lines of text\")\n",
155 |     "        st.session_state['is_doc_uploaded'] = True\n",
156 |     "    except Exception as e:\n",
157 |     "        st.markdown(f\"Could not upload/read file={e}\")\n",
158 |     "        st.session_state['is_doc_uploaded'] = False\n",
159 |     "    \n",
160 |     "    ## Index Uploaded text file\n",
161 |     "    with st.spinner(\"Indexing document... This may take a while⏳\"):\n",
162 |     "        db_status_msg = load_data(db, documents=file_data)\n",
163 |     "    \n",
164 |     "    ## status update\n",
165 |     "    st.markdown(f\"> Database indexed {db.count()} documents\")\n",
166 |     "    if db.count() == 0:\n",
167 |     "        st.markdown(db_status_msg)\n",
168 |     "        st.session_state['is_doc_uploaded'] = False\n",
169 |     "\n",
170 |     "## Get User Input\n",
171 |     "with st.form(key=\"qa_form\"):\n",
172 |     "    query = st.text_area(\"Enter Your Query:\",\n",
173 |     "                         placeholder=\"Examples: \\nwhat is tf-idf?\\nwhich module covers RLHF\\nhow many moons does Jupiter have?\")\n",
174 |     "    submit = st.form_submit_button(\"Submit\")\n",
175 |     "\n",
176 |     "## Provide additional Options for citing source\n",
177 |     "with st.expander(\"Advanced Options\"):\n",
178 |     "    show_source = st.checkbox(\"Show Source\")\n",
179 |     "\n",
180 |     "## Generate Output upon button click\n",
181 |     "if submit:\n",
182 |     "  # Get relevant documents from DB\n",
183 |     "  context = get_relevant_documents(query, db)\n",
184 |     "\n",
185 |     "  # get answer from LLM\n",
186 |     "  answer,score,error = get_answer(query,context,lm_model)\n",
187 |     "\n",
188 |     "  # Showcase response on screen\n",
189 |     "  st.markdown(f\"**Answer:** _{answer}_\")\n",
190 |     "  st.markdown(f\"> **Relevance Score**:{score}\")\n",
191 |     "  st.markdown(\"---\")\n",
192 |     "\n",
193 |     "  # Add more details if advanced option is chosen\n",
194 |     "  if show_source:\n",
195 |     "    st.markdown(\"**Source(s):**\")\n",
196 |     "    st.markdown(f\"- <i>{context[:100]}...</i>\", unsafe_allow_html=True)"
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 3,
202 |    "metadata": {
203 |     "id": "mlGSHYN0bQSm"
204 |    },
205 |    "outputs": [],
206 |    "source": [
207 |     "# check the log file for localhost port\n",
208 |     "# !streamlit run app.py &>logs.txt & "
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 30,
214 |    "metadata": {
215 |     "colab": {
216 |      "base_uri": "https://localhost:8080/"
217 |     },
218 |     "id": "b5dllFNabXhE",
219 |     "outputId": "8dd09f2d-c36d-49f1-e25d-991b18d1574c"
220 |    },
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "\u001b[K\u001b[?25hnpx: installed 22 in 2.41s\n",
227 |       "your url is: https://icy-heads-enjoy.loca.lt\n",
228 |       "^C\n"
229 |      ]
230 |     }
231 |    ],
232 |    "source": [
233 |     "# ensure setup is complete and your have install nvm/node/npm and localtunnel\n",
234 |     "!lt --port 8501"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": []
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "colab": {
247 |    "provenance": []
248 |   },
249 |   "kernelspec": {
250 |    "display_name": "Python 3 (ipykernel)",
251 |    "language": "python",
252 |    "name": "python3"
253 |   },
254 |   "language_info": {
255 |    "codemirror_mode": {
256 |     "name": "ipython",
257 |     "version": 3
258 |    },
259 |    "file_extension": ".py",
260 |    "mimetype": "text/x-python",
261 |    "name": "python",
262 |    "nbconvert_exporter": "python",
263 |    "pygments_lexer": "ipython3",
264 |    "version": "3.11.9"
265 |   }
266 |  },
267 |  "nbformat": 4,
268 |  "nbformat_minor": 4
269 | }
270 | 


--------------------------------------------------------------------------------
/module_04/06_supercharge_llm_apps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "7c4a39ae-3cbf-4ee7-878c-79af5d013d56",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Supercharge LLM Apps with DSPy and Langfuse\n",
  9 |     "\n",
 10 |     "Prompt engineering, the art of crafting precise instructions for LLMs, can be a time-consuming and iterative process. Debugging and troubleshooting LLM behavior can also be complex, given the inherent \"black box\" nature of these models. Additionally, gaining insights into the performance and cost implications of LLM applications is crucial for optimization and scalability (key components for any production grade setup).\n",
 11 |     "\n",
 12 |     "## The LLM Ecosystem\n",
 13 |     "The ecosystem for LLMs is still in its nascent stages. To address some of these challenges, a number of innovative tools and frameworks are being developed. DSPy from Stanford University is one such unique take towards formalizing LLM-based app development. Langfuse on the other-hand has emerged as an offering to streamline and operationalize aspects of LLM app maintenance. To put it in brief: \n",
 14 |     "- **[DSPY](https://dspy-docs.vercel.app/)** provides a modular and composable framework for building LLM applications, abstracting away the complexities of prompt engineering and enabling developers to focus on the core logic of their applications.\n",
 15 |     "- **[Langfuse](https://langfuse.com/docs)** offers a comprehensive observability platform for LLM apps, providing deep insights into model performance, cost, and user interactions.\n",
 16 |     "\n",
 17 |     "By combining DSPy and Langfuse, developers can unlock the full potential of LLMs, building robust, scalable, and insightful applications that deliver exceptional user experiences."
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "id": "7a4d29a0-a37b-41fe-9c8e-b4cb858e5af4",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "### Langfuse Setup\n",
 26 |     "We will make use of self-hosting option for Langfuse. This is based on ``docker`` and ``docker compose``.\n",
 27 |     "Steps:\n",
 28 |     "- Clone the langfuse repository: ``git clone https://github.com/langfuse/langfuse.git``\n",
 29 |     "- From the langfuse repository: ``cd langfuse``\n",
 30 |     "- Start the docker containers: ``docker compose up``\n",
 31 |     "> The last step spins up a container for langfuse and another one for postgres, you may change settings using the ``.env`` or ``docker-compose.yml`` files"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "id": "144f3ae6-0f65-4c98-8d60-3bc2b15855b5",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "### Imports and Config"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "id": "1651dd12-e05b-4750-b02c-f64aca5d0741",
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "# !pip3 install dspy-ai=2.5.2\n",
 50 |     "# !pip3 install langfuse==2.51.2\n",
 51 |     "# pip3 install chromadb==0.5.5"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 18,
 57 |    "id": "38b72fd8-1510-4bc4-8c74-20165e6e9f8f",
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import os\n",
 62 |     "import sys\n",
 63 |     "import dspy\n",
 64 |     "from dsp.utils import deduplicate\n",
 65 |     "from dspy.retrieve.chromadb_rm import ChromadbRM\n",
 66 |     "from dsp.trackers.langfuse_tracker import LangfuseTracker\n",
 67 |     "\n",
 68 |     "import chromadb\n",
 69 |     "from chromadb.utils import embedding_functions\n",
 70 |     "\n",
 71 |     "from langfuse import Langfuse\n",
 72 |     "\n",
 73 |     "import random\n",
 74 |     "import itertools\n",
 75 |     "from scraper_utils import NB_Markdown_Scraper\n",
 76 |     "from IPython.display import display, Markdown"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 2,
 82 |    "id": "4657c235-76f0-4e36-a12a-37b4b1f01873",
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "config = {\n",
 87 |     "    'LANGFUSE_PUBLIC_KEY': 'XXXX',\n",
 88 |     "    'LANGFUSE_SECRET_KEY': 'XXXX',\n",
 89 |     "    'LANGFUSE_HOST': 'http://localhost:3000',\n",
 90 |     "    'OPENAI_API_KEY': 'XXXX',\n",
 91 |     "    'OPENAI_BASE_URL': '',\n",
 92 |     "    'OPENAI_PROVIDER': '',\n",
 93 |     "    'CHROMA_DB_PATH': './chromadb/',\n",
 94 |     "    'CHROMA_COLLECTION_NAME':\"supercharged_workshop_collection\",\n",
 95 |     "    'CHROMA_EMB_MODEL': 'all-MiniLM-L6-v2'\n",
 96 |     "}"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": 3,
102 |    "id": "4d991f5e-ed34-4c32-8ea9-8ee070f1a62b",
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "os.environ[\"LANGFUSE_PUBLIC_KEY\"] = config.get('LANGFUSE_PUBLIC_KEY')\n",
107 |     "os.environ[\"LANGFUSE_SECRET_KEY\"] = config.get('LANGFUSE_SECRET_KEY')\n",
108 |     "os.environ[\"LANGFUSE_HOST\"] = config.get('LANGFUSE_HOST')\n",
109 |     "os.environ[\"OPENAI_API_KEY\"] = config.get('OPENAI_API_KEY')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 4,
115 |    "id": "3ff6ee91-2661-4697-8b73-a77f6747bc7e",
116 |    "metadata": {},
117 |    "outputs": [],
118 |    "source": [
119 |     "# setup Langfuse tracker\n",
120 |     "langfuse_tracker = LangfuseTracker(session_id='supercharger001')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 5,
126 |    "id": "6cdcd002-9eae-4b9a-91be-e5204b0f8293",
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# instantiate language-model for DSPY\n",
131 |     "llm_model = dspy.OpenAI(\n",
132 |     "    api_key=config.get('OPENAI_API_KEY'),\n",
133 |     "    model='gpt-4o-mini'\n",
134 |     ")"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "id": "10de99e6-b0c5-4c36-b4de-2ffdb45ee94f",
140 |    "metadata": {},
141 |    "source": [
142 |     "## Prepare Dataset"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 11,
148 |    "id": "8151ff1b-9660-4c1f-a42d-199ad6dd576c",
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "nb_scraper = NB_Markdown_Scraper([f'../module_0{i}' for i in range(1,5)])\n",
153 |     "nb_scraper.scrape_markdowns()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 12,
159 |    "id": "ba7cdde6-8a0d-404e-a7ab-fb018fb7c1c1",
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "with open(\"./dspy_content.tsv\", \"w\") as record_file:\n",
164 |     "    for k,v in nb_scraper.notebook_md_dict.items():\n",
165 |     "        record_file.write(f\"{k}\\t{v}\\n\")"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": 13,
171 |    "id": "63769ba8-3eae-42ad-a248-a2675daf2a4f",
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "doc_ids = []\n",
176 |     "ctr = 1\n",
177 |     "for k,_ in nb_scraper.notebook_md_dict.items():\n",
178 |     "    doc_ids.append(f'{ctr}_{k}')\n",
179 |     "    ctr+= 1"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "id": "67bd03a7-e17a-46c4-af1b-ccb10c078cc8",
185 |    "metadata": {},
186 |    "source": [
187 |     "### Ingest Data into ChromaDB\n",
188 |     "> ensure Chroma is running in our terminal\n",
189 |     "> ``$>chroma run --path ./chromadb``"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 6,
195 |    "id": "3baf1c5c-5bdc-4454-981e-2584433e4538",
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stderr",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "/Users/raghav.bali/.pyenv/versions/3.11.9/envs/datahack/lib/python3.11/site-packages/transformers/tokenization_utils_base.py:1601: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
203 |       "  warnings.warn(\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "chroma_emb_fn = embedding_functions.\\\n",
209 |     "                    SentenceTransformerEmbeddingFunction(\n",
210 |     "                        model_name=config.get(\n",
211 |     "                            'CHROMA_EMB_MODEL'\n",
212 |     "                        )\n",
213 |     "                    )\n",
214 |     "client = chromadb.HttpClient()"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 11,
220 |    "id": "bbdeb484-ae0d-401c-b8f0-f16d9c59b4bf",
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "# if collection exists\n",
225 |     "collection = client.get_collection(config.get('CHROMA_COLLECTION_NAME'))"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 10,
231 |    "id": "d5f12287-4d89-4b7a-b5e3-e04c8ba044ff",
232 |    "metadata": {},
233 |    "outputs": [],
234 |    "source": [
235 |     "collection = client.create_collection(\n",
236 |     "    config.get('CHROMA_COLLECTION_NAME'),\n",
237 |     "    embedding_function=chroma_emb_fn,\n",
238 |     "    metadata={\"hnsw:space\": \"cosine\"}\n",
239 |     ")"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 19,
245 |    "id": "93408574-7a22-45fc-82ba-d14feba27b91",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "# Add to collection\n",
250 |     "collection.add(\n",
251 |     "    documents=[v for _,v in nb_scraper.notebook_md_dict.items()], \n",
252 |     "    ids=doc_ids, # must be unique for each doc\n",
253 |     ")"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "id": "728a44fe-1a49-402b-996c-3f5e518e5161",
259 |    "metadata": {},
260 |    "source": [
261 |     "### Test Retrieval using ChromaDB Client"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 12,
267 |    "id": "78abd266-b272-4fae-a1f4-6260e2903e91",
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "['6_module_03_03_RLHF_phi2', '10_module_04_06_supercharge_llm_apps', '2_module_01_02_getting_started']\n",
275 |       "[0.6175035195275418, 0.7261012146561765, 0.8062081214907408]\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "results = collection.query(\n",
281 |     "    query_texts=[\"RLHF\"], # Chroma will embed using the function we provided\n",
282 |     "    n_results=3 # how many results to return\n",
283 |     ")\n",
284 |     "print(results['ids'][0])\n",
285 |     "print(results['distances'][0])\n",
286 |     "#print([i[:100] for j in results['documents'] for i in j])"
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "markdown",
291 |    "id": "cd096119-7012-4154-92fd-97ac8b94c7f6",
292 |    "metadata": {},
293 |    "source": [
294 |     "### Setup ChromaDB as DSPy Retriever "
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 13,
300 |    "id": "6cb9c5a1-1a56-49c6-a2f4-79dd4c3bc003",
301 |    "metadata": {},
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/markdown": [
306 |        "__Document__::# Quick Overview of RLFH\n",
307 |        "\n",
308 |        "The performance of Language Models until GPT-3 was kind of amazing as-is. ... \n"
309 |       ],
310 |       "text/plain": [
311 |        "<IPython.core.display.Markdown object>"
312 |       ]
313 |      },
314 |      "metadata": {},
315 |      "output_type": "display_data"
316 |     },
317 |     {
318 |      "data": {
319 |       "text/markdown": [
320 |        ">- __Document id__::6_module_03_03_RLHF_phi2 \n",
321 |        ">- __Document score__::0.6174977412306334"
322 |       ],
323 |       "text/plain": [
324 |        "<IPython.core.display.Markdown object>"
325 |       ]
326 |      },
327 |      "metadata": {},
328 |      "output_type": "display_data"
329 |     },
330 |     {
331 |      "data": {
332 |       "text/markdown": [
333 |        "__Document__::... \n"
334 |       ],
335 |       "text/plain": [
336 |        "<IPython.core.display.Markdown object>"
337 |       ]
338 |      },
339 |      "metadata": {},
340 |      "output_type": "display_data"
341 |     },
342 |     {
343 |      "data": {
344 |       "text/markdown": [
345 |        ">- __Document id__::10_module_04_06_supercharge_llm_apps \n",
346 |        ">- __Document score__::0.7260969660795557"
347 |       ],
348 |       "text/plain": [
349 |        "<IPython.core.display.Markdown object>"
350 |       ]
351 |      },
352 |      "metadata": {},
353 |      "output_type": "display_data"
354 |     },
355 |     {
356 |      "data": {
357 |       "text/markdown": [
358 |        "__Document__::# Getting Started : Text Representation\n",
359 |        "<img src=\"./assets/banner_notebook_1.jpg\">\n",
360 |        "\n",
361 |        "\n",
362 |        "The NLP domain ... \n"
363 |       ],
364 |       "text/plain": [
365 |        "<IPython.core.display.Markdown object>"
366 |       ]
367 |      },
368 |      "metadata": {},
369 |      "output_type": "display_data"
370 |     },
371 |     {
372 |      "data": {
373 |       "text/markdown": [
374 |        ">- __Document id__::2_module_01_02_getting_started \n",
375 |        ">- __Document score__::0.8062083377747705"
376 |       ],
377 |       "text/plain": [
378 |        "<IPython.core.display.Markdown object>"
379 |       ]
380 |      },
381 |      "metadata": {},
382 |      "output_type": "display_data"
383 |     },
384 |     {
385 |      "data": {
386 |       "text/markdown": [
387 |        "__Document__::# Text Generation <a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_w... \n"
388 |       ],
389 |       "text/plain": [
390 |        "<IPython.core.display.Markdown object>"
391 |       ]
392 |      },
393 |      "metadata": {},
394 |      "output_type": "display_data"
395 |     },
396 |     {
397 |      "data": {
398 |       "text/markdown": [
399 |        ">- __Document id__::3_module_02_02_simple_text_generator \n",
400 |        ">- __Document score__::0.8826038964887366"
401 |       ],
402 |       "text/plain": [
403 |        "<IPython.core.display.Markdown object>"
404 |       ]
405 |      },
406 |      "metadata": {},
407 |      "output_type": "display_data"
408 |     },
409 |     {
410 |      "data": {
411 |       "text/markdown": [
412 |        "__Document__::# <img src=\"./assets/dspy_logo.png\" width=\"2%\"> DSPy: Beyond Prompting\n",
413 |        "---\n",
414 |        "<img src=\"./assets/dspy_b... \n"
415 |       ],
416 |       "text/plain": [
417 |        "<IPython.core.display.Markdown object>"
418 |       ]
419 |      },
420 |      "metadata": {},
421 |      "output_type": "display_data"
422 |     },
423 |     {
424 |      "data": {
425 |       "text/markdown": [
426 |        ">- __Document id__::12_module_04_05_dspy_demo \n",
427 |        ">- __Document score__::0.9200280698248913"
428 |       ],
429 |       "text/plain": [
430 |        "<IPython.core.display.Markdown object>"
431 |       ]
432 |      },
433 |      "metadata": {},
434 |      "output_type": "display_data"
435 |     }
436 |    ],
437 |    "source": [
438 |     "retriever_model = ChromadbRM(\n",
439 |     "    config.get('CHROMA_COLLECTION_NAME'),\n",
440 |     "    config.get('CHROMA_DB_PATH'),\n",
441 |     "    embedding_function=chroma_emb_fn,\n",
442 |     "    client=client,\n",
443 |     "    k=5\n",
444 |     ")\n",
445 |     "\n",
446 |     "# Test Retrieval\n",
447 |     "results = retriever_model(\"RLHF\")\n",
448 |     "for result in results:\n",
449 |     "    display(Markdown(f\"__Document__::{result.long_text[:100]}... \\n\"))\n",
450 |     "    display(Markdown(f\">- __Document id__::{result.id} \\n>- __Document score__::{result.score}\"))"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "markdown",
455 |    "id": "f16c3643-bc0a-4f73-a32c-a846bd3e5882",
456 |    "metadata": {},
457 |    "source": [
458 |     "## Prepare DSPy Program"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 14,
464 |    "id": "75724e2f-266b-4b10-9c93-c8186ede6de4",
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "# Set up the LM and RM\n",
469 |     "dspy.settings.configure(lm=llm_model,rm=retriever_model)"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 15,
475 |    "id": "93e83804-a6d4-4a48-9924-294628eb3fa0",
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "class GenerateAnswer(dspy.Signature):\n",
480 |     "    \"\"\"Answer questions with short factoid answers.\"\"\"\n",
481 |     "\n",
482 |     "    context = dspy.InputField(desc=\"may contain relevant facts\")\n",
483 |     "    question = dspy.InputField()\n",
484 |     "    answer = dspy.OutputField(desc=\"often less than 50 words\")"
485 |    ]
486 |   },
487 |   {
488 |    "cell_type": "code",
489 |    "execution_count": 16,
490 |    "id": "01b93559-dc08-4b2c-a24b-7539c543ff26",
491 |    "metadata": {},
492 |    "outputs": [],
493 |    "source": [
494 |     "class RAG(dspy.Module):\n",
495 |     "    def __init__(self, num_passages=3):\n",
496 |     "        super().__init__()\n",
497 |     "\n",
498 |     "        self.retrieve = dspy.Retrieve(k=num_passages)\n",
499 |     "        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)\n",
500 |     "    \n",
501 |     "    def forward(self, question):\n",
502 |     "        context = self.retrieve(question).passages\n",
503 |     "        prediction = self.generate_answer(context=context, question=question)\n",
504 |     "        return dspy.Prediction(context=context, answer=prediction.answer)"
505 |    ]
506 |   },
507 |   {
508 |    "cell_type": "markdown",
509 |    "id": "2a1a3ba6-ce4f-4692-9732-6c77b7714c32",
510 |    "metadata": {},
511 |    "source": [
512 |     "## Let us Answer Some Questions"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": 69,
518 |    "id": "5dd88f39-fcca-43fd-8aee-de19ddb17954",
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": [
522 |     "compiled_rag = RAG()"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 70,
528 |    "id": "3f6ac3c3-e084-4a0a-8862-adc0184ed782",
529 |    "metadata": {},
530 |    "outputs": [
531 |     {
532 |      "data": {
533 |       "text/markdown": [
534 |        "__Question__: List the models covered in module03"
535 |       ],
536 |       "text/plain": [
537 |        "<IPython.core.display.Markdown object>"
538 |       ]
539 |      },
540 |      "metadata": {},
541 |      "output_type": "display_data"
542 |     },
543 |     {
544 |      "data": {
545 |       "text/markdown": [
546 |        "__Predicted Answer__: _The models covered in module 03 include LLaMA 3.1, Chinchilla, and Gopher._"
547 |       ],
548 |       "text/plain": [
549 |        "<IPython.core.display.Markdown object>"
550 |       ]
551 |      },
552 |      "metadata": {},
553 |      "output_type": "display_data"
554 |     },
555 |     {
556 |      "data": {
557 |       "text/markdown": [
558 |        "__Retrieved Contexts (truncated):__"
559 |       ],
560 |       "text/plain": [
561 |        "<IPython.core.display.Markdown object>"
562 |       ]
563 |      },
564 |      "metadata": {},
565 |      "output_type": "display_data"
566 |     },
567 |     {
568 |      "name": "stdout",
569 |      "output_type": "stream",
570 |      "text": [
571 |       "1. # Scaling Neural Nets and Efficient Training\n",
572 |       "\n",
573 |       "We have covered quite some ground in previous 2 modules and observed the steady increase in size and performance of the models. These gains come at huge c...\n",
574 |       "\n",
575 |       "2. # Prompt Engineering\n",
576 |       "<img src=\"./assets/pe_banner.jpg\">\n",
577 |       "\n",
578 |       "Prompt Engineering is this thrilling new discipline that opens the door to a world of possibilities with large language models (LLMs).\n",
579 |       "\n",
580 |       "As a pr...\n",
581 |       "\n",
582 |       "3. # Text Generation <a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_02/02_simple_text_generator.ipynb\">\n",
583 |       "  <img src=\"https://colab.research.goog...\n",
584 |       "\n"
585 |      ]
586 |     },
587 |     {
588 |      "data": {
589 |       "text/markdown": [
590 |        "---"
591 |       ],
592 |       "text/plain": [
593 |        "<IPython.core.display.Markdown object>"
594 |       ]
595 |      },
596 |      "metadata": {},
597 |      "output_type": "display_data"
598 |     },
599 |     {
600 |      "data": {
601 |       "text/markdown": [
602 |        "__Question__: Brief summary of module02"
603 |       ],
604 |       "text/plain": [
605 |        "<IPython.core.display.Markdown object>"
606 |       ]
607 |      },
608 |      "metadata": {},
609 |      "output_type": "display_data"
610 |     },
611 |     {
612 |      "data": {
613 |       "text/markdown": [
614 |        "__Predicted Answer__: _Module 02 focuses on text generation using pre-trained models like GPT-2, explaining foundation models, decoding strategies (greedy, beam search, sampling), and the impact of temperature on randomness. It also discusses limitations like long-range context and hallucination._"
615 |       ],
616 |       "text/plain": [
617 |        "<IPython.core.display.Markdown object>"
618 |       ]
619 |      },
620 |      "metadata": {},
621 |      "output_type": "display_data"
622 |     },
623 |     {
624 |      "data": {
625 |       "text/markdown": [
626 |        "__Retrieved Contexts (truncated):__"
627 |       ],
628 |       "text/plain": [
629 |        "<IPython.core.display.Markdown object>"
630 |       ]
631 |      },
632 |      "metadata": {},
633 |      "output_type": "display_data"
634 |     },
635 |     {
636 |      "name": "stdout",
637 |      "output_type": "stream",
638 |      "text": [
639 |       "1. # Prompt Engineering\n",
640 |       "<img src=\"./assets/pe_banner.jpg\">\n",
641 |       "\n",
642 |       "Prompt Engineering is this thrilling new discipline that opens the door to a world of possibilities with large language models (LLMs).\n",
643 |       "\n",
644 |       "As a pr...\n",
645 |       "\n",
646 |       "2. # Text Generation <a target=\"_blank\" href=\"https://colab.research.google.com/github/raghavbali/llm_workshop/blob/main/module_02/02_simple_text_generator.ipynb\">\n",
647 |       "  <img src=\"https://colab.research.goog...\n",
648 |       "\n",
649 |       "3. # Scaling Neural Nets and Efficient Training\n",
650 |       "\n",
651 |       "We have covered quite some ground in previous 2 modules and observed the steady increase in size and performance of the models. These gains come at huge c...\n",
652 |       "\n"
653 |      ]
654 |     },
655 |     {
656 |      "data": {
657 |       "text/markdown": [
658 |        "---"
659 |       ],
660 |       "text/plain": [
661 |        "<IPython.core.display.Markdown object>"
662 |       ]
663 |      },
664 |      "metadata": {},
665 |      "output_type": "display_data"
666 |     },
667 |     {
668 |      "data": {
669 |       "text/markdown": [
670 |        "__Question__: What is LLaMA?"
671 |       ],
672 |       "text/plain": [
673 |        "<IPython.core.display.Markdown object>"
674 |       ]
675 |      },
676 |      "metadata": {},
677 |      "output_type": "display_data"
678 |     },
679 |     {
680 |      "data": {
681 |       "text/markdown": [
682 |        "__Predicted Answer__: _LLaMA is a language model from Meta.AI, available in sizes 8B, 70B, and 405B, and it outperforms many existing LLMs on various benchmarks._"
683 |       ],
684 |       "text/plain": [
685 |        "<IPython.core.display.Markdown object>"
686 |       ]
687 |      },
688 |      "metadata": {},
689 |      "output_type": "display_data"
690 |     },
691 |     {
692 |      "data": {
693 |       "text/markdown": [
694 |        "__Retrieved Contexts (truncated):__"
695 |       ],
696 |       "text/plain": [
697 |        "<IPython.core.display.Markdown object>"
698 |       ]
699 |      },
700 |      "metadata": {},
701 |      "output_type": "display_data"
702 |     },
703 |     {
704 |      "name": "stdout",
705 |      "output_type": "stream",
706 |      "text": [
707 |       "1. # Open Source Vs Close Sourced LLMs\n",
708 |       "\n",
709 |       "Similar to any other piece of technology, LLMs are available in all flavours and license types. While some of the most popular offerings are closed source (OpenAI ...\n",
710 |       "\n",
711 |       "2. # Scaling Neural Nets and Efficient Training\n",
712 |       "\n",
713 |       "We have covered quite some ground in previous 2 modules and observed the steady increase in size and performance of the models. These gains come at huge c...\n",
714 |       "\n",
715 |       "3. # Retrieval Augmented LLM App\n",
716 |       "<img src=\"./assets/rap_banner.jpeg\">\n",
717 |       "\n",
718 |       "We have covered quite some ground in terms of understanding and building components for:\n",
719 |       "- Text Representation\n",
720 |       "- NLP Tasks\n",
721 |       "- Pretrai...\n",
722 |       "\n"
723 |      ]
724 |     },
725 |     {
726 |      "data": {
727 |       "text/markdown": [
728 |        "---"
729 |       ],
730 |       "text/plain": [
731 |        "<IPython.core.display.Markdown object>"
732 |       ]
733 |      },
734 |      "metadata": {},
735 |      "output_type": "display_data"
736 |     }
737 |    ],
738 |    "source": [
739 |     "my_questions = [\n",
740 |     "    \"List the models covered in module03\",\n",
741 |     "    \"Brief summary of module02\",\n",
742 |     "    \"What is LLaMA?\"\n",
743 |     "]\n",
744 |     "\n",
745 |     "for question in my_questions:\n",
746 |     "    # Get the prediction. This contains `pred.context` and `pred.answer`.\n",
747 |     "    pred = compiled_rag(question)\n",
748 |     "    \n",
749 |     "    display(Markdown(f\"__Question__: {question}\"))\n",
750 |     "    display(Markdown(f\"__Predicted Answer__: _{pred.answer}_\"))\n",
751 |     "    display(Markdown(\"__Retrieved Contexts (truncated):__\"))\n",
752 |     "    for idx,cont in enumerate(pred.context):\n",
753 |     "        print(f\"{idx+1}. {cont[:200]}...\" )\n",
754 |     "        print()\n",
755 |     "    display(Markdown('---'))"
756 |    ]
757 |   },
758 |   {
759 |    "attachments": {},
760 |    "cell_type": "markdown",
761 |    "id": "5d3f76da-c5f0-49c6-8a30-3654b0e526d2",
762 |    "metadata": {},
763 |    "source": [
764 |     "## Langfuse\n",
765 |     "Understanding Costs\n",
766 |     "\n",
767 |     "<img src ='./assets/langfuse_dashboard.png'>\n",
768 |     "\n",
769 |     "---\n",
770 |     "\n",
771 |     "<img src = './assets/langfuse_traces.png'>"
772 |    ]
773 |   },
774 |   {
775 |    "cell_type": "markdown",
776 |    "id": "3ec2ca40-dd85-4eaa-8f74-492e48149421",
777 |    "metadata": {},
778 |    "source": [
779 |     "## Testing Langfuse Dataset using OpenLLaMA"
780 |    ]
781 |   },
782 |   {
783 |    "cell_type": "code",
784 |    "execution_count": 23,
785 |    "id": "d259e844-2444-4568-887e-a656d464fa55",
786 |    "metadata": {},
787 |    "outputs": [],
788 |    "source": [
789 |     "langfuse =langfuse_tracker.langfuse\n",
790 |     "ollama_dspy = dspy.OllamaLocal(model='llama3.1',temperature=0.5)\n",
791 |     "\n",
792 |     "# Set up the ollama as LM and RM\n",
793 |     "dspy.settings.configure(lm=ollama_dspy,rm=retriever_model)"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "code",
798 |    "execution_count": 24,
799 |    "id": "0e41de51-1925-4b55-8ac6-5e79c530ec31",
800 |    "metadata": {},
801 |    "outputs": [],
802 |    "source": [
803 |     "# get annotated dataset\n",
804 |     "annotated_dataset = langfuse.get_dataset(\"llm_workshop_rag\")"
805 |    ]
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": 25,
810 |    "id": "b9b60273-b9d7-4c91-ada8-e30a3d8391d5",
811 |    "metadata": {},
812 |    "outputs": [],
813 |    "source": [
814 |     "# test rag using ollama\n",
815 |     "ollama_rag = RAG()"
816 |    ]
817 |   },
818 |   {
819 |    "cell_type": "code",
820 |    "execution_count": 27,
821 |    "id": "116ccd33-2c0b-47fe-9cc1-bfef0f8de96b",
822 |    "metadata": {},
823 |    "outputs": [
824 |     {
825 |      "data": {
826 |       "text/markdown": [
827 |        "__Question__: Brief summary of module02"
828 |       ],
829 |       "text/plain": [
830 |        "<IPython.core.display.Markdown object>"
831 |       ]
832 |      },
833 |      "metadata": {},
834 |      "output_type": "display_data"
835 |     },
836 |     {
837 |      "data": {
838 |       "text/markdown": [
839 |        "__Predicted Answer (LLaMA 3.1)__: Here is a brief summary of module02:\n",
840 |        "\n",
841 |        "* LoRA (Low-Rank Adaptation) technique for fine-tuning large models:\n",
842 |        "\t+ Freezes base model weights\n",
843 |        "\t+ Decomposes weight update matrix into lower rank matrices, reducing updates by 100-1000x\n",
844 |        "* qLoRA: Combines quantization and LoRA to further improve efficiency\n",
845 |        "* Model Parameters:\n",
846 |        "\t+ Model size: 405 billion parameters\n",
847 |        "\t+ Training dataset: 15 trillion data points\n",
848 |        "* GPU Performance and Compute Time:\n",
849 |        "\t+ Compute required for training large models\n",
850 |        "\t+ Cost of training large models\n",
851 |        "* Scaling Laws:\n",
852 |        "\t+ Insights from the paper \"Scaling Laws for Neural Language Models\""
853 |       ],
854 |       "text/plain": [
855 |        "<IPython.core.display.Markdown object>"
856 |       ]
857 |      },
858 |      "metadata": {},
859 |      "output_type": "display_data"
860 |     },
861 |     {
862 |      "data": {
863 |       "text/markdown": [
864 |        ">__Annotated Answer (GPT-4o-mini)__: _Module 02 focuses on text generation using pre-trained models like GPT-2, explaining foundation models, decoding strategies (greedy, beam search, sampling), and the impact of temperature on randomness. It also discusses limitations like long-range context and hallucination._"
865 |       ],
866 |       "text/plain": [
867 |        "<IPython.core.display.Markdown object>"
868 |       ]
869 |      },
870 |      "metadata": {},
871 |      "output_type": "display_data"
872 |     },
873 |     {
874 |      "data": {
875 |       "text/markdown": [
876 |        "__Question__: What is LLaMA?"
877 |       ],
878 |       "text/plain": [
879 |        "<IPython.core.display.Markdown object>"
880 |       ]
881 |      },
882 |      "metadata": {},
883 |      "output_type": "display_data"
884 |     },
885 |     {
886 |      "data": {
887 |       "text/markdown": [
888 |        "__Predicted Answer (LLaMA 3.1)__: It seems like you're trying to follow along with a workshop on Large Language Models (LLMs) and their applications. However, the question about LLaMA was not fully answered.\n",
889 |        "\n",
890 |        "To provide a complete answer:\n",
891 |        "\n",
892 |        "Llama is a large language model developed by Meta AI. It's designed for natural language processing tasks such as text generation, translation, and more. Like other popular LLMs like BERT and RoBERTa, Llama uses self-supervised learning to learn patterns in language from vast amounts of text data.\n",
893 |        "\n",
894 |        "Now, let's get back to the original question: \"Fine-Tuning PEFT - SFT and LLM Landscape - Vector Databases - Libraries and Frameworks\".\n",
895 |        "\n",
896 |        "To answer this question:\n",
897 |        "\n",
898 |        "The topic seems to be"
899 |       ],
900 |       "text/plain": [
901 |        "<IPython.core.display.Markdown object>"
902 |       ]
903 |      },
904 |      "metadata": {},
905 |      "output_type": "display_data"
906 |     },
907 |     {
908 |      "data": {
909 |       "text/markdown": [
910 |        ">__Annotated Answer (GPT-4o-mini)__: _LLaMA is a language model from Meta.AI, available in sizes 8B, 70B, and 405B, and it outperforms many existing LLMs on various benchmarks._"
911 |       ],
912 |       "text/plain": [
913 |        "<IPython.core.display.Markdown object>"
914 |       ]
915 |      },
916 |      "metadata": {},
917 |      "output_type": "display_data"
918 |     }
919 |    ],
920 |    "source": [
921 |     "for item in annotated_dataset.items:\n",
922 |     "    question = item.input[0]['content'].split('Question: ')[-1].split('\\n')[0]\n",
923 |     "    answer = item.expected_output['content'].split('Answer: ')[-1]\n",
924 |     "    o_pred = ollama_rag(question)\n",
925 |     "    with item.observe(\n",
926 |     "        run_name='ollama_experiment',\n",
927 |     "        run_description='compare LLaMA3.1 RAG vs GPT4o-mini RAG ',\n",
928 |     "        run_metadata={\"model\": \"llama3.1\"},\n",
929 |     "    ) as trace_id:\n",
930 |     "        langfuse.score(\n",
931 |     "            name=\"visual-eval\",\n",
932 |     "            # any float value\n",
933 |     "            value=1.0,\n",
934 |     "            comment=\"LLaMA3.1 is very verbose\",\n",
935 |     "        )\n",
936 |     "    langfuse.trace(input=question,output=o_pred.answer,metadata={'model':'LLaMA3.1'})\n",
937 |     "    display(Markdown(f\"__Question__: {question}\"))\n",
938 |     "    display(Markdown(f\"__Predicted Answer (LLaMA 3.1)__: {o_pred.answer}\"))\n",
939 |     "    display(Markdown(f\">__Annotated Answer (GPT-4o-mini)__: _{answer}_\"))"
940 |    ]
941 |   },
942 |   {
943 |    "cell_type": "code",
944 |    "execution_count": null,
945 |    "id": "b56d5e1a-7713-4056-b4af-e8f9363cd1a6",
946 |    "metadata": {},
947 |    "outputs": [],
948 |    "source": []
949 |   }
950 |  ],
951 |  "metadata": {
952 |   "kernelspec": {
953 |    "display_name": "Python 3 (ipykernel)",
954 |    "language": "python",
955 |    "name": "python3"
956 |   },
957 |   "language_info": {
958 |    "codemirror_mode": {
959 |     "name": "ipython",
960 |     "version": 3
961 |    },
962 |    "file_extension": ".py",
963 |    "mimetype": "text/x-python",
964 |    "name": "python",
965 |    "nbconvert_exporter": "python",
966 |    "pygments_lexer": "ipython3",
967 |    "version": "3.11.9"
968 |   }
969 |  },
970 |  "nbformat": 4,
971 |  "nbformat_minor": 5
972 | }
973 | 


--------------------------------------------------------------------------------
/module_04/app.py:
--------------------------------------------------------------------------------
 1 | ## import required components
 2 | 
 3 | import pandas as pd
 4 | from utils import (
 5 |     get_lines,
 6 |     load_data,
 7 |     get_relevant_documents,
 8 |     get_answer,
 9 |     create_db,
10 |     sidebar,
11 | )
12 | import streamlit as st
13 | chroma_client, db = create_db()
14 | 
15 | ## Setup Page Header and Sidebar
16 | st.set_page_config(page_title="PersonalGPT", page_icon="📖", layout="wide")
17 | lm_model = sidebar()
18 | st.header(f"📖PersonalGPT")
19 | st.markdown(f">:zap: Responses Powered by **{lm_model}**")
20 | 
21 | if 'is_doc_uploaded' not in st.session_state:
22 |     st.session_state['is_doc_uploaded'] = False
23 | 
24 | 
25 | ## Add Uploader Component
26 | uploaded_file = st.file_uploader(
27 |     "Upload a txt file",
28 |     type=["txt"],
29 |     help="Text files with each sentence acting as a document",
30 | )
31 | 
32 | if not st.session_state['is_doc_uploaded']:
33 |     ## Check if upload is complete
34 |     if not uploaded_file:
35 |         st.stop()
36 |     
37 |     ## Read uploaded file
38 |     try:
39 |         file_data = get_lines(uploaded_file)
40 |         ## Verbose Status update
41 |         st.markdown(f"> Uploaded file has {len(file_data)} lines of text")
42 |         st.session_state['is_doc_uploaded'] = True
43 |     except Exception as e:
44 |         st.markdown(f"Could not upload/read file={e}")
45 |         st.session_state['is_doc_uploaded'] = False
46 |     
47 |     ## Index Uploaded text file
48 |     with st.spinner("Indexing document... This may take a while⏳"):
49 |         db_status_msg = load_data(db, documents=file_data)
50 |     
51 |     ## status update
52 |     st.markdown(f"> Database indexed {db.count()} documents")
53 |     if db.count() == 0:
54 |         st.markdown(db_status_msg)
55 |         st.session_state['is_doc_uploaded'] = False
56 | 
57 | ## Get User Input
58 | with st.form(key="qa_form"):
59 |     query = st.text_area("Enter Your Query:",placeholder="Examples: \nwhat is tf-idf?\nwhich module covers RLHF\nhow many moons does Jupiter have?")
60 |     submit = st.form_submit_button("Submit")
61 | 
62 | ## Provide additional Options for citing source
63 | with st.expander("Advanced Options"):
64 |     show_source = st.checkbox("Show Source")
65 | 
66 | ## Generate Output upon button click
67 | if submit:
68 |   # Get relevant documents from DB
69 |   context = get_relevant_documents(query, db)
70 | 
71 |   # get answer from LLM
72 |   answer,score,error = get_answer(query,context,lm_model)
73 | 
74 |   # Showcase response on screen
75 |   st.markdown(f"**Answer:** _{answer}_")
76 |   st.markdown(f"> **Relevance Score**:{score}")
77 |   st.markdown("---")
78 | 
79 |   # Add more details if advanced option is chosen
80 |   if show_source:
81 |     st.markdown("**Source(s):**")
82 |     st.markdown(f"- <i>{context[:100]}...</i>", unsafe_allow_html=True)
83 | 


--------------------------------------------------------------------------------
/module_04/assets/chroma_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/chroma_workflow.png


--------------------------------------------------------------------------------
/module_04/assets/cot_few_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/cot_few_shot.png


--------------------------------------------------------------------------------
/module_04/assets/dspy_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_banner.png


--------------------------------------------------------------------------------
/module_04/assets/dspy_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_logo.png


--------------------------------------------------------------------------------
/module_04/assets/dspy_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/dspy_workflow.png


--------------------------------------------------------------------------------
/module_04/assets/langchain_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langchain_workflow.png


--------------------------------------------------------------------------------
/module_04/assets/langfuse_dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langfuse_dashboard.png


--------------------------------------------------------------------------------
/module_04/assets/langfuse_traces.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/langfuse_traces.png


--------------------------------------------------------------------------------
/module_04/assets/llama_setup_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_1.png


--------------------------------------------------------------------------------
/module_04/assets/llama_setup_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_2.png


--------------------------------------------------------------------------------
/module_04/assets/llama_setup_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_3.png


--------------------------------------------------------------------------------
/module_04/assets/llama_setup_4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_4.png


--------------------------------------------------------------------------------
/module_04/assets/llama_setup_5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/llama_setup_5.png


--------------------------------------------------------------------------------
/module_04/assets/mteb.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/mteb.png


--------------------------------------------------------------------------------
/module_04/assets/pe_banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/pe_banner.jpg


--------------------------------------------------------------------------------
/module_04/assets/pe_types.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/pe_types.jpg


--------------------------------------------------------------------------------
/module_04/assets/prompt_hacking_reddit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/prompt_hacking_reddit.png


--------------------------------------------------------------------------------
/module_04/assets/prompt_workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/prompt_workflow.png


--------------------------------------------------------------------------------
/module_04/assets/rap_banner.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/rap_banner.jpeg


--------------------------------------------------------------------------------
/module_04/assets/react_sample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/react_sample.png


--------------------------------------------------------------------------------
/module_04/assets/training_is_hard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/training_is_hard.png


--------------------------------------------------------------------------------
/module_04/assets/vector_banner.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_04/assets/vector_banner.jpg


--------------------------------------------------------------------------------
/module_04/constants.py:
--------------------------------------------------------------------------------
 1 | #####################
 2 | ## Set Constants
 3 | #####################
 4 | HF_TOKEN = '<YOUR KEY>'
 5 | OPENAI_TOKEN = '<YOUR KEY>'
 6 | HEADERS = {"Authorization": f"Bearer {HF_TOKEN}"}
 7 | 
 8 | # Constants for embedding model
 9 | EMB_MODEL_ID = 'pinecone/mpnet-retriever-discourse'
10 | EMB_API_URL = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{EMB_MODEL_ID}"
11 | 
12 | # Constants for QA model
13 | QA_MODEL_ID = 'deepset/roberta-base-squad2'
14 | 
15 | # List of Different Endpoints
16 | HF_QA_ENDPOINT = 'HF-QA'
17 | HF_LM_ENDPOINT = 'HF-LM'
18 | OPENAI_ENDPOINT = 'OPENAI-LM'
19 | LOCAL_OLLAMA_ENDPOINT = 'OLLAMA'
20 | AVAILABLE_LMs = {
21 |     'models':
22 |     [
23 |         'deepset/roberta-base-squad2',
24 |         'Intel/dynamic_tinybert',
25 |         #'google/gemma-2-2b-it', # this is timing out mostly
26 |         'Local-LLAMA-3.1:8b',
27 |         'OpenAI-GPT4o-mini'
28 |     ],
29 |     'endpoints':
30 |     [
31 |         HF_QA_ENDPOINT,
32 |         HF_QA_ENDPOINT,
33 |         #HF_LM_ENDPOINT, #this is timing out mostly
34 |         LOCAL_OLLAMA_ENDPOINT,
35 |         OPENAI_ENDPOINT,
36 |     ]
37 | }


--------------------------------------------------------------------------------
/module_04/scraper_utils.py:
--------------------------------------------------------------------------------
 1 | # Adapted From: https://gist.github.com/psychemedia/925e190e2afd15b050f32334ceff9ef6
 2 | import os
 3 | import nbformat
 4 | 
 5 | class NB_Markdown_Scraper:
 6 | 
 7 |     def __init__(self,input_paths=None):
 8 |         self.notebook_md_dict = dict()
 9 |         self.input_paths = input_paths
10 | 
11 |     def nbpathwalk(self,path):
12 |         ''' Walk down a directory path looking for ipynb notebook files... '''
13 |         valid_notebook_files = []
14 |         for path, _, files in os.walk(path):
15 |             if '.ipynb_checkpoints' in path or 'solutions' in path : continue
16 |             for f in [i for i in files if i.endswith('.ipynb') and not i.startswith('dontcommit')]:
17 |                 valid_notebook_files.append(os.path.join(path, f))
18 |         return valid_notebook_files
19 | 
20 | 
21 |     def get_cell_contents(self,nb_fn, c_md=None, cell_typ=None):
22 |         ''' Extract the content of Jupyter notebook cells. '''
23 |         if cell_typ is None: cell_typ=['markdown']
24 |         if c_md is None: c_md = []
25 |         nb=nbformat.read(nb_fn,nbformat.NO_CONVERT)
26 |         _c_md=[i for i in nb.cells if i['cell_type'] in cell_typ]
27 |         ix=len(c_md)
28 |         for c in _c_md:
29 |             c.update( {"ix":str(ix)})
30 |             c.update( {"title":nb_fn})
31 |             ix = ix+1
32 |         c_md = c_md + _c_md
33 |         return c_md
34 | 
35 | 
36 |     # scraper
37 |     def scrape_markdowns(self):
38 |         for directory in self.input_paths:
39 |             directory_notebooks = self.nbpathwalk(directory)
40 |             for notebook in directory_notebooks:
41 |                 notebook_cells = self.get_cell_contents(notebook, cell_typ=['markdown'])
42 |                 notebook_name = '_'.join(notebook.split('/')[1:]).split('.')[0]
43 |                 self.notebook_md_dict[notebook_name] = ' '.join([cell['source'] for cell in sorted(notebook_cells, 
44 |                                                                                                    key=lambda d: d['ix'])])
45 |             


--------------------------------------------------------------------------------
/module_04/utils.py:
--------------------------------------------------------------------------------
  1 | #####################
  2 | ## imports
  3 | #####################
  4 | import pandas as pd
  5 | import json
  6 | import requests
  7 | from retry import retry
  8 | import streamlit as st
  9 | import chromadb.utils.embedding_functions as embedding_functions
 10 | from huggingface_hub import InferenceClient
 11 | from openai import OpenAI
 12 | import ollama
 13 | from constants import (
 14 |     HF_TOKEN,
 15 |     OPENAI_TOKEN,
 16 |     HEADERS,
 17 |     EMB_MODEL_ID,
 18 |     EMB_API_URL,
 19 |     QA_MODEL_ID,
 20 |     HF_QA_ENDPOINT,
 21 |     HF_LM_ENDPOINT,
 22 |     OPENAI_ENDPOINT,
 23 |     LOCAL_OLLAMA_ENDPOINT,
 24 |     AVAILABLE_LMs)
 25 | 
 26 | 
 27 | import chromadb
 28 | 
 29 | 
 30 | lm_df = pd.DataFrame.from_dict(AVAILABLE_LMs)
 31 | 
 32 | #####################
 33 | ## Utility Functions
 34 | #####################
 35 | 
 36 | def get_lines(uploaded_file):
 37 |   """
 38 |     Utility to read raw text file in binary
 39 |   """
 40 |   raw_data = []
 41 |   for line in uploaded_file:
 42 |         raw_data.append(line.decode("utf-8") )
 43 |   return raw_data
 44 | 
 45 | def create_db():
 46 |   """
 47 |     Utility to instantiate vector db client and collection
 48 |   """
 49 |   chroma_client = chromadb.Client()
 50 |   # huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
 51 |   #     api_key=HF_TOKEN,
 52 |   #     model_name=EMB_MODEL_ID
 53 |   # )
 54 |   db = chroma_client.get_or_create_collection(name="nlp_llm_workshop",)
 55 |                                       #embedding_function=huggingface_ef)
 56 |   return chroma_client,db
 57 | 
 58 | def load_data(db, documents):
 59 |   """
 60 |     Utility to add/index data into vector db
 61 |   """
 62 |   try:
 63 |       db.add(
 64 |           documents=documents,
 65 |           ids=[str(i) for i in range(len(documents))]
 66 |   )
 67 |   except Exception as ex:
 68 |       return "Apologies but I could not ingest document", 0.0, ex
 69 | 
 70 | def get_relevant_documents(query, db):
 71 |   """
 72 |     Utility to retrieve relevant documents from vector DB
 73 |   """
 74 |   try:
 75 |       relevant_doc = db.query(query_texts=[query], n_results=1)['documents'][0][0]
 76 |       return relevant_doc
 77 |   except Exception as ex:
 78 |       return "Apologies but I could not process your query", 0.0, ex
 79 | 
 80 | def get_hf_qa_answer(payload,lm_model):
 81 |     data = json.dumps(payload)
 82 |     try:
 83 |       QA_API_URL = f"https://api-inference.huggingface.co/models/{lm_model}" 
 84 |       response = requests.request("POST", QA_API_URL, headers=HEADERS, data=data)
 85 |       decoded_response = json.loads(response.content.decode("utf-8"))
 86 |       return decoded_response['answer'], decoded_response['score'], ""
 87 |     except Exception as ex:
 88 |       return "Apologies but I could not find any relevant answer", 0.0, ex
 89 | 
 90 | # this is mostly timing out
 91 | def get_hf_llm_answer(payload,lm_model):
 92 |     try:
 93 |         client = InferenceClient(
 94 |         "google/gemma-2-2b-it",
 95 |         token=HF_TOKEN,)
 96 | 
 97 |         content = f"Given the context, answer the question. \ncontext:{payload['context']}\nquestion:{payload['question']}"
 98 |         response= client.chat_completion(
 99 |     	messages=[{"role": "user", "content": content}],
100 |     	max_tokens=500,
101 |     	stream=False,
102 |         )
103 |     
104 |         return json.loads(message.choices[0].delta.content), 0.0 
105 |     except Exception as ex:
106 |       return "Apologies but I could not find any relevant answer", 0.0, ex
107 | 
108 | def get_local_llama_answer(payload,lm_model):
109 |     try:
110 |         content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n2.Check if your answer is really in the provided context, otherwise respond with 'Sorry I could not find the answer'.\n 3.Generate a relevance score between 0 and 1.\n4. Format the output as a json with answer and score as keys.\n5.Do not add makrdown syntax only respond with json.\nBe careful and Think step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}"
111 |         response = ollama.chat(model='llama3.1:8b', messages=[
112 |             {
113 |                 'role': 'user',
114 |                 'content': content,
115 |             },
116 |         ]
117 |                               )
118 |         json_output = json.loads(response['message']['content'])
119 |         return json_output['answer'], json_output['score'], ""
120 |     except Exception as ex:
121 |       st.markdown(ex)
122 |       return "Apologies but I could not find any relevant answer", 0.0, ex
123 |         
124 | def get_opeai_answer(payload,lm_model):
125 |     try:
126 |         client = OpenAI(
127 |             api_key=OPENAI_TOKEN,
128 |         )
129 |         content = f"Given the context, perform the following tasks:1.Respond with a summarized answer to the question factually in few words only if the provided context contains the answer\n 2.Generate a relevance score.\n3. Format the output as a json with answer and score as keys. Do not add makrdown syntax.\nThink step by step.\ncontext:{payload['context']}\nquestion:{payload['question']}"
130 |         chat_completion = client.chat.completions.create(
131 |             messages=[
132 |                 {
133 |                     "role": "user",
134 |                     "content": content,
135 |                 }
136 |             ],
137 |             model="gpt-4o-mini",
138 |         )
139 |         json_output = json.loads(chat_completion.choices[0].message.content)
140 |         return json_output['answer'], json_output['score'], ""
141 |     except Exception as ex:
142 |       return "Apologies but I could not find any relevant answer", 0.0, ex
143 | 
144 | 
145 | def get_answer(question,context,lm_model):
146 |     """
147 |       Utility to leverage QA model for answering question using given context
148 |       and the mentioned model
149 |     """
150 |     payload = {
151 |         "question": question,
152 |         "context":context
153 |     }
154 |     try:
155 |       endpoint_type = lm_df[lm_df['models']==lm_model]['endpoints'].values[0]  
156 |       if endpoint_type == HF_QA_ENDPOINT:
157 |           return get_hf_qa_answer(payload,lm_model)
158 |       elif endpoint_type == HF_LM_ENDPOINT:  
159 |           return get_hf_llm_answer(payload,lm_model)
160 |       elif endpoint_type == OPENAI_ENDPOINT: 
161 |           return get_opeai_answer(payload,lm_model)
162 |       elif endpoint_type == LOCAL_OLLAMA_ENDPOINT:
163 |           return get_local_llama_answer(payload,lm_model)
164 |       else:
165 |           "This is not implemented yet", 0.0, ex
166 |     except Exception as ex:
167 |       return "Apologies but I could not find any relevant answer", 0.0, ex
168 | 
169 | 
170 | def sidebar():
171 |     """
172 |       Utility to add content to sidebar
173 |     """
174 |     with st.sidebar:
175 |         st.markdown(
176 |             "## How to use\n"
177 |             "1. Upload a txt file📄\n"
178 |             "3. Ask a question about the document💬\n"
179 |         )
180 |         st.markdown("---")
181 |         st.markdown("## Which LM would you like to use?")
182 |         option = st.selectbox(
183 |             "Select a Model",
184 |              lm_df['models'],
185 |             label_visibility='hidden'
186 |         )
187 | 
188 |         st.markdown("---")
189 |         st.markdown("# About")
190 |         st.markdown(
191 |             "📖PersonalGPT is a demo to showcase retrieval augmented question answering system"
192 |         )
193 |         st.markdown(":heart: Made by [raghav bali](https://raghavbali.github.io)")
194 |         st.markdown("---")
195 | 
196 |         return option


--------------------------------------------------------------------------------
/module_05/whats_next.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/module_05/whats_next.pdf


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "llm-workshop"
 3 | version = "0.1.0"
 4 | description = "LLM Workshop 2024 by raghavbali.github.io"
 5 | authors = ["raghavbali <bali.raghav@gmail.com>"]
 6 | readme = "README.md"
 7 | package-mode = false
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = "^3.11"
11 | pandas = "^2.2.2"
12 | matplotlib = "^3.9.0"
13 | seaborn = "^0.13.2"
14 | torch = "^2.3.1"
15 | transformers = "4.44.0"
16 | torchtext = "^0.18.0"
17 | gensim = "^4.3.2"
18 | jupyterlab = "^4.2.3"
19 | jupyter = "^1.0.0"
20 | nltk = "^3.8.1"
21 | scikit-learn = "^1.5.1"
22 | scipy = "1.12"
23 | scienceplots = "^2.1.1"
24 | fasttext = "^0.9.3"
25 | datasets = "^2.20.0"
26 | accelerate = "^0.33.0"
27 | peft = "^0.12.0"
28 | trl = "^0.9.6"
29 | tensorboardx = "^2.6.2.2"
30 | gpt4all = "^2.7.0"
31 | ollama = "^0.3.1"
32 | dspy-ai = "2.4.10"
33 | langchain = "^0.2.12"
34 | langchain-community = "^0.2.11"
35 | langchain-openai = "^0.1.20"
36 | chromadb = "^0.5.5"
37 | openai = "^1.38.0"
38 | streamlit = "^1.37.0"
39 | retry = "^0.9.2"
40 | sentence-transformers = "^3.0.1"
41 | langchainhub = "^0.1.20"
42 | watchdog = "^4.0.1"
43 | bitsandbytes = "0.43.3"
44 | 
45 | 
46 | [build-system]
47 | requires = ["poetry-core"]
48 | build-backend = "poetry.core.masonry.api"
49 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | pandas==2.2.2
 3 | matplotlib==3.9.0
 4 | seaborn==0.13.2
 5 | torch==2.3.1
 6 | transformers==4.44.0
 7 | torchtext==0.18.0
 8 | gensim==4.3.2
 9 | jupyterlab==4.2.3
10 | jupyter==1.0.0
11 | nltk==3.8.1
12 | scikit-learn==1.5.1
13 | scipy==1.12
14 | scienceplots==2.1.1
15 | fasttext==0.9.3
16 | datasets==2.20.0
17 | accelerate==0.33.0
18 | peft==0.12.0
19 | trl==0.9.6
20 | tensorboardx==2.6.2.2
21 | gpt4all==2.7.0
22 | ollama==0.3.1
23 | dspy-ai==2.4.10
24 | langchain==0.2.12
25 | langchain-community==0.2.11
26 | langchain-openai==0.1.20
27 | chromadb==0.5.5
28 | openai==1.38.0
29 | streamlit==1.37.0
30 | retry==0.9.2
31 | sentence-transformers==3.0.1
32 | langchainhub==0.1.20
33 | watchdog==4.0.1
34 | bitsandbytes==0.43.3


--------------------------------------------------------------------------------
/workshop_introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/raghavbali/llm_workshop/4b66893cc315a0f16706f98ad6548b0b3b4bc700/workshop_introduction.pdf


--------------------------------------------------------------------------------