├── colab ├── image │ ├── upload.png │ ├── resources.png │ ├── gpu_enable_1.png │ └── gpu_enable_2.png ├── README.md ├── finetune.md └── getting_started.md ├── kaggle ├── image │ ├── input.png │ ├── queue.png │ ├── save.png │ ├── ticks.png │ ├── button.png │ ├── output_tab.png │ ├── outputfile.png │ ├── quicksave.png │ ├── resources.png │ ├── enable_gpu_1.png │ ├── enable_gpu_2.png │ ├── persistence.png │ ├── run_with_gpu.png │ ├── view_versions.png │ ├── avoid_download.png │ ├── upload_notebook.png │ ├── version_history.png │ ├── getting_started_2025-01-27-00-25-11.png │ ├── getting_started_2025-01-27-00-28-01.png │ └── getting_started_2025-01-27-00-29-40.png ├── README.md ├── finetune.md └── getting_started.md ├── others ├── papers │ └── UltimateGuideFromBasicsToBreakthrough │ │ ├── image │ │ ├── hft.png │ │ ├── moa.png │ │ ├── rag.png │ │ ├── dora.png │ │ ├── lora.png │ │ ├── peft.png │ │ ├── lamini-1.png │ │ ├── mistral.png │ │ ├── pipeline.png │ │ ├── setupllm.png │ │ ├── timeline.png │ │ ├── lora_dora.png │ │ ├── LLMdimension.png │ │ ├── hft_vs_lora.png │ │ ├── lora_weight.png │ │ ├── task_specific.png │ │ ├── timeline-MMD.png │ │ ├── challenge_init.png │ │ ├── table_pre_fine.png │ │ ├── compare_rag_fine.png │ │ ├── multiple_adapter.png │ │ ├── data_collect_library.png │ │ └── data_preprocess_library.png │ │ ├── README.md │ │ ├── Chapter4.ipynb │ │ ├── Chapter2.ipynb │ │ ├── Chapter11.ipynb │ │ ├── Chapter5.ipynb │ │ ├── Chapter7.ipynb │ │ ├── Chapter3.ipynb │ │ ├── Chapter1.ipynb │ │ └── Chapter6.ipynb ├── blogs │ ├── lora │ │ ├── advancd_guide_lora.md │ │ ├── image │ │ │ └── insights_100_experiments_2025-01-27-02-58-20.png │ │ └── insights_100_experiments.md │ └── efficient_training_huggingface.md └── README.md ├── scripts ├── cv │ ├── README.md │ ├── ultralytics.ipynb │ ├── tensorflow.ipynb │ └── torch.ipynb └── llm │ ├── README.md │ ├── huggingface.md │ ├── lora.ipynb │ └── huggingface.ipynb └── README.md /colab/image/upload.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/colab/image/upload.png -------------------------------------------------------------------------------- /kaggle/image/input.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/input.png -------------------------------------------------------------------------------- /kaggle/image/queue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/queue.png -------------------------------------------------------------------------------- /kaggle/image/save.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/save.png -------------------------------------------------------------------------------- /kaggle/image/ticks.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/ticks.png -------------------------------------------------------------------------------- /colab/image/resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/colab/image/resources.png -------------------------------------------------------------------------------- /kaggle/image/button.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/button.png -------------------------------------------------------------------------------- /kaggle/image/output_tab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/output_tab.png -------------------------------------------------------------------------------- /kaggle/image/outputfile.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/outputfile.png -------------------------------------------------------------------------------- /kaggle/image/quicksave.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/quicksave.png -------------------------------------------------------------------------------- /kaggle/image/resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/resources.png -------------------------------------------------------------------------------- /colab/image/gpu_enable_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/colab/image/gpu_enable_1.png -------------------------------------------------------------------------------- /colab/image/gpu_enable_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/colab/image/gpu_enable_2.png -------------------------------------------------------------------------------- /kaggle/image/enable_gpu_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/enable_gpu_1.png -------------------------------------------------------------------------------- /kaggle/image/enable_gpu_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/enable_gpu_2.png -------------------------------------------------------------------------------- /kaggle/image/persistence.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/persistence.png -------------------------------------------------------------------------------- /kaggle/image/run_with_gpu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/run_with_gpu.png -------------------------------------------------------------------------------- /kaggle/image/view_versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/view_versions.png -------------------------------------------------------------------------------- /kaggle/image/avoid_download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/avoid_download.png -------------------------------------------------------------------------------- /kaggle/image/upload_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/upload_notebook.png -------------------------------------------------------------------------------- /kaggle/image/version_history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/version_history.png -------------------------------------------------------------------------------- /kaggle/image/getting_started_2025-01-27-00-25-11.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/getting_started_2025-01-27-00-25-11.png -------------------------------------------------------------------------------- /kaggle/image/getting_started_2025-01-27-00-28-01.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/getting_started_2025-01-27-00-28-01.png -------------------------------------------------------------------------------- /kaggle/image/getting_started_2025-01-27-00-29-40.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/kaggle/image/getting_started_2025-01-27-00-29-40.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/hft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/hft.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/moa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/moa.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/rag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/rag.png -------------------------------------------------------------------------------- /others/blogs/lora/advancd_guide_lora.md: -------------------------------------------------------------------------------- 1 | # Essential to Advanced Guide to training a LoRA 2 | 3 | Link: [here](https://civitai.com/articles/3105/essential-to-advanced-guide-to-training-a-lora) 4 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/dora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/dora.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/peft.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/peft.png -------------------------------------------------------------------------------- /others/blogs/efficient_training_huggingface.md: -------------------------------------------------------------------------------- 1 | # Methods and tools for efficient training on a single GPU 2 | 3 | Link: [here](https://huggingface.co/docs/transformers/main/en/perf_train_gpu_one) 4 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/lamini-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/lamini-1.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/mistral.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/mistral.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/pipeline.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/setupllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/setupllm.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/timeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/timeline.png -------------------------------------------------------------------------------- /others/blogs/lora/image/insights_100_experiments_2025-01-27-02-58-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/blogs/lora/image/insights_100_experiments_2025-01-27-02-58-20.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora_dora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora_dora.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/LLMdimension.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/LLMdimension.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/hft_vs_lora.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/hft_vs_lora.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora_weight.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/lora_weight.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/task_specific.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/task_specific.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/timeline-MMD.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/timeline-MMD.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/challenge_init.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/challenge_init.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/table_pre_fine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/table_pre_fine.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/compare_rag_fine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/compare_rag_fine.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/multiple_adapter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/multiple_adapter.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/data_collect_library.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/data_collect_library.png -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/image/data_preprocess_library.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BFCmath/FinetuneAI_Learning/HEAD/others/papers/UltimateGuideFromBasicsToBreakthrough/image/data_preprocess_library.png -------------------------------------------------------------------------------- /colab/README.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning on Colab 2 | ## Introduction 3 | This guide summarizes my learnings and experiences from using Colab for AI competitions. Please note that some information may change and may not always be up-to-date. 4 | 5 | This repository is geared toward minimizing costs, so you don't need to worry about GPU expenses. 6 | 7 | ## Getting Started with Colab 8 | 9 | Please check this [Getting Started](getting_started.md) file. 10 | 11 | ## Fine-tuning on Colab 12 | 13 | Please check this [Fine-tuning](finetune.md) file. -------------------------------------------------------------------------------- /kaggle/README.md: -------------------------------------------------------------------------------- 1 | # Fine-tuning on Kaggle 2 | 3 | ## Introduction 4 | This guide is a summary of my learnings and experiences from using Kaggle for AI competitions. Please note that some information may be subject to change and may not always be up-to-date. 5 | 6 | This repository is geared toward those looking to minimize costs, so no need to worry about GPU expenses. 7 | 8 | ## Getting Started with Kaggle 9 | 10 | Please check this [Getting Started](getting_started.md) file. 11 | 12 | ## Fine-tuning on Kaggle 13 | 14 | Please check this [Fine-tuning](finetune.md) file. -------------------------------------------------------------------------------- /scripts/cv/README.md: -------------------------------------------------------------------------------- 1 | # Computer Vision Fine-Tuning 2 | 3 | ## Introduction 4 | 5 | This document summarizes my learnings and experiences with fine-tuning computer vision models. The goal is to create a concise, revisitable resource that simplifies understanding and implementation of fine-tuning techniques for computer vision tasks. 6 | 7 | Currently, the focus is on CNN-based models. However, I plan to expand this to include other architectures in the future. 8 | 9 | ## Frameworks 10 | 11 | I primarily use two frameworks for fine-tuning CNN-based models: 12 | 13 | - **TensorFlow** 14 | - **PyTorch** 15 | 16 | Detailed step-by-step guides for fine-tuning and transfer learning using these frameworks are available in the following notebooks: 17 | 18 | - [TensorFlow Guide](Tensorflow.ipynb) 19 | - [PyTorch Guide](Pytorch.ipynb) 20 | 21 | ## YOLO 22 | 23 | You can check [Ultralytics notebook](Ultralytics.ipynb) to get more information about finetune YOLO models both using Python's library and command line. 24 | 25 | --- 26 | 27 | This repository is continuously updated to include new learnings and frameworks as I progress in my understanding of CV fine-tuning. Feedback and contributions are welcome! -------------------------------------------------------------------------------- /others/README.md: -------------------------------------------------------------------------------- 1 | # LLM fine-tuning 2 | 3 | ## Introduction 4 | 5 | This document summarizes my learnings and experiences with fine-tuning Large Language Model models. The goal is to create a concise, revisitable resource that simplifies understanding and implementation of fine-tuning techniques for LLMs. 6 | 7 | Currently, I only summarize and note down Fine-tuning techniques for LLMs without detailed implementation. 8 | 9 | ## Papers 10 | 11 | You can check this awesome paper/book [The Ultimate Guide to Fine-Tuning LLMs from Basics to Breakthroughs](papers/UltimateGuideFromBasicsToBreakthrough) to get more information about fine-tuning technique. 12 | Also check this one [Instruction Tuning Survey](papers/InstructionTuningSurvey) to understand more about Instruction Tuning. 13 | 14 | ## Blogs 15 | 16 | + [X] [Insight From 100 Experiments](blogs/lora/insights_100_experiments.md) - Finetuning LLMs with LoRA and QLoRA: Insights from Hundreds of Experiments 17 | + [ ] [Advanced Guide to training a LoRA](blogs/lora/advancd_guide_lora.md) - Essential to Advanced Guide to training a LoRA 18 | + [ ] [Efficient Training on a Single GPU](blogs/efficient_training_huggingface.md) - Methods and tools for efficient training on a single GPU 19 | -------------------------------------------------------------------------------- /kaggle/finetune.md: -------------------------------------------------------------------------------- 1 | # How to effectively fine-tune small AI models using Kaggle. 2 | 3 | ## Load Data and Workflow 4 | - Kaggle has a fast data upload speed. 5 | - You should zip the data before uploading to save time. 6 | - For weights/outputs after training, I recommend saving them in the output folder (as notebook or as dataset) and reusing them in the next session. This will save time and avoid re-uploading. 7 | 8 | ## Fine-tuning Strategy on Kaggle 9 | Make sure you don't have to spend limited GPU resources on testing scripts (use Colab for that). 10 | ### Large Models 11 | - For large models, consider fine-tuning one epoch at a time. 12 | - This approach helps conserve resources, allows you to monitor the training process, and reduces the risk of overfitting. 13 | - Kaggle supports output workflows, so you can save the weights and optimizer state in the output folder and reuse them in the next session without re-uploading. 14 | ### Small Models 15 | - For small models, I recommend preparing a well written fine-tune script that can run in many epochs in one go. 16 | - You should apply early stopping, and save the weights after each epoch. 17 | 18 | After fine-tuning, you can download the weights and upload them to Colab for inference. (Optional) -------------------------------------------------------------------------------- /scripts/llm/README.md: -------------------------------------------------------------------------------- 1 | # LLM Fine-Tuning 2 | 3 | ## Introduction 4 | 5 | This document summarizes my learnings and experiences with fine-tuning Large Language Models (LLMs). The goal is to create a concise, revisitable resource that simplifies understanding and implementation of fine-tuning techniques for various NLP tasks. 6 | 7 | Currently, the focus is on transformer-based models like GPT and BERT. However, I plan to expand this to include other architectures and advanced techniques in the future. 8 | 9 | ## Frameworks 10 | 11 | I primarily use below frameworks for fine-tuning LLMs: 12 | 13 | - [**Hugging Face Transformers**](huggingface.md) 14 | 15 | Detailed step-by-step guides for fine-tuning LLMs using these frameworks are available in the following notebooks: 16 | 17 | - [Hugging Face Guide](huggingface.ipynb) 18 | 19 | ## LoRA (Low-Rank Adaptation) 20 | 21 | For lightweight and efficient fine-tuning, I explore LoRA techniques. You can check the [LoRA Guide](lora.ipynb) to understand how to implement LoRA with Hugging Face models. 22 | 23 | --- 24 | 25 | This repository is continuously updated to include new learnings and frameworks as I progress in my understanding of LLM fine-tuning. Feedback and contributions are welcome! 26 | -------------------------------------------------------------------------------- /colab/finetune.md: -------------------------------------------------------------------------------- 1 | # How to effectively fine-tune AI models using Colab. 2 | 3 | ## Load Data 4 | - Colab consumes a lot of time when uploading data, so always upload data first. 5 | - In the uploading time, you can prepare the script and environment. 6 | 7 | ## Find scripts for fine-tuning 8 | - Many fine-tuning scripts tailored for Colab are available online. 9 | - If your model is hosted on Hugging Face, you can often find Colab-compatible scripts directly on the model's page. 10 | - Numerous fine-tuning scripts can be found on platforms like Medium and GitHub. However, these scripts often require adjustments to address compatibility issues with specific Python library versions. 11 | 12 | ## Fine-tuning Strategy on Colab 13 | ### Large Models 14 | - Colab have a daily GPU usage limit (instead of weekly like Kaggle). So its better to use it for debugging and testing scripts: 15 | + Check library version collisions. 16 | + Adjust the script to run successfully in one go. 17 | + You can run it with a small dataset to estimate the time and resources needed. 18 | ### Small Models 19 | - Colab is particularly suitable for fine-tuning smaller models or running inference scripts for larger models. 20 | - You can fine-tune smaller models entirely on Colab. 21 | - However, time limits may prevent many epochs of training, so you may need to save model weights after each epoch to prevent progress loss. 22 | - More effective to experiement with different learning rates and batch sizes for small models on Colab. -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/README.md: -------------------------------------------------------------------------------- 1 | # The Ultimate Guide to Fine-Tuning LLMs from Basics to Breakthroughs: An Exhaustive Review of Technologies, Research, Best Practices, Applied Research Challenges and Opportunities 2 | 3 | **Source**: [original paper](https://arxiv.org/pdf/2408.13296v1) and many online sources 4 | 5 | **Note**: This is a summary of what I've learned and understood from the original papers. I've also included additional information collected from online sources. 6 | 7 | ## TL;DR 8 | + Chapter 1: 9 | - Type of LLMs: Unsupervised, supervised and instruction tuning. 10 | - RAG 11 | + Chapter 2: 12 |
13 | 14 |
15 | 16 | + Chapter 3: Data Preprocessing + Data Augmentation 17 | + Chapter 4: Model Architecture 18 |
19 | 20 |
21 | 22 | + Chapter 5: GPUs, hyperparameters, and optimizer and loss function for training. 23 | + Chapter 6: Fine-Tuning techniques: 24 | + Task/Domain specific fine-tuning 25 | + PEFT: Adapter, Lora, Qlora, Dora. 26 | + Half Fine-Tuning 27 | + MoE&MoA 28 | + Chapter 7: Evaluation Metrics 29 | 1. Set Up Evaluation Metrics 30 | 2. Interpret Training Loss Curve 31 | 3. Run Validation Loops 32 | 4. Monitor and Interpret Results 33 | 5. Hyperparameter Tuning and Adjustments 34 | + Chapter 11: Multimodal LLMs and their Fine-tuning -------------------------------------------------------------------------------- /colab/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Colab 2 | ## Create a new notebook 3 | 4 | + Access [Google Colab](https://colab.research.google.com/) 5 | + Sign in with your Google account 6 | + You can also upload your own `.ipynb` files 7 | + Go to **[File] > [Upload notebook]**. 8 | ![Upload notebook](image/upload.png) 9 | + Alternatively, upload an `.ipynb` file to your Drive and open it in Colab. 10 | 11 | ## Enable GPU in Colab 12 | + Go to **[Runtime] > [Change runtime type]** and select **T4 GPU**. 13 | 14 |
15 | View Version 16 | Version History 17 |
18 | 19 | ## Available Resources 20 | + **Disk Space**: 21 | + 78 GB when using a GPU. 22 | + 107 GB when using only the CPU. 23 | + **RAM**: 12.72 GB. 24 | + **GPU**: T4 (15GB memory). 25 | + **Session Duration**: Up to 12 hours but may disconnect earlier if idle. 26 | + **Daily Usage**: Generally around 12 hours of GPU usage, depending on load. 27 | + **Idle Timeout**: Disconnects after 90 minutes of inactivity. 28 | 29 | **Note**: From my experience, you can use a GPU for about 3–4 hours continuously. 30 | 31 | You can check resources in the top-right corner of the Colab notebook. For more details, click on the resource monitoring window. 32 | ![Resource view](image/resources.png) 33 | 34 | ## Working with Datasets 35 | + To use a dataset in your notebook: 36 | 1. Upload it to your Drive. 37 | 2. Mount your Drive to access the dataset: 38 | 39 | ```python 40 | from google.colab import drive 41 | drive.mount('/content/drive') 42 | ``` 43 | 44 | - Alternatively, use the Kaggle API to download datasets directly: 45 | 46 | ```bash 47 | !pip install kaggle 48 | !kaggle competitions download -c dataset_name 49 | ``` 50 | 51 | - Use the `!wget` command to download datasets directly into Colab: 52 | 53 | ```bash 54 | !wget https://www.example.com/dataset.zip 55 | ``` 56 | 57 | ## Saving and Accessing Outputs 58 | Save outputs temporarily in Colab's workspace or permanently in Google Drive: 59 | 60 | ```python 61 | model.save('/content/drive/My Drive/your_folder/model.h5') 62 | ``` 63 | -------------------------------------------------------------------------------- /scripts/llm/huggingface.md: -------------------------------------------------------------------------------- 1 | # Using HuggingFace 2 | Hugging Face is a leading AI platform and open-source community that simplifies working with state-of-the-art machine learning models. Known for its user-friendly libraries, it empowers developers and researchers to access pre-trained models, fine-tune them, and integrate them into applications with ease. 3 | 4 | ## Transformers Library 5 | + [Transformers](https://huggingface.co/transformers/) 6 | + Installation: 7 | ```bash 8 | pip install transformers 9 | ``` 10 | + Core components: 11 | + Models 12 | + Tokenizers 13 | + Trainer API 14 | + Datasets Integration 15 | ## Hugging Face Models 16 | + [Hugging Face Model Hub](https://huggingface.co/models) 17 | + Hugging Face have a vast collection of pre-trained models available for various NLP tasks. 18 | + You can filter task and access to up-to-date models. 19 | + You can download weight and configuration files for the model you need. 20 | + They can be easily loaded and utilized using the `transformers` library. 21 | Example usage: 22 | ```python 23 | from transformers import AutoModelForSequenceClassification, AutoTokenizer 24 | 25 | model_name = "bert-base-uncased" 26 | model = AutoModelForSequenceClassification.from_pretrained(model_name) 27 | tokenizer = AutoTokenizer.from_pretrained(model_name) 28 | 29 | inputs = tokenizer("I love Hugging Face!", return_tensors="pt") 30 | outputs = model(**inputs) 31 | ``` 32 | ## Tokenizers 33 | + Essential for preparing input data for transformer models. 34 | + Features: 35 | + Subword tokenization. 36 | + Batch encoding with padding and truncation. 37 | ```python 38 | from transformers import AutoTokenizer 39 | 40 | tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") 41 | tokens = tokenizer("Transformers are powerful!", return_tensors="pt") 42 | print(tokens) 43 | ``` 44 | ## Datasets 45 | The library seamlessly integrates with Hugging Face's Datasets library for loading and processing data. 46 | ## Trainer API 47 | Simplifies fine-tuning and evaluation of models. 48 | ```python 49 | from transformers import Trainer, TrainingArguments 50 | 51 | training_args = TrainingArguments( 52 | output_dir="./results", 53 | num_train_epochs=3, 54 | per_device_train_batch_size=8, 55 | ) 56 | trainer = Trainer( 57 | model=model, 58 | args=training_args, 59 | train_dataset=train_dataset, 60 | eval_dataset=eval_dataset, 61 | ) 62 | trainer.train() 63 | ``` 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # FinetuneAI_Learning 2 | 3 | How to fine-tune AI models for undergraduates and beginners (with free GPU resources). 4 | 5 | ## Introduction 6 | 7 | This repository serves as a record of my learning process and the insights I've gained while studying the fine-tuning of AI models. The aim is to create a comprehensive resource that is easy to revisit and can help others understand and implement fine-tuning techniques. 8 | Especially, this aims to help no-local-GPU users to fine-tune AI models effectively with free GPU resources. 9 | 10 | --- 11 | 12 | ## Free GPU Resources 13 | 14 | In order to train AI models effectively, GPU is essential. However, not everyone has access to a local GPU or the resources to rent one. 15 | 16 | This section provides an overview of free GPU resources available on platforms like Kaggle and Google Colab, along with tips on how to manage these resources efficiently. 17 | 18 | ### Platforms 19 | The two most popular platforms that offer free GPU resources are [Kaggle](https://www.kaggle.com/) and [Google Colab](https://colab.research.google.com/). 20 | 21 | ### Getting Started 22 | - You can check my file: [Kaggle](kaggle/getting_started.md) or learn from the original guide. 23 | - You can check my file: [Google Colab](colab/getting_started.md) or learn from the original guide. 24 | 25 | ### Effective Resource Management 26 | - Although these platforms offer free GPU resources, they come with limitations (time, VRAM, etc.) 27 | - It's important to manage these resources efficiently to avoid interruptions during training. 28 | - Please check these below files for detailed information: 29 | - [Use Kaggle for fine-tuning](kaggle/finetune.md) 30 | - [Use Colab for fine-tuning](colab/finetune.md) 31 | 32 | - **TL;DR**: 33 | - [Kaggle](kaggle): 34 | - 30 hours weekly. 35 | - Fast data/weight uploads and workflows. 36 | - 1x 16 GB VRAM GPU or 2x 15 GB VRAM GPUs. (Tesla P100 or 2x T4). 37 | - [Colab](colab): 38 | - About 3 hours daily. 39 | - Many example scripts available. 40 | - 1x 16 GB VRAM GPU. (T4) 41 | - Effective mix: 42 | - Load data on both platforms. 43 | - Find avaibale online scripts (usually written for Colab). 44 | - Stablize/Debug/Estimate time on Colab. 45 | - Convert the scripts to Kaggle. 46 | - Fine-tune models on Kaggle. 47 | - For large models: Run per epoch and use Kaggle advantage for output workflows ([MUST READ](kaggle/getting_started.md#run-all--recurrent-workflow)). 48 | - For small models: Use Colab to experiment (hyperparameter tuning, etc.) and fine-tune in one go on Kaggle. 49 | - Download weight from Kaggle, upload to Colab and inference (Optional). 50 | 51 | --- 52 | 53 | ## Script samples for fine-tuning 54 | 55 | Another problems besides the resource management is that there are not many scripts available for fine-tuning AI models. (You can find many scripts for inference, but not for fine-tuning from my experience.) 56 | 57 | This section provides some fine-tune scripts for some AI tasks. 58 | ### Computer vision tasks 59 | 60 | Please check the [Computer vision script](scripts/cv/README.md) folder for detailed information. 61 | 62 | **TL;DR**: I provide scripts for fine-tuning with Torch and TF, and YOLO with Ultralytics. 63 | 64 | ### LLM tasks 65 | 66 | Please check the [LLM script](scripts/llm/README.md) folder for detailed information. 67 | 68 | **TL;DR**: I provide scripts for fine-tuning with Hugging Face. 69 | 70 | --- 71 | ## Others 72 | 73 | Beside the above sections, I also provide some other useful information for fine-tuning AI models that I have learned from reading cool papers, books, and blogs. 74 | 75 | Please check the [Others folder](others/README.md) for detailed information. 76 | 77 | --- 78 | 79 | This repository will continue to grow as I learn more about fine-tuning AI models. Feel free to explore, learn, and contribute! 🚀 80 | -------------------------------------------------------------------------------- /scripts/llm/lora.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# LoRA Fine-tuning with peft" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Load libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import torch\n", 24 | "from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer\n", 25 | "from datasets import load_dataset\n", 26 | "from peft import LoraConfig, get_peft_model, TaskType" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Load Model and Tokenizer" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "model_name = \"microsoft/deberta-v3-base\" # Example 1B param model (replace as needed)\n", 43 | "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", 44 | "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2) # num_labels should align with your dataset" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": {}, 50 | "source": [ 51 | "## Load and Prepare Dataset\n" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": null, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "dataset = load_dataset(\"glue\", \"mrpc\") # Example dataset (replace as needed)\n", 61 | "def tokenize_function(examples):\n", 62 | " return tokenizer(examples[\"sentence1\"], examples[\"sentence2\"], padding=\"max_length\", truncation=True)\n", 63 | "tokenized_datasets = dataset.map(tokenize_function, batched=True)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Define LoRA Configuration\n" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "peft_config = LoraConfig(\n", 80 | " task_type=TaskType.SEQ_CLS, # Specify your task type\n", 81 | " inference_mode=False,\n", 82 | " r=8, # Rank of LoRA matrices (common values: 8, 16, 32)\n", 83 | " lora_alpha=32, # Scaling factor for LoRA updates\n", 84 | " lora_dropout=0.1, # Dropout for LoRA layers\n", 85 | " target_modules=[\"query\", \"value\"] # option to choose which modules to apply LoRA, like query, key, value, dense\n", 86 | ")" 87 | ] 88 | }, 89 | { 90 | "cell_type": "markdown", 91 | "metadata": {}, 92 | "source": [ 93 | "## Peft model" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "model = get_peft_model(model, peft_config)\n", 103 | "model.print_trainable_parameters()" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Training arguments" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "training_args = TrainingArguments(\n", 120 | " output_dir=\"./lora_finetuned\",\n", 121 | " learning_rate=2e-5,\n", 122 | " per_device_train_batch_size=8,\n", 123 | " per_device_eval_batch_size=8,\n", 124 | " num_train_epochs=3,\n", 125 | " weight_decay=0.01,\n", 126 | " evaluation_strategy=\"epoch\",\n", 127 | " save_strategy=\"epoch\",\n", 128 | " load_best_model_at_end=True,\n", 129 | ")" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": {}, 135 | "source": [ 136 | "## Define Trainer and train" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "trainer = Trainer(\n", 146 | " model=model,\n", 147 | " args=training_args,\n", 148 | " train_dataset=tokenized_datasets[\"train\"],\n", 149 | " eval_dataset=tokenized_datasets[\"validation\"],\n", 150 | ")\n", 151 | "\n", 152 | "trainer.train()\n" 153 | ] 154 | } 155 | ], 156 | "metadata": { 157 | "kernelspec": { 158 | "display_name": "base", 159 | "language": "python", 160 | "name": "python3" 161 | }, 162 | "language_info": { 163 | "name": "python", 164 | "version": "3.11.9" 165 | } 166 | }, 167 | "nbformat": 4, 168 | "nbformat_minor": 2 169 | } 170 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chapter 4: Stage 2 - Model Initialisation" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Steps Involved in Model Initialisation" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "vscode": { 29 | "languageId": "plaintext" 30 | } 31 | }, 32 | "source": [ 33 | "
\n", 34 | " \"\"\n", 35 | "
" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "vscode": { 42 | "languageId": "plaintext" 43 | } 44 | }, 45 | "source": [ 46 | "1. **Set Up the Environment**: Configure your environment, such as setting up GPU/TPU usage if available, which can significantly speed up model loading and inference.\n", 47 | "2. **Install the Dependencies**: Ensure that all necessary software and libraries are installed. This typically includes package managers like pip and frameworks like PyTorch or TensorFlow.\n", 48 | "3. **Import the Libraries**: Import the required libraries in your script or notebook. Common libraries include transformers from Hugging Face, torch for PyTorch, and other utility libraries.\n", 49 | "4. **Choose the Language Model**: Select the appropriate pre-trained language model based on your task requirements. This could be models like BERT, GPT-3, or others available on platforms like Hugging Face’s Model Hub.\n", 50 | "5. **Download the Model from the Repository**: Use the chosen framework’s functions to download the pre-trained model from an online repository. For instance, using transformers, you might use AutoModel.from_pretrained(’model_name’).\n", 51 | "6. **Load the Model in the Memory**: Load the model into memory, ready for inference or further fine-tuning. This step ensures the model weights are initialised and ready for use.\n", 52 | "7. **Execute Tasks**: Perform the desired tasks using the loaded model. This could involve making predictions, generating text, or fine-tuning the model on a new dataset." 53 | ] 54 | }, 55 | { 56 | "cell_type": "markdown", 57 | "metadata": { 58 | "vscode": { 59 | "languageId": "plaintext" 60 | } 61 | }, 62 | "source": [ 63 | "## Tools and Libraries for Model Initialisation" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "vscode": { 70 | "languageId": "plaintext" 71 | } 72 | }, 73 | "source": [ 74 | "1. Python Library: **HuggingFace**\n", 75 | "\n", 76 | "Description: HuggingFace is renowned for its support of numerous pre-trained large language models, ranging from Phi-3 mini to Llama-3 70B. The transformers library, part of HuggingFace, enables users to access these models via classes such as AutoModelForCausalLM. This library supports loading fine-tuned models as well as 4-bit quantised models. Additionally, the transformers library includes the ”pipeline” feature, making it easy to use pre-trained models for various tasks.\n", 77 | "\n", 78 | "2. Python Framework: **PyTorch**\n", 79 | "\n", 80 | "Description: PyTorch offers comprehensive tools and libraries for Initialising and fine-tuning large language models. It provides a flexible and efficient platform for building and deploying deep learning models. HuggingFace’s transformers library bridges the gap between PyTorch and other frameworks, enhancing its usability for state-of-the-art language models.\n", 81 | "\n", 82 | "3. Python Framework: **TensorFlow**\n", 83 | "\n", 84 | "Description: TensorFlow also provides extensive tools and libraries for Initialising and fine-tuning large language models. Similar to PyTorch, it benefits from the HuggingFace transformers library, which provides a versatile and user-friendly API and interface for working with the latest advancements in large language models." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "vscode": { 91 | "languageId": "plaintext" 92 | } 93 | }, 94 | "source": [ 95 | "## Challenges in Model Initialisation" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": { 101 | "vscode": { 102 | "languageId": "plaintext" 103 | } 104 | }, 105 | "source": [ 106 | "
\n", 107 | " \"\"\n", 108 | "
" 109 | ] 110 | } 111 | ], 112 | "metadata": { 113 | "language_info": { 114 | "name": "python" 115 | } 116 | }, 117 | "nbformat": 4, 118 | "nbformat_minor": 2 119 | } 120 | -------------------------------------------------------------------------------- /kaggle/getting_started.md: -------------------------------------------------------------------------------- 1 | # Getting Started with Kaggle 2 | ## Setting Up Kaggle 3 | 4 | 1. **Create an Account**: Register for an account on Kaggle. 5 | 2. **Identity Verification**: 6 | - Go to **Profile > Settings**. 7 | - Complete **Phone Verification** by adding and verifying your phone number. 8 | - Complete **Identity Verification** by scanning your face. 9 | 10 | ## Creating a New Notebook 11 | 12 | - Create your first notebook by selecting **Create > New Notebook**. 13 | - Kaggle notebooks support Jupyter-style shortcuts and magic commands. 14 | - Notebooks save automatically, and you can upload your own `.ipynb` files. 15 | 16 | ![Upload notebook](image/upload_notebook.png) 17 | 18 | ## Enabling GPU in Kaggle 19 | To enable GPU in your Kaggle notebook, follow these steps: 20 | 21 | 1. **Verify Identity**: Ensure you have completed identity verification as described [above](#setting-up-kaggle). 22 | 23 | 2. **Enable GPU**: You have two options to activate the GPU in your notebook: 24 | 25 | - **Option 1**: 26 | - Open **Settings** in your notebook. 27 | - Under **Accelerator**, select **GPU**. 28 | 29 | ![Enable GPU in settings](image/enable_gpu_1.png) 30 | 31 | - **Option 2**: 32 | - Click the button in the lower right corner of your notebook interface. 33 | - In **Session Options**, go to **Accelerator** and select **GPU**. 34 | 35 |
36 | view version 37 | version history 38 |
39 | 40 | ## Available Resources 41 | 42 | - **Accelerator Options**: 43 | - 2x GPU T4 (15GB each), GPU P100 (16GB), TPU VM v3-8 44 | - 30 hours per week, reset every Saturday 45 | - **System Specs**: 46 | - Disk: HDD 58GB 47 | - RAM: 29GB 48 | - CPU: 4 cores 49 | - **Session Limit**: 12 hours per session 50 | 51 | Resource usage can be monitored in the top-right corner of the notebook. 52 | ![Resource view](image/resources.png) 53 | 54 | ## Working with Datasets 55 | 56 | - To use a dataset in your notebook, you first need to upload it to Kaggle. 57 | - Go to **Create > New Dataset** and upload your dataset. For large files, consider zipping them before uploading. 58 | - In your notebook, select **Add Input** to access your uploaded dataset. 59 | 60 | ![Add input](image/input.png) 61 | 62 | - Check **Your Work** and then **Datasets** to view your uploaded datasets. 63 | 64 | ![Select dataset](image/ticks.png) 65 | 66 | - Click the plus sign to add your dataset (ensure it has been successfully uploaded). 67 | 68 | - Once added, your dataset will be available in the `/kaggle/input` directory by default. 69 | 70 | ## Saving and Accessing Outputs 71 | 72 | - By default, any files or folders you download or create in the notebook are stored in `/kaggle/working` under **Output**. 73 | ![Output tab](image/output_tab.png) 74 | 75 | - To save your output, click **Save Version**. 76 | ![Save version](image/save.png) 77 | 78 | - Enter a version name, set the version type to **Quick Save**, and configure the save output setting to **Always save output when creating Quick Save**. 79 |
80 | Quick Save 81 |
82 | 83 | - Wait for the save process to complete. 84 | ![Saving queue](image/queue.png) 85 | 86 | - Once completed, you can go to the **Output** tab, locate your files, and download them. 87 | ![Download output files](image/outputfile.png) 88 | 89 | - *Note*: You can directly download smaller output files from the output tab in your notebook. 90 | 91 | ## Run All + Recurrent Workflow (**MUST READ**) 92 | 93 | **Problem**: When using Colab, you need to download the output files (weights, logs, etc.) after each run, then reupload for the next run/training/inference. This can be time-consuming and inconvenient. 94 | 95 | **Solution**: Data (output) workflow in Kaggle is more efficient than Colab!!! 96 | 97 | **Details**: 98 | If you want to use the output of your notebook as the input for the same notebook, follow these steps: 99 | 100 | 1. **Save and Run All**: Ensure your notebook runs successfully by using the **Save and Run All (Commit)** option. This will execute the notebook and save the outputs in one step. Remember to turn on the GPU before running the notebook. 101 | 102 | ![a](image/getting_started_2025-01-27-00-25-11.png) 103 | 104 | 2. **Add Input**: After the notebook has run successfully, click **Add Input** and select **Your Work > Notebooks**. Choose the notebook you just ran. 105 | Here is an example of how the notebook will look after running and adding the input: 106 | 107 | ![a](image/getting_started_2025-01-27-00-28-01.png) 108 | 109 | 3. **Access Output Files**: You can access the output files directly without creating a new dataset each time. However, this method defaults to the latest version of the output files. 110 | 111 | 4. **Save as Dataset** (Alternative): 112 | You can also save the output files as a dataset by clicking **New Dataset** in the **Output** tab. This will create a reusable dataset with the saved outputs. 113 | 114 | ![](image/getting_started_2025-01-27-00-29-40.png) 115 | 116 | **NOTE**: This method is especially useful for recurrent workflows, such as training a model over multiple epochs or running multiple experiments with the same data. The time for saving outputs and notebooks is significantly reduced compared to Colab. 117 | -------------------------------------------------------------------------------- /scripts/llm/huggingface.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using HuggingFace to finetune for LLM tasks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Load libraries" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import torch\n", 24 | "from transformers import BertTokenizer, BertForSequenceClassification, BertModel\n", 25 | "from torch.utils.data import DataLoader, TensorDataset\n", 26 | "import torch.nn as nn" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Load the tokenizer and model" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "model_name = \"bert-base-uncased\"\n", 43 | "tokenizer = BertTokenizer.from_pretrained(model_name)\n", 44 | "model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2) # 2 for binary classification\n", 45 | "\n", 46 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 47 | "model.to(device)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Tokenize " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "inputs = tokenizer(texts, padding=True, truncation=True, return_tensors=\"pt\")\n", 64 | "labels = torch.tensor(labels)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "## Dataset and DataLoader (PyTorch)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "dataset = TensorDataset(inputs[\"input_ids\"], inputs[\"attention_mask\"], labels)\n", 81 | "dataloader = DataLoader(dataset, batch_size=2)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## Optimizer, loss function and scheduler" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)\n", 98 | "loss_fn = torch.nn.CrossEntropyLoss()\n", 99 | "scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## Training loop" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [ 115 | "model.train()\n", 116 | "for epoch in range(3): # Example: 3 epochs\n", 117 | " for batch in dataloader:\n", 118 | " input_ids, attention_mask, labels = batch\n", 119 | " input_ids = input_ids.to(device)\n", 120 | " attention_mask = attention_mask.to(device)\n", 121 | " labels = labels.to(device)\n", 122 | "\n", 123 | " optimizer.zero_grad()\n", 124 | "\n", 125 | " outputs = model(input_ids, attention_mask=attention_mask, labels=labels)\n", 126 | " loss = outputs.loss\n", 127 | " # loss = loss_fn(outputs.logits, labels) # Alternative if not using the model's loss\n", 128 | "\n", 129 | " loss.backward()\n", 130 | " optimizer.step()\n", 131 | "\n", 132 | " print(f\"Epoch: {epoch+1}, Loss: {loss.item()}\")" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "# Special: Custom output model" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "class BertLSTMForSentimentAnalysis(nn.Module):\n", 149 | " def __init__(self, model_name, num_labels, lstm_hidden_size=256, lstm_layers=1, dropout_rate=0.1):\n", 150 | " super(BertLSTMForSentimentAnalysis, self).__init__()\n", 151 | "\n", 152 | " self.bert = BertModel.from_pretrained(model_name)\n", 153 | " self.lstm = nn.LSTM(\n", 154 | " input_size=self.bert.config.hidden_size,\n", 155 | " hidden_size=lstm_hidden_size,\n", 156 | " num_layers=lstm_layers,\n", 157 | " batch_first=True,\n", 158 | " bidirectional=False, # Set to True for bidirectional\n", 159 | " dropout=dropout_rate if lstm_layers > 1 else 0\n", 160 | " )\n", 161 | " self.dropout = nn.Dropout(dropout_rate)\n", 162 | " self.classifier = nn.Linear(lstm_hidden_size, num_labels)\n", 163 | "\n", 164 | " def forward(self, input_ids, attention_mask):\n", 165 | " # Get all hidden states from BERT\n", 166 | " outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)\n", 167 | " hidden_states = outputs.last_hidden_state # Use the last hidden state\n", 168 | "\n", 169 | " # Pass hidden states to LSTM\n", 170 | " lstm_output, (h_n, c_n) = self.lstm(hidden_states)\n", 171 | "\n", 172 | " # Use the last hidden state of the LSTM\n", 173 | " # If bidirectional, use torch.cat((h_n[-2,:,:], h_n[-1,:,:]), dim=1)\n", 174 | " lstm_final_hidden_state = h_n[-1]\n", 175 | "\n", 176 | " # Dropout and classification\n", 177 | " x = self.dropout(lstm_final_hidden_state)\n", 178 | " logits = self.classifier(x)\n", 179 | " return logits" 180 | ] 181 | } 182 | ], 183 | "metadata": { 184 | "kernelspec": { 185 | "display_name": "base", 186 | "language": "python", 187 | "name": "python3" 188 | }, 189 | "language_info": { 190 | "name": "python", 191 | "version": "3.11.9" 192 | } 193 | }, 194 | "nbformat": 4, 195 | "nbformat_minor": 2 196 | } 197 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chapter 2: Seven Stage Fine-Tuning Pipeline for LLM" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | " Fine-tuning a Large Language Model (LLM) is a comprehensive process divided into seven distinct\n", 23 | " stages, each essential for adapting the pre-trained model to specific tasks and ensuring optimal per\n", 24 | "formance. These stages encompass everything from initial dataset preparation to the final deployment\n", 25 | " and maintenance of the fine-tuned model. " 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "vscode": { 32 | "languageId": "plaintext" 33 | } 34 | }, 35 | "source": [ 36 | " The seven stages include Dataset Preparation, Model Initialisation,\n", 37 | " Training Environment Setup, Fine-Tuning, Evaluation and Validation, Deployment, and Monitoring and\n", 38 | " Maintenance." 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "vscode": { 45 | "languageId": "plaintext" 46 | } 47 | }, 48 | "source": [ 49 | "
\n", 50 | " \"\"\n", 51 | "
" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "metadata": { 57 | "vscode": { 58 | "languageId": "plaintext" 59 | } 60 | }, 61 | "source": [ 62 | "## Stage 1: Dataset Preparation" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "vscode": { 69 | "languageId": "plaintext" 70 | } 71 | }, 72 | "source": [ 73 | " Fine-tuning a Large Language Model (LLM) starts with adapting the pre-trained model for specific tasks\n", 74 | " by updating its parameters using a new dataset. This involves cleaning and formatting the dataset to\n", 75 | " match the target task, such as instruction tuning, sentiment analysis, or topic mapping." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "vscode": { 82 | "languageId": "plaintext" 83 | } 84 | }, 85 | "source": [ 86 | " The dataset is\n", 87 | " composed of < input,output > pairs, demonstrating the desired behaviour for the model.\n", 88 | " For example, in instruction tuning, the dataset may look like" 89 | ] 90 | }, 91 | { 92 | "cell_type": "markdown", 93 | "metadata": { 94 | "vscode": { 95 | "languageId": "plaintext" 96 | } 97 | }, 98 | "source": [ 99 | "```\n", 100 | "###Human: $$\n", 101 | "###Assistant: $$\n", 102 | "```" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "vscode": { 109 | "languageId": "plaintext" 110 | } 111 | }, 112 | "source": [ 113 | "## Stage 2: Model Initialisation" 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "vscode": { 120 | "languageId": "plaintext" 121 | } 122 | }, 123 | "source": [ 124 | "Model initialisation is the process of setting up the initial parameters and configurations of the LLM\n", 125 | " before training or deploying it. This step is crucial for ensuring the model performs optimally, trains\n", 126 | " efficiently, and avoids issues such as vanishing or exploding gradients." 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": { 132 | "vscode": { 133 | "languageId": "plaintext" 134 | } 135 | }, 136 | "source": [ 137 | "## Stage 3: Training Environment Setup" 138 | ] 139 | }, 140 | { 141 | "cell_type": "markdown", 142 | "metadata": { 143 | "vscode": { 144 | "languageId": "plaintext" 145 | } 146 | }, 147 | "source": [ 148 | " Setting up the training environment for LLM fine-tuning involves configuring the necessary infrastructure\n", 149 | " to adapt a pre-existing model for specific tasks. " 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": { 155 | "vscode": { 156 | "languageId": "plaintext" 157 | } 158 | }, 159 | "source": [ 160 | "This includes selecting relevant training data, defining the\n", 161 | " model’s architecture and hyperparameters, and running training iterations to adjust the model’s weights\n", 162 | " and biases. " 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "vscode": { 169 | "languageId": "plaintext" 170 | } 171 | }, 172 | "source": [ 173 | "## Stage 4: Partial or Full Fine-Tuning" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "vscode": { 180 | "languageId": "plaintext" 181 | } 182 | }, 183 | "source": [ 184 | " This stage involves updating the parameters of the LLM using a task-specific dataset. Full fine-tuning up\n", 185 | "dates all parameters of the model, ensuring comprehensive adaptation to the new task.\n", 186 | "\n", 187 | " Alternatively, Half\n", 188 | " fine-tuning (HFT) or Parameter-Efficient Fine-Tuning approaches, such as using adapter\n", 189 | " layers, can be employed to partially fine-tune the model. " 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "vscode": { 196 | "languageId": "plaintext" 197 | } 198 | }, 199 | "source": [ 200 | "## Stage 5: Evaluation and Validation" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "vscode": { 207 | "languageId": "plaintext" 208 | } 209 | }, 210 | "source": [ 211 | " Evaluation and validation involve assessing the fine-tuned LLM’s performance on unseen data to ensure\n", 212 | " it generalises well and meets the desired objectives. " 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": { 218 | "vscode": { 219 | "languageId": "plaintext" 220 | } 221 | }, 222 | "source": [ 223 | " Evaluation metrics, such as cross-entropy, measure\n", 224 | " prediction errors, while validation monitors loss curves and other performance indicators to detect issues\n", 225 | " like overfitting or underfitting. " 226 | ] 227 | } 228 | ], 229 | "metadata": { 230 | "language_info": { 231 | "name": "python" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 2 236 | } 237 | -------------------------------------------------------------------------------- /scripts/cv/ultralytics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Finetune YOLO using Ultralytics" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "vscode": { 14 | "languageId": "plaintext" 15 | } 16 | }, 17 | "source": [ 18 | "## Installation" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "!pip install ultralytics" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": { 33 | "vscode": { 34 | "languageId": "plaintext" 35 | } 36 | }, 37 | "source": [ 38 | "## Datasets" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": { 44 | "vscode": { 45 | "languageId": "plaintext" 46 | } 47 | }, 48 | "source": [ 49 | "### Directory Structure" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "vscode": { 56 | "languageId": "plaintext" 57 | } 58 | }, 59 | "source": [ 60 | "```\n", 61 | "dataset/\n", 62 | "├── images/\n", 63 | "│ ├── train/\n", 64 | "│ └── val/\n", 65 | "└── labels/\n", 66 | " ├── train/\n", 67 | " └── val/\n", 68 | "```" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "vscode": { 75 | "languageId": "plaintext" 76 | } 77 | }, 78 | "source": [ 79 | "### Yaml file" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "```yaml\n", 87 | "path: ../dataset # dataset root dir\n", 88 | "train: images/train # train images (relative to 'path') \n", 89 | "val: images/val # val images (relative to 'path')\n", 90 | "test: # test images (optional)\n", 91 | "\n", 92 | "# Classes\n", 93 | "nc: 2 # number of classes\n", 94 | "names: ['class1', 'class2'] # class names\n", 95 | "```" 96 | ] 97 | }, 98 | { 99 | "cell_type": "markdown", 100 | "metadata": {}, 101 | "source": [ 102 | "### Label format" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "```\n", 110 | " \n", 111 | "```" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "metadata": {}, 117 | "source": [ 118 | "+ ``: The index of the class (0 or 1 in your case).\n", 119 | "\n", 120 | "+ ``, ``, ``, ``: These are the bounding box coordinates, normalized by the image width and height (values between 0 and 1)." 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": { 126 | "vscode": { 127 | "languageId": "plaintext" 128 | } 129 | }, 130 | "source": [ 131 | "## Using Python" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "### Fine tuning" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "from ultralytics import YOLO\n", 148 | "\n", 149 | "# Load a pretrained model (recommended for training)\n", 150 | "model = YOLO('yolov8n.pt')\n", 151 | "\n", 152 | "# Train the model\n", 153 | "results = model.train(\n", 154 | " data = 'data.yaml',\n", 155 | " epochs = 100,\n", 156 | " batch = 32,\n", 157 | " imgsz = 1024,\n", 158 | " optimizer = 'Adam',\n", 159 | " cos_lr = True,\n", 160 | " dropout = 0.2,\n", 161 | " device = [0, 1]\n", 162 | ")" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Inference" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "# Creating best-weighted model\n", 179 | "bestModel = YOLO('/kaggle/working/runs/detect/train/weights/best.pt')\n", 180 | "# Making predictions on test set\n", 181 | "pred = bestModel.predict(\n", 182 | " source = '/kaggle/input/data/test',\n", 183 | " imgsz = 1024,\n", 184 | " conf = 0.65,\n", 185 | " iou = 0.65,\n", 186 | " save_txt = True,\n", 187 | " save_conf = True\n", 188 | ")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "The results are saved to txts file in /kaggle/working/runs/detect/predict/labels" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "## Using command line" 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "### Fine tuning" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | "```\n", 217 | "yolo task=detect mode=train model=yolov8n.pt args\n", 218 | " ↑ ↑ ↑ ↑\n", 219 | " what to do train/val pretrained arguments\n", 220 | " predict/export weights\n", 221 | "```" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "!yolo train model=yolov8n.pt data=data.yaml epochs=100 imgsz=640" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "### Inference" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [ 246 | "!yolo mode=predict model=best.pt source=\"/kaggle/input/data/test\" conf=0.25" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "## Clone repository" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "### Prepare" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": null, 266 | "metadata": {}, 267 | "outputs": [], 268 | "source": [ 269 | "!git clone https://github.com/ultralytics/yolov5.git\n", 270 | "!cd yolov5\n", 271 | "!pip install -r requirements.txt" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "### Fine tuning" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "!python train.py --img 640 --batch 16 --epochs 30 --data data.yaml --weights yolov5s.pt --cache" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### Inference" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "!python detect.py --weights runs/train/exp/weights/best.pt --img 640 --conf 0.25 --source /path/to/images_or_video" 304 | ] 305 | } 306 | ], 307 | "metadata": { 308 | "kernelspec": { 309 | "display_name": "base", 310 | "language": "python", 311 | "name": "python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.11.9" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 2 328 | } 329 | -------------------------------------------------------------------------------- /scripts/cv/tensorflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using Tensorflow to finetune for computer vision tasks" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "vscode": { 14 | "languageId": "plaintext" 15 | } 16 | }, 17 | "source": [ 18 | "## Datasets" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "vscode": { 25 | "languageId": "plaintext" 26 | } 27 | }, 28 | "source": [ 29 | "### Dataset Folder Structure\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": { 35 | "vscode": { 36 | "languageId": "plaintext" 37 | } 38 | }, 39 | "source": [ 40 | "```bash\n", 41 | "dataset/\n", 42 | "├── class_1/\n", 43 | "│ ├── image1.jpg\n", 44 | "│ ├── image2.jpg\n", 45 | "│ └── ...\n", 46 | "├── class_2/\n", 47 | "│ ├── image1.jpg\n", 48 | "│ ├── image2.jpg\n", 49 | "│ └── ...\n", 50 | "└── class_n/\n", 51 | " ├── image1.jpg\n", 52 | " ├── image2.jpg\n", 53 | " └── ...\n", 54 | "```" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "vscode": { 61 | "languageId": "plaintext" 62 | } 63 | }, 64 | "source": [ 65 | "### Splitting data" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "import tensorflow as tf\n", 75 | "\n", 76 | "image_size = (224, 224) \n", 77 | "batch_size = 32 \n", 78 | "validation_split = 0.2 \n", 79 | "\n", 80 | "train_ds = tf.keras.preprocessing.image_dataset_from_directory(\n", 81 | " 'dataset', # Path to the dataset folder\n", 82 | " validation_split=validation_split,\n", 83 | " subset=\"training\", # Training subset\n", 84 | " seed=123, # Set a random seed for reproducibility\n", 85 | " image_size=image_size, \n", 86 | " batch_size=batch_size \n", 87 | ")\n", 88 | "\n", 89 | "# Load validation data\n", 90 | "val_ds = tf.keras.preprocessing.image_dataset_from_directory(\n", 91 | " 'dataset', # Path to the dataset folder\n", 92 | " validation_split=validation_split,\n", 93 | " subset=\"validation\", # Validation subset\n", 94 | " seed=123, # Must be the same seed as above\n", 95 | " image_size=image_size, \n", 96 | " batch_size=batch_size \n", 97 | ")" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Pre-trained Model\n" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "from keras.applications import ResNet50\n", 114 | "\n", 115 | "# Load the base model\n", 116 | "base_model = ResNet50(\n", 117 | " weights='imagenet', # Load weights pre-trained on ImageNet\n", 118 | " include_top=False, # Exclude the fully connected layers\n", 119 | " input_shape=(224, 224, 3) # Input size for the model\n", 120 | ")" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "### Freeze model" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "base_model.trainable = False # Freeze the base model" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "base_model.trainable = True # Unfreeze the base model\n", 146 | "\n", 147 | "# Optionally, freeze some layers to fine-tune specific parts\n", 148 | "for layer in base_model.layers[:100]:\n", 149 | " layer.trainable = False" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "## Custom layers" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "from keras import layers, models\n", 166 | "\n", 167 | "\n", 168 | "# Add custom layers\n", 169 | "model = models.Sequential([\n", 170 | " base_model, # Pre-trained base\n", 171 | " layers.GlobalAveragePooling2D(), # Pooling layer to reduce dimensions\n", 172 | " layers.Dense(128, activation='relu'), # Fully connected layer\n", 173 | " layers.Dropout(0.5), # Dropout for regularization\n", 174 | " layers.Dense(10, activation='softmax') # Final layer for classification\n", 175 | "])" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": {}, 181 | "source": [ 182 | "## Training" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": null, 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "model.compile(\n", 192 | " optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),\n", 193 | " loss='sparse_categorical_crossentropy',\n", 194 | " metrics=['accuracy']\n", 195 | ")\n", 196 | "history = model.fit(\n", 197 | " train_ds,\n", 198 | " validation_data=val_ds,\n", 199 | " epochs=10\n", 200 | ")\n", 201 | "model.summary() # Display the model's architecture" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "# Special: Image + metadata model" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "from keras.applications import ResNet50\n", 218 | "from keras.layers import Input, GlobalAveragePooling2D, Dense, Dropout, Concatenate\n", 219 | "from keras.models import Model\n", 220 | "\n", 221 | "image_input = Input(shape=(224, 224, 3), name=\"image_input\")\n", 222 | "base_model = ResNet50(weights='imagenet', include_top=False, input_tensor=image_input)\n", 223 | "base_model.trainable = False \n", 224 | "\n", 225 | "image_features = base_model.output\n", 226 | "image_features = GlobalAveragePooling2D()(image_features)\n", 227 | "image_features = Dense(128, activation='relu')(image_features)\n", 228 | "image_features = Dropout(0.5)(image_features)\n", 229 | "\n", 230 | "metadata_input = Input(shape=(10,), name=\"metadata_input\") \n", 231 | "metadata_features = Dense(64, activation='relu')(metadata_input)\n", 232 | "metadata_features = Dropout(0.3)(metadata_features)\n", 233 | "metadata_features = Dense(32, activation='relu')(metadata_features)" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "combined_features = Concatenate()([image_features, metadata_features])\n", 243 | "\n", 244 | "x = Dense(128, activation='relu')(combined_features)\n", 245 | "x = Dropout(0.5)(x)\n", 246 | "output = Dense(10, activation='softmax', name=\"output_layer\")(x) \n" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": null, 252 | "metadata": {}, 253 | "outputs": [], 254 | "source": [ 255 | "# Define the multi-input model\n", 256 | "model = Model(inputs=[image_input, metadata_input], outputs=output)\n", 257 | "\n", 258 | "model.compile(\n", 259 | " optimizer='adam',\n", 260 | " loss='sparse_categorical_crossentropy',\n", 261 | " metrics=['accuracy']\n", 262 | ")\n", 263 | "\n", 264 | "model.summary()" 265 | ] 266 | } 267 | ], 268 | "metadata": { 269 | "kernelspec": { 270 | "display_name": "base", 271 | "language": "python", 272 | "name": "python3" 273 | }, 274 | "language_info": { 275 | "name": "python", 276 | "version": "3.11.9" 277 | } 278 | }, 279 | "nbformat": 4, 280 | "nbformat_minor": 2 281 | } 282 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter11.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter 11: Multimodal LLMs and their Fine-tuning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "vscode": { 14 | "languageId": "plaintext" 15 | } 16 | }, 17 | "source": [ 18 | "
\n", 19 | " \"\"\n", 20 | "
" 21 | ] 22 | }, 23 | { 24 | "cell_type": "markdown", 25 | "metadata": { 26 | "vscode": { 27 | "languageId": "plaintext" 28 | } 29 | }, 30 | "source": [ 31 | "## Vision Language Model (VLMs)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "vscode": { 38 | "languageId": "plaintext" 39 | } 40 | }, 41 | "source": [ 42 | " Vision language models encompass multimodal models capable of learning from both images and text\n", 43 | " inputs. \n", 44 | " \n", 45 | " They belong to the category of generative models that utilise image and text data to produce\n", 46 | " textual outputs. \n", 47 | "\n", 48 | " Certain advanced vision language models\n", 49 | " can also understand spatial attributes within images. " 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "vscode": { 56 | "languageId": "plaintext" 57 | } 58 | }, 59 | "source": [ 60 | "### Architecture" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "vscode": { 67 | "languageId": "plaintext" 68 | } 69 | }, 70 | "source": [ 71 | "Vision-language models adeptly integrate both visual and textual information, leveraging three fundamental components:\n", 72 | "\n", 73 | "+ Image Encoder: This component translates visual data (images) into a format that the model can process.\n", 74 | "+ Text Encoder: Similar to the image encoder, this component converts textual data (words and sentences) into a format the model can understand.\n", 75 | "+ Fusion Strategy: This component combines the information from both the image and text encoders, merging the two data types into a unified representation." 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": { 81 | "vscode": { 82 | "languageId": "plaintext" 83 | } 84 | }, 85 | "source": [ 86 | " These elements work collaboratively, with the model’s learning process (loss functions) specifically tai\n", 87 | "lored to the architecture and learning strategy employed. " 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "vscode": { 94 | "languageId": "plaintext" 95 | } 96 | }, 97 | "source": [ 98 | "### Constrative Learning" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "vscode": { 105 | "languageId": "plaintext" 106 | } 107 | }, 108 | "source": [ 109 | " Contrastive learning is a technique that focuses on understanding the differences between data points. It\n", 110 | " computes a similarity score between instances and aims to minimise contrastive loss, making it particu\n", 111 | "larly useful in semi-supervised learning where a limited number of labelled samples guide the optimisation\n", 112 | " process to classify unseen data points." 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": { 118 | "vscode": { 119 | "languageId": "plaintext" 120 | } 121 | }, 122 | "source": [ 123 | "**How it works**\n", 124 | "\n", 125 | "CLIP is a model that utilises contrastive learning to compute similarity between text and image embeddings through textual and visual encoders. It follows a three-step process for zero-shot predictions:\n", 126 | "\n", 127 | "+ Pre-training: Trains a text and image encoder to learn image-text pairs.\n", 128 | "+ Caption Conversion: Converts training dataset classes into captions.\n", 129 | "+ Zero-Shot Prediction: Estimates the best caption for a given input image based on learned similarities." 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "vscode": { 136 | "languageId": "plaintext" 137 | } 138 | }, 139 | "source": [ 140 | "### Fine-tuning of multimodal models" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | " LoRA and\n", 148 | " QLoRA can be utilised. " 149 | ] 150 | }, 151 | { 152 | "cell_type": "markdown", 153 | "metadata": { 154 | "vscode": { 155 | "languageId": "plaintext" 156 | } 157 | }, 158 | "source": [ 159 | "LLM-Adapters integrate various adapter\n", 160 | " modules into the pre-trained model’s architecture, enabling parameter-efficient fine-tuning for diverse\n", 161 | " tasks by updating only the adapter parameters while keeping the base model parameters fixed. " 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": { 167 | "vscode": { 168 | "languageId": "plaintext" 169 | } 170 | }, 171 | "source": [ 172 | "(IA)³,\n", 173 | " or Infused Adapters by Inhibiting and Amplifying Inner Activations, enhances performance by learn\n", 174 | "ing vectors to weight model parameters through activation multiplications, supporting robust few-shot\n", 175 | " performance and task mixing without manual adjustments." 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "metadata": { 181 | "vscode": { 182 | "languageId": "plaintext" 183 | } 184 | }, 185 | "source": [ 186 | "Dynamic adaptation techniques like DyLoRA allow for the training of low-rank adaptation blocks across different ranks, optimising\n", 187 | " the learning process by sorting the representations during training." 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "vscode": { 194 | "languageId": "plaintext" 195 | } 196 | }, 197 | "source": [ 198 | " LoRA-FA, a variant of LoRA, optimises the fine-tuning process by freezing the first low-rank matrix after initialisation and using it as a\n", 199 | " random projection while training the other, thereby reducing the number of parameters by half without\n", 200 | " compromising performance." 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "vscode": { 207 | "languageId": "plaintext" 208 | } 209 | }, 210 | "source": [ 211 | "The Efficient Attention Skipping (EAS) module introduces a novel parameter and computation\n", 212 | "efficient tuning method for MLLMs, aiming to maintain high performance while reducing parameter and\n", 213 | " computation costs for downstream tasks. " 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": { 219 | "vscode": { 220 | "languageId": "plaintext" 221 | } 222 | }, 223 | "source": [ 224 | "MemVP integrates visual prompts\n", 225 | " with the weights of Feed Forward Networks, thereby injecting visual knowledge to decrease training time\n", 226 | " and inference latency, ultimately outperforming previous PEFT methods." 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": { 232 | "vscode": { 233 | "languageId": "plaintext" 234 | } 235 | }, 236 | "source": [ 237 | "### Full-parameter Fine-Tuning" 238 | ] 239 | }, 240 | { 241 | "cell_type": "markdown", 242 | "metadata": { 243 | "vscode": { 244 | "languageId": "plaintext" 245 | } 246 | }, 247 | "source": [ 248 | "Methods such as those introduced by LOMO and MeZO provide alternative solutions by focusing\n", 249 | " on memory efficiency:\n", 250 | " + LOMO utilises a low-memory optimisation technique derived from Stochastic\n", 251 | " Gradient Descent (SGD), reducing memory consumption typically associated with the ADAM optimiser.\n", 252 | " \n", 253 | " + MeZO, on the other hand, offers a memory-efficient optimiser that requires only two forward passes\n", 254 | " to compute gradients, enabling comprehensive fine-tuning of large models with a memory footprint\n", 255 | " equivalent to inference" 256 | ] 257 | } 258 | ], 259 | "metadata": { 260 | "language_info": { 261 | "name": "python" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 2 266 | } 267 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chapter 5: Stage 3: Training Setup" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Steps Involved in Training Setup" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "vscode": { 29 | "languageId": "plaintext" 30 | } 31 | }, 32 | "source": [ 33 | "+ Setting up the training environment\n", 34 | "+ Defining the Hyper-parameters\n", 35 | "+ Initialising Optimisers and Loss Functions" 36 | ] 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "metadata": { 41 | "vscode": { 42 | "languageId": "plaintext" 43 | } 44 | }, 45 | "source": [ 46 | "## Setting up Training Environment" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "vscode": { 53 | "languageId": "plaintext" 54 | } 55 | }, 56 | "source": [ 57 | "When fine-tuning a large language model (LLM), the computational environment plays a crucial role in\n", 58 | " ensuring efficient training. To achieve optimal performance, it’s essential to configure the environment\n", 59 | " with high-performance hardware such as GPUs (Graphics Processing Units) or TPUs (Tensor Processing\n", 60 | " Units). " 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "vscode": { 67 | "languageId": "plaintext" 68 | } 69 | }, 70 | "source": [ 71 | "First, ensure that your system or cloud environment has the necessary hardware installed. For GPUs,\n", 72 | " this involves setting up CUDA1 (Compute Unified Device Architecture) and cuDNN2 (CUDA Deep Neu\n", 73 | "ral Network library) from NVIDIA, which are essential for enabling GPU acceleration.\n", 74 | "\n", 75 | " For TPU usage,\n", 76 | " you would typically set up a Google Cloud environment with TPU instances, which includes configuring\n", 77 | " the TPU runtime in your training scripts." 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": { 83 | "vscode": { 84 | "languageId": "plaintext" 85 | } 86 | }, 87 | "source": [ 88 | "Additionally, use libraries like Hugging Face’s transformers to simplify the process of loading pre-trained\n", 89 | " models and tokenizers. This library is particularly well-suited for working with various LLMs and offers\n", 90 | " a user-friendly interface for model fine-tuning. Ensure that all software components, including libraries\n", 91 | " and dependencies, are compatible with your chosen framework and hardware setup." 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": { 97 | "vscode": { 98 | "languageId": "plaintext" 99 | } 100 | }, 101 | "source": [ 102 | "On the hardware side, consider the memory requirements of the model and your dataset. LLMs typically require substantial GPU memory, so opting for GPUs with higher VRAM (e.g., 16GB or more)\n", 103 | " can be beneficial. If your model is exceptionally large or if you are training with very large datasets,\n", 104 | " distributed training across multiple GPUs or TPUs might be necessary. This requires a careful setup of\n", 105 | " data parallelism or model parallelism techniques to efficiently utilise the available hardware" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "vscode": { 112 | "languageId": "plaintext" 113 | } 114 | }, 115 | "source": [ 116 | "## Defining Hyperparameters" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "vscode": { 123 | "languageId": "plaintext" 124 | } 125 | }, 126 | "source": [ 127 | "+ Learning Rate\n", 128 | "+ Batch Size\n", 129 | "+ Epochs" 130 | ] 131 | }, 132 | { 133 | "cell_type": "markdown", 134 | "metadata": { 135 | "vscode": { 136 | "languageId": "plaintext" 137 | } 138 | }, 139 | "source": [ 140 | "### Methods for Hyperparameter Tuning" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "vscode": { 147 | "languageId": "plaintext" 148 | } 149 | }, 150 | "source": [ 151 | " LLM hyperparameter tuning involves adjusting various hyperparameters during the training process\n", 152 | " to identify the optimal combination that yields the best output. " 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "vscode": { 159 | "languageId": "plaintext" 160 | } 161 | }, 162 | "source": [ 163 | "1. Random Search\n", 164 | "2. Grid Search\n", 165 | "3. Bayesian Optimisation\n", 166 | "4. Automated hyperparameter tuning" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "vscode": { 173 | "languageId": "plaintext" 174 | } 175 | }, 176 | "source": [ 177 | "## Initialising Optimisers and Loss Functions" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "vscode": { 184 | "languageId": "plaintext" 185 | } 186 | }, 187 | "source": [ 188 | "+ Gradient Descent\n", 189 | "+ Stochastic Gradient Descent\n", 190 | "+ Mini-batch Gradient Descent\n", 191 | "+ AdaGrad\n", 192 | "+ RMSprop\n", 193 | "+ AdaDelta\n", 194 | "+ Adam\n", 195 | "+ AdamW" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "vscode": { 202 | "languageId": "plaintext" 203 | } 204 | }, 205 | "source": [ 206 | "## Best Practices" 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "metadata": { 212 | "vscode": { 213 | "languageId": "plaintext" 214 | } 215 | }, 216 | "source": [ 217 | "+ Optimal Learning Rate: Use a lower learning rate, typically between 1e-4 to 2e-4, to ensure stable convergence. A learning rate schedule, such as learning rate warm-up followed by a linear decay, can also be beneficial. \n", 218 | "\n", 219 | "+ Batch Size Considerations: Opt for a batch size that balances memory constraints and training efficiency. Smaller batch sizes can help in achieving faster convergence but may require more frequent updates. Conversely, larger batch sizes can be more memory-intensive but may lead to more stable updates. \n", 220 | "\n", 221 | "+ Save Checkpoints Regularly: Regularly save model weights at various intervals across 5-8 epochs to capture optimal performance without overfitting. Implement early stopping mechanisms to halt training once the model performance starts to degrade on the validation set, thereby preventing overfitting.\n", 222 | "\n", 223 | "+ Hyperparameter Tuning: Utilise hyperparameter tuning methods like grid search, random search, and Bayesian optimisation to find the optimal set of hyperparameters. Tools such as Optuna, Hyperopt, and Ray Tune can automate this process and help in efficiently exploring the hyperparameter space.\n", 224 | "\n", 225 | "+ Data Parallelism and Model Parallelism: For large-scale training, consider using data parallelism or model parallelism techniques to distribute the training workload across multiple GPUs or TPUs. (Horovod and DeepSpeed)\n", 226 | "\n", 227 | "+ Regular Monitoring and Logging: Implement robust monitoring and logging to track training metrics, resource usage, and potential bottlenecks. Tools like TensorBoard, Weights & Biases, and MLflow can provide real-time insights into the training process, allowing for timely interventions and adjustments.\n", 228 | "\n", 229 | "+ Handling Overfitting and Underfitting: Ensure that your model generalises well by implementing techniques to handle overfitting and underfitting. regularisation techniques such as L2 regularisation, dropout, and data augmentation can help prevent overfitting. Conversely, if your model is underfitting, consider increasing the model complexity or training for more epochs.\n", 230 | "\n", 231 | "+ Use Mixed Precision Training: Mixed precision training involves using both 16-bit and 32-bit floating-point types to reduce memory usage and increase computational efficiency. \n", 232 | "\n", 233 | "+ Evaluate and Iterate: Continuously evaluate the model performance using a separate validation set and iterate on the training process based on the results. Regularly update your training data and retrain the model to keep it current with new data trends and patterns.\n", 234 | "\n", 235 | "+ Documentation and Reproducibility: Maintain thorough documentation of your training setup, including the hardware configuration, software environment, and hyperparameters used. Ensure reproducibility by setting random seeds and providing detailed records of the training process. " 236 | ] 237 | } 238 | ], 239 | "metadata": { 240 | "language_info": { 241 | "name": "python" 242 | } 243 | }, 244 | "nbformat": 4, 245 | "nbformat_minor": 2 246 | } 247 | -------------------------------------------------------------------------------- /others/blogs/lora/insights_100_experiments.md: -------------------------------------------------------------------------------- 1 | # Finetuning LLMs with LoRA and QLoRA: Insights from Hundreds of Experiments 2 | Link: [here](https://lightning.ai/pages/community/lora-insights/#toc1) 3 | 4 | ## Introduction: Getting the Most out of LoRA 5 | **Goal**: To provide practical insights for those interested in applying LoRA for fine-tuning LLMs. 6 | 7 | * The article aims to address questions about the value of QLoRA, whether to replace AdamW with SGD, the use of learning rate schedulers, and LoRA hyperparameter adjustments. 8 | 9 | ## Evaluation Tasks and Dataset 10 | 11 | **TruthfulQA Scoring**: 12 | 13 | * The TruthfulQA benchmark reports two scores: 14 | * **MC1 (Single-true)**: Measures simple accuracy by selecting the most likely answer from 4-5 choices. 15 | * **MC2 (Multi-true)**: Measures the normalized total probability assigned to a set of true answers given a question. 16 | * For reference, the 175B GPT-3 model has TruthfulQA MC1 and MC2 values of 0.21 and 0.33, respectively. 17 | 18 | **Arithmetic Task Examples**: 19 | 20 | * **Arithmetic 2ds**: Example: "What is 59 minus 38?". Expected answer: "21". 21 | * **Arithmetic 4ds**: Example: “What is 2762 plus 2751”. Expected answer: "5513". 22 | 23 | **Training Dataset: Alpaca Dataset**: 24 | 25 | * The Alpaca dataset was used for supervised instruction finetuning. 26 | * It consists of approximately **50k instruction-response pairs**. 27 | * The median length of the input size is **110 tokens** using the Llama 2 SentencePiece tokenizer. 28 | 29 | ![a](image/insights_100_experiments_2025-01-27-02-58-20.png) 30 | 31 | ## Code Framework 32 | 33 | **Lit-GPT Repository**: 34 | 35 | * The custom LLM finetuning code used for the experiments is based on the open-source Lit-GPT repository. 36 | * A more detailed guide can be found in the Lit-GPT tutorials section. 37 | 38 | ## Choosing a Good Base Model 39 | 40 | Selecting Llama 2 7B: 41 | 42 | * The author decided that selecting the smallest of the remaining models would provide the most room for improvement while maintaining lower hardware requirements. 43 | * Therefore, the remainder of the article focuses on the **Llama 2 7B** model. 44 | 45 | **Hardware**: All experiments were run on a single A100 GPU. 46 | 47 | ## Evaluating the LoRA Defaults 48 | 49 | * **Default LoRA Hyperparameters**: The following default settings were used for the initial LoRA fine-tuning evaluation: 50 | * `learning_rate = 3e-4` 51 | * `batch_size = 128` 52 | * `micro_batch_size = 1` 53 | * `max_iters = 50000` 54 | * `weight_decay = 0.01` 55 | * `lora_r = 8` 56 | * `lora_alpha = 16` 57 | * `lora_dropout = 0.05` 58 | * `lora_query = True` 59 | * `lora_key = False` 60 | * `lora_value = True` 61 | * `lora_projection = False` 62 | * `lora_mlp = False` 63 | * `lora_head = False` 64 | * `warmup_steps = 100` 65 | 66 | * **Trainable Parameters**: 67 | * With these settings, **4,194,304 LoRA parameters** were trained. 68 | * The total number of trainable parameters in the base model is **6,738,415,616**. 69 | 70 | * **Training Time and Memory Usage**: 71 | * The training took approximately **1.8 hours** on a single A100 GPU. 72 | * The maximum memory usage was **21.33 GB**. 73 | 74 | * **Consistency of Results**: The experiment was repeated three times to observe the variance, and the performance was found to be very consistent and stable across runs. 75 | * **Performance Observations**: 76 | * The LoRA default model became "really bad" at arithmetic. This was attributed to the Alpaca dataset not containing many arithmetic tasks. 77 | 78 | * **Comparison with Meta's Llama 2 Chat Model**: 79 | * The performance of the 7B Llama 2 version that was instruction-finetuned by Meta using RLHF was also examined. 80 | * Meta's Llama 2 Chat model also showed worse arithmetic performance. 81 | * However, the Chat model performed much better on other benchmarks (except BLiMP), which served as a performance reference for the LoRA fine-tuning experiments. 82 | 83 | ## Memory Savings with QLoRA 84 | 85 | ### Experimental Setup 86 | 87 | * Two quantization types were tested: 4-bit Normal Float (`bnb.nf4`) and 4-bit Floating Point (`bnb.fp4`). 88 | * The results were compared to default LoRA with bfloat-16 precision. 89 | 90 | **Impact on Training Time and Memory Usage**: 91 | 92 | * **Default LoRA (with bfloat-16):** 93 | * Training time: 6685.75 seconds 94 | * Memory used: 21.33 GB 95 | * **QLoRA via `--quantize "bnb.nf4"`:** 96 | * Training time: 10059.53 seconds 97 | * Memory used: 14.18 GB 98 | * **QLoRA via `--quantize "bnb.fp4"`:** 99 | * Training time: 9334.45 seconds 100 | * Memory used: 14.19 GB 101 | * **Key Observation:** QLoRA decreased memory requirements by almost 6 GB, but increased training time by approximately 30% due to quantization and dequantization steps. 102 | 103 | **Impact on Model Performance**: 104 | 105 | * QLoRA had a small impact on model performance compared to regular LoRA. 106 | * The model improved on the arithmetic benchmarks. 107 | * The model's performance declined on the MMLU Global Facts benchmark. 108 | 109 | ## Learning Rate Schedulers and SGD 110 | 111 | **Background on AdamW**: 112 | 113 | * The AdamW optimizer was used for previous experiments as it's a common choice for LLM training. 114 | * AdamW can be memory-intensive because it tracks two additional parameters (moments *m* and *v*) for each model parameter. 115 | 116 | **Motivation for Exploring SGD**: 117 | 118 | * The author explores whether swapping AdamW with an SGD optimizer could be beneficial, particularly in terms of memory usage. 119 | * SGD optimizers require a learning rate scheduler, and a cosine annealing schedule was chosen. 120 | 121 | **Memory Savings Comparison**: 122 | 123 | * **AdamW:** 14.18 GB memory usage 124 | * **SGD:** 14.15 GB memory usage 125 | * **Observation:** Swapping AdamW with SGD resulted in only minor memory savings. This is attributed to the fact that most memory is used for matrix multiplications rather than optimizer states. 126 | * The additional memory used by AdamW was calculated to be about 16.78 megabytes for the given LoRA configuration (r=8), which does not account for the small measured difference. 127 | * **Note:** A larger difference in memory usage between AdamW and SGD is expected when using a larger LoRA rank `r` value, due to the larger number of trainable parameters. 128 | 129 | ## LoRA Hyperparameter Tuning 130 | 131 | ### Part 1: LoRA for All Layers 132 | 133 | * **Initial Setup**: By default, LoRA was only enabled for the Key and Query matrices within the multi-head self-attention blocks. 134 | * **Change**: This experiment involves enabling LoRA for all layers of the model, including the Value matrix, projection layers, and linear layers. 135 | * **Impact:** The blog post does not include specific performance results in this part, but rather sets the stage for the following experiments which will test different values for `r` and `alpha`. The point of the change was to enable more trainable parameters which will be investigated in the next section. 136 | 137 | ### Part 2: Increasing R 138 | 139 | * **Introduction to 'r'**: The parameter "r" determines the rank or dimension of the LoRA matrices, which directly influences the model's complexity and capacity. A higher "r" means more expressive power but can lead to overfitting, while a lower "r" can reduce overfitting at the expense of expressiveness. 140 | * **Experiment**: The author increased the value of `r` from 8 to 16, keeping LoRA enabled for all layers. 141 | * **Result**: Increasing `r` from 8 to 16 resulted in **worse performance**. The author notes that this result prompted further investigation of the alpha parameter. 142 | 143 | ### Part 3: Changing Alpha 144 | 145 | * **Introduction to 'alpha'**: The parameter "alpha" is used to place more emphasis on the low-rank structure or regularization. A higher "alpha" would place more emphasis on the low-rank structure or regularization, while a lower "alpha" would reduce its influence, making the model rely more on the original parameters. 146 | * **Rule of Thumb**: It is common practice to set alpha to twice the size of the rank ("r") when fine-tuning LLMs. 147 | * **Experiment**: The author increased "alpha" two-fold from 16 to 32 when `r` was set to 16. 148 | * **Result**: Increasing "alpha" to 32 resulted in the **best model performance** up to that point. This improvement, however, came with an increase in the number of trainable parameters, as well as memory usage, though the increase was not substantial. 149 | * **Additional Experiments:** The author ran experiments with exceptionally large ranks (512, 1024, and 2048) which resulted in poorer outcomes, which were excluded from the table. 150 | * **Importance of alpha**: Experiments with an alpha of 1 showed that a large alpha value was necessary for good performance. The author also repeated experiments with alpha values of 16 and 32, and found worse performance compared to choosing the alpha value as two-times the rank. 151 | 152 | ### Part 3: Very Large R 153 | 154 | * **Experiment:** The author further optimized the alpha value of the best model from the previous section (r=256), suspecting that the default setting (alpha=512) might be a bit too large. 155 | * **Finding**: Choosing a large alpha value appears to be crucial when increasing the rank, and an alpha value of two times the rank yielded the best results. 156 | * **Result:** Choosing an alpha value such that it exceeds the “two-fold the rank” recommendation also makes the benchmark outcomes worse. 157 | 158 | ## Conclustion 159 | 160 | This article explored the various knobs we can tune when training custom LLMs using LoRA. We found that QLoRA is a great memory-saver even though it comes at an increased runtime cost. Moreover, while learning rate schedulers can be beneficial, choosing between AdamW and SGD optimizers makes little difference. And iterating over the dataset more than once can make the results even worse. The best bang for the buck can be achieved by optimizing the LoRA settings, including the rank. Increasing the rank will result in more trainable parameters, which could lead to higher degrees of overfitting and runtime costs. However, when increasing the rank, choosing the appropriate alpha value is important. 161 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Chapter 7: Stage 5: Evaluation and Validation" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "vscode": { 14 | "languageId": "plaintext" 15 | } 16 | }, 17 | "source": [ 18 | "## Steps Involved in Evaluating and Validating Fine-Tuned Model" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": { 24 | "vscode": { 25 | "languageId": "plaintext" 26 | } 27 | }, 28 | "source": [ 29 | "1. Set Up Evaluation Metrics\n", 30 | "2. Interpret Training Loss Curve\n", 31 | "3. Run Validation Loops\n", 32 | "4. Monitor and Interpret Results\n", 33 | "5. Hyperparameter Tuning and Adjustments" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "vscode": { 40 | "languageId": "plaintext" 41 | } 42 | }, 43 | "source": [ 44 | "## Setting Up Evaluation Metrics" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "vscode": { 51 | "languageId": "plaintext" 52 | } 53 | }, 54 | "source": [ 55 | "Cross-entropy is a key metric for evaluating LLMs during training or fine-tuning. Originating from\n", 56 | " information theory, it quantifies the difference between two probability distributions." 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": { 62 | "vscode": { 63 | "languageId": "plaintext" 64 | } 65 | }, 66 | "source": [ 67 | "### Importance of Cross-Entropy for LLM Training and Evaluation" 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "vscode": { 74 | "languageId": "plaintext" 75 | } 76 | }, 77 | "source": [ 78 | "Cross-entropy is crucial for training and fine-tuning LLMs. It serves as a loss function, guiding the model to produce high-quality predictions by minimising discrepancies between the predicted and actual data." 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": { 84 | "vscode": { 85 | "languageId": "plaintext" 86 | } 87 | }, 88 | "source": [ 89 | "### Beyond Cross-Entropy: Advanced LLM Evaluation Metrics" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": { 95 | "vscode": { 96 | "languageId": "plaintext" 97 | } 98 | }, 99 | "source": [ 100 | "+ Perplexity: Perplexity measures how well a probability distribution or model predicts a sample. In the context of LLMs, it evaluates the model’s uncertainty about the next word in a sequence. Lower perplexity indicates better performance, as the model is more confident in its predictions.\n", 101 | "\n", 102 | "+ Factuality: Factuality assesses the accuracy of the information produced by the LLM. It is particularly important for applications where misinformation could have serious consequences. Higher factuality scores correlate with higher output quality.\n", 103 | "\n", 104 | "+ LLM Uncertainty: LLM uncertainty is measured using log probability, helping to identify low-quality generations. Lower uncertainty indicates higher output quality. \n", 105 | "\n", 106 | "+ Prompt Perplexity: This metric evaluates how well the model understands the input prompt. \n", 107 | "\n", 108 | "+ Context Relevance: In retrieval-augmented generation (RAG) systems, context relevance measures how pertinent the retrieved context is to the user query. \n", 109 | "\n", 110 | "+ Completeness\n", 111 | "\n", 112 | "+ Chunk Attribution and Utilisation: These metrics evaluate how effectively the retrieved chunks of information contribute to the final response.\n", 113 | "\n", 114 | "+ Data Error Potential \n", 115 | "\n", 116 | "+ Safety Metrics" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "vscode": { 123 | "languageId": "plaintext" 124 | } 125 | }, 126 | "source": [ 127 | "## Understanding the Training Loss Curve" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": { 133 | "vscode": { 134 | "languageId": "plaintext" 135 | } 136 | }, 137 | "source": [ 138 | " The training loss curve plots the loss value against training epochs and is essential for monitoring model\n", 139 | " performance." 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": { 145 | "vscode": { 146 | "languageId": "plaintext" 147 | } 148 | }, 149 | "source": [ 150 | "### Interpreting Loss Curves" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "vscode": { 157 | "languageId": "plaintext" 158 | } 159 | }, 160 | "source": [ 161 | " An ideal training loss curve shows a rapid decrease in loss during initial stages, followed by a gradual\n", 162 | " decline and eventual plateau. Specific patterns to look for include:\n", 163 | " 1. Underfitting: High loss value that does not decrease significantly over time, suggesting the model\n", 164 | " cannot learn the data.\n", 165 | " 2. Overfitting: Decreasing training loss with increasing validation loss, indicating the model mem\n", 166 | "orises the training data.\n", 167 | " 3. Fluctuations: Significant variations may indicate a high learning rate or noisy gradients." 168 | ] 169 | }, 170 | { 171 | "cell_type": "markdown", 172 | "metadata": { 173 | "vscode": { 174 | "languageId": "plaintext" 175 | } 176 | }, 177 | "source": [ 178 | "### Avoiding Overfitting" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": { 184 | "vscode": { 185 | "languageId": "plaintext" 186 | } 187 | }, 188 | "source": [ 189 | " Techniques to prevent overfitting include:\n", 190 | " 1. Regularisation: Adds a penalty term to the loss function to encourage smaller weights.\n", 191 | " 2. Early Stopping: Stops training when validation performance no longer improves.\n", 192 | " 3. Dropout: Randomly deactivates neurons during training to reduce sensitivity to noise.\n", 193 | " 4. Cross-Validation: Splits data into multiple subsets for training and validation to assess model\n", 194 | " generalisation.\n", 195 | " 5. Batch Normalisation: Normalises inputs to each layer during training to stabilise the learning\n", 196 | " process.\n", 197 | " 6. Larger Datasets and Batch Sizes: Reduces overfitting by increasing the amount of diverse\n", 198 | " data and batch sizes" 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "vscode": { 205 | "languageId": "plaintext" 206 | } 207 | }, 208 | "source": [ 209 | "### Sources of Noisy Gradients" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "vscode": { 216 | "languageId": "plaintext" 217 | } 218 | }, 219 | "source": [ 220 | " 1. Learning Rate Scheduling: Gradually decreasing the learning rate during training can reduce\n", 221 | " the impact of noisy gradients.\n", 222 | " 2. Gradient Clipping: Setting a threshold for gradient values prevents large updates that can\n", 223 | " destabilise training." 224 | ] 225 | }, 226 | { 227 | "cell_type": "markdown", 228 | "metadata": { 229 | "vscode": { 230 | "languageId": "plaintext" 231 | } 232 | }, 233 | "source": [ 234 | "## Running Validation Loops" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": { 240 | "vscode": { 241 | "languageId": "plaintext" 242 | } 243 | }, 244 | "source": [ 245 | " 1. Split Data: Divide the dataset into training and validation sets.\n", 246 | " 2. Initialise Validation: Evaluate the model on the validation set at the end of each epoch.\n", 247 | " 3. Calculate Metrics: Compute relevant performance metrics, such as cross-entropy loss.\n", 248 | " 4. Record Results: Log validation metrics for each epoch.\n", 249 | " 5. Early Stopping: Optionally stop training if validation loss does not improve for a predefined\n", 250 | " number of epochs." 251 | ] 252 | }, 253 | { 254 | "cell_type": "markdown", 255 | "metadata": { 256 | "vscode": { 257 | "languageId": "plaintext" 258 | } 259 | }, 260 | "source": [ 261 | "## Monitoring and Interpreting Results" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": { 267 | "vscode": { 268 | "languageId": "plaintext" 269 | } 270 | }, 271 | "source": [ 272 | " 1. Consistent Improvement: Indicates good model generalisation if both training and validation\n", 273 | " metrics improve and plateau.\n", 274 | " 2. Divergence: Suggests overfitting if training metrics improve while validation metrics deteriorate.\n", 275 | " 3. Stability: Ensure validation metrics do not fluctuate significantly, indicating stable training" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "vscode": { 282 | "languageId": "plaintext" 283 | } 284 | }, 285 | "source": [ 286 | "## Hyperparameter Tuning and Other Adjustments" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "vscode": { 293 | "languageId": "plaintext" 294 | } 295 | }, 296 | "source": [ 297 | " 1. Learning Rate: Determines the step size for updating model weights. A good starting point is\n", 298 | " 2e-4, but this can vary.\n", 299 | " 2. Batch Size: Larger batch sizes lead to more stable updates but require more memory.\n", 300 | " 3. Number of Training Epochs: Balancing the number of epochs ensures the model learns suffi\n", 301 | "ciently without overfitting or underfitting.\n", 302 | " 4. Optimiser: Optimisers like Paged ADAM optimise memory usage, advantageous for large models" 303 | ] 304 | } 305 | ], 306 | "metadata": { 307 | "language_info": { 308 | "name": "python" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 2 313 | } 314 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chapter 3: Stage 1 - Data Preparation" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Steps Involved in Data Preparation" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "vscode": { 29 | "languageId": "plaintext" 30 | } 31 | }, 32 | "source": [ 33 | "### Data Collection" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": { 39 | "vscode": { 40 | "languageId": "plaintext" 41 | } 42 | }, 43 | "source": [ 44 | "The first step in data preparation is to collect data from various sources. These sources can be in any\n", 45 | " format such as CSV, web pages, SQL databases, S3 storage, etc. Python provides several libraries to\n", 46 | " gather the data efficiently and accurately. " 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": { 52 | "vscode": { 53 | "languageId": "plaintext" 54 | } 55 | }, 56 | "source": [ 57 | "
\n", 58 | " \"\"\n", 59 | "
" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": { 65 | "vscode": { 66 | "languageId": "plaintext" 67 | } 68 | }, 69 | "source": [ 70 | "### Data Preprocessing and Formatting" 71 | ] 72 | }, 73 | { 74 | "cell_type": "markdown", 75 | "metadata": { 76 | "vscode": { 77 | "languageId": "plaintext" 78 | } 79 | }, 80 | "source": [ 81 | "Data preprocessing and formatting are crucial for ensuring high-quality data for fine-tuning. This step\n", 82 | " involves tasks such as cleaning the data, handling missing values, and formatting the data to match the\n", 83 | " specific requirements of the task. " 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "metadata": { 89 | "vscode": { 90 | "languageId": "plaintext" 91 | } 92 | }, 93 | "source": [ 94 | "
\n", 95 | " \"\"\n", 96 | "
" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": { 102 | "vscode": { 103 | "languageId": "plaintext" 104 | } 105 | }, 106 | "source": [ 107 | "### Handling Data Imbalance" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "vscode": { 114 | "languageId": "plaintext" 115 | } 116 | }, 117 | "source": [ 118 | "**Over-sampling and Under-sampling**: \n", 119 | "+ Techniques like SMOTE (Synthetic Minority Over\n", 120 | "sampling Technique) generate synthetic examples to achieve balance.\n", 121 | "+ Python Library: imbalanced-learn\n", 122 | "+ Description: imbalanced-learn provides various methods to deal with imbalanced datasets, in\n", 123 | "cluding oversampling techniques like SMOTE." 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": { 129 | "vscode": { 130 | "languageId": "plaintext" 131 | } 132 | }, 133 | "source": [ 134 | "**Adjusting Loss Function**: Modify the loss function to give more weight to the minority class,\n", 135 | " setting class weights inversely proportional to the class frequencies." 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": { 141 | "vscode": { 142 | "languageId": "plaintext" 143 | } 144 | }, 145 | "source": [ 146 | "**Focal Loss**: A variant of cross-entropy loss that adds a factor to down-weight easy examples and\n", 147 | " focus training on hard negatives.\n", 148 | "+ **Python Library**: focal loss\n", 149 | "+ **Description**: The focal loss package provides robust implementations of various focal loss func\n", 150 | "tions, including BinaryFocalLoss and SparseCategoricalFocalLoss." 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": { 156 | "vscode": { 157 | "languageId": "plaintext" 158 | } 159 | }, 160 | "source": [ 161 | "**Cost-sensitive Learning**: Incorporating the cost of misclassifications directly into the learning\n", 162 | " algorithm, assigning a higher cost to misclassifying minority class samples." 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": { 168 | "vscode": { 169 | "languageId": "plaintext" 170 | } 171 | }, 172 | "source": [ 173 | "**Ensemble Methods**: Using techniques like bagging and boosting to combine multiple models\n", 174 | " and handle class imbalance.\n", 175 | "+ Python Library: sklearn.ensemble\n", 176 | "+ Description: scikit-learn provides robust implementations of various ensemble methods, including\n", 177 | " bagging and boosting." 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": { 183 | "vscode": { 184 | "languageId": "plaintext" 185 | } 186 | }, 187 | "source": [ 188 | "**StratifiedSampling**: Ensuring that each mini-batch during training contains an equal or proportional representation of each `class.\n", 189 | "+ PythonLibrary: sklearn.model selection.StratifiedShuffleSplit\n", 190 | "+ Description: scikit-learn offers tools for stratifiedsampling, ensuring balanced representation\n", 191 | " across classes" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": { 197 | "vscode": { 198 | "languageId": "plaintext" 199 | } 200 | }, 201 | "source": [ 202 | "**Data Cleaning**: Removing noisy and mislabelled data, which can disproportionately affect the minority class.\n", 203 | "+ Python Library: pandas.DataFrame.sample\n", 204 | "+ Description: pandas provides methods for sampling data from DataFrames, useful for data cleaning and preprocessing." 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "vscode": { 211 | "languageId": "plaintext" 212 | } 213 | }, 214 | "source": [ 215 | "**Using Appropriate Metrics**: Metrics like Precision-Recall AUC, F1-score, and Cohen’s Kappa are more informative than accuracy when dealing with imbalanced datasets.\n", 216 | "+ Python Library: sklearn.metrics\n", 217 | "+ Description: scikit-learn offers a comprehensive set of tools for evaluating the performance of classification models, particularly with imbalanced datasets." 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": { 223 | "vscode": { 224 | "languageId": "plaintext" 225 | } 226 | }, 227 | "source": [ 228 | "### Splitting Dataset" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "vscode": { 235 | "languageId": "plaintext" 236 | } 237 | }, 238 | "source": [ 239 | "Splitting the dataset for fine-tuning involves dividing it into training and validation sets, typically using an 80:20 ratio. Different techniques include:" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "metadata": { 245 | "vscode": { 246 | "languageId": "plaintext" 247 | } 248 | }, 249 | "source": [ 250 | "+ Random Sampling: Selecting a subset of data randomly to create a representative sample\n", 251 | "+ Stratified Sampling: Dividing the dataset into subgroups and sampling from each to maintain class balance.\n", 252 | "+ K-Fold Cross Validation: Splitting the dataset into K folds and performing training and validation K times.\n", 253 | "+ Leave-One-Out Cross Validation: Using a single data point as the validation set and the rest for training, repeated for each data point." 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": { 259 | "vscode": { 260 | "languageId": "plaintext" 261 | } 262 | }, 263 | "source": [ 264 | "## Existing and Potential Research Methodologies" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": { 270 | "vscode": { 271 | "languageId": "plaintext" 272 | } 273 | }, 274 | "source": [ 275 | "### Data Annotation" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "vscode": { 282 | "languageId": "plaintext" 283 | } 284 | }, 285 | "source": [ 286 | "Data annotation involves labelling or tagging textual data with specific attributes relevant to the model’s training objectives. This process is crucial for supervised learning tasks and greatly influences the performance of the fine-tuned model. Recent research highlights various approaches to data annotation:\n" 287 | ] 288 | }, 289 | { 290 | "cell_type": "markdown", 291 | "metadata": { 292 | "vscode": { 293 | "languageId": "plaintext" 294 | } 295 | }, 296 | "source": [ 297 | "+ Human Annotation\n", 298 | "+ Semi-automatic Annotation\n", 299 | "+ Automatic Annotation" 300 | ] 301 | }, 302 | { 303 | "cell_type": "markdown", 304 | "metadata": { 305 | "vscode": { 306 | "languageId": "plaintext" 307 | } 308 | }, 309 | "source": [ 310 | "### Data Augmentation" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": { 316 | "vscode": { 317 | "languageId": "plaintext" 318 | } 319 | }, 320 | "source": [ 321 | "Data Augmentation (DA) techniques expand training datasets artificially to address data scarcity and improve model performance. Advanced techniques often used in NLP include:" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": { 327 | "vscode": { 328 | "languageId": "plaintext" 329 | } 330 | }, 331 | "source": [ 332 | "+ Word Embeddings\n", 333 | "+ Back Translation\n", 334 | "+ Adversarial Attacks\n", 335 | "+ NLP-AUG" 336 | ] 337 | }, 338 | { 339 | "cell_type": "markdown", 340 | "metadata": { 341 | "vscode": { 342 | "languageId": "plaintext" 343 | } 344 | }, 345 | "source": [ 346 | "### Synthetic Data Generation using LLMs" 347 | ] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "metadata": { 352 | "vscode": { 353 | "languageId": "plaintext" 354 | } 355 | }, 356 | "source": [ 357 | "Large Language Models (LLMs) can generate synthetic data through innovative techniques such as:\n", 358 | "+ Prompt Engineering: Crafting specific prompts to guide LLMs like GPT-3 in generating relevant and high-quality synthetic data\n", 359 | "+ Multi-Step Generation: Employing iterative generation processes where LLMs generate initial data that is refined through subsequent steps. This method can produce high-quality synthetic data for various tasks, including summarising and bias detection." 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": { 365 | "vscode": { 366 | "languageId": "plaintext" 367 | } 368 | }, 369 | "source": [ 370 | "## Challenges in Data Preparation for Fine-Tuning LLMs" 371 | ] 372 | }, 373 | { 374 | "cell_type": "markdown", 375 | "metadata": { 376 | "vscode": { 377 | "languageId": "plaintext" 378 | } 379 | }, 380 | "source": [ 381 | "## Available LLM Fine-Tuning Datasets" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": { 387 | "vscode": { 388 | "languageId": "plaintext" 389 | } 390 | }, 391 | "source": [ 392 | "## Best Practices" 393 | ] 394 | } 395 | ], 396 | "metadata": { 397 | "language_info": { 398 | "name": "python" 399 | } 400 | }, 401 | "nbformat": 4, 402 | "nbformat_minor": 2 403 | } 404 | -------------------------------------------------------------------------------- /scripts/cv/torch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Using Pytorch to finetune for computer vision tasks" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Dataset" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "### Custom dataset" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "from PIL import Image\n", 40 | "from torch.utils.data import Dataset\n", 41 | "\n", 42 | "class CustomImageDataset(Dataset):\n", 43 | " def __init__(self, image_dir):\n", 44 | " self.image_dir = image_dir\n", 45 | " self.image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir)]\n", 46 | " \n", 47 | " def __len__(self):\n", 48 | " return len(self.image_paths)\n", 49 | " \n", 50 | " def __getitem__(self, idx):\n", 51 | " image_path = self.image_paths[idx]\n", 52 | " image = Image.open(image_path).convert(\"RGB\")\n", 53 | "\n", 54 | " label = 0 if \"class0\" in image_path else 1 \n", 55 | " \n", 56 | " return image, label" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "custom_dataset = CustomImageDataset(image_dir='data')" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "### Augmentation" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "from torchvision import transforms\n", 82 | "transform = transforms.Compose([\n", 83 | " transforms.Resize((224, 224)),\n", 84 | " transforms.ToTensor(),\n", 85 | " transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])\n", 86 | "])" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": {}, 93 | "outputs": [], 94 | "source": [ 95 | "import os\n", 96 | "from PIL import Image\n", 97 | "from torch.utils.data import Dataset\n", 98 | "\n", 99 | "class CustomImageDataset(Dataset):\n", 100 | " def __init__(self, image_dir, transform=None):\n", 101 | " self.image_dir = image_dir\n", 102 | " self.transform = transform\n", 103 | " self.image_paths = [os.path.join(image_dir, fname) for fname in os.listdir(image_dir)]\n", 104 | " \n", 105 | " def __len__(self):\n", 106 | " return len(self.image_paths)\n", 107 | " \n", 108 | " def __getitem__(self, idx):\n", 109 | " image_path = self.image_paths[idx]\n", 110 | " image = Image.open(image_path).convert(\"RGB\")\n", 111 | " \n", 112 | " if self.transform:\n", 113 | " image = self.transform(image)\n", 114 | " \n", 115 | " label = 0 if \"class0\" in image_path else 1 \n", 116 | " \n", 117 | " return image, label" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "custom_dataset = CustomImageDataset(image_dir='./data/custom_images', transform=transform)" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "### ImageFolder" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "```bash\n", 141 | "data/\n", 142 | "├── class_1/\n", 143 | "│ ├── img1.jpg\n", 144 | "│ ├── img2.jpg\n", 145 | "│ └── ...\n", 146 | "├── class_2/\n", 147 | "│ ├── img1.jpg\n", 148 | "│ ├── img2.jpg\n", 149 | "│ └── ...\n", 150 | "└── class_n/\n", 151 | " ├── img1.jpg\n", 152 | " ├── img2.jpg\n", 153 | " └── ...\n", 154 | "```" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "from torchvision import datasets\n", 164 | "dataset = datasets.ImageFolder(root='data', transform=transform)" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "import torch\n", 174 | "train_size = int(0.8 * len(dataset))\n", 175 | "val_size = len(dataset) - train_size\n", 176 | "train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])" 177 | ] 178 | }, 179 | { 180 | "cell_type": "markdown", 181 | "metadata": {}, 182 | "source": [ 183 | "## DataLoader" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [ 192 | "from torch.utils.data import DataLoader\n", 193 | "\n", 194 | "train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)\n", 195 | "val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "vscode": { 202 | "languageId": "plaintext" 203 | } 204 | }, 205 | "source": [ 206 | "## Pretrained model" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "from torchvision import models\n", 216 | "\n", 217 | "model = models.resnet18(pretrained=True) # Pretrained weights on ImageNet" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "### Freeze model" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "for param in model.parameters():\n", 234 | " param.requires_grad = False # Freeze all parameters" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "for name, param in model.named_parameters():\n", 244 | " if \"layer4\" in name: # Unfreeze the final layer block in ResNet\n", 245 | " param.requires_grad = True" 246 | ] 247 | }, 248 | { 249 | "cell_type": "markdown", 250 | "metadata": {}, 251 | "source": [ 252 | "## Custom layers" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "import torch.nn as nn\n", 262 | "\n", 263 | "num_classes = 10 \n", 264 | "model = models.resnet18(pretrained=True)\n", 265 | "\n", 266 | "model.fc = nn.Linear(model.fc.in_features, num_classes) " 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": { 272 | "vscode": { 273 | "languageId": "plaintext" 274 | } 275 | }, 276 | "source": [ 277 | "## Training" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": {}, 284 | "outputs": [], 285 | "source": [ 286 | "import torch.optim as optim\n", 287 | "\n", 288 | "criterion = nn.CrossEntropyLoss()\n", 289 | "optimizer = optim.Adam(model.parameters(), lr=0.001) " 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "device = torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\")\n", 299 | "model.to(device)\n", 300 | "\n", 301 | "for epoch in range(5): \n", 302 | " model.train() \n", 303 | " running_loss = 0.0\n", 304 | " for images, labels in train_loader:\n", 305 | " images, labels = images.to(device), labels.to(device)\n", 306 | "\n", 307 | " optimizer.zero_grad() \n", 308 | " outputs = model(images) \n", 309 | " loss = criterion(outputs, labels) \n", 310 | " loss.backward() \n", 311 | " optimizer.step()\n", 312 | "\n", 313 | " running_loss += loss.item()\n", 314 | "\n", 315 | " print(f\"Epoch {epoch+1}, Loss: {running_loss/len(train_loader):.4f}\")\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "# Special: Image + metadata model" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "class MultiInputModel(nn.Module):\n", 332 | " def __init__(self, num_classes, metadata_input_size):\n", 333 | " super(MultiInputModel, self).__init__()\n", 334 | "\n", 335 | " # Image branch: Use pre-trained ResNet18\n", 336 | " self.image_branch = models.resnet18(pretrained=True)\n", 337 | " self.image_branch.fc = nn.Identity() # Remove the final fully connected layer\n", 338 | "\n", 339 | " # Metadata branch: Fully connected layers\n", 340 | " self.metadata_branch = nn.Sequential(\n", 341 | " nn.Linear(metadata_input_size, 64),\n", 342 | " nn.ReLU(),\n", 343 | " nn.Dropout(0.3),\n", 344 | " nn.Linear(64, 32),\n", 345 | " nn.ReLU()\n", 346 | " )\n", 347 | "\n", 348 | " # Combined branch\n", 349 | " self.combined_fc = nn.Sequential(\n", 350 | " nn.Linear(512 + 32, 128), # 512 from ResNet18 + 32 from metadata branch\n", 351 | " nn.ReLU(),\n", 352 | " nn.Dropout(0.5),\n", 353 | " nn.Linear(128, num_classes)\n", 354 | " )\n", 355 | "\n", 356 | " def forward(self, image, metadata):\n", 357 | " # Forward pass through the image branch\n", 358 | " image_features = self.image_branch(image)\n", 359 | "\n", 360 | " # Forward pass through the metadata branch\n", 361 | " metadata_features = self.metadata_branch(metadata)\n", 362 | "\n", 363 | " # Concatenate features from both branches\n", 364 | " combined_features = torch.cat((image_features, metadata_features), dim=1)\n", 365 | "\n", 366 | " # Forward pass through the combined branch\n", 367 | " output = self.combined_fc(combined_features)\n", 368 | " return output" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": null, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "class MultiInputDataset(Dataset):\n", 378 | " def __init__(self, image_paths, metadata, labels, transform=None):\n", 379 | " self.image_paths = image_paths\n", 380 | " self.metadata = metadata\n", 381 | " self.labels = labels\n", 382 | " self.transform = transform\n", 383 | "\n", 384 | " def __len__(self):\n", 385 | " return len(self.image_paths)\n", 386 | "\n", 387 | " def __getitem__(self, idx):\n", 388 | " # Load image\n", 389 | " image = Image.open(self.image_paths[idx]).convert(\"RGB\")\n", 390 | " if self.transform:\n", 391 | " image = self.transform(image)\n", 392 | "\n", 393 | " # Load metadata\n", 394 | " metadata = self.metadata[idx]\n", 395 | "\n", 396 | " # Load label\n", 397 | " label = self.labels[idx]\n", 398 | "\n", 399 | " return image, torch.tensor(metadata, dtype=torch.float32), label" 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "execution_count": null, 405 | "metadata": {}, 406 | "outputs": [], 407 | "source": [ 408 | "for epoch in range(10): # Number of epochs\n", 409 | " model.train()\n", 410 | " running_loss = 0.0\n", 411 | "\n", 412 | " for images, metadata, labels in train_loader:\n", 413 | " images, metadata, labels = images.to(device), metadata.to(device), labels.to(device)\n", 414 | "\n", 415 | " optimizer.zero_grad() # Zero the gradients\n", 416 | " outputs = model(images, metadata) # Forward pass\n", 417 | " loss = criterion(outputs, labels) # Compute loss\n", 418 | " loss.backward() # Backward pass\n", 419 | " optimizer.step() # Update weights\n", 420 | "\n", 421 | " running_loss += loss.item()\n", 422 | "\n", 423 | " print(f\"Epoch {epoch+1}, Loss: {running_loss / len(train_loader):.4f}\")" 424 | ] 425 | } 426 | ], 427 | "metadata": { 428 | "kernelspec": { 429 | "display_name": "base", 430 | "language": "python", 431 | "name": "python3" 432 | }, 433 | "language_info": { 434 | "name": "python", 435 | "version": "3.11.9" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 2 440 | } 441 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chaper 1: Introduction" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Background of Large Language Models" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "vscode": { 29 | "languageId": "plaintext" 30 | } 31 | }, 32 | "source": [ 33 | " Large Language Models (LLMs) represent a significant leap in computational systems capable of under\n", 34 | "standing and generating human language. " 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": { 40 | "vscode": { 41 | "languageId": "plaintext" 42 | } 43 | }, 44 | "source": [ 45 | " Notable examples, such as GPT-3 and GPT-4, leverage the self-attention mecha\n", 46 | "nism within Transformer architectures to efficiently manage sequential data and understand long-range\n", 47 | " dependencies. " 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "vscode": { 54 | "languageId": "plaintext" 55 | } 56 | }, 57 | "source": [ 58 | "## Historical Development and Key Milestones" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "vscode": { 65 | "languageId": "plaintext" 66 | } 67 | }, 68 | "source": [ 69 | " Language models are fundamental to natural language processing (NLP), leveraging mathematical tech\n", 70 | "niques to generalise linguistic rules and knowledge for tasks involving prediction and generation.\n", 71 | "\n", 72 | " Over\n", 73 | " several decades, language modelling has evolved from early statistical language models (SLMs) to to\n", 74 | "day’s advanced large language models (LLMs). " 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": { 80 | "vscode": { 81 | "languageId": "plaintext" 82 | } 83 | }, 84 | "source": [ 85 | "
\n", 86 | " \"timeline\"\n", 87 | "
" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "vscode": { 94 | "languageId": "plaintext" 95 | } 96 | }, 97 | "source": [ 98 | "## Evolution from Traditional NLP Models to State-of-the-Art LLM" 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "metadata": { 104 | "vscode": { 105 | "languageId": "plaintext" 106 | } 107 | }, 108 | "source": [ 109 | "### Statistical Language Model (SLM)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": { 115 | "vscode": { 116 | "languageId": "plaintext" 117 | } 118 | }, 119 | "source": [ 120 | " Emerging in the 1990s, SLMs analyse natural language using probabilistic methods to determine the\n", 121 | " likelihood of sentences within texts. " 122 | ] 123 | }, 124 | { 125 | "cell_type": "markdown", 126 | "metadata": { 127 | "vscode": { 128 | "languageId": "plaintext" 129 | } 130 | }, 131 | "source": [ 132 | "+ Probability: SLMs assign probabilities to sequences of words or sentences.\n", 133 | "+ N-gram models: The most common type, especially for earlier SLMs. They predict the next word based on the previous n-1 words.\n", 134 | "+ Limitations: Traditional SLMs struggle with long-range dependencies and context." 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": { 140 | "vscode": { 141 | "languageId": "plaintext" 142 | } 143 | }, 144 | "source": [ 145 | "### Neural Language Model (NLM)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "vscode": { 152 | "languageId": "plaintext" 153 | } 154 | }, 155 | "source": [ 156 | "NLMs leverage neural networks to predict word sequences, overcoming SLM limitations. Word vectors\n", 157 | " enable computers to understand word meanings. \n", 158 | " \n", 159 | " Tools like Word2Vec represent words in a vector\n", 160 | " space where semantic relationships are reflected in vector angles. " 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": { 166 | "vscode": { 167 | "languageId": "plaintext" 168 | } 169 | }, 170 | "source": [ 171 | "The input layer concatenates word vectors,\n", 172 | " the hidden layer applies a non-linear activation function, and the output layer predicts subsequent words\n", 173 | " using the Softmax function to transform values into a probability distribution." 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": { 179 | "vscode": { 180 | "languageId": "plaintext" 181 | } 182 | }, 183 | "source": [ 184 | "### Pretrained Language Model (PLM)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": { 190 | "vscode": { 191 | "languageId": "plaintext" 192 | } 193 | }, 194 | "source": [ 195 | " PLMs are initially trained on extensive volumes of unlabelled text to understand fundamental language\n", 196 | " structures (pre-training). They are then fine-tuned on a smaller, task-specific dataset. This ”pre-training\n", 197 | " and fine-tuning” paradigm, exemplified by GPT-2 and BERT, has led to diverse and effective model\n", 198 | " architectures." 199 | ] 200 | }, 201 | { 202 | "cell_type": "markdown", 203 | "metadata": { 204 | "vscode": { 205 | "languageId": "plaintext" 206 | } 207 | }, 208 | "source": [ 209 | "### Large Language Models (LLM)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": { 215 | "vscode": { 216 | "languageId": "plaintext" 217 | } 218 | }, 219 | "source": [ 220 | " LLMs like GPT-3, GPT-4, PaLM, and LLaMA are trained on massive text corpora with tens of\n", 221 | " billions of parameters. LLMs undergo a two-stage process: initial pre-training on a vast corpus followed by alignment with human values." 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": { 227 | "vscode": { 228 | "languageId": "plaintext" 229 | } 230 | }, 231 | "source": [ 232 | "## Overview of Current Leading LLMs" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": { 238 | "vscode": { 239 | "languageId": "plaintext" 240 | } 241 | }, 242 | "source": [ 243 | "LLMs’ rapid development has spurred research into architectural innovations, training strategies, extending context lengths, fine-tuning techniques, and integrating multi-modal data. " 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": { 249 | "vscode": { 250 | "languageId": "plaintext" 251 | } 252 | }, 253 | "source": [ 254 | "
\n", 255 | " \"\"\n", 256 | "
" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "metadata": { 262 | "vscode": { 263 | "languageId": "plaintext" 264 | } 265 | }, 266 | "source": [ 267 | "## Types of LLM Fine-Tuning" 268 | ] 269 | }, 270 | { 271 | "cell_type": "markdown", 272 | "metadata": { 273 | "vscode": { 274 | "languageId": "plaintext" 275 | } 276 | }, 277 | "source": [ 278 | "### Unsupervised Fine-Tuning" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": { 284 | "vscode": { 285 | "languageId": "plaintext" 286 | } 287 | }, 288 | "source": [ 289 | " This method does not require labelled data. Instead, the LLM is exposed to a large corpus of unla\n", 290 | "belled text from the target domain, refining its understanding of language. This approach is useful for\n", 291 | " new domains like legal or medical fields but is less precise for specific tasks such as classification or\n", 292 | " summarisation." 293 | ] 294 | }, 295 | { 296 | "cell_type": "markdown", 297 | "metadata": { 298 | "vscode": { 299 | "languageId": "plaintext" 300 | } 301 | }, 302 | "source": [ 303 | "### Supervised Fine-Tuning (SFT)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": { 309 | "vscode": { 310 | "languageId": "plaintext" 311 | } 312 | }, 313 | "source": [ 314 | "SFT involves providing the LLM with labelled data tailored to the target task.\n", 315 | "\n", 316 | " While effective, this method requires substantial labelled data, which can be costly and time-consuming\n", 317 | " to obtain." 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "metadata": { 323 | "vscode": { 324 | "languageId": "plaintext" 325 | } 326 | }, 327 | "source": [ 328 | "### Instruction Fine-Tuning via Prompt Engineering" 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "metadata": { 334 | "vscode": { 335 | "languageId": "plaintext" 336 | } 337 | }, 338 | "source": [ 339 | "This method relies on providing the LLM with natural language instructions, useful for creating spe\n", 340 | "cialised assistants. It reduces the need for vast amounts of labelled data but depends heavily on the\n", 341 | " quality of the prompts." 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": { 347 | "vscode": { 348 | "languageId": "plaintext" 349 | } 350 | }, 351 | "source": [ 352 | "## Pre-training vs Fine-tuning" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": { 358 | "vscode": { 359 | "languageId": "plaintext" 360 | } 361 | }, 362 | "source": [ 363 | "
\n", 364 | " \"\"\n", 365 | "
" 366 | ] 367 | }, 368 | { 369 | "cell_type": "markdown", 370 | "metadata": { 371 | "vscode": { 372 | "languageId": "plaintext" 373 | } 374 | }, 375 | "source": [ 376 | "## Importance of Fine-Tuning LLM" 377 | ] 378 | }, 379 | { 380 | "cell_type": "markdown", 381 | "metadata": { 382 | "vscode": { 383 | "languageId": "plaintext" 384 | } 385 | }, 386 | "source": [ 387 | "1. *Transfer Learning*: Fine-tuning leverages the knowledge acquired during pre-training, adapting it to specific tasks with reduced computation time and resources.\n", 388 | "2. *Reduced Data Requirements*: Fine-tuning requires less labelled data, focusing on tailoring pre-trained features to the target task.\n", 389 | "3. *Improved Generalisation*: Fine-tuning enhances the model’s ability to generalise to specific tasks or domains, capturing general language features and customising them.\n", 390 | "4. *Efficient Model Deployment*: Fine-tuned models are more efficient for real-world applications, being computationally efficient and well-suited for specific tasks.\n", 391 | "5. *Adaptability to Various Tasks*: Fine-tuned LLMs can adapt to a broad range of tasks, performing well across various applications without task-specific architectures.\n", 392 | "6. *Domain-Specific Performance*: Fine-tuning allows models to excel in domain-specific tasks by adjusting to the nuances and vocabulary of the target domain.\n", 393 | "7. *Faster Convergence*: Fine-tuning usually achieves faster convergence, starting with weights that already capture general language features." 394 | ] 395 | }, 396 | { 397 | "cell_type": "markdown", 398 | "metadata": { 399 | "vscode": { 400 | "languageId": "plaintext" 401 | } 402 | }, 403 | "source": [ 404 | "## Retrieval Augmented Generation (RAG)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "markdown", 409 | "metadata": { 410 | "vscode": { 411 | "languageId": "plaintext" 412 | } 413 | }, 414 | "source": [ 415 | "Apopular method to utilise your own data is by incorporating it into the prompt when querying the LLM\n", 416 | " model." 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": { 422 | "vscode": { 423 | "languageId": "plaintext" 424 | } 425 | }, 426 | "source": [ 427 | " This approach, known as Retrieval-Augmented Generation (RAG), involves retrieving relevant\n", 428 | " data and using it as additional context for the LLM. Instead of depending solely on knowledge from the\n", 429 | " training data, a RAG workflow pulls pertinent information, connecting static LLMs with real-time data\n", 430 | " retrieval. " 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": { 436 | "vscode": { 437 | "languageId": "plaintext" 438 | } 439 | }, 440 | "source": [ 441 | "With RAG architecture, organisations can deploy any LLM model and enhance it to return\n", 442 | " relevant results by providing a small amount of their own data" 443 | ] 444 | }, 445 | { 446 | "cell_type": "markdown", 447 | "metadata": { 448 | "vscode": { 449 | "languageId": "plaintext" 450 | } 451 | }, 452 | "source": [ 453 | "This\n", 454 | " process avoids the costs and time associated with fine-tuning or pre-training the model." 455 | ] 456 | }, 457 | { 458 | "cell_type": "markdown", 459 | "metadata": {}, 460 | "source": [ 461 | "
\n", 462 | " \"\"\n", 463 | "
" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": { 469 | "vscode": { 470 | "languageId": "plaintext" 471 | } 472 | }, 473 | "source": [ 474 | "### Traditional RAG Pipeline and Steps" 475 | ] 476 | }, 477 | { 478 | "cell_type": "markdown", 479 | "metadata": { 480 | "vscode": { 481 | "languageId": "plaintext" 482 | } 483 | }, 484 | "source": [ 485 | "1. *Data Indexing*: Organise data efficiently for quick retrieval. This involves processing, chunking,\n", 486 | " and storing data in a vector database using indexing strategies like search indexing, vector indexing,\n", 487 | " and hybrid indexing\n", 488 | "2. *Input Query Processing*: Refine user queries to improve compatibility with indexed data. This\n", 489 | " can include simplification or vector transformation of queries for enhanced search efficiency.\n", 490 | "3. *Searching and Ranking*: Retrieve and rank data based on relevance using search algorithms\n", 491 | " such as TF-IDF, BM25, and deep learning models like BERT to interpret the query’s intent and\n", 492 | " context.\n", 493 | "4. *Prompt Augmentation*: Incorporate relevant information from the search results into the origi\n", 494 | "nal query to provide the LLM with additional context, enhancing response accuracy and relevance.\n", 495 | "5. *Response Generation*: Usetheaugmentedprompttogenerate responses that combine the LLM’s\n", 496 | " knowledge with current, specific data, ensuring high-quality, contextually grounded answers.\n" 497 | ] 498 | }, 499 | { 500 | "cell_type": "markdown", 501 | "metadata": { 502 | "vscode": { 503 | "languageId": "plaintext" 504 | } 505 | }, 506 | "source": [ 507 | "### Benefits of Using RAG" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": { 513 | "vscode": { 514 | "languageId": "plaintext" 515 | } 516 | }, 517 | "source": [ 518 | "+ *Up-to-Date and Accurate Responses*: Enhances the LLM’s responses with current external\n", 519 | " data, improving accuracy and relevance.\n", 520 | "+ *Reducing Inaccurate Responses*: Grounds the LLM’s output in relevant knowledge, reducing\n", 521 | " the risk of generating incorrect information.\n", 522 | "+ *Domain-Specific Responses*: Delivers contextually relevant responses tailored to an organisa\n", 523 | "tion’s proprietary data.\n", 524 | "+ *EfficiencyandCost-Effectiveness*: Offersacost-effective method for customising LLMs without\n", 525 | " extensive model fine-tuning." 526 | ] 527 | }, 528 | { 529 | "cell_type": "markdown", 530 | "metadata": { 531 | "vscode": { 532 | "languageId": "plaintext" 533 | } 534 | }, 535 | "source": [ 536 | "### Challenges and Considerations in Serving RAG" 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": { 542 | "vscode": { 543 | "languageId": "plaintext" 544 | } 545 | }, 546 | "source": [ 547 | "1. *User Experience*: Ensuring rapid response times suitable for real-time applications.\n", 548 | "2. *Cost Efficiency*: Managing the costs associated with serving millions of responses.\n", 549 | "3. *Accuracy*: Ensuring outputs are accurate to avoid misinformation.\n", 550 | "4. *Recency and Relevance*: Keeping responses and content current with the latest data.\n", 551 | "5. *Business Context Awareness*: Aligning LLM responses with specific business contexts.\n", 552 | "6. *Service Scalability*: Managing increased capacity while controlling costs.\n", 553 | "7. *Security and Governance*: Implementing protocols for data security, privacy, and governance." 554 | ] 555 | }, 556 | { 557 | "cell_type": "markdown", 558 | "metadata": {}, 559 | "source": [ 560 | "### Considerations for Choosing Between RAG and Fine-Tuning" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "vscode": { 567 | "languageId": "plaintext" 568 | } 569 | }, 570 | "source": [ 571 | "When considering external data access, RAG is likely a superior option for applications needing to access\n", 572 | " external data sources. Fine-tuning, on the other hand, is more suitable if you require the model to ad\n", 573 | "just its behaviour, and writing style, or incorporate domain-specific knowledge. " 574 | ] 575 | }, 576 | { 577 | "cell_type": "markdown", 578 | "metadata": { 579 | "vscode": { 580 | "languageId": "plaintext" 581 | } 582 | }, 583 | "source": [ 584 | " In terms of suppressing\n", 585 | " hallucinations and ensuring accuracy, RAG systems tend to perform better as they are less prone to gen\n", 586 | "erating incorrect information. " 587 | ] 588 | }, 589 | { 590 | "cell_type": "markdown", 591 | "metadata": { 592 | "vscode": { 593 | "languageId": "plaintext" 594 | } 595 | }, 596 | "source": [ 597 | "
\n", 598 | " \"\"\n", 599 | "
" 600 | ] 601 | } 602 | ], 603 | "metadata": { 604 | "language_info": { 605 | "name": "python" 606 | } 607 | }, 608 | "nbformat": 4, 609 | "nbformat_minor": 2 610 | } 611 | -------------------------------------------------------------------------------- /others/papers/UltimateGuideFromBasicsToBreakthrough/Chapter6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "vscode": { 7 | "languageId": "plaintext" 8 | } 9 | }, 10 | "source": [ 11 | "# Chapter 6: Stage 4: Selection of Fine-Tuning Techniques and Appropriate Model Configurations" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": { 17 | "vscode": { 18 | "languageId": "plaintext" 19 | } 20 | }, 21 | "source": [ 22 | "## Steps Involved in Fine-Tuning" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": { 28 | "vscode": { 29 | "languageId": "plaintext" 30 | } 31 | }, 32 | "source": [ 33 | "1. **Initialise the Pre-Trained Tokenizer and Model**\n", 34 | "2. **Modify the Model’s Output Layer**\n", 35 | "3. **Choose an Appropriate Fine-Tuning Strategy**: Select the fine-tuning strategy that best fits the task and the model architecture. Some Options include:\n", 36 | "+ Task-Specific Fine-Tuning: For tasks such as text summarisation, code generation, classification, and question answering, adapt the model using relevant datasets.\n", 37 | "+ Domain-Specific Fine-Tuning: Tailor the model to comprehend and generate text relevant to specific domains, such as medical, financial, or legal fields.\n", 38 | "+ Parameter-Efficient Fine-Tuning (PEFT): Techniques like LoRA, QLoRA, and adapters allow for fine-tuning with reduced computational costs by updating a small subset of model parameters.\n", 39 | "+ Half Fine-Tuning (HFT): Balance between retaining pre-trained knowledge and learning new tasks by updating only half of the model’s parameters during each fine-tuning round.\n", 40 | "\n", 41 | "4. **Set Up the Training Loop**\n", 42 | "5. **Incorporate Techniques for Handling Multiple Tasks**\n", 43 | "6. **Monitor Performance on a Validation Set**\n", 44 | "7. **Optimise Model Using Advanced Techniques**: Employ techniques such as Proximal Policy Optimisation (PPO) for reinforcement learning scenarios, or Direct Preference Optimisation (DPO) for aligning model outputs with human preferences. These techniques are particularly useful in fine-tuning models for tasks requiring nuanced decision-making or human-like responses.\n", 45 | "\n", 46 | "8. **Prune and optimise the Model** (if necessary)\n", 47 | "9. **Continuous Evaluation and Iteration**" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": { 53 | "vscode": { 54 | "languageId": "plaintext" 55 | } 56 | }, 57 | "source": [ 58 | "## Fine-Tuning Strategies for LLMs" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "vscode": { 65 | "languageId": "plaintext" 66 | } 67 | }, 68 | "source": [ 69 | "### Task-Specific Fine-Tuning" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": { 75 | "vscode": { 76 | "languageId": "plaintext" 77 | } 78 | }, 79 | "source": [ 80 | "
\n", 81 | " \"\"\n", 82 | "
" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "metadata": { 88 | "vscode": { 89 | "languageId": "plaintext" 90 | } 91 | }, 92 | "source": [ 93 | "### Domain-Specific Fine-Tuning" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": { 99 | "vscode": { 100 | "languageId": "plaintext" 101 | } 102 | }, 103 | "source": [ 104 | "## Parameter-Efficient Fine-Tuning (PEFT) Techniques" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "vscode": { 111 | "languageId": "plaintext" 112 | } 113 | }, 114 | "source": [ 115 | "Parameter Efficient Fine Tuning (PEFT) is an impactful NLP technique that adeptly adapts pre-trained language models to various applications with remarkable efficiency. PEFT methods fine-tune only a small subset of (additional) model parameters while keeping most of the pre-trained LLM parameters frozen, thereby significantly reducing computational and storage costs. This approach mitigates the issue of catastrophic forgetting, a phenomenon where neural networks lose previously acquired knowledge and experience a significant performance decline on previously learned tasks when trained on new datasets. PEFT methods have demonstrated superior performance compared to full fine-tuning, particularly in low-data scenarios, and exhibit better generalisation to out-of-domain contexts. " 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": { 121 | "vscode": { 122 | "languageId": "plaintext" 123 | } 124 | }, 125 | "source": [ 126 | "### Adapters" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": { 132 | "vscode": { 133 | "languageId": "plaintext" 134 | } 135 | }, 136 | "source": [ 137 | "Adapter-based methods introduce additional trainable parameters after the attention and fully connected\n", 138 | " layers of a frozen pre-trained model, aiming to reduce memory usage and accelerate training. \n", 139 | " \n", 140 | " The specific approach varies depending on the adapter; it might involve adding an extra layer or representing the\n", 141 | " weight updates delta as a low-rank decomposition of the weight matrix.\n", 142 | " \n", 143 | " Regardless of the method,\n", 144 | " adapters are generally small yet achieve performance comparable to fully fine-tuned models, allowing for\n", 145 | " the training of larger models with fewer resources." 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": { 151 | "vscode": { 152 | "languageId": "plaintext" 153 | } 154 | }, 155 | "source": [ 156 | "
\n", 157 | " \"\"\n", 158 | "
" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": { 164 | "vscode": { 165 | "languageId": "plaintext" 166 | } 167 | }, 168 | "source": [ 169 | "### Low-Rank Adaptation (LoRA)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "markdown", 174 | "metadata": { 175 | "vscode": { 176 | "languageId": "plaintext" 177 | } 178 | }, 179 | "source": [ 180 | " Low-Rank Adaptation (LoRA) is a technique designed for fine-tuning large language models, which\n", 181 | " modifies the fine-tuning process by freezing the original model weights and applying changes to a separate\n", 182 | " set of weights, added to the original parameters. " 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": { 188 | "vscode": { 189 | "languageId": "plaintext" 190 | } 191 | }, 192 | "source": [ 193 | " LoRA transforms the model parameters into a lower\n", 194 | "rank dimension, reducing the number of trainable parameters, speeding up the process, and lowering\n", 195 | " costs.\n", 196 | " \n", 197 | " This method is particularly useful in scenarios where multiple clients require fine-tuned models\n", 198 | " for different applications, allowing for the creation of specific weights for each use case without the\n", 199 | " need for separate models. " 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "vscode": { 206 | "languageId": "plaintext" 207 | } 208 | }, 209 | "source": [ 210 | "
\n", 211 | " \"\"\n", 212 | "
" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": { 218 | "vscode": { 219 | "languageId": "plaintext" 220 | } 221 | }, 222 | "source": [ 223 | "
\n", 224 | " \"\"\n", 225 | "
" 226 | ] 227 | }, 228 | { 229 | "cell_type": "markdown", 230 | "metadata": { 231 | "vscode": { 232 | "languageId": "plaintext" 233 | } 234 | }, 235 | "source": [ 236 | "### QLoRA" 237 | ] 238 | }, 239 | { 240 | "cell_type": "markdown", 241 | "metadata": { 242 | "vscode": { 243 | "languageId": "plaintext" 244 | } 245 | }, 246 | "source": [ 247 | " QLoRA is an extended version of LoRA designed for greater memory efficiency in large language mod\n", 248 | "els (LLMs) by quantising weight parameters to 4-bit precision. Typically, LLM parameters are stored\n", 249 | " in a 32-bit format, but QLoRA compresses them to 4-bit, significantly reducing the memory footprint.\n", 250 | " This allows fine-tuning on less powerful hardware, including consumer GPUs. QLoRA also quantises the\n", 251 | " weights of the LoRA adapters from 8-bit to 4-bit, further decreasing memory and storage requirements. Despite the reduction in bit precision, QLoRA maintains performance levels comparable\n", 252 | " to traditional 16-bit fine-tuning" 253 | ] 254 | }, 255 | { 256 | "cell_type": "markdown", 257 | "metadata": { 258 | "vscode": { 259 | "languageId": "plaintext" 260 | } 261 | }, 262 | "source": [ 263 | "### Weight-Decomposed Low-Rank Adaptation (DoRA)" 264 | ] 265 | }, 266 | { 267 | "cell_type": "markdown", 268 | "metadata": { 269 | "vscode": { 270 | "languageId": "plaintext" 271 | } 272 | }, 273 | "source": [ 274 | "Weight-Decomposed Low-Rank Adaptation (DoRA) is a novel fine-tuning methodology designed to\n", 275 | " optimise pre-trained models by decomposing their weights into magnitude and directional components.\n", 276 | "\n", 277 | " This approach leverages the efficiency of Low-Rank Adaptation (LoRA) for directional updates, facili\n", 278 | "tating substantial parameter updates without altering the entire model architecture. \n", 279 | "\n", 280 | "DoRA addresses the computational challenges associated with traditional full fine-tuning (FT) by maintaining model\n", 281 | " simplicity and inference efficiency, while simultaneously bridging the performance gap typically observed\n", 282 | " between LoRA and FT. " 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": { 288 | "vscode": { 289 | "languageId": "plaintext" 290 | } 291 | }, 292 | "source": [ 293 | "
\n", 294 | " \"\"\n", 295 | "
" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": { 301 | "vscode": { 302 | "languageId": "plaintext" 303 | } 304 | }, 305 | "source": [ 306 | "**Comparison between LoRA and DoRA**" 307 | ] 308 | }, 309 | { 310 | "cell_type": "markdown", 311 | "metadata": { 312 | "vscode": { 313 | "languageId": "plaintext" 314 | } 315 | }, 316 | "source": [ 317 | "
\n", 318 | " \"\"\n", 319 | "
" 320 | ] 321 | }, 322 | { 323 | "cell_type": "markdown", 324 | "metadata": { 325 | "vscode": { 326 | "languageId": "plaintext" 327 | } 328 | }, 329 | "source": [ 330 | "### Fine-Tuning with Multiple Adapters" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": { 336 | "vscode": { 337 | "languageId": "plaintext" 338 | } 339 | }, 340 | "source": [ 341 | "The PEFT library simplifies the process of merging adapters with its add_weighted_adapter function 3, which offers three distinct methods:\n", 342 | "\n", 343 | "1. Concatenation: This straightforward method concatenates the parameters of the adapters. For instance, if two adapters each have a rank of 16, the resulting adapter will have a rank of 32. This method is highly efficient.\n", 344 | "2. Linear Combination: Although less documented, this method appears to perform a weighted sum of the adapters’ parameters.\n", 345 | "3. SVD: The default method employs singular value decomposition through torch.linalg.svd. While versatile, it is notably slower than the other methods, particularly for adapters with high ranks (greater than 100), which can take several hours." 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": { 351 | "vscode": { 352 | "languageId": "plaintext" 353 | } 354 | }, 355 | "source": [ 356 | "
\n", 357 | " \"\"\n", 358 | "
" 359 | ] 360 | }, 361 | { 362 | "cell_type": "markdown", 363 | "metadata": { 364 | "vscode": { 365 | "languageId": "plaintext" 366 | } 367 | }, 368 | "source": [ 369 | "## Half Fine Tuning" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "metadata": { 375 | "vscode": { 376 | "languageId": "plaintext" 377 | } 378 | }, 379 | "source": [ 380 | "Half Fine-Tuning (HFT) is a technique designed to balance the retention of foundational knowledge\n", 381 | " with the acquisition of new skills in large language models (LLMs).\n", 382 | " \n", 383 | "HFT involves freezing half of the\n", 384 | " model’s parameters during each fine-tuning round while updating the other half, allowing the model to\n", 385 | " retain pre-trained knowledge and enhance new task performance without altering the model architecture" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": { 391 | "vscode": { 392 | "languageId": "plaintext" 393 | } 394 | }, 395 | "source": [ 396 | "### Benefits of using Half Fine tuning" 397 | ] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "metadata": { 402 | "vscode": { 403 | "languageId": "plaintext" 404 | } 405 | }, 406 | "source": [ 407 | "1. Recovery of Pre-Trained Knowledge\n", 408 | "\n", 409 | "2. Enhanced Performance: Research experiments shows that HFT maintains or even surpasses the performance of full fine-tuning (FFT) on downstream tasks, demonstrating its effectiveness in balancing knowledge retention with task-specific learning.\n", 410 | "\n", 411 | "3. Robustness\n", 412 | "\n", 413 | "4. Simplicity and Scalability\n", 414 | "\n", 415 | "5. Versatility" 416 | ] 417 | }, 418 | { 419 | "cell_type": "markdown", 420 | "metadata": { 421 | "vscode": { 422 | "languageId": "plaintext" 423 | } 424 | }, 425 | "source": [ 426 | "
\n", 427 | " \"\"\n", 428 | "
" 429 | ] 430 | }, 431 | { 432 | "cell_type": "markdown", 433 | "metadata": { 434 | "vscode": { 435 | "languageId": "plaintext" 436 | } 437 | }, 438 | "source": [ 439 | "### Comparison between HFT and LoRA" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": { 445 | "vscode": { 446 | "languageId": "plaintext" 447 | } 448 | }, 449 | "source": [ 450 | "
\n", 451 | " \"\"\n", 452 | "
" 453 | ] 454 | }, 455 | { 456 | "cell_type": "markdown", 457 | "metadata": { 458 | "vscode": { 459 | "languageId": "plaintext" 460 | } 461 | }, 462 | "source": [ 463 | "## Lamini memory tuning" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": { 469 | "vscode": { 470 | "languageId": "plaintext" 471 | } 472 | }, 473 | "source": [ 474 | " Foundation models often follow a training regimen similar to the Chinchilla recipe, which prescribes\n", 475 | " training for a single epoch on a massive corpus, such as training Llama 2 7B on about one trillion\n", 476 | " tokens.\n", 477 | " \n", 478 | " This approach results in substantial loss and is geared more towards enhancing generalisation\n", 479 | " and creativity where a degree of randomness in token selection is permissible. \n", 480 | " \n", 481 | " However, it falls short for\n", 482 | " tasks demanding high factual precision." 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": { 488 | "vscode": { 489 | "languageId": "plaintext" 490 | } 491 | }, 492 | "source": [ 493 | "In contrast, Lamini Memory Tuning delves deeper by analysing\n", 494 | " the loss of individual facts, significantly improving the accuracy of factual recall. " 495 | ] 496 | }, 497 | { 498 | "cell_type": "markdown", 499 | "metadata": { 500 | "vscode": { 501 | "languageId": "plaintext" 502 | } 503 | }, 504 | "source": [ 505 | "By augmenting a\n", 506 | " model with additional parameters specifically for memory (e.g., an 8B parameter model with an extra 2B\n", 507 | " parameters for weights), Lamini enables the model to memorise and accurately recall a significant number\n", 508 | " of facts, closely aligning performance with LLM scaling laws without compromising on generalisation" 509 | ] 510 | }, 511 | { 512 | "cell_type": "markdown", 513 | "metadata": { 514 | "vscode": { 515 | "languageId": "plaintext" 516 | } 517 | }, 518 | "source": [ 519 | "### Lamini-1- A model architecture based on Lamini" 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "metadata": { 525 | "vscode": { 526 | "languageId": "plaintext" 527 | } 528 | }, 529 | "source": [ 530 | " Departing from traditional transformer-based designs, the Lamini-1 model architectur employs a massive mixture of memory experts (MoME). This system features a pre-trained transformer\n", 531 | " backbone augmented by adapters that are dynamically selected from an index using cross-attention\n", 532 | " mechanisms. \n", 533 | " \n", 534 | " These adapters function similarly to experts in MoE (Mixture of Expert) architectures, and the network is\n", 535 | " trained end-to-end while freezing the backbone. This setup allows for specific facts to be stored exactly\n", 536 | " in the selected experts." 537 | ] 538 | }, 539 | { 540 | "cell_type": "markdown", 541 | "metadata": { 542 | "vscode": { 543 | "languageId": "plaintext" 544 | } 545 | }, 546 | "source": [ 547 | "
\n", 548 | " \"\"\n", 549 | "
" 550 | ] 551 | }, 552 | { 553 | "cell_type": "markdown", 554 | "metadata": { 555 | "vscode": { 556 | "languageId": "plaintext" 557 | } 558 | }, 559 | "source": [ 560 | "### Systems Optimisations for Banishing Hallucinations" 561 | ] 562 | }, 563 | { 564 | "cell_type": "markdown", 565 | "metadata": { 566 | "vscode": { 567 | "languageId": "plaintext" 568 | } 569 | }, 570 | "source": [ 571 | " The MoME architecture is designed to minimise the computational demand required to memorise facts.\n", 572 | " During training, a subset of experts, such as 32 out of a million, is selected for each fact. The weights of\n", 573 | " the backbone network and the cross attention used to select the expert are frozen, and gradient descent\n", 574 | " steps are taken until the loss is sufficiently reduced to memorise the fact. " 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": { 580 | "vscode": { 581 | "languageId": "plaintext" 582 | } 583 | }, 584 | "source": [ 585 | "## Mixture of Experts" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "metadata": { 591 | "vscode": { 592 | "languageId": "plaintext" 593 | } 594 | }, 595 | "source": [ 596 | " A mixture of experts (MoE) is an architectural design for neural networks that divides the computation\n", 597 | " of a layer or operation (e.g., linear layers, MLPs, or attention projection) into several specialised subnet\n", 598 | "works, referred to as ”experts”.\n", 599 | "\n", 600 | " Each expert independently carries out its computation, and the results\n", 601 | " are aggregated to produce the final output of the MoE layer.\n", 602 | " \n", 603 | " MoE architectures can be categorised as\n", 604 | " either dense, where every expert is engaged for each input, or sparse, where only a subset of experts is\n", 605 | " utilised for each input" 606 | ] 607 | }, 608 | { 609 | "cell_type": "markdown", 610 | "metadata": { 611 | "vscode": { 612 | "languageId": "plaintext" 613 | } 614 | }, 615 | "source": [ 616 | "### Mixtral 8x7B Architecture and Performance" 617 | ] 618 | }, 619 | { 620 | "cell_type": "markdown", 621 | "metadata": { 622 | "vscode": { 623 | "languageId": "plaintext" 624 | } 625 | }, 626 | "source": [ 627 | " Mixtral 8x7B employs a Sparse Mixture of Experts (SMoE) architecture, mirroring the\n", 628 | " structure of Mistral 7B but incorporating eight feedforward blocks (experts) in each layer.\n", 629 | " \n", 630 | " For every\n", 631 | " token at each layer, a router network selects two experts to process the current state and combine their\n", 632 | " outputs. Although each token interacts with only two experts at a time, the selected experts can vary at\n", 633 | " each timestep. Consequently, each token has access to 47 billion parameters but utilises only 13 billion\n", 634 | " active parameters during inference. " 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": { 640 | "vscode": { 641 | "languageId": "plaintext" 642 | } 643 | }, 644 | "source": [ 645 | "
\n", 646 | " \"\"\n", 647 | "
" 648 | ] 649 | }, 650 | { 651 | "cell_type": "markdown", 652 | "metadata": { 653 | "vscode": { 654 | "languageId": "plaintext" 655 | } 656 | }, 657 | "source": [ 658 | "## Mixture of Agents" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": { 664 | "vscode": { 665 | "languageId": "plaintext" 666 | } 667 | }, 668 | "source": [ 669 | "A recent study has investigated\n", 670 | " leveraging the collective expertise of multiple LLMs to develop a more capable and robust model, a\n", 671 | " method known as Mixture of Agents (MoA)" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": { 677 | "vscode": { 678 | "languageId": "plaintext" 679 | } 680 | }, 681 | "source": [ 682 | "
\n", 683 | " \"\"\n", 684 | "
" 685 | ] 686 | }, 687 | { 688 | "cell_type": "markdown", 689 | "metadata": { 690 | "vscode": { 691 | "languageId": "plaintext" 692 | } 693 | }, 694 | "source": [ 695 | "### Methodology" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": { 701 | "vscode": { 702 | "languageId": "plaintext" 703 | } 704 | }, 705 | "source": [ 706 | "To enhance collaboration among multiple LLMs, it is essential to understand their individual strengths and classify them accordingly. The classification includes:\n", 707 | "\n", 708 | "1. Proposers: These models excel at generating valuable reference responses for other models. While they may not perform exceptionally on their own, they provide useful context and varied perspectives that improve the final output when utilised by an aggregator.\n", 709 | "2. Aggregators: These models are adept at merging responses from various models into a single high-quality result. An effective aggregator should maintain or even enhance the quality of the final response, regardless of the quality of the individual inputs." 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": { 715 | "vscode": { 716 | "languageId": "plaintext" 717 | } 718 | }, 719 | "source": [ 720 | "## Proximal Policy Optimisation" 721 | ] 722 | }, 723 | { 724 | "cell_type": "markdown", 725 | "metadata": { 726 | "vscode": { 727 | "languageId": "plaintext" 728 | } 729 | }, 730 | "source": [ 731 | "## Direct Preference Optimisation (DPO)" 732 | ] 733 | }, 734 | { 735 | "cell_type": "markdown", 736 | "metadata": { 737 | "vscode": { 738 | "languageId": "plaintext" 739 | } 740 | }, 741 | "source": [ 742 | "## Optimised Routing and Pruning Operations " 743 | ] 744 | } 745 | ], 746 | "metadata": { 747 | "language_info": { 748 | "name": "python" 749 | } 750 | }, 751 | "nbformat": 4, 752 | "nbformat_minor": 2 753 | } 754 | --------------------------------------------------------------------------------