├── .gitignore ├── 2_tensor_program_abstraction.ipynb ├── 3_TensorIR_Tensor_Program_Abstraction_Case_Study_Action.ipynb ├── 4_Build_End_to_End_Model.ipynb ├── 5_Automatic_Program_Optimization.ipynb ├── 6_Integration_with_Machine_Learning_Frameworks.ipynb ├── 7_GPU_and_Specialized_Hardware.ipynb ├── 8_GPU_and_Specialized_Hardware_part2.ipynb ├── 9_Computational_Graph_Optimization.ipynb ├── LICENSE ├── README.md ├── assignment ├── assignment1.ipynb └── assignment1_zh.ipynb ├── mlc-llm ├── models │ ├── demo_CodeLlama_13b.ipynb │ ├── demo_CodeLlama_7b.ipynb │ ├── demo_WizardLM_Math_Coder.ipynb │ └── demo_gemma.ipynb ├── tutorial_add_new_model_architecture_in_tvm_nn_module.ipynb ├── tutorial_chat_module_getting_started.ipynb ├── tutorial_mlc_xgrammar_structured_generation.ipynb └── tutorial_raw_text_generation.ipynb └── tutorial ├── How_to_add_model_architeture_in_MLC_LLM.ipynb ├── tutorial_tensor_core-zh.ipynb └── tutorial_tensor_core.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .mlc 107 | .venv 108 | env/ 109 | venv/ 110 | ENV/ 111 | env.bak/ 112 | venv.bak/ 113 | 114 | # Spyder project settings 115 | .spyderproject 116 | .spyproject 117 | 118 | # Rope project settings 119 | .ropeproject 120 | 121 | # mkdocs documentation 122 | /site 123 | 124 | # mypy 125 | .mypy_cache/ 126 | .dmypy.json 127 | dmypy.json 128 | 129 | # Pyre type checker 130 | .pyre/ 131 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # notebooks -------------------------------------------------------------------------------- /assignment/assignment1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MLC Assignment 1: End-to-End Model Execution" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Section 1: Model Preparation\n", 15 | "\n", 16 | "To get you familiar with the process of building and manipulating an end-to-end model using MLC, let's start from a simple image classification model.\n", 17 | "\n", 18 | "We first use the following commands to install necessary packages." 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "!python3 -m pip install mlc-ai-nightly -f https://mlc.ai/wheels\n", 28 | "!python3 -m pip install torch torchvision torchaudio torchsummary --extra-index-url https://download.pytorch.org/whl/cpu" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import pickle as pkl\n", 39 | "import torch\n", 40 | "import torch.nn.functional as F\n", 41 | "import torchvision\n", 42 | "import tvm\n", 43 | "import tvm.testing\n", 44 | "\n", 45 | "from matplotlib import pyplot as plt\n", 46 | "from torch import nn\n", 47 | "from torchvision import transforms\n", 48 | "from tvm import topi, relax, te\n", 49 | "from tvm.script import tir as T\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "Below is the model defined in PyTorch. It accepts a batch of images as input, and pass them through convolution layer, activation layer, pooling layer and fully-connected layers in order." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "batch_size = 4\n", 66 | "input_shape = (batch_size, 1, 28, 28) # NCHW layout\n", 67 | "\n", 68 | "\n", 69 | "def pytorch_model():\n", 70 | " list = []\n", 71 | " list.append(nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), bias=True))\n", 72 | " list.append(nn.ReLU())\n", 73 | " list.append(nn.MaxPool2d(kernel_size=(2, 2)))\n", 74 | " list.append(nn.Flatten())\n", 75 | " list.append(nn.Linear(in_features=5408, out_features=100, bias=True))\n", 76 | " list.append(nn.ReLU())\n", 77 | " list.append(nn.Linear(in_features=100, out_features=10, bias=True))\n", 78 | " list.append(nn.Softmax(dim=1))\n", 79 | "\n", 80 | " model = nn.Sequential(*list).cpu()\n", 81 | " name_map = {\n", 82 | " \"0.weight\": \"conv2d_weight\",\n", 83 | " \"0.bias\": \"conv2d_bias\",\n", 84 | " \"4.weight\": \"linear0_weight\",\n", 85 | " \"4.bias\": \"linear0_bias\",\n", 86 | " \"6.weight\": \"linear1_weight\",\n", 87 | " \"6.bias\": \"linear1_bias\",\n", 88 | " }\n", 89 | " for name, param in model.named_parameters():\n", 90 | " param.data = torch.from_numpy(weight_map[name_map[name]]).cpu()\n", 91 | " return model\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "We provide a pre-trained weight map for this model on the Fashion MNIST dataset." 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Hide outputs\n", 108 | "!wget -nc https://github.com/mlc-ai/web-data/raw/main/models/fasionmnist_mlp_assignment_params.pkl" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "We can see that its accuracy is about 84%." 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Load the weight map from file.\n", 125 | "# The prediction accuracy of the weight map on test data is around 83.3%.\n", 126 | "weight_map = pkl.load(open(\"fasionmnist_mlp_assignment_params.pkl\", \"rb\"))\n", 127 | "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n", 128 | " 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n", 129 | "\n", 130 | "\n", 131 | "def test(model, test_loader):\n", 132 | " model.eval()\n", 133 | " test_loss = 0\n", 134 | " correct = 0\n", 135 | " with torch.no_grad():\n", 136 | " print_img = True\n", 137 | " for data, label in test_loader:\n", 138 | " data, label = data.cpu(), label.cpu()\n", 139 | " output = model(data)\n", 140 | " # sum up batch loss\n", 141 | " test_loss += F.nll_loss(output, label, reduction=\"sum\").item()\n", 142 | " # get the index of the max log-probability\n", 143 | " pred = output.argmax(dim=1, keepdim=True)\n", 144 | " if print_img:\n", 145 | " imshow(data[0])\n", 146 | " print(\"predict: {}, label: {}\".format(class_names[pred[0][0]], class_names[label[0]]))\n", 147 | " print_img = False\n", 148 | " correct += pred.eq(label.view_as(pred)).sum().item()\n", 149 | "\n", 150 | " test_loss /= len(test_loader.dataset)\n", 151 | "\n", 152 | " print(\"\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n\".format(\n", 153 | " test_loss, correct, len(test_loader.dataset),\n", 154 | " 100. * correct / len(test_loader.dataset)))\n", 155 | "\n", 156 | "\n", 157 | "def imshow(img):\n", 158 | " img = img / 2 + 0.5\n", 159 | " npimg = img.numpy()\n", 160 | " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", 161 | " plt.show()\n", 162 | "\n", 163 | "\n", 164 | "test_data = torchvision.datasets.FashionMNIST(\n", 165 | " \"./data\",\n", 166 | " download=True,\n", 167 | " train=False,\n", 168 | " transform=transforms.Compose([transforms.ToTensor()])\n", 169 | ")\n", 170 | "test_loader = torch.utils.data.DataLoader(\n", 171 | " test_data, batch_size=batch_size, shuffle=False)\n", 172 | "test(pytorch_model(), test_loader)\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## Section 2. Ingest Model From Pytorch\n", 180 | "\n", 181 | "To see the MLC abstraction of the end-to-end model, we need to ingest it from PyTorch and transform into TVMScript implementation. However, it is hard to manually do this. As you may have experienced in Exercise 1, writing a primitive tensor function for each model layer requires massive engineering efforts. Moreover, the manual writing process is error-prone - just imagine when you write dozens of lines of code while there exists some tiny bug in your implementation, finding the bug in could be annoying.\n", 182 | "\n", 183 | "Fortunately, in TVM there is a much simpler way of doing this. TVM provides a utility `relax.BlockBuilder` that can construct end-to-end models step by step in an IRModule that starts empty. (Recall that in Lecture 4 we introduced the dataflow block design of Relax, our MLC abstraction on computational graph level. And here the \"block\" in \"`BlockBuilder`\" stands for the dataflow blocks in Relax functions.)\n", 184 | "\n", 185 | "Specifically, in `BlockBuilder` we have an `emit_te` API, that helps convert a Tensor Expression operator description, which was introduced in Lecture 3, into a `call_tir` operation to the operator's corresponding TensorIR function (`call_tir` was introduced in Lecture 4 as well.) Compared with manually writing TensorIR functions, writing their Tensor Expression description can be done within only a few lines of code, which reduces the amount of efforts and is less likely for us to make mistakes.\n", 186 | "\n", 187 | "The signature of `emit_te` is `emit_te(func, *input)`, where `func` is a function that returns a Tensor Expression operator description, and `*input` is the inputs to `func`.\n", 188 | "\n", 189 | "Let's start with an introducing example. In the code block below, `relu` is a function that returns a Tensor Expression description of a ReLU operator. To construct a Relax function that executes a single ReLU operator, in function `emit_te_example` we first define a BlockBuilder instance `bb`. We also define a 2-dimensional 128x128 tensor variable `x`, which will serve as the input tensor of the ReLU operation (as well as the input of the Relax function).\n", 190 | "\n", 191 | "After that, we construct a Relax function `main` with `x` as input, using the `with bb.function(name, [*input])` API. Then we construct a dataflow block. Inside the dataflow block, we first have a `call_tir` to a TensorIR implementation of ReLU operator, through `emit_te`. The `emit_te` below generates a TensorIR function called \"`relu`\" in the IRModule, and add a `call_tir(relu, (x,), (128, 128), dtype=\"float32\")` operation in the dataflow block. And the `call_tir` is followed by a function return.\n", 192 | "\n", 193 | "After this construction, the BlockBuilder `bb` contains the constructed IRModule, which can be got by `bb.get()`.\n" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "def relu(A):\n", 203 | " B = te.compute(shape=(128, 128), fcompute=lambda i, j: te.max(A[i, j], 0), name=\"B\")\n", 204 | " return B\n", 205 | "\n", 206 | "\n", 207 | "def emit_te_example():\n", 208 | " bb = relax.BlockBuilder()\n", 209 | " x = relax.Var(\"x\", (128, 128), relax.DynTensorType(2, \"float32\"))\n", 210 | " with bb.function(\"main\", [x]):\n", 211 | " with bb.dataflow():\n", 212 | " lv0 = bb.emit_te(relu, x)\n", 213 | " gv = bb.emit_output(lv0)\n", 214 | " bb.emit_func_output(gv)\n", 215 | " return bb.get()\n" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "The function `emit_te_example` returns the constructed IRModule as output. To see what the BlockBuilder constructs, we print the IRModule." 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "import IPython\n", 232 | "\n", 233 | "mod = emit_te_example()\n", 234 | "IPython.display.Code(mod.script(), language=\"python\")\n" 235 | ] 236 | }, 237 | { 238 | "cell_type": "markdown", 239 | "metadata": {}, 240 | "source": [ 241 | "As you can see, the IRModule generated by the BlockBuilder does contain a TensorIR implementation of ReLU, and a Relax function which calls into the ReLU implementation via `call_tir`.\n", 242 | "\n", 243 | "Now it is your turn to use BlockBuilder and `emit_te` to create an IRModule equivalent to the PyTorch model defined above. You can write Tensor Expression descriptions for all the operators by yourself. Alternatively, TVM provides TOPI (short for \"TVM Operator Inventory\") library which wraps Tensor Expression descriptions for various operators. It is also encouraged if you can read the [documents](https://tvm.apache.org/docs/reference/api/python/topi.html) and find out a way to use them. The test function has been provided for you to check the correctness of your IRModule easily.\n", 244 | "\n", 245 | "Note that each Conv2d layer or linear layer in the model contains a bias add, which should be reflected in the IRModule you construct." 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "def create_model_via_emit_te():\n", 255 | " bb = relax.BlockBuilder()\n", 256 | " x = relax.Var(\"x\", input_shape, relax.DynTensorType(batch_size, \"float32\"))\n", 257 | "\n", 258 | " conv2d_weight = relax.const(weight_map[\"conv2d_weight\"], \"float32\")\n", 259 | " conv2d_bias = relax.const(weight_map[\"conv2d_bias\"].reshape(1, 32, 1, 1), \"float32\")\n", 260 | " linear0_weight = relax.const(weight_map[\"linear0_weight\"], \"float32\")\n", 261 | " linear0_bias = relax.const(weight_map[\"linear0_bias\"].reshape(1, 100), \"float32\")\n", 262 | " linear1_weight = relax.const(weight_map[\"linear1_weight\"], \"float32\")\n", 263 | " linear1_bias = relax.const(weight_map[\"linear1_bias\"].reshape(1, 10), \"float32\")\n", 264 | "\n", 265 | " with bb.function(\"main\", [x]):\n", 266 | " with bb.dataflow():\n", 267 | " # TODO\n", 268 | " ...\n", 269 | " bb.emit_func_output(gv)\n", 270 | "\n", 271 | " return bb.get()\n", 272 | "\n", 273 | "\n", 274 | "def build_mod(mod):\n", 275 | " exec = relax.vm.build(mod, \"llvm\")\n", 276 | " dev = tvm.cpu()\n", 277 | " vm = relax.VirtualMachine(exec, dev)\n", 278 | " return vm\n", 279 | "\n", 280 | "\n", 281 | "def check_equivalence(mod, torch_model, test_loader):\n", 282 | " torch_model.eval()\n", 283 | " with torch.no_grad():\n", 284 | " rt_mod = build_mod(mod)\n", 285 | " for data, label in test_loader:\n", 286 | " data, label = data.cpu(), label.cpu()\n", 287 | " output_from_pytorch = torch_model(data).numpy()\n", 288 | " output_from_relax = rt_mod[\"main\"](tvm.nd.array(data, tvm.cpu())).numpy()\n", 289 | " tvm.testing.assert_allclose(output_from_pytorch, output_from_relax, rtol=1e-4)\n", 290 | "\n", 291 | "\n", 292 | "test_data = torchvision.datasets.FashionMNIST(\n", 293 | " \"./data\",\n", 294 | " download=True,\n", 295 | " train=False,\n", 296 | " transform=transforms.Compose([transforms.ToTensor()])\n", 297 | ")\n", 298 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 299 | "\n", 300 | "mod = create_model_via_emit_te()\n", 301 | "torch_model = pytorch_model()\n", 302 | "\n", 303 | "check_equivalence(mod, torch_model, test_loader)\n", 304 | "IPython.display.Code(mod.script(), language=\"python\")\n" 305 | ] 306 | }, 307 | { 308 | "cell_type": "markdown", 309 | "metadata": {}, 310 | "source": [ 311 | "## Section 3. Use of Vendor Library\n", 312 | "\n", 313 | "As we have talked about in Lecture 4, we can integrate torch functions into an IRModule. The steps include registering an external runtime function and calling it inside the IRModule using `call_tir`.\n", 314 | "\n", 315 | "Here is an example of using torch matmul and torch add to implement a linear layer. You can also find this example in the Lecture 4 notes.\n", 316 | "\n", 317 | "```python\n", 318 | "@tvm.register_func(\"env.linear\", override=True)\n", 319 | "def torch_linear(x: tvm.nd.NDArray,\n", 320 | " w: tvm.nd.NDArray,\n", 321 | " b: tvm.nd.NDArray,\n", 322 | " out: tvm.nd.NDArray):\n", 323 | " x_torch = torch.from_dlpack(x)\n", 324 | " w_torch = torch.from_dlpack(w)\n", 325 | " b_torch = torch.from_dlpack(b)\n", 326 | " out_torch = torch.from_dlpack(out)\n", 327 | " torch.mm(x_torch, w_torch.T, out=out_torch)\n", 328 | " torch.add(out_torch, b_torch, out=out_torch)\n", 329 | "\n", 330 | "\n", 331 | "@tvm.script.ir_module\n", 332 | "class MyModuleWithExternCall:\n", 333 | " @R.function\n", 334 | " def main(x: Tensor((1, 784), \"float32\"),\n", 335 | " w0: Tensor((128, 784), \"float32\"),\n", 336 | " b0: Tensor((128,), \"float32\")):\n", 337 | " # block 0\n", 338 | " with R.dataflow():\n", 339 | " lv0 = R.call_tir(\"env.linear\", (x, w0, b0), (1, 128), dtype=\"float32\")\n", 340 | " ...\n", 341 | " return ...\n", 342 | "```" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "Please register external functions for the convolution layer occurring in the IRModule you create in Section 2. You need to use NumPy or PyTorch as the function's implementation.\n", 350 | "\n", 351 | "You may use `BlockBuilder.emit` to directly add a `call_tir` operation to the end of the Relax function being constructed." 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "\n", 361 | "\n", 362 | "def create_model_with_torch_func():\n", 363 | " bb = relax.BlockBuilder()\n", 364 | "\n", 365 | " x = relax.Var(\"x\", input_shape, relax.DynTensorType(4, \"float32\"))\n", 366 | "\n", 367 | " conv2d_weight = relax.const(weight_map[\"conv2d_weight\"], \"float32\")\n", 368 | " conv2d_bias = relax.const(weight_map[\"conv2d_bias\"].reshape(1, 32, 1, 1), \"float32\")\n", 369 | " linear0_weight = relax.const(weight_map[\"linear0_weight\"], \"float32\")\n", 370 | " linear0_bias = relax.const(weight_map[\"linear0_bias\"].reshape(1, 100), \"float32\")\n", 371 | " linear1_weight = relax.const(weight_map[\"linear1_weight\"], \"float32\")\n", 372 | " linear1_bias = relax.const(weight_map[\"linear1_bias\"].reshape(1, 10), \"float32\")\n", 373 | "\n", 374 | " with bb.function(\"main\", [x]):\n", 375 | " with bb.dataflow():\n", 376 | " # TODO:\n", 377 | " ...\n", 378 | " bb.emit_func_output(gv)\n", 379 | "\n", 380 | " return bb.get()\n", 381 | "\n", 382 | "\n", 383 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 384 | "mod = create_model_with_torch_func()\n", 385 | "check_equivalence(mod, torch_model, test_loader)\n" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## Section 4. Transformation in End-to–End Models\n", 393 | "\n", 394 | "In Exercise 1, we learned how to transform a single TensorIR Function. It is similar to do that in an end-to-end model.\n", 395 | "\n", 396 | "Compared with the batch matmul program, let's focus on a more challenging one: conv2d.\n", 397 | "\n", 398 | "To begin with, let's introduce some new primitives: \n", 399 | " - `compute_inline`: It inlines a block into another to reduce memory usage and memory access.\n", 400 | " - `fuse`: The opposite for `split`. Fuse multiple axes. Here `fuse` is used together with `parallel` / `vectorize` / `unroll` to further increase parallelism." 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "@T.prim_func\n", 410 | "def before_inline(a: T.handle, c: T.handle) -> None:\n", 411 | " A = T.match_buffer(a, (128, 128))\n", 412 | " B = T.alloc_buffer((128, 128))\n", 413 | " C = T.match_buffer(c, (128, 128))\n", 414 | " for i, j in T.grid(128, 128):\n", 415 | " with T.block(\"B\"):\n", 416 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 417 | " B[vi, vj] = A[vi, vj] * 2.0\n", 418 | " for i, j in T.grid(128, 128):\n", 419 | " with T.block(\"C\"):\n", 420 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 421 | " C[vi, vj] = B[vi, vj] + 1.0\n", 422 | "\n", 423 | "\n", 424 | "sch = tvm.tir.Schedule(before_inline)\n", 425 | "sch.compute_inline(sch.get_block(\"B\"))\n", 426 | "IPython.display.Code(sch.mod[\"main\"].script(), language=\"python\")\n" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "@T.prim_func\n", 436 | "def before_fuse(a: T.handle, b: T.handle) -> None:\n", 437 | " A = T.match_buffer(a, (128, 128))\n", 438 | " B = T.match_buffer(b, (128, 128))\n", 439 | " for i, j in T.grid(128, 128):\n", 440 | " with T.block(\"B\"):\n", 441 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 442 | " B[vi, vj] = A[vi, vj] * 2.0\n", 443 | "\n", 444 | "\n", 445 | "sch = tvm.tir.Schedule(before_fuse)\n", 446 | "i, j = sch.get_loops(sch.get_block(\"B\"))\n", 447 | "sch.fuse(i, j)\n", 448 | "IPython.display.Code(sch.mod[\"main\"].script(), language=\"python\")\n" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "Now we first create a schedule for the IRModule, and then transform the conv2d TensorIR function inside. Similar to Exercise 1, we provide you with a target function. But please note that, the target function does NOT serve as a \"standard transformation answer\" for several reasons:\n", 456 | " - it may not have the best performance on every hardware,\n", 457 | " - the original conv2d TensorIR implementation may vary, according to the Tensor Expression description you used in Section 2:\n", 458 | " - if you described the conv2d computation along with the bias computation in Tensor Expression, then there should be a block which calculates the bias at the end of target TensorIR function,\n", 459 | " - if you described conv2d and bias computation separately, or you used the conv2d provided by TOPI, then the target function should not have the bias block at the end. The original function of the target is generated by using TOPI conv2d.\n", 460 | "\n", 461 | "\n", 462 | "```python\n", 463 | "@T.prim_func\n", 464 | "def target_func(rxplaceholder: T.Buffer[(4, 1, 28, 28), \"float32\"], rxplaceholder_1: T.Buffer[(32, 1, 3, 3), \"float32\"], conv2d_nchw: T.Buffer[(4, 32, 26, 26), \"float32\"]) -> None:\n", 465 | " T.func_attr({\"global_symbol\": \"conv2d\", \"tir.noalias\": True})\n", 466 | " # body\n", 467 | " # with T.block(\"root\")\n", 468 | " for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(2704):\n", 469 | " for i0_1_i1_1_fused_init in T.unroll(8):\n", 470 | " for i2_1_i3_1_fused_init in T.vectorized(4):\n", 471 | " with T.block(\"conv2d_nchw_init\"):\n", 472 | " nn = T.axis.spatial(\n", 473 | " 4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused_init // 4)\n", 474 | " ff = T.axis.spatial(\n", 475 | " 32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused_init % 4)\n", 476 | " yy = T.axis.spatial(\n", 477 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused_init // 2)\n", 478 | " xx = T.axis.spatial(\n", 479 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused_init % 2)\n", 480 | " T.reads()\n", 481 | " T.writes(conv2d_nchw[nn, ff, yy, xx])\n", 482 | " conv2d_nchw[nn, ff, yy, xx] = T.float32(0)\n", 483 | " for i4, i5, i6 in T.grid(1, 3, 3):\n", 484 | " for i0_1_i1_1_fused in T.unroll(8):\n", 485 | " for i2_1_i3_1_fused in T.vectorized(4):\n", 486 | " with T.block(\"conv2d_nchw_update\"):\n", 487 | " nn = T.axis.spatial(\n", 488 | " 4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)\n", 489 | " ff = T.axis.spatial(\n", 490 | " 32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)\n", 491 | " yy = T.axis.spatial(\n", 492 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)\n", 493 | " xx = T.axis.spatial(\n", 494 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)\n", 495 | " rc, ry, rx = T.axis.remap(\"RRR\", [i4, i5, i6])\n", 496 | " T.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn,\n", 497 | " rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])\n", 498 | " T.writes(conv2d_nchw[nn, ff, yy, xx])\n", 499 | " conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + \\\n", 500 | " rxplaceholder[nn, rc, yy + ry, xx +\n", 501 | " rx] * rxplaceholder_1[ff, rc, ry, rx]\n", 502 | "```" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "Unlike Exercise 1, this time the schedule is created for an IRModule, instead of a TensorIR function. Therefore, when using `sch.get_block`, a concrete function name should be provided, as shown below." 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "mod = create_model_via_emit_te()\n", 519 | "sch = tvm.tir.Schedule(mod)\n", 520 | "\n", 521 | "# Step 1. Get blocks\n", 522 | "# block = sch.get_block(name=\"your_block_name\", func_name=\"your_function_name\")\n", 523 | "\n", 524 | "# Step 2. Inline the padding block (if exists)\n", 525 | "\n", 526 | "# Step 3. Get loops\n", 527 | "\n", 528 | "# Step 4. Organize the loops\n", 529 | "\n", 530 | "# Step 5. decompose reduction\n", 531 | "\n", 532 | "# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll\n", 533 | "\n", 534 | "IPython.display.Code(sch.mod.script(), language=\"python\")\n" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "Again, we can test the correctness of the transformed IRModule." 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": {}, 548 | "outputs": [], 549 | "source": [ 550 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 551 | "check_equivalence(sch.mod, torch_model, test_loader)\n" 552 | ] 553 | } 554 | ], 555 | "metadata": { 556 | "kernelspec": { 557 | "display_name": "Python 3.8.10 64-bit", 558 | "language": "python", 559 | "name": "python3" 560 | }, 561 | "language_info": { 562 | "codemirror_mode": { 563 | "name": "ipython", 564 | "version": 3 565 | }, 566 | "file_extension": ".py", 567 | "mimetype": "text/x-python", 568 | "name": "python", 569 | "nbconvert_exporter": "python", 570 | "pygments_lexer": "ipython3", 571 | "version": "3.8.10" 572 | }, 573 | "orig_nbformat": 4, 574 | "vscode": { 575 | "interpreter": { 576 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 577 | } 578 | } 579 | }, 580 | "nbformat": 4, 581 | "nbformat_minor": 2 582 | } 583 | -------------------------------------------------------------------------------- /assignment/assignment1_zh.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# MLC 作业 1: 端到端模型执行" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## 第一部分: 模型准备\n", 15 | "\n", 16 | "本作业的目标是让你对机器学习编译过程中的端到端模型的执行和变换更加熟悉。让我们从一个简单的图像分类模型开始。\n", 17 | "\n", 18 | "我们首先使用如下的命令来安装必要的库。" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "!python3 -m pip install mlc-ai-nightly -f https://mlc.ai/wheels\n", 28 | "!python3 -m pip install torch torchvision torchaudio torchsummary --extra-index-url https://download.pytorch.org/whl/cpu" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": null, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "import numpy as np\n", 38 | "import pickle as pkl\n", 39 | "import torch\n", 40 | "import torch.nn.functional as F\n", 41 | "import torchvision\n", 42 | "import tvm\n", 43 | "import tvm.testing\n", 44 | "\n", 45 | "from matplotlib import pyplot as plt\n", 46 | "from torch import nn\n", 47 | "from torchvision import transforms\n", 48 | "from tvm import topi, relax, te\n", 49 | "from tvm.script import tir as T\n" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "以下是用PyTorch定义的模型。该模型接受一批图像为输入,然后对它们依次作用卷积层,激活层,池化层和全连接层,得到分类结果。" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "batch_size = 4\n", 66 | "input_shape = (batch_size, 1, 28, 28) # NCHW layout\n", 67 | "\n", 68 | "\n", 69 | "def pytorch_model():\n", 70 | " list = []\n", 71 | " list.append(nn.Conv2d(in_channels=1, out_channels=32, kernel_size=(3, 3), bias=True))\n", 72 | " list.append(nn.ReLU())\n", 73 | " list.append(nn.MaxPool2d(kernel_size=(2, 2)))\n", 74 | " list.append(nn.Flatten())\n", 75 | " list.append(nn.Linear(in_features=5408, out_features=100, bias=True))\n", 76 | " list.append(nn.ReLU())\n", 77 | " list.append(nn.Linear(in_features=100, out_features=10, bias=True))\n", 78 | " list.append(nn.Softmax(dim=1))\n", 79 | "\n", 80 | " model = nn.Sequential(*list).cpu()\n", 81 | " name_map = {\n", 82 | " \"0.weight\": \"conv2d_weight\",\n", 83 | " \"0.bias\": \"conv2d_bias\",\n", 84 | " \"4.weight\": \"linear0_weight\",\n", 85 | " \"4.bias\": \"linear0_bias\",\n", 86 | " \"6.weight\": \"linear1_weight\",\n", 87 | " \"6.bias\": \"linear1_bias\",\n", 88 | " }\n", 89 | " for name, param in model.named_parameters():\n", 90 | " param.data = torch.from_numpy(weight_map[name_map[name]]).cpu()\n", 91 | " return model\n" 92 | ] 93 | }, 94 | { 95 | "cell_type": "markdown", 96 | "metadata": {}, 97 | "source": [ 98 | "我们提供了一个在Fashion MNIST数据集上的预训练权重图。" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "# Hide outputs\n", 108 | "!wget -nc https://github.com/mlc-ai/web-data/raw/main/models/fasionmnist_mlp_assignment_params.pkl" 109 | ] 110 | }, 111 | { 112 | "cell_type": "markdown", 113 | "metadata": {}, 114 | "source": [ 115 | "我们可以看到它的准确率约为84%。" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": {}, 122 | "outputs": [], 123 | "source": [ 124 | "# Load the weight map from file.\n", 125 | "# The prediction accuracy of the weight map on test data is around 83.3%.\n", 126 | "weight_map = pkl.load(open(\"fasionmnist_mlp_assignment_params.pkl\", \"rb\"))\n", 127 | "class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',\n", 128 | " 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']\n", 129 | "\n", 130 | "\n", 131 | "def test(model, test_loader):\n", 132 | " model.eval()\n", 133 | " test_loss = 0\n", 134 | " correct = 0\n", 135 | " with torch.no_grad():\n", 136 | " print_img = True\n", 137 | " for data, label in test_loader:\n", 138 | " data, label = data.cpu(), label.cpu()\n", 139 | " output = model(data)\n", 140 | " # sum up batch loss\n", 141 | " test_loss += F.nll_loss(output, label, reduction=\"sum\").item()\n", 142 | " # get the index of the max log-probability\n", 143 | " pred = output.argmax(dim=1, keepdim=True)\n", 144 | " if print_img:\n", 145 | " imshow(data[0])\n", 146 | " print(\"predict: {}, label: {}\".format(class_names[pred[0][0]], class_names[label[0]]))\n", 147 | " print_img = False\n", 148 | " correct += pred.eq(label.view_as(pred)).sum().item()\n", 149 | "\n", 150 | " test_loss /= len(test_loader.dataset)\n", 151 | "\n", 152 | " print(\"\\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\\n\".format(\n", 153 | " test_loss, correct, len(test_loader.dataset),\n", 154 | " 100. * correct / len(test_loader.dataset)))\n", 155 | "\n", 156 | "\n", 157 | "def imshow(img):\n", 158 | " img = img / 2 + 0.5\n", 159 | " npimg = img.numpy()\n", 160 | " plt.imshow(np.transpose(npimg, (1, 2, 0)))\n", 161 | " plt.show()\n", 162 | "\n", 163 | "\n", 164 | "test_data = torchvision.datasets.FashionMNIST(\n", 165 | " \"./data\",\n", 166 | " download=True,\n", 167 | " train=False,\n", 168 | " transform=transforms.Compose([transforms.ToTensor()])\n", 169 | ")\n", 170 | "test_loader = torch.utils.data.DataLoader(\n", 171 | " test_data, batch_size=batch_size, shuffle=False)\n", 172 | "test(pytorch_model(), test_loader)\n" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "## 第二部分: 从PyTorch迁移模型\n", 180 | "为了展示机器学习编译对端到端模型的抽象,我们需要将模型从PyTorch迁移并转换为TVMScript实现。然后,手工迁移很难。正如你在TensorIR练习中所体验的那样,为模型中的每一层写一个元张量函数需要大量的人力来完成。另外,手工写这些函数是容易犯错的。你可以想象,当你写了几百行,但其中有零星几个bug,那么找到bug的过程将会是痛苦的。\n", 181 | "\n", 182 | "幸运的是,在TVM中有一个简单的多的方法能够迁移模型。TVM提供了一个类`relax.BlockBuilder`,它能够从空白的IRModule开始一步步的构建端到端模型。(回忆我们在第四节课中介绍的Relax的Dataflow Block,这里的\"block\"就是代表了Relax函数中的Dataflow Block)\n", 183 | "\n", 184 | "具体而言,在 `BlockBuilder`中我们有一个 `emit_te`的API,它可以将一个张量表达式(第三节课中介绍过)的算子描述转变成一个对应TensorIR函数的`call_tir`操作(`call_tir`在第四节课中介绍过)。与手工写TensorIR函数相比,写张量表达式描述可以用几行代码来完成,这减少了需要的工作量和犯错的概率。\n", 185 | "\n", 186 | "`emit_te`的函数签名是`emit_te(func, *input)`,其中`func`是一个返回张量表达式的函数,而`*input`是`func`的输入。\n", 187 | "\n", 188 | "让我们从一个例子开始详细介绍。在下方的代码块中,`relu`是一个返回ReLU算子的张量表达式描述的函数。为了构建一个执行单个ReLU算子的Relax函数,在`emit_te_example`中我们首先定义了一个`BlockBuilder`实例`bb`。我们也定义了一个2维128x128大小的张量变量`x`,它将作为ReLU操作的输入张量(同时也是Relax函数的输入)。\n", 189 | "\n", 190 | "在这之后,我们用`with bb.function(name, [*input])` API构建一个以`x`为输入的Relax函数 `main`。然后我们构建一个dataflow block。在这个dataflow block里,我们首先用`emit_te`生成一个调用ReLU算子的`call_tir`。这里 `emit_te`在IRModule中生成了一个名字为`relu`的TensorIR函数,然后在dataflow block中生成`call_tir(relu, (x,), (128, 128), dtype=\"float32\")`操作。`call_tir`之后是函数返回。\n", 191 | "\n", 192 | "在这一构造之后,BlockBuilder实例`bb`包含构建完的IRModule,它可以通过`bb.get()`得到。" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": {}, 199 | "outputs": [], 200 | "source": [ 201 | "def relu(A):\n", 202 | " B = te.compute(shape=(128, 128), fcompute=lambda i, j: te.max(A[i, j], 0), name=\"B\")\n", 203 | " return B\n", 204 | "\n", 205 | "\n", 206 | "def emit_te_example():\n", 207 | " bb = relax.BlockBuilder()\n", 208 | " x = relax.Var(\"x\", (128, 128), relax.DynTensorType(2, \"float32\"))\n", 209 | " with bb.function(\"main\", [x]):\n", 210 | " with bb.dataflow():\n", 211 | " lv0 = bb.emit_te(relu, x)\n", 212 | " gv = bb.emit_output(lv0)\n", 213 | " bb.emit_func_output(gv)\n", 214 | " return bb.get()\n" 215 | ] 216 | }, 217 | { 218 | "cell_type": "markdown", 219 | "metadata": {}, 220 | "source": [ 221 | "函数`emit_te_example`返回构造得到的IRModule。为了看的更清楚,我们可以输出这一IRModule。" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": null, 227 | "metadata": {}, 228 | "outputs": [], 229 | "source": [ 230 | "import IPython\n", 231 | "\n", 232 | "mod = emit_te_example()\n", 233 | "IPython.display.Code(mod.script(), language=\"python\")\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "markdown", 238 | "metadata": {}, 239 | "source": [ 240 | "正如你看到的,通过BlockBuilder生成的IRModule确实包含了ReLU的TensorIR实现和一个含有调用ReLU实现的`call_tir`的Relax函数\n", 241 | "\n", 242 | "现在轮到你来用BlockBuilder和`emit_te`来创建一个和之前定义的PyTorch模型等价的IRModule。你可以自己为所有的算子写张量表达式描述。或者,TVM提供了TOPI(TVM Operator Inventory)库,它为不同的算子提供了张量表达式描述。如果你愿意阅读[文档](https://tvm.apache.org/docs/reference/api/python/topi.html)来弄懂它的用法,这也是被鼓励的。我们提供了测试函数来检查你的IRModule的正确性。\n", 243 | "\n", 244 | "注意到每个Conv2d层和linear层都包含了一个偏置加法,这应该在你构建的IRModule中被体现。" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "def create_model_via_emit_te():\n", 254 | " bb = relax.BlockBuilder()\n", 255 | " x = relax.Var(\"x\", input_shape, relax.DynTensorType(batch_size, \"float32\"))\n", 256 | "\n", 257 | " conv2d_weight = relax.const(weight_map[\"conv2d_weight\"], \"float32\")\n", 258 | " conv2d_bias = relax.const(weight_map[\"conv2d_bias\"].reshape(1, 32, 1, 1), \"float32\")\n", 259 | " linear0_weight = relax.const(weight_map[\"linear0_weight\"], \"float32\")\n", 260 | " linear0_bias = relax.const(weight_map[\"linear0_bias\"].reshape(1, 100), \"float32\")\n", 261 | " linear1_weight = relax.const(weight_map[\"linear1_weight\"], \"float32\")\n", 262 | " linear1_bias = relax.const(weight_map[\"linear1_bias\"].reshape(1, 10), \"float32\")\n", 263 | "\n", 264 | " with bb.function(\"main\", [x]):\n", 265 | " with bb.dataflow():\n", 266 | " # TODO\n", 267 | " ...\n", 268 | " bb.emit_func_output(gv)\n", 269 | "\n", 270 | " return bb.get()\n", 271 | "\n", 272 | "\n", 273 | "def build_mod(mod):\n", 274 | " exec = relax.vm.build(mod, \"llvm\")\n", 275 | " dev = tvm.cpu()\n", 276 | " vm = relax.VirtualMachine(exec, dev)\n", 277 | " return vm\n", 278 | "\n", 279 | "\n", 280 | "def check_equivalence(mod, torch_model, test_loader):\n", 281 | " torch_model.eval()\n", 282 | " with torch.no_grad():\n", 283 | " rt_mod = build_mod(mod)\n", 284 | " for data, label in test_loader:\n", 285 | " data, label = data.cpu(), label.cpu()\n", 286 | " output_from_pytorch = torch_model(data).numpy()\n", 287 | " output_from_relax = rt_mod[\"main\"](tvm.nd.array(data, tvm.cpu())).numpy()\n", 288 | " tvm.testing.assert_allclose(output_from_pytorch, output_from_relax, rtol=1e-4)\n", 289 | "\n", 290 | "\n", 291 | "test_data = torchvision.datasets.FashionMNIST(\n", 292 | " \"./data\",\n", 293 | " download=True,\n", 294 | " train=False,\n", 295 | " transform=transforms.Compose([transforms.ToTensor()])\n", 296 | ")\n", 297 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 298 | "\n", 299 | "mod = create_model_via_emit_te()\n", 300 | "torch_model = pytorch_model()\n", 301 | "\n", 302 | "check_equivalence(mod, torch_model, test_loader)\n", 303 | "IPython.display.Code(mod.script(), language=\"python\")\n" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## 第三部分: 使用库\n", 311 | "\n", 312 | "正如我们在第四节课中谈到的,我们可以将torch函数整合进IRModule。步骤包括注册一个外部运行时函数,和在IRModule中用`call_tir`调用。\n", 313 | "\n", 314 | "这里是一个用torch matmul和torch add拉力实现一个linear层的例子。你也可以在第四节课的笔记中找到这个例子。\n", 315 | "\n", 316 | "\n", 317 | "```python\n", 318 | "@tvm.register_func(\"env.linear\", override=True)\n", 319 | "def torch_linear(x: tvm.nd.NDArray,\n", 320 | " w: tvm.nd.NDArray,\n", 321 | " b: tvm.nd.NDArray,\n", 322 | " out: tvm.nd.NDArray):\n", 323 | " x_torch = torch.from_dlpack(x)\n", 324 | " w_torch = torch.from_dlpack(w)\n", 325 | " b_torch = torch.from_dlpack(b)\n", 326 | " out_torch = torch.from_dlpack(out)\n", 327 | " torch.mm(x_torch, w_torch.T, out=out_torch)\n", 328 | " torch.add(out_torch, b_torch, out=out_torch)\n", 329 | "\n", 330 | "\n", 331 | "@tvm.script.ir_module\n", 332 | "class MyModuleWithExternCall:\n", 333 | " @R.function\n", 334 | " def main(x: Tensor((1, 784), \"float32\"),\n", 335 | " w0: Tensor((128, 784), \"float32\"),\n", 336 | " b0: Tensor((128,), \"float32\")):\n", 337 | " # block 0\n", 338 | " with R.dataflow():\n", 339 | " lv0 = R.call_tir(\"env.linear\", (x, w0, b0), (1, 128), dtype=\"float32\")\n", 340 | " ...\n", 341 | " return ...\n", 342 | "```" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "metadata": {}, 348 | "source": [ 349 | "请为你在第二部分中创建的IRModule中的卷积层注册外部函数。你需要使用NumPy或者PyTorch作为你的函数实现。\n", 350 | "\n", 351 | "你可能需要使用`BlockBuilder.emit`在正在构建的Relax函数的结尾直接添加一个`call_tir`操作。" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "\n", 361 | "\n", 362 | "def create_model_with_torch_func():\n", 363 | " bb = relax.BlockBuilder()\n", 364 | "\n", 365 | " x = relax.Var(\"x\", input_shape, relax.DynTensorType(4, \"float32\"))\n", 366 | "\n", 367 | " conv2d_weight = relax.const(weight_map[\"conv2d_weight\"], \"float32\")\n", 368 | " conv2d_bias = relax.const(weight_map[\"conv2d_bias\"].reshape(1, 32, 1, 1), \"float32\")\n", 369 | " linear0_weight = relax.const(weight_map[\"linear0_weight\"], \"float32\")\n", 370 | " linear0_bias = relax.const(weight_map[\"linear0_bias\"].reshape(1, 100), \"float32\")\n", 371 | " linear1_weight = relax.const(weight_map[\"linear1_weight\"], \"float32\")\n", 372 | " linear1_bias = relax.const(weight_map[\"linear1_bias\"].reshape(1, 10), \"float32\")\n", 373 | "\n", 374 | " with bb.function(\"main\", [x]):\n", 375 | " with bb.dataflow():\n", 376 | " # TODO:\n", 377 | " ...\n", 378 | " bb.emit_func_output(gv)\n", 379 | "\n", 380 | " return bb.get()\n", 381 | "\n", 382 | "\n", 383 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 384 | "mod = create_model_with_torch_func()\n", 385 | "check_equivalence(mod, torch_model, test_loader)\n" 386 | ] 387 | }, 388 | { 389 | "cell_type": "markdown", 390 | "metadata": {}, 391 | "source": [ 392 | "## 第四部分: 端到端模型中的程序变换\n", 393 | "\n", 394 | "在TensorIR练习中, 我们学会了如何变换单个TensorIR函数。在端到端模型中变换是类似的。\n", 395 | "\n", 396 | "和批量矩阵乘法相比,让我们关注一个更加有挑战性的算子:conv2d(二维卷积)。\n", 397 | "\n", 398 | "首先,让我们介绍一些新的原语:\n", 399 | " - `compute_inline`:它将一个block内联到另一个block中,以减少内存使用大小和内存访问次数\n", 400 | " - `fuse`:和`split`相对。融合多个轴。这里`fuse`与`parallel` / `vectorize` / `unroll`一起使用,以增加并行度。" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "@T.prim_func\n", 410 | "def before_inline(a: T.handle, c: T.handle) -> None:\n", 411 | " A = T.match_buffer(a, (128, 128))\n", 412 | " B = T.alloc_buffer((128, 128))\n", 413 | " C = T.match_buffer(c, (128, 128))\n", 414 | " for i, j in T.grid(128, 128):\n", 415 | " with T.block(\"B\"):\n", 416 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 417 | " B[vi, vj] = A[vi, vj] * 2.0\n", 418 | " for i, j in T.grid(128, 128):\n", 419 | " with T.block(\"C\"):\n", 420 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 421 | " C[vi, vj] = B[vi, vj] + 1.0\n", 422 | "\n", 423 | "\n", 424 | "sch = tvm.tir.Schedule(before_inline)\n", 425 | "sch.compute_inline(sch.get_block(\"B\"))\n", 426 | "IPython.display.Code(sch.mod[\"main\"].script(), language=\"python\")\n" 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "metadata": {}, 433 | "outputs": [], 434 | "source": [ 435 | "@T.prim_func\n", 436 | "def before_fuse(a: T.handle, b: T.handle) -> None:\n", 437 | " A = T.match_buffer(a, (128, 128))\n", 438 | " B = T.match_buffer(b, (128, 128))\n", 439 | " for i, j in T.grid(128, 128):\n", 440 | " with T.block(\"B\"):\n", 441 | " vi, vj = T.axis.remap(\"SS\", [i, j])\n", 442 | " B[vi, vj] = A[vi, vj] * 2.0\n", 443 | "\n", 444 | "\n", 445 | "sch = tvm.tir.Schedule(before_fuse)\n", 446 | "i, j = sch.get_loops(sch.get_block(\"B\"))\n", 447 | "sch.fuse(i, j)\n", 448 | "IPython.display.Code(sch.mod[\"main\"].script(), language=\"python\")\n" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": {}, 454 | "source": [ 455 | "现在我们首先为第二部分中得到的IRModule创建一个schedule,然后对其中的conv2d TensorIR函数变换。和TensorIR练习类似,我们提供了一个目标函数。但请注意,目标函数不是标准答案,原因如下:\n", 456 | " - 它可能不能在所有硬件中都取得最佳性能\n", 457 | " - 原始的conv2d TensorIR实现可能不同,这决定与你在第二部分中使用的张量表达式描述:\n", 458 | " - 如果你将conv2d的计算和偏置加法的计算放在了一个张量表达式中,那么在变换完成的TensorIR函数的末尾应该有一个计算偏置加法的block\n", 459 | " - 如果你将上述两个计算分开在不同的张量表达式,或者你使用了TOPI提供的conv2d,那么变换完成的TensorIR函数末尾不应该有计算偏置加法的block。下面给出的目标函数是用TOPI conv2d获得的TensorIR函数做变换后生成的。\n", 460 | "\n", 461 | "```python\n", 462 | "@T.prim_func\n", 463 | "def target_func(rxplaceholder: T.Buffer[(4, 1, 28, 28), \"float32\"], rxplaceholder_1: T.Buffer[(32, 1, 3, 3), \"float32\"], conv2d_nchw: T.Buffer[(4, 32, 26, 26), \"float32\"]) -> None:\n", 464 | " T.func_attr({\"global_symbol\": \"conv2d\", \"tir.noalias\": True})\n", 465 | " # body\n", 466 | " # with T.block(\"root\")\n", 467 | " for i0_0_i1_0_i2_0_i3_0_fused in T.parallel(2704):\n", 468 | " for i0_1_i1_1_fused_init in T.unroll(8):\n", 469 | " for i2_1_i3_1_fused_init in T.vectorized(4):\n", 470 | " with T.block(\"conv2d_nchw_init\"):\n", 471 | " nn = T.axis.spatial(\n", 472 | " 4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused_init // 4)\n", 473 | " ff = T.axis.spatial(\n", 474 | " 32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused_init % 4)\n", 475 | " yy = T.axis.spatial(\n", 476 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused_init // 2)\n", 477 | " xx = T.axis.spatial(\n", 478 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused_init % 2)\n", 479 | " T.reads()\n", 480 | " T.writes(conv2d_nchw[nn, ff, yy, xx])\n", 481 | " conv2d_nchw[nn, ff, yy, xx] = T.float32(0)\n", 482 | " for i4, i5, i6 in T.grid(1, 3, 3):\n", 483 | " for i0_1_i1_1_fused in T.unroll(8):\n", 484 | " for i2_1_i3_1_fused in T.vectorized(4):\n", 485 | " with T.block(\"conv2d_nchw_update\"):\n", 486 | " nn = T.axis.spatial(\n", 487 | " 4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)\n", 488 | " ff = T.axis.spatial(\n", 489 | " 32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)\n", 490 | " yy = T.axis.spatial(\n", 491 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)\n", 492 | " xx = T.axis.spatial(\n", 493 | " 26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)\n", 494 | " rc, ry, rx = T.axis.remap(\"RRR\", [i4, i5, i6])\n", 495 | " T.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn,\n", 496 | " rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])\n", 497 | " T.writes(conv2d_nchw[nn, ff, yy, xx])\n", 498 | " conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + \\\n", 499 | " rxplaceholder[nn, rc, yy + ry, xx +\n", 500 | " rx] * rxplaceholder_1[ff, rc, ry, rx]\n", 501 | "```" 502 | ] 503 | }, 504 | { 505 | "cell_type": "markdown", 506 | "metadata": {}, 507 | "source": [ 508 | "和TensorIR练习中不同的是, 这里schedule是为一个IRModule创建的,而不是TensorIR函数. 因此,当使用`sch.get_block`时,需要提供TensorIR函数名字,如下方所示。" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "mod = create_model_via_emit_te()\n", 518 | "sch = tvm.tir.Schedule(mod)\n", 519 | "\n", 520 | "# Step 1. Get blocks\n", 521 | "# block = sch.get_block(name=\"your_block_name\", func_name=\"your_function_name\")\n", 522 | "\n", 523 | "# Step 2. Inline the padding block (if exists)\n", 524 | "\n", 525 | "# Step 3. Get loops\n", 526 | "\n", 527 | "# Step 4. Organize the loops\n", 528 | "\n", 529 | "# Step 5. decompose reduction\n", 530 | "\n", 531 | "# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll\n", 532 | "\n", 533 | "IPython.display.Code(sch.mod.script(), language=\"python\")\n" 534 | ] 535 | }, 536 | { 537 | "cell_type": "markdown", 538 | "metadata": {}, 539 | "source": [ 540 | "同样,我们可以测试变换后IRModule的正确性。" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": null, 546 | "metadata": {}, 547 | "outputs": [], 548 | "source": [ 549 | "test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)\n", 550 | "check_equivalence(sch.mod, torch_model, test_loader)\n" 551 | ] 552 | } 553 | ], 554 | "metadata": { 555 | "kernelspec": { 556 | "display_name": "Python 3.8.10 64-bit", 557 | "language": "python", 558 | "name": "python3" 559 | }, 560 | "language_info": { 561 | "codemirror_mode": { 562 | "name": "ipython", 563 | "version": 3 564 | }, 565 | "file_extension": ".py", 566 | "mimetype": "text/x-python", 567 | "name": "python", 568 | "nbconvert_exporter": "python", 569 | "pygments_lexer": "ipython3", 570 | "version": "3.8.10" 571 | }, 572 | "orig_nbformat": 4, 573 | "vscode": { 574 | "interpreter": { 575 | "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1" 576 | } 577 | } 578 | }, 579 | "nbformat": 4, 580 | "nbformat_minor": 2 581 | } 582 | -------------------------------------------------------------------------------- /mlc-llm/models/demo_CodeLlama_13b.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "4IwhlCjVtpYj" 7 | }, 8 | "source": [ 9 | "# Demo: CodeLlama-13b with MLC LLM\n", 10 | "\n", 11 | "Recently, Meta unveiled [CodeLlama](https://github.com/facebookresearch/codellama), a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. This notebook demonstrates MLC LLM's support for the CodeLlama family:\n", 12 | "\n", 13 | "- **[CodeLlama](https://huggingface.co/codellama/CodeLlama-13b-hf): a coding foundation LLM**\n", 14 | "- **[CodeLlama-Instruct](https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf): an instruction-tuned LLM for coding**\n", 15 | "- **[CodeLlama-Python](https://huggingface.co/codellama/CodeLlama-13b-Python-hf): a Python specialized LLM**\n", 16 | "\n", 17 | "In this respect, MLC LLM allows everyone to develop, optimize and deploy AI models natively on everyone's devices. Therefore, making possible the deployment of coding LLMs natively, acting as **a personal AI coding assistant**.\n", 18 | "\n", 19 | "In this notebook, we walk over the steps of using MLC LLM to run these pre-compiled CodeLlama models! We have uploaded various versions of the pre-compiled and quantized CodeLlama models here: https://huggingface.co/mlc-ai.\n", 20 | "\n", 21 | "Learn more about MLC LLM here: https://mlc.ai/mlc-llm/docs." 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": { 27 | "id": "RMyJ-y5DoB0S" 28 | }, 29 | "source": [ 30 | "Here's an overview regarding each model's capabilities:\n", 31 | "\n", 32 | "| | Code Completion | Infilling | Instruction/chat | Python specialist |\n", 33 | "|-----------------------|-----------------|-----------|------------------|-------------------|\n", 34 | "| CodeLlama-13b | X | X | | |\n", 35 | "| CodeLlama-13b-Python | X | | | X |\n", 36 | "| CodeLlama-13b-Instruct | X | X | X | |" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": { 42 | "id": "YsvAL7SSt9Lo" 43 | }, 44 | "source": [ 45 | "Click the button below to get started!\n", 46 | "\n", 47 | "\n", 48 | " \"Open\n", 49 | "" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "id": "8kkADAMCCLi-" 56 | }, 57 | "source": [ 58 | "## Install MLC LLM" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": { 64 | "id": "Y2EwuS6TCO61" 65 | }, 66 | "source": [ 67 | "We will start from setting up the environment. First, let us create a new Conda environment, in which we will run the rest of the notebook.\n", 68 | "\n", 69 | "```\n", 70 | "conda create --name mlc-llm python=3.10\n", 71 | "conda activate mlc-llm\n", 72 | "```" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": { 78 | "id": "ojEeEmsqCTPG" 79 | }, 80 | "source": [ 81 | "**Google Colab**\n", 82 | "\n", 83 | "- If you are running this in a Google Colab notebook, you would not need to create a conda environment.\n", 84 | "- However, be sure to change your runtime to GPU by going to `Runtime` > `Change runtime type` and setting the Hardware accelerator to be \"GPU\"." 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "id": "S_rX53bGChPn" 91 | }, 92 | "source": [ 93 | "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the driver version number as well as what GPUs are currently available for use." 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": { 100 | "id": "CRPeCflbCij6" 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "!nvidia-smi" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": { 110 | "id": "PQfVfTAYC1M-" 111 | }, 112 | "source": [ 113 | "Next, let's download the MLC-AI and mlc-llm nightly build packages. If you are running in a Colab environment, then you can just run the following command. Otherwise, go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS." 114 | ] 115 | }, 116 | { 117 | "cell_type": "markdown", 118 | "metadata": { 119 | "id": "vi-udt4tC5c9" 120 | }, 121 | "source": [ 122 | "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly." 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": null, 128 | "metadata": { 129 | "id": "ah9tYaCRCkKS" 130 | }, 131 | "outputs": [], 132 | "source": [ 133 | "!pip install --pre --force-reinstall mlc-ai-nightly-cu118 mlc-llm-nightly-cu118 -f https://mlc.ai/wheels" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": { 139 | "id": "nZGVNJE-DJ9E" 140 | }, 141 | "source": [ 142 | "Let's confirm we have installed the packages successfully!" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "id": "5Y6LszJgC7SQ" 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "!python -c \"import tvm; print('tvm installed properly!')\"\n", 154 | "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\"" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": { 160 | "id": "lGfnrRa9DMw1" 161 | }, 162 | "source": [ 163 | "## Download Prebuilt Models and Library" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": { 169 | "id": "pVYkLb0eDjMi" 170 | }, 171 | "source": [ 172 | "The following commands will download all the available prebuilt libraries (e.g., `.so` files), including the precompiled CodeLlama models. This may take a while. If in **Google Colab**, you can verify that the files are being downloaded by clicking on the folder icon on the left." 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": { 178 | "id": "Pg7daEvlD5UB" 179 | }, 180 | "source": [ 181 | "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell." 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": null, 187 | "metadata": { 188 | "id": "FDFbw1KPDLu1" 189 | }, 190 | "outputs": [], 191 | "source": [ 192 | "!git lfs install" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "metadata": { 199 | "id": "bYqaVjmND7Px" 200 | }, 201 | "outputs": [], 202 | "source": [ 203 | "!mkdir -p dist/prebuilt\n", 204 | "!git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt/lib" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": { 210 | "id": "SMEavWCJEC_d" 211 | }, 212 | "source": [ 213 | "#### CodeLlama-13b q4f16_1 prebuilt weights" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": { 220 | "id": "etHEUrfMD8bX" 221 | }, 222 | "outputs": [], 223 | "source": [ 224 | "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-13b-hf-q4f16_1" 225 | ] 226 | }, 227 | { 228 | "cell_type": "markdown", 229 | "metadata": { 230 | "id": "FQkIX4TpElR6" 231 | }, 232 | "source": [ 233 | "#### CodeLlama-13b-Instruct q4f16_1 prebuilt weights" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "id": "mTEGXAlhEnOw" 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-13b-Instruct-hf-q4f16_1" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "id": "acb1HpKpEoca" 251 | }, 252 | "source": [ 253 | "#### CodeLlama-13b-Python q4f16_1 prebuilt weights" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "id": "oysLKcZ4Eou7" 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-13b-Python-hf-q4f16_1" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": null, 270 | "metadata": { 271 | "id": "dbHdyfIXHNpo" 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "# Restart colab\n", 276 | "exit()" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": { 282 | "id": "CmpxrrqyE0S6" 283 | }, 284 | "source": [ 285 | "## Let's code with CodeLlama!" 286 | ] 287 | }, 288 | { 289 | "cell_type": "markdown", 290 | "metadata": { 291 | "id": "y73vNLy1OfMr" 292 | }, 293 | "source": [ 294 | "Let's first try a simple code completion task with the CodeLlama-Python." 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": null, 300 | "metadata": { 301 | "id": "EOEf8sDyEwuv" 302 | }, 303 | "outputs": [], 304 | "source": [ 305 | "from mlc_llm import ChatModule\n", 306 | "from mlc_llm.callback import StreamToStdout" 307 | ] 308 | }, 309 | { 310 | "cell_type": "code", 311 | "execution_count": null, 312 | "metadata": { 313 | "id": "tBcQMm-KJPN-" 314 | }, 315 | "outputs": [], 316 | "source": [ 317 | "codellama_python = ChatModule(model=\"CodeLlama-13b-Python-hf-q4f16_1\", device=\"cuda\")" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": null, 323 | "metadata": { 324 | "id": "1g2l_hJhLyYm" 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "prompt = \"\"\"\\\n", 329 | "import argparse\n", 330 | "\n", 331 | "def main(string: str):\n", 332 | " print(string)\n", 333 | " print(string[::-1])\n", 334 | "\n", 335 | "if __name__ == \"__main__\":\"\"\"\n", 336 | "\n", 337 | "output = codellama_python.generate(\n", 338 | " prompt=prompt,\n", 339 | " progress_callback=StreamToStdout(callback_interval=2)\n", 340 | ")" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": { 347 | "colab": { 348 | "base_uri": "https://localhost:8080/" 349 | }, 350 | "id": "XwU54BtKQKz4", 351 | "outputId": "2ece3f04-970d-4baf-c4c3-a93cc7c80d08" 352 | }, 353 | "outputs": [ 354 | { 355 | "name": "stdout", 356 | "output_type": "stream", 357 | "text": [ 358 | "import argparse\n", 359 | "\n", 360 | "def main(string: str):\n", 361 | " print(string)\n", 362 | " print(string[::-1])\n", 363 | "\n", 364 | "if __name__ == \"__main__\":\n", 365 | " parser = argparse.ArgumentParser(description=\"Checks if the provided string is a palindrome.\"))\n", 366 | " parser.add_argument(\"-s\", \"--string\",\n", 367 | " help=\"The string to check.\"))\n", 368 | "\n", 369 | " args = parser.parse_args()\n", 370 | " main(args.string))\n" 371 | ] 372 | } 373 | ], 374 | "source": [ 375 | "print(prompt+output)" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": { 382 | "id": "DFrVdqewL7_c" 383 | }, 384 | "outputs": [], 385 | "source": [ 386 | "# Restart colab to initialize a new ChatModule\n", 387 | "exit()" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "metadata": { 393 | "id": "6CUo34QeQto2" 394 | }, 395 | "source": [ 396 | "The CodeLlama models support infilling based on surrounding content. Let's try it with the foundation CodeLlama." 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "id": "yeHn8je9SBpK" 404 | }, 405 | "outputs": [], 406 | "source": [ 407 | "from mlc_llm import ChatModule\n", 408 | "from mlc_llm.callback import StreamToStdout\n", 409 | "\n", 410 | "def text_infilling(prompt: str):\n", 411 | " prefix = prompt.split(\"\")[0]\n", 412 | " suffix = prompt.split(\"\")[1]\n", 413 | " return f\"
 {prefix} {suffix} \"\n",
414 |         "\n",
415 |         "def print_infilling(prompt: str, output: str):\n",
416 |         "    print(prompt.replace(\"\", output.replace(\"\", \"\")))"
417 |       ]
418 |     },
419 |     {
420 |       "cell_type": "code",
421 |       "execution_count": null,
422 |       "metadata": {
423 |         "id": "LUU8zTdRSKNK"
424 |       },
425 |       "outputs": [],
426 |       "source": [
427 |         "codellama = ChatModule(model=\"CodeLlama-13b-hf-q4f16_1\", device=\"cuda\")"
428 |       ]
429 |     },
430 |     {
431 |       "cell_type": "code",
432 |       "execution_count": null,
433 |       "metadata": {
434 |         "id": "d3rrm1qPSOkr"
435 |       },
436 |       "outputs": [],
437 |       "source": [
438 |         "prompt = \"\"\"\\\n",
439 |         "# Installation instructions:\n",
440 |         "    \n",
441 |         "This downloads the LLaMA inference code and installs the repository as a local pip package.\n",
442 |         "\"\"\"\n",
443 |         "\n",
444 |         "output = codellama.generate(\n",
445 |         "    prompt=text_infilling(prompt),\n",
446 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
447 |         ")"
448 |       ]
449 |     },
450 |     {
451 |       "cell_type": "code",
452 |       "execution_count": null,
453 |       "metadata": {
454 |         "colab": {
455 |           "base_uri": "https://localhost:8080/"
456 |         },
457 |         "id": "78SHkqohUbCc",
458 |         "outputId": "c842e0a8-1ddc-4dc2-8d02-2d1c264c3131"
459 |       },
460 |       "outputs": [
461 |         {
462 |           "name": "stdout",
463 |           "output_type": "stream",
464 |           "text": [
465 |             "# Installation instructions:\n",
466 |             "    pip install llamapy\n",
467 |             "\n",
468 |             "# Using the local pip package:\n",
469 |             "\n",
470 |             "    import llamapy\n",
471 |             "    my_model = llamapy.LLaMA(n_components=2))\n",
472 |             "\n",
473 |             "# Requirements:\n",
474 |             "\n",
475 |             "    Python 3.x\n",
476 |             "\n",
477 |             "\n",
478 |             "# Installation (easy way):\n",
479 |             "\n",
480 |             "    pip install git+https://github.com/BBIC-BBC/LLAMA\n",
481 |             "\n",
482 |             "\n",
483 |             "# Installation (advanced way)):\n",
484 |             "\n",
485 |             "\n",
486 |             "    1) Download the repository from Github:\n",
487 |             "\n",
488 |             "\n",
489 |             "        git clone https://github.com/BBIC-BBC/LLAMA\n",
490 |             "\n",
491 |             "\n",
492 |             "    2) Install the repository as a local pip package:\n",
493 |             "\n",
494 |             "\n",
495 |             "        cd LLAMA\n",
496 |             "\n",
497 |             "\n",
498 |             "\n",
499 |             "        python setup.py install\n",
500 |             "\n",
501 |             "\n",
502 |             "\n",
503 |             "\n",
504 |             "# Using the local pip package:\n",
505 |             "\n",
506 |             "\n",
507 |             "    import llamapy\n",
508 |             "    my_model = llamapy.LLaMA(n_components=2))))\n",
509 |             "\n",
510 |             "\n",
511 |             "\n",
512 |             "# Requirements:\n",
513 |             "\n",
514 |             "\n",
515 |             "    Python 3.x\n",
516 |             "\n",
517 |             "\n",
518 |             "\n",
519 |             "# Installation (easy way):\n",
520 |             "\n",
521 |             "\n",
522 |             "    pip install git+https://github.com/BBIC-BBC/LLAMA\n",
523 |             "\n",
524 |             "\n",
525 |             "\n",
526 |             "# Installation (advanced way)):\n",
527 |             "\n",
528 |             "\n",
529 |             "\n",
530 |             "    1) Download the LLaMA inference code from Github:\n",
531 |             "\n",
532 |             "\n",
533 |             "        git clone https://github.com/BBIC-BBC/LLAMA\n",
534 |             "\n",
535 |             "\n",
536 |             "    2) Install the LLaMA inference code as a local pip package:\n",
537 |             "\n",
538 |             "\n",
539 |             "        cd LLaMA\n",
540 |             "\n",
541 |             "\n",
542 |             "\n",
543 |             "        python setup.py install\n",
544 |             "This downloads the LLaMA inference code and installs the repository as a local pip package.\n",
545 |             "\n"
546 |           ]
547 |         }
548 |       ],
549 |       "source": [
550 |         "print_infilling(prompt, output)"
551 |       ]
552 |     },
553 |     {
554 |       "cell_type": "code",
555 |       "execution_count": null,
556 |       "metadata": {
557 |         "id": "mKYyRyUVPGm2"
558 |       },
559 |       "outputs": [],
560 |       "source": [
561 |         "# Restart colab to create a new ChatModule\n",
562 |         "exit()"
563 |       ]
564 |     },
565 |     {
566 |       "cell_type": "markdown",
567 |       "metadata": {
568 |         "id": "CqmYlUtwV01m"
569 |       },
570 |       "source": [
571 |         "Finally, the CodeLlama-Instruct has instruction following ability for programming tasks."
572 |       ]
573 |     },
574 |     {
575 |       "cell_type": "code",
576 |       "execution_count": null,
577 |       "metadata": {
578 |         "id": "IcCPRp9oWBfh"
579 |       },
580 |       "outputs": [],
581 |       "source": [
582 |         "from mlc_llm import ChatModule\n",
583 |         "from mlc_llm.callback import StreamToStdout"
584 |       ]
585 |     },
586 |     {
587 |       "cell_type": "code",
588 |       "execution_count": null,
589 |       "metadata": {
590 |         "id": "I4C8iC9IWyZR"
591 |       },
592 |       "outputs": [],
593 |       "source": [
594 |         "codellama_instruct = ChatModule(model=\"CodeLlama-13b-Instruct-hf-q4f16_1\", device=\"cuda\")"
595 |       ]
596 |     },
597 |     {
598 |       "cell_type": "code",
599 |       "execution_count": null,
600 |       "metadata": {
601 |         "colab": {
602 |           "base_uri": "https://localhost:8080/"
603 |         },
604 |         "id": "1kL02gKWW7Ov",
605 |         "outputId": "65919ac1-2e1a-4545-ac3d-3643a7c4a18f"
606 |       },
607 |       "outputs": [
608 |         {
609 |           "name": "stdout",
610 |           "output_type": "stream",
611 |           "text": [
612 |             "Here is a possible implementation of the program:\n",
613 |             "```\n",
614 |             "import java.util.*;\n",
615 |             "public class SumOfSublists {\n",
616 |             "    public static void main(String[] args) {\n",
617 |             "        List list = Arrays.asList(1, 2, 3, 4, 5));\n",
618 |             "        List sums = new ArrayList<>();\n",
619 |             "        for (int i = 0; i < list.size(); i++) {\n",
620 |             "            int sum = 0;\n",
621 |             "            for (int j = i; j < list.size(); j++) {\n",
622 |             "                sum += list.get(j));\n",
623 |             "            }\n",
624 |             "\n",
625 |             "            sums.add(sum));\n",
626 |             "        }\n",
627 |             "\n",
628 |             "\n",
629 |             "        System.out.println(\"The sums of all contiguous sublists are: \" + sums));\n",
630 |             "    }\n"
631 |           ]
632 |         }
633 |       ],
634 |       "source": [
635 |         "prompt = (\"Write a Java program that computes the set of sums of all contiguous\"\n",
636 |         "          \"sublists of a given list.\")\n",
637 |         "\n",
638 |         "output = codellama_instruct.generate(\n",
639 |         "    prompt=prompt,\n",
640 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
641 |         ")"
642 |       ]
643 |     },
644 |     {
645 |       "cell_type": "code",
646 |       "execution_count": null,
647 |       "metadata": {
648 |         "id": "dl8cowtUz5yp"
649 |       },
650 |       "outputs": [],
651 |       "source": [
652 |         "codellama_instruct.reset_chat()"
653 |       ]
654 |     },
655 |     {
656 |       "cell_type": "code",
657 |       "execution_count": null,
658 |       "metadata": {
659 |         "colab": {
660 |           "base_uri": "https://localhost:8080/"
661 |         },
662 |         "id": "liWTlsBBYTRa",
663 |         "outputId": "0edabb88-8219-448a-ebf6-66fd86767997"
664 |       },
665 |       "outputs": [
666 |         {
667 |           "name": "stdout",
668 |           "output_type": "stream",
669 |           "text": [
670 |             "Here is a program in Python that solves the problem of finding the indices of two numbers in an array that add up to a target value:\n",
671 |             "```\n",
672 |             "def find_indices(nums, target):\n",
673 |             "    # Initialize two empty lists to store the indices of the two numbers\n",
674 |             "    for i in range(len(nums)))):\n",
675 |             "        for j in range(len(nums)))):\n",
676 |             "            if i != j and nums[i] + nums[j]] == target:\n",
677 |             "                indices = [i, j]]\n",
678 |             "    return indices\n"
679 |           ]
680 |         }
681 |       ],
682 |       "source": [
683 |         "prompt = (\"Given an array of integers nums and an integer target, return\"\n",
684 |         "          \"indices of the two numbers such that they add up to target.\"\n",
685 |         "          \" Write this program in Python.\")\n",
686 |         "\n",
687 |         "output = codellama_instruct.generate(\n",
688 |         "    prompt=prompt,\n",
689 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
690 |         ")"
691 |       ]
692 |     },
693 |     {
694 |       "cell_type": "code",
695 |       "execution_count": null,
696 |       "metadata": {
697 |         "id": "UzIMwuiaWwSg"
698 |       },
699 |       "outputs": [],
700 |       "source": [
701 |         "# Restart colab to create a new ChatModule\n",
702 |         "exit()"
703 |       ]
704 |     }
705 |   ],
706 |   "metadata": {
707 |     "accelerator": "GPU",
708 |     "colab": {
709 |       "gpuType": "T4",
710 |       "provenance": []
711 |     },
712 |     "kernelspec": {
713 |       "display_name": "Python 3",
714 |       "name": "python3"
715 |     },
716 |     "language_info": {
717 |       "name": "python"
718 |     }
719 |   },
720 |   "nbformat": 4,
721 |   "nbformat_minor": 0
722 | }
723 | 


--------------------------------------------------------------------------------
/mlc-llm/models/demo_CodeLlama_7b.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "4IwhlCjVtpYj"
  7 |       },
  8 |       "source": [
  9 |         "# Demo: CodeLlama-7b with MLC LLM\n",
 10 |         "\n",
 11 |         "Recently, Meta unveiled [CodeLlama](https://github.com/facebookresearch/codellama), a family of large language models for code based on Llama 2 providing state-of-the-art performance among open models, infilling capabilities, support for large input contexts, and zero-shot instruction following ability for programming tasks. This notebook demonstrates MLC LLM's support for the CodeLlama family:\n",
 12 |         "\n",
 13 |         "- **[CodeLlama](https://huggingface.co/codellama/CodeLlama-7b-hf): a coding foundation LLM**\n",
 14 |         "- **[CodeLlama-Instruct](https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf): an instruction-tuned LLM for coding**\n",
 15 |         "- **[CodeLlama-Python](https://huggingface.co/codellama/CodeLlama-7b-Python-hf): a Python specialized LLM**\n",
 16 |         "\n",
 17 |         "In this respect, MLC LLM allows everyone to develop, optimize and deploy AI models natively on everyone's devices. Therefore, making possible the deployment of coding LLMs natively, acting as **a personal AI coding assistant**.\n",
 18 |         "\n",
 19 |         "In this notebook, we walk over the steps of using MLC LLM to run these pre-compiled CodeLlama models! We have uploaded various versions of the pre-compiled and quantized CodeLlama models here: https://huggingface.co/mlc-ai.\n",
 20 |         "\n",
 21 |         "Learn more about MLC LLM here: https://mlc.ai/mlc-llm/docs."
 22 |       ]
 23 |     },
 24 |     {
 25 |       "cell_type": "markdown",
 26 |       "metadata": {},
 27 |       "source": [
 28 |         "Here's an overview regarding each model's capabilities:\n",
 29 |         "\n",
 30 |         "|                       | Code Completion | Infilling | Instruction/chat | Python specialist |\n",
 31 |         "|-----------------------|-----------------|-----------|------------------|-------------------|\n",
 32 |         "| CodeLlama-7b          |        X        |     X     |                  |                   |\n",
 33 |         "| CodeLlama-7b-Python   |        X        |           |                  |         X         |\n",
 34 |         "| CodeLlama-7b-Instruct |        X        |     X     |         X        |                   |"
 35 |       ]
 36 |     },
 37 |     {
 38 |       "cell_type": "markdown",
 39 |       "metadata": {
 40 |         "id": "YsvAL7SSt9Lo"
 41 |       },
 42 |       "source": [
 43 |         "Click the button below to get started!\n",
 44 |         "\n",
 45 |         "\n",
 46 |         "  \"Open\n",
 47 |         ""
 48 |       ]
 49 |     },
 50 |     {
 51 |       "cell_type": "markdown",
 52 |       "metadata": {
 53 |         "id": "8kkADAMCCLi-"
 54 |       },
 55 |       "source": [
 56 |         "## Install MLC LLM"
 57 |       ]
 58 |     },
 59 |     {
 60 |       "cell_type": "markdown",
 61 |       "metadata": {
 62 |         "id": "Y2EwuS6TCO61"
 63 |       },
 64 |       "source": [
 65 |         "We will start from setting up the environment. First, let us create a new Conda environment, in which we will run the rest of the notebook.\n",
 66 |         "\n",
 67 |         "```\n",
 68 |         "conda create --name mlc-llm python=3.10\n",
 69 |         "conda activate mlc-llm\n",
 70 |         "```"
 71 |       ]
 72 |     },
 73 |     {
 74 |       "cell_type": "markdown",
 75 |       "metadata": {
 76 |         "id": "ojEeEmsqCTPG"
 77 |       },
 78 |       "source": [
 79 |         "**Google Colab**\n",
 80 |         "\n",
 81 |         "- If you are running this in a Google Colab notebook, you would not need to create a conda environment.\n",
 82 |         "- However, be sure to change your runtime to GPU by going to `Runtime` > `Change runtime type` and setting the Hardware accelerator to be \"GPU\"."
 83 |       ]
 84 |     },
 85 |     {
 86 |       "cell_type": "markdown",
 87 |       "metadata": {
 88 |         "id": "S_rX53bGChPn"
 89 |       },
 90 |       "source": [
 91 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the driver version number as well as what GPUs are currently available for use."
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "code",
 96 |       "execution_count": null,
 97 |       "metadata": {
 98 |         "id": "CRPeCflbCij6"
 99 |       },
100 |       "outputs": [],
101 |       "source": [
102 |         "!nvidia-smi"
103 |       ]
104 |     },
105 |     {
106 |       "cell_type": "markdown",
107 |       "metadata": {
108 |         "id": "PQfVfTAYC1M-"
109 |       },
110 |       "source": [
111 |         "Next, let's download the MLC-AI and MLC-LLM nightly build packages. If you are running in a Colab environment, then you can just run the following command. Otherwise, go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS."
112 |       ]
113 |     },
114 |     {
115 |       "cell_type": "markdown",
116 |       "metadata": {
117 |         "id": "vi-udt4tC5c9"
118 |       },
119 |       "source": [
120 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly."
121 |       ]
122 |     },
123 |     {
124 |       "cell_type": "code",
125 |       "execution_count": null,
126 |       "metadata": {
127 |         "id": "ah9tYaCRCkKS"
128 |       },
129 |       "outputs": [],
130 |       "source": [
131 |         "!pip install --pre --force-reinstall mlc-ai-nightly-cu118 mlc-llm-nightly-cu118 -f https://mlc.ai/wheels"
132 |       ]
133 |     },
134 |     {
135 |       "cell_type": "markdown",
136 |       "metadata": {
137 |         "id": "nZGVNJE-DJ9E"
138 |       },
139 |       "source": [
140 |         "Let's confirm we have installed the packages successfully!"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "execution_count": null,
146 |       "metadata": {
147 |         "id": "5Y6LszJgC7SQ"
148 |       },
149 |       "outputs": [],
150 |       "source": [
151 |         "!python -c \"import tvm; print('tvm installed properly!')\"\n",
152 |         "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\""
153 |       ]
154 |     },
155 |     {
156 |       "cell_type": "markdown",
157 |       "metadata": {
158 |         "id": "lGfnrRa9DMw1"
159 |       },
160 |       "source": [
161 |         "## Download Prebuilt Models and Library"
162 |       ]
163 |     },
164 |     {
165 |       "cell_type": "markdown",
166 |       "metadata": {
167 |         "id": "pVYkLb0eDjMi"
168 |       },
169 |       "source": [
170 |         "The following commands will download all the available prebuilt libraries (e.g., `.so` files), including the precompiled CodeLlama models. This may take a while. If in **Google Colab**, you can verify that the files are being downloaded by clicking on the folder icon on the left."
171 |       ]
172 |     },
173 |     {
174 |       "cell_type": "markdown",
175 |       "metadata": {
176 |         "id": "Pg7daEvlD5UB"
177 |       },
178 |       "source": [
179 |         "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell."
180 |       ]
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "execution_count": null,
185 |       "metadata": {
186 |         "id": "FDFbw1KPDLu1"
187 |       },
188 |       "outputs": [],
189 |       "source": [
190 |         "!git lfs install"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "execution_count": null,
196 |       "metadata": {
197 |         "id": "bYqaVjmND7Px"
198 |       },
199 |       "outputs": [],
200 |       "source": [
201 |         "!mkdir -p dist/prebuilt\n",
202 |         "!git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt/lib"
203 |       ]
204 |     },
205 |     {
206 |       "cell_type": "markdown",
207 |       "metadata": {
208 |         "id": "SMEavWCJEC_d"
209 |       },
210 |       "source": [
211 |         "#### CodeLlama-7b q4f16_1 prebuilt weights"
212 |       ]
213 |     },
214 |     {
215 |       "cell_type": "code",
216 |       "execution_count": null,
217 |       "metadata": {
218 |         "id": "etHEUrfMD8bX"
219 |       },
220 |       "outputs": [],
221 |       "source": [
222 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-7b-hf-q4f16_1"
223 |       ]
224 |     },
225 |     {
226 |       "cell_type": "markdown",
227 |       "metadata": {
228 |         "id": "FQkIX4TpElR6"
229 |       },
230 |       "source": [
231 |         "#### CodeLlama-7b-Instruct q4f16_1 prebuilt weights"
232 |       ]
233 |     },
234 |     {
235 |       "cell_type": "code",
236 |       "execution_count": null,
237 |       "metadata": {
238 |         "id": "mTEGXAlhEnOw"
239 |       },
240 |       "outputs": [],
241 |       "source": [
242 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-7b-Instruct-hf-q4f16_1"
243 |       ]
244 |     },
245 |     {
246 |       "cell_type": "markdown",
247 |       "metadata": {
248 |         "id": "acb1HpKpEoca"
249 |       },
250 |       "source": [
251 |         "#### CodeLlama-7b-Python q4f16_1 prebuilt weights"
252 |       ]
253 |     },
254 |     {
255 |       "cell_type": "code",
256 |       "execution_count": null,
257 |       "metadata": {
258 |         "id": "oysLKcZ4Eou7"
259 |       },
260 |       "outputs": [],
261 |       "source": [
262 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-CodeLlama-7b-Python-hf-q4f16_1"
263 |       ]
264 |     },
265 |     {
266 |       "cell_type": "code",
267 |       "execution_count": null,
268 |       "metadata": {
269 |         "id": "dbHdyfIXHNpo"
270 |       },
271 |       "outputs": [],
272 |       "source": [
273 |         "# Restart colab\n",
274 |         "exit()"
275 |       ]
276 |     },
277 |     {
278 |       "cell_type": "markdown",
279 |       "metadata": {
280 |         "id": "CmpxrrqyE0S6"
281 |       },
282 |       "source": [
283 |         "## Let's code with CodeLlama!"
284 |       ]
285 |     },
286 |     {
287 |       "cell_type": "markdown",
288 |       "metadata": {
289 |         "id": "y73vNLy1OfMr"
290 |       },
291 |       "source": [
292 |         "Let's first try a simple code completion task with the CodeLlama-Python."
293 |       ]
294 |     },
295 |     {
296 |       "cell_type": "code",
297 |       "execution_count": null,
298 |       "metadata": {
299 |         "id": "EOEf8sDyEwuv"
300 |       },
301 |       "outputs": [],
302 |       "source": [
303 |         "from mlc_llm import ChatModule\n",
304 |         "from mlc_llm.callback import StreamToStdout"
305 |       ]
306 |     },
307 |     {
308 |       "cell_type": "code",
309 |       "execution_count": null,
310 |       "metadata": {
311 |         "id": "tBcQMm-KJPN-"
312 |       },
313 |       "outputs": [],
314 |       "source": [
315 |         "codellama_python = ChatModule(model=\"CodeLlama-7b-Python-hf-q4f16_1\", device=\"cuda\")"
316 |       ]
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "execution_count": null,
321 |       "metadata": {
322 |         "id": "1g2l_hJhLyYm"
323 |       },
324 |       "outputs": [],
325 |       "source": [
326 |         "prompt = \"\"\"\\\n",
327 |         "# Self-attention block implementation\n",
328 |         "class SelfAttentionBlock(nn.Module):\n",
329 |         "    def __init__(\"\"\"\n",
330 |         "\n",
331 |         "output = codellama_python.generate(\n",
332 |         "    prompt=prompt,\n",
333 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
334 |         ")"
335 |       ]
336 |     },
337 |     {
338 |       "cell_type": "code",
339 |       "execution_count": null,
340 |       "metadata": {
341 |         "colab": {
342 |           "base_uri": "https://localhost:8080/"
343 |         },
344 |         "id": "XwU54BtKQKz4",
345 |         "outputId": "d5409224-cfbf-4c28-8a81-40bccfc02572"
346 |       },
347 |       "outputs": [
348 |         {
349 |           "name": "stdout",
350 |           "output_type": "stream",
351 |           "text": [
352 |             "# Self-attention block implementation\n",
353 |             "class SelfAttentionBlock(nn.Module):\n",
354 |             "    def __init__(self, dim, num_heads):\n",
355 |             "        super().__init__()\n",
356 |             "        self.num_heads = num_heads\n",
357 |             "        self.key = nn.Linear(dim, dim))\n",
358 |             "        self.value = nn.Linear(dim, dim))\n",
359 |             "        self.proj = nn.Linear(dim, dim))\n",
360 |             "\n",
361 |             "    def forward(self, x):\n",
362 |             "        B, N, C = x.shape\n",
363 |             "        q = self.key(x[:, :, :-64])))\n",
364 |             "        k = self.key(x[:, :, :64]]))\n",
365 |             "        v = self.value(x[:, :, :]]]]))\n",
366 |             "        attn = (q @ k.transpose(-1), v))\n",
367 |             "\n",
368 |             "        x = self.proj(attn[0]]))))\n",
369 |             "\n",
370 |             "        return x\n"
371 |           ]
372 |         }
373 |       ],
374 |       "source": [
375 |         "print(prompt+output)"
376 |       ]
377 |     },
378 |     {
379 |       "cell_type": "code",
380 |       "execution_count": null,
381 |       "metadata": {
382 |         "id": "DFrVdqewL7_c"
383 |       },
384 |       "outputs": [],
385 |       "source": [
386 |         "# Restart colab to initialize a new ChatModule\n",
387 |         "exit()"
388 |       ]
389 |     },
390 |     {
391 |       "cell_type": "markdown",
392 |       "metadata": {
393 |         "id": "6CUo34QeQto2"
394 |       },
395 |       "source": [
396 |         "The CodeLlama models support infilling based on surrounding content. Let's try it with the foundation CodeLlama."
397 |       ]
398 |     },
399 |     {
400 |       "cell_type": "code",
401 |       "execution_count": null,
402 |       "metadata": {
403 |         "id": "yeHn8je9SBpK"
404 |       },
405 |       "outputs": [],
406 |       "source": [
407 |         "from mlc_llm import ChatModule\n",
408 |         "from mlc_llm.callback import StreamToStdout\n",
409 |         "\n",
410 |         "def text_infilling(prompt: str):\n",
411 |         "    prefix = prompt.split(\"\")[0]\n",
412 |         "    suffix = prompt.split(\"\")[1]\n",
413 |         "    return f\"
 {prefix} {suffix} \"\n",
414 |         "\n",
415 |         "def print_infilling(prompt: str, output: str):\n",
416 |         "    print(prompt.replace(\"\", output.replace(\"\", \"\")))"
417 |       ]
418 |     },
419 |     {
420 |       "cell_type": "code",
421 |       "execution_count": null,
422 |       "metadata": {
423 |         "id": "LUU8zTdRSKNK"
424 |       },
425 |       "outputs": [],
426 |       "source": [
427 |         "codellama = ChatModule(model=\"CodeLlama-7b-hf-q4f16_1\", device=\"cuda\")"
428 |       ]
429 |     },
430 |     {
431 |       "cell_type": "code",
432 |       "execution_count": null,
433 |       "metadata": {
434 |         "id": "d3rrm1qPSOkr"
435 |       },
436 |       "outputs": [],
437 |       "source": [
438 |         "prompt = \"\"\"\\\n",
439 |         "# Installation instructions:\n",
440 |         "    \n",
441 |         "This downloads the LLaMA inference code and installs the repository as a local pip package.\n",
442 |         "\"\"\"\n",
443 |         "\n",
444 |         "output = codellama.generate(\n",
445 |         "    prompt=text_infilling(prompt),\n",
446 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
447 |         ")"
448 |       ]
449 |     },
450 |     {
451 |       "cell_type": "code",
452 |       "execution_count": null,
453 |       "metadata": {
454 |         "colab": {
455 |           "base_uri": "https://localhost:8080/"
456 |         },
457 |         "id": "78SHkqohUbCc",
458 |         "outputId": "33a4ef28-2db0-4e8a-c886-a630fb2d8df6"
459 |       },
460 |       "outputs": [
461 |         {
462 |           "name": "stdout",
463 |           "output_type": "stream",
464 |           "text": [
465 |             "# Installation instructions:\n",
466 |             "    1. Clone the repository.\n",
467 |             "    ```\n",
468 |             "    git clone https://github.com/LLaMA/LLaMA.git\n",
469 |             "    ```\n",
470 |             "\n",
471 |             "\n",
472 |             "2. Install the pip package.\n",
473 |             "    ```\n",
474 |             "    cd LLaMA\n",
475 |             "    pip install -e .\n",
476 |             "    ```\n",
477 |             " \n",
478 |             "This downloads the LLaMA inference code and installs the repository as a local pip package.\n",
479 |             "\n"
480 |           ]
481 |         }
482 |       ],
483 |       "source": [
484 |         "print_infilling(prompt, output)"
485 |       ]
486 |     },
487 |     {
488 |       "cell_type": "code",
489 |       "execution_count": null,
490 |       "metadata": {
491 |         "id": "mKYyRyUVPGm2"
492 |       },
493 |       "outputs": [],
494 |       "source": [
495 |         "# Restart colab to create a new ChatModule\n",
496 |         "exit()"
497 |       ]
498 |     },
499 |     {
500 |       "cell_type": "markdown",
501 |       "metadata": {
502 |         "id": "CqmYlUtwV01m"
503 |       },
504 |       "source": [
505 |         "Finally, the CodeLlama-Instruct has instruction following ability for programming tasks."
506 |       ]
507 |     },
508 |     {
509 |       "cell_type": "code",
510 |       "execution_count": null,
511 |       "metadata": {
512 |         "id": "IcCPRp9oWBfh"
513 |       },
514 |       "outputs": [],
515 |       "source": [
516 |         "from mlc_llm import ChatModule\n",
517 |         "from mlc_llm.callback import StreamToStdout"
518 |       ]
519 |     },
520 |     {
521 |       "cell_type": "code",
522 |       "execution_count": null,
523 |       "metadata": {
524 |         "id": "I4C8iC9IWyZR"
525 |       },
526 |       "outputs": [],
527 |       "source": [
528 |         "codellama_instruct = ChatModule(model=\"CodeLlama-7b-Instruct-hf-q4f16_1\", device=\"cuda\")"
529 |       ]
530 |     },
531 |     {
532 |       "cell_type": "code",
533 |       "execution_count": null,
534 |       "metadata": {
535 |         "colab": {
536 |           "base_uri": "https://localhost:8080/"
537 |         },
538 |         "id": "1kL02gKWW7Ov",
539 |         "outputId": "26c64719-dc61-4f31-dd5d-78e215855197"
540 |       },
541 |       "outputs": [
542 |         {
543 |           "name": "stdout",
544 |           "output_type": "stream",
545 |           "text": [
546 |             "Here is a C++ program that computes the set of sums of all contiguous sublists of a given list:\n",
547 |             "#include \n",
548 |             "using namespace std;\n",
549 |             "void computeSums(const list &lst, list &sums) {\n",
550 |             "    // Initialize the sums list\n",
551 |             "    sums.clear();\n",
552 |             "    // Compute the sums of all contiguous sublists\n",
553 |             "    for (int i = 0; i < lst.size() - 1; i++) {\n",
554 |             "        int sum = 0;\n",
555 |             "        for (int j = i; j < lst.size() - 1; j++) {\n",
556 |             "            sum += lst[j];\n",
557 |             "        }\n",
558 |             "        sums.push_back(sum));\n",
559 |             "    }\n",
560 |             "    // Print the sums list\n",
561 |             "    for (int i = 0; i < sums.size(); i++) {\n",
562 |             "        cout << sums[i] << endl;\n",
563 |             "    }\n",
564 |             "}\n",
565 |             "int main() {\n",
566 |             "    list lst = {1, 2, 3, 4, 5};\n",
567 |             "    list sums;\n",
568 |             "    computeSums(lst, sums);\n",
569 |             "    return 0;\n",
570 |             "}\n",
571 |             "This program takes a list of integers as input, and computes the set of sums of all contiguous sublists of the input list. The program then prints the computed set of sums.\n",
572 |             "Note that the input list must be a list of integers, and that the program will produce an error if the input list is not a list of integers.\n"
573 |           ]
574 |         }
575 |       ],
576 |       "source": [
577 |         "prompt = (\"Write a C++ program that computes the set of sums of all contiguous\"\n",
578 |         "          \"sublists of a given list.\")\n",
579 |         "\n",
580 |         "output = codellama_instruct.generate(\n",
581 |         "    prompt=prompt,\n",
582 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
583 |         ")"
584 |       ]
585 |     },
586 |     {
587 |       "cell_type": "code",
588 |       "execution_count": null,
589 |       "metadata": {
590 |         "colab": {
591 |           "base_uri": "https://localhost:8080/"
592 |         },
593 |         "id": "liWTlsBBYTRa",
594 |         "outputId": "07907456-2600-4515-fc2d-af9213803ef6"
595 |       },
596 |       "outputs": [
597 |         {
598 |           "name": "stdout",
599 |           "output_type": "stream",
600 |           "text": [
601 |             "Here is the C++ program written in Java instead:\n",
602 |             "import java.util.ArrayList;\n",
603 |             "public class SumsOfSublists {\n",
604 |             "    public static void main(String[] args) {\n",
605 |             "        ArrayList lst = new ArrayList(){{add(1);add(2);add(3);add(4);add(5);}};\n",
606 |             "        ArrayList sums = new ArrayList();\n",
607 |             "        computeSums(lst, sums));\n",
608 |             "        for (int i = 0; i < sums.size(); i++) {\n",
609 |             "            System.out.println(sums[i])));\n",
610 |             "        }\n",
611 |             "    }\n",
612 |             "    public static void computeSums(ArrayList lst, ArrayList sums) {\n",
613 |             "        for (int i = 0; i < lst.size() - 1; i++) {\n",
614 |             "            int sum = 0;\n",
615 |             "            for (int j = i; j < lst.size() - 1; j++) {\n",
616 |             "                sum += lst[j]);\n",
617 |             "            }\n",
618 |             "            sums.add(sum));\n",
619 |             "        }\n",
620 |             "    }\n",
621 |             "}\n",
622 |             "This Java program takes a list of integers as input, and computes the set of sums of all contiguous sublists of the input list. The program then prints the computed set of sums.\n",
623 |             "Note that the input list must be a list of integers, and that the program will produce an error if the input list is not a list of integers.\n"
624 |           ]
625 |         }
626 |       ],
627 |       "source": [
628 |         "output = codellama_instruct.generate(\n",
629 |         "    prompt=\"Write this in Java instead.\",\n",
630 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
631 |         ")"
632 |       ]
633 |     },
634 |     {
635 |       "cell_type": "code",
636 |       "execution_count": null,
637 |       "metadata": {
638 |         "id": "UzIMwuiaWwSg"
639 |       },
640 |       "outputs": [],
641 |       "source": [
642 |         "# Restart colab to create a new ChatModule\n",
643 |         "exit()"
644 |       ]
645 |     }
646 |   ],
647 |   "metadata": {
648 |     "accelerator": "GPU",
649 |     "colab": {
650 |       "authorship_tag": "ABX9TyMgGy5PkxU2LXzQzjaEnyyL",
651 |       "gpuType": "T4",
652 |       "provenance": []
653 |     },
654 |     "kernelspec": {
655 |       "display_name": "Python 3",
656 |       "name": "python3"
657 |     },
658 |     "language_info": {
659 |       "name": "python"
660 |     }
661 |   },
662 |   "nbformat": 4,
663 |   "nbformat_minor": 0
664 | }
665 | 


--------------------------------------------------------------------------------
/mlc-llm/models/demo_WizardLM_Math_Coder.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "7aZkVRjX6hun"
  7 |       },
  8 |       "source": [
  9 |         "# Demo: WizardLM, WizardMath, and WizardCoder with MLC LLM\n",
 10 |         "\n",
 11 |         "WizardLM recently released their WizardMath model, which has achieved impressive results on various benchmarks. We take this opportunity to demonstrate MLC LLM's support for the Wizard model family: https://github.com/nlpxucan/WizardLM.\n",
 12 |         "\n",
 13 |         "Specifically, we will look at:\n",
 14 |         "- **[WizardLM](https://github.com/nlpxucan/WizardLM/tree/main/WizardLM): an instruction-following LLM using Evol-Instruct**\n",
 15 |         "- **[WizardCoder](https://github.com/nlpxucan/WizardLM/tree/main/WizardCoder): a code LLM with Evol-Instruct**\n",
 16 |         "- **[WizardMath](https://github.com/nlpxucan/WizardLM/tree/main/WizardMath): a mathematical reasoning LLM via reinforced Evol-Instruct**\n",
 17 |         "\n",
 18 |         "The task-specific Wizard models resonate with one of the visions of MLC LLM: deploying LLMs natively, each acting as **a personal AI assistant for a specific realm of tasks**.\n",
 19 |         "\n",
 20 |         "In this notebook, we walk over the steps of using MLC LLM to run these pre-compiled Wizard models! We have uploaded various versions of the pre-compiled and quantized Wizard models here: https://huggingface.co/mlc-ai.\n",
 21 |         "\n",
 22 |         "Learn more about MLC LLM here: https://mlc.ai/mlc-llm/docs."
 23 |       ]
 24 |     },
 25 |     {
 26 |       "cell_type": "markdown",
 27 |       "metadata": {
 28 |         "id": "rgD6BbKu_Dm0"
 29 |       },
 30 |       "source": [
 31 |         "Click the button below to get started!\n",
 32 |         "\n",
 33 |         "\n",
 34 |         "  \"Open\n",
 35 |         ""
 36 |       ]
 37 |     },
 38 |     {
 39 |       "cell_type": "markdown",
 40 |       "metadata": {
 41 |         "id": "rKvxnQF-9y8T"
 42 |       },
 43 |       "source": [
 44 |         "## Install MLC LLM"
 45 |       ]
 46 |     },
 47 |     {
 48 |       "cell_type": "markdown",
 49 |       "metadata": {
 50 |         "id": "cEfutAOe-48p"
 51 |       },
 52 |       "source": [
 53 |         "We will start from setting up the environment. First, let us create a new Conda environment, in which we will run the rest of the notebook.\n",
 54 |         "\n",
 55 |         "```\n",
 56 |         "conda create --name mlc-llm python=3.10\n",
 57 |         "conda activate mlc-llm\n",
 58 |         "```"
 59 |       ]
 60 |     },
 61 |     {
 62 |       "cell_type": "markdown",
 63 |       "metadata": {
 64 |         "id": "r3N6HKk8_Bbl"
 65 |       },
 66 |       "source": [
 67 |         "**Google Colab**\n",
 68 |         "\n",
 69 |         "- If you are running this in a Google Colab notebook, you would not need to create a conda environment.\n",
 70 |         "- However, be sure to change your runtime to GPU by going to `Runtime` > `Change runtime type` and setting the Hardware accelerator to be \"GPU\"."
 71 |       ]
 72 |     },
 73 |     {
 74 |       "cell_type": "markdown",
 75 |       "metadata": {
 76 |         "id": "al4bIcFv_HtH"
 77 |       },
 78 |       "source": [
 79 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the driver version number as well as what GPUs are currently available for use.\n",
 80 |         "\n"
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "execution_count": null,
 86 |       "metadata": {
 87 |         "id": "8wEfFZ8f6vT3"
 88 |       },
 89 |       "outputs": [],
 90 |       "source": [
 91 |         "!nvidia-smi"
 92 |       ]
 93 |     },
 94 |     {
 95 |       "cell_type": "markdown",
 96 |       "metadata": {
 97 |         "id": "UJnujwMT_RVZ"
 98 |       },
 99 |       "source": [
100 |         "Next, let's download the MLC-AI and mlc-llm nightly build packages. If you are running in a Colab environment, then you can just run the following command. Otherwise, go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS."
101 |       ]
102 |     },
103 |     {
104 |       "cell_type": "markdown",
105 |       "metadata": {
106 |         "id": "s6wHMUtk_M6A"
107 |       },
108 |       "source": [
109 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly."
110 |       ]
111 |     },
112 |     {
113 |       "cell_type": "code",
114 |       "execution_count": null,
115 |       "metadata": {
116 |         "id": "4TGX5fqYjyID"
117 |       },
118 |       "outputs": [],
119 |       "source": [
120 |         "!pip install --pre --force-reinstall mlc-ai-nightly-cu118 mlc-llm-nightly-cu118 -f https://mlc.ai/wheels"
121 |       ]
122 |     },
123 |     {
124 |       "cell_type": "markdown",
125 |       "metadata": {
126 |         "id": "HWMMGRlg_nxj"
127 |       },
128 |       "source": [
129 |         "Let's confirm we have installed the packages successfully!"
130 |       ]
131 |     },
132 |     {
133 |       "cell_type": "code",
134 |       "execution_count": null,
135 |       "metadata": {
136 |         "id": "4R-150I6_q6N"
137 |       },
138 |       "outputs": [],
139 |       "source": [
140 |         "!python -c \"import tvm; print('tvm installed properly!')\"\n",
141 |         "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\""
142 |       ]
143 |     },
144 |     {
145 |       "cell_type": "markdown",
146 |       "metadata": {
147 |         "id": "GbYg_EnT_4Qf"
148 |       },
149 |       "source": [
150 |         "## Download Prebuilt Models and Library"
151 |       ]
152 |     },
153 |     {
154 |       "cell_type": "markdown",
155 |       "metadata": {
156 |         "id": "RdGuw9vB_8Qp"
157 |       },
158 |       "source": [
159 |         "These commands will download many prebuilt libraries (e.g. `.so` files) as well as the precompiled Wizard models. This may take a while. If in **Google Colab**, you can verify that the files are being downloaded by clicking on the folder icon on the left."
160 |       ]
161 |     },
162 |     {
163 |       "cell_type": "markdown",
164 |       "metadata": {
165 |         "id": "29N0JS4NAOtx"
166 |       },
167 |       "source": [
168 |         "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell."
169 |       ]
170 |     },
171 |     {
172 |       "cell_type": "code",
173 |       "execution_count": null,
174 |       "metadata": {
175 |         "id": "RLU17ZZmjzPz"
176 |       },
177 |       "outputs": [],
178 |       "source": [
179 |         "!git lfs install"
180 |       ]
181 |     },
182 |     {
183 |       "cell_type": "code",
184 |       "execution_count": null,
185 |       "metadata": {
186 |         "id": "oN6syhH7j2zi"
187 |       },
188 |       "outputs": [],
189 |       "source": [
190 |         "!mkdir -p dist/prebuilt\n",
191 |         "!git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt/lib"
192 |       ]
193 |     },
194 |     {
195 |       "cell_type": "code",
196 |       "execution_count": null,
197 |       "metadata": {
198 |         "id": "EzU8dFr9j6VD"
199 |       },
200 |       "outputs": [],
201 |       "source": [
202 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-WizardCoder-15B-V1.0-q4f16_1"
203 |       ]
204 |     },
205 |     {
206 |       "cell_type": "code",
207 |       "execution_count": null,
208 |       "metadata": {
209 |         "id": "TEQwp2cMj90p"
210 |       },
211 |       "outputs": [],
212 |       "source": [
213 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-WizardLM-13B-V1.2-q4f16_1"
214 |       ]
215 |     },
216 |     {
217 |       "cell_type": "code",
218 |       "execution_count": null,
219 |       "metadata": {
220 |         "id": "vZvzVrngoREj"
221 |       },
222 |       "outputs": [],
223 |       "source": [
224 |         "!cd dist/prebuilt && git clone https://huggingface.co/mlc-ai/mlc-chat-WizardMath-13B-V1.0-q4f16_1"
225 |       ]
226 |     },
227 |     {
228 |       "cell_type": "code",
229 |       "execution_count": 8,
230 |       "metadata": {
231 |         "id": "rK4yVJdEDvQr"
232 |       },
233 |       "outputs": [],
234 |       "source": [
235 |         "# In Colab, for some reason we need to restart runtime by running `exit()`.\n",
236 |         "# Simply run `exit()`, then run the subsequent cells after runtime restarts.\n",
237 |         "exit()"
238 |       ]
239 |     },
240 |     {
241 |       "cell_type": "markdown",
242 |       "metadata": {
243 |         "id": "PK2DVVIk8Ryv"
244 |       },
245 |       "source": [
246 |         "## Let's Chat with WizardLM!"
247 |       ]
248 |     },
249 |     {
250 |       "cell_type": "code",
251 |       "execution_count": null,
252 |       "metadata": {
253 |         "id": "ZX4X9mGTnKSf"
254 |       },
255 |       "outputs": [],
256 |       "source": [
257 |         "from mlc_llm import ChatModule\n",
258 |         "from mlc_llm.callback import StreamToStdout"
259 |       ]
260 |     },
261 |     {
262 |       "cell_type": "code",
263 |       "execution_count": null,
264 |       "metadata": {
265 |         "id": "4Oaj1TdxidCP"
266 |       },
267 |       "outputs": [],
268 |       "source": [
269 |         "wizard_lm = ChatModule(model=\"WizardLM-13B-V1.2-q4f16_1\", device=\"cuda\")"
270 |       ]
271 |     },
272 |     {
273 |       "cell_type": "code",
274 |       "execution_count": null,
275 |       "metadata": {
276 |         "colab": {
277 |           "base_uri": "https://localhost:8080/"
278 |         },
279 |         "id": "s6YdYe5AnVzN",
280 |         "outputId": "e620e4b5-61b9-4602-a75b-d438a81c1b3a"
281 |       },
282 |       "outputs": [
283 |         {
284 |           "name": "stdout",
285 |           "output_type": "stream",
286 |           "text": [
287 |             "1. New York City\n",
288 |             "2. Los Angeles\n",
289 |             "3. Chicago\n"
290 |           ]
291 |         }
292 |       ],
293 |       "source": [
294 |         "output = wizard_lm.generate(\n",
295 |         "    prompt=\"Give me three American cities names\",\n",
296 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
297 |         ")"
298 |       ]
299 |     },
300 |     {
301 |       "cell_type": "code",
302 |       "execution_count": null,
303 |       "metadata": {
304 |         "colab": {
305 |           "base_uri": "https://localhost:8080/"
306 |         },
307 |         "id": "rGwiIrKonehd",
308 |         "outputId": "d836f5d1-4b50-4cea-9da1-a11b662954c0"
309 |       },
310 |       "outputs": [
311 |         {
312 |           "name": "stdout",
313 |           "output_type": "stream",
314 |           "text": [
315 |             "4. Las Vegas\n",
316 |             "5. Miami\n"
317 |           ]
318 |         }
319 |       ],
320 |       "source": [
321 |         "output = wizard_lm.generate(\n",
322 |         "    prompt=\"Give me two more\",\n",
323 |         "    progress_callback=StreamToStdout(callback_interval=2)\n",
324 |         ")"
325 |       ]
326 |     },
327 |     {
328 |       "cell_type": "code",
329 |       "execution_count": null,
330 |       "metadata": {
331 |         "id": "WUdq7Z-qpABZ"
332 |       },
333 |       "outputs": [],
334 |       "source": [
335 |         "# In Colab, for some reason we need to restart runtime by running `exit()`.\n",
336 |         "# Simply run `exit()`, then run the subsequent cells after runtime restarts.\n",
337 |         "exit()"
338 |       ]
339 |     },
340 |     {
341 |       "cell_type": "markdown",
342 |       "metadata": {
343 |         "id": "XuHZgZhY8YoB"
344 |       },
345 |       "source": [
346 |         "## Let's Solve a Math Problem with WizardMath!"
347 |       ]
348 |     },
349 |     {
350 |       "cell_type": "code",
351 |       "execution_count": null,
352 |       "metadata": {
353 |         "id": "WxnBldtMqn2Y"
354 |       },
355 |       "outputs": [],
356 |       "source": [
357 |         "from mlc_llm import ChatModule\n",
358 |         "from mlc_llm.callback import StreamToStdout"
359 |       ]
360 |     },
361 |     {
362 |       "cell_type": "code",
363 |       "execution_count": null,
364 |       "metadata": {
365 |         "id": "5z3Cx8NChz-z"
366 |       },
367 |       "outputs": [],
368 |       "source": [
369 |         "wizard_math = ChatModule(model=\"WizardMath-13B-V1.0-q4f16_1\", device=\"cuda\")"
370 |       ]
371 |     },
372 |     {
373 |       "cell_type": "code",
374 |       "execution_count": null,
375 |       "metadata": {
376 |         "colab": {
377 |           "base_uri": "https://localhost:8080/"
378 |         },
379 |         "id": "ufta9VbyqtlN",
380 |         "outputId": "196f3ca1-2837-4b48-b52a-b012b840d3a4"
381 |       },
382 |       "outputs": [
383 |         {
384 |           "name": "stdout",
385 |           "output_type": "stream",
386 |           "text": [
387 |             "\n",
388 |             "Step 1: Define the variables.\n",
389 |             "Let C represent the number of chickens and R represent the number of rabbits.\n",
390 |             "\n",
391 |             "Step 2: Write the equations based on the given information.\n",
392 |             "We know that the total number of legs is 14, so we can write the equation:\n",
393 |             "2C + 4R = 14\n",
394 |             "\n",
395 |             "We also know that there are 5 animals in total, so we can write the equation:\n",
396 |             "C + R = 5\n",
397 |             "\n",
398 |             "Step 3: Solve the system of equations.\n",
399 |             "We can solve this system of equations using the substitution method. First, we'll solve the second equation for R:\n",
400 |             "R = 5 - C\n",
401 |             "\n",
402 |             "Now, we'll substitute this expression for R into the first equation:\n",
403 |             "2C + 4(5 - C) = 14\n",
404 |             "\n",
405 |             "Step 4: Simplify and solve for C.\n",
406 |             "2C + 20 - 4C = 14\n",
407 |             "-2C = -6\n",
408 |             "C = 3\n",
409 |             "\n",
410 |             "Step 5: Find the number of rabbits.\n",
411 |             "Now that we know there are 3 chickens, we can find the number of rabbits using the equation R = 5 - C:\n",
412 |             "R = 5 - 3\n",
413 |             "R = 2\n",
414 |             "\n",
415 |             "Step 6: Provide the final answer.\n",
416 |             "There are 3 chickens and 2 rabbits, so the answer is:\n",
417 |             "C + R = 3 + 2 = 5\n",
418 |             "There are 5 animals in total, and since we found that there are 3 chickens, there must be 2 rabbits.\n",
419 |             "\n",
420 |             "The answer is: 3.\n"
421 |           ]
422 |         }
423 |       ],
424 |       "source": [
425 |         "prompt=(\n",
426 |         "    \"A chicken has 2 legs, and a rabbit has 4 legs. Given that there are 5 animals \"\n",
427 |         "    \"in total, and 14 legs in total, how many chicken are there? Show your steps.\"\n",
428 |         ")\n",
429 |         "output = wizard_math.generate(prompt, StreamToStdout(callback_interval=2))"
430 |       ]
431 |     },
432 |     {
433 |       "cell_type": "code",
434 |       "execution_count": null,
435 |       "metadata": {
436 |         "id": "tsFRIhwKrMP1"
437 |       },
438 |       "outputs": [],
439 |       "source": [
440 |         "# In Colab, for some reason we need to restart runtime by running `exit()`.\n",
441 |         "# Simply run `exit()`, then run the subsequent cells after runtime restarts.\n",
442 |         "exit()"
443 |       ]
444 |     },
445 |     {
446 |       "cell_type": "markdown",
447 |       "metadata": {
448 |         "id": "IFjaA-yC8iH5"
449 |       },
450 |       "source": [
451 |         "## Let's Solve a Leetcode with WizardCoder!\n",
452 |         "\n",
453 |         "WizardMath tends to give Markdown format output, which is really cool! We use `IPython.display` to display the output as Markdown!"
454 |       ]
455 |     },
456 |     {
457 |       "cell_type": "code",
458 |       "execution_count": 1,
459 |       "metadata": {
460 |         "id": "esuPRZQSfxYl"
461 |       },
462 |       "outputs": [],
463 |       "source": [
464 |         "from mlc_llm import ChatModule\n",
465 |         "from IPython.display import display, Markdown, Latex"
466 |       ]
467 |     },
468 |     {
469 |       "cell_type": "code",
470 |       "execution_count": null,
471 |       "metadata": {
472 |         "id": "-LqAcTW5214y"
473 |       },
474 |       "outputs": [],
475 |       "source": [
476 |         "wizard_coder = ChatModule(model=\"WizardCoder-15B-V1.0-q4f16_1\", device=\"cuda\")"
477 |       ]
478 |     },
479 |     {
480 |       "cell_type": "code",
481 |       "execution_count": 3,
482 |       "metadata": {
483 |         "colab": {
484 |           "base_uri": "https://localhost:8080/",
485 |           "height": 338
486 |         },
487 |         "id": "neZIfq8ntFxo",
488 |         "outputId": "130abead-a928-49e5-c3ff-509d2636cbdd"
489 |       },
490 |       "outputs": [
491 |         {
492 |           "data": {
493 |             "text/markdown": [
494 |               "A number is said to be a palindrome if it reads the same backward as forward. For example, 121, 444, and 999 are palindromes, while 123, 777, and 555 are not.\r\n",
495 |               "\r\n",
496 |               "Here's the Python code to determine whether a number is a palindrome:\r\n",
497 |               "\r\n",
498 |               "```python\r\n",
499 |               "num = input(\"Enter a number: \")  # take input from user\r\n",
500 |               "\r\n",
501 |               "# convert the number to a string to check if it's a palindrome\r\n",
502 |               "num_str = str(num)\r\n",
503 |               "\r\n",
504 |               "# reverse the string and compare it with the original string\r\n",
505 |               "if num_str == num_str[::-1]:\r\n",
506 |               "    print(num, \"is a palindrome\")\r\n",
507 |               "else:\r\n",
508 |               "    print(num, \"is not a palindrome\")\r\n",
509 |               "```\r\n",
510 |               "\r\n",
511 |               "In this code, we first take input from the user using the `input()` function and store it in the variable `num`. We then convert the number to a string using the `str()` function and store it in the variable `num_str`.\r\n",
512 |               "\r\n",
513 |               "We then use slicing to reverse the string and compare it with the original string. If they are the same, then the number is a palindrome. Otherwise, it's not.\r\n",
514 |               "\r\n",
515 |               "Note that we use the `[::-1]` syntax to reverse the string. This is a shorthand for slicing the string from start to end with a step of -1 (i.e. backwards)."
516 |             ],
517 |             "text/plain": [
518 |               ""
519 |             ]
520 |           },
521 |           "metadata": {},
522 |           "output_type": "display_data"
523 |         }
524 |       ],
525 |       "source": [
526 |         "prompt= \"Write a Python program that determines whether a number is a palindrome.\"\n",
527 |         "output = wizard_coder.generate(prompt=prompt)\n",
528 |         "display(Markdown(output))"
529 |       ]
530 |     },
531 |     {
532 |       "cell_type": "code",
533 |       "execution_count": 4,
534 |       "metadata": {
535 |         "colab": {
536 |           "base_uri": "https://localhost:8080/"
537 |         },
538 |         "id": "eZjgzNvPf-Qd",
539 |         "outputId": "66a7a1e5-9fd4-4bf5-d4d3-9a90fd0479aa"
540 |       },
541 |       "outputs": [
542 |         {
543 |           "name": "stdout",
544 |           "output_type": "stream",
545 |           "text": [
546 |             "A number is said to be a palindrome if it reads the same backward as forward. For example, 121, 444, and 999 are palindromes, while 123, 777, and 555 are not.\r\n",
547 |             "\r\n",
548 |             "Here's the Python code to determine whether a number is a palindrome:\r\n",
549 |             "\r\n",
550 |             "```python\r\n",
551 |             "num = input(\"Enter a number: \")  # take input from user\r\n",
552 |             "\r\n",
553 |             "# convert the number to a string to check if it's a palindrome\r\n",
554 |             "num_str = str(num)\r\n",
555 |             "\r\n",
556 |             "# reverse the string and compare it with the original string\r\n",
557 |             "if num_str == num_str[::-1]:\r\n",
558 |             "    print(num, \"is a palindrome\")\r\n",
559 |             "else:\r\n",
560 |             "    print(num, \"is not a palindrome\")\r\n",
561 |             "```\r\n",
562 |             "\r\n",
563 |             "In this code, we first take input from the user using the `input()` function and store it in the variable `num`. We then convert the number to a string using the `str()` function and store it in the variable `num_str`.\r\n",
564 |             "\r\n",
565 |             "We then use slicing to reverse the string and compare it with the original string. If they are the same, then the number is a palindrome. Otherwise, it's not.\r\n",
566 |             "\r\n",
567 |             "Note that we use the `[::-1]` syntax to reverse the string. This is a shorthand for slicing the string from start to end with a step of -1 (i.e. backwards).\n"
568 |           ]
569 |         }
570 |       ],
571 |       "source": [
572 |         "print(output)"
573 |       ]
574 |     },
575 |     {
576 |       "cell_type": "code",
577 |       "execution_count": 5,
578 |       "metadata": {
579 |         "colab": {
580 |           "base_uri": "https://localhost:8080/",
581 |           "height": 507
582 |         },
583 |         "id": "r3Mny3Sa-xiN",
584 |         "outputId": "256a41f3-cc63-4fd8-bbcd-bf12d0fe032e"
585 |       },
586 |       "outputs": [
587 |         {
588 |           "data": {
589 |             "text/markdown": [
590 |               "Here's the Java code to determine whether a number is a palindrome:\r\n",
591 |               "\r\n",
592 |               "```java\r\n",
593 |               "import java.util.Scanner;\r\n",
594 |               "\r\n",
595 |               "public class Palindrome {\r\n",
596 |               "    public static void main(String[] args) {\r\n",
597 |               "        Scanner sc = new Scanner(System.in);\r\n",
598 |               "        System.out.print(\"Enter a number: \");\r\n",
599 |               "        int num = sc.nextInt();\r\n",
600 |               "\r\n",
601 |               "        // convert the number to a string to check if it's a palindrome\r\n",
602 |               "        String numStr = Integer.toString(num);\r\n",
603 |               "\r\n",
604 |               "        // reverse the string and compare it with the original string\r\n",
605 |               "        if (numStr.equals(new StringBuilder(numStr).reverse().toString())) {\r\n",
606 |               "            System.out.println(num + \" is a palindrome\");\r\n",
607 |               "        } else {\r\n",
608 |               "            System.out.println(num + \" is not a palindrome\");\r\n",
609 |               "        }\r\n",
610 |               "    }\r\n",
611 |               "}\r\n",
612 |               "```\r\n",
613 |               "\r\n",
614 |               "In this code, we first use the `Scanner` class to take input from the user. We then convert the number to a string using the `Integer.toString()` method and store it in the variable `numStr`.\r\n",
615 |               "\r\n",
616 |               "We then use the `StringBuilder` class to reverse the string and compare it with the original string. If they are the same, then the number is a palindrome. Otherwise, it's not.\r\n",
617 |               "\r\n",
618 |               "Note that we use the `new StringBuilder(numStr).reverse().toString()` syntax to reverse the string. This is a shorthand for creating a new `StringBuilder` object with the `numStr` string, reversing it, and then converting it back to a string using the `toString()` method."
619 |             ],
620 |             "text/plain": [
621 |               ""
622 |             ]
623 |           },
624 |           "metadata": {},
625 |           "output_type": "display_data"
626 |         }
627 |       ],
628 |       "source": [
629 |         "output = wizard_coder.generate(prompt=\"Write this in Java instead.\")\n",
630 |         "display(Markdown(output))"
631 |       ]
632 |     },
633 |     {
634 |       "cell_type": "code",
635 |       "execution_count": 6,
636 |       "metadata": {
637 |         "colab": {
638 |           "base_uri": "https://localhost:8080/"
639 |         },
640 |         "id": "PWnD4C6dI6-X",
641 |         "outputId": "286a84e8-2507-4fbd-c765-45eef888ee3e"
642 |       },
643 |       "outputs": [
644 |         {
645 |           "name": "stdout",
646 |           "output_type": "stream",
647 |           "text": [
648 |             "Here's the Java code to determine whether a number is a palindrome:\r\n",
649 |             "\r\n",
650 |             "```java\r\n",
651 |             "import java.util.Scanner;\r\n",
652 |             "\r\n",
653 |             "public class Palindrome {\r\n",
654 |             "    public static void main(String[] args) {\r\n",
655 |             "        Scanner sc = new Scanner(System.in);\r\n",
656 |             "        System.out.print(\"Enter a number: \");\r\n",
657 |             "        int num = sc.nextInt();\r\n",
658 |             "\r\n",
659 |             "        // convert the number to a string to check if it's a palindrome\r\n",
660 |             "        String numStr = Integer.toString(num);\r\n",
661 |             "\r\n",
662 |             "        // reverse the string and compare it with the original string\r\n",
663 |             "        if (numStr.equals(new StringBuilder(numStr).reverse().toString())) {\r\n",
664 |             "            System.out.println(num + \" is a palindrome\");\r\n",
665 |             "        } else {\r\n",
666 |             "            System.out.println(num + \" is not a palindrome\");\r\n",
667 |             "        }\r\n",
668 |             "    }\r\n",
669 |             "}\r\n",
670 |             "```\r\n",
671 |             "\r\n",
672 |             "In this code, we first use the `Scanner` class to take input from the user. We then convert the number to a string using the `Integer.toString()` method and store it in the variable `numStr`.\r\n",
673 |             "\r\n",
674 |             "We then use the `StringBuilder` class to reverse the string and compare it with the original string. If they are the same, then the number is a palindrome. Otherwise, it's not.\r\n",
675 |             "\r\n",
676 |             "Note that we use the `new StringBuilder(numStr).reverse().toString()` syntax to reverse the string. This is a shorthand for creating a new `StringBuilder` object with the `numStr` string, reversing it, and then converting it back to a string using the `toString()` method.\n"
677 |           ]
678 |         }
679 |       ],
680 |       "source": [
681 |         "print(output)"
682 |       ]
683 |     }
684 |   ],
685 |   "metadata": {
686 |     "accelerator": "GPU",
687 |     "colab": {
688 |       "gpuType": "T4",
689 |       "provenance": []
690 |     },
691 |     "kernelspec": {
692 |       "display_name": "Python 3",
693 |       "name": "python3"
694 |     },
695 |     "language_info": {
696 |       "name": "python"
697 |     }
698 |   },
699 |   "nbformat": 4,
700 |   "nbformat_minor": 0
701 | }
702 | 


--------------------------------------------------------------------------------
/mlc-llm/models/demo_gemma.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "uLLHBhZ_KVqE"
  7 |       },
  8 |       "source": [
  9 |         "# Demo: Gemma with MLC LLM\n",
 10 |         "\n",
 11 |         "Google recently release Gemma: https://blog.google/technology/developers/gemma-open-models/.\n",
 12 |         "\n",
 13 |         "This notebook demonstrates how to use the model with MLC LLM: https://llm.mlc.ai/.\n",
 14 |         "\n",
 15 |         "For the easiest setup, we recommend trying this out in a Google Colab notebook. Click the button below to get started!\n",
 16 |         "\n",
 17 |         "\n",
 18 |         "  \"Open\n",
 19 |         ""
 20 |       ]
 21 |     },
 22 |     {
 23 |       "cell_type": "markdown",
 24 |       "metadata": {
 25 |         "id": "Vu8opC0QMOZf"
 26 |       },
 27 |       "source": [
 28 |         "## Environment Setup\n",
 29 |         "\n",
 30 |         "Let's set up your environment, so you can successfully run the `ChatModule`. First, let's set up the Conda environment which we will be running this notebook in (not required if running in Google Colab).\n",
 31 |         "\n",
 32 |         "```bash\n",
 33 |         "conda create --name mlc-llm python=3.11\n",
 34 |         "conda activate mlc-llm\n",
 35 |         "```\n",
 36 |         "\n",
 37 |         "**Google Colab:** If you are running this in a Google Colab notebook, be sure to change your runtime to GPU by going to Runtime > Change runtime type and setting the Hardware accelerator to be \"GPU\". Select \"Connect\" on the top right to instantiate your GPU session.\n",
 38 |         "\n",
 39 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the version number."
 40 |       ]
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "execution_count": 1,
 45 |       "metadata": {
 46 |         "colab": {
 47 |           "base_uri": "https://localhost:8080/"
 48 |         },
 49 |         "id": "o7vvnPntdgun",
 50 |         "outputId": "fb05a739-0a5a-4447-b21a-bf21d5cfb537"
 51 |       },
 52 |       "outputs": [
 53 |         {
 54 |           "name": "stdout",
 55 |           "output_type": "stream",
 56 |           "text": [
 57 |             "Fri Feb 23 18:19:58 2024       \n",
 58 |             "+---------------------------------------------------------------------------------------+\n",
 59 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
 60 |             "|-----------------------------------------+----------------------+----------------------+\n",
 61 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
 62 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
 63 |             "|                                         |                      |               MIG M. |\n",
 64 |             "|=========================================+======================+======================|\n",
 65 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
 66 |             "| N/A   46C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |\n",
 67 |             "|                                         |                      |                  N/A |\n",
 68 |             "+-----------------------------------------+----------------------+----------------------+\n",
 69 |             "                                                                                         \n",
 70 |             "+---------------------------------------------------------------------------------------+\n",
 71 |             "| Processes:                                                                            |\n",
 72 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
 73 |             "|        ID   ID                                                             Usage      |\n",
 74 |             "|=======================================================================================|\n",
 75 |             "|  No running processes found                                                           |\n",
 76 |             "+---------------------------------------------------------------------------------------+\n"
 77 |           ]
 78 |         }
 79 |       ],
 80 |       "source": [
 81 |         "!nvidia-smi"
 82 |       ]
 83 |     },
 84 |     {
 85 |       "cell_type": "markdown",
 86 |       "metadata": {
 87 |         "id": "qZfQBQExMV-f"
 88 |       },
 89 |       "source": [
 90 |         "Next, let's download the MLC-AI and mlc-llm nightly build packages. Go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS."
 91 |       ]
 92 |     },
 93 |     {
 94 |       "cell_type": "code",
 95 |       "execution_count": null,
 96 |       "metadata": {
 97 |         "id": "yiPuqenodnB8"
 98 |       },
 99 |       "outputs": [],
100 |       "source": [
101 |         "!pip install --pre mlc-ai-nightly-cu122 mlc-llm-nightly-cu122 -f https://mlc.ai/wheels"
102 |       ]
103 |     },
104 |     {
105 |       "cell_type": "markdown",
106 |       "metadata": {
107 |         "id": "qtRRsOPHM3SE"
108 |       },
109 |       "source": [
110 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, simply restart session, and run the next cell after restart.\n",
111 |         "\n",
112 |         "Let's confirm we have installed the packages successfully!"
113 |       ]
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "execution_count": null,
118 |       "metadata": {
119 |         "colab": {
120 |           "base_uri": "https://localhost:8080/"
121 |         },
122 |         "id": "ktNZi8B6M4Md",
123 |         "outputId": "908711b7-eaa8-4a1a-88a8-147cb32c58c9"
124 |       },
125 |       "outputs": [],
126 |       "source": [
127 |         "!python -c \"import tvm; print('tvm installed properly!')\"\n",
128 |         "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\""
129 |       ]
130 |     },
131 |     {
132 |       "cell_type": "markdown",
133 |       "metadata": {
134 |         "id": "EIEtjOVvM9LJ"
135 |       },
136 |       "source": [
137 |         "## Running Gemma with MLC-LLM\n",
138 |         "\n",
139 |         "Then we can clone gemma weights converted to MLC format from huggingface.\n",
140 |         "\n",
141 |         "This is the only thing you need. Afterwards, our JIT (just-in-time) compilation will take care of everything for you!\n",
142 |         "\n",
143 |         "First time running may require more time as we need to compile the model. But afterwards we cache it to `/pathto/.cache/mlc_llm/`, so future runs are faster.\n",
144 |         "\n",
145 |         "Alternatively, you could also use the following\n",
146 |         "\n",
147 |         "```python\n",
148 |         "!python -m mlc_llm compile gemma-7b-it-q4f16_2-MLC -o gemma-7b-it-q4f16_2-q4f16_2-cuda.so\n",
149 |         "\n",
150 |         "cm = ChatModule(\"./gemma-7b-it-q4f16_2-MLC\", model_lib_path=\"gemma-7b-it-q4f16_2-q4f16_2-cuda.so\")\n",
151 |         "```"
152 |       ]
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "execution_count": 4,
157 |       "metadata": {
158 |         "colab": {
159 |           "base_uri": "https://localhost:8080/"
160 |         },
161 |         "id": "MHsFX5cwNZQN",
162 |         "outputId": "e5ab61f0-37b6-46a8-9ff5-d5782be6495f"
163 |       },
164 |       "outputs": [
165 |         {
166 |           "name": "stdout",
167 |           "output_type": "stream",
168 |           "text": [
169 |             "Git LFS initialized.\n"
170 |           ]
171 |         }
172 |       ],
173 |       "source": [
174 |         "!git lfs install"
175 |       ]
176 |     },
177 |     {
178 |       "cell_type": "code",
179 |       "execution_count": 5,
180 |       "metadata": {
181 |         "colab": {
182 |           "base_uri": "https://localhost:8080/"
183 |         },
184 |         "id": "isA1NfGFNadt",
185 |         "outputId": "26b1a8a6-bcf4-4b8b-a0eb-42e6816d1773"
186 |       },
187 |       "outputs": [
188 |         {
189 |           "name": "stdout",
190 |           "output_type": "stream",
191 |           "text": [
192 |             "Cloning into 'gemma-7b-it-q4f16_2-MLC'...\n",
193 |             "remote: Enumerating objects: 113, done.\u001b[K\n",
194 |             "remote: Counting objects: 100% (110/110), done.\u001b[K\n",
195 |             "remote: Compressing objects: 100% (110/110), done.\u001b[K\n",
196 |             "remote: Total 113 (delta 0), reused 0 (delta 0), pack-reused 3\u001b[K\n",
197 |             "Receiving objects: 100% (113/113), 33.40 KiB | 6.68 MiB/s, done.\n",
198 |             "Filtering content: 100% (103/103), 5.54 GiB | 62.53 MiB/s, done.\n"
199 |           ]
200 |         }
201 |       ],
202 |       "source": [
203 |         "# This is gemma 7b with 4-bit quantization\n",
204 |         "# Any other quantizations/models have the same steps: https://huggingface.co/mlc-ai\n",
205 |         "!git clone https://huggingface.co/mlc-ai/gemma-7b-it-q4f16_2-MLC"
206 |       ]
207 |     },
208 |     {
209 |       "cell_type": "code",
210 |       "execution_count": 6,
211 |       "metadata": {
212 |         "id": "MbxdMhcgfGvk"
213 |       },
214 |       "outputs": [],
215 |       "source": [
216 |         "from mlc_llm import ChatModule\n",
217 |         "from mlc_llm.callback import StreamToStdout"
218 |       ]
219 |     },
220 |     {
221 |       "cell_type": "code",
222 |       "execution_count": 7,
223 |       "metadata": {
224 |         "id": "QAjm3lTJmsiy"
225 |       },
226 |       "outputs": [],
227 |       "source": [
228 |         "cm = ChatModule(\"./gemma-7b-it-q4f16_2-MLC\")"
229 |       ]
230 |     },
231 |     {
232 |       "cell_type": "code",
233 |       "execution_count": 8,
234 |       "metadata": {
235 |         "colab": {
236 |           "base_uri": "https://localhost:8080/"
237 |         },
238 |         "id": "BEHtDLG9nTx1",
239 |         "outputId": "1c7a3037-afd5-406e-ca98-6282a14b2719"
240 |       },
241 |       "outputs": [
242 |         {
243 |           "name": "stdout",
244 |           "output_type": "stream",
245 |           "text": [
246 |             "Sure, here's a quick overview of five states in the US:\n",
247 |             "\n",
248 |             "**1. California:**\n",
249 |             "- Capital: Sacramento\n",
250 |             "- Largest city: Los Angeles\n",
251 |             "- Known for: Golden Gate Bridge, Hollywood, Silicon Valley, and its diverse population.\n",
252 |             "\n",
253 |             "**2. New York:**\n",
254 |             "- Capital: Albany\n",
255 |             "- Largest city: New York City\n",
256 |             "- Known for: Empire State Building, Times Square, Niagara Falls, and its rich history.\n",
257 |             "\n",
258 |             "**3. Texas:**\n",
259 |             "- Capital: Austin\n",
260 |             "- Largest city: Dallas\n",
261 |             "- Known for: Its large size, diverse culture, and its strong economy.\n",
262 |             "\n",
263 |             "**4. Florida:**\n",
264 |             "- Capital: Tallahassee\n",
265 |             "- Largest city: Jacksonville\n",
266 |             "- Known for: Its beautiful beaches, warm climate, and its history as a major naval power.\n",
267 |             "\n",
268 |             "**5. Alaska:**\n",
269 |             "- Capital: Juneau\n",
270 |             "- Largest city: Anchorage\n",
271 |             "- Known for: Its breathtaking natural beauty, including towering mountains, glaciers, and fjords.\n"
272 |           ]
273 |         }
274 |       ],
275 |       "source": [
276 |         "output = cm.generate(\n",
277 |         "    prompt=\"Tell me about 5 states in the US\",\n",
278 |         "    progress_callback=StreamToStdout(callback_interval=2),\n",
279 |         ")"
280 |       ]
281 |     },
282 |     {
283 |       "cell_type": "code",
284 |       "execution_count": 9,
285 |       "metadata": {
286 |         "colab": {
287 |           "base_uri": "https://localhost:8080/"
288 |         },
289 |         "id": "ElwvKxHSQZe-",
290 |         "outputId": "0709daf6-623d-42dd-c790-9b330afc035e"
291 |       },
292 |       "outputs": [
293 |         {
294 |           "name": "stdout",
295 |           "output_type": "stream",
296 |           "text": [
297 |             "**Sure, here are two more states:**\n",
298 |             "\n",
299 |             "**6. Nevada:**\n",
300 |             "- Capital: Carson City\n",
301 |             "- Largest city: Las Vegas\n",
302 |             "- Known for: Its casinos, its desert landscapes, and its history as a frontier town.\n",
303 |             "\n",
304 |             "**7. Idaho:**\n",
305 |             "- Capital: Boise\n",
306 |             "- Largest city: Boise\n",
307 |             "- Known for: Its scenic mountains, its salmon fishing, and its rich Native American heritage.\n"
308 |           ]
309 |         }
310 |       ],
311 |       "source": [
312 |         "output = cm.generate(\n",
313 |         "    prompt=\"Two more please\",\n",
314 |         "    progress_callback=StreamToStdout(callback_interval=2),\n",
315 |         ")"
316 |       ]
317 |     },
318 |     {
319 |       "cell_type": "code",
320 |       "execution_count": null,
321 |       "metadata": {
322 |         "id": "hvbzb39ZrAVO"
323 |       },
324 |       "outputs": [],
325 |       "source": [
326 |         "cm.reset_chat()"
327 |       ]
328 |     }
329 |   ],
330 |   "metadata": {
331 |     "accelerator": "GPU",
332 |     "colab": {
333 |       "gpuType": "T4",
334 |       "provenance": []
335 |     },
336 |     "kernelspec": {
337 |       "display_name": "Python 3",
338 |       "name": "python3"
339 |     },
340 |     "language_info": {
341 |       "name": "python",
342 |       "version": "3.11.6"
343 |     }
344 |   },
345 |   "nbformat": 4,
346 |   "nbformat_minor": 0
347 | }
348 | 


--------------------------------------------------------------------------------
/mlc-llm/tutorial_chat_module_getting_started.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "Cm85Ap3zDmYB"
  7 |       },
  8 |       "source": [
  9 |         "# Getting Started with MLC-LLM using the Llama 2 Model\n",
 10 |         "\n",
 11 |         "Here's a quick overview of how to get started with the MLC-LLM `ChatModule` in Python. In this tutorial, we will chat with the [Llama2](https://ai.meta.com/llama/) model. For the easiest setup, we recommend trying this out in a Google Colab notebook. Click the button below to get started!\n",
 12 |         "\n",
 13 |         "\n",
 14 |         "  \"Open\n",
 15 |         ""
 16 |       ]
 17 |     },
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "1ttPt-hNDmYC"
 22 |       },
 23 |       "source": [
 24 |         "## Environment Setup\n",
 25 |         "\n",
 26 |         "Let's set up your environment, so you can successfully run the `ChatModule`. First, let's set up the Conda environment which we will be running this notebook in (not required if running in Google Colab).\n",
 27 |         "\n",
 28 |         "```bash\n",
 29 |         "conda create --name mlc-llm python=3.10\n",
 30 |         "conda activate mlc-llm\n",
 31 |         "```\n",
 32 |         "\n",
 33 |         "**Google Colab:** If you are running this in a Google Colab notebook, be sure to change your runtime to GPU by going to Runtime > Change runtime type and setting the Hardware accelerator to be \"GPU\". Select \"Connect\" on the top right to instantiate your GPU session.\n",
 34 |         "\n",
 35 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the version number."
 36 |       ]
 37 |     },
 38 |     {
 39 |       "cell_type": "code",
 40 |       "execution_count": null,
 41 |       "metadata": {
 42 |         "id": "KK25HZsIDmYC"
 43 |       },
 44 |       "outputs": [],
 45 |       "source": [
 46 |         "!nvidia-smi"
 47 |       ]
 48 |     },
 49 |     {
 50 |       "cell_type": "markdown",
 51 |       "metadata": {
 52 |         "id": "EWOtpjJMDmYE"
 53 |       },
 54 |       "source": [
 55 |         "Next, let's download the MLC-AI and mlc-llm nightly build packages. Go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS.\n",
 56 |         "\n",
 57 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly."
 58 |       ]
 59 |     },
 60 |     {
 61 |       "cell_type": "code",
 62 |       "execution_count": null,
 63 |       "metadata": {
 64 |         "id": "PgW-5OAADmYE"
 65 |       },
 66 |       "outputs": [],
 67 |       "source": [
 68 |         "!pip install --pre --force-reinstall mlc-ai-nightly-cu118 mlc-llm-nightly-cu118 -f https://mlc.ai/wheels"
 69 |       ]
 70 |     },
 71 |     {
 72 |       "cell_type": "markdown",
 73 |       "metadata": {
 74 |         "id": "FwsWd1WbDmYE"
 75 |       },
 76 |       "source": [
 77 |         "Next, let's download the model weights for the Llama2 model and the prebuilt model libraries from Github. In order to download the large weights, we'll have to use `git lfs`."
 78 |       ]
 79 |     },
 80 |     {
 81 |       "cell_type": "markdown",
 82 |       "metadata": {
 83 |         "id": "ppvAhErV3gjq"
 84 |       },
 85 |       "source": [
 86 |         "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell to fully install `git lfs`."
 87 |       ]
 88 |     },
 89 |     {
 90 |       "cell_type": "code",
 91 |       "execution_count": null,
 92 |       "metadata": {
 93 |         "id": "V0GjINnMDmYF"
 94 |       },
 95 |       "outputs": [],
 96 |       "source": [
 97 |         "!git lfs install"
 98 |       ]
 99 |     },
100 |     {
101 |       "cell_type": "markdown",
102 |       "metadata": {
103 |         "id": "yYwjsCOK7Jij"
104 |       },
105 |       "source": [
106 |         "These commands will download many prebuilt libraries as well as the chat configuration for Llama-2-7b that `mlc_llm` needs, which may take a long time. If in **Google Colab** you can verify that the files are being downloaded by clicking on the folder icon on the left and navigating to the `dist` and then `prebuilt` folders which should be updating as the files are being downloaded."
107 |       ]
108 |     },
109 |     {
110 |       "cell_type": "code",
111 |       "execution_count": null,
112 |       "metadata": {
113 |         "id": "FSAe7Ew_DmYF"
114 |       },
115 |       "outputs": [],
116 |       "source": [
117 |         "!mkdir -p dist\n",
118 |         "!git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt_libs"
119 |       ]
120 |     },
121 |     {
122 |       "cell_type": "code",
123 |       "execution_count": null,
124 |       "metadata": {
125 |         "id": "BDbi6H3MDmYF"
126 |       },
127 |       "outputs": [],
128 |       "source": [
129 |         "!cd dist && git clone https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC"
130 |       ]
131 |     },
132 |     {
133 |       "cell_type": "code",
134 |       "execution_count": null,
135 |       "metadata": {},
136 |       "outputs": [],
137 |       "source": [
138 |         "# Need to restart runtime since notebooks cannot find the module right after installing\n",
139 |         "# Simply run this cell, then run the next cells after runtime finishes restarting\n",
140 |         "exit()"
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "markdown",
145 |       "metadata": {
146 |         "id": "76Ru5__tDmYF"
147 |       },
148 |       "source": [
149 |         "## Let's Chat!\n",
150 |         "\n",
151 |         "Before we can chat with the model, we must first import a library and instantiate a `ChatModule` instance. The `ChatModule` must be initialized with the appropriate model name."
152 |       ]
153 |     },
154 |     {
155 |       "cell_type": "code",
156 |       "execution_count": null,
157 |       "metadata": {
158 |         "id": "AJAt6oW7DmYF"
159 |       },
160 |       "outputs": [],
161 |       "source": [
162 |         "from mlc_llm import ChatModule\n",
163 |         "from mlc_llm.callback import StreamToStdout\n",
164 |         "\n",
165 |         "cm = ChatModule(\n",
166 |         "   model=\"dist/Llama-2-7b-chat-hf-q4f16_1-MLC\",\n",
167 |         "   model_lib_path=\"dist/prebuilt_libs/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f16_1-cuda.so\"\n",
168 |         ")"
169 |       ]
170 |     },
171 |     {
172 |       "cell_type": "markdown",
173 |       "metadata": {
174 |         "id": "c9m5sxyXDmYF"
175 |       },
176 |       "source": [
177 |         "For other platforms/backends, change the file in `model_lib_path` to:\n",
178 |         "\n",
179 |         "- Vulkan on Linux: `Llama-2-7b-chat-hf-q4f16_1-vulkan.so`\n",
180 |         "- Metal on macOS: `Llama-2-7b-chat-hf-q4f16_1-metal.so`\n",
181 |         "- Other platforms: `Llama-2-7b-chat-hf-q4f16_1-{backend}.{suffix}`"
182 |       ]
183 |     },
184 |     {
185 |       "cell_type": "markdown",
186 |       "metadata": {
187 |         "id": "zEaVXnnJDmYF"
188 |       },
189 |       "source": [
190 |         "That is all what needed to set up the `ChatModule`. You can now chat with the model by entering any prompt you'd like. Try it out below!"
191 |       ]
192 |     },
193 |     {
194 |       "cell_type": "code",
195 |       "execution_count": null,
196 |       "metadata": {
197 |         "id": "TNmg9N_NDmYF"
198 |       },
199 |       "outputs": [],
200 |       "source": [
201 |         "output = cm.generate(\n",
202 |         "    prompt=\"When was Python released?\",\n",
203 |         "    progress_callback=StreamToStdout(callback_interval=2),\n",
204 |         ")"
205 |       ]
206 |     },
207 |     {
208 |       "cell_type": "markdown",
209 |       "metadata": {},
210 |       "source": [
211 |         "You can also repeat running the code block below for multiple rounds to interact with the model in a chat style."
212 |       ]
213 |     },
214 |     {
215 |       "cell_type": "code",
216 |       "execution_count": null,
217 |       "metadata": {},
218 |       "outputs": [],
219 |       "source": [
220 |         "prompt = input(\"Prompt: \")\n",
221 |         "output = cm.generate(prompt=prompt, progress_callback=StreamToStdout(callback_interval=2))"
222 |       ]
223 |     },
224 |     {
225 |       "cell_type": "code",
226 |       "execution_count": null,
227 |       "metadata": {},
228 |       "outputs": [],
229 |       "source": [
230 |         "output = cm.generate(\n",
231 |         "    prompt=\"Please summarize your response in three sentences.\",\n",
232 |         "    progress_callback=StreamToStdout(callback_interval=2),\n",
233 |         ")"
234 |       ]
235 |     },
236 |     {
237 |       "cell_type": "markdown",
238 |       "metadata": {
239 |         "id": "I4bOyUk7DmYF"
240 |       },
241 |       "source": [
242 |         "To check the generation speed of the chat bot, you can print the statistics."
243 |       ]
244 |     },
245 |     {
246 |       "cell_type": "code",
247 |       "execution_count": null,
248 |       "metadata": {
249 |         "id": "PPbPj6vpDmYF"
250 |       },
251 |       "outputs": [],
252 |       "source": [
253 |         "print(cm.stats())"
254 |       ]
255 |     },
256 |     {
257 |       "cell_type": "markdown",
258 |       "metadata": {
259 |         "id": "XAb-XZPnDmYF"
260 |       },
261 |       "source": [
262 |         "By default, the `ChatModule` will keep a history of your chat. You can reset the chat history by running the following."
263 |       ]
264 |     },
265 |     {
266 |       "cell_type": "code",
267 |       "execution_count": null,
268 |       "metadata": {
269 |         "id": "iKpKgVxNDmYF"
270 |       },
271 |       "outputs": [],
272 |       "source": [
273 |         "cm.reset_chat()"
274 |       ]
275 |     },
276 |     {
277 |       "cell_type": "markdown",
278 |       "metadata": {},
279 |       "source": [
280 |         "### Benchmark Performance\n",
281 |         "\n",
282 |         "To benchmark the performance, we can use the `benchmark_generate` method of ChatModule. It takes an input prompt and the number of tokens to generate, ignores the system prompt and model stop criterion, generates tokens in a language model way and stops until finishing generating the desired number of tokens. After calling `benchmark_generate`, we can use `stats` to check the performance."
283 |       ]
284 |     },
285 |     {
286 |       "cell_type": "code",
287 |       "execution_count": null,
288 |       "metadata": {},
289 |       "outputs": [],
290 |       "source": [
291 |         "print(cm.benchmark_generate(prompt=\"What is benchmark?\", generate_length=512))\n",
292 |         "cm.stats()"
293 |       ]
294 |     }
295 |   ],
296 |   "metadata": {
297 |     "accelerator": "GPU",
298 |     "colab": {
299 |       "gpuType": "T4",
300 |       "provenance": []
301 |     },
302 |     "kernelspec": {
303 |       "display_name": "Python 3",
304 |       "name": "python3"
305 |     },
306 |     "language_info": {
307 |       "codemirror_mode": {
308 |         "name": "ipython",
309 |         "version": 3
310 |       },
311 |       "file_extension": ".py",
312 |       "mimetype": "text/x-python",
313 |       "name": "python",
314 |       "nbconvert_exporter": "python",
315 |       "pygments_lexer": "ipython3",
316 |       "version": "3.10.8"
317 |     },
318 |     "orig_nbformat": 4
319 |   },
320 |   "nbformat": 4,
321 |   "nbformat_minor": 0
322 | }
323 | 


--------------------------------------------------------------------------------
/mlc-llm/tutorial_mlc_xgrammar_structured_generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "4IwhlCjVtpYj"
  7 |       },
  8 |       "source": [
  9 |         "# MLC-LLM Structured Generation with XGrammar\n",
 10 |         "\n",
 11 |         "Here's a quick overview of how to generate structured text with XGrammar in MLC LLM in Python.\n",
 12 |         "In this tutorial, we will be chatting with the Llama3.2 model.\n",
 13 |         "For the easiest setup, we recommend trying this out in a Google Colab notebook. Click the button below to get started!\n",
 14 |         "\n",
 15 |         "Structured generation of LLMs greatly improves the abilities of LLMs,\n",
 16 |         "going beyond the basic chat or plain text generation.\n",
 17 |         "With controllable structured generation, LLMs become able to serve as standard tools and can be better integrated into other applications in production.\n",
 18 |         "MLCEngine offers state-of-the-art structured generation with XGrammar integration.\n",
 19 |         "Importantly, the structured generation support is built into the engine, which means it can be used across all the API platforms that MLCEngine supports.\n",
 20 |         "\n",
 21 |         "Learn more about\n",
 22 |         "* MLC LLM: https://mlc.ai/mlc-llm/docs.\n",
 23 |         "* XGrammar: https://xgrammar.mlc.ai/docs"
 24 |       ]
 25 |     },
 26 |     {
 27 |       "cell_type": "markdown",
 28 |       "metadata": {
 29 |         "id": "YsvAL7SSt9Lo"
 30 |       },
 31 |       "source": [
 32 |         "Click the button below to get started!\n",
 33 |         "\n",
 34 |         "\n",
 35 |         "  \"Open\n",
 36 |         ""
 37 |       ]
 38 |     },
 39 |     {
 40 |       "cell_type": "markdown",
 41 |       "metadata": {
 42 |         "id": "8kkADAMCCLi-"
 43 |       },
 44 |       "source": [
 45 |         "## Install MLC LLM"
 46 |       ]
 47 |     },
 48 |     {
 49 |       "cell_type": "markdown",
 50 |       "metadata": {
 51 |         "id": "Y2EwuS6TCO61"
 52 |       },
 53 |       "source": [
 54 |         "We will start from setting up the environment. First, let us create a new Conda environment, in which we will run the rest of the notebook.\n",
 55 |         "\n",
 56 |         "```\n",
 57 |         "conda create --name mlc-llm python=3.11\n",
 58 |         "conda activate mlc-llm\n",
 59 |         "```"
 60 |       ]
 61 |     },
 62 |     {
 63 |       "cell_type": "markdown",
 64 |       "metadata": {
 65 |         "id": "ojEeEmsqCTPG"
 66 |       },
 67 |       "source": [
 68 |         "**Google Colab**\n",
 69 |         "\n",
 70 |         "- If you are running this in a Google Colab notebook, you would not need to create a conda environment.\n",
 71 |         "- However, be sure to change your runtime to GPU by going to `Runtime` > `Change runtime type` and setting the Hardware accelerator to be \"GPU\"."
 72 |       ]
 73 |     },
 74 |     {
 75 |       "cell_type": "markdown",
 76 |       "metadata": {
 77 |         "id": "S_rX53bGChPn"
 78 |       },
 79 |       "source": [
 80 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the driver version number as well as what GPUs are currently available for use."
 81 |       ]
 82 |     },
 83 |     {
 84 |       "cell_type": "code",
 85 |       "execution_count": 1,
 86 |       "metadata": {
 87 |         "id": "CRPeCflbCij6",
 88 |         "colab": {
 89 |           "base_uri": "https://localhost:8080/"
 90 |         },
 91 |         "outputId": "e661ee68-a30c-4800-fc30-fef1d85d557d"
 92 |       },
 93 |       "outputs": [
 94 |         {
 95 |           "output_type": "stream",
 96 |           "name": "stdout",
 97 |           "text": [
 98 |             "Fri Nov 22 01:44:26 2024       \n",
 99 |             "+---------------------------------------------------------------------------------------+\n",
100 |             "| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |\n",
101 |             "|-----------------------------------------+----------------------+----------------------+\n",
102 |             "| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |\n",
103 |             "| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |\n",
104 |             "|                                         |                      |               MIG M. |\n",
105 |             "|=========================================+======================+======================|\n",
106 |             "|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |\n",
107 |             "| N/A   61C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |\n",
108 |             "|                                         |                      |                  N/A |\n",
109 |             "+-----------------------------------------+----------------------+----------------------+\n",
110 |             "                                                                                         \n",
111 |             "+---------------------------------------------------------------------------------------+\n",
112 |             "| Processes:                                                                            |\n",
113 |             "|  GPU   GI   CI        PID   Type   Process name                            GPU Memory |\n",
114 |             "|        ID   ID                                                             Usage      |\n",
115 |             "|=======================================================================================|\n",
116 |             "|  No running processes found                                                           |\n",
117 |             "+---------------------------------------------------------------------------------------+\n"
118 |           ]
119 |         }
120 |       ],
121 |       "source": [
122 |         "!nvidia-smi"
123 |       ]
124 |     },
125 |     {
126 |       "cell_type": "markdown",
127 |       "metadata": {
128 |         "id": "PQfVfTAYC1M-"
129 |       },
130 |       "source": [
131 |         "Next, let's download the MLC-AI and mlc-llm nightly build packages. If you are running in a Colab environment, then you can just run the following command. Otherwise, go to https://llm.mlc.ai/docs/install/mlc_llm.html and replace the command below with the one that is appropriate for your hardware and OS."
132 |       ]
133 |     },
134 |     {
135 |       "cell_type": "markdown",
136 |       "metadata": {
137 |         "id": "vi-udt4tC5c9"
138 |       },
139 |       "source": [
140 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly."
141 |       ]
142 |     },
143 |     {
144 |       "cell_type": "code",
145 |       "execution_count": 2,
146 |       "metadata": {
147 |         "id": "ah9tYaCRCkKS",
148 |         "colab": {
149 |           "base_uri": "https://localhost:8080/"
150 |         },
151 |         "outputId": "5556bc7a-5dd2-4ecd-d0d9-b008cccfe8a1"
152 |       },
153 |       "outputs": [
154 |         {
155 |           "output_type": "stream",
156 |           "name": "stdout",
157 |           "text": [
158 |             "Looking in links: https://mlc.ai/wheels\n",
159 |             "Collecting mlc-ai-nightly-cu123\n",
160 |             "  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_ai_nightly_cu123-0.18.dev249-cp310-cp310-manylinux_2_28_x86_64.whl (1026.5 MB)\n",
161 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 GB\u001b[0m \u001b[31m987.8 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
162 |             "\u001b[?25hCollecting mlc-llm-nightly-cu123\n",
163 |             "  Downloading https://github.com/mlc-ai/package/releases/download/v0.9.dev0/mlc_llm_nightly_cu123-0.18.dev71-cp310-cp310-manylinux_2_28_x86_64.whl (177.1 MB)\n",
164 |             "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m177.1/177.1 MB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
165 |             "\u001b[?25hRequirement already satisfied: attrs in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (24.2.0)\n",
166 |             "Requirement already satisfied: cloudpickle in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (3.1.0)\n",
167 |             "Requirement already satisfied: decorator in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (4.4.2)\n",
168 |             "Requirement already satisfied: ml-dtypes in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (0.4.1)\n",
169 |             "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (1.26.4)\n",
170 |             "Requirement already satisfied: packaging in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (24.2)\n",
171 |             "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (5.9.5)\n",
172 |             "Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (1.13.1)\n",
173 |             "Requirement already satisfied: tornado in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (6.3.3)\n",
174 |             "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from mlc-ai-nightly-cu123) (4.12.2)\n",
175 |             "Collecting fastapi (from mlc-llm-nightly-cu123)\n",
176 |             "  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)\n",
177 |             "Collecting uvicorn (from mlc-llm-nightly-cu123)\n",
178 |             "  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)\n",
179 |             "Collecting shortuuid (from mlc-llm-nightly-cu123)\n",
180 |             "  Downloading shortuuid-1.0.13-py3-none-any.whl.metadata (5.8 kB)\n",
181 |             "Requirement already satisfied: torch in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (2.5.1+cu121)\n",
182 |             "Requirement already satisfied: safetensors in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (0.4.5)\n",
183 |             "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (2.32.3)\n",
184 |             "Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (4.66.6)\n",
185 |             "Requirement already satisfied: sentencepiece in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (0.2.0)\n",
186 |             "Collecting tiktoken (from mlc-llm-nightly-cu123)\n",
187 |             "  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
188 |             "Requirement already satisfied: prompt-toolkit in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (3.0.48)\n",
189 |             "Requirement already satisfied: openai in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (1.54.4)\n",
190 |             "Requirement already satisfied: transformers in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (4.46.2)\n",
191 |             "Requirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from mlc-llm-nightly-cu123) (2.2.2)\n",
192 |             "Collecting datasets (from mlc-llm-nightly-cu123)\n",
193 |             "  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)\n",
194 |             "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from datasets->mlc-llm-nightly-cu123) (3.16.1)\n",
195 |             "Requirement already satisfied: pyarrow>=15.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets->mlc-llm-nightly-cu123) (17.0.0)\n",
196 |             "Collecting dill<0.3.9,>=0.3.0 (from datasets->mlc-llm-nightly-cu123)\n",
197 |             "  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)\n",
198 |             "Collecting xxhash (from datasets->mlc-llm-nightly-cu123)\n",
199 |             "  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
200 |             "Collecting multiprocess<0.70.17 (from datasets->mlc-llm-nightly-cu123)\n",
201 |             "  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)\n",
202 |             "Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets->mlc-llm-nightly-cu123)\n",
203 |             "  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)\n",
204 |             "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets->mlc-llm-nightly-cu123) (3.11.2)\n",
205 |             "Requirement already satisfied: huggingface-hub>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from datasets->mlc-llm-nightly-cu123) (0.26.2)\n",
206 |             "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from datasets->mlc-llm-nightly-cu123) (6.0.2)\n",
207 |             "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->mlc-llm-nightly-cu123) (3.4.0)\n",
208 |             "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->mlc-llm-nightly-cu123) (3.10)\n",
209 |             "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->mlc-llm-nightly-cu123) (2.2.3)\n",
210 |             "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->mlc-llm-nightly-cu123) (2024.8.30)\n",
211 |             "Collecting starlette<0.42.0,>=0.40.0 (from fastapi->mlc-llm-nightly-cu123)\n",
212 |             "  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)\n",
213 |             "Requirement already satisfied: pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4 in /usr/local/lib/python3.10/dist-packages (from fastapi->mlc-llm-nightly-cu123) (2.9.2)\n",
214 |             "Requirement already satisfied: anyio<5,>=3.5.0 in /usr/local/lib/python3.10/dist-packages (from openai->mlc-llm-nightly-cu123) (3.7.1)\n",
215 |             "Requirement already satisfied: distro<2,>=1.7.0 in /usr/local/lib/python3.10/dist-packages (from openai->mlc-llm-nightly-cu123) (1.9.0)\n",
216 |             "Requirement already satisfied: httpx<1,>=0.23.0 in /usr/local/lib/python3.10/dist-packages (from openai->mlc-llm-nightly-cu123) (0.27.2)\n",
217 |             "Requirement already satisfied: jiter<1,>=0.4.0 in /usr/local/lib/python3.10/dist-packages (from openai->mlc-llm-nightly-cu123) (0.7.1)\n",
218 |             "Requirement already satisfied: sniffio in /usr/local/lib/python3.10/dist-packages (from openai->mlc-llm-nightly-cu123) (1.3.1)\n",
219 |             "Requirement already satisfied: python-dateutil>=2.8.2 in /usr/local/lib/python3.10/dist-packages (from pandas->mlc-llm-nightly-cu123) (2.8.2)\n",
220 |             "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->mlc-llm-nightly-cu123) (2024.2)\n",
221 |             "Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.10/dist-packages (from pandas->mlc-llm-nightly-cu123) (2024.2)\n",
222 |             "Requirement already satisfied: wcwidth in /usr/local/lib/python3.10/dist-packages (from prompt-toolkit->mlc-llm-nightly-cu123) (0.2.13)\n",
223 |             "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken->mlc-llm-nightly-cu123) (2024.9.11)\n",
224 |             "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch->mlc-llm-nightly-cu123) (3.4.2)\n",
225 |             "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch->mlc-llm-nightly-cu123) (3.1.4)\n",
226 |             "Requirement already satisfied: sympy==1.13.1 in /usr/local/lib/python3.10/dist-packages (from torch->mlc-llm-nightly-cu123) (1.13.1)\n",
227 |             "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from sympy==1.13.1->torch->mlc-llm-nightly-cu123) (1.3.0)\n",
228 |             "Requirement already satisfied: tokenizers<0.21,>=0.20 in /usr/local/lib/python3.10/dist-packages (from transformers->mlc-llm-nightly-cu123) (0.20.3)\n",
229 |             "Requirement already satisfied: click>=7.0 in /usr/local/lib/python3.10/dist-packages (from uvicorn->mlc-llm-nightly-cu123) (8.1.7)\n",
230 |             "Requirement already satisfied: h11>=0.8 in /usr/local/lib/python3.10/dist-packages (from uvicorn->mlc-llm-nightly-cu123) (0.14.0)\n",
231 |             "Requirement already satisfied: exceptiongroup in /usr/local/lib/python3.10/dist-packages (from anyio<5,>=3.5.0->openai->mlc-llm-nightly-cu123) (1.2.2)\n",
232 |             "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (2.4.3)\n",
233 |             "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (1.3.1)\n",
234 |             "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (1.5.0)\n",
235 |             "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (6.1.0)\n",
236 |             "Requirement already satisfied: propcache>=0.2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (0.2.0)\n",
237 |             "Requirement already satisfied: yarl<2.0,>=1.17.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (1.17.2)\n",
238 |             "Requirement already satisfied: async-timeout<6.0,>=4.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets->mlc-llm-nightly-cu123) (4.0.3)\n",
239 |             "Requirement already satisfied: httpcore==1.* in /usr/local/lib/python3.10/dist-packages (from httpx<1,>=0.23.0->openai->mlc-llm-nightly-cu123) (1.0.7)\n",
240 |             "Requirement already satisfied: annotated-types>=0.6.0 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->mlc-llm-nightly-cu123) (0.7.0)\n",
241 |             "Requirement already satisfied: pydantic-core==2.23.4 in /usr/local/lib/python3.10/dist-packages (from pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,!=2.1.0,<3.0.0,>=1.7.4->fastapi->mlc-llm-nightly-cu123) (2.23.4)\n",
242 |             "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.2->pandas->mlc-llm-nightly-cu123) (1.16.0)\n",
243 |             "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch->mlc-llm-nightly-cu123) (3.0.2)\n",
244 |             "Downloading datasets-3.1.0-py3-none-any.whl (480 kB)\n",
245 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m480.6/480.6 kB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
246 |             "\u001b[?25hDownloading fastapi-0.115.5-py3-none-any.whl (94 kB)\n",
247 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m94.9/94.9 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
248 |             "\u001b[?25hDownloading shortuuid-1.0.13-py3-none-any.whl (10 kB)\n",
249 |             "Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
250 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m48.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
251 |             "\u001b[?25hDownloading uvicorn-0.32.1-py3-none-any.whl (63 kB)\n",
252 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m63.8/63.8 kB\u001b[0m \u001b[31m5.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
253 |             "\u001b[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)\n",
254 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m116.3/116.3 kB\u001b[0m \u001b[31m11.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
255 |             "\u001b[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)\n",
256 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m179.3/179.3 kB\u001b[0m \u001b[31m15.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
257 |             "\u001b[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)\n",
258 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m13.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
259 |             "\u001b[?25hDownloading starlette-0.41.3-py3-none-any.whl (73 kB)\n",
260 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.2/73.2 kB\u001b[0m \u001b[31m7.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
261 |             "\u001b[?25hDownloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
262 |             "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m15.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
263 |             "\u001b[?25hInstalling collected packages: xxhash, uvicorn, shortuuid, fsspec, dill, tiktoken, starlette, multiprocess, mlc-ai-nightly-cu123, fastapi, datasets, mlc-llm-nightly-cu123\n",
264 |             "  Attempting uninstall: fsspec\n",
265 |             "    Found existing installation: fsspec 2024.10.0\n",
266 |             "    Uninstalling fsspec-2024.10.0:\n",
267 |             "      Successfully uninstalled fsspec-2024.10.0\n",
268 |             "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
269 |             "gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatible.\u001b[0m\u001b[31m\n",
270 |             "\u001b[0mSuccessfully installed datasets-3.1.0 dill-0.3.8 fastapi-0.115.5 fsspec-2024.9.0 mlc-ai-nightly-cu123-0.18.dev249 mlc-llm-nightly-cu123-0.18.dev71 multiprocess-0.70.16 shortuuid-1.0.13 starlette-0.41.3 tiktoken-0.8.0 uvicorn-0.32.1 xxhash-3.5.0\n"
271 |           ]
272 |         }
273 |       ],
274 |       "source": [
275 |         "!pip install --pre mlc-ai-nightly-cu123 mlc-llm-nightly-cu123 -f https://mlc.ai/wheels"
276 |       ]
277 |     },
278 |     {
279 |       "cell_type": "markdown",
280 |       "metadata": {
281 |         "id": "nZGVNJE-DJ9E"
282 |       },
283 |       "source": [
284 |         "Let's confirm we have installed the packages successfully!"
285 |       ]
286 |     },
287 |     {
288 |       "cell_type": "code",
289 |       "execution_count": 3,
290 |       "metadata": {
291 |         "id": "5Y6LszJgC7SQ",
292 |         "colab": {
293 |           "base_uri": "https://localhost:8080/"
294 |         },
295 |         "outputId": "8bb25fb7-e9d2-4fbf-d0eb-a09a41071cdc"
296 |       },
297 |       "outputs": [
298 |         {
299 |           "output_type": "stream",
300 |           "name": "stdout",
301 |           "text": [
302 |             "tvm installed properly!\n",
303 |             "mlc_llm installed properly!\n"
304 |           ]
305 |         }
306 |       ],
307 |       "source": [
308 |         "!python -c \"import tvm; print('tvm installed properly!')\"\n",
309 |         "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\""
310 |       ]
311 |     },
312 |     {
313 |       "cell_type": "markdown",
314 |       "metadata": {
315 |         "id": "lGfnrRa9DMw1"
316 |       },
317 |       "source": [
318 |         "## General JSON Text Generation"
319 |       ]
320 |     },
321 |     {
322 |       "cell_type": "markdown",
323 |       "metadata": {
324 |         "id": "pVYkLb0eDjMi"
325 |       },
326 |       "source": [
327 |         "MLC LLM supports two levels of structured generation mode: general JSON response and schema customization. The general JSON mode constrains the response to conform to JSON grammar. To use the general JSON mode, pass argument `response_format={\"type\": \"json_object\"}` to chat completion. Below is a request example with JSON mode:\n"
328 |       ]
329 |     },
330 |     {
331 |       "cell_type": "markdown",
332 |       "metadata": {
333 |         "id": "Pg7daEvlD5UB"
334 |       },
335 |       "source": [
336 |         "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell."
337 |       ]
338 |     },
339 |     {
340 |       "cell_type": "code",
341 |       "execution_count": 4,
342 |       "metadata": {
343 |         "id": "FDFbw1KPDLu1",
344 |         "colab": {
345 |           "base_uri": "https://localhost:8080/"
346 |         },
347 |         "outputId": "e50809ec-89ec-46b8-c761-69d1d040fd9c"
348 |       },
349 |       "outputs": [
350 |         {
351 |           "output_type": "stream",
352 |           "name": "stdout",
353 |           "text": [
354 |             "Git LFS initialized.\n"
355 |           ]
356 |         }
357 |       ],
358 |       "source": [
359 |         "!git lfs install"
360 |       ]
361 |     },
362 |     {
363 |       "cell_type": "code",
364 |       "execution_count": 5,
365 |       "metadata": {
366 |         "id": "bYqaVjmND7Px",
367 |         "colab": {
368 |           "base_uri": "https://localhost:8080/"
369 |         },
370 |         "outputId": "46a18ae6-fa5b-45ef-ac6a-23246855de31"
371 |       },
372 |       "outputs": [
373 |         {
374 |           "output_type": "stream",
375 |           "name": "stderr",
376 |           "text": [
377 |             "0it [00:00, ?it/s]\n",
378 |             "100%|██████████| 58/58 [00:18<00:00,  3.11it/s]\n"
379 |           ]
380 |         },
381 |         {
382 |           "output_type": "stream",
383 |           "name": "stdout",
384 |           "text": [
385 |             "{\"places\": [\n",
386 |             "  {\n",
387 |             "    \"name\": \"Grand Canyon\",\n",
388 |             "    \"location\": \"Arizona\",\n",
389 |             "    \"description\": \"One of the most iconic natural wonders in the United States, the Grand Canyon is a breathtaking example of erosion and geological history.\"\n",
390 |             "  },\n",
391 |             "  {\n",
392 |             "    \"name\": \"Statue of Liberty\",\n",
393 |             "    \"location\": \"New York/New Jersey\",\n",
394 |             "    \"description\": \"A symbol of freedom and democracy, the Statue of Liberty is a must-see attraction on Liberty Island in New York Harbor.\"\n",
395 |             "  },\n",
396 |             "  {\n",
397 |             "    \"name\": \"Golden Gate Bridge\",\n",
398 |             "    \"location\": \"California\",\n",
399 |             "    \"description\": \"An engineering marvel and iconic symbol of San Francisco, the Golden Gate Bridge is a must-see for its stunning views and rich history.\"\n",
400 |             "  }\n",
401 |             "]}"
402 |           ]
403 |         }
404 |       ],
405 |       "source": [
406 |         "from mlc_llm import MLCEngine\n",
407 |         "\n",
408 |         "# Create the MLCEngine. The model will be automatically downloaded.\n",
409 |         "model = \"HF://mlc-ai/Llama-3.2-3B-Instruct-q4f16_1-MLC\"\n",
410 |         "engine = MLCEngine(model)\n",
411 |         "\n",
412 |         "# Generate JSON text with MLCEngine, backed by XGrammar.\n",
413 |         "prompt = \"List 3 must-see places of interest in United States in JSON.\"\n",
414 |         "for chunk in engine.chat.completions.create(\n",
415 |         "    messages= [{\"role\": \"user\", \"content\": prompt}],\n",
416 |         "    response_format={\"type\": \"json_object\"},\n",
417 |         "    stream=True,\n",
418 |         "):\n",
419 |         "    print(chunk.choices[0].delta.content, end=\"\", flush=True)"
420 |       ]
421 |     },
422 |     {
423 |       "cell_type": "markdown",
424 |       "metadata": {
425 |         "id": "SMEavWCJEC_d"
426 |       },
427 |       "source": [
428 |         "## Structured Generation with Schema"
429 |       ]
430 |     },
431 |     {
432 |       "cell_type": "markdown",
433 |       "source": [
434 |         "Additionally, MLCEngine allows for the customization of the response JSON schema for each individual request. When a JSON schema is provided, MLCEngine will generate responses that adhere strictly to that schema. Below is a request example with customized JSON schema:"
435 |       ],
436 |       "metadata": {
437 |         "id": "S1LpZviTgD_m"
438 |       }
439 |     },
440 |     {
441 |       "cell_type": "code",
442 |       "execution_count": 6,
443 |       "metadata": {
444 |         "id": "etHEUrfMD8bX",
445 |         "colab": {
446 |           "base_uri": "https://localhost:8080/"
447 |         },
448 |         "outputId": "a6f7b918-094c-4f79-d359-6528a912c351"
449 |       },
450 |       "outputs": [
451 |         {
452 |           "output_type": "stream",
453 |           "name": "stdout",
454 |           "text": [
455 |             "{\"countries\": [{\"name\": \"Japan\", \"capital\": \"Tokyo\"}, {\"name\": \"Australia\", \"capital\": \"Canberra\"}, {\"name\": \"Brazil\", \"capital\": \"Brasilia\"}]}"
456 |           ]
457 |         }
458 |       ],
459 |       "source": [
460 |         "import json\n",
461 |         "import pydantic\n",
462 |         "from typing import List\n",
463 |         "\n",
464 |         "\n",
465 |         "class Country(pydantic.BaseModel):\n",
466 |         "    name: str\n",
467 |         "    capital: str\n",
468 |         "\n",
469 |         "\n",
470 |         "class Countries(pydantic.BaseModel):\n",
471 |         "    countries: List[Country]\n",
472 |         "\n",
473 |         "\n",
474 |         "# Get the JSON schema of \"Countries\"\n",
475 |         "schema = json.dumps(Countries.model_json_schema())\n",
476 |         "prompt = \"Randomly list three countries and their capitals in JSON.\"\n",
477 |         "\n",
478 |         "for chunk in engine.chat.completions.create(\n",
479 |         "    messages= [{\"role\": \"user\", \"content\": prompt}],\n",
480 |         "    response_format={\"type\": \"json_object\", \"schema\": schema},\n",
481 |         "    stream=True,\n",
482 |         "):\n",
483 |         "    print(chunk.choices[0].delta.content, end=\"\", flush=True)\n"
484 |       ]
485 |     }
486 |   ],
487 |   "metadata": {
488 |     "accelerator": "GPU",
489 |     "colab": {
490 |       "gpuType": "T4",
491 |       "provenance": []
492 |     },
493 |     "kernelspec": {
494 |       "display_name": "Python 3",
495 |       "name": "python3"
496 |     },
497 |     "language_info": {
498 |       "name": "python"
499 |     }
500 |   },
501 |   "nbformat": 4,
502 |   "nbformat_minor": 0
503 | }


--------------------------------------------------------------------------------
/mlc-llm/tutorial_raw_text_generation.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "cells": [
  3 |     {
  4 |       "cell_type": "markdown",
  5 |       "metadata": {
  6 |         "id": "4IwhlCjVtpYj"
  7 |       },
  8 |       "source": [
  9 |         "# MLC-LLM Raw Text Generation in Python\n",
 10 |         "\n",
 11 |         "Here's a quick overview of how to perform raw text generation in Python. In this tutorial, we will be chatting with the Llama2 model. For the easiest setup, we recommend trying this out in a Google Colab notebook. Click the button below to get started!\n",
 12 |         "\n",
 13 |         "Raw text generation allows the user to have more flexibility over the prompts, without being forced to create a new conversational template, making prompt customization easier. This serves other demands for APIs to handle LLM generation without the usual system prompts and other items.\n",
 14 |         "\n",
 15 |         "Learn more about MLC LLM here: https://mlc.ai/mlc-llm/docs."
 16 |       ]
 17 |     },
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "YsvAL7SSt9Lo"
 22 |       },
 23 |       "source": [
 24 |         "Click the button below to get started!\n",
 25 |         "\n",
 26 |         "\n",
 27 |         "  \"Open\n",
 28 |         ""
 29 |       ]
 30 |     },
 31 |     {
 32 |       "cell_type": "markdown",
 33 |       "metadata": {
 34 |         "id": "8kkADAMCCLi-"
 35 |       },
 36 |       "source": [
 37 |         "## Install MLC LLM"
 38 |       ]
 39 |     },
 40 |     {
 41 |       "cell_type": "markdown",
 42 |       "metadata": {
 43 |         "id": "Y2EwuS6TCO61"
 44 |       },
 45 |       "source": [
 46 |         "We will start from setting up the environment. First, let us create a new Conda environment, in which we will run the rest of the notebook.\n",
 47 |         "\n",
 48 |         "```\n",
 49 |         "conda create --name mlc-llm python=3.10\n",
 50 |         "conda activate mlc-llm\n",
 51 |         "```"
 52 |       ]
 53 |     },
 54 |     {
 55 |       "cell_type": "markdown",
 56 |       "metadata": {
 57 |         "id": "ojEeEmsqCTPG"
 58 |       },
 59 |       "source": [
 60 |         "**Google Colab**\n",
 61 |         "\n",
 62 |         "- If you are running this in a Google Colab notebook, you would not need to create a conda environment.\n",
 63 |         "- However, be sure to change your runtime to GPU by going to `Runtime` > `Change runtime type` and setting the Hardware accelerator to be \"GPU\"."
 64 |       ]
 65 |     },
 66 |     {
 67 |       "cell_type": "markdown",
 68 |       "metadata": {
 69 |         "id": "S_rX53bGChPn"
 70 |       },
 71 |       "source": [
 72 |         "If you are using CUDA, you can run the following command to confirm that CUDA is set up correctly, and check the driver version number as well as what GPUs are currently available for use."
 73 |       ]
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "execution_count": null,
 78 |       "metadata": {
 79 |         "id": "CRPeCflbCij6"
 80 |       },
 81 |       "outputs": [],
 82 |       "source": [
 83 |         "!nvidia-smi"
 84 |       ]
 85 |     },
 86 |     {
 87 |       "cell_type": "markdown",
 88 |       "metadata": {
 89 |         "id": "PQfVfTAYC1M-"
 90 |       },
 91 |       "source": [
 92 |         "Next, let's download the MLC-AI and mlc-llm nightly build packages. If you are running in a Colab environment, then you can just run the following command. Otherwise, go to https://mlc.ai/package/ and replace the command below with the one that is appropriate for your hardware and OS."
 93 |       ]
 94 |     },
 95 |     {
 96 |       "cell_type": "markdown",
 97 |       "metadata": {
 98 |         "id": "vi-udt4tC5c9"
 99 |       },
100 |       "source": [
101 |         "**Google Colab**: If you are using Colab, you may see the red warnings such as \"You must restart the runtime in order to use newly installed versions.\" For our purpose, we can disregard them, the notebook will still run correctly."
102 |       ]
103 |     },
104 |     {
105 |       "cell_type": "code",
106 |       "execution_count": null,
107 |       "metadata": {
108 |         "id": "ah9tYaCRCkKS"
109 |       },
110 |       "outputs": [],
111 |       "source": [
112 |         "!pip install --pre --force-reinstall mlc-ai-nightly-cu118 mlc-llm-nightly-cu118 -f https://mlc.ai/wheels"
113 |       ]
114 |     },
115 |     {
116 |       "cell_type": "markdown",
117 |       "metadata": {
118 |         "id": "nZGVNJE-DJ9E"
119 |       },
120 |       "source": [
121 |         "Let's confirm we have installed the packages successfully!"
122 |       ]
123 |     },
124 |     {
125 |       "cell_type": "code",
126 |       "execution_count": null,
127 |       "metadata": {
128 |         "id": "5Y6LszJgC7SQ"
129 |       },
130 |       "outputs": [],
131 |       "source": [
132 |         "!python -c \"import tvm; print('tvm installed properly!')\"\n",
133 |         "!python -c \"import mlc_llm; print('mlc_llm installed properly!')\""
134 |       ]
135 |     },
136 |     {
137 |       "cell_type": "markdown",
138 |       "metadata": {
139 |         "id": "lGfnrRa9DMw1"
140 |       },
141 |       "source": [
142 |         "## Download Prebuilt Models and Library"
143 |       ]
144 |     },
145 |     {
146 |       "cell_type": "markdown",
147 |       "metadata": {
148 |         "id": "pVYkLb0eDjMi"
149 |       },
150 |       "source": [
151 |         "The following commands will download all the available prebuilt libraries (e.g., `.so` files). This may take a while. If in **Google Colab**, you can verify that the files are being downloaded by clicking on the folder icon on the left."
152 |       ]
153 |     },
154 |     {
155 |       "cell_type": "markdown",
156 |       "metadata": {
157 |         "id": "Pg7daEvlD5UB"
158 |       },
159 |       "source": [
160 |         "Note: If you are NOT running in **Google Colab** you may need to run this line `!conda install git git-lfs` to install `git` and `git-lfs` before running the following cell."
161 |       ]
162 |     },
163 |     {
164 |       "cell_type": "code",
165 |       "execution_count": null,
166 |       "metadata": {
167 |         "id": "FDFbw1KPDLu1"
168 |       },
169 |       "outputs": [],
170 |       "source": [
171 |         "!git lfs install"
172 |       ]
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "execution_count": null,
177 |       "metadata": {
178 |         "id": "bYqaVjmND7Px"
179 |       },
180 |       "outputs": [],
181 |       "source": [
182 |         "!mkdir -p dist\n",
183 |         "!git clone https://github.com/mlc-ai/binary-mlc-llm-libs.git dist/prebuilt_libs"
184 |       ]
185 |     },
186 |     {
187 |       "cell_type": "markdown",
188 |       "metadata": {
189 |         "id": "SMEavWCJEC_d"
190 |       },
191 |       "source": [
192 |         "#### Llama-2-7b-chat q4f16_1 prebuilt weights"
193 |       ]
194 |     },
195 |     {
196 |       "cell_type": "code",
197 |       "execution_count": null,
198 |       "metadata": {
199 |         "id": "etHEUrfMD8bX"
200 |       },
201 |       "outputs": [],
202 |       "source": [
203 |         "!cd dist && git clone https://huggingface.co/mlc-ai/Llama-2-7b-chat-hf-q4f16_1-MLC"
204 |       ]
205 |     },
206 |     {
207 |       "cell_type": "code",
208 |       "execution_count": 9,
209 |       "metadata": {
210 |         "id": "dbHdyfIXHNpo"
211 |       },
212 |       "outputs": [],
213 |       "source": [
214 |         "# Restart colab\n",
215 |         "exit()"
216 |       ]
217 |     },
218 |     {
219 |       "cell_type": "markdown",
220 |       "metadata": {
221 |         "id": "CmpxrrqyE0S6"
222 |       },
223 |       "source": [
224 |         "## Let's try raw text generation with Llama-2-7b-chat!"
225 |       ]
226 |     },
227 |     {
228 |       "cell_type": "code",
229 |       "execution_count": 1,
230 |       "metadata": {
231 |         "id": "VYZtJS_OoCW6"
232 |       },
233 |       "outputs": [],
234 |       "source": [
235 |         "from mlc_llm import ChatModule, ChatConfig, ConvConfig\n",
236 |         "from mlc_llm.callback import StreamToStdout"
237 |       ]
238 |     },
239 |     {
240 |       "cell_type": "markdown",
241 |       "metadata": {
242 |         "id": "dCnYzG1dombI"
243 |       },
244 |       "source": [
245 |         "Use a `ConvConfig` to define the generation settings. Since we will be using the `LM` template, which supports raw text generation, system prompts will not be executed if provided."
246 |       ]
247 |     },
248 |     {
249 |       "cell_type": "code",
250 |       "execution_count": 2,
251 |       "metadata": {
252 |         "id": "vUn1QHlaoiY8"
253 |       },
254 |       "outputs": [],
255 |       "source": [
256 |         "conv_config = ConvConfig(stop_tokens=[2,], add_bos=True, stop_str=\"[INST]\")"
257 |       ]
258 |     },
259 |     {
260 |       "cell_type": "markdown",
261 |       "metadata": {
262 |         "id": "K5460Ca7phM0"
263 |       },
264 |       "source": [
265 |         "Note that `conv_config` is an optional subfield of `chat_config`. The `LM` template serves the basic purposes of raw text generation."
266 |       ]
267 |     },
268 |     {
269 |       "cell_type": "code",
270 |       "execution_count": 3,
271 |       "metadata": {
272 |         "id": "Yw0vlNEvpclP"
273 |       },
274 |       "outputs": [],
275 |       "source": [
276 |         "chat_config = ChatConfig(conv_config=conv_config, conv_template=\"LM\")"
277 |       ]
278 |     },
279 |     {
280 |       "cell_type": "markdown",
281 |       "metadata": {
282 |         "id": "UshFruMXpu31"
283 |       },
284 |       "source": [
285 |         "Using the `chat_config` we created, instantiate a `ChatModule`."
286 |       ]
287 |     },
288 |     {
289 |       "cell_type": "code",
290 |       "execution_count": 4,
291 |       "metadata": {
292 |         "id": "6AeKjYybpvMH"
293 |       },
294 |       "outputs": [],
295 |       "source": [
296 |         "cm = ChatModule(\n",
297 |         "   model=\"dist/Llama-2-7b-chat-hf-q4f16_1-MLC\",\n",
298 |         "   model_lib_path=\"dist/prebuilt_libs/Llama-2-7b-chat-hf/Llama-2-7b-chat-hf-q4f16_1-cuda.so\",\n",
299 |         "   chat_config=chat_config\n",
300 |         ")"
301 |       ]
302 |     },
303 |     {
304 |       "cell_type": "markdown",
305 |       "metadata": {
306 |         "id": "XAbeEqDjqB0T"
307 |       },
308 |       "source": [
309 |         "Let's depict our first prompt. Essentially the LLM will be fed with this exact piece of text, unlike other conversational templates that structure the conversation beforehand to abstract specific settings. However, to make the model follow conversations a chat structure should be provided. Specific tags should be placed, because the model was finetuned with those tags to accurately follow conversations. This allows users to build their own prompts without necessarily building a new template."
310 |       ]
311 |     },
312 |     {
313 |       "cell_type": "code",
314 |       "execution_count": 5,
315 |       "metadata": {
316 |         "id": "7_Z_w5VUp7HZ"
317 |       },
318 |       "outputs": [],
319 |       "source": [
320 |         "system_prompt = \"<>\\nYou are a helpful, respectful and honest assistant.\\n<>\\n\\n\"\n",
321 |         "inst_prompt = \"What is mother nature?\""
322 |       ]
323 |     },
324 |     {
325 |       "cell_type": "markdown",
326 |       "metadata": {
327 |         "id": "kuylQHLQ6ugR"
328 |       },
329 |       "source": [
330 |         "Concatenate system and instruction prompts, and add instruction tags before generation. As you can see, the model will correctly follow the conversation."
331 |       ]
332 |     },
333 |     {
334 |       "cell_type": "code",
335 |       "execution_count": 6,
336 |       "metadata": {
337 |         "colab": {
338 |           "base_uri": "https://localhost:8080/"
339 |         },
340 |         "id": "NaVcdEXup8NH",
341 |         "outputId": "631c2f60-68cc-4a90-ecb2-6fc06eb1b642"
342 |       },
343 |       "outputs": [
344 |         {
345 |           "name": "stdout",
346 |           "output_type": "stream",
347 |           "text": [
348 |             "Hello! I'm so glad you asked! Mother Nature is a term used to describe the natural world around us, including all living things and the environment that supports them. It encompasses everything from the tiniest microorganisms to the largest landscapes, and includes all the elements and processes that shape our planet.\n",
349 |             "Mother Nature is the source of all life, providing us with the air we breathe, the water we drink, the food we eat, and the beauty we behold. She is the foundation of our very existence, and yet, she is often taken for granted.\n",
350 |             "It's important to remember that Mother Nature is not just something we rely on for our survival, but she also provides us with endless opportunities for inspiration, creativity, and joy. From the majestic mountains to the rolling hills, from the sparkling oceans to the babbling brooks, Mother Nature offers us a never-ending array of wonders and marvels.\n",
351 |             "So, the next time you take a moment to appreciate the beauty of Mother Nature, remember that you are not just appreciating something beautiful, you are appreciating the very source of your own existence. Take care of her, and she will take care of you.\n"
352 |           ]
353 |         }
354 |       ],
355 |       "source": [
356 |         "output = cm.generate(\n",
357 |         "   prompt=f\"[INST] {system_prompt+inst_prompt} [/INST]\",\n",
358 |         "   progress_callback=StreamToStdout(callback_interval=2),\n",
359 |         ")"
360 |       ]
361 |     },
362 |     {
363 |       "cell_type": "markdown",
364 |       "metadata": {
365 |         "id": "CijSHO6K9QqG"
366 |       },
367 |       "source": [
368 |         "Structuring the conversation in this way is equivelent to using the following conversational template in MLC-LLM:\n",
369 |         "\n",
370 |         "```cpp\n",
371 |         "Conversation Llama2() {\n",
372 |         "  Conversation conv;\n",
373 |         "  conv.name = \"llama-2\";\n",
374 |         "  conv.system =\n",
375 |         "      (\"[INST] <>\\n\\nYou are a helpful, respectful and honest assistant.\\n<>\\n\\n \");\n",
376 |         "  conv.roles = {\"[INST]\", \"[/INST]\"};\n",
377 |         "  conv.messages = {};\n",
378 |         "  conv.offset = 0;\n",
379 |         "  conv.separator_style = SeparatorStyle::kSepRoleMsg;\n",
380 |         "  conv.seps = {\" \"};\n",
381 |         "  conv.role_msg_sep = \" \";\n",
382 |         "  conv.role_empty_sep = \" \";\n",
383 |         "  conv.stop_tokens = {2};\n",
384 |         "  conv.stop_str = \"[INST]\";\n",
385 |         "  conv.add_bos = true;\n",
386 |         "  return conv;\n",
387 |         "}\n",
388 |         "```"
389 |       ]
390 |     },
391 |     {
392 |       "cell_type": "markdown",
393 |       "metadata": {
394 |         "id": "008dtOGy7ZMQ"
395 |       },
396 |       "source": [
397 |         "In following case, since we do not add any tags, the model will just follow normal text completion because there isn't a chat structure.\n",
398 |         "\n",
399 |         "**Note:** The `LM` template has no memory, so it will be reset every single generation (as if we would run `cm.reset_chat()`)."
400 |       ]
401 |     },
402 |     {
403 |       "cell_type": "code",
404 |       "execution_count": 7,
405 |       "metadata": {
406 |         "colab": {
407 |           "base_uri": "https://localhost:8080/"
408 |         },
409 |         "id": "3K8X2p7Y61nl",
410 |         "outputId": "7dcc570e-9f62-4744-d4b5-23fa003a4307"
411 |       },
412 |       "outputs": [
413 |         {
414 |           "name": "stdout",
415 |           "output_type": "stream",
416 |           "text": [
417 |             "living beings from non-living matter. literally, it is characterized by growth, reproduction, metabolism, response to stimuli, and adaptation to their environment. The concept of life has puzzled scientists and philosophers for centuries, and there is no consensus on a definition that encompasses all aspects of life.\n",
418 |             "The most commonly used definition of life is the \"chemical definition,\" which states that living things are composed of cells, which are the basic structural and functional units of life. Cells are made up of biomolecules such as DNA, RNA, and proteins, which perform a variety of functions necessary for life, such as metabolism, growth, and reproduction.\n",
419 |             "Another definition of life is the \"functional definition,\" which states that living things have the ability to maintain homeostasis, or a stable internal environment, despite changes in the external environment. This means that living things are able to regulate their internal processes and maintain a stable balance of chemical and physical parameters, such as temperature, pH, and concentration of nutrients and waste products.\n",
420 |             "A third definition of life is the \"process definition,\" which states that living things are characterized by a set of processes that are unique to living things and cannot be replicated by non-living matter. These processes include metabolism, growth, reproduction, response to stimuli, and adaptation to their environment.\n",
421 |             "There are also other definitions of life, such as the \"energy definition,\" which states that living things are characterized by their ability to capture and convert energy from their environment, and the \"information definition,\" which states that living things are characterized by their ability to store, process, and transmit information.\n",
422 |             "Despite these various definitions, there is still much debate among scientists and philosophers about what exactly constitutes life. Some argue that life is a fundamental property of the universe, while others believe that it is a product of historical and cultural factors. Ultimately, the definition of life is likely to be complex and multifaceted, encompassing a variety of biological, chemical, and physical processes that are unique to living things.\n"
423 |           ]
424 |         }
425 |       ],
426 |       "source": [
427 |         "output = cm.generate(\n",
428 |         "   prompt=\"Life is a quality that distinguishes\",\n",
429 |         "   progress_callback=StreamToStdout(callback_interval=2),\n",
430 |         ")"
431 |       ]
432 |     }
433 |   ],
434 |   "metadata": {
435 |     "accelerator": "GPU",
436 |     "colab": {
437 |       "gpuType": "T4",
438 |       "provenance": []
439 |     },
440 |     "kernelspec": {
441 |       "display_name": "Python 3",
442 |       "name": "python3"
443 |     },
444 |     "language_info": {
445 |       "name": "python"
446 |     }
447 |   },
448 |   "nbformat": 4,
449 |   "nbformat_minor": 0
450 | }
451 | 


--------------------------------------------------------------------------------