├── AccDiffusion_Gradio_Colab.ipynb ├── Artist_Gradio_Colab.ipynb ├── AuraSRv2_Gradio_Colab.ipynb ├── DDColor_Gradio_Demo_Colab.ipynb ├── DeepSeek_VL_Gradio_Multimodal.ipynb ├── Edit Video and Create Gifs using SOTA ASR.ipynb ├── FluxDEV_with_ControlNet_Canny_and_Gradio.ipynb ├── Flux_with_Gradio.ipynb ├── GradioApp_VisualGrounding_guided_Inpainting_StableDiffusion.ipynb ├── IMAGDressing_v1.ipynb ├── Llama3_2_3B_Instruct_with_Gradio5.ipynb ├── Llama3_2_Vision_Chatbot_with_Gradio5.ipynb ├── Llama3_2_Vision_Chatbot_with_Gradio5_cp.ipynb ├── OpenAI_TTS_with_Gradio.ipynb ├── PhotoMaker_Demo.ipynb ├── README.md ├── SAM2_Gradio_Colab.ipynb ├── Using_HFHub_for_Image_Analysis.ipynb ├── Video_Editing_using_Automatic_Speech_Recognition.ipynb ├── Video_Segmentation_with_EVF_SAM2_&_Gradio.ipynb ├── Whatsapp_Image_Forwards_In_Your_Language_GradioDemo.ipynb ├── YouTube_SearchInAVideo.ipynb ├── app.py ├── images ├── gif1.gif └── gradioapp3.jpg ├── index.html ├── nougat.ipynb └── updated_smolagents_gradio_ui.py /AccDiffusion_Gradio_Colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "A100", 9 | "authorship_tag": "ABX9TyNpXKh6UnRNOEiFiEI1AhKQ", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | " $\"Open$ " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "RvgHJi2R7MUD" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "!pip install gradio -q --upgrade" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "!pip install spaces -q" 47 | ], 48 | "metadata": { 49 | "id": "9_Li3ql97QbS" 50 | }, 51 | "execution_count": 2, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "source": [ 57 | "!git clone https://huggingface.co/spaces/fffiloni/AccDiffusion" 58 | ], 59 | "metadata": { 60 | "id": "pp9P9mYL7QYc" 61 | }, 62 | "execution_count": null, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "%cd AccDiffusion" 69 | ], 70 | "metadata": { 71 | "colab": { 72 | "base_uri": "https://localhost:8080/" 73 | }, 74 | "id": "m51rYCaO7Rq5", 75 | "outputId": "9d8ed8e7-2915-4220-d23f-f2c3e15e7727" 76 | }, 77 | "execution_count": 4, 78 | "outputs": [ 79 | { 80 | "output_type": "stream", 81 | "name": "stdout", 82 | "text": [ 83 | "/content/AccDiffusion\n" 84 | ] 85 | } 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "source": [ 91 | "!pip install -r requirements.txt" 92 | ], 93 | "metadata": { 94 | "id": "H_fPjlsR7q8b" 95 | }, 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "source": [ 102 | "**NOTE**: Set `share=True` in `demo.launch()` within the `app.py` file manually, before moving to the next step. You don't need to restart the session when prompted.\n", 103 | "Also make sure that the gradio version is updated to the latest release (`pip install gradio --upgrade`)" 104 | ], 105 | "metadata": { 106 | "id": "2q7EyKsN714d" 107 | } 108 | }, 109 | { 110 | "cell_type": "code", 111 | "source": [ 112 | "!python app.py" 113 | ], 114 | "metadata": { 115 | "id": "jsU2RlPm7vO9" 116 | }, 117 | "execution_count": null, 118 | "outputs": [] 119 | } 120 | ] 121 | } -------------------------------------------------------------------------------- /Artist_Gradio_Colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyMpL4ZzOeDk/KgNmHcaap48", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | " $\"Open$ " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 1, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "y_kIweQlj5k_", 37 | "outputId": "5807f89a-d2d7-4d0a-91f7-98f758bd8d4a" 38 | }, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "Cloning into 'Artist'...\n", 45 | "remote: Enumerating objects: 51, done.\u001b[K\n", 46 | "remote: Counting objects: 100% (51/51), done.\u001b[K\n", 47 | "remote: Compressing objects: 100% (46/46), done.\u001b[K\n", 48 | "remote: Total 51 (delta 8), reused 28 (delta 0), pack-reused 0\u001b[K\n", 49 | "Receiving objects: 100% (51/51), 10.22 MiB | 24.81 MiB/s, done.\n", 50 | "Resolving deltas: 100% (8/8), done.\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "!git clone https://github.com/songrise/Artist.git" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "source": [ 61 | "%cd Artist" 62 | ], 63 | "metadata": { 64 | "colab": { 65 | "base_uri": "https://localhost:8080/" 66 | }, 67 | "id": "FvQBgCD0lMA5", 68 | "outputId": "1b4468cc-61f7-48ea-be7c-58dd331bd065" 69 | }, 70 | "execution_count": 7, 71 | "outputs": [ 72 | { 73 | "output_type": "stream", 74 | "name": "stdout", 75 | "text": [ 76 | "/content/Artist\n" 77 | ] 78 | } 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "source": [ 84 | "!pip install clip==0.1.0 diffusers==0.26.3 einops==0.8.0 omegaconf==2.3.0 gradio -q" 85 | ], 86 | "metadata": { 87 | "id": "1X5UI7SjkTwJ" 88 | }, 89 | "execution_count": null, 90 | "outputs": [] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "source": [ 95 | "**NOTE**: Set `share=True` in `demo.launch()` within the `injection_main.py` file manually, before moving to the next step. You don't need to restart the session when prompted. Also make sure that the gradio version is updated to the latest release (`pip install gradio --upgrade`)" 96 | ], 97 | "metadata": { 98 | "id": "mC5ljY6Gkrvi" 99 | } 100 | }, 101 | { 102 | "cell_type": "code", 103 | "source": [ 104 | "!python injection_main.py --mode app" 105 | ], 106 | "metadata": { 107 | "id": "LKlx4zg5kICl" 108 | }, 109 | "execution_count": null, 110 | "outputs": [] 111 | } 112 | ] 113 | } -------------------------------------------------------------------------------- /DDColor_Gradio_Demo_Colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "V100", 9 | "authorship_tag": "ABX9TyMVvgYrgosDq5X9UWaUxX0b", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | " $\"Open$ " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "source": [ 35 | "!git clone https://github.com/piddnad/DDColor.git" 36 | ], 37 | "metadata": { 38 | "id": "svmR1KiCyrFC" 39 | }, 40 | "execution_count": null, 41 | "outputs": [] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "%cd DDColor" 47 | ], 48 | "metadata": { 49 | "colab": { 50 | "base_uri": "https://localhost:8080/" 51 | }, 52 | "id": "vxsER1SRzKRF", 53 | "outputId": "5cb5e427-83de-4a1f-de4e-d10126a97547" 54 | }, 55 | "execution_count": 2, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "name": "stdout", 60 | "text": [ 61 | "/content/DDColor\n" 62 | ] 63 | } 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "source": [ 69 | "!pip install -r requirements.txt" 70 | ], 71 | "metadata": { 72 | "id": "1DXDJQk1yj5V" 73 | }, 74 | "execution_count": null, 75 | "outputs": [] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "source": [ 80 | "!pip install gradio -q" 81 | ], 82 | "metadata": { 83 | "id": "VTA1TwvFk5KN" 84 | }, 85 | "execution_count": null, 86 | "outputs": [] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "source": [ 91 | "!pip install timm gradio_imageslider -q" 92 | ], 93 | "metadata": { 94 | "id": "3q2P7gxy628B" 95 | }, 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "id": "zL98iRYByFjA" 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "!pip install modelscope -q" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "source": [ 113 | "from modelscope.hub.snapshot_download import snapshot_download\n", 114 | "\n", 115 | "model_dir = snapshot_download('damo/cv_ddcolor_image-colorization', cache_dir='./modelscope')\n", 116 | "print('model assets saved to %s'%model_dir)" 117 | ], 118 | "metadata": { 119 | "colab": { 120 | "base_uri": "https://localhost:8080/" 121 | }, 122 | "id": "bGEePNw3ye84", 123 | "outputId": "e993f1e9-5975-4e66-f026-d9ec86e58b49" 124 | }, 125 | "execution_count": 6, 126 | "outputs": [ 127 | { 128 | "output_type": "stream", 129 | "name": "stderr", 130 | "text": [ 131 | "2024-01-19 03:47:46,140 - modelscope - INFO - PyTorch version 2.1.0+cu121 Found.\n", 132 | "2024-01-19 03:47:46,144 - modelscope - INFO - TensorFlow version 2.15.0 Found.\n", 133 | "2024-01-19 03:47:46,145 - modelscope - INFO - Loading ast index from /root/.cache/modelscope/ast_indexer\n", 134 | "2024-01-19 03:47:46,147 - modelscope - INFO - No valid ast index found from /root/.cache/modelscope/ast_indexer, generating ast index from prebuilt!\n", 135 | "2024-01-19 03:47:46,210 - modelscope - INFO - Loading done! Current index file version is 1.11.0, with md5 d1bf70dc840bbc78c6b639e013c9762e and a total number of 953 components indexed\n", 136 | "2024-01-19 03:47:50,058 - modelscope - WARNING - Model revision not specified, use revision: v1.02\n", 137 | "Downloading: 100%|██████████| 1.39k/1.39k [00:00<00:00, 6.37MB/s]\n", 138 | "Downloading: 100%|██████████| 235k/235k [00:00<00:00, 7.11MB/s]\n", 139 | "Downloading: 100%|██████████| 199k/199k [00:00<00:00, 6.00MB/s]\n", 140 | "Downloading: 100%|██████████| 94.9k/94.9k [00:00<00:00, 5.95MB/s]\n", 141 | "Downloading: 100%|██████████| 117k/117k [00:00<00:00, 4.92MB/s]\n", 142 | "Downloading: 100%|█████████▉| 870M/870M [00:13<00:00, 67.0MB/s]\n", 143 | "Downloading: 100%|██████████| 3.44k/3.44k [00:00<00:00, 13.7MB/s]" 144 | ] 145 | }, 146 | { 147 | "output_type": "stream", 148 | "name": "stdout", 149 | "text": [ 150 | "model assets saved to ./modelscope/damo/cv_ddcolor_image-colorization\n" 151 | ] 152 | }, 153 | { 154 | "output_type": "stream", 155 | "name": "stderr", 156 | "text": [ 157 | "\n" 158 | ] 159 | } 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "source": [ 165 | "import sys\n", 166 | "sys.path.append('/DDColor')\n", 167 | "\n", 168 | "import argparse\n", 169 | "import cv2\n", 170 | "import numpy as np\n", 171 | "import os\n", 172 | "from tqdm import tqdm\n", 173 | "import torch\n", 174 | "from basicsr.archs.ddcolor_arch import DDColor\n", 175 | "import torch.nn.functional as F\n", 176 | "\n", 177 | "import gradio as gr\n", 178 | "from gradio_imageslider import ImageSlider\n", 179 | "import uuid\n", 180 | "from PIL import Image\n", 181 | "\n", 182 | "model_path = 'modelscope/damo/cv_ddcolor_image-colorization/pytorch_model.pt'\n", 183 | "input_size = 512\n", 184 | "model_size = 'large'\n", 185 | "\n", 186 | "\n", 187 | "# Create Image Colorization Pipeline\n", 188 | "class ImageColorizationPipeline(object):\n", 189 | "\n", 190 | " def __init__(self, model_path, input_size=256, model_size='large'):\n", 191 | "\n", 192 | " self.input_size = input_size\n", 193 | " if torch.cuda.is_available():\n", 194 | " self.device = torch.device('cuda')\n", 195 | " else:\n", 196 | " self.device = torch.device('cpu')\n", 197 | "\n", 198 | " if model_size == 'tiny':\n", 199 | " self.encoder_name = 'convnext-t'\n", 200 | " else:\n", 201 | " self.encoder_name = 'convnext-l'\n", 202 | "\n", 203 | " self.decoder_type = \"MultiScaleColorDecoder\"\n", 204 | "\n", 205 | " if self.decoder_type == 'MultiScaleColorDecoder':\n", 206 | " self.model = DDColor(\n", 207 | " encoder_name=self.encoder_name,\n", 208 | " decoder_name='MultiScaleColorDecoder',\n", 209 | " input_size=[self.input_size, self.input_size],\n", 210 | " num_output_channels=2,\n", 211 | " last_norm='Spectral',\n", 212 | " do_normalize=False,\n", 213 | " num_queries=100,\n", 214 | " num_scales=3,\n", 215 | " dec_layers=9,\n", 216 | " ).to(self.device)\n", 217 | " else:\n", 218 | " self.model = DDColor(\n", 219 | " encoder_name=self.encoder_name,\n", 220 | " decoder_name='SingleColorDecoder',\n", 221 | " input_size=[self.input_size, self.input_size],\n", 222 | " num_output_channels=2,\n", 223 | " last_norm='Spectral',\n", 224 | " do_normalize=False,\n", 225 | " num_queries=256,\n", 226 | " ).to(self.device)\n", 227 | "\n", 228 | " self.model.load_state_dict(\n", 229 | " torch.load(model_path, map_location=torch.device('cpu'))['params'],\n", 230 | " strict=False)\n", 231 | " self.model.eval()\n", 232 | "\n", 233 | " @torch.no_grad()\n", 234 | " def process(self, img):\n", 235 | " self.height, self.width = img.shape[:2]\n", 236 | " img = (img / 255.0).astype(np.float32)\n", 237 | " orig_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1] # (h, w, 1)\n", 238 | "\n", 239 | " # resize rgb image -> lab -> get grey -> rgb\n", 240 | " img = cv2.resize(img, (self.input_size, self.input_size))\n", 241 | " img_l = cv2.cvtColor(img, cv2.COLOR_BGR2Lab)[:, :, :1]\n", 242 | " img_gray_lab = np.concatenate((img_l, np.zeros_like(img_l), np.zeros_like(img_l)), axis=-1)\n", 243 | " img_gray_rgb = cv2.cvtColor(img_gray_lab, cv2.COLOR_LAB2RGB)\n", 244 | "\n", 245 | " tensor_gray_rgb = torch.from_numpy(img_gray_rgb.transpose((2, 0, 1))).float().unsqueeze(0).to(self.device)\n", 246 | " output_ab = self.model(tensor_gray_rgb).cpu() # (1, 2, self.height, self.width)\n", 247 | "\n", 248 | " # resize ab -> concat original l -> rgb\n", 249 | " output_ab_resize = F.interpolate(output_ab, size=(self.height, self.width))[0].float().numpy().transpose(1, 2, 0)\n", 250 | " output_lab = np.concatenate((orig_l, output_ab_resize), axis=-1)\n", 251 | " output_bgr = cv2.cvtColor(output_lab, cv2.COLOR_LAB2BGR)\n", 252 | "\n", 253 | " output_img = (output_bgr * 255.0).round().astype(np.uint8)\n", 254 | "\n", 255 | " return output_img\n", 256 | "\n", 257 | "\n", 258 | "# Initialize\n", 259 | "colorizer = ImageColorizationPipeline(model_path=model_path,\n", 260 | " input_size=input_size,\n", 261 | " model_size=model_size)\n", 262 | "\n", 263 | "\n", 264 | "# Create inference function for gradio app\n", 265 | "def colorize(img):\n", 266 | " image_out = colorizer.process(img)\n", 267 | " # Generate a unique filename using UUID\n", 268 | " unique_imgfilename = str(uuid.uuid4()) + '.png'\n", 269 | " cv2.imwrite(unique_imgfilename, image_out)\n", 270 | " return (img, unique_imgfilename)\n", 271 | "\n", 272 | "\n", 273 | "# Gradio demo using the Image-Slider custom component\n", 274 | "with gr.Blocks() as demo:\n", 275 | " with gr.Row():\n", 276 | " with gr.Column():\n", 277 | " bw_image = gr.Image(label='Black and White Input Image')\n", 278 | " btn = gr.Button('Convert using DDColor')\n", 279 | " with gr.Column():\n", 280 | " col_image_slider = ImageSlider(position=0.5,\n", 281 | " label='Colored Image with Slider-view')\n", 282 | "\n", 283 | " btn.click(colorize, bw_image, col_image_slider)\n", 284 | "demo.launch()" 285 | ], 286 | "metadata": { 287 | "id": "e3TU4y5G7QQJ" 288 | }, 289 | "execution_count": 11, 290 | "outputs": [] 291 | } 292 | ] 293 | } -------------------------------------------------------------------------------- /DeepSeek_VL_Gradio_Multimodal.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "T4", 9 | "authorship_tag": "ABX9TyMzH9lflqv3ES+7PA10A68Y", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | " $\"Open$ " 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "source": [ 35 | "# DeepSeek-VL with Gradio Multimodal Chatbots" 36 | ], 37 | "metadata": { 38 | "id": "BUbCHXLb68Q-" 39 | } 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "source": [ 44 | "## Install gradio" 45 | ], 46 | "metadata": { 47 | "id": "51T5zswe7C4A" 48 | } 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 1, 53 | "metadata": { 54 | "colab": { 55 | "base_uri": "https://localhost:8080/" 56 | }, 57 | "id": "JxeH3BnFv3w_", 58 | "outputId": "5724edc4-9fb2-4bf2-a8c8-e73e247abbd4" 59 | }, 60 | "outputs": [ 61 | { 62 | "output_type": "stream", 63 | "name": "stdout", 64 | "text": [ 65 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.0/17.0 MB\u001b[0m \u001b[31m81.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 66 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.1/92.1 kB\u001b[0m \u001b[31m12.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 67 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 68 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 69 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.6/75.6 kB\u001b[0m \u001b[31m7.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 70 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.5/138.5 kB\u001b[0m \u001b[31m17.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 71 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.9/7.9 MB\u001b[0m \u001b[31m94.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 72 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.6/60.6 kB\u001b[0m \u001b[31m7.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 73 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m14.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 74 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.8/77.8 kB\u001b[0m \u001b[31m10.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 75 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 76 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.5/71.5 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 77 | "\u001b[?25h Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 78 | ] 79 | } 80 | ], 81 | "source": [ 82 | "!pip install gradio -q\n", 83 | "import gradio as gr" 84 | ] 85 | }, 86 | { 87 | "cell_type": "markdown", 88 | "source": [ 89 | "## Clone DeepSeek-VL lib" 90 | ], 91 | "metadata": { 92 | "id": "hLflWXMi7GF9" 93 | } 94 | }, 95 | { 96 | "cell_type": "code", 97 | "source": [ 98 | "!git clone https://github.com/deepseek-ai/DeepSeek-VL\n", 99 | "%cd DeepSeek-VL\n", 100 | "\n", 101 | "!pip install -e ." 102 | ], 103 | "metadata": { 104 | "id": "QvFPT0jCwID9" 105 | }, 106 | "execution_count": null, 107 | "outputs": [] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "source": [ 112 | "#no\n", 113 | "!pip install typing-extensions" 114 | ], 115 | "metadata": { 116 | "colab": { 117 | "base_uri": "https://localhost:8080/" 118 | }, 119 | "id": "r09WOj4JxWAa", 120 | "outputId": "8cdadafb-57ad-40e9-df06-a6d1e5fe8c6c" 121 | }, 122 | "execution_count": 3, 123 | "outputs": [ 124 | { 125 | "output_type": "stream", 126 | "name": "stdout", 127 | "text": [ 128 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (4.10.0)\n" 129 | ] 130 | } 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "source": [ 136 | "## Import libraries" 137 | ], 138 | "metadata": { 139 | "id": "9U8eTz1x7WHF" 140 | } 141 | }, 142 | { 143 | "cell_type": "code", 144 | "source": [ 145 | "import collections.abc\n", 146 | "import sys\n", 147 | "import torch\n", 148 | "from transformers import AutoModelForCausalLM\n", 149 | "\n", 150 | "# Temporarily fix the ImportError for collections.Mapping\n", 151 | "sys.modules['collections.Mapping'] = collections.abc.Mapping\n", 152 | "sys.modules['collections.MutableMapping'] = collections.abc.MutableMapping\n", 153 | "sys.modules['collections.Sequence'] = collections.abc.Sequence\n", 154 | "\n", 155 | "# Now try importing the deepseek_vl package\n", 156 | "from deepseek_vl.models import VLChatProcessor, MultiModalityCausalLM\n", 157 | "from deepseek_vl.utils.io import load_pil_images\n" 158 | ], 159 | "metadata": { 160 | "colab": { 161 | "base_uri": "https://localhost:8080/" 162 | }, 163 | "id": "dJ7btZpwyZj4", 164 | "outputId": "df200778-b7db-4f92-fb0b-01b73e72ffb4" 165 | }, 166 | "execution_count": 5, 167 | "outputs": [ 168 | { 169 | "output_type": "stream", 170 | "name": "stderr", 171 | "text": [ 172 | "/usr/local/lib/python3.10/dist-packages/torchvision/io/image.py:13: UserWarning: Failed to load image Python extension: '/usr/local/lib/python3.10/dist-packages/torchvision/image.so: undefined symbol: _ZN3c1017RegisterOperatorsD1Ev'If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. Otherwise, there might be something wrong with your environment. Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?\n", 173 | " warn(\n" 174 | ] 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "source": [ 181 | "## Download and initialize 1.3b VLM chat model" 182 | ], 183 | "metadata": { 184 | "id": "1Y4G2n3J73VG" 185 | } 186 | }, 187 | { 188 | "cell_type": "code", 189 | "source": [ 190 | "# specify the path to the model\n", 191 | "model_path = \"deepseek-ai/deepseek-vl-1.3b-chat\"\n", 192 | "vl_chat_processor: VLChatProcessor = VLChatProcessor.from_pretrained(model_path)\n", 193 | "tokenizer = vl_chat_processor.tokenizer\n", 194 | "\n", 195 | "vl_gpt: MultiModalityCausalLM = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True)\n", 196 | "vl_gpt = vl_gpt.to(torch.bfloat16).cuda().eval()" 197 | ], 198 | "metadata": { 199 | "colab": { 200 | "base_uri": "https://localhost:8080/" 201 | }, 202 | "id": "qr4A8QE6wBog", 203 | "outputId": "cb5eb520-ac22-46d5-c3fc-d4e68aba9ba9" 204 | }, 205 | "execution_count": 7, 206 | "outputs": [ 207 | { 208 | "output_type": "stream", 209 | "name": "stderr", 210 | "text": [ 211 | "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n", 212 | "The secret `HF_TOKEN` does not exist in your Colab secrets.\n", 213 | "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n", 214 | "You will be able to reuse this secret in all of your notebooks.\n", 215 | "Please note that authentication is recommended but still optional to access public models or datasets.\n", 216 | " warnings.warn(\n", 217 | "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n" 218 | ] 219 | } 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "source": [ 225 | "### Chat Prompt" 226 | ], 227 | "metadata": { 228 | "id": "mHGTZYhH8tFQ" 229 | } 230 | }, 231 | { 232 | "cell_type": "code", 233 | "source": [ 234 | "conversation = [\n", 235 | " {\n", 236 | " \"role\": \"User\",\n", 237 | " \"content\": \"Describe each stage of this image.\",\n", 238 | " \"images\": [\"/content/DeepSeek-VL/images/training_pipelines.png\"]\n", 239 | " },\n", 240 | " {\n", 241 | " \"role\": \"Assistant\",\n", 242 | " \"content\": \"\"\n", 243 | " }\n", 244 | "]" 245 | ], 246 | "metadata": { 247 | "id": "ER9by_xpwOJl" 248 | }, 249 | "execution_count": 8, 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "source": [ 255 | "### Define graio predict function for chat inference" 256 | ], 257 | "metadata": { 258 | "id": "cVzuUw8O89S7" 259 | } 260 | }, 261 | { 262 | "cell_type": "code", 263 | "source": [ 264 | "\n", 265 | "def deepseekvl(messages):\n", 266 | "\n", 267 | " pil_images = load_pil_images(messages)\n", 268 | " prepare_inputs = vl_chat_processor(\n", 269 | " conversations=messages,\n", 270 | " images=pil_images,\n", 271 | " force_batchify=True\n", 272 | " ).to(vl_gpt.device)\n", 273 | " # run image encoder to get the image embeddings\n", 274 | " inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)\n", 275 | " # run the model to get the response\n", 276 | " outputs = vl_gpt.language_model.generate(\n", 277 | " inputs_embeds=inputs_embeds,\n", 278 | " attention_mask=prepare_inputs.attention_mask,\n", 279 | " pad_token_id=tokenizer.eos_token_id,\n", 280 | " bos_token_id=tokenizer.bos_token_id,\n", 281 | " eos_token_id=tokenizer.eos_token_id,\n", 282 | " max_new_tokens=512,\n", 283 | " do_sample=False,\n", 284 | " use_cache=True)\n", 285 | "\n", 286 | " answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)\n", 287 | " return answer\n", 288 | "\n", 289 | "\n", 290 | "def bot(msg, history):\n", 291 | "\n", 292 | " messages = []\n", 293 | " history = history + [[msg, None]]\n", 294 | " history_len = len(history)\n", 295 | " image_flag=False\n", 296 | " for idx, (user_message, assistant_message) in enumerate(history):\n", 297 | "\n", 298 | " if idx == (history_len-1):\n", 299 | " break\n", 300 | " if image_flag:\n", 301 | " image_flag=False\n", 302 | " continue\n", 303 | " if isinstance(user_message, tuple):\n", 304 | " if idx == (history_len-2):\n", 305 | " messages.append({\"role\": \"user\",\n", 306 | " \"content\": f\"{msg}\",\n", 307 | " \"images\": [user_message[0]],})\n", 308 | " messages.append({\"role\": \"Assistant\", \"content\": \"\"})\n", 309 | " image_flag=True\n", 310 | " response = deepseekvl(messages)\n", 311 | " history[-1][1] = response\n", 312 | " return history, ''\n", 313 | " else:\n", 314 | " messages.append({\"role\": \"user\",\n", 315 | " \"content\": f\"{history[idx+1][0]}\",\n", 316 | " \"images\": [user_message[0]],})\n", 317 | " image_flag=True\n", 318 | " else:\n", 319 | " messages.append({\"role\": \"user\", \"content\": user_message})\n", 320 | " messages.append({\"role\": \"assistant\", \"content\": assistant_message})\n", 321 | "\n", 322 | " messages.append({\"role\": \"user\", \"content\": msg})\n", 323 | " messages.append({\"role\": \"Assistant\", \"content\": \"\"})\n", 324 | " response = deepseekvl(messages)\n", 325 | " history[-1][1] = response\n", 326 | " return history, ''\n" 327 | ], 328 | "metadata": { 329 | "id": "Q6P-J92N_CMQ" 330 | }, 331 | "execution_count": 10, 332 | "outputs": [] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "source": [ 337 | "### Run this only if you get the below error while running inference in Gradio chatbot:\n", 338 | "\n", 339 | "> RuntimeError: cutlassF: no kernel found to launch\n", 340 | "\n", 341 | "Restart the session after this" 342 | ], 343 | "metadata": { 344 | "id": "fE-bNGma-QnY" 345 | } 346 | }, 347 | { 348 | "cell_type": "code", 349 | "source": [ 350 | "!pip install torch --upgrade" 351 | ], 352 | "metadata": { 353 | "id": "PwSC98Rx46cZ" 354 | }, 355 | "execution_count": null, 356 | "outputs": [] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "source": [ 361 | "## **Gradio layout**" 362 | ], 363 | "metadata": { 364 | "id": "N37UYyXj-ann" 365 | } 366 | }, 367 | { 368 | "cell_type": "code", 369 | "source": [ 370 | "import gradio as gr\n", 371 | "import os\n", 372 | "import time\n", 373 | "\n", 374 | "# Chatbot demo with multimodal input\n", 375 | "\n", 376 | "def print_like_dislike(x: gr.LikeData):\n", 377 | " # for now just print the messages liked or disliked by a user\n", 378 | " # you can implement a more complex workflow here as well\n", 379 | " print(x.index, x.value, x.liked)\n", 380 | "\n", 381 | "# upload file to chatbot\n", 382 | "def add_file(history, file):\n", 383 | " history = history + [[(file.name,), None]]\n", 384 | " return history\n", 385 | "\n", 386 | "\n", 387 | "with gr.Blocks(fill_height=True) as demo:\n", 388 | " gr.HTML(\"\"\"

DeepSeek-VL (1.3B Parameters)

\"\"\")\n", 389 | " chatbot = gr.Chatbot(\n", 390 | " [],\n", 391 | " elem_id=\"chatbot\",\n", 392 | " bubble_full_width=False,\n", 393 | " #avatar_images=(None, (\"/content/deepseeklogo.png\")),\n", 394 | " )\n", 395 | "\n", 396 | " with gr.Row():\n", 397 | " txt = gr.Textbox(\n", 398 | " scale=4,\n", 399 | " show_label=False,\n", 400 | " placeholder=\"Enter text and press enter, or upload an image\",\n", 401 | " container=False,\n", 402 | " )\n", 403 | " btn = gr.UploadButton(\"📁\", file_types=[\"image\", \"video\", \"audio\"])\n", 404 | "\n", 405 | " txt_msg = txt.submit(bot, [txt, chatbot], [chatbot,txt] )\n", 406 | "\n", 407 | " txt_msg.then(lambda: gr.Textbox(interactive=True), None, [txt], queue=False)\n", 408 | " file_msg = btn.upload(add_file, [chatbot, btn], [chatbot], queue=False)\n", 409 | "\n", 410 | " chatbot.like(print_like_dislike, None, None)\n", 411 | "\n", 412 | "\n", 413 | "demo.queue()\n", 414 | "demo.launch(debug=True,)\n", 415 | "\n" 416 | ], 417 | "metadata": { 418 | "colab": { 419 | "base_uri": "https://localhost:8080/", 420 | "height": 690 421 | }, 422 | "id": "aJNFRshN7vW7", 423 | "outputId": "d66eb7ed-0149-4928-e4e8-d94516a9d977" 424 | }, 425 | "execution_count": 11, 426 | "outputs": [ 427 | { 428 | "metadata": { 429 | "tags": null 430 | }, 431 | "name": "stdout", 432 | "output_type": "stream", 433 | "text": [ 434 | "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", 435 | "\n", 436 | "Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().\n", 437 | "Running on public URL: https://c07cb48b1f0ba7903b.gradio.live\n", 438 | "\n", 439 | "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" 440 | ] 441 | }, 442 | { 443 | "data": { 444 | "text/html": [ 445 | "

" 446 | ], 447 | "text/plain": [ 448 | "" 449 | ] 450 | }, 451 | "metadata": {}, 452 | "output_type": "display_data" 453 | }, 454 | { 455 | "output_type": "stream", 456 | "name": "stdout", 457 | "text": [ 458 | "Keyboard interruption in main thread... closing server.\n", 459 | "Killing tunnel 127.0.0.1:7860 <> https://c07cb48b1f0ba7903b.gradio.live\n" 460 | ] 461 | }, 462 | { 463 | "output_type": "execute_result", 464 | "data": { 465 | "text/plain": [] 466 | }, 467 | "metadata": {}, 468 | "execution_count": 11 469 | } 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "source": [ 475 | "9" 476 | ], 477 | "metadata": { 478 | "id": "EJJoECBC603K" 479 | }, 480 | "execution_count": null, 481 | "outputs": [] 482 | } 483 | ] 484 | } -------------------------------------------------------------------------------- /FluxDEV_with_ControlNet_Canny_and_Gradio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "A100", 9 | "authorship_tag": "ABX9TyNymxXiMAZ70gTHRW6iD8Y2", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | " $\"Open$ " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "evkv8iPXUi0n" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "!pip install gradio -q" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "!git clone https://huggingface.co/spaces/DamarJati/FLUX.1-DEV-Canny" 47 | ], 48 | "metadata": { 49 | "id": "3cQiiu2dVVUP" 50 | }, 51 | "execution_count": null, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "source": [ 57 | "%cd FLUX.1-DEV-Canny" 58 | ], 59 | "metadata": { 60 | "colab": { 61 | "base_uri": "https://localhost:8080/" 62 | }, 63 | "id": "JBVZxgLBVgus", 64 | "outputId": "b7f576fe-9f27-475f-d80a-c568ed23d767" 65 | }, 66 | "execution_count": 3, 67 | "outputs": [ 68 | { 69 | "output_type": "stream", 70 | "name": "stdout", 71 | "text": [ 72 | "/content/FLUX.1-DEV-Canny\n" 73 | ] 74 | } 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "source": [ 80 | "#You can either install this library or comment out the Spaces code from the app.py\n", 81 | "!pip install spaces -q" 82 | ], 83 | "metadata": { 84 | "id": "ecUaFYVdWDT9" 85 | }, 86 | "execution_count": 5, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "source": [ 92 | "!pip install -r requirements.txt" 93 | ], 94 | "metadata": { 95 | "id": "wD6AKmW6VlSO" 96 | }, 97 | "execution_count": null, 98 | "outputs": [] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "source": [ 103 | "**NOTE**: Set `share=True` in `demo.launch()` within the app.py file manually, before moving to the next step. You don't need to restart the session when prompted. Also make sure that the gradio version is updated to the latest release (`pip install gradio --upgrade`)" 104 | ], 105 | "metadata": { 106 | "id": "uEhY9a2oWZtp" 107 | } 108 | }, 109 | { 110 | "cell_type": "markdown", 111 | "source": [ 112 | "You'll also need to provide your Huggingface Hub token when prompted." 113 | ], 114 | "metadata": { 115 | "id": "eYBRV1IgZNuP" 116 | } 117 | }, 118 | { 119 | "cell_type": "code", 120 | "source": [ 121 | "!python app.py" 122 | ], 123 | "metadata": { 124 | "id": "ISL4nGJnV8zc" 125 | }, 126 | "execution_count": null, 127 | "outputs": [] 128 | } 129 | ] 130 | } -------------------------------------------------------------------------------- /IMAGDressing_v1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "machine_shape": "hm", 8 | "gpuType": "A100", 9 | "authorship_tag": "ABX9TyON7FnTA/yXOnIZQlOWT0pn", 10 | "include_colab_link": true 11 | }, 12 | "kernelspec": { 13 | "name": "python3", 14 | "display_name": "Python 3" 15 | }, 16 | "language_info": { 17 | "name": "python" 18 | }, 19 | "accelerator": "GPU" 20 | }, 21 | "cells": [ 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "id": "view-in-github", 26 | "colab_type": "text" 27 | }, 28 | "source": [ 29 | " $\"Open$ " 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "id": "hFwbOXwPu9KJ" 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "!pip install gradio -q" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "source": [ 46 | "!pip install spaces -q" 47 | ], 48 | "metadata": { 49 | "id": "EynUR9Y9xMPz" 50 | }, 51 | "execution_count": null, 52 | "outputs": [] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "source": [ 57 | "!git clone https://huggingface.co/spaces/feishen29/IMAGDressing-v1" 58 | ], 59 | "metadata": { 60 | "id": "ST4rxYvpvTXa" 61 | }, 62 | "execution_count": null, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "%cd IMAGDressing-v1" 69 | ], 70 | "metadata": { 71 | "colab": { 72 | "base_uri": "https://localhost:8080/" 73 | }, 74 | "id": "Te_SsNK5vXMC", 75 | "outputId": "0e470bf3-a628-4378-d59e-ae64818f0a88" 76 | }, 77 | "execution_count": 4, 78 | "outputs": [ 79 | { 80 | "output_type": "stream", 81 | "name": "stdout", 82 | "text": [ 83 | "/content/IMAGDressing-v1\n" 84 | ] 85 | } 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "source": [ 91 | "!pip install -r requirements.txt" 92 | ], 93 | "metadata": { 94 | "id": "CyTB2LH4vbES" 95 | }, 96 | "execution_count": null, 97 | "outputs": [] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "source": [ 102 | "**NOTE**: Set `share=True` in `demo.launch()` within the `app.py` file manually, before moving to next step. You don't need to restart the session when prompted." 103 | ], 104 | "metadata": { 105 | "id": "CLmHM1ZzwLGU" 106 | } 107 | }, 108 | { 109 | "cell_type": "code", 110 | "source": [ 111 | "!python app.py" 112 | ], 113 | "metadata": { 114 | "id": "G6cpBapRv3qK" 115 | }, 116 | "execution_count": null, 117 | "outputs": [] 118 | } 119 | ] 120 | } -------------------------------------------------------------------------------- /OpenAI_TTS_with_Gradio.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyPRVtEaH34+TT4gp7DgGQjP", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | " $\"Open$ " 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": { 33 | "id": "poKnX2s19gpr" 34 | }, 35 | "outputs": [], 36 | "source": [ 37 | "!pip install --upgrade gradio -q" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "source": [ 43 | "!pip install openai -q" 44 | ], 45 | "metadata": { 46 | "id": "Xo9Liwhx9syB" 47 | }, 48 | "execution_count": null, 49 | "outputs": [] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "source": [ 54 | "import gradio as gr\n", 55 | "import os\n", 56 | "import tempfile\n", 57 | "from openai import OpenAI" 58 | ], 59 | "metadata": { 60 | "id": "Ec__dI8vAxzv" 61 | }, 62 | "execution_count": 3, 63 | "outputs": [] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "source": [ 68 | "# Set an environment variable for key\n", 69 | "os.environ['OPENAI_API_KEY'] = \"sk-...\" # Enter your openai api key here\n", 70 | "\n", 71 | "client = OpenAI() # add api_key\n", 72 | "\n", 73 | "def tts(text, model, voice):\n", 74 | " response = client.audio.speech.create(\n", 75 | " model=model, #\"tts-1\",\"tts-1-hd\"\n", 76 | " voice=voice, #'alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'\n", 77 | " input=text,\n", 78 | " )\n", 79 | "\n", 80 | " # Create a temp file to save the audio\n", 81 | " with tempfile.NamedTemporaryFile(suffix=\".mp3\", delete=False) as temp_file:\n", 82 | " temp_file.write(response.content)\n", 83 | "\n", 84 | " # Get the file path of the temp file\n", 85 | " temp_file_path = temp_file.name\n", 86 | "\n", 87 | " return temp_file_path\n", 88 | "\n", 89 | "\n", 90 | "with gr.Blocks() as demo:\n", 91 | " gr.Markdown(\"# OpenAI Text-To-Speech API with Gradio \")\n", 92 | " with gr.Row():\n", 93 | " model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='Model', value='tts-1')\n", 94 | " voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='Voice Options', value='alloy')\n", 95 | "\n", 96 | " text = gr.Textbox(label=\"Input text\", placeholder=\"Input text and press the Text-To-Speech button or press Enter.\")\n", 97 | " btn = gr.Button(\"Text-To-Speech\")\n", 98 | " output_audio = gr.Audio(label=\"Speech Output\")\n", 99 | "\n", 100 | " text.submit(fn=tts, inputs=[text, model, voice], outputs=output_audio, api_name=\"tts_1\", concurrency_limit=None)\n", 101 | " btn.click(fn=tts, inputs=[text, model, voice], outputs=output_audio, api_name=\"tts_2\", concurrency_limit=None)\n", 102 | "\n", 103 | "demo.launch()" 104 | ], 105 | "metadata": { 106 | "colab": { 107 | "base_uri": "https://localhost:8080/", 108 | "height": 628 109 | }, 110 | "id": "7JSzZvjB9ver", 111 | "outputId": "a0114a75-fbe7-4052-b272-5d49fc8f0f3e" 112 | }, 113 | "execution_count": 5, 114 | "outputs": [ 115 | { 116 | "output_type": "stream", 117 | "name": "stdout", 118 | "text": [ 119 | "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", 120 | "\n", 121 | "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", 122 | "Running on public URL: https://492ba4182a3c570c11.gradio.live\n", 123 | "\n", 124 | "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" 125 | ] 126 | }, 127 | { 128 | "output_type": "display_data", 129 | "data": { 130 | "text/plain": [ 131 | "" 132 | ], 133 | "text/html": [ 134 | "

" 135 | ] 136 | }, 137 | "metadata": {} 138 | }, 139 | { 140 | "output_type": "execute_result", 141 | "data": { 142 | "text/plain": [] 143 | }, 144 | "metadata": {}, 145 | "execution_count": 5 146 | } 147 | ] 148 | } 149 | ] 150 | } -------------------------------------------------------------------------------- /PhotoMaker_Demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | " $\"Open$ " 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "source": [ 16 | "!git clone https://github.com/TencentARC/PhotoMaker.git\n", 17 | "%cd PhotoMaker\n", 18 | "# Install requirements\n", 19 | "!pip install -r requirements.txt\n", 20 | "# Install photomaker\n", 21 | "!pip install git+https://github.com/TencentARC/PhotoMaker.git\n", 22 | "\n", 23 | "from photomaker import PhotoMakerStableDiffusionXLPipeline\n", 24 | "from huggingface_hub import hf_hub_download\n", 25 | "import torch\n", 26 | "import os\n", 27 | "from diffusers.utils import load_image\n", 28 | "from diffusers import EulerDiscreteScheduler\n", 29 | "from photomaker import PhotoMakerStableDiffusionXLPipeline\n", 30 | "from PIL import Image\n", 31 | "import gradio as gr\n", 32 | "\n", 33 | "photomaker_path = hf_hub_download(repo_id=\"TencentARC/PhotoMaker\", filename=\"photomaker-v1.bin\", repo_type=\"model\")\n", 34 | "base_model_path = 'SG161222/RealVisXL_V3.0'\n", 35 | "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", 36 | "\n", 37 | "### Load base model\n", 38 | "pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(\n", 39 | " base_model_path, # can change to any base model based on SDXL\n", 40 | " torch_dtype=torch.bfloat16,\n", 41 | " use_safetensors=True,\n", 42 | " variant=\"fp16\"\n", 43 | ").to(device)\n", 44 | "\n", 45 | "### Load PhotoMaker checkpoint\n", 46 | "pipe.load_photomaker_adapter(\n", 47 | " os.path.dirname(photomaker_path),\n", 48 | " subfolder=\"\",\n", 49 | " weight_name=os.path.basename(photomaker_path),\n", 50 | " trigger_word=\"img\" # define the trigger word\n", 51 | ")\n", 52 | "pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)" 53 | ], 54 | "metadata": { 55 | "id": "Bprrd6l026v8" 56 | }, 57 | "execution_count": null, 58 | "outputs": [] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "source": [ 63 | "def generate_image(files, prompt, negative_prompt):\n", 64 | " # load the input ID images\n", 65 | " files_list = []\n", 66 | " for f in files:\n", 67 | " img = Image.open(f.name)\n", 68 | " files_list.append(img)\n", 69 | " # Note that the trigger word `img` must follow the class word for personalization\n", 70 | " generator = torch.Generator(device=device).manual_seed(42)\n", 71 | " images = pipe(\n", 72 | " prompt=prompt,\n", 73 | " input_id_images=files_list,\n", 74 | " negative_prompt=negative_prompt,\n", 75 | " num_images_per_prompt=1,\n", 76 | " num_inference_steps=50,\n", 77 | " start_merge_step=10,\n", 78 | " generator=generator,).images[0]\n", 79 | " return [images]" 80 | ], 81 | "metadata": { 82 | "id": "vaFMd4YDUfgu" 83 | }, 84 | "execution_count": 19, 85 | "outputs": [] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "source": [ 90 | "logo = r\"\"\"

\"\"\"\n", 91 | "title = r\"\"\"

PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding

\"\"\"\n", 92 | "with gr.Blocks() as demo:\n", 93 | " with gr.Row():\n", 94 | " gr.Markdown(title)\n", 95 | " gr.Markdown(logo)\n", 96 | " with gr.Row():\n", 97 | " files = gr.Files(\n", 98 | " label=\"Drag (Select) 1 or more photos of your face\",\n", 99 | " file_types=[\"image\"])\n", 100 | " gallery = gr.Gallery(label=\"Generated Images\")\n", 101 | " with gr.Row():\n", 102 | " prompt = gr.Textbox(label=\"Prompt\",\n", 103 | " info=\"Try something like 'a photo of a man/woman img', 'img' is the trigger word.\",\n", 104 | " placeholder=\"A photo of a [man/woman img]...\")\n", 105 | " negative_prompt = gr.Textbox(\n", 106 | " label=\"Negative Prompt\",\n", 107 | " placeholder=\"low quality\",\n", 108 | " value=\"nsfw, lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry\",)\n", 109 | " submit = gr.Button(\"Submit\")\n", 110 | "\n", 111 | "\n", 112 | " submit.click(generate_image, [files, prompt, negative_prompt], [gallery])\n", 113 | "\n", 114 | "demo.launch()" 115 | ], 116 | "metadata": { 117 | "colab": { 118 | "base_uri": "https://localhost:8080/", 119 | "height": 646 120 | }, 121 | "id": "4qu3zIr_QNYL", 122 | "outputId": "9405dfa2-ef80-4371-a9e4-2f8b828a785c" 123 | }, 124 | "execution_count": 21, 125 | "outputs": [ 126 | { 127 | "output_type": "stream", 128 | "name": "stdout", 129 | "text": [ 130 | "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n", 131 | "\n", 132 | "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", 133 | "Running on public URL: https://1443074086d29b3276.gradio.live\n", 134 | "\n", 135 | "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" 136 | ] 137 | }, 138 | { 139 | "output_type": "display_data", 140 | "data": { 141 | "text/plain": [ 142 | "" 143 | ], 144 | "text/html": [ 145 | "

" 146 | ] 147 | }, 148 | "metadata": {} 149 | }, 150 | { 151 | "output_type": "execute_result", 152 | "data": { 153 | "text/plain": [] 154 | }, 155 | "metadata": {}, 156 | "execution_count": 21 157 | } 158 | ] 159 | } 160 | ], 161 | "metadata": { 162 | "colab": { 163 | "provenance": [], 164 | "machine_shape": "hm", 165 | "gpuType": "V100", 166 | "authorship_tag": "ABX9TyM9Cs+Imsu6rOs34njO9WoP", 167 | "include_colab_link": true 168 | }, 169 | "kernelspec": { 170 | "display_name": "Python 3", 171 | "name": "python3" 172 | }, 173 | "language_info": { 174 | "name": "python" 175 | }, 176 | "accelerator": "GPU" 177 | }, 178 | "nbformat": 4, 179 | "nbformat_minor": 0 180 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How To Deploy Your Machine Learning Models Online Using Huggingface & Gradio Powered Spaces 2 | 3 | 4 | ![](/images/gradioapp3.jpg) 5 | 6 | ## Inspiration 7 | I built a fun little ML powered app over the Christmas holidays, hope you have fun using it as much as I did while building it. User Research (*LOL*): The inspiration of building a tool like this comes from my mother's need of understanding *Whatsapp image forwards* which have English text written over them. I believe there are others as well who face the same struggle while trying to understand the daily fowards which are not in their regional language. Egro, added the support for 6 international languages. 8 | 9 | Spaces is a new and extremly useful tool to deploy or showcase your ML apps to the world. You can refer these videos - [Build and Deploy a Machine Learning App in 2 Minutes](https://www.youtube.com/watch?v=3bSVKNKb_PY) or [Building Machine Learning Applications Fast](https://www.youtube.com/watch?v=c7mle2yYpwQ&t=738s) released by Huggingface, to get more idea about it. Also, please refer this wonderful [blogpost](https://huggingface.co/blog/gradio-spaces) on how you can use HuggingFace Spaces and Gradio in matter of few lines of code. 10 | 11 | In this article I'll try and explain how I build this fun app and how you can build one too. Let's Go! 12 | 13 | 14 | ## Table of Content 15 | #### 1. HuggingFace introduction 16 | #### 2. Gradio introducttion 17 | #### 3. What I built 18 | #### 4. How I built it 19 | #### 5. How you can access the app 20 | #### 6. Conclusion 21 | 22 | 23 | 24 | ## Huggingface 25 | HuggingFace is a startup in the AI field, and there mission is to democratize good machine learning. Its an AI community trying to build the future in which everyone has equal opportunity and access to benfits of latest advances in AI. You can either browse their [model hub](https://huggingface.co/models) to discover, experiment and contribute to new sota models, for example, [gooogle-tapas](https://huggingface.co/google/tapas-base), [distilbert](https://huggingface.co/distilbert-base-uncased), [facebook-wav2vec2](https://huggingface.co/facebook/wav2vec2-base-960h), and so on. Or you can directly use their inference API to serve your moddels directly from HF infrastructure. The most famous artifact that HF has created so far is their Transformer library, which started as an nlp library but now has support for other modalities as well. Now, Transformer provides thousands of pretrained models to perform tasks on different modalities such as text, vision, and audio. 26 | 27 | 28 | ## Gradio 29 | [Gradio](https://gradio.app/) is the fastest and easiest way to demo your ML models with a very friendly and feature-rich UI. Almost anyone can us it without a manual and with just a little intuition. You can install Gradio library easily using pip install. I used both Hugging Face and Gradio on Colab so installations were allthemore starightforward and easier. You can deploy your ML model online using just 5-10 code lines depending on the complexity of your implementation. Recently, Gradio has made it possible to embed your deployed ML model to any webpage of your choice. I have done the same at the end of this article, check it out. Gadio code helps you generate a public link for your deployed ML model/app which you can then share with your friends, colleagues at work or a potential employer or collaborator. 30 | 31 | 32 | ## What I Built 33 | I built a fun project in last couple days using HuggingFace and Gradio functionalities. This project employs mage analysis, language translation and OCR techniques. A user can select an image of his choice with some english text over it as an input. For example, an image with some motivational text written over it like the ones we all receive in our family whatsapp groups all the time. He then gets to make a selection from the given 7 languages as the output language - German, Spanish, French, Turkish, Hindi, Arabic, and Irish. The app then outputs the same image as input but with text now translated in the language selected by the user. 34 | 35 | 36 | ## How I Built It 37 | I am using pytesseract to perform the OCR on input image. Once I have the text 'extracted' from the input image, I employ HuggingFace transformers library to get the desired translation model and tokenizer loaded for an inference. These translation models are open sourced by the [Language Technology Research Group at the University of Helsinki](https://blogs.helsinki.fi/language-technology/), and you can access their account page and pre-trained odels on HuggingFace'e [website](https://huggingface.co/Helsinki-NLP). The extracted text is then translated into the selected language. For example, if you have selected the language as German, the app will load the "Helsinki-NLP/opus-mt-en-de" translation model from transformers hub and would tranlate the OCR extracted English text to German. 38 | 39 | Next, I am using [Kers-OCR](https://github.com/faustomorales/keras-ocr) library to extract the cordinates of English text from the original input image. This library is based on Keras CRNN or [Keras implementation of Convolutional Recurrent Neural Network for Text Recognition](https://github.com/janzd/CRNN). Once I have these cordinates, I perform a cleaning of text using OpenCV Pillow library with just couple lines of code. This cleaning is inspired from [Carlo Borella's incredible post](https://towardsdatascience.com/remove-text-from-images-using-cv2-and-keras-ocr-24e7612ae4f4). 40 | 41 | After this, next step is to copy the translated text onto the 'cleansed' image. Current implementation does not take care of pasting the translated text exactly in place of the original English text, however i have plans to do that and more in my next iterations. 42 | 43 | 44 | 45 | ## How You Can Access It 46 | My HuggingFace - Gradio app can be accessed on my account page on thier website, its accessible to public and is available over here - [Translate English Text to Your Regional Language In Your Forwarded Images](https://huggingface.co/spaces/ysharma/TranslateQuotesInImageForwards). 47 | Providing the demo in form of an animation below. 48 | 49 | ![](/images/gif1.gif) 50 | 51 | 52 | ## Conclusion : 53 | ## Benefits 54 | [HuggingFace Spaces](https://huggingface.co/spaces) is a cool new feature, where anyone can host their AI models using two awesome SDKs - either [Streamlit](https://streamlit.io/) and Gradio. Spaces are a simple way to host ML demo apps directly on your HF profile page or your organization’s profile. This empowers our ML community to create our own little ML project portfolios, showcasing our projects at conferences, to stakeholders, or to any interested parties and to work collaboratively with other people in the ecosystem. 55 | 56 | ## Ease of use 57 | Few points to keep in mind for an easy passage while building a complex Gradio app like this one - 58 | 59 | * All the required libraries should be mentioned in *requirements.txt* file 60 | * In case you have some *Debian dependencies* and you would want to use sudo apt install for the same, make sure you copy such libraries in *packages.txt* file 61 | * Make sure you are copying all the supporting files (images/fonts) over to your *app space repo* 62 | * Comment aptly the code that you are submiiting under *app.py* file 63 | * Try to have your model and tokenizers loading outside the inference calls made from gradio.interface(). This helps in speeding up your app response to the users. 64 | * This apps *app.py* code can help you take an inspiration in case you want to have multiple and different type of inputs and outputs (image/text/radio box *etc.*). It took me a while to figure out the right way. 65 | 66 | ## Awesome Community 67 | At the end of the day a strong community support helps you in learning about cool new avenues, uderstanding hard concepts, in resolving your issues, and in staying motivated to improve yourself and your skills. 68 | 69 | There are many incredible folks out there building and helping ML communities day in day out. I would like to take a moment and specially thank a few folks for all the efforts that they put in daily. A lot of effort goes in replying to call outs for help, in writing easy to follow blogs, in inspiring other by showcasing their own ML work and in doing everything that they do to put themselves out there. Reach out to them on Twitter and Discord over here -- 70 | 71 | * [Abubakar Abid](https://twitter.com/abidlabs), [Ali](https://twitter.com/si3luwa), and [AK](https://twitter.com/ak92501) of Gradio labs 72 | * [Merve Noyan](https://twitter.com/mervenoyann) and [Omar](https://twitter.com/osanseviero) from HuggingFace 73 | * Everyone who is active on [HuggingFace Discord community](http://hf.co/join/discord) 74 | 75 | ## Future work 76 | The app is still a bit rough on the edges and I plan to improve it in future interations, for example, right now it might not process well certain screenshots and those images in which the text is slanted a bit. Planning to enable OCR for slant text and for images in which text is present at multiple places. I will also be adding more languages to the mix. And lastly would be trying to insert the translated text at the same spot as the original image and in similar font style and font size. 77 | 78 | ## Lastmile AI 79 | Gradio helps in bridging the gap between developing your ML models and showcasing them to the world. In my humble opinion, this is a crucial step in two main themes of this new year - Democratizing AI and Productioninzing AI. 80 | 81 | 82 | My github repo and code can be accessed over here - [HugginFace_Gradio](https://github.com/yvrjsharma/HugginFace_Gradio/blob/main/Whatsapp_Image_Forwards_In_Your_Language_GradioDemo.ipynb). 83 | 84 | *If you enjoyed this article, please feel free to connect with me on [LinkedIn](https://www.linkedin.com/in/yuvraj-sharma-a7154628/) or [Twitter](https://twitter.com/yvrjsharma) and do share your feedback and any other ML app ideas that you would want to implement yourself, I will be happy to help as much as I can.* 85 | 86 | 87 | Image source - Photo by Michał Kubalczyk on Unsplash 88 | -------------------------------------------------------------------------------- /Video_Editing_using_Automatic_Speech_Recognition.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Video Editing using Automatic Speech Recognition", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyOu8u7ORzrK7gbCughb9MHD", 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "language_info": { 16 | "name": "python" 17 | } 18 | }, 19 | "cells": [ 20 | { 21 | "cell_type": "markdown", 22 | "metadata": { 23 | "id": "view-in-github", 24 | "colab_type": "text" 25 | }, 26 | "source": [ 27 | " $\"Open$ " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": null, 33 | "metadata": { 34 | "id": "1ilMbvf2WZuG" 35 | }, 36 | "outputs": [], 37 | "source": [ 38 | "" 39 | ] 40 | } 41 | ] 42 | } -------------------------------------------------------------------------------- /Whatsapp_Image_Forwards_In_Your_Language_GradioDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | " $\"Open$ " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "poAXVJMx97gV" 17 | }, 18 | "source": [ 19 | "# **Translating Text from Images**" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "source": [ 25 | "# Run this cell to mount your Google Drive.\n", 26 | "from google.colab import drive\n", 27 | "drive.flush_and_unmount()\n", 28 | "drive.mount('/content/gdrive/')\n" 29 | ], 30 | "metadata": { 31 | "colab": { 32 | "base_uri": "https://localhost:8080/" 33 | }, 34 | "id": "nBn2lVFlIqYr", 35 | "outputId": "4c70c35e-9619-47b9-ccfd-a1fa5f4d1652" 36 | }, 37 | "execution_count": 2, 38 | "outputs": [ 39 | { 40 | "output_type": "stream", 41 | "name": "stdout", 42 | "text": [ 43 | "Drive not mounted, so nothing to flush and unmount.\n", 44 | "Mounted at /content/gdrive/\n" 45 | ] 46 | } 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": { 53 | "id": "3mRXLtkP97ga" 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "#Installing required libraries\n", 58 | "!pip install datasets transformers[sentencepiece]\n", 59 | "!pip install accelerate\n", 60 | "!apt install git-lfs\n", 61 | "\n", 62 | "!sudo apt install tesseract-ocr\n", 63 | "!pip install pytesseract\n", 64 | "!pip install keras-ocr\n", 65 | "!pip install gradio" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "source": [ 71 | "#importing required libraries\n", 72 | "import pytesseract\n", 73 | "from PIL import Image, ImageFont, ImageDraw \n", 74 | "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM\n", 75 | "import matplotlib.pyplot as plt\n", 76 | "import keras_ocr\n", 77 | "import cv2\n", 78 | "import math\n", 79 | "import numpy as np\n", 80 | "import gradio as gr\n", 81 | "import numpy as np" 82 | ], 83 | "metadata": { 84 | "id": "O7jS8KEZ3HV5" 85 | }, 86 | "execution_count": 20, 87 | "outputs": [] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "source": [ 92 | "#Translated in your desired language\n", 93 | "def choose_language(language):\n", 94 | " #Support for Hindi, Spanish, French\n", 95 | " #Support for Arabic, Turish, arabic\n", 96 | " #Support for German\n", 97 | " if language == 'hindi':\n", 98 | " modelnm = \"Helsinki-NLP/opus-mt-en-hi\"\n", 99 | " elif language == 'spanish':\n", 100 | " modelnm = \"Helsinki-NLP/opus-mt-en-es\"\n", 101 | " elif language == 'german':\n", 102 | " modelnm = \"Helsinki-NLP/opus-mt-en-de\"\n", 103 | " elif language == 'french':\n", 104 | " modelnm = \"Helsinki-NLP/opus-mt-en-fr\"\n", 105 | " elif language == 'turkish':\n", 106 | " modelnm = \"Helsinki-NLP/opus-mt-en-trk\"\n", 107 | " elif language == 'arabic':\n", 108 | " modelnm = \"Helsinki-NLP/opus-mt-en-ar\"\n", 109 | " else:\n", 110 | " modelnm = \"Helsinki-NLP/opus-mt-en-ga\"\n", 111 | "\n", 112 | "\n", 113 | " tokenizer = AutoTokenizer.from_pretrained(modelnm)\n", 114 | " model = AutoModelForSeq2SeqLM.from_pretrained(modelnm)\n", 115 | "\n", 116 | " return tokenizer, model\n", 117 | "\n", 118 | "#Function to translate english text to desired language\n", 119 | "def translator(text, lang):\n", 120 | "\n", 121 | " if '\\n' in text:\n", 122 | " text_list = text.splitlines()\n", 123 | " text = ' '.join(text_list)\n", 124 | "\n", 125 | " #Huggingface transformers Magic \n", 126 | " tokenizer, model = choose_language(lang)\n", 127 | " input_ids = tokenizer.encode(text, return_tensors=\"pt\", padding=True) #Tokenizer\n", 128 | " outputs = model.generate(input_ids) #Model\n", 129 | " #Translated Text\n", 130 | " decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True) #Tokenizer\n", 131 | " return decoded_text\n" 132 | ], 133 | "metadata": { 134 | "id": "E4EgtneNElw1" 135 | }, 136 | "execution_count": 21, 137 | "outputs": [] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "source": [ 142 | "#Getting cordinates\n", 143 | "def midpoint(x1, y1, x2, y2):\n", 144 | " x_mid = int((x1 + x2)/2)\n", 145 | " y_mid = int((y1 + y2)/2)\n", 146 | " return (x_mid, y_mid)\n", 147 | "\n", 148 | "pipeline = keras_ocr.pipeline.Pipeline()\n", 149 | "\n", 150 | "#Getting cordinates for text insie image\n", 151 | "#This will help in filling up the space with colors\n", 152 | "def img_text_cords(im): #, pipeline):\n", 153 | " #read image\n", 154 | " img = keras_ocr.tools.read(im)\n", 155 | " #generate (word, box) tuples \n", 156 | " prediction_groups = pipeline.recognize([img]) \n", 157 | " mask = np.zeros(img.shape[:2], dtype=\"uint8\")\n", 158 | " for box in prediction_groups[0]:\n", 159 | " x0, y0 = box[1][0]\n", 160 | " x1, y1 = box[1][1] \n", 161 | " x2, y2 = box[1][2]\n", 162 | " x3, y3 = box[1][3] \n", 163 | " \n", 164 | " x_mid0, y_mid0 = midpoint(x1, y1, x2, y2)\n", 165 | " x_mid1, y_mi1 = midpoint(x0, y0, x3, y3)\n", 166 | " \n", 167 | " thickness = int(math.sqrt( (x2 - x1)**2 + (y2 - y1)**2 ))\n", 168 | " \n", 169 | " cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255, \n", 170 | " thickness)\n", 171 | " img = cv2.inpaint(img, mask, 7, cv2.INPAINT_NS)\n", 172 | " \n", 173 | " return img \n", 174 | "\n", 175 | "#Extracting text from image\n", 176 | "def text_extract(im):\n", 177 | " #Using pytesseract to read text\n", 178 | " ocr_text = pytesseract.image_to_string(im)\n", 179 | " return ocr_text\n", 180 | "\n", 181 | "#Formatting the text to multi lines structure\n", 182 | "#This is mainly for translated text to look and fit better on an image\n", 183 | "def format_text(language,extracted_text):\n", 184 | " \n", 185 | " translated_text = translator(extracted_text, language)\n", 186 | " \n", 187 | " word_list,i = [],0\n", 188 | " for word in translated_text.split():\n", 189 | " if i%5 != 0:\n", 190 | " word_list.append(' '+word)\n", 191 | " else:\n", 192 | " word_list.append('\\n'+word)\n", 193 | " i+=1 \n", 194 | "\n", 195 | " new_title_text = ''.join(word_list)\n", 196 | " return new_title_text\n", 197 | "\n", 198 | "\n", 199 | "def translate_image(im, language):\n", 200 | " #Extract text, translate in your language and format it \n", 201 | " extracted_text = text_extract(im)\n", 202 | " #font select -- Getting Unicode Text\n", 203 | " title_font = ImageFont.truetype('/content/gdrive/My Drive/sample_images/arial-unicode-ms.ttf',30)\n", 204 | " #text to write on image #Example in hindi - Unicode text u\"आप जीवन में मिलता हर मौका ले लो, क्योंकि कुछ चीजें केवल एक बार होती हैं. शुभ सुबह\" \n", 205 | " txt = format_text(language,extracted_text)\n", 206 | "\n", 207 | " #Editing image\n", 208 | " img_returned = img_text_cords(im) \n", 209 | " img_rgb = cv2.cvtColor(img_returned, cv2.COLOR_BGR2RGB)\n", 210 | " cv2.imwrite(\"text_free_image.jpg\",img_rgb)\n", 211 | " new_image = Image.open(\"text_free_image.jpg\")\n", 212 | "\n", 213 | " #Enable writing on image\n", 214 | " image_editable = ImageDraw.Draw(new_image)\n", 215 | " image_editable.multiline_text((10,10), txt,spacing=2, font=title_font, fill= (0, 0, 0)) #(237, 230, 211)) (0, 0, 0))\n", 216 | " return new_image\n" 217 | ], 218 | "metadata": { 219 | "colab": { 220 | "base_uri": "https://localhost:8080/" 221 | }, 222 | "id": "cmov6DmdfH_-", 223 | "outputId": "2a31006b-5029-445e-a118-8fce3846ca27" 224 | }, 225 | "execution_count": 22, 226 | "outputs": [ 227 | { 228 | "output_type": "stream", 229 | "name": "stdout", 230 | "text": [ 231 | "Looking for /root/.keras-ocr/craft_mlt_25k.h5\n", 232 | "Looking for /root/.keras-ocr/crnn_kurapan.h5\n" 233 | ] 234 | } 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "source": [ 240 | "title = \"Translate English Text to Your Regional Language In Your Forwarded Images\"\n", 241 | "description = \"This fun Gradio demo is for translating English quote in an image (usually whatsapp forwards :) ) to your local or preferred language. To use it, simply upload your image, select one of the language choices given (hindi, spanish, german, french, arabic, irish, and turkish) from radio buttons provided. You can alternately click one of the examples to load them and select the language choice along with it.\"\n", 242 | "article = \"

Image Text Translate by Yuvraj S | Github Repo |

\"\n", 243 | "pipeline = keras_ocr.pipeline.Pipeline()\n", 244 | "gr.Interface(\n", 245 | " translate_image, \n", 246 | " [gr.inputs.Image(type=\"filepath\", label=\"Input\"), gr.inputs.Radio(choices=['hindi','spanish','french','turkish','german','irish', 'arabic'], type=\"value\", default='hindi', label='Choose A Language')], \n", 247 | " gr.outputs.Image(type=\"pil\", label=\"Output\"), \"text\",\n", 248 | " title=title,\n", 249 | " description=description,\n", 250 | " article=article,\n", 251 | " #examples=[['bill.png','version 0.2'],['keanu.png','version 0.3'],['will.jpeg','version 0.2']],\n", 252 | " enable_queue=True\n", 253 | " ).launch(debug=True)" 254 | ], 255 | "metadata": { 256 | "id": "N-hI8F5L67jB" 257 | }, 258 | "execution_count": null, 259 | "outputs": [] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "source": [ 264 | "" 265 | ], 266 | "metadata": { 267 | "id": "E8U3HB7Xds3n" 268 | }, 269 | "execution_count": null, 270 | "outputs": [] 271 | } 272 | ], 273 | "metadata": { 274 | "colab": { 275 | "name": "Whatsapp_Image_Forwards_In_Your_Language_GradioDemo.ipynb", 276 | "provenance": [], 277 | "collapsed_sections": [], 278 | "include_colab_link": true 279 | }, 280 | "language_info": { 281 | "name": "python" 282 | }, 283 | "kernelspec": { 284 | "name": "python3", 285 | "display_name": "Python 3" 286 | }, 287 | "accelerator": "GPU" 288 | }, 289 | "nbformat": 4, 290 | "nbformat_minor": 0 291 | } -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | #Import libraries 2 | import pytesseract 3 | from PIL import Image, ImageFont, ImageDraw 4 | from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 5 | import matplotlib.pyplot as plt 6 | import keras_ocr 7 | import cv2 8 | import math 9 | import numpy as np 10 | import gradio as gr 11 | import numpy as np 12 | #Support for Hindi, Spanish, French, Arabic, Turish, Gailec/Irish, and German 13 | #'hindi': 14 | tokenizerhi = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-hi") 15 | modelhi = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-hi") 16 | #'spanish': 17 | tokenizeres = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-es") 18 | modeles = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es") 19 | #'german': 20 | tokenizerde = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de") 21 | modelde = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de") 22 | #'french': 23 | tokenizerfr = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-fr") 24 | modelfr = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-fr") 25 | #'turkish': 26 | tokenizertrk = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-trk") 27 | modeltrk = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-trk") 28 | #'arabic': 29 | tokenizerar = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ar") 30 | modelar = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ar") 31 | #Irish /Gaelish 32 | tokenizerga = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ga") 33 | modelga = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ga") 34 | #Translate in your desired language 35 | def choose_language(language): 36 | #Loading the tokenizers and trained models 37 | if language == 'hindi': 38 | tokenizer, model = tokenizerhi, modelhi 39 | elif language == 'spanish': 40 | tokenizer, model = tokenizeres, modeles 41 | elif language == 'german': 42 | tokenizer, model = tokenizerde, modelde 43 | elif language == 'french': 44 | tokenizer, model = tokenizerfr, modelfr 45 | elif language == 'turkish': 46 | tokenizer, model = tokenizertrk, modeltrk 47 | elif language == 'arabic': 48 | tokenizer, model = tokenizerar, modelar 49 | else: 50 | tokenizer, model = tokenizerga, modelga 51 | return tokenizer, model 52 | #Function to translate english text to desired language 53 | def translator(text, lang): 54 | if '\n' in text: 55 | text_list = text.splitlines() 56 | text = ' '.join(text_list) 57 | #Huggingface transformers Magic 58 | tokenizer, model = choose_language(lang) 59 | input_ids = tokenizer.encode(text, return_tensors="pt", padding=True) #Tokenizer 60 | outputs = model.generate(input_ids) #Model 61 | #Translated Text 62 | decoded_text = tokenizer.decode(outputs[0], skip_special_tokens=True) #Tokenizer 63 | return decoded_text 64 | #Getting cordinates 65 | def midpoint(x1, y1, x2, y2): 66 | x_mid = int((x1 + x2)/2) 67 | y_mid = int((y1 + y2)/2) 68 | return (x_mid, y_mid) 69 | pipeline = keras_ocr.pipeline.Pipeline() 70 | #Getting cordinates for text insie image 71 | #This will help in filling up the space with colors 72 | def img_text_cords(im): #, pipeline): 73 | #read image 74 | img = keras_ocr.tools.read(im) 75 | #generate (word, box) tuples 76 | prediction_groups = pipeline.recognize([img]) 77 | mask = np.zeros(img.shape[:2], dtype="uint8") 78 | for box in prediction_groups[0]: 79 | x0, y0 = box[1][0] 80 | x1, y1 = box[1][1] 81 | x2, y2 = box[1][2] 82 | x3, y3 = box[1][3] 83 | 84 | x_mid0, y_mid0 = midpoint(x1, y1, x2, y2) 85 | x_mid1, y_mi1 = midpoint(x0, y0, x3, y3) 86 | 87 | thickness = int(math.sqrt( (x2 - x1)**2 + (y2 - y1)**2 )) 88 | 89 | cv2.line(mask, (x_mid0, y_mid0), (x_mid1, y_mi1), 255, 90 | thickness) 91 | img = cv2.inpaint(img, mask, 7, cv2.INPAINT_NS) 92 | 93 | return img 94 | #Extracting text from image 95 | def text_extract(im): 96 | #Using pytesseract to read text 97 | ocr_text = pytesseract.image_to_string(im) 98 | return ocr_text 99 | #Formatting the text to multi lines structure 100 | #This is mainly for translated text to look and fit better on an image 101 | def format_text(language,extracted_text): 102 | 103 | translated_text = translator(extracted_text, language) 104 | 105 | word_list,i = [],0 106 | for word in translated_text.split(): 107 | if i%5 != 0: 108 | word_list.append(' '+word) 109 | else: 110 | word_list.append('\n'+word) 111 | i+=1 112 | new_title_text = ''.join(word_list) 113 | return new_title_text 114 | def translate_image(im, language): 115 | #Extract text, translate in your language and format it 116 | extracted_text = text_extract(im) 117 | #font select -- Getting Unicode Text 118 | title_font = ImageFont.truetype('./arial-unicode-ms.ttf',30) 119 | #text to write on image #Example in hindi - Unicode text u"आप जीवन में मिलता हर मौका ले लो, क्योंकि कुछ चीजें केवल एक बार होती हैं. शुभ सुबह" 120 | txt = format_text(language,extracted_text) 121 | #Editing image 122 | img_returned = img_text_cords(im) 123 | img_rgb = cv2.cvtColor(img_returned, cv2.COLOR_BGR2RGB) 124 | cv2.imwrite("text_free_image.jpg",img_rgb) 125 | new_image = Image.open("text_free_image.jpg") 126 | #Enable writing on image 127 | image_editable = ImageDraw.Draw(new_image) 128 | image_editable.multiline_text((10,10), txt,spacing=2, font=title_font, fill= (237, 230, 211)) # Text color e.g. (0, 0, 0)) blacks 129 | return new_image 130 | title = "Translate English Text to Your Regional Language In Your Forwarded Images" 131 | description = "This fun Gradio demo is for translating English quote in an image (usually whatsapp forwards :) ) to your local or preferred language. To use it, simply upload your image, select one of the language choices given (hindi, spanish, german, french, arabic, irish, and turkish) from radio buttons provided. You can alternately click one of the examples to load them and select the language choice along with it." 132 | article = "

Image Text Translate by Yuvraj S | Github Repo |

" 133 | pipeline = keras_ocr.pipeline.Pipeline() 134 | gr.Interface( 135 | translate_image, 136 | [gr.inputs.Image(type="filepath", label="Input"), gr.inputs.Radio(choices=['hindi','spanish','french','turkish','german','irish', 'arabic'], type="value", default='hindi', label='Choose A Language')], 137 | gr.outputs.Image(type="pil", label="Output"), 138 | title=title, 139 | description=description, 140 | article=article, 141 | examples=[['quote1.jpg','german'], ['en2.jpg','hindi'],['gm1.jpg','french'],['quotes6.jpg','spanish']], 142 | enable_queue=True 143 | ).launch(debug=True) 144 | -------------------------------------------------------------------------------- /images/gif1.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yvrjsharma/HugginFace_Gradio/720943cdd3f4f0da7915875ce074285f5f40bff3/images/gif1.gif -------------------------------------------------------------------------------- /images/gradioapp3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yvrjsharma/HugginFace_Gradio/720943cdd3f4f0da7915875ce074285f5f40bff3/images/gradioapp3.jpg -------------------------------------------------------------------------------- /index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | import gradio as gr 9 | 10 | def greet(name): 11 | return "Hello, " + name + "!" 12 | 13 | gr.Interface(greet, "textbox", "textbox").launch() 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /nougat.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | " $\"Open$ " 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "source": [ 16 | "# **Nougat** : Neural Optical Understanding for Academic Documents\n", 17 | "# **A Gradio Demo**\n", 18 | "\n", 19 | "## Lukas Blecher et al. [Paper](https://arxiv.org/pdf/2308.13418.pdf), [Project](https://facebookresearch.github.io/nougat/)\n", 20 | "\n" 21 | ], 22 | "metadata": { 23 | "id": "3IGr7-SPuivC" 24 | } 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "source": [ 29 | "### Installing the required libraries" 30 | ], 31 | "metadata": { 32 | "id": "o1psC42ludfh" 33 | } 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 1, 38 | "metadata": { 39 | "colab": { 40 | "base_uri": "https://localhost:8080/" 41 | }, 42 | "id": "wFetOZtjXT4D", 43 | "outputId": "e20caad0-6539-474c-a765-eaf40044e952" 44 | }, 45 | "outputs": [ 46 | { 47 | "output_type": "stream", 48 | "name": "stdout", 49 | "text": [ 50 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.1/20.1 MB\u001b[0m \u001b[31m83.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 51 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m66.2/66.2 kB\u001b[0m \u001b[31m8.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 52 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 53 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m298.2/298.2 kB\u001b[0m \u001b[31m34.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 54 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.4/75.4 kB\u001b[0m \u001b[31m10.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 55 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m268.8/268.8 kB\u001b[0m \u001b[31m31.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 56 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m139.9/139.9 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 57 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m5.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 58 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m59.5/59.5 kB\u001b[0m \u001b[31m7.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 59 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m15.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 60 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m7.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 61 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.0/67.0 kB\u001b[0m \u001b[31m8.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 62 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m74.5/74.5 kB\u001b[0m \u001b[31m9.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 63 | "\u001b[?25h Building wheel for ffmpy (setup.py) ... \u001b[?25l\u001b[?25hdone\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "!pip install gradio --q\n", 69 | "import gradio as gr" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "source": [ 75 | "!pip install nougat-ocr" 76 | ], 77 | "metadata": { 78 | "colab": { 79 | "base_uri": "https://localhost:8080/" 80 | }, 81 | "id": "VTVyJd43TJjh", 82 | "outputId": "048a4a98-afe1-4505-dd47-6e81c89fe11c" 83 | }, 84 | "execution_count": 2, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "name": "stdout", 89 | "text": [ 90 | "Collecting nougat-ocr\n", 91 | " Downloading nougat_ocr-0.1.2-py3-none-any.whl (71 kB)\n", 92 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m71.9/71.9 kB\u001b[0m \u001b[31m881.2 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 93 | "\u001b[?25hCollecting transformers>=4.25.1 (from nougat-ocr)\n", 94 | " Downloading transformers-4.32.1-py3-none-any.whl (7.5 MB)\n", 95 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.5/7.5 MB\u001b[0m \u001b[31m45.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 96 | "\u001b[?25hCollecting timm==0.5.4 (from nougat-ocr)\n", 97 | " Downloading timm-0.5.4-py3-none-any.whl (431 kB)\n", 98 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m431.5/431.5 kB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 99 | "\u001b[?25hRequirement already satisfied: orjson in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (3.9.5)\n", 100 | "Requirement already satisfied: opencv-python-headless in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (4.8.0.76)\n", 101 | "Collecting datasets[vision] (from nougat-ocr)\n", 102 | " Downloading datasets-2.14.4-py3-none-any.whl (519 kB)\n", 103 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.3/519.3 kB\u001b[0m \u001b[31m40.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 104 | "\u001b[?25hCollecting pytorch-lightning>=1.8.5 (from nougat-ocr)\n", 105 | " Downloading pytorch_lightning-2.0.8-py3-none-any.whl (727 kB)\n", 106 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m727.0/727.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 107 | "\u001b[?25hRequirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (3.8.1)\n", 108 | "Collecting python-Levenshtein (from nougat-ocr)\n", 109 | " Downloading python_Levenshtein-0.21.1-py3-none-any.whl (9.4 kB)\n", 110 | "Collecting sentencepiece (from nougat-ocr)\n", 111 | " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", 112 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m60.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 113 | "\u001b[?25hCollecting sconf>=0.2.3 (from nougat-ocr)\n", 114 | " Downloading sconf-0.2.5-py3-none-any.whl (8.8 kB)\n", 115 | "Requirement already satisfied: albumentations in /usr/local/lib/python3.10/dist-packages (from nougat-ocr) (1.3.1)\n", 116 | "Collecting pymupdf (from nougat-ocr)\n", 117 | " Downloading PyMuPDF-1.23.3-cp310-none-manylinux2014_x86_64.whl (4.3 MB)\n", 118 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.3/4.3 MB\u001b[0m \u001b[31m78.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 119 | "\u001b[?25hRequirement already satisfied: torch>=1.4 in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4->nougat-ocr) (2.0.1+cu118)\n", 120 | "Requirement already satisfied: torchvision in /usr/local/lib/python3.10/dist-packages (from timm==0.5.4->nougat-ocr) (0.15.2+cu118)\n", 121 | "Requirement already satisfied: numpy>=1.17.2 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (1.23.5)\n", 122 | "Requirement already satisfied: tqdm>=4.57.0 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (4.66.1)\n", 123 | "Requirement already satisfied: PyYAML>=5.4 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (6.0.1)\n", 124 | "Requirement already satisfied: fsspec[http]>2021.06.0 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (2023.6.0)\n", 125 | "Collecting torchmetrics>=0.7.0 (from pytorch-lightning>=1.8.5->nougat-ocr)\n", 126 | " Downloading torchmetrics-1.1.1-py3-none-any.whl (763 kB)\n", 127 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m763.4/763.4 kB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 128 | "\u001b[?25hRequirement already satisfied: packaging>=17.1 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (23.1)\n", 129 | "Requirement already satisfied: typing-extensions>=4.0.0 in /usr/local/lib/python3.10/dist-packages (from pytorch-lightning>=1.8.5->nougat-ocr) (4.7.1)\n", 130 | "Collecting lightning-utilities>=0.7.0 (from pytorch-lightning>=1.8.5->nougat-ocr)\n", 131 | " Downloading lightning_utilities-0.9.0-py3-none-any.whl (23 kB)\n", 132 | "Collecting ruamel.yaml (from sconf>=0.2.3->nougat-ocr)\n", 133 | " Downloading ruamel.yaml-0.17.32-py3-none-any.whl (112 kB)\n", 134 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m112.2/112.2 kB\u001b[0m \u001b[31m12.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 135 | "\u001b[?25hCollecting munch (from sconf>=0.2.3->nougat-ocr)\n", 136 | " Downloading munch-4.0.0-py2.py3-none-any.whl (9.9 kB)\n", 137 | "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (3.12.2)\n", 138 | "Requirement already satisfied: huggingface-hub<1.0,>=0.15.1 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (0.16.4)\n", 139 | "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (2023.6.3)\n", 140 | "Requirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers>=4.25.1->nougat-ocr) (2.31.0)\n", 141 | "Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers>=4.25.1->nougat-ocr)\n", 142 | " Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)\n", 143 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.8/7.8 MB\u001b[0m \u001b[31m107.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 144 | "\u001b[?25hCollecting safetensors>=0.3.1 (from transformers>=4.25.1->nougat-ocr)\n", 145 | " Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", 146 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 147 | "\u001b[?25hRequirement already satisfied: scipy>=1.1.0 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (1.10.1)\n", 148 | "Requirement already satisfied: scikit-image>=0.16.1 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (0.19.3)\n", 149 | "Requirement already satisfied: qudida>=0.0.4 in /usr/local/lib/python3.10/dist-packages (from albumentations->nougat-ocr) (0.0.4)\n", 150 | "Requirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (9.0.0)\n", 151 | "Collecting dill<0.3.8,>=0.3.0 (from datasets[vision]->nougat-ocr)\n", 152 | " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", 153 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m16.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 154 | "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (1.5.3)\n", 155 | "Collecting xxhash (from datasets[vision]->nougat-ocr)\n", 156 | " Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", 157 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m28.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 158 | "\u001b[?25hCollecting multiprocess (from datasets[vision]->nougat-ocr)\n", 159 | " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", 160 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 161 | "\u001b[?25hRequirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (3.8.5)\n", 162 | "Requirement already satisfied: Pillow>=6.2.1 in /usr/local/lib/python3.10/dist-packages (from datasets[vision]->nougat-ocr) (9.4.0)\n", 163 | "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->nougat-ocr) (8.1.7)\n", 164 | "Requirement already satisfied: joblib in /usr/local/lib/python3.10/dist-packages (from nltk->nougat-ocr) (1.3.2)\n", 165 | "Collecting PyMuPDFb==1.23.3 (from pymupdf->nougat-ocr)\n", 166 | " Downloading PyMuPDFb-1.23.3-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (30.6 MB)\n", 167 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m30.6/30.6 MB\u001b[0m \u001b[31m57.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 168 | "\u001b[?25hCollecting Levenshtein==0.21.1 (from python-Levenshtein->nougat-ocr)\n", 169 | " Downloading Levenshtein-0.21.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (172 kB)\n", 170 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m172.5/172.5 kB\u001b[0m \u001b[31m24.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 171 | "\u001b[?25hCollecting rapidfuzz<4.0.0,>=2.3.0 (from Levenshtein==0.21.1->python-Levenshtein->nougat-ocr)\n", 172 | " Downloading rapidfuzz-3.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n", 173 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m69.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 174 | "\u001b[?25hRequirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (23.1.0)\n", 175 | "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (3.2.0)\n", 176 | "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (6.0.4)\n", 177 | "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (4.0.3)\n", 178 | "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.9.2)\n", 179 | "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.4.0)\n", 180 | "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets[vision]->nougat-ocr) (1.3.1)\n", 181 | "Requirement already satisfied: scikit-learn>=0.19.1 in /usr/local/lib/python3.10/dist-packages (from qudida>=0.0.4->albumentations->nougat-ocr) (1.2.2)\n", 182 | "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers>=4.25.1->nougat-ocr) (3.4)\n", 183 | "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers>=4.25.1->nougat-ocr) (2.0.4)\n", 184 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers>=4.25.1->nougat-ocr) (2023.7.22)\n", 185 | "Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (3.1)\n", 186 | "Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (2.31.1)\n", 187 | "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (2023.8.12)\n", 188 | "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.16.1->albumentations->nougat-ocr) (1.4.1)\n", 189 | "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.4->timm==0.5.4->nougat-ocr) (1.12)\n", 190 | "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.4->timm==0.5.4->nougat-ocr) (3.1.2)\n", 191 | "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.4->timm==0.5.4->nougat-ocr) (2.0.0)\n", 192 | "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.4->timm==0.5.4->nougat-ocr) (3.27.2)\n", 193 | "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.4->timm==0.5.4->nougat-ocr) (16.0.6)\n", 194 | "Requirement already satisfied: python-dateutil>=2.8.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets[vision]->nougat-ocr) (2.8.2)\n", 195 | "Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.10/dist-packages (from pandas->datasets[vision]->nougat-ocr) (2023.3)\n", 196 | "Collecting ruamel.yaml.clib>=0.2.7 (from ruamel.yaml->sconf>=0.2.3->nougat-ocr)\n", 197 | " Downloading ruamel.yaml.clib-0.2.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (485 kB)\n", 198 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m485.6/485.6 kB\u001b[0m \u001b[31m44.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 199 | "\u001b[?25hRequirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.8.1->pandas->datasets[vision]->nougat-ocr) (1.16.0)\n", 200 | "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.19.1->qudida>=0.0.4->albumentations->nougat-ocr) (3.2.0)\n", 201 | "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.4->timm==0.5.4->nougat-ocr) (2.1.3)\n", 202 | "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.4->timm==0.5.4->nougat-ocr) (1.3.0)\n", 203 | "Installing collected packages: tokenizers, sentencepiece, safetensors, xxhash, ruamel.yaml.clib, rapidfuzz, PyMuPDFb, munch, lightning-utilities, dill, ruamel.yaml, pymupdf, multiprocess, Levenshtein, transformers, sconf, python-Levenshtein, datasets, torchmetrics, timm, pytorch-lightning, nougat-ocr\n", 204 | "Successfully installed Levenshtein-0.21.1 PyMuPDFb-1.23.3 datasets-2.14.4 dill-0.3.7 lightning-utilities-0.9.0 multiprocess-0.70.15 munch-4.0.0 nougat-ocr-0.1.2 pymupdf-1.23.3 python-Levenshtein-0.21.1 pytorch-lightning-2.0.8 rapidfuzz-3.2.0 ruamel.yaml-0.17.32 ruamel.yaml.clib-0.2.7 safetensors-0.3.3 sconf-0.2.5 sentencepiece-0.1.99 timm-0.5.4 tokenizers-0.13.3 torchmetrics-1.1.1 transformers-4.32.1 xxhash-3.3.0\n" 205 | ] 206 | } 207 | ] 208 | }, 209 | { 210 | "cell_type": "markdown", 211 | "source": [ 212 | "### Download a smaple pdf file" 213 | ], 214 | "metadata": { 215 | "id": "DHGxzqdmkbVf" 216 | } 217 | }, 218 | { 219 | "cell_type": "code", 220 | "source": [ 221 | "# Download a sample pdf file - https://arxiv.org/pdf/2308.13418.pdf (nougat paper)\n", 222 | "import requests\n", 223 | "\n", 224 | "def get_pdf(pdf_link):\n", 225 | "\n", 226 | " # Send a GET request to the PDF link\n", 227 | " response = requests.get(pdf_link)\n", 228 | "\n", 229 | " if response.status_code == 200:\n", 230 | " # Save the PDF content to a local file\n", 231 | " with open(\"nougat.pdf\", 'wb') as pdf_file:\n", 232 | " pdf_file.write(response.content)\n", 233 | " print(\"PDF downloaded successfully.\")\n", 234 | " else:\n", 235 | " print(\"Failed to download the PDF.\")\n", 236 | " return\n", 237 | "\n", 238 | "\n", 239 | "get_pdf(\"https://arxiv.org/pdf/2308.13418.pdf\")" 240 | ], 241 | "metadata": { 242 | "colab": { 243 | "base_uri": "https://localhost:8080/" 244 | }, 245 | "id": "RMsn4EE1j9Gl", 246 | "outputId": "2a085c28-d0df-4904-d4b9-5e79774d2a9f" 247 | }, 248 | "execution_count": 4, 249 | "outputs": [ 250 | { 251 | "output_type": "stream", 252 | "name": "stdout", 253 | "text": [ 254 | "PDF downloaded successfully.\n" 255 | ] 256 | } 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "source": [ 262 | "### Testing nougat OCR" 263 | ], 264 | "metadata": { 265 | "id": "ghKR79CknBcP" 266 | } 267 | }, 268 | { 269 | "cell_type": "code", 270 | "source": [ 271 | "# Testing Nougat\n", 272 | "# Use the nougat pdf file and fire up the CLI command\n", 273 | "# creating a new output directory for the mmd file\n", 274 | "! nougat --out /content/output pdf /content/nougat.pdf" 275 | ], 276 | "metadata": { 277 | "id": "y88ZxhOVaOXt", 278 | "colab": { 279 | "base_uri": "https://localhost:8080/" 280 | }, 281 | "outputId": "69b653ac-d8c0-42cf-d632-0a7815753989" 282 | }, 283 | "execution_count": 7, 284 | "outputs": [ 285 | { 286 | "output_type": "stream", 287 | "name": "stdout", 288 | "text": [ 289 | "downloading nougat checkpoint version 0.1.0-small to path /root/.cache/torch/hub/nougat\n", 290 | "config.json: 100% 557/557 [00:00<00:00, 2.96Mb/s]\n", 291 | "pytorch_model.bin: 100% 956M/956M [00:02<00:00, 445Mb/s]\n", 292 | "special_tokens_map.json: 100% 96.0/96.0 [00:00<00:00, 597kb/s]\n", 293 | "tokenizer.json: 100% 2.04M/2.04M [00:00<00:00, 13.4Mb/s]\n", 294 | "tokenizer_config.json: 100% 106/106 [00:00<00:00, 628kb/s]\n", 295 | "INFO:root:Output directory does not exist. Creating output directory.\n", 296 | "/usr/local/lib/python3.10/dist-packages/torch/functional.py:504: UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at ../aten/src/ATen/native/TensorShape.cpp:3483.)\n", 297 | " return _VF.meshgrid(tensors, **kwargs) # type: ignore[attr-defined]\n", 298 | " 0% 0/5 [00:00Upload a PDF',scale=1)\n", 517 | " mkd = gr.Markdown('

OR

',scale=1)\n", 518 | " mkd = gr.Markdown('

Provide a PDF link

PDF converted into markup language through Nougat-OCR👇:

\")\n", 529 | " parsed_output = gr.Markdown(elem_id='mkd', value='OCR Output📃🔤')\n", 530 | "\n", 531 | " btn.click(paper_read, [pdf_file, pdf_link], parsed_output )\n", 532 | " clr.click(lambda : (gr.update(value=None),\n", 533 | " gr.update(value=None),\n", 534 | " gr.update(value=None)),\n", 535 | " [],\n", 536 | " [pdf_file, pdf_link, parsed_output]\n", 537 | " )\n", 538 | "\n", 539 | " gr.Examples(\n", 540 | " [[\"/content/nougat.pdf\", \"\"], [None, \"https://arxiv.org/pdf/2308.08316.pdf\"]],\n", 541 | " inputs = [pdf_file, pdf_link],\n", 542 | " outputs = parsed_output,\n", 543 | " fn=process_example,\n", 544 | " cache_examples=True,\n", 545 | " label='Click on any examples below to get Nougat OCR results quickly:'\n", 546 | " )\n", 547 | "\n", 548 | "demo.queue()\n", 549 | "demo.launch(share=True)\n" 550 | ], 551 | "metadata": { 552 | "colab": { 553 | "base_uri": "https://localhost:8080/", 554 | "height": 750 555 | }, 556 | "id": "TnvIHr5ITJbl", 557 | "outputId": "16b2cb74-7bbe-41e7-ebf9-9e1d873587bd" 558 | }, 559 | "execution_count": 22, 560 | "outputs": [ 561 | { 562 | "output_type": "stream", 563 | "name": "stderr", 564 | "text": [ 565 | "/usr/local/lib/python3.10/dist-packages/gradio/helpers.py:223: UserWarning: Examples are being cached but not all input components have example values. This may result in an exception being thrown by your function. If you do get an error while caching examples, make sure all of your inputs have example values for all of your examples or you provide default values for those particular parameters in your function.\n", 566 | " warnings.warn(\n" 567 | ] 568 | }, 569 | { 570 | "output_type": "stream", 571 | "name": "stdout", 572 | "text": [ 573 | "Caching examples at: '/root/.cache/torch/hub/nougat/gradio_cached_examples/58'\n", 574 | "Caching example 1/2\n", 575 | "Caching example 2/2\n", 576 | "PDF downloaded successfully.\n", 577 | "Caching complete\n", 578 | "\n", 579 | "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n", 580 | "Running on public URL: https://527b93c25e4a941714.gradio.live\n", 581 | "\n", 582 | "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n" 583 | ] 584 | }, 585 | { 586 | "output_type": "display_data", 587 | "data": { 588 | "text/plain": [ 589 | "" 590 | ], 591 | "text/html": [ 592 | "

" 593 | ] 594 | }, 595 | "metadata": {} 596 | }, 597 | { 598 | "output_type": "execute_result", 599 | "data": { 600 | "text/plain": [] 601 | }, 602 | "metadata": {}, 603 | "execution_count": 22 604 | } 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "source": [ 610 | "https://arxiv.org/pdf/2309.00615.pdf" 611 | ], 612 | "metadata": { 613 | "id": "NIttIDIWxXnj" 614 | }, 615 | "execution_count": null, 616 | "outputs": [] 617 | } 618 | ], 619 | "metadata": { 620 | "accelerator": "GPU", 621 | "colab": { 622 | "provenance": [], 623 | "gpuType": "T4", 624 | "authorship_tag": "ABX9TyPFMHNaTfhUKiJYOP3V1gUh", 625 | "include_colab_link": true 626 | }, 627 | "kernelspec": { 628 | "display_name": "Python 3", 629 | "name": "python3" 630 | }, 631 | "language_info": { 632 | "name": "python" 633 | } 634 | }, 635 | "nbformat": 4, 636 | "nbformat_minor": 0 637 | } -------------------------------------------------------------------------------- /updated_smolagents_gradio_ui.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding=utf-8 3 | # Copyright 2024 The HuggingFace Inc. team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | import os 17 | import re 18 | import shutil 19 | from typing import Optional 20 | 21 | from smolagents.agent_types import AgentAudio, AgentImage, AgentText 22 | from smolagents.agents import MultiStepAgent, PlanningStep 23 | from smolagents.memory import ActionStep, FinalAnswerStep, MemoryStep 24 | from smolagents.utils import _is_package_available 25 | 26 | 27 | def get_step_footnote_content(step_log: MemoryStep, step_name: str) -> str: 28 | """Get a footnote string for a step log with duration and token information""" 29 | step_footnote = f"**{step_name}**" 30 | if hasattr(step_log, "input_token_count") and hasattr(step_log, "output_token_count"): 31 | token_str = f" | Input tokens:{step_log.input_token_count:,} | Output tokens: {step_log.output_token_count:,}" 32 | step_footnote += token_str 33 | if hasattr(step_log, "duration"): 34 | step_duration = f" | Duration: {round(float(step_log.duration), 2)}" if step_log.duration else None 35 | step_footnote += step_duration 36 | step_footnote_content = f"""{step_footnote} """ 37 | return step_footnote_content 38 | 39 | 40 | def pull_messages_from_step( 41 | step_log: MemoryStep, 42 | ): 43 | """Extract ChatMessage objects from agent steps with proper nesting""" 44 | if not _is_package_available("gradio"): 45 | raise ModuleNotFoundError( 46 | "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" 47 | ) 48 | import gradio as gr 49 | 50 | if isinstance(step_log, ActionStep): 51 | # Output the step number 52 | step_number = f"Step {step_log.step_number}" if step_log.step_number is not None else "Step" 53 | yield gr.ChatMessage(role="assistant", content=f"**{step_number}**") 54 | 55 | # First yield the thought/reasoning from the LLM 56 | if hasattr(step_log, "model_output") and step_log.model_output is not None: 57 | # Clean up the LLM output 58 | model_output = step_log.model_output.strip() 59 | # Remove any trailing and extra backticks, handling multiple possible formats 60 | model_output = re.sub(r"```\s*", "```", model_output) # handles ``` 61 | model_output = re.sub(r"\s*```", "```", model_output) # handles ``` 62 | model_output = re.sub(r"```\s*\n\s*", "```", model_output) # handles ```\n 63 | model_output = model_output.strip() 64 | yield gr.ChatMessage(role="assistant", content=model_output) 65 | 66 | # For tool calls, create a parent message 67 | if hasattr(step_log, "tool_calls") and step_log.tool_calls is not None: 68 | first_tool_call = step_log.tool_calls[0] 69 | used_code = first_tool_call.name == "python_interpreter" 70 | parent_id = f"call_{len(step_log.tool_calls)}" 71 | 72 | # Tool call becomes the parent message with timing info 73 | # First we will handle arguments based on type 74 | args = first_tool_call.arguments 75 | if isinstance(args, dict): 76 | content = str(args.get("answer", str(args))) 77 | else: 78 | content = str(args).strip() 79 | 80 | if used_code: 81 | # Clean up the content by removing any end code tags 82 | content = re.sub(r"```.*?\n", "", content) # Remove existing code blocks 83 | content = re.sub(r"\s*\s*", "", content) # Remove end_code tags 84 | content = content.strip() 85 | if not content.startswith("```python"): 86 | content = f"```python\n{content}\n```" 87 | 88 | parent_message_tool = gr.ChatMessage( 89 | role="assistant", 90 | content=content, 91 | metadata={ 92 | "title": f"🛠️ Used tool {first_tool_call.name}", 93 | "id": parent_id, 94 | "status": "done", 95 | }, 96 | ) 97 | yield parent_message_tool 98 | 99 | # Display execution logs if they exist 100 | if hasattr(step_log, "observations") and ( 101 | step_log.observations is not None and step_log.observations.strip() 102 | ): # Only yield execution logs if there's actual content 103 | log_content = step_log.observations.strip() 104 | if log_content: 105 | log_content = re.sub(r"^Execution logs:\s*", "", log_content) 106 | yield gr.ChatMessage( 107 | role="assistant", 108 | content=f"```bash\n{log_content}\n", 109 | metadata={"title": "📝 Execution Logs", "status": "done"}, 110 | ) 111 | 112 | # Display any errors 113 | if hasattr(step_log, "error") and step_log.error is not None: 114 | yield gr.ChatMessage( 115 | role="assistant", 116 | content=str(step_log.error), 117 | metadata={"title": "💥 Error", "status": "done"}, 118 | ) 119 | 120 | # Update parent message metadata to done status without yielding a new message 121 | if getattr(step_log, "observations_images", []): 122 | for image in step_log.observations_images: 123 | path_image = AgentImage(image).to_string() 124 | yield gr.ChatMessage( 125 | role="assistant", 126 | content={"path": path_image, "mime_type": f"image/{path_image.split('.')[-1]}"}, 127 | metadata={"title": "🖼️ Output Image", "status": "done"}, 128 | ) 129 | 130 | # Handle standalone errors but not from tool calls 131 | if hasattr(step_log, "error") and step_log.error is not None: 132 | yield gr.ChatMessage(role="assistant", content=str(step_log.error), metadata={"title": "💥 Error"}) 133 | 134 | yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, step_number)) 135 | yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"}) 136 | 137 | elif isinstance(step_log, PlanningStep): 138 | yield gr.ChatMessage(role="assistant", content="**Planning step**") 139 | yield gr.ChatMessage(role="assistant", content=step_log.plan) 140 | yield gr.ChatMessage(role="assistant", content=get_step_footnote_content(step_log, "Planning step")) 141 | yield gr.ChatMessage(role="assistant", content="-----", metadata={"status": "done"}) 142 | 143 | elif isinstance(step_log, FinalAnswerStep): 144 | final_answer = step_log.final_answer 145 | if isinstance(final_answer, AgentText): 146 | yield gr.ChatMessage( 147 | role="assistant", 148 | content=f"**Final answer:**\n{final_answer.to_string()}\n", 149 | ) 150 | elif isinstance(final_answer, AgentImage): 151 | yield gr.ChatMessage( 152 | role="assistant", 153 | content={"path": final_answer.to_string(), "mime_type": "image/png"}, 154 | ) 155 | elif isinstance(final_answer, AgentAudio): 156 | yield gr.ChatMessage( 157 | role="assistant", 158 | content={"path": final_answer.to_string(), "mime_type": "audio/wav"}, 159 | ) 160 | else: 161 | yield gr.ChatMessage(role="assistant", content=f"**Final answer:** {str(final_answer)}") 162 | 163 | else: 164 | raise ValueError(f"Unsupported step type: {type(step_log)}") 165 | 166 | 167 | def stream_to_gradio( 168 | agent, 169 | task: str, 170 | task_images: list | None = None, 171 | reset_agent_memory: bool = False, 172 | additional_args: Optional[dict] = None, 173 | ): 174 | """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages.""" 175 | total_input_tokens = 0 176 | total_output_tokens = 0 177 | 178 | for step_log in agent.run( 179 | task, images=task_images, stream=True, reset=reset_agent_memory, additional_args=additional_args, 180 | ): 181 | # Track tokens if model provides them 182 | if getattr(agent.model, "last_input_token_count", None) is not None: 183 | total_input_tokens += agent.model.last_input_token_count 184 | total_output_tokens += agent.model.last_output_token_count 185 | if isinstance(step_log, (ActionStep, PlanningStep)): 186 | step_log.input_token_count = agent.model.last_input_token_count 187 | step_log.output_token_count = agent.model.last_output_token_count 188 | 189 | for message in pull_messages_from_step( 190 | step_log, 191 | ): 192 | yield message 193 | 194 | 195 | class GradioUI: 196 | """A one-line interface to launch your agent in Gradio""" 197 | 198 | def __init__(self, agent: MultiStepAgent, file_upload_folder: str | None = None): 199 | if not _is_package_available("gradio"): 200 | raise ModuleNotFoundError( 201 | "Please install 'gradio' extra to use the GradioUI: `pip install 'smolagents[gradio]'`" 202 | ) 203 | self.agent = agent 204 | self.file_upload_folder = file_upload_folder 205 | self.name = getattr(agent, "name") or "Agent interface" 206 | self.description = getattr(agent, "description", None) 207 | if self.file_upload_folder is not None: 208 | if not os.path.exists(file_upload_folder): 209 | os.mkdir(file_upload_folder) 210 | 211 | 212 | def interact_with_agent(self, messages, chatbot_history, session_state): 213 | import gradio as gr 214 | 215 | # Get the agent type from the template agent 216 | if "agent" not in session_state: 217 | session_state["agent"] = self.agent 218 | 219 | try: 220 | # Add all messages to chatbot history 221 | for message in messages: 222 | chatbot_history.append(message) 223 | yield chatbot_history 224 | 225 | # Extract text and files for the agent 226 | text_messages = [msg["content"] for msg in messages if isinstance(msg["content"], str)] 227 | file_messages = [msg["content"]["path"] for msg in messages if isinstance(msg["content"], dict)] 228 | 229 | # Combine text messages and file information for the agent 230 | task = " ".join(text_messages) 231 | if file_messages: 232 | task += f"\nAttached files: {', '.join(file_messages)}" 233 | 234 | for msg in stream_to_gradio(session_state["agent"], task=task, reset_agent_memory=False): 235 | chatbot_history.append(msg) 236 | yield chatbot_history 237 | 238 | yield chatbot_history 239 | except Exception as e: 240 | print(f"Error in interaction: {str(e)}") 241 | chatbot_history.append(gr.ChatMessage(role="assistant", content=f"Error: {str(e)}")) 242 | yield chatbot_history 243 | 244 | def upload_file(self, file, file_uploads_log, allowed_file_types=None): 245 | """ 246 | Handle file uploads, default allowed types are .pdf, .docx, and .txt 247 | """ 248 | import gradio as gr 249 | 250 | if file is None: 251 | return gr.Textbox(value="No file uploaded", visible=True), file_uploads_log 252 | 253 | if allowed_file_types is None: 254 | allowed_file_types = [".pdf", ".docx", ".txt"] 255 | 256 | file_ext = os.path.splitext(file.name)[1].lower() 257 | if file_ext not in allowed_file_types: 258 | return gr.Textbox("File type disallowed", visible=True), file_uploads_log 259 | 260 | # Sanitize file name 261 | original_name = os.path.basename(file.name) 262 | sanitized_name = re.sub( 263 | r"[^\w\-.]", "_", original_name 264 | ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores 265 | 266 | # Save the uploaded file to the specified folder 267 | file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name)) 268 | shutil.copy(file.name, file_path) 269 | 270 | return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path] 271 | 272 | 273 | def log_user_message(self, multimodal_input, file_uploads_log): 274 | """ 275 | Handle the multimodal input from the user 276 | multimodal_input will be a dict with 'text' and 'files' keys 277 | """ 278 | import gradio as gr 279 | 280 | # Extract text and files from multimodal input 281 | text = multimodal_input.get("text", "") 282 | files = multimodal_input.get("files", []) 283 | 284 | # Build the message 285 | messages = [] 286 | 287 | # Add file paths first 288 | for file_path in files: 289 | # File paths come directly in the files array 290 | messages.append({"role": "user", "content": {"path": file_path}}) 291 | # Add to file uploads log if not already present 292 | if file_path not in file_uploads_log: 293 | file_uploads_log.append(file_path) 294 | 295 | # Add text message if present 296 | if text: 297 | messages.append({"role": "user", "content": text}) 298 | 299 | # Add any previously uploaded files through the separate file uploader 300 | if len(file_uploads_log) > 0: 301 | text += f"\nPreviously uploaded files: {file_uploads_log}" 302 | 303 | return messages, {"text": "", "files": []}, gr.Button(interactive=False) 304 | 305 | 306 | def launch(self, share: bool = True, **kwargs): 307 | self.create_app().launch(debug=True, share=share, **kwargs) 308 | 309 | 310 | 311 | def create_app(self): 312 | import gradio as gr 313 | 314 | with gr.Blocks(theme="ocean", fill_height=True) as demo: 315 | # Add session state to store session-specific data 316 | session_state = gr.State({}) 317 | stored_messages = gr.State([]) 318 | file_uploads_log = gr.State([]) 319 | 320 | with gr.Sidebar(): 321 | gr.Markdown( 322 | f"# {self.name.replace('_', ' ').capitalize()}" 323 | "\n> This web ui allows you to interact with a `smolagents` agent that can use tools and execute steps to complete tasks." 324 | + (f"\n\n**Agent description:**\n{self.description}" if self.description else "") 325 | ) 326 | 327 | with gr.Group(): 328 | gr.Markdown("**Your request**", container=True) 329 | # Updated MultimodalTextbox configuration 330 | text_input = gr.MultimodalTextbox( 331 | interactive=True, 332 | file_count="multiple", 333 | show_label=False, 334 | sources=["upload"], 335 | file_types=[".csv", "image"], 336 | placeholder="Enter your prompt here and press Shift+Enter or press the button" 337 | ) 338 | submit_btn = gr.Button("Submit", variant="primary") 339 | 340 | # If an upload folder is provided, enable the upload feature 341 | if self.file_upload_folder is not None: 342 | upload_file = gr.File(label="Upload a file") 343 | upload_status = gr.Textbox(label="Upload Status", interactive=False, visible=False) 344 | upload_file.change( 345 | self.upload_file, 346 | [upload_file, file_uploads_log], 347 | [upload_status, file_uploads_log], 348 | ) 349 | 350 | gr.HTML("

Powered by:

") 351 | with gr.Row(): 352 | gr.HTML("""

353 |

354 | huggingface/smolagents 355 |

""") 356 | 357 | # Main chat interface 358 | chatbot = gr.Chatbot( 359 | label="Agent", 360 | type="messages", 361 | avatar_images=( 362 | None, 363 | "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/smolagents/mascot_smol.png", 364 | ), 365 | resizeable=True, 366 | scale=1, 367 | ) 368 | 369 | # Updated event handlers for multimodal input 370 | text_input.submit( 371 | self.log_user_message, 372 | [text_input, file_uploads_log], 373 | [stored_messages, text_input, submit_btn], 374 | ).then(self.interact_with_agent, [stored_messages, chatbot, session_state], [chatbot]).then( 375 | lambda: ( 376 | gr.MultimodalTextbox( 377 | interactive=True, 378 | file_count="multiple", 379 | show_label=False, 380 | sources=["upload"], 381 | file_types=[".csv", "image"], 382 | placeholder="Enter your prompt here and press Shift+Enter or press the button" 383 | ), 384 | gr.Button(interactive=True), 385 | ), 386 | None, 387 | [text_input, submit_btn], 388 | ) 389 | 390 | submit_btn.click( 391 | self.log_user_message, 392 | [text_input, file_uploads_log], 393 | [stored_messages, text_input, submit_btn], 394 | ).then(self.interact_with_agent, [stored_messages, chatbot, session_state], [chatbot]).then( 395 | lambda: ( 396 | gr.MultimodalTextbox( 397 | interactive=True, 398 | file_count="multiple", 399 | show_label=False, 400 | sources=["upload"], 401 | file_types=[".csv", "image"], 402 | placeholder="Enter your prompt here and press Shift+Enter or press the button" 403 | ), 404 | gr.Button(interactive=True), 405 | ), 406 | None, 407 | [text_input, submit_btn], 408 | ) 409 | 410 | return demo 411 | 412 | 413 | __all__ = ["stream_to_gradio", "GradioUI"] 414 | --------------------------------------------------------------------------------