└── Extract_Table_Structure_from_Image_Document_.ipynb /Extract_Table_Structure_from_Image_Document_.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Extract Table Structure from Image Document .ipynb", 7 | "provenance": [], 8 | "collapsed_sections": [] 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | }, 17 | "accelerator": "GPU", 18 | "gpuClass": "standard" 19 | }, 20 | "cells": [ 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": { 25 | "colab": { 26 | "base_uri": "https://localhost:8080/" 27 | }, 28 | "id": "G_hvNK0qfAj3", 29 | "outputId": "3e154786-c894-4bd2-f613-5e70447aedc4" 30 | }, 31 | "outputs": [ 32 | { 33 | "output_type": "stream", 34 | "name": "stdout", 35 | "text": [ 36 | "Cloning into 'PaddleOCR'...\n", 37 | "remote: Enumerating objects: 39717, done.\u001b[K\n", 38 | "remote: Counting objects: 100% (161/161), done.\u001b[K\n", 39 | "remote: Compressing objects: 100% (121/121), done.\u001b[K\n", 40 | "remote: Total 39717 (delta 57), reused 82 (delta 40), pack-reused 39556\u001b[K\n", 41 | "Receiving objects: 100% (39717/39717), 320.84 MiB | 34.31 MiB/s, done.\n", 42 | "Resolving deltas: 100% (27775/27775), done.\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "!git clone https://github.com/PaddlePaddle/PaddleOCR.git" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "source": [ 53 | "!pip install paddleocr --upgrade\n", 54 | "!pip install paddlepaddle" 55 | ], 56 | "metadata": { 57 | "colab": { 58 | "base_uri": "https://localhost:8080/" 59 | }, 60 | "id": "rikzlTmLgsWI", 61 | "outputId": "f59efce2-1a70-4b3f-c207-9a0a2c533417" 62 | }, 63 | "execution_count": null, 64 | "outputs": [ 65 | { 66 | "output_type": "stream", 67 | "name": "stdout", 68 | "text": [ 69 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 70 | "Collecting paddleocr\n", 71 | " Downloading paddleocr-2.5.0.3-py3-none-any.whl (334 kB)\n", 72 | "\u001b[K |████████████████████████████████| 334 kB 8.8 MB/s \n", 73 | "\u001b[?25hRequirement already satisfied: shapely in /usr/local/lib/python3.7/dist-packages (from paddleocr) (1.8.2)\n", 74 | "Collecting pyclipper\n", 75 | " Downloading pyclipper-1.3.0.post3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (604 kB)\n", 76 | "\u001b[K |████████████████████████████████| 604 kB 65.2 MB/s \n", 77 | "\u001b[?25hRequirement already satisfied: scikit-image in /usr/local/lib/python3.7/dist-packages (from paddleocr) (0.18.3)\n", 78 | "Requirement already satisfied: lmdb in /usr/local/lib/python3.7/dist-packages (from paddleocr) (0.99)\n", 79 | "Requirement already satisfied: imgaug==0.4.0 in /usr/local/lib/python3.7/dist-packages (from paddleocr) (0.4.0)\n", 80 | "Requirement already satisfied: tqdm in /usr/local/lib/python3.7/dist-packages (from paddleocr) (4.64.0)\n", 81 | "Collecting python-Levenshtein\n", 82 | " Downloading python-Levenshtein-0.12.2.tar.gz (50 kB)\n", 83 | "\u001b[K |████████████████████████████████| 50 kB 8.1 MB/s \n", 84 | "\u001b[?25hCollecting premailer\n", 85 | " Downloading premailer-3.10.0-py2.py3-none-any.whl (19 kB)\n", 86 | "Requirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from paddleocr) (1.21.6)\n", 87 | "Collecting opencv-contrib-python==4.4.0.46\n", 88 | " Downloading opencv_contrib_python-4.4.0.46-cp37-cp37m-manylinux2014_x86_64.whl (55.7 MB)\n", 89 | "\u001b[K |████████████████████████████████| 55.7 MB 1.2 MB/s \n", 90 | "\u001b[?25hRequirement already satisfied: cython in /usr/local/lib/python3.7/dist-packages (from paddleocr) (0.29.32)\n", 91 | "Requirement already satisfied: lxml in /usr/local/lib/python3.7/dist-packages (from paddleocr) (4.9.1)\n", 92 | "Requirement already satisfied: openpyxl in /usr/local/lib/python3.7/dist-packages (from paddleocr) (3.0.10)\n", 93 | "Collecting visualdl\n", 94 | " Downloading visualdl-2.3.0-py3-none-any.whl (2.8 MB)\n", 95 | "\u001b[K |████████████████████████████████| 2.8 MB 32.5 MB/s \n", 96 | "\u001b[?25hCollecting attrdict\n", 97 | " Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)\n", 98 | "Requirement already satisfied: Pillow in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (7.1.2)\n", 99 | "Requirement already satisfied: scipy in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (1.7.3)\n", 100 | "Requirement already satisfied: matplotlib in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (3.2.2)\n", 101 | "Requirement already satisfied: opencv-python in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (4.6.0.66)\n", 102 | "Requirement already satisfied: imageio in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (2.9.0)\n", 103 | "Requirement already satisfied: six in /usr/local/lib/python3.7/dist-packages (from imgaug==0.4.0->paddleocr) (1.15.0)\n", 104 | "Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.7/dist-packages (from scikit-image->paddleocr) (2021.11.2)\n", 105 | "Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from scikit-image->paddleocr) (1.3.0)\n", 106 | "Requirement already satisfied: networkx>=2.0 in /usr/local/lib/python3.7/dist-packages (from scikit-image->paddleocr) (2.6.3)\n", 107 | "Requirement already satisfied: python-dateutil>=2.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->imgaug==0.4.0->paddleocr) (2.8.2)\n", 108 | "Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.7/dist-packages (from matplotlib->imgaug==0.4.0->paddleocr) (0.11.0)\n", 109 | "Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->imgaug==0.4.0->paddleocr) (1.4.4)\n", 110 | "Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /usr/local/lib/python3.7/dist-packages (from matplotlib->imgaug==0.4.0->paddleocr) (3.0.9)\n", 111 | "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from kiwisolver>=1.0.1->matplotlib->imgaug==0.4.0->paddleocr) (4.1.1)\n", 112 | "Requirement already satisfied: et-xmlfile in /usr/local/lib/python3.7/dist-packages (from openpyxl->paddleocr) (1.1.0)\n", 113 | "Requirement already satisfied: cachetools in /usr/local/lib/python3.7/dist-packages (from premailer->paddleocr) (4.2.4)\n", 114 | "Requirement already satisfied: requests in /usr/local/lib/python3.7/dist-packages (from premailer->paddleocr) (2.23.0)\n", 115 | "Collecting cssselect\n", 116 | " Downloading cssselect-1.1.0-py2.py3-none-any.whl (16 kB)\n", 117 | "Collecting cssutils\n", 118 | " Downloading cssutils-2.5.1-py3-none-any.whl (399 kB)\n", 119 | "\u001b[K |████████████████████████████████| 399 kB 68.7 MB/s \n", 120 | "\u001b[?25hRequirement already satisfied: importlib-metadata in /usr/local/lib/python3.7/dist-packages (from cssutils->premailer->paddleocr) (4.12.0)\n", 121 | "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.7/dist-packages (from importlib-metadata->cssutils->premailer->paddleocr) (3.8.1)\n", 122 | "Requirement already satisfied: setuptools in /usr/local/lib/python3.7/dist-packages (from python-Levenshtein->paddleocr) (57.4.0)\n", 123 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.7/dist-packages (from requests->premailer->paddleocr) (2.10)\n", 124 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.7/dist-packages (from requests->premailer->paddleocr) (2022.6.15)\n", 125 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.7/dist-packages (from requests->premailer->paddleocr) (1.24.3)\n", 126 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.7/dist-packages (from requests->premailer->paddleocr) (3.0.4)\n", 127 | "Requirement already satisfied: pandas in /usr/local/lib/python3.7/dist-packages (from visualdl->paddleocr) (1.3.5)\n", 128 | "Collecting Flask-Babel>=1.0.0\n", 129 | " Downloading Flask_Babel-2.0.0-py3-none-any.whl (9.3 kB)\n", 130 | "Requirement already satisfied: protobuf>=3.11.0 in /usr/local/lib/python3.7/dist-packages (from visualdl->paddleocr) (3.17.3)\n", 131 | "Collecting bce-python-sdk\n", 132 | " Downloading bce_python_sdk-0.8.74-py3-none-any.whl (204 kB)\n", 133 | "\u001b[K |████████████████████████████████| 204 kB 61.5 MB/s \n", 134 | "\u001b[?25hRequirement already satisfied: flask>=1.1.1 in /usr/local/lib/python3.7/dist-packages (from visualdl->paddleocr) (1.1.4)\n", 135 | "Requirement already satisfied: Werkzeug<2.0,>=0.15 in /usr/local/lib/python3.7/dist-packages (from flask>=1.1.1->visualdl->paddleocr) (1.0.1)\n", 136 | "Requirement already satisfied: Jinja2<3.0,>=2.10.1 in /usr/local/lib/python3.7/dist-packages (from flask>=1.1.1->visualdl->paddleocr) (2.11.3)\n", 137 | "Requirement already satisfied: itsdangerous<2.0,>=0.24 in /usr/local/lib/python3.7/dist-packages (from flask>=1.1.1->visualdl->paddleocr) (1.1.0)\n", 138 | "Requirement already satisfied: click<8.0,>=5.1 in /usr/local/lib/python3.7/dist-packages (from flask>=1.1.1->visualdl->paddleocr) (7.1.2)\n", 139 | "Requirement already satisfied: pytz in /usr/local/lib/python3.7/dist-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2022.1)\n", 140 | "Requirement already satisfied: Babel>=2.3 in /usr/local/lib/python3.7/dist-packages (from Flask-Babel>=1.0.0->visualdl->paddleocr) (2.10.3)\n", 141 | "Requirement already satisfied: MarkupSafe>=0.23 in /usr/local/lib/python3.7/dist-packages (from Jinja2<3.0,>=2.10.1->flask>=1.1.1->visualdl->paddleocr) (2.0.1)\n", 142 | "Requirement already satisfied: future>=0.6.0 in /usr/local/lib/python3.7/dist-packages (from bce-python-sdk->visualdl->paddleocr) (0.16.0)\n", 143 | "Collecting pycryptodome>=3.8.0\n", 144 | " Downloading pycryptodome-3.15.0-cp35-abi3-manylinux2010_x86_64.whl (2.3 MB)\n", 145 | "\u001b[K |████████████████████████████████| 2.3 MB 49.9 MB/s \n", 146 | "\u001b[?25hBuilding wheels for collected packages: python-Levenshtein\n", 147 | " Building wheel for python-Levenshtein (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 148 | " Created wheel for python-Levenshtein: filename=python_Levenshtein-0.12.2-cp37-cp37m-linux_x86_64.whl size=149863 sha256=c6b73c0ac41c6e388f16ae019aa3b283991c85be2590fbd4cfa57a3919bd4284\n", 149 | " Stored in directory: /root/.cache/pip/wheels/05/5f/ca/7c4367734892581bb5ff896f15027a932c551080b2abd3e00d\n", 150 | "Successfully built python-Levenshtein\n", 151 | "Installing collected packages: pycryptodome, Flask-Babel, cssutils, cssselect, bce-python-sdk, visualdl, python-Levenshtein, pyclipper, premailer, opencv-contrib-python, attrdict, paddleocr\n", 152 | " Attempting uninstall: opencv-contrib-python\n", 153 | " Found existing installation: opencv-contrib-python 4.6.0.66\n", 154 | " Uninstalling opencv-contrib-python-4.6.0.66:\n", 155 | " Successfully uninstalled opencv-contrib-python-4.6.0.66\n", 156 | "Successfully installed Flask-Babel-2.0.0 attrdict-2.0.1 bce-python-sdk-0.8.74 cssselect-1.1.0 cssutils-2.5.1 opencv-contrib-python-4.4.0.46 paddleocr-2.5.0.3 premailer-3.10.0 pyclipper-1.3.0.post3 pycryptodome-3.15.0 python-Levenshtein-0.12.2 visualdl-2.3.0\n" 157 | ] 158 | } 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "source": [ 164 | "%cd PaddleOCR/ppstructure\n", 165 | "\n", 166 | "# download model\n", 167 | "!mkdir inference\n", 168 | "%cd inference\n", 169 | "# Download the detection model of the ultra-lightweight table English OCR model and unzip it\n", 170 | "!wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar && tar xf en_ppocr_mobile_v2.0_table_det_infer.tar\n", 171 | "# Download the recognition model of the ultra-lightweight table English OCR model and unzip it\n", 172 | "!wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar && tar xf en_ppocr_mobile_v2.0_table_rec_infer.tar\n", 173 | "# Download the ultra-lightweight English table inch model and unzip it\n", 174 | "!wget https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar && tar xf en_ppocr_mobile_v2.0_table_structure_infer.tar\n", 175 | "\n", 176 | "##New OCR Model\n", 177 | "#!wget https://paddleocr.bj.bcebos.com/PP-OCRv3/english/en_PP-OCRv3_det_infer.tar && tar xf en_PP-OCRv3_det_infer.tar\n", 178 | "\n" 179 | ], 180 | "metadata": { 181 | "colab": { 182 | "base_uri": "https://localhost:8080/" 183 | }, 184 | "id": "W35j7XxefMSU", 185 | "outputId": "77036ac7-5479-47dc-8ade-7837dc6894bd" 186 | }, 187 | "execution_count": null, 188 | "outputs": [ 189 | { 190 | "output_type": "stream", 191 | "name": "stdout", 192 | "text": [ 193 | "[Errno 2] No such file or directory: 'PaddleOCR/ppstructure'\n", 194 | "/content/PaddleOCR\n", 195 | "/content/PaddleOCR/inference\n", 196 | "--2022-08-13 14:06:49-- https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_det_infer.tar\n", 197 | "Resolving paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)... 220.181.33.44, 220.181.33.43, 2409:8c04:1001:1002:0:ff:b001:368a\n", 198 | "Connecting to paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)|220.181.33.44|:443... connected.\n", 199 | "HTTP request sent, awaiting response... 200 OK\n", 200 | "Length: 5001216 (4.8M) [application/x-tar]\n", 201 | "Saving to: ‘en_ppocr_mobile_v2.0_table_det_infer.tar’\n", 202 | "\n", 203 | " en 16%[==> ] 826.82K 7.64KB/s eta 8m 34s ^C\n", 204 | "--2022-08-13 14:10:22-- https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_rec_infer.tar\n", 205 | "Resolving paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)... 220.181.33.43, 220.181.33.44, 2409:8c04:1001:1002:0:ff:b001:368a\n", 206 | "Connecting to paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)|220.181.33.43|:443... connected.\n", 207 | "HTTP request sent, awaiting response... 200 OK\n", 208 | "Length: 7220736 (6.9M) [application/x-tar]\n", 209 | "Saving to: ‘en_ppocr_mobile_v2.0_table_rec_infer.tar’\n", 210 | "\n", 211 | "able_rec_infer.tar 40%[=======> ] 2.76M 434KB/s eta 22s ^C\n", 212 | "--2022-08-13 14:10:37-- https://paddleocr.bj.bcebos.com/dygraph_v2.0/table/en_ppocr_mobile_v2.0_table_structure_infer.tar\n", 213 | "Resolving paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)... 220.181.33.43, 220.181.33.44, 2409:8c04:1001:1002:0:ff:b001:368a\n", 214 | "Connecting to paddleocr.bj.bcebos.com (paddleocr.bj.bcebos.com)|220.181.33.43|:443... connected.\n", 215 | "HTTP request sent, awaiting response... 200 OK\n", 216 | "Length: 19667456 (19M) [application/x-tar]\n", 217 | "Saving to: ‘en_ppocr_mobile_v2.0_table_structure_infer.tar’\n", 218 | "\n", 219 | " en_ppocr_mo 0%[ ] 26.82K 49.4KB/s ^C\n" 220 | ] 221 | } 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "source": [ 227 | "# run\n", 228 | "%cd PaddleOCR/ppstructure\n", 229 | "!python3 /content/PaddleOCR/ppstructure/table/predict_table.py --det_model_dir=inference/en_PP-OCRv3_det_infer --rec_model_dir=inference/en_ppocr_mobile_v2.0_table_rec_infer --table_model_dir=inference/en_ppocr_mobile_v2.0_table_structure_infer --image_dir=/content/PaddleOCR/ppstructure/table_2.png --rec_char_dict_path=../ppocr/utils/dict/table_dict.txt --table_char_dict_path=../ppocr/utils/dict/table_structure_dict.txt --det_limit_side_len=736 --det_limit_type=min --output ./output/table" 230 | ], 231 | "metadata": { 232 | "colab": { 233 | "base_uri": "https://localhost:8080/" 234 | }, 235 | "id": "9MRP9_cNfjmx", 236 | "outputId": "9c1d8a47-8684-4f8f-8017-676def877c47" 237 | }, 238 | "execution_count": 16, 239 | "outputs": [ 240 | { 241 | "output_type": "stream", 242 | "name": "stdout", 243 | "text": [ 244 | "[Errno 2] No such file or directory: 'PaddleOCR/ppstructure'\n", 245 | "/content/PaddleOCR/ppstructure\n", 246 | "E0813 15:39:56.498070 1551 analysis_config.cc:95] Please compile with gpu to EnableGpu()\n", 247 | "E0813 15:39:56.702649 1551 analysis_config.cc:95] Please compile with gpu to EnableGpu()\n", 248 | "E0813 15:39:56.879097 1551 analysis_config.cc:95] Please compile with gpu to EnableGpu()\n", 249 | "[2022/08/13 15:39:57] ppocr INFO: [0/1] /content/PaddleOCR/ppstructure/table_2.png\n", 250 | "[2022/08/13 15:40:00] ppocr DEBUG: dt_boxes num : 74, elapse : 0.6278271675109863\n", 251 | "[2022/08/13 15:40:03] ppocr DEBUG: rec_res num : 74, elapse : 3.4260013103485107\n", 252 | "[2022/08/13 15:40:03] ppocr INFO:
| C3tegOfy | Metho | nitialiZ3tiOf | P-RGNN | N-RGNN |
| Text | 76f0-Sb00 | pDD. 3yNet | 0.482 | 0.46g |
| f | pibl.3yNet | 0.701 | 0738 | |
| fine tuning | CoCo | 0.651 | 0.661 | |
| fine tuning | ImageNet | 0.622 | 0.629 | |
| LiSt | ZerO-SBOt | PubLayNet | 0.50g | 0.510 |
| PubL2yNet | 0.681 | 0.684 | ||
| Coco | 0.622 | 0.611 | ||
| ImageNet | 0.933 | 0.933 | ||
| Table | 7eF0-SbOt | pRbl.3VNef | 0,422 | 0.419 |
| finei | PGbL.3yNef | 0.541 | 0.596 | |
| fe) | CQCG | o.s60 | 0.58g | |
| IT3g2Net | 0.528 | 0.573 | ||
| M2CTD 3Nfage | 2fOShDL | pubL2yNOR | 0.470 | 0.465 |
| fif | pIbL3yNeR | 0.641 | 0.663 | |
| f | CQCQ | 0.611 | 0.620 | |
| ITag2Not | 0.584 | 0.632 |