├── Beautiful_Soup.ipynb ├── Introduction to Web Scraping & Text Extraction.pdf ├── Text_Extraction.ipynb ├── Web Scraping with Python, 2nd Edition.pdf ├── readme.md └── sample files ├── Schedule.png ├── merged_data (1).csv └── prescription.pdf /Introduction to Web Scraping & Text Extraction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gurtaransingh/scraping/784a7be8778f9890a8e4812b64a0d6e383ce5f36/Introduction to Web Scraping & Text Extraction.pdf -------------------------------------------------------------------------------- /Text_Extraction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOoSj7Pd+yUVxbIQM6sH3td", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "source": [ 32 | "#Text Extraction from any file type" 33 | ], 34 | "metadata": { 35 | "id": "cavSDr1xUs-9" 36 | } 37 | }, 38 | { 39 | "cell_type": "markdown", 40 | "source": [ 41 | "#1\n", 42 | "##Install various libraries\n", 43 | "We may use few of them but these are for your future refrances." 44 | ], 45 | "metadata": { 46 | "id": "t8zOuQ8_IzH7" 47 | } 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 1, 52 | "metadata": { 53 | "colab": { 54 | "base_uri": "https://localhost:8080/" 55 | }, 56 | "id": "5S8DQwSfwV06", 57 | "outputId": "9b899ed4-9350-4948-c384-13b960b6f768" 58 | }, 59 | "outputs": [ 60 | { 61 | "output_type": "stream", 62 | "name": "stdout", 63 | "text": [ 64 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 65 | "Requirement already satisfied: PyPDF2 in /usr/local/lib/python3.10/dist-packages (3.0.1)\n", 66 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 67 | "Requirement already satisfied: textract in /usr/local/lib/python3.10/dist-packages (1.6.5)\n", 68 | "Requirement already satisfied: argcomplete~=1.10.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.10.3)\n", 69 | "Requirement already satisfied: beautifulsoup4~=4.8.0 in /usr/local/lib/python3.10/dist-packages (from textract) (4.8.2)\n", 70 | "Requirement already satisfied: chardet==3.* in /usr/local/lib/python3.10/dist-packages (from textract) (3.0.4)\n", 71 | "Requirement already satisfied: docx2txt~=0.8 in /usr/local/lib/python3.10/dist-packages (from textract) (0.8)\n", 72 | "Requirement already satisfied: extract-msg<=0.29.* in /usr/local/lib/python3.10/dist-packages (from textract) (0.28.7)\n", 73 | "Requirement already satisfied: pdfminer.six==20191110 in /usr/local/lib/python3.10/dist-packages (from textract) (20191110)\n", 74 | "Requirement already satisfied: python-pptx~=0.6.18 in /usr/local/lib/python3.10/dist-packages (from textract) (0.6.21)\n", 75 | "Requirement already satisfied: six~=1.12.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.12.0)\n", 76 | "Requirement already satisfied: SpeechRecognition~=3.8.1 in /usr/local/lib/python3.10/dist-packages (from textract) (3.8.1)\n", 77 | "Requirement already satisfied: xlrd~=1.2.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.2.0)\n", 78 | "Requirement already satisfied: pycryptodome in /usr/local/lib/python3.10/dist-packages (from pdfminer.six==20191110->textract) (3.18.0)\n", 79 | "Requirement already satisfied: sortedcontainers in /usr/local/lib/python3.10/dist-packages (from pdfminer.six==20191110->textract) (2.4.0)\n", 80 | "Requirement already satisfied: soupsieve>=1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4~=4.8.0->textract) (2.4.1)\n", 81 | "Requirement already satisfied: imapclient==2.1.0 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (2.1.0)\n", 82 | "Requirement already satisfied: olefile>=0.46 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (0.46)\n", 83 | "Requirement already satisfied: tzlocal>=2.1 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (5.0.1)\n", 84 | "Requirement already satisfied: compressed-rtf>=1.0.6 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (1.0.6)\n", 85 | "Requirement already satisfied: ebcdic>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (1.1.1)\n", 86 | "Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (4.9.2)\n", 87 | "Requirement already satisfied: Pillow>=3.3.2 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (8.4.0)\n", 88 | "Requirement already satisfied: XlsxWriter>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (3.1.2)\n", 89 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 90 | "Requirement already satisfied: pytesseract in /usr/local/lib/python3.10/dist-packages (0.3.10)\n", 91 | "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.1)\n", 92 | "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (8.4.0)\n", 93 | "Reading package lists... Done\n", 94 | "Building dependency tree \n", 95 | "Reading state information... Done\n", 96 | "tesseract-ocr is already the newest version (4.1.1-2build2).\n", 97 | "0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.\n", 98 | "Reading package lists... Done\n", 99 | "Building dependency tree \n", 100 | "Reading state information... Done\n", 101 | "libtesseract-dev is already the newest version (4.1.1-2build2).\n", 102 | "0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.\n", 103 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 104 | "Requirement already satisfied: pytesseract in /usr/local/lib/python3.10/dist-packages (0.3.10)\n", 105 | "Requirement already satisfied: packaging>=21.3 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (23.1)\n", 106 | "Requirement already satisfied: Pillow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from pytesseract) (8.4.0)\n", 107 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 108 | "Requirement already satisfied: textract in /usr/local/lib/python3.10/dist-packages (1.6.5)\n", 109 | "Requirement already satisfied: argcomplete~=1.10.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.10.3)\n", 110 | "Requirement already satisfied: beautifulsoup4~=4.8.0 in /usr/local/lib/python3.10/dist-packages (from textract) (4.8.2)\n", 111 | "Requirement already satisfied: chardet==3.* in /usr/local/lib/python3.10/dist-packages (from textract) (3.0.4)\n", 112 | "Requirement already satisfied: docx2txt~=0.8 in /usr/local/lib/python3.10/dist-packages (from textract) (0.8)\n", 113 | "Requirement already satisfied: extract-msg<=0.29.* in /usr/local/lib/python3.10/dist-packages (from textract) (0.28.7)\n", 114 | "Requirement already satisfied: pdfminer.six==20191110 in /usr/local/lib/python3.10/dist-packages (from textract) (20191110)\n", 115 | "Requirement already satisfied: python-pptx~=0.6.18 in /usr/local/lib/python3.10/dist-packages (from textract) (0.6.21)\n", 116 | "Requirement already satisfied: six~=1.12.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.12.0)\n", 117 | "Requirement already satisfied: SpeechRecognition~=3.8.1 in /usr/local/lib/python3.10/dist-packages (from textract) (3.8.1)\n", 118 | "Requirement already satisfied: xlrd~=1.2.0 in /usr/local/lib/python3.10/dist-packages (from textract) (1.2.0)\n", 119 | "Requirement already satisfied: pycryptodome in /usr/local/lib/python3.10/dist-packages (from pdfminer.six==20191110->textract) (3.18.0)\n", 120 | "Requirement already satisfied: sortedcontainers in /usr/local/lib/python3.10/dist-packages (from pdfminer.six==20191110->textract) (2.4.0)\n", 121 | "Requirement already satisfied: soupsieve>=1.2 in /usr/local/lib/python3.10/dist-packages (from beautifulsoup4~=4.8.0->textract) (2.4.1)\n", 122 | "Requirement already satisfied: imapclient==2.1.0 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (2.1.0)\n", 123 | "Requirement already satisfied: olefile>=0.46 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (0.46)\n", 124 | "Requirement already satisfied: tzlocal>=2.1 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (5.0.1)\n", 125 | "Requirement already satisfied: compressed-rtf>=1.0.6 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (1.0.6)\n", 126 | "Requirement already satisfied: ebcdic>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from extract-msg<=0.29.*->textract) (1.1.1)\n", 127 | "Requirement already satisfied: lxml>=3.1.0 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (4.9.2)\n", 128 | "Requirement already satisfied: Pillow>=3.3.2 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (8.4.0)\n", 129 | "Requirement already satisfied: XlsxWriter>=0.5.7 in /usr/local/lib/python3.10/dist-packages (from python-pptx~=0.6.18->textract) (3.1.2)\n", 130 | "Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/\n", 131 | "Requirement already satisfied: pdf2image in /usr/local/lib/python3.10/dist-packages (1.16.3)\n", 132 | "Requirement already satisfied: pillow in /usr/local/lib/python3.10/dist-packages (from pdf2image) (8.4.0)\n", 133 | "Traceback (most recent call last):\n", 134 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 3108, in _dep_map\n", 135 | " return self.__dep_map\n", 136 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 2901, in __getattr__\n", 137 | " raise AttributeError(attr)\n", 138 | "AttributeError: _DistInfoDistribution__dep_map\n", 139 | "\n", 140 | "During handling of the above exception, another exception occurred:\n", 141 | "\n", 142 | "Traceback (most recent call last):\n", 143 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 144 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 145 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 2337, in parseImpl\n", 146 | " if instring[loc] == self.firstMatchChar and instring.startswith(\n", 147 | "IndexError: string index out of range\n", 148 | "\n", 149 | "During handling of the above exception, another exception occurred:\n", 150 | "\n", 151 | "Traceback (most recent call last):\n", 152 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 169, in exc_logging_wrapper\n", 153 | " status = run_func(*args)\n", 154 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/req_command.py\", line 242, in wrapper\n", 155 | " return func(self, options, args)\n", 156 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py\", line 441, in run\n", 157 | " conflicts = self._determine_conflicts(to_install)\n", 158 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/commands/install.py\", line 572, in _determine_conflicts\n", 159 | " return check_install_conflicts(to_install)\n", 160 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/operations/check.py\", line 101, in check_install_conflicts\n", 161 | " package_set, _ = create_package_set_from_installed()\n", 162 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/operations/check.py\", line 42, in create_package_set_from_installed\n", 163 | " dependencies = list(dist.iter_dependencies())\n", 164 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/metadata/pkg_resources.py\", line 216, in iter_dependencies\n", 165 | " return self._dist.requires(extras)\n", 166 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 2821, in requires\n", 167 | " dm = self._dep_map\n", 168 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 3110, in _dep_map\n", 169 | " self.__dep_map = self._compute_dependencies()\n", 170 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 3120, in _compute_dependencies\n", 171 | " reqs.extend(parse_requirements(req))\n", 172 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pkg_resources/__init__.py\", line 3173, in __init__\n", 173 | " super(Requirement, self).__init__(requirement_string)\n", 174 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/packaging/requirements.py\", line 102, in __init__\n", 175 | " req = REQUIREMENT.parseString(requirement_string)\n", 176 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 1131, in parse_string\n", 177 | " loc, tokens = self._parse(instring, 0)\n", 178 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 179 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 180 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3886, in parseImpl\n", 181 | " loc, exprtokens = e._parse(instring, loc, doActions)\n", 182 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 183 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 184 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4114, in parseImpl\n", 185 | " return e._parse(\n", 186 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 187 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 188 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3886, in parseImpl\n", 189 | " loc, exprtokens = e._parse(instring, loc, doActions)\n", 190 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 191 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 192 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4959, in parseImpl\n", 193 | " loc, tokens = self_expr._parse(instring, loc, doActions, callPreParse=False)\n", 194 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 195 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 196 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3886, in parseImpl\n", 197 | " loc, exprtokens = e._parse(instring, loc, doActions)\n", 198 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 199 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 200 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3886, in parseImpl\n", 201 | " loc, exprtokens = e._parse(instring, loc, doActions)\n", 202 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 203 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 204 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 5226, in parseImpl\n", 205 | " return super().parseImpl(instring, loc, doActions)\n", 206 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4375, in parseImpl\n", 207 | " return self.expr._parse(instring, loc, doActions, callPreParse=False)\n", 208 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 209 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 210 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3886, in parseImpl\n", 211 | " loc, exprtokens = e._parse(instring, loc, doActions)\n", 212 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 213 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 214 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4891, in parseImpl\n", 215 | " return super().parseImpl(instring, loc, doActions)\n", 216 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4790, in parseImpl\n", 217 | " loc, tokens = self_expr_parse(instring, loc, doActions)\n", 218 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 219 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 220 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 3864, in parseImpl\n", 221 | " loc, resultlist = self.exprs[0]._parse(\n", 222 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 223 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 224 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 4114, in parseImpl\n", 225 | " return e._parse(\n", 226 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_vendor/pyparsing/core.py\", line 817, in _parseNoCache\n", 227 | " loc, tokens = self.parseImpl(instring, pre_loc, doActions)\n", 228 | "KeyboardInterrupt\n", 229 | "\n", 230 | "During handling of the above exception, another exception occurred:\n", 231 | "\n", 232 | "Traceback (most recent call last):\n", 233 | " File \"/usr/lib/python3.10/logging/__init__.py\", line 1732, in isEnabledFor\n", 234 | " return self._cache[level]\n", 235 | "KeyError: 50\n", 236 | "\n", 237 | "During handling of the above exception, another exception occurred:\n", 238 | "\n", 239 | "Traceback (most recent call last):\n", 240 | " File \"/usr/local/bin/pip3\", line 8, in \n", 241 | " sys.exit(main())\n", 242 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py\", line 79, in main\n", 243 | " return command.main(cmd_args)\n", 244 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 101, in main\n", 245 | " return self._main(args)\n", 246 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 223, in _main\n", 247 | " return run(options, args)\n", 248 | " File \"/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/base_command.py\", line 206, in exc_logging_wrapper\n", 249 | " logger.critical(\"Operation cancelled by user\")\n", 250 | " File \"/usr/lib/python3.10/logging/__init__.py\", line 1523, in critical\n", 251 | " if self.isEnabledFor(CRITICAL):\n", 252 | " File \"/usr/lib/python3.10/logging/__init__.py\", line 1734, in isEnabledFor\n", 253 | " _acquireLock()\n", 254 | " File \"/usr/lib/python3.10/logging/__init__.py\", line 226, in _acquireLock\n", 255 | " _lock.acquire()\n", 256 | "KeyboardInterrupt\n", 257 | "^C\n", 258 | "^C\n" 259 | ] 260 | } 261 | ], 262 | "source": [ 263 | "!pip install PyPDF2 # Install PyPDF2 library for working with PDF files\n", 264 | "!pip install textract # Install textract library for extracting text from various file formats\n", 265 | "!pip install pytesseract # Install pytesseract library for optical character recognition (OCR)\n", 266 | "!apt-get install tesseract-ocr # Install Tesseract OCR engine\n", 267 | "!apt-get install libtesseract-dev # Install Tesseract development libraries\n", 268 | "!pip install pytesseract # Install pytesseract Python wrapper for Tesseract OCR\n", 269 | "!pip install textract # Install textract library for text extraction from various file types\n", 270 | "!pip install pdf2image # Install pdf2image library for converting PDFs to images\n", 271 | "!apt-get install poppler-utils # Install poppler-utils for working with PDFs and extracting text" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "source": [ 277 | "#2\n", 278 | "##Import the required libraries" 279 | ], 280 | "metadata": { 281 | "id": "lfpUYL8pJkjZ" 282 | } 283 | }, 284 | { 285 | "cell_type": "code", 286 | "source": [ 287 | "from PyPDF2 import PdfReader # Import PdfReader class from PyPDF2 library for reading PDF files\n", 288 | "import pytesseract # Import pytesseract library for OCR (text extraction from images)\n", 289 | "from PIL import Image # Import Image class from Pillow library for image manipulation\n", 290 | "import textract # Import textract library for text extraction from various file formats" 291 | ], 292 | "metadata": { 293 | "id": "GS2dzYJ7wix4" 294 | }, 295 | "execution_count": 2, 296 | "outputs": [] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "source": [ 301 | "#3\n", 302 | "##Text extraction from Pdf" 303 | ], 304 | "metadata": { 305 | "id": "xIjZ6e5FJz3x" 306 | } 307 | }, 308 | { 309 | "cell_type": "code", 310 | "source": [ 311 | "def extract_text_from_pdf(pdf_path):\n", 312 | " text = \"\" # Initialize an empty string to store the extracted text\n", 313 | " with open(pdf_path, 'rb') as file: # Open the PDF file in read-binary mode\n", 314 | " reader = PdfReader(file) # Create a PDF reader object\n", 315 | " for page in reader.pages: # Iterate through each page in the PDF\n", 316 | " text += page.extract_text() # Extract the text from the current page and append it to the 'text' variable\n", 317 | " return text # Return the extracted text\n" 318 | ], 319 | "metadata": { 320 | "id": "3S4JrVSZwlB_" 321 | }, 322 | "execution_count": 3, 323 | "outputs": [] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "source": [ 328 | "#4\n", 329 | "##Text extraction from Image" 330 | ], 331 | "metadata": { 332 | "id": "Vg_sIshwKI7o" 333 | } 334 | }, 335 | { 336 | "cell_type": "code", 337 | "source": [ 338 | "def extract_text_from_image(image_path):\n", 339 | " image = Image.open(image_path) # Open the image file using PIL's Image class\n", 340 | " text = pytesseract.image_to_string(image) # Use pytesseract to extract text from the image\n", 341 | " return text # Return the extracted text\n" 342 | ], 343 | "metadata": { 344 | "id": "n-yslFs3wr-s" 345 | }, 346 | "execution_count": 4, 347 | "outputs": [] 348 | }, 349 | { 350 | "cell_type": "markdown", 351 | "source": [ 352 | "#5\n", 353 | "##Text extraction from any file type." 354 | ], 355 | "metadata": { 356 | "id": "80oqkG-zL0uu" 357 | } 358 | }, 359 | { 360 | "cell_type": "code", 361 | "source": [ 362 | "def extract_text_from_other(file_path):\n", 363 | " text = textract.process(file_path) # Use textract to extract text from the file\n", 364 | " return text.decode('utf-8') # Decode the extracted text from bytes to UTF-8 string\n" 365 | ], 366 | "metadata": { 367 | "id": "rrJTj2OpwuUI" 368 | }, 369 | "execution_count": 5, 370 | "outputs": [] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "source": [ 375 | "#6\n", 376 | "##Main function redirection to above function based upon file type" 377 | ], 378 | "metadata": { 379 | "id": "oBFUg_5_MSeh" 380 | } 381 | }, 382 | { 383 | "cell_type": "code", 384 | "source": [ 385 | "def extract_text_from_file(file_path):\n", 386 | " if file_path.lower().endswith('.pdf'): # If the file is a PDF\n", 387 | " return extract_text_from_pdf(file_path) # Extract text from the PDF file\n", 388 | " elif file_path.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')): # If the file is an image\n", 389 | " return extract_text_from_image(file_path) # Extract text from the image file\n", 390 | " else: # For other file types such as csv, word document etc\n", 391 | " return extract_text_from_other(file_path) # Extract text using other methods" 392 | ], 393 | "metadata": { 394 | "id": "u97-LaThwwHx" 395 | }, 396 | "execution_count": 6, 397 | "outputs": [] 398 | }, 399 | { 400 | "cell_type": "markdown", 401 | "source": [ 402 | "#7\n", 403 | "##Upload file path\n", 404 | "##Send file to above main function\n", 405 | "##And wait for output\n", 406 | "Try uploading random images or Open Github repository https://github.com/gurtaransingh/scraping and open sample files folder." 407 | ], 408 | "metadata": { 409 | "id": "g1OYKBkgMhmP" 410 | } 411 | }, 412 | { 413 | "cell_type": "code", 414 | "source": [ 415 | "file_path = '/content/merged_data.csv'\n", 416 | "extracted_text = extract_text_from_file(file_path)\n", 417 | "print(extracted_text)" 418 | ], 419 | "metadata": { 420 | "colab": { 421 | "base_uri": "https://localhost:8080/" 422 | }, 423 | "id": "vICSlXcjwyTZ", 424 | "outputId": "f3b7f090-eda0-481f-bc2b-a45c5aa135c8" 425 | }, 426 | "execution_count": 7, 427 | "outputs": [ 428 | { 429 | "output_type": "stream", 430 | "name": "stdout", 431 | "text": [ 432 | " \n", 433 | "\n", 434 | "Schedule\n", 435 | "\n", 436 | " \n", 437 | "\n", 438 | "Time & Venue :\n", 439 | "¢ Offline - 09:00 to 13:00 | Monday to Friday | TAN Audi\n", 440 | "¢ Online - 15:00 to 18:00 | Monday to Friday | Zoom\n", 441 | "Note : (1) Be on Time (2) Bring your Laptop (3) Bring power extensions (if possible)\n", 442 | "Brief Schedule:\n", 443 | "¢ Week 1 | 05 - 09 June 2023 | Basics of Python, OOPs and Problem-Solving using Python, Pandas, Numpy, Scipy\n", 444 | "¢ Week 2 | 12 - 16 June 2023 | Machine Learning, Optimization and Statistics, Mini Projects\n", 445 | "¢ Week 3 | 19 - 23 June 2023 | Web Development - Basics (HTML, CSS, Bootstrap, Javascript, ReactJS, Express and MongoDB)\n", 446 | "¢ Week 4 | 26 - 30 June 2023 | Web Development - Advance (NextJS, Django, Hosting, ML Model Deployment, Crud)\n", 447 | "¢ Week 5 | 03 - 07 July 2023 | Deep Learning for Image Processing and Natural Language Processing (NLP)\n", 448 | "¢ Week 6 | 10 - 14 July 2023 | Time Series, Internet of Things (loT), Cloud Services, Generative Al, Valedictory, Certificate distribution\n", 449 | "\n", 450 | " \n", 451 | "\n", 452 | " \n", 453 | "\f\n" 454 | ] 455 | } 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "source": [ 461 | "extracted_text" 462 | ], 463 | "metadata": { 464 | "colab": { 465 | "base_uri": "https://localhost:8080/", 466 | "height": 160 467 | }, 468 | "id": "dl7GrUoZw0uS", 469 | "outputId": "e1187fd2-fa7a-4cae-d876-9b4bae3bc113" 470 | }, 471 | "execution_count": 8, 472 | "outputs": [ 473 | { 474 | "output_type": "execute_result", 475 | "data": { 476 | "text/plain": [ 477 | "' \\n\\nSchedule\\n\\n \\n\\nTime & Venue :\\n¢ Offline - 09:00 to 13:00 | Monday to Friday | TAN Audi\\n¢ Online - 15:00 to 18:00 | Monday to Friday | Zoom\\nNote : (1) Be on Time (2) Bring your Laptop (3) Bring power extensions (if possible)\\nBrief Schedule:\\n¢ Week 1 | 05 - 09 June 2023 | Basics of Python, OOPs and Problem-Solving using Python, Pandas, Numpy, Scipy\\n¢ Week 2 | 12 - 16 June 2023 | Machine Learning, Optimization and Statistics, Mini Projects\\n¢ Week 3 | 19 - 23 June 2023 | Web Development - Basics (HTML, CSS, Bootstrap, Javascript, ReactJS, Express and MongoDB)\\n¢ Week 4 | 26 - 30 June 2023 | Web Development - Advance (NextJS, Django, Hosting, ML Model Deployment, Crud)\\n¢ Week 5 | 03 - 07 July 2023 | Deep Learning for Image Processing and Natural Language Processing (NLP)\\n¢ Week 6 | 10 - 14 July 2023 | Time Series, Internet of Things (loT), Cloud Services, Generative Al, Valedictory, Certificate distribution\\n\\n \\n\\n \\n\\x0c'" 478 | ], 479 | "application/vnd.google.colaboratory.intrinsic+json": { 480 | "type": "string" 481 | } 482 | }, 483 | "metadata": {}, 484 | "execution_count": 8 485 | } 486 | ] 487 | }, 488 | { 489 | "cell_type": "markdown", 490 | "source": [ 491 | "#8\n", 492 | "##Replace \\n and \\t with space" 493 | ], 494 | "metadata": { 495 | "id": "UCjMqPlXUQxC" 496 | } 497 | }, 498 | { 499 | "cell_type": "code", 500 | "source": [ 501 | "text = extracted_text.replace('\\n', ' ').replace('\\t', ' ')\n", 502 | "print(text)" 503 | ], 504 | "metadata": { 505 | "colab": { 506 | "base_uri": "https://localhost:8080/" 507 | }, 508 | "id": "G-UGYv23w849", 509 | "outputId": "fad900db-23fd-4588-c7fe-7b9d72e12e87" 510 | }, 511 | "execution_count": 9, 512 | "outputs": [ 513 | { 514 | "output_type": "stream", 515 | "name": "stdout", 516 | "text": [ 517 | " Schedule Time & Venue : ¢ Offline - 09:00 to 13:00 | Monday to Friday | TAN Audi ¢ Online - 15:00 to 18:00 | Monday to Friday | Zoom Note : (1) Be on Time (2) Bring your Laptop (3) Bring power extensions (if possible) Brief Schedule: ¢ Week 1 | 05 - 09 June 2023 | Basics of Python, OOPs and Problem-Solving using Python, Pandas, Numpy, Scipy ¢ Week 2 | 12 - 16 June 2023 | Machine Learning, Optimization and Statistics, Mini Projects ¢ Week 3 | 19 - 23 June 2023 | Web Development - Basics (HTML, CSS, Bootstrap, Javascript, ReactJS, Express and MongoDB) ¢ Week 4 | 26 - 30 June 2023 | Web Development - Advance (NextJS, Django, Hosting, ML Model Deployment, Crud) ¢ Week 5 | 03 - 07 July 2023 | Deep Learning for Image Processing and Natural Language Processing (NLP) ¢ Week 6 | 10 - 14 July 2023 | Time Series, Internet of Things (loT), Cloud Services, Generative Al, Valedictory, Certificate distribution \f\n" 518 | ] 519 | } 520 | ] 521 | }, 522 | { 523 | "cell_type": "markdown", 524 | "source": [ 525 | "#9 - Homework\n", 526 | "a) Search online for text written image dataset or use the below given dataset.\n", 527 | "\n", 528 | "- https://www.kaggle.com/datasets/mehaksingal/personal-financial-dataset-for-india\n", 529 | "\n", 530 | "b) Modify above code to upload a zip file.\n", 531 | "\n", 532 | "c) Unzip the file\n", 533 | "\n", 534 | "d) Add loops or any other method to read mutiple files at a time\n", 535 | "\n", 536 | "e) Output must be extracted text stored in any format (list, sets etc)" 537 | ], 538 | "metadata": { 539 | "id": "Px7tP5G0Vbur" 540 | } 541 | }, 542 | { 543 | "cell_type": "code", 544 | "source": [], 545 | "metadata": { 546 | "id": "JfG-0HCrw-g4" 547 | }, 548 | "execution_count": 9, 549 | "outputs": [] 550 | } 551 | ] 552 | } -------------------------------------------------------------------------------- /Web Scraping with Python, 2nd Edition.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gurtaransingh/scraping/784a7be8778f9890a8e4812b64a0d6e383ce5f36/Web Scraping with Python, 2nd Edition.pdf -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | **Web Scraping and Text Extraction** 2 | 3 | 1. PPT : Link 4 | 2. Beautiful Soup and Flipkart Mini Project : Link 5 | 3. Text Extraction Code and Text from Multiple Images Detection Mini Project : Link 6 | 4. Sample File : Open Sample Files Folder above and download required file type 7 | 5. Self Learning Web Scraping Book Resource : Link 8 | 9 | **Star and fork this repository for future help** 10 | 11 | _**For any queries : Link**_ 12 | 13 | **Projects/opportunities for future** 14 | 1. **Share Predictor:** Live share/stock market graph data scraping, implimenting basic machine learning models and then making a web application telling me probability to buy a particular share or not, or suggeting me the right time to buy or sell. 15 | 2. **Handwriting Recogniser:** Train your own OCR model to read handwritings in different languages like English, Hindi, Punjabi etc. Try cursive text, doctor's handwriting, any bill which has something handwritten on it. 16 | 3. **Your Own Google Lens:** Merge many ocr models to extract text, qr codes, images. Compare the scraped data together. Search the most repeated words or images on backend and show most relavent searches. 17 | 4. **Kaggle Dataset Master:** Scrape different websites, get relavent filtered information, upload datasets on kaggle like platform. Ask your friends to upvote you, add comments on dataset. If dataset is good and big upto 10k entries ask RANA SIR to host a kaggle competition with your dataset. 18 | 5. **Cold Emailing:** Write a code - just put a url, and scrape all email IDs. And automatically cold email them. Use filter like hr@company.com 19 | -------------------------------------------------------------------------------- /sample files/Schedule.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gurtaransingh/scraping/784a7be8778f9890a8e4812b64a0d6e383ce5f36/sample files/Schedule.png -------------------------------------------------------------------------------- /sample files/merged_data (1).csv: -------------------------------------------------------------------------------- 1 | Name,RAM,ROM,Sold Quantity,Rating 2 | POCO M4 5G,4,64,79482,4.2 3 | APPLE iPhone 13,0,128,227342,4.7 4 | APPLE iPhone 13,0,128,227342,4.7 5 | APPLE iPhone 13,0,128,227342,4.7 6 | POCO M4 5G,4,64,79482,4.2 7 | vivo T2 5G,8,128,4978,4.4 8 | APPLE iPhone 13,0,128,227342,4.7 9 | SAMSUNG Galaxy F14 5G,6,128,17115,4.2 10 | Infinix Note 12 Pro 5G,8,128,28073,4.1 11 | Infinix Note 12 Pro 5G,8,128,28073,4.1 12 | SAMSUNG Galaxy F23 5G,6,128,195602,4.3 13 | POCO X5 Pro 5G,8,256,8229,4.3 14 | POCO M4 5G,4,64,79482,4.2 15 | realme 10 Pro 5G,8,128,19487,4.3 16 | realme 10 Pro 5G,6,128,26570,4.3 17 | realme GT 2,8,128,10142,4.3 18 | realme GT 2,8,128,10142,4.3 19 | Google Pixel 6a,6,128,47692,4.3 20 | Google Pixel 6a,6,128,47692,4.3 21 | realme 10 Pro 5G,6,128,26570,4.3 22 | realme 10 Pro 5G,8,128,19487,4.3 23 | realme 10 Pro 5G,6,128,26570,4.3 24 | realme 10 Pro 5G,8,128,19487,4.3 25 | SAMSUNG Galaxy F23 5G,6,128,195602,4.3 26 | -------------------------------------------------------------------------------- /sample files/prescription.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gurtaransingh/scraping/784a7be8778f9890a8e4812b64a0d6e383ce5f36/sample files/prescription.pdf --------------------------------------------------------------------------------