└── Web Scraper.ipynb /Web Scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "81f4e982", 6 | "metadata": {}, 7 | "source": [ 8 | "# Web Scraping From Daraz" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "turkish-columbus", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Requirement already satisfied: selenium in /home/hassan/.local/lib/python3.8/site-packages (3.141.0)\n", 22 | "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from selenium) (1.25.8)\n", 23 | "Requirement already satisfied: webdriver_manager in /home/hassan/.local/lib/python3.8/site-packages (3.4.2)\n", 24 | "Requirement already satisfied: crayons in /home/hassan/.local/lib/python3.8/site-packages (from webdriver_manager) (0.4.0)\n", 25 | "Requirement already satisfied: configparser in /home/hassan/.local/lib/python3.8/site-packages (from webdriver_manager) (5.0.2)\n", 26 | "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from webdriver_manager) (2.22.0)\n", 27 | "Requirement already satisfied: colorama in /usr/lib/python3/dist-packages (from crayons->webdriver_manager) (0.4.3)\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "# Using webdriver_manager to manage the Chrome Driver for Selenium\n", 33 | "!pip install selenium\n", 34 | "!pip install webdriver_manager" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "powered-necessity", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import re\n", 45 | "import pandas as pd\n", 46 | "from selenium import webdriver\n", 47 | "from webdriver_manager.chrome import ChromeDriverManager\n", 48 | "from selenium.common.exceptions import NoSuchElementException" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "divided-inspector", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "stars_value = {\n", 59 | " 'c3dn4k c1dtTC' : 0,\n", 60 | " 'c3dn4k c1Zozd' : 0.1,\n", 61 | " 'c3dn4k cbDGcO' : 0.2,\n", 62 | " 'c3dn4k c3fsPU' : 0.3,\n", 63 | " 'c3dn4k c1e2gb' : 0.4,\n", 64 | " 'c3dn4k c3An30' : 0.5,\n", 65 | " 'c3dn4k c3DcGB' : 0.6,\n", 66 | " 'c3dn4k c1wCjy' : 0.7,\n", 67 | " 'c3dn4k c17YMy' : 0.8,\n", 68 | " 'c3dn4k cF1vkb' : 0.9,\n", 69 | " 'c3dn4k c3EEAg' : 1\n", 70 | "}\n", 71 | "\n", 72 | "Web_Data = pd.DataFrame([],columns=['Name', 'Daraz Mall?', 'Original Price', \n", 73 | " 'Discounted Price', 'Rating' ,'Reviews', 'Seller Country'])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "id": "cardiovascular-ribbon", 80 | "metadata": { 81 | "scrolled": true 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Enter Product Name : Samsung A32\n", 89 | "URL : https://www.daraz.pk/catalog/?q=Samsung+A32 \n", 90 | "\n", 91 | "Enter no.to take Top Level Averages : 10\n" 92 | ] 93 | }, 94 | { 95 | "name": "stderr", 96 | "output_type": "stream", 97 | "text": [ 98 | "\n", 99 | "\n", 100 | "====== WebDriver manager ======\n", 101 | "Current google-chrome version is 91.0.4472\n", 102 | "Get LATEST driver version for 91.0.4472\n", 103 | "There is no [linux64] chromedriver for browser 91.0.4472 in cache\n", 104 | "Get LATEST driver version for 91.0.4472\n", 105 | "Trying to download new driver from https://chromedriver.storage.googleapis.com/91.0.4472.101/chromedriver_linux64.zip\n", 106 | "Driver has been saved in cache [/home/hassan/.wdm/drivers/chromedriver/linux64/91.0.4472.101]\n" 107 | ] 108 | }, 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "****************************************Web Scraping Ended :)\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "product = input('Enter Product Name : ').replace(\" \",\"+\")\n", 119 | "daraz = 'https://www.daraz.pk/catalog/?q={}'.format(product)\n", 120 | "print('URL : ',daraz,'\\n')\n", 121 | "\n", 122 | "no_of_product = int(input('Enter no.to take Top Level Averages : '))\n", 123 | "\n", 124 | "driver = webdriver.Chrome(ChromeDriverManager().install())\n", 125 | "driver.get(daraz)\n", 126 | "\n", 127 | "total_sellers = driver.find_element_by_class_name('c1DXz4').text.split(' ')[0]\n", 128 | "\n", 129 | "index = 0\n", 130 | "for product in driver.find_elements_by_class_name('c2prKC'):\n", 131 | " print('*', end = '')\n", 132 | " name = product.find_element_by_class_name('c16H9d').text\n", 133 | " prize = int(product.find_element_by_class_name('c3gUW0').text[3:].replace(',',''))\n", 134 | " country = product.find_element_by_class_name('c2i43-').text\n", 135 | " \n", 136 | " rating = 0\n", 137 | " try:\n", 138 | " stars = product.find_element_by_class_name('c15YQ9').get_attribute('innerHTML')\n", 139 | " for s in re.findall('(c3dn4k c.+?)\"', stars):\n", 140 | " rating += stars_value[s]\n", 141 | " except NoSuchElementException:\n", 142 | " rating = 0\n", 143 | " \n", 144 | " daraz_mall = 0\n", 145 | " mall = product.find_element_by_class_name('c3vCyH').get_attribute('innerHTML')\n", 146 | " if len(mall)>0:\n", 147 | " daraz_mall = 'Yes'\n", 148 | " else:\n", 149 | " daraz_mall = 'No'\n", 150 | " \n", 151 | " reviews = 0\n", 152 | " try:\n", 153 | " reviews = int(product.find_element_by_class_name('c3XbGJ').text[1:-1])\n", 154 | " except NoSuchElementException:\n", 155 | " reviews = 0\n", 156 | " \n", 157 | " discount_prize = 0\n", 158 | " try:\n", 159 | " discount_prize = int(product.find_element_by_class_name('c1-B2V').text[3:].replace(',',''))\n", 160 | " except NoSuchElementException:\n", 161 | " discount_prize = 0\n", 162 | " \n", 163 | " if discount_prize==0:\n", 164 | " Web_Data.loc[index] = [name, daraz_mall, prize, discount_prize, rating, reviews, country]\n", 165 | " else:\n", 166 | " Web_Data.loc[index] = [name, daraz_mall, discount_prize, prize, rating, reviews, country]\n", 167 | " index += 1\n", 168 | " \n", 169 | "print('Web Scraping Ended :)')\n", 170 | "driver.quit()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 5, 176 | "id": "intellectual-composition", 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Total Seller : 1656\n", 184 | "Avg Price : 45639.0\n", 185 | "Avg Reviews : 16.1\n", 186 | "Avg Rating : 3.87\n", 187 | "Total Product Present in DataFrame : 40\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "print(\"Total Seller :\",total_sellers)\n", 193 | "print(\"Avg Price :\", Web_Data['Original Price'][:no_of_product].mean())\n", 194 | "print(\"Avg Reviews :\", Web_Data['Reviews'][:no_of_product].mean())\n", 195 | "print(\"Avg Rating :\", Web_Data['Rating'][:no_of_product].mean())\n", 196 | "print('Total Product Present in DataFrame : ', Web_Data.shape[0])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 6, 202 | "id": "interstate-logic", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/html": [ 208 | "
\n", 226 | " | Name | \n", 227 | "Daraz Mall? | \n", 228 | "Original Price | \n", 229 | "Discounted Price | \n", 230 | "Rating | \n", 231 | "Reviews | \n", 232 | "Seller Country | \n", 233 | "
---|---|---|---|---|---|---|---|
0 | \n", 238 | "Samsung Galaxy A32 - Display 6.4 - Multi Quad ... | \n", 239 | "Yes | \n", 240 | "44499 | \n", 241 | "0 | \n", 242 | "4.6 | \n", 243 | "41 | \n", 244 | "Pakistan | \n", 245 | "
1 | \n", 248 | "Samsung Galaxy A32 - Display 6.4 - Multi Quad ... | \n", 249 | "Yes | \n", 250 | "45999 | \n", 251 | "0 | \n", 252 | "4.9 | \n", 253 | "42 | \n", 254 | "Pakistan | \n", 255 | "
2 | \n", 258 | "Samsung Galaxy A32 || 6GB Ram 128GB Rom || 500... | \n", 259 | "No | \n", 260 | "45999 | \n", 261 | "0 | \n", 262 | "5.0 | \n", 263 | "11 | \n", 264 | "Pakistan | \n", 265 | "
3 | \n", 268 | "Samsung Galaxy A32 - 6.4\" Inch Display - 6GB R... | \n", 269 | "No | \n", 270 | "44999 | \n", 271 | "0 | \n", 272 | "4.9 | \n", 273 | "8 | \n", 274 | "Pakistan | \n", 275 | "
4 | \n", 278 | "Samsung Galaxy A32 - RAM 6GB - ROM 128 GB - Di... | \n", 279 | "No | \n", 280 | "44999 | \n", 281 | "0 | \n", 282 | "4.7 | \n", 283 | "45 | \n", 284 | "Pakistan | \n", 285 | "