└── Web Scraper.ipynb /Web Scraper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "81f4e982", 6 | "metadata": {}, 7 | "source": [ 8 | "# Web Scraping From Daraz" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "id": "turkish-columbus", 15 | "metadata": {}, 16 | "outputs": [ 17 | { 18 | "name": "stdout", 19 | "output_type": "stream", 20 | "text": [ 21 | "Requirement already satisfied: selenium in /home/hassan/.local/lib/python3.8/site-packages (3.141.0)\n", 22 | "Requirement already satisfied: urllib3 in /usr/lib/python3/dist-packages (from selenium) (1.25.8)\n", 23 | "Requirement already satisfied: webdriver_manager in /home/hassan/.local/lib/python3.8/site-packages (3.4.2)\n", 24 | "Requirement already satisfied: crayons in /home/hassan/.local/lib/python3.8/site-packages (from webdriver_manager) (0.4.0)\n", 25 | "Requirement already satisfied: configparser in /home/hassan/.local/lib/python3.8/site-packages (from webdriver_manager) (5.0.2)\n", 26 | "Requirement already satisfied: requests in /usr/lib/python3/dist-packages (from webdriver_manager) (2.22.0)\n", 27 | "Requirement already satisfied: colorama in /usr/lib/python3/dist-packages (from crayons->webdriver_manager) (0.4.3)\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "# Using webdriver_manager to manage the Chrome Driver for Selenium\n", 33 | "!pip install selenium\n", 34 | "!pip install webdriver_manager" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "id": "powered-necessity", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import re\n", 45 | "import pandas as pd\n", 46 | "from selenium import webdriver\n", 47 | "from webdriver_manager.chrome import ChromeDriverManager\n", 48 | "from selenium.common.exceptions import NoSuchElementException" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 3, 54 | "id": "divided-inspector", 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "stars_value = {\n", 59 | " 'c3dn4k c1dtTC' : 0,\n", 60 | " 'c3dn4k c1Zozd' : 0.1,\n", 61 | " 'c3dn4k cbDGcO' : 0.2,\n", 62 | " 'c3dn4k c3fsPU' : 0.3,\n", 63 | " 'c3dn4k c1e2gb' : 0.4,\n", 64 | " 'c3dn4k c3An30' : 0.5,\n", 65 | " 'c3dn4k c3DcGB' : 0.6,\n", 66 | " 'c3dn4k c1wCjy' : 0.7,\n", 67 | " 'c3dn4k c17YMy' : 0.8,\n", 68 | " 'c3dn4k cF1vkb' : 0.9,\n", 69 | " 'c3dn4k c3EEAg' : 1\n", 70 | "}\n", 71 | "\n", 72 | "Web_Data = pd.DataFrame([],columns=['Name', 'Daraz Mall?', 'Original Price', \n", 73 | " 'Discounted Price', 'Rating' ,'Reviews', 'Seller Country'])" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "id": "cardiovascular-ribbon", 80 | "metadata": { 81 | "scrolled": true 82 | }, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Enter Product Name : Samsung A32\n", 89 | "URL : https://www.daraz.pk/catalog/?q=Samsung+A32 \n", 90 | "\n", 91 | "Enter no.to take Top Level Averages : 10\n" 92 | ] 93 | }, 94 | { 95 | "name": "stderr", 96 | "output_type": "stream", 97 | "text": [ 98 | "\n", 99 | "\n", 100 | "====== WebDriver manager ======\n", 101 | "Current google-chrome version is 91.0.4472\n", 102 | "Get LATEST driver version for 91.0.4472\n", 103 | "There is no [linux64] chromedriver for browser 91.0.4472 in cache\n", 104 | "Get LATEST driver version for 91.0.4472\n", 105 | "Trying to download new driver from https://chromedriver.storage.googleapis.com/91.0.4472.101/chromedriver_linux64.zip\n", 106 | "Driver has been saved in cache [/home/hassan/.wdm/drivers/chromedriver/linux64/91.0.4472.101]\n" 107 | ] 108 | }, 109 | { 110 | "name": "stdout", 111 | "output_type": "stream", 112 | "text": [ 113 | "****************************************Web Scraping Ended :)\n" 114 | ] 115 | } 116 | ], 117 | "source": [ 118 | "product = input('Enter Product Name : ').replace(\" \",\"+\")\n", 119 | "daraz = 'https://www.daraz.pk/catalog/?q={}'.format(product)\n", 120 | "print('URL : ',daraz,'\\n')\n", 121 | "\n", 122 | "no_of_product = int(input('Enter no.to take Top Level Averages : '))\n", 123 | "\n", 124 | "driver = webdriver.Chrome(ChromeDriverManager().install())\n", 125 | "driver.get(daraz)\n", 126 | "\n", 127 | "total_sellers = driver.find_element_by_class_name('c1DXz4').text.split(' ')[0]\n", 128 | "\n", 129 | "index = 0\n", 130 | "for product in driver.find_elements_by_class_name('c2prKC'):\n", 131 | " print('*', end = '')\n", 132 | " name = product.find_element_by_class_name('c16H9d').text\n", 133 | " prize = int(product.find_element_by_class_name('c3gUW0').text[3:].replace(',',''))\n", 134 | " country = product.find_element_by_class_name('c2i43-').text\n", 135 | " \n", 136 | " rating = 0\n", 137 | " try:\n", 138 | " stars = product.find_element_by_class_name('c15YQ9').get_attribute('innerHTML')\n", 139 | " for s in re.findall('(c3dn4k c.+?)\"', stars):\n", 140 | " rating += stars_value[s]\n", 141 | " except NoSuchElementException:\n", 142 | " rating = 0\n", 143 | " \n", 144 | " daraz_mall = 0\n", 145 | " mall = product.find_element_by_class_name('c3vCyH').get_attribute('innerHTML')\n", 146 | " if len(mall)>0:\n", 147 | " daraz_mall = 'Yes'\n", 148 | " else:\n", 149 | " daraz_mall = 'No'\n", 150 | " \n", 151 | " reviews = 0\n", 152 | " try:\n", 153 | " reviews = int(product.find_element_by_class_name('c3XbGJ').text[1:-1])\n", 154 | " except NoSuchElementException:\n", 155 | " reviews = 0\n", 156 | " \n", 157 | " discount_prize = 0\n", 158 | " try:\n", 159 | " discount_prize = int(product.find_element_by_class_name('c1-B2V').text[3:].replace(',',''))\n", 160 | " except NoSuchElementException:\n", 161 | " discount_prize = 0\n", 162 | " \n", 163 | " if discount_prize==0:\n", 164 | " Web_Data.loc[index] = [name, daraz_mall, prize, discount_prize, rating, reviews, country]\n", 165 | " else:\n", 166 | " Web_Data.loc[index] = [name, daraz_mall, discount_prize, prize, rating, reviews, country]\n", 167 | " index += 1\n", 168 | " \n", 169 | "print('Web Scraping Ended :)')\n", 170 | "driver.quit()" 171 | ] 172 | }, 173 | { 174 | "cell_type": "code", 175 | "execution_count": 5, 176 | "id": "intellectual-composition", 177 | "metadata": {}, 178 | "outputs": [ 179 | { 180 | "name": "stdout", 181 | "output_type": "stream", 182 | "text": [ 183 | "Total Seller : 1656\n", 184 | "Avg Price : 45639.0\n", 185 | "Avg Reviews : 16.1\n", 186 | "Avg Rating : 3.87\n", 187 | "Total Product Present in DataFrame : 40\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "print(\"Total Seller :\",total_sellers)\n", 193 | "print(\"Avg Price :\", Web_Data['Original Price'][:no_of_product].mean())\n", 194 | "print(\"Avg Reviews :\", Web_Data['Reviews'][:no_of_product].mean())\n", 195 | "print(\"Avg Rating :\", Web_Data['Rating'][:no_of_product].mean())\n", 196 | "print('Total Product Present in DataFrame : ', Web_Data.shape[0])" 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 6, 202 | "id": "interstate-logic", 203 | "metadata": {}, 204 | "outputs": [ 205 | { 206 | "data": { 207 | "text/html": [ 208 | "
\n", 209 | "\n", 222 | "\n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | "
NameDaraz Mall?Original PriceDiscounted PriceRatingReviewsSeller Country
0Samsung Galaxy A32 - Display 6.4 - Multi Quad ...Yes4449904.641Pakistan
1Samsung Galaxy A32 - Display 6.4 - Multi Quad ...Yes4599904.942Pakistan
2Samsung Galaxy A32 || 6GB Ram 128GB Rom || 500...No4599905.011Pakistan
3Samsung Galaxy A32 - 6.4\" Inch Display - 6GB R...No4499904.98Pakistan
4Samsung Galaxy A32 - RAM 6GB - ROM 128 GB - Di...No4499904.745Pakistan
\n", 288 | "
" 289 | ], 290 | "text/plain": [ 291 | " Name Daraz Mall? \\\n", 292 | "0 Samsung Galaxy A32 - Display 6.4 - Multi Quad ... Yes \n", 293 | "1 Samsung Galaxy A32 - Display 6.4 - Multi Quad ... Yes \n", 294 | "2 Samsung Galaxy A32 || 6GB Ram 128GB Rom || 500... No \n", 295 | "3 Samsung Galaxy A32 - 6.4\" Inch Display - 6GB R... No \n", 296 | "4 Samsung Galaxy A32 - RAM 6GB - ROM 128 GB - Di... No \n", 297 | "\n", 298 | " Original Price Discounted Price Rating Reviews Seller Country \n", 299 | "0 44499 0 4.6 41 Pakistan \n", 300 | "1 45999 0 4.9 42 Pakistan \n", 301 | "2 45999 0 5.0 11 Pakistan \n", 302 | "3 44999 0 4.9 8 Pakistan \n", 303 | "4 44999 0 4.7 45 Pakistan " 304 | ] 305 | }, 306 | "execution_count": 6, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "Web_Data.head(5)" 313 | ] 314 | } 315 | ], 316 | "metadata": { 317 | "kernelspec": { 318 | "display_name": "Python 3 (ipykernel)", 319 | "language": "python", 320 | "name": "python3" 321 | }, 322 | "language_info": { 323 | "codemirror_mode": { 324 | "name": "ipython", 325 | "version": 3 326 | }, 327 | "file_extension": ".py", 328 | "mimetype": "text/x-python", 329 | "name": "python", 330 | "nbconvert_exporter": "python", 331 | "pygments_lexer": "ipython3", 332 | "version": "3.8.10" 333 | } 334 | }, 335 | "nbformat": 4, 336 | "nbformat_minor": 5 337 | } 338 | --------------------------------------------------------------------------------