├── WEB SCRAPING.jpg ├── Web Scraping with BeautifulSoup.ipynb ├── Web Scraping with BeautifulSoup.py ├── readme.md ├── requirement.txt ├── scrap wikipedia.png ├── scraped_data.json └── web_scraping_command_line_tool.py /WEB SCRAPING.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/WEB SCRAPING.jpg -------------------------------------------------------------------------------- /Web Scraping with BeautifulSoup.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#Requirements\n", 10 | "#pip3 install requests\n", 11 | "#pip3 install bs4" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "## Basic fundamentals of web scraping" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 49, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "this is with html tags : Easy Python – A programming language of revolution\n", 31 | "this is without html tags: Easy Python\n", 32 | "Skip to content\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "# import these two modules bs4 for selecting HTML tags easily\n", 38 | "from bs4 import BeautifulSoup\n", 39 | "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n", 40 | "import requests\n", 41 | "\n", 42 | "# I put here my own blog url ,you can change it.\n", 43 | "url=\"https://getpython.wordpress.com/\"\n", 44 | "\n", 45 | "#Requests module use to data from given url\n", 46 | "source=requests.get(url)\n", 47 | "\n", 48 | "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n", 49 | "soup=BeautifulSoup(source.text,'html')\n", 50 | "\n", 51 | "# Find function is used to find a single element if there are more than once it always returns the first element.\n", 52 | "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n", 53 | "print(\"this is with html tags :\",title)\n", 54 | "\n", 55 | "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n", 56 | "\n", 57 | "#use .text for extract only text without any html tags\n", 58 | "print(\"this is without html tags:\",qwery.text) \n", 59 | "\n", 60 | "\n", 61 | "links=soup.find('a') #i extarcted link using \"a\" tag\n", 62 | "print(links)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## extarct data from innerhtml " 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 41, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "#content\n" 82 | ] 83 | } 84 | ], 85 | "source": [ 86 | "# here i extarcted href data from anchor tag.\n", 87 | "print(links['href']) " 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 42, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "['screen-reader-text', 'skip-link']\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "# similarly i got class details from a anchor tag\n", 105 | "print(links['class'])" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "## findall operation in Bs4" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 51, 118 | "metadata": {}, 119 | "outputs": [ 120 | { 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "total links in my website : 37\n", 125 | "\n", 126 | "Skip to content\n", 127 | "\n", 128 | "
\n", 129 | "
\n", 130 | "Search\n", 131 | "Easy Python\n", 132 | "Home\n", 133 | "Contact\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "# findall function is used to fetch all tags at a single time.\n", 139 | "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n", 140 | "total_links=len(many_link) # len function is use to calculate length of your array\n", 141 | "print(\"total links in my website :\",total_links)\n", 142 | "print()\n", 143 | "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n", 144 | " print(i)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 54, 150 | "metadata": {}, 151 | "outputs": [ 152 | { 153 | "name": "stdout", 154 | "output_type": "stream", 155 | "text": [ 156 | "\n", 157 | "
\n", 158 | "
\n", 159 | "\n", 160 | "href is : https://getpython.wordpress.com/\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n", 166 | "print(second_link)\n", 167 | "print()\n", 168 | "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag\n", 169 | "\n" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 59, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "
\n", 182 | "\n", 183 | "['cover']\n", 184 | "\n", 185 | "\n", 186 | "class name of div is : cover\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "# select div tag from second link\n", 192 | "nested_div=second_link.find('div')\n", 193 | "# As you can see div element extarcted , it also have inner elements\n", 194 | "print(nested_div)\n", 195 | "print()\n", 196 | "#here i extracted class element from div but it give us in the form of list\n", 197 | "z=(nested_div['class'])\n", 198 | "print(z)\n", 199 | "print(type(z))\n", 200 | "print()\n", 201 | "# \" \" .join () method use to convert list type into string type\n", 202 | "print(\"class name of div is :\",\" \".join(nested_div['class'])) " 203 | ] 204 | }, 205 | { 206 | "cell_type": "markdown", 207 | "metadata": {}, 208 | "source": [ 209 | "## scrap data from wikipedia" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 60, 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "World War II - Wikipedia\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n", 227 | "soup=BeautifulSoup(wiki.text,'html')\n", 228 | "print(soup.find('title'))\n" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### find html tags with classes" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 65, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Contents\n", 248 | "\n", 249 | "1 Chronology\n", 250 | "2 Background\n", 251 | "\n", 252 | "2.1 Europe\n", 253 | "2.2 Asia\n", 254 | "\n", 255 | "\n", 256 | "3 Pre-war events\n", 257 | "\n", 258 | "3.1 Italian invasion of Ethiopia (1935)\n", 259 | "3.2 Spanish Civil War (1936–1939)\n", 260 | "3.3 Japanese invasion of China (1937)\n", 261 | "3.4 Soviet–Japanese border conflicts\n", 262 | "3.5 European occupations and agreements\n", 263 | "\n", 264 | "\n", 265 | "4 Course of the war\n", 266 | "\n", 267 | "4.1 War breaks out in Europe (1939–40)\n", 268 | "4.2 Western Europe (1940–41)\n", 269 | "4.3 Mediterranean (1940–41)\n", 270 | "4.4 Axis attack on the Soviet Union (1941)\n", 271 | "4.5 War breaks out in the Pacific (1941)\n", 272 | "4.6 Axis advance stalls (1942–43)\n", 273 | "\n", 274 | "4.6.1 Pacific (1942–43)\n", 275 | "4.6.2 Eastern Front (1942–43)\n", 276 | "4.6.3 Western Europe/Atlantic and Mediterranean (1942–43)\n", 277 | "\n", 278 | "\n", 279 | "4.7 Allies gain momentum (1943–44)\n", 280 | "4.8 Allies close in (1944)\n", 281 | "4.9 Axis collapse, Allied victory (1944–45)\n", 282 | "\n", 283 | "\n", 284 | "5 Aftermath\n", 285 | "6 Impact\n", 286 | "\n", 287 | "6.1 Casualties and war crimes\n", 288 | "6.2 Genocide, concentration camps, and slave labour\n", 289 | "6.3 Occupation\n", 290 | "6.4 Home fronts and production\n", 291 | "6.5 Advances in technology and warfare\n", 292 | "\n", 293 | "\n", 294 | "7 See also\n", 295 | "8 Notes\n", 296 | "9 Citations\n", 297 | "10 References\n", 298 | "11 External links\n", 299 | "\n", 300 | "\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "ww2_contents=soup.find_all(\"div\",class_='toc')\n", 306 | "for i in ww2_contents:\n", 307 | " print(i.text)" 308 | ] 309 | }, 310 | { 311 | "cell_type": "code", 312 | "execution_count": 68, 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "name": "stdout", 317 | "output_type": "stream", 318 | "text": [ 319 | "World War II(clockwise from top left)\n", 320 | "Chinese forces in the Battle of Wanjialing\n", 321 | "Australian 25-pounder guns during the First Battle of El Alamein\n", 322 | "German Stuka dive bombers on the Eastern Front in December 1943\n", 323 | "American naval force in the Lingayen Gulf\n", 324 | "Wilhelm Keitel signing the German Instrument of Surrender\n", 325 | "Soviet troops in the Battle of Stalingrad\n", 326 | "Date1 September 1939 – 2 September 1945 (1939-09-01 – 1945-09-02)(6 years and 1 day)[a]LocationEurope, Pacific, Atlantic, South-East Asia, China, Middle East, Mediterranean, North Africa, Horn of Africa, Australia, briefly North and South AmericaResult\n", 327 | "Allied victory\n", 328 | "Collapse of Nazi Germany\n", 329 | "Fall of the Japanese and Italian Empires\n", 330 | "Beginning of the Nuclear Age\n", 331 | "Dissolution of the League of Nations\n", 332 | "Creation of the United Nations\n", 333 | "Emergence of the United States and the Soviet Union as rival superpowers\n", 334 | "Beginning of the Cold War (more...)Participants\n", 335 | "Allies\n", 336 | "AxisCommanders and leaders\n", 337 | "Main Allied leaders\n", 338 | " Joseph Stalin\n", 339 | " Franklin D. Roosevelt\n", 340 | " Winston Churchill\n", 341 | " Chiang Kai-shek\n", 342 | "\n", 343 | "Main Axis leaders\n", 344 | " Adolf Hitler\n", 345 | " Hirohito\n", 346 | " Benito Mussolini\n", 347 | "Casualties and losses\n", 348 | "\n", 349 | "Military dead:\n", 350 | "Over 16,000,000\n", 351 | "Civilian dead:\n", 352 | "Over 45,000,000\n", 353 | "Total dead:\n", 354 | "Over 61,000,000\n", 355 | "(1937–1945)\n", 356 | "...further details\n", 357 | "\n", 358 | "\n", 359 | "Military dead:\n", 360 | "Over 8,000,000\n", 361 | "Civilian dead:\n", 362 | "Over 4,000,000\n", 363 | "Total dead:\n", 364 | "Over 12,000,000\n", 365 | "(1937–1945)\n", 366 | "...further details\n", 367 | "\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "overview=soup.find_all('table',class_='infobox vevent')\n", 373 | "for z in overview:\n", 374 | " print(z.text)\n", 375 | " " 376 | ] 377 | } 378 | ], 379 | "metadata": { 380 | "kernelspec": { 381 | "display_name": "Python 3", 382 | "language": "python", 383 | "name": "python3" 384 | }, 385 | "language_info": { 386 | "codemirror_mode": { 387 | "name": "ipython", 388 | "version": 3 389 | }, 390 | "file_extension": ".py", 391 | "mimetype": "text/x-python", 392 | "name": "python", 393 | "nbconvert_exporter": "python", 394 | "pygments_lexer": "ipython3", 395 | "version": "3.5.2" 396 | } 397 | }, 398 | "nbformat": 4, 399 | "nbformat_minor": 2 400 | } 401 | -------------------------------------------------------------------------------- /Web Scraping with BeautifulSoup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | #Requirements 5 | #pip3 install requests 6 | #pip3 install bs4 7 | 8 | #run in the browser also what are you doing with the help of chrome driver 9 | 10 | # ## Basic fundamentals of web scraping 11 | 12 | # import these two modules bs4 for selecting HTML tags easily 13 | from bs4 import BeautifulSoup 14 | # requests module is easy to operate some people use urllib but I prefer this one because it is easy to use. 15 | import requests 16 | from selenium import webdriver 17 | 18 | # I put here my own blog url ,you can change it. 19 | url="https://getpython.wordpress.com/" 20 | BASE_URL = "https://getpython.wordpress.com/" 21 | #Requests module use to data from given url 22 | source=requests.get(url) 23 | 24 | 25 | def get_chrome_web_driver(options): 26 | return webdriver.Chrome("./chromedriver", chrome_options=options) 27 | 28 | 29 | def get_web_driver_options(): 30 | return webdriver.ChromeOptions() 31 | 32 | 33 | def set_ignore_certificate_error(options): 34 | options.add_argument('--ignore-certificate-errors') 35 | 36 | 37 | def set_browser_as_incognito(options): 38 | options.add_argument('--incognito') 39 | 40 | # BeautifulSoup is used for getting HTML structure from requests response.(craete your soup) 41 | soup=BeautifulSoup(source.text,'html') 42 | 43 | # Find function is used to find a single element if there are more than once it always returns the first element. 44 | title=soup.find('title') # place your html tagg in parentheses that you want to find from html. 45 | print("this is with html tags :",title) 46 | 47 | qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation. 48 | 49 | #use .text for extract only text without any html tags 50 | print("this is without html tags:",qwery.text) 51 | 52 | 53 | links=soup.find('a') #i extarcted link using "a" tag 54 | print(links) 55 | 56 | 57 | # ## extarct data from innerhtml 58 | 59 | # here i extarcted href data from anchor tag. 60 | print(links['href']) 61 | 62 | ## or another way 63 | ##extracting href(links) attribute and anchor() tag from page 64 | for a in soup.find_all('a', href=True): 65 | print ( a['href']) 66 | 67 | for i in links: 68 | print(i.text) 69 | 70 | # similarly i got class details from a anchor tag 71 | print(links['class']) 72 | 73 | 74 | # ## findall operation in Bs4 75 | 76 | # findall function is used to fetch all tags at a single time. 77 | many_link=soup.find_all('a') # here i extracted all the anchor tags of my website 78 | total_links=len(many_link) # len function is use to calculate length of your array 79 | print("total links in my website :",total_links) 80 | print() 81 | for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them. 82 | print(i) 83 | 84 | second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links. 85 | print(second_link) 86 | print() 87 | print("href is :",second_link['href']) #only href link is extracted from ancor tag 88 | 89 | 90 | # select div tag from second link 91 | nested_div=second_link.find('div') 92 | # As you can see div element extarcted , it also have inner elements 93 | print(nested_div) 94 | print() 95 | #here i extracted class element from div but it give us in the form of list 96 | z=(nested_div['class']) 97 | print(z) 98 | print(type(z)) 99 | print() 100 | # " " .join () method use to convert list type into string type 101 | print("class name of div is :"," ".join(nested_div['class'])) 102 | 103 | 104 | # ## scrap data from wikipedia 105 | 106 | wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II") 107 | soup=BeautifulSoup(wiki.text,'html') 108 | print(soup.find('title')) 109 | 110 | 111 | # ### find html tags with classes 112 | 113 | ww2_contents=soup.find_all("div",class_='toc') 114 | for i in ww2_contents: 115 | print(i.text) 116 | 117 | 118 | overview=soup.find_all('table',class_='infobox vevent') 119 | for z in overview: 120 | print(z.text) 121 | 122 | images=soup.find_all('img') 123 | 124 | images 125 | ##or 126 | print(images) 127 | 128 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ![web scraping with python](https://github.com/rajat4665/web-scraping-with-python/blob/master/WEB%20SCRAPING.jpg) 2 |
3 | Introduction: 4 | 5 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites using its HTML structure, In this post, I will explain basic fundaments of web scraping using python and also explore it by a live demonstration with two python libraries Beautifulsoup and requests respectively. 6 | 7 | What you will learn from this post: 8 | 14 | Requirements: 15 | 20 |

Install required dependencies :

21 |
27 | 28 |

How to run this code

29 | 36 | ---------------------------------------------------------------------------------------- 37 |

HAPPY CODING

38 | -------------------------------------------------------------------------------- /requirement.txt: -------------------------------------------------------------------------------- 1 | async-generator==1.10 2 | attrs==21.4.0 3 | beautifulsoup4==4.10.0 4 | beautifultable==1.0.1 5 | certifi==2021.10.8 6 | cffi==1.15.0 7 | charset-normalizer==2.0.12 8 | cryptography==36.0.1 9 | h11==0.13.0 10 | idna==3.3 11 | outcome==1.1.0 12 | pycparser==2.21 13 | pyOpenSSL==22.0.0 14 | PySocks==1.7.1 15 | requests==2.27.1 16 | selenium==4.1.2 17 | sniffio==1.2.0 18 | sortedcontainers==2.4.0 19 | soupsieve==2.3.1 20 | trio==0.20.0 21 | trio-websocket==0.9.2 22 | urllib3==1.26.8 23 | wcwidth==0.2.5 24 | wsproto==1.1.0 25 | -------------------------------------------------------------------------------- /scrap wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/scrap wikipedia.png -------------------------------------------------------------------------------- /web_scraping_command_line_tool.py: -------------------------------------------------------------------------------- 1 | # import required modules 2 | import json 3 | import requests 4 | from datetime import datetime 5 | from urllib.parse import urlparse 6 | from bs4 import BeautifulSoup 7 | from beautifultable import BeautifulTable 8 | 9 | 10 | 11 | def load_json(database_json_file="scraped_data.json"): 12 | """ 13 | This function will load json data from scraped_data.json file if it exist else crean an empty array 14 | """ 15 | try: 16 | with open(database_json_file, "r") as read_it: 17 | all_data_base = json.loads(read_it.read()) 18 | return all_data_base 19 | except: 20 | all_data_base = dict() 21 | return all_data_base 22 | 23 | 24 | def save_scraped_data_in_json(data, database_json_file="scraped_data.json"): 25 | """ 26 | This function Save the scraped data in json format. scraped_data.json file if it exist else create it. 27 | if file already exist you can view previous scraped data 28 | """ 29 | file_obj = open(database_json_file, "w") 30 | file_obj.write(json.dumps(data)) 31 | file_obj.close() 32 | 33 | 34 | def existing_scraped_data_init(json_db): 35 | """ 36 | This function init data from json file if it exist have data else create an empty one 37 | """ 38 | scraped_data = json_db.get("scraped_data") 39 | if scraped_data is None: 40 | json_db['scraped_data'] = dict() 41 | 42 | return None 43 | 44 | 45 | def scraped_time_is(): 46 | """ 47 | This function create time stamp for keep our book issue record trackable 48 | """ 49 | now = datetime.now() 50 | dt_string = now.strftime("%d/%m/%Y %H:%M:%S") 51 | return dt_string 52 | 53 | def process_url_request(website_url): 54 | """ 55 | This function process provided URL get its data using requets module 56 | and contrunct soup data using BeautifulSoup for scarping 57 | """ 58 | requets_data = requests.get(website_url) 59 | if requets_data.status_code == 200: 60 | soup = BeautifulSoup(requets_data.text,'html') 61 | return soup 62 | return None 63 | 64 | def proccess_beautiful_soup_data(soup): 65 | return { 66 | 'title': soup.find('title').text, 67 | 'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)], 68 | 'all_anchors': [str(i) for i in soup.find_all('a')], 69 | 'all_images_data': [ str(i) for i in soup.find_all('img')], 70 | 'all_images_source_data': [ i['src'] for i in soup.find_all('img')], 71 | 'all_h1_data': [i.text for i in soup.find_all('h1')], 72 | 'all_h2_data': [i.text for i in soup.find_all('h2')], 73 | 'all_h3_data': [i.text for i in soup.find_all('h3')], 74 | 'all_p_data': [i.text for i in soup.find_all('p')] 75 | } 76 | 77 | 78 | 79 | # Here I used infinite loop because i don't want to run it again and again. 80 | while True: 81 | 82 | print(""" ================ Welcome to this scraping program ============= 83 | ==>> press 1 for checking existing scraped websites 84 | ==>> press 2 for scrap a single website 85 | ==>> press 3 for exit 86 | """) 87 | 88 | choice = int(input("==>> Please enter your choice :")) 89 | 90 | # Load json function called for fetching/creating data from json file. 91 | local_json_db = load_json() 92 | existing_scraped_data_init(local_json_db) 93 | 94 | if choice == 1: 95 | # I used Beautiful table for presenting scraped data in a good way !! 96 | # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html 97 | scraped_websites_table = BeautifulTable() 98 | scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"] 99 | scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED) 100 | 101 | 102 | local_json_db = load_json() 103 | for count, data in enumerate(local_json_db['scraped_data']): 104 | scraped_websites_table.rows.append([count + 1, 105 | local_json_db['scraped_data'][data]['alias'], 106 | local_json_db['scraped_data'][data]['domain'], 107 | local_json_db['scraped_data'][data]['title'], 108 | local_json_db['scraped_data'][data]['scraped_at'], 109 | local_json_db['scraped_data'][data]['status']]) 110 | # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']] 111 | if not local_json_db['scraped_data']: 112 | print('===> No existing data found !!!') 113 | print(scraped_websites_table) 114 | 115 | elif choice == 2: 116 | print() 117 | url_for_scrap = input("===> Please enter url you want to scrap:") 118 | is_accessable = process_url_request(url_for_scrap) 119 | if is_accessable: 120 | scraped_data_packet = proccess_beautiful_soup_data(is_accessable) 121 | print() 122 | print(' =====> Data scraped successfully !!!') 123 | key_for_storing_data = input("enter alias name for saving scraped data :") 124 | scraped_data_packet['url'] = url_for_scrap 125 | scraped_data_packet['name'] = key_for_storing_data 126 | scraped_data_packet['scraped_at'] = scraped_time_is() 127 | if key_for_storing_data in local_json_db['scraped_data']: 128 | key_for_storing_data = key_for_storing_data + str(scraped_time_is()) 129 | print("Provided key is already exist so data stored as : {}".format(key_for_storing_data)) 130 | scraped_data_packet['alias'] = key_for_storing_data 131 | scraped_data_packet['status'] = True 132 | scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc 133 | 134 | local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet 135 | print( 136 | 'scraped data is:', local_json_db['scraped_data'][key_for_storing_data] 137 | ) 138 | save_scraped_data_in_json(local_json_db) 139 | # load data 140 | local_json_db = load_json() 141 | print(' =====> Data saved successfully !!!') 142 | print() 143 | elif choice == 3: 144 | print('Thank you for using !!!') 145 | break 146 | 147 | elif choice == 4: 148 | print('Thank you for using !!!') 149 | break 150 | 151 | else: 152 | print("enter a valid choice ") --------------------------------------------------------------------------------