HAPPY CODING

├── WEB SCRAPING.jpg
├── Web Scraping with BeautifulSoup.ipynb
├── Web Scraping with BeautifulSoup.py
├── readme.md
├── requirement.txt
├── scrap wikipedia.png
├── scraped_data.json
└── web_scraping_command_line_tool.py


/WEB SCRAPING.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/WEB SCRAPING.jpg


--------------------------------------------------------------------------------
/Web Scraping with BeautifulSoup.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#Requirements\n",
 10 |     "#pip3 install requests\n",
 11 |     "#pip3 install bs4"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "## Basic fundamentals of web scraping"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 49,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "this is with html tags : <title>Easy Python – A programming language of revolution</title>\n",
 31 |       "this is without html tags: Easy Python\n",
 32 |       "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "# import these two modules bs4 for selecting HTML tags easily\n",
 38 |     "from bs4 import BeautifulSoup\n",
 39 |     "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
 40 |     "import requests\n",
 41 |     "\n",
 42 |     "# I put here my own blog url ,you can change it.\n",
 43 |     "url=\"https://getpython.wordpress.com/\"\n",
 44 |     "\n",
 45 |     "#Requests module use to data from given url\n",
 46 |     "source=requests.get(url)\n",
 47 |     "\n",
 48 |     "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
 49 |     "soup=BeautifulSoup(source.text,'html')\n",
 50 |     "\n",
 51 |     "# Find function is used to find a single element if there are more than once it always returns the first element.\n",
 52 |     "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
 53 |     "print(\"this is with html tags :\",title)\n",
 54 |     "\n",
 55 |     "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
 56 |     "\n",
 57 |     "#use .text for extract only text without any html tags\n",
 58 |     "print(\"this is without html tags:\",qwery.text) \n",
 59 |     "\n",
 60 |     "\n",
 61 |     "links=soup.find('a') #i extarcted link using \"a\" tag\n",
 62 |     "print(links)"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## extarct data from innerhtml "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 41,
 75 |    "metadata": {},
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "#content\n"
 82 |      ]
 83 |     }
 84 |    ],
 85 |    "source": [
 86 |     "# here i extarcted href data from anchor tag.\n",
 87 |     "print(links['href']) "
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 42,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "['screen-reader-text', 'skip-link']\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "# similarly i got class details from a anchor tag\n",
105 |     "print(links['class'])"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "## findall operation in Bs4"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 51,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "name": "stdout",
122 |      "output_type": "stream",
123 |      "text": [
124 |       "total links in my website : 37\n",
125 |       "\n",
126 |       "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n",
127 |       "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
128 |       "<div class=\"cover\"></div>\n",
129 |       "</a>\n",
130 |       "<a class=\"screen-reader-text search-toggle\" href=\"#search-container\">Search</a>\n",
131 |       "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">Easy Python</a>\n",
132 |       "<a aria-current=\"page\" href=\"/\">Home</a>\n",
133 |       "<a href=\"https://getpython.wordpress.com/contact/\">Contact</a>\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "# findall function is used to fetch all tags at a single time.\n",
139 |     "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
140 |     "total_links=len(many_link) # len function is use to calculate length of your array\n",
141 |     "print(\"total links in my website :\",total_links)\n",
142 |     "print()\n",
143 |     "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
144 |     "    print(i)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 54,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
157 |       "<div class=\"cover\"></div>\n",
158 |       "</a>\n",
159 |       "\n",
160 |       "href is : https://getpython.wordpress.com/\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
166 |     "print(second_link)\n",
167 |     "print()\n",
168 |     "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag\n",
169 |     "\n"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 59,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "<div class=\"cover\"></div>\n",
182 |       "\n",
183 |       "['cover']\n",
184 |       "<class 'list'>\n",
185 |       "\n",
186 |       "class name of div is : cover\n"
187 |      ]
188 |     }
189 |    ],
190 |    "source": [
191 |     "# select div tag from second link\n",
192 |     "nested_div=second_link.find('div')\n",
193 |     "# As you can see div element extarcted , it also have inner elements\n",
194 |     "print(nested_div)\n",
195 |     "print()\n",
196 |     "#here i extracted class element from div but it give us in the form of list\n",
197 |     "z=(nested_div['class'])\n",
198 |     "print(z)\n",
199 |     "print(type(z))\n",
200 |     "print()\n",
201 |     "#  \" \" .join () method use to convert list type  into string type\n",
202 |     "print(\"class name of div is :\",\" \".join(nested_div['class'])) "
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "markdown",
207 |    "metadata": {},
208 |    "source": [
209 |     "## scrap data from wikipedia"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": 60,
215 |    "metadata": {},
216 |    "outputs": [
217 |     {
218 |      "name": "stdout",
219 |      "output_type": "stream",
220 |      "text": [
221 |       "<title>World War II - Wikipedia</title>\n"
222 |      ]
223 |     }
224 |    ],
225 |    "source": [
226 |     "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
227 |     "soup=BeautifulSoup(wiki.text,'html')\n",
228 |     "print(soup.find('title'))\n"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "### find html tags with classes"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 65,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "Contents\n",
248 |       "\n",
249 |       "1 Chronology\n",
250 |       "2 Background\n",
251 |       "\n",
252 |       "2.1 Europe\n",
253 |       "2.2 Asia\n",
254 |       "\n",
255 |       "\n",
256 |       "3 Pre-war events\n",
257 |       "\n",
258 |       "3.1 Italian invasion of Ethiopia (1935)\n",
259 |       "3.2 Spanish Civil War (1936–1939)\n",
260 |       "3.3 Japanese invasion of China (1937)\n",
261 |       "3.4 Soviet–Japanese border conflicts\n",
262 |       "3.5 European occupations and agreements\n",
263 |       "\n",
264 |       "\n",
265 |       "4 Course of the war\n",
266 |       "\n",
267 |       "4.1 War breaks out in Europe (1939–40)\n",
268 |       "4.2 Western Europe (1940–41)\n",
269 |       "4.3 Mediterranean (1940–41)\n",
270 |       "4.4 Axis attack on the Soviet Union (1941)\n",
271 |       "4.5 War breaks out in the Pacific (1941)\n",
272 |       "4.6 Axis advance stalls (1942–43)\n",
273 |       "\n",
274 |       "4.6.1 Pacific (1942–43)\n",
275 |       "4.6.2 Eastern Front (1942–43)\n",
276 |       "4.6.3 Western Europe/Atlantic and Mediterranean (1942–43)\n",
277 |       "\n",
278 |       "\n",
279 |       "4.7 Allies gain momentum (1943–44)\n",
280 |       "4.8 Allies close in (1944)\n",
281 |       "4.9 Axis collapse, Allied victory (1944–45)\n",
282 |       "\n",
283 |       "\n",
284 |       "5 Aftermath\n",
285 |       "6 Impact\n",
286 |       "\n",
287 |       "6.1 Casualties and war crimes\n",
288 |       "6.2 Genocide, concentration camps, and slave labour\n",
289 |       "6.3 Occupation\n",
290 |       "6.4 Home fronts and production\n",
291 |       "6.5 Advances in technology and warfare\n",
292 |       "\n",
293 |       "\n",
294 |       "7 See also\n",
295 |       "8 Notes\n",
296 |       "9 Citations\n",
297 |       "10 References\n",
298 |       "11 External links\n",
299 |       "\n",
300 |       "\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "ww2_contents=soup.find_all(\"div\",class_='toc')\n",
306 |     "for i in ww2_contents:\n",
307 |     "    print(i.text)"
308 |    ]
309 |   },
310 |   {
311 |    "cell_type": "code",
312 |    "execution_count": 68,
313 |    "metadata": {},
314 |    "outputs": [
315 |     {
316 |      "name": "stdout",
317 |      "output_type": "stream",
318 |      "text": [
319 |       "World War II(clockwise from top left)\n",
320 |       "Chinese forces in the Battle of Wanjialing\n",
321 |       "Australian 25-pounder guns during the First Battle of El Alamein\n",
322 |       "German Stuka dive bombers on the Eastern Front in December 1943\n",
323 |       "American naval force in the Lingayen Gulf\n",
324 |       "Wilhelm Keitel signing the German Instrument of Surrender\n",
325 |       "Soviet troops in the Battle of Stalingrad\n",
326 |       "Date1 September 1939 – 2 September 1945 (1939-09-01 – 1945-09-02)(6 years and 1 day)[a]LocationEurope, Pacific, Atlantic, South-East Asia, China, Middle East, Mediterranean, North Africa, Horn of Africa, Australia, briefly North and South AmericaResult\n",
327 |       "Allied victory\n",
328 |       "Collapse of Nazi Germany\n",
329 |       "Fall of the Japanese and Italian Empires\n",
330 |       "Beginning of the Nuclear Age\n",
331 |       "Dissolution of the League of Nations\n",
332 |       "Creation of the United Nations\n",
333 |       "Emergence of the United States and the Soviet Union as rival superpowers\n",
334 |       "Beginning of the Cold War (more...)Participants\n",
335 |       "Allies\n",
336 |       "AxisCommanders and leaders\n",
337 |       "Main Allied leaders\n",
338 |       " Joseph Stalin\n",
339 |       " Franklin D. Roosevelt\n",
340 |       " Winston Churchill\n",
341 |       " Chiang Kai-shek\n",
342 |       "\n",
343 |       "Main Axis leaders\n",
344 |       " Adolf Hitler\n",
345 |       " Hirohito\n",
346 |       " Benito Mussolini\n",
347 |       "Casualties and losses\n",
348 |       "\n",
349 |       "Military dead:\n",
350 |       "Over 16,000,000\n",
351 |       "Civilian dead:\n",
352 |       "Over 45,000,000\n",
353 |       "Total dead:\n",
354 |       "Over 61,000,000\n",
355 |       "(1937–1945)\n",
356 |       "...further details\n",
357 |       "\n",
358 |       "\n",
359 |       "Military dead:\n",
360 |       "Over 8,000,000\n",
361 |       "Civilian dead:\n",
362 |       "Over 4,000,000\n",
363 |       "Total dead:\n",
364 |       "Over 12,000,000\n",
365 |       "(1937–1945)\n",
366 |       "...further details\n",
367 |       "\n"
368 |      ]
369 |     }
370 |    ],
371 |    "source": [
372 |     "overview=soup.find_all('table',class_='infobox vevent')\n",
373 |     "for z in overview:\n",
374 |     "    print(z.text)\n",
375 |     "    "
376 |    ]
377 |   }
378 |  ],
379 |  "metadata": {
380 |   "kernelspec": {
381 |    "display_name": "Python 3",
382 |    "language": "python",
383 |    "name": "python3"
384 |   },
385 |   "language_info": {
386 |    "codemirror_mode": {
387 |     "name": "ipython",
388 |     "version": 3
389 |    },
390 |    "file_extension": ".py",
391 |    "mimetype": "text/x-python",
392 |    "name": "python",
393 |    "nbconvert_exporter": "python",
394 |    "pygments_lexer": "ipython3",
395 |    "version": "3.5.2"
396 |   }
397 |  },
398 |  "nbformat": 4,
399 |  "nbformat_minor": 2
400 | }
401 | 


--------------------------------------------------------------------------------
/Web Scraping with BeautifulSoup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | #Requirements
  5 | #pip3 install requests
  6 | #pip3 install bs4
  7 | 
  8 | #run in the browser also what are you doing with the help of chrome driver
  9 | 
 10 | # ## Basic fundamentals of web scraping
 11 | 
 12 | # import these two modules bs4 for selecting HTML tags easily
 13 | from bs4 import BeautifulSoup
 14 | # requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
 15 | import requests
 16 | from selenium import webdriver
 17 | 
 18 | # I put here my own blog url ,you can change it.
 19 | url="https://getpython.wordpress.com/"
 20 | BASE_URL = "https://getpython.wordpress.com/"
 21 | #Requests module use to data from given url
 22 | source=requests.get(url)
 23 | 
 24 | 
 25 | def get_chrome_web_driver(options):
 26 |     return webdriver.Chrome("./chromedriver", chrome_options=options)
 27 | 
 28 | 
 29 | def get_web_driver_options():
 30 |     return webdriver.ChromeOptions()
 31 | 
 32 | 
 33 | def set_ignore_certificate_error(options):
 34 |     options.add_argument('--ignore-certificate-errors')
 35 | 
 36 | 
 37 | def set_browser_as_incognito(options):
 38 |     options.add_argument('--incognito')
 39 | 
 40 | # BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
 41 | soup=BeautifulSoup(source.text,'html')
 42 | 
 43 | # Find function is used to find a single element if there are more than once it always returns the first element.
 44 | title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
 45 | print("this is with html tags :",title)
 46 | 
 47 | qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.
 48 | 
 49 | #use .text for extract only text without any html tags
 50 | print("this is without html tags:",qwery.text) 
 51 | 
 52 | 
 53 | links=soup.find('a') #i extarcted link using "a" tag
 54 | print(links)
 55 | 
 56 | 
 57 | # ## extarct data from innerhtml 
 58 | 
 59 | # here i extarcted href data from anchor tag.
 60 | print(links['href']) 
 61 | 
 62 | ##  or another way
 63 | ##extracting href(links) attribute and anchor(<a>) tag from page 
 64 | for a in soup.find_all('a', href=True):
 65 |     print ( a['href'])
 66 | 
 67 | for i in links:
 68 |     print(i.text)
 69 | 
 70 | # similarly i got class details from a anchor tag
 71 | print(links['class'])
 72 | 
 73 | 
 74 | # ## findall operation in Bs4
 75 | 
 76 | # findall function is used to fetch all tags at a single time.
 77 | many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
 78 | total_links=len(many_link) # len function is use to calculate length of your array
 79 | print("total links in my website :",total_links)
 80 | print()
 81 | for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
 82 |     print(i)
 83 | 
 84 | second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
 85 | print(second_link)
 86 | print()
 87 | print("href is :",second_link['href']) #only href link is extracted from ancor tag
 88 | 
 89 | 
 90 | # select div tag from second link
 91 | nested_div=second_link.find('div')
 92 | # As you can see div element extarcted , it also have inner elements
 93 | print(nested_div)
 94 | print()
 95 | #here i extracted class element from div but it give us in the form of list
 96 | z=(nested_div['class'])
 97 | print(z)
 98 | print(type(z))
 99 | print()
100 | #  " " .join () method use to convert list type  into string type
101 | print("class name of div is :"," ".join(nested_div['class'])) 
102 | 
103 | 
104 | # ## scrap data from wikipedia
105 | 
106 | wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
107 | soup=BeautifulSoup(wiki.text,'html')
108 | print(soup.find('title'))
109 | 
110 | 
111 | # ### find html tags with classes
112 | 
113 | ww2_contents=soup.find_all("div",class_='toc')
114 | for i in ww2_contents:
115 |     print(i.text)
116 | 
117 | 
118 | overview=soup.find_all('table',class_='infobox vevent')
119 | for z in overview:
120 |     print(z.text)
121 |   
122 | images=soup.find_all('img')
123 | 
124 | images
125 | ##or
126 | print(images)
127 | 
128 | 


--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
 1 | ![web scraping with python](https://github.com/rajat4665/web-scraping-with-python/blob/master/WEB%20SCRAPING.jpg)
 2 | <br>
 3 | <span style="text-decoration: underline;"><strong>Introduction:</strong></span>
 4 | 
 5 | <b>Web scraping</b>, <b>web harvesting</b>, or <b>web data extraction</b> is data scraping used for extracting data from websites using its HTML structure, In this post, I will explain basic fundaments of web scraping using python and also explore it by a live demonstration with two python libraries Beautifulsoup and requests respectively.
 6 | 
 7 | <span style="text-decoration: underline;"><strong>What you will learn from this post:</strong></span>
 8 | <ul>
 9 | 	<li>basic understanding of web scraping</li>
10 | 	<li>how to extract data from a website using classes and HTML tags</li>
11 | 	<li>how to use requests module to get data</li>
12 | 	<li>how to use Beautifulsoup</li>
13 | </ul>
14 | <span style="text-decoration: underline;"><strong>Requirements:</strong></span>
15 | <ul>
16 | 	<li>python3</li>
17 | 	<li>requests</li>
18 | 	<li>bs4</li>
19 | </ul>
20 | <h3>Install required dependencies :</h3>
21 | <ul>
22 | 	<li>clone or download it from <a href="https://github.com/rajat4665/web-scraping-with-python" target="_blank" rel="noopener">here</a></li>
23 | 	<li>install requirements.txt file</li>
24 | 	<li><code>pip install -r requirements.txt</code></li>
25 | 
26 |  </ul>
27 | 
28 | <h2> How to run this code</h2>
29 | <ul>
30 | 	<li>there are two source code files, one is .py extention and another is .ipynb extention</li>
31 | 	<li>one can run Scraping with BeautifulSoup.py file in python by run this cammand in terminal "python3 Web Scraping with BeautifulSoup.py"</li>
32 | 	<li>one can run Scraping with BeautifulSoup.ipynb file in jupyter notebook /li>
33 | 	<li>one can install juypyter notebook by this command "pip3 install jupyter"</li>
34 | 	<li> CLI scraping tool is underdevelopment only beta version  is available now </li>
35 | </ul>
36 | ----------------------------------------------------------------------------------------
37 | <h1>HAPPY CODING</h1>
38 | 


--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
 1 | async-generator==1.10
 2 | attrs==21.4.0
 3 | beautifulsoup4==4.10.0
 4 | beautifultable==1.0.1
 5 | certifi==2021.10.8
 6 | cffi==1.15.0
 7 | charset-normalizer==2.0.12
 8 | cryptography==36.0.1
 9 | h11==0.13.0
10 | idna==3.3
11 | outcome==1.1.0
12 | pycparser==2.21
13 | pyOpenSSL==22.0.0
14 | PySocks==1.7.1
15 | requests==2.27.1
16 | selenium==4.1.2
17 | sniffio==1.2.0
18 | sortedcontainers==2.4.0
19 | soupsieve==2.3.1
20 | trio==0.20.0
21 | trio-websocket==0.9.2
22 | urllib3==1.26.8
23 | wcwidth==0.2.5
24 | wsproto==1.1.0
25 | 


--------------------------------------------------------------------------------
/scrap wikipedia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/scrap wikipedia.png


--------------------------------------------------------------------------------
/web_scraping_command_line_tool.py:
--------------------------------------------------------------------------------
  1 | # import required modules 
  2 | import json 
  3 | import requests
  4 | from datetime import datetime
  5 | from urllib.parse import urlparse
  6 | from bs4 import BeautifulSoup
  7 | from beautifultable import BeautifulTable
  8 | 
  9 | 
 10 | 
 11 | def load_json(database_json_file="scraped_data.json"):
 12 |     """
 13 |     This function will load json data from scraped_data.json file if it exist else crean an empty array
 14 |     """
 15 |     try:
 16 |         with open(database_json_file, "r") as read_it: 
 17 |             all_data_base = json.loads(read_it.read())
 18 |             return all_data_base
 19 |     except:
 20 |         all_data_base = dict()
 21 |         return all_data_base
 22 | 
 23 | 
 24 | def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
 25 |     """
 26 |     This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
 27 |     if file already exist you can view previous scraped data
 28 |     """
 29 |     file_obj =  open(database_json_file, "w")
 30 |     file_obj.write(json.dumps(data))
 31 |     file_obj.close()
 32 | 
 33 | 
 34 | def existing_scraped_data_init(json_db):
 35 |     """
 36 |     This function init data from json file if it exist have data else create an empty one 
 37 |     """
 38 |     scraped_data = json_db.get("scraped_data")
 39 |     if scraped_data is None:
 40 |         json_db['scraped_data'] = dict()
 41 | 
 42 |     return None
 43 | 
 44 | 
 45 | def scraped_time_is():
 46 |     """
 47 |     This function create time stamp for keep our book issue record trackable 
 48 |     """
 49 |     now = datetime.now()
 50 |     dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
 51 |     return dt_string
 52 | 
 53 | def process_url_request(website_url):
 54 |     """
 55 |     This function process provided URL get its data using requets module
 56 |     and contrunct soup data using BeautifulSoup for scarping
 57 |     """
 58 |     requets_data = requests.get(website_url)
 59 |     if requets_data.status_code == 200:
 60 |         soup = BeautifulSoup(requets_data.text,'html')
 61 |         return soup
 62 |     return None
 63 | 
 64 | def proccess_beautiful_soup_data(soup):
 65 |     return {
 66 |         'title': soup.find('title').text,
 67 |         'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
 68 |         'all_anchors': [str(i) for i in soup.find_all('a')],
 69 |         'all_images_data': [ str(i) for i in soup.find_all('img')],
 70 |         'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
 71 |         'all_h1_data': [i.text for i in soup.find_all('h1')],
 72 |         'all_h2_data': [i.text for i in soup.find_all('h2')],
 73 |         'all_h3_data': [i.text for i in soup.find_all('h3')],
 74 |         'all_p_data': [i.text for i in soup.find_all('p')]
 75 |     }
 76 | 
 77 | 
 78 | 
 79 | # Here I used infinite loop because i don't want to run it again and again.
 80 | while True:
 81 | 
 82 |     print("""  ================ Welcome to this scraping program =============
 83 |     ==>> press 1 for checking existing scraped websites
 84 |     ==>> press 2 for scrap a single website
 85 |     ==>> press 3 for exit
 86 |     """)
 87 | 
 88 |     choice = int(input("==>> Please enter your choice :"))
 89 | 
 90 |     # Load json function called for fetching/creating data from json file.
 91 |     local_json_db = load_json()
 92 |     existing_scraped_data_init(local_json_db)
 93 | 
 94 |     if choice == 1:
 95 |         # I used Beautiful table for presenting scraped data in a good way !!
 96 |         # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
 97 |         scraped_websites_table = BeautifulTable()
 98 |         scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title",   "Scraped at", "Status"]
 99 |         scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
100 |         
101 | 
102 |         local_json_db = load_json()
103 |         for count,  data in enumerate(local_json_db['scraped_data']):
104 |            scraped_websites_table.rows.append([count + 1, 
105 |                             local_json_db['scraped_data'][data]['alias'], 
106 |                             local_json_db['scraped_data'][data]['domain'], 
107 |                             local_json_db['scraped_data'][data]['title'], 
108 |                             local_json_db['scraped_data'][data]['scraped_at'], 
109 |                             local_json_db['scraped_data'][data]['status']])
110 |         # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
111 |         if not local_json_db['scraped_data']:
112 |             print('===> No existing data found !!!')
113 |         print(scraped_websites_table)
114 |     
115 |     elif choice == 2:
116 |         print()
117 |         url_for_scrap = input("===> Please enter url you want to scrap:")
118 |         is_accessable = process_url_request(url_for_scrap)
119 |         if is_accessable:
120 |             scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
121 |             print()
122 |             print(' =====> Data scraped successfully !!!')
123 |             key_for_storing_data = input("enter alias name for saving scraped data :")
124 |             scraped_data_packet['url'] = url_for_scrap
125 |             scraped_data_packet['name'] = key_for_storing_data
126 |             scraped_data_packet['scraped_at'] = scraped_time_is()
127 |             if key_for_storing_data in  local_json_db['scraped_data']:
128 |                 key_for_storing_data = key_for_storing_data + str(scraped_time_is())
129 |                 print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
130 |             scraped_data_packet['alias'] = key_for_storing_data
131 |             scraped_data_packet['status'] = True
132 |             scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
133 | 
134 |             local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
135 |             print(
136 |                 'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
137 |             )
138 |             save_scraped_data_in_json(local_json_db)
139 |             # load data
140 |             local_json_db = load_json()
141 |             print(' =====> Data saved successfully !!!')
142 |             print()
143 |     elif choice == 3:
144 |         print('Thank you for using !!!')
145 |         break
146 | 
147 |     elif choice == 4:
148 |         print('Thank you for using !!!')
149 |         break
150 | 
151 |     else:
152 |         print("enter a valid choice ")


--------------------------------------------------------------------------------