├── WEB SCRAPING.jpg
├── Web Scraping with BeautifulSoup.ipynb
├── Web Scraping with BeautifulSoup.py
├── readme.md
├── requirement.txt
├── scrap wikipedia.png
├── scraped_data.json
└── web_scraping_command_line_tool.py
/WEB SCRAPING.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/WEB SCRAPING.jpg
--------------------------------------------------------------------------------
/Web Scraping with BeautifulSoup.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#Requirements\n",
10 | "#pip3 install requests\n",
11 | "#pip3 install bs4"
12 | ]
13 | },
14 | {
15 | "cell_type": "markdown",
16 | "metadata": {},
17 | "source": [
18 | "## Basic fundamentals of web scraping"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 49,
24 | "metadata": {},
25 | "outputs": [
26 | {
27 | "name": "stdout",
28 | "output_type": "stream",
29 | "text": [
30 | "this is with html tags :
Easy Python – A programming language of revolution\n",
31 | "this is without html tags: Easy Python\n",
32 | "Skip to content\n"
33 | ]
34 | }
35 | ],
36 | "source": [
37 | "# import these two modules bs4 for selecting HTML tags easily\n",
38 | "from bs4 import BeautifulSoup\n",
39 | "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
40 | "import requests\n",
41 | "\n",
42 | "# I put here my own blog url ,you can change it.\n",
43 | "url=\"https://getpython.wordpress.com/\"\n",
44 | "\n",
45 | "#Requests module use to data from given url\n",
46 | "source=requests.get(url)\n",
47 | "\n",
48 | "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
49 | "soup=BeautifulSoup(source.text,'html')\n",
50 | "\n",
51 | "# Find function is used to find a single element if there are more than once it always returns the first element.\n",
52 | "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
53 | "print(\"this is with html tags :\",title)\n",
54 | "\n",
55 | "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
56 | "\n",
57 | "#use .text for extract only text without any html tags\n",
58 | "print(\"this is without html tags:\",qwery.text) \n",
59 | "\n",
60 | "\n",
61 | "links=soup.find('a') #i extarcted link using \"a\" tag\n",
62 | "print(links)"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## extarct data from innerhtml "
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 41,
75 | "metadata": {},
76 | "outputs": [
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "#content\n"
82 | ]
83 | }
84 | ],
85 | "source": [
86 | "# here i extarcted href data from anchor tag.\n",
87 | "print(links['href']) "
88 | ]
89 | },
90 | {
91 | "cell_type": "code",
92 | "execution_count": 42,
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "['screen-reader-text', 'skip-link']\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "# similarly i got class details from a anchor tag\n",
105 | "print(links['class'])"
106 | ]
107 | },
108 | {
109 | "cell_type": "markdown",
110 | "metadata": {},
111 | "source": [
112 | "## findall operation in Bs4"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 51,
118 | "metadata": {},
119 | "outputs": [
120 | {
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "total links in my website : 37\n",
125 | "\n",
126 | "Skip to content\n",
127 | "\n",
128 | "\n",
129 | "\n",
130 | "Search\n",
131 | "Easy Python\n",
132 | "Home\n",
133 | "Contact\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "# findall function is used to fetch all tags at a single time.\n",
139 | "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
140 | "total_links=len(many_link) # len function is use to calculate length of your array\n",
141 | "print(\"total links in my website :\",total_links)\n",
142 | "print()\n",
143 | "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
144 | " print(i)"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 54,
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "name": "stdout",
154 | "output_type": "stream",
155 | "text": [
156 | "\n",
157 | "\n",
158 | "\n",
159 | "\n",
160 | "href is : https://getpython.wordpress.com/\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
166 | "print(second_link)\n",
167 | "print()\n",
168 | "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag\n",
169 | "\n"
170 | ]
171 | },
172 | {
173 | "cell_type": "code",
174 | "execution_count": 59,
175 | "metadata": {},
176 | "outputs": [
177 | {
178 | "name": "stdout",
179 | "output_type": "stream",
180 | "text": [
181 | "\n",
182 | "\n",
183 | "['cover']\n",
184 | "\n",
185 | "\n",
186 | "class name of div is : cover\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "# select div tag from second link\n",
192 | "nested_div=second_link.find('div')\n",
193 | "# As you can see div element extarcted , it also have inner elements\n",
194 | "print(nested_div)\n",
195 | "print()\n",
196 | "#here i extracted class element from div but it give us in the form of list\n",
197 | "z=(nested_div['class'])\n",
198 | "print(z)\n",
199 | "print(type(z))\n",
200 | "print()\n",
201 | "# \" \" .join () method use to convert list type into string type\n",
202 | "print(\"class name of div is :\",\" \".join(nested_div['class'])) "
203 | ]
204 | },
205 | {
206 | "cell_type": "markdown",
207 | "metadata": {},
208 | "source": [
209 | "## scrap data from wikipedia"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 60,
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "World War II - Wikipedia\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
227 | "soup=BeautifulSoup(wiki.text,'html')\n",
228 | "print(soup.find('title'))\n"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {},
234 | "source": [
235 | "### find html tags with classes"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 65,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "name": "stdout",
245 | "output_type": "stream",
246 | "text": [
247 | "Contents\n",
248 | "\n",
249 | "1 Chronology\n",
250 | "2 Background\n",
251 | "\n",
252 | "2.1 Europe\n",
253 | "2.2 Asia\n",
254 | "\n",
255 | "\n",
256 | "3 Pre-war events\n",
257 | "\n",
258 | "3.1 Italian invasion of Ethiopia (1935)\n",
259 | "3.2 Spanish Civil War (1936–1939)\n",
260 | "3.3 Japanese invasion of China (1937)\n",
261 | "3.4 Soviet–Japanese border conflicts\n",
262 | "3.5 European occupations and agreements\n",
263 | "\n",
264 | "\n",
265 | "4 Course of the war\n",
266 | "\n",
267 | "4.1 War breaks out in Europe (1939–40)\n",
268 | "4.2 Western Europe (1940–41)\n",
269 | "4.3 Mediterranean (1940–41)\n",
270 | "4.4 Axis attack on the Soviet Union (1941)\n",
271 | "4.5 War breaks out in the Pacific (1941)\n",
272 | "4.6 Axis advance stalls (1942–43)\n",
273 | "\n",
274 | "4.6.1 Pacific (1942–43)\n",
275 | "4.6.2 Eastern Front (1942–43)\n",
276 | "4.6.3 Western Europe/Atlantic and Mediterranean (1942–43)\n",
277 | "\n",
278 | "\n",
279 | "4.7 Allies gain momentum (1943–44)\n",
280 | "4.8 Allies close in (1944)\n",
281 | "4.9 Axis collapse, Allied victory (1944–45)\n",
282 | "\n",
283 | "\n",
284 | "5 Aftermath\n",
285 | "6 Impact\n",
286 | "\n",
287 | "6.1 Casualties and war crimes\n",
288 | "6.2 Genocide, concentration camps, and slave labour\n",
289 | "6.3 Occupation\n",
290 | "6.4 Home fronts and production\n",
291 | "6.5 Advances in technology and warfare\n",
292 | "\n",
293 | "\n",
294 | "7 See also\n",
295 | "8 Notes\n",
296 | "9 Citations\n",
297 | "10 References\n",
298 | "11 External links\n",
299 | "\n",
300 | "\n"
301 | ]
302 | }
303 | ],
304 | "source": [
305 | "ww2_contents=soup.find_all(\"div\",class_='toc')\n",
306 | "for i in ww2_contents:\n",
307 | " print(i.text)"
308 | ]
309 | },
310 | {
311 | "cell_type": "code",
312 | "execution_count": 68,
313 | "metadata": {},
314 | "outputs": [
315 | {
316 | "name": "stdout",
317 | "output_type": "stream",
318 | "text": [
319 | "World War II(clockwise from top left)\n",
320 | "Chinese forces in the Battle of Wanjialing\n",
321 | "Australian 25-pounder guns during the First Battle of El Alamein\n",
322 | "German Stuka dive bombers on the Eastern Front in December 1943\n",
323 | "American naval force in the Lingayen Gulf\n",
324 | "Wilhelm Keitel signing the German Instrument of Surrender\n",
325 | "Soviet troops in the Battle of Stalingrad\n",
326 | "Date1 September 1939 – 2 September 1945 (1939-09-01 – 1945-09-02)(6 years and 1 day)[a]LocationEurope, Pacific, Atlantic, South-East Asia, China, Middle East, Mediterranean, North Africa, Horn of Africa, Australia, briefly North and South AmericaResult\n",
327 | "Allied victory\n",
328 | "Collapse of Nazi Germany\n",
329 | "Fall of the Japanese and Italian Empires\n",
330 | "Beginning of the Nuclear Age\n",
331 | "Dissolution of the League of Nations\n",
332 | "Creation of the United Nations\n",
333 | "Emergence of the United States and the Soviet Union as rival superpowers\n",
334 | "Beginning of the Cold War (more...)Participants\n",
335 | "Allies\n",
336 | "AxisCommanders and leaders\n",
337 | "Main Allied leaders\n",
338 | " Joseph Stalin\n",
339 | " Franklin D. Roosevelt\n",
340 | " Winston Churchill\n",
341 | " Chiang Kai-shek\n",
342 | "\n",
343 | "Main Axis leaders\n",
344 | " Adolf Hitler\n",
345 | " Hirohito\n",
346 | " Benito Mussolini\n",
347 | "Casualties and losses\n",
348 | "\n",
349 | "Military dead:\n",
350 | "Over 16,000,000\n",
351 | "Civilian dead:\n",
352 | "Over 45,000,000\n",
353 | "Total dead:\n",
354 | "Over 61,000,000\n",
355 | "(1937–1945)\n",
356 | "...further details\n",
357 | "\n",
358 | "\n",
359 | "Military dead:\n",
360 | "Over 8,000,000\n",
361 | "Civilian dead:\n",
362 | "Over 4,000,000\n",
363 | "Total dead:\n",
364 | "Over 12,000,000\n",
365 | "(1937–1945)\n",
366 | "...further details\n",
367 | "\n"
368 | ]
369 | }
370 | ],
371 | "source": [
372 | "overview=soup.find_all('table',class_='infobox vevent')\n",
373 | "for z in overview:\n",
374 | " print(z.text)\n",
375 | " "
376 | ]
377 | }
378 | ],
379 | "metadata": {
380 | "kernelspec": {
381 | "display_name": "Python 3",
382 | "language": "python",
383 | "name": "python3"
384 | },
385 | "language_info": {
386 | "codemirror_mode": {
387 | "name": "ipython",
388 | "version": 3
389 | },
390 | "file_extension": ".py",
391 | "mimetype": "text/x-python",
392 | "name": "python",
393 | "nbconvert_exporter": "python",
394 | "pygments_lexer": "ipython3",
395 | "version": "3.5.2"
396 | }
397 | },
398 | "nbformat": 4,
399 | "nbformat_minor": 2
400 | }
401 |
--------------------------------------------------------------------------------
/Web Scraping with BeautifulSoup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | #Requirements
5 | #pip3 install requests
6 | #pip3 install bs4
7 |
8 | #run in the browser also what are you doing with the help of chrome driver
9 |
10 | # ## Basic fundamentals of web scraping
11 |
12 | # import these two modules bs4 for selecting HTML tags easily
13 | from bs4 import BeautifulSoup
14 | # requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
15 | import requests
16 | from selenium import webdriver
17 |
18 | # I put here my own blog url ,you can change it.
19 | url="https://getpython.wordpress.com/"
20 | BASE_URL = "https://getpython.wordpress.com/"
21 | #Requests module use to data from given url
22 | source=requests.get(url)
23 |
24 |
25 | def get_chrome_web_driver(options):
26 | return webdriver.Chrome("./chromedriver", chrome_options=options)
27 |
28 |
29 | def get_web_driver_options():
30 | return webdriver.ChromeOptions()
31 |
32 |
33 | def set_ignore_certificate_error(options):
34 | options.add_argument('--ignore-certificate-errors')
35 |
36 |
37 | def set_browser_as_incognito(options):
38 | options.add_argument('--incognito')
39 |
40 | # BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
41 | soup=BeautifulSoup(source.text,'html')
42 |
43 | # Find function is used to find a single element if there are more than once it always returns the first element.
44 | title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
45 | print("this is with html tags :",title)
46 |
47 | qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.
48 |
49 | #use .text for extract only text without any html tags
50 | print("this is without html tags:",qwery.text)
51 |
52 |
53 | links=soup.find('a') #i extarcted link using "a" tag
54 | print(links)
55 |
56 |
57 | # ## extarct data from innerhtml
58 |
59 | # here i extarcted href data from anchor tag.
60 | print(links['href'])
61 |
62 | ## or another way
63 | ##extracting href(links) attribute and anchor() tag from page
64 | for a in soup.find_all('a', href=True):
65 | print ( a['href'])
66 |
67 | for i in links:
68 | print(i.text)
69 |
70 | # similarly i got class details from a anchor tag
71 | print(links['class'])
72 |
73 |
74 | # ## findall operation in Bs4
75 |
76 | # findall function is used to fetch all tags at a single time.
77 | many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
78 | total_links=len(many_link) # len function is use to calculate length of your array
79 | print("total links in my website :",total_links)
80 | print()
81 | for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
82 | print(i)
83 |
84 | second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
85 | print(second_link)
86 | print()
87 | print("href is :",second_link['href']) #only href link is extracted from ancor tag
88 |
89 |
90 | # select div tag from second link
91 | nested_div=second_link.find('div')
92 | # As you can see div element extarcted , it also have inner elements
93 | print(nested_div)
94 | print()
95 | #here i extracted class element from div but it give us in the form of list
96 | z=(nested_div['class'])
97 | print(z)
98 | print(type(z))
99 | print()
100 | # " " .join () method use to convert list type into string type
101 | print("class name of div is :"," ".join(nested_div['class']))
102 |
103 |
104 | # ## scrap data from wikipedia
105 |
106 | wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
107 | soup=BeautifulSoup(wiki.text,'html')
108 | print(soup.find('title'))
109 |
110 |
111 | # ### find html tags with classes
112 |
113 | ww2_contents=soup.find_all("div",class_='toc')
114 | for i in ww2_contents:
115 | print(i.text)
116 |
117 |
118 | overview=soup.find_all('table',class_='infobox vevent')
119 | for z in overview:
120 | print(z.text)
121 |
122 | images=soup.find_all('img')
123 |
124 | images
125 | ##or
126 | print(images)
127 |
128 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | Introduction:
4 |
5 | Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites using its HTML structure, In this post, I will explain basic fundaments of web scraping using python and also explore it by a live demonstration with two python libraries Beautifulsoup and requests respectively.
6 |
7 | What you will learn from this post:
8 |
9 | - basic understanding of web scraping
10 | - how to extract data from a website using classes and HTML tags
11 | - how to use requests module to get data
12 | - how to use Beautifulsoup
13 |
14 | Requirements:
15 |
16 | - python3
17 | - requests
18 | - bs4
19 |
20 | Install required dependencies :
21 |
22 | - clone or download it from here
23 | - install requirements.txt file
24 | pip install -r requirements.txt
25 |
26 |
27 |
28 | How to run this code
29 |
30 | - there are two source code files, one is .py extention and another is .ipynb extention
31 | - one can run Scraping with BeautifulSoup.py file in python by run this cammand in terminal "python3 Web Scraping with BeautifulSoup.py"
32 | - one can run Scraping with BeautifulSoup.ipynb file in jupyter notebook /li>
33 |
- one can install juypyter notebook by this command "pip3 install jupyter"
34 | - CLI scraping tool is underdevelopment only beta version is available now
35 |
36 | ----------------------------------------------------------------------------------------
37 | HAPPY CODING
38 |
--------------------------------------------------------------------------------
/requirement.txt:
--------------------------------------------------------------------------------
1 | async-generator==1.10
2 | attrs==21.4.0
3 | beautifulsoup4==4.10.0
4 | beautifultable==1.0.1
5 | certifi==2021.10.8
6 | cffi==1.15.0
7 | charset-normalizer==2.0.12
8 | cryptography==36.0.1
9 | h11==0.13.0
10 | idna==3.3
11 | outcome==1.1.0
12 | pycparser==2.21
13 | pyOpenSSL==22.0.0
14 | PySocks==1.7.1
15 | requests==2.27.1
16 | selenium==4.1.2
17 | sniffio==1.2.0
18 | sortedcontainers==2.4.0
19 | soupsieve==2.3.1
20 | trio==0.20.0
21 | trio-websocket==0.9.2
22 | urllib3==1.26.8
23 | wcwidth==0.2.5
24 | wsproto==1.1.0
25 |
--------------------------------------------------------------------------------
/scrap wikipedia.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/spider-yamet/web-scraping-with-python/52056b1890c84fbdedb8abd8914b01d949b68f54/scrap wikipedia.png
--------------------------------------------------------------------------------
/web_scraping_command_line_tool.py:
--------------------------------------------------------------------------------
1 | # import required modules
2 | import json
3 | import requests
4 | from datetime import datetime
5 | from urllib.parse import urlparse
6 | from bs4 import BeautifulSoup
7 | from beautifultable import BeautifulTable
8 |
9 |
10 |
11 | def load_json(database_json_file="scraped_data.json"):
12 | """
13 | This function will load json data from scraped_data.json file if it exist else crean an empty array
14 | """
15 | try:
16 | with open(database_json_file, "r") as read_it:
17 | all_data_base = json.loads(read_it.read())
18 | return all_data_base
19 | except:
20 | all_data_base = dict()
21 | return all_data_base
22 |
23 |
24 | def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
25 | """
26 | This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
27 | if file already exist you can view previous scraped data
28 | """
29 | file_obj = open(database_json_file, "w")
30 | file_obj.write(json.dumps(data))
31 | file_obj.close()
32 |
33 |
34 | def existing_scraped_data_init(json_db):
35 | """
36 | This function init data from json file if it exist have data else create an empty one
37 | """
38 | scraped_data = json_db.get("scraped_data")
39 | if scraped_data is None:
40 | json_db['scraped_data'] = dict()
41 |
42 | return None
43 |
44 |
45 | def scraped_time_is():
46 | """
47 | This function create time stamp for keep our book issue record trackable
48 | """
49 | now = datetime.now()
50 | dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
51 | return dt_string
52 |
53 | def process_url_request(website_url):
54 | """
55 | This function process provided URL get its data using requets module
56 | and contrunct soup data using BeautifulSoup for scarping
57 | """
58 | requets_data = requests.get(website_url)
59 | if requets_data.status_code == 200:
60 | soup = BeautifulSoup(requets_data.text,'html')
61 | return soup
62 | return None
63 |
64 | def proccess_beautiful_soup_data(soup):
65 | return {
66 | 'title': soup.find('title').text,
67 | 'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
68 | 'all_anchors': [str(i) for i in soup.find_all('a')],
69 | 'all_images_data': [ str(i) for i in soup.find_all('img')],
70 | 'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
71 | 'all_h1_data': [i.text for i in soup.find_all('h1')],
72 | 'all_h2_data': [i.text for i in soup.find_all('h2')],
73 | 'all_h3_data': [i.text for i in soup.find_all('h3')],
74 | 'all_p_data': [i.text for i in soup.find_all('p')]
75 | }
76 |
77 |
78 |
79 | # Here I used infinite loop because i don't want to run it again and again.
80 | while True:
81 |
82 | print(""" ================ Welcome to this scraping program =============
83 | ==>> press 1 for checking existing scraped websites
84 | ==>> press 2 for scrap a single website
85 | ==>> press 3 for exit
86 | """)
87 |
88 | choice = int(input("==>> Please enter your choice :"))
89 |
90 | # Load json function called for fetching/creating data from json file.
91 | local_json_db = load_json()
92 | existing_scraped_data_init(local_json_db)
93 |
94 | if choice == 1:
95 | # I used Beautiful table for presenting scraped data in a good way !!
96 | # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
97 | scraped_websites_table = BeautifulTable()
98 | scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"]
99 | scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
100 |
101 |
102 | local_json_db = load_json()
103 | for count, data in enumerate(local_json_db['scraped_data']):
104 | scraped_websites_table.rows.append([count + 1,
105 | local_json_db['scraped_data'][data]['alias'],
106 | local_json_db['scraped_data'][data]['domain'],
107 | local_json_db['scraped_data'][data]['title'],
108 | local_json_db['scraped_data'][data]['scraped_at'],
109 | local_json_db['scraped_data'][data]['status']])
110 | # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
111 | if not local_json_db['scraped_data']:
112 | print('===> No existing data found !!!')
113 | print(scraped_websites_table)
114 |
115 | elif choice == 2:
116 | print()
117 | url_for_scrap = input("===> Please enter url you want to scrap:")
118 | is_accessable = process_url_request(url_for_scrap)
119 | if is_accessable:
120 | scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
121 | print()
122 | print(' =====> Data scraped successfully !!!')
123 | key_for_storing_data = input("enter alias name for saving scraped data :")
124 | scraped_data_packet['url'] = url_for_scrap
125 | scraped_data_packet['name'] = key_for_storing_data
126 | scraped_data_packet['scraped_at'] = scraped_time_is()
127 | if key_for_storing_data in local_json_db['scraped_data']:
128 | key_for_storing_data = key_for_storing_data + str(scraped_time_is())
129 | print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
130 | scraped_data_packet['alias'] = key_for_storing_data
131 | scraped_data_packet['status'] = True
132 | scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
133 |
134 | local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
135 | print(
136 | 'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
137 | )
138 | save_scraped_data_in_json(local_json_db)
139 | # load data
140 | local_json_db = load_json()
141 | print(' =====> Data saved successfully !!!')
142 | print()
143 | elif choice == 3:
144 | print('Thank you for using !!!')
145 | break
146 |
147 | elif choice == 4:
148 | print('Thank you for using !!!')
149 | break
150 |
151 | else:
152 | print("enter a valid choice ")
--------------------------------------------------------------------------------