├── README.md
├── License
└── crawler.py


/README.md:
--------------------------------------------------------------------------------
1 | # Links-Crawler
2 | This program crawls an url and recursively fetches and crawls all the links found on the page. 
3 | 


--------------------------------------------------------------------------------
/License:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Madhav Kumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/crawler.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # """
 4 | # Title:    Links Crawler
 5 | # author:   MKNC [https://github.com/Madhav-MKNC/]
 6 | # created:  12-01-2023 01:50 
 7 | # """
 8 | 
 9 | from urllib.parse import urljoin
10 | import requests
11 | from bs4 import BeautifulSoup
12 | 
13 | import os
14 | from platform import uname
15 | def printScreen():
16 |     os.system('cls' if 'win' in uname().system.lower() else 'clear')
17 |     print("["+"="*30+" LINKS CRAWLER "+"="*30+"]\n")
18 | 
19 | class Crawler:
20 |     def __init__(self, base_url, restricted_domain='https://'):
21 |         self.urls = [base_url]
22 |         self.urls_dict = dict()
23 |         self.links_found = list(self.urls_dict.keys())
24 |         self.restricted_domain = restricted_domain
25 |     
26 |     def inList(self, url):
27 |         try: return self.urls_dict[url]
28 |         except: return False
29 | 
30 |     def crawl_all_links(self, url):
31 |         response = requests.get(url)
32 |         soup = BeautifulSoup(response.text, 'html.parser')
33 |         for link in soup.find_all('a'):
34 |             path = link.get('href')
35 |             if not path.startswith('http'): path = urljoin(url,path)
36 | 
37 |             if self.inList(path): continue
38 |             if '#' in path: continue
39 |             if not path.startswith(self.restricted_domain): continue
40 | 
41 |             print("[*] Link found -",path)
42 |             self.urls.append(path)
43 |             self.links_found.append(path)
44 |             self.urls_dict[path] = True
45 | 
46 |     def crawl(self):
47 |         while self.urls:
48 |             url = self.urls.pop(0)
49 |             print("\n[+] Crawling:",url)
50 |             try:
51 |                 self.crawl_all_links(url)
52 |             except AttributeError:
53 |                 print(f'[-] No href on : {url}\n')
54 |             except Exception as e:
55 |                 print(f'[!] Failed to crawl: {url}')
56 |                 print("[!] REASON:",e,'\n')
57 | 
58 |     def saveData(self, fname='links_found.txt'):
59 |         with open(fname,'w',encoding="utf-8") as file:
60 |             file.write("\n".join(self.links_found))
61 |         print("[+] list saved in 'links_found.txt'")
62 | 
63 | if __name__ == '__main__':
64 |     printScreen()
65 | 
66 |     url = input('[=] Enter the base url you want to crawl: ')
67 |     # url = "https://docs.aave.com/"
68 |     if not url.startswith('http'): url = 'https://'+url
69 | 
70 |     try:
71 |         aave = Crawler(base_url=url)
72 |         aave.crawl()
73 |     except KeyboardInterrupt:
74 |         print("\n[Program Stopped]")
75 |     aave.saveData()
76 | 
77 |     print()
78 |     print("len(urls) =",len(aave.links_found))
79 |     print("len(set(urls)) =",len(set(aave.links_found)))
80 | 
81 | 


--------------------------------------------------------------------------------