├── README.md
├── LICENSE
├── aave scraper.py
└── aave scraper v2.py


/README.md:
--------------------------------------------------------------------------------
1 | # AAVE-docs-scraper
2 | Web Scraper program that scrapes data from https://docs.aave.com/ and write to the respective files in ./data/ directory
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Madhav Kumar
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/aave scraper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Title:    AAVE's Docs Scraper
 4 | # Author:   MKNC
 5 | # created:  10-01-2023 19:30 IST
 6 | # version 1
 7 | 
 8 | # requests      --> for scraping html data from an url
 9 | # BeautifulSoup --> for parsing the html data in a tree like structure
10 | import requests
11 | from bs4 import BeautifulSoup
12 | from urllib.parse import urljoin
13 | 
14 | import os
15 | from platform import uname
16 | def clearScreen():
17 |     os.system('cls' if 'win' in uname().system.lower() else 'clear')
18 | 
19 | # for saving the data
20 | def saveData(fname, data):
21 |     with open(fname,'w',encoding="utf-8") as file:
22 |         file.write(str(data))
23 | 
24 | # scraping links
25 | def get_paths(domain, url):
26 |     response = requests.get(url)
27 |     soup = BeautifulSoup(response.text, 'html.parser')
28 | 
29 |     urls = [url]
30 |     for link in soup.find_all('a'):
31 |         path = link.get('href')
32 |         if path.startswith('/'):
33 |             path = urljoin(url,path)
34 |         if path.startswith(domain) and '#' not in path: 
35 |             urls.append(path)
36 | 
37 |     return list(set(urls))
38 | 
39 | # page content
40 | def fetch_data(url, content_class, dirName='data'):
41 |     try:
42 |         print("[+] Scraping",url)
43 |         response = requests.get(url)
44 |         data = BeautifulSoup(response.text, 'html.parser')
45 |         title = data.find('title').text
46 |         data = data.find_all(class_=content_class)[1:]      # skips the first element
47 |     except Exception as e:
48 |         print("[!] Failed to fetch",url)
49 |         print("[ERROR:]",e)
50 |         return
51 | 
52 |     # saving data
53 |     fileName = f"{dirName}/{title}.txt"
54 |     content = f"[{title}]\n"
55 |     content += f"[{url}]]\n"
56 |     for i in data:
57 |         content+='\n'+i.text.strip()+'\n'
58 |     saveData(fileName, content)
59 | 
60 | if __name__ == "__main__":
61 |     clearScreen()
62 |     print("AAVE's Docs Scraper\n")
63 |     print("[+] Data will saved in 'data/' in the currect directory")
64 | 
65 |     if os.path.exists('data'): 
66 |         print("[-] 'data/' directory already Exists, saving all the data here")
67 |     else:
68 |         print("[+] Creating directory 'data/'")
69 |         os.mkdir('data')
70 |     
71 |     # input
72 |     data_class = "css-175oi2r r-bnwqim"
73 |     url = "https://docs.aave.com/hub/"
74 |     domain = '/'.join(url.split('/')[0:3])+'/'
75 |     
76 |     # output
77 |     try:
78 |         for path in get_paths(domain=domain, url=url):
79 |             fetch_data(path, content_class=data_class, dirName='data')
80 |     except KeyboardInterrupt:
81 |         print("[-] KeyBoard interrupted")
82 |         print("[-] Exiting...")
83 | 
84 | 


--------------------------------------------------------------------------------
/aave scraper v2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # Title:    AAVE's Docs Scraper
  4 | # Author:   MKNC
  5 | # created:  10-01-2023 19:30 IST
  6 | # version 1
  7 | 
  8 | # requests      --> for scraping html data from an url
  9 | # BeautifulSoup --> for parsing the html data in a tree like structure
 10 | import requests
 11 | from bs4 import BeautifulSoup
 12 | from urllib.parse import urljoin, urlparse
 13 | 
 14 | import os
 15 | from platform import uname
 16 | def printScreen():
 17 |     os.system('cls' if 'win' in uname().system.lower() else 'clear')
 18 |     print("<"+"="*50+" AAVE's Docs Scraper "+"="*50+">\n")
 19 | 
 20 | # for unique directory names
 21 | def get_dirname(dname):
 22 |     if os.path.exists(dname):
 23 |         print(dname)
 24 |         dname += "1"
 25 |         print(dname)
 26 |         return get_dirname(dname)
 27 |     return dname
 28 | 
 29 | def makeDir(dname):
 30 |     os.mkdir(dname)
 31 |     # if os.path.exists(dname): 
 32 |     #     new_dname = get_dirname(dname)
 33 |     #     print(f"[-] '{dname}/' directory already Exists, creating and saving all the data in '{new_dname}/' directory")
 34 |     #     os.mkdir(new_dname)
 35 |     # else:
 36 |     #     print(f"[+] Creating directory '{dname}'")
 37 |     #     os.mkdir(dname)
 38 | 
 39 | # for saving the data
 40 | def saveData(fname, data):
 41 |     with open(fname,'w',encoding="utf-8") as file:
 42 |         file.write(str(data))
 43 | 
 44 | # links crawler class
 45 | class Crawler:
 46 |     def __init__(self, base_url, restricted_domain='', crawl_limit=20):
 47 |         self.urls = [base_url]
 48 |         self.links_found = []
 49 |         self.restricted_domain = restricted_domain
 50 |         self.crawl_limit = crawl_limit
 51 | 
 52 |     def fetch_all_links(self, url):
 53 |         response = requests.get(url)
 54 |         soup = BeautifulSoup(response.text, 'html.parser')
 55 |         for link in soup.find_all('a'):
 56 |             if len(self.links_found)>self.crawl_limit: return
 57 | 
 58 |             path = link.get('href')
 59 |             if not path.startswith('http'): path = urljoin(url,path)
 60 | 
 61 |             if '#' in path: continue
 62 |             if urlparse(path).hostname != self.restricted_domain and self.restricted_domain!='': continue
 63 |             if path in self.links_found: continue
 64 | 
 65 |             print("[*] Link found -",path)
 66 |             self.urls.append(path)
 67 |             self.links_found.append(path)
 68 | 
 69 |     def crawl(self):
 70 |         while self.urls:
 71 |             url = self.urls.pop(0)
 72 |             print(f"[+] Crawling: {url}\n")
 73 |             try:
 74 |                 self.fetch_all_links(url)
 75 |             except Exception as e:
 76 |                 print(f'[!] Failed to crawl: {url}')
 77 |                 print("[!] REASON:",e,'\n')
 78 | 
 79 | # page content
 80 | index = 0
 81 | def fetch_data(url, content_class, dirName='.'):
 82 |     global index
 83 |     try:
 84 |         print(f"[{index}] Scraping {url}")
 85 |         index += 1
 86 |         response = requests.get(url)
 87 |         data = BeautifulSoup(response.text, 'html.parser')
 88 |         # title = data.find('title').text     # for filename = title of page
 89 |         data = data.find_all(class_=content_class)[1:]      # skips the first element
 90 |     except Exception as e:
 91 |         print("[!] Failed to fetch",url)
 92 |         print("[ERROR]",e)
 93 |         return
 94 | 
 95 |     # title => filename
 96 |     title = urlparse(url).path[1:-1].replace('/','-')
 97 |     fileName = f"{dirName}/{title}.txt"
 98 | 
 99 |     # data -> file
100 |     content = f"[{title}]\n"
101 |     content += f"[{url}]\n"
102 |     for i in data:
103 |         content += '\n'+i.text.strip()+'\n'
104 |     saveData(fileName, content)
105 | 
106 | 
107 | def scrape_all(url, content_class, dirName, crawl_limit):
108 |     domain = urlparse(url).hostname
109 |     aave = Crawler(base_url=url, restricted_domain=domain, crawl_limit=crawl_limit)
110 |     aave.crawl()
111 |     saveData(f'{dirName}/links_found.txt',"/n".join(aave.links_found))
112 | 
113 |     for path in aave.links_found:
114 |         try:
115 |             fetch_data(path, content_class,dirName)
116 |         except Exception as e:
117 |             print("[!] Failed to fetch",path)
118 |             print("[ERROR]",e)
119 | 
120 | 
121 |     
122 | if __name__ == "__main__":
123 |     printScreen()
124 |     
125 |     root_dir = input("[=] create a directory in currect directory for saving all the data: ")
126 |     root_dir = get_dirname(root_dir)
127 |     makeDir(root_dir)
128 | 
129 |     url = "https://docs.aave.com/"
130 |     ## UPDATE THIS VALUE IF NO DATA IS FETCHED
131 |     # it might be possible that the website developers may have changed the content class or maybe just modified
132 |     data_class = "r-1xnzce8"
133 | 
134 |     crawl_limit = input("[=] Enter a crawl limit: ")
135 |     if not crawl_limit.isnumeric():
136 |         print("[!] Enter a valid number")
137 |         exit()
138 | 
139 |     try:
140 |         scrape_all(url, data_class, root_dir, int(crawl_limit))
141 |     except KeyboardInterrupt:
142 |         print("[-] KeyBoard interrupted")
143 |         print("Exiting...")
144 | 


--------------------------------------------------------------------------------