├── README.md ├── LICENSE ├── aave scraper.py └── aave scraper v2.py /README.md: -------------------------------------------------------------------------------- 1 | # AAVE-docs-scraper 2 | Web Scraper program that scrapes data from https://docs.aave.com/ and write to the respective files in ./data/ directory 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Madhav Kumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /aave scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Title: AAVE's Docs Scraper 4 | # Author: MKNC 5 | # created: 10-01-2023 19:30 IST 6 | # version 1 7 | 8 | # requests --> for scraping html data from an url 9 | # BeautifulSoup --> for parsing the html data in a tree like structure 10 | import requests 11 | from bs4 import BeautifulSoup 12 | from urllib.parse import urljoin 13 | 14 | import os 15 | from platform import uname 16 | def clearScreen(): 17 | os.system('cls' if 'win' in uname().system.lower() else 'clear') 18 | 19 | # for saving the data 20 | def saveData(fname, data): 21 | with open(fname,'w',encoding="utf-8") as file: 22 | file.write(str(data)) 23 | 24 | # scraping links 25 | def get_paths(domain, url): 26 | response = requests.get(url) 27 | soup = BeautifulSoup(response.text, 'html.parser') 28 | 29 | urls = [url] 30 | for link in soup.find_all('a'): 31 | path = link.get('href') 32 | if path.startswith('/'): 33 | path = urljoin(url,path) 34 | if path.startswith(domain) and '#' not in path: 35 | urls.append(path) 36 | 37 | return list(set(urls)) 38 | 39 | # page content 40 | def fetch_data(url, content_class, dirName='data'): 41 | try: 42 | print("[+] Scraping",url) 43 | response = requests.get(url) 44 | data = BeautifulSoup(response.text, 'html.parser') 45 | title = data.find('title').text 46 | data = data.find_all(class_=content_class)[1:] # skips the first element 47 | except Exception as e: 48 | print("[!] Failed to fetch",url) 49 | print("[ERROR:]",e) 50 | return 51 | 52 | # saving data 53 | fileName = f"{dirName}/{title}.txt" 54 | content = f"[{title}]\n" 55 | content += f"[{url}]]\n" 56 | for i in data: 57 | content+='\n'+i.text.strip()+'\n' 58 | saveData(fileName, content) 59 | 60 | if __name__ == "__main__": 61 | clearScreen() 62 | print("AAVE's Docs Scraper\n") 63 | print("[+] Data will saved in 'data/' in the currect directory") 64 | 65 | if os.path.exists('data'): 66 | print("[-] 'data/' directory already Exists, saving all the data here") 67 | else: 68 | print("[+] Creating directory 'data/'") 69 | os.mkdir('data') 70 | 71 | # input 72 | data_class = "css-175oi2r r-bnwqim" 73 | url = "https://docs.aave.com/hub/" 74 | domain = '/'.join(url.split('/')[0:3])+'/' 75 | 76 | # output 77 | try: 78 | for path in get_paths(domain=domain, url=url): 79 | fetch_data(path, content_class=data_class, dirName='data') 80 | except KeyboardInterrupt: 81 | print("[-] KeyBoard interrupted") 82 | print("[-] Exiting...") 83 | 84 | -------------------------------------------------------------------------------- /aave scraper v2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Title: AAVE's Docs Scraper 4 | # Author: MKNC 5 | # created: 10-01-2023 19:30 IST 6 | # version 1 7 | 8 | # requests --> for scraping html data from an url 9 | # BeautifulSoup --> for parsing the html data in a tree like structure 10 | import requests 11 | from bs4 import BeautifulSoup 12 | from urllib.parse import urljoin, urlparse 13 | 14 | import os 15 | from platform import uname 16 | def printScreen(): 17 | os.system('cls' if 'win' in uname().system.lower() else 'clear') 18 | print("<"+"="*50+" AAVE's Docs Scraper "+"="*50+">\n") 19 | 20 | # for unique directory names 21 | def get_dirname(dname): 22 | if os.path.exists(dname): 23 | print(dname) 24 | dname += "1" 25 | print(dname) 26 | return get_dirname(dname) 27 | return dname 28 | 29 | def makeDir(dname): 30 | os.mkdir(dname) 31 | # if os.path.exists(dname): 32 | # new_dname = get_dirname(dname) 33 | # print(f"[-] '{dname}/' directory already Exists, creating and saving all the data in '{new_dname}/' directory") 34 | # os.mkdir(new_dname) 35 | # else: 36 | # print(f"[+] Creating directory '{dname}'") 37 | # os.mkdir(dname) 38 | 39 | # for saving the data 40 | def saveData(fname, data): 41 | with open(fname,'w',encoding="utf-8") as file: 42 | file.write(str(data)) 43 | 44 | # links crawler class 45 | class Crawler: 46 | def __init__(self, base_url, restricted_domain='', crawl_limit=20): 47 | self.urls = [base_url] 48 | self.links_found = [] 49 | self.restricted_domain = restricted_domain 50 | self.crawl_limit = crawl_limit 51 | 52 | def fetch_all_links(self, url): 53 | response = requests.get(url) 54 | soup = BeautifulSoup(response.text, 'html.parser') 55 | for link in soup.find_all('a'): 56 | if len(self.links_found)>self.crawl_limit: return 57 | 58 | path = link.get('href') 59 | if not path.startswith('http'): path = urljoin(url,path) 60 | 61 | if '#' in path: continue 62 | if urlparse(path).hostname != self.restricted_domain and self.restricted_domain!='': continue 63 | if path in self.links_found: continue 64 | 65 | print("[*] Link found -",path) 66 | self.urls.append(path) 67 | self.links_found.append(path) 68 | 69 | def crawl(self): 70 | while self.urls: 71 | url = self.urls.pop(0) 72 | print(f"[+] Crawling: {url}\n") 73 | try: 74 | self.fetch_all_links(url) 75 | except Exception as e: 76 | print(f'[!] Failed to crawl: {url}') 77 | print("[!] REASON:",e,'\n') 78 | 79 | # page content 80 | index = 0 81 | def fetch_data(url, content_class, dirName='.'): 82 | global index 83 | try: 84 | print(f"[{index}] Scraping {url}") 85 | index += 1 86 | response = requests.get(url) 87 | data = BeautifulSoup(response.text, 'html.parser') 88 | # title = data.find('title').text # for filename = title of page 89 | data = data.find_all(class_=content_class)[1:] # skips the first element 90 | except Exception as e: 91 | print("[!] Failed to fetch",url) 92 | print("[ERROR]",e) 93 | return 94 | 95 | # title => filename 96 | title = urlparse(url).path[1:-1].replace('/','-') 97 | fileName = f"{dirName}/{title}.txt" 98 | 99 | # data -> file 100 | content = f"[{title}]\n" 101 | content += f"[{url}]\n" 102 | for i in data: 103 | content += '\n'+i.text.strip()+'\n' 104 | saveData(fileName, content) 105 | 106 | 107 | def scrape_all(url, content_class, dirName, crawl_limit): 108 | domain = urlparse(url).hostname 109 | aave = Crawler(base_url=url, restricted_domain=domain, crawl_limit=crawl_limit) 110 | aave.crawl() 111 | saveData(f'{dirName}/links_found.txt',"/n".join(aave.links_found)) 112 | 113 | for path in aave.links_found: 114 | try: 115 | fetch_data(path, content_class,dirName) 116 | except Exception as e: 117 | print("[!] Failed to fetch",path) 118 | print("[ERROR]",e) 119 | 120 | 121 | 122 | if __name__ == "__main__": 123 | printScreen() 124 | 125 | root_dir = input("[=] create a directory in currect directory for saving all the data: ") 126 | root_dir = get_dirname(root_dir) 127 | makeDir(root_dir) 128 | 129 | url = "https://docs.aave.com/" 130 | ## UPDATE THIS VALUE IF NO DATA IS FETCHED 131 | # it might be possible that the website developers may have changed the content class or maybe just modified 132 | data_class = "r-1xnzce8" 133 | 134 | crawl_limit = input("[=] Enter a crawl limit: ") 135 | if not crawl_limit.isnumeric(): 136 | print("[!] Enter a valid number") 137 | exit() 138 | 139 | try: 140 | scrape_all(url, data_class, root_dir, int(crawl_limit)) 141 | except KeyboardInterrupt: 142 | print("[-] KeyBoard interrupted") 143 | print("Exiting...") 144 | --------------------------------------------------------------------------------