├── .idea └── .gitignore └── main.py /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | 4 | target_url = "https://test.com" 5 | foundLinks = [] 6 | 7 | def make_request(url): 8 | response = requests.get(url) 9 | soup = BeautifulSoup(response.text, "html.parser") 10 | return soup 11 | 12 | def crawl(url): 13 | links = make_request(url) 14 | for link in links.find_all('a'): 15 | found_link = link.get('href') 16 | if found_link: 17 | if "#" in found_link: 18 | found_link = found_link.split("#")[0] 19 | if target_url in found_link and found_link not in foundLinks: 20 | foundLinks.append(found_link) 21 | print(found_link) 22 | #recursive 23 | crawl(found_link) 24 | 25 | crawl(target_url) --------------------------------------------------------------------------------