├── README.md └── crawler.py /README.md: -------------------------------------------------------------------------------- 1 | UrlCrawler 2 | 3 | author : saucerman 4 | 5 | description: python写的全站url爬取脚本,爬取网站的全部url用于分析站点目录 6 | 7 | ## 说明 8 | 9 | 暂未使用多进程或多线程,速度不快,可自行修改源码 10 | 11 | 基本思想如下所示: 12 | 13 | ![](http://ww1.sinaimg.cn/large/005GjT4tgy1fx4i98ipkqj30ji091wev.jpg) 14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /crawler.py: -------------------------------------------------------------------------------- 1 | # /usr/bin/env python3 2 | # _*_ coding:utf-8 _*_ 3 | # auther: saucerman 4 | # project: https://github.com/saucer-man/UrlCrawler 5 | 6 | """ 7 | decription : 全站url爬取脚本 8 | """ 9 | import re 10 | import time 11 | import sys 12 | import requests 13 | try : 14 | import tldextract 15 | except: 16 | print('module tldextract not fount \nyou can try pip install tldextract') 17 | sys.exit() 18 | 19 | 20 | def domain_get(): 21 | ''' 22 | 接收要爬取的网站url 23 | ''' 24 | url = input("Please input the url of website:") 25 | if '//' not in url: 26 | url = 'http://' + url 27 | try: 28 | kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} 29 | requests.head(url,headers=kv) 30 | return url 31 | except: 32 | print("your url is incorrect!!") 33 | return domain_get() 34 | 35 | 36 | class spider(): 37 | def __init__(self, domain, key, depth): 38 | self.domain = domain # 爬取的域名 39 | self.depth = depth # 爬取的深度 40 | self.urls_all = set([]) # 爬取的结果 41 | self.key = key # 顶级域名,用于排除外链 42 | 43 | def page_spider(self, url): 44 | ''' 45 | 爬取url中的所有链接 46 | ''' 47 | try: 48 | kv={'user_agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'} 49 | r = requests.get(url, headers=kv, timeout = 2) 50 | r.encoding=r.apparent_encoding 51 | pagetext=r.text 52 | pagelinks = re.findall(r'(?<=href=\").*?(?=\")|(?<=href=\').*?(?=\')',pagetext) 53 | 54 | except: 55 | return set([]) 56 | # 接下来对爬取的链接进行处理 57 | 58 | # 1、先去除不同域的链接 59 | url_list = set([]) 60 | for url in pagelinks: 61 | if self.key in url: 62 | url_list.add(url) 63 | 64 | # 2、再对链接进行去重处理 65 | url_list = set(url_list)-self.urls_all 66 | self.urls_all.update(url_list) 67 | return url_list # 返回集合 68 | 69 | 70 | 71 | def run(self): 72 | url_list = set([self.domain]) # 第一次爬取原始url的链接 73 | while self.depth >= 1: # 每一次深度的爬取都会爬取url_list的所有链接 74 | print("倒数第%d轮"%self.depth) 75 | url_list_tmp = set([]) 76 | for url in url_list: 77 | url_list_tmp.update(self.page_spider(url)) 78 | url_list = url_list_tmp 79 | self.depth = self.depth -1 80 | 81 | file=open('result.txt','w') 82 | for url in self.urls_all: 83 | file.write(url) 84 | file.write('\n') 85 | file.close() 86 | 87 | 88 | 89 | 90 | if __name__ == '__main__': 91 | time.clock() 92 | domain = domain_get() 93 | print('domain:', domain) 94 | key_tmp = tldextract.extract(domain) 95 | # 用于排除外链,爬取的url不包含key的都会被舍弃。 96 | # 'https://www.xiaogeng.com.cn/admin?id=6'==>'www.xiaogeng.com.cn' 97 | key = key_tmp.subdomain + '.' + key_tmp.domain+'.' + key_tmp.suffix 98 | print('key:', key) 99 | print('开始爬取...\n') 100 | spider = spider(domain = domain, key = key, depth = 3) 101 | spider.run() 102 | print('结果已保存至result.txt中') 103 | print('time:',time.clock()) 104 | 105 | 106 | --------------------------------------------------------------------------------