├── .gitignore ├── .idea ├── .name ├── dictionaries │ └── tanishindaira.xml ├── encodings.xml ├── misc.xml ├── modules.xml ├── python_collect_domain.iml ├── vcs.xml └── workspace.xml ├── README.md ├── __init__.py ├── collect.py └── logger.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .ropeproject 90 | -------------------------------------------------------------------------------- /.idea/.name: -------------------------------------------------------------------------------- 1 | python_collect_domain -------------------------------------------------------------------------------- /.idea/dictionaries/tanishindaira.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Python 16 | 17 | 18 | 19 | 20 | Python 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 53 | 54 | 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/python_collect_domain.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/workspace.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 17 | 18 | 19 | 20 | 21 | 22 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 87 | 88 | 95 | 96 | 97 | 98 | 99 | true 100 | 101 | 102 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 145 | 146 | 149 | 150 | 153 | 154 | 155 | 156 | 159 | 160 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 181 | 182 | 183 | 184 | 201 | 202 | 221 | 222 | 223 | 224 | 225 | 238 | 239 | 252 | 253 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 299 | 300 | 319 | 320 | 341 | 342 | 364 | 365 | 389 | 390 | 391 | 392 | 393 | 394 | 395 | 396 | 397 | 398 | 399 | 400 | 401 | 402 | 1482227454848 403 | 406 | 407 | 408 | 409 | 410 | 411 | 412 | 413 | 414 | 415 | 416 | 417 | 418 | 419 | 420 | 421 | 422 | 423 | 424 | 425 | 426 | 427 | 428 | 429 | 430 | 431 | 432 | 433 | 434 | 435 | 437 | 438 | 439 | 441 | 442 | 443 | 444 | 445 | 446 | 447 | 448 | 449 | 450 | 451 | 452 | 453 | 454 | 455 | 456 | 457 | 458 | 459 | 460 | 461 | 462 | 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # python_collect_domain 2 | python无限爬取URL,渗透必备 3 | 需要threadpool、requests、lxml模块请自行pip install 4 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sml2h3/python_collect_domain/28d2b07d3b5e5991ccac47b8da3a080e27db3abe/__init__.py -------------------------------------------------------------------------------- /collect.py: -------------------------------------------------------------------------------- 1 | #coding=utf-8 2 | #author:sml2h3 3 | #created:2016.12.20 4 | #email:sml2h3@gmail.com 5 | import requests 6 | import threadpool 7 | from lxml import etree 8 | from urlparse import * 9 | import sys 10 | from logger import Logger 11 | 12 | logger = Logger('collect.py') 13 | 14 | 15 | def get_url(url): 16 | """获取URL""" 17 | headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'} 18 | try: 19 | page = requests.get(url, verify=False, timeout=3, headers=headers).text 20 | 21 | except requests.RequestException as e: 22 | logger.error(e) 23 | return [] 24 | 25 | try: 26 | ss = etree.HTML(page) 27 | urls = ss.xpath("//*[@href]/@href") 28 | 29 | except Exception as e: 30 | logger.error("dom化失败:{}".format(e)) 31 | return [] 32 | 33 | domain_url_list = deal_url(urls) 34 | 35 | return domain_url_list 36 | 37 | 38 | def deal_url(urls): 39 | """处理url, 获取domain""" 40 | res_list = [] 41 | 42 | if len(urls) == 0: 43 | return [] 44 | 45 | for i in urls: 46 | r = urlparse(i) 47 | domain = r.netloc 48 | domain = domain.replace(" ", '') 49 | if domain != '': 50 | if r.scheme == "http" or r.scheme == "https" or r.scheme == "ftp": 51 | u = r.scheme + "://" + r.netloc 52 | else: 53 | u = "http://" + r.netloc 54 | else: 55 | continue 56 | 57 | if u in urlArr: 58 | continue 59 | else: 60 | urlArr.append(u) 61 | res_list.append(u) 62 | 63 | return res_list 64 | 65 | 66 | def con(request, result): 67 | global allget 68 | for i in result: 69 | allget = allget + 1 70 | f.write(i+'\n') 71 | logger.info("当前已爬取"+str(allget)+"个Url:"+i) 72 | re = threadpool.makeRequests(get_url, result, con) 73 | [pool.putRequest(req) for req in re] 74 | 75 | 76 | if __name__ == '__main__': 77 | 78 | # 测试 79 | reload(sys) 80 | sys.setdefaultencoding('utf8') 81 | f2 = file('error_file.txt', 'w') 82 | sys.stderr = f2 83 | urlArr = [] 84 | allget = 0 85 | f = file("url.txt", "a+") 86 | data = get_url("http://www.baidu.com") 87 | pool = threadpool.ThreadPool(20) 88 | reqrest = threadpool.makeRequests(get_url, data, con) 89 | [pool.putRequest(req) for req in reqrest] 90 | pool.wait() 91 | f.close() 92 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | # -*-coding:utf-8 -*- 2 | 3 | import logging 4 | 5 | 6 | class Logger(object): 7 | 8 | def __init__(self, name): 9 | self.logger = logging.getLogger(name) 10 | self.logger.setLevel(logging.DEBUG) 11 | 12 | # 创建一个handler,用于写入日志文件 13 | fh = logging.FileHandler('/tmp/test.log') 14 | 15 | # 再创建一个handler,用于输出到控制台 16 | ch = logging.StreamHandler() 17 | 18 | # 定义handler的输出格式formatter 19 | formatter = logging.Formatter('%(asctime)s-[ %(name)s ]-%(levelname)s: %(message)s') 20 | fh.setFormatter(formatter) 21 | ch.setFormatter(formatter) 22 | 23 | self.logger.addHandler(fh) 24 | self.logger.addHandler(ch) 25 | 26 | def error(self, msg): 27 | self.logger.error(msg) 28 | 29 | def info(self, msg): 30 | self.logger.info(msg) 31 | 32 | def warning(self, msg): 33 | self.logger.warning(msg) 34 | 35 | def debug(self, msg): 36 | self.logger.debug(msg) 37 | 38 | 39 | if __name__ == "__main__": 40 | logger = Logger("test") 41 | 42 | logger.debug('logger5 debug message') 43 | logger.info('logger5 info message') 44 | logger.warning('logger5 warning message') 45 | logger.error('logger5 error message') 46 | --------------------------------------------------------------------------------