├── .gitignore
├── .idea
├── .name
├── dictionaries
│ └── tanishindaira.xml
├── encodings.xml
├── misc.xml
├── modules.xml
├── python_collect_domain.iml
├── vcs.xml
└── workspace.xml
├── README.md
├── __init__.py
├── collect.py
└── logger.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 |
27 | # PyInstaller
28 | # Usually these files are written by a python script from a template
29 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 |
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 |
37 | # Unit test / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 |
48 | # Translations
49 | *.mo
50 | *.pot
51 |
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 |
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 |
60 | # Scrapy stuff:
61 | .scrapy
62 |
63 | # Sphinx documentation
64 | docs/_build/
65 |
66 | # PyBuilder
67 | target/
68 |
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 |
72 | # pyenv
73 | .python-version
74 |
75 | # celery beat schedule file
76 | celerybeat-schedule
77 |
78 | # dotenv
79 | .env
80 |
81 | # virtualenv
82 | venv/
83 | ENV/
84 |
85 | # Spyder project settings
86 | .spyderproject
87 |
88 | # Rope project settings
89 | .ropeproject
90 |
--------------------------------------------------------------------------------
/.idea/.name:
--------------------------------------------------------------------------------
1 | python_collect_domain
--------------------------------------------------------------------------------
/.idea/dictionaries/tanishindaira.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
--------------------------------------------------------------------------------
/.idea/encodings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 | Python
16 |
17 |
18 |
19 |
20 | Python
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
53 |
54 |
55 |
56 |
57 |
58 |
--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/.idea/python_collect_domain.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/workspace.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 | true
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 |
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 |
137 |
138 |
139 |
140 |
141 |
142 |
143 |
144 |
145 |
146 |
147 |
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 |
158 |
159 |
160 |
161 |
162 |
163 |
164 |
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
173 |
174 |
175 |
176 |
177 |
178 |
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 |
187 |
188 |
189 |
190 |
191 |
192 |
193 |
194 |
195 |
196 |
197 |
198 |
199 |
200 |
201 |
202 |
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 |
211 |
212 |
213 |
214 |
215 |
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
235 |
236 |
237 |
238 |
239 |
240 |
241 |
242 |
243 |
244 |
245 |
246 |
247 |
248 |
249 |
250 |
251 |
252 |
253 |
254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
274 |
275 |
276 |
277 |
278 |
279 |
280 |
281 |
282 |
283 |
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 |
319 |
320 |
321 |
322 |
323 |
324 |
325 |
326 |
327 |
328 |
329 |
330 |
331 |
332 |
333 |
334 |
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
352 |
353 |
354 |
355 |
356 |
357 |
358 |
359 |
360 |
361 |
362 |
363 |
364 |
365 |
366 |
367 |
368 |
369 |
370 |
371 |
372 |
373 |
374 |
375 |
376 |
377 |
378 |
379 |
380 |
381 |
382 |
383 |
384 |
385 |
386 |
387 |
388 |
389 |
390 |
391 |
392 |
393 |
394 |
395 |
396 |
397 |
398 |
399 |
400 |
401 |
402 | 1482227454848
403 |
404 | 1482227454848
405 |
406 |
407 |
408 |
409 |
410 |
411 |
412 |
413 |
414 |
415 |
416 |
417 |
418 |
419 |
420 |
421 |
422 |
423 |
424 |
425 |
426 |
427 |
428 |
429 |
430 |
431 |
432 |
433 |
434 |
435 |
436 |
437 |
438 |
439 |
440 |
441 |
442 |
443 |
444 |
445 |
446 |
447 |
448 |
449 |
450 |
451 |
452 |
453 |
454 |
455 |
456 |
457 |
458 |
459 |
460 |
461 |
462 |
463 |
464 |
465 |
466 |
467 |
468 |
469 |
470 |
471 |
472 |
473 |
474 |
475 |
476 |
477 |
478 |
479 |
480 |
481 |
482 |
483 |
484 |
485 |
486 |
487 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # python_collect_domain
2 | python无限爬取URL,渗透必备
3 | 需要threadpool、requests、lxml模块请自行pip install
4 |
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sml2h3/python_collect_domain/28d2b07d3b5e5991ccac47b8da3a080e27db3abe/__init__.py
--------------------------------------------------------------------------------
/collect.py:
--------------------------------------------------------------------------------
1 | #coding=utf-8
2 | #author:sml2h3
3 | #created:2016.12.20
4 | #email:sml2h3@gmail.com
5 | import requests
6 | import threadpool
7 | from lxml import etree
8 | from urlparse import *
9 | import sys
10 | from logger import Logger
11 |
12 | logger = Logger('collect.py')
13 |
14 |
15 | def get_url(url):
16 | """获取URL"""
17 | headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36'}
18 | try:
19 | page = requests.get(url, verify=False, timeout=3, headers=headers).text
20 |
21 | except requests.RequestException as e:
22 | logger.error(e)
23 | return []
24 |
25 | try:
26 | ss = etree.HTML(page)
27 | urls = ss.xpath("//*[@href]/@href")
28 |
29 | except Exception as e:
30 | logger.error("dom化失败:{}".format(e))
31 | return []
32 |
33 | domain_url_list = deal_url(urls)
34 |
35 | return domain_url_list
36 |
37 |
38 | def deal_url(urls):
39 | """处理url, 获取domain"""
40 | res_list = []
41 |
42 | if len(urls) == 0:
43 | return []
44 |
45 | for i in urls:
46 | r = urlparse(i)
47 | domain = r.netloc
48 | domain = domain.replace(" ", '')
49 | if domain != '':
50 | if r.scheme == "http" or r.scheme == "https" or r.scheme == "ftp":
51 | u = r.scheme + "://" + r.netloc
52 | else:
53 | u = "http://" + r.netloc
54 | else:
55 | continue
56 |
57 | if u in urlArr:
58 | continue
59 | else:
60 | urlArr.append(u)
61 | res_list.append(u)
62 |
63 | return res_list
64 |
65 |
66 | def con(request, result):
67 | global allget
68 | for i in result:
69 | allget = allget + 1
70 | f.write(i+'\n')
71 | logger.info("当前已爬取"+str(allget)+"个Url:"+i)
72 | re = threadpool.makeRequests(get_url, result, con)
73 | [pool.putRequest(req) for req in re]
74 |
75 |
76 | if __name__ == '__main__':
77 |
78 | # 测试
79 | reload(sys)
80 | sys.setdefaultencoding('utf8')
81 | f2 = file('error_file.txt', 'w')
82 | sys.stderr = f2
83 | urlArr = []
84 | allget = 0
85 | f = file("url.txt", "a+")
86 | data = get_url("http://www.baidu.com")
87 | pool = threadpool.ThreadPool(20)
88 | reqrest = threadpool.makeRequests(get_url, data, con)
89 | [pool.putRequest(req) for req in reqrest]
90 | pool.wait()
91 | f.close()
92 |
--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
1 | # -*-coding:utf-8 -*-
2 |
3 | import logging
4 |
5 |
6 | class Logger(object):
7 |
8 | def __init__(self, name):
9 | self.logger = logging.getLogger(name)
10 | self.logger.setLevel(logging.DEBUG)
11 |
12 | # 创建一个handler,用于写入日志文件
13 | fh = logging.FileHandler('/tmp/test.log')
14 |
15 | # 再创建一个handler,用于输出到控制台
16 | ch = logging.StreamHandler()
17 |
18 | # 定义handler的输出格式formatter
19 | formatter = logging.Formatter('%(asctime)s-[ %(name)s ]-%(levelname)s: %(message)s')
20 | fh.setFormatter(formatter)
21 | ch.setFormatter(formatter)
22 |
23 | self.logger.addHandler(fh)
24 | self.logger.addHandler(ch)
25 |
26 | def error(self, msg):
27 | self.logger.error(msg)
28 |
29 | def info(self, msg):
30 | self.logger.info(msg)
31 |
32 | def warning(self, msg):
33 | self.logger.warning(msg)
34 |
35 | def debug(self, msg):
36 | self.logger.debug(msg)
37 |
38 |
39 | if __name__ == "__main__":
40 | logger = Logger("test")
41 |
42 | logger.debug('logger5 debug message')
43 | logger.info('logger5 info message')
44 | logger.warning('logger5 warning message')
45 | logger.error('logger5 error message')
46 |
--------------------------------------------------------------------------------