├── requirements.txt
├── README.md
└── sitecopy.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | Path
3 | asyncio
4 | argparse
5 | functools


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SiteCopy
 2 | 
 3 | sitecopy is a tool that facilitates personal website backup and network data collection
 4 | 
 5 | ## 介绍
 6 | 
 7 | 网站复制，也可称为网站备份。是通过工具将网页上的内容全部保存下来。当然不仅仅只是保存了一个html页面，而是将网页源码内所包含的css、js和静态文件等全部保存，以在本地也可以完整的浏览整个网站。网络上也有一些类似的工具，但使用起来并不理想。于是自己写一个Python脚本，方便个人对网站的备份，也方便一些网络资料的收集。
 8 | 
 9 | - 工具名称: SiteCopy
10 | - 作者: Threezh1
11 | - 博客: http://www.threezh1.com/
12 | 
13 | 关于SiteCopy的开发记录：[论如何优雅的复制一个网站的所有页面](https://xz.aliyun.com/t/6941)
14 | 
15 | 对互联网任何网站的复制需在取得授权后方可进行，若使用者因此做出危害网络安全的行为后果自负，与作者无关，特此声明。
16 | 
17 | ## 使用
18 | 
19 | Python版本: 3.7
20 | 
21 | 安装依赖库: `pip3 install -r requirements.txt`
22 | 
23 | - 复制单个页面
24 | 
25 | `python sitecopy.py -u "http://www.threezh1.com"`
26 | 
27 | - 复制整个网站
28 | 
29 | `python sitecopy.py -u "http://www.threezh1.com" -e`
30 | 
31 | - 复制多个页面
32 | 
33 | `python sitecopy.py -s "site.txt"`
34 | 
35 | - 复制多个网站
36 | 
37 | `python sitecopy.py -s "site.txt" -e`
38 | 
39 | 
40 | 指定链接爬取的循环次数： -d (默认为200)
41 | 
42 | 指定线程数：-e (默认为30)
43 | 
44 | 例子： 爬取 www.threezh1.com 网站所有页面，指定链接爬取的循环次数为200，指定线程数为30
45 | 
46 | `python sitecopy.py -u "http://www.threezh1.com" -e -d 200 -t 30`
47 | 
48 | ## 复制网站测试
49 | 
50 | - 复制自己的博客：https://threezh1.com 花费时间：2分钟48秒
51 | 
52 | 运行截图：
53 | 
54 | ![pic_11.jpg](https://s2.ax1x.com/2019/12/12/QcnOp9.jpg)
55 | 
56 | 目录截图：
57 | 
58 | ![pic_07.jpg](https://i.loli.net/2019/12/12/MRmv4licZCb5OzD.jpg)
59 | 
60 | 页面截图：
61 | 
62 | ![pic_06.jpg](https://i.loli.net/2019/12/12/4ydL371zCEiVJnZ.jpg)
63 | 
64 | 
65 | ## 已知存在的问题
66 | 
67 | 1. 目录替换时在有些情况下会进行多次替换导致页面无法正常显示
68 | 2. 网站或图床有防爬措施时无法正常保存
69 | 3. 网络问题导致脚本无法正常执行
70 | 
71 | 非常希望能够和师傅们共同交流对这些问题的解决方式，我的邮箱：makefoxm@qq.com
72 | 


--------------------------------------------------------------------------------
/sitecopy.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | # Author: Threezh1
  4 | # Blog 	: http://www.threezh1.com/
  5 | # Github: https://github.com/Threezh1
  6 | 
  7 | import requests, urllib, os, asyncio, functools, argparse, sys
  8 | from pathlib import Path
  9 | from bs4 import BeautifulSoup
 10 | from requests.packages import urllib3
 11 | from requests.adapters import HTTPAdapter
 12 | 
 13 | Welcome = """
 14 |  .▄▄ · ▪  ▄▄▄▄▄▄▄▄ . ▄▄·        ▄▄▄· ▄· ▄▌
 15 |  ▐█ ▀. ██ •██  ▀▄.▀·▐█ ▌▪▪     ▐█ ▄█▐█▪██▌
 16 |  ▄▀▀▀█▄▐█· ▐█.▪▐▀▀▪▄██ ▄▄ ▄█▀▄  ██▀·▐█▌▐█▪
 17 |  ▐█▄▪▐█▐█▌ ▐█▌·▐█▄▄▌▐███▌▐█▌.▐▌▐█▪·• ▐█▀·.
 18 |   ▀▀▀▀ ▀▀▀ ▀▀▀  ▀▀▀ ·▀▀▀  ▀█▄▀▪.▀     ▀ • """
 19 | 
 20 | Information = r"""
 21 |  Author: 	Threezh1
 22 |  Blog:		http://www.threezh1.com/
 23 |  Version:	1.0"""
 24 | 
 25 | Help = r"""
 26 |  Uage: README.md
 27 |  Stop Copy: Ctrl + C
 28 | """
 29 | 
 30 | urllib3.disable_warnings()
 31 | header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",}
 32 | 
 33 | def parse_args():
 34 |     parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com")
 35 |     parser.add_argument("-u", "--url", help="The address where you want to get the source code")
 36 |     parser.add_argument("-s", "--urls",help="Download multiple urls")
 37 |     parser.add_argument("-d", "--depth",help="Number of loops to get links")
 38 |     parser.add_argument("-t", "--threads",help="Number of threads for task execution")
 39 |     parser.add_argument("-e", "--entire",help="Download entire website", action="store_true")
 40 |     return parser.parse_args()
 41 | 
 42 | # Get the page source
 43 | def ExtractContent(url):
 44 | 	try:
 45 | 		raw = requests.get(url, headers = header, timeout=10, allow_redirects=True, verify=False)
 46 | 		raw = raw.content
 47 | 		if raw != "":
 48 | 			return raw
 49 | 	except Exception as e:
 50 | 		print("[error] - " + url)
 51 | 		#print(e)
 52 | 		return None
 53 | 
 54 | def Md5Encrypt(text):
 55 | 	import hashlib
 56 | 	hl = hashlib.md5()
 57 | 	hl.update(text.encode(encoding='utf-8'))
 58 | 	return hl.hexdigest()
 59 | 
 60 | def GetUrlPart(url, part = ""):
 61 | 	from urllib.parse import urlparse
 62 | 	# http://www.example.com/a/b/index.php?id=1#h1
 63 | 	# domain : www.example.com
 64 | 	# scheme : http
 65 | 	# path   : /a/b/index.php
 66 | 	# id=1   : id=1
 67 | 	# fragment : h1
 68 | 	# completepath : /a/b/
 69 | 	# completedomain : http://www.example.com
 70 | 	# filename : index.php
 71 | 	# filesuffix : php
 72 | 
 73 | 	if url.startswith("http") == False:
 74 | 		if part == "path":
 75 | 			return url[:url.rfind("/") + 1]
 76 | 		if part == "filename":
 77 | 			temp = url[url.rfind("/") + 1:]
 78 | 			if temp.find("?") != -1:
 79 | 				temp = temp[:temp.find("?")]
 80 | 			if temp.find("#") != -1:
 81 | 				temp = temp[:temp.find("#")]
 82 | 			return temp
 83 | 	else:
 84 | 		pass
 85 | 	try:
 86 | 		parsed = urlparse(url)
 87 | 	except:
 88 | 		return ""
 89 | 	if part == "domain":
 90 | 		return parsed.netloc
 91 | 	elif part == "scheme":
 92 | 		return parsed.scheme
 93 | 	elif part == "path":
 94 | 		return parsed.path
 95 | 	elif part == "query":
 96 | 		return parsed.query
 97 | 	elif part == "fragment":
 98 | 		return parsed.fragment
 99 | 	elif part == "completepath":
100 | 		return parsed.path[:parsed.path.rfind("/") + 1]
101 | 	elif part == "completedomain":
102 | 		return (parsed.scheme + "://" + parsed.netloc)
103 | 	elif part == "filename": 
104 | 		return parsed.path[parsed.path.rfind("/") + 1:]
105 | 	elif part == "filesuffix": 
106 | 		temp = parsed.path[parsed.path.rfind("/") + 1:]
107 | 		if temp.find(".") == -1: return ""
108 | 		return temp[temp.find("."):]
109 | 	else:
110 | 		return parsed
111 | 
112 | def ProcessResourcePath(pages_url, source_url):
113 | 	""" Handle the relationship between relative paths and absolute paths, and give replacement results and save paths """
114 | 	
115 | 	source_download_url = ""
116 | 	processed_source_url = ""
117 | 	source_save_path = ""
118 | 	source_url_kind = 0
119 | 	
120 | 	relative_path = ""
121 | 	url_path = GetUrlPart(pages_url, "completepath")
122 | 	for i in range(url_path.count("/") - 1):
123 | 		relative_path += "../"
124 | 	# process others
125 | 	if_others = False
126 | 	if source_url.startswith("data:image") == False:
127 | 		# process absolute and special path
128 | 		if_abslote_url = False
129 | 		if source_url.startswith("http"):
130 | 			source_url_kind = 1
131 | 			source_download_url = source_url
132 | 			if_abslote_url = True
133 | 		elif source_url.startswith("//"):
134 | 			source_url_kind = 2
135 | 			source_download_url = GetUrlPart(pages_url, "scheme") + ":" + source_url
136 | 			if_abslote_url = True
137 | 
138 | 		if_special_url = False
139 | 		if source_url.startswith("../"):
140 | 			source_url_kind = 3
141 | 			cleared_source_url = GetUrlPart(source_url, "filename")
142 | 			cleared_source_path = GetUrlPart(source_url, "path").replace("../", "")
143 | 			temp = url_path
144 | 			for i in range(source_url.count("../") + 1):
145 | 				temp = temp[:temp.rfind("/")]
146 | 			absolte_url_path = temp + "/"
147 | 			source_download_url = GetUrlPart(pages_url, "completedomain") + absolte_url_path + cleared_source_path + cleared_source_url
148 | 			temp = relative_path
149 | 			for i in range(source_url.count("../") + 1):
150 | 				temp = temp[:temp.rfind("/") + 1]
151 | 			processed_source_url = source_url
152 | 			if absolte_url_path.startswith("/"):absolte_url_path = absolte_url_path[1:]
153 | 			source_save_path = absolte_url_path + cleared_source_path + cleared_source_url
154 | 			if_special_url = True
155 | 		elif source_url.startswith("/") and source_url.startswith("//") == False and source_url.startswith("/./") == False:
156 | 			source_url_kind = 4
157 | 			source_download_url = GetUrlPart(pages_url, "completedomain") + source_url
158 | 			if relative_path == "":
159 | 				processed_source_url = GetUrlPart(source_url, "path")[1:] + GetUrlPart(source_url, "filename")
160 | 			else:
161 | 				processed_source_url = relative_path[:-1] + GetUrlPart(source_url, "path") + GetUrlPart(source_url, "filename")
162 | 			source_save_path = GetUrlPart(source_url, "path")[1:] + GetUrlPart(source_url, "filename")
163 | 			if_special_url = True
164 | 		elif source_url.startswith("/./"):
165 | 			source_url_kind = 5
166 | 			source_download_url =  GetUrlPart(pages_url, "completedomain") + "/" + source_url[3:]
167 | 			processed_source_url = relative_path + GetUrlPart(source_url, "path")[3:] + GetUrlPart(source_url, "filename")
168 | 			source_save_path = GetUrlPart(source_url, "path")[3:] + GetUrlPart(source_url, "filename")
169 | 			if_special_url = True
170 | 
171 | 		# process relative path
172 | 		if if_abslote_url == True:
173 | 			temp_source_name = Md5Encrypt(source_url) + GetUrlPart(source_download_url, "filesuffix")
174 | 			processed_source_url = relative_path + "nopathsource/" + temp_source_name
175 | 			source_save_path = "nopathsource/" + temp_source_name
176 | 		elif if_special_url == True: pass
177 | 		elif source_url.startswith("./"):
178 | 			source_url_kind = 6
179 | 			cleared_source_url = GetUrlPart(source_url[2:], "path") + GetUrlPart(source_url, "filename")
180 | 		else:
181 | 			source_url_kind = 7
182 | 			cleared_source_url = GetUrlPart(source_url, "path") + GetUrlPart(source_url, "filename")
183 | 
184 | 		if if_abslote_url == False and if_special_url == False:
185 | 			source_download_url = GetUrlPart(pages_url, "completedomain") + GetUrlPart(pages_url, "completepath") + cleared_source_url
186 | 			processed_source_url = cleared_source_url
187 | 			source_save_path = url_path[1:] + cleared_source_url
188 | 	else:
189 | 		source_url_kind = 0
190 | 	result = {
191 | 		"pages_url": pages_url,
192 | 		"source_url": source_url,
193 | 		"source_download_url": source_download_url,
194 | 		"processed_source_url": processed_source_url,
195 | 		"source_save_path": source_save_path,
196 | 		"source_url_kind": source_url_kind
197 | 	}
198 | 	return result
199 | 
200 | def IfBlackName(black_name_list, text, kind=1):
201 | 	# 1: equal
202 | 	# 2: exist
203 | 	# 3: startswith
204 | 	for temp in black_name_list:
205 | 		if kind == 1:
206 | 			if text == temp:
207 | 				return True
208 | 		if kind == 2:
209 | 			if text.find(temp) != -1:
210 | 				return True
211 | 		if kind == 3:
212 | 			if text.startswith(temp):
213 | 				return True
214 | 	return False
215 | 
216 | def ExtractLinks(url, lable_name, attribute_name):
217 | 	single_black_names = ["/", "#"]
218 | 	starts_black_names = ["#", "javascript:"]
219 | 	html_raw = ExtractContent(url)
220 | 	if html_raw == None: return []
221 | 	html = BeautifulSoup(html_raw.decode("utf-8", "ignore"), "html.parser")
222 | 	lables = html.findAll({lable_name})
223 | 	old_links = []
224 | 	for lable in lables:
225 | 		lable_attribute = lable.get(attribute_name)
226 | 		if lable_attribute == None or lable_attribute == "": continue
227 | 		lable_attribute = lable_attribute.strip()
228 | 		if IfBlackName(single_black_names, lable_attribute): continue 
229 | 		if IfBlackName(starts_black_names, lable_attribute, 3): continue
230 | 		if lable_attribute not in old_links:
231 | 			old_links.append(lable_attribute)
232 | 	return old_links
233 | 
234 | def SaveFile(file_content, file_path, utf8=False):
235 | 	processed_path = urllib.parse.unquote(file_path)
236 | 	try:
237 | 		path = Path(GetUrlPart(processed_path, "path"))
238 | 		path.mkdir(parents=True, exist_ok=True)
239 | 		if utf8 == False:
240 | 			with open(processed_path, "wb") as fobject:
241 | 				fobject.write(file_content)
242 | 		else:
243 | 			with open(processed_path, "w", encoding="utf-8") as fobject:
244 | 				fobject.write(file_content)
245 | 	except Exception as e:
246 | 		print("[error] - " + file_path)
247 | 		#print(e)
248 | 
249 | def ProcessLink(page_url, link, if_page_url = False):
250 | 	temp = ProcessResourcePath(page_url, link)
251 | 	processed_link = temp["source_download_url"]
252 | 	if GetUrlPart(page_url, "domain") != GetUrlPart(processed_link, "domain"): return None
253 | 	if if_page_url == True:
254 | 		processed_link = GetUrlPart(processed_link, "completedomain") + GetUrlPart(processed_link, "path")
255 | 	else:
256 | 		temp = ProcessResourcePath(page_url, link)
257 | 		processed_link = temp["processed_source_url"]
258 | 	url_filename = GetUrlPart(processed_link, "filename")
259 | 	url_suffix = GetUrlPart(processed_link, "filesuffix")
260 | 	if url_suffix == ".html":
261 | 		pass
262 | 	elif url_filename == "":
263 | 		processed_link += "index.html"
264 | 	else:
265 | 		processed_link += ".html"
266 | 	if if_page_url == False:
267 | 		if processed_link.startswith("/"):
268 | 			processed_link = processed_link[1:]
269 | 	return processed_link
270 | 
271 | def SaveSinglePage(page_url):
272 | 	domain = GetUrlPart(page_url, "domain")
273 | 	domain_path = domain.replace(".", "_")
274 | 	processed_page_url = ProcessLink("http://" + domain, page_url, True)
275 | 	page_save_path = "website/" + domain_path + "/" + GetUrlPart(processed_page_url, "path")
276 | 	if os.path.exists(page_save_path) == True: 
277 | 		print("[Info] - " + page_url + " Downloaded")
278 | 		return None
279 | 	print("[Processing] - " + page_url)	
280 | 	links_js = ExtractLinks(page_url, "script", "src")
281 | 	links_css = ExtractLinks(page_url, "link", "href")
282 | 	links_img = ExtractLinks(page_url, "img", "src")
283 | 	links_a = ExtractLinks(page_url, "a", "href")
284 | 	links_all = links_js + links_css + links_img
285 | 	page_raw = ExtractContent(page_url)
286 | 	if page_raw == None: return None
287 | 	page_raw = page_raw.decode("utf-8", "ignore")
288 | 	processed_links = []
289 | 	for link in links_all:
290 | 		link_info = ProcessResourcePath(page_url, link.strip())
291 | 		try:
292 | 			page_raw = page_raw.replace(link, link_info["processed_source_url"])
293 | 		except Exception as e:
294 | 			print(e)
295 | 			continue
296 | 		source_save_path = "website/" + domain_path + "/" + link_info["source_save_path"]
297 | 		source_save_path.replace("\\\\", "")
298 | 		if os.path.exists(source_save_path) == True: continue
299 | 		source_raw = ExtractContent(link_info["source_download_url"])
300 | 		#print(source_save_path)
301 | 		if source_raw == None: continue
302 | 		SaveFile(source_raw, source_save_path)
303 | 	links = []
304 | 	links_copy = []
305 | 	for link_a in links_a:
306 | 		processed_link = ProcessLink(page_url, link_a)
307 | 		if processed_link in links_copy: continue
308 | 		if processed_link == None: continue
309 | 		links_copy.append(processed_link)
310 | 		link_temp = {
311 | 			"link": link_a,
312 | 			"processed_link": processed_link
313 | 		}
314 | 		links.append(link_temp)
315 | 
316 | 	for link in links:
317 | 		if link["link"] == '/': continue
318 | 		page_raw = page_raw.replace(link["link"], link["processed_link"])
319 | 	SaveFile(page_raw, page_save_path , True)
320 | 
321 | def CollectUrls(page_url):
322 | 	filename_black_names = [":", "?", "'", '"', "<", ">", "|"]
323 | 	black_suffix_str = ".tgz|.jar|.so|.docx|.py|.js|.css|.jpg|.jpeg|.png|.gif|.bmp|.pic|.tif|.txt|.doc|.hlp|.wps|.rtf|.pdf|.rar|.zip|.gz|.arj|.z|.wav|.aif|.au|.mp3|.ram|.wma|.mmf|.amr|.aac|.flac|.avi|.mpg|.mov|.swf|.int|.sys|.dll|.adt|.exe|.com|.c|.asm|.for|.lib|.lst|.msg|.obj|.pas|.wki|.bas|.map|.bak|.tmp|.dot|.bat|.cmd|.com"
324 | 	black_suffix = black_suffix_str.split("|")	
325 | 	links_a = ExtractLinks(page_url, "a", "href")
326 | 	result = []
327 | 	for link in links_a:
328 | 		link_info = ProcessResourcePath(page_url, link)
329 | 		processed_link = link_info["source_download_url"]
330 | 		if GetUrlPart(processed_link, "domain") != GetUrlPart(page_url, "domain"): continue
331 | 		if IfBlackName(filename_black_names, GetUrlPart(processed_link, "path"), 2): continue
332 | 		if IfBlackName(black_suffix, GetUrlPart(processed_link, "filesuffix")): continue
333 | 		processed_link = GetUrlPart(processed_link, "completedomain") + GetUrlPart(processed_link, "path")
334 | 		if processed_link not in result:
335 | 			result.append(processed_link)
336 | 	return result
337 | 
338 | async def coroutine_execution(function, param1):
339 | 	"""
340 | 	通过run_in_executor方法来新建一个线程来执行耗时函数。
341 | 	注意：functools.partial调用的参数应与目标函数一致
342 | 	"""
343 | 	loop = asyncio.get_event_loop()
344 | 	result = await loop.run_in_executor(None,functools.partial(function, page_url=param1)) 
345 | 	# result为目标函数返回的值
346 | 	return result
347 | 
348 | def coroutine_init(function, parameters, threads):
349 | 	"""
350 | 	处理线程
351 | 	coroutine_execution()调用协程函数，可自行修改参数个数内容等。 
352 | 	"""
353 | 	times = int(len(parameters) / threads) + 1
354 | 	if len(parameters) == threads or int(len(parameters) % threads) == 0: times -= 1
355 | 	result = []
356 | 	for num in range(times):
357 | 		tasks = []
358 | 		Minimum = threads * num
359 | 		Maximum = threads * (num + 1)
360 | 		if num == times - 1 and len(parameters) % threads != 0:
361 | 			Minimum = (times - 1) * threads
362 | 			Maximum = len(parameters)
363 | 		if len(parameters) <= threads:
364 | 			Minimum = 0
365 | 			Maximum = len(parameters)
366 | 		for i in range(Minimum, Maximum):
367 | 			# 此处的parameters[i]就是取目标参数的单个值，可自行调整
368 | 			future = asyncio.ensure_future(coroutine_execution(function, param1=parameters[i]))
369 | 			tasks.append(future)
370 | 		loop = asyncio.get_event_loop()
371 | 		loop.run_until_complete(asyncio.wait(tasks))
372 | 		for task in tasks:
373 | 			result.append(task.result())
374 | 		#print("[*] The {}th thread ends".format(str(num + 1)))
375 | 	return result
376 | 
377 | def ExtractUrls(main_url, depth = 200, threads = 30):
378 | 	print("[Info] - Collecting URLs for the entire website, it takes a little time...")
379 | 	print("Main url: {url} \nDepth: {depth}\nThreads:{threads}".format(url=main_url,depth=depth,threads=threads))
380 | 	domain = GetUrlPart(main_url, "domain")
381 | 	domain_path = domain.replace(".", "_")
382 | 	urls = CollectUrls(main_url)
383 | 	if main_url not in urls: urls.append(main_url)
384 | 	collected_urls = []
385 | 	urls_count = 0
386 | 	for i in range(0, depth):
387 | 		print("- " + str(i + 1) + "th loop traversal in progress")
388 | 		copy_urls = urls[:]
389 | 		if len(copy_urls) == len(collected_urls): break
390 | 		not_extracted_urls = []
391 | 		for url in copy_urls:
392 | 			if url not in collected_urls: 
393 | 				not_extracted_urls.append(url)
394 | 		results = coroutine_init(CollectUrls, parameters=not_extracted_urls, threads=threads)
395 | 		collected_urls.extend(not_extracted_urls)
396 | 		for result in results:
397 | 			for temp_url in result:
398 | 				if temp_url not in urls:
399 | 					urls.append(temp_url.strip())
400 | 		print("- Collected a total of {0} URL links in this cycle".format(len(urls) - urls_count))
401 | 		urls_count = len(urls)
402 | 	print("[Info] - Urls collection completed")
403 | 	print("[Info] - Collected a total of {0} URLs".format(str(urls_count)))
404 | 	print("\n[Info] - Getting source and resources for each page...")
405 | 	results = coroutine_init(SaveSinglePage, parameters=urls, threads=threads)
406 | 
407 | if __name__ == "__main__":
408 | 
409 | 	print(Welcome)
410 | 	print(Information)
411 | 	print(Help)
412 | 
413 | 	args = parse_args()
414 | 	if args.urls == None:
415 | 		if args.url == None:
416 | 			print("Please enter a url. \n Example: python -u 'http://www.threezh1.com/'")
417 | 			exit()
418 | 		if args.entire == True:
419 | 			depth = 200
420 | 			threads = 30
421 | 			if args.depth != None: depth = int(args.depth)
422 | 			if args.threads != None: threads = int(args.threads)
423 | 			ExtractUrls(args.url, depth, threads)
424 | 		elif args.entire == False:
425 | 			SaveSinglePage(args.url)
426 | 		print("\n[Info] - All resources have been downloaded")
427 | 	else:
428 | 		with open(args.urls, "r", encoding="utf-8") as fobject:
429 | 			urls = fobject.read().split("\n")
430 | 		for url in urls:
431 | 			if args.entire == True:
432 | 				depth = 200
433 | 				threads = 30
434 | 				if args.depth != None: depth = int(args.depth)
435 | 				if args.threads != None: threads = int(args.threads)
436 | 				ExtractUrls(url, depth, threads)
437 | 			elif args.entire == False:
438 | 				SaveSinglePage(url)


--------------------------------------------------------------------------------