├── requirements.txt ├── README.md └── sitecopy.py /requirements.txt: -------------------------------------------------------------------------------- 1 | bs4 2 | Path 3 | asyncio 4 | argparse 5 | functools -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SiteCopy 2 | 3 | sitecopy is a tool that facilitates personal website backup and network data collection 4 | 5 | ## 介绍 6 | 7 | 网站复制,也可称为网站备份。是通过工具将网页上的内容全部保存下来。当然不仅仅只是保存了一个html页面,而是将网页源码内所包含的css、js和静态文件等全部保存,以在本地也可以完整的浏览整个网站。网络上也有一些类似的工具,但使用起来并不理想。于是自己写一个Python脚本,方便个人对网站的备份,也方便一些网络资料的收集。 8 | 9 | - 工具名称: SiteCopy 10 | - 作者: Threezh1 11 | - 博客: http://www.threezh1.com/ 12 | 13 | 关于SiteCopy的开发记录:[论如何优雅的复制一个网站的所有页面](https://xz.aliyun.com/t/6941) 14 | 15 | 对互联网任何网站的复制需在取得授权后方可进行,若使用者因此做出危害网络安全的行为后果自负,与作者无关,特此声明。 16 | 17 | ## 使用 18 | 19 | Python版本: 3.7 20 | 21 | 安装依赖库: `pip3 install -r requirements.txt` 22 | 23 | - 复制单个页面 24 | 25 | `python sitecopy.py -u "http://www.threezh1.com"` 26 | 27 | - 复制整个网站 28 | 29 | `python sitecopy.py -u "http://www.threezh1.com" -e` 30 | 31 | - 复制多个页面 32 | 33 | `python sitecopy.py -s "site.txt"` 34 | 35 | - 复制多个网站 36 | 37 | `python sitecopy.py -s "site.txt" -e` 38 | 39 | 40 | 指定链接爬取的循环次数: -d (默认为200) 41 | 42 | 指定线程数:-e (默认为30) 43 | 44 | 例子: 爬取 www.threezh1.com 网站所有页面,指定链接爬取的循环次数为200,指定线程数为30 45 | 46 | `python sitecopy.py -u "http://www.threezh1.com" -e -d 200 -t 30` 47 | 48 | ## 复制网站测试 49 | 50 | - 复制自己的博客:https://threezh1.com 花费时间:2分钟48秒 51 | 52 | 运行截图: 53 | 54 | ![pic_11.jpg](https://s2.ax1x.com/2019/12/12/QcnOp9.jpg) 55 | 56 | 目录截图: 57 | 58 | ![pic_07.jpg](https://i.loli.net/2019/12/12/MRmv4licZCb5OzD.jpg) 59 | 60 | 页面截图: 61 | 62 | ![pic_06.jpg](https://i.loli.net/2019/12/12/4ydL371zCEiVJnZ.jpg) 63 | 64 | 65 | ## 已知存在的问题 66 | 67 | 1. 目录替换时在有些情况下会进行多次替换导致页面无法正常显示 68 | 2. 网站或图床有防爬措施时无法正常保存 69 | 3. 网络问题导致脚本无法正常执行 70 | 71 | 非常希望能够和师傅们共同交流对这些问题的解决方式,我的邮箱:makefoxm@qq.com 72 | -------------------------------------------------------------------------------- /sitecopy.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | # Author: Threezh1 4 | # Blog : http://www.threezh1.com/ 5 | # Github: https://github.com/Threezh1 6 | 7 | import requests, urllib, os, asyncio, functools, argparse, sys 8 | from pathlib import Path 9 | from bs4 import BeautifulSoup 10 | from requests.packages import urllib3 11 | from requests.adapters import HTTPAdapter 12 | 13 | Welcome = """ 14 | .▄▄ · ▪ ▄▄▄▄▄▄▄▄ . ▄▄· ▄▄▄· ▄· ▄▌ 15 | ▐█ ▀. ██ •██ ▀▄.▀·▐█ ▌▪▪ ▐█ ▄█▐█▪██▌ 16 | ▄▀▀▀█▄▐█· ▐█.▪▐▀▀▪▄██ ▄▄ ▄█▀▄ ██▀·▐█▌▐█▪ 17 | ▐█▄▪▐█▐█▌ ▐█▌·▐█▄▄▌▐███▌▐█▌.▐▌▐█▪·• ▐█▀·. 18 | ▀▀▀▀ ▀▀▀ ▀▀▀ ▀▀▀ ·▀▀▀ ▀█▄▀▪.▀ ▀ • """ 19 | 20 | Information = r""" 21 | Author: Threezh1 22 | Blog: http://www.threezh1.com/ 23 | Version: 1.0""" 24 | 25 | Help = r""" 26 | Uage: README.md 27 | Stop Copy: Ctrl + C 28 | """ 29 | 30 | urllib3.disable_warnings() 31 | header = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",} 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser(epilog='\tExample: \r\npython ' + sys.argv[0] + " -u http://www.baidu.com") 35 | parser.add_argument("-u", "--url", help="The address where you want to get the source code") 36 | parser.add_argument("-s", "--urls",help="Download multiple urls") 37 | parser.add_argument("-d", "--depth",help="Number of loops to get links") 38 | parser.add_argument("-t", "--threads",help="Number of threads for task execution") 39 | parser.add_argument("-e", "--entire",help="Download entire website", action="store_true") 40 | return parser.parse_args() 41 | 42 | # Get the page source 43 | def ExtractContent(url): 44 | try: 45 | raw = requests.get(url, headers = header, timeout=10, allow_redirects=True, verify=False) 46 | raw = raw.content 47 | if raw != "": 48 | return raw 49 | except Exception as e: 50 | print("[error] - " + url) 51 | #print(e) 52 | return None 53 | 54 | def Md5Encrypt(text): 55 | import hashlib 56 | hl = hashlib.md5() 57 | hl.update(text.encode(encoding='utf-8')) 58 | return hl.hexdigest() 59 | 60 | def GetUrlPart(url, part = ""): 61 | from urllib.parse import urlparse 62 | # http://www.example.com/a/b/index.php?id=1#h1 63 | # domain : www.example.com 64 | # scheme : http 65 | # path : /a/b/index.php 66 | # id=1 : id=1 67 | # fragment : h1 68 | # completepath : /a/b/ 69 | # completedomain : http://www.example.com 70 | # filename : index.php 71 | # filesuffix : php 72 | 73 | if url.startswith("http") == False: 74 | if part == "path": 75 | return url[:url.rfind("/") + 1] 76 | if part == "filename": 77 | temp = url[url.rfind("/") + 1:] 78 | if temp.find("?") != -1: 79 | temp = temp[:temp.find("?")] 80 | if temp.find("#") != -1: 81 | temp = temp[:temp.find("#")] 82 | return temp 83 | else: 84 | pass 85 | try: 86 | parsed = urlparse(url) 87 | except: 88 | return "" 89 | if part == "domain": 90 | return parsed.netloc 91 | elif part == "scheme": 92 | return parsed.scheme 93 | elif part == "path": 94 | return parsed.path 95 | elif part == "query": 96 | return parsed.query 97 | elif part == "fragment": 98 | return parsed.fragment 99 | elif part == "completepath": 100 | return parsed.path[:parsed.path.rfind("/") + 1] 101 | elif part == "completedomain": 102 | return (parsed.scheme + "://" + parsed.netloc) 103 | elif part == "filename": 104 | return parsed.path[parsed.path.rfind("/") + 1:] 105 | elif part == "filesuffix": 106 | temp = parsed.path[parsed.path.rfind("/") + 1:] 107 | if temp.find(".") == -1: return "" 108 | return temp[temp.find("."):] 109 | else: 110 | return parsed 111 | 112 | def ProcessResourcePath(pages_url, source_url): 113 | """ Handle the relationship between relative paths and absolute paths, and give replacement results and save paths """ 114 | 115 | source_download_url = "" 116 | processed_source_url = "" 117 | source_save_path = "" 118 | source_url_kind = 0 119 | 120 | relative_path = "" 121 | url_path = GetUrlPart(pages_url, "completepath") 122 | for i in range(url_path.count("/") - 1): 123 | relative_path += "../" 124 | # process others 125 | if_others = False 126 | if source_url.startswith("data:image") == False: 127 | # process absolute and special path 128 | if_abslote_url = False 129 | if source_url.startswith("http"): 130 | source_url_kind = 1 131 | source_download_url = source_url 132 | if_abslote_url = True 133 | elif source_url.startswith("//"): 134 | source_url_kind = 2 135 | source_download_url = GetUrlPart(pages_url, "scheme") + ":" + source_url 136 | if_abslote_url = True 137 | 138 | if_special_url = False 139 | if source_url.startswith("../"): 140 | source_url_kind = 3 141 | cleared_source_url = GetUrlPart(source_url, "filename") 142 | cleared_source_path = GetUrlPart(source_url, "path").replace("../", "") 143 | temp = url_path 144 | for i in range(source_url.count("../") + 1): 145 | temp = temp[:temp.rfind("/")] 146 | absolte_url_path = temp + "/" 147 | source_download_url = GetUrlPart(pages_url, "completedomain") + absolte_url_path + cleared_source_path + cleared_source_url 148 | temp = relative_path 149 | for i in range(source_url.count("../") + 1): 150 | temp = temp[:temp.rfind("/") + 1] 151 | processed_source_url = source_url 152 | if absolte_url_path.startswith("/"):absolte_url_path = absolte_url_path[1:] 153 | source_save_path = absolte_url_path + cleared_source_path + cleared_source_url 154 | if_special_url = True 155 | elif source_url.startswith("/") and source_url.startswith("//") == False and source_url.startswith("/./") == False: 156 | source_url_kind = 4 157 | source_download_url = GetUrlPart(pages_url, "completedomain") + source_url 158 | if relative_path == "": 159 | processed_source_url = GetUrlPart(source_url, "path")[1:] + GetUrlPart(source_url, "filename") 160 | else: 161 | processed_source_url = relative_path[:-1] + GetUrlPart(source_url, "path") + GetUrlPart(source_url, "filename") 162 | source_save_path = GetUrlPart(source_url, "path")[1:] + GetUrlPart(source_url, "filename") 163 | if_special_url = True 164 | elif source_url.startswith("/./"): 165 | source_url_kind = 5 166 | source_download_url = GetUrlPart(pages_url, "completedomain") + "/" + source_url[3:] 167 | processed_source_url = relative_path + GetUrlPart(source_url, "path")[3:] + GetUrlPart(source_url, "filename") 168 | source_save_path = GetUrlPart(source_url, "path")[3:] + GetUrlPart(source_url, "filename") 169 | if_special_url = True 170 | 171 | # process relative path 172 | if if_abslote_url == True: 173 | temp_source_name = Md5Encrypt(source_url) + GetUrlPart(source_download_url, "filesuffix") 174 | processed_source_url = relative_path + "nopathsource/" + temp_source_name 175 | source_save_path = "nopathsource/" + temp_source_name 176 | elif if_special_url == True: pass 177 | elif source_url.startswith("./"): 178 | source_url_kind = 6 179 | cleared_source_url = GetUrlPart(source_url[2:], "path") + GetUrlPart(source_url, "filename") 180 | else: 181 | source_url_kind = 7 182 | cleared_source_url = GetUrlPart(source_url, "path") + GetUrlPart(source_url, "filename") 183 | 184 | if if_abslote_url == False and if_special_url == False: 185 | source_download_url = GetUrlPart(pages_url, "completedomain") + GetUrlPart(pages_url, "completepath") + cleared_source_url 186 | processed_source_url = cleared_source_url 187 | source_save_path = url_path[1:] + cleared_source_url 188 | else: 189 | source_url_kind = 0 190 | result = { 191 | "pages_url": pages_url, 192 | "source_url": source_url, 193 | "source_download_url": source_download_url, 194 | "processed_source_url": processed_source_url, 195 | "source_save_path": source_save_path, 196 | "source_url_kind": source_url_kind 197 | } 198 | return result 199 | 200 | def IfBlackName(black_name_list, text, kind=1): 201 | # 1: equal 202 | # 2: exist 203 | # 3: startswith 204 | for temp in black_name_list: 205 | if kind == 1: 206 | if text == temp: 207 | return True 208 | if kind == 2: 209 | if text.find(temp) != -1: 210 | return True 211 | if kind == 3: 212 | if text.startswith(temp): 213 | return True 214 | return False 215 | 216 | def ExtractLinks(url, lable_name, attribute_name): 217 | single_black_names = ["/", "#"] 218 | starts_black_names = ["#", "javascript:"] 219 | html_raw = ExtractContent(url) 220 | if html_raw == None: return [] 221 | html = BeautifulSoup(html_raw.decode("utf-8", "ignore"), "html.parser") 222 | lables = html.findAll({lable_name}) 223 | old_links = [] 224 | for lable in lables: 225 | lable_attribute = lable.get(attribute_name) 226 | if lable_attribute == None or lable_attribute == "": continue 227 | lable_attribute = lable_attribute.strip() 228 | if IfBlackName(single_black_names, lable_attribute): continue 229 | if IfBlackName(starts_black_names, lable_attribute, 3): continue 230 | if lable_attribute not in old_links: 231 | old_links.append(lable_attribute) 232 | return old_links 233 | 234 | def SaveFile(file_content, file_path, utf8=False): 235 | processed_path = urllib.parse.unquote(file_path) 236 | try: 237 | path = Path(GetUrlPart(processed_path, "path")) 238 | path.mkdir(parents=True, exist_ok=True) 239 | if utf8 == False: 240 | with open(processed_path, "wb") as fobject: 241 | fobject.write(file_content) 242 | else: 243 | with open(processed_path, "w", encoding="utf-8") as fobject: 244 | fobject.write(file_content) 245 | except Exception as e: 246 | print("[error] - " + file_path) 247 | #print(e) 248 | 249 | def ProcessLink(page_url, link, if_page_url = False): 250 | temp = ProcessResourcePath(page_url, link) 251 | processed_link = temp["source_download_url"] 252 | if GetUrlPart(page_url, "domain") != GetUrlPart(processed_link, "domain"): return None 253 | if if_page_url == True: 254 | processed_link = GetUrlPart(processed_link, "completedomain") + GetUrlPart(processed_link, "path") 255 | else: 256 | temp = ProcessResourcePath(page_url, link) 257 | processed_link = temp["processed_source_url"] 258 | url_filename = GetUrlPart(processed_link, "filename") 259 | url_suffix = GetUrlPart(processed_link, "filesuffix") 260 | if url_suffix == ".html": 261 | pass 262 | elif url_filename == "": 263 | processed_link += "index.html" 264 | else: 265 | processed_link += ".html" 266 | if if_page_url == False: 267 | if processed_link.startswith("/"): 268 | processed_link = processed_link[1:] 269 | return processed_link 270 | 271 | def SaveSinglePage(page_url): 272 | domain = GetUrlPart(page_url, "domain") 273 | domain_path = domain.replace(".", "_") 274 | processed_page_url = ProcessLink("http://" + domain, page_url, True) 275 | page_save_path = "website/" + domain_path + "/" + GetUrlPart(processed_page_url, "path") 276 | if os.path.exists(page_save_path) == True: 277 | print("[Info] - " + page_url + " Downloaded") 278 | return None 279 | print("[Processing] - " + page_url) 280 | links_js = ExtractLinks(page_url, "script", "src") 281 | links_css = ExtractLinks(page_url, "link", "href") 282 | links_img = ExtractLinks(page_url, "img", "src") 283 | links_a = ExtractLinks(page_url, "a", "href") 284 | links_all = links_js + links_css + links_img 285 | page_raw = ExtractContent(page_url) 286 | if page_raw == None: return None 287 | page_raw = page_raw.decode("utf-8", "ignore") 288 | processed_links = [] 289 | for link in links_all: 290 | link_info = ProcessResourcePath(page_url, link.strip()) 291 | try: 292 | page_raw = page_raw.replace(link, link_info["processed_source_url"]) 293 | except Exception as e: 294 | print(e) 295 | continue 296 | source_save_path = "website/" + domain_path + "/" + link_info["source_save_path"] 297 | source_save_path.replace("\\\\", "") 298 | if os.path.exists(source_save_path) == True: continue 299 | source_raw = ExtractContent(link_info["source_download_url"]) 300 | #print(source_save_path) 301 | if source_raw == None: continue 302 | SaveFile(source_raw, source_save_path) 303 | links = [] 304 | links_copy = [] 305 | for link_a in links_a: 306 | processed_link = ProcessLink(page_url, link_a) 307 | if processed_link in links_copy: continue 308 | if processed_link == None: continue 309 | links_copy.append(processed_link) 310 | link_temp = { 311 | "link": link_a, 312 | "processed_link": processed_link 313 | } 314 | links.append(link_temp) 315 | 316 | for link in links: 317 | if link["link"] == '/': continue 318 | page_raw = page_raw.replace(link["link"], link["processed_link"]) 319 | SaveFile(page_raw, page_save_path , True) 320 | 321 | def CollectUrls(page_url): 322 | filename_black_names = [":", "?", "'", '"', "<", ">", "|"] 323 | black_suffix_str = ".tgz|.jar|.so|.docx|.py|.js|.css|.jpg|.jpeg|.png|.gif|.bmp|.pic|.tif|.txt|.doc|.hlp|.wps|.rtf|.pdf|.rar|.zip|.gz|.arj|.z|.wav|.aif|.au|.mp3|.ram|.wma|.mmf|.amr|.aac|.flac|.avi|.mpg|.mov|.swf|.int|.sys|.dll|.adt|.exe|.com|.c|.asm|.for|.lib|.lst|.msg|.obj|.pas|.wki|.bas|.map|.bak|.tmp|.dot|.bat|.cmd|.com" 324 | black_suffix = black_suffix_str.split("|") 325 | links_a = ExtractLinks(page_url, "a", "href") 326 | result = [] 327 | for link in links_a: 328 | link_info = ProcessResourcePath(page_url, link) 329 | processed_link = link_info["source_download_url"] 330 | if GetUrlPart(processed_link, "domain") != GetUrlPart(page_url, "domain"): continue 331 | if IfBlackName(filename_black_names, GetUrlPart(processed_link, "path"), 2): continue 332 | if IfBlackName(black_suffix, GetUrlPart(processed_link, "filesuffix")): continue 333 | processed_link = GetUrlPart(processed_link, "completedomain") + GetUrlPart(processed_link, "path") 334 | if processed_link not in result: 335 | result.append(processed_link) 336 | return result 337 | 338 | async def coroutine_execution(function, param1): 339 | """ 340 | 通过run_in_executor方法来新建一个线程来执行耗时函数。 341 | 注意:functools.partial调用的参数应与目标函数一致 342 | """ 343 | loop = asyncio.get_event_loop() 344 | result = await loop.run_in_executor(None,functools.partial(function, page_url=param1)) 345 | # result为目标函数返回的值 346 | return result 347 | 348 | def coroutine_init(function, parameters, threads): 349 | """ 350 | 处理线程 351 | coroutine_execution()调用协程函数,可自行修改参数个数内容等。 352 | """ 353 | times = int(len(parameters) / threads) + 1 354 | if len(parameters) == threads or int(len(parameters) % threads) == 0: times -= 1 355 | result = [] 356 | for num in range(times): 357 | tasks = [] 358 | Minimum = threads * num 359 | Maximum = threads * (num + 1) 360 | if num == times - 1 and len(parameters) % threads != 0: 361 | Minimum = (times - 1) * threads 362 | Maximum = len(parameters) 363 | if len(parameters) <= threads: 364 | Minimum = 0 365 | Maximum = len(parameters) 366 | for i in range(Minimum, Maximum): 367 | # 此处的parameters[i]就是取目标参数的单个值,可自行调整 368 | future = asyncio.ensure_future(coroutine_execution(function, param1=parameters[i])) 369 | tasks.append(future) 370 | loop = asyncio.get_event_loop() 371 | loop.run_until_complete(asyncio.wait(tasks)) 372 | for task in tasks: 373 | result.append(task.result()) 374 | #print("[*] The {}th thread ends".format(str(num + 1))) 375 | return result 376 | 377 | def ExtractUrls(main_url, depth = 200, threads = 30): 378 | print("[Info] - Collecting URLs for the entire website, it takes a little time...") 379 | print("Main url: {url} \nDepth: {depth}\nThreads:{threads}".format(url=main_url,depth=depth,threads=threads)) 380 | domain = GetUrlPart(main_url, "domain") 381 | domain_path = domain.replace(".", "_") 382 | urls = CollectUrls(main_url) 383 | if main_url not in urls: urls.append(main_url) 384 | collected_urls = [] 385 | urls_count = 0 386 | for i in range(0, depth): 387 | print("- " + str(i + 1) + "th loop traversal in progress") 388 | copy_urls = urls[:] 389 | if len(copy_urls) == len(collected_urls): break 390 | not_extracted_urls = [] 391 | for url in copy_urls: 392 | if url not in collected_urls: 393 | not_extracted_urls.append(url) 394 | results = coroutine_init(CollectUrls, parameters=not_extracted_urls, threads=threads) 395 | collected_urls.extend(not_extracted_urls) 396 | for result in results: 397 | for temp_url in result: 398 | if temp_url not in urls: 399 | urls.append(temp_url.strip()) 400 | print("- Collected a total of {0} URL links in this cycle".format(len(urls) - urls_count)) 401 | urls_count = len(urls) 402 | print("[Info] - Urls collection completed") 403 | print("[Info] - Collected a total of {0} URLs".format(str(urls_count))) 404 | print("\n[Info] - Getting source and resources for each page...") 405 | results = coroutine_init(SaveSinglePage, parameters=urls, threads=threads) 406 | 407 | if __name__ == "__main__": 408 | 409 | print(Welcome) 410 | print(Information) 411 | print(Help) 412 | 413 | args = parse_args() 414 | if args.urls == None: 415 | if args.url == None: 416 | print("Please enter a url. \n Example: python -u 'http://www.threezh1.com/'") 417 | exit() 418 | if args.entire == True: 419 | depth = 200 420 | threads = 30 421 | if args.depth != None: depth = int(args.depth) 422 | if args.threads != None: threads = int(args.threads) 423 | ExtractUrls(args.url, depth, threads) 424 | elif args.entire == False: 425 | SaveSinglePage(args.url) 426 | print("\n[Info] - All resources have been downloaded") 427 | else: 428 | with open(args.urls, "r", encoding="utf-8") as fobject: 429 | urls = fobject.read().split("\n") 430 | for url in urls: 431 | if args.entire == True: 432 | depth = 200 433 | threads = 30 434 | if args.depth != None: depth = int(args.depth) 435 | if args.threads != None: threads = int(args.threads) 436 | ExtractUrls(url, depth, threads) 437 | elif args.entire == False: 438 | SaveSinglePage(url) --------------------------------------------------------------------------------