├── .gitignore ├── CSDNBlogBackup.py ├── README.md └── screenshots ├── blog.png ├── download.png ├── downloaded.jpg └── index.png /.gitignore: -------------------------------------------------------------------------------- 1 | CSDN-lanbing510/ -------------------------------------------------------------------------------- /CSDNBlogBackup.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Thu Dec 03 15:06:27 2015 4 | 5 | @author: 冰蓝 6 | """ 7 | import re 8 | import os 9 | import sys 10 | import chilkat 11 | 12 | 13 | head_string=""" 14 | 15 | 16 | Evernote Export 17 | 18 | 19 | 20 | 26 | 27 | 28 | """ 29 | tail_string=""" 30 | 31 | 32 | """ 33 | 34 | iter_count=0 35 | 36 | 37 | def extractBlogLists(user_name='lanbing510',loop_times=1000): 38 | url="http://blog.csdn.net/%s/" % user_name 39 | spider=chilkat.CkSpider() 40 | spider.Initialize(url) 41 | pattern=user_name+'/article/details' 42 | file_path='URList-'+user_name+'.txt' 43 | f=open(file_path,'w') 44 | url_count=0 45 | for i in range(0,loop_times): 46 | success = spider.CrawlNext() 47 | if (success == True): 48 | url=spider.lastUrl() 49 | m=re.search(pattern,url) 50 | if not m: 51 | continue 52 | url_count+=1 53 | print url_count 54 | print url 55 | title=spider.lastHtmlTitle().split(' -')[0] 56 | title=title.replace('/',' ') #标题中有特殊符号时的处理 57 | title=title.replace('_',' ') 58 | title=title.replace(':',' ') 59 | title=title.replace('*',' ') 60 | title=title.replace('?',' ') 61 | title=title.replace('|',' ') 62 | title=title.replace('#','sharp') 63 | f.write(url+","+title+'\n') 64 | #Print The HTML META title 65 | #print(spider.lastHtmlTitle().decode('gbk')) 66 | else: 67 | #Did we get an error or are there no more URLs to crawl? 68 | if (spider.get_NumUnspidered() == 0): 69 | print "No more URLs to spider" 70 | else: 71 | print spider.lastErrorText() 72 | #Sleep 1 second before spidering the next URL. 73 | spider.SleepMs(1000) 74 | f.close() 75 | #对生产的文件进行备份 76 | open('URList-'+user_name+'-backup.txt', "w").write(open(file_path, "r").read()) 77 | 78 | 79 | def downloadBlogLists(user_name='lanbing510'): 80 | global iter_count 81 | mht = chilkat.CkMht() 82 | success = mht.UnlockComponent("Anything for 30-day trial") 83 | if (success != True): 84 | print(mht.lastErrorText()) 85 | sys.exit() 86 | 87 | file_path='URList-'+user_name+'.txt' 88 | f=open(file_path,'r') 89 | fout=open('Error.txt','w') 90 | 91 | for line in f.readlines(): 92 | m=re.search('(http.+[0-9]{7,}),(.+)',line) 93 | url=m.group(1) 94 | title=m.group(2) 95 | mht_doc = mht.getMHT(url) 96 | if (mht_doc == None ): 97 | print(mht.lastErrorText()) 98 | sys.exit() 99 | 100 | if not os.path.exists('CSDN-'+user_name): 101 | os.mkdir('CSDN-'+user_name) 102 | #Now extract the HTML and embedded objects: 103 | unpack_dir = "./CSDN-"+user_name+'/' 104 | html_filename = title+".html" 105 | parts_subdir = title 106 | success = mht.UnpackMHTString(mht_doc,unpack_dir,html_filename,parts_subdir) 107 | if (success != True): 108 | #print(mht.lastErrorText()) 109 | fout.write(line) 110 | else: 111 | print("Successfully Downloaded "+title.decode('gbk')) 112 | f.close() 113 | fout.close() 114 | if iter_count>=5: 115 | print u"Some Blogs May Not Be Downloaded Successfully, Pleace Make Sure By Checking Error.txt And Index.html." 116 | os.remove(file_path) 117 | os.rename('URList-'+user_name+'-backup.txt',file_path) 118 | if iter_count<10 and os.path.getsize('Error.txt')>0: 119 | iter_count+=1 120 | print u"进行第 "+str(iter_count)+u" 次迭代下载" 121 | os.remove(file_path) 122 | os.rename('Error.txt',file_path) 123 | downloadBlogLists(user_name) 124 | 125 | 126 | def generateIndex(user_name='lanbing510'): 127 | file_path='URList-'+user_name+'.txt' 128 | f=open(file_path,'r') 129 | fout=open('./CSDN-'+user_name+'/Index.html','w') 130 | fout.write(head_string) 131 | fout.write("""

"""+user_name+"的博客"+"""

\n""") 132 | fout.write("""
    \n""") 133 | for line in f.readlines(): 134 | m=re.search('(http.+[0-9]{7,}),(.+)',line) 135 | title=m.group(2) 136 | title=title.decode('gbk').encode('utf-8') 137 | print title 138 | fout.write("""
  1. """+title+"""
  2. \n""") 139 | fout.write("""
""") 140 | fout.write(tail_string) 141 | f.close() 142 | fout.close() 143 | 144 | 145 | if __name__=='__main__': 146 | print "Please Input The Username Of Your CSDN Blog" 147 | user_name=raw_input() 148 | print "Start Extracting Blog List..." 149 | extractBlogLists(user_name) 150 | print "Start Downloading Blog List..." 151 | downloadBlogLists(user_name) 152 | print "Start Generating Index.html..." 153 | generateIndex(user_name) 154 | print "Done" 155 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Python实现CSDN博客的完美备份 2 | 3 | ## 出发点 4 | 5 | 之所以造这个轮子无非是现有的轮子不好使,CSDN官网是推出的博客备份在系统中读不到博客数据,打开后还会闪退,其他人写的工具,要么是收费,要么只是对网页的下载,不能完整的下载网页中嵌入的图片等各种资源。 6 | 7 | 于是自己花几个小时写了这个工具,其特点是可以做到CSDN博客的完美备份,下载整个网页,包括网页中的图片,css,js等,可以做到博客的完整备份。 8 | 9 | 10 | ## 功能 11 | 12 | 1 CSDN博客的完美备份; 13 | 14 | 2 下载整个博客网页,包括图片,css,js等各种资源; 15 | 16 | 3 生成Index.html方便对本地博客的浏览; 17 | 18 | 4 完全免费,开源。 19 | 20 | 21 | ## 效果截图 22 | 23 | 利用整个脚本已经把自己的博客做了完整备份,一些截图如下: 24 | 25 | 26 | 这是下载完成后文件夹里的部分内容,所有网页对应的图片等资源都放到了同名文件夹中。 27 | 28 | 29 | 30 | 31 | 这是Inde下.html索引文件: 32 | 33 | 34 | 35 | 36 | 这是部分博客内容的展示,其对图片和公式都能非常好的支持,即便公式是用mathjax写的(因为下载了网页需要的所有资源,包括js)。 37 | 38 | 39 | 40 | 41 | 42 | ## 运行环境 43 | 44 | 1 python; 45 | 46 | 2 python的chilkat库。 47 | 48 | ## 使用说明 49 | 50 | 使用时直接双击CSDNBlogBackup.py,输入你要备份的CSDN的用户名,等待下载完成即可。 51 | 52 | ## 注 53 | 54 | 可以自己各种DIY做各种其他博客的备份。Enjoy It! 55 | 56 | -------------------------------------------------------------------------------- /screenshots/blog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/blog.png -------------------------------------------------------------------------------- /screenshots/download.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/download.png -------------------------------------------------------------------------------- /screenshots/downloaded.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/downloaded.jpg -------------------------------------------------------------------------------- /screenshots/index.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/index.png --------------------------------------------------------------------------------