├── .gitignore
├── CSDNBlogBackup.py
├── README.md
└── screenshots
├── blog.png
├── download.png
├── downloaded.jpg
└── index.png
/.gitignore:
--------------------------------------------------------------------------------
1 | CSDN-lanbing510/
--------------------------------------------------------------------------------
/CSDNBlogBackup.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Thu Dec 03 15:06:27 2015
4 |
5 | @author: 冰蓝
6 | """
7 | import re
8 | import os
9 | import sys
10 | import chilkat
11 |
12 |
13 | head_string="""
14 |
15 |
16 | Evernote Export
17 |
18 |
19 |
20 |
26 |
27 |
28 | """
29 | tail_string="""
30 |
31 |
32 | """
33 |
34 | iter_count=0
35 |
36 |
37 | def extractBlogLists(user_name='lanbing510',loop_times=1000):
38 | url="http://blog.csdn.net/%s/" % user_name
39 | spider=chilkat.CkSpider()
40 | spider.Initialize(url)
41 | pattern=user_name+'/article/details'
42 | file_path='URList-'+user_name+'.txt'
43 | f=open(file_path,'w')
44 | url_count=0
45 | for i in range(0,loop_times):
46 | success = spider.CrawlNext()
47 | if (success == True):
48 | url=spider.lastUrl()
49 | m=re.search(pattern,url)
50 | if not m:
51 | continue
52 | url_count+=1
53 | print url_count
54 | print url
55 | title=spider.lastHtmlTitle().split(' -')[0]
56 | title=title.replace('/',' ') #标题中有特殊符号时的处理
57 | title=title.replace('_',' ')
58 | title=title.replace(':',' ')
59 | title=title.replace('*',' ')
60 | title=title.replace('?',' ')
61 | title=title.replace('|',' ')
62 | title=title.replace('#','sharp')
63 | f.write(url+","+title+'\n')
64 | #Print The HTML META title
65 | #print(spider.lastHtmlTitle().decode('gbk'))
66 | else:
67 | #Did we get an error or are there no more URLs to crawl?
68 | if (spider.get_NumUnspidered() == 0):
69 | print "No more URLs to spider"
70 | else:
71 | print spider.lastErrorText()
72 | #Sleep 1 second before spidering the next URL.
73 | spider.SleepMs(1000)
74 | f.close()
75 | #对生产的文件进行备份
76 | open('URList-'+user_name+'-backup.txt', "w").write(open(file_path, "r").read())
77 |
78 |
79 | def downloadBlogLists(user_name='lanbing510'):
80 | global iter_count
81 | mht = chilkat.CkMht()
82 | success = mht.UnlockComponent("Anything for 30-day trial")
83 | if (success != True):
84 | print(mht.lastErrorText())
85 | sys.exit()
86 |
87 | file_path='URList-'+user_name+'.txt'
88 | f=open(file_path,'r')
89 | fout=open('Error.txt','w')
90 |
91 | for line in f.readlines():
92 | m=re.search('(http.+[0-9]{7,}),(.+)',line)
93 | url=m.group(1)
94 | title=m.group(2)
95 | mht_doc = mht.getMHT(url)
96 | if (mht_doc == None ):
97 | print(mht.lastErrorText())
98 | sys.exit()
99 |
100 | if not os.path.exists('CSDN-'+user_name):
101 | os.mkdir('CSDN-'+user_name)
102 | #Now extract the HTML and embedded objects:
103 | unpack_dir = "./CSDN-"+user_name+'/'
104 | html_filename = title+".html"
105 | parts_subdir = title
106 | success = mht.UnpackMHTString(mht_doc,unpack_dir,html_filename,parts_subdir)
107 | if (success != True):
108 | #print(mht.lastErrorText())
109 | fout.write(line)
110 | else:
111 | print("Successfully Downloaded "+title.decode('gbk'))
112 | f.close()
113 | fout.close()
114 | if iter_count>=5:
115 | print u"Some Blogs May Not Be Downloaded Successfully, Pleace Make Sure By Checking Error.txt And Index.html."
116 | os.remove(file_path)
117 | os.rename('URList-'+user_name+'-backup.txt',file_path)
118 | if iter_count<10 and os.path.getsize('Error.txt')>0:
119 | iter_count+=1
120 | print u"进行第 "+str(iter_count)+u" 次迭代下载"
121 | os.remove(file_path)
122 | os.rename('Error.txt',file_path)
123 | downloadBlogLists(user_name)
124 |
125 |
126 | def generateIndex(user_name='lanbing510'):
127 | file_path='URList-'+user_name+'.txt'
128 | f=open(file_path,'r')
129 | fout=open('./CSDN-'+user_name+'/Index.html','w')
130 | fout.write(head_string)
131 | fout.write(""""""+user_name+"的博客"+"""
\n""")
132 | fout.write("""\n""")
133 | for line in f.readlines():
134 | m=re.search('(http.+[0-9]{7,}),(.+)',line)
135 | title=m.group(2)
136 | title=title.decode('gbk').encode('utf-8')
137 | print title
138 | fout.write("""- """+title+"""
\n""")
139 | fout.write("""
""")
140 | fout.write(tail_string)
141 | f.close()
142 | fout.close()
143 |
144 |
145 | if __name__=='__main__':
146 | print "Please Input The Username Of Your CSDN Blog"
147 | user_name=raw_input()
148 | print "Start Extracting Blog List..."
149 | extractBlogLists(user_name)
150 | print "Start Downloading Blog List..."
151 | downloadBlogLists(user_name)
152 | print "Start Generating Index.html..."
153 | generateIndex(user_name)
154 | print "Done"
155 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Python实现CSDN博客的完美备份
2 |
3 | ## 出发点
4 |
5 | 之所以造这个轮子无非是现有的轮子不好使,CSDN官网是推出的博客备份在系统中读不到博客数据,打开后还会闪退,其他人写的工具,要么是收费,要么只是对网页的下载,不能完整的下载网页中嵌入的图片等各种资源。
6 |
7 | 于是自己花几个小时写了这个工具,其特点是可以做到CSDN博客的完美备份,下载整个网页,包括网页中的图片,css,js等,可以做到博客的完整备份。
8 |
9 |
10 | ## 功能
11 |
12 | 1 CSDN博客的完美备份;
13 |
14 | 2 下载整个博客网页,包括图片,css,js等各种资源;
15 |
16 | 3 生成Index.html方便对本地博客的浏览;
17 |
18 | 4 完全免费,开源。
19 |
20 |
21 | ## 效果截图
22 |
23 | 利用整个脚本已经把自己的博客做了完整备份,一些截图如下:
24 |
25 |
26 | 这是下载完成后文件夹里的部分内容,所有网页对应的图片等资源都放到了同名文件夹中。
27 |
28 |
29 |
30 |
31 | 这是Inde下.html索引文件:
32 |
33 |
34 |
35 |
36 | 这是部分博客内容的展示,其对图片和公式都能非常好的支持,即便公式是用mathjax写的(因为下载了网页需要的所有资源,包括js)。
37 |
38 |
39 |
40 |
41 |
42 | ## 运行环境
43 |
44 | 1 python;
45 |
46 | 2 python的chilkat库。
47 |
48 | ## 使用说明
49 |
50 | 使用时直接双击CSDNBlogBackup.py,输入你要备份的CSDN的用户名,等待下载完成即可。
51 |
52 | ## 注
53 |
54 | 可以自己各种DIY做各种其他博客的备份。Enjoy It!
55 |
56 |
--------------------------------------------------------------------------------
/screenshots/blog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/blog.png
--------------------------------------------------------------------------------
/screenshots/download.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/download.png
--------------------------------------------------------------------------------
/screenshots/downloaded.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/downloaded.jpg
--------------------------------------------------------------------------------
/screenshots/index.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lanbing510/CSDNBlogBackup/c0646e1151c3a5259c3bf8601a3d571db6013b2b/screenshots/index.png
--------------------------------------------------------------------------------