├── .gitignore
├── README.md
├── readLof.exe
├── readLof.py
├── readLof_photo.exe
└── readLof_photo.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # -XML-Reader-Lofter-Data-Backup
2 | XML readers for xml files of LOFTER users data
3 | ## Introduction
4 |
5 | - 用于整理导出lofter的主页xml日志内的文章和插图
6 | - 编码为UTF-8,与xml日志编码一致
7 | - 文章以`txt`格式保存在同路径下的`Articles`文件夹下,`txt`文件包含标题、发表时间、修改时间、Tag、正文、文章内插入的链接、评论
8 | - 文章内插入的图片保存在同路径下的`Images`文件夹
9 | - 插图保存在同路径下的`Photos`文件夹,包含每一条lof插图的所有图片,以及用`txt`文件保存该条发表时间、修改时间、Tag、插图描述、插入的链接、评论
10 | - 整理文章请运行`readLof.exe`可执行文件,插图为`readLof_photo.exe`可执行文件
11 |
12 | ## Update Log
13 | - 2020.6.12. 23:48 修复了评论只能导出一条的问题;修复标题重复的文件被覆盖的问题,重名的文件会在文件名尾追加重复次数的序号;增加了图片链接请求的超时时间限制
14 | - 2020.6.13. 13:15 修复了标题内包含'\n'导致闪退的问题;修改了评论发表时间的显示格式,从时间戳改为正常时间格式
15 |
16 | ## How to Run
17 |
18 | ### Environment
19 | - Windows 7/8/XP/10
20 |
21 | ### Run
22 | - 下载后直接运行`readLof.exe`或`readLof_photo.exe`
23 | - 运行后,在弹出的文件选择框中选择需要整理的xml日志文件(一定不要选其他格式的文件或者非lofter日志文件,不然请关掉程序重来)
24 | - 等待命令行窗口执行完毕自动退出,完成整理导出
25 |
26 |
27 | ## Attention
28 |
29 | - 2020.6.12. 23:48:追加:关于闪退问题,大部分原因是文章内容包含特殊符号比如小方块(这里打不出来),这些符号读xml文件时无法解析所以会报错闪退,需要手动删掉这些特殊符号
30 | - 2020.6.12. 23:48:追加2:关于部分文内插图和图片导不出来的问题,目前增加了超时时限,但是仍旧存在连接超时和拒绝访问的情况,因此可能还是导不出来
31 | - 所有文章的txt文件命名使用文章标题,没有标题则用“无题\_<发布时间>”的格式命名,文章插图用“<文章标题>\_<图片序号>”的格式命名
32 | - 所有插图以“插图_<发布时间>\_<图片序号>”的格式命名,插图的描述和评论等的txt文件以“插图\_<发布时间>”的格式命名
33 | - lofter日志只包括了标题、发布时间、修改时间、Tag、图片、正文、文内链接、评论,不包括热度
34 | - 由于有的图片源链接会失效或者连接超时,所以会出现timeout和403 forbidden的情况,这种情况可以多次运行程序尝试重备份,但是通常情况下该下不下来还是下不下来(……
35 | - 文章导出比较快,插图类看情况可能要花比较长的时间,当然100条以下的插图lof还是很快可以下载完所有图片的
36 | - 关于评论:由于xml日志内,每条回复的被回复人只有一串字符串代码id而不是昵称,回复的发布人才同时有昵称和代码id,因此比较影响阅读
37 | - 文章标题内的尖括号、英文双引号、星号、英文问号、正反竖斜杠属于Windows违法的文件命名符号,因此全部替换为了“-”符号,但是正文的这些符号不受影响
38 | - 文章内出现的尖括号等html语言字符在xml日志内均自动转成了代码,目前没有实现将这些代码还原符号的功能
39 | - 文内链接统一整理写入了txt文件结尾(评论之前),没有位于原本插入的位置
40 |
--------------------------------------------------------------------------------
/readLof.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof.exe
--------------------------------------------------------------------------------
/readLof.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | from tkinter.filedialog import *
4 | from tkinter import *
5 | from xml.dom.minidom import parse
6 | import xml.dom.minidom
7 | import os
8 | import datetime, time
9 | import re
10 | import urllib
11 | import urllib.request
12 | import imghdr
13 | import os.path
14 | from pathlib import Path
15 | import shutil
16 |
17 |
18 | rootwin = Tk()
19 | rootwin.withdraw()
20 |
21 | default_dir = r"file_path"
22 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir)))
23 |
24 | #print(file_path)
25 |
26 |
27 | DOMTree = xml.dom.minidom.parse(file_path)
28 |
29 | rootNode = DOMTree.documentElement
30 | #print(rootNode.nodeName)
31 |
32 | items = rootNode.getElementsByTagName("PostItem")
33 |
34 | i = 0
35 |
36 | sameTitle = {}
37 | scount = 0
38 |
39 | def requestImg(url, i, title, num_retries=3):
40 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
41 | AppleWebKit/537.36 (KHTML, like Gecko) \
42 | Chrome/35.0.1916.114 Safari/537.36',
43 | 'Cookie': 'AspxAutoDetectCookieSupport=1'}
44 |
45 | req = urllib.request.Request(url=url, headers=header)
46 | try:
47 | response = urllib.request.urlopen(req, timeout=20)
48 | imgfname = title + '-' + str(i)
49 | conte = response.read()
50 | imgtype = imghdr.what("", conte)
51 | #print(imgtype)
52 | imgfname += '.'+imgtype
53 | with open("./Images/"+imgfname, "wb") as f:
54 |
55 | f.write(conte)
56 | response.close()
57 |
58 |
59 | except Exception as e:
60 | print(e)
61 |
62 |
63 | if not os.path.exists('Articles'):
64 | os.mkdir('Articles')
65 | else:
66 | shutil.rmtree('Articles')
67 | os.mkdir('Articles')
68 |
69 | if not os.path.exists('Images'):
70 | os.mkdir('Images')
71 | else:
72 | shutil.rmtree('Images')
73 | os.mkdir('Images')
74 |
75 | for item in items:
76 | itemType = item.getElementsByTagName("type")[0]
77 | #print(itemType.nodeName, ":", itemType.childNodes[0].data)
78 | if itemType.childNodes[0].data != "Text":
79 | continue
80 | i += 1
81 |
82 | ## get all messages
83 |
84 | title = item.getElementsByTagName("title")[0]
85 | titleText = ''
86 | if title.childNodes == []:
87 | titleText = ''
88 | else:
89 | titleText = title.childNodes[0].data #.replace('','')
90 |
91 |
92 | publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data
93 | publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000)))
94 | if titleText == '':
95 | titleText = "无题_"+publishTime
96 | titleText = titleText.replace(' ', '_')
97 |
98 | titleText = titleText.replace('/','-')
99 | titleText = titleText.replace('\\', '-')
100 | titleText = titleText.replace('|', '-')
101 | titleText = titleText.replace(':', '-')
102 | titleText = titleText.replace('"', '-')
103 | titleText = titleText.replace('*', '-')
104 | titleText = titleText.replace('?', '-')
105 | titleText = titleText.replace('<', '-')
106 | titleText = titleText.replace('>', '-')
107 | titleText = titleText.replace('', '')
108 | titleText = titleText.replace('\n', '')
109 |
110 | modifyTime = publishTime
111 | if item.getElementsByTagName("modifyTime") == []:
112 | modifyTime = publishTime
113 | else:
114 | modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data
115 | modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000)))
116 | tag = []
117 | if item.getElementsByTagName("tag") == []:
118 | tag = []
119 | else:
120 | tag = item.getElementsByTagName("tag")[0]
121 | if tag.childNodes == []:
122 | tag = []
123 | else:
124 | tag = tag.childNodes[0].data.split(',')
125 | #print(tag)
126 | content = item.getElementsByTagName("content")[0].childNodes[0].data
127 | content = content.replace('
', '')
128 | content = content.replace('
', '')
129 | content = content.replace('
', '\r\n')
130 | content = content.replace('', '')
131 |
132 | linkSrc = r'href="(.*?)"'
133 | linkSrc = re.findall(linkSrc, content)
134 |
135 | imgSrc = r'
]+>', re.S).sub('', content)
144 |
145 |
146 | #if i == 1:
147 | # print(content)
148 | #cList = item.getElementsByTagName("commentList")
149 | cList = item.getElementsByTagName("comment")
150 | # if i == 1:
151 | #print(len(cList))
152 | comments = []
153 | for comm in cList:
154 |
155 | pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data
156 | pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data
157 | comcon = comm.getElementsByTagName("content")[0].childNodes[0].data
158 | comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data
159 | comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000)))
160 | repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data
161 | comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid})
162 | #print('rua!')
163 | #if i == 1:
164 | #print(len(comments))
165 |
166 | ## save as txt and images
167 |
168 |
169 |
170 |
171 | if not Path("./Articles/"+titleText+".txt").is_file():
172 | sameTitle[titleText] = 0
173 | else:
174 | sameTitle[titleText] += 1
175 | titleText += '_'+str(sameTitle[titleText])
176 |
177 |
178 | with open("./Articles/"+titleText+".txt", "w", encoding="utf-8") as f:
179 | f.write(titleText+'\n\n')
180 | f.write("发表时间:"+publishTime+'\n')
181 | f.write("修改时间:"+modifyTime+'\n\n')
182 | f.write("Tag:")
183 | if tag != []:
184 | for t in range(0, len(tag)-1):
185 | f.write(tag[t]+', ')
186 | f.write(tag[len(tag)-1]+'\n\n\n')
187 | else:
188 | f.write('\n\n\n')
189 | f.writelines(content)
190 |
191 | f.write("\n\n插入链接:\n")
192 | for lk in linkSrc:
193 | f.write(lk+'\n')
194 |
195 | f.write('\n\n\n评论:\n\n')
196 | for comm in comments:
197 | f.write("发表人:"+comm["pubnick"]+' '+"UserId:"+comm["pubid"]+' '+"回复时间:"+comm["comtime"]+'\n')
198 | f.write("回复给:"+comm["repid"]+'\n')
199 | f.writelines(comm["comcon"]+'\n\n')
200 |
201 | for img in range(0, len(imgSrc)):
202 | requestImg(imgSrc[img], img, titleText)
203 |
204 | print(titleText+": finished.")
205 |
206 | print("Complete!")
207 |
--------------------------------------------------------------------------------
/readLof_photo.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof_photo.exe
--------------------------------------------------------------------------------
/readLof_photo.py:
--------------------------------------------------------------------------------
1 | # -*- coding: UTF-8 -*-
2 |
3 | from tkinter.filedialog import *
4 | from tkinter import *
5 | from xml.dom.minidom import parse
6 | import xml.dom.minidom
7 | import os
8 | import datetime, time
9 | import re
10 | import urllib
11 | import urllib.request
12 | import imghdr
13 | import threading
14 | import types
15 | import os.path
16 | from pathlib import Path
17 | import shutil
18 |
19 | global false, null, true
20 |
21 | false = null = true = ""
22 |
23 | rootwin = Tk()
24 | rootwin.withdraw()
25 |
26 | default_dir = r"file_path"
27 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir)))
28 |
29 | #print(file_path)
30 |
31 | DOMTree = xml.dom.minidom.parse(file_path)
32 |
33 | rootNode = DOMTree.documentElement
34 | #print(rootNode.nodeName)
35 |
36 | items = rootNode.getElementsByTagName("PostItem")
37 |
38 | i = 0
39 |
40 |
41 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
42 | AppleWebKit/537.36 (KHTML, like Gecko) \
43 | Chrome/35.0.1916.114 Safari/537.36',
44 | 'Cookie': 'AspxAutoDetectCookieSupport=1'}
45 | glock = threading.Lock()
46 | pho_list = []
47 | url_list = []
48 | title_list = []
49 |
50 | sameTitle = {}
51 | scount = 0
52 |
53 |
54 | def get_url():
55 |
56 | while True:
57 | glock.acquire()
58 | if len(url_list) == 0:
59 | print("Get photos: finished.")
60 | glock.release()
61 |
62 | break
63 | else:
64 | u = url_list.pop()
65 | count = 0
66 | title = title_list.pop()
67 |
68 | glock.release()
69 |
70 |
71 | glock.acquire()
72 | for phurl in u:
73 |
74 | picu = phurl['orign']
75 | #req = urllib.request.Request(url=picu, headers=header)
76 | #pic = urllib.request.urlopen(req, timeout=30).read()
77 |
78 | pho_list.append({'pic':picu, 'title':title+'_'+str(count)})
79 | count += 1
80 | #print(title+": get request successfully.")
81 | glock.release()
82 |
83 |
84 |
85 | def download_pho():
86 |
87 | while True:
88 | glock.acquire()
89 | if len(pho_list) == 0:
90 | glock.release()
91 | print("This thread finished downloading.")
92 | if len(pho_list) == 0 and len(url_list) == 0:
93 | break
94 | continue
95 | else:
96 | pic = pho_list.pop()
97 | glock.release()
98 | imgfname = pic['title']
99 |
100 | path = "./Photos/"+imgfname
101 | requestImg(pic['pic'], path)
102 | #urllib.request.urlretrieve(pic['pic'], filename=path)
103 | print(pic['title']+": download successfully.")
104 | #with open("./Photos/"+imgfname, "wb") as f:
105 | #f.write(pic['pic'])
106 |
107 |
108 |
109 | def requestImg(url, path, num_retries=3):
110 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
111 | AppleWebKit/537.36 (KHTML, like Gecko) \
112 | Chrome/35.0.1916.114 Safari/537.36',
113 | 'Cookie': 'AspxAutoDetectCookieSupport=1'}
114 |
115 | req = urllib.request.Request(url=url, headers=header)
116 | try:
117 | response = urllib.request.urlopen(req, timeout=10)
118 | conte = response.read()
119 | imgfname = path
120 | imgtype = imghdr.what("", conte)
121 | #print(imgtype)
122 | imgfname += '.'+imgtype
123 | with open(imgfname, "wb") as f:
124 |
125 | f.write(conte)
126 | response.close()
127 |
128 |
129 | except Exception as e:
130 | print(e)
131 |
132 |
133 | if not os.path.exists('Photos'):
134 | os.mkdir('Photos')
135 | else:
136 | shutil.rmtree('Photos')
137 | os.mkdir('Photos')
138 |
139 |
140 | for item in items:
141 | itemType = item.getElementsByTagName("type")[0]
142 | #print(itemType.nodeName, ":", itemType.childNodes[0].data)
143 | if itemType.childNodes[0].data != "Photo":
144 | continue
145 | i += 1
146 |
147 |
148 | ## get all messages
149 |
150 | titleText = ''
151 |
152 | publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data
153 | publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000)))
154 |
155 | titleText = "插图_"+publishTime
156 | titleText = titleText.replace(' ', '_')
157 |
158 | titleText = titleText.replace('/','-')
159 | titleText = titleText.replace('\\', '-')
160 | titleText = titleText.replace('|', '-')
161 | titleText = titleText.replace(':', '-')
162 | titleText = titleText.replace('"', '-')
163 | titleText = titleText.replace('*', '-')
164 | titleText = titleText.replace('?', '-')
165 | titleText = titleText.replace('<', '-')
166 | titleText = titleText.replace('>', '-')
167 | titleText = titleText.replace('', '')
168 | titleText = titleText.replace('\n', '')
169 |
170 | modifyTime = publishTime
171 | if item.getElementsByTagName("modifyTime") == []:
172 | modifyTime = publishTime
173 | else:
174 | modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data
175 | modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000)))
176 | tag = []
177 | if item.getElementsByTagName("tag") == []:
178 | tag = []
179 | else:
180 | tag = item.getElementsByTagName("tag")[0]
181 | if tag.childNodes == []:
182 | tag = []
183 | else:
184 | tag = tag.childNodes[0].data.split(',')
185 | #print(tag)
186 | #print(type(item.getElementsByTagName("caption")[0]))
187 | #print(i)
188 | if item.getElementsByTagName("caption") == [] or item.getElementsByTagName("caption")[0].childNodes == []:
189 | content = ''
190 | else:
191 | content = item.getElementsByTagName("caption")[0].childNodes[0].data
192 | #print(content)
193 | content = content.replace('', '')
194 | content = content.replace('
', '')
195 | content = content.replace('
', '\r\n')
196 | content = content.replace('', '')
197 |
198 | linkSrc = r'href="(.*?)"'
199 | linkSrc = re.findall(linkSrc, content)
200 | #iS2 = r'
]+>', re.S).sub('', content)
207 |
208 | photos = item.getElementsByTagName("photoLinks")[0].childNodes[0].data
209 | photos = eval(photos)
210 | #if i == 1:
211 | # print(photos)
212 |
213 |
214 | title_list.append(titleText)
215 | url_list.append(photos)
216 |
217 |
218 |
219 |
220 |
221 | #if i == 1:
222 | # print(content)
223 |
224 | cList = item.getElementsByTagName("comment")
225 | comments = []
226 | for comm in cList:
227 | pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data
228 | pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data
229 | comcon = comm.getElementsByTagName("content")[0].childNodes[0].data
230 | comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data
231 | comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000)))
232 | repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data
233 | comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid})
234 |
235 | ## save as txt
236 |
237 |
238 | with open("./Photos/"+titleText+".txt", "w", encoding="utf-8") as f:
239 | f.write(titleText+'\n\n')
240 | f.write("发表时间:"+publishTime+'\n')
241 | f.write("修改时间:"+modifyTime+'\n\n')
242 | f.write("Tag:")
243 | if tag != []:
244 | for t in range(0, len(tag)-1):
245 | f.write(tag[t]+', ')
246 | f.write(tag[len(tag)-1]+'\n\n\n')
247 | else:
248 | f.write('\n\n\n')
249 | f.writelines(content)
250 | f.write("\n\n插入链接:\n")
251 | for lk in linkSrc:
252 | f.write(lk+'\n')
253 |
254 | f.write('\n\n\n评论:\n\n')
255 | for comm in comments:
256 | f.write("发表人:"+comm["pubnick"]+' '+"UserId:"+comm["pubid"]+' '+"回复时间:"+comm["comtime"]+'\n')
257 | f.write("回复给:"+comm["repid"]+'\n')
258 | f.writelines(comm["comcon"]+'\n\n')
259 |
260 |
261 |
262 | def main():
263 |
264 | for xx in range(5):
265 | product = threading.Thread(target=get_url)
266 | product.start()
267 | for xx in range(6):
268 | consumer = threading.Thread(target=download_pho)
269 | consumer.start()
270 |
271 |
272 | if __name__ == '__main__':
273 | main()
274 |
--------------------------------------------------------------------------------