├── .gitignore ├── README.md ├── readLof.exe ├── readLof.py ├── readLof_photo.exe └── readLof_photo.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # -XML-Reader-Lofter-Data-Backup 2 | XML readers for xml files of LOFTER users data 3 | ## Introduction 4 | 5 | - 用于整理导出lofter的主页xml日志内的文章和插图 6 | - 编码为UTF-8,与xml日志编码一致 7 | - 文章以`txt`格式保存在同路径下的`Articles`文件夹下,`txt`文件包含标题、发表时间、修改时间、Tag、正文、文章内插入的链接、评论 8 | - 文章内插入的图片保存在同路径下的`Images`文件夹 9 | - 插图保存在同路径下的`Photos`文件夹,包含每一条lof插图的所有图片,以及用`txt`文件保存该条发表时间、修改时间、Tag、插图描述、插入的链接、评论 10 | - 整理文章请运行`readLof.exe`可执行文件,插图为`readLof_photo.exe`可执行文件 11 | 12 | ## Update Log 13 | - 2020.6.12. 23:48 修复了评论只能导出一条的问题;修复标题重复的文件被覆盖的问题,重名的文件会在文件名尾追加重复次数的序号;增加了图片链接请求的超时时间限制 14 | - 2020.6.13. 13:15 修复了标题内包含'\n'导致闪退的问题;修改了评论发表时间的显示格式,从时间戳改为正常时间格式 15 | 16 | ## How to Run 17 | 18 | ### Environment 19 | - Windows 7/8/XP/10 20 | 21 | ### Run 22 | - 下载后直接运行`readLof.exe`或`readLof_photo.exe` 23 | - 运行后,在弹出的文件选择框中选择需要整理的xml日志文件(一定不要选其他格式的文件或者非lofter日志文件,不然请关掉程序重来) 24 | - 等待命令行窗口执行完毕自动退出,完成整理导出 25 | 26 | 27 | ## Attention 28 | 29 | - 2020.6.12. 23:48:追加:关于闪退问题,大部分原因是文章内容包含特殊符号比如小方块(这里打不出来),这些符号读xml文件时无法解析所以会报错闪退,需要手动删掉这些特殊符号 30 | - 2020.6.12. 23:48:追加2:关于部分文内插图和图片导不出来的问题,目前增加了超时时限,但是仍旧存在连接超时和拒绝访问的情况,因此可能还是导不出来 31 | - 所有文章的txt文件命名使用文章标题,没有标题则用“无题\_<发布时间>”的格式命名,文章插图用“<文章标题>\_<图片序号>”的格式命名 32 | - 所有插图以“插图_<发布时间>\_<图片序号>”的格式命名,插图的描述和评论等的txt文件以“插图\_<发布时间>”的格式命名 33 | - lofter日志只包括了标题、发布时间、修改时间、Tag、图片、正文、文内链接、评论,不包括热度 34 | - 由于有的图片源链接会失效或者连接超时,所以会出现timeout和403 forbidden的情况,这种情况可以多次运行程序尝试重备份,但是通常情况下该下不下来还是下不下来(…… 35 | - 文章导出比较快,插图类看情况可能要花比较长的时间,当然100条以下的插图lof还是很快可以下载完所有图片的 36 | - 关于评论:由于xml日志内,每条回复的被回复人只有一串字符串代码id而不是昵称,回复的发布人才同时有昵称和代码id,因此比较影响阅读 37 | - 文章标题内的尖括号、英文双引号、星号、英文问号、正反竖斜杠属于Windows违法的文件命名符号,因此全部替换为了“-”符号,但是正文的这些符号不受影响 38 | - 文章内出现的尖括号等html语言字符在xml日志内均自动转成了代码,目前没有实现将这些代码还原符号的功能 39 | - 文内链接统一整理写入了txt文件结尾(评论之前),没有位于原本插入的位置 40 | -------------------------------------------------------------------------------- /readLof.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof.exe -------------------------------------------------------------------------------- /readLof.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from tkinter.filedialog import * 4 | from tkinter import * 5 | from xml.dom.minidom import parse 6 | import xml.dom.minidom 7 | import os 8 | import datetime, time 9 | import re 10 | import urllib 11 | import urllib.request 12 | import imghdr 13 | import os.path 14 | from pathlib import Path 15 | import shutil 16 | 17 | 18 | rootwin = Tk() 19 | rootwin.withdraw() 20 | 21 | default_dir = r"file_path" 22 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir))) 23 | 24 | #print(file_path) 25 | 26 | 27 | DOMTree = xml.dom.minidom.parse(file_path) 28 | 29 | rootNode = DOMTree.documentElement 30 | #print(rootNode.nodeName) 31 | 32 | items = rootNode.getElementsByTagName("PostItem") 33 | 34 | i = 0 35 | 36 | sameTitle = {} 37 | scount = 0 38 | 39 | def requestImg(url, i, title, num_retries=3): 40 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \ 41 | AppleWebKit/537.36 (KHTML, like Gecko) \ 42 | Chrome/35.0.1916.114 Safari/537.36', 43 | 'Cookie': 'AspxAutoDetectCookieSupport=1'} 44 | 45 | req = urllib.request.Request(url=url, headers=header) 46 | try: 47 | response = urllib.request.urlopen(req, timeout=20) 48 | imgfname = title + '-' + str(i) 49 | conte = response.read() 50 | imgtype = imghdr.what("", conte) 51 | #print(imgtype) 52 | imgfname += '.'+imgtype 53 | with open("./Images/"+imgfname, "wb") as f: 54 | 55 | f.write(conte) 56 | response.close() 57 | 58 | 59 | except Exception as e: 60 | print(e) 61 | 62 | 63 | if not os.path.exists('Articles'): 64 | os.mkdir('Articles') 65 | else: 66 | shutil.rmtree('Articles') 67 | os.mkdir('Articles') 68 | 69 | if not os.path.exists('Images'): 70 | os.mkdir('Images') 71 | else: 72 | shutil.rmtree('Images') 73 | os.mkdir('Images') 74 | 75 | for item in items: 76 | itemType = item.getElementsByTagName("type")[0] 77 | #print(itemType.nodeName, ":", itemType.childNodes[0].data) 78 | if itemType.childNodes[0].data != "Text": 79 | continue 80 | i += 1 81 | 82 | ## get all messages 83 | 84 | title = item.getElementsByTagName("title")[0] 85 | titleText = '' 86 | if title.childNodes == []: 87 | titleText = '' 88 | else: 89 | titleText = title.childNodes[0].data #.replace('','') 90 | 91 | 92 | publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data 93 | publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000))) 94 | if titleText == '': 95 | titleText = "无题_"+publishTime 96 | titleText = titleText.replace(' ', '_') 97 | 98 | titleText = titleText.replace('/','-') 99 | titleText = titleText.replace('\\', '-') 100 | titleText = titleText.replace('|', '-') 101 | titleText = titleText.replace(':', '-') 102 | titleText = titleText.replace('"', '-') 103 | titleText = titleText.replace('*', '-') 104 | titleText = titleText.replace('?', '-') 105 | titleText = titleText.replace('<', '-') 106 | titleText = titleText.replace('>', '-') 107 | titleText = titleText.replace('', '') 108 | titleText = titleText.replace('\n', '') 109 | 110 | modifyTime = publishTime 111 | if item.getElementsByTagName("modifyTime") == []: 112 | modifyTime = publishTime 113 | else: 114 | modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data 115 | modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000))) 116 | tag = [] 117 | if item.getElementsByTagName("tag") == []: 118 | tag = [] 119 | else: 120 | tag = item.getElementsByTagName("tag")[0] 121 | if tag.childNodes == []: 122 | tag = [] 123 | else: 124 | tag = tag.childNodes[0].data.split(',') 125 | #print(tag) 126 | content = item.getElementsByTagName("content")[0].childNodes[0].data 127 | content = content.replace('

', '') 128 | content = content.replace('

', '') 129 | content = content.replace('
', '\r\n') 130 | content = content.replace('', '') 131 | 132 | linkSrc = r'href="(.*?)"' 133 | linkSrc = re.findall(linkSrc, content) 134 | 135 | imgSrc = r']+>', re.S).sub('', content) 144 | 145 | 146 | #if i == 1: 147 | # print(content) 148 | #cList = item.getElementsByTagName("commentList") 149 | cList = item.getElementsByTagName("comment") 150 | # if i == 1: 151 | #print(len(cList)) 152 | comments = [] 153 | for comm in cList: 154 | 155 | pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data 156 | pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data 157 | comcon = comm.getElementsByTagName("content")[0].childNodes[0].data 158 | comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data 159 | comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000))) 160 | repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data 161 | comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid}) 162 | #print('rua!') 163 | #if i == 1: 164 | #print(len(comments)) 165 | 166 | ## save as txt and images 167 | 168 | 169 | 170 | 171 | if not Path("./Articles/"+titleText+".txt").is_file(): 172 | sameTitle[titleText] = 0 173 | else: 174 | sameTitle[titleText] += 1 175 | titleText += '_'+str(sameTitle[titleText]) 176 | 177 | 178 | with open("./Articles/"+titleText+".txt", "w", encoding="utf-8") as f: 179 | f.write(titleText+'\n\n') 180 | f.write("发表时间:"+publishTime+'\n') 181 | f.write("修改时间:"+modifyTime+'\n\n') 182 | f.write("Tag:") 183 | if tag != []: 184 | for t in range(0, len(tag)-1): 185 | f.write(tag[t]+', ') 186 | f.write(tag[len(tag)-1]+'\n\n\n') 187 | else: 188 | f.write('\n\n\n') 189 | f.writelines(content) 190 | 191 | f.write("\n\n插入链接:\n") 192 | for lk in linkSrc: 193 | f.write(lk+'\n') 194 | 195 | f.write('\n\n\n评论:\n\n') 196 | for comm in comments: 197 | f.write("发表人:"+comm["pubnick"]+' '+"UserId:"+comm["pubid"]+' '+"回复时间:"+comm["comtime"]+'\n') 198 | f.write("回复给:"+comm["repid"]+'\n') 199 | f.writelines(comm["comcon"]+'\n\n') 200 | 201 | for img in range(0, len(imgSrc)): 202 | requestImg(imgSrc[img], img, titleText) 203 | 204 | print(titleText+": finished.") 205 | 206 | print("Complete!") 207 | -------------------------------------------------------------------------------- /readLof_photo.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof_photo.exe -------------------------------------------------------------------------------- /readLof_photo.py: -------------------------------------------------------------------------------- 1 | # -*- coding: UTF-8 -*- 2 | 3 | from tkinter.filedialog import * 4 | from tkinter import * 5 | from xml.dom.minidom import parse 6 | import xml.dom.minidom 7 | import os 8 | import datetime, time 9 | import re 10 | import urllib 11 | import urllib.request 12 | import imghdr 13 | import threading 14 | import types 15 | import os.path 16 | from pathlib import Path 17 | import shutil 18 | 19 | global false, null, true 20 | 21 | false = null = true = "" 22 | 23 | rootwin = Tk() 24 | rootwin.withdraw() 25 | 26 | default_dir = r"file_path" 27 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir))) 28 | 29 | #print(file_path) 30 | 31 | DOMTree = xml.dom.minidom.parse(file_path) 32 | 33 | rootNode = DOMTree.documentElement 34 | #print(rootNode.nodeName) 35 | 36 | items = rootNode.getElementsByTagName("PostItem") 37 | 38 | i = 0 39 | 40 | 41 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \ 42 | AppleWebKit/537.36 (KHTML, like Gecko) \ 43 | Chrome/35.0.1916.114 Safari/537.36', 44 | 'Cookie': 'AspxAutoDetectCookieSupport=1'} 45 | glock = threading.Lock() 46 | pho_list = [] 47 | url_list = [] 48 | title_list = [] 49 | 50 | sameTitle = {} 51 | scount = 0 52 | 53 | 54 | def get_url(): 55 | 56 | while True: 57 | glock.acquire() 58 | if len(url_list) == 0: 59 | print("Get photos: finished.") 60 | glock.release() 61 | 62 | break 63 | else: 64 | u = url_list.pop() 65 | count = 0 66 | title = title_list.pop() 67 | 68 | glock.release() 69 | 70 | 71 | glock.acquire() 72 | for phurl in u: 73 | 74 | picu = phurl['orign'] 75 | #req = urllib.request.Request(url=picu, headers=header) 76 | #pic = urllib.request.urlopen(req, timeout=30).read() 77 | 78 | pho_list.append({'pic':picu, 'title':title+'_'+str(count)}) 79 | count += 1 80 | #print(title+": get request successfully.") 81 | glock.release() 82 | 83 | 84 | 85 | def download_pho(): 86 | 87 | while True: 88 | glock.acquire() 89 | if len(pho_list) == 0: 90 | glock.release() 91 | print("This thread finished downloading.") 92 | if len(pho_list) == 0 and len(url_list) == 0: 93 | break 94 | continue 95 | else: 96 | pic = pho_list.pop() 97 | glock.release() 98 | imgfname = pic['title'] 99 | 100 | path = "./Photos/"+imgfname 101 | requestImg(pic['pic'], path) 102 | #urllib.request.urlretrieve(pic['pic'], filename=path) 103 | print(pic['title']+": download successfully.") 104 | #with open("./Photos/"+imgfname, "wb") as f: 105 | #f.write(pic['pic']) 106 | 107 | 108 | 109 | def requestImg(url, path, num_retries=3): 110 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \ 111 | AppleWebKit/537.36 (KHTML, like Gecko) \ 112 | Chrome/35.0.1916.114 Safari/537.36', 113 | 'Cookie': 'AspxAutoDetectCookieSupport=1'} 114 | 115 | req = urllib.request.Request(url=url, headers=header) 116 | try: 117 | response = urllib.request.urlopen(req, timeout=10) 118 | conte = response.read() 119 | imgfname = path 120 | imgtype = imghdr.what("", conte) 121 | #print(imgtype) 122 | imgfname += '.'+imgtype 123 | with open(imgfname, "wb") as f: 124 | 125 | f.write(conte) 126 | response.close() 127 | 128 | 129 | except Exception as e: 130 | print(e) 131 | 132 | 133 | if not os.path.exists('Photos'): 134 | os.mkdir('Photos') 135 | else: 136 | shutil.rmtree('Photos') 137 | os.mkdir('Photos') 138 | 139 | 140 | for item in items: 141 | itemType = item.getElementsByTagName("type")[0] 142 | #print(itemType.nodeName, ":", itemType.childNodes[0].data) 143 | if itemType.childNodes[0].data != "Photo": 144 | continue 145 | i += 1 146 | 147 | 148 | ## get all messages 149 | 150 | titleText = '' 151 | 152 | publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data 153 | publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000))) 154 | 155 | titleText = "插图_"+publishTime 156 | titleText = titleText.replace(' ', '_') 157 | 158 | titleText = titleText.replace('/','-') 159 | titleText = titleText.replace('\\', '-') 160 | titleText = titleText.replace('|', '-') 161 | titleText = titleText.replace(':', '-') 162 | titleText = titleText.replace('"', '-') 163 | titleText = titleText.replace('*', '-') 164 | titleText = titleText.replace('?', '-') 165 | titleText = titleText.replace('<', '-') 166 | titleText = titleText.replace('>', '-') 167 | titleText = titleText.replace('', '') 168 | titleText = titleText.replace('\n', '') 169 | 170 | modifyTime = publishTime 171 | if item.getElementsByTagName("modifyTime") == []: 172 | modifyTime = publishTime 173 | else: 174 | modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data 175 | modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000))) 176 | tag = [] 177 | if item.getElementsByTagName("tag") == []: 178 | tag = [] 179 | else: 180 | tag = item.getElementsByTagName("tag")[0] 181 | if tag.childNodes == []: 182 | tag = [] 183 | else: 184 | tag = tag.childNodes[0].data.split(',') 185 | #print(tag) 186 | #print(type(item.getElementsByTagName("caption")[0])) 187 | #print(i) 188 | if item.getElementsByTagName("caption") == [] or item.getElementsByTagName("caption")[0].childNodes == []: 189 | content = '' 190 | else: 191 | content = item.getElementsByTagName("caption")[0].childNodes[0].data 192 | #print(content) 193 | content = content.replace('

', '') 194 | content = content.replace('

', '') 195 | content = content.replace('
', '\r\n') 196 | content = content.replace('', '') 197 | 198 | linkSrc = r'href="(.*?)"' 199 | linkSrc = re.findall(linkSrc, content) 200 | #iS2 = r']+>', re.S).sub('', content) 207 | 208 | photos = item.getElementsByTagName("photoLinks")[0].childNodes[0].data 209 | photos = eval(photos) 210 | #if i == 1: 211 | # print(photos) 212 | 213 | 214 | title_list.append(titleText) 215 | url_list.append(photos) 216 | 217 | 218 | 219 | 220 | 221 | #if i == 1: 222 | # print(content) 223 | 224 | cList = item.getElementsByTagName("comment") 225 | comments = [] 226 | for comm in cList: 227 | pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data 228 | pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data 229 | comcon = comm.getElementsByTagName("content")[0].childNodes[0].data 230 | comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data 231 | comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000))) 232 | repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data 233 | comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid}) 234 | 235 | ## save as txt 236 | 237 | 238 | with open("./Photos/"+titleText+".txt", "w", encoding="utf-8") as f: 239 | f.write(titleText+'\n\n') 240 | f.write("发表时间:"+publishTime+'\n') 241 | f.write("修改时间:"+modifyTime+'\n\n') 242 | f.write("Tag:") 243 | if tag != []: 244 | for t in range(0, len(tag)-1): 245 | f.write(tag[t]+', ') 246 | f.write(tag[len(tag)-1]+'\n\n\n') 247 | else: 248 | f.write('\n\n\n') 249 | f.writelines(content) 250 | f.write("\n\n插入链接:\n") 251 | for lk in linkSrc: 252 | f.write(lk+'\n') 253 | 254 | f.write('\n\n\n评论:\n\n') 255 | for comm in comments: 256 | f.write("发表人:"+comm["pubnick"]+' '+"UserId:"+comm["pubid"]+' '+"回复时间:"+comm["comtime"]+'\n') 257 | f.write("回复给:"+comm["repid"]+'\n') 258 | f.writelines(comm["comcon"]+'\n\n') 259 | 260 | 261 | 262 | def main(): 263 | 264 | for xx in range(5): 265 | product = threading.Thread(target=get_url) 266 | product.start() 267 | for xx in range(6): 268 | consumer = threading.Thread(target=download_pho) 269 | consumer.start() 270 | 271 | 272 | if __name__ == '__main__': 273 | main() 274 | --------------------------------------------------------------------------------