├── .gitignore
├── README.md
├── readLof.exe
├── readLof.py
├── readLof_photo.exe
└── readLof_photo.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # -XML-Reader-Lofter-Data-Backup
 2 | XML readers for xml files of LOFTER users data
 3 | ## Introduction
 4 | 
 5 | - 用于整理导出lofter的主页xml日志内的文章和插图
 6 | - 编码为UTF-8，与xml日志编码一致
 7 | - 文章以`txt`格式保存在同路径下的`Articles`文件夹下，`txt`文件包含标题、发表时间、修改时间、Tag、正文、文章内插入的链接、评论
 8 | - 文章内插入的图片保存在同路径下的`Images`文件夹
 9 | - 插图保存在同路径下的`Photos`文件夹，包含每一条lof插图的所有图片，以及用`txt`文件保存该条发表时间、修改时间、Tag、插图描述、插入的链接、评论
10 | - 整理文章请运行`readLof.exe`可执行文件，插图为`readLof_photo.exe`可执行文件
11 | 
12 | ## Update Log
13 | - 2020.6.12. 23:48  修复了评论只能导出一条的问题；修复标题重复的文件被覆盖的问题，重名的文件会在文件名尾追加重复次数的序号；增加了图片链接请求的超时时间限制
14 | - 2020.6.13. 13:15  修复了标题内包含'\n'导致闪退的问题；修改了评论发表时间的显示格式，从时间戳改为正常时间格式
15 | 
16 | ## How to Run
17 | 
18 | ### Environment
19 | - Windows 7/8/XP/10
20 | 
21 | ### Run
22 | - 下载后直接运行`readLof.exe`或`readLof_photo.exe`
23 | - 运行后，在弹出的文件选择框中选择需要整理的xml日志文件（一定不要选其他格式的文件或者非lofter日志文件，不然请关掉程序重来）
24 | - 等待命令行窗口执行完毕自动退出，完成整理导出
25 | 
26 | 
27 | ## Attention
28 | 
29 | - 2020.6.12. 23:48:追加：关于闪退问题，大部分原因是文章内容包含特殊符号比如小方块（这里打不出来），这些符号读xml文件时无法解析所以会报错闪退，需要手动删掉这些特殊符号
30 | - 2020.6.12. 23:48:追加2：关于部分文内插图和图片导不出来的问题，目前增加了超时时限，但是仍旧存在连接超时和拒绝访问的情况，因此可能还是导不出来
31 | - 所有文章的txt文件命名使用文章标题，没有标题则用“无题\_<发布时间>”的格式命名，文章插图用“<文章标题>\_<图片序号>”的格式命名
32 | - 所有插图以“插图_<发布时间>\_<图片序号>”的格式命名，插图的描述和评论等的txt文件以“插图\_<发布时间>”的格式命名
33 | - lofter日志只包括了标题、发布时间、修改时间、Tag、图片、正文、文内链接、评论，不包括热度
34 | - 由于有的图片源链接会失效或者连接超时，所以会出现timeout和403 forbidden的情况，这种情况可以多次运行程序尝试重备份，但是通常情况下该下不下来还是下不下来（……
35 | - 文章导出比较快，插图类看情况可能要花比较长的时间，当然100条以下的插图lof还是很快可以下载完所有图片的
36 | - 关于评论：由于xml日志内，每条回复的被回复人只有一串字符串代码id而不是昵称，回复的发布人才同时有昵称和代码id，因此比较影响阅读
37 | - 文章标题内的尖括号、英文双引号、星号、英文问号、正反竖斜杠属于Windows违法的文件命名符号，因此全部替换为了“-”符号，但是正文的这些符号不受影响
38 | - 文章内出现的尖括号等html语言字符在xml日志内均自动转成了代码，目前没有实现将这些代码还原符号的功能
39 | - 文内链接统一整理写入了txt文件结尾（评论之前），没有位于原本插入的位置
40 | 


--------------------------------------------------------------------------------
/readLof.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof.exe


--------------------------------------------------------------------------------
/readLof.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | from tkinter.filedialog import *
  4 | from tkinter import *
  5 | from xml.dom.minidom import parse
  6 | import xml.dom.minidom
  7 | import os
  8 | import datetime, time
  9 | import re
 10 | import urllib
 11 | import urllib.request
 12 | import imghdr
 13 | import os.path
 14 | from pathlib import Path
 15 | import shutil
 16 | 
 17 | 
 18 | rootwin = Tk()
 19 | rootwin.withdraw()
 20 | 
 21 | default_dir = r"file_path"
 22 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir)))
 23 | 
 24 | #print(file_path)
 25 | 
 26 | 
 27 | DOMTree = xml.dom.minidom.parse(file_path)
 28 | 
 29 | rootNode = DOMTree.documentElement
 30 | #print(rootNode.nodeName)
 31 | 
 32 | items = rootNode.getElementsByTagName("PostItem")
 33 | 
 34 | i = 0
 35 | 
 36 | sameTitle = {}
 37 | scount = 0
 38 | 
 39 | def requestImg(url, i, title, num_retries=3):
 40 |     header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
 41 |               AppleWebKit/537.36 (KHTML, like Gecko) \
 42 |               Chrome/35.0.1916.114 Safari/537.36',
 43 |               'Cookie': 'AspxAutoDetectCookieSupport=1'}
 44 | 
 45 |     req = urllib.request.Request(url=url, headers=header)
 46 |     try:
 47 |         response = urllib.request.urlopen(req, timeout=20)
 48 |         imgfname = title + '-' + str(i)
 49 |         conte = response.read()
 50 |         imgtype = imghdr.what("", conte)
 51 |         #print(imgtype)
 52 |         imgfname += '.'+imgtype
 53 |         with open("./Images/"+imgfname, "wb") as f:
 54 |             
 55 |             f.write(conte)
 56 |             response.close()
 57 |         
 58 |     
 59 |     except Exception as e:
 60 |         print(e)
 61 | 
 62 | 
 63 | if not os.path.exists('Articles'):
 64 |     os.mkdir('Articles')
 65 | else:
 66 |     shutil.rmtree('Articles')
 67 |     os.mkdir('Articles')
 68 | 
 69 | if not os.path.exists('Images'):
 70 |     os.mkdir('Images')
 71 | else:
 72 |     shutil.rmtree('Images')
 73 |     os.mkdir('Images')
 74 | 
 75 | for item in items:
 76 |     itemType = item.getElementsByTagName("type")[0]
 77 |     #print(itemType.nodeName, ":", itemType.childNodes[0].data)
 78 |     if itemType.childNodes[0].data != "Text":
 79 |         continue
 80 |     i += 1
 81 | 
 82 | ## get all messages
 83 | 
 84 |     title = item.getElementsByTagName("title")[0]
 85 |     titleText = ''
 86 |     if title.childNodes == []:
 87 |         titleText = ''
 88 |     else:
 89 |         titleText = title.childNodes[0].data    #.replace('','')
 90 | 
 91 |     
 92 |     publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data
 93 |     publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000)))
 94 |     if titleText == '':
 95 |         titleText = "无题_"+publishTime
 96 |         titleText = titleText.replace(' ', '_')
 97 |         
 98 |     titleText = titleText.replace('/','-')
 99 |     titleText = titleText.replace('\\', '-')
100 |     titleText = titleText.replace('|', '-')
101 |     titleText = titleText.replace(':', '-')    
102 |     titleText = titleText.replace('"', '-')
103 |     titleText = titleText.replace('*', '-')
104 |     titleText = titleText.replace('?', '-')
105 |     titleText = titleText.replace('<', '-')
106 |     titleText = titleText.replace('>', '-')
107 |     titleText = titleText.replace('', '')
108 |     titleText = titleText.replace('\n', '')
109 |     
110 |     modifyTime = publishTime
111 |     if item.getElementsByTagName("modifyTime") == []:
112 |         modifyTime = publishTime
113 |     else:
114 |         modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data
115 |         modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000)))
116 |     tag = []
117 |     if item.getElementsByTagName("tag") == []:
118 |         tag = []
119 |     else:
120 |         tag = item.getElementsByTagName("tag")[0]
121 |         if tag.childNodes == []:
122 |             tag = []
123 |         else:
124 |             tag = tag.childNodes[0].data.split(',')
125 |     #print(tag)
126 |     content = item.getElementsByTagName("content")[0].childNodes[0].data
127 |     content = content.replace('<p>', '')
128 |     content = content.replace('</p>', '')
129 |     content = content.replace('<br />', '\r\n')
130 |     content = content.replace('', '')
131 | 
132 |     linkSrc = r'href="(.*?)"'
133 |     linkSrc = re.findall(linkSrc, content)
134 | 
135 |     imgSrc = r'<img src="(.*?)"'
136 |     imgSrc = re.findall(imgSrc, content)
137 |     #iS2 = r'<img src=(.*?)smallsrc='
138 |     #imgSrc = imgSrc + re.findall(iS2, content)
139 |     #for j in range(0, len(imgSrc)):
140 |     #    imgSrc[j] = imgSrc[j].replace('"', '')
141 |     #    imgSrc[j] = imgSrc[j].replace(' ', '')
142 | 
143 |     content = re.compile(r'<[^>]+>', re.S).sub('', content)
144 | 
145 | 
146 |     #if i == 1:
147 |     #    print(content)
148 |     #cList = item.getElementsByTagName("commentList")
149 |     cList = item.getElementsByTagName("comment")
150 |    # if i == 1:
151 |         #print(len(cList))
152 |     comments = []
153 |     for comm in cList:
154 |         
155 |         pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data
156 |         pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data
157 |         comcon = comm.getElementsByTagName("content")[0].childNodes[0].data
158 |         comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data
159 |         comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000)))
160 |         repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data
161 |         comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid})
162 |         #print('rua!')
163 |     #if i == 1:
164 |         #print(len(comments))
165 | 
166 | ## save as txt and images
167 |     
168 | 
169 |     
170 | 
171 |     if not Path("./Articles/"+titleText+".txt").is_file():
172 |         sameTitle[titleText] = 0
173 |     else:
174 |         sameTitle[titleText] += 1
175 |         titleText += '_'+str(sameTitle[titleText])
176 |         
177 | 
178 |     with open("./Articles/"+titleText+".txt", "w", encoding="utf-8") as f:
179 |         f.write(titleText+'\n\n')
180 |         f.write("发表时间："+publishTime+'\n')
181 |         f.write("修改时间："+modifyTime+'\n\n')
182 |         f.write("Tag：")
183 |         if tag != []:
184 |             for t in range(0, len(tag)-1):
185 |                 f.write(tag[t]+', ')
186 |             f.write(tag[len(tag)-1]+'\n\n\n')
187 |         else:
188 |             f.write('\n\n\n')
189 |         f.writelines(content)
190 | 
191 |         f.write("\n\n插入链接：\n")
192 |         for lk in linkSrc:
193 |             f.write(lk+'\n')
194 | 
195 |         f.write('\n\n\n评论：\n\n')
196 |         for comm in comments:
197 |             f.write("发表人："+comm["pubnick"]+'  '+"UserId："+comm["pubid"]+'  '+"回复时间："+comm["comtime"]+'\n')
198 |             f.write("回复给："+comm["repid"]+'\n')
199 |             f.writelines(comm["comcon"]+'\n\n')
200 | 
201 |     for img in range(0, len(imgSrc)):
202 |         requestImg(imgSrc[img], img, titleText)
203 |     
204 |     print(titleText+": finished.")
205 | 
206 | print("Complete!")
207 | 


--------------------------------------------------------------------------------
/readLof_photo.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Freezethewind/-XML-Reader-Lofter-Data-Backup/b24147c1e4fb0235347abf24fd212d89ace4a26f/readLof_photo.exe


--------------------------------------------------------------------------------
/readLof_photo.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: UTF-8 -*-
  2 | 
  3 | from tkinter.filedialog import *
  4 | from tkinter import *
  5 | from xml.dom.minidom import parse
  6 | import xml.dom.minidom
  7 | import os
  8 | import datetime, time
  9 | import re
 10 | import urllib
 11 | import urllib.request
 12 | import imghdr
 13 | import threading
 14 | import types
 15 | import os.path
 16 | from pathlib import Path
 17 | import shutil
 18 | 
 19 | global false, null, true
 20 | 
 21 | false = null = true = ""
 22 | 
 23 | rootwin = Tk()
 24 | rootwin.withdraw()
 25 | 
 26 | default_dir = r"file_path"
 27 | file_path = askopenfilename(title=u'choose xml file', initialdir=(os.path.expanduser(default_dir)))
 28 | 
 29 | #print(file_path)
 30 | 
 31 | DOMTree = xml.dom.minidom.parse(file_path)
 32 | 
 33 | rootNode = DOMTree.documentElement
 34 | #print(rootNode.nodeName)
 35 | 
 36 | items = rootNode.getElementsByTagName("PostItem")
 37 | 
 38 | i = 0
 39 | 
 40 | 
 41 | header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
 42 |               AppleWebKit/537.36 (KHTML, like Gecko) \
 43 |               Chrome/35.0.1916.114 Safari/537.36',
 44 |               'Cookie': 'AspxAutoDetectCookieSupport=1'}
 45 | glock = threading.Lock()
 46 | pho_list = []
 47 | url_list = []
 48 | title_list = []
 49 | 
 50 | sameTitle = {}
 51 | scount = 0
 52 | 
 53 | 
 54 | def get_url():
 55 | 
 56 |     while True:
 57 |         glock.acquire()
 58 |         if len(url_list) == 0:
 59 |             print("Get photos: finished.")
 60 |             glock.release()
 61 |             
 62 |             break
 63 |         else:
 64 |             u = url_list.pop()
 65 |             count = 0
 66 |             title = title_list.pop()
 67 | 
 68 |             glock.release()
 69 | 
 70 | 
 71 |             glock.acquire()
 72 |             for phurl in u:
 73 | 
 74 |                 picu = phurl['orign']
 75 |                 #req = urllib.request.Request(url=picu, headers=header)
 76 |                 #pic = urllib.request.urlopen(req, timeout=30).read()
 77 | 
 78 |                 pho_list.append({'pic':picu, 'title':title+'_'+str(count)})
 79 |                 count += 1
 80 |             #print(title+": get request successfully.")
 81 |             glock.release()
 82 |             
 83 | 
 84 | 
 85 | def download_pho():
 86 |    
 87 |     while True:
 88 |         glock.acquire()
 89 |         if len(pho_list) == 0:
 90 |             glock.release()
 91 |             print("This thread finished downloading.")
 92 |             if len(pho_list) == 0 and len(url_list) == 0:
 93 |                 break
 94 |             continue
 95 |         else:
 96 |             pic = pho_list.pop()
 97 |             glock.release()
 98 |             imgfname = pic['title'] 
 99 | 
100 |             path = "./Photos/"+imgfname
101 |             requestImg(pic['pic'], path)
102 |             #urllib.request.urlretrieve(pic['pic'], filename=path)
103 |             print(pic['title']+": download successfully.")
104 |             #with open("./Photos/"+imgfname, "wb") as f:            
105 |                 #f.write(pic['pic'])
106 | 
107 | 
108 | 
109 | def requestImg(url, path, num_retries=3):
110 |     header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) \
111 |               AppleWebKit/537.36 (KHTML, like Gecko) \
112 |               Chrome/35.0.1916.114 Safari/537.36',
113 |               'Cookie': 'AspxAutoDetectCookieSupport=1'}
114 | 
115 |     req = urllib.request.Request(url=url, headers=header)
116 |     try:
117 |         response = urllib.request.urlopen(req, timeout=10)
118 |         conte = response.read()
119 |         imgfname = path
120 |         imgtype = imghdr.what("", conte)
121 |         #print(imgtype)
122 |         imgfname += '.'+imgtype
123 |         with open(imgfname, "wb") as f:
124 |             
125 |             f.write(conte)
126 |             response.close()
127 |         
128 |     
129 |     except Exception as e:
130 |         print(e)
131 | 
132 | 
133 | if not os.path.exists('Photos'):
134 |     os.mkdir('Photos')
135 | else:
136 |     shutil.rmtree('Photos')
137 |     os.mkdir('Photos')
138 | 
139 | 
140 | for item in items:
141 |     itemType = item.getElementsByTagName("type")[0]
142 |     #print(itemType.nodeName, ":", itemType.childNodes[0].data)
143 |     if itemType.childNodes[0].data != "Photo":
144 |         continue
145 |     i += 1
146 | 
147 | 
148 | ## get all messages
149 | 
150 |     titleText = ''
151 | 
152 |     publishTime = item.getElementsByTagName("publishTime")[0].childNodes[0].data
153 |     publishTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(publishTime)/1000)))
154 |     
155 |     titleText = "插图_"+publishTime
156 |     titleText = titleText.replace(' ', '_')
157 |         
158 |     titleText = titleText.replace('/','-')
159 |     titleText = titleText.replace('\\', '-')
160 |     titleText = titleText.replace('|', '-')
161 |     titleText = titleText.replace(':', '-')    
162 |     titleText = titleText.replace('"', '-')
163 |     titleText = titleText.replace('*', '-')
164 |     titleText = titleText.replace('?', '-')
165 |     titleText = titleText.replace('<', '-')
166 |     titleText = titleText.replace('>', '-')
167 |     titleText = titleText.replace('', '')
168 |     titleText = titleText.replace('\n', '')
169 |     
170 |     modifyTime = publishTime
171 |     if item.getElementsByTagName("modifyTime") == []:
172 |         modifyTime = publishTime
173 |     else:
174 |         modifyTime = item.getElementsByTagName("modifyTime")[0].childNodes[0].data
175 |         modifyTime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(modifyTime)/1000)))
176 |     tag = []
177 |     if item.getElementsByTagName("tag") == []:
178 |         tag = []
179 |     else:
180 |         tag = item.getElementsByTagName("tag")[0]
181 |         if tag.childNodes == []:
182 |             tag = []
183 |         else:
184 |             tag = tag.childNodes[0].data.split(',')
185 |     #print(tag)
186 |     #print(type(item.getElementsByTagName("caption")[0]))
187 |     #print(i)
188 |     if item.getElementsByTagName("caption") == [] or item.getElementsByTagName("caption")[0].childNodes == []:
189 |         content = ''
190 |     else:
191 |         content = item.getElementsByTagName("caption")[0].childNodes[0].data
192 |     #print(content)
193 |     content = content.replace('<p>', '')
194 |     content = content.replace('</p>', '')
195 |     content = content.replace('<br />', '\r\n')
196 |     content = content.replace('', '')
197 | 
198 |     linkSrc = r'href="(.*?)"'
199 |     linkSrc = re.findall(linkSrc, content)
200 |     #iS2 = r'<img src=(.*?)smallsrc='
201 |     #imgSrc = imgSrc + re.findall(iS2, content)
202 |     #for j in range(0, len(imgSrc)):
203 |     #    imgSrc[j] = imgSrc[j].replace('"', '')
204 |     #    imgSrc[j] = imgSrc[j].replace(' ', '')
205 | 
206 |     content = re.compile(r'<[^>]+>', re.S).sub('', content)
207 | 
208 |     photos = item.getElementsByTagName("photoLinks")[0].childNodes[0].data
209 |     photos = eval(photos)
210 |     #if i == 1:
211 |     #    print(photos)
212 | 
213 | 
214 |     title_list.append(titleText)
215 |     url_list.append(photos)
216 | 
217 | 
218 | 
219 | 
220 | 
221 |     #if i == 1:
222 |     #    print(content)
223 |     
224 |     cList = item.getElementsByTagName("comment")
225 |     comments = []
226 |     for comm in cList:
227 |         pubid = comm.getElementsByTagName("publisherUserId")[0].childNodes[0].data
228 |         pubnick = comm.getElementsByTagName("publisherNick")[0].childNodes[0].data
229 |         comcon = comm.getElementsByTagName("content")[0].childNodes[0].data
230 |         comtime = comm.getElementsByTagName("publishTime")[0].childNodes[0].data
231 |         comtime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(float(int(comtime)/1000)))
232 |         repid = comm.getElementsByTagName("replyToUserId")[0].childNodes[0].data
233 |         comments.append({"pubid":pubid, "pubnick":pubnick, "comcon":comcon, "comtime":comtime, "repid":repid})
234 | 
235 | ## save as txt
236 | 
237 | 
238 |     with open("./Photos/"+titleText+".txt", "w", encoding="utf-8") as f:
239 |         f.write(titleText+'\n\n')
240 |         f.write("发表时间："+publishTime+'\n')
241 |         f.write("修改时间："+modifyTime+'\n\n')
242 |         f.write("Tag：")
243 |         if tag != []:
244 |             for t in range(0, len(tag)-1):
245 |                 f.write(tag[t]+', ')
246 |             f.write(tag[len(tag)-1]+'\n\n\n')
247 |         else:
248 |             f.write('\n\n\n')
249 |         f.writelines(content)
250 |         f.write("\n\n插入链接：\n")
251 |         for lk in linkSrc:
252 |             f.write(lk+'\n')
253 | 
254 |         f.write('\n\n\n评论：\n\n')
255 |         for comm in comments:
256 |             f.write("发表人："+comm["pubnick"]+'  '+"UserId："+comm["pubid"]+'  '+"回复时间："+comm["comtime"]+'\n')
257 |             f.write("回复给："+comm["repid"]+'\n')
258 |             f.writelines(comm["comcon"]+'\n\n')
259 | 
260 | 
261 |     
262 | def main():
263 | 
264 |     for xx in range(5):
265 |         product = threading.Thread(target=get_url)
266 |         product.start()
267 |     for xx in range(6):
268 |         consumer = threading.Thread(target=download_pho)
269 |         consumer.start()
270 | 
271 | 
272 | if __name__ == '__main__':
273 |     main()
274 | 


--------------------------------------------------------------------------------