├── .gitignore
├── README.md
├── download_comments.py
├── download_posts.py
└── export.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__/
 2 | *.py[cod]
 3 | .idea
 4 | comments-json
 5 | comments-xml
 6 | comments-markdown
 7 | posts-xml
 8 | posts-json
 9 | posts-markdown
10 | posts-html
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Export your LiveJournal blog data
 2 | 
 3 | [Livejournal provides a method to export your posts as 
 4 | XML](http://www.livejournal.com/export.bml). However 
 5 | this has to be done manually for every month of your blog. 
 6 | Also [comments are exported separately](http://www.livejournal.com/developer/exporting.bml).
 7 | I wrote this tool to make exporting more convenient.
 8 | 
 9 | You will need Python 3 to use it.
10 | 
11 | ## export.py
12 | 
13 | This script will do the exporting. You will end up with
14 | full blog contents in several formats. `posts-html` folder
15 | will contain basic HTML of posts and comments.
16 | `posts-markdown` will contain posts in Markdown format
17 | with HTML comments and metadata necessary to
18 | [generate a static blog with Pelican](http://docs.getpelican.com/).
19 | `posts-json` will contain posts with nested comments 
20 | in JSON format should you want to process them further.
21 | 
22 | This version of the script does not require you to make any
23 | modifications prior to running it. It will prompt you for
24 | the range of months you want to pull, then will ask for your
25 | LiveJournal username and password. It will use that to 
26 | acquire the required session cookies. After this, the
27 | download process will begin.
28 | 
29 | ## download_posts.py
30 | 
31 | This script will download your posts in XML into `posts-xml` 
32 | folder. Also it will create `posts-json/all.json` file with
33 | the same data in JSON format for convenient processing.
34 | 
35 | ## download_comments.py
36 | 
37 | This script will download comments from your blog as 
38 | `comments-xml/*.xml` files. Also it will create
39 | `comments-json/all.json` with all the comments data in
40 | JSON format for convenient processing.
41 | 
42 | ## Requirements
43 | 
44 | * `html2text`
45 | * `markdown`
46 | * `beautifulsoup4`
47 | * `requests`
48 | * `lxml`
49 | 
50 | ## Processing exported data separately
51 | 
52 | In the last lines of `export.py` there's a condition `if True:`.
53 | Change `True` to `False` to skip the downloading step and go
54 | directly to the processing of already downloaded data.
55 | 
56 | 


--------------------------------------------------------------------------------
/download_comments.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import os
 4 | import json
 5 | import requests
 6 | import xml.etree.ElementTree as ET
 7 | 
 8 | def fetch_xml(params, cookies, headers):
 9 |     response = requests.get(
10 |         'https://www.livejournal.com/export_comments.bml',
11 |         params=params,
12 |         headers=headers,
13 |         cookies=cookies
14 |     )
15 | 
16 |     return response.text
17 | 
18 | 
19 | def get_users_map(xml):
20 |     users = {}
21 | 
22 |     for user in xml.iter('usermap'):
23 |         users[user.attrib['id']] = user.attrib['user']
24 | 
25 |     with open('comments-json/usermap.json', 'w', encoding='utf-8') as f:
26 |         f.write(json.dumps(users, ensure_ascii=False, indent=2))
27 | 
28 |     return users
29 | 
30 | 
31 | def get_comment_property(name, comment_xml, comment):
32 |     if name in comment_xml.attrib:
33 |         comment[name] = int(comment_xml.attrib[name])
34 | 
35 | 
36 | def get_comment_element(name, comment_xml, comment):
37 |     elements = comment_xml.findall(name)
38 |     if len(elements) > 0:
39 |         comment[name] = elements[0].text
40 | 
41 | 
42 | def get_more_comments(start_id, users, cookies, headers):
43 |     comments = []
44 |     local_max_id = -1
45 | 
46 |     xml = fetch_xml({'get': 'comment_body', 'startid': start_id}, cookies, headers)
47 |     with open('comments-xml/comment_body-{0}.xml'.format(start_id), 'w', encoding='utf-8') as f:
48 |         f.write(xml)
49 | 
50 |     for comment_xml in ET.fromstring(xml).iter('comment'):
51 |         comment = {
52 |             'jitemid': int(comment_xml.attrib['jitemid']),
53 |             'id': int(comment_xml.attrib['id']),
54 |             'children': []
55 |         }
56 |         get_comment_property('parentid', comment_xml, comment)
57 |         get_comment_property('posterid', comment_xml, comment)
58 |         get_comment_element('date', comment_xml, comment)
59 |         get_comment_element('subject', comment_xml, comment)
60 |         get_comment_element('body', comment_xml, comment)
61 | 
62 |         if 'state' in comment_xml.attrib:
63 |             comment['state'] = comment_xml.attrib['state']
64 | 
65 |         if 'posterid' in comment:
66 |             comment['author'] = users.get(str(comment['posterid']), "deleted-user")
67 | 
68 |         local_max_id = max(local_max_id, comment['id'])
69 |         comments.append(comment)
70 | 
71 |     return local_max_id, comments
72 | 
73 | def download_comments(cookies, headers):
74 |     os.makedirs('comments-xml', exist_ok=True)
75 |     os.makedirs('comments-json', exist_ok=True)
76 | 
77 |     metadata_xml = fetch_xml({'get': 'comment_meta', 'startid': 0}, cookies, headers)
78 |     with open('comments-xml/comment_meta.xml', 'w', encoding='utf-8') as f:
79 |         f.write(metadata_xml)
80 | 
81 |     metadata = ET.fromstring(metadata_xml)
82 |     users = get_users_map(metadata)
83 | 
84 |     all_comments = []
85 |     start_id = 0
86 |     max_id = int(metadata.find('maxid').text)
87 |     while start_id < max_id:
88 |         start_id, comments = get_more_comments(start_id + 1, users, cookies, headers)
89 |         all_comments.extend(comments)
90 | 
91 |     with open('comments-json/all.json', 'w', encoding='utf-8') as f:
92 |         f.write(json.dumps(all_comments, ensure_ascii=False, indent=2))
93 | 
94 |     return all_comments
95 | 
96 | 
97 | if __name__ == '__main__':
98 |     download_comments()
99 | 


--------------------------------------------------------------------------------
/download_posts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python3
 2 | 
 3 | import json
 4 | import os
 5 | import requests
 6 | from sys import exit as sysexit
 7 | import xml.etree.ElementTree as ET
 8 | from datetime import datetime, timedelta
 9 | from dateutil.relativedelta import relativedelta
10 | 
11 | DATE_FORMAT = '%Y-%m'
12 | 
13 | try:
14 |     start_month = datetime.strptime(input("Enter start month in YYYY-MM format: "), DATE_FORMAT)
15 | except Exception as e:
16 |     print(f"\nError with start month entered. Error: {e}. Exiting...")
17 |     sysexit(1)
18 | 
19 | try:
20 |     end_month = datetime.strptime(input("Enter end month in YYYY-MM format: "), DATE_FORMAT)
21 | except Exception as e:
22 |     print(f"\nError with end month entered. Error: {e}. Exiting...")
23 |     sysexit(1)
24 | 
25 | 
26 | def fetch_month_posts(year, month, cookies, headers):
27 |     response = requests.post(
28 |         'https://www.livejournal.com/export_do.bml',
29 |         headers=headers,
30 |         cookies=cookies,
31 |         data={
32 |             'what': 'journal',
33 |             'year': year,
34 |             'month': '{0:02d}'.format(month),
35 |             'format': 'xml',
36 |             'header': 'on',
37 |             'encid': '2',
38 |             'field_itemid': 'on',
39 |             'field_eventtime': 'on',
40 |             'field_logtime': 'on',
41 |             'field_subject': 'on',
42 |             'field_event': 'on',
43 |             'field_security': 'on',
44 |             'field_allowmask': 'on',
45 |             'field_currents': 'on'
46 |         }
47 |     )
48 |     return response.text
49 | 
50 | def xml_to_json(xml):
51 |     def f(field):
52 |         return xml.find(field).text
53 | 
54 |     return {
55 |         'id': f('itemid'),
56 |         'date': f('logtime'),
57 |         'subject': f('subject') or '',
58 |         'body': f('event'),
59 |         'eventtime': f('eventtime'),
60 |         'security': f('security'),
61 |         'allowmask': f('allowmask'),
62 |         'current_music': f('current_music'),
63 |         'current_mood': f('current_mood')
64 |     }
65 | 
66 | def download_posts(cookies, headers):
67 |     os.makedirs('posts-xml', exist_ok=True)
68 |     os.makedirs('posts-json', exist_ok=True)
69 | 
70 |     xml_posts = []
71 |     month_cursor = start_month
72 | 
73 |     while month_cursor <= end_month:
74 |         year = month_cursor.year
75 |         month = month_cursor.month
76 | 
77 |         xml = fetch_month_posts(year, month, cookies, headers)
78 |         xml_posts.extend(list(ET.fromstring(xml).iter('entry')))
79 | 
80 |         with open('posts-xml/{0}-{1:02d}.xml'.format(year, month), 'w+', encoding='utf-8') as file:
81 |             file.write(xml)
82 |         
83 |         month_cursor = month_cursor + relativedelta(months=1)  
84 | 
85 |     json_posts = list(map(xml_to_json, xml_posts))
86 |     with open('posts-json/all.json', 'w', encoding='utf-8') as f:
87 |         f.write(json.dumps(json_posts, ensure_ascii=False, indent=2))
88 | 
89 |     return json_posts
90 | 
91 | if __name__ == '__main__':
92 |     download_posts()
93 | 


--------------------------------------------------------------------------------
/export.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python3
  2 | 
  3 | import os
  4 | import json
  5 | import re 
  6 | import html2text
  7 | import requests 
  8 | from sys import exit as sysexit
  9 | from bs4 import BeautifulSoup
 10 | from getpass import getpass
 11 | from datetime import datetime
 12 | from markdown import markdown
 13 | from operator import itemgetter
 14 | from download_posts import download_posts
 15 | from download_comments import download_comments
 16 | 
 17 | def get_cookie_value(response, cName):
 18 |     try:
 19 |         header = response.headers.get('Set-Cookie')
 20 | 
 21 |         if header:
 22 |             return header.split(f'{cName}=')[1].split(';')[0]
 23 |         else:
 24 |             raise ValueError(f'Cookie {cName} not found in response.')
 25 | 
 26 |     except Exception as e:
 27 |         print(f"Error extracting required cookie: {cName}. Error: {e}. Exiting...")
 28 |         sysexit(1)
 29 | 
 30 | 
 31 | # Generic headers to prevent LiveJournal from throwing out this random solicitation
 32 | headers = {
 33 |     "Upgrade-Insecure-Requests": "1",
 34 |     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 OPR/113.0.0.0",
 35 |     "sec-ch-ua": '"Chromium";v="127"',
 36 |     "sec-ch-ua-platform": '"Windows"',
 37 | }
 38 | 
 39 | 
 40 | # Get a "luid" cookie so it'll accept our form login.
 41 | try:
 42 |     response = requests.get("https://www.livejournal.com/", headers=headers)
 43 | except Exception as e:
 44 |     # If attempt to reach LiveJournal fails, error out.
 45 |     print(f"Could not retrieve pre-connection cookie from www.livejournal.com. Error: {e}. Exiting.")
 46 |     sysexit(1)
 47 | 
 48 | cookies = {
 49 |     'luid': get_cookie_value(response, 'luid')
 50 | }
 51 | 
 52 | # Populate dictionary for request
 53 | credentials = {
 54 |     'user': input("Enter LiveJournal Username: "),
 55 |     'password': getpass("Enter LiveJournal Password: ")
 56 | }
 57 | 
 58 | # Login with user credentials and retrieve the two cookies required for the main script functions
 59 | response = requests.post("https://www.livejournal.com/login.bml", data=credentials, cookies=cookies)
 60 | 
 61 | # If not successful, whine about it.
 62 | if response.status_code != 200:
 63 |     print("Error - Return code:", response.status_code)
 64 | 
 65 | # If successful, then get the 'Set-Cookie' key from the headers dict and parse it for the two cookies, placing them in a cookies dict
 66 | cookies = {
 67 |     'ljloggedin': get_cookie_value(response, 'ljloggedin'),
 68 |     'ljmastersession': get_cookie_value(response, 'ljmastersession')
 69 | }
 70 | 
 71 | # Credit to the Author!
 72 | headers = {
 73 |     'User-Agent': 'https://github.com/arty-name/livejournal-export; me@arty.name'
 74 | }
 75 | 
 76 | # Now that we have the cookies, notify the user that we'll grab the LJ posts and comments
 77 | print("Login successful. Downloading posts and comments.")
 78 | print("When complete, you will find post-... and comment-... folders in the current location\ncontaining the differently formated versions of your content.")
 79 | 
 80 | COMMENTS_HEADER = 'Комментарии'
 81 | 
 82 | TAG = re.compile(r'\[!\[(.*?)\]\(http:\/\/utx.ambience.ru\/img\/.*?\)\]\(.*?\)')
 83 | USER = re.compile(r'<lj user="?(.*?)"?>')
 84 | TAGLESS_NEWLINES = re.compile(r'(?<!>)\n')
 85 | NEWLINES = re.compile(r'(\s*\n){3,}')
 86 | 
 87 | SLUGS = {}
 88 | 
 89 | # TODO: lj-cut
 90 | 
 91 | 
 92 | def fix_user_links(json):
 93 |     """ replace user links with usernames """
 94 |     if 'subject' in json:
 95 |         json['subject'] = USER.sub(r'\1', json['subject'])
 96 | 
 97 |     if 'body' in json:
 98 |         json['body'] = USER.sub(r'\1', json['body'])
 99 | 
100 | 
101 | def json_to_html(json):
102 |     return """<!doctype html>
103 | <meta charset="utf-8">
104 | <title>{subject}</title>
105 | <article>
106 | <h1>{subject}</h1>
107 | {body}
108 | </article>
109 | """.format(
110 |         subject=json['subject'] or json['date'],
111 |         body=TAGLESS_NEWLINES.sub('<br>\n', json['body'])
112 |     )
113 | 
114 | 
115 | def get_slug(json):
116 |     slug = json['subject']
117 |     if not len(slug):
118 |         slug = json['id']
119 | 
120 |     if '<' in slug or '&' in slug:
121 |         slug = BeautifulSoup('<p>{0}</p>'.format(slug), features='lxml').text
122 | 
123 |     slug = re.compile(r'\W+').sub('-', slug)
124 |     slug = re.compile(r'^-|-$').sub('', slug)
125 | 
126 |     if slug in SLUGS:
127 |         slug += (len(slug) and '-' or '') + json['id']
128 | 
129 |     SLUGS[slug] = True
130 | 
131 |     return slug
132 | 
133 | 
134 | def json_to_markdown(json):
135 |     body = TAGLESS_NEWLINES.sub('<br>', json['body'])
136 | 
137 |     h = html2text.HTML2Text()
138 |     h.body_width = 0
139 |     h.unicode_snob = True
140 |     body = h.handle(body)
141 |     body = NEWLINES.sub('\n\n', body)
142 | 
143 |     # read UTX tags
144 |     tags = TAG.findall(body)
145 |     json['tags'] = len(tags) and '\ntags: {0}'.format(', '.join(tags)) or ''
146 | 
147 |     # remove UTX tags from text
148 |     json['body'] = TAG.sub('', body).strip()
149 | 
150 |     json['slug'] = get_slug(json)
151 |     json['subject'] = json['subject'] or json['date']
152 | 
153 |     return """id: {id}
154 | title: {subject}
155 | slug: {slug}
156 | date: {date}{tags}
157 | 
158 | {body}
159 | """.format(**json)
160 | 
161 | def group_comments_by_post(comments):
162 |     posts = {}
163 | 
164 |     for comment in comments:
165 |         post_id = comment['jitemid']
166 | 
167 |         if post_id not in posts:
168 |             posts[post_id] = {}
169 | 
170 |         post = posts[post_id]
171 |         post[comment['id']] = comment
172 | 
173 |     return posts
174 | 
175 | 
176 | def nest_comments(comments):
177 |     post = []
178 | 
179 |     for comment in comments.values():
180 |         fix_user_links(comment)
181 | 
182 |         if 'parentid' not in comment:
183 |             post.append(comment)
184 |         else:
185 |             comments[comment['parentid']]['children'].append(comment)
186 | 
187 |     return post
188 | 
189 | 
190 | def comment_to_li(comment):
191 |     if 'state' in comment and comment['state'] == 'D':
192 |         return ''
193 | 
194 |     html = '<h3>{0}: {1}</h3>'.format(comment.get('author', 'anonym'), comment.get('subject', ''))
195 |     html += '\n<a id="comment-{0}"></a>'.format(comment['id'])
196 | 
197 |     if 'body' in comment:
198 |         html += '\n' + markdown(TAGLESS_NEWLINES.sub('<br>\n', comment['body']))
199 | 
200 |     if len(comment['children']) > 0:
201 |         html += '\n' + comments_to_html(comment['children'])
202 | 
203 |     subject_class = 'subject' in comment and ' class=subject' or ''
204 |     return '<li{0}>{1}\n</li>'.format(subject_class, html)
205 | 
206 | 
207 | def comments_to_html(comments):
208 |     return '<ul>\n{0}\n</ul>'.format('\n'.join(map(comment_to_li, sorted(comments, key=itemgetter('id')))))
209 | 
210 | 
211 | def save_as_json(id, json_post, post_comments):
212 |     json_data = {'id': id, 'post': json_post, 'comments': post_comments}
213 |     with open('posts-json/{0}.json'.format(id), 'w', encoding='utf-8') as f:
214 |         f.write(json.dumps(json_data, ensure_ascii=False, indent=2))
215 | 
216 | 
217 | def save_as_markdown(id, subfolder, json_post, post_comments_html):
218 |     os.makedirs('posts-markdown/{0}'.format(subfolder), exist_ok=True)
219 |     with open('posts-markdown/{0}/{1}.md'.format(subfolder, id), 'w', encoding='utf-8') as f:
220 |         f.write(json_to_markdown(json_post))
221 |     if post_comments_html:
222 |         with open('comments-markdown/{0}.md'.format(json_post['slug']), 'w', encoding='utf-8') as f:
223 |             f.write(post_comments_html)
224 | 
225 | 
226 | def save_as_html(id, subfolder, json_post, post_comments_html):
227 |     os.makedirs('posts-html/{0}'.format(subfolder), exist_ok=True)
228 |     with open('posts-html/{0}/{1}.html'.format(subfolder, id), 'w', encoding='utf-8') as f:
229 |         f.writelines(json_to_html(json_post))
230 |         if post_comments_html:
231 |             f.write('\n<h2>{0}</h2>\n'.format(COMMENTS_HEADER) + post_comments_html)
232 | 
233 | 
234 | def combine(all_posts, all_comments):
235 |     os.makedirs('posts-html', exist_ok=True)
236 |     os.makedirs('posts-markdown', exist_ok=True)
237 |     os.makedirs('comments-markdown', exist_ok=True)
238 | 
239 |     posts_comments = group_comments_by_post(all_comments)
240 | 
241 |     for json_post in all_posts:
242 |         id = json_post['id']
243 |         jitemid = int(id) >> 8
244 | 
245 |         date = datetime.strptime(json_post['date'], '%Y-%m-%d %H:%M:%S')
246 |         subfolder = '{0.year}-{0.month:02d}'.format(date)
247 | 
248 |         post_comments = jitemid in posts_comments and nest_comments(posts_comments[jitemid]) or None
249 |         post_comments_html = post_comments and comments_to_html(post_comments) or ''
250 | 
251 |         fix_user_links(json_post)
252 | 
253 |         save_as_json(id, json_post, post_comments)
254 |         save_as_html(id, subfolder, json_post, post_comments_html)
255 |         save_as_markdown(id, subfolder, json_post, post_comments_html)
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     if True:
260 |         all_posts = download_posts(cookies, headers)
261 |         all_comments = download_comments(cookies, headers)
262 | 
263 |     else:
264 |         with open('posts-json/all.json', 'r', encoding='utf-8') as f:
265 |             all_posts = json.load(f)
266 |         with open('comments-json/all.json', 'r', encoding='utf-8') as f:
267 |             all_comments = json.load(f)
268 | 
269 |     combine(all_posts, all_comments)
270 | 


--------------------------------------------------------------------------------