├── .gitignore ├── README.md ├── download_comments.py ├── download_posts.py └── export.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__/ 2 | *.py[cod] 3 | .idea 4 | comments-json 5 | comments-xml 6 | comments-markdown 7 | posts-xml 8 | posts-json 9 | posts-markdown 10 | posts-html 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Export your LiveJournal blog data 2 | 3 | [Livejournal provides a method to export your posts as 4 | XML](http://www.livejournal.com/export.bml). However 5 | this has to be done manually for every month of your blog. 6 | Also [comments are exported separately](http://www.livejournal.com/developer/exporting.bml). 7 | I wrote this tool to make exporting more convenient. 8 | 9 | You will need Python 3 to use it. 10 | 11 | ## export.py 12 | 13 | This script will do the exporting. You will end up with 14 | full blog contents in several formats. `posts-html` folder 15 | will contain basic HTML of posts and comments. 16 | `posts-markdown` will contain posts in Markdown format 17 | with HTML comments and metadata necessary to 18 | [generate a static blog with Pelican](http://docs.getpelican.com/). 19 | `posts-json` will contain posts with nested comments 20 | in JSON format should you want to process them further. 21 | 22 | This version of the script does not require you to make any 23 | modifications prior to running it. It will prompt you for 24 | the range of months you want to pull, then will ask for your 25 | LiveJournal username and password. It will use that to 26 | acquire the required session cookies. After this, the 27 | download process will begin. 28 | 29 | ## download_posts.py 30 | 31 | This script will download your posts in XML into `posts-xml` 32 | folder. Also it will create `posts-json/all.json` file with 33 | the same data in JSON format for convenient processing. 34 | 35 | ## download_comments.py 36 | 37 | This script will download comments from your blog as 38 | `comments-xml/*.xml` files. Also it will create 39 | `comments-json/all.json` with all the comments data in 40 | JSON format for convenient processing. 41 | 42 | ## Requirements 43 | 44 | * `html2text` 45 | * `markdown` 46 | * `beautifulsoup4` 47 | * `requests` 48 | * `lxml` 49 | 50 | ## Processing exported data separately 51 | 52 | In the last lines of `export.py` there's a condition `if True:`. 53 | Change `True` to `False` to skip the downloading step and go 54 | directly to the processing of already downloaded data. 55 | 56 | -------------------------------------------------------------------------------- /download_comments.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import json 5 | import requests 6 | import xml.etree.ElementTree as ET 7 | 8 | def fetch_xml(params, cookies, headers): 9 | response = requests.get( 10 | 'https://www.livejournal.com/export_comments.bml', 11 | params=params, 12 | headers=headers, 13 | cookies=cookies 14 | ) 15 | 16 | return response.text 17 | 18 | 19 | def get_users_map(xml): 20 | users = {} 21 | 22 | for user in xml.iter('usermap'): 23 | users[user.attrib['id']] = user.attrib['user'] 24 | 25 | with open('comments-json/usermap.json', 'w', encoding='utf-8') as f: 26 | f.write(json.dumps(users, ensure_ascii=False, indent=2)) 27 | 28 | return users 29 | 30 | 31 | def get_comment_property(name, comment_xml, comment): 32 | if name in comment_xml.attrib: 33 | comment[name] = int(comment_xml.attrib[name]) 34 | 35 | 36 | def get_comment_element(name, comment_xml, comment): 37 | elements = comment_xml.findall(name) 38 | if len(elements) > 0: 39 | comment[name] = elements[0].text 40 | 41 | 42 | def get_more_comments(start_id, users, cookies, headers): 43 | comments = [] 44 | local_max_id = -1 45 | 46 | xml = fetch_xml({'get': 'comment_body', 'startid': start_id}, cookies, headers) 47 | with open('comments-xml/comment_body-{0}.xml'.format(start_id), 'w', encoding='utf-8') as f: 48 | f.write(xml) 49 | 50 | for comment_xml in ET.fromstring(xml).iter('comment'): 51 | comment = { 52 | 'jitemid': int(comment_xml.attrib['jitemid']), 53 | 'id': int(comment_xml.attrib['id']), 54 | 'children': [] 55 | } 56 | get_comment_property('parentid', comment_xml, comment) 57 | get_comment_property('posterid', comment_xml, comment) 58 | get_comment_element('date', comment_xml, comment) 59 | get_comment_element('subject', comment_xml, comment) 60 | get_comment_element('body', comment_xml, comment) 61 | 62 | if 'state' in comment_xml.attrib: 63 | comment['state'] = comment_xml.attrib['state'] 64 | 65 | if 'posterid' in comment: 66 | comment['author'] = users.get(str(comment['posterid']), "deleted-user") 67 | 68 | local_max_id = max(local_max_id, comment['id']) 69 | comments.append(comment) 70 | 71 | return local_max_id, comments 72 | 73 | def download_comments(cookies, headers): 74 | os.makedirs('comments-xml', exist_ok=True) 75 | os.makedirs('comments-json', exist_ok=True) 76 | 77 | metadata_xml = fetch_xml({'get': 'comment_meta', 'startid': 0}, cookies, headers) 78 | with open('comments-xml/comment_meta.xml', 'w', encoding='utf-8') as f: 79 | f.write(metadata_xml) 80 | 81 | metadata = ET.fromstring(metadata_xml) 82 | users = get_users_map(metadata) 83 | 84 | all_comments = [] 85 | start_id = 0 86 | max_id = int(metadata.find('maxid').text) 87 | while start_id < max_id: 88 | start_id, comments = get_more_comments(start_id + 1, users, cookies, headers) 89 | all_comments.extend(comments) 90 | 91 | with open('comments-json/all.json', 'w', encoding='utf-8') as f: 92 | f.write(json.dumps(all_comments, ensure_ascii=False, indent=2)) 93 | 94 | return all_comments 95 | 96 | 97 | if __name__ == '__main__': 98 | download_comments() 99 | -------------------------------------------------------------------------------- /download_posts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import json 4 | import os 5 | import requests 6 | from sys import exit as sysexit 7 | import xml.etree.ElementTree as ET 8 | from datetime import datetime, timedelta 9 | from dateutil.relativedelta import relativedelta 10 | 11 | DATE_FORMAT = '%Y-%m' 12 | 13 | try: 14 | start_month = datetime.strptime(input("Enter start month in YYYY-MM format: "), DATE_FORMAT) 15 | except Exception as e: 16 | print(f"\nError with start month entered. Error: {e}. Exiting...") 17 | sysexit(1) 18 | 19 | try: 20 | end_month = datetime.strptime(input("Enter end month in YYYY-MM format: "), DATE_FORMAT) 21 | except Exception as e: 22 | print(f"\nError with end month entered. Error: {e}. Exiting...") 23 | sysexit(1) 24 | 25 | 26 | def fetch_month_posts(year, month, cookies, headers): 27 | response = requests.post( 28 | 'https://www.livejournal.com/export_do.bml', 29 | headers=headers, 30 | cookies=cookies, 31 | data={ 32 | 'what': 'journal', 33 | 'year': year, 34 | 'month': '{0:02d}'.format(month), 35 | 'format': 'xml', 36 | 'header': 'on', 37 | 'encid': '2', 38 | 'field_itemid': 'on', 39 | 'field_eventtime': 'on', 40 | 'field_logtime': 'on', 41 | 'field_subject': 'on', 42 | 'field_event': 'on', 43 | 'field_security': 'on', 44 | 'field_allowmask': 'on', 45 | 'field_currents': 'on' 46 | } 47 | ) 48 | return response.text 49 | 50 | def xml_to_json(xml): 51 | def f(field): 52 | return xml.find(field).text 53 | 54 | return { 55 | 'id': f('itemid'), 56 | 'date': f('logtime'), 57 | 'subject': f('subject') or '', 58 | 'body': f('event'), 59 | 'eventtime': f('eventtime'), 60 | 'security': f('security'), 61 | 'allowmask': f('allowmask'), 62 | 'current_music': f('current_music'), 63 | 'current_mood': f('current_mood') 64 | } 65 | 66 | def download_posts(cookies, headers): 67 | os.makedirs('posts-xml', exist_ok=True) 68 | os.makedirs('posts-json', exist_ok=True) 69 | 70 | xml_posts = [] 71 | month_cursor = start_month 72 | 73 | while month_cursor <= end_month: 74 | year = month_cursor.year 75 | month = month_cursor.month 76 | 77 | xml = fetch_month_posts(year, month, cookies, headers) 78 | xml_posts.extend(list(ET.fromstring(xml).iter('entry'))) 79 | 80 | with open('posts-xml/{0}-{1:02d}.xml'.format(year, month), 'w+', encoding='utf-8') as file: 81 | file.write(xml) 82 | 83 | month_cursor = month_cursor + relativedelta(months=1) 84 | 85 | json_posts = list(map(xml_to_json, xml_posts)) 86 | with open('posts-json/all.json', 'w', encoding='utf-8') as f: 87 | f.write(json.dumps(json_posts, ensure_ascii=False, indent=2)) 88 | 89 | return json_posts 90 | 91 | if __name__ == '__main__': 92 | download_posts() 93 | -------------------------------------------------------------------------------- /export.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python3 2 | 3 | import os 4 | import json 5 | import re 6 | import html2text 7 | import requests 8 | from sys import exit as sysexit 9 | from bs4 import BeautifulSoup 10 | from getpass import getpass 11 | from datetime import datetime 12 | from markdown import markdown 13 | from operator import itemgetter 14 | from download_posts import download_posts 15 | from download_comments import download_comments 16 | 17 | def get_cookie_value(response, cName): 18 | try: 19 | header = response.headers.get('Set-Cookie') 20 | 21 | if header: 22 | return header.split(f'{cName}=')[1].split(';')[0] 23 | else: 24 | raise ValueError(f'Cookie {cName} not found in response.') 25 | 26 | except Exception as e: 27 | print(f"Error extracting required cookie: {cName}. Error: {e}. Exiting...") 28 | sysexit(1) 29 | 30 | 31 | # Generic headers to prevent LiveJournal from throwing out this random solicitation 32 | headers = { 33 | "Upgrade-Insecure-Requests": "1", 34 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 OPR/113.0.0.0", 35 | "sec-ch-ua": '"Chromium";v="127"', 36 | "sec-ch-ua-platform": '"Windows"', 37 | } 38 | 39 | 40 | # Get a "luid" cookie so it'll accept our form login. 41 | try: 42 | response = requests.get("https://www.livejournal.com/", headers=headers) 43 | except Exception as e: 44 | # If attempt to reach LiveJournal fails, error out. 45 | print(f"Could not retrieve pre-connection cookie from www.livejournal.com. Error: {e}. Exiting.") 46 | sysexit(1) 47 | 48 | cookies = { 49 | 'luid': get_cookie_value(response, 'luid') 50 | } 51 | 52 | # Populate dictionary for request 53 | credentials = { 54 | 'user': input("Enter LiveJournal Username: "), 55 | 'password': getpass("Enter LiveJournal Password: ") 56 | } 57 | 58 | # Login with user credentials and retrieve the two cookies required for the main script functions 59 | response = requests.post("https://www.livejournal.com/login.bml", data=credentials, cookies=cookies) 60 | 61 | # If not successful, whine about it. 62 | if response.status_code != 200: 63 | print("Error - Return code:", response.status_code) 64 | 65 | # If successful, then get the 'Set-Cookie' key from the headers dict and parse it for the two cookies, placing them in a cookies dict 66 | cookies = { 67 | 'ljloggedin': get_cookie_value(response, 'ljloggedin'), 68 | 'ljmastersession': get_cookie_value(response, 'ljmastersession') 69 | } 70 | 71 | # Credit to the Author! 72 | headers = { 73 | 'User-Agent': 'https://github.com/arty-name/livejournal-export; me@arty.name' 74 | } 75 | 76 | # Now that we have the cookies, notify the user that we'll grab the LJ posts and comments 77 | print("Login successful. Downloading posts and comments.") 78 | print("When complete, you will find post-... and comment-... folders in the current location\ncontaining the differently formated versions of your content.") 79 | 80 | COMMENTS_HEADER = 'Комментарии' 81 | 82 | TAG = re.compile(r'\[!\[(.*?)\]\(http:\/\/utx.ambience.ru\/img\/.*?\)\]\(.*?\)') 83 | USER = re.compile(r'') 84 | TAGLESS_NEWLINES = re.compile(r'(?)\n') 85 | NEWLINES = re.compile(r'(\s*\n){3,}') 86 | 87 | SLUGS = {} 88 | 89 | # TODO: lj-cut 90 | 91 | 92 | def fix_user_links(json): 93 | """ replace user links with usernames """ 94 | if 'subject' in json: 95 | json['subject'] = USER.sub(r'\1', json['subject']) 96 | 97 | if 'body' in json: 98 | json['body'] = USER.sub(r'\1', json['body']) 99 | 100 | 101 | def json_to_html(json): 102 | return """ 103 | 104 | {subject} 105 |
106 |

{subject}

107 | {body} 108 |
109 | """.format( 110 | subject=json['subject'] or json['date'], 111 | body=TAGLESS_NEWLINES.sub('
\n', json['body']) 112 | ) 113 | 114 | 115 | def get_slug(json): 116 | slug = json['subject'] 117 | if not len(slug): 118 | slug = json['id'] 119 | 120 | if '<' in slug or '&' in slug: 121 | slug = BeautifulSoup('

{0}

'.format(slug), features='lxml').text 122 | 123 | slug = re.compile(r'\W+').sub('-', slug) 124 | slug = re.compile(r'^-|-$').sub('', slug) 125 | 126 | if slug in SLUGS: 127 | slug += (len(slug) and '-' or '') + json['id'] 128 | 129 | SLUGS[slug] = True 130 | 131 | return slug 132 | 133 | 134 | def json_to_markdown(json): 135 | body = TAGLESS_NEWLINES.sub('
', json['body']) 136 | 137 | h = html2text.HTML2Text() 138 | h.body_width = 0 139 | h.unicode_snob = True 140 | body = h.handle(body) 141 | body = NEWLINES.sub('\n\n', body) 142 | 143 | # read UTX tags 144 | tags = TAG.findall(body) 145 | json['tags'] = len(tags) and '\ntags: {0}'.format(', '.join(tags)) or '' 146 | 147 | # remove UTX tags from text 148 | json['body'] = TAG.sub('', body).strip() 149 | 150 | json['slug'] = get_slug(json) 151 | json['subject'] = json['subject'] or json['date'] 152 | 153 | return """id: {id} 154 | title: {subject} 155 | slug: {slug} 156 | date: {date}{tags} 157 | 158 | {body} 159 | """.format(**json) 160 | 161 | def group_comments_by_post(comments): 162 | posts = {} 163 | 164 | for comment in comments: 165 | post_id = comment['jitemid'] 166 | 167 | if post_id not in posts: 168 | posts[post_id] = {} 169 | 170 | post = posts[post_id] 171 | post[comment['id']] = comment 172 | 173 | return posts 174 | 175 | 176 | def nest_comments(comments): 177 | post = [] 178 | 179 | for comment in comments.values(): 180 | fix_user_links(comment) 181 | 182 | if 'parentid' not in comment: 183 | post.append(comment) 184 | else: 185 | comments[comment['parentid']]['children'].append(comment) 186 | 187 | return post 188 | 189 | 190 | def comment_to_li(comment): 191 | if 'state' in comment and comment['state'] == 'D': 192 | return '' 193 | 194 | html = '

{0}: {1}

'.format(comment.get('author', 'anonym'), comment.get('subject', '')) 195 | html += '\n'.format(comment['id']) 196 | 197 | if 'body' in comment: 198 | html += '\n' + markdown(TAGLESS_NEWLINES.sub('
\n', comment['body'])) 199 | 200 | if len(comment['children']) > 0: 201 | html += '\n' + comments_to_html(comment['children']) 202 | 203 | subject_class = 'subject' in comment and ' class=subject' or '' 204 | return '{1}\n'.format(subject_class, html) 205 | 206 | 207 | def comments_to_html(comments): 208 | return ''.format('\n'.join(map(comment_to_li, sorted(comments, key=itemgetter('id'))))) 209 | 210 | 211 | def save_as_json(id, json_post, post_comments): 212 | json_data = {'id': id, 'post': json_post, 'comments': post_comments} 213 | with open('posts-json/{0}.json'.format(id), 'w', encoding='utf-8') as f: 214 | f.write(json.dumps(json_data, ensure_ascii=False, indent=2)) 215 | 216 | 217 | def save_as_markdown(id, subfolder, json_post, post_comments_html): 218 | os.makedirs('posts-markdown/{0}'.format(subfolder), exist_ok=True) 219 | with open('posts-markdown/{0}/{1}.md'.format(subfolder, id), 'w', encoding='utf-8') as f: 220 | f.write(json_to_markdown(json_post)) 221 | if post_comments_html: 222 | with open('comments-markdown/{0}.md'.format(json_post['slug']), 'w', encoding='utf-8') as f: 223 | f.write(post_comments_html) 224 | 225 | 226 | def save_as_html(id, subfolder, json_post, post_comments_html): 227 | os.makedirs('posts-html/{0}'.format(subfolder), exist_ok=True) 228 | with open('posts-html/{0}/{1}.html'.format(subfolder, id), 'w', encoding='utf-8') as f: 229 | f.writelines(json_to_html(json_post)) 230 | if post_comments_html: 231 | f.write('\n

{0}

\n'.format(COMMENTS_HEADER) + post_comments_html) 232 | 233 | 234 | def combine(all_posts, all_comments): 235 | os.makedirs('posts-html', exist_ok=True) 236 | os.makedirs('posts-markdown', exist_ok=True) 237 | os.makedirs('comments-markdown', exist_ok=True) 238 | 239 | posts_comments = group_comments_by_post(all_comments) 240 | 241 | for json_post in all_posts: 242 | id = json_post['id'] 243 | jitemid = int(id) >> 8 244 | 245 | date = datetime.strptime(json_post['date'], '%Y-%m-%d %H:%M:%S') 246 | subfolder = '{0.year}-{0.month:02d}'.format(date) 247 | 248 | post_comments = jitemid in posts_comments and nest_comments(posts_comments[jitemid]) or None 249 | post_comments_html = post_comments and comments_to_html(post_comments) or '' 250 | 251 | fix_user_links(json_post) 252 | 253 | save_as_json(id, json_post, post_comments) 254 | save_as_html(id, subfolder, json_post, post_comments_html) 255 | save_as_markdown(id, subfolder, json_post, post_comments_html) 256 | 257 | 258 | if __name__ == '__main__': 259 | if True: 260 | all_posts = download_posts(cookies, headers) 261 | all_comments = download_comments(cookies, headers) 262 | 263 | else: 264 | with open('posts-json/all.json', 'r', encoding='utf-8') as f: 265 | all_posts = json.load(f) 266 | with open('comments-json/all.json', 'r', encoding='utf-8') as f: 267 | all_comments = json.load(f) 268 | 269 | combine(all_posts, all_comments) 270 | --------------------------------------------------------------------------------