├── .editorconfig ├── package.json ├── .gitignore ├── README.md └── index.js /.editorconfig: -------------------------------------------------------------------------------- 1 | # https://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | charset = utf-8 7 | end_of_line = lf 8 | trim_trailing_whitespace = true 9 | 10 | [*.js] 11 | indent_size = 4 12 | indent_style = tabs 13 | insert_final_newline = true 14 | 15 | [*.md] 16 | trim_trailing_whitespace = false 17 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "blogger-to-md", 3 | "version": "1.0.0", 4 | "description": "Blogger backup to markdown", 5 | "main": "index.js", 6 | "repository": { 7 | "type": "git", 8 | "url": "git+https://github.com/palaniraja/blog2md.git" 9 | }, 10 | "scripts": { 11 | "test": "echo \"Error: no test specified\" && exit 1" 12 | }, 13 | "author": "palaniraja", 14 | "license": "ISC", 15 | "dependencies": { 16 | "moment": "^2.22.2", 17 | "sanitize-filename": "^1.6.3", 18 | "turndown": "^5.0.1", 19 | "xml2js": "^0.4.19" 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *blog*.xml 2 | *wordpress*.xml 3 | out 4 | 5 | 6 | 7 | # https://github.com/github/gitignore/blob/master/Node.gitignore 8 | 9 | # Logs 10 | logs 11 | *.log 12 | npm-debug.log* 13 | yarn-debug.log* 14 | yarn-error.log* 15 | 16 | # Runtime data 17 | pids 18 | *.pid 19 | *.seed 20 | *.pid.lock 21 | 22 | # Directory for instrumented libs generated by jscoverage/JSCover 23 | lib-cov 24 | 25 | # Coverage directory used by tools like istanbul 26 | coverage 27 | 28 | # nyc test coverage 29 | .nyc_output 30 | 31 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 32 | .grunt 33 | 34 | # Bower dependency directory (https://bower.io/) 35 | bower_components 36 | 37 | # node-waf configuration 38 | .lock-wscript 39 | 40 | # Compiled binary addons (https://nodejs.org/api/addons.html) 41 | build/Release 42 | 43 | # Dependency directories 44 | node_modules/ 45 | jspm_packages/ 46 | 47 | # TypeScript v1 declaration files 48 | typings/ 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Optional REPL history 57 | .node_repl_history 58 | 59 | # Output of 'npm pack' 60 | *.tgz 61 | 62 | # Yarn Integrity file 63 | .yarn-integrity 64 | 65 | # dotenv environment variables file 66 | .env 67 | 68 | # parcel-bundler cache (https://parceljs.org/) 69 | .cache 70 | 71 | # next.js build output 72 | .next 73 | 74 | # nuxt.js build output 75 | .nuxt 76 | 77 | # vuepress build output 78 | .vuepress/dist 79 | 80 | # Serverless directories 81 | .serverless -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Blogger to Markdown 2 | 3 | Convert Blogger & WordPress backup blog posts to hugo compatible markdown documents 4 | 5 | 6 | Usage: node index.js b|w 7 | 8 | For Blogger imports, blog posts and comments (as seperate file `-comments.md`) will be created in "`out`" directory 9 | 10 | ``` 11 | node index.js b your-blogger-backup-export.xml out 12 | ``` 13 | 14 | For WordPress imports, blog posts and comments (as seperate file `-comments.md`) will be created in "`out`" directory 15 | 16 | ``` 17 | node index.js w your-wordpress-backup-export.xml out 18 | ``` 19 | 20 | If you want the comments to be merged in your post file itself. you can use flag `m` at the end. Defaults to `s` for seperate comments file 21 | 22 | ``` 23 | node index.js w your-wordpress-backup-export.xml out m 24 | ``` 25 | 26 | If converting from WordPress, and you have posts that do not contain HTML, you can use a `paragraph-fix` flag at the end. 27 | 28 | ``` 29 | node index.js w your-wordpress-backup-export.xml out m paragraph-fix 30 | ``` 31 | 32 | ## Installation (usual node project) 33 | 34 | * Download or Clone this project 35 | * `cd` to directory 36 | * Run `npm install` to install dependencies 37 | * Run `node index.js ` 38 | 39 | ## Notes to self 40 | 41 | Script to convert posts from Blogger to Markdown. 42 | 43 | - [x] Read XML 44 | - [x] Parse Entries (Posts and comments) (with xpath?) 45 | - [x] Parse Title, Link, Created, Updated, Content, Link 46 | - [ ] List Post & Respective comment counts 47 | - [x] Content to MD - pandoc? 48 | - [ ] Parse Images, Files, Videos linked to the posts 49 | - [x] Create output dir 50 | - [ ] List items that are not downloaded( or can't) along with their .md file for user to proceed 51 | 52 | 53 | ## Reasons 54 | 55 | * Wrote this to consolidate and convert my blogs under one roof. 56 | * Plain simple workflow with `hugo` 57 | * Ideas was to download associated assets (images/files) linked to post. Gave up, because it was time consuming and anyhow I need to validate the markdown with assets of converted. And I don't see benefit. 58 | * Initial assumption was to parse with `xpath` but I found `xml2json.js` was easier 59 | * Also thought `pandoc` is a overkill and `turndown.js` was successful, though I had to wrap empty `text` to `md` instead of `html`. 60 | * I want to retain comments. Believe it or not, There were some **good** comments. 61 | * Was sick and spent around ~12 hrs over 5 days in coding and testing with my blog contents over ~150 posts. And also, I find parsing _oddly satisfying_ when it result in success. `¯\_(ツ)_/¯` 62 | 63 | -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | 3 | /*** 4 | Usage: blog2md b|w 5 | 6 | */ 7 | 8 | 9 | const fs = require('fs'); 10 | const os = require('os'); 11 | const path = require('path'); 12 | const xml2js = require('xml2js'); 13 | const sanitize = require('sanitize-filename'); 14 | const TurndownService = require('turndown'); 15 | var moment = require('moment'); 16 | 17 | var tds = new TurndownService({ codeBlockStyle: 'fenced', fence: '```' }) 18 | 19 | tds.addRule('wppreblock', { 20 | filter: ['pre'], 21 | replacement: function(content) { 22 | return '```\n' + content + '\n```' 23 | } 24 | }) 25 | 26 | // console.log(`No. of arguments passed: ${process.argv.length}`); 27 | 28 | if (process.argv.length < 5){ 29 | // ${process.argv[1]} 30 | console.log(`Usage: blog2md [b|w] m|s`) 31 | console.log(`\t b for parsing Blogger(Blogspot) backup`); 32 | console.log(`\t w for parsing WordPress backup`); 33 | return 1; 34 | } 35 | 36 | var option = process.argv[2]; 37 | var inputFile = process.argv[3]; 38 | 39 | var outputDir = process.argv[4]; 40 | 41 | var mergeComments = (process.argv[5] == 'm')?'m':'s' ; 42 | /** Apply a fix to WordPress posts to convert newlines to paragraphs. */ 43 | var applyParagraphFix = (process.argv.indexOf('paragraph-fix') >= 0); 44 | 45 | 46 | if (fs.existsSync(outputDir)) { 47 | console.log(`WARNING: Given output directory "${outputDir}" already exists. Files will be overwritten.`) 48 | } 49 | else{ 50 | fs.mkdirSync(outputDir); 51 | } 52 | 53 | 54 | if (mergeComments == 'm'){ 55 | console.log(`INFO: Comments requested to be merged along with posts. (m)`); 56 | } 57 | else{ 58 | console.log(`INFO: Comments requested to be a separate .md file(m - default)`); 59 | } 60 | 61 | 62 | 63 | if( option.toLowerCase() == 'b'){ 64 | bloggerImport(inputFile, outputDir); 65 | } 66 | else if(option.toLowerCase() == 'w'){ 67 | wordpressImport(inputFile, outputDir); 68 | } 69 | else { 70 | console.log('Only b (Blogger) and w (WordPress) are valid options'); 71 | return; 72 | } 73 | 74 | 75 | 76 | 77 | 78 | function wordpressImport(backupXmlFile, outputDir){ 79 | var parser = new xml2js.Parser(); 80 | 81 | fs.readFile(backupXmlFile, function(err, data) { 82 | parser.parseString(data, function (err, result) { 83 | if (err) { 84 | console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); 85 | return 1; 86 | } 87 | // console.dir(result); 88 | // console.log(JSON.stringify(result)); return; 89 | var posts = []; 90 | 91 | // try { 92 | posts = result.rss.channel[0].item; 93 | 94 | console.log(`Total Post count: ${posts.length}`); 95 | 96 | posts = posts.filter(function(post){ 97 | var status = ''; 98 | if(post["wp:status"]){ 99 | status = post["wp:status"].join(''); 100 | } 101 | // console.log(post["wp:status"].join('')); 102 | return status != "private" && status != "inherit" 103 | }); 104 | 105 | 106 | // console.log(posts) 107 | console.log(`Post count: ${posts.length}`); 108 | 109 | var title = ''; 110 | var content = ''; 111 | var tags = []; 112 | var draft = false; 113 | var published = ''; 114 | var comments = []; 115 | var fname = ''; 116 | var markdown = ''; 117 | var fileContent = ''; 118 | var fileHeader = ''; 119 | var postMaps = {}; 120 | 121 | posts.forEach(function(post){ 122 | var postMap = {}; 123 | 124 | title = post.title[0].trim(); 125 | 126 | // console.log(title); 127 | 128 | // if (title && title.indexOf("'")!=-1){ 129 | title = title.replace(/'/g, "''"); 130 | // } 131 | 132 | draft = post["wp:status"] == "draft" 133 | published = post.pubDate; 134 | comments = post['wp:comment']; 135 | fname = sanitize(decodeURI(post["wp:post_name"][0])) || post["wp:post_id"]; 136 | markdown = ''; 137 | // if (post.guid && post.guid[0] && post.guid[0]['_']){ 138 | // fname = path.basename(post.guid[0]['_']); 139 | // } 140 | // console.log(comments); 141 | 142 | console.log(`\n\n\n\ntitle: '${title}'`); 143 | console.log(`published: '${published}'`); 144 | 145 | if (comments){ 146 | console.log(`comments: '${comments.length}'`); 147 | } 148 | 149 | tags = []; 150 | 151 | var categories = post.category; 152 | var tagString = ''; 153 | 154 | if (categories && categories.length){ 155 | categories.forEach(function (category){ 156 | // console.log(category['_']); 157 | tags.push(category['_']); 158 | }); 159 | 160 | // console.log(tags.join(", ")); 161 | // tags = tags.join(", "); 162 | tagString = 'tags: [\'' + tags.join("', '") + "']\n"; 163 | // console.log(tagString); 164 | } 165 | 166 | var pmap = {fname:'', comments:[]}; 167 | pmap.fname = outputDir+'/'+fname+'-comments.md'; 168 | 169 | fname = outputDir+'/'+fname+'.md'; 170 | pmap.postName = fname; 171 | console.log(`fname: '${fname}'`); 172 | 173 | if (post["content:encoded"]){ 174 | // console.log('content available'); 175 | var postContent = post["content:encoded"].toString(); 176 | if (applyParagraphFix && !/

/i.test(postContent)) { 177 | postContent = '

' + postContent.replace(/(\r?\n){2}/g, '

\n\n

') + '

'; 178 | } 179 | content = '
'+postContent+'
'; //to resolve error if plain text returned 180 | markdown = tds.turndown(content); 181 | // console.log(markdown); 182 | 183 | fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\n${tagString}---\n`; 184 | fileContent = `${fileHeader}\n${markdown}`; 185 | pmap.header = `${fileHeader}\n`; 186 | 187 | writeToFile(fname, fileContent); 188 | 189 | } 190 | 191 | //comments: 192 | /* 193 | "wp:comment" [.each] 194 | wp:comment_author[0] 195 | wp:comment_author_email[0] 196 | wp:comment_author_url[0] 197 | wp:comment_date[0] 198 | wp:comment_content[0] 199 | wp:comment_approved[0] == 1 200 | wp:post_id 201 | 202 | */ 203 | var comments = post["wp:comment"] || []; 204 | // console.dir(comments); 205 | var anyApprovedComments = 0; 206 | var ccontent = ''; 207 | comments.forEach(function(comment){ 208 | // console.log('') 209 | if(comment["wp:comment_approved"].pop()){ 210 | anyApprovedComments = 1; 211 | 212 | var cmt = {title:'', published:'', content:'', author:{}}; 213 | 214 | cmt.published = (comment["wp:comment_date"]?comment["wp:comment_date"].pop():''); 215 | 216 | var cont = '
'+comment["wp:comment_content"].pop()+'
'; 217 | cmt.content = (comment["wp:comment_content"]?tds.turndown(cont):''); 218 | 219 | cmt.author.name = (comment["wp:comment_author"]?comment["wp:comment_author"].pop():''); 220 | cmt.author.email = (comment["wp:comment_author_email"]?comment["wp:comment_author_email"].pop():''); 221 | cmt.author.url = (comment["wp:comment_author_url"]?comment["wp:comment_author_url"].pop():''); 222 | 223 | ccontent += `#### [${cmt.author.name}](${cmt.author.url} "${cmt.author.email}") - ${cmt.published}\n\n${cmt.content}\n
\n`; 224 | 225 | pmap.comments.push(cmt); 226 | } 227 | }); 228 | 229 | //just a hack to re-use blogger writecomments method 230 | if (pmap && pmap.comments && pmap.comments.length){ 231 | writeComments({"0": pmap}); 232 | } 233 | 234 | }); 235 | 236 | }); 237 | }); 238 | 239 | } 240 | 241 | 242 | 243 | 244 | function getFileName(text) { 245 | var newFileName = sanitize(text) // first remove any dodgy characters 246 | .replace(/[\.']/g, '') // then remove some known characters 247 | .replace(/[^a-z0-9]/gi, '-') // then turn anything that isn't a number or letter into a hyphen 248 | .replace(/[\-]{2,}/g, '-') // then turn multiple hyphens into a single one 249 | .toLowerCase(); // finally make it all lower case 250 | return newFileName; 251 | } 252 | 253 | function bloggerImport(backupXmlFile, outputDir){ 254 | var parser = new xml2js.Parser(); 255 | // __dirname + '/foo.xml' 256 | fs.readFile(backupXmlFile, function(err, data) { 257 | parser.parseString(data, function (err, result) { 258 | if (err){ 259 | console.log(`Error parsing xml file (${backupXmlFile})\n${JSON.stringify(err)}`); return 1; 260 | } 261 | // console.dir(JSON.stringify(result)); return; 262 | 263 | if(result.feed && result.feed.entry) { 264 | var contents = result.feed.entry; 265 | console.log(`Total no. of entries found : ${contents.length}`); 266 | // var i=0 267 | var posts = contents.filter(function(entry){ 268 | return entry.id[0].indexOf('.post-')!=-1 && !entry['thr:in-reply-to'] 269 | }); 270 | 271 | var comments = contents.filter(function(entry){ 272 | return entry.id[0].indexOf('.post-')!=-1 && entry['thr:in-reply-to'] 273 | }); 274 | 275 | // console.dir(posts); 276 | 277 | console.log(`Content-posts ${posts.length}`); 278 | console.log(`Content-Comments ${comments.length}`); 279 | 280 | var content = ''; 281 | var markdown = ''; 282 | var fileContent = ''; 283 | var fileHeader = ''; 284 | var postMaps = {}; 285 | 286 | posts.forEach(function(entry){ 287 | var postMap = {}; 288 | 289 | var title = entry.title[0]['_']; 290 | // title = tds.turndown(title); 291 | if (title && title.indexOf("'")!=-1){ 292 | title = title.replace(/'/g, "''"); 293 | } 294 | postMap.pid = entry.id[0].split('-').pop() 295 | 296 | var published = entry.published; 297 | var draft = 'false'; 298 | if(entry['app:control'] && (entry['app:control'][0]['app:draft'][0] == 'yes')){ 299 | draft = 'true'; 300 | } 301 | 302 | console.log(`title: "${title}"`); 303 | console.log(`date: ${published}`); 304 | console.log(`draft: ${draft}`); 305 | 306 | var sanitizedTitle = getFileName(title) 307 | 308 | var urlLink = entry.link.filter(function(link){ 309 | return link["$"].type && link["$"].rel && link["$"].rel=='alternate' && link["$"].type=='text/html' 310 | }); 311 | 312 | var url='' 313 | 314 | // console.dir(urlLink[0]); 315 | if (urlLink && urlLink[0] && urlLink[0]['$'] && urlLink[0]['$'].href){ 316 | url = urlLink[0]['$'].href; 317 | } 318 | 319 | var fname = outputDir + '/' + path.basename(sanitizedTitle) + '.md'; 320 | console.log(fname); 321 | postMap.postName = fname 322 | postMap.fname = fname.replace('.md', '-comments.md'); 323 | postMap.comments = []; 324 | 325 | 326 | if (entry.content && entry.content[0] && entry.content[0]['_']){ 327 | // console.log('content available'); 328 | content = entry.content[0]['_']; 329 | markdown = tds.turndown(content); 330 | // console.log(markdown); 331 | 332 | 333 | } 334 | 335 | var tagLabel = []; 336 | var tags = []; 337 | 338 | 339 | tagLabel = entry.category.filter(function (tag){ 340 | // console.log(`tagged against :${tag['$'].term}`); 341 | return tag['$'].term && tag['$'].term.indexOf('http://schemas.google')==-1; 342 | }); 343 | console.log(`No of category: ${entry.category.length}`); 344 | tagLabel.forEach(function(tag){ 345 | // console.log(`tagged against :${tag['$'].term}`); 346 | tags.push(tag['$'].term); 347 | }); 348 | 349 | 350 | console.log(`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`); 351 | 352 | var tagString=''; 353 | 354 | if(tags.length){ 355 | tagString=`tags: \n${tags.map(a=> '- '+a).join('\n')}\n`; 356 | } 357 | 358 | console.dir(postMap); 359 | 360 | console.log("\n\n\n\n\n"); 361 | 362 | var alias = url.replace(/^.*\/\/[^\/]+/, ''); 363 | 364 | fileHeader = `---\ntitle: '${title}'\ndate: ${published}\ndraft: ${draft}\nurl: ${alias}\n${tagString}---\n`; 365 | fileContent = `${fileHeader}\n${markdown}`; 366 | 367 | postMap.header = fileHeader; 368 | postMaps[postMap.pid] = postMap; 369 | 370 | writeToFile(fname, fileContent) 371 | 372 | }); 373 | 374 | 375 | comments.forEach(function(entry){ 376 | // var commentMap = {}; 377 | var comment = {published:'', title:'', content:''}; 378 | 379 | var postId = entry['thr:in-reply-to'][0]["$"]["source"]; 380 | postId = path.basename(postId); 381 | 382 | comment.published = entry['published'][0]; 383 | 384 | if(entry['title'][0] && entry['title'][0]["_"]){ 385 | comment.title = tds.turndown(entry['title'][0]["_"]); 386 | } 387 | 388 | if (entry['content'][0] && entry['content'][0]["_"]){ 389 | comment.content = tds.turndown(entry['content'][0]["_"]); 390 | } 391 | 392 | comment.author = {name: '', email: '', url: ''}; 393 | 394 | if(entry['author'][0]["name"] && entry['author'][0]["name"][0]){ 395 | comment.author.name = entry['author'][0]["name"][0]; 396 | } 397 | 398 | if (entry['author'][0]["email"] && entry['author'][0]["email"][0]){ 399 | comment.author.email = entry['author'][0]["email"][0]; 400 | } 401 | 402 | if (entry['author'][0]["uri"] && entry['author'][0]["uri"][0]){ 403 | comment.author.url = entry['author'][0]["uri"][0]; 404 | } 405 | 406 | postMaps[postId].comments.push(comment); 407 | }); 408 | 409 | // console.log(JSON.stringify(postMaps)); return; 410 | writeComments(postMaps); 411 | 412 | } 413 | console.log('Done'); 414 | }); 415 | }); 416 | 417 | } 418 | 419 | 420 | function writeComments(postMaps){ 421 | 422 | if (mergeComments == 'm'){ 423 | console.log('DEBUG: merge comments requested'); 424 | }else{ 425 | console.log('DEBUG: separate comments requested (defaulted)'); 426 | } 427 | for (var pmap in postMaps){ 428 | var comments = postMaps[pmap].comments; 429 | console.log(`post id: ${pmap} has ${comments.length} comments`); 430 | // console.dir(comments); 431 | 432 | if (comments.length){ 433 | var ccontent = ''; 434 | comments.forEach(function(comment){ 435 | var readableDate = ''; 436 | 437 | ccontent += `#### ${comment.title}\n[${comment.author.name}](${comment.author.url} "${comment.author.email}") - ${readableDate}\n\n${comment.content}\n
\n`; 438 | }); 439 | 440 | if (mergeComments == 'm'){ 441 | writeToFile(postMaps[pmap].postName, `\n---\n### Comments:\n${ccontent}`, true); 442 | }else{ 443 | writeToFile(postMaps[pmap].fname, `${postMaps[pmap].header}\n${ccontent}`); 444 | } 445 | 446 | } 447 | } 448 | } 449 | 450 | 451 | 452 | function writeToFile(filename, content, append=false){ 453 | 454 | if(append){ 455 | console.log(`DEBUG: going to append to ${filename}`); 456 | try{ 457 | fs.appendFileSync(filename, content); 458 | console.log(`Successfully appended to ${filename}`); 459 | } 460 | catch(err){ 461 | console.log(`Error while appending to ${filename} - ${JSON.stringify(err)}`); 462 | console.dir(err); 463 | } 464 | 465 | }else{ 466 | console.log(`DEBUG: going to write to ${filename}`); 467 | try{ 468 | fs.writeFileSync(filename, content); 469 | console.log(`Successfully written to ${filename}`); 470 | } 471 | catch(err){ 472 | console.log(`Error while writing to ${filename} - ${JSON.stringify(err)}`); 473 | console.dir(err); 474 | } 475 | } 476 | 477 | } 478 | --------------------------------------------------------------------------------