├── .gitignore ├── index.js ├── src ├── errors.js ├── model │ ├── profile.js │ └── post.js ├── image-downloader.js ├── converter.js └── importer.js ├── .eslintrc.json ├── package.json ├── LICENSE ├── scripts └── import-external.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | .vscode 3 | temp -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | importer: require('./src/importer'), 3 | errors: require('./src/errors') 4 | }; 5 | -------------------------------------------------------------------------------- /src/errors.js: -------------------------------------------------------------------------------- 1 | 2 | const Errors = { 3 | InputFileNotFound: { message: 'Input file not found' }, 4 | InvalidZip: { message: 'Error unzipping file' }, 5 | EmptyInput: { message: 'Nothing found to import' } 6 | }; 7 | 8 | module.exports = Errors; -------------------------------------------------------------------------------- /.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": { 3 | "node" : true, 4 | "es6": true 5 | }, 6 | "extends": "eslint:recommended", 7 | "parserOptions": { 8 | "ecmaVersion": 2018 9 | }, 10 | "rules": { 11 | "no-console": "warn", 12 | "indent": [ 13 | "error", 14 | 4 15 | ], 16 | "linebreak-style": [ 17 | "error", 18 | "unix" 19 | ], 20 | "quotes": [ 21 | "error", 22 | "single" 23 | ], 24 | "semi": [ 25 | "error", 26 | "always" 27 | ], 28 | "no-unused-vars": 0 29 | } 30 | } -------------------------------------------------------------------------------- /src/model/profile.js: -------------------------------------------------------------------------------- 1 | const cheerio = require('cheerio'); 2 | 3 | class Profile { 4 | constructor(content) { 5 | this.content = content; 6 | this.$ = cheerio.load(content); 7 | 8 | const props = this.$('li') 9 | .map((i, elem) => this.$(elem).text()) 10 | .get() 11 | .map(propStr => propStr.split(':')) 12 | .reduce((acc, val) => { 13 | acc[val[0]] = val.slice(1).join(':').trim(); 14 | return acc; 15 | }, {}); 16 | 17 | this.name = props['Display name']; 18 | this.email = props['Email address']; 19 | this.twitterUsername = props['Twitter']; 20 | } 21 | 22 | get avatarUrl() { 23 | return this.$('img[class*=u-photo]').attr('src'); 24 | } 25 | } 26 | 27 | module.exports = Profile; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@stackbit/stackbit-medium-importer", 3 | "version": "0.2.0", 4 | "description": "Stackbit Medium Importer", 5 | "main": "index.js", 6 | "scripts": {}, 7 | "author": "Stackbit", 8 | "dependencies": { 9 | "axios": "^0.18.0", 10 | "chalk": "^2.4.2", 11 | "cheerio": "^1.0.0-rc.2", 12 | "gray-matter": "^4.0.2", 13 | "js-yaml": "^3.12.2", 14 | "lodash": "^4.17.11", 15 | "p-all": "^2.0.0", 16 | "sanitize-html": "^1.20.0", 17 | "turndown": "^5.0.3", 18 | "unzipper": "^0.9.11", 19 | "url-parse": "^1.4.4", 20 | "yargs": "^13.2.2" 21 | }, 22 | "repository": { 23 | "url": "https://github.com/stackbithq/stackbit-medium-importer.git" 24 | }, 25 | "bugs": { 26 | "url": "https://github.com/stackbithq/gatsby-plugin-menus/issues" 27 | }, 28 | "bin": { 29 | "medium-importer": "./scripts/import-external.js" 30 | }, 31 | "license": "MIT", 32 | "devDependencies": { 33 | "eslint": "^5.16.0" 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2019 Stackbit 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/model/post.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | const cheerio = require('cheerio'); 4 | const urlParse = require('url-parse'); 5 | 6 | class Post { 7 | constructor(content) { 8 | this.content = content; 9 | this.$ = cheerio.load(content); 10 | } 11 | 12 | get title() { 13 | return this.$('title').text().trim(); 14 | } 15 | 16 | get excerpt() { 17 | return this.$('section[data-field=subtitle]').text().trim() || 18 | this.subtitle; 19 | } 20 | 21 | get subtitle() { 22 | return this.$('h4[class*="graf--subtitle"]').text().trim(); 23 | } 24 | 25 | get publishDate() { 26 | const publishTime = this.$('time[class*="dt-published"]').attr('datetime'); 27 | if (publishTime) { 28 | return new Date(publishTime); 29 | } 30 | return null; 31 | } 32 | 33 | get imageUrls() { 34 | return this.$('img') 35 | .map((i, img) => this.$(img).attr('src')) 36 | .get() 37 | .filter(url => urlParse(url).hostname.includes('medium.com')); 38 | } 39 | 40 | get splashImageUrl() { 41 | return null;//this.$('div[class*=sectionLayout--fullWidth] img').attr('src'); 42 | } 43 | 44 | get bodyHtml() { 45 | return this.$('div[class*=section-content]') 46 | .map((i, section) => this.$(section).html()) 47 | .get() 48 | .join('
'); 49 | } 50 | } 51 | 52 | module.exports = Post; -------------------------------------------------------------------------------- /src/image-downloader.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | const pAll = require('p-all'); 5 | const axios = require('axios'); 6 | const chalk = require('chalk'); 7 | 8 | function downloadImage(url, outputFilePath) { 9 | console.log(chalk.gray('downloading image: ' + url)); 10 | 11 | return axios({ 12 | method: 'get', 13 | url: url, 14 | responseType: 'stream', 15 | timeout: 60 * 1000 16 | }).then(function (response) { 17 | response.data.pipe(fs.createWriteStream(outputFilePath)); 18 | return new Promise((resolve, reject) => { 19 | response.data.on('end', () => { 20 | resolve({ 21 | url: url, 22 | filename: path.parse(outputFilePath).base 23 | }); 24 | }); 25 | response.data.on('error', reject); 26 | }); 27 | }).catch((err) => { 28 | // failure to download image is just a warning 29 | console.warn(err); 30 | }); 31 | } 32 | 33 | function downloadAllImages(urls, outputDir) { 34 | console.log(chalk.gray('downloading images to ' + outputDir)); 35 | 36 | const imageDownloads = urls.map((url) => { 37 | let filename = url.substring(url.lastIndexOf('/') + 1); 38 | const ext = path.extname(filename); 39 | if (_.isEmpty(ext) || ext === '.') { 40 | filename += '.jpg'; 41 | } 42 | return () => downloadImage(url, path.join(outputDir, filename)); 43 | }); 44 | return pAll(imageDownloads, { concurrency: 1 }); 45 | } 46 | 47 | module.exports = downloadAllImages; -------------------------------------------------------------------------------- /scripts/import-external.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | const fs = require('fs'); 4 | const path = require('path'); 5 | const yargs = require('yargs'); 6 | const importer = require('../src/importer'); 7 | const chalk = require('chalk'); 8 | 9 | 10 | let argv = yargs 11 | .usage('Usage: $0 --input-file= --output-dir= --concurrency= --download-images= --import-drafts=') 12 | .example( 13 | '$0 -i medium-export.zip -o output/', 14 | 'Convert export located in "medium-export.zip" into Stackbit readable format saved to "output/"' 15 | ) 16 | .alias('input-file', 'i') 17 | .alias('output-dir', 'o') 18 | .alias('concurrency', 'c') 19 | .alias('download-images', 'd') 20 | .alias('import-drafts', 'r') 21 | .describe('input-file', 'medium export zip file') 22 | .describe('output-dir', 'target folder for the converted files') 23 | .describe('concurrency', 'number of posts to process in parallel') 24 | .describe('download-images', 'should images be downloaded') 25 | .describe('import-drafts', 'should drafts be imported') 26 | .default('concurrency', 1) 27 | .default('download-images', true) 28 | .default('import-drafts', true) 29 | .demandOption(['input-file', 'output-dir']) 30 | .wrap(null) 31 | .argv; 32 | 33 | const inputFilename = argv['inputFile']; 34 | const outputDir = argv['outputDir']; 35 | const concurrency = argv['concurrency']; 36 | const shouldDownloadImages = argv['downloadImages'] !== 'false'; 37 | const shouldImportDrafts = argv['importDrafts'] && argv['importDrafts'] !== 'false'; 38 | 39 | const contentDir = path.join(outputDir, 'content'); 40 | const postsDir = path.join(contentDir, 'posts'); 41 | 42 | const staticDir = path.join(outputDir, 'static'); 43 | const imagesDir = path.join(staticDir, 'images'); 44 | 45 | const dataDir = path.join(outputDir, 'data'); 46 | const dataFile = path.join(dataDir, 'data.json'); 47 | 48 | const originalDir = path.join(outputDir, 'original'); 49 | 50 | [outputDir, dataDir, staticDir, contentDir].forEach(dir => { 51 | if (!fs.existsSync(dir)) { 52 | fs.mkdirSync(dir); 53 | } 54 | }); 55 | 56 | importer.doImport(inputFilename, postsDir, imagesDir, dataFile, originalDir, shouldDownloadImages, shouldImportDrafts, concurrency).then(() => { 57 | console.log(chalk.green.bold('Done importing')); 58 | }).catch(error => { 59 | console.error(chalk.red(error.message)); 60 | }); 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # stackbit-medium-importer 2 | 3 | A small library to convert [medium export](https://medium.com/me/export) zip files to static-site-generator (SSG) friendly Markdown files. 4 | 5 | # Demo 6 | 7 | The Medium importer can be seen in action on the [Stackbit website](https://www.stackbit.com/medium). 8 | 9 | # Install 10 | 11 | ``` 12 | npm install @stackbit/stackbit-medium-importer 13 | ``` 14 | 15 | # How to use 16 | 17 | The module comes with a utility to invoke the importer directly. 18 | 19 | ``` 20 | $ medium-importer 21 | Usage: medium-importer --input-file= --output-dir= --concurrency= --download-images= --import-drafts= 22 | 23 | Options: 24 | --help Show help [boolean] 25 | --version Show version number [boolean] 26 | --input-file, -i medium export zip file [required] 27 | --output-dir, -o target folder for the converted files [required] 28 | --concurrency, -c number of posts to process in parallel [default: 1] 29 | --download-images, -d should images be downloaded [default: true] 30 | --import-drafts, -r should drafts be imported [default: true] 31 | 32 | Examples: 33 | medium-importer -i medium-export.zip -o output/ 34 | ``` 35 | 36 | # Importer output 37 | 38 | The output directory is populated with the imported data, following a structure that makes it straightforward to later use with an SSG. For certain SSG's (like Hugo) it's possible to point the utility directly at your project. 39 | 40 | The output directories can be configured when working with the library directly. 41 | 42 | ### `content/posts` 43 | 44 | Markdown pages representing imported Medium posts. 45 | 46 | ```yaml 47 | --- 48 | template: post 49 | title: My Blog Post 50 | date: 2018-10-08T15:01:02.452Z 51 | subtitle: >- 52 | This post is a posty post, and this is its subtitle. 53 | excerpt: >- 54 | An excerpt of the post 55 | thumb_img_path: >- 56 | images/My-Blog-Post/image.jpeg 57 | content_img_path: >- 58 | images/My-Blog-Post/splash.jpeg 59 | --- 60 | 61 | # My Post 62 | 63 | Markdown content goes **HERE** 64 | ``` 65 | 66 | ### `static/images` 67 | 68 | * Per-post images organized in directories with the post's slug. 69 | * Global images extracted from the Medium export: `avatar.png` if available. 70 | 71 | 72 | ### `data/data.json` 73 | 74 | Extraced information in `JSON` format: 75 | 76 | ```json 77 | { 78 | "author": { 79 | "name": "My Name", 80 | "email": "email@example.com", 81 | "avatar": "images/avatar.png" 82 | }, 83 | "social": { 84 | "twitter": { 85 | "username": "@myuser", 86 | "url": "https://twitter.com/@myuser" 87 | } 88 | } 89 | ``` 90 | 91 | -------------------------------------------------------------------------------- /src/converter.js: -------------------------------------------------------------------------------- 1 | 2 | 3 | const sanitizeHtml = require('sanitize-html'); 4 | const TurndownService = require('turndown'); 5 | 6 | function processHtml(html, slug, imageMap) { 7 | return sanitizeHtml(html, { 8 | allowedTags: sanitizeHtml.defaults.allowedTags.concat(['figcaption', 'img', 'iframe', 'script', 'hr', 'blockquote', 'pre']), 9 | allowedAttributes: { 10 | iframe: ['*'], 11 | img: ['*'], 12 | a: ['*'], 13 | div: ['class'], 14 | h3: ['class'], 15 | script: ['*'], 16 | blockquote: ['class'] 17 | }, 18 | transformTags: { 19 | 'img': (tagName, attribs) => { 20 | const newAttribs = Object.assign(attribs, { 21 | src: imageMap.has(attribs.src) 22 | ? `/images/${slug}/${imageMap.get(attribs.src)}` 23 | : attribs.src 24 | }); 25 | return { 26 | tagName: tagName, 27 | attribs: newAttribs 28 | }; 29 | }, 30 | 'blockquote': (tagName, attribs) => { 31 | let result = { 32 | tagName: tagName, 33 | attribs: attribs, 34 | }; 35 | if (attribs.class === 'twitter-tweet') { 36 | // placeholder needed to get rendered properly 37 | result.text = 'tweet'; 38 | } 39 | return result; 40 | }, 41 | }, 42 | exclusiveFilter: function(frame) { 43 | return (frame.tag === 'h3' && 44 | frame.attribs['class'].includes('graf--title')) || 45 | (frame.tag === 'h4' && 46 | frame.attribs['class'].includes('graf--subtitle')); 47 | } 48 | }); 49 | } 50 | 51 | function convertToMarkdown(html) { 52 | 53 | // workaround issue with multiline emphasis 54 | let bodyHtml = html.replace(/
<\/em>/gmi, '
'); 55 | 56 | // workaround issue with pre blocks 57 | bodyHtml = bodyHtml.replace(/
/gmi, '
');
58 |     bodyHtml = bodyHtml.replace(/<\/pre>/gmi, '
'); 59 | 60 | // unicode spaces are causing issues with whitespace detection 61 | bodyHtml = bodyHtml.replace(/[\u2000-\u200D]+/gmi, ' '); 62 | 63 | const turndownService = new TurndownService({ 64 | emDelimiter: '*' 65 | }); 66 | turndownService.keep(['iframe', 'figcaption', 'script']); 67 | turndownService.addRule('twitter-tweet', { 68 | filter: (node) => { 69 | return node.nodeName === 'BLOCKQUOTE' && 70 | node.getAttribute('class') === 'twitter-tweet'; 71 | }, 72 | replacement: (innerHTML, node) => node.outerHTML 73 | }); 74 | turndownService.addRule('pre', { 75 | filter: ['pre'], 76 | replacement: (content) => { 77 | return '\n ' + content.split('\n').join('\n '); 78 | } 79 | }); 80 | 81 | const bodyMarkdown = turndownService.turndown(bodyHtml); 82 | 83 | return bodyMarkdown; 84 | } 85 | 86 | function convert(html, slug, imageMap) { 87 | const processedHtml = processHtml(html, slug, imageMap); 88 | 89 | const markdown = convertToMarkdown(processedHtml); 90 | 91 | return markdown; 92 | } 93 | 94 | module.exports = convert; -------------------------------------------------------------------------------- /src/importer.js: -------------------------------------------------------------------------------- 1 | const _ = require('lodash'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | const pAll = require('p-all'); 5 | const unzipper = require('unzipper'); 6 | const matter = require('gray-matter'); 7 | const chalk = require('chalk'); 8 | 9 | const Errors = require('./errors'); 10 | const Post = require('./model/post'); 11 | const Profile = require('./model/profile'); 12 | const downloadImages = require('./image-downloader'); 13 | const convertHtml = require('./converter'); 14 | 15 | function unzipInput(inputFilePath, outputDir) { 16 | return fs.createReadStream(inputFilePath) 17 | .pipe(unzipper.Extract({ path: outputDir })) 18 | .promise() 19 | .catch((err) => { 20 | console.error(err); 21 | throw Errors.InvalidZip; 22 | }); 23 | } 24 | 25 | function processPost(postFilePath, outputDir, assetsDir, shouldDownloadImages, shouldImportDrafts) { 26 | console.log(chalk.bold('processing post ' + postFilePath)); 27 | 28 | const post = new Post(fs.readFileSync(postFilePath)); 29 | 30 | const fileName = path.parse(postFilePath).name; 31 | if (!shouldImportDrafts && fileName.startsWith('draft_')) { 32 | console.log('skipping draft: ' + fileName); 33 | return Promise.resolve(); 34 | } 35 | 36 | const slug = fileName 37 | .substring(0, fileName.lastIndexOf('-')) 38 | .substring(fileName.indexOf('_') + 1) 39 | .replace(/-+$/, '') 40 | .substring(0, 200); 41 | const imagesDir = path.join(assetsDir, slug); 42 | 43 | if (!fs.existsSync(imagesDir)) { 44 | fs.mkdirSync(imagesDir); 45 | } 46 | 47 | const imageUrls = post.imageUrls; 48 | 49 | const downloadPromise = shouldDownloadImages 50 | ? downloadImages(imageUrls, imagesDir) 51 | : Promise.resolve([]); 52 | 53 | return downloadPromise.then((results) => { 54 | 55 | const images = results.filter(Boolean); 56 | const imageMap = new Map(images.map(item => [item.url, item.filename])); 57 | 58 | const thumbPath = _.isEmpty(images) 59 | ? (_.isEmpty(imageUrls) ? null : imageUrls[0]) 60 | : `images/${slug}/${images[0].filename}`; 61 | 62 | const bodyMarkdown = convertHtml(post.bodyHtml, slug, imageMap); 63 | 64 | const data = { 65 | template: 'post', 66 | title: post.title, 67 | date: post.publishDate || new Date() 68 | }; 69 | 70 | if (post.subtitle) { 71 | data.subtitle = post.subtitle; 72 | } 73 | if (post.excerpt) { 74 | data.excerpt = post.excerpt; 75 | } 76 | if (thumbPath) { 77 | data.thumb_img_path = thumbPath; 78 | } 79 | if (post.splashImageUrl) { 80 | data.content_img_path = imageMap.has(post.splashImageUrl) 81 | ? `images/${slug}/${imageMap.get(post.splashImageUrl)}` 82 | : post.splashImageUrl; 83 | } 84 | 85 | const outputFilename = path.join(outputDir, `${slug}.md`); 86 | const yamlContent = matter.stringify(bodyMarkdown, data); 87 | fs.writeFileSync(outputFilename, yamlContent); 88 | 89 | console.log('done with ' + outputFilename); 90 | }); 91 | } 92 | 93 | function processPosts(inputDir, outputDir, assetsDir, concurrency, shouldDownloadImages, shouldImportDrafts) { 94 | if (!fs.existsSync(inputDir)) { 95 | throw Errors.EmptyInput; 96 | } 97 | if (!fs.existsSync(outputDir)) { 98 | fs.mkdirSync(outputDir); 99 | } 100 | if (!fs.existsSync(assetsDir)) { 101 | fs.mkdirSync(assetsDir); 102 | } 103 | 104 | const postFiles = fs.readdirSync(inputDir); 105 | 106 | if (_.isEmpty(postFiles)) { 107 | throw Errors.EmptyInput; 108 | } 109 | 110 | return pAll(postFiles.map((filename) => { 111 | return () => processPost(path.join(inputDir, filename), outputDir, assetsDir, shouldDownloadImages, shouldImportDrafts); 112 | }), { concurrency: concurrency }); 113 | } 114 | 115 | function processProfile(inputFile, dataFile, assetsDir) { 116 | console.log(chalk.bold('processing profile ' + inputFile)); 117 | 118 | if (!fs.existsSync(inputFile)) { 119 | console.log('no profile file'); 120 | return Promise.resolve(); 121 | } 122 | 123 | const profile = new Profile(fs.readFileSync(inputFile)); 124 | 125 | let data = { 126 | author: {}, 127 | social: {} 128 | }; 129 | 130 | if (profile.name) { 131 | data.author.name = profile.name; 132 | } 133 | if (profile.email) { 134 | data.author.email = profile.email; 135 | } 136 | if (profile.twitterUsername) { 137 | data.social.twitter = { 138 | username: profile.twitterUsername, 139 | url: `https://twitter.com/${profile.twitterUsername}` 140 | }; 141 | } 142 | 143 | const avatarUrl = profile.avatarUrl; 144 | if (avatarUrl) { 145 | return downloadImages([avatarUrl], assetsDir).then((results) => { 146 | const filename = _.get(results, '[0].filename'); 147 | if (filename) { 148 | fs.renameSync( 149 | path.join(assetsDir, filename), 150 | path.join(assetsDir, 'avatar.png') 151 | ); 152 | data.author.avatar = 'images/avatar.png'; 153 | fs.writeFileSync(dataFile, JSON.stringify(data)); 154 | } 155 | }); 156 | } 157 | 158 | fs.writeFileSync(dataFile, JSON.stringify(data)); 159 | 160 | return Promise.resolve(); 161 | } 162 | 163 | function doImport(inputFilePath, postsDir, imagesDir, dataFile, originalDir, shouldDownloadImages=true, shouldImportDrafts=false, concurrency=1) { 164 | if (!fs.existsSync(inputFilePath)) { 165 | return Promise.reject(Errors.InputFileNotFound); 166 | } 167 | 168 | return unzipInput(inputFilePath, originalDir).then(() => { 169 | return processPosts( 170 | path.join(originalDir, 'posts'), 171 | postsDir, 172 | imagesDir, 173 | concurrency, 174 | shouldDownloadImages, 175 | shouldImportDrafts 176 | ); 177 | }).then(() => { 178 | if (dataFile) { 179 | return processProfile( 180 | path.join(originalDir, 'profile/profile.html'), 181 | dataFile, 182 | imagesDir 183 | ); 184 | } 185 | }); 186 | } 187 | 188 | module.exports = { 189 | doImport 190 | }; --------------------------------------------------------------------------------