├── .gitignore ├── package.json ├── resourceManager.js ├── parser.js ├── cli.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "thread-reader-reader", 3 | "version": "0.1.0", 4 | "description": "Parse Twitter thread already simplified by third-party Thread Reader app, and produce dead simple HTML.", 5 | "main": "cli.js", 6 | "bin": { 7 | "thread-reader-reader": "./cli.js" 8 | }, 9 | "dependencies": { 10 | "fs-extra": "^9.0.0", 11 | "jsdom": "^16.2.2", 12 | "node-emoji": "^1.10.0", 13 | "node-fetch": "^2.6.0" 14 | }, 15 | "devDependencies": {}, 16 | "scripts": { 17 | "test": "echo \"Error: no test specified\" && exit 1" 18 | }, 19 | "author": "Benjamin Becquet (https://bbecquet.net)", 20 | "license": "WTFPL" 21 | } 22 | -------------------------------------------------------------------------------- /resourceManager.js: -------------------------------------------------------------------------------- 1 | const fetch = require('node-fetch'); 2 | const fs = require('fs-extra'); 3 | const path = require('path'); 4 | 5 | const resourceBaseName = url => path.basename(new URL(url).pathname); 6 | const rewriteUrl = urlPath => url => path.join(urlPath, resourceBaseName(url)); 7 | 8 | function download(url, destPath) { 9 | return fetch(url) 10 | .then(response => response.buffer()) 11 | .then(buffer => fs.outputFile(destPath, buffer)); 12 | } 13 | 14 | function downloadResources(urls, localPath) { 15 | return Promise.all(urls.map(async url => 16 | download(url, path.join(localPath, resourceBaseName(url))) 17 | )); 18 | } 19 | 20 | if (typeof exports === 'object' && typeof module !== 'undefined') { 21 | exports.downloadResources = downloadResources; 22 | exports.rewriteUrl = rewriteUrl; 23 | } -------------------------------------------------------------------------------- /parser.js: -------------------------------------------------------------------------------- 1 | const queryDOM = element => selector => Array.from(element.querySelectorAll(selector)); 2 | 3 | const getImageData = img => ({ 4 | url: queryDOM(img)('img')[0].getAttribute('data-src') 5 | }); 6 | 7 | const getVideoData = video => { 8 | const sources = queryDOM(video)('source'); 9 | return { 10 | poster: queryDOM(video)('video')[0].getAttribute('poster'), 11 | sources: sources.map(source => ({ 12 | src: source.getAttribute('src'), 13 | type: source.getAttribute('type'), 14 | })) 15 | } 16 | }; 17 | 18 | function parseTweet(tweet) { 19 | const $tweet = queryDOM(tweet); 20 | 21 | const images = $tweet('.entity-image').map(getImageData); 22 | const videos = $tweet('.entity-video').map(getVideoData); 23 | 24 | // Clean stuff. /!\ Modify the original node. 25 | const ignoreSelector = ['.row', '.entity-image', '.entity-video', '.tw-permalink'].join(','); 26 | $tweet(ignoreSelector).forEach(child => child.remove()); 27 | $tweet('a.entity-url').forEach(child => { 28 | child.removeAttribute('data-preview'); 29 | child.removeAttribute('class'); 30 | }); 31 | const tweetHTML = tweet.innerHTML; 32 | 33 | return { tweetHTML, images, videos }; 34 | } 35 | 36 | function getTweets(threadReaderDoc) { 37 | return queryDOM(threadReaderDoc)('.t-main .content-tweet').map(parseTweet); 38 | } 39 | 40 | if (typeof exports === 'object' && typeof module !== 'undefined') { 41 | exports.getTweets = getTweets; 42 | } 43 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fetch = require('node-fetch'); 3 | const emojis = require('node-emoji'); 4 | const getTweets = require('./parser').getTweets; 5 | const JSDOM = require('jsdom').JSDOM; 6 | const { rewriteUrl, downloadResources } = require('./resourceManager'); 7 | 8 | const [,, threadReaderUrl, rsrcDirPath, rsrcUrlPath] = process.argv; 9 | 10 | function processDocument(document, { extractResources = false, resourcesUrlPath = '.' }) { 11 | const resources = []; 12 | const urlTransformer = extractResources ? rewriteUrl(resourcesUrlPath) : x => x; 13 | const html = getTweets(document) 14 | .map(({ tweetHTML, images, videos }) => { 15 | const imagesHTML = images.map(img => { 16 | if (extractResources) { 17 | resources.push(img.url); 18 | } 19 | return imageToHtml(urlTransformer)(img); 20 | }); 21 | const videosHTML = videos.map(video => { 22 | if (extractResources) { 23 | resources.push(video.poster); 24 | video.sources.forEach(source => resources.push(source.src)); 25 | } 26 | return videoToHtml(urlTransformer)(video); 27 | }); 28 | return { tweetHTML: emojis.strip(tweetHTML), imagesHTML, videosHTML }; 29 | }) 30 | .map(({ tweetHTML, imagesHTML, videosHTML }) => { 31 | const tags = [ `
${tweetHTML}
` ]; 32 | if (imagesHTML.length > 0) { tags.push(`` divs, and images and videos will be wrapped in a `