├── .gitignore ├── package.json ├── resourceManager.js ├── parser.js ├── cli.js └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "thread-reader-reader", 3 | "version": "0.1.0", 4 | "description": "Parse Twitter thread already simplified by third-party Thread Reader app, and produce dead simple HTML.", 5 | "main": "cli.js", 6 | "bin": { 7 | "thread-reader-reader": "./cli.js" 8 | }, 9 | "dependencies": { 10 | "fs-extra": "^9.0.0", 11 | "jsdom": "^16.2.2", 12 | "node-emoji": "^1.10.0", 13 | "node-fetch": "^2.6.0" 14 | }, 15 | "devDependencies": {}, 16 | "scripts": { 17 | "test": "echo \"Error: no test specified\" && exit 1" 18 | }, 19 | "author": "Benjamin Becquet (https://bbecquet.net)", 20 | "license": "WTFPL" 21 | } 22 | -------------------------------------------------------------------------------- /resourceManager.js: -------------------------------------------------------------------------------- 1 | const fetch = require('node-fetch'); 2 | const fs = require('fs-extra'); 3 | const path = require('path'); 4 | 5 | const resourceBaseName = url => path.basename(new URL(url).pathname); 6 | const rewriteUrl = urlPath => url => path.join(urlPath, resourceBaseName(url)); 7 | 8 | function download(url, destPath) { 9 | return fetch(url) 10 | .then(response => response.buffer()) 11 | .then(buffer => fs.outputFile(destPath, buffer)); 12 | } 13 | 14 | function downloadResources(urls, localPath) { 15 | return Promise.all(urls.map(async url => 16 | download(url, path.join(localPath, resourceBaseName(url))) 17 | )); 18 | } 19 | 20 | if (typeof exports === 'object' && typeof module !== 'undefined') { 21 | exports.downloadResources = downloadResources; 22 | exports.rewriteUrl = rewriteUrl; 23 | } -------------------------------------------------------------------------------- /parser.js: -------------------------------------------------------------------------------- 1 | const queryDOM = element => selector => Array.from(element.querySelectorAll(selector)); 2 | 3 | const getImageData = img => ({ 4 | url: queryDOM(img)('img')[0].getAttribute('data-src') 5 | }); 6 | 7 | const getVideoData = video => { 8 | const sources = queryDOM(video)('source'); 9 | return { 10 | poster: queryDOM(video)('video')[0].getAttribute('poster'), 11 | sources: sources.map(source => ({ 12 | src: source.getAttribute('src'), 13 | type: source.getAttribute('type'), 14 | })) 15 | } 16 | }; 17 | 18 | function parseTweet(tweet) { 19 | const $tweet = queryDOM(tweet); 20 | 21 | const images = $tweet('.entity-image').map(getImageData); 22 | const videos = $tweet('.entity-video').map(getVideoData); 23 | 24 | // Clean stuff. /!\ Modify the original node. 25 | const ignoreSelector = ['.row', '.entity-image', '.entity-video', '.tw-permalink'].join(','); 26 | $tweet(ignoreSelector).forEach(child => child.remove()); 27 | $tweet('a.entity-url').forEach(child => { 28 | child.removeAttribute('data-preview'); 29 | child.removeAttribute('class'); 30 | }); 31 | const tweetHTML = tweet.innerHTML; 32 | 33 | return { tweetHTML, images, videos }; 34 | } 35 | 36 | function getTweets(threadReaderDoc) { 37 | return queryDOM(threadReaderDoc)('.t-main .content-tweet').map(parseTweet); 38 | } 39 | 40 | if (typeof exports === 'object' && typeof module !== 'undefined') { 41 | exports.getTweets = getTweets; 42 | } 43 | -------------------------------------------------------------------------------- /cli.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | const fetch = require('node-fetch'); 3 | const emojis = require('node-emoji'); 4 | const getTweets = require('./parser').getTweets; 5 | const JSDOM = require('jsdom').JSDOM; 6 | const { rewriteUrl, downloadResources } = require('./resourceManager'); 7 | 8 | const [,, threadReaderUrl, rsrcDirPath, rsrcUrlPath] = process.argv; 9 | 10 | function processDocument(document, { extractResources = false, resourcesUrlPath = '.' }) { 11 | const resources = []; 12 | const urlTransformer = extractResources ? rewriteUrl(resourcesUrlPath) : x => x; 13 | const html = getTweets(document) 14 | .map(({ tweetHTML, images, videos }) => { 15 | const imagesHTML = images.map(img => { 16 | if (extractResources) { 17 | resources.push(img.url); 18 | } 19 | return imageToHtml(urlTransformer)(img); 20 | }); 21 | const videosHTML = videos.map(video => { 22 | if (extractResources) { 23 | resources.push(video.poster); 24 | video.sources.forEach(source => resources.push(source.src)); 25 | } 26 | return videoToHtml(urlTransformer)(video); 27 | }); 28 | return { tweetHTML: emojis.strip(tweetHTML), imagesHTML, videosHTML }; 29 | }) 30 | .map(({ tweetHTML, imagesHTML, videosHTML }) => { 31 | const tags = [ `

${tweetHTML}

` ]; 32 | if (imagesHTML.length > 0) { tags.push(`
${imagesHTML.join('')}
`); } 33 | if (videosHTML.length > 0) { tags.push(`
${videosHTML.join('')}
`); } 34 | return tags; 35 | }) 36 | .reduce((acc, tweetParts) => acc.concat(tweetParts), []) // alternative for missing .flat 37 | .join(''); 38 | 39 | return { html, resources }; 40 | } 41 | 42 | const imageToHtml = urlTransformer => ({ url }) => 43 | ` 44 | 45 | `; 46 | 47 | const videoToHtml = urlTransformer => ({ poster, sources }) => { 48 | const types = sources.map(({ src, type }) => ``); 49 | return ``; 53 | } 54 | 55 | fetch(threadReaderUrl) 56 | .then(response => response.text()) 57 | .then(html => new JSDOM(html).window.document) 58 | .then(doc => processDocument(doc, { 59 | extractResources: !!rsrcDirPath, 60 | resourcesUrlPath: rsrcUrlPath || rsrcDirPath 61 | })) 62 | .then(async ({ html, resources }) => { 63 | await downloadResources(resources, rsrcDirPath); 64 | return html; 65 | }) 66 | .then(console.log); 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Thread-reader-reader 2 | 3 | Helps converting Twitter threads to stand-alone articles by extracting them to simple HTML. Relies on the [Thread reader](https://threadreaderapp.com) third-party app. 4 | 5 | ## Installation 6 | 7 | 1. Make sure you have Node and NPM installed. 8 | 2. Install the command globally 9 | ``` 10 | npm i -g thread-reader-reader 11 | ``` 12 | 13 | ## Usage 14 | 15 | 1. Submit the first tweet of a Twitter thread to [Thread reader](https://threadreaderapp.com) 16 | 2. Get the resulting URL (Should look like https://threadreaderapp.com/thread/1241364682084093953.html) 17 | 3. Run the CLI program with the following command: 18 | 19 | ``` 20 | thread-reader-reader 21 | ``` 22 | 4. The result will be written to the standard output. 23 | 24 | Tweets will be output in `

` divs, and images and videos will be wrapped in a `

` tag following the paragraph of the tweet they appear in. 25 | 26 | ### Downloading resources 27 | 28 | By default, original image and video urls will be kept, meaning links will still use the resources stored on Twitter servers. 29 | 30 | Instead, you can download these resources locally by adding a path to a local directory as second parameter. 31 | 32 | ``` 33 | thread-reader-reader [directory_to_store_files] 34 | ``` 35 | 36 | In that case, the urls to images and videos will be rewritten with the same path as relative url, replacing Twitter urls. 37 | 38 | You can change this relative url by specifying a third parameter. 39 | 40 | ``` 41 | thread-reader-reader [directory_to_store_files] [relative_url_path] 42 | ``` 43 | 44 | That way, it's easy to adapt to how your destination website stores content. 45 | 46 | ### Full example 47 | 48 | Command: 49 | 50 | ``` 51 | thread-reader-reader https://threadreaderapp.com/thread/1241364682084093953.html aquatint-files /public/images/aquatint-files > aquatint-article.html 52 | ``` 53 | 54 | Result: 55 | 56 | - The extracted thread HTML will be written to the `aquatint-article.html` file. 57 | - In this HTML, urls to images and videos will use `/public/images/aquatint-files` as prefix. 58 | - Image and video files will be downloaded and stored in the `aquatint-files` directory, ready to be uploaded on your server, in the directory corresponding to `/public/images/aquatint-files`. 59 | 60 | 61 | ## As a lib 62 | 63 | You can also use the `parser.js` file as a separate lib. 64 | 65 | The `getTweets` function takes a DOM element as input (JSDom or real DOM will work), basically the document of a Thread Reader page and will return an array of objects: 66 | 67 | ```js 68 | { 69 | tweetHTML, // inner markup of the tweet text, including links 70 | images, // array of { url } objects 71 | videos // array of { poster, sources: [{ type, src }] } objects 72 | } 73 | ``` --------------------------------------------------------------------------------