├── constants.js ├── README.md ├── package.json ├── .gitignore ├── server.js ├── scraper.js └── helpers.js /constants.js: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Node-Twitter-Scraper 2 | Scraping twitter using Puppeteer and Node 3 | 4 | This is a work in progress. 5 | 6 | Make sure you enter the email address and password for your email in the helpers.js file 7 | 8 | # Running the package 9 | ```bash 10 | npm install 11 | ``` 12 | to install all dependencies. Then connect to your mongo server and `npm start` to run everything. 13 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "node-twitter-scraper", 3 | "version": "1.0.0", 4 | "description": "Scraping twitter with chrome headless and puppeteer", 5 | "main": "server.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1", 8 | "start": "node server.js" 9 | }, 10 | "repository": { 11 | "type": "git", 12 | "url": "git+https://github.com/dansalerno712/Node-Twitter-Scraper.git" 13 | }, 14 | "author": "Dan Salerno", 15 | "license": "ISC", 16 | "bugs": { 17 | "url": "https://github.com/dansalerno712/Node-Twitter-Scraper/issues" 18 | }, 19 | "homepage": "https://github.com/dansalerno712/Node-Twitter-Scraper#readme", 20 | "dependencies": { 21 | "chunk-date-range": "^0.1.0", 22 | "csv-writer": "^1.0.0", 23 | "dateformat": "^3.0.3", 24 | "express": "^4.16.3", 25 | "mongodb": "^3.1.13", 26 | "mysql2": "^1.5.3", 27 | "nodemailer": "^6.4.16", 28 | "puppeteer": "^1.18.1", 29 | "sequelize": "^6.0.0" 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | #data directory 2 | /data 3 | 4 | #csv files 5 | *.csv 6 | 7 | #jshint config 8 | .jshintrc 9 | 10 | # Logs 11 | logs 12 | *.log 13 | npm-debug.log* 14 | yarn-debug.log* 15 | yarn-error.log* 16 | 17 | # Runtime data 18 | pids 19 | *.pid 20 | *.seed 21 | *.pid.lock 22 | 23 | # Directory for instrumented libs generated by jscoverage/JSCover 24 | lib-cov 25 | 26 | # Coverage directory used by tools like istanbul 27 | coverage 28 | 29 | # nyc test coverage 30 | .nyc_output 31 | 32 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) 33 | .grunt 34 | 35 | # Bower dependency directory (https://bower.io/) 36 | bower_components 37 | 38 | # node-waf configuration 39 | .lock-wscript 40 | 41 | # Compiled binary addons (http://nodejs.org/api/addons.html) 42 | build/Release 43 | 44 | # Dependency directories 45 | node_modules/ 46 | jspm_packages/ 47 | 48 | # Typescript v1 declaration files 49 | typings/ 50 | 51 | # Optional npm cache directory 52 | .npm 53 | 54 | # Optional eslint cache 55 | .eslintcache 56 | 57 | # Optional REPL history 58 | .node_repl_history 59 | 60 | # Output of 'npm pack' 61 | *.tgz 62 | 63 | # Yarn Integrity file 64 | .yarn-integrity 65 | 66 | # dotenv environment variables file 67 | .env 68 | 69 | -------------------------------------------------------------------------------- /server.js: -------------------------------------------------------------------------------- 1 | const express = require('express'); 2 | const app = express(); 3 | const bodyParser = require("body-parser"); 4 | const scraper = require("./scraper"); 5 | const toCSV = require("./helpers").toCSV; 6 | const MongoClient = require("mongodb").MongoClient; 7 | const assert = require("assert"); 8 | const sendEmail = require("./helpers").sendEmail; 9 | 10 | app.use(bodyParser.urlencoded({ 11 | extended: true 12 | })); 13 | app.use(bodyParser.json()); 14 | 15 | var mongoURL = 'mongodb://localhost:27017'; 16 | 17 | app.get('/', (req, res) => res.send("Henlo World")); 18 | 19 | app.post('/scrape', async (req, res) => { 20 | let term = req.body.term; 21 | let startDate = req.body.startDate; 22 | let endDate = req.body.endDate; 23 | let chunk = req.body.chunk; 24 | let email = req.body.email; 25 | 26 | res.status(200); 27 | res.send("Starting Scraping"); 28 | 29 | let ret = await scraper.run(term, startDate, endDate, chunk); 30 | 31 | MongoClient.connect(mongoURL, (err, client) => { 32 | assert.equal(null, err); 33 | 34 | var db = client.db('tweetFiles') 35 | 36 | file = { 37 | term: term, 38 | startDate: startDate, 39 | endDate: endDate, 40 | }; 41 | db.collection('files').insertOne(file, (err, response) => { 42 | if (err) { 43 | throw err; 44 | } else { 45 | path = "./files/" + response.ops[0]._id + ".csv"; 46 | toCSV(ret, path); 47 | let link = "http://localhost:3000/download?id=" + response.ops[0]._id 48 | sendEmail(email, link); 49 | } 50 | }); 51 | 52 | client.close(); 53 | }); 54 | }); 55 | 56 | app.get("/download", (req, res) => { 57 | let id = req.query.id 58 | let path = __dirname + "/files/" + id + ".csv"; 59 | res.download(path); 60 | }); 61 | 62 | app.listen(3000, () => console.log("Listening on 3000")); -------------------------------------------------------------------------------- /scraper.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const helpers = require('./helpers'); 3 | const toCSV = helpers.toCSV; 4 | const splitDateRange = helpers.splitDateRange; 5 | const autoScroll = helpers.autoScroll; 6 | 7 | /* 8 | * Function that scrapes all the tweets from a single twitter advanced search and returns them 9 | * @input query: The search query 10 | * @input startDate: Starting date in the format "YYYY-MM-DD" 11 | * @input endDate: Ending date in the format "YYYY-MM-DD" 12 | * 13 | * @return: An array of Tweet objects that contain tweet text, id, timestamp, date, likes, retweets 14 | */ 15 | async function run(query, startDate, endDate, chunks) { 16 | // hold results to output to csv 17 | let ret = []; 18 | 19 | // make sure we encode the query correctly for URLs 20 | let encodedQuery = encodeURI(query); 21 | 22 | //chunk the dates 23 | let dateChunks = splitDateRange(startDate, endDate, chunks); 24 | 25 | //hold the urls to parse 26 | let urls = []; 27 | for (var i = 0; i < dateChunks.length; i += 1) { 28 | //put the search parameters into the search url 29 | urls.push(`https://twitter.com/search?l=&q=${encodedQuery}%20since%3A${dateChunks[i].start}%20until%3A${dateChunks[i].end}&src=typd&lang=en`); 30 | } 31 | 32 | //make and launch a new page 33 | const browser = await puppeteer.launch({ 34 | headless: true 35 | }); 36 | 37 | for (i = 0; i < urls.length; i += 1) { 38 | let page = await browser.newPage(); 39 | 40 | console.log("Starting scraping on " + urls[i]); 41 | //goto the twitter search page 42 | await page.goto(urls[i]); 43 | 44 | //set viewport for the autoscroll function 45 | await page.setViewport({ 46 | width: 1200, 47 | height: 800 48 | }); 49 | 50 | //scroll until twitter is done lazy loading 51 | await autoScroll(page); 52 | 53 | //scrape the tweets 54 | const tweets = await page.evaluate(function() { 55 | //constant selector for the actual tweets on the screen 56 | const TWEET_SELECTOR = '.js-stream-tweet'; 57 | 58 | //grab the DOM elements for the tweets 59 | let elements = Array.from(document.querySelectorAll(TWEET_SELECTOR)); 60 | 61 | //create an array to return 62 | let ret = []; 63 | 64 | //get the info from within the tweet DOM elements 65 | for (var i = 0; i < elements.length; i += 1) { 66 | //object to store data 67 | let tweet = {}; 68 | 69 | //get text of tweet 70 | const TWEET_TEXT_SELECTOR = ".tweet-text"; 71 | tweet.text = elements[i].querySelector(TWEET_TEXT_SELECTOR).textContent; 72 | 73 | //get timestamp 74 | const TWEET_TIMESTAMP_SELECTOR = '.tweet-timestamp'; 75 | tweet.timestamp = elements[i].querySelector(TWEET_TIMESTAMP_SELECTOR).getAttribute('title'); 76 | 77 | //get tweet id 78 | const TWEET_ID_SELECTOR = 'data-tweet-id'; 79 | tweet.id = elements[i].getAttribute(TWEET_ID_SELECTOR); 80 | 81 | //get likes/retweets 82 | const ACTIONS_SELECTOR = ".ProfileTweet-actionCountForPresentation"; 83 | let actions = elements[i].querySelectorAll(ACTIONS_SELECTOR); 84 | 85 | //loop through the DOM elements for the actions 86 | for (var j = 0; j < actions.length; j += 1) { 87 | //for some reason, retweets are the 2nd action and likes are the 4th 88 | tweet.retweets = actions[1].innerHTML ? actions[1].innerHTML : 0; 89 | tweet.likes = actions[3].innerHTML ? actions[3].innerHTML : 0; 90 | } 91 | 92 | //add tweet data to return array 93 | ret.push(tweet); 94 | } 95 | return ret; 96 | }); 97 | 98 | //add to csv 99 | ret.push(tweets); 100 | 101 | //close the page 102 | await page.close(); 103 | } 104 | 105 | //exit the browser 106 | await browser.close(); 107 | 108 | // collapse into one array and return 109 | return [].concat.apply([], ret); 110 | } 111 | 112 | module.exports.run = run; -------------------------------------------------------------------------------- /helpers.js: -------------------------------------------------------------------------------- 1 | const chunk = require('chunk-date-range'); 2 | const dateformat = require('dateformat'); 3 | const createCsvWriter = require('csv-writer').createObjectCsvWriter; 4 | const nodemailer = require('nodemailer'); 5 | 6 | module.exports = { 7 | /** 8 | * Sends an email containing the download link for the tweet csv 9 | * @param {string} toEmail Email to send to 10 | * @param {string} link Link to download the csv 11 | */ 12 | "sendEmail": function(toEmail, link) { 13 | var transporter = nodemailer.createTransport({ 14 | service: 'gmail', 15 | auth: { 16 | user: "", 17 | pass: "" 18 | } 19 | }); 20 | 21 | var mailOptions = { 22 | from: "", 23 | to: toEmail + "", 24 | subject: "Your Tweet Download Link", 25 | text: link + "" 26 | }; 27 | 28 | transporter.sendMail(mailOptions, function(error, info) { 29 | if (error) { 30 | throw error 31 | } else { 32 | console.log("Email sent: " + info.response); 33 | } 34 | }); 35 | }, 36 | /* 37 | * Function to write an array of tweets to a csv 38 | * @input: tweets: An array of tweet objects 39 | * @input: path: A string of the path to save the csv at 40 | * @return: Nothing, but a csv is created 41 | */ 42 | "toCSV": function(tweets, path) { 43 | // create header schema 44 | const csvWriter = createCsvWriter({ 45 | path: path, 46 | header: [{ 47 | id: "text", 48 | title: "Text" 49 | }, { 50 | id: "timestamp", 51 | title: "Timestamp" 52 | }, { 53 | id: "id", 54 | title: "ID" 55 | }, { 56 | id: "retweets", 57 | title: "Retweets" 58 | }, { 59 | id: "likes", 60 | title: "Likes" 61 | }] 62 | }); 63 | 64 | // output to csv 65 | csvWriter.writeRecords(tweets) 66 | .then(() => { 67 | console.log("Done writing to csv"); 68 | }); 69 | }, 70 | /* 71 | * Function to split the Start/End Date into either chunks or by Date/Week/Month/Year 72 | * @input startDate: A string in the format YYYY/MM/DD 73 | * @input endDate: A string in the format YYYY/MM/DD 74 | * @input chunks: Either a number that specifies how many equal chunks the user wants to 75 | * split the date range into or a String day|week|month|year that splits the date range that way 76 | * 77 | * @return: An array of {startDate, endDate} objects where start and end date are in the format 78 | * of a YYYY/MM/DD string 79 | */ 80 | "splitDateRange": function(startDate, endDate, chunks) { 81 | let start = new Date(startDate); 82 | let end = new Date(endDate); 83 | let ret = chunk(start, end, chunks); 84 | return ret.map(function(dateRange) { 85 | return { 86 | 'start': dateformat(dateRange.start, "yyyy-mm-dd"), 87 | 'end': dateformat(dateRange.end, "yyyy-mm-dd") 88 | }; 89 | }); 90 | }, 91 | /* 92 | * Function to scroll on a page until all lazy loading has been done 93 | * @input page: the page you want to scroll on 94 | */ 95 | "autoScroll": function(page) { 96 | // evaluate some javascript 97 | return page.evaluate(function() { 98 | return new Promise(function(resolve, reject) { 99 | let totalHeight = 0; 100 | 101 | //distance per scroll 102 | let distance = 1000; 103 | let timer = setInterval(function() { 104 | //get current height 105 | let scrollHeight = document.body.scrollHeight; 106 | 107 | //scroll and increment 108 | window.scrollBy(0, distance); 109 | totalHeight += distance; 110 | 111 | //if we didnt scroll, lazy loading must be done, so return 112 | if (totalHeight >= scrollHeight) { 113 | clearInterval(timer); 114 | resolve(); 115 | } 116 | //how long to wait between scrolls 117 | }, 1000); 118 | }); 119 | }); 120 | } 121 | }; --------------------------------------------------------------------------------