├── constants.js
├── README.md
├── package.json
├── .gitignore
├── server.js
├── scraper.js
└── helpers.js


/constants.js:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Node-Twitter-Scraper
 2 | Scraping twitter using Puppeteer and Node
 3 | 
 4 | This is a work in progress.
 5 | 
 6 | Make sure you enter the email address and password for your email in the helpers.js file 
 7 | 
 8 | # Running the package
 9 | ```bash
10 | npm install
11 | ```
12 | to install all dependencies. Then connect to your mongo server and `npm start` to run everything.
13 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "node-twitter-scraper",
 3 |   "version": "1.0.0",
 4 |   "description": "Scraping twitter with chrome headless and puppeteer",
 5 |   "main": "server.js",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1",
 8 |     "start": "node server.js"
 9 |   },
10 |   "repository": {
11 |     "type": "git",
12 |     "url": "git+https://github.com/dansalerno712/Node-Twitter-Scraper.git"
13 |   },
14 |   "author": "Dan Salerno",
15 |   "license": "ISC",
16 |   "bugs": {
17 |     "url": "https://github.com/dansalerno712/Node-Twitter-Scraper/issues"
18 |   },
19 |   "homepage": "https://github.com/dansalerno712/Node-Twitter-Scraper#readme",
20 |   "dependencies": {
21 |     "chunk-date-range": "^0.1.0",
22 |     "csv-writer": "^1.0.0",
23 |     "dateformat": "^3.0.3",
24 |     "express": "^4.16.3",
25 |     "mongodb": "^3.1.13",
26 |     "mysql2": "^1.5.3",
27 |     "nodemailer": "^6.4.16",
28 |     "puppeteer": "^1.18.1",
29 |     "sequelize": "^6.0.0"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | #data directory
 2 | /data
 3 | 
 4 | #csv files
 5 | *.csv
 6 | 
 7 | #jshint config
 8 | .jshintrc
 9 | 
10 | # Logs
11 | logs
12 | *.log
13 | npm-debug.log*
14 | yarn-debug.log*
15 | yarn-error.log*
16 | 
17 | # Runtime data
18 | pids
19 | *.pid
20 | *.seed
21 | *.pid.lock
22 | 
23 | # Directory for instrumented libs generated by jscoverage/JSCover
24 | lib-cov
25 | 
26 | # Coverage directory used by tools like istanbul
27 | coverage
28 | 
29 | # nyc test coverage
30 | .nyc_output
31 | 
32 | # Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
33 | .grunt
34 | 
35 | # Bower dependency directory (https://bower.io/)
36 | bower_components
37 | 
38 | # node-waf configuration
39 | .lock-wscript
40 | 
41 | # Compiled binary addons (http://nodejs.org/api/addons.html)
42 | build/Release
43 | 
44 | # Dependency directories
45 | node_modules/
46 | jspm_packages/
47 | 
48 | # Typescript v1 declaration files
49 | typings/
50 | 
51 | # Optional npm cache directory
52 | .npm
53 | 
54 | # Optional eslint cache
55 | .eslintcache
56 | 
57 | # Optional REPL history
58 | .node_repl_history
59 | 
60 | # Output of 'npm pack'
61 | *.tgz
62 | 
63 | # Yarn Integrity file
64 | .yarn-integrity
65 | 
66 | # dotenv environment variables file
67 | .env
68 | 
69 | 


--------------------------------------------------------------------------------
/server.js:
--------------------------------------------------------------------------------
 1 | const express = require('express');
 2 | const app = express();
 3 | const bodyParser = require("body-parser");
 4 | const scraper = require("./scraper");
 5 | const toCSV = require("./helpers").toCSV;
 6 | const MongoClient = require("mongodb").MongoClient;
 7 | const assert = require("assert");
 8 | const sendEmail = require("./helpers").sendEmail;
 9 | 
10 | app.use(bodyParser.urlencoded({
11 |     extended: true
12 | }));
13 | app.use(bodyParser.json());
14 | 
15 | var mongoURL = 'mongodb://localhost:27017';
16 | 
17 | app.get('/', (req, res) => res.send("Henlo World"));
18 | 
19 | app.post('/scrape', async (req, res) => {
20 |     let term = req.body.term;
21 |     let startDate = req.body.startDate;
22 |     let endDate = req.body.endDate;
23 |     let chunk = req.body.chunk;
24 |     let email = req.body.email;
25 | 
26 |     res.status(200);
27 |     res.send("Starting Scraping");
28 | 
29 |     let ret = await scraper.run(term, startDate, endDate, chunk);
30 | 
31 |     MongoClient.connect(mongoURL, (err, client) => {
32 |         assert.equal(null, err);
33 | 
34 |         var db = client.db('tweetFiles')
35 | 
36 |         file = {
37 |             term: term,
38 |             startDate: startDate,
39 |             endDate: endDate,
40 |         };
41 |         db.collection('files').insertOne(file, (err, response) => {
42 |             if (err) {
43 |                 throw err;
44 |             } else {
45 |                 path = "./files/" + response.ops[0]._id + ".csv";
46 |                 toCSV(ret, path);
47 |                 let link = "http://localhost:3000/download?id=" + response.ops[0]._id
48 |                 sendEmail(email, link);
49 |             }
50 |         });
51 | 
52 |         client.close();
53 |     });
54 | });
55 | 
56 | app.get("/download", (req, res) => {
57 |     let id = req.query.id
58 |     let path = __dirname + "/files/" + id + ".csv";
59 |     res.download(path);
60 | });
61 | 
62 | app.listen(3000, () => console.log("Listening on 3000"));


--------------------------------------------------------------------------------
/scraper.js:
--------------------------------------------------------------------------------
  1 | const puppeteer = require('puppeteer');
  2 | const helpers = require('./helpers');
  3 | const toCSV = helpers.toCSV;
  4 | const splitDateRange = helpers.splitDateRange;
  5 | const autoScroll = helpers.autoScroll;
  6 | 
  7 | /*
  8 |  * Function that scrapes all the tweets from a single twitter advanced search and returns them
  9 |  * @input query: The search query
 10 |  * @input startDate: Starting date in the format "YYYY-MM-DD"
 11 |  * @input endDate: Ending date in the format "YYYY-MM-DD"
 12 |  *
 13 |  * @return: An array of Tweet objects that contain tweet text, id, timestamp, date, likes, retweets
 14 |  */
 15 | async function run(query, startDate, endDate, chunks) {
 16 |     // hold results to output to csv
 17 |     let ret = [];
 18 | 
 19 |     // make sure we encode the query correctly for URLs
 20 |     let encodedQuery = encodeURI(query);
 21 | 
 22 |     //chunk the dates
 23 |     let dateChunks = splitDateRange(startDate, endDate, chunks);
 24 | 
 25 |     //hold the urls to parse
 26 |     let urls = [];
 27 |     for (var i = 0; i < dateChunks.length; i += 1) {
 28 |         //put the search parameters into the search url
 29 |         urls.push(`https://twitter.com/search?l=&q=${encodedQuery}%20since%3A${dateChunks[i].start}%20until%3A${dateChunks[i].end}&src=typd&lang=en`);
 30 |     }
 31 | 
 32 |     //make and launch a new page
 33 |     const browser = await puppeteer.launch({
 34 |         headless: true
 35 |     });
 36 | 
 37 |     for (i = 0; i < urls.length; i += 1) {
 38 |         let page = await browser.newPage();
 39 | 
 40 |         console.log("Starting scraping on " + urls[i]);
 41 |         //goto the twitter search page
 42 |         await page.goto(urls[i]);
 43 | 
 44 |         //set viewport for the autoscroll function
 45 |         await page.setViewport({
 46 |             width: 1200,
 47 |             height: 800
 48 |         });
 49 | 
 50 |         //scroll until twitter is done lazy loading
 51 |         await autoScroll(page);
 52 | 
 53 |         //scrape the tweets
 54 |         const tweets = await page.evaluate(function() {
 55 |             //constant selector for the actual tweets on the screen
 56 |             const TWEET_SELECTOR = '.js-stream-tweet';
 57 | 
 58 |             //grab the DOM elements for the tweets
 59 |             let elements = Array.from(document.querySelectorAll(TWEET_SELECTOR));
 60 | 
 61 |             //create an array to return
 62 |             let ret = [];
 63 | 
 64 |             //get the info from within the tweet DOM elements
 65 |             for (var i = 0; i < elements.length; i += 1) {
 66 |                 //object to store data
 67 |                 let tweet = {};
 68 | 
 69 |                 //get text of tweet
 70 |                 const TWEET_TEXT_SELECTOR = ".tweet-text";
 71 |                 tweet.text = elements[i].querySelector(TWEET_TEXT_SELECTOR).textContent;
 72 | 
 73 |                 //get timestamp
 74 |                 const TWEET_TIMESTAMP_SELECTOR = '.tweet-timestamp';
 75 |                 tweet.timestamp = elements[i].querySelector(TWEET_TIMESTAMP_SELECTOR).getAttribute('title');
 76 | 
 77 |                 //get tweet id
 78 |                 const TWEET_ID_SELECTOR = 'data-tweet-id';
 79 |                 tweet.id = elements[i].getAttribute(TWEET_ID_SELECTOR);
 80 | 
 81 |                 //get likes/retweets
 82 |                 const ACTIONS_SELECTOR = ".ProfileTweet-actionCountForPresentation";
 83 |                 let actions = elements[i].querySelectorAll(ACTIONS_SELECTOR);
 84 | 
 85 |                 //loop through the DOM elements for the actions
 86 |                 for (var j = 0; j < actions.length; j += 1) {
 87 |                     //for some reason, retweets are the 2nd action and likes are the 4th
 88 |                     tweet.retweets = actions[1].innerHTML ? actions[1].innerHTML : 0;
 89 |                     tweet.likes = actions[3].innerHTML ? actions[3].innerHTML : 0;
 90 |                 }
 91 | 
 92 |                 //add tweet data to return array
 93 |                 ret.push(tweet);
 94 |             }
 95 |             return ret;
 96 |         });
 97 | 
 98 |         //add to csv
 99 |         ret.push(tweets);
100 | 
101 |         //close the page
102 |         await page.close();
103 |     }
104 | 
105 |     //exit the browser
106 |     await browser.close();
107 | 
108 |     // collapse into one array and return
109 |     return [].concat.apply([], ret);
110 | }
111 | 
112 | module.exports.run = run;


--------------------------------------------------------------------------------
/helpers.js:
--------------------------------------------------------------------------------
  1 | const chunk = require('chunk-date-range');
  2 | const dateformat = require('dateformat');
  3 | const createCsvWriter = require('csv-writer').createObjectCsvWriter;
  4 | const nodemailer = require('nodemailer');
  5 | 
  6 | module.exports = {
  7 |     /**
  8 |      * Sends an email containing the download link for the tweet csv
  9 |      * @param  {string} toEmail Email to send to
 10 |      * @param  {string} link    Link to download the csv
 11 |      */
 12 |     "sendEmail": function(toEmail, link) {
 13 |         var transporter = nodemailer.createTransport({
 14 |             service: 'gmail',
 15 |             auth: {
 16 |                 user: "<email>",
 17 |                 pass: "<password>"
 18 |             }
 19 |         });
 20 | 
 21 |         var mailOptions = {
 22 |             from: "<email>",
 23 |             to: toEmail + "",
 24 |             subject: "Your Tweet Download Link",
 25 |             text: link + ""
 26 |         };
 27 | 
 28 |         transporter.sendMail(mailOptions, function(error, info) {
 29 |             if (error) {
 30 |                 throw error
 31 |             } else {
 32 |                 console.log("Email sent: " + info.response);
 33 |             }
 34 |         });
 35 |     },
 36 |     /*
 37 |      * Function to write an array of tweets to a csv
 38 |      * @input: tweets: An array of tweet objects
 39 |      * @input: path: A string of the path to save the csv at
 40 |      * @return: Nothing, but a csv is created
 41 |      */
 42 |     "toCSV": function(tweets, path) {
 43 |         // create header schema
 44 |         const csvWriter = createCsvWriter({
 45 |             path: path,
 46 |             header: [{
 47 |                 id: "text",
 48 |                 title: "Text"
 49 |             }, {
 50 |                 id: "timestamp",
 51 |                 title: "Timestamp"
 52 |             }, {
 53 |                 id: "id",
 54 |                 title: "ID"
 55 |             }, {
 56 |                 id: "retweets",
 57 |                 title: "Retweets"
 58 |             }, {
 59 |                 id: "likes",
 60 |                 title: "Likes"
 61 |             }]
 62 |         });
 63 | 
 64 |         // output to csv
 65 |         csvWriter.writeRecords(tweets)
 66 |             .then(() => {
 67 |                 console.log("Done writing to csv");
 68 |             });
 69 |     },
 70 |     /*
 71 |      * Function to split the Start/End Date into either chunks or by Date/Week/Month/Year
 72 |      * @input startDate: A string in the format YYYY/MM/DD
 73 |      * @input endDate: A string in the format YYYY/MM/DD
 74 |      * @input chunks: Either a number that specifies how many equal chunks the user wants to 
 75 |      * split the date range into or a String day|week|month|year that splits the date range that way
 76 |      *
 77 |      * @return: An array of {startDate, endDate} objects where start and end date are in the format
 78 |      *  of a YYYY/MM/DD string
 79 |      */
 80 |     "splitDateRange": function(startDate, endDate, chunks) {
 81 |         let start = new Date(startDate);
 82 |         let end = new Date(endDate);
 83 |         let ret = chunk(start, end, chunks);
 84 |         return ret.map(function(dateRange) {
 85 |             return {
 86 |                 'start': dateformat(dateRange.start, "yyyy-mm-dd"),
 87 |                 'end': dateformat(dateRange.end, "yyyy-mm-dd")
 88 |             };
 89 |         });
 90 |     },
 91 |     /*
 92 |      * Function to scroll on a page until all lazy loading has been done
 93 |      * @input page: the page you want to scroll on
 94 |      */
 95 |     "autoScroll": function(page) {
 96 |         // evaluate some javascript
 97 |         return page.evaluate(function() {
 98 |             return new Promise(function(resolve, reject) {
 99 |                 let totalHeight = 0;
100 | 
101 |                 //distance per scroll
102 |                 let distance = 1000;
103 |                 let timer = setInterval(function() {
104 |                     //get current height
105 |                     let scrollHeight = document.body.scrollHeight;
106 | 
107 |                     //scroll and increment
108 |                     window.scrollBy(0, distance);
109 |                     totalHeight += distance;
110 | 
111 |                     //if we didnt scroll, lazy loading must be done, so return
112 |                     if (totalHeight >= scrollHeight) {
113 |                         clearInterval(timer);
114 |                         resolve();
115 |                     }
116 |                     //how long to wait between scrolls
117 |                 }, 1000);
118 |             });
119 |         });
120 |     }
121 | };


--------------------------------------------------------------------------------