├── requirements.txt
├── .gitignore
├── lib
    ├── browser.js
    ├── messages_html_parse.py
    └── get_channel_messages.js
├── package.json
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | beautifulsoup4
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Node modules
 2 | node_modules/
 3 | 
 4 | # debug images
 5 | debug/*.png
 6 | 
 7 | # data folders
 8 | data/*
 9 | output/*
10 | 
11 | .DS_store
12 | package-lock.json
13 | 
14 | # env file
15 | .env
16 | 
17 | # venv
18 | venv/
19 | 


--------------------------------------------------------------------------------
/lib/browser.js:
--------------------------------------------------------------------------------
 1 | const puppeteer = require('puppeteer');
 2 | 
 3 | async function startBrowser(){
 4 | 	let browser;
 5 | 	try {
 6 | 	    console.log("Opening the browser......");
 7 | 	    browser = await puppeteer.launch({
 8 | 	        headless: false,
 9 | 	        args: [
10 | 				// '--start-fullscreen', 
11 | 			'--disable-setuid-sandbox'
12 | 			],
13 | 	        'ignoreHTTPSErrors': true
14 | 	    });
15 | 	} catch (err) {
16 | 	    console.log("Could not create a browser instance => : ", err);
17 | 	}
18 | 	return browser;
19 | }
20 | 
21 | module.exports = {
22 | 	startBrowser
23 | };


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "francis",
 3 |   "version": "24.2.17",
 4 |   "description": "scrape data from slack",
 5 |   "main": "",
 6 |   "scripts": {
 7 |     "test": "echo \"Error: no test specified\" && exit 1"
 8 |   },
 9 |   "repository": {
10 |     "type": "git",
11 |     "url": "git+https://github.com/francisbrero/slack-information-scraper.git"
12 |   },
13 |   "keywords": [
14 |     "slack",
15 |     "scraper",
16 |     "puppeteer",
17 |     "browser automation"
18 |   ],
19 |   "author": "francis@madkudu.com",
20 |   "license": "ISC",
21 |   "bugs": {
22 |     "url": "https://github.com/francisbrero/slack-information-scraper/issues"
23 |   },
24 |   "homepage": "https://github.com/francisbrero/slack-information-scraper#readme",
25 |   "engines": {
26 |     "node": "21.2.0",
27 |     "npm": "10.2.3"
28 |   },
29 |   "dependencies": {
30 |     "puppeteer": "22.0.0",
31 |     "dotenv": "16.3.1"
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/lib/messages_html_parse.py:
--------------------------------------------------------------------------------
 1 | from bs4 import BeautifulSoup
 2 | 
 3 | # Create the output csv file and create headers
 4 | f = open('./output/slack_scrape.csv', 'w+')
 5 | f.write('"name", "message", "date"\n')
 6 | 
 7 | # Open file and parse
 8 | with open('./data/C0284GBS76G_20240217.html', 'rb') as file:
 9 |     soup = BeautifulSoup(file,"html.parser")
10 | 
11 | cnt = 0
12 | err = 0
13 | 
14 | # Find all member DOM elements
15 | elements = soup.find_all("div", class_="c-message_kit__gutter__right".split())
16 | 
17 | # Process them
18 | for member in elements:
19 | 	# Get the name of the user
20 | 	name = member.find("span", class_="c-message__sender c-message_kit__sender")
21 | 	
22 | 	# Get the date of the message
23 | 	date_a = member.find_all("a")[0]
24 | 	# get aria-label attribute as the date
25 | 	date = date_a['aria-label']
26 | 
27 | 	# Get the message
28 | 	message_div = member.find("div", class_="c-message_kit__blocks c-message_kit__blocks--rich_text")
29 | 	# print(message_div)
30 | 	# get all the text in the message and join it
31 | 	message = ""
32 | 	for text in message_div.strings:
33 | 		message += text
34 | 	
35 | 	# Return results
36 | 	try:
37 | 		f.write('"' + name.string + '", "' + message + '", "' + date + '"\n')
38 | 		cnt =+1
39 | 	except:
40 | 		print("error with user " + name.string)
41 | 		err =+1
42 | res = "inserted " + str(cnt) + " successfully records into the file, there were " + str(err) + " errors"
43 | print(res)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # slack-information-scraper
 2 | 
 3 | Get messages from a slack channel and parse them into a csv file
 4 | 
 5 | ## Initialization
 6 | 
 7 | ### install dependencies for the project
 8 | 
 9 | ```
10 | npm install
11 | ```
12 | 
13 | ## Usage
14 | 
15 | Configure the `.env` file with your slack team name, your slack email and password
16 | It should look something like this:
17 | 
18 | ```
19 | SLACK_INSTANCE_NAME=
20 | SLACK_INSTANCE_ID=
21 | SLACK_USERNAME=
22 | SLACK_PASSWORD=
23 | ```
24 | 
25 | ## Scraping
26 | 
27 | ### Get messages from channel
28 | 
29 | Specify the channel you want to parse
30 | 
31 | ``` bash
32 | node ./lib/get_channel_messages.js channel_id
33 | ```
34 | 
35 | ex: `node ./lib/get_channel_messages.js C0284GBS76G`
36 | 
37 | ### Debug
38 | 
39 | This creates a couple screenshots in debug to ensure everything went smoothly
40 | 
41 | ### Output
42 | 
43 | The scraper outputs a raw html in ./data called with today's date which contains all the goodness you're looking for.
44 | 
45 | ## Step 3 => parse the html to get a clean csv
46 | 
47 | ### start a virtual environment
48 | 
49 | ```bash
50 | python3 -m venv venv
51 | source venv/bin/activate
52 | ```
53 | 
54 | ### install the requirements
55 | 
56 | ```bash
57 | pip install -r requirements.txt
58 | ```
59 | 
60 | ### run the parser
61 | 
62 | ```bash
63 | python lib/messages_html_parse.py
64 | ```
65 | 
66 | this outputs a csv file '/output/slack_scrape.csv' text qualified.
67 | 
68 | Enjoy!
69 | 
70 | ### Deactivate the virtual environment
71 | 
72 | ```bash
73 | deactivate
74 | ```
75 | 
76 | ## Comments/feedback
77 | 
78 | are welcome!
79 | 


--------------------------------------------------------------------------------
/lib/get_channel_messages.js:
--------------------------------------------------------------------------------
  1 | const puppeteer = require('puppeteer');
  2 | const fs = require('fs');
  3 | const browserObject = require('./browser');
  4 | 
  5 | require('dotenv').config();
  6 | var slack_instance_name = process.env.SLACK_INSTANCE_NAME;
  7 | var slack_instance_id = process.env.SLACK_INSTANCE_ID;
  8 | var email = process.env.SLACK_USERNAME;
  9 | var pwd = process.env.SLACK_PASSWORD;
 10 | 
 11 | var url = 'https://'+slack_instance_name+'.slack.com/sign_in_with_password'
 12 | 
 13 | var channel = process.argv[2];
 14 | 
 15 | var main = (async () => {  
 16 |   let browserInstance = browserObject.startBrowser();
 17 |   let browser;
 18 |   try{
 19 |       browser = await browserInstance;
 20 |       console.log('slack instance name: ' + slack_instance_name);
 21 |   
 22 |       let page = await browser.newPage();
 23 |       // close the pop-up asking us to open Slack app
 24 |       page.on('dialog', async dialog => {
 25 |         await page.screenshot({ path: './debug/page_pop_up_dialog.png' });
 26 |         console.log(dialog.message());
 27 |         await dialog.dismiss();
 28 |       });
 29 |       page.on('alert', async alert => {
 30 |         await page.screenshot({ path: './debug/page_pop_up_alert.png' });
 31 |         console.log(alert.message());
 32 |         await alert.dismiss();
 33 |       });
 34 |       page.on('confirm', async confirm => {
 35 |         await page.screenshot({ path: './debug/page_pop_up_confirm.png' });
 36 |         console.log(confirm.message());
 37 |         await confirm.dismiss();
 38 |       });
 39 |       page.setDefaultNavigationTimeout(60000); // Increase to 60 seconds
 40 |       console.log('we are headed to: ' + url);
 41 |       await page.goto(url);
 42 |       // make sure we're ready to log in
 43 |       await page.waitForSelector('input[id="email"]')
 44 |       await page.screenshot({ path: './debug/login_page.png' });
 45 |       // input values
 46 |       await page.waitForSelector('input[id="email"]');
 47 |       await page.type('input[id="email"]', email);
 48 |       await page.screenshot({ path: './debug/login_page_email.png' });
 49 |       await page.waitForSelector('input[id="password"]');
 50 |       await page.type('input[id="password"]', pwd);
 51 |       await page.screenshot({ path: './debug/login_page_pwd.png' });
 52 |       await page.click('button[id="signin_btn"]');
 53 |       console.log('signing in, waiting for navigation');
 54 |       await page.waitForNavigation();
 55 |       console.log('boom, we are in!');
 56 | 
 57 |       // head to the intro channel
 58 |       url = 'https://app.slack.com/client/'+slack_instance_id+'/'+channel+'/';
 59 |       console.log('we are headed to: ' + url);
 60 |       await page.goto(url);
 61 |       console.log('we are in the channel, waiting for navigation');
 62 |       await page.waitForNavigation();
 63 | 
 64 |       // check that we made it to the channel
 65 |       if (page.url().indexOf('login') > 0) {
 66 |         console.log('we are not getting past the login page');
 67 |         await page.screenshot({ path: './debug/not_thru_login.png' });
 68 |         await browser.close();
 69 |         return;
 70 |       };
 71 |       console.log('we are in the channel, waiting for the content to load');
 72 | 
 73 |       // wait for all data to be loaded and ensure we don't timeout
 74 |       await page.waitForSelector('div[data-qa="slack_kit_scrollbar"]', { timeout: 6000000 });
 75 |       console.log('we are in the channel, we see the scrollbar');
 76 |       await page.screenshot({ path: './debug/channel.png' });
 77 | 
 78 |       // scroll to the top of the page
 79 |       console.log('scrolling to the top of the page');
 80 |       var top = false;
 81 |       var elHandleArray;
 82 |       var elHandleArray_previous;
 83 |       const senders = [];
 84 |       const messages = [];
 85 |       i = 0;
 86 |       // scroll until we see the element span[data-qa="inline_channel_entity__name"]
 87 |       while (top == false && i < 25) {
 88 |         await page.focus('div[class="c-virtual_list__item"]');
 89 |         await page.keyboard.press('PageUp'); // alternative ArrowUp
 90 |         // get all the messages
 91 |         elHandleArray = await page.$$('div[class="c-message_kit__gutter__right"]')
 92 |         // check that we have some new messages to parse, otherwise skip
 93 |         if (elHandleArray != elHandleArray_previous || i == 0) {
 94 |           elHandleArray.forEach(async el => {
 95 |             const sender = await el.evaluate(el => el.getElementsByClassName('c-message__sender c-message_kit__sender')[0].textContent);
 96 |             senders.push(sender);
 97 |             const message = await el.evaluate(el => el.getElementsByClassName('c-message_kit__blocks c-message_kit__blocks--rich_text')[0].textContent);
 98 |             messages.push(message);
 99 |           })
100 |           // write the html to a file with today's date
101 |           var html = await page.content();
102 |           filename = './data/channel_'+ i.toString() + '.html';
103 |           await page.screenshot({ path: './debug/channel_'+ i.toString() + '.png' });
104 |           
105 | 
106 |           // fs.writeFile(filename, html, (err) => {
107 |           //   if (err)
108 |           //     console.log(err);
109 |           //   else {
110 |           //     console.log("File written successfully\n");
111 |           //   }
112 |           // });
113 |         }
114 | 
115 |         i = i + 1;
116 |         elHandleArray_previous = elHandleArray;
117 |         // we define the top of the channel as the place where we see the title of the channel
118 |         if (await page.$('span[data-qa="inline_channel_entity__name"]')) {
119 |           console.log('we are at the top');
120 |           top = true;
121 |         }
122 |       }
123 |       await page.screenshot({ path: './debug/channel_top.png' });
124 |       
125 |       // write all the senders and messages to a csv file where row i would be sender i and message i
126 |       var data = [];
127 |       for (i = 0; i < senders.length; i++) {
128 |         data.push([senders[i], messages[i]]);
129 |       }
130 |       
131 |       // write the data to a csv file, add double quotes around the message to ensure that commas in the message do not break the csv
132 |       filename = './data/'+channel+'.csv';
133 |       var csvContent = "sender,message\r\n";
134 |       data.forEach(function(rowArray){
135 |         let row = rowArray[0] + ',' + '"' + rowArray[1] + '"';
136 |         csvContent += row + "\r\n";
137 |       });
138 |       // write the csv to a file, 
139 |       fs.writeFile
140 |       (filename, csvContent, (err) => {
141 |         if (err)
142 |           console.log(err);
143 |         else {
144 |           console.log("File written successfully\n");
145 |         }
146 |       });
147 | 
148 | 
149 |     }
150 |     catch(err){
151 |         console.log(err);
152 |     };
153 | 
154 |     // close the browser
155 |     await browser.close();
156 |     console.log('I am done')
157 |  });
158 | 
159 | main()
160 |   .catch((e) => console.log('err: ' + e));
161 | 


--------------------------------------------------------------------------------