├── requirements.txt ├── .gitignore ├── lib ├── browser.js ├── messages_html_parse.py └── get_channel_messages.js ├── package.json └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Node modules 2 | node_modules/ 3 | 4 | # debug images 5 | debug/*.png 6 | 7 | # data folders 8 | data/* 9 | output/* 10 | 11 | .DS_store 12 | package-lock.json 13 | 14 | # env file 15 | .env 16 | 17 | # venv 18 | venv/ 19 | -------------------------------------------------------------------------------- /lib/browser.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | 3 | async function startBrowser(){ 4 | let browser; 5 | try { 6 | console.log("Opening the browser......"); 7 | browser = await puppeteer.launch({ 8 | headless: false, 9 | args: [ 10 | // '--start-fullscreen', 11 | '--disable-setuid-sandbox' 12 | ], 13 | 'ignoreHTTPSErrors': true 14 | }); 15 | } catch (err) { 16 | console.log("Could not create a browser instance => : ", err); 17 | } 18 | return browser; 19 | } 20 | 21 | module.exports = { 22 | startBrowser 23 | }; -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "francis", 3 | "version": "24.2.17", 4 | "description": "scrape data from slack", 5 | "main": "", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/francisbrero/slack-information-scraper.git" 12 | }, 13 | "keywords": [ 14 | "slack", 15 | "scraper", 16 | "puppeteer", 17 | "browser automation" 18 | ], 19 | "author": "francis@madkudu.com", 20 | "license": "ISC", 21 | "bugs": { 22 | "url": "https://github.com/francisbrero/slack-information-scraper/issues" 23 | }, 24 | "homepage": "https://github.com/francisbrero/slack-information-scraper#readme", 25 | "engines": { 26 | "node": "21.2.0", 27 | "npm": "10.2.3" 28 | }, 29 | "dependencies": { 30 | "puppeteer": "22.0.0", 31 | "dotenv": "16.3.1" 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /lib/messages_html_parse.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | 3 | # Create the output csv file and create headers 4 | f = open('./output/slack_scrape.csv', 'w+') 5 | f.write('"name", "message", "date"\n') 6 | 7 | # Open file and parse 8 | with open('./data/C0284GBS76G_20240217.html', 'rb') as file: 9 | soup = BeautifulSoup(file,"html.parser") 10 | 11 | cnt = 0 12 | err = 0 13 | 14 | # Find all member DOM elements 15 | elements = soup.find_all("div", class_="c-message_kit__gutter__right".split()) 16 | 17 | # Process them 18 | for member in elements: 19 | # Get the name of the user 20 | name = member.find("span", class_="c-message__sender c-message_kit__sender") 21 | 22 | # Get the date of the message 23 | date_a = member.find_all("a")[0] 24 | # get aria-label attribute as the date 25 | date = date_a['aria-label'] 26 | 27 | # Get the message 28 | message_div = member.find("div", class_="c-message_kit__blocks c-message_kit__blocks--rich_text") 29 | # print(message_div) 30 | # get all the text in the message and join it 31 | message = "" 32 | for text in message_div.strings: 33 | message += text 34 | 35 | # Return results 36 | try: 37 | f.write('"' + name.string + '", "' + message + '", "' + date + '"\n') 38 | cnt =+1 39 | except: 40 | print("error with user " + name.string) 41 | err =+1 42 | res = "inserted " + str(cnt) + " successfully records into the file, there were " + str(err) + " errors" 43 | print(res) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # slack-information-scraper 2 | 3 | Get messages from a slack channel and parse them into a csv file 4 | 5 | ## Initialization 6 | 7 | ### install dependencies for the project 8 | 9 | ``` 10 | npm install 11 | ``` 12 | 13 | ## Usage 14 | 15 | Configure the `.env` file with your slack team name, your slack email and password 16 | It should look something like this: 17 | 18 | ``` 19 | SLACK_INSTANCE_NAME= 20 | SLACK_INSTANCE_ID= 21 | SLACK_USERNAME= 22 | SLACK_PASSWORD= 23 | ``` 24 | 25 | ## Scraping 26 | 27 | ### Get messages from channel 28 | 29 | Specify the channel you want to parse 30 | 31 | ``` bash 32 | node ./lib/get_channel_messages.js channel_id 33 | ``` 34 | 35 | ex: `node ./lib/get_channel_messages.js C0284GBS76G` 36 | 37 | ### Debug 38 | 39 | This creates a couple screenshots in debug to ensure everything went smoothly 40 | 41 | ### Output 42 | 43 | The scraper outputs a raw html in ./data called with today's date which contains all the goodness you're looking for. 44 | 45 | ## Step 3 => parse the html to get a clean csv 46 | 47 | ### start a virtual environment 48 | 49 | ```bash 50 | python3 -m venv venv 51 | source venv/bin/activate 52 | ``` 53 | 54 | ### install the requirements 55 | 56 | ```bash 57 | pip install -r requirements.txt 58 | ``` 59 | 60 | ### run the parser 61 | 62 | ```bash 63 | python lib/messages_html_parse.py 64 | ``` 65 | 66 | this outputs a csv file '/output/slack_scrape.csv' text qualified. 67 | 68 | Enjoy! 69 | 70 | ### Deactivate the virtual environment 71 | 72 | ```bash 73 | deactivate 74 | ``` 75 | 76 | ## Comments/feedback 77 | 78 | are welcome! 79 | -------------------------------------------------------------------------------- /lib/get_channel_messages.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const fs = require('fs'); 3 | const browserObject = require('./browser'); 4 | 5 | require('dotenv').config(); 6 | var slack_instance_name = process.env.SLACK_INSTANCE_NAME; 7 | var slack_instance_id = process.env.SLACK_INSTANCE_ID; 8 | var email = process.env.SLACK_USERNAME; 9 | var pwd = process.env.SLACK_PASSWORD; 10 | 11 | var url = 'https://'+slack_instance_name+'.slack.com/sign_in_with_password' 12 | 13 | var channel = process.argv[2]; 14 | 15 | var main = (async () => { 16 | let browserInstance = browserObject.startBrowser(); 17 | let browser; 18 | try{ 19 | browser = await browserInstance; 20 | console.log('slack instance name: ' + slack_instance_name); 21 | 22 | let page = await browser.newPage(); 23 | // close the pop-up asking us to open Slack app 24 | page.on('dialog', async dialog => { 25 | await page.screenshot({ path: './debug/page_pop_up_dialog.png' }); 26 | console.log(dialog.message()); 27 | await dialog.dismiss(); 28 | }); 29 | page.on('alert', async alert => { 30 | await page.screenshot({ path: './debug/page_pop_up_alert.png' }); 31 | console.log(alert.message()); 32 | await alert.dismiss(); 33 | }); 34 | page.on('confirm', async confirm => { 35 | await page.screenshot({ path: './debug/page_pop_up_confirm.png' }); 36 | console.log(confirm.message()); 37 | await confirm.dismiss(); 38 | }); 39 | page.setDefaultNavigationTimeout(60000); // Increase to 60 seconds 40 | console.log('we are headed to: ' + url); 41 | await page.goto(url); 42 | // make sure we're ready to log in 43 | await page.waitForSelector('input[id="email"]') 44 | await page.screenshot({ path: './debug/login_page.png' }); 45 | // input values 46 | await page.waitForSelector('input[id="email"]'); 47 | await page.type('input[id="email"]', email); 48 | await page.screenshot({ path: './debug/login_page_email.png' }); 49 | await page.waitForSelector('input[id="password"]'); 50 | await page.type('input[id="password"]', pwd); 51 | await page.screenshot({ path: './debug/login_page_pwd.png' }); 52 | await page.click('button[id="signin_btn"]'); 53 | console.log('signing in, waiting for navigation'); 54 | await page.waitForNavigation(); 55 | console.log('boom, we are in!'); 56 | 57 | // head to the intro channel 58 | url = 'https://app.slack.com/client/'+slack_instance_id+'/'+channel+'/'; 59 | console.log('we are headed to: ' + url); 60 | await page.goto(url); 61 | console.log('we are in the channel, waiting for navigation'); 62 | await page.waitForNavigation(); 63 | 64 | // check that we made it to the channel 65 | if (page.url().indexOf('login') > 0) { 66 | console.log('we are not getting past the login page'); 67 | await page.screenshot({ path: './debug/not_thru_login.png' }); 68 | await browser.close(); 69 | return; 70 | }; 71 | console.log('we are in the channel, waiting for the content to load'); 72 | 73 | // wait for all data to be loaded and ensure we don't timeout 74 | await page.waitForSelector('div[data-qa="slack_kit_scrollbar"]', { timeout: 6000000 }); 75 | console.log('we are in the channel, we see the scrollbar'); 76 | await page.screenshot({ path: './debug/channel.png' }); 77 | 78 | // scroll to the top of the page 79 | console.log('scrolling to the top of the page'); 80 | var top = false; 81 | var elHandleArray; 82 | var elHandleArray_previous; 83 | const senders = []; 84 | const messages = []; 85 | i = 0; 86 | // scroll until we see the element span[data-qa="inline_channel_entity__name"] 87 | while (top == false && i < 25) { 88 | await page.focus('div[class="c-virtual_list__item"]'); 89 | await page.keyboard.press('PageUp'); // alternative ArrowUp 90 | // get all the messages 91 | elHandleArray = await page.$$('div[class="c-message_kit__gutter__right"]') 92 | // check that we have some new messages to parse, otherwise skip 93 | if (elHandleArray != elHandleArray_previous || i == 0) { 94 | elHandleArray.forEach(async el => { 95 | const sender = await el.evaluate(el => el.getElementsByClassName('c-message__sender c-message_kit__sender')[0].textContent); 96 | senders.push(sender); 97 | const message = await el.evaluate(el => el.getElementsByClassName('c-message_kit__blocks c-message_kit__blocks--rich_text')[0].textContent); 98 | messages.push(message); 99 | }) 100 | // write the html to a file with today's date 101 | var html = await page.content(); 102 | filename = './data/channel_'+ i.toString() + '.html'; 103 | await page.screenshot({ path: './debug/channel_'+ i.toString() + '.png' }); 104 | 105 | 106 | // fs.writeFile(filename, html, (err) => { 107 | // if (err) 108 | // console.log(err); 109 | // else { 110 | // console.log("File written successfully\n"); 111 | // } 112 | // }); 113 | } 114 | 115 | i = i + 1; 116 | elHandleArray_previous = elHandleArray; 117 | // we define the top of the channel as the place where we see the title of the channel 118 | if (await page.$('span[data-qa="inline_channel_entity__name"]')) { 119 | console.log('we are at the top'); 120 | top = true; 121 | } 122 | } 123 | await page.screenshot({ path: './debug/channel_top.png' }); 124 | 125 | // write all the senders and messages to a csv file where row i would be sender i and message i 126 | var data = []; 127 | for (i = 0; i < senders.length; i++) { 128 | data.push([senders[i], messages[i]]); 129 | } 130 | 131 | // write the data to a csv file, add double quotes around the message to ensure that commas in the message do not break the csv 132 | filename = './data/'+channel+'.csv'; 133 | var csvContent = "sender,message\r\n"; 134 | data.forEach(function(rowArray){ 135 | let row = rowArray[0] + ',' + '"' + rowArray[1] + '"'; 136 | csvContent += row + "\r\n"; 137 | }); 138 | // write the csv to a file, 139 | fs.writeFile 140 | (filename, csvContent, (err) => { 141 | if (err) 142 | console.log(err); 143 | else { 144 | console.log("File written successfully\n"); 145 | } 146 | }); 147 | 148 | 149 | } 150 | catch(err){ 151 | console.log(err); 152 | }; 153 | 154 | // close the browser 155 | await browser.close(); 156 | console.log('I am done') 157 | }); 158 | 159 | main() 160 | .catch((e) => console.log('err: ' + e)); 161 | --------------------------------------------------------------------------------