├── output ├── users │ └── .gitkeep └── channels │ └── .gitkeep ├── .gitignore ├── json_utils.py ├── get_users.py ├── README.md ├── get_channels_info.py ├── slackscrape.py ├── get_channels_messages.py └── count_emojis.py /output/users/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /output/channels/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | env.json 3 | *.pyc 4 | output/channels/* 5 | !output/channels/.gitkeep 6 | output/users/* 7 | !output/users/.gitkeep 8 | -------------------------------------------------------------------------------- /json_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | from operator import itemgetter 4 | from datetime import datetime 5 | import time 6 | 7 | def load_json(path): 8 | with io.open(path, encoding='utf-8') as f: 9 | return json.loads(f.read()) 10 | 11 | def dump_json(path, data): 12 | with open(path, mode='w') as f: 13 | json.dump(data, f, indent = 2) 14 | -------------------------------------------------------------------------------- /get_users.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from json_utils import load_json, dump_json 3 | from slackclient import SlackClient 4 | import operator 5 | import os 6 | import argparse 7 | 8 | def ensure_dir(directory): 9 | if not os.path.exists(directory): 10 | os.makedirs(directory) 11 | 12 | return directory 13 | 14 | config = load_json('./env.json') 15 | 16 | if __name__ == '__main__': 17 | ap = argparse.ArgumentParser() 18 | ap.add_argument('-u', '--update', help = 'update channels', action="store_true") 19 | args = vars(ap.parse_args()) 20 | 21 | slack_args = { 22 | 'presence': 1 23 | } 24 | 25 | sc = SlackClient(config['token']) 26 | response = sc.api_call('users.list', **slack_args) 27 | users = response['members'] 28 | 29 | for user in users: 30 | user_name = user['name'] 31 | memb_path = ensure_dir('./output/users/members') 32 | user_path = '{}/{}.json'.format(memb_path, user_name) 33 | 34 | try: 35 | old_json = load_json(user_path) 36 | if not args['update']: 37 | print('Aready have user {}, skipping...'.format(user_name)) 38 | continue 39 | except Exception as e: 40 | old_json = {} 41 | print('No existing messages, starting from scratch...') 42 | 43 | print('ADDING ', user_name) 44 | 45 | dump_json(user_path, user) 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # slackscrape 2 | 3 | ## Usage 4 | 5 | Add your [slack token](https://api.slack.com/docs/oauth-test-tokens) credentials to `env.json` in project root 6 | 7 | eg: 8 | 9 | ```js 10 | { 11 | "token": "xxxxxx-XXXXXXXX-XXXXXXXX-xxxxxxxx" 12 | } 13 | ``` 14 | 15 | ## slackscrape 16 | 17 | `python slackscrape.py -c [channel id]` 18 | 19 | eg: `python slackscrape.py -c C193MSB9J 20 | 21 | will write channel messages to `general.json` in `output/channels/general/messages/` 22 | 23 | ## get channels messages 24 | 25 | `python get\_channels\_messages.py -u [optional update existing] -a [optional include archived]` 26 | 27 | eg: `python get\_channels\_messages.py -u 28 | 29 | Will get all channels messages and update with any new messages it finds and write to 30 | `output/channels//messages/.json` 31 | 32 | ## get channels info 33 | 34 | `python get\_channels\_info.py -u [optional update existing]` 35 | 36 | eg: `python get\_channels\_info.py -u 37 | 38 | Will get all channels metadata and write to 39 | `output/channels//info/.json` 40 | 41 | ## get users 42 | 43 | `python get\_users.py -u [optional update existing]` 44 | 45 | eg: `python get\_users.py -u 46 | 47 | Will get all users and write to `output/users/members/.json` 48 | 49 | ## count emojis 50 | 51 | `python count_emojis.py` 52 | 53 | Extracts all messages from all chanenls and does a team-wide breakdown of the top 50 most popular emojis & reactions. 54 | 55 | ## Dependencies 56 | 57 | ```bash 58 | pip install slackclient 59 | ``` 60 | -------------------------------------------------------------------------------- /get_channels_info.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from json_utils import load_json, dump_json 3 | from slackscrape import scrape_slack 4 | from slackclient import SlackClient 5 | import operator 6 | import argparse 7 | import os 8 | 9 | config = load_json('./env.json') 10 | 11 | def ensure_dir(directory): 12 | if not os.path.exists(directory): 13 | os.makedirs(directory) 14 | 15 | return directory 16 | 17 | if __name__ == '__main__': 18 | ap = argparse.ArgumentParser() 19 | ap.add_argument('-u', '--update', help = 'update channels', action="store_true") 20 | args = vars(ap.parse_args()) 21 | 22 | channel_args = { 23 | 'exclude_archived': 0, 24 | } 25 | 26 | sc = SlackClient(config['token']) 27 | response = sc.api_call('channels.list', **channel_args) 28 | channels = response['channels'] 29 | 30 | for idx, channel in enumerate(channels): 31 | chan_name = channel['name'] 32 | print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) 33 | 34 | chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) 35 | info_path = ensure_dir('./output/channels/{}/info'.format(chan_name)) 36 | 37 | try: 38 | old_json = load_json('{}/{}.json'.format(info_path, chan_name)) 39 | if not args['update']: 40 | print('Already have channel {}, skipping ...'.format(chan_name)) 41 | continue 42 | except Exception as e: 43 | print('No existing channel {} info, fetching ...'.format(chan_name)) 44 | 45 | slack_args = { 46 | 'channel': channel['id'], 47 | } 48 | 49 | channel_info = sc.api_call('channels.info', **slack_args) 50 | try: 51 | dump_json('{}/{}.json'.format(info_path, chan_name), channel_info) 52 | except Exception as e: 53 | print('ERROR DUMPING {}'.format(chan_name)) 54 | print(e) 55 | -------------------------------------------------------------------------------- /slackscrape.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from json_utils import load_json, dump_json 3 | from slackclient import SlackClient 4 | import argparse 5 | 6 | def get_messages(sc, slack_args, messages, filter_func): 7 | history = sc.api_call("channels.history", **slack_args) 8 | last_ts = history['messages'][-1]['ts'] if history['has_more'] else False 9 | filtered = list(filter(filter_func, history['messages'])) 10 | all_messages = messages + filtered 11 | print('Fetched {} messages. {} Total now.'.format(len(filtered), len(all_messages))) 12 | 13 | return { 14 | 'messages': all_messages, 15 | 'last_ts': last_ts, 16 | } 17 | 18 | def scrape_slack(token, slack_args, filter_func = lambda x: x): 19 | sc = SlackClient(token) 20 | results = get_messages(sc, slack_args, [], filter_func) 21 | 22 | while results['last_ts']: 23 | slack_args['latest'] = results['last_ts'] 24 | results = get_messages(sc, slack_args, results['messages'], filter_func) 25 | 26 | print('Done fetching messages. Found {} in total.'.format(len(results['messages']))) 27 | return results['messages'] 28 | 29 | if __name__ == '__main__': 30 | config = load_json('./env.json') 31 | 32 | ap = argparse.ArgumentParser() 33 | ap.add_argument('-c', '--channel', help = 'channel id to scrape') 34 | ap.add_argument('-o', '--output', help = 'file to save out') 35 | args = vars(ap.parse_args()) 36 | channel = args['channel'] 37 | output = args['output'] 38 | 39 | try: 40 | old_json = load_json(output) 41 | except Exception as e: 42 | old_json = [] 43 | print('No existing messages, starting from scratch...') 44 | 45 | slack_args = { 46 | 'channel': config['channel_id'], 47 | 'oldest': old_json[0]['ts'] if len(old_json) else '', 48 | } 49 | 50 | new_messages = scrape_slack(config['token'], slack_args) 51 | 52 | if len(new_messages): 53 | all_messages = new_messages + old_json 54 | dump_json(output, all_messages) 55 | -------------------------------------------------------------------------------- /get_channels_messages.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from json_utils import load_json, dump_json 3 | from slackscrape import scrape_slack 4 | from slackclient import SlackClient 5 | import argparse 6 | import operator 7 | import os 8 | 9 | config = load_json('./env.json') 10 | 11 | def ensure_dir(directory): 12 | if not os.path.exists(directory): 13 | os.makedirs(directory) 14 | 15 | return directory 16 | 17 | if __name__ == '__main__': 18 | ap = argparse.ArgumentParser() 19 | ap.add_argument('-a', '--archived', help = 'include archived channels', action="store_true") 20 | ap.add_argument('-u', '--update', help = 'update channels', action="store_true") 21 | args = vars(ap.parse_args()) 22 | 23 | slack_args = { 24 | 'exclude_archived': 0 if args['archived'] else 1, 25 | } 26 | 27 | sc = SlackClient(config['token']) 28 | response = sc.api_call('channels.list', **slack_args) 29 | channels = response['channels'] 30 | 31 | sorted_channels = sorted(channels, key=lambda x: x['num_members'], reverse=True) 32 | 33 | for idx, channel in enumerate(sorted_channels): 34 | chan_name = channel['name'] 35 | print('{} | {} - {} MEMBERS'.format(idx, chan_name, channel['num_members'])) 36 | chan_path = ensure_dir('./output/channels/{}'.format(chan_name)) 37 | msg_path = ensure_dir('./output/channels/{}/messages'.format(chan_name)) 38 | output = './output/channels/{}/messages/{}.json'.format(chan_name, chan_name) 39 | 40 | try: 41 | old_json = load_json(output) 42 | if not args['update']: 43 | print('Aready have messages, skipping...') 44 | continue 45 | except Exception as e: 46 | old_json = [] 47 | print('No existing messages, starting from scratch...') 48 | 49 | slack_args = { 50 | 'channel': channel['id'], 51 | 'oldest': old_json[0]['ts'] if len(old_json) else '', 52 | } 53 | 54 | new_messages = scrape_slack(config['token'], slack_args) 55 | 56 | if len(new_messages): 57 | all_messages = new_messages + old_json 58 | dump_json(output, all_messages) 59 | -------------------------------------------------------------------------------- /count_emojis.py: -------------------------------------------------------------------------------- 1 | import re 2 | from json_utils import load_json 3 | import operator 4 | import os 5 | import pprint 6 | from sets import Set 7 | from itertools import islice 8 | 9 | def take(n, iterable): 10 | return list(islice(iterable, n)) 11 | 12 | allowed_chars = Set('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_') 13 | 14 | master_count = {} 15 | 16 | def add_hit(channel, hit_type, hit): 17 | if not master_count.get(channel): 18 | master_count[channel] = { 19 | 'emojis': {}, 20 | 'emojis_reactions': {}, 21 | } 22 | 23 | if not master_count.get(channel).get(hit_type).get(hit): 24 | master_count[channel][hit_type][hit] = 0 25 | 26 | master_count[channel][hit_type][hit] += 1 27 | 28 | 29 | def filter_emojis(text): 30 | return Set(text.replace(':', '')).issubset(allowed_chars) and len(text) > 2 31 | 32 | for idx, dump in enumerate(os.listdir('./output/channels')): 33 | try: 34 | messages = load_json('./output/channels/{}/messages/{}.json'.format(dump, dump)) 35 | print('ANALYZING {}'.format(dump)) 36 | except Exception as e: 37 | print('ERROR GETTING MESSSAGES IN {}'.format(dump)) 38 | print(e) 39 | continue 40 | 41 | for message in messages: 42 | emojis = [] 43 | emojis_reactions = [] 44 | 45 | if message.get('text'): 46 | text_emojis = re.findall('\:.*?\:', message['text']) 47 | text_emojis = list(filter(lambda x: filter_emojis(x), text_emojis)) 48 | if len(text_emojis): 49 | text_emojis = list(map(lambda x: x.replace(':', ''), text_emojis)) 50 | # print('TEXT EMOJIS =>') 51 | # print(text_emojis) 52 | emojis += text_emojis 53 | 54 | if message.get('reactions'): 55 | for reaction in message.get('reactions'): 56 | emojis_reactions.append(reaction['name']) 57 | 58 | # print('REACTION EMOJIS =>') 59 | # print(emojis_reactions) 60 | 61 | for emoji in emojis: 62 | add_hit(dump.replace('.json', ''), 'emojis', emoji) 63 | 64 | for reaction in emojis_reactions: 65 | add_hit(dump.replace('.json', ''), 'emojis_reactions', reaction) 66 | 67 | 68 | master_emojis_count = { 69 | 'emojis': {}, 70 | 'emojis_reactions': {}, 71 | } 72 | 73 | def add_emoji_hit(hit_type, hit, total): 74 | if not master_emojis_count.get(hit_type).get(hit): 75 | master_emojis_count[hit_type][hit] = 0 76 | 77 | master_emojis_count[hit_type][hit] += total 78 | 79 | for channel in master_count: 80 | for emoji in master_count[channel]['emojis']: 81 | add_emoji_hit('emojis', emoji, master_count[channel]['emojis'][emoji]) 82 | for reaction in master_count[channel]['emojis_reactions']: 83 | add_emoji_hit('emojis_reactions', reaction, master_count[channel]['emojis_reactions'][reaction]) 84 | 85 | 86 | sorted_emojis = sorted(master_emojis_count['emojis'].items(), key=operator.itemgetter(1), reverse=True) 87 | sorted_reactions = sorted(master_emojis_count['emojis_reactions'].items(), key=operator.itemgetter(1), reverse=True) 88 | 89 | pp = pprint.PrettyPrinter(indent=2) 90 | 91 | print('TOP EMOJIS =>') 92 | pp.pprint(sorted_emojis[:50]) 93 | print('') 94 | print('TOP REACTIONS =>') 95 | pp.pprint(sorted_reactions[:50]) 96 | --------------------------------------------------------------------------------