├── .gitattributes ├── run_logger.sh ├── .gitignore ├── cronjob.sh ├── cronjob_clean.sh ├── make_website.sh ├── README.md ├── ipynb_filter.py ├── logger.py └── index.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | *.ipynb filter=ipynb_filter 2 | -------------------------------------------------------------------------------- /run_logger.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python logger.py >> error.log 2>&1 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | .DS_Store 3 | .ipynb_checkpoints/ 4 | __pycache__/ 5 | *.log -------------------------------------------------------------------------------- /cronjob.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /home/cluster_logger/cluster-logger-master 3 | ./run_logger.sh 4 | ./make_website.sh 5 | -------------------------------------------------------------------------------- /cronjob_clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | cd /home/cluster_logger/cluster-logger-master 3 | python logger.py --clean_db >> error.log 2>&1 4 | -------------------------------------------------------------------------------- /make_website.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | jupyter nbconvert --to html --execute --ExecutePreprocessor.timeout=3000 --log-level WARN index.ipynb >> error.log 2>&1 3 | rsync -ravz index.html hpc05@tnw-tn1.tudelft.net: 4 | rsync -ravz database.p hpc05@tnw-tn1.tudelft.net: 5 | rm -f index.html 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Add this to `crontab -e`, make sure there is a newline on top of the file. 2 | ``` 3 | */15 * * * * $HOME/Work/cluster_log/cronjob.sh 4 | 30 23 * * * $HOME/Work/cluster_log/cronjob_clean.sh 5 | ``` 6 | 7 | Also make sure that `${HOME}/Work/cluster_log/cronjob.sh` runs without any issues before adding it to cron. 8 | -------------------------------------------------------------------------------- /ipynb_filter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # `ipynb_filter.py`: 4 | # This is a git filters that strips out the outputs and 5 | # meta data of a Jupyer notebook using `nbconvert`. 6 | # Execute the following line in order to activate this filter: 7 | # python ipynb_filter.py 8 | # 9 | # The following line should be in `.gitattributes`: 10 | # *.ipynb filter=ipynb_filter 11 | 12 | from nbconvert.preprocessors import Preprocessor 13 | 14 | 15 | class RemoveMetadata(Preprocessor): 16 | def preprocess(self, nb, resources): 17 | nb.metadata = {"language_info": {"name":"python", 18 | "pygments_lexer": "ipython3"}} 19 | return nb, resources 20 | 21 | 22 | if __name__ == '__main__': 23 | # The filter is getting activated 24 | import os 25 | git_cmd = 'git config filter.ipynb_filter.clean "jupyter nbconvert --to notebook --config ipynb_filter.py --stdin --stdout"' 26 | os.system(git_cmd) 27 | else: 28 | # This script is used as config 29 | c.Exporter.preprocessors = [RemoveMetadata] 30 | c.ClearOutputPreprocessor.enabled = True 31 | c.ClearOutputPreprocessor.remove_metadata_fields = [ 32 | "deletable", "editable", "collapsed", "scrolled"] 33 | -------------------------------------------------------------------------------- /logger.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | from collections import defaultdict 5 | from datetime import datetime, timedelta 6 | import gzip 7 | import os 8 | import pickle 9 | 10 | import hpc05 11 | from pytz import timezone 12 | 13 | 14 | tz = timezone('Europe/Amsterdam') # timezone in the Netherlands 15 | tz_offset = tz.utcoffset(datetime.now()).seconds // 3600 16 | now = datetime.now(tz) 17 | 18 | 19 | def get_qstat(): 20 | ssh = hpc05.ssh_utils.setup_ssh() 21 | stdin, stdout, sterr = ssh.exec_command('qstat -ea') 22 | out = stdout.readlines(), sterr.readlines() 23 | lines = out[0][5:] 24 | return lines 25 | 26 | 27 | def get_total_cores(): 28 | ssh = hpc05.ssh_utils.setup_ssh() 29 | stdin, stdout, sterr = ssh.exec_command('LOCALnodeload.pl') 30 | out, err = stdout.readlines(), sterr.readlines() 31 | lines = out[2:] 32 | return sum(int(line.split()[1]) for line in lines) 33 | 34 | 35 | def print_current_usage(): 36 | lines = get_qstat() 37 | processes = [process_line(line) for line in lines] 38 | processes = [p for p in processes if p is not None] # Filter out `None`s 39 | 40 | stat = defaultdict(int) 41 | for p in processes: 42 | stat[p['Username']] += p['num_cores'] 43 | 44 | class bcolors: 45 | HEADER = '\033[95m' 46 | OKBLUE = '\033[94m' 47 | OKGREEN = '\033[92m' 48 | WARNING = '\033[93m' 49 | FAIL = '\033[91m' 50 | ENDC = '\033[0m' 51 | BOLD = '\033[1m' 52 | UNDERLINE = '\033[4m' 53 | 54 | total_in_use = sum(v for v in stat.values()) 55 | 56 | total_cores = get_total_cores() 57 | 58 | free_cores = total_cores - total_in_use 59 | print(bcolors.WARNING + 'Total in cores in use: {}, free: {}\n'.format( 60 | total_in_use, free_cores) + bcolors.ENDC) 61 | for user, num_cores in sorted(stat.items(), key=lambda x: -x[1]): 62 | print(bcolors.OKGREEN + '{} uses {} cores'.format(user, num_cores) + bcolors.ENDC) 63 | 64 | 65 | def parse_line(line): 66 | cols = ['Job ID', 'Username', 'Queue', 'Jobname', 'SessID', 'NDS', 'TSK', 67 | 'Required Memory', 'Required Time', 'S', 'Elapsed Time'] 68 | process_dict = {key: val for key, val in zip(cols, line.split())} 69 | return process_dict 70 | 71 | 72 | def get_num_processors(process): 73 | try: 74 | num_cores = int(process['TSK']) 75 | except ValueError: 76 | num_cores = 1 77 | return num_cores 78 | 79 | 80 | def set_elapsed_time(process): 81 | try: 82 | h, m, s = process['Elapsed Time'].split(':') 83 | if s == '': 84 | # Sometimes the Elapsed Time is reported as '13079:00:' 85 | s = 0 86 | total_seconds = 3600 * int(h) + 60 * int(m) + int(s) 87 | except: 88 | total_seconds = 0 89 | return total_seconds 90 | 91 | 92 | def filter_dict(process): 93 | to_save = ['Job ID', 'Jobname', 'SessID', 'Username', 94 | 'current_time', 'num_cores', 'cpu_time'] 95 | filtered_process = {k: process[k] for k in to_save} 96 | return filtered_process 97 | 98 | 99 | def process_line(line): 100 | process = parse_line(line) 101 | if process['S'] == 'R': 102 | process['current_time'] = datetime.timestamp(now) 103 | process['num_cores'] = get_num_processors(process) 104 | process['cpu_time'] = set_elapsed_time(process) 105 | return filter_dict(process) 106 | 107 | 108 | def save_processes(processes, fname, append=True): 109 | mode = 'ab' if append else 'wb' 110 | with gzip.open(fname, mode) as pfile: 111 | for p in processes: 112 | pickle.dump(p, pfile) 113 | 114 | 115 | def load_processes(fname): 116 | processes = [] 117 | with gzip.open(fname, 'rb') as f: 118 | while True: 119 | try: 120 | process = pickle.load(f) 121 | except EOFError: 122 | break 123 | except pickle.UnpicklingError: 124 | # If an UnpicklingError happens overwrite the database. 125 | save_processes(processes, fname, append=False) 126 | break 127 | processes.append(process) 128 | return processes 129 | 130 | 131 | def older_than(process, days=30): 132 | time = date_from_process(process) 133 | return time < now - timedelta(days=days) 134 | 135 | 136 | def date_from_process(process): 137 | return datetime.fromtimestamp(process['current_time'], tz) 138 | 139 | 140 | def clean_database(database_fname, days=60): 141 | processes = load_processes(database_fname) 142 | to_archive = defaultdict(list) 143 | keep = [] 144 | for process in processes: 145 | date = date_from_process(process) 146 | if older_than(process, days=days): 147 | key = date.strftime("%Y-%m") 148 | to_archive[key].append(process) 149 | else: 150 | keep.append(process) 151 | 152 | for fname, processes in to_archive.items(): 153 | os.makedirs('archive', exist_ok=True) 154 | fname = 'archive/' + fname + '.p' 155 | append = os.path.isfile(fname) 156 | save_processes(processes, fname, append) 157 | 158 | save_processes(keep, database_fname, append=False) 159 | 160 | 161 | if __name__ == "__main__": 162 | parser = argparse.ArgumentParser(description='Log qstat or clean its database.') 163 | parser.add_argument('-f', '--fname', type=str, default='database.p') 164 | parser.add_argument('-c', '--clean_db', action='store_true') 165 | args = parser.parse_args() 166 | 167 | database_fname = args.fname 168 | 169 | if args.clean_db: 170 | clean_database(database_fname) 171 | 172 | else: 173 | lines = get_qstat() 174 | 175 | processes = [process_line(line) for line in lines] 176 | processes = [p for p in processes if p is not None] # Filter out `None`s 177 | 178 | save_processes(processes, database_fname, append=True) 179 | -------------------------------------------------------------------------------- /index.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Measuring and logging the CPU usage at the `hpc05`" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "Takes a measuring point **every 15 minutes** and then updates this website.\n", 15 | "\n", 16 | "Found a mistake or want to know something? Ask/e-mail Bas at [basnijholt@gmail.com](mailto:basnijholt@gmail.com) or see the complete code on [GitHub](https://github.com/basnijholt/cluster-logger).\n", 17 | "\n", 18 | "You can also find this `ipynb` [here](https://github.com/basnijholt/cluster-logger/blob/master/index.ipynb) and the data of the last 60 days [here](https://hpc05.quantumtinkerer.tudelft.nl/database.p).\n", 19 | "\n", 20 | "_You can see the code of this Jupyter Notebook by clicking on this button:_" 21 | ] 22 | }, 23 | { 24 | "cell_type": "raw", 25 | "metadata": {}, 26 | "source": [ 27 | "\n", 44 | "
\n", 45 | "\n", 46 | "" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "from logger import print_current_usage, load_processes, now, tz_offset\n", 58 | "import socket\n", 59 | "print('Last time this script ran is at {}, on {}'.format(now, socket.gethostname()))" 60 | ] 61 | }, 62 | { 63 | "cell_type": "markdown", 64 | "metadata": {}, 65 | "source": [ 66 | "# Current usage at the `hpc05`" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "print_current_usage()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "import pandas as pd\n", 85 | "import matplotlib.pyplot as plt\n", 86 | "%matplotlib inline\n", 87 | "\n", 88 | "today = str(now.date())\n", 89 | "month = now.strftime(\"%B\")\n", 90 | "\n", 91 | "processes = load_processes('database.p')\n", 92 | "df = pd.DataFrame(processes)\n", 93 | "df.index = pd.to_datetime(df.current_time, unit='s')\n", 94 | "df.index = df.index.tz_localize('UTC').tz_convert('Europe/Amsterdam')\n", 95 | "\n", 96 | "gb = df.groupby('Job ID', as_index=False)\n", 97 | "df['cpu_time'] = gb['cpu_time'].transform(lambda x: x-x.min())\n", 98 | "df['reserved_time'] = gb['current_time'].transform(lambda x: x-x.min())\n", 99 | "df['reserved_cpu_time'] = df['reserved_time'] * df['num_cores']\n", 100 | "df['activity'] = df['cpu_time'] / df['reserved_cpu_time'] * 100\n", 101 | "lasts = gb.last()\n", 102 | "\n", 103 | "def get_user_df(lasts, only_today=False):\n", 104 | " lasts = lasts.copy()\n", 105 | " if only_today:\n", 106 | " # select only today\n", 107 | " lasts.index = pd.to_datetime(lasts.pop('current_time'), unit='s')\n", 108 | " lasts.index = lasts.index.tz_localize('UTC').tz_convert('Europe/Amsterdam')\n", 109 | " lasts = lasts.loc[today]\n", 110 | "\n", 111 | " by_user = lasts.groupby('Username')\n", 112 | " reserved_days = by_user.reserved_cpu_time.sum() / 86400\n", 113 | " cpu_days = by_user.cpu_time.sum() / 86400\n", 114 | " idle_days = reserved_days - cpu_days\n", 115 | " activity = cpu_days * 100 / reserved_days\n", 116 | " cols = ['CPU time (days)', 'Reserved time (days)',\n", 117 | " 'IDLE time (days)', 'Activity (%)']\n", 118 | " user_df = pd.DataFrame([cpu_days, reserved_days, idle_days, activity], \n", 119 | " index=cols).T\n", 120 | " return user_df" 121 | ] 122 | }, 123 | { 124 | "cell_type": "markdown", 125 | "metadata": {}, 126 | "source": [ 127 | "# Data of the last 60 days\n" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "user_df = get_user_df(lasts)\n", 137 | "user_df.sort_values('IDLE time (days)', ascending=False)" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "ax = user_df.sort_values('Activity (%)').plot.bar(y=['Reserved time (days)', 'CPU time (days)'])\n", 147 | "ax.set_ylabel('CPU time in days')\n", 148 | "ax.set_title('CPU time used per user for the last 60 days');" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": null, 154 | "metadata": {}, 155 | "outputs": [], 156 | "source": [ 157 | "ax = df.groupby(df.index.weekday_name, sort=False).cpu_time.sum().divide(86400 * 7 * 365).plot.bar()\n", 158 | "ax.set_xlabel('Weekday')\n", 159 | "ax.set_ylabel('CPU time in years')\n", 160 | "ax.set_title('CPU time per weekday in the last 60 days');" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": null, 166 | "metadata": {}, 167 | "outputs": [], 168 | "source": [ 169 | "ax = df.groupby(df.index.hour + tz_offset, sort=False).cpu_time.sum().divide(86400 * 24).plot.bar()\n", 170 | "ax.set_ylabel('CPU time in days')\n", 171 | "ax.set_xlabel('Hour of the day')\n", 172 | "ax.set_title('CPU time per hour in the last 60 days');" 173 | ] 174 | }, 175 | { 176 | "cell_type": "markdown", 177 | "metadata": {}, 178 | "source": [ 179 | "# Only today" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "user_df_today = get_user_df(lasts, only_today=True)\n", 189 | "user_df_today.sort_values('IDLE time (days)', ascending=False)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "ax = user_df_today.sort_values('Activity (%)').plot.bar(y=['Reserved time (days)', 'CPU time (days)'])\n", 199 | "ax.set_ylabel('CPU time in days')\n", 200 | "today = str(now.utcnow().date())\n", 201 | "ax.set_title('CPU time per user today ({})'.format(today));" 202 | ] 203 | }, 204 | { 205 | "cell_type": "markdown", 206 | "metadata": {}, 207 | "source": [ 208 | "# Ideas?\n", 209 | "* Showing usage per department\n", 210 | "* Average number of cores used per day" 211 | ] 212 | } 213 | ], 214 | "metadata": { 215 | "language_info": { 216 | "name": "python", 217 | "pygments_lexer": "ipython3" 218 | } 219 | }, 220 | "nbformat": 4, 221 | "nbformat_minor": 1 222 | } 223 | --------------------------------------------------------------------------------