├── .gitattributes
├── run_logger.sh
├── .gitignore
├── cronjob.sh
├── cronjob_clean.sh
├── make_website.sh
├── README.md
├── ipynb_filter.py
├── logger.py
└── index.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | *.ipynb filter=ipynb_filter
2 | 


--------------------------------------------------------------------------------
/run_logger.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python logger.py >> error.log 2>&1
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | .DS_Store
3 | .ipynb_checkpoints/
4 | __pycache__/
5 | *.log


--------------------------------------------------------------------------------
/cronjob.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd  /home/cluster_logger/cluster-logger-master
3 | ./run_logger.sh
4 | ./make_website.sh
5 | 


--------------------------------------------------------------------------------
/cronjob_clean.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | cd  /home/cluster_logger/cluster-logger-master
3 | python logger.py --clean_db >> error.log 2>&1
4 | 


--------------------------------------------------------------------------------
/make_website.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | jupyter nbconvert --to html --execute --ExecutePreprocessor.timeout=3000 --log-level WARN index.ipynb >> error.log 2>&1
3 | rsync -ravz index.html hpc05@tnw-tn1.tudelft.net:
4 | rsync -ravz database.p hpc05@tnw-tn1.tudelft.net:
5 | rm -f index.html
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Add this to `crontab -e`, make sure there is a newline on top of the file.
2 | ```
3 | */15 * * * * $HOME/Work/cluster_log/cronjob.sh
4 | 30 23 * * * $HOME/Work/cluster_log/cronjob_clean.sh
5 | ```
6 | 
7 | Also make sure that `${HOME}/Work/cluster_log/cronjob.sh` runs without any issues before adding it to cron.
8 | 


--------------------------------------------------------------------------------
/ipynb_filter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # `ipynb_filter.py`:
 4 | # This is a git filters that strips out the outputs and
 5 | # meta data of a Jupyer notebook using `nbconvert`.
 6 | # Execute the following line in order to activate this filter:
 7 | # python ipynb_filter.py
 8 | #
 9 | # The following line should be in `.gitattributes`:
10 | # *.ipynb filter=ipynb_filter
11 | 
12 | from nbconvert.preprocessors import Preprocessor
13 | 
14 | 
15 | class RemoveMetadata(Preprocessor):
16 |     def preprocess(self, nb, resources):
17 |         nb.metadata = {"language_info": {"name":"python",
18 |                                          "pygments_lexer": "ipython3"}}
19 |         return nb, resources
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     # The filter is getting activated
24 |     import os
25 |     git_cmd = 'git config filter.ipynb_filter.clean "jupyter nbconvert --to notebook --config ipynb_filter.py --stdin --stdout"'
26 |     os.system(git_cmd)
27 | else:
28 |     # This script is used as config
29 |     c.Exporter.preprocessors = [RemoveMetadata]
30 |     c.ClearOutputPreprocessor.enabled = True
31 |     c.ClearOutputPreprocessor.remove_metadata_fields = [
32 |         "deletable", "editable", "collapsed", "scrolled"]
33 | 


--------------------------------------------------------------------------------
/logger.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import argparse
  4 | from collections import defaultdict
  5 | from datetime import datetime, timedelta
  6 | import gzip
  7 | import os
  8 | import pickle
  9 | 
 10 | import hpc05
 11 | from pytz import timezone
 12 | 
 13 | 
 14 | tz = timezone('Europe/Amsterdam')  # timezone in the Netherlands
 15 | tz_offset = tz.utcoffset(datetime.now()).seconds // 3600
 16 | now = datetime.now(tz)
 17 | 
 18 | 
 19 | def get_qstat():
 20 |     ssh = hpc05.ssh_utils.setup_ssh()
 21 |     stdin, stdout, sterr = ssh.exec_command('qstat -ea')
 22 |     out = stdout.readlines(), sterr.readlines()
 23 |     lines = out[0][5:]
 24 |     return lines
 25 | 
 26 | 
 27 | def get_total_cores():
 28 |     ssh = hpc05.ssh_utils.setup_ssh()
 29 |     stdin, stdout, sterr = ssh.exec_command('LOCALnodeload.pl')
 30 |     out, err = stdout.readlines(), sterr.readlines()
 31 |     lines = out[2:]
 32 |     return sum(int(line.split()[1]) for line in lines)
 33 | 
 34 | 
 35 | def print_current_usage():
 36 |     lines = get_qstat()
 37 |     processes = [process_line(line) for line in lines]
 38 |     processes = [p for p in processes if p is not None]  # Filter out `None`s
 39 | 
 40 |     stat = defaultdict(int)
 41 |     for p in processes:
 42 |         stat[p['Username']] += p['num_cores']
 43 | 
 44 |     class bcolors:
 45 |         HEADER = '\033[95m'
 46 |         OKBLUE = '\033[94m'
 47 |         OKGREEN = '\033[92m'
 48 |         WARNING = '\033[93m'
 49 |         FAIL = '\033[91m'
 50 |         ENDC = '\033[0m'
 51 |         BOLD = '\033[1m'
 52 |         UNDERLINE = '\033[4m'
 53 | 
 54 |     total_in_use = sum(v for v in stat.values())
 55 | 
 56 |     total_cores = get_total_cores()
 57 | 
 58 |     free_cores = total_cores - total_in_use
 59 |     print(bcolors.WARNING + 'Total in cores in use: {}, free: {}\n'.format(
 60 |         total_in_use, free_cores) + bcolors.ENDC)
 61 |     for user, num_cores in sorted(stat.items(), key=lambda x: -x[1]):
 62 |         print(bcolors.OKGREEN + '{} uses {} cores'.format(user, num_cores) + bcolors.ENDC)
 63 | 
 64 | 
 65 | def parse_line(line):
 66 |     cols = ['Job ID', 'Username', 'Queue', 'Jobname', 'SessID', 'NDS', 'TSK',
 67 |             'Required Memory', 'Required Time', 'S', 'Elapsed Time']
 68 |     process_dict = {key: val for key, val in zip(cols, line.split())}
 69 |     return process_dict
 70 | 
 71 | 
 72 | def get_num_processors(process):
 73 |     try:
 74 |         num_cores = int(process['TSK'])
 75 |     except ValueError:
 76 |         num_cores = 1
 77 |     return num_cores
 78 | 
 79 | 
 80 | def set_elapsed_time(process):
 81 |     try:
 82 |         h, m, s = process['Elapsed Time'].split(':')
 83 |         if s == '':
 84 |             # Sometimes the Elapsed Time is reported as '13079:00:'
 85 |             s = 0
 86 |         total_seconds = 3600 * int(h) + 60 * int(m) + int(s)
 87 |     except:
 88 |         total_seconds = 0
 89 |     return total_seconds
 90 | 
 91 | 
 92 | def filter_dict(process):
 93 |     to_save = ['Job ID', 'Jobname', 'SessID', 'Username',
 94 |                'current_time', 'num_cores', 'cpu_time']
 95 |     filtered_process = {k: process[k] for k in to_save}
 96 |     return filtered_process
 97 | 
 98 | 
 99 | def process_line(line):
100 |     process = parse_line(line)
101 |     if process['S'] == 'R':
102 |         process['current_time'] = datetime.timestamp(now)
103 |         process['num_cores'] = get_num_processors(process)
104 |         process['cpu_time'] = set_elapsed_time(process)
105 |         return filter_dict(process)
106 | 
107 | 
108 | def save_processes(processes, fname, append=True):
109 |     mode = 'ab' if append else 'wb'
110 |     with gzip.open(fname, mode) as pfile:
111 |         for p in processes:
112 |             pickle.dump(p, pfile)
113 | 
114 | 
115 | def load_processes(fname):
116 |     processes = []
117 |     with gzip.open(fname, 'rb') as f:
118 |         while True:
119 |             try:
120 |                 process = pickle.load(f)
121 |             except EOFError:
122 |                 break
123 |             except pickle.UnpicklingError:
124 |                 # If an UnpicklingError happens overwrite the database.
125 |                 save_processes(processes, fname, append=False)
126 |                 break
127 |             processes.append(process)
128 |     return processes
129 | 
130 | 
131 | def older_than(process, days=30):
132 |     time = date_from_process(process)
133 |     return time < now - timedelta(days=days)
134 | 
135 | 
136 | def date_from_process(process):
137 |     return datetime.fromtimestamp(process['current_time'], tz)
138 | 
139 | 
140 | def clean_database(database_fname, days=60):
141 |     processes = load_processes(database_fname)
142 |     to_archive = defaultdict(list)
143 |     keep = []
144 |     for process in processes:
145 |         date = date_from_process(process)
146 |         if older_than(process, days=days):
147 |             key = date.strftime("%Y-%m")
148 |             to_archive[key].append(process)
149 |         else:
150 |             keep.append(process)
151 | 
152 |     for fname, processes in to_archive.items():
153 |         os.makedirs('archive', exist_ok=True)
154 |         fname = 'archive/' + fname + '.p'
155 |         append = os.path.isfile(fname)
156 |         save_processes(processes, fname, append)
157 | 
158 |     save_processes(keep, database_fname, append=False)
159 | 
160 | 
161 | if __name__ == "__main__":
162 |     parser = argparse.ArgumentParser(description='Log qstat or clean its database.')
163 |     parser.add_argument('-f', '--fname', type=str, default='database.p')
164 |     parser.add_argument('-c', '--clean_db', action='store_true')
165 |     args = parser.parse_args()
166 | 
167 |     database_fname = args.fname
168 | 
169 |     if args.clean_db:
170 |         clean_database(database_fname)
171 | 
172 |     else:
173 |         lines = get_qstat()
174 | 
175 |         processes = [process_line(line) for line in lines]
176 |         processes = [p for p in processes if p is not None]  # Filter out `None`s
177 | 
178 |         save_processes(processes, database_fname, append=True)
179 | 


--------------------------------------------------------------------------------
/index.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Measuring and logging the CPU usage at the `hpc05`"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "Takes a measuring point **every 15 minutes** and then updates this website.\n",
 15 |     "\n",
 16 |     "Found a mistake or want to know something? Ask/e-mail Bas at [basnijholt@gmail.com](mailto:basnijholt@gmail.com) or see the complete code on [GitHub](https://github.com/basnijholt/cluster-logger).\n",
 17 |     "\n",
 18 |     "You can also find this `ipynb` [here](https://github.com/basnijholt/cluster-logger/blob/master/index.ipynb) and the data of the last 60 days [here](https://hpc05.quantumtinkerer.tudelft.nl/database.p).\n",
 19 |     "\n",
 20 |     "_You can see the code of this Jupyter Notebook by clicking on this button:_"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "raw",
 25 |    "metadata": {},
 26 |    "source": [
 27 |     "<script>\n",
 28 |     "  function code_toggle() {\n",
 29 |     "    if (code_shown){\n",
 30 |     "      $('div.input').hide('500');\n",
 31 |     "      $('#toggleButton').val('Show Code')\n",
 32 |     "    } else {\n",
 33 |     "      $('div.input').show('500');\n",
 34 |     "      $('#toggleButton').val('Hide Code')\n",
 35 |     "    }\n",
 36 |     "    code_shown = !code_shown\n",
 37 |     "  }\n",
 38 |     "\n",
 39 |     "  $( document ).ready(function(){\n",
 40 |     "    code_shown=false;\n",
 41 |     "    $('div.input').hide()\n",
 42 |     "  });\n",
 43 |     "</script>\n",
 44 |     "<form action=\"javascript:code_toggle()\"><input type=\"submit\" id=\"toggleButton\" value=\"Show Code\"></form>\n",
 45 |     "\n",
 46 |     "<style>\n",
 47 |     "div.prompt {display:none}\n",
 48 |     "</style>"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": null,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "from logger import print_current_usage, load_processes, now, tz_offset\n",
 58 |     "import socket\n",
 59 |     "print('Last time this script ran is at {}, on {}'.format(now, socket.gethostname()))"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "markdown",
 64 |    "metadata": {},
 65 |    "source": [
 66 |     "# Current usage at the `hpc05`"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "print_current_usage()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "import pandas as pd\n",
 85 |     "import matplotlib.pyplot as plt\n",
 86 |     "%matplotlib inline\n",
 87 |     "\n",
 88 |     "today = str(now.date())\n",
 89 |     "month = now.strftime(\"%B\")\n",
 90 |     "\n",
 91 |     "processes = load_processes('database.p')\n",
 92 |     "df = pd.DataFrame(processes)\n",
 93 |     "df.index = pd.to_datetime(df.current_time, unit='s')\n",
 94 |     "df.index = df.index.tz_localize('UTC').tz_convert('Europe/Amsterdam')\n",
 95 |     "\n",
 96 |     "gb = df.groupby('Job ID', as_index=False)\n",
 97 |     "df['cpu_time'] = gb['cpu_time'].transform(lambda x: x-x.min())\n",
 98 |     "df['reserved_time'] = gb['current_time'].transform(lambda x: x-x.min())\n",
 99 |     "df['reserved_cpu_time'] = df['reserved_time'] * df['num_cores']\n",
100 |     "df['activity'] = df['cpu_time'] / df['reserved_cpu_time'] * 100\n",
101 |     "lasts = gb.last()\n",
102 |     "\n",
103 |     "def get_user_df(lasts, only_today=False):\n",
104 |     "    lasts = lasts.copy()\n",
105 |     "    if only_today:\n",
106 |     "        # select only today\n",
107 |     "        lasts.index = pd.to_datetime(lasts.pop('current_time'), unit='s')\n",
108 |     "        lasts.index = lasts.index.tz_localize('UTC').tz_convert('Europe/Amsterdam')\n",
109 |     "        lasts = lasts.loc[today]\n",
110 |     "\n",
111 |     "    by_user = lasts.groupby('Username')\n",
112 |     "    reserved_days = by_user.reserved_cpu_time.sum() / 86400\n",
113 |     "    cpu_days = by_user.cpu_time.sum() / 86400\n",
114 |     "    idle_days = reserved_days - cpu_days\n",
115 |     "    activity = cpu_days * 100 / reserved_days\n",
116 |     "    cols = ['CPU time (days)', 'Reserved time (days)',\n",
117 |     "            'IDLE time (days)', 'Activity (%)']\n",
118 |     "    user_df = pd.DataFrame([cpu_days, reserved_days, idle_days, activity], \n",
119 |     "                           index=cols).T\n",
120 |     "    return user_df"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "# Data of the last 60 days\n"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "user_df = get_user_df(lasts)\n",
137 |     "user_df.sort_values('IDLE time (days)', ascending=False)"
138 |    ]
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": [
146 |     "ax = user_df.sort_values('Activity (%)').plot.bar(y=['Reserved time (days)', 'CPU time (days)'])\n",
147 |     "ax.set_ylabel('CPU time in days')\n",
148 |     "ax.set_title('CPU time used per user for the last 60 days');"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": null,
154 |    "metadata": {},
155 |    "outputs": [],
156 |    "source": [
157 |     "ax = df.groupby(df.index.weekday_name, sort=False).cpu_time.sum().divide(86400 * 7 * 365).plot.bar()\n",
158 |     "ax.set_xlabel('Weekday')\n",
159 |     "ax.set_ylabel('CPU time in years')\n",
160 |     "ax.set_title('CPU time per weekday in the last 60 days');"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": null,
166 |    "metadata": {},
167 |    "outputs": [],
168 |    "source": [
169 |     "ax = df.groupby(df.index.hour + tz_offset, sort=False).cpu_time.sum().divide(86400 * 24).plot.bar()\n",
170 |     "ax.set_ylabel('CPU time in days')\n",
171 |     "ax.set_xlabel('Hour of the day')\n",
172 |     "ax.set_title('CPU time per hour in the last 60 days');"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "markdown",
177 |    "metadata": {},
178 |    "source": [
179 |     "# Only today"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": null,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "user_df_today = get_user_df(lasts, only_today=True)\n",
189 |     "user_df_today.sort_values('IDLE time (days)', ascending=False)"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "ax = user_df_today.sort_values('Activity (%)').plot.bar(y=['Reserved time (days)', 'CPU time (days)'])\n",
199 |     "ax.set_ylabel('CPU time in days')\n",
200 |     "today = str(now.utcnow().date())\n",
201 |     "ax.set_title('CPU time per user today ({})'.format(today));"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "# Ideas?\n",
209 |     "* Showing usage per department\n",
210 |     "* Average number of cores used per day"
211 |    ]
212 |   }
213 |  ],
214 |  "metadata": {
215 |   "language_info": {
216 |    "name": "python",
217 |    "pygments_lexer": "ipython3"
218 |   }
219 |  },
220 |  "nbformat": 4,
221 |  "nbformat_minor": 1
222 | }
223 | 


--------------------------------------------------------------------------------