├── requirements.txt ├── example_files ├── Screenshot HPC 2024-08-20.png ├── example_sacctOutput_raw.txt ├── example_sacctOutput_raw_asDF.tsv └── example_output_workloadManager.tsv ├── data ├── fixed_parameters.yaml └── cluster_info.yaml ├── myCarbonFootprint.sh ├── backend ├── helpers.py ├── __init__.py └── slurm_extract.py ├── frontend ├── __init__.py ├── helpers.py ├── templates │ ├── _user.html │ ├── report_blank.html │ └── styles.css ├── terminal_output.py └── dashboard_output.py ├── .gitignore ├── __init__.py └── README.md /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.24 2 | pandas==2.0 3 | PyYAML==6.0 4 | jinja2==3.1 5 | plotly==5.18 -------------------------------------------------------------------------------- /example_files/Screenshot HPC 2024-08-20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/GreenAlgorithms/GreenAlgorithms4HPC/HEAD/example_files/Screenshot HPC 2024-08-20.png -------------------------------------------------------------------------------- /data/fixed_parameters.yaml: -------------------------------------------------------------------------------- 1 | 2 | ## ~~~ DO NOT EDIT ~~~ 3 | ## 4 | ## These are fixed values, from the Green Algorithms app 5 | ## Hello World 6 | 7 | --- 8 | power_memory_perGB: 0.3725 # W/GB 9 | tree_month: 917 #gCO2e 10 | passengerCar_EU_perkm: 175 #gCO2e/km 11 | passengerCar_US_perkm: 251 #gCO2e/km 12 | flight_NY_SF: 570000 #gCO2e 13 | flight_PAR_LON: 50000 #gCO2e 14 | flight_NYC_MEL: 2310000 #gCO2e 15 | electricity_cost: 0.34 # GBP/kWh (source?) -------------------------------------------------------------------------------- /example_files/example_sacctOutput_raw.txt: -------------------------------------------------------------------------------- 1 | JobID|JobName|Submit|Elapsed|Partition|NNodes|NCPUS|TotalCPU|CPUTime|ReqMem|MaxRSS|WorkDir|State|Account|AllocTRES 2 | 556141|myJobName|2022-02-11T19:11:21|04:00:25|myPartition|1|1|00:00:00|04:00:25|6760Mc||/job/path|TIMEOUT|myAccount|billing=1,cpu=1,mem=6760M,node=1 3 | 552375|myJobName|2022-02-12T13:55:33|03:00:30|myPartition|1|32|00:00:00|4-00:16:00|250Gn||/job/path|TIMEOUT|myAccount|billing=32,cpu=32,gres/gpu=1,mem=250G,node=1 4 | -------------------------------------------------------------------------------- /example_files/example_sacctOutput_raw_asDF.tsv: -------------------------------------------------------------------------------- 1 | JobID JobName Submit Elapsed Partition NNodes NCPUS TotalCPU CPUTime ReqMem MaxRSS WorkDir State Account AllocTRES 2 | 556141 myJobName 2022-02-12T13:55:33 03:00:30 myPartition 1 32 00:00:00 4-00:16:00 250Gn /job/path TIMEOUT myAccount billing=32,cpu=32,gres/gpu=1,mem=250G,node=1 3 | 552375 myJobName 2022-02-12T14:04:01 00:39:16 myPartition 1 32 00:00:00 20:56:32 250Gn /job/path COMPLETED myAccount billing=32,cpu=32,gres/gpu=1,mem=250G,node=1 4 | -------------------------------------------------------------------------------- /example_files/example_output_workloadManager.tsv: -------------------------------------------------------------------------------- 1 | single_jobID TotalCPUtime_ CPUwallclocktime_ WallclockTimeX ReqMemX UsedMem_ NCPUS_ NGPUS_ NNodes_ PartitionX JobName_ SubmitDatetimeX WorkingDir_ StateX Account_ UsedMem2_ PartitionTypeX TotalCPUtime2useX TotalGPUtime2useX CoreHoursChargedX NeededMemX memOverallocationFactorX parentJobID 2 | 1 27879 0 days 00:00:00.508000 0 days 03:15:45 0 days 00:13:03 102.6 0.003016 15 0 1 myPartition myName 2022-09-14 18:21:21 /job/path 0 myAccount 0.003016 CPU 0 days 00:00:00.508000 0 days 00:00:00 3.2625 6.0 17.099999999999998 2787379 3 | 2 27060 0 days 00:00:12.499000 0 days 11:12:30 0 days 00:44:50 102.6 0.347312 15 0 1 myPartition myName 2022-09-14 18:38:58 /job/path 0 myAccount 0.347312 CPU 0 days 00:00:12.499000 0 days 00:00:00 11.208333333333334 6.0 17.099999999999998 2788060 4 | -------------------------------------------------------------------------------- /myCarbonFootprint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ## ~~~ TO BE EDITED TO BE TAILORED TO THE CLUSTER ~~~ 4 | ## 5 | ## You only need to edit the module loading line (l.13), make sure you are loading python 3.7 or greater. 6 | ## 7 | 8 | # store the cwd in case we need to filter on it 9 | userCWD="$(pwd)" 10 | 11 | # Cd into the directory where the GA files are located 12 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P ) 13 | cd "$parent_path" 14 | 15 | # Test if the virtualenv GA_env already exists, and if not, creates it. Download python 3.8 or higher for better results. 16 | if [ ! -f GA_env/bin/activate ]; then 17 | echo "Need to create virtualenv" 18 | /usr/bin/python3.8 -m venv GA_env # this line needs updating to load python on your server 19 | source GA_env/bin/activate 20 | pip3 install -r requirements.txt 21 | else 22 | echo "Virtualenv: OK" 23 | source GA_env/bin/activate 24 | fi 25 | 26 | # Test if the python version is at least 3.8 27 | version_major=$(python -c 'import sys; print(sys.version_info[0])') 28 | version_minor=$(python -c 'import sys; print(sys.version_info[1])') 29 | if (( $version_major < 3 )); then 30 | echo "The command python needs to refer to python 3" 31 | exit 1 32 | fi 33 | 34 | if (( $version_minor < 8 )); then 35 | echo "The command python needs to refer to python3.8 or higher." 36 | exit 1 37 | fi 38 | echo "Python versions: OK" 39 | 40 | 41 | # Run the python code and pass on the arguments 42 | #userCWD="/home/ll582/ with space" # DEBUGONLY 43 | python __init__.py "$@" --userCWD "$userCWD" 44 | -------------------------------------------------------------------------------- /data/cluster_info.yaml: -------------------------------------------------------------------------------- 1 | ## 2 | ## ~~~ TO BE EDITED TO BE TAILORED TO THE CLUSTER ~~~ 3 | ## Fill in the values for your cluster: all the variables in <> need to be changed 4 | ## 5 | --- 6 | institution: "" # [str] 7 | cluster_name: "" # [str] 8 | granularity_memory_request: <6> # [number] in GB representing the smallest memory unit users can reserve 9 | partitions: # a list of the different partitions on the cluster 10 | : # name of the partition 11 | type: # [CPU or GPU] 12 | model: "" # [str] the model of the processing core on this partition. Not actually used by the code but useful for reference for others. 13 | TDP: <10> # [number] TDP of the processor, in W, per core 14 | : # name of the partition 15 | type: # [CPU or GPU] 16 | model: "" # [str] the model of the processing core on this partition. Not actually used by the code but useful for reference for others. 17 | TDP: <250> # [number] For GPUs, the TDP is for the entire GPU 18 | # For GPU partitions, we also need info about the CPUs available for support. 19 | model_CPU: "" # [str] Not actually used by the code but useful for reference for others. 20 | TDP_CPU: <10> # [number] TDP of the processor, in W, per core 21 | # You can keep adding partitions to this 22 | PUE: <1.67> # [number > 1] Power Usage Effectiveness of the facility 23 | CI: <467> # [number] average carbon intensity of the geographic location, in gCO2e/kWh 24 | energy_cost: 25 | cost: <0.34> # [number] in currency/kWh 26 | currency: "<£>" # [str] 27 | # 28 | # Below are optional parameters if the html output is used. 29 | # HTML tags can be used 30 | # 31 | texts_intro: 32 | CPU: "XX - XX W/core (see here for models)" # For example 33 | GPU: "NVIDIA A100 (300 W) and NVIDIA Tesla P100 (250 W)" # For example 34 | # 35 | # Below are optional parameters to accommodate some clusters. Do not remove but can be ignored. 36 | # 37 | default_unit_RSS: 'K' 38 | -------------------------------------------------------------------------------- /backend/helpers.py: -------------------------------------------------------------------------------- 1 | 2 | import datetime 3 | import sys 4 | import random 5 | import pandas as pd 6 | import numpy as np 7 | 8 | def check_empty_results(df, args): 9 | """ 10 | This is to check whether any jobs have been run on the period, and stop the script if not. 11 | :param df: [pd.DataFrame] Usage logs 12 | :param args: 13 | """ 14 | if len(df) == 0: 15 | if args.filterWD is not None: 16 | addThat = f' from this directory ({args.filterWD})' 17 | else: 18 | addThat = '' 19 | if args.filterJobIDs != 'all': 20 | addThat += ' and with these jobIDs' 21 | if args.filterAccount is not None: 22 | addThat += ' charged under this account' 23 | 24 | print(f''' 25 | 26 | You haven't run any jobs on that period (from {args.startDay} to {args.endDay}){addThat}. 27 | 28 | ''') 29 | sys.exit() 30 | 31 | def simulate_mock_jobs(): # DEBUGONLY 32 | df_list = [] 33 | n_jobs = random.randint(500,800) 34 | foo = { 35 | 'WallclockTimeX':[datetime.timedelta(minutes=random.randint(50,700)) for _ in range(n_jobs)], 36 | 'ReqMemX':np.random.randint(4,130, size=n_jobs)*1., 37 | 'PartitionX':['icelake']*n_jobs, 38 | 'SubmitDatetimeX':[datetime.datetime(day=1,month=5,year=2023) + datetime.timedelta(days=random.randint(1,60)) for _ in range(n_jobs)], 39 | 'StateX':np.random.choice([1,0], p=[.8,.2], size=n_jobs), 40 | 'UIDX':['11111']*n_jobs, 41 | 'UserX':['foo']*n_jobs, 42 | 'PartitionTypeX':['CPU']*n_jobs, 43 | 'TotalCPUtime2useX':[datetime.timedelta(minutes=random.randint(50,5000)) for _ in range(n_jobs)], 44 | 'TotalGPUtime2useX':[datetime.timedelta(seconds=0)]*n_jobs, 45 | } 46 | 47 | foo_df = pd.DataFrame(foo) 48 | foo_df['CPUhoursChargedX'] = foo_df.TotalCPUtime2useX / np.timedelta64(1, 'h') 49 | foo_df['GPUhoursChargedX'] = 0. 50 | foo_df['NeededMemX'] = foo_df.ReqMemX * np.random.random(n_jobs) 51 | foo_df['memOverallocationFactorX'] = foo_df.ReqMemX / foo_df.NeededMemX 52 | 53 | df_list.append(foo_df) 54 | return pd.concat(df_list) -------------------------------------------------------------------------------- /frontend/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import yaml 3 | import os 4 | 5 | from frontend.terminal_output import generate_terminal_view 6 | from frontend.dashboard_output import dashboard_html 7 | 8 | def main_frontend(dict_stats, args): 9 | ### Load cluster specific info 10 | with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream: 11 | try: 12 | cluster_info = yaml.safe_load(stream) 13 | except yaml.YAMLError as exc: 14 | print(exc) 15 | 16 | if args.output == 'terminal': 17 | print("Generating terminal view... ", end="") 18 | terminal_view = generate_terminal_view(dict_stats, args, cluster_info) 19 | print("Done\n") 20 | print(terminal_view) 21 | elif args.output == 'html': 22 | print("Generating html... ", end="") 23 | dashboard = dashboard_html( 24 | dict_stats=dict_stats, 25 | args=args, 26 | cluster_info=cluster_info, 27 | ) 28 | report_path = dashboard.generate() 29 | print(f"done: {report_path}") 30 | 31 | else: 32 | raise ValueError("Wrong output format") 33 | 34 | 35 | if __name__ == "__main__": 36 | 37 | #### This is used for testing only #### 38 | 39 | from collections import namedtuple 40 | from backend import main_backend 41 | 42 | argStruct = namedtuple('argStruct', 43 | 'startDay endDay use_mock_agg_data user output useCustomLogs customSuccessStates filterWD filterJobIDs filterAccount reportBug reportBugHere path_infrastucture_info') 44 | args = argStruct( 45 | startDay='2022-01-01', 46 | endDay='2023-06-30', 47 | use_mock_agg_data=True, 48 | user='ll582', 49 | output='html', 50 | useCustomLogs=None, 51 | customSuccessStates='', 52 | filterWD=None, 53 | filterJobIDs='all', 54 | filterAccount=None, 55 | reportBug=False, 56 | reportBugHere=False, 57 | path_infrastucture_info="clustersData/CSD3", 58 | ) 59 | with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream: 60 | try: 61 | cluster_info = yaml.safe_load(stream) 62 | except yaml.YAMLError as exc: 63 | print(exc) 64 | 65 | extracted_data = main_backend(args) 66 | 67 | # generate_dashboard_html(dict_stats=extracted_data, args=args, cluster_info=cluster_info, dict_deptGroupsUsers=dict_deptGroupsUsers, dict_users=dict_users) 68 | 69 | main_frontend(dict_stats=extracted_data,args=args) -------------------------------------------------------------------------------- /frontend/helpers.py: -------------------------------------------------------------------------------- 1 | def formatText_footprint(footprint_g, use_html=False): 2 | ''' 3 | Format the text to display the carbon footprint 4 | :param footprint_g: [float] carbon footprint, in gCO2e 5 | :return: [str] the text to display 6 | ''' 7 | if use_html: 8 | co2e = "CO2e" 9 | else: 10 | co2e = "CO2e" 11 | if footprint_g < 1e3: 12 | text_footprint = f"{footprint_g:,.0f} g{co2e}" 13 | elif footprint_g < 1e6: 14 | text_footprint = f"{footprint_g / 1e3:,.0f} kg{co2e}" 15 | else: 16 | text_footprint = f"{footprint_g / 1e3:,.0f} T{co2e}" 17 | return text_footprint 18 | 19 | def formatText_treemonths(tm_float, splitMonthsYear=True): 20 | ''' 21 | Format the text to display the tree months 22 | :param tm_float: [float] tree-months 23 | :return: [str] the text to display 24 | ''' 25 | tm = int(tm_float) 26 | ty = int(tm / 12) 27 | if tm < 1: 28 | text_trees = f"{tm_float:.3f} tree-months" 29 | elif tm == 1: 30 | text_trees = f"{tm_float:.1f} tree-month" 31 | elif tm < 6: 32 | text_trees = f"{tm_float:.1f} tree-months" 33 | elif tm <= 24: 34 | text_trees = f"{tm} tree-months" 35 | elif tm < 120: 36 | if splitMonthsYear: 37 | text_trees = f"{ty} tree-years and {tm - ty * 12} tree-months" 38 | else: 39 | text_trees = f"{ty} tree-years" 40 | else: 41 | text_trees = f"{tm_float/12:.1f} tree-years" 42 | return text_trees 43 | 44 | def formatText_flying(dict_stats, output_format='single_str'): 45 | """ 46 | Format the text to display about flying 47 | :param dict_stats: 48 | :param output_format: 49 | :return: [str] or [(float,str)] text to display 50 | """ 51 | if output_format not in ['single_str', 'dict']: 52 | raise ValueError() 53 | 54 | if dict_stats['flying_NY_SF'] < 0.5: 55 | value = round(dict_stats['flying_PAR_LON'], 2) 56 | if output_format == 'single_str': 57 | output_flying = f"{value:,} flights between Paris and London" 58 | else: 59 | output_flying = {'number': value, 'trip': 'Paris - London'} 60 | elif dict_stats['flying_NYC_MEL'] < 0.5: 61 | value = round(dict_stats['flying_NY_SF'], 2) 62 | if output_format == 'single_str': 63 | output_flying = f"{value:,} flights between New York and San Francisco" 64 | else: 65 | output_flying = {'number': value, 'trip': 'New York - San Francisco'} 66 | else: 67 | value = round(dict_stats['flying_NYC_MEL'], 2) 68 | if output_format == 'single_str': 69 | output_flying = f"{value:,} flights between New York and Melbourne" 70 | else: 71 | output_flying = {'number': value, 'trip': 'New York - Melbourne'} 72 | return output_flying -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Project specific 2 | .idea/ 3 | clustersData/ 4 | testData/ 5 | error_logs_archived/ 6 | support_files/ 7 | frontend/templates/plotly* 8 | outputs/* 9 | example_outputs/ 10 | 11 | # Byte-compiled / optimized / DLL files 12 | __pycache__/ 13 | *.py[cod] 14 | *$py.class 15 | 16 | # C extensions 17 | *.so 18 | 19 | # Distribution / packaging 20 | .Python 21 | build/ 22 | develop-eggs/ 23 | dist/ 24 | downloads/ 25 | eggs/ 26 | .eggs/ 27 | lib/ 28 | lib64/ 29 | parts/ 30 | sdist/ 31 | var/ 32 | wheels/ 33 | share/python-wheels/ 34 | *.egg-info/ 35 | .installed.cfg 36 | *.egg 37 | MANIFEST 38 | 39 | # PyInstaller 40 | # Usually these files are written by a python script from a template 41 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 42 | *.manifest 43 | *.spec 44 | 45 | # Installer logs 46 | pip-log.txt 47 | pip-delete-this-directory.txt 48 | 49 | # Unit test / coverage reports 50 | htmlcov/ 51 | .tox/ 52 | .nox/ 53 | .coverage 54 | .coverage.* 55 | .cache 56 | nosetests.xml 57 | coverage.xml 58 | *.cover 59 | *.py,cover 60 | .hypothesis/ 61 | .pytest_cache/ 62 | cover/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | db.sqlite3-journal 73 | 74 | # Flask stuff: 75 | instance/ 76 | .webassets-cache 77 | 78 | # Scrapy stuff: 79 | .scrapy 80 | 81 | # Sphinx documentation 82 | docs/_build/ 83 | 84 | # PyBuilder 85 | .pybuilder/ 86 | target/ 87 | 88 | # Jupyter Notebook 89 | .ipynb_checkpoints 90 | 91 | # IPython 92 | profile_default/ 93 | ipython_config.py 94 | 95 | # pyenv 96 | # For a library or package, you might want to ignore these files since the code is 97 | # intended to run in multiple environments; otherwise, check them in: 98 | # .python-version 99 | 100 | # pipenv 101 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 102 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 103 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 104 | # install all needed dependencies. 105 | #Pipfile.lock 106 | 107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 108 | __pypackages__/ 109 | 110 | # Celery stuff 111 | celerybeat-schedule 112 | celerybeat.pid 113 | 114 | # SageMath parsed files 115 | *.sage.py 116 | 117 | # Environments 118 | .env 119 | .venv 120 | env/ 121 | venv/ 122 | ENV/ 123 | env.bak/ 124 | venv.bak/ 125 | 126 | # Spyder project settings 127 | .spyderproject 128 | .spyproject 129 | 130 | # Rope project settings 131 | .ropeproject 132 | 133 | # mkdocs documentation 134 | /site 135 | 136 | # mypy 137 | .mypy_cache/ 138 | .dmypy.json 139 | dmypy.json 140 | 141 | # Pyre type checker 142 | .pyre/ 143 | 144 | # pytype static type analyzer 145 | .pytype/ 146 | 147 | # Cython debug symbols 148 | cython_debug/ -------------------------------------------------------------------------------- /frontend/templates/_user.html: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 |
5 |

User's personal report: {{ user.userID }}

6 | 7 |

8 | Find out your carbon footprint from {{ startDay }} to {{ endDay }}. 9 |

10 |
11 | 12 |
13 |
14 |
15 | 16 | CPU time 17 | {{ usersActivity[user.userID].cpuTime }} 18 |
19 |
20 | 21 | Carbon footprint 22 | {{ usersActivity[user.userID].carbonFootprint }} 23 |
24 |
25 | 26 | {{ usersActivity[user.userID].flying.trip }} 27 | {{ usersActivity[user.userID].flying.number }} 28 |
29 |
30 | 31 | Carbon sequestration 32 | {{ usersActivity[user.userID].trees }} 33 |
34 |
35 |
36 | 37 |
38 |
39 | {% include 'plotly_thisuserDailyCarbonFootprint.html' %} 40 |
41 |
42 | {% include 'plotly_thisuserDailyNjobs.html' %} 43 |
44 |
45 | {% include 'plotly_thisuserDailyCpuTime.html' %} 46 |
47 |
48 | {% include 'plotly_thisuserDailyMemoryRequested.html' %} 49 |
50 | 51 |
52 |
Failed jobs
53 |

54 | Because any resource spent on a job is wasted if the job fails, it 55 | is important to test scripts and pipelines on small datasets. 56 | The chart below shows the daily success rate of {{ usersActivity[user.userID].n_jobs }} 57 | jobs that completed in the period. 58 | 59 | Failed jobs represent {{ usersActivity[user.userID].carbonFootprint_failedJobs }} and 60 | a cost of {{ usersActivity[user.userID].cost_failedJobs }}. 61 | They are responsible for {{ usersActivity[user.userID].carbonFootprint_failedJobs_share }} of the overall 62 | carbon footprint. 63 |

64 | {% include 'plotly_thisuserSuccessRate.html' %} 65 | 66 | {% include 'plotly_thisuserDailySuccessRate.html' %} 67 |
68 | 69 |
70 |
Memory efficiency
71 | 72 |

73 | Memory can be a significant source of waste, because the power draw from memory mainly depends 74 | on the memory available, not on the actual memory used. The chart below shows the distribution 75 | of the memory efficiency collected from {{ usersActivity[user.userID].n_jobs }} jobs 76 | between {{ startDay }} and {{ endDay }} (the closer to 100% the better). 77 |

78 | 79 | {% include 'plotly_thisuserMemoryEfficiency.html' %} 80 | 81 |

82 | Using the memory efficiency, we can estimate how much memory was needed to run a job. 83 | If all jobs above had been submitted with only the memory they needed (rounded up), 84 | you would have emitted {{ usersActivity[user.userID].carbonFootprint_wasted_memoryOverallocation }} less 85 | and saved {{ usersActivity[user.userID].cost_wasted_memoryOverallocation }}. 86 |

87 |
88 | 89 | 90 |
91 |
-------------------------------------------------------------------------------- /frontend/terminal_output.py: -------------------------------------------------------------------------------- 1 | 2 | import math 3 | from frontend.helpers import formatText_footprint, formatText_treemonths, formatText_flying 4 | import pandas as pd 5 | import os 6 | 7 | 8 | def formatText_driving(dist): 9 | """ 10 | Format the text to display the driving distance 11 | :param dist: [float] driving distance, in km 12 | :return: [str] text to display 13 | """ 14 | if dist < 10: 15 | text_driving = f"driving {dist:,.2f} km" 16 | else: 17 | text_driving = f"driving {dist:,.0f} km" 18 | return text_driving 19 | 20 | def generate_terminal_view(dict_stats_all, args, cluster_info): 21 | 22 | user_here = dict_stats_all['user'] 23 | dict_stats = dict_stats_all['userActivity'][user_here] 24 | text_nUsers = f"- user: {user_here} -" 25 | 26 | ## various variables 27 | clusterName = cluster_info['cluster_name'] 28 | 29 | ## energy 30 | dcOverheads = dict_stats['energy'] - dict_stats['energy_CPUs'] - dict_stats['energy_GPUs'] - dict_stats['energy_memory'] 31 | 32 | ## Carbon footprint 33 | text_footprint = formatText_footprint(dict_stats['carbonFootprint']) 34 | text_footprint_failedJobs = formatText_footprint(dict_stats['carbonFootprint_failedJobs']) 35 | text_footprint_wasted_memoryOverallocation = formatText_footprint(dict_stats['carbonFootprint']-dict_stats['carbonFootprint_memoryNeededOnly']) 36 | 37 | ## Context 38 | text_trees = formatText_treemonths(dict_stats['treeMonths']) 39 | text_trees_failedJobs = formatText_treemonths(dict_stats['treeMonths_failedJobs']) 40 | text_trees_wasted_memoryOverallocation = formatText_treemonths(dict_stats['treeMonths']-dict_stats['treeMonths_memoryNeededOnly']) 41 | text_driving = formatText_driving(dict_stats['driving']) 42 | text_flying = formatText_flying(dict_stats) 43 | 44 | ### Text filterCWD 45 | if args.filterWD is None: 46 | text_filterCWD = '' 47 | else: 48 | text_filterCWD = f"\n (NB: The only jobs considered here are those launched from {args.filterWD})\n" 49 | 50 | ### Text filterJobIDs 51 | if args.filterJobIDs == 'all': 52 | text_filterJobIDs = '' 53 | else: 54 | text_filterJobIDs = f"\n (NB: The only jobs considered here are those with job IDs: {args.filterJobIDs})\n" 55 | 56 | ### Text filter Account 57 | if args.filterAccount is None: 58 | text_filterAccount = '' 59 | else: 60 | text_filterAccount = f"\n (NB: The only jobs considered here are those charged under {args.filterAccount})\n" 61 | 62 | ### To get the title length right 63 | title_row1 = f"Carbon footprint on {clusterName}" 64 | title_row2 = text_nUsers 65 | title_row3 = f"({args.startDay} / {args.endDay})" 66 | max_length = max([len(title_row1), len(title_row2), len(title_row3)]) 67 | 68 | title_row1_full = f"# {' '*math.floor((max_length-len(title_row1))/2)}{title_row1}{' '*math.ceil((max_length-len(title_row1))/2)} #" 69 | title_row2_full = f"# {' '*math.floor((max_length-len(title_row2))/2)}{title_row2}{' '*math.ceil((max_length-len(title_row2))/2)} #" 70 | title_row3_full = f"# {' '*math.floor((max_length-len(title_row3))/2)}{title_row3}{' '*math.ceil((max_length-len(title_row3))/2)} #" 71 | 72 | title = f''' 73 | {'#'*(max_length+6)} 74 | #{' '*(max_length+4)}# 75 | {title_row1_full} 76 | {title_row2_full} 77 | {title_row3_full} 78 | #{' '*(max_length+4)}# 79 | {'#'*(max_length+6)} 80 | ''' 81 | 82 | return f''' 83 | {title} 84 | 85 | {'-' * (len(text_footprint) + 6)} 86 | | {text_footprint} | 87 | {'-' * (len(text_footprint) + 6)} 88 | 89 | ...This is equivalent to: 90 | - {text_trees} 91 | - {text_driving} 92 | - {text_flying} 93 | 94 | ...{dict_stats['failure_rate']:.1%} of the jobs failed, these represent a waste of {text_footprint_failedJobs} ({text_trees_failedJobs}). 95 | ...On average, the jobs request at least {dict_stats['memoryOverallocationFactor']:,.1f} times the memory needed. By only requesting the memory needed, {text_footprint_wasted_memoryOverallocation} ({text_trees_wasted_memoryOverallocation}) could have been saved. 96 | {text_filterCWD}{text_filterJobIDs}{text_filterAccount} 97 | Energy used: {dict_stats['energy']:,.2f} kWh 98 | - CPUs: {dict_stats['energy_CPUs']:,.2f} kWh ({dict_stats['energy_CPUs'] / dict_stats['energy']:.2%}) 99 | - GPUs: {dict_stats['energy_GPUs']:,.2f} kWh ({dict_stats['energy_GPUs'] / dict_stats['energy']:.2%}) 100 | - Memory: {dict_stats['energy_memory']:,.2f} kWh ({dict_stats['energy_memory'] / dict_stats['energy']:.2%}) 101 | - Data centre overheads: {dcOverheads:,.2f} kWh ({dcOverheads / dict_stats['energy']:.2%}) 102 | Carbon intensity used for the calculations: {cluster_info['CI']:,} gCO2e/kWh 103 | 104 | Summary of usage: 105 | - First/last job recorded on that period: {str(dict_stats['first_job_period'].date())}/{str(dict_stats['last_job_period'].date())} 106 | - Number of jobs: {dict_stats['n_jobs']:,} ({dict_stats['n_success']:,} completed) 107 | - Core hours used/charged: {dict_stats['CPUhoursCharged']:,.1f} (CPU), {dict_stats['GPUhoursCharged']:,.1f} (GPU), {dict_stats['CPUhoursCharged']+dict_stats['GPUhoursCharged']:,.1f} (total). 108 | - Total usage time (i.e. when cores were performing computations): 109 | - CPU: {str(dict_stats['cpuTime'])} ({dict_stats['cpuTime'].total_seconds()/3600:,.0f} hours) 110 | - GPU: {str(dict_stats['gpuTime'])} ({dict_stats['gpuTime'].total_seconds()/3600:,.0f} hours) 111 | - Total wallclock time: {str(dict_stats['wallclockTime'])} 112 | - Total memory requested: {dict_stats['memoryRequested']:,.0f} GB 113 | 114 | Limitations to keep in mind: 115 | - The workload manager doesn't alway log the exact CPU usage time, and when this information is missing, we assume that all cores are used at 100%. 116 | - For now, we assume that for GPU jobs, the GPUs are used at 100% (as the information needed for more accurate measurement is not available) 117 | (this may lead to slightly overestimated carbon footprints, although the order of magnitude is likely to be correct) 118 | - Conversely, the wasted energy due to memory overallocation may be largely underestimated, as the information needed is not always logged. 119 | 120 | Any bugs, questions, suggestions? Post on GitHub (GreenAlgorithms/GreenAlgorithms4HPC) or email LL582@medschl.cam.ac.uk 121 | {'-' * 80} 122 | Calculated using the Green Algorithms framework: www.green-algorithms.org 123 | Please cite https://onlinelibrary.wiley.com/doi/10.1002/advs.202100707 124 | ''' 125 | 126 | -------------------------------------------------------------------------------- /frontend/templates/report_blank.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Green Algorithms dashboard 6 | 7 | 8 | 9 | 10 | 11 | 12 |
13 |
14 | 15 | 22 |

23 | Last updated: {{ last_updated }} 24 |

25 |
26 | 27 |
28 |
29 |
30 | 39 |
40 |
41 | 42 | 43 |
44 |
45 |
46 |

47 | This is an early version, please report any bug you find! 48 |

49 |
50 | 51 |

52 | Computing is a major contributor to energy consumption, and thus is one of the main sources of 53 | the carbon emission of our research. 54 | In the context of the global climate crisis, it is imperative that individuals and organizations 55 | find ways to assess then reduce the carbon footprint of their work. 56 |

57 | 58 |

59 | This page aims to represent the carbon footprint that we are, collectively and individually, 60 | responsible for at {{ institution }}. 61 | SLURM jobs submitted to the {{ cluster_name }} High Performance Cluster are logged automatically 62 | (including information such as resource requested, run time, memory efficiency, etc.), 63 | and the corresponding carbon footprint was calculated using the framework proposed 64 | by Green Algorithms and the following assumptions: 65 |

66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 |
CPU{{ texts_intro.CPU }}
GPU{{ texts_intro.GPU }}
Memory power0.3725 W/GB
Power usage effectiveness{{ PUE }}
Carbon intensity{{ CI }} gCO2e/kWh
Energy cost{{ energy_cost_perkWh.currency }}{{ energy_cost_perkWh.cost }}/kWh
95 | 96 |
97 |

98 | We built this tool in the hope to raise awareness of computing usage, 99 | highlight resources waste, and foster good computing practices. 100 | This is intended to be a lightweight carbon footprint calculator, not a cluster monitoring system. 101 |

102 |
103 |
104 | 105 | {% if include_user_context %} 106 | {% include "_user.html" %} 107 | {% endif %} 108 | 109 |
110 |

Credits

111 |

112 | This dashboard is the combination of a template developed at EMBL-EBI by Matthias Blum and Alex Bateman, 113 | and the Green Algorithms project led by Loïc Lannelongue and Michael Inouye. 114 | The carbon footprint calculations are described on the Green Algorithms project's website. 115 |

116 |
117 | 118 |
119 |

Contact

120 |

121 | If you want to report a bug or a user assigned to the wrong team, 122 | request a feature, or just give some general feedback, you can email LL582@medschl.cam.ac.uk. 123 |

124 |
125 | 126 |
127 |

FAQ

128 |

129 | How is the information on SLRUM jobs collected? 130 | Logs are pulled using the `sacct` command. It's all powered by the GA4HPC methods, 131 | you can check it out there. 132 |

133 | 134 |

135 | Where can I ask more questions? 136 | On the GitHub here or by email (see above).. 137 |

138 | 139 | 140 |
141 | 142 |
143 | 144 |
145 | 146 |
147 | 148 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import datetime 4 | import os 5 | 6 | from backend import main_backend 7 | from frontend import main_frontend 8 | 9 | def create_arguments(): 10 | """ 11 | Command line arguments for the tool. 12 | :return: argparse object 13 | """ 14 | parser = argparse.ArgumentParser(description=f'Calculate your carbon footprint on the server.') 15 | 16 | default_endDay = datetime.date.today().strftime("%Y-%m-%d") # today 17 | default_startDay = f"{datetime.date.today().year}-01-01" # start of the year 18 | 19 | ## Timeframe 20 | parser.add_argument('-S', '--startDay', type=str, 21 | help=f'The first day to take into account, as YYYY-MM-DD (default: {default_startDay})', 22 | default=default_startDay) 23 | parser.add_argument('-E', '--endDay', type=str, 24 | help='The last day to take into account, as YYYY-MM-DD (default: today)', 25 | default=default_endDay) 26 | 27 | ## How to display the report 28 | parser.add_argument('-o', '--output', type=str, 29 | help="How to display the results, one of 'terminal' or 'html' (default: terminal)", 30 | default='terminal') 31 | parser.add_argument('--outputDir', type=str, 32 | help="Export path for the output (default: under `output/`). Only used with `--output html`.", 33 | default='outputs') 34 | 35 | ## Filter out jobs 36 | parser.add_argument('--filterCWD', action='store_true', 37 | help='Only report on jobs launched from the current location.') 38 | parser.add_argument('--userCWD', type=str, help=argparse.SUPPRESS) 39 | parser.add_argument('--filterJobIDs', type=str, 40 | help='Comma separated list of Job IDs you want to filter on. (default: "all")', 41 | default='all') 42 | parser.add_argument('--filterAccount', type=str, 43 | help='Only consider jobs charged under this account') 44 | parser.add_argument('--customSuccessStates', type=str, default='', 45 | help="Comma-separated list of job states. By default, only jobs that exit with status CD or \ 46 | COMPLETED are considered successful (PENDING, RUNNING and REQUEUD are ignored). \ 47 | Jobs with states listed here will be considered successful as well (best to list both \ 48 | 2-letter and full-length codes. Full list of job states: \ 49 | https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES") 50 | 51 | ## Reporting bugs 52 | group1 = parser.add_mutually_exclusive_group() 53 | group1.add_argument('--reportBug', action='store_true', 54 | help='In case of a bug, this flag exports the jobs logs so that you/we can investigate further. ' 55 | 'The debug file will be stored in the shared folder where this tool is located (under /outputs), ' 56 | 'to export it to your home folder, user `--reportBugHere`. ' 57 | 'Note that this will write out some basic information about your jobs, such as runtime, ' 58 | 'number of cores and memory usage.' 59 | ) 60 | group1.add_argument('--reportBugHere', action='store_true', 61 | help='Similar to --reportBug, but exports the output to your home folder.') 62 | group2 = parser.add_mutually_exclusive_group() 63 | group2.add_argument('--useCustomLogs', type=str, default='', 64 | help='This bypasses the workload manager, and enables you to input a custom log file of your jobs. \ 65 | This is mostly meant for debugging, but can be useful in some situations. ' 66 | 'An example of the expected file can be found at `example_files/example_sacctOutput_raw.txt`.') 67 | # Arguments for debugging only (not visible to users) 68 | # To ue arbitrary folder for the infrastructure information 69 | parser.add_argument('--useOtherInfrastuctureInfo', type=str, default='', help=argparse.SUPPRESS) 70 | # Uses mock aggregated usage data, for offline debugging 71 | group2.add_argument('--use_mock_agg_data', action='store_true', help=argparse.SUPPRESS) 72 | 73 | args = parser.parse_args() 74 | return args 75 | 76 | class validate_args(): 77 | """ 78 | Class used to validate all the arguments provided. 79 | """ 80 | # TODO add validation 81 | # TODO test these 82 | 83 | def _validate_dates(self, args): 84 | """ 85 | Validates that `startDay` and `endDay` are in the right format and in the right order. 86 | """ 87 | for x in [args.startDay, args.endDay]: 88 | try: 89 | datetime.datetime.strptime(x, '%Y-%m-%d') 90 | except ValueError: 91 | raise ValueError(f"Incorrect date format, should be YYYY-MM-DD but is: {x}") 92 | 93 | foo = datetime.datetime.strptime(args.startDay, '%Y-%m-%d') 94 | bar = datetime.datetime.strptime(args.endDay, '%Y-%m-%d') 95 | if foo > bar: 96 | raise ValueError(f"Start date ({args.startDay}) is after the end date ({args.endDay}).") 97 | 98 | def _validate_output(self, args): 99 | """ 100 | Validates that --output is one of the accepted options. 101 | """ 102 | list_options = ['terminal', 'html'] 103 | if args.output not in list_options: 104 | raise ValueError(f"output argument invalid. Is {args.output} but should be one of {list_options}") 105 | 106 | 107 | def all(self, args): 108 | self._validate_dates(args) 109 | self._validate_output(args) 110 | 111 | if __name__ == "__main__": 112 | # print("Working dir0: ", os.getcwd()) # DEBUGONLY 113 | 114 | args = create_arguments() 115 | 116 | ## Decide which infrastructure info to use 117 | if args.useOtherInfrastuctureInfo != '': 118 | args.path_infrastucture_info = args.useOtherInfrastuctureInfo 119 | print(f"Overriding infrastructure info with: {args.path_infrastucture_info}") 120 | else: 121 | args.path_infrastucture_info = 'data' 122 | 123 | ## Organise the unique output directory (used for output report and logs export for debugging) 124 | ## creating a uniquely named subdirectory in whatever 125 | # Decide if an output directory is needed at all 126 | if (args.output in ['html']) | args.reportBug | args.reportBugHere: 127 | timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M-%S%f') 128 | args.outputDir2use = { 129 | 'timestamp': timestamp, 130 | 'path': os.path.join(args.outputDir, f"outputs_{timestamp}") 131 | } 132 | 133 | # Create directory 134 | os.makedirs(args.outputDir2use["path"]) 135 | 136 | else: 137 | # no output is created 138 | args.outputDir2use = None 139 | 140 | ### Set the WD to filter on, if needed 141 | if args.filterCWD: 142 | args.filterWD = args.userCWD 143 | print("\nNB: --filterCWD doesn't work with symbolic links (yet!)\n") 144 | else: 145 | args.filterWD = None 146 | 147 | ### Validate input 148 | validate_args().all(args) 149 | 150 | ### Run backend to get data 151 | extracted_data = main_backend(args) 152 | 153 | main_frontend(extracted_data, args) -------------------------------------------------------------------------------- /frontend/templates/styles.css: -------------------------------------------------------------------------------- 1 | @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:ital,wght@0,400;0,700;1,400;1,700&display=swap'); 2 | 3 | @media only screen and (min-width: 993px) { 4 | .container { 5 | width: 85%; 6 | } 7 | } 8 | 9 | @media only screen and (min-width: 1201px) { 10 | html { 11 | font-size:16px; 12 | } 13 | } 14 | 15 | /* Override Materialize */ 16 | html, 17 | button, 18 | input, 19 | optgroup, 20 | select, 21 | textarea { 22 | font-family: "IBM Plex Sans", sans-serif !important; 23 | } 24 | .btn, .tabs .tab { text-transform: inherit; } 25 | .btn.fluid { width: 100%; } 26 | td, th { 27 | padding: .5em .75em; 28 | } 29 | .section { 30 | padding: 2rem 0 0; 31 | } 32 | .section > h4 { 33 | margin-top: 0; 34 | } 35 | .input-field .right.circle { 36 | float: left !important; 37 | } 38 | img.circle { 39 | border: 1px solid #d4d4d5; 40 | } 41 | /*table#teams-table, table#users-table {*/ 42 | /* font-size: .9rem;*/ 43 | /*}*/ 44 | .card-panel { 45 | padding: 1.25rem; 46 | } 47 | .card-panel > .card-title { 48 | font-size: 1.25rem; 49 | margin-bottom: .5rem; 50 | } 51 | .card-panel > .card-title + p { 52 | margin-top: 0; 53 | } 54 | .card-panel.info { 55 | background-color: #e1f5fe; 56 | border-left: 0.25rem solid #03a9f4; 57 | } 58 | .card-panel.warning { 59 | background-color: #fff8e1; 60 | border-left: 0.25rem solid #ffc107; 61 | } 62 | .card-panel.alert { 63 | background-color: #ffebee; 64 | border-left: 0.25rem solid #f44336; 65 | } 66 | .card-panel > :first-child { 67 | margin-top: 0; 68 | } 69 | .card-panel > :last-child { 70 | margin-bottom: 0; 71 | } 72 | .tabs .tab a { 73 | color: inherit; 74 | } 75 | .tabs .tab a:hover, .tabs .tab a.active { 76 | color: inherit; 77 | } 78 | .tabs .indicator { 79 | height: 4px; 80 | background-color: #18974c; 81 | } 82 | .tabs .tab a:focus, .tabs .tab a:focus.active { 83 | /*background-color: rgba(0,123,83,0.2);*/ 84 | background-color: transparent; 85 | } 86 | input:not(.browser-default).invalid ~ .helper-text[data-error] > *, 87 | input:not(.browser-default).valid ~ .helper-text[data-success] > * { 88 | /* hide nested elements */ 89 | display: none; 90 | } 91 | .modal-content > form:last-child { 92 | margin-bottom: 0; 93 | } 94 | table + h6 { 95 | margin-top: 2rem; 96 | } 97 | pre { 98 | background-color: #f6f8fa; 99 | border: 1px solid rgba(30, 30, 30, 0.1); 100 | border-radius: 3px; 101 | color: #24292F; 102 | font-size: .85em; 103 | padding: 8px 16px; 104 | } 105 | 106 | #loader { 107 | position: fixed; 108 | left: 0; 109 | top: 0; 110 | width: 100%; 111 | height: 100%; 112 | background: #eee; 113 | padding: 5rem 0; 114 | color: #333; 115 | z-index: 99999; 116 | } 117 | input:not(.browser-default):focus:not([readonly]):not(.invalid) { 118 | border-bottom: 1px solid #3489ca !important; 119 | box-shadow: 0 1px 0 0 #3489ca !important; 120 | } 121 | input:not(.browser-default):focus:not([readonly]):not(.invalid) + label { 122 | color: #3489ca !important; 123 | } 124 | .custom.blue { background-color: #3489ca !important; } 125 | 126 | [type="checkbox"].custom.blue.filled-in:checked + span:not(.lever)::after { 127 | border: 2px solid #3489ca !important; 128 | background-color: #3489ca !important; 129 | } 130 | 131 | ::placeholder { 132 | color: rgb(90, 95, 95); 133 | opacity: 0.5; 134 | } 135 | 136 | /* Header */ 137 | header { 138 | border-bottom: 1px solid rgba(0,0,0,.14); 139 | } 140 | #top-nav { 141 | background-color: inherit; 142 | box-shadow: inherit; 143 | color: #18974c; 144 | height: 150px; 145 | display: flex; 146 | flex-direction: row; 147 | align-items: center; 148 | margin-bottom: 20px; 149 | } 150 | #title { 151 | display: flex; 152 | flex-direction: column; 153 | } 154 | #title h1 { 155 | margin: 1rem 0 0; 156 | width: 100%; 157 | text-align: center; 158 | } 159 | #title h2 { 160 | margin: 1rem 0 0; 161 | width: 100%; 162 | text-align: center; 163 | font-size: 3rem; 164 | } 165 | #top-nav i { 166 | font-size: 4.2rem; 167 | width: 10%; 168 | margin: 1rem; 169 | } 170 | header p { 171 | color: rgba(0, 0, 0, .5); 172 | margin: 0 0 1rem; 173 | text-align: right; 174 | } 175 | 176 | /* Intro/Abstract */ 177 | #intro > p:first-child { 178 | font-weight: bold; 179 | font-size: 110%; 180 | } 181 | 182 | /* Autocomplete highlight */ 183 | .dropdown-content li > span { 184 | color: #444; 185 | } 186 | .autocomplete-content li .highlight { 187 | color: #26a69a; 188 | } 189 | 190 | #remove-user { 191 | width: 100%; 192 | height: 48px; 193 | } 194 | 195 | table + .pagination { 196 | margin-top: 1rem; 197 | text-align: center; 198 | } 199 | .pagination li { 200 | /* Override Materialize */ 201 | vertical-align: auto; 202 | height: auto; 203 | margin: .25rem; 204 | } 205 | .pagination li.active { 206 | background-color: #485fc7; 207 | border-color: #485fc7; 208 | color: #fff; 209 | } 210 | .pagination li a { 211 | color: #363636; 212 | font-size: 1rem; 213 | height: auto; 214 | line-height: normal; 215 | min-width: 2.5em; 216 | padding: .5rem; 217 | user-select: none; 218 | } 219 | .pagination li:not(.ellipsis) { 220 | border: 1px solid #dbdbdb; 221 | border-radius: .375em; 222 | } 223 | 224 | .pagination li.ellipsis { 225 | pointer-events: none; 226 | } 227 | 228 | thead th.sortable { 229 | position: relative; 230 | padding-right: 20px; 231 | } 232 | thead th.sortable:hover { 233 | cursor: pointer; 234 | } 235 | thead th.sortable:hover::after { 236 | opacity: .25; 237 | } 238 | thead th.sortable::after { 239 | position: absolute; 240 | display: inline-block; 241 | opacity: .15; 242 | right: 10px; 243 | font-size: .8em; 244 | content: "▲"; 245 | } 246 | 247 | thead th.sortable.asc::after { 248 | content: "▲"; 249 | opacity: 1; 250 | } 251 | 252 | thead th.sortable.desc::after { 253 | content: "▼"; 254 | opacity: 1; 255 | } 256 | .table-search { 257 | float: right; 258 | width: 350px; 259 | margin-bottom: 0; 260 | } 261 | .table-search > input[type="text"]:not(.browser-default) { 262 | border: 1px solid #9e9e9e; 263 | margin: 0; 264 | padding-left: 20px; 265 | } 266 | .table-search > input[type="text"]:focus:not(.browser-default) { 267 | border: 1px solid #3489ca !important; 268 | box-shadow: none !important; 269 | } 270 | 271 | .table-of-contents a { 272 | border-color: #4caf50 !important; 273 | } 274 | .table-of-contents a.active { 275 | font-weight: bold; 276 | } 277 | .table-of-contents a ~ ul li { 278 | line-height: 1; 279 | /*padding: 0;*/ 280 | } 281 | .table-of-contents a ~ ul li a { 282 | font-size: .85em; 283 | color: rgba(117, 117, 117, 0.75); 284 | line-height: 1; 285 | height: auto; 286 | } 287 | .table-of-contents a:not(.active) ~ ul { 288 | /*display: none;*/ 289 | } 290 | 291 | .highcharts-tooltip table > tbody > tr { 292 | border: none !important; 293 | } 294 | 295 | .highcharts-tooltip table > tbody > tr > td { 296 | padding: .25em .5em; 297 | } 298 | #user-info { 299 | display: flex; 300 | align-items: center; 301 | margin-bottom: 1rem; 302 | /*height: 100px;*/ 303 | } 304 | #user-info > img { 305 | height: 100px; 306 | width: 100px; 307 | } 308 | #user-info > .content { 309 | padding-left: 1.5rem; 310 | } 311 | #user-info > .content > h6 { 312 | /*display: inline-block;*/ 313 | font-weight: 700; 314 | margin: 0; 315 | } 316 | #user-info > .content > .block { 317 | margin: .25em 0 .25em; 318 | color: rgba(0,0,0,.6); 319 | } 320 | #faq .question { 321 | font-weight: bold; 322 | display: block; 323 | } 324 | .stats-summary .col i { 325 | display: block; 326 | font-size: 3rem; 327 | margin-bottom: .5rem; 328 | } 329 | .stats-summary .col span:not([data-stat]) { 330 | font-size: 1.15rem; 331 | } 332 | .stats-summary .col span[data-stat] { 333 | display: block; 334 | font-size: 1.5rem; 335 | font-weight: 700; 336 | } 337 | #contact-email, #contact-slack { 338 | white-space: nowrap; 339 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GA4HPC: Green Algorithms for High Performance Computing 2 | 3 | > :point_right: There are many different flabours of SLURM setups, so no doubt you'll find some bugs... 4 | please let us know what you find so that we can make it work for more people! 5 | 6 | The aim of this code is to implement the Green Algorithms framework 7 | (more [here](https://onlinelibrary.wiley.com/doi/abs/10.1002/advs.202100707) 8 | and on [www.green-algorithms.org](www.green-algorithms.org)) 9 | directly on HPC clusters powered by SLURM (although it could work for other workload managers, see below). 10 | 11 | As a user, it pulls your usage statistics from the workload manager's logs and then it estimate your carbon footprint based on this usage. 12 | It reports a range of statistics such as energy usage, carbon footprints, compute use, memory efficiency, impact of failed jobs etc. 13 | 14 | The default output is in the terminal (example below), but we have now added the option of a richer html output (example coming soon). 15 | 16 | https://github.com/GreenAlgorithms/GreenAlgorithms4HPC/blob/main/example_files/Screenshot%20HPC%202024-08-20.png 17 | 18 | ![example file](https://github.com/GreenAlgorithms/GreenAlgorithms4HPC/blob/main/example_files/Screenshot%20HPC%202024-08-20.png) 19 | 20 | ## Quick start 21 | 22 | The tool only needs to be installed once, preferably in a shared drive so that all users can access it without installing 23 | it for themselves. 24 | 25 | :warning: Even if it's in a shared drive, each user will only be able to see their own usage. 26 | However, if the HTML output is used without a custom output directory, the report will be also located on the shared drive 27 | (more on this below). 28 | 29 | ### If GA4HPC is not installed yet 30 | 31 | Then it's on you to install it: see below for installation guide 32 | 33 | ### If GA4HPC is already installed 34 | 35 | Then you can run it straight away to find out your own carbon footprint. 36 | Assuming it's installed in `shared_directory`, all you have to do is to run the command below on the SLURM cluster to obtain the carbon footprint between two dates. 37 | ```bash 38 | shared_directory/myCarbonFootprint.sh --startDay 2024-01-10 --endDay 2024-08-15 39 | ``` 40 | 41 | You can customise the output with a number of options (full list below), but the main ones are: 42 | - `-S --startDay` and `-E --endDay`: formatted at YYY-MM-DD to restrict the logs considered. 43 | - `-o --output`: `-o terminal` to have the terminal output (default) or `-o html` for the html report. 44 | In case of the html report, a subdirectory will be created for it. 45 | By default, it's under `GreenAlgorithms4HPC/outputs/`, but this can be changed. 46 | - `--outputDir` to provide a path where to export any output. 47 | 48 | ### Limitations to keep in mind 49 | 50 | - The workload manager doesn't alway log the exact CPU usage time, and when this information is missing, we assume that all cores are used at 100%. 51 | - For now, we assume that GPUs are used at 100% (as the information needed for more accurate measurement is not available) 52 | (this may lead to slightly overestimated carbon footprints, although the order of magnitude is likely to be correct) 53 | - Conversely, the wasted energy due to memory overallocation may be largely underestimated, as the information needed is not always logged. 54 | 55 | 56 | ## Full list of options 57 | 58 | ``` 59 | usage: __init__.py [-h] [-S STARTDAY] [-E ENDDAY] [-o OUTPUT] [--outputDir OUTPUTDIR] [--filterCWD] [--filterJobIDs FILTERJOBIDS] [--filterAccount FILTERACCOUNT] [--customSuccessStates CUSTOMSUCCESSSTATES] 60 | [--reportBug | --reportBugHere] [--useCustomLogs USECUSTOMLOGS] 61 | 62 | Calculate your carbon footprint on the server. 63 | 64 | optional arguments: 65 | -h, --help show this help message and exit 66 | -S STARTDAY, --startDay STARTDAY 67 | The first day to take into account, as YYYY-MM-DD (default: 2024-01-01) 68 | -E ENDDAY, --endDay ENDDAY 69 | The last day to take into account, as YYYY-MM-DD (default: today) 70 | -o OUTPUT, --output OUTPUT 71 | How to display the results, one of 'terminal' or 'html' (default: terminal) 72 | --outputDir OUTPUTDIR 73 | Export path for the output (default: under `outputs/`). Only used with `--output html` and `--reportBug`. 74 | --filterCWD Only report on jobs launched from the current location. 75 | --filterJobIDs FILTERJOBIDS 76 | Comma separated list of Job IDs you want to filter on. (default: "all") 77 | --filterAccount FILTERACCOUNT 78 | Only consider jobs charged under this account 79 | --customSuccessStates CUSTOMSUCCESSSTATES 80 | Comma-separated list of job states. By default, only jobs that exit with status CD or COMPLETED are considered successful (PENDING, RUNNING and REQUEUD are ignored). Jobs with states listed here will 81 | be considered successful as well (best to list both 2-letter and full-length codes. Full list of job states: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES 82 | --reportBug In case of a bug, this flag exports the jobs logs so that you/we can investigate further. The debug file will be stored in the shared folder where this tool is located (under /outputs), to export it to 83 | your home folder, user `--reportBugHere`. Note that this will write out some basic information about your jobs, such as runtime, number of cores and memory usage. 84 | --reportBugHere Similar to --reportBug, but exports the output to your home folder. 85 | --useCustomLogs USECUSTOMLOGS 86 | This bypasses the workload manager, and enables you to input a custom log file of your jobs. This is mostly meant for debugging, but can be useful in some situations. An example of the expected file 87 | can be found at `example_files/example_sacctOutput_raw.txt`. 88 | ``` 89 | 90 | ## Installation guide 91 | 92 | :point_right: Only needs to be installed once on a cluster, check first that someone else hasn't installed it yet! 93 | 94 | ### Requirements 95 | - Python 3.8+ (can probably be adjusted to older versions of python fairly easily). 96 | 97 | ### Step-by-step 98 | 99 | 1. Clone this repository in a shared directory on your cluster: 100 | ```bash 101 | $ cd shared_directory 102 | $ git clone https://github.com/Llannelongue/GreenAlgorithms4HPC.git 103 | ``` 104 | 105 | 2. Edit `myCarbonFootprint.sh` line 20 to create the virtual environment with Python 3.8 or later. 106 | The default line is: 107 | ```bash 108 | /usr/bin/python3.8 -m venv GA_env 109 | ``` 110 | But it may be something else on your server, for example: 111 | ```bash 112 | module load python/3.11.7 113 | python -m venv GA_env 114 | ``` 115 | 116 | 3. Make the bash script executable: 117 | ```bash 118 | $ chmod +x shared_directory/GreenAlgorithms4HPC/myCarbonFootprint.sh 119 | ``` 120 | 121 | 4. Edit `cluster_info.yaml` to plug in the values corresponding to the hardware specs of your cluster 122 | (this is the tricky step). You can ask your HPC team and 123 | you can find a lot of useful values on the Green Algorithms GitHub: https://github.com/GreenAlgorithms/green-algorithms-tool/tree/master/data 124 | 125 | 5. Run the script a first time. It will check that the correct version of python is used 126 | and will create the virtualenv with the required packages, based on `requirements.txt`: 127 | ```shell script 128 | $ shared_directory/GreenAlgorithms4HPC/myCarbonFootprint.sh 129 | ``` 130 | 131 | ### How to update the software once installed 132 | 133 | _More elegant solutions welcome! [Discussion here](https://github.com/Llannelongue/GreenAlgorithms4HPC/issues/1)._ 134 | 135 | ⚠️ Make sure you have saved your custom version of `cluster_info.yaml` 136 | and the way to load python3.8 the first time. 137 | 138 | - `git reset --hard` To remove local changes to files (hence the need for a backup!) 139 | - `git pull` 140 | - Update `cluster_info.yaml` and `myCarbonFootprint.sh` as described above. 141 | - `chmod +x myCarbonFootprint.sh` to make it executable again 142 | - Test `myCarbonFootprint.sh` 143 | 144 | ## FAQ 145 | 146 | ### Can it work other other workload managers? 147 | 148 | Yes it can, but we have only written the code for SLURM so far. 149 | What you can do is to adapts [`slurm_extract.py`](backend/slurm_extract.py) for your own workload manager. 150 | 151 | In a nutshell, you just need to create a variable `self.df_agg_X` similar to the example file [here](example_files/example_output_workloadManager.tsv) 152 | (only the columns with a name ending in X in the code are needed). 153 | 154 | ### How to debug errors 155 | There are some example of intermediary files in [example_files/](example_files/). 156 | 157 | For the workload manager part of the code: 158 | - [The raw output](example_files/example_sacctOutput_raw.txt) ([here](example_files/example_sacctOutput_raw_asDF.tsv) as a table) from the `sacct` SLURM command (this is the command pulling all the logs from SLURM), i.e. `WM.logs_raw`, the output of `WM.pull_logs()`. 159 | - [The cleaned output of the workload manager step](example_files/example_output_workloadManager.tsv), i.e. `WM.df_agg`, the output of `WM.clean_logs_df()`. Only the columns with a name ending with X are needed (the other ones are being used by the workload manager script). NB: the `pd.DataFrame` has been converted to a csv to be included here. 160 | -------------------------------------------------------------------------------- /frontend/dashboard_output.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | from jinja2 import Environment, FileSystemLoader 4 | # from jinja2 import select_autoescape, DebugUndefined, StrictUndefined, Undefined 5 | import datetime 6 | from pprint import pprint 7 | import pandas as pd 8 | import numpy as np 9 | import plotly.express as px 10 | 11 | from frontend.helpers import formatText_footprint, formatText_treemonths, formatText_flying 12 | 13 | # class SilentUndefined(Undefined): # DEBUGONLY 14 | # def _fail_with_undefined_error(self, *args, **kwargs): 15 | # return '!MISSING!' 16 | 17 | def formatText_timedelta_short(dt): 18 | dt_sec = dt.total_seconds() 19 | hour = 3600 20 | day = 24*hour 21 | year = 365*day 22 | if dt_sec >= year: 23 | return f"{dt_sec / year:.1f} year{'' if int(dt_sec/year)==1 else 's'}" 24 | elif dt_sec > 2*day: 25 | return f"{dt_sec / day:.1f} days" 26 | elif dt_sec >= hour: 27 | return f"{dt_sec / hour:.1f} hour{'' if int(dt_sec/hour)==1 else 's'}" 28 | else: 29 | return f"{dt_sec:.2f} seconds" 30 | 31 | def formatText_cost(cost, cluster_info): 32 | return f"{cluster_info['energy_cost']['currency']}{cost:,.0f}" 33 | 34 | def get_summary_texts(dict_in, cluster_info): 35 | output = { 36 | 'cpuTime': formatText_timedelta_short(dict_in['cpuTime']), 37 | 'gpuTime': formatText_timedelta_short(dict_in['gpuTime']), 38 | 'carbonFootprint': formatText_footprint(dict_in['carbonFootprint'], use_html=True), 39 | 'carbonFootprint_failedJobs': formatText_footprint(dict_in['carbonFootprint_failedJobs'], use_html=True), 40 | 'carbonFootprint_failedJobs_share': f"{dict_in['carbonFootprint_failedJobs']/dict_in['carbonFootprint']:.2%}", 41 | 'carbonFootprint_wasted_memoryOverallocation': formatText_footprint(dict_in['carbonFootprint']-dict_in['carbonFootprint_memoryNeededOnly'], use_html=True), 42 | 'share_carbonFootprint': f"{dict_in['share_carbonFootprint']:.2%}", 43 | 'trees': formatText_treemonths(dict_in['treeMonths'], splitMonthsYear=False), 44 | 'flying': formatText_flying(dict_in, output_format='dict'), 45 | 'cost': formatText_cost(dict_in['cost'], cluster_info=cluster_info), 46 | 'cost_failedJobs': formatText_cost(dict_in['cost_failedJobs'], cluster_info=cluster_info), 47 | 'cost_wasted_memoryOverallocation': formatText_cost(dict_in['cost']-dict_in['cost_memoryNeededOnly'], cluster_info=cluster_info), 48 | 'n_jobs': f"{dict_in['n_jobs']:,}" 49 | } 50 | 51 | for key in dict_in: 52 | if key not in output: 53 | # print(f"adding {key}") 54 | output[key] = dict_in[key] 55 | 56 | return output 57 | 58 | class dashboard_html: 59 | def __init__(self, dict_stats, args, cluster_info): 60 | self.dict_stats = dict_stats 61 | self.args = args 62 | self.cluster_info = cluster_info 63 | 64 | self.context = { 65 | 'last_updated': datetime.datetime.now().strftime("%A %d %b %Y, %H:%M"), 66 | 'startDay': args.startDay, 67 | 'endDay': args.endDay, 68 | 'institution': cluster_info['institution'], 69 | 'cluster_name': cluster_info['cluster_name'], 70 | 'PUE': cluster_info['PUE'], 71 | 'CI': cluster_info['CI'], 72 | 'energy_cost_perkWh': cluster_info['energy_cost'], 73 | 'texts_intro': cluster_info['texts_intro'], 74 | } 75 | 76 | self.template_plotly = "plotly_white" 77 | self.custom_colours = { 78 | 'area': '#a6cee3' 79 | } 80 | self.height_plotly = 350 81 | 82 | self.user_here = dict_stats['user'] 83 | 84 | self.outputDir = args.outputDir2use['path'] 85 | self.plotsDir = os.path.join(self.outputDir, 'plots') 86 | os.makedirs(self.plotsDir) 87 | 88 | def _user_context(self): 89 | #################################### 90 | # User-specific part of the report # 91 | #################################### 92 | 93 | self.context['user'] = {'userID': self.user_here} 94 | 95 | self.context['usersActivity'] = { 96 | self.user_here: get_summary_texts( 97 | self.dict_stats['userActivity'][self.user_here], 98 | cluster_info=self.cluster_info 99 | ) 100 | } 101 | 102 | ### User's overall metrics 103 | 104 | df_userDaily_here = self.dict_stats['userDaily'] 105 | 106 | # Daily carbon footprint 107 | fig_userDailyCarbonFootprint = px.area( 108 | df_userDaily_here, x='SubmitDate', y="carbonFootprint", 109 | labels=dict(SubmitDate='', carbonFootprint='Carbon footprint (gCO2e)'), 110 | title="Daily carbon footprint", 111 | template=self.template_plotly, 112 | color_discrete_sequence=[self.custom_colours['area']] 113 | ) 114 | fig_userDailyCarbonFootprint.update_layout(height=self.height_plotly) 115 | fig_userDailyCarbonFootprint.write_html( 116 | os.path.join(self.plotsDir, "plotly_thisuserDailyCarbonFootprint.html"), 117 | include_plotlyjs='cdn' 118 | ) 119 | 120 | # Daily number of jobs 121 | fig_userDailyNjobs = px.area( 122 | df_userDaily_here, x='SubmitDate', y="n_jobs", 123 | labels=dict(SubmitDate='', n_jobs='Number of jobs started'), 124 | title="Number of jobs started", 125 | template=self.template_plotly, 126 | color_discrete_sequence=[self.custom_colours['area']] 127 | ) 128 | fig_userDailyNjobs.update_layout(height=self.height_plotly) 129 | fig_userDailyNjobs.write_html( 130 | os.path.join(self.plotsDir, "plotly_thisuserDailyNjobs.html"), 131 | include_plotlyjs='cdn' 132 | ) 133 | 134 | # Daily CPU time 135 | fig_userDailyCpuTime = px.area( 136 | df_userDaily_here, x='SubmitDate', y="CPUhoursCharged", 137 | labels=dict(SubmitDate='', CPUhoursCharged='CPU core-hours'), 138 | title="CPU core hours", 139 | template=self.template_plotly, 140 | color_discrete_sequence=[self.custom_colours['area']] 141 | ) 142 | fig_userDailyCpuTime.update_layout(height=self.height_plotly) 143 | fig_userDailyCpuTime.write_html( 144 | os.path.join(self.plotsDir, "plotly_thisuserDailyCpuTime.html"), 145 | include_plotlyjs='cdn' 146 | ) 147 | 148 | # Daily Memory requested 149 | fig_userDailyCpuTime = px.area( 150 | df_userDaily_here, x='SubmitDate', y="memoryRequested", 151 | labels=dict(SubmitDate='', memoryRequested='Memory requested (GB)'), 152 | title="Memory requested", 153 | template=self.template_plotly, 154 | color_discrete_sequence=[self.custom_colours['area']] 155 | ) 156 | fig_userDailyCpuTime.update_layout(height=self.height_plotly) 157 | fig_userDailyCpuTime.write_html( 158 | os.path.join(self.plotsDir, "plotly_thisuserDailyMemoryRequested.html"), 159 | include_plotlyjs='cdn' 160 | ) 161 | 162 | # Total success rate 163 | n_success = self.dict_stats['userActivity'][self.user_here]['n_success'] 164 | n_failure = self.dict_stats['userActivity'][self.user_here]['n_jobs'] - self.dict_stats['userActivity'][self.user_here]['n_success'] 165 | foo = pd.DataFrame({ 166 | 'Status': ['Success', 'Failure'], 167 | 'Number of jobs': [n_success, n_failure] 168 | }) 169 | fig_userSuccessRate = px.pie( 170 | foo, values='Number of jobs', names='Status', color='Status', 171 | color_discrete_map={'Success':"#A9DFBF", 'Failure': "#F5B7B1"}, 172 | template=self.template_plotly, 173 | hole=.6, 174 | ) 175 | fig_userSuccessRate.update_layout(height=self.height_plotly) 176 | fig_userSuccessRate.write_html( 177 | os.path.join(self.plotsDir, "plotly_thisuserSuccessRate.html"), 178 | include_plotlyjs='cdn' 179 | ) 180 | 181 | # Daily success rate 182 | fig_userDailySuccessRate = px.area( 183 | pd.melt(df_userDaily_here, id_vars='SubmitDate', value_vars=['failure_rate', 'success_rate']), 184 | x='SubmitDate', y="value", color='variable', 185 | color_discrete_map={'failure_rate': "#F5B7B1", 'success_rate': "#A9DFBF"}, 186 | labels=dict(SubmitDate='', value='% of failed jobs (in red)', variable=""), 187 | # title="", 188 | template=self.template_plotly 189 | ) 190 | fig_userDailySuccessRate.update_layout(height=self.height_plotly, showlegend=False) 191 | fig_userDailySuccessRate.write_html( 192 | os.path.join(self.plotsDir, "plotly_thisuserDailySuccessRate.html"), 193 | include_plotlyjs='cdn' 194 | ) 195 | 196 | # Memory efficiency 197 | fig_userMemoryEfficiency = px.histogram( 198 | np.reciprocal(self.dict_stats['memoryOverallocationFactors'][self.user_here]) * 100, 199 | labels=dict(value="Memory efficiency (%)"), 200 | template=self.template_plotly, 201 | color_discrete_sequence=[self.custom_colours['area']] 202 | ) 203 | fig_userMemoryEfficiency.update_layout( 204 | bargap=0.2, 205 | yaxis_title="Number of jobs", 206 | showlegend=False, 207 | height=self.height_plotly 208 | ) 209 | fig_userMemoryEfficiency.write_html( 210 | os.path.join(self.plotsDir, "plotly_thisuserMemoryEfficiency.html"), 211 | include_plotlyjs='cdn' 212 | ) 213 | 214 | def generate(self): 215 | 216 | self.context['include_user_context'] = True 217 | 218 | self._user_context() 219 | 220 | environment = Environment( 221 | loader=FileSystemLoader(['frontend/templates/', self.plotsDir]), 222 | # autoescape=select_autoescape(), 223 | # undefined=SilentUndefined # StrictUndefined is mostly for testing, SilenUndefined to ignore missing ones 224 | ) 225 | 226 | j2_template = environment.get_template('report_blank.html') 227 | j2_rendered = j2_template.render(self.context) 228 | 229 | ## Export 230 | # print(os.getcwd()) 231 | report_path = os.path.join(self.outputDir, f"report_{self.user_here}.html") 232 | with open(report_path, 'w') as file: 233 | file.write(j2_rendered) 234 | # Also copy across the styles.css 235 | shutil.copy("frontend/templates/styles.css", self.outputDir) 236 | 237 | return report_path 238 | 239 | # FIXME the pdf export doesn't really work...sticking with html for now 240 | # Follows guidelines from https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#command-line 241 | # from weasyprint import HTML, CSS 242 | # 243 | # css = CSS(string=''' @page {size: 53.34cm 167.86 cm;} ''') 244 | # HTML("outputs/report_rendered.html").write_pdf("outputs/report_rendered.pdf", stylesheets=[css]) -------------------------------------------------------------------------------- /backend/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import yaml 4 | import pandas as pd 5 | import numpy as np 6 | 7 | from backend.helpers import check_empty_results, simulate_mock_jobs 8 | from backend.slurm_extract import WorkloadManager 9 | 10 | # print("Working dir1: ", os.getcwd()) # DEBUGONLY 11 | 12 | class GA_tools(): 13 | 14 | def __init__(self, cluster_info, fParams): 15 | self.cluster_info = cluster_info 16 | self.fParams = fParams 17 | 18 | def calculate_energies(self, row): 19 | ''' 20 | Calculate the energy usaged based on the job's paramaters 21 | :param row: [pd.Series] one row of usage statistics, corresponding to one job 22 | :return: [pd.Series] the same statistics with the energies added 23 | ''' 24 | ### CPU and GPU 25 | partition_info = self.cluster_info['partitions'][row.PartitionX] 26 | if row.PartitionTypeX == 'CPU': 27 | TDP2use4CPU = partition_info['TDP'] 28 | TDP2use4GPU = 0 29 | else: 30 | TDP2use4CPU = partition_info['TDP_CPU'] 31 | TDP2use4GPU = partition_info['TDP'] 32 | 33 | row['energy_CPUs'] = row.TotalCPUtime2useX.total_seconds() / 3600 * TDP2use4CPU / 1000 # in kWh 34 | 35 | row['energy_GPUs'] = row.TotalGPUtime2useX.total_seconds() / 3600 * TDP2use4GPU / 1000 # in kWh 36 | 37 | ### memory 38 | for suffix, memory2use in zip(['','_memoryNeededOnly'], [row.ReqMemX,row.NeededMemX]): 39 | row[f'energy_memory{suffix}'] = row.WallclockTimeX.total_seconds()/3600 * memory2use * self.fParams['power_memory_perGB'] /1000 # in kWh 40 | row[f'energy{suffix}'] = (row.energy_CPUs + row.energy_GPUs + row[f'energy_memory{suffix}']) * self.cluster_info['PUE'] # in kWh 41 | 42 | return row 43 | 44 | def calculate_carbonFootprint(self, df, col_energy): 45 | return df[col_energy] * self.cluster_info['CI'] 46 | 47 | 48 | def extract_data(args, cluster_info): 49 | 50 | if args.use_mock_agg_data: # DEBUGONLY 51 | 52 | if args.reportBug | args.reportBugHere: 53 | print("\n(!) --reportBug and --reportBugHere are ignored when --useCustomLogs is present\n") 54 | 55 | # df2 = simulate_mock_jobs() 56 | # df2.to_pickle("testData/df_agg_X_mockMultiUsers_1.pkl") 57 | 58 | # foo = 'testData/df_agg_test_3.pkl' 59 | foo = 'testData/df_agg_X_1.pkl' 60 | print(f"Overriding df_agg with `{foo}`") 61 | return pd.read_pickle(foo) 62 | 63 | 64 | ### Pull usage statistics from the workload manager 65 | WM = WorkloadManager(args, cluster_info) 66 | WM.pull_logs() 67 | 68 | ### Log the output for debugging 69 | if args.reportBug | args.reportBugHere: 70 | if args.reportBug: 71 | # Create an error_logs subfolder in the output dir 72 | errorLogsDir = os.path.join(args.outputDir2use['path'], 'error_logs') 73 | os.makedirs(errorLogsDir) 74 | log_path = os.path.join(errorLogsDir, f'sacctOutput.csv') 75 | else: 76 | # i.e. args.reportBugHere is True 77 | log_path = f"{args.userCWD}/sacctOutput_{args.outputDir2use['timestamp']}.csv" 78 | 79 | with open(log_path, 'wb') as f: 80 | f.write(WM.logs_raw) 81 | print(f"\nSLURM statistics logged for debuging: {log_path}\n") 82 | 83 | ### Turn usage logs into DataFrame 84 | WM.convert2dataframe() 85 | check_empty_results(WM.logs_df, args) 86 | 87 | # And clean 88 | WM.clean_logs_df() 89 | # Check if there are any jobs during the period from this directory and with these jobIDs 90 | check_empty_results(WM.df_agg, args) 91 | 92 | # Check that there is only one user's data 93 | if len(set(WM.df_agg_X.UserX)) > 1: 94 | raise ValueError(f"More than one user's logs was included: {set(WM.df_agg_X.UserX)}") 95 | 96 | # WM.df_agg_X.to_pickle("testData/df_agg_X_1.pkl") # DEBUGONLY used to test different steps offline 97 | 98 | return WM.df_agg_X 99 | 100 | def enrich_data(df, fParams, GA): 101 | 102 | ### energy 103 | df = df.apply(GA.calculate_energies, axis=1) 104 | 105 | df['energy_failedJobs'] = np.where(df.StateX == 0, df.energy, 0) 106 | 107 | ### carbon footprint 108 | for suffix in ['', '_memoryNeededOnly', '_failedJobs']: 109 | df[f'carbonFootprint{suffix}'] = GA.calculate_carbonFootprint(df, f'energy{suffix}') 110 | # Context metrics (part 1) 111 | df[f'treeMonths{suffix}'] = df[f'carbonFootprint{suffix}'] / fParams['tree_month'] 112 | df[f'cost{suffix}'] = df[f'energy{suffix}'] * fParams['electricity_cost'] # TODO use realtime electricity costs 113 | 114 | ### Context metrics (part 2) 115 | df['driving'] = df.carbonFootprint / fParams['passengerCar_EU_perkm'] 116 | df['flying_NY_SF'] = df.carbonFootprint / fParams['flight_NY_SF'] 117 | df['flying_PAR_LON'] = df.carbonFootprint / fParams['flight_PAR_LON'] 118 | df['flying_NYC_MEL'] = df.carbonFootprint / fParams['flight_NYC_MEL'] 119 | 120 | return df 121 | 122 | def summarise_data(df, args): 123 | agg_functions_from_raw = { 124 | 'n_jobs': ('UserX', 'count'), 125 | 'first_job_period': ('SubmitDatetimeX', 'min'), 126 | 'last_job_period': ('SubmitDatetimeX', 'max'), 127 | 'energy': ('energy', 'sum'), 128 | 'energy_CPUs': ('energy_CPUs', 'sum'), 129 | 'energy_GPUs': ('energy_GPUs', 'sum'), 130 | 'energy_memory': ('energy_memory', 'sum'), 131 | 'carbonFootprint': ('carbonFootprint', 'sum'), 132 | 'carbonFootprint_memoryNeededOnly': ('carbonFootprint_memoryNeededOnly', 'sum'), 133 | 'carbonFootprint_failedJobs': ('carbonFootprint_failedJobs', 'sum'), 134 | 'cpuTime': ('TotalCPUtime2useX', 'sum'), 135 | 'gpuTime': ('TotalGPUtime2useX', 'sum'), 136 | 'wallclockTime': ('WallclockTimeX', 'sum'), 137 | 'CPUhoursCharged': ('CPUhoursChargedX', 'sum'), 138 | 'GPUhoursCharged': ('GPUhoursChargedX', 'sum'), 139 | 'memoryRequested': ('ReqMemX', 'sum'), 140 | 'memoryOverallocationFactor': ('memOverallocationFactorX', 'mean'), 141 | 'n_success': ('StateX', 'sum'), 142 | 'treeMonths': ('treeMonths', 'sum'), 143 | 'treeMonths_memoryNeededOnly': ('treeMonths_memoryNeededOnly', 'sum'), 144 | 'treeMonths_failedJobs': ('treeMonths_failedJobs', 'sum'), 145 | 'driving': ('driving', 'sum'), 146 | 'flying_NY_SF': ('flying_NY_SF', 'sum'), 147 | 'flying_PAR_LON': ('flying_PAR_LON', 'sum'), 148 | 'flying_NYC_MEL': ('flying_NYC_MEL', 'sum'), 149 | 'cost': ('cost', 'sum'), 150 | 'cost_failedJobs': ('cost_failedJobs', 'sum'), 151 | 'cost_memoryNeededOnly': ('cost_memoryNeededOnly', 'sum'), 152 | } 153 | 154 | # This is to aggregate already aggregated dataset (so names are a bit different) 155 | agg_functions_further = agg_functions_from_raw.copy() 156 | agg_functions_further['n_jobs'] = ('n_jobs', 'sum') 157 | agg_functions_further['first_job_period'] = ('first_job_period', 'min') 158 | agg_functions_further['last_job_period'] = ('last_job_period', 'max') 159 | agg_functions_further['cpuTime'] = ('cpuTime', 'sum') 160 | agg_functions_further['gpuTime'] = ('gpuTime', 'sum') 161 | agg_functions_further['wallclockTime'] = ('wallclockTime', 'sum') 162 | agg_functions_further['CPUhoursCharged'] = ('CPUhoursCharged', 'sum') 163 | agg_functions_further['GPUhoursCharged'] = ('GPUhoursCharged', 'sum') 164 | agg_functions_further['memoryRequested'] = ('memoryRequested', 'sum') 165 | agg_functions_further['memoryOverallocationFactor'] = ('memoryOverallocationFactor', 'mean') # NB: not strictly correct to do a mean of mean, but ok 166 | agg_functions_further['n_success'] = ('n_success', 'sum') 167 | 168 | def agg_jobs(data, agg_names=None): 169 | """ 170 | 171 | :param data: 172 | :param agg_names: if None, then the whole dataset is aggregated 173 | :return: 174 | """ 175 | agg_names2 = agg_names if agg_names else lambda _:True 176 | if 'UserX' in data.columns: 177 | timeseries = data.groupby(agg_names2).agg(**agg_functions_from_raw) 178 | else: 179 | timeseries = data.groupby(agg_names2).agg(**agg_functions_further) 180 | 181 | timeseries.reset_index(inplace=True, drop=(agg_names is None)) 182 | timeseries['success_rate'] = timeseries.n_success / timeseries.n_jobs 183 | timeseries['failure_rate'] = 1 - timeseries.success_rate 184 | timeseries['share_carbonFootprint'] = timeseries.carbonFootprint / timeseries.carbonFootprint.sum() 185 | 186 | return timeseries 187 | 188 | df['SubmitDate'] = df.SubmitDatetimeX.dt.date # TODO do it with real start time rather than submit day 189 | 190 | df_userdaily = agg_jobs(df, ['SubmitDate']) 191 | df_overallStats = agg_jobs(df_userdaily) 192 | dict_overallStats = df_overallStats.iloc[0, :].to_dict() 193 | userID = df.UserX[0] 194 | 195 | output = { 196 | "userDaily": df_userdaily, 197 | 'userActivity': {userID: dict_overallStats}, 198 | "user": userID 199 | } 200 | 201 | # Some job-level statistics to plot distributions 202 | memoryOverallocationFactors = df.groupby('UserX')['memOverallocationFactorX'].apply(list).to_dict() 203 | memoryOverallocationFactors['overall'] = df.memOverallocationFactorX.to_numpy() 204 | output['memoryOverallocationFactors'] = memoryOverallocationFactors 205 | 206 | return output 207 | 208 | 209 | def main_backend(args): 210 | ''' 211 | 212 | :param args: 213 | :return: 214 | ''' 215 | ### Load cluster specific info 216 | with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream: 217 | try: 218 | cluster_info = yaml.safe_load(stream) 219 | except yaml.YAMLError as exc: 220 | print(exc) 221 | 222 | ### Load fixed parameters 223 | with open("data/fixed_parameters.yaml", "r") as stream: 224 | try: 225 | fParams = yaml.safe_load(stream) 226 | except yaml.YAMLError as exc: 227 | print(exc) 228 | 229 | GA = GA_tools(cluster_info, fParams) 230 | 231 | df = extract_data(args, cluster_info=cluster_info) 232 | df2 = enrich_data(df, fParams=fParams, GA=GA) 233 | summary_stats = summarise_data(df2, args=args) 234 | 235 | return summary_stats 236 | 237 | if __name__ == "__main__": 238 | 239 | #### This is used for testing only #### 240 | 241 | from collections import namedtuple 242 | argStruct = namedtuple('argStruct', 243 | 'startDay endDay use_mock_agg_data useCustomLogs customSuccessStates filterWD filterJobIDs filterAccount reportBug reportBugHere path_infrastucture_info') 244 | args = argStruct( 245 | startDay='2022-01-01', 246 | endDay='2023-06-30', 247 | useCustomLogs=None, 248 | use_mock_agg_data=True, 249 | customSuccessStates='', 250 | filterWD=None, 251 | filterJobIDs='all', 252 | filterAccount=None, 253 | reportBug=False, 254 | reportBugHere=False, 255 | path_infrastucture_info="clustersData/CSD3", 256 | ) 257 | 258 | main_backend(args) 259 | 260 | 261 | 262 | -------------------------------------------------------------------------------- /backend/slurm_extract.py: -------------------------------------------------------------------------------- 1 | 2 | import subprocess 3 | 4 | import pandas as pd 5 | from io import BytesIO 6 | import datetime 7 | import os 8 | import numpy as np 9 | 10 | 11 | class Helpers_WM(): 12 | 13 | def __init__(self, cluster_info): 14 | self.cluster_info = cluster_info 15 | 16 | def convert_to_GB(self, memory, unit): 17 | """ 18 | Converts data quantity into GB. 19 | :param memory: [float] quantity to convert 20 | :param unit: [str] unit of `memory`, has to be one of ['M', 'G', 'K'] 21 | :return: [float] memory in GB. 22 | """ 23 | assert unit in ['M', 'G', 'K'] 24 | if unit == 'M': 25 | memory /= 1e3 26 | elif unit == 'K': 27 | memory /= 1e6 28 | return memory 29 | 30 | def calc_ReqMem(self, x): 31 | """ 32 | Calculates the total memory required when submitting the job. 33 | :param x: [pd.Series] one row of sacct output. 34 | :return: [float] total required memory, in GB. 35 | """ 36 | mem_raw, n_nodes, n_cores = x['ReqMem'], x['NNodes'], x['NCPUS'] 37 | 38 | if pd.isnull(mem_raw): 39 | unit = 'G' 40 | memory = 0 41 | elif mem_raw[-1] == 'n': 42 | unit = mem_raw[-2] 43 | memory = float(mem_raw[:-2]) * n_nodes 44 | elif mem_raw[-1] == 'c': 45 | unit = mem_raw[-2] 46 | memory = float(mem_raw[:-2]) * n_cores 47 | elif mem_raw[-1] in ['M', 'G', 'K']: 48 | unit = mem_raw[-1] 49 | memory = float(mem_raw[:-1]) 50 | else: 51 | raise ValueError(f"Can't parse memory value: {mem_raw}. Please raise issue on GitHub.") 52 | 53 | return self.convert_to_GB(memory, unit) 54 | 55 | def clean_RSS(self, x): 56 | """ 57 | Cleans the RSS value in sacct output. 58 | :param x: [NaN or str] the RSS value, either NaN or of the form '2745K' 59 | (optionally, just a number, we then use default_unit_RSS from cluster_info.yaml as unit). 60 | :return: [float] RSS value, in GB. 61 | """ 62 | if pd.isnull(x.MaxRSS): 63 | # NB if no info on MaxRSS, we assume all memory was used 64 | memory = -1 65 | elif x.MaxRSS == '0': 66 | memory = 0 67 | else: 68 | assert type(x.MaxRSS) == str 69 | # Special case for the situation where MaxRSS is of the form '154264' without a unit. 70 | if x.MaxRSS[-1].isalpha(): 71 | memory = self.convert_to_GB(float(x.MaxRSS[:-1]), x.MaxRSS[-1]) 72 | else: 73 | assert 'default_unit_RSS' in self.cluster_info, "Some values of MaxRSS don't have a unit. Please specify a default_unit_RSS in cluster_info.yaml" 74 | memory = self.convert_to_GB(float(x.MaxRSS), self.cluster_info['default_unit_RSS']) 75 | 76 | return memory 77 | 78 | def cleam_UsedMem(self, x): 79 | """ 80 | Cleans the UsedMemory column 81 | :param x: 82 | :return: [float] 83 | """ 84 | # NB when MaxRSS didn't store any values, we assume that "memory used = memory requested" 85 | return x.ReqMemX if x.UsedMem_ == -1 else x.UsedMem_ 86 | 87 | def clean_partition(self, x): 88 | """ 89 | Cleans the partition field, by replacing NaNs with empty string and selecting just one partition per job. 90 | :param x: [str] partition or comma-seperated list of partitions 91 | :return: [str] one partition or empty string 92 | """ 93 | if pd.isnull(x.Partition): 94 | return '' 95 | 96 | L_partitions = x.Partition.split(',') 97 | if (x.WallclockTimeX.total_seconds() > 0) & (len(L_partitions) > 1): 98 | # Multiple partitions logged is only an issue for jobs that never started, 99 | # for the others, only the used partition is logged 100 | print(f"\n-!- WARNING: Multiple partitions logged on a job than ran: {x.JobID} - {x.Partition} (using the first one)\n") 101 | return L_partitions[0] 102 | 103 | def set_partitionType(self, x): 104 | assert x in self.cluster_info['partitions'], f"\n-!- Unknown partition: {x} -!-\n" 105 | return self.cluster_info['partitions'][x]['type'] 106 | 107 | def parse_timedelta(self, x): 108 | """ 109 | Parse a string representing a duration into a `datetime.timedelta` object. 110 | :param x: [str] Duration, as '[DD-HH:MM:]SS[.MS]' 111 | :return: [datetime.timedelta] Timedelta object 112 | """ 113 | # Parse number of days 114 | day_split = x.split('-') 115 | if len(day_split) == 2: 116 | n_days = int(day_split[0]) 117 | HHMMSSms = day_split[1] 118 | else: 119 | n_days = 0 120 | HHMMSSms = x 121 | 122 | # Parse ms 123 | ms_split = HHMMSSms.split('.') 124 | if len(ms_split) == 2: 125 | n_ms = int(ms_split[1]) 126 | HHMMSS = ms_split[0] 127 | else: 128 | n_ms = 0 129 | HHMMSS = HHMMSSms 130 | 131 | # Parse HH,MM,SS 132 | last_split = HHMMSS.split(':') 133 | if len(last_split) == 3: 134 | to_add = [] 135 | elif len(last_split) == 2: 136 | to_add = ['00'] 137 | elif len(last_split) == 1: 138 | to_add = ['00', '00'] 139 | else: 140 | raise ValueError(f"Can't parse {x}") 141 | n_h, n_m, n_s = list(map(int, to_add + last_split)) 142 | 143 | return datetime.timedelta( 144 | days=n_days, hours=n_h, minutes=n_m, seconds=n_s, milliseconds=n_ms 145 | ) 146 | 147 | def calc_realMemNeeded(self, x, granularity_memory_request): 148 | """ 149 | Calculate the minimum memory needed. 150 | This is calculated as the smallest multiple of `granularity_memory_request` that is greater than maxRSS. 151 | :param x: [pd.Series] one row of sacct output. 152 | :param granularity_memory_request: [float or int] level of granularity available when requesting memory on this cluster 153 | :return: [float] minimum memory needed, in GB. 154 | """ 155 | foo = (int(x.UsedMem2_ / granularity_memory_request) + 1) * granularity_memory_request 156 | return foo if x.ReqMemX < x.UsedMem2_ else min(x.ReqMemX, foo) 157 | 158 | def calc_memory_overallocation(self, x): 159 | # This is in case ReqMem is wrong or too low 160 | return 1. if x.ReqMemX < x.NeededMemX else x.ReqMemX / x.NeededMemX 161 | 162 | def calc_CPUusage2use(self, x): 163 | if x.TotalCPUtime_.total_seconds() == 0: 164 | # This is when the workload manager actually didn't store real usage 165 | # NB: when TotalCPU=0, we assume usage factor = 100% for all CPU cores 166 | return x.CPUwallclocktime_ 167 | 168 | assert x.TotalCPUtime_ <= x.CPUwallclocktime_ 169 | return x.TotalCPUtime_ 170 | 171 | def calc_GPUusage2use(self, x): 172 | if x.PartitionTypeX != 'GPU': 173 | return datetime.timedelta(0) 174 | if x.WallclockTimeX.total_seconds() > 0: 175 | assert x.NGPUS_ != 0 176 | return x.WallclockTimeX * x.NGPUS_ # NB assuming usage factor of 100% for GPUs 177 | 178 | def calc_coreHoursCharged(self, x): 179 | ''' 180 | Split CPU and GPU core hours charged, depending on the partition. 181 | :param x: 182 | :return: [(float, float)] 183 | ''' 184 | if x.PartitionTypeX == 'CPU': 185 | return x.CPUwallclocktime_ / np.timedelta64(1, 'h'), 0. 186 | else: 187 | return 0., x.WallclockTimeX * x.NGPUS_ / np.timedelta64(1, 'h') 188 | 189 | def clean_State(self, x, customSuccessStates_list): 190 | """ 191 | Standardise the job's state, coding with {-1,0,1} 192 | :param x: [str] "State" field from sacct output 193 | :return: [int] in [-1,0,1] 194 | """ 195 | # Codes are found here: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES 196 | # self.args.customSuccessStates = 'TO,TIMEOUT' 197 | success_codes = ['CD', 'COMPLETED'] 198 | running_codes = ['PD', 'PENDING', 'R', 'RUNNING', 'RQ', 'REQUEUED'] 199 | if x in success_codes: 200 | codeState = 1 201 | elif x in customSuccessStates_list: 202 | # we allocate a lower value here so that when aggregating by jobID, the whole job keeps the flag 203 | # Otherwise a "cancelled" job could take over with StateX=0 for example 204 | codeState = -1 205 | else: 206 | codeState = 0 207 | 208 | if x in running_codes: 209 | # running jobs are the lowest to be removed all the time 210 | # (if one of the subprocess is still running, the job gets ignored regardless of --customSuccessStates 211 | codeState = -2 212 | 213 | return codeState 214 | 215 | def get_parent_jobID(self, x): 216 | """ 217 | Get the parent job ID in case of array jobs 218 | :param x: [str] JobID of the form 123456789_0 (with or without '_0') 219 | :return: [str] Parent ID 123456789 220 | """ 221 | foo = x.split('_') 222 | assert len(foo) <= 2, f"Can't parse the job ID: {x}" 223 | return foo[0] 224 | 225 | 226 | class WorkloadManager(Helpers_WM): 227 | 228 | def __init__(self, args, cluster_info): 229 | """ 230 | Methods related to the Workload manager 231 | :param args: [Namespace] input from the user 232 | :param cluster_info: [dict] information about this specific cluster. 233 | """ 234 | super().__init__(cluster_info=cluster_info) 235 | self.args = args 236 | 237 | self.logs_df = None 238 | self.df_agg_0 = None 239 | self.df_agg = None 240 | self.df_agg_X = None 241 | 242 | def pull_logs(self): 243 | """ 244 | Run the command line to pull usage from the workload manager. 245 | More: https://slurm.schedmd.com/sacct.html 246 | """ 247 | if self.args.useCustomLogs == '': 248 | bash_com = [ 249 | "sacct", 250 | "--starttime", 251 | self.args.startDay, # format YYYY-MM-DD 252 | "--endtime", 253 | self.args.endDay, # format YYYY-MM-DD 254 | "--format", 255 | "UID,User,JobID,JobName,Submit,Elapsed,Partition,NNodes,NCPUS,TotalCPU,CPUTime,ReqMem,MaxRSS,WorkDir,State,Account,AllocTres", 256 | "-P" 257 | ] 258 | 259 | # logs = subprocess.run(bash_com, capture_output=True) # this line is the new way, but doesn't work with python 3.6 or earlier. line below is the legacy way. https://stackoverflow.com/questions/4760215/running-shell-command-and-capturing-the-output 260 | logs = subprocess.run(bash_com, stdout=subprocess.PIPE) 261 | self.logs_raw = logs.stdout 262 | else: 263 | foo = "Overriding logs_raw with: " 264 | foundIt = False 265 | for sacctFileLocation in ['', 'testData', 'error_logs']: 266 | if not foundIt: 267 | try: 268 | with open(os.path.join(sacctFileLocation, self.args.useCustomLogs), 'rb') as f: 269 | self.logs_raw = f.read() 270 | foo += f"{sacctFileLocation}/{self.args.useCustomLogs}" 271 | foundIt = True 272 | except: 273 | pass 274 | if not foundIt: 275 | raise FileNotFoundError(f"Couldn't find {self.args.useCustomLogs} \n " 276 | f"It should be either be in the testData/ or error_logs/ subdirectories, or the full path should be provided by --useCustomLogs.") 277 | print(foo) 278 | 279 | def convert2dataframe(self): 280 | """ 281 | Convert raw logs output into a pandas dataframe. 282 | """ 283 | logs_df = pd.read_csv(BytesIO(self.logs_raw), sep="|", dtype='str') 284 | for x in ['NNodes', 'NCPUS']: 285 | logs_df[x] = logs_df[x].astype('int64') 286 | 287 | self.logs_df = logs_df 288 | 289 | def clean_logs_df(self): 290 | """ 291 | Clean the different fields of the usage logs. 292 | NB: the name of the columns ending with X need to be conserved, as they are used by the main script. 293 | """ 294 | # self.logs_df_raw = self.logs_df.copy() # DEBUGONLY Save a copy of uncleaned raw for debugging mainly 295 | 296 | ### Calculate real memory usage 297 | self.logs_df['ReqMemX'] = self.logs_df.apply(self.calc_ReqMem, axis=1) 298 | 299 | ### Clean MaxRSS 300 | self.logs_df['UsedMem_'] = self.logs_df.apply(self.clean_RSS, axis=1) 301 | 302 | ### Parse wallclock time 303 | self.logs_df['WallclockTimeX'] = self.logs_df['Elapsed'].apply(self.parse_timedelta) 304 | 305 | ### Parse total CPU time 306 | # This is the total CPU used time, accross all cores. 307 | # But it is not reliably logged 308 | self.logs_df['TotalCPUtime_'] = self.logs_df['TotalCPU'].apply(self.parse_timedelta) 309 | 310 | ### Parse core-wallclock time 311 | # This is the maximum time cores could use, if used at 100% (Elapsed time * CPU count) 312 | if 'CPUTime' in self.logs_df.columns: 313 | self.logs_df['CPUwallclocktime_'] = self.logs_df['CPUTime'].apply(self.parse_timedelta) 314 | else: 315 | print('Using old logs, "CPUTime" information not available.') # TODO: remove this after a while 316 | self.logs_df['CPUwallclocktime_'] = self.logs_df.WallclockTimeX * self.logs_df.NCPUS 317 | 318 | ### Number of GPUs 319 | # TODO double check that it includes multiple GPUs correctly 320 | if 'AllocTRES' in self.logs_df.columns: 321 | self.logs_df['NGPUS_'] = self.logs_df.AllocTRES.str.extract(r'((?<=gres\/gpu=)\d+)', expand=False).fillna( 322 | 0).astype('int64') 323 | else: 324 | print('Using old logs, "AllocTRES" information not available.') # TODO: remove this after a while 325 | self.logs_df['NGPUS_'] = 0 326 | 327 | ### Clean partition 328 | # Make sure it's either a partition name, or a comma-separated list of partitions 329 | self.logs_df['PartitionX'] = self.logs_df.apply(self.clean_partition, axis=1) 330 | 331 | ### Parse submit datetime 332 | self.logs_df['SubmitDatetimeX'] = self.logs_df.Submit.apply( 333 | lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S")) 334 | 335 | ### Number of CPUs 336 | # e.g. here there is no cleaning necessary, so I just standardise the column name 337 | self.logs_df['NCPUS_'] = self.logs_df.NCPUS 338 | 339 | ### Number of nodes 340 | self.logs_df['NNodes_'] = self.logs_df.NNodes 341 | 342 | ### Job name 343 | self.logs_df['JobName_'] = self.logs_df.JobName 344 | 345 | ### Working directory 346 | self.logs_df['WorkingDir_'] = self.logs_df.WorkDir 347 | 348 | ### Username and UID 349 | self.logs_df['UIDX'] = self.logs_df.UID 350 | self.logs_df['UserX'] = self.logs_df.User 351 | 352 | ### State 353 | customSuccessStates_list = self.args.customSuccessStates.split(',') 354 | self.logs_df['StateX'] = self.logs_df.State.apply(self.clean_State, 355 | customSuccessStates_list=customSuccessStates_list) 356 | 357 | ### Pull jobID 358 | self.logs_df['single_jobID'] = self.logs_df.JobID.apply(lambda x: x.split('.')[0]) 359 | 360 | ### Account 361 | if 'Account' in self.logs_df.columns: 362 | self.logs_df['Account_'] = self.logs_df.Account 363 | else: 364 | print('Using old logs, "Account" information not available.') # TODO: remove this after a while 365 | self.logs_df['Account_'] = '' 366 | 367 | ### Aggregate per jobID 368 | self.df_agg_0 = self.logs_df.groupby('single_jobID').agg({ 369 | 'TotalCPUtime_': 'max', 370 | 'CPUwallclocktime_': 'max', 371 | 'WallclockTimeX': 'max', 372 | 'ReqMemX': 'max', 373 | 'UsedMem_': 'max', 374 | 'NCPUS_': 'max', 375 | 'NGPUS_': 'max', 376 | 'NNodes_': 'max', 377 | 'PartitionX': lambda x: ''.join(x), 378 | 'JobName_': 'first', 379 | 'SubmitDatetimeX': 'min', 380 | 'WorkingDir_': 'first', 381 | 'StateX': 'min', 382 | 'Account_': 'first', 383 | 'UIDX': 'first', 384 | 'UserX': 'first', 385 | }) 386 | 387 | ### Remove jobs that are still running or currently queued 388 | self.df_agg = self.df_agg_0.loc[self.df_agg_0.StateX != -2] 389 | 390 | ### Turn StateX==-2 into 1 391 | self.df_agg.loc[self.df_agg.StateX == -1, 'StateX'] = 1 392 | 393 | ### Replace UsedMem_=-1 with memory requested (for when MaxRSS=NaN) 394 | self.df_agg['UsedMem2_'] = self.df_agg.apply(self.cleam_UsedMem, axis=1) 395 | 396 | ### Label as CPU or GPU partition 397 | self.df_agg['PartitionTypeX'] = self.df_agg.PartitionX.apply(self.set_partitionType) 398 | 399 | # Just used to clean up with old logs: 400 | if 'AllocTRES' not in self.logs_df.columns: 401 | self.df_agg.loc[self.df_agg.PartitionTypeX == 'GPU', 'NGPUS_'] = 1 # TODO remove after a while 402 | 403 | # Sanity check (no GPU logged for CPU partitions and vice versa) 404 | assert (self.df_agg.loc[self.df_agg.PartitionTypeX == 'CPU'].NGPUS_ == 0).all() 405 | foo = self.df_agg.loc[(self.df_agg.PartitionTypeX == 'GPU') & (self.df_agg.NGPUS_ == 0)] 406 | assert (foo.WallclockTimeX.dt.total_seconds() == 0).all() # Cancelled GPU jobs won't have any GPUs allocated if they didn't start 407 | 408 | ## Check that there is no missing UID/User 409 | if self.df_agg.UIDX.isnull().sum() > 0: 410 | print(f"(!) WARNING: {self.df_agg.UIDX.isnull().sum()} jobs have missing UIDs") 411 | if self.df_agg.UserX.isnull().sum() > 0: 412 | print(f"(!) WARNING: {self.df_agg.UserX.isnull().sum()} jobs have missing Usernames") 413 | 414 | ### add the usage time to use for calculations 415 | self.df_agg['TotalCPUtime2useX'] = self.df_agg.apply(self.calc_CPUusage2use, axis=1) 416 | self.df_agg['TotalGPUtime2useX'] = self.df_agg.apply(self.calc_GPUusage2use, axis=1) 417 | 418 | ### Calculate core-hours charged 419 | self.df_agg[['CPUhoursChargedX', 'GPUhoursChargedX']] = self.df_agg.apply(self.calc_coreHoursCharged, axis=1, result_type='expand') 420 | 421 | ### Calculate real memory need 422 | self.df_agg['NeededMemX'] = self.df_agg.apply( 423 | self.calc_realMemNeeded, 424 | granularity_memory_request=self.cluster_info['granularity_memory_request'], 425 | axis=1) 426 | 427 | ### Add memory waste information 428 | self.df_agg['memOverallocationFactorX'] = self.df_agg.apply(self.calc_memory_overallocation, axis=1) 429 | 430 | # foo = self.df_agg[['TotalCPUtime_', 'CPUwallclocktime_', 'WallclockTimeX', 'NCPUS_', 'CoreHoursChargedCPUX', 431 | # 'CoreHoursChargedGPUX', 'TotalCPUtime2useX', 'TotalGPUtime2useX']] # DEBUGONLY 432 | 433 | ### Filter on working directory 434 | if self.args.filterWD is not None: 435 | # FIXME: Doesn't work with symbolic links 436 | self.df_agg = self.df_agg.loc[self.df_agg.WorkingDir_ == self.args.filterWD] 437 | # print(f'Filtered out {len(self.df_agg)-len(self.df_agg):,} rows (filterCWD={self.args.filterWD})') # DEBUGONLY 438 | 439 | ### Filter on Job ID 440 | self.df_agg.reset_index(inplace=True) 441 | self.df_agg['parentJobID'] = self.df_agg.single_jobID.apply(self.get_parent_jobID) 442 | 443 | if self.args.filterJobIDs != 'all': 444 | list_jobs2keep = self.args.filterJobIDs.split(',') 445 | self.df_agg = self.df_agg.loc[self.df_agg.parentJobID.isin(list_jobs2keep)] 446 | 447 | ### Filter on Account 448 | if self.args.filterAccount is not None: 449 | self.df_agg = self.df_agg.loc[self.df_agg.Account_ == self.args.filterAccount] 450 | 451 | self.df_agg_X = self.df_agg[[x for x in self.df_agg.columns if x[-1] == 'X']] --------------------------------------------------------------------------------