├── requirements.txt
├── example_files
    ├── Screenshot HPC 2024-08-20.png
    ├── example_sacctOutput_raw.txt
    ├── example_sacctOutput_raw_asDF.tsv
    └── example_output_workloadManager.tsv
├── data
    ├── fixed_parameters.yaml
    └── cluster_info.yaml
├── myCarbonFootprint.sh
├── backend
    ├── helpers.py
    ├── __init__.py
    └── slurm_extract.py
├── frontend
    ├── __init__.py
    ├── helpers.py
    ├── templates
    │   ├── _user.html
    │   ├── report_blank.html
    │   └── styles.css
    ├── terminal_output.py
    └── dashboard_output.py
├── .gitignore
├── __init__.py
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.24
2 | pandas==2.0
3 | PyYAML==6.0
4 | jinja2==3.1
5 | plotly==5.18


--------------------------------------------------------------------------------
/example_files/Screenshot HPC 2024-08-20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/GreenAlgorithms/GreenAlgorithms4HPC/HEAD/example_files/Screenshot HPC 2024-08-20.png


--------------------------------------------------------------------------------
/data/fixed_parameters.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | ## ~~~ DO NOT EDIT ~~~
 3 | ##
 4 | ## These are fixed values, from the Green Algorithms app
 5 | ## Hello World
 6 | 
 7 | ---
 8 | power_memory_perGB: 0.3725 # W/GB
 9 | tree_month: 917 #gCO2e
10 | passengerCar_EU_perkm: 175 #gCO2e/km
11 | passengerCar_US_perkm: 251 #gCO2e/km
12 | flight_NY_SF: 570000 #gCO2e
13 | flight_PAR_LON: 50000 #gCO2e
14 | flight_NYC_MEL: 2310000 #gCO2e
15 | electricity_cost: 0.34 # GBP/kWh (source?)


--------------------------------------------------------------------------------
/example_files/example_sacctOutput_raw.txt:
--------------------------------------------------------------------------------
1 | JobID|JobName|Submit|Elapsed|Partition|NNodes|NCPUS|TotalCPU|CPUTime|ReqMem|MaxRSS|WorkDir|State|Account|AllocTRES
2 | 556141|myJobName|2022-02-11T19:11:21|04:00:25|myPartition|1|1|00:00:00|04:00:25|6760Mc||/job/path|TIMEOUT|myAccount|billing=1,cpu=1,mem=6760M,node=1
3 | 552375|myJobName|2022-02-12T13:55:33|03:00:30|myPartition|1|32|00:00:00|4-00:16:00|250Gn||/job/path|TIMEOUT|myAccount|billing=32,cpu=32,gres/gpu=1,mem=250G,node=1
4 | 


--------------------------------------------------------------------------------
/example_files/example_sacctOutput_raw_asDF.tsv:
--------------------------------------------------------------------------------
1 | JobID	JobName	Submit	Elapsed	Partition	NNodes	NCPUS	TotalCPU	CPUTime	ReqMem	MaxRSS	WorkDir	State	Account	AllocTRES
2 | 556141	myJobName	2022-02-12T13:55:33	03:00:30	myPartition	1	32	00:00:00	4-00:16:00	250Gn		/job/path	TIMEOUT	myAccount	billing=32,cpu=32,gres/gpu=1,mem=250G,node=1
3 | 552375	myJobName	2022-02-12T14:04:01	00:39:16	myPartition	1	32	00:00:00	20:56:32	250Gn		/job/path	COMPLETED	myAccount	billing=32,cpu=32,gres/gpu=1,mem=250G,node=1
4 | 


--------------------------------------------------------------------------------
/example_files/example_output_workloadManager.tsv:
--------------------------------------------------------------------------------
1 | 	single_jobID	TotalCPUtime_	CPUwallclocktime_	WallclockTimeX	ReqMemX	UsedMem_	NCPUS_	NGPUS_	NNodes_	PartitionX	JobName_	SubmitDatetimeX	WorkingDir_	StateX	Account_	UsedMem2_	PartitionTypeX	TotalCPUtime2useX	TotalGPUtime2useX	CoreHoursChargedX	NeededMemX	memOverallocationFactorX	parentJobID
2 | 1	27879	0 days 00:00:00.508000	0 days 03:15:45	0 days 00:13:03	102.6	0.003016	15	0	1	myPartition	myName	2022-09-14 18:21:21	/job/path	0	myAccount	0.003016	CPU	0 days 00:00:00.508000	0 days 00:00:00	3.2625	6.0	17.099999999999998	2787379
3 | 2	27060	0 days 00:00:12.499000	0 days 11:12:30	0 days 00:44:50	102.6	0.347312	15	0	1	myPartition	myName	2022-09-14 18:38:58	/job/path	0	myAccount	0.347312	CPU	0 days 00:00:12.499000	0 days 00:00:00	11.208333333333334	6.0	17.099999999999998	2788060
4 | 


--------------------------------------------------------------------------------
/myCarbonFootprint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ## ~~~ TO BE EDITED TO BE TAILORED TO THE CLUSTER ~~~
 4 | ##
 5 | ## You only need to edit the module loading line (l.13), make sure you are loading python 3.7 or greater.
 6 | ##
 7 | 
 8 | # store the cwd in case we need to filter on it
 9 | userCWD="$(pwd)"
10 | 
11 | # Cd into the directory where the GA files are located
12 | parent_path=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
13 | cd "$parent_path"
14 | 
15 | # Test if the virtualenv GA_env already exists, and if not, creates it. Download python 3.8 or higher for better results.
16 | if [ ! -f GA_env/bin/activate ]; then
17 |   echo "Need to create virtualenv"
18 |   /usr/bin/python3.8 -m venv GA_env # this line needs updating to load python on your server
19 |   source GA_env/bin/activate
20 |   pip3 install -r requirements.txt
21 | else
22 |   echo "Virtualenv: OK"
23 |   source GA_env/bin/activate
24 | fi
25 | 
26 | # Test if the python version is at least 3.8
27 | version_major=$(python -c 'import sys; print(sys.version_info[0])')
28 | version_minor=$(python -c 'import sys; print(sys.version_info[1])')
29 | if (( $version_major < 3 )); then
30 |   echo "The command python needs to refer to python 3"
31 |   exit 1
32 | fi
33 | 
34 | if (( $version_minor < 8 )); then
35 |   echo "The command python needs to refer to python3.8 or higher."
36 |   exit 1
37 | fi
38 | echo "Python versions: OK"
39 | 
40 | 
41 | # Run the python code and pass on the arguments
42 | #userCWD="/home/ll582/ with space" # DEBUGONLY
43 | python __init__.py "$@" --userCWD "$userCWD"
44 | 


--------------------------------------------------------------------------------
/data/cluster_info.yaml:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## ~~~ TO BE EDITED TO BE TAILORED TO THE CLUSTER ~~~
 3 | ## Fill in the values for your cluster: all the variables in <> need to be changed
 4 | ##
 5 | ---
 6 | institution: "<My institution>" # [str]
 7 | cluster_name: "<My cluster name>" # [str]
 8 | granularity_memory_request: <6> # [number] in GB representing the smallest memory unit users can reserve
 9 | partitions: # a list of the different partitions on the cluster
10 |   <partition name_1>: # name of the partition
11 |     type: <CPU> # [CPU or GPU]
12 |     model: "<Intel XXX>" # [str] the model of the processing core on this partition. Not actually used by the code but useful for reference for others.
13 |     TDP: <10> # [number] TDP of the processor, in W, per core
14 |   <partition name_2>: # name of the partition
15 |     type: <GPU> # [CPU or GPU]
16 |     model: "<NVIDIA XXX>" # [str] the model of the processing core on this partition. Not actually used by the code but useful for reference for others.
17 |     TDP: <250> # [number] For GPUs, the TDP is for the entire GPU
18 |     # For GPU partitions, we also need info about the CPUs available for support.
19 |     model_CPU: "<Intel XXX>" # [str] Not actually used by the code but useful for reference for others.
20 |     TDP_CPU: <10> # [number] TDP of the processor, in W, per core
21 |   # You can keep adding partitions to this
22 | PUE: <1.67> # [number > 1] Power Usage Effectiveness of the facility
23 | CI: <467> # [number] average carbon intensity of the geographic location, in gCO2e/kWh
24 | energy_cost:
25 |   cost: <0.34> # [number] in currency/kWh
26 |   currency: "<£>" # [str]
27 | #
28 | # Below are optional parameters if the html output is used.
29 | # HTML tags can be used
30 | #
31 | texts_intro:
32 |   CPU: "XX - XX W/core (see <a>here</a> for models)" # For example
33 |   GPU: "NVIDIA A100 (300 W) and NVIDIA Tesla P100 (250 W)" # For example
34 | #
35 | # Below are optional parameters to accommodate some clusters. Do not remove but can be ignored.
36 | #
37 | default_unit_RSS: 'K'
38 | 


--------------------------------------------------------------------------------
/backend/helpers.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import datetime
 3 | import sys
 4 | import random
 5 | import pandas as pd
 6 | import numpy as np
 7 | 
 8 | def check_empty_results(df, args):
 9 |     """
10 |     This is to check whether any jobs have been run on the period, and stop the script if not.
11 |     :param df: [pd.DataFrame] Usage logs
12 |     :param args:
13 |     """
14 |     if len(df) == 0:
15 |         if args.filterWD is not None:
16 |             addThat = f' from this directory ({args.filterWD})'
17 |         else:
18 |             addThat = ''
19 |         if args.filterJobIDs != 'all':
20 |             addThat += ' and with these jobIDs'
21 |         if args.filterAccount is not None:
22 |             addThat += ' charged under this account'
23 | 
24 |         print(f'''
25 | 
26 |     You haven't run any jobs on that period (from {args.startDay} to {args.endDay}){addThat}.
27 | 
28 |         ''')
29 |         sys.exit()
30 | 
31 | def simulate_mock_jobs(): # DEBUGONLY
32 |     df_list = []
33 |     n_jobs = random.randint(500,800)
34 |     foo = {
35 |         'WallclockTimeX':[datetime.timedelta(minutes=random.randint(50,700)) for _ in range(n_jobs)],
36 |         'ReqMemX':np.random.randint(4,130, size=n_jobs)*1.,
37 |         'PartitionX':['icelake']*n_jobs,
38 |         'SubmitDatetimeX':[datetime.datetime(day=1,month=5,year=2023) + datetime.timedelta(days=random.randint(1,60)) for _ in range(n_jobs)],
39 |         'StateX':np.random.choice([1,0], p=[.8,.2], size=n_jobs),
40 |         'UIDX':['11111']*n_jobs,
41 |         'UserX':['foo']*n_jobs,
42 |         'PartitionTypeX':['CPU']*n_jobs,
43 |         'TotalCPUtime2useX':[datetime.timedelta(minutes=random.randint(50,5000)) for _ in range(n_jobs)],
44 |         'TotalGPUtime2useX':[datetime.timedelta(seconds=0)]*n_jobs,
45 |     }
46 | 
47 |     foo_df = pd.DataFrame(foo)
48 |     foo_df['CPUhoursChargedX'] = foo_df.TotalCPUtime2useX / np.timedelta64(1, 'h')
49 |     foo_df['GPUhoursChargedX'] = 0.
50 |     foo_df['NeededMemX'] = foo_df.ReqMemX * np.random.random(n_jobs)
51 |     foo_df['memOverallocationFactorX'] = foo_df.ReqMemX / foo_df.NeededMemX
52 | 
53 |     df_list.append(foo_df)
54 |     return pd.concat(df_list)


--------------------------------------------------------------------------------
/frontend/__init__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import yaml
 3 | import os
 4 | 
 5 | from frontend.terminal_output import generate_terminal_view
 6 | from frontend.dashboard_output import dashboard_html
 7 | 
 8 | def main_frontend(dict_stats, args):
 9 |     ### Load cluster specific info
10 |     with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream:
11 |         try:
12 |             cluster_info = yaml.safe_load(stream)
13 |         except yaml.YAMLError as exc:
14 |             print(exc)
15 | 
16 |     if args.output == 'terminal':
17 |         print("Generating terminal view... ", end="")
18 |         terminal_view = generate_terminal_view(dict_stats, args, cluster_info)
19 |         print("Done\n")
20 |         print(terminal_view)
21 |     elif args.output == 'html':
22 |         print("Generating html... ", end="")
23 |         dashboard = dashboard_html(
24 |             dict_stats=dict_stats,
25 |             args=args,
26 |             cluster_info=cluster_info,
27 |         )
28 |         report_path = dashboard.generate()
29 |         print(f"done: {report_path}")
30 | 
31 |     else:
32 |         raise ValueError("Wrong output format")
33 | 
34 | 
35 | if __name__ == "__main__":
36 | 
37 |     #### This is used for testing only ####
38 | 
39 |     from collections import namedtuple
40 |     from backend import main_backend
41 | 
42 |     argStruct = namedtuple('argStruct',
43 |                            'startDay endDay use_mock_agg_data user output useCustomLogs customSuccessStates filterWD filterJobIDs filterAccount reportBug reportBugHere path_infrastucture_info')
44 |     args = argStruct(
45 |         startDay='2022-01-01',
46 |         endDay='2023-06-30',
47 |         use_mock_agg_data=True,
48 |         user='ll582',
49 |         output='html',
50 |         useCustomLogs=None,
51 |         customSuccessStates='',
52 |         filterWD=None,
53 |         filterJobIDs='all',
54 |         filterAccount=None,
55 |         reportBug=False,
56 |         reportBugHere=False,
57 |         path_infrastucture_info="clustersData/CSD3",
58 |     )
59 |     with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream:
60 |         try:
61 |             cluster_info = yaml.safe_load(stream)
62 |         except yaml.YAMLError as exc:
63 |             print(exc)
64 | 
65 |     extracted_data = main_backend(args)
66 | 
67 |     # generate_dashboard_html(dict_stats=extracted_data, args=args, cluster_info=cluster_info, dict_deptGroupsUsers=dict_deptGroupsUsers, dict_users=dict_users)
68 | 
69 |     main_frontend(dict_stats=extracted_data,args=args)


--------------------------------------------------------------------------------
/frontend/helpers.py:
--------------------------------------------------------------------------------
 1 | def formatText_footprint(footprint_g, use_html=False):
 2 |     '''
 3 |     Format the text to display the carbon footprint
 4 |     :param footprint_g: [float] carbon footprint, in gCO2e
 5 |     :return: [str] the text to display
 6 |     '''
 7 |     if use_html:
 8 |         co2e = "CO<sub>2</sub>e"
 9 |     else:
10 |         co2e = "CO2e"
11 |     if footprint_g < 1e3:
12 |         text_footprint = f"{footprint_g:,.0f} g{co2e}"
13 |     elif footprint_g < 1e6:
14 |         text_footprint = f"{footprint_g / 1e3:,.0f} kg{co2e}"
15 |     else:
16 |         text_footprint = f"{footprint_g / 1e3:,.0f} T{co2e}"
17 |     return text_footprint
18 | 
19 | def formatText_treemonths(tm_float, splitMonthsYear=True):
20 |     '''
21 |     Format the text to display the tree months
22 |     :param tm_float: [float] tree-months
23 |     :return: [str] the text to display
24 |     '''
25 |     tm = int(tm_float)
26 |     ty = int(tm / 12)
27 |     if tm < 1:
28 |         text_trees = f"{tm_float:.3f} tree-months"
29 |     elif tm == 1:
30 |         text_trees = f"{tm_float:.1f} tree-month"
31 |     elif tm < 6:
32 |         text_trees = f"{tm_float:.1f} tree-months"
33 |     elif tm <= 24:
34 |         text_trees = f"{tm} tree-months"
35 |     elif tm < 120:
36 |         if splitMonthsYear:
37 |             text_trees = f"{ty} tree-years and {tm - ty * 12} tree-months"
38 |         else:
39 |             text_trees = f"{ty} tree-years"
40 |     else:
41 |         text_trees = f"{tm_float/12:.1f} tree-years"
42 |     return text_trees
43 | 
44 | def formatText_flying(dict_stats, output_format='single_str'):
45 |     """
46 |     Format the text to display about flying
47 |     :param dict_stats:
48 |     :param output_format:
49 |     :return: [str] or [(float,str)] text to display
50 |     """
51 |     if output_format not in ['single_str', 'dict']:
52 |         raise ValueError()
53 | 
54 |     if dict_stats['flying_NY_SF'] < 0.5:
55 |         value = round(dict_stats['flying_PAR_LON'], 2)
56 |         if output_format == 'single_str':
57 |             output_flying = f"{value:,} flights between Paris and London"
58 |         else:
59 |             output_flying = {'number': value, 'trip': 'Paris - London'}
60 |     elif dict_stats['flying_NYC_MEL'] < 0.5:
61 |         value = round(dict_stats['flying_NY_SF'], 2)
62 |         if output_format == 'single_str':
63 |             output_flying = f"{value:,} flights between New York and San Francisco"
64 |         else:
65 |             output_flying = {'number': value, 'trip': 'New York - San Francisco'}
66 |     else:
67 |         value = round(dict_stats['flying_NYC_MEL'], 2)
68 |         if output_format == 'single_str':
69 |             output_flying = f"{value:,} flights between New York and Melbourne"
70 |         else:
71 |             output_flying = {'number': value, 'trip': 'New York - Melbourne'}
72 |     return output_flying


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Project specific
  2 | .idea/
  3 | clustersData/
  4 | testData/
  5 | error_logs_archived/
  6 | support_files/
  7 | frontend/templates/plotly*
  8 | outputs/*
  9 | example_outputs/
 10 | 
 11 | # Byte-compiled / optimized / DLL files
 12 | __pycache__/
 13 | *.py[cod]
 14 | *$py.class
 15 | 
 16 | # C extensions
 17 | *.so
 18 | 
 19 | # Distribution / packaging
 20 | .Python
 21 | build/
 22 | develop-eggs/
 23 | dist/
 24 | downloads/
 25 | eggs/
 26 | .eggs/
 27 | lib/
 28 | lib64/
 29 | parts/
 30 | sdist/
 31 | var/
 32 | wheels/
 33 | share/python-wheels/
 34 | *.egg-info/
 35 | .installed.cfg
 36 | *.egg
 37 | MANIFEST
 38 | 
 39 | # PyInstaller
 40 | #  Usually these files are written by a python script from a template
 41 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 42 | *.manifest
 43 | *.spec
 44 | 
 45 | # Installer logs
 46 | pip-log.txt
 47 | pip-delete-this-directory.txt
 48 | 
 49 | # Unit test / coverage reports
 50 | htmlcov/
 51 | .tox/
 52 | .nox/
 53 | .coverage
 54 | .coverage.*
 55 | .cache
 56 | nosetests.xml
 57 | coverage.xml
 58 | *.cover
 59 | *.py,cover
 60 | .hypothesis/
 61 | .pytest_cache/
 62 | cover/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | db.sqlite3-journal
 73 | 
 74 | # Flask stuff:
 75 | instance/
 76 | .webassets-cache
 77 | 
 78 | # Scrapy stuff:
 79 | .scrapy
 80 | 
 81 | # Sphinx documentation
 82 | docs/_build/
 83 | 
 84 | # PyBuilder
 85 | .pybuilder/
 86 | target/
 87 | 
 88 | # Jupyter Notebook
 89 | .ipynb_checkpoints
 90 | 
 91 | # IPython
 92 | profile_default/
 93 | ipython_config.py
 94 | 
 95 | # pyenv
 96 | #   For a library or package, you might want to ignore these files since the code is
 97 | #   intended to run in multiple environments; otherwise, check them in:
 98 | # .python-version
 99 | 
100 | # pipenv
101 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
102 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
103 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
104 | #   install all needed dependencies.
105 | #Pipfile.lock
106 | 
107 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
108 | __pypackages__/
109 | 
110 | # Celery stuff
111 | celerybeat-schedule
112 | celerybeat.pid
113 | 
114 | # SageMath parsed files
115 | *.sage.py
116 | 
117 | # Environments
118 | .env
119 | .venv
120 | env/
121 | venv/
122 | ENV/
123 | env.bak/
124 | venv.bak/
125 | 
126 | # Spyder project settings
127 | .spyderproject
128 | .spyproject
129 | 
130 | # Rope project settings
131 | .ropeproject
132 | 
133 | # mkdocs documentation
134 | /site
135 | 
136 | # mypy
137 | .mypy_cache/
138 | .dmypy.json
139 | dmypy.json
140 | 
141 | # Pyre type checker
142 | .pyre/
143 | 
144 | # pytype static type analyzer
145 | .pytype/
146 | 
147 | # Cython debug symbols
148 | cython_debug/


--------------------------------------------------------------------------------
/frontend/templates/_user.html:
--------------------------------------------------------------------------------
 1 | <!--TODO create similar pages for department and group granularity -->
 2 | 
 3 | <div id="user" class="section scrollspy">
 4 |      <div>
 5 |         <h3>User's personal report: {{ user.userID }}</h3>
 6 | 
 7 |         <p>
 8 |             Find out your carbon footprint from {{ startDay }} to {{ endDay }}.
 9 |         </p>
10 |     </div>
11 | 
12 |     <div id="summary_user" class="card-panel stats-summary">
13 |         <div class="row center-align">
14 |             <div class="col s3">
15 |                 <i class="fa-solid fa-microchip"></i>
16 |                 <span>CPU time</span>
17 |                 <span data-stat="cpu" >{{ usersActivity[user.userID].cpuTime }}</span>
18 |             </div>
19 |             <div class="col s3">
20 |                 <i class="fa-solid fa-smog"></i>
21 |                 <span>Carbon footprint</span>
22 |                 <span data-stat="co2e" >{{ usersActivity[user.userID].carbonFootprint }}</span>
23 |             </div>
24 |             <div class="col s3">
25 |                 <i class="fa-solid fa-plane"></i>
26 |                 <span>{{ usersActivity[user.userID].flying.trip }}</span>
27 |                 <span data-stat="flight" >{{ usersActivity[user.userID].flying.number }}</span>
28 |             </div>
29 |             <div class="col s3">
30 |                 <i class="fa-solid fa-tree"></i>
31 |                 <span>Carbon sequestration</span>
32 |                 <span data-stat="tree" >{{ usersActivity[user.userID].trees }}</span>
33 |             </div>
34 |         </div>
35 |     </div>
36 | 
37 |     <div>
38 |         <div>
39 |             {% include 'plotly_thisuserDailyCarbonFootprint.html' %}
40 |         </div>
41 |         <div>
42 |             {% include 'plotly_thisuserDailyNjobs.html' %}
43 |         </div>
44 |         <div>
45 |             {% include 'plotly_thisuserDailyCpuTime.html' %}
46 |         </div>
47 |         <div>
48 |             {% include 'plotly_thisuserDailyMemoryRequested.html' %}
49 |         </div>
50 | 
51 |         <div>
52 |             <h5>Failed jobs</h5>
53 |             <p>
54 |                 Because any resource spent on a job is wasted if the job fails, it
55 |                 is important to test scripts and pipelines on small datasets.
56 |                 The chart below shows the daily success rate of {{ usersActivity[user.userID].n_jobs }}
57 |                 jobs that completed in the period.
58 | 
59 |                 Failed jobs represent {{ usersActivity[user.userID].carbonFootprint_failedJobs }} and
60 |                 a cost of {{ usersActivity[user.userID].cost_failedJobs }}.
61 |                 They are responsible for {{ usersActivity[user.userID].carbonFootprint_failedJobs_share }} of the overall
62 |                 carbon footprint.
63 |             </p>
64 |             {% include 'plotly_thisuserSuccessRate.html' %}
65 | <!--                        TODO put text and pie chart side-by-side-->
66 |             {% include 'plotly_thisuserDailySuccessRate.html' %}
67 |         </div>
68 | 
69 |         <div>
70 |             <h5>Memory efficiency</h5>
71 | 
72 |             <p>
73 |                 Memory can be a significant source of waste, because the power draw from memory mainly depends
74 |                 on the memory available, not on the actual memory used. The chart below shows the distribution
75 |                 of the memory efficiency collected from {{ usersActivity[user.userID].n_jobs }} jobs
76 |                 between {{ startDay }} and {{ endDay }} (the closer to 100% the better).
77 |             </p>
78 | 
79 |             {% include 'plotly_thisuserMemoryEfficiency.html' %}
80 | 
81 |             <p>
82 |                 Using the memory efficiency, we can estimate how much memory was needed to run a job.
83 |                 If all jobs above had been submitted with only the memory they needed (rounded up),
84 |                 you would have emitted {{ usersActivity[user.userID].carbonFootprint_wasted_memoryOverallocation }} less
85 |                 and saved {{ usersActivity[user.userID].cost_wasted_memoryOverallocation }}.
86 |             </p>
87 |         </div>
88 | 
89 | 
90 |     </div>
91 | </div>


--------------------------------------------------------------------------------
/frontend/terminal_output.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import math
  3 | from frontend.helpers import formatText_footprint, formatText_treemonths, formatText_flying
  4 | import pandas as pd
  5 | import os
  6 | 
  7 | 
  8 | def formatText_driving(dist):
  9 |     """
 10 |     Format the text to display the driving distance
 11 |     :param dist: [float] driving distance, in km
 12 |     :return: [str] text to display
 13 |     """
 14 |     if dist < 10:
 15 |         text_driving = f"driving {dist:,.2f} km"
 16 |     else:
 17 |         text_driving = f"driving {dist:,.0f} km"
 18 |     return text_driving
 19 | 
 20 | def generate_terminal_view(dict_stats_all, args, cluster_info):
 21 | 
 22 |     user_here = dict_stats_all['user']
 23 |     dict_stats = dict_stats_all['userActivity'][user_here]
 24 |     text_nUsers = f"- user: {user_here} -"
 25 | 
 26 |     ## various variables
 27 |     clusterName = cluster_info['cluster_name']
 28 | 
 29 |     ## energy
 30 |     dcOverheads = dict_stats['energy'] - dict_stats['energy_CPUs'] - dict_stats['energy_GPUs'] - dict_stats['energy_memory']
 31 | 
 32 |     ## Carbon footprint
 33 |     text_footprint = formatText_footprint(dict_stats['carbonFootprint'])
 34 |     text_footprint_failedJobs = formatText_footprint(dict_stats['carbonFootprint_failedJobs'])
 35 |     text_footprint_wasted_memoryOverallocation = formatText_footprint(dict_stats['carbonFootprint']-dict_stats['carbonFootprint_memoryNeededOnly'])
 36 | 
 37 |     ## Context
 38 |     text_trees = formatText_treemonths(dict_stats['treeMonths'])
 39 |     text_trees_failedJobs = formatText_treemonths(dict_stats['treeMonths_failedJobs'])
 40 |     text_trees_wasted_memoryOverallocation = formatText_treemonths(dict_stats['treeMonths']-dict_stats['treeMonths_memoryNeededOnly'])
 41 |     text_driving = formatText_driving(dict_stats['driving'])
 42 |     text_flying = formatText_flying(dict_stats)
 43 | 
 44 |     ### Text filterCWD
 45 |     if args.filterWD is None:
 46 |         text_filterCWD = ''
 47 |     else:
 48 |         text_filterCWD = f"\n        (NB: The only jobs considered here are those launched from {args.filterWD})\n"
 49 | 
 50 |     ### Text filterJobIDs
 51 |     if args.filterJobIDs == 'all':
 52 |         text_filterJobIDs = ''
 53 |     else:
 54 |         text_filterJobIDs = f"\n        (NB: The only jobs considered here are those with job IDs: {args.filterJobIDs})\n"
 55 | 
 56 |     ### Text filter Account
 57 |     if args.filterAccount is None:
 58 |         text_filterAccount = ''
 59 |     else:
 60 |         text_filterAccount = f"\n        (NB: The only jobs considered here are those charged under {args.filterAccount})\n"
 61 | 
 62 |     ### To get the title length right
 63 |     title_row1 = f"Carbon footprint on {clusterName}"
 64 |     title_row2 = text_nUsers
 65 |     title_row3 = f"({args.startDay} / {args.endDay})"
 66 |     max_length = max([len(title_row1), len(title_row2), len(title_row3)])
 67 | 
 68 |     title_row1_full = f"#  {' '*math.floor((max_length-len(title_row1))/2)}{title_row1}{' '*math.ceil((max_length-len(title_row1))/2)}  #"
 69 |     title_row2_full = f"#  {' '*math.floor((max_length-len(title_row2))/2)}{title_row2}{' '*math.ceil((max_length-len(title_row2))/2)}  #"
 70 |     title_row3_full = f"#  {' '*math.floor((max_length-len(title_row3))/2)}{title_row3}{' '*math.ceil((max_length-len(title_row3))/2)}  #"
 71 | 
 72 |     title = f'''
 73 |         {'#'*(max_length+6)}
 74 |         #{' '*(max_length+4)}#
 75 |         {title_row1_full}
 76 |         {title_row2_full}
 77 |         {title_row3_full}
 78 |         #{' '*(max_length+4)}#
 79 |         {'#'*(max_length+6)}
 80 |     '''
 81 | 
 82 |     return f'''
 83 |       {title}
 84 |       
 85 |               {'-' * (len(text_footprint) + 6)}
 86 |              |   {text_footprint}   |
 87 |               {'-' * (len(text_footprint) + 6)}
 88 |               
 89 |     ...This is equivalent to:
 90 |          - {text_trees}
 91 |          - {text_driving}
 92 |          - {text_flying}
 93 |          
 94 |     ...{dict_stats['failure_rate']:.1%} of the jobs failed, these represent a waste of {text_footprint_failedJobs} ({text_trees_failedJobs}).
 95 |     ...On average, the jobs request at least {dict_stats['memoryOverallocationFactor']:,.1f} times the memory needed. By only requesting the memory needed, {text_footprint_wasted_memoryOverallocation} ({text_trees_wasted_memoryOverallocation}) could have been saved.
 96 |     {text_filterCWD}{text_filterJobIDs}{text_filterAccount}
 97 |     Energy used: {dict_stats['energy']:,.2f} kWh
 98 |          - CPUs: {dict_stats['energy_CPUs']:,.2f} kWh ({dict_stats['energy_CPUs'] / dict_stats['energy']:.2%})
 99 |          - GPUs: {dict_stats['energy_GPUs']:,.2f} kWh ({dict_stats['energy_GPUs'] / dict_stats['energy']:.2%})
100 |          - Memory: {dict_stats['energy_memory']:,.2f} kWh ({dict_stats['energy_memory'] / dict_stats['energy']:.2%})
101 |          - Data centre overheads: {dcOverheads:,.2f} kWh ({dcOverheads / dict_stats['energy']:.2%})
102 |     Carbon intensity used for the calculations: {cluster_info['CI']:,} gCO2e/kWh
103 |     
104 |     Summary of usage:
105 |     - First/last job recorded on that period: {str(dict_stats['first_job_period'].date())}/{str(dict_stats['last_job_period'].date())}
106 |     - Number of jobs: {dict_stats['n_jobs']:,} ({dict_stats['n_success']:,} completed)
107 |     - Core hours used/charged: {dict_stats['CPUhoursCharged']:,.1f} (CPU), {dict_stats['GPUhoursCharged']:,.1f} (GPU), {dict_stats['CPUhoursCharged']+dict_stats['GPUhoursCharged']:,.1f} (total).
108 |     - Total usage time (i.e. when cores were performing computations):
109 |         - CPU: {str(dict_stats['cpuTime'])} ({dict_stats['cpuTime'].total_seconds()/3600:,.0f} hours)
110 |         - GPU: {str(dict_stats['gpuTime'])} ({dict_stats['gpuTime'].total_seconds()/3600:,.0f} hours)
111 |     - Total wallclock time: {str(dict_stats['wallclockTime'])}
112 |     - Total memory requested: {dict_stats['memoryRequested']:,.0f} GB
113 |     
114 |     Limitations to keep in mind:
115 |          - The workload manager doesn't alway log the exact CPU usage time, and when this information is missing, we assume that all cores are used at 100%.
116 |          - For now, we assume that for GPU jobs, the GPUs are used at 100% (as the information needed for more accurate measurement is not available)
117 |          (this may lead to slightly overestimated carbon footprints, although the order of magnitude is likely to be correct)
118 |          - Conversely, the wasted energy due to memory overallocation may be largely underestimated, as the information needed is not always logged.
119 | 
120 |     Any bugs, questions, suggestions? Post on GitHub (GreenAlgorithms/GreenAlgorithms4HPC) or email LL582@medschl.cam.ac.uk
121 |     {'-' * 80}
122 |     Calculated using the Green Algorithms framework: www.green-algorithms.org
123 |     Please cite https://onlinelibrary.wiley.com/doi/10.1002/advs.202100707 
124 |     '''
125 | 
126 | 


--------------------------------------------------------------------------------
/frontend/templates/report_blank.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en">
  3 | <head>
  4 |     <meta charset="UTF-8">
  5 |     <title>Green Algorithms dashboard</title>
  6 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@fortawesome/fontawesome-free@6.3.0/css/all.min.css">
  7 |     <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/@materializecss/materialize@1.1.0/dist/css/materialize.min.css">
  8 |     <link rel="stylesheet" href="styles.css">
  9 | <!--    <link rel="icon" type="image/x-icon" href="https://www.ebi.ac.uk/favicon.ico">-->
 10 | </head>
 11 | <body>
 12 |     <section class="container">
 13 |         <header>
 14 |             <!-- TODO make the page responsive -->
 15 |             <nav id="top-nav">
 16 |                 <i class="fa-solid fa-seedling"></i>
 17 |                 <div id="title">
 18 |                     <h1>Green Algorithms dashboard</h1>
 19 |                     <h2>{{ institution }}</h2>
 20 |                 </div>
 21 |             </nav>
 22 |             <p>
 23 |                 Last updated: {{ last_updated }}
 24 |             </p>
 25 |         </header>
 26 | 
 27 |         <div class="row">
 28 |             <div class="col l2">
 29 |                 <div id="toc-wrapper">
 30 |                     <ul class="section table-of-contents">
 31 |                         <li><a href="#intro">Introduction</a></li>
 32 |                         {% if include_user_context %}
 33 |                             <li><a href="#user">User's data ({{ user.userID }})</a></li>
 34 |                         {% endif %}
 35 |                         <li><a href="#credits">Credits</a></li>
 36 |                         <li><a href="#contact">Contact</a></li>
 37 |                         <li><a href="#faq">FAQ</a></li>
 38 |                     </ul>
 39 |                 </div>
 40 |             </div>
 41 | 
 42 | <!--        starting column on the right -->
 43 |             <div class="col l10">
 44 |                 <div id="intro" class="section scrollspy">
 45 |                     <div class="card-panel warning">
 46 |                         <p>
 47 |                             This is an early version, please report any bug you find!
 48 |                         </p>
 49 |                     </div>
 50 | 
 51 |                     <p>
 52 |                         Computing is a major contributor to energy consumption, and thus is one of the main sources of
 53 |                         the carbon emission of our research.
 54 |                         In the context of the global climate crisis, it is imperative that individuals and organizations
 55 |                         find ways to assess then reduce the carbon footprint of their work.
 56 |                     </p>
 57 | 
 58 |                     <p>
 59 |                         This page aims to represent the carbon footprint that we are, collectively and individually,
 60 |                         responsible for at {{ institution }}.
 61 |                         SLURM jobs submitted to the {{ cluster_name }} High Performance Cluster are logged automatically
 62 |                         (including information such as resource requested, run time, memory efficiency, etc.),
 63 |                         and the corresponding carbon footprint  was calculated using the framework proposed
 64 |                         by <a href="https://green-algorithms.org/">Green Algorithms</a> and the following assumptions:
 65 |                     </p>
 66 | 
 67 |                     <table>
 68 |                         <tbody>
 69 |                             <tr>
 70 |                                 <th>CPU</th>
 71 |                                 <td>{{ texts_intro.CPU }}</td>
 72 |                             </tr>
 73 |                             <tr>
 74 |                                 <th>GPU</th>
 75 |                                 <td>{{ texts_intro.GPU }}</td>
 76 |                             </tr>
 77 |                             <tr>
 78 |                                 <th>Memory power</th>
 79 |                                 <td>0.3725 W/GB</td>
 80 |                             </tr>
 81 |                             <tr>
 82 |                                 <th>Power usage effectiveness</th>
 83 |                                 <td>{{ PUE }}</td>
 84 |                             </tr>
 85 |                             <tr>
 86 |                                 <th>Carbon intensity</th>
 87 |                                 <td>{{ CI }} gCO<sub>2</sub>e/kWh</td>
 88 |                             </tr>
 89 |                             <tr>
 90 |                                 <th>Energy cost</th>
 91 |                                 <td>{{ energy_cost_perkWh.currency }}{{ energy_cost_perkWh.cost }}/kWh</td>
 92 |                             </tr>
 93 |                         </tbody>
 94 |                     </table>
 95 | 
 96 |                     <div class="card-panel info">
 97 |                         <p>
 98 |                             We built this tool in the hope to raise awareness of computing usage,
 99 |                             highlight resources waste, and foster good computing practices.
100 |                             This is intended to be a lightweight carbon footprint calculator, not a cluster monitoring system.
101 |                         </p>
102 |                     </div>
103 |                 </div>
104 | 
105 |                 {% if include_user_context %}
106 |                     {% include "_user.html" %}
107 |                 {% endif %}
108 | 
109 |                 <div id="credits" class="section scrollspy">
110 |                     <h4>Credits</h4>
111 |                     <p>
112 |                         This dashboard is the combination of a template developed at EMBL-EBI by Matthias Blum and Alex Bateman,
113 |                         and the Green Algorithms project led by Loïc Lannelongue and Michael Inouye.
114 |                         The carbon footprint calculations are described on <a href="https://www.green-algorithms.org/">the Green Algorithms project's website</a>.
115 |                     </p>
116 |                 </div>
117 | 
118 |                 <div id="contact" class="section scrollspy">
119 |                     <h4>Contact</h4>
120 |                     <p>
121 |                         If you want to report a bug or a user assigned to the wrong team,
122 |                         request a feature, or just give some general feedback, you can email LL582@medschl.cam.ac.uk.
123 |                     </p>
124 |                 </div>
125 | 
126 |                 <div id="faq" class="section scrollspy">
127 |                     <h4>FAQ</h4>
128 |                     <p>
129 |                         <span class="question">How is the information on SLRUM jobs collected?</span>
130 |                         Logs are pulled using the `sacct` command. It's all powered by the GA4HPC methods,
131 |                         you can check it out <a href="https://www.green-algorithms.org/GA4HPC/">there</a>.
132 |                     </p>
133 | 
134 |                     <p>
135 |                         <span class="question">Where can I ask more questions?</span>
136 |                         On the GitHub <a href="https://github.com/GreenAlgorithms/GreenAlgorithms4HPC/issues">here</a> or by email (see above)..
137 |                     </p>
138 | 
139 | 
140 |                 </div>
141 | 
142 |             </div>
143 | 
144 |         </div>
145 | 
146 |     </section>
147 | </body>
148 | </html>


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import datetime
  4 | import os
  5 | 
  6 | from backend import main_backend
  7 | from frontend import main_frontend
  8 | 
  9 | def create_arguments():
 10 |     """
 11 |     Command line arguments for the tool.
 12 |     :return: argparse object
 13 |     """
 14 |     parser = argparse.ArgumentParser(description=f'Calculate your carbon footprint on the server.')
 15 | 
 16 |     default_endDay = datetime.date.today().strftime("%Y-%m-%d")  # today
 17 |     default_startDay = f"{datetime.date.today().year}-01-01"  # start of the year
 18 | 
 19 |     ## Timeframe
 20 |     parser.add_argument('-S', '--startDay', type=str,
 21 |                         help=f'The first day to take into account, as YYYY-MM-DD (default: {default_startDay})',
 22 |                         default=default_startDay)
 23 |     parser.add_argument('-E', '--endDay', type=str,
 24 |                         help='The last day to take into account, as YYYY-MM-DD (default: today)',
 25 |                         default=default_endDay)
 26 | 
 27 |     ## How to display the report
 28 |     parser.add_argument('-o', '--output', type=str,
 29 |                         help="How to display the results, one of 'terminal' or 'html' (default: terminal)",
 30 |                         default='terminal')
 31 |     parser.add_argument('--outputDir', type=str,
 32 |                         help="Export path for the output (default: under `output/`). Only used with `--output html`.",
 33 |                         default='outputs')
 34 | 
 35 |     ## Filter out jobs
 36 |     parser.add_argument('--filterCWD', action='store_true',
 37 |                         help='Only report on jobs launched from the current location.')
 38 |     parser.add_argument('--userCWD', type=str, help=argparse.SUPPRESS)
 39 |     parser.add_argument('--filterJobIDs', type=str,
 40 |                         help='Comma separated list of Job IDs you want to filter on. (default: "all")',
 41 |                         default='all')
 42 |     parser.add_argument('--filterAccount', type=str,
 43 |                         help='Only consider jobs charged under this account')
 44 |     parser.add_argument('--customSuccessStates', type=str, default='',
 45 |                         help="Comma-separated list of job states. By default, only jobs that exit with status CD or \
 46 |                                  COMPLETED are considered successful (PENDING, RUNNING and REQUEUD are ignored). \
 47 |                                  Jobs with states listed here will be considered successful as well (best to list both \
 48 |                                  2-letter and full-length codes. Full list of job states: \
 49 |                                  https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES")
 50 | 
 51 |     ## Reporting bugs
 52 |     group1 = parser.add_mutually_exclusive_group()
 53 |     group1.add_argument('--reportBug', action='store_true',
 54 |                         help='In case of a bug, this flag exports the jobs logs so that you/we can investigate further. '
 55 |                              'The debug file will be stored in the shared folder where this tool is located (under /outputs), '
 56 |                              'to export it to your home folder, user `--reportBugHere`. '
 57 |                              'Note that this will write out some basic information about your jobs, such as runtime, '
 58 |                              'number of cores and memory usage.'
 59 |                         )
 60 |     group1.add_argument('--reportBugHere', action='store_true',
 61 |                         help='Similar to --reportBug, but exports the output to your home folder.')
 62 |     group2 = parser.add_mutually_exclusive_group()
 63 |     group2.add_argument('--useCustomLogs', type=str, default='',
 64 |                         help='This bypasses the workload manager, and enables you to input a custom log file of your jobs. \
 65 |                                  This is mostly meant for debugging, but can be useful in some situations. '
 66 |                              'An example of the expected file can be found at `example_files/example_sacctOutput_raw.txt`.')
 67 |     # Arguments for debugging only (not visible to users)
 68 |     # To ue arbitrary folder for the infrastructure information
 69 |     parser.add_argument('--useOtherInfrastuctureInfo', type=str, default='', help=argparse.SUPPRESS)
 70 |     # Uses mock aggregated usage data, for offline debugging
 71 |     group2.add_argument('--use_mock_agg_data', action='store_true', help=argparse.SUPPRESS)
 72 | 
 73 |     args = parser.parse_args()
 74 |     return args
 75 | 
 76 | class validate_args():
 77 |     """
 78 |     Class used to validate all the arguments provided.
 79 |     """
 80 |     # TODO add validation
 81 |     # TODO test these
 82 | 
 83 |     def _validate_dates(self, args):
 84 |         """
 85 |         Validates that `startDay` and `endDay` are in the right format and in the right order.
 86 |         """
 87 |         for x in [args.startDay, args.endDay]:
 88 |             try:
 89 |                 datetime.datetime.strptime(x, '%Y-%m-%d')
 90 |             except ValueError:
 91 |                 raise ValueError(f"Incorrect date format, should be YYYY-MM-DD but is: {x}")
 92 | 
 93 |         foo = datetime.datetime.strptime(args.startDay, '%Y-%m-%d')
 94 |         bar = datetime.datetime.strptime(args.endDay, '%Y-%m-%d')
 95 |         if foo > bar:
 96 |             raise ValueError(f"Start date ({args.startDay}) is after the end date ({args.endDay}).")
 97 | 
 98 |     def _validate_output(self, args):
 99 |         """
100 |         Validates that --output is one of the accepted options.
101 |         """
102 |         list_options = ['terminal', 'html']
103 |         if args.output not in list_options:
104 |             raise ValueError(f"output argument invalid. Is {args.output} but should be one of {list_options}")
105 | 
106 | 
107 |     def all(self, args):
108 |         self._validate_dates(args)
109 |         self._validate_output(args)
110 | 
111 | if __name__ == "__main__":
112 |     # print("Working dir0: ", os.getcwd()) # DEBUGONLY
113 | 
114 |     args = create_arguments()
115 | 
116 |     ## Decide which infrastructure info to use
117 |     if args.useOtherInfrastuctureInfo != '':
118 |         args.path_infrastucture_info = args.useOtherInfrastuctureInfo
119 |         print(f"Overriding infrastructure info with: {args.path_infrastucture_info}")
120 |     else:
121 |         args.path_infrastucture_info = 'data'
122 | 
123 |     ## Organise the unique output directory (used for output report and logs export for debugging)
124 |     ## creating a uniquely named subdirectory in whatever
125 |     # Decide if an output directory is needed at all
126 |     if (args.output in ['html']) | args.reportBug | args.reportBugHere:
127 |         timestamp = datetime.datetime.now().strftime('%Y%m%d-%H%M-%S%f')
128 |         args.outputDir2use = {
129 |             'timestamp': timestamp,
130 |             'path': os.path.join(args.outputDir, f"outputs_{timestamp}")
131 |         }
132 | 
133 |         # Create directory
134 |         os.makedirs(args.outputDir2use["path"])
135 | 
136 |     else:
137 |         # no output is created
138 |         args.outputDir2use = None
139 | 
140 |     ### Set the WD to filter on, if needed
141 |     if args.filterCWD:
142 |         args.filterWD = args.userCWD
143 |         print("\nNB: --filterCWD doesn't work with symbolic links (yet!)\n")
144 |     else:
145 |         args.filterWD = None
146 | 
147 |     ### Validate input
148 |     validate_args().all(args)
149 | 
150 |     ### Run backend to get data
151 |     extracted_data = main_backend(args)
152 | 
153 |     main_frontend(extracted_data, args)


--------------------------------------------------------------------------------
/frontend/templates/styles.css:
--------------------------------------------------------------------------------
  1 | @import url('https://fonts.googleapis.com/css2?family=IBM+Plex+Sans:ital,wght@0,400;0,700;1,400;1,700&display=swap');
  2 | 
  3 | @media only screen and (min-width: 993px) {
  4 |     .container {
  5 |         width: 85%;
  6 |     }
  7 | }
  8 | 
  9 | @media only screen and (min-width: 1201px) {
 10 |     html {
 11 |         font-size:16px;
 12 |     }
 13 | }
 14 | 
 15 | /* Override Materialize */
 16 | html,
 17 | button,
 18 | input,
 19 | optgroup,
 20 | select,
 21 | textarea {
 22 |     font-family: "IBM Plex Sans", sans-serif !important;
 23 | }
 24 | .btn, .tabs .tab { text-transform: inherit; }
 25 | .btn.fluid { width: 100%; }
 26 | td, th {
 27 |     padding: .5em .75em;
 28 | }
 29 | .section {
 30 |     padding: 2rem 0 0;
 31 | }
 32 | .section > h4 {
 33 |     margin-top: 0;
 34 | }
 35 | .input-field .right.circle {
 36 |     float: left !important;
 37 | }
 38 | img.circle {
 39 |     border: 1px solid #d4d4d5;
 40 | }
 41 | /*table#teams-table, table#users-table {*/
 42 | /*    font-size: .9rem;*/
 43 | /*}*/
 44 | .card-panel {
 45 |     padding: 1.25rem;
 46 | }
 47 | .card-panel > .card-title {
 48 |     font-size: 1.25rem;
 49 |     margin-bottom: .5rem;
 50 | }
 51 | .card-panel > .card-title + p {
 52 |     margin-top: 0;
 53 | }
 54 | .card-panel.info {
 55 |     background-color: #e1f5fe;
 56 |     border-left: 0.25rem solid #03a9f4;
 57 | }
 58 | .card-panel.warning {
 59 |     background-color: #fff8e1;
 60 |     border-left: 0.25rem solid #ffc107;
 61 | }
 62 | .card-panel.alert {
 63 |     background-color: #ffebee;
 64 |     border-left: 0.25rem solid #f44336;
 65 | }
 66 | .card-panel > :first-child {
 67 |     margin-top: 0;
 68 | }
 69 | .card-panel > :last-child {
 70 |     margin-bottom: 0;
 71 | }
 72 | .tabs .tab a {
 73 |     color: inherit;
 74 | }
 75 | .tabs .tab a:hover, .tabs .tab a.active {
 76 |     color: inherit;
 77 | }
 78 | .tabs .indicator {
 79 |     height: 4px;
 80 |     background-color: #18974c;
 81 | }
 82 | .tabs .tab a:focus, .tabs .tab a:focus.active {
 83 |   /*background-color: rgba(0,123,83,0.2);*/
 84 |     background-color: transparent;
 85 | }
 86 | input:not(.browser-default).invalid ~ .helper-text[data-error] > *,
 87 | input:not(.browser-default).valid ~ .helper-text[data-success] > * {
 88 |     /* hide nested elements */
 89 |     display: none;
 90 | }
 91 | .modal-content > form:last-child {
 92 |     margin-bottom: 0;
 93 | }
 94 | table + h6 {
 95 |     margin-top: 2rem;
 96 | }
 97 | pre {
 98 |     background-color: #f6f8fa;
 99 |     border: 1px solid rgba(30, 30, 30, 0.1);
100 |     border-radius: 3px;
101 |     color: #24292F;
102 |     font-size: .85em;
103 |     padding: 8px 16px;
104 | }
105 | 
106 | #loader {
107 |     position: fixed;
108 |     left: 0;
109 |     top: 0;
110 |     width: 100%;
111 |     height: 100%;
112 |     background: #eee;
113 |     padding: 5rem 0;
114 |     color: #333;
115 |     z-index: 99999;
116 | }
117 | input:not(.browser-default):focus:not([readonly]):not(.invalid) {
118 |     border-bottom: 1px solid #3489ca !important;
119 |     box-shadow: 0 1px 0 0 #3489ca !important;
120 | }
121 | input:not(.browser-default):focus:not([readonly]):not(.invalid) + label {
122 |     color: #3489ca !important;
123 | }
124 | .custom.blue { background-color: #3489ca !important; }
125 | 
126 | [type="checkbox"].custom.blue.filled-in:checked + span:not(.lever)::after {
127 |     border: 2px solid #3489ca !important;
128 |     background-color: #3489ca !important;
129 | }
130 | 
131 | ::placeholder {
132 |     color: rgb(90, 95, 95);
133 |     opacity: 0.5;
134 | }
135 | 
136 | /* Header */
137 | header {
138 |     border-bottom: 1px solid rgba(0,0,0,.14);
139 | }
140 | #top-nav {
141 |     background-color: inherit;
142 |     box-shadow: inherit;
143 |     color: #18974c;
144 |     height: 150px;
145 |     display: flex;
146 |     flex-direction: row;
147 |     align-items: center;
148 |     margin-bottom: 20px;
149 | }
150 | #title {
151 |     display: flex;
152 |     flex-direction: column;
153 | }
154 | #title h1 {
155 |     margin: 1rem 0 0;
156 |     width: 100%;
157 |     text-align: center;
158 | }
159 | #title h2 {
160 |     margin: 1rem 0 0;
161 |     width: 100%;
162 |     text-align: center;
163 |     font-size: 3rem;
164 | }
165 | #top-nav i {
166 |     font-size: 4.2rem;
167 |     width: 10%;
168 |     margin: 1rem;
169 | }
170 | header p {
171 |     color: rgba(0, 0, 0, .5);
172 |     margin: 0 0 1rem;
173 |     text-align: right;
174 | }
175 | 
176 | /* Intro/Abstract */
177 | #intro > p:first-child {
178 |     font-weight: bold;
179 |     font-size: 110%;
180 | }
181 | 
182 | /* Autocomplete highlight */
183 | .dropdown-content li > span {
184 |     color: #444;
185 | }
186 | .autocomplete-content li .highlight {
187 |     color: #26a69a;
188 | }
189 | 
190 | #remove-user {
191 |     width: 100%;
192 |     height: 48px;
193 | }
194 | 
195 | table + .pagination {
196 |     margin-top: 1rem;
197 |     text-align: center;
198 | }
199 | .pagination li {
200 |     /* Override Materialize */
201 |     vertical-align: auto;
202 |     height: auto;
203 |     margin: .25rem;
204 | }
205 | .pagination li.active {
206 |     background-color: #485fc7;
207 |     border-color: #485fc7;
208 |     color: #fff;
209 | }
210 | .pagination li a {
211 |     color: #363636;
212 |     font-size: 1rem;
213 |     height: auto;
214 |     line-height: normal;
215 |     min-width: 2.5em;
216 |     padding: .5rem;
217 |     user-select: none;
218 | }
219 | .pagination li:not(.ellipsis) {
220 |     border: 1px solid #dbdbdb;
221 |     border-radius: .375em;
222 | }
223 | 
224 | .pagination li.ellipsis {
225 |     pointer-events: none;
226 | }
227 | 
228 | thead th.sortable {
229 |     position: relative;
230 |     padding-right: 20px;
231 | }
232 | thead th.sortable:hover {
233 |     cursor: pointer;
234 | }
235 | thead th.sortable:hover::after {
236 |     opacity: .25;
237 | }
238 | thead th.sortable::after {
239 |     position: absolute;
240 |     display: inline-block;
241 |     opacity: .15;
242 |     right: 10px;
243 |     font-size: .8em;
244 |     content: "▲";
245 | }
246 | 
247 | thead th.sortable.asc::after {
248 |     content: "▲";
249 |     opacity: 1;
250 | }
251 | 
252 | thead th.sortable.desc::after {
253 |     content: "▼";
254 |     opacity: 1;
255 | }
256 | .table-search {
257 |     float: right;
258 |     width: 350px;
259 |     margin-bottom: 0;
260 | }
261 | .table-search > input[type="text"]:not(.browser-default) {
262 |     border: 1px solid #9e9e9e;
263 |     margin: 0;
264 |     padding-left: 20px;
265 | }
266 | .table-search > input[type="text"]:focus:not(.browser-default) {
267 |     border: 1px solid #3489ca !important;
268 |     box-shadow: none !important;
269 | }
270 | 
271 | .table-of-contents a {
272 |     border-color: #4caf50 !important;
273 | }
274 | .table-of-contents a.active {
275 |     font-weight: bold;
276 | }
277 | .table-of-contents a ~ ul li {
278 |     line-height: 1;
279 |     /*padding: 0;*/
280 | }
281 | .table-of-contents a ~ ul li a {
282 |     font-size: .85em;
283 |     color: rgba(117, 117, 117, 0.75);
284 |     line-height: 1;
285 |     height: auto;
286 | }
287 | .table-of-contents a:not(.active) ~ ul {
288 |     /*display: none;*/
289 | }
290 | 
291 | .highcharts-tooltip table > tbody > tr {
292 |     border: none !important;
293 | }
294 | 
295 | .highcharts-tooltip table > tbody > tr > td {
296 |     padding: .25em .5em;
297 | }
298 | #user-info {
299 |     display: flex;
300 |     align-items: center;
301 |     margin-bottom: 1rem;
302 |     /*height: 100px;*/
303 | }
304 | #user-info > img {
305 |     height: 100px;
306 |     width: 100px;
307 | }
308 | #user-info > .content {
309 |     padding-left: 1.5rem;
310 | }
311 | #user-info > .content > h6 {
312 |     /*display: inline-block;*/
313 |     font-weight: 700;
314 |     margin: 0;
315 | }
316 | #user-info > .content > .block {
317 |     margin: .25em 0 .25em;
318 |     color: rgba(0,0,0,.6);
319 | }
320 | #faq .question {
321 |     font-weight: bold;
322 |     display: block;
323 | }
324 | .stats-summary .col i {
325 |     display: block;
326 |     font-size: 3rem;
327 |     margin-bottom: .5rem;
328 | }
329 | .stats-summary .col span:not([data-stat]) {
330 |     font-size: 1.15rem;
331 | }
332 | .stats-summary .col span[data-stat] {
333 |     display: block;
334 |     font-size: 1.5rem;
335 |     font-weight: 700;
336 | }
337 | #contact-email, #contact-slack {
338 |     white-space: nowrap;
339 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # GA4HPC: Green Algorithms for High Performance Computing
  2 | 
  3 | > :point_right: There are many different flabours of SLURM setups, so no doubt you'll find some bugs... 
  4 | please let us know what you find so that we can make it work for more people! 
  5 | 
  6 | The aim of this code is to implement the Green Algorithms framework 
  7 | (more [here](https://onlinelibrary.wiley.com/doi/abs/10.1002/advs.202100707) 
  8 | and on [www.green-algorithms.org](www.green-algorithms.org)) 
  9 | directly on HPC clusters powered by SLURM (although it could work for other workload managers, see below).
 10 | 
 11 | As a user, it pulls your usage statistics from the workload manager's logs and then it estimate your carbon footprint based on this usage.
 12 | It reports a range of statistics such as energy usage, carbon footprints, compute use, memory efficiency, impact of failed jobs etc.
 13 | 
 14 | The default output is in the terminal (example below), but we have now added the option of a richer html output (example coming soon).
 15 | 
 16 | https://github.com/GreenAlgorithms/GreenAlgorithms4HPC/blob/main/example_files/Screenshot%20HPC%202024-08-20.png
 17 | 
 18 | ![example file](https://github.com/GreenAlgorithms/GreenAlgorithms4HPC/blob/main/example_files/Screenshot%20HPC%202024-08-20.png)
 19 | 
 20 | ## Quick start
 21 | 
 22 | The tool only needs to be installed once, preferably in a shared drive so that all users can access it without installing 
 23 | it for themselves.
 24 | 
 25 | :warning: Even if it's in a shared drive, each user will only be able to see their own usage. 
 26 | However, if the HTML output is used without a custom output directory, the report will be also located on the shared drive
 27 | (more on this below).
 28 | 
 29 | ### If GA4HPC is not installed yet
 30 | 
 31 | Then it's on you to install it: see below for installation guide
 32 | 
 33 | ### If GA4HPC is already installed
 34 | 
 35 | Then you can run it straight away to find out your own carbon footprint. 
 36 | Assuming it's installed in `shared_directory`, all you have to do is to run the command below on the SLURM cluster to obtain the carbon footprint between two dates. 
 37 | ```bash
 38 | shared_directory/myCarbonFootprint.sh --startDay 2024-01-10 --endDay 2024-08-15
 39 | ```
 40 | 
 41 | You can customise the output with a number of options (full list below), but the main ones are:
 42 | - `-S --startDay` and `-E --endDay`: formatted at YYY-MM-DD to restrict the logs considered. 
 43 | - `-o --output`: `-o terminal` to have the terminal output (default) or `-o html` for the html report. 
 44 | In case of the html report, a subdirectory will be created for it. 
 45 | By default, it's under `GreenAlgorithms4HPC/outputs/`, but this can be changed. 
 46 | - `--outputDir` to provide a path where to export any output. 
 47 | 
 48 | ### Limitations to keep in mind
 49 | 
 50 |  - The workload manager doesn't alway log the exact CPU usage time, and when this information is missing, we assume that all cores are used at 100%.
 51 |  - For now, we assume that GPUs are used at 100% (as the information needed for more accurate measurement is not available)
 52 |  (this may lead to slightly overestimated carbon footprints, although the order of magnitude is likely to be correct)
 53 |  - Conversely, the wasted energy due to memory overallocation may be largely underestimated, as the information needed is not always logged.
 54 | 
 55 | 
 56 | ## Full list of options
 57 | 
 58 | ```
 59 | usage: __init__.py [-h] [-S STARTDAY] [-E ENDDAY] [-o OUTPUT] [--outputDir OUTPUTDIR] [--filterCWD] [--filterJobIDs FILTERJOBIDS] [--filterAccount FILTERACCOUNT] [--customSuccessStates CUSTOMSUCCESSSTATES]
 60 |                    [--reportBug | --reportBugHere] [--useCustomLogs USECUSTOMLOGS]
 61 | 
 62 | Calculate your carbon footprint on the server.
 63 | 
 64 | optional arguments:
 65 |   -h, --help            show this help message and exit
 66 |   -S STARTDAY, --startDay STARTDAY
 67 |                         The first day to take into account, as YYYY-MM-DD (default: 2024-01-01)
 68 |   -E ENDDAY, --endDay ENDDAY
 69 |                         The last day to take into account, as YYYY-MM-DD (default: today)
 70 |   -o OUTPUT, --output OUTPUT
 71 |                         How to display the results, one of 'terminal' or 'html' (default: terminal)
 72 |   --outputDir OUTPUTDIR
 73 |                         Export path for the output (default: under `outputs/`). Only used with `--output html` and `--reportBug`.
 74 |   --filterCWD           Only report on jobs launched from the current location.
 75 |   --filterJobIDs FILTERJOBIDS
 76 |                         Comma separated list of Job IDs you want to filter on. (default: "all")
 77 |   --filterAccount FILTERACCOUNT
 78 |                         Only consider jobs charged under this account
 79 |   --customSuccessStates CUSTOMSUCCESSSTATES
 80 |                         Comma-separated list of job states. By default, only jobs that exit with status CD or COMPLETED are considered successful (PENDING, RUNNING and REQUEUD are ignored). Jobs with states listed here will
 81 |                         be considered successful as well (best to list both 2-letter and full-length codes. Full list of job states: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
 82 |   --reportBug           In case of a bug, this flag exports the jobs logs so that you/we can investigate further. The debug file will be stored in the shared folder where this tool is located (under /outputs), to export it to
 83 |                         your home folder, user `--reportBugHere`. Note that this will write out some basic information about your jobs, such as runtime, number of cores and memory usage.
 84 |   --reportBugHere       Similar to --reportBug, but exports the output to your home folder.
 85 |   --useCustomLogs USECUSTOMLOGS
 86 |                         This bypasses the workload manager, and enables you to input a custom log file of your jobs. This is mostly meant for debugging, but can be useful in some situations. An example of the expected file
 87 |                         can be found at `example_files/example_sacctOutput_raw.txt`.
 88 | ```
 89 | 
 90 | ## Installation guide
 91 | 
 92 | :point_right: Only needs to be installed once on a cluster, check first that someone else hasn't installed it yet!
 93 | 
 94 | ### Requirements
 95 | - Python 3.8+ (can probably be adjusted to older versions of python fairly easily).
 96 | 
 97 | ### Step-by-step
 98 | 
 99 | 1. Clone this repository in a shared directory on your cluster:
100 |     ```bash
101 |     $ cd shared_directory 
102 |     $ git clone https://github.com/Llannelongue/GreenAlgorithms4HPC.git
103 |     ```
104 | 
105 | 2. Edit `myCarbonFootprint.sh` line 20 to create the virtual environment with Python 3.8 or later. 
106 | The default line is:
107 |     ```bash
108 |     /usr/bin/python3.8 -m venv GA_env
109 |     ```
110 |     But it may be something else on your server, for example:
111 |     ```bash
112 |     module load python/3.11.7
113 |     python -m venv GA_env
114 |     ```
115 | 
116 | 3. Make the bash script executable: 
117 |     ```bash
118 |     $ chmod +x shared_directory/GreenAlgorithms4HPC/myCarbonFootprint.sh
119 |     ```
120 | 
121 | 4. Edit `cluster_info.yaml` to plug in the values corresponding to the hardware specs of your cluster
122 |    (this is the tricky step). You can ask your HPC team and 
123 |    you can find a lot of useful values on the Green Algorithms GitHub: https://github.com/GreenAlgorithms/green-algorithms-tool/tree/master/data
124 | 
125 | 5. Run the script a first time. It will check that the correct version of python is used 
126 | and will create the virtualenv with the required packages, based on `requirements.txt`:
127 | ```shell script
128 | $ shared_directory/GreenAlgorithms4HPC/myCarbonFootprint.sh
129 | ```
130 | 
131 | ### How to update the software once installed
132 | 
133 | _More elegant solutions welcome! [Discussion here](https://github.com/Llannelongue/GreenAlgorithms4HPC/issues/1)._
134 | 
135 | ⚠️ Make sure you have saved your custom version of `cluster_info.yaml` 
136 | and the way to load python3.8 the first time.
137 | 
138 | - `git reset --hard` To remove local changes to files (hence the need for a backup!)
139 | - `git pull`
140 | - Update `cluster_info.yaml` and `myCarbonFootprint.sh` as described above.
141 | - `chmod +x myCarbonFootprint.sh` to make it executable again
142 | - Test `myCarbonFootprint.sh` 
143 | 
144 | ## FAQ
145 | 
146 | ### Can it work other other workload managers?
147 | 
148 | Yes it can, but we have only written the code for SLURM so far. 
149 | What you can do is to adapts [`slurm_extract.py`](backend/slurm_extract.py) for your own workload manager.
150 | 
151 | In a nutshell, you just need to create a variable `self.df_agg_X` similar to the example file [here](example_files/example_output_workloadManager.tsv)
152 | (only the columns with a name ending in X in the code are needed).
153 | 
154 | ### How to debug errors
155 | There are some example of intermediary files in [example_files/](example_files/).
156 | 
157 | For the workload manager part of the code:
158 | - [The raw output](example_files/example_sacctOutput_raw.txt) ([here](example_files/example_sacctOutput_raw_asDF.tsv) as a table) from the `sacct` SLURM command (this is the command pulling all the logs from SLURM), i.e. `WM.logs_raw`, the output of `WM.pull_logs()`.
159 | - [The cleaned output of the workload manager step](example_files/example_output_workloadManager.tsv), i.e. `WM.df_agg`, the output of `WM.clean_logs_df()`. Only the columns with a name ending with X are needed (the other ones are being used by the workload manager script). NB: the `pd.DataFrame` has been converted to a csv to be included here.
160 | 


--------------------------------------------------------------------------------
/frontend/dashboard_output.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | from jinja2 import Environment, FileSystemLoader
  4 | # from jinja2 import select_autoescape, DebugUndefined, StrictUndefined, Undefined
  5 | import datetime
  6 | from pprint import pprint
  7 | import pandas as pd
  8 | import numpy as np
  9 | import plotly.express as px
 10 | 
 11 | from frontend.helpers import formatText_footprint, formatText_treemonths, formatText_flying
 12 | 
 13 | # class SilentUndefined(Undefined): # DEBUGONLY
 14 | #     def _fail_with_undefined_error(self, *args, **kwargs):
 15 | #         return '!MISSING!'
 16 | 
 17 | def formatText_timedelta_short(dt):
 18 |     dt_sec = dt.total_seconds()
 19 |     hour = 3600
 20 |     day = 24*hour
 21 |     year = 365*day
 22 |     if dt_sec >= year:
 23 |         return f"{dt_sec / year:.1f} year{'' if int(dt_sec/year)==1 else 's'}"
 24 |     elif dt_sec > 2*day:
 25 |         return f"{dt_sec / day:.1f} days"
 26 |     elif dt_sec >= hour:
 27 |         return f"{dt_sec / hour:.1f} hour{'' if int(dt_sec/hour)==1 else 's'}"
 28 |     else:
 29 |         return f"{dt_sec:.2f} seconds"
 30 | 
 31 | def formatText_cost(cost, cluster_info):
 32 |     return f"{cluster_info['energy_cost']['currency']}{cost:,.0f}"
 33 | 
 34 | def get_summary_texts(dict_in, cluster_info):
 35 |     output = {
 36 |         'cpuTime': formatText_timedelta_short(dict_in['cpuTime']),
 37 |         'gpuTime': formatText_timedelta_short(dict_in['gpuTime']),
 38 |         'carbonFootprint': formatText_footprint(dict_in['carbonFootprint'], use_html=True),
 39 |         'carbonFootprint_failedJobs': formatText_footprint(dict_in['carbonFootprint_failedJobs'], use_html=True),
 40 |         'carbonFootprint_failedJobs_share': f"{dict_in['carbonFootprint_failedJobs']/dict_in['carbonFootprint']:.2%}",
 41 |         'carbonFootprint_wasted_memoryOverallocation': formatText_footprint(dict_in['carbonFootprint']-dict_in['carbonFootprint_memoryNeededOnly'], use_html=True),
 42 |         'share_carbonFootprint': f"{dict_in['share_carbonFootprint']:.2%}",
 43 |         'trees': formatText_treemonths(dict_in['treeMonths'], splitMonthsYear=False),
 44 |         'flying': formatText_flying(dict_in, output_format='dict'),
 45 |         'cost': formatText_cost(dict_in['cost'], cluster_info=cluster_info),
 46 |         'cost_failedJobs': formatText_cost(dict_in['cost_failedJobs'], cluster_info=cluster_info),
 47 |         'cost_wasted_memoryOverallocation': formatText_cost(dict_in['cost']-dict_in['cost_memoryNeededOnly'], cluster_info=cluster_info),
 48 |         'n_jobs': f"{dict_in['n_jobs']:,}"
 49 |     }
 50 | 
 51 |     for key in dict_in:
 52 |         if key not in output:
 53 |             # print(f"adding {key}")
 54 |             output[key] = dict_in[key]
 55 | 
 56 |     return output
 57 | 
 58 | class dashboard_html:
 59 |     def __init__(self, dict_stats, args, cluster_info):
 60 |         self.dict_stats = dict_stats
 61 |         self.args = args
 62 |         self.cluster_info = cluster_info
 63 | 
 64 |         self.context = {
 65 |             'last_updated': datetime.datetime.now().strftime("%A %d %b %Y, %H:%M"),
 66 |             'startDay': args.startDay,
 67 |             'endDay': args.endDay,
 68 |             'institution': cluster_info['institution'],
 69 |             'cluster_name': cluster_info['cluster_name'],
 70 |             'PUE': cluster_info['PUE'],
 71 |             'CI': cluster_info['CI'],
 72 |             'energy_cost_perkWh': cluster_info['energy_cost'],
 73 |             'texts_intro': cluster_info['texts_intro'],
 74 |         }
 75 | 
 76 |         self.template_plotly = "plotly_white"
 77 |         self.custom_colours = {
 78 |             'area': '#a6cee3'
 79 |         }
 80 |         self.height_plotly = 350
 81 | 
 82 |         self.user_here = dict_stats['user']
 83 | 
 84 |         self.outputDir = args.outputDir2use['path']
 85 |         self.plotsDir = os.path.join(self.outputDir, 'plots')
 86 |         os.makedirs(self.plotsDir)
 87 | 
 88 |     def _user_context(self):
 89 |         ####################################
 90 |         # User-specific part of the report #
 91 |         ####################################
 92 | 
 93 |         self.context['user'] = {'userID': self.user_here}
 94 | 
 95 |         self.context['usersActivity'] = {
 96 |             self.user_here: get_summary_texts(
 97 |                 self.dict_stats['userActivity'][self.user_here],
 98 |                 cluster_info=self.cluster_info
 99 |             )
100 |         }
101 | 
102 |         ### User's overall metrics
103 | 
104 |         df_userDaily_here = self.dict_stats['userDaily']
105 | 
106 |         # Daily carbon footprint
107 |         fig_userDailyCarbonFootprint = px.area(
108 |             df_userDaily_here, x='SubmitDate', y="carbonFootprint",
109 |             labels=dict(SubmitDate='', carbonFootprint='Carbon footprint (gCO2e)'),
110 |             title="Daily carbon footprint",
111 |             template=self.template_plotly,
112 |             color_discrete_sequence=[self.custom_colours['area']]
113 |         )
114 |         fig_userDailyCarbonFootprint.update_layout(height=self.height_plotly)
115 |         fig_userDailyCarbonFootprint.write_html(
116 |             os.path.join(self.plotsDir, "plotly_thisuserDailyCarbonFootprint.html"),
117 |             include_plotlyjs='cdn'
118 |         )
119 | 
120 |         # Daily number of jobs
121 |         fig_userDailyNjobs = px.area(
122 |             df_userDaily_here, x='SubmitDate', y="n_jobs",
123 |             labels=dict(SubmitDate='', n_jobs='Number of jobs started'),
124 |             title="Number of jobs started",
125 |             template=self.template_plotly,
126 |             color_discrete_sequence=[self.custom_colours['area']]
127 |         )
128 |         fig_userDailyNjobs.update_layout(height=self.height_plotly)
129 |         fig_userDailyNjobs.write_html(
130 |             os.path.join(self.plotsDir, "plotly_thisuserDailyNjobs.html"),
131 |             include_plotlyjs='cdn'
132 |         )
133 | 
134 |         # Daily CPU time
135 |         fig_userDailyCpuTime = px.area(
136 |             df_userDaily_here, x='SubmitDate', y="CPUhoursCharged",
137 |             labels=dict(SubmitDate='', CPUhoursCharged='CPU core-hours'),
138 |             title="CPU core hours",
139 |             template=self.template_plotly,
140 |             color_discrete_sequence=[self.custom_colours['area']]
141 |         )
142 |         fig_userDailyCpuTime.update_layout(height=self.height_plotly)
143 |         fig_userDailyCpuTime.write_html(
144 |             os.path.join(self.plotsDir, "plotly_thisuserDailyCpuTime.html"),
145 |             include_plotlyjs='cdn'
146 |         )
147 | 
148 |         # Daily Memory requested
149 |         fig_userDailyCpuTime = px.area(
150 |             df_userDaily_here, x='SubmitDate', y="memoryRequested",
151 |             labels=dict(SubmitDate='', memoryRequested='Memory requested (GB)'),
152 |             title="Memory requested",
153 |             template=self.template_plotly,
154 |             color_discrete_sequence=[self.custom_colours['area']]
155 |         )
156 |         fig_userDailyCpuTime.update_layout(height=self.height_plotly)
157 |         fig_userDailyCpuTime.write_html(
158 |             os.path.join(self.plotsDir, "plotly_thisuserDailyMemoryRequested.html"),
159 |             include_plotlyjs='cdn'
160 |         )
161 | 
162 |         # Total success rate
163 |         n_success = self.dict_stats['userActivity'][self.user_here]['n_success']
164 |         n_failure = self.dict_stats['userActivity'][self.user_here]['n_jobs'] - self.dict_stats['userActivity'][self.user_here]['n_success']
165 |         foo = pd.DataFrame({
166 |             'Status': ['Success', 'Failure'],
167 |             'Number of jobs': [n_success, n_failure]
168 |         })
169 |         fig_userSuccessRate = px.pie(
170 |             foo, values='Number of jobs', names='Status', color='Status',
171 |             color_discrete_map={'Success':"#A9DFBF", 'Failure': "#F5B7B1"},
172 |             template=self.template_plotly,
173 |             hole=.6,
174 |         )
175 |         fig_userSuccessRate.update_layout(height=self.height_plotly)
176 |         fig_userSuccessRate.write_html(
177 |             os.path.join(self.plotsDir, "plotly_thisuserSuccessRate.html"),
178 |             include_plotlyjs='cdn'
179 |         )
180 | 
181 |         # Daily success rate
182 |         fig_userDailySuccessRate = px.area(
183 |             pd.melt(df_userDaily_here, id_vars='SubmitDate', value_vars=['failure_rate', 'success_rate']),
184 |             x='SubmitDate', y="value", color='variable',
185 |             color_discrete_map={'failure_rate': "#F5B7B1", 'success_rate': "#A9DFBF"},
186 |             labels=dict(SubmitDate='', value='% of failed jobs (in red)', variable=""),
187 |             # title="",
188 |             template=self.template_plotly
189 |         )
190 |         fig_userDailySuccessRate.update_layout(height=self.height_plotly, showlegend=False)
191 |         fig_userDailySuccessRate.write_html(
192 |             os.path.join(self.plotsDir, "plotly_thisuserDailySuccessRate.html"),
193 |             include_plotlyjs='cdn'
194 |         )
195 | 
196 |         # Memory efficiency
197 |         fig_userMemoryEfficiency = px.histogram(
198 |             np.reciprocal(self.dict_stats['memoryOverallocationFactors'][self.user_here]) * 100,
199 |             labels=dict(value="Memory efficiency (%)"),
200 |             template=self.template_plotly,
201 |             color_discrete_sequence=[self.custom_colours['area']]
202 |         )
203 |         fig_userMemoryEfficiency.update_layout(
204 |             bargap=0.2,
205 |             yaxis_title="Number of jobs",
206 |             showlegend=False,
207 |             height=self.height_plotly
208 |         )
209 |         fig_userMemoryEfficiency.write_html(
210 |             os.path.join(self.plotsDir, "plotly_thisuserMemoryEfficiency.html"),
211 |             include_plotlyjs='cdn'
212 |         )
213 | 
214 |     def generate(self):
215 | 
216 |         self.context['include_user_context'] = True
217 | 
218 |         self._user_context()
219 | 
220 |         environment = Environment(
221 |             loader=FileSystemLoader(['frontend/templates/', self.plotsDir]),
222 |             # autoescape=select_autoescape(),
223 |             # undefined=SilentUndefined  # StrictUndefined is mostly for testing, SilenUndefined to ignore missing ones
224 |         )
225 | 
226 |         j2_template = environment.get_template('report_blank.html')
227 |         j2_rendered = j2_template.render(self.context)
228 | 
229 |         ## Export
230 |         # print(os.getcwd())
231 |         report_path = os.path.join(self.outputDir, f"report_{self.user_here}.html")
232 |         with open(report_path, 'w') as file:
233 |             file.write(j2_rendered)
234 |         # Also copy across the styles.css
235 |         shutil.copy("frontend/templates/styles.css", self.outputDir)
236 | 
237 |         return report_path
238 | 
239 |         # FIXME the pdf export doesn't really work...sticking with html for now
240 |         # Follows guidelines from https://doc.courtbouillon.org/weasyprint/stable/first_steps.html#command-line
241 |         # from weasyprint import HTML, CSS
242 |         #
243 |         # css = CSS(string=''' @page {size: 53.34cm 167.86 cm;} ''')
244 |         # HTML("outputs/report_rendered.html").write_pdf("outputs/report_rendered.pdf", stylesheets=[css])


--------------------------------------------------------------------------------
/backend/__init__.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import os
  3 | import yaml
  4 | import pandas as pd
  5 | import numpy as np
  6 | 
  7 | from backend.helpers import check_empty_results, simulate_mock_jobs
  8 | from backend.slurm_extract import WorkloadManager
  9 | 
 10 | # print("Working dir1: ", os.getcwd()) # DEBUGONLY
 11 | 
 12 | class GA_tools():
 13 | 
 14 |     def __init__(self, cluster_info, fParams):
 15 |         self.cluster_info = cluster_info
 16 |         self.fParams = fParams
 17 | 
 18 |     def calculate_energies(self, row):
 19 |         '''
 20 |         Calculate the energy usaged based on the job's paramaters
 21 |         :param row: [pd.Series] one row of usage statistics, corresponding to one job
 22 |         :return: [pd.Series] the same statistics with the energies added
 23 |         '''
 24 |         ### CPU and GPU
 25 |         partition_info = self.cluster_info['partitions'][row.PartitionX]
 26 |         if row.PartitionTypeX == 'CPU':
 27 |             TDP2use4CPU = partition_info['TDP']
 28 |             TDP2use4GPU = 0
 29 |         else:
 30 |             TDP2use4CPU = partition_info['TDP_CPU']
 31 |             TDP2use4GPU = partition_info['TDP']
 32 | 
 33 |         row['energy_CPUs'] = row.TotalCPUtime2useX.total_seconds() / 3600 * TDP2use4CPU / 1000  # in kWh
 34 | 
 35 |         row['energy_GPUs'] = row.TotalGPUtime2useX.total_seconds() / 3600 * TDP2use4GPU / 1000  # in kWh
 36 | 
 37 |         ### memory
 38 |         for suffix, memory2use in zip(['','_memoryNeededOnly'], [row.ReqMemX,row.NeededMemX]):
 39 |             row[f'energy_memory{suffix}'] = row.WallclockTimeX.total_seconds()/3600 * memory2use * self.fParams['power_memory_perGB'] /1000 # in kWh
 40 |             row[f'energy{suffix}'] = (row.energy_CPUs +  row.energy_GPUs + row[f'energy_memory{suffix}']) * self.cluster_info['PUE'] # in kWh
 41 | 
 42 |         return row
 43 | 
 44 |     def calculate_carbonFootprint(self, df, col_energy):
 45 |         return df[col_energy] * self.cluster_info['CI']
 46 | 
 47 | 
 48 | def extract_data(args, cluster_info):
 49 | 
 50 |     if args.use_mock_agg_data: # DEBUGONLY
 51 | 
 52 |         if args.reportBug | args.reportBugHere:
 53 |             print("\n(!) --reportBug and --reportBugHere are ignored when --useCustomLogs is present\n")
 54 | 
 55 |         # df2 = simulate_mock_jobs()
 56 |         # df2.to_pickle("testData/df_agg_X_mockMultiUsers_1.pkl")
 57 | 
 58 |         # foo = 'testData/df_agg_test_3.pkl'
 59 |         foo = 'testData/df_agg_X_1.pkl'
 60 |         print(f"Overriding df_agg with `{foo}`")
 61 |         return pd.read_pickle(foo)
 62 | 
 63 | 
 64 |     ### Pull usage statistics from the workload manager
 65 |     WM = WorkloadManager(args, cluster_info)
 66 |     WM.pull_logs()
 67 | 
 68 |     ### Log the output for debugging
 69 |     if args.reportBug | args.reportBugHere:
 70 |         if args.reportBug:
 71 |             # Create an error_logs subfolder in the output dir
 72 |             errorLogsDir = os.path.join(args.outputDir2use['path'], 'error_logs')
 73 |             os.makedirs(errorLogsDir)
 74 |             log_path = os.path.join(errorLogsDir, f'sacctOutput.csv')
 75 |         else:
 76 |             # i.e. args.reportBugHere is True
 77 |             log_path = f"{args.userCWD}/sacctOutput_{args.outputDir2use['timestamp']}.csv"
 78 | 
 79 |         with open(log_path, 'wb') as f:
 80 |             f.write(WM.logs_raw)
 81 |         print(f"\nSLURM statistics logged for debuging: {log_path}\n")
 82 | 
 83 |     ### Turn usage logs into DataFrame
 84 |     WM.convert2dataframe()
 85 |     check_empty_results(WM.logs_df, args)
 86 | 
 87 |     # And clean
 88 |     WM.clean_logs_df()
 89 |     # Check if there are any jobs during the period from this directory and with these jobIDs
 90 |     check_empty_results(WM.df_agg, args)
 91 | 
 92 |     # Check that there is only one user's data
 93 |     if len(set(WM.df_agg_X.UserX)) > 1:
 94 |         raise ValueError(f"More than one user's logs was included: {set(WM.df_agg_X.UserX)}")
 95 | 
 96 |     # WM.df_agg_X.to_pickle("testData/df_agg_X_1.pkl") # DEBUGONLY used to test different steps offline
 97 | 
 98 |     return WM.df_agg_X
 99 | 
100 | def enrich_data(df, fParams, GA):
101 | 
102 |     ### energy
103 |     df = df.apply(GA.calculate_energies, axis=1)
104 | 
105 |     df['energy_failedJobs'] = np.where(df.StateX == 0, df.energy, 0)
106 | 
107 |     ### carbon footprint
108 |     for suffix in ['', '_memoryNeededOnly', '_failedJobs']:
109 |         df[f'carbonFootprint{suffix}'] = GA.calculate_carbonFootprint(df, f'energy{suffix}')
110 |         # Context metrics (part 1)
111 |         df[f'treeMonths{suffix}'] = df[f'carbonFootprint{suffix}'] / fParams['tree_month']
112 |         df[f'cost{suffix}'] = df[f'energy{suffix}'] * fParams['electricity_cost'] # TODO use realtime electricity costs
113 | 
114 |     ### Context metrics (part 2)
115 |     df['driving'] = df.carbonFootprint / fParams['passengerCar_EU_perkm']
116 |     df['flying_NY_SF'] = df.carbonFootprint / fParams['flight_NY_SF']
117 |     df['flying_PAR_LON'] = df.carbonFootprint / fParams['flight_PAR_LON']
118 |     df['flying_NYC_MEL'] = df.carbonFootprint / fParams['flight_NYC_MEL']
119 | 
120 |     return df
121 | 
122 | def summarise_data(df, args):
123 |     agg_functions_from_raw = {
124 |         'n_jobs': ('UserX', 'count'),
125 |         'first_job_period': ('SubmitDatetimeX', 'min'),
126 |         'last_job_period': ('SubmitDatetimeX', 'max'),
127 |         'energy': ('energy', 'sum'),
128 |         'energy_CPUs': ('energy_CPUs', 'sum'),
129 |         'energy_GPUs': ('energy_GPUs', 'sum'),
130 |         'energy_memory': ('energy_memory', 'sum'),
131 |         'carbonFootprint': ('carbonFootprint', 'sum'),
132 |         'carbonFootprint_memoryNeededOnly': ('carbonFootprint_memoryNeededOnly', 'sum'),
133 |         'carbonFootprint_failedJobs': ('carbonFootprint_failedJobs', 'sum'),
134 |         'cpuTime': ('TotalCPUtime2useX', 'sum'),
135 |         'gpuTime': ('TotalGPUtime2useX', 'sum'),
136 |         'wallclockTime': ('WallclockTimeX', 'sum'),
137 |         'CPUhoursCharged': ('CPUhoursChargedX', 'sum'),
138 |         'GPUhoursCharged': ('GPUhoursChargedX', 'sum'),
139 |         'memoryRequested': ('ReqMemX', 'sum'),
140 |         'memoryOverallocationFactor': ('memOverallocationFactorX', 'mean'),
141 |         'n_success': ('StateX', 'sum'),
142 |         'treeMonths': ('treeMonths', 'sum'),
143 |         'treeMonths_memoryNeededOnly': ('treeMonths_memoryNeededOnly', 'sum'),
144 |         'treeMonths_failedJobs': ('treeMonths_failedJobs', 'sum'),
145 |         'driving': ('driving', 'sum'),
146 |         'flying_NY_SF': ('flying_NY_SF', 'sum'),
147 |         'flying_PAR_LON': ('flying_PAR_LON', 'sum'),
148 |         'flying_NYC_MEL': ('flying_NYC_MEL', 'sum'),
149 |         'cost': ('cost', 'sum'),
150 |         'cost_failedJobs': ('cost_failedJobs', 'sum'),
151 |         'cost_memoryNeededOnly': ('cost_memoryNeededOnly', 'sum'),
152 |     }
153 | 
154 |     # This is to aggregate already aggregated dataset (so names are a bit different)
155 |     agg_functions_further = agg_functions_from_raw.copy()
156 |     agg_functions_further['n_jobs'] = ('n_jobs', 'sum')
157 |     agg_functions_further['first_job_period'] = ('first_job_period', 'min')
158 |     agg_functions_further['last_job_period'] = ('last_job_period', 'max')
159 |     agg_functions_further['cpuTime'] = ('cpuTime', 'sum')
160 |     agg_functions_further['gpuTime'] = ('gpuTime', 'sum')
161 |     agg_functions_further['wallclockTime'] = ('wallclockTime', 'sum')
162 |     agg_functions_further['CPUhoursCharged'] = ('CPUhoursCharged', 'sum')
163 |     agg_functions_further['GPUhoursCharged'] = ('GPUhoursCharged', 'sum')
164 |     agg_functions_further['memoryRequested'] = ('memoryRequested', 'sum')
165 |     agg_functions_further['memoryOverallocationFactor'] = ('memoryOverallocationFactor', 'mean') # NB: not strictly correct to do a mean of mean, but ok
166 |     agg_functions_further['n_success'] = ('n_success', 'sum')
167 | 
168 |     def agg_jobs(data, agg_names=None):
169 |         """
170 | 
171 |         :param data:
172 |         :param agg_names: if None, then the whole dataset is aggregated
173 |         :return:
174 |         """
175 |         agg_names2 = agg_names if agg_names else lambda _:True
176 |         if 'UserX' in data.columns:
177 |             timeseries = data.groupby(agg_names2).agg(**agg_functions_from_raw)
178 |         else:
179 |             timeseries = data.groupby(agg_names2).agg(**agg_functions_further)
180 | 
181 |         timeseries.reset_index(inplace=True, drop=(agg_names is None))
182 |         timeseries['success_rate'] = timeseries.n_success / timeseries.n_jobs
183 |         timeseries['failure_rate'] = 1 - timeseries.success_rate
184 |         timeseries['share_carbonFootprint'] = timeseries.carbonFootprint / timeseries.carbonFootprint.sum()
185 | 
186 |         return timeseries
187 | 
188 |     df['SubmitDate'] = df.SubmitDatetimeX.dt.date  # TODO do it with real start time rather than submit day
189 | 
190 |     df_userdaily = agg_jobs(df, ['SubmitDate'])
191 |     df_overallStats = agg_jobs(df_userdaily)
192 |     dict_overallStats = df_overallStats.iloc[0, :].to_dict()
193 |     userID = df.UserX[0]
194 | 
195 |     output = {
196 |         "userDaily": df_userdaily,
197 |         'userActivity': {userID: dict_overallStats},
198 |         "user": userID
199 |     }
200 | 
201 |     # Some job-level statistics to plot distributions
202 |     memoryOverallocationFactors = df.groupby('UserX')['memOverallocationFactorX'].apply(list).to_dict()
203 |     memoryOverallocationFactors['overall'] = df.memOverallocationFactorX.to_numpy()
204 |     output['memoryOverallocationFactors'] = memoryOverallocationFactors
205 | 
206 |     return output
207 | 
208 | 
209 | def main_backend(args):
210 |     '''
211 | 
212 |     :param args:
213 |     :return:
214 |     '''
215 |     ### Load cluster specific info
216 |     with open(os.path.join(args.path_infrastucture_info, 'cluster_info.yaml'), "r") as stream:
217 |         try:
218 |             cluster_info = yaml.safe_load(stream)
219 |         except yaml.YAMLError as exc:
220 |             print(exc)
221 | 
222 |     ### Load fixed parameters
223 |     with open("data/fixed_parameters.yaml", "r") as stream:
224 |         try:
225 |             fParams = yaml.safe_load(stream)
226 |         except yaml.YAMLError as exc:
227 |             print(exc)
228 | 
229 |     GA = GA_tools(cluster_info, fParams)
230 | 
231 |     df = extract_data(args, cluster_info=cluster_info)
232 |     df2 = enrich_data(df, fParams=fParams, GA=GA)
233 |     summary_stats = summarise_data(df2, args=args)
234 | 
235 |     return summary_stats
236 | 
237 | if __name__ == "__main__":
238 | 
239 |     #### This is used for testing only ####
240 | 
241 |     from collections import namedtuple
242 |     argStruct = namedtuple('argStruct',
243 |                            'startDay endDay use_mock_agg_data useCustomLogs customSuccessStates filterWD filterJobIDs filterAccount reportBug reportBugHere path_infrastucture_info')
244 |     args = argStruct(
245 |         startDay='2022-01-01',
246 |         endDay='2023-06-30',
247 |         useCustomLogs=None,
248 |         use_mock_agg_data=True,
249 |         customSuccessStates='',
250 |         filterWD=None,
251 |         filterJobIDs='all',
252 |         filterAccount=None,
253 |         reportBug=False,
254 |         reportBugHere=False,
255 |         path_infrastucture_info="clustersData/CSD3",
256 |     )
257 | 
258 |     main_backend(args)
259 | 
260 | 
261 | 
262 | 


--------------------------------------------------------------------------------
/backend/slurm_extract.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import subprocess
  3 | 
  4 | import pandas as pd
  5 | from io import BytesIO
  6 | import datetime
  7 | import os
  8 | import numpy as np
  9 | 
 10 | 
 11 | class Helpers_WM():
 12 | 
 13 |     def __init__(self, cluster_info):
 14 |         self.cluster_info = cluster_info
 15 | 
 16 |     def convert_to_GB(self, memory, unit):
 17 |         """
 18 |         Converts data quantity into GB.
 19 |         :param memory: [float] quantity to convert
 20 |         :param unit: [str] unit of `memory`, has to be one of ['M', 'G', 'K']
 21 |         :return: [float] memory in GB.
 22 |         """
 23 |         assert unit in ['M', 'G', 'K']
 24 |         if unit == 'M':
 25 |             memory /= 1e3
 26 |         elif unit == 'K':
 27 |             memory /= 1e6
 28 |         return memory
 29 | 
 30 |     def calc_ReqMem(self, x):
 31 |         """
 32 |         Calculates the total memory required when submitting the job.
 33 |         :param x: [pd.Series] one row of sacct output.
 34 |         :return: [float] total required memory, in GB.
 35 |         """
 36 |         mem_raw, n_nodes, n_cores = x['ReqMem'], x['NNodes'], x['NCPUS']
 37 | 
 38 |         if pd.isnull(mem_raw):
 39 |             unit = 'G'
 40 |             memory = 0
 41 |         elif mem_raw[-1] == 'n':
 42 |             unit = mem_raw[-2]
 43 |             memory = float(mem_raw[:-2]) * n_nodes
 44 |         elif mem_raw[-1] == 'c':
 45 |             unit = mem_raw[-2]
 46 |             memory = float(mem_raw[:-2]) * n_cores
 47 |         elif mem_raw[-1] in ['M', 'G', 'K']:
 48 |             unit = mem_raw[-1]
 49 |             memory = float(mem_raw[:-1])
 50 |         else:
 51 |             raise ValueError(f"Can't parse memory value: {mem_raw}. Please raise issue on GitHub.")
 52 | 
 53 |         return self.convert_to_GB(memory, unit)
 54 | 
 55 |     def clean_RSS(self, x):
 56 |         """
 57 |         Cleans the RSS value in sacct output.
 58 |         :param x: [NaN or str] the RSS value, either NaN or of the form '2745K'
 59 |         (optionally, just a number, we then use default_unit_RSS from cluster_info.yaml as unit).
 60 |         :return: [float] RSS value, in GB.
 61 |         """
 62 |         if pd.isnull(x.MaxRSS):
 63 |             # NB if no info on MaxRSS, we assume all memory was used
 64 |             memory = -1
 65 |         elif x.MaxRSS == '0':
 66 |             memory = 0
 67 |         else:
 68 |             assert type(x.MaxRSS) == str
 69 |             # Special case for the situation where MaxRSS is of the form '154264' without a unit.
 70 |             if x.MaxRSS[-1].isalpha():
 71 |                 memory = self.convert_to_GB(float(x.MaxRSS[:-1]), x.MaxRSS[-1])
 72 |             else:
 73 |                 assert 'default_unit_RSS' in self.cluster_info, "Some values of MaxRSS don't have a unit. Please specify a default_unit_RSS in cluster_info.yaml"
 74 |                 memory = self.convert_to_GB(float(x.MaxRSS), self.cluster_info['default_unit_RSS'])
 75 | 
 76 |         return memory
 77 | 
 78 |     def cleam_UsedMem(self, x):
 79 |         """
 80 |         Cleans the UsedMemory column
 81 |         :param x:
 82 |         :return: [float]
 83 |         """
 84 |         # NB when MaxRSS didn't store any values, we assume that "memory used = memory requested"
 85 |         return x.ReqMemX if x.UsedMem_ == -1 else x.UsedMem_
 86 | 
 87 |     def clean_partition(self, x):
 88 |         """
 89 |         Cleans the partition field, by replacing NaNs with empty string and selecting just one partition per job.
 90 |         :param x: [str] partition or comma-seperated list of partitions
 91 |         :return: [str] one partition or empty string
 92 |         """
 93 |         if pd.isnull(x.Partition):
 94 |             return ''
 95 | 
 96 |         L_partitions = x.Partition.split(',')
 97 |         if (x.WallclockTimeX.total_seconds() > 0) & (len(L_partitions) > 1):
 98 |             # Multiple partitions logged is only an issue for jobs that never started,
 99 |             # for the others, only the used partition is logged
100 |             print(f"\n-!- WARNING: Multiple partitions logged on a job than ran: {x.JobID} - {x.Partition} (using the first one)\n")
101 |         return L_partitions[0]
102 | 
103 |     def set_partitionType(self, x):
104 |         assert x in self.cluster_info['partitions'], f"\n-!- Unknown partition: {x} -!-\n"
105 |         return self.cluster_info['partitions'][x]['type']
106 | 
107 |     def parse_timedelta(self, x):
108 |         """
109 |         Parse a string representing a duration into a `datetime.timedelta` object.
110 |         :param x: [str] Duration, as '[DD-HH:MM:]SS[.MS]'
111 |         :return: [datetime.timedelta] Timedelta object
112 |         """
113 |         # Parse number of days
114 |         day_split = x.split('-')
115 |         if len(day_split) == 2:
116 |             n_days = int(day_split[0])
117 |             HHMMSSms = day_split[1]
118 |         else:
119 |             n_days = 0
120 |             HHMMSSms = x
121 | 
122 |         # Parse ms
123 |         ms_split = HHMMSSms.split('.')
124 |         if len(ms_split) == 2:
125 |             n_ms = int(ms_split[1])
126 |             HHMMSS = ms_split[0]
127 |         else:
128 |             n_ms = 0
129 |             HHMMSS = HHMMSSms
130 | 
131 |         # Parse HH,MM,SS
132 |         last_split = HHMMSS.split(':')
133 |         if len(last_split) == 3:
134 |             to_add = []
135 |         elif len(last_split) == 2:
136 |             to_add = ['00']
137 |         elif len(last_split) == 1:
138 |             to_add = ['00', '00']
139 |         else:
140 |             raise ValueError(f"Can't parse {x}")
141 |         n_h, n_m, n_s = list(map(int, to_add + last_split))
142 | 
143 |         return datetime.timedelta(
144 |             days=n_days, hours=n_h, minutes=n_m, seconds=n_s, milliseconds=n_ms
145 |         )
146 | 
147 |     def calc_realMemNeeded(self, x, granularity_memory_request):
148 |         """
149 |         Calculate the minimum memory needed.
150 |         This is calculated as the smallest multiple of `granularity_memory_request` that is greater than maxRSS.
151 |         :param x: [pd.Series] one row of sacct output.
152 |         :param  granularity_memory_request: [float or int] level of granularity available when requesting memory on this cluster
153 |         :return: [float] minimum memory needed, in GB.
154 |         """
155 |         foo = (int(x.UsedMem2_ / granularity_memory_request) + 1) * granularity_memory_request
156 |         return foo if x.ReqMemX < x.UsedMem2_ else min(x.ReqMemX, foo)
157 | 
158 |     def calc_memory_overallocation(self, x):
159 |         # This is in case ReqMem is wrong or too low
160 |         return 1. if x.ReqMemX < x.NeededMemX else x.ReqMemX / x.NeededMemX
161 | 
162 |     def calc_CPUusage2use(self, x):
163 |         if x.TotalCPUtime_.total_seconds() == 0:
164 |             # This is when the workload manager actually didn't store real usage
165 |             # NB: when TotalCPU=0, we assume usage factor = 100% for all CPU cores
166 |             return x.CPUwallclocktime_
167 | 
168 |         assert x.TotalCPUtime_ <= x.CPUwallclocktime_
169 |         return x.TotalCPUtime_
170 | 
171 |     def calc_GPUusage2use(self, x):
172 |         if x.PartitionTypeX != 'GPU':
173 |             return datetime.timedelta(0)
174 |         if x.WallclockTimeX.total_seconds() > 0:
175 |             assert x.NGPUS_ != 0
176 |         return x.WallclockTimeX * x.NGPUS_  # NB assuming usage factor of 100% for GPUs
177 | 
178 |     def calc_coreHoursCharged(self, x):
179 |         '''
180 |         Split CPU and GPU core hours charged, depending on the partition.
181 |         :param x:
182 |         :return: [(float, float)]
183 |         '''
184 |         if x.PartitionTypeX == 'CPU':
185 |             return x.CPUwallclocktime_ / np.timedelta64(1, 'h'), 0.
186 |         else:
187 |             return 0., x.WallclockTimeX * x.NGPUS_ / np.timedelta64(1, 'h')
188 | 
189 |     def clean_State(self, x, customSuccessStates_list):
190 |         """
191 |         Standardise the job's state, coding with {-1,0,1}
192 |         :param x: [str] "State" field from sacct output
193 |         :return: [int] in [-1,0,1]
194 |         """
195 |         # Codes are found here: https://slurm.schedmd.com/squeue.html#SECTION_JOB-STATE-CODES
196 |         # self.args.customSuccessStates = 'TO,TIMEOUT'
197 |         success_codes = ['CD', 'COMPLETED']
198 |         running_codes = ['PD', 'PENDING', 'R', 'RUNNING', 'RQ', 'REQUEUED']
199 |         if x in success_codes:
200 |             codeState = 1
201 |         elif x in customSuccessStates_list:
202 |             # we allocate a lower value here so that when aggregating by jobID, the whole job keeps the flag
203 |             # Otherwise a "cancelled" job could take over with StateX=0 for example
204 |             codeState = -1
205 |         else:
206 |             codeState = 0
207 | 
208 |         if x in running_codes:
209 |             # running jobs are the lowest to be removed all the time
210 |             # (if one of the subprocess is still running, the job gets ignored regardless of --customSuccessStates
211 |             codeState = -2
212 | 
213 |         return codeState
214 | 
215 |     def get_parent_jobID(self, x):
216 |         """
217 |         Get the parent job ID in case of array jobs
218 |         :param x: [str] JobID of the form 123456789_0 (with or without '_0')
219 |         :return: [str] Parent ID 123456789
220 |         """
221 |         foo = x.split('_')
222 |         assert len(foo) <= 2, f"Can't parse the job ID: {x}"
223 |         return foo[0]
224 | 
225 | 
226 | class WorkloadManager(Helpers_WM):
227 | 
228 |     def __init__(self, args, cluster_info):
229 |         """
230 |         Methods related to the Workload manager
231 |         :param args: [Namespace] input from the user
232 |         :param cluster_info: [dict] information about this specific cluster.
233 |         """
234 |         super().__init__(cluster_info=cluster_info)
235 |         self.args = args
236 | 
237 |         self.logs_df = None
238 |         self.df_agg_0 = None
239 |         self.df_agg = None
240 |         self.df_agg_X = None
241 | 
242 |     def pull_logs(self):
243 |         """
244 |         Run the command line to pull usage from the workload manager.
245 |         More: https://slurm.schedmd.com/sacct.html
246 |         """
247 |         if self.args.useCustomLogs == '':
248 |             bash_com = [
249 |                 "sacct",
250 |                 "--starttime",
251 |                 self.args.startDay,  # format YYYY-MM-DD
252 |                 "--endtime",
253 |                 self.args.endDay,  # format YYYY-MM-DD
254 |                 "--format",
255 |                 "UID,User,JobID,JobName,Submit,Elapsed,Partition,NNodes,NCPUS,TotalCPU,CPUTime,ReqMem,MaxRSS,WorkDir,State,Account,AllocTres",
256 |                 "-P"
257 |             ]
258 | 
259 |             # logs = subprocess.run(bash_com, capture_output=True) # this line is the new way, but doesn't work with python 3.6 or earlier. line below is the legacy way. https://stackoverflow.com/questions/4760215/running-shell-command-and-capturing-the-output
260 |             logs = subprocess.run(bash_com, stdout=subprocess.PIPE)
261 |             self.logs_raw = logs.stdout
262 |         else:
263 |             foo = "Overriding logs_raw with: "
264 |             foundIt = False
265 |             for sacctFileLocation in ['', 'testData', 'error_logs']:
266 |                 if not foundIt:
267 |                     try:
268 |                         with open(os.path.join(sacctFileLocation, self.args.useCustomLogs), 'rb') as f:
269 |                             self.logs_raw = f.read()
270 |                         foo += f"{sacctFileLocation}/{self.args.useCustomLogs}"
271 |                         foundIt = True
272 |                     except:
273 |                         pass
274 |             if not foundIt:
275 |                 raise FileNotFoundError(f"Couldn't find {self.args.useCustomLogs} \n "
276 |                                         f"It should be either be in the testData/ or error_logs/ subdirectories, or the full path should be provided by --useCustomLogs.")
277 |             print(foo)
278 | 
279 |     def convert2dataframe(self):
280 |         """
281 |         Convert raw logs output into a pandas dataframe.
282 |         """
283 |         logs_df = pd.read_csv(BytesIO(self.logs_raw), sep="|", dtype='str')
284 |         for x in ['NNodes', 'NCPUS']:
285 |             logs_df[x] = logs_df[x].astype('int64')
286 | 
287 |         self.logs_df = logs_df
288 | 
289 |     def clean_logs_df(self):
290 |         """
291 |         Clean the different fields of the usage logs.
292 |         NB: the name of the columns ending with X need to be conserved, as they are used by the main script.
293 |         """
294 |         # self.logs_df_raw = self.logs_df.copy() # DEBUGONLY Save a copy of uncleaned raw for debugging mainly
295 | 
296 |         ### Calculate real memory usage
297 |         self.logs_df['ReqMemX'] = self.logs_df.apply(self.calc_ReqMem, axis=1)
298 | 
299 |         ### Clean MaxRSS
300 |         self.logs_df['UsedMem_'] = self.logs_df.apply(self.clean_RSS, axis=1)
301 | 
302 |         ### Parse wallclock time
303 |         self.logs_df['WallclockTimeX'] = self.logs_df['Elapsed'].apply(self.parse_timedelta)
304 | 
305 |         ### Parse total CPU time
306 |         # This is the total CPU used time, accross all cores.
307 |         # But it is not reliably logged
308 |         self.logs_df['TotalCPUtime_'] = self.logs_df['TotalCPU'].apply(self.parse_timedelta)
309 | 
310 |         ### Parse core-wallclock time
311 |         # This is the maximum time cores could use, if used at 100% (Elapsed time * CPU count)
312 |         if 'CPUTime' in self.logs_df.columns:
313 |             self.logs_df['CPUwallclocktime_'] = self.logs_df['CPUTime'].apply(self.parse_timedelta)
314 |         else:
315 |             print('Using old logs, "CPUTime" information not available.')  # TODO: remove this after a while
316 |             self.logs_df['CPUwallclocktime_'] = self.logs_df.WallclockTimeX * self.logs_df.NCPUS
317 | 
318 |         ### Number of GPUs
319 |         # TODO double check that it includes multiple GPUs correctly
320 |         if 'AllocTRES' in self.logs_df.columns:
321 |             self.logs_df['NGPUS_'] = self.logs_df.AllocTRES.str.extract(r'((?<=gres\/gpu=)\d+)', expand=False).fillna(
322 |                 0).astype('int64')
323 |         else:
324 |             print('Using old logs, "AllocTRES" information not available.')  # TODO: remove this after a while
325 |             self.logs_df['NGPUS_'] = 0
326 | 
327 |         ### Clean partition
328 |         # Make sure it's either a partition name, or a comma-separated list of partitions
329 |         self.logs_df['PartitionX'] = self.logs_df.apply(self.clean_partition, axis=1)
330 | 
331 |         ### Parse submit datetime
332 |         self.logs_df['SubmitDatetimeX'] = self.logs_df.Submit.apply(
333 |             lambda x: datetime.datetime.strptime(x, "%Y-%m-%dT%H:%M:%S"))
334 | 
335 |         ### Number of CPUs
336 |         # e.g. here there is no cleaning necessary, so I just standardise the column name
337 |         self.logs_df['NCPUS_'] = self.logs_df.NCPUS
338 | 
339 |         ### Number of nodes
340 |         self.logs_df['NNodes_'] = self.logs_df.NNodes
341 | 
342 |         ### Job name
343 |         self.logs_df['JobName_'] = self.logs_df.JobName
344 | 
345 |         ### Working directory
346 |         self.logs_df['WorkingDir_'] = self.logs_df.WorkDir
347 | 
348 |         ### Username and UID
349 |         self.logs_df['UIDX'] = self.logs_df.UID
350 |         self.logs_df['UserX'] = self.logs_df.User
351 | 
352 |         ### State
353 |         customSuccessStates_list = self.args.customSuccessStates.split(',')
354 |         self.logs_df['StateX'] = self.logs_df.State.apply(self.clean_State,
355 |                                                           customSuccessStates_list=customSuccessStates_list)
356 | 
357 |         ### Pull jobID
358 |         self.logs_df['single_jobID'] = self.logs_df.JobID.apply(lambda x: x.split('.')[0])
359 | 
360 |         ### Account
361 |         if 'Account' in self.logs_df.columns:
362 |             self.logs_df['Account_'] = self.logs_df.Account
363 |         else:
364 |             print('Using old logs, "Account" information not available.')  # TODO: remove this after a while
365 |             self.logs_df['Account_'] = ''
366 | 
367 |         ### Aggregate per jobID
368 |         self.df_agg_0 = self.logs_df.groupby('single_jobID').agg({
369 |             'TotalCPUtime_': 'max',
370 |             'CPUwallclocktime_': 'max',
371 |             'WallclockTimeX': 'max',
372 |             'ReqMemX': 'max',
373 |             'UsedMem_': 'max',
374 |             'NCPUS_': 'max',
375 |             'NGPUS_': 'max',
376 |             'NNodes_': 'max',
377 |             'PartitionX': lambda x: ''.join(x),
378 |             'JobName_': 'first',
379 |             'SubmitDatetimeX': 'min',
380 |             'WorkingDir_': 'first',
381 |             'StateX': 'min',
382 |             'Account_': 'first',
383 |             'UIDX': 'first',
384 |             'UserX': 'first',
385 |         })
386 | 
387 |         ### Remove jobs that are still running or currently queued
388 |         self.df_agg = self.df_agg_0.loc[self.df_agg_0.StateX != -2]
389 | 
390 |         ### Turn StateX==-2 into 1
391 |         self.df_agg.loc[self.df_agg.StateX == -1, 'StateX'] = 1
392 | 
393 |         ### Replace UsedMem_=-1 with memory requested (for when MaxRSS=NaN)
394 |         self.df_agg['UsedMem2_'] = self.df_agg.apply(self.cleam_UsedMem, axis=1)
395 | 
396 |         ### Label as CPU or GPU partition
397 |         self.df_agg['PartitionTypeX'] = self.df_agg.PartitionX.apply(self.set_partitionType)
398 | 
399 |         # Just used to clean up with old logs:
400 |         if 'AllocTRES' not in self.logs_df.columns:
401 |             self.df_agg.loc[self.df_agg.PartitionTypeX == 'GPU', 'NGPUS_'] = 1  # TODO remove after a while
402 | 
403 |         # Sanity check (no GPU logged for CPU partitions and vice versa)
404 |         assert (self.df_agg.loc[self.df_agg.PartitionTypeX == 'CPU'].NGPUS_ == 0).all()
405 |         foo = self.df_agg.loc[(self.df_agg.PartitionTypeX == 'GPU') & (self.df_agg.NGPUS_ == 0)]
406 |         assert (foo.WallclockTimeX.dt.total_seconds() == 0).all()  # Cancelled GPU jobs won't have any GPUs allocated if they didn't start
407 | 
408 |         ## Check that there is no missing UID/User
409 |         if self.df_agg.UIDX.isnull().sum() > 0:
410 |             print(f"(!) WARNING: {self.df_agg.UIDX.isnull().sum()} jobs have missing UIDs")
411 |         if self.df_agg.UserX.isnull().sum() > 0:
412 |             print(f"(!) WARNING: {self.df_agg.UserX.isnull().sum()} jobs have missing Usernames")
413 | 
414 |         ### add the usage time to use for calculations
415 |         self.df_agg['TotalCPUtime2useX'] = self.df_agg.apply(self.calc_CPUusage2use, axis=1)
416 |         self.df_agg['TotalGPUtime2useX'] = self.df_agg.apply(self.calc_GPUusage2use, axis=1)
417 | 
418 |         ### Calculate core-hours charged
419 |         self.df_agg[['CPUhoursChargedX', 'GPUhoursChargedX']] = self.df_agg.apply(self.calc_coreHoursCharged, axis=1, result_type='expand')
420 | 
421 |         ### Calculate real memory need
422 |         self.df_agg['NeededMemX'] = self.df_agg.apply(
423 |             self.calc_realMemNeeded,
424 |             granularity_memory_request=self.cluster_info['granularity_memory_request'],
425 |             axis=1)
426 | 
427 |         ### Add memory waste information
428 |         self.df_agg['memOverallocationFactorX'] = self.df_agg.apply(self.calc_memory_overallocation, axis=1)
429 | 
430 |         # foo = self.df_agg[['TotalCPUtime_', 'CPUwallclocktime_', 'WallclockTimeX', 'NCPUS_', 'CoreHoursChargedCPUX',
431 |         #                    'CoreHoursChargedGPUX', 'TotalCPUtime2useX', 'TotalGPUtime2useX']] # DEBUGONLY
432 | 
433 |         ### Filter on working directory
434 |         if self.args.filterWD is not None:
435 |             # FIXME: Doesn't work with symbolic links
436 |             self.df_agg = self.df_agg.loc[self.df_agg.WorkingDir_ == self.args.filterWD]
437 |             # print(f'Filtered out {len(self.df_agg)-len(self.df_agg):,} rows (filterCWD={self.args.filterWD})') # DEBUGONLY
438 | 
439 |         ### Filter on Job ID
440 |         self.df_agg.reset_index(inplace=True)
441 |         self.df_agg['parentJobID'] = self.df_agg.single_jobID.apply(self.get_parent_jobID)
442 | 
443 |         if self.args.filterJobIDs != 'all':
444 |             list_jobs2keep = self.args.filterJobIDs.split(',')
445 |             self.df_agg = self.df_agg.loc[self.df_agg.parentJobID.isin(list_jobs2keep)]
446 | 
447 |         ### Filter on Account
448 |         if self.args.filterAccount is not None:
449 |             self.df_agg = self.df_agg.loc[self.df_agg.Account_ == self.args.filterAccount]
450 | 
451 |         self.df_agg_X = self.df_agg[[x for x in self.df_agg.columns if x[-1] == 'X']]


--------------------------------------------------------------------------------