├── .circleci └── config.yml ├── .gitignore ├── .pylintrc ├── AUTHORS ├── CHANGELOG.md ├── LICENSE ├── README.md ├── config ├── ccr-files │ └── supremm_update_ccr ├── config.json ├── prometheus │ └── mapping.json └── templates │ ├── hotproc │ └── hotproc.conf │ ├── pmlogger │ ├── control │ └── pmlogger-supremm.config │ └── slurm │ ├── slurm-epilog │ └── slurm-prolog ├── pytest.ini ├── setup.cfg ├── setup.py ├── src └── supremm │ ├── .gitignore │ ├── Job.py │ ├── TimeseriesPatterns.py │ ├── __init__.py │ ├── account.py │ ├── accounting.py │ ├── assets │ ├── modw_pcp.sql │ ├── modw_supremm.sql │ └── mongo_setup.js │ ├── batch_acct.py │ ├── config.py │ ├── datadumper.py │ ├── datasource │ ├── __init__.py │ ├── datasource.py │ ├── factory.py │ ├── pcp │ │ ├── __init__.py │ │ ├── indexarchives.py │ │ ├── pcparchive.py │ │ ├── pcpcinterface │ │ │ ├── __init__.py │ │ │ ├── c_pcp.pxd │ │ │ └── pcpcinterface.pyx │ │ ├── pcpdatasource.py │ │ └── pcpsummarize.py │ └── prometheus │ │ ├── __init__.py │ │ ├── promdatasource.py │ │ ├── prominterface.py │ │ ├── prommapping.py │ │ └── promsummarize.py │ ├── errors.py │ ├── gen_pmlogger_control.py │ ├── ingest_jobscripts.py │ ├── lariat.py │ ├── linuxhelpers.py │ ├── migrations │ └── 1.0-1.1 │ │ └── modw_supremm.sql │ ├── outputter.py │ ├── plugin.py │ ├── plugins │ ├── ArmPowerUsageTimeseries.py │ ├── Block.py │ ├── BlockTimeseries.py │ ├── Catastrophe.py │ ├── CgroupMemTimeseries.py │ ├── CgroupMemory.py │ ├── CpuCategories.py │ ├── CpuPerfCounters.py │ ├── CpuUsage.py │ ├── CpuUserTimeseries.py │ ├── Gpfs.py │ ├── GpfsTimeseries.py │ ├── GpuPower.py │ ├── GpuUsage.py │ ├── GpuUsageTimeseries.py │ ├── InfiniBand.py │ ├── InfiniBandTimeseries.py │ ├── IpmiPower.py │ ├── Lnet.py │ ├── LoadAvg.py │ ├── Lustre.py │ ├── LustreTimeseries.py │ ├── MemBwTimeseries.py │ ├── MemUsageTimeseries.py │ ├── MemoryUsage.py │ ├── Network.py │ ├── Nfs.py │ ├── NfsTimeseries.py │ ├── NodeMemoryUsage.py │ ├── PowerUsageTimeseries.py │ ├── SimdInsTimeseries.py │ ├── SveTimeseries.py │ ├── TaccCatastrophe.py │ ├── TaccPerfCounters.py │ ├── TaccUncoreCounters.py │ ├── TimeseriesPatternsGpfs.py │ ├── TotalMemUsageTimeseries.py │ ├── UncoreCounters.py │ └── __init__.py │ ├── preprocessors │ ├── HardwareInventory.py │ ├── PerfEvent.py │ ├── Proc.py │ ├── ProcPrometheus.py │ └── __init__.py │ ├── proc_common.py │ ├── processhelpers.py │ ├── rangechange.py │ ├── scripthelpers.py │ ├── statistics.py │ ├── subsample.py │ ├── summarize.py │ ├── summarize_jobs.py │ ├── summarize_mpi.py │ ├── supremm_setup.py │ ├── supremm_testharness.py │ ├── supremm_update │ ├── supremm_upgrade.py │ ├── supremmconf.py │ ├── xdmodaccount.py │ └── xdmodstylesetupmenu.py └── tests ├── __init__.py ├── ci ├── Dockerfile ├── build.sh ├── setup.sh ├── srv │ ├── prom_cluster.txt │ ├── prometheus │ │ ├── Dockerfile │ │ ├── promdata.tar.gz │ │ ├── prometheus.yml │ │ └── web.yml │ └── services.yml └── test │ ├── Dockerfile │ └── bootstrap.sh ├── component ├── data │ ├── perfevent.0 │ ├── perfevent.index │ └── perfevent.meta └── runtests.sh ├── integration_tests ├── 5894431-1622570028 │ ├── cpn-d14-02.0 │ ├── cpn-d14-02.index │ └── cpn-d14-02.meta ├── __init__.py ├── integration_plugin_api.py ├── integration_test.bash ├── mock_preprocessor.py ├── pcp_logs_extracted │ ├── 20161229.00.10.0 │ ├── 20161229.00.10.index │ ├── 20161229.00.10.meta │ ├── job-972366-begin-20161229.23.06.00.0 │ ├── job-972366-begin-20161229.23.06.00.index │ ├── job-972366-begin-20161229.23.06.00.meta │ ├── job-972366-end-20161230.00.06.00.0 │ ├── job-972366-end-20161230.00.06.00.index │ └── job-972366-end-20161230.00.06.00.meta ├── supremm_setup_expect.py └── throwing_plugin.py ├── testPcpArchiveProcessor.py ├── testgetoptions.py ├── testrangechange.py └── testsummarize.py /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | jobs: 3 | build: 4 | docker: 5 | - image: rockylinux:8 6 | steps: 7 | - checkout 8 | - run: 9 | name: Install System Dependencies 10 | command: ./tests/ci/setup.sh build 11 | - run: 12 | name: Build Software Package 13 | command: ./tests/ci/build.sh 14 | - persist_to_workspace: 15 | root: . 16 | paths: 17 | - dist/* 18 | 19 | test: 20 | parameters: 21 | test-mode: 22 | type: string 23 | install-type: 24 | type: string 25 | docker: 26 | - image: tools-ext-01.ccr.xdmod.org/xdmod-job_performance-10.5.0:rockylinux8-0.1 27 | environment: 28 | TERM: xterm 29 | TERMINFO: /bin/bash 30 | COMPOSER_ALLOW_SUPERUSER: 1 31 | XDMOD_REALMS: 'jobs,storage,cloud' 32 | XDMOD_IS_CORE: yes 33 | XDMOD_INSTALL_DIR: /xdmod 34 | XDMOD_TEST_MODE: << parameters.test-mode >> 35 | SUPREMM_INSTALL_TYPE: << parameters.install-type >> 36 | steps: 37 | - checkout 38 | - run: 39 | name: Create Test Result Directories 40 | command: | 41 | mkdir -p shippable/testresults 42 | mkdir -p shippable/codecoverage 43 | - attach_workspace: 44 | at: . 45 | - run: 46 | name: Install Docker Compose 47 | command: | 48 | dnf install -y dnf-utils 49 | dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo 50 | dnf install -y docker-ce docker-ce-cli docker-compose-plugin 51 | 52 | - setup_remote_docker 53 | - run: 54 | name: Build services 55 | command: docker compose -f ./tests/ci/srv/services.yml build 56 | - run: 57 | name: Start services 58 | command: docker compose -f ./tests/ci/srv/services.yml up -d 59 | - run: 60 | name: Run Bootstrap 61 | command: ./tests/ci/test/bootstrap.sh $SUPREMM_INSTALL_TYPE 62 | - run: 63 | name: Run Integration Tests 64 | command: ./tests/integration_tests/integration_test.bash 65 | - run: 66 | name: Run Component Tests 67 | command: ./tests/component/runtests.sh 68 | - run: 69 | name: Summarize Jobs 70 | command: summarize_jobs.py -h > /dev/null 71 | - run: 72 | name: Index Archives 73 | command: indexarchives.py -h > /dev/null 74 | - run: 75 | name: Ingest Jobs 76 | command: ingest_jobscripts.py -d 77 | - run: 78 | name: Pylint 79 | command: pylint-3 --errors-only supremm 80 | - run: 81 | name: Pytest 82 | command: pytest-3 --junitxml=shippable/testresults/testreport.xml --cov=supremm --cov-report xml:shippable/codecoverage/coverage.xml 83 | - run: 84 | name: Remove Currently Installed SUPREMM 85 | command: dnf remove -y supremm 86 | - store_test_results: 87 | path: shippable/testresults 88 | - store_artifacts: 89 | path: shippable/codecoverage 90 | - store_artifacts: 91 | path: /var/log/xdmod 92 | 93 | workflows: 94 | full-build: 95 | jobs: 96 | - build 97 | - test: 98 | matrix: 99 | parameters: 100 | test-mode: ["fresh_install", "upgrade"] 101 | install-type: ["rpm", "wheel", "src"] 102 | requires: 103 | - build 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | dist/ 3 | *.egg-info/ 4 | *.pyc 5 | *.so 6 | pypmlogextract.c 7 | pcpcinterface.c 8 | 9 | .idea/ 10 | .vscode/ 11 | .cache/ 12 | 13 | # ci or testing files 14 | .coverage 15 | shippable/ 16 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | SUPReMM Summarization 2 | ===================== 3 | 4 | The code for the SUPReMM summarization package was originally written by and includes contributions from: 5 | 6 | - Bill Barth 7 | - Andrew E. Bruno 8 | - Richard T. Evans 9 | - John L. Hammond 10 | - Martins Innus 11 | - Kyle Markus 12 | - Jeffrey T. Palmer 13 | - Joseph P. White 14 | - Thomas Yearke 15 | 16 | Contributors 17 | 18 | - Trey Dockendorf 19 | - Ian DesJardin 20 | - Alex Kofke 21 | -------------------------------------------------------------------------------- /config/ccr-files/supremm_update_ccr: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #------------------------------------------------------------------------- 4 | # Configurable settigns 5 | 6 | PREFIX=/data/xdtas 7 | LOCKFILE=/var/tmp/supremm_summary.lock 8 | INSTALLPATH=$PREFIX/opt 9 | 10 | GITHUBKEY=$HOME/.ssh/id_rsa_github 11 | 12 | THREADS=`nproc --ignore=2` 13 | 14 | ulimit -n4096 15 | 16 | #------------------------------------------------------------------------- 17 | # Main script 18 | 19 | reportfail() 20 | { 21 | echo "Not running due to another process holding the lock" 22 | exit 1 23 | } 24 | 25 | ( 26 | flock -n 9 || reportfail 27 | 28 | set -e 29 | 30 | #------------------------------------------------------------------------- 31 | # Pull the latest from git and install package from source 32 | 33 | ssh-agent sh -c "ssh-add $GITHUBKEY 2> /dev/null && cd $PREFIX/ccr-pcp && git pull -q" > /dev/null 34 | 35 | cd $PREFIX/ccr-pcp/scripts 36 | python setup.py install --prefix=$INSTALLPATH > /dev/null 37 | 38 | #------------------------------------------------------------------------- 39 | # Run index and ingest 40 | 41 | PATH=$INSTALLPATH/bin:$PATH 42 | PYTHONPATH=$INSTALLPATH/lib64/python2.7/site-packages 43 | 44 | export PYTHONPATH PATH 45 | 46 | if [ "$1" != "process" ]; then 47 | indexarchives.py 48 | account.py 49 | summarize_jobs.py -t $THREADS -q 50 | else 51 | summarize_jobs.py -t $THREADS -d 52 | fi 53 | 54 | ) 9>${LOCKFILE} 55 | 56 | -------------------------------------------------------------------------------- /config/config.json: -------------------------------------------------------------------------------- 1 | { 2 | // Configuration settings for job data input 3 | // The database configuration below assumes everything on the same host and 4 | // XDMoD is installed on the host too. 5 | // 6 | // It is not necessary for all databases to run locally. A more complex 7 | // configuration could have multiple hosts and different credentials for 8 | // each DB. 9 | "xdmodroot": "/etc/xdmod", 10 | "datawarehouse": { 11 | "include": "xdmod://datawarehouse" 12 | }, 13 | // Configuration setting for summary document output 14 | "outputdatabase": { 15 | "db_engine": "mongodb", 16 | "uri": "mongodb://localhost:27017/supremm", 17 | //"uri": "mongodb://username:password@localhost/supremm", 18 | //"uri": "mongodb://mongodb-server1:27017,mongodb-server2:27017/supremm?replicaSet=foo", 19 | //"uri": "mongodb://username:password@mongodb-server1:27017,mongodb-server2:27017/supremm?replicaSet=foo", 20 | "dbname": "supremm" 21 | }, 22 | "summary": { 23 | // The archive out directory should be writable by the process that runs 24 | // the summaries. 25 | "archive_out_dir": "/dev/shm/supremm_test", 26 | // The following substitutions are defined for the job archive subdirectory: 27 | // %r means the resource name 28 | // %j the local job id 29 | // the rest is sent to strftime with the end time of the job 30 | // Common examples: 31 | // %Y-%m-%d/%r/%j includes the date/resource/jobid in the path 32 | "subdir_out_format": "%r/%j" 33 | }, 34 | "resources": { 35 | // Edit the below to match your cluster name and data locations 36 | "my_cluster_name": { 37 | "enabled": true, 38 | "resource_id": 1, 39 | "batch_system": "XDMoD", 40 | "hostname_mode": "hostname", 41 | "pcp_log_dir": "/data/pcp-logs/my_cluster_name", 42 | "script_dir": "/data/jobscripts/my_cluster_name", 43 | 44 | // fast_index uses an alternative method of indexing job-level pcp archives which can significantly speed 45 | // up the indexarchives.py script. The tradeoff is that the indexed archive end time is not found and the 46 | // start time is slightly less accurate. For normal summarization usage this doesn't matter, but set this 47 | // to false if you need that data for other purposes. 48 | "fast_index": true 49 | 50 | // When using fast_index mode, if the timezone of the resource where the pcp logs were collected is 51 | // different than the timezone of the computer running the indexing, the timezone of the resource 52 | // must be specified here. 53 | //,"timezone": "America/New_York" 54 | } 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /config/prometheus/mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "common": { 3 | "params": ["host"], 4 | "defaults": {"environment": "prod"} 5 | }, 6 | "metrics": { 7 | "cgroup.memory.usage": { 8 | "name": "cgroup_memory_used_bytes", 9 | "params": ["cgroup"], 10 | "groupby": "cgroup" 11 | }, 12 | "cgroup.memory.limit": { 13 | "name": "cgroup_memory_total_bytes", 14 | "params": ["cgroup"], 15 | "groupby": "cgroup" 16 | }, 17 | "disk.dev.read": { 18 | "name": "node_disk_reads_completed_total", 19 | "groupby": "device" 20 | }, 21 | "disk.dev.read_bytes": { 22 | "name": "node_disk_read_bytes_total", 23 | "scaling": "0.0009765625", 24 | "groupby": "device" 25 | }, 26 | "disk.dev.write": { 27 | "name": "node_disk_writes_completed_total", 28 | "groupby": "device" 29 | }, 30 | "disk.dev.write_bytes": { 31 | "name": "node_disk_written_bytes_total", 32 | "scaling": "0.0009765625", 33 | "groupby": "device" 34 | }, 35 | "infiniband.port.switch.in.bytes": { 36 | "name": "node_infiniband_port_data_received_bytes_total", 37 | "groupby": "port", 38 | "out_fmt": ["{}:{}", "device", "port"] 39 | }, 40 | "infiniband.port.switch.in.packets": { 41 | "name": "node_infiniband_port_packets_received_total", 42 | "groupby": "port", 43 | "out_fmt": ["{}:{}", "device", "port"] 44 | }, 45 | "infiniband.port.switch.out.bytes": { 46 | "name": "node_infiniband_port_data_transmitted_bytes_total", 47 | "groupby": "port", 48 | "out_fmt": ["{}:{}", "device", "port"] 49 | }, 50 | "infiniband.port.switch.out.packets": { 51 | "name": "node_infiniband_port_packets_transmitted_total", 52 | "groupby": "port", 53 | "out_fmt": ["{}:{}", "device", "port"] 54 | }, 55 | "ipmi.dcmi.power": { 56 | "name": "ipmi_dcmi_power_consumption_watts", 57 | "groupby": "host" 58 | }, 59 | "kernel.all.load": { 60 | "name": "node_load1", 61 | "groupby": "host" 62 | }, 63 | "kernel.percpu.cpu.user": { 64 | "name": "node_cpu_seconds_total", 65 | "defaults": {"mode" : "user"}, 66 | "scaling": "1000", 67 | "groupby": "cpu", 68 | "out_fmt": ["cpu{}", "cpu"] 69 | }, 70 | "kernel.percpu.cpu.idle": { 71 | "name": "node_cpu_seconds_total", 72 | "defaults": {"mode" : "idle"}, 73 | "scaling": "1000", 74 | "groupby": "cpu", 75 | "out_fmt": ["cpu{}", "cpu"] 76 | }, 77 | "kernel.percpu.cpu.nice": { 78 | "name": "node_cpu_seconds_total", 79 | "defaults": {"mode" : "nice"}, 80 | "scaling": "1000", 81 | "groupby": "cpu", 82 | "out_fmt": ["cpu{}", "cpu"] 83 | }, 84 | "kernel.percpu.cpu.sys": { 85 | "name": "node_cpu_seconds_total", 86 | "defaults": {"mode" : "system"}, 87 | "scaling": "1000", 88 | "groupby": "cpu", 89 | "out_fmt": ["cpu{}", "cpu"] 90 | }, 91 | "kernel.percpu.cpu.wait.total": { 92 | "name": "node_cpu_seconds_total", 93 | "defaults": {"mode" : "iowait"}, 94 | "scaling": "1000", 95 | "groupby": "cpu", 96 | "out_fmt": ["cpu{}", "cpu"] 97 | }, 98 | "kernel.percpu.cpu.irq.hard": { 99 | "name": "node_cpu_seconds_total", 100 | "defaults": {"mode" : "irq"}, 101 | "scaling": "1000", 102 | "groupby": "cpu", 103 | "out_fmt": ["cpu{}", "cpu"] 104 | }, 105 | "kernel.percpu.cpu.irq.soft": { 106 | "name": "node_cpu_seconds_total", 107 | "defaults": {"mode" : "softirq"}, 108 | "scaling": "1000", 109 | "groupby": "cpu", 110 | "out_fmt": ["cpu{}", "cpu"] 111 | }, 112 | "mem.numa.util.filePages": { 113 | "name": "node_memory_numa_FilePages", 114 | "groupby": "node" 115 | }, 116 | "mem.numa.util.slab": { 117 | "name": "node_memory_numa_Slab", 118 | "groupby": "node" 119 | }, 120 | "mem.numa.util.used": { 121 | "name": "node_memory_numa_MemUsed", 122 | "groupby": "node" 123 | }, 124 | "mem.freemem": { 125 | "name": "node_memory_MemFree_bytes", 126 | "scaling": "0.0009765625", 127 | "groupby": "host" 128 | }, 129 | "mem.physmem": { 130 | "name": "node_memory_MemTotal_bytes", 131 | "scaling": "0.0009765625", 132 | "groupby": "host" 133 | }, 134 | "network.interface.in.bytes": { 135 | "name": "node_network_receive_bytes_total", 136 | "groupby": "device" 137 | }, 138 | "network.interface.out.bytes": { 139 | "name": "node_network_transmit_bytes_total", 140 | "groupby": "device" 141 | }, 142 | "nvidia.gpuactive": { 143 | "name": "DCGM_FI_DEV_GPU_UTIL", 144 | "groupby": "gpu", 145 | "out_fmt": ["gpu{}", "gpu"] 146 | }, 147 | "nvidia.memused": { 148 | "name": "DCGM_FI_DEV_FB_USED", 149 | "groupby": "gpu", 150 | "out_fmt": ["gpu{}", "gpu"] 151 | }, 152 | "nvidia.powerused": { 153 | "name": "DCGM_FI_DEV_POWER_USAGE", 154 | "scaling": "1000", 155 | "groupby": "gpu", 156 | "out_fmt": ["gpu{}", "gpu"] 157 | }, 158 | "prom:cgroup_cpu_info": { 159 | "name": "cgroup_cpu_info", 160 | "params": ["cgroup"], 161 | "groupby": "cpus" 162 | }, 163 | "prom:cgroup_process_exec_count": { 164 | "name": "cgroup_process_exec_count", 165 | "params": ["cgroup"], 166 | "groupby": "exec" 167 | } 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /config/templates/hotproc/hotproc.conf: -------------------------------------------------------------------------------- 1 | #pmdahotproc 2 | Version 1.0 3 | 4 | ( (uname != "root") && (uname != "rpc") && (uname != "rpcuser") && (uname != "dbus") && (uname != "avahi") && (uname != "munge") && (uname != "ntp") && (uname != "nagios") && (uname != "postfix") && (uname != "pcp") && (uname != "libstoragemgmt") && (uname != "chrony") && (uname != "polkitd") ) || cpuburn > 0.1 5 | -------------------------------------------------------------------------------- /config/templates/pmlogger/control: -------------------------------------------------------------------------------- 1 | # 2 | # PCP archive logging configuration/control 3 | # 4 | # This file is used by various of the PCP archive logging administrative 5 | # tools to perform maintenance on the pmlogger instances running on 6 | # the local host. 7 | # 8 | # This file contains one line per host to be logged, fields are 9 | # Host name of host to be logged 10 | # P(rimary) is this the primary logger? y or n 11 | # S(ocks) should this logger be launched with pmsocks? y or n 12 | # Directory full pathname to directory where archive logs are 13 | # to be maintained ... note all scripts "cd" to here as 14 | # a first step 15 | # Args optional additional arguments to pmlogger and/or pmnewlog 16 | # 17 | 18 | # === VARIABLE ASSIGNMENTS === 19 | # 20 | # DO NOT REMOVE OR EDIT THE FOLLOWING LINE 21 | $version=1.1 22 | 23 | # if pmsocks is being used, edit the IP address for $SOCKS_SERVER 24 | #$SOCKS_SERVER=123.456.789.123 25 | 26 | # for remote loggers running over a WAN with potentially long delays 27 | $PMCD_CONNECT_TIMEOUT=150 28 | $PMCD_REQUEST_TIMEOUT=120 29 | 30 | # === LOGGER CONTROL SPECIFICATIONS === 31 | # 32 | #Host P? S? directory args 33 | 34 | # local primary logger 35 | # 36 | # (LOCALHOSTNAME is expanded to local: in the first column, 37 | # and to `hostname` in the fourth (directory) column.) 38 | # 39 | LOCALHOSTNAME y n "PCP_LOG_DIR/pmlogger/$(date +%Y)/$(date +%m)/LOCALHOSTNAME/$(date +%Y)-$(date +%m)-$(date +%d)" -r -c /etc/pcp/pmlogger/pmlogger-supremm.config 40 | 41 | # Note: if multiple pmloggers for the same host (e.g. both primary and 42 | # non-primary loggers are active), then they MUST use different 43 | # directories 44 | 45 | # local non-primary logger 46 | #LOCALHOSTNAME n n PCP_LOG_DIR/pmlogger/mysummary -r -T24h10m -c config.Summary 47 | 48 | # remote host 49 | #remote n n PCP_LOG_DIR/pmlogger/remote -r -T24h10m -c config.remote 50 | 51 | # thru the firewall via socks 52 | #distant n y PCP_LOG_DIR/pmlogger/distant -r -T24h10m -c config.distant 53 | -------------------------------------------------------------------------------- /config/templates/slurm/slurm-epilog: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use Sys::Hostname; 5 | use Date::Simple; 6 | use POSIX qw(strftime); 7 | 8 | my $jobid = $ENV{SLURM_JOB_ID}; 9 | my $today = Date::Simple->new; 10 | 11 | my $logyear = $today->format("%Y"); 12 | my $logmonth = $today->format("%m"); 13 | my $logday = $today->format("%d"); 14 | 15 | my $jobdatelong = strftime "%Y%m%d.%H.%M.%S", localtime; 16 | my $fullhost = hostname(); 17 | 18 | # PCP End of job logging 19 | 20 | my $logdir = "//supremm/pmlogger/$logyear/$logmonth/$fullhost/$logyear-$logmonth-$logday"; 21 | system("env PMLOGGER_EPILOG=yes pmlogger -U pcp -c /etc/pcp/pmlogger/pmlogger-supremm.config -s 1 -l /tmp/job-$jobid-end-$jobdatelong.log $logdir/job-$jobid-end-$jobdatelong &> /dev/null"); 22 | 23 | exit(0); 24 | -------------------------------------------------------------------------------- /config/templates/slurm/slurm-prolog: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use strict; 4 | use Sys::Hostname; 5 | use Date::Simple; 6 | use POSIX qw(strftime setsid); 7 | 8 | my $today = Date::Simple->new; 9 | 10 | my $logyear = $today->format("%Y"); 11 | my $logmonth = $today->format("%m"); 12 | my $logday = $today->format("%d"); 13 | 14 | my $jobdatelong = strftime "%Y%m%d.%H.%M.%S", localtime; 15 | my $host = ( split('\.', hostname()) )[0]; 16 | my $jobid = $ENV{SLURM_JOB_ID}; 17 | 18 | # PCP job Start Logging 19 | 20 | my $fullhost = hostname(); 21 | 22 | # This is the every 10 seconds for a total of 30 second log, after job start time 23 | # Slurm kills child processes of the prolog so we need to start our own process group to keep this running for 30 seconds after prolog is done 24 | 25 | $SIG{CHLD} = 'IGNORE'; # Configure to autoreap zombies 26 | exit(0) unless defined ( my $child = fork ); # fork, or just exit if failed 27 | exit(0) if $child; # Main prolog exits with success 28 | 29 | # Below this is now the child 30 | setsid(); # Become session leader 31 | open( STDIN, "/dev/null" ); # Detach STDOUT from shell 33 | open( STDERR, ">&STDOUT" ); # Detach STDERR from shell 34 | chdir '/tmp'; # Change working directory 35 | umask(0); # Reset umask 36 | 37 | my $logdir = "//supremm/pmlogger/$logyear/$logmonth/$fullhost/$logyear-$logmonth-$logday"; 38 | 39 | # The prolog config runs every 10 seconds, we exit after 4 samples 40 | system("env PMLOGGER_PROLOG=yes pmlogger -U pcp -c /etc/pcp/pmlogger/pmlogger-supremm.config -s 4 -l /tmp/job-$jobid-begin-$jobdatelong.log $logdir/job-$jobid-begin-$jobdatelong &> /dev/null"); 41 | 42 | exit(0) 43 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | python_files = test*.py 3 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_rpm] 2 | release = 1%%{?dist} 3 | build_requires = python36-devel, pcp-libs-devel >= 5.3, pcp-libs-devel < 5.4 4 | requires = python3, python3-pymongo, python3-numpy, python3-scipy, python3-PyMySQL, python3-pcp >= 5.3, python3-pcp < 5.4, pcp-libs >= 5.3, pcp-libs < 5.4, python3-Cython, python3-pytz, python3-requests 5 | install_script = .rpm_install_script.txt 6 | 7 | [bdist_wheel] 8 | python-tag = py36 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ setup script for SUPReMM job summarization utilities """ 3 | import sys 4 | import os 5 | from setuptools import setup, find_packages, Extension 6 | import numpy 7 | 8 | from Cython.Build import cythonize 9 | 10 | # For rpm-based builds want the configuration files to 11 | # go in the standard location. Also need to rewrite the file list so that 12 | # the config filesa are listed as %config(noreplace) 13 | IS_RPM_BUILD = False 14 | if 'bdist_rpm' in sys.argv or 'RPM_BUILD_ROOT' in os.environ: 15 | IS_RPM_BUILD = True 16 | confpath = '/etc/supremm' 17 | with open('.rpm_install_script.txt', 'w') as fp: 18 | fp.write('%s %s install -O1 --root=$RPM_BUILD_ROOT --record=INSTALLED_FILES\n' % (sys.executable, os.path.basename(sys.argv[0]))) 19 | fp.write('sed -i \'s#^\\(%s\\)#%%config(noreplace) \\1#\' INSTALLED_FILES\n' % (confpath, )) 20 | else: 21 | confpath = 'etc/supremm' 22 | 23 | 24 | setup( 25 | name='supremm', 26 | version='2.0.0', 27 | description='SUPReMM Job Summarization Utilities', 28 | long_description='Utilities for generating job-level summary data from host level PCP archives.\nAlso includes template configuration files for running PCP on an HPC system.', 29 | license='LGPLv3', 30 | author='Joseph P White', 31 | author_email='jpwhite4@buffalo.edu', 32 | url='https://github.com/ubccr/supremm', 33 | 34 | zip_safe=False, 35 | package_dir={'': 'src'}, 36 | packages=find_packages(where='src'), 37 | package_data={ 38 | 'supremm': ['assets/modw_supremm.sql', 'assets/mongo_setup.js', '*.pxd', '*.pyx'], 39 | 'supremm.datasource.pcp.pcpcinterface': ['*.pxd', '*.pyx'] 40 | }, 41 | data_files=[ 42 | (confpath, ['config/config.json', 'config/prometheus/mapping.json']), 43 | ('share/supremm/templates/slurm', ['config/templates/slurm/slurm-epilog', 'config/templates/slurm/slurm-prolog']), 44 | ('share/supremm/templates/hotproc', ['config/templates/hotproc/hotproc.conf']), 45 | ('share/supremm/templates/pmlogger', ['config/templates/pmlogger/control', 'config/templates/pmlogger/pmlogger-supremm.config']) 46 | ], 47 | scripts=[ 48 | 'src/supremm/supremm_update' 49 | ], 50 | entry_points={ 51 | 'console_scripts': [ 52 | 'gen-pmlogger-control.py = supremm.gen_pmlogger_control:main', 53 | 'summarize_jobs.py = supremm.summarize_jobs:main', 54 | 'summarize_mpi.py = supremm.summarize_mpi:main', 55 | 'indexarchives.py = supremm.datasource.pcp.indexarchives:runindexing', 56 | 'account.py = supremm.account:runingest', 57 | 'supremmconf.py = supremm.supremmconf:main', 58 | 'supremm-setup = supremm.supremm_setup:main', 59 | 'supremm-upgrade = supremm.supremm_upgrade:main', 60 | 'ingest_jobscripts.py = supremm.ingest_jobscripts:main' 61 | 62 | ] 63 | }, 64 | install_requires=[ 65 | 'numpy', 66 | 'PyMySQL', 67 | 'pcp', 68 | 'Cython', 69 | 'scipy', 70 | 'pymongo', 71 | 'pytz', 72 | 'requests' 73 | ], 74 | ext_modules=cythonize([ 75 | Extension("supremm.datasource.pcp.pcpcinterface.pcpcinterface", ["src/supremm/datasource/pcp/pcpcinterface/pcpcinterface.pyx"], libraries=["pcp"], include_dirs=[numpy.get_include()]) 76 | ]) 77 | ) 78 | 79 | if IS_RPM_BUILD: 80 | os.unlink('.rpm_install_script.txt') 81 | -------------------------------------------------------------------------------- /src/supremm/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | -------------------------------------------------------------------------------- /src/supremm/__init__.py: -------------------------------------------------------------------------------- 1 | """ The SUPReMM module contains software that generates job-level summaries from PCP archives """ 2 | -------------------------------------------------------------------------------- /src/supremm/accounting.py: -------------------------------------------------------------------------------- 1 | """ definition of the accounting API and implementations of some base classes that 2 | include common functions """ 3 | 4 | from abc import ABCMeta, abstractmethod 5 | 6 | class Accounting(object, metaclass=ABCMeta): 7 | """ abstract base class describing the job accounting interface """ 8 | 9 | PROCESS_VERSION = 1 10 | 11 | def __init__(self, resource_id, config): 12 | self._resource_id = resource_id 13 | self._config = config 14 | 15 | @abstractmethod 16 | def getbylocaljobid(self, localjobid): 17 | """ Yields one or more Jobs that match the localjobid """ 18 | pass 19 | 20 | @abstractmethod 21 | def getbytimerange(self, start, end, onlynew): 22 | """ Search for all jobs based on the time interval. Matches based on the end 23 | timestamp of the job """ 24 | pass 25 | 26 | @abstractmethod 27 | def get(self, start, end): 28 | """ Yields all unprocessed jobs. Optionally specify a time interval to process""" 29 | pass 30 | 31 | @abstractmethod 32 | def markasdone(self, job, success, elapsedtime): 33 | """ log a job as being processed (either successfully or not) """ 34 | pass 35 | 36 | class ArchiveCache(object, metaclass=ABCMeta): 37 | """ abstract base class describing the job archive cache interface """ 38 | 39 | def __init__(self, config): 40 | self._config = config 41 | 42 | @abstractmethod 43 | def insert(self, resource_id, hostname, filename, start, end, jobid): 44 | """ insert a record into the cache """ 45 | pass 46 | 47 | @abstractmethod 48 | def insert_from_files(self, paths_file, joblevel_file, nodelevel_file): 49 | pass 50 | 51 | @abstractmethod 52 | def postinsert(self): 53 | """ Must be called after insert. """ 54 | pass 55 | -------------------------------------------------------------------------------- /src/supremm/assets/modw_pcp.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 5.5.41, for debian-linux-gnu (x86_64) 2 | -- 3 | -- Host: localhost Database: modw_pcp 4 | -- ------------------------------------------------------ 5 | -- Server version 5.5.41-0ubuntu0.12.04.1-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Current Database: `modw_pcp` 20 | -- 21 | 22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `modw_pcp` /*!40100 DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci */; 23 | 24 | USE `modw_pcp`; 25 | 26 | -- 27 | -- Table structure for table `archive` 28 | -- 29 | 30 | DROP TABLE IF EXISTS `archive`; 31 | /*!40101 SET @saved_cs_client = @@character_set_client */; 32 | /*!40101 SET character_set_client = utf8 */; 33 | CREATE TABLE `archive` ( 34 | `id` int(11) NOT NULL AUTO_INCREMENT, 35 | `hostid` int(11) NOT NULL, 36 | `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL, 37 | `start_time_ts` double NOT NULL, 38 | `end_time_ts` double NOT NULL, 39 | `jobid` varchar(45) DEFAULT NULL, 40 | PRIMARY KEY (`id`), 41 | UNIQUE KEY `unique` (`filename`), 42 | KEY `fk_archive_1_idx` (`hostid`), 43 | CONSTRAINT `fk_archive_1` FOREIGN KEY (`hostid`) REFERENCES `hosts` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION 44 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 45 | /*!40101 SET character_set_client = @saved_cs_client */; 46 | 47 | -- 48 | -- Table structure for table `hosts` 49 | -- 50 | 51 | DROP TABLE IF EXISTS `hosts`; 52 | /*!40101 SET @saved_cs_client = @@character_set_client */; 53 | /*!40101 SET character_set_client = utf8 */; 54 | CREATE TABLE `hosts` ( 55 | `id` int(11) NOT NULL AUTO_INCREMENT, 56 | `hostname` varchar(255) COLLATE utf8_unicode_ci NOT NULL, 57 | PRIMARY KEY (`id`), 58 | UNIQUE KEY `UNIQUE` (`hostname`) 59 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 60 | /*!40101 SET character_set_client = @saved_cs_client */; 61 | 62 | -- 63 | -- Table structure for table `job` 64 | -- 65 | 66 | DROP TABLE IF EXISTS `job`; 67 | /*!40101 SET @saved_cs_client = @@character_set_client */; 68 | /*!40101 SET character_set_client = utf8 */; 69 | CREATE TABLE `job` ( 70 | `id` int(11) NOT NULL AUTO_INCREMENT, 71 | `resource_id` int(11) NOT NULL, 72 | `local_job_id` int(11) NOT NULL, 73 | `start_time_ts` int(11) NOT NULL, 74 | `end_time_ts` int(11) NOT NULL, 75 | `record` blob, 76 | PRIMARY KEY (`id`), 77 | UNIQUE KEY `UNIQUE` (`resource_id`,`local_job_id`,`end_time_ts`) 78 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 79 | /*!40101 SET character_set_client = @saved_cs_client */; 80 | 81 | -- 82 | -- Table structure for table `jobhosts` 83 | -- 84 | 85 | DROP TABLE IF EXISTS `jobhosts`; 86 | /*!40101 SET @saved_cs_client = @@character_set_client */; 87 | /*!40101 SET character_set_client = utf8 */; 88 | CREATE TABLE `jobhosts` ( 89 | `jobid` int(11) NOT NULL, 90 | `hostid` int(11) NOT NULL, 91 | UNIQUE KEY `UNIQUE` (`jobid`,`hostid`), 92 | KEY `fk_jobhosts_2_idx` (`hostid`), 93 | CONSTRAINT `fk_jobhosts_1` FOREIGN KEY (`jobid`) REFERENCES `job` (`id`) ON DELETE CASCADE ON UPDATE CASCADE, 94 | CONSTRAINT `fk_jobhosts_2` FOREIGN KEY (`hostid`) REFERENCES `hosts` (`id`) ON DELETE CASCADE ON UPDATE CASCADE 95 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 96 | /*!40101 SET character_set_client = @saved_cs_client */; 97 | 98 | -- 99 | -- Table structure for table `process` 100 | -- 101 | 102 | DROP TABLE IF EXISTS `process`; 103 | /*!40101 SET @saved_cs_client = @@character_set_client */; 104 | /*!40101 SET character_set_client = utf8 */; 105 | CREATE TABLE `process` ( 106 | `jobid` int(11) NOT NULL, 107 | `process_version` int(11) NOT NULL DEFAULT '0', 108 | `process_timestamp` timestamp NULL DEFAULT NULL, 109 | `process_time` double DEFAULT NULL, 110 | `ingest_version` int(11) NOT NULL, 111 | `ingest_timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, 112 | PRIMARY KEY (`jobid`), 113 | KEY `proc` (`process_version`), 114 | CONSTRAINT `fk_process_1` FOREIGN KEY (`jobid`) REFERENCES `job` (`id`) ON DELETE CASCADE ON UPDATE CASCADE 115 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 116 | /*!40101 SET character_set_client = @saved_cs_client */; 117 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 118 | 119 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 120 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 121 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 122 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 123 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 124 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 125 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 126 | 127 | -- Dump completed on 2015-05-19 11:00:21 128 | -------------------------------------------------------------------------------- /src/supremm/assets/modw_supremm.sql: -------------------------------------------------------------------------------- 1 | -- MySQL dump 10.13 Distrib 5.5.41, for debian-linux-gnu (x86_64) 2 | -- 3 | -- Host: localhost Database: modw_pcp 4 | -- ------------------------------------------------------ 5 | -- Server version 5.5.41-0ubuntu0.12.04.1-log 6 | 7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; 8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; 9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; 10 | /*!40101 SET NAMES utf8 */; 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; 12 | /*!40103 SET TIME_ZONE='+00:00' */; 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; 17 | 18 | -- 19 | -- Current Database: `modw_pcp` 20 | -- 21 | 22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `modw_supremm` /*!40100 DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci */; 23 | 24 | USE `modw_supremm`; 25 | 26 | -- 27 | -- Table structure for table `archive_paths` 28 | -- 29 | 30 | DROP TABLE IF EXISTS `archive_paths`; 31 | /*!40101 SET @saved_cs_client = @@character_set_client */; 32 | /*!40101 SET character_set_client = utf8 */; 33 | CREATE TABLE `archive_paths` ( 34 | `id` int(11) NOT NULL AUTO_INCREMENT, 35 | `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL, 36 | PRIMARY KEY (`id`), 37 | UNIQUE KEY `filename` (`filename`) 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 39 | /*!40101 SET character_set_client = @saved_cs_client */; 40 | 41 | -- 42 | -- Table structure for table `archives_joblevel` 43 | -- 44 | 45 | DROP TABLE IF EXISTS `archives_joblevel`; 46 | /*!40101 SET @saved_cs_client = @@character_set_client */; 47 | /*!40101 SET character_set_client = utf8 */; 48 | CREATE TABLE `archives_joblevel` ( 49 | `archive_id` int(11) NOT NULL, 50 | `host_id` int(11) NOT NULL, 51 | `local_jobid` int(11) NOT NULL DEFAULT '-1', 52 | `local_job_array_index` int(11) NOT NULL DEFAULT '-1', 53 | `local_job_id_raw` int(11) NOT NULL, 54 | `start_time_ts` int(11) NOT NULL, 55 | `end_time_ts` int(11) NOT NULL, 56 | PRIMARY KEY (`archive_id`), 57 | KEY `hostjobs` (`host_id`,`local_job_id_raw`) 58 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 59 | /*!40101 SET character_set_client = @saved_cs_client */; 60 | 61 | -- 62 | -- Table structure for table `archives_nodelevel` 63 | -- 64 | 65 | DROP TABLE IF EXISTS `archives_nodelevel`; 66 | /*!40101 SET @saved_cs_client = @@character_set_client */; 67 | /*!40101 SET character_set_client = utf8 */; 68 | CREATE TABLE `archives_nodelevel` ( 69 | `archive_id` int(11) NOT NULL, 70 | `host_id` int(11) NOT NULL, 71 | `start_time_ts` int(11) NOT NULL, 72 | `end_time_ts` int(11) NOT NULL, 73 | PRIMARY KEY (`archive_id`), 74 | KEY `hosttimes` (`host_id`,`start_time_ts`,`end_time_ts`) 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 76 | /*!40101 SET character_set_client = @saved_cs_client */; 77 | 78 | -- 79 | -- Table structure for table `process` 80 | -- 81 | 82 | DROP TABLE IF EXISTS `process`; 83 | /*!40101 SET @saved_cs_client = @@character_set_client */; 84 | /*!40101 SET character_set_client = utf8 */; 85 | CREATE TABLE `process` ( 86 | `jobid` int(11) NOT NULL, 87 | `process_version` int(11) NOT NULL DEFAULT '0', 88 | `process_timestamp` timestamp NULL DEFAULT NULL, 89 | `process_time` double DEFAULT '0', 90 | PRIMARY KEY (`jobid`), 91 | KEY `proc` (`process_version`) 92 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 93 | /*!40101 SET character_set_client = @saved_cs_client */; 94 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; 95 | 96 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; 97 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; 98 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; 99 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; 100 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; 101 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; 102 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; 103 | 104 | -- Dump completed on 2015-05-19 11:00:21 105 | -------------------------------------------------------------------------------- /src/supremm/config.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Configuration data management """ 3 | import os 4 | import json 5 | import configparser 6 | import re 7 | import glob 8 | import pkg_resources 9 | import logging 10 | 11 | 12 | def iscomment(line): 13 | """ check is line is a c++ style comment """ 14 | if re.search(r"^\s*//", line): 15 | return True 16 | return False 17 | 18 | class Config(object): 19 | """ Configuration data management 20 | The configuration file format is similar to json except lines that begin "//" 21 | are treated as comments and are ignored. Also the string \n[:space:]// is not permitted 22 | anywhere in json key or value. 23 | """ 24 | 25 | def __init__(self, confpath=None): 26 | 27 | if confpath == None: 28 | confpath = self.autodetectconfpath() 29 | 30 | if confpath is None or os.path.isdir(confpath) == False: 31 | raise Exception("Missing configuration path %s" % confpath) 32 | 33 | conffile = os.path.join(confpath, "config.json") 34 | logging.debug("Using config file %s", conffile) 35 | with open(conffile, "r") as conffp: 36 | confdata = "" 37 | for line in conffp: 38 | if not iscomment(line): 39 | confdata += line 40 | try: 41 | self._config = json.loads(confdata) 42 | except ValueError as exc: 43 | raise Exception("Syntax error in %s.\n%s" % (conffile, str(exc))) 44 | 45 | self._xdmodconfig = None 46 | 47 | def __str__(self): 48 | return json.dumps(self._config, indent=4) 49 | 50 | @staticmethod 51 | def autodetectconfpath(filename="config.json"): 52 | """ search known paths for the configuration directory 53 | List of paths support the three typical install locations 54 | 1) Environment variable SUPREMM_CONFIG_DIR 55 | 2) source install with pip 56 | 3) rpm based install 57 | 4) source install with python setup.py install 58 | @returns Directory name or None if no suitable directory found 59 | """ 60 | searchpaths = [ 61 | os.getenv('SUPREMM_CONFIG_DIR', os.path.dirname(os.path.abspath(__file__)) + "/../../../../etc/supremm"), 62 | "/etc/supremm", 63 | pkg_resources.resource_filename(pkg_resources.Requirement.parse("supremm"), "etc/supremm") 64 | ] 65 | 66 | for path in searchpaths: 67 | if os.path.exists(os.path.join(path, filename)): 68 | return os.path.abspath(path) 69 | 70 | return None 71 | 72 | def getsection(self, sectionname): 73 | """ return the dict for a given section """ 74 | 75 | if "include" in self._config[sectionname]: 76 | self._config[sectionname] = self.process_include(sectionname, self._config[sectionname]['include']) 77 | 78 | return self._config[sectionname] 79 | 80 | def parsexdmod(self): 81 | """ locate and parse the XDMoD portal settings file """ 82 | self._xdmodconfig = configparser.RawConfigParser() 83 | 84 | xdmodconfs = glob.glob(os.path.join(self._config['xdmodroot'], "portal_settings.d/*.ini")) 85 | xdmodconfs.sort() 86 | xdmodconfs.insert(0, os.path.join(self._config['xdmodroot'], "portal_settings.ini")) 87 | xdmodconfs.reverse() 88 | 89 | nread = self._xdmodconfig.read(xdmodconfs) 90 | if len(nread) == 0: 91 | raise Exception("Unable to read XDMoD configuration file. Locations scanned: %s", xdmodconfs) 92 | 93 | @staticmethod 94 | def strtonative(value): 95 | v = value.strip("\"") 96 | try: 97 | return int(v) 98 | except ValueError: 99 | return v 100 | 101 | def process_include(self, sectionname, url): 102 | """ process an include directive (only xdmod parsing is supported) """ 103 | if url.startswith("xdmod://"): 104 | if self._xdmodconfig == None: 105 | self.parsexdmod() 106 | 107 | xdmodsection = url[8:] 108 | if not self._xdmodconfig.has_section(xdmodsection): 109 | raise Exception("Unable to locate include data for %s", url) 110 | 111 | result = {} 112 | for k, v in self._xdmodconfig.items(xdmodsection): 113 | result[k] = self.strtonative(v) 114 | 115 | return result 116 | else: 117 | raise Exception("Unsupported include url %s in section %s", url, sectionname) 118 | 119 | def resourceconfigs(self): 120 | """ Iterator over enabled resources """ 121 | for resname, resdata in self._config['resources'].items(): 122 | if "enabled" in resdata and resdata['enabled'] == False: 123 | continue 124 | resdata['name'] = resname 125 | yield (resname, resdata) 126 | 127 | def test(): 128 | """ test """ 129 | conf = Config() 130 | print(conf.getsection("datawarehouse")) 131 | # for r, d in c.resourceconfigs(): 132 | # print r, d 133 | 134 | if __name__ == "__main__": 135 | test() 136 | -------------------------------------------------------------------------------- /src/supremm/datasource/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/__init__.py -------------------------------------------------------------------------------- /src/supremm/datasource/datasource.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from abc import ABC, abstractmethod 3 | 4 | from supremm.errors import ProcessingError 5 | from supremm.proc_common import instantiatePlugins 6 | 7 | class Datasource(ABC): 8 | """ Definition of the Datasource API """ 9 | 10 | def __init__(self, preprocs, plugins): 11 | self._allpreprocs = preprocs 12 | self._allplugins = plugins 13 | 14 | @property 15 | def allpreprocs(self): 16 | return self._allpreprocs 17 | 18 | @allpreprocs.setter 19 | def allpreprocs(self, preprocs): 20 | self._allpreprocs = preprocs 21 | 22 | @property 23 | def allplugins(self): 24 | return self._allplugins 25 | 26 | @allplugins.setter 27 | def allplugins(self, plugins): 28 | self._allplugins = plugins 29 | 30 | @abstractmethod 31 | def presummarize(self, job, config, resconf, opts): 32 | 33 | jobmeta = JobMeta() 34 | 35 | # Filter jobs by options 36 | if job.nodecount > 1 and opts['min_parallel_duration'] != None and job.walltime < opts['min_parallel_duration']: 37 | jobmeta.result = 1 38 | jobmeta.mdata["skipped_parallel_too_short"] = True 39 | jobmeta.error = ProcessingError.PARALLEL_TOO_SHORT 40 | # Was "skipped" 41 | jobmeta.missingnodes = job.nodecount 42 | logging.info("Skipping %s, skipped_parallel_too_short", job.job_id) 43 | elif opts['min_duration'] != None and job.walltime < opts['min_duration']: 44 | jobmeta.result = 1 45 | jobmeta.mdata["skipped_too_short"] = True 46 | jobmeta.error = ProcessingError.TIME_TOO_SHORT 47 | jobmeta.missingnodes = job.nodecount 48 | logging.info("Skipping %s, skipped_too_short", job.job_id) 49 | elif job.nodecount < 1: 50 | jobmeta.result = 1 51 | jobmeta.mdata["skipped_invalid_nodecount"] = True 52 | jobmeta.error = ProcessingError.INVALID_NODECOUNT 53 | jobmeta.missingnodes = job.nodecount 54 | logging.info("Skipping %s, skipped_invalid_nodecount", job.job_id) 55 | elif opts['max_nodes'] > 0 and job.nodecount > opts['max_nodes']: 56 | jobmeta.result = 1 57 | jobmeta.mdata["skipped_job_too_big"] = True 58 | jobmeta.error = ProcessingError.JOB_TOO_BIG 59 | jobmeta.missingnodes = job.nodecount 60 | logging.info("Skipping %s, skipped_job_too_big", job.job_id) 61 | elif opts['max_nodetime'] != None and (job.nodecount * job.walltime) > opts['max_nodetime']: 62 | jobmeta.result = 1 63 | jobmeta.mdata["skipped_job_nodehours"] = True 64 | jobmeta.error = ProcessingError.JOB_TOO_MANY_NODEHOURS 65 | jobmeta.missingnodes = job.nodecount 66 | logging.info("Skipping %s, skipped_job_too_big (node time)", job.job_id) 67 | elif opts['max_duration'] > 0 and job.walltime >= opts['max_duration']: 68 | jobmeta.result = 1 69 | jobmeta.mdata["skipped_too_long"] = True 70 | jobmeta.error = ProcessingError.TIME_TOO_LONG 71 | jobmeta.missingnodes = job.nodecount 72 | logging.info("Skipping %s, skipped_too_long", job.job_id) 73 | 74 | return jobmeta 75 | 76 | @abstractmethod 77 | def summarizejob(self, job, jobmeta, config, opts): 78 | # All datasources instantiate plugins/preprocs 79 | preprocessors = instantiatePlugins(self.allpreprocs, job) 80 | analytics = instantiatePlugins(self.allplugins, job) 81 | return preprocessors, analytics 82 | 83 | @abstractmethod 84 | def cleanup(self, job, opts): 85 | pass 86 | 87 | 88 | class JobMeta(): 89 | """ Container class for a job's metadata """ 90 | 91 | def __init__(self): 92 | self.mdata = {} 93 | self.result = 0 94 | self.error = None 95 | self.missingnodes = 0 96 | 97 | @property 98 | def mdata(self): 99 | return self._mdata 100 | 101 | @mdata.setter 102 | def mdata(self, md): 103 | self._mdata = md 104 | 105 | @property 106 | def result(self): 107 | return self._result 108 | 109 | @result.setter 110 | def result(self, r): 111 | self._result = r 112 | 113 | @property 114 | def error(self): 115 | return self._error 116 | 117 | @error.setter 118 | def error(self, e): 119 | self._error = e 120 | 121 | @property 122 | def missingnodes(self): 123 | return self._missingnodes 124 | 125 | @missingnodes.setter 126 | def missingnodes(self, mn): 127 | self._missingnodes = mn 128 | -------------------------------------------------------------------------------- /src/supremm/datasource/factory.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from supremm.datasource.pcp.pcpdatasource import PCPDatasource 4 | from supremm.datasource.prometheus.promdatasource import PromDatasource 5 | 6 | 7 | class DatasourceFactory(): 8 | """ Datasource class helper """ 9 | 10 | def __init__(self, preprocs, plugins, resconf): 11 | 12 | if resconf["datasource"] == "pcp": 13 | self._datasource = PCPDatasource(preprocs, plugins) 14 | elif resconf["datasource"] == "prometheus": 15 | self._datasource = PromDatasource(preprocs, plugins, resconf) 16 | else: 17 | logging.error("Invalid datasource in configuration: %s", resconf["datasource"]) 18 | 19 | def presummarize(self, job, config, resconf, opts): 20 | return self._datasource.presummarize(job, config, resconf, opts) 21 | 22 | def summarizejob(self, job, jobmeta, config, opts): 23 | return self._datasource.summarizejob(job, jobmeta, config, opts) 24 | 25 | def cleanup(self, job, opts): 26 | return self._datasource.cleanup(job, opts) 27 | -------------------------------------------------------------------------------- /src/supremm/datasource/pcp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/pcp/__init__.py -------------------------------------------------------------------------------- /src/supremm/datasource/pcp/pcpcinterface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/pcp/pcpcinterface/__init__.py -------------------------------------------------------------------------------- /src/supremm/datasource/pcp/pcpcinterface/c_pcp.pxd: -------------------------------------------------------------------------------- 1 | from pcp import pmapi # Python bindings 2 | 3 | cdef extern from "sys/time.h": 4 | ctypedef struct timeval: 5 | pass 6 | 7 | cdef extern from "pcp/pmapi.h": 8 | # Errors 9 | int PM_ERR_GENERIC = "PM_ERR_GENERIC" 10 | int PM_ERR_PMID = "PM_ERR_PMID" 11 | int PM_ERR_INDOM = "PM_ERR_INDOM" 12 | int PM_ERR_INST = "PM_ERR_INST" 13 | int PM_ERR_PMID_LOG = "PM_ERR_PMID_LOG" 14 | int PM_ERR_INDOM_LOG = "PM_ERR_INDOM_LOG" 15 | int PM_ERR_INST_LOG = "PM_ERR_INST_LOG" 16 | int PM_ERR_NAME = "PM_ERR_NAME" 17 | int PM_ERR_SIGN = "PM_ERR_SIGN" 18 | enum: PM_ERR_VALUE 19 | 20 | # pmDesc.type -- data type of metric values 21 | int PM_TYPE_NOSUPPORT = "PM_TYPE_NOSUPPORT" 22 | int PM_TYPE_32 = "PM_TYPE_32" 23 | int PM_TYPE_U32 = "PM_TYPE_U32" 24 | int PM_TYPE_64 = "PM_TYPE_64" 25 | int PM_TYPE_U64 = "PM_TYPE_U64" 26 | int PM_TYPE_FLOAT = "PM_TYPE_FLOAT" 27 | int PM_TYPE_DOUBLE = "PM_TYPE_DOUBLE" 28 | int PM_TYPE_STRING = "PM_TYPE_STRING" 29 | int PM_TYPE_AGGREGATE = "PM_TYPE_AGGREGATE" 30 | int PM_TYPE_AGGREGATE_STATIC = "PM_TYPE_AGGREGATE_STATIC" 31 | int PM_TYPE_EVENT = "PM_TYPE_EVENT" 32 | int PM_TYPE_HIGHRES_EVENT = "PM_TYPE_HIGHRES_EVENT" 33 | int PM_TYPE_UNKNOWN = "PM_TYPE_UNKNOWN" 34 | 35 | ctypedef struct pmUnits: 36 | pass 37 | ctypedef unsigned int pmID 38 | ctypedef unsigned int pmInDom 39 | ctypedef struct pmValueBlock: 40 | pass 41 | ctypedef union myvalue: 42 | pmValueBlock* pval 43 | int lval 44 | ctypedef struct pmValue: # Can't declare anonymous union 45 | int inst 46 | myvalue value 47 | ctypedef struct pmDesc: 48 | pmID pmid 49 | int type 50 | pmInDom indom 51 | int sem 52 | pmUnits units 53 | ctypedef struct pmValueSet: 54 | pmID pmid 55 | int numval 56 | int valfmt 57 | pmValue vlist[1] 58 | ctypedef struct pmResult: 59 | timeval timestamp 60 | int numpmid 61 | pmValueSet *vset[1] 62 | ctypedef union pmAtomValue: 63 | # TODO use types instead of simple long etc. 64 | char* cp 65 | long l 66 | unsigned long ul 67 | long long ll 68 | unsigned long long ull 69 | float f 70 | double d 71 | 72 | pmInDom PM_INDOM_NULL 73 | 74 | int pmLookupName(int, char **, pmID *) 75 | int pmLookupDesc(pmID, pmDesc *) 76 | int pmLookupInDom(pmInDom, const char *) 77 | int pmLookupInDomArchive(pmInDom, const char *) 78 | int pmNameInDom(pmInDom, int, char **) 79 | int pmNameInDomArchive(pmInDom, int, char **) 80 | int pmUseContext(int) 81 | int pmGetInDom(pmInDom, int **, char ***) 82 | int pmGetInDomArchive(pmInDom, int **, char ***) 83 | int pmExtractValue(int, const pmValue *, int, pmAtomValue *, int) 84 | char *pmErrStr(int) 85 | -------------------------------------------------------------------------------- /src/supremm/datasource/pcp/pcpdatasource.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import time 4 | import logging 5 | import datetime 6 | 7 | from supremm.datasource.datasource import Datasource 8 | from supremm.datasource.pcp.pcparchive import extract_and_merge_logs 9 | from supremm.datasource.pcp.pcpsummarize import PCPSummarize 10 | from supremm.errors import ProcessingError 11 | 12 | class PCPDatasource(Datasource): 13 | """ Instance of a PCP datasource class """ 14 | 15 | def __init__(self, preprocs, plugins): 16 | super().__init__(preprocs, plugins) 17 | 18 | def presummarize(self, job, conf, resconf, opts): 19 | jobmeta = super().presummarize(job, conf, resconf, opts) 20 | 21 | # Error with general presummarize, don't try datasource specific checks 22 | if jobmeta.result != 0 and jobmeta.error != None: 23 | return jobmeta 24 | else: 25 | mergestart = time.time() 26 | if not job.has_any_archives(): 27 | jobmeta.result = 1 28 | jobmeta.mdata["skipped_noarchives"] = True 29 | jobmeta.error = ProcessingError.NO_ARCHIVES 30 | jobmeta.missingnodes = job.nodecount 31 | logging.info("Skipping %s, skipped_noarchives", job.job_id) 32 | elif not job.has_enough_raw_archives(): 33 | jobmeta.result = 1 34 | jobmeta.mdata["skipped_rawarchives"] = True 35 | jobmeta.error = ProcessingError.RAW_ARCHIVES 36 | jobmeta.missingnodes = job.nodecount 37 | logging.info("Skipping %s, skipped_rawarchives", job.job_id) 38 | else: 39 | jobmeta.result = extract_and_merge_logs(job, conf, resconf, opts) 40 | jobmeta.missingnodes = -1.0 * jobmeta.result 41 | 42 | mergeend = time.time() 43 | jobmeta.mdata["mergetime"] = mergeend - mergestart 44 | 45 | if opts['extractonly']: 46 | if jobmeta.result == 0: 47 | return None 48 | else: 49 | logging.error("Failure extracting logs for job %s", job.job_id) 50 | return None 51 | 52 | return jobmeta 53 | 54 | def summarizejob(self, job, jobmeta, conf, opts): 55 | preprocessors, analytics = super().summarizejob(job, jobmeta, conf, opts) 56 | 57 | s = PCPSummarize(preprocessors, analytics, job, conf, opts["fail_fast"]) 58 | 59 | enough_nodes = False 60 | 61 | if 0 == jobmeta.result or (job.nodecount !=0 and (jobmeta.missingnodes / job.nodecount < 0.05)): 62 | enough_nodes = True 63 | logging.info("Success for %s files in %s (%s/%s)", job.job_id, job.jobdir, jobmeta.missingnodes, job.nodecount) 64 | s.process() 65 | elif jobmeta.error == None and job.nodecount != 0 and (jobmeta.missingnodes / job.nodecount >= 0.5): 66 | # Don't overwrite existing error 67 | # Don't have enough node data to even try summarization 68 | jobmeta.mdata["skipped_pmlogextract_error"] = True 69 | logging.info("Skipping %s, skipped_pmlogextract_error", job.job_id) 70 | jobmeta.error = ProcessingError.PMLOGEXTRACT_ERROR 71 | 72 | if opts['tag'] != None: 73 | jobmeta.mdata['tag'] = opts['tag'] 74 | 75 | if jobmeta.missingnodes > 0: 76 | jobmeta.mdata['missingnodes'] = jobmeta.missingnodes 77 | 78 | success = s.good_enough() 79 | 80 | if not success and enough_nodes: 81 | # We get here if the pmlogextract step gave us enough nodes but summarization didn't succeed for enough nodes 82 | # All other "known" errors should already be handled above. 83 | jobmeta.mdata["skipped_summarization_error"] = True 84 | logging.info("Skipping %s, skipped_summarization_error", job.job_id) 85 | jobmeta.error = ProcessingError.SUMMARIZATION_ERROR 86 | 87 | force_success = False 88 | if not success: 89 | force_timeout = opts['force_timeout'] 90 | if (datetime.datetime.now() - job.end_datetime) > datetime.timedelta(seconds=force_timeout): 91 | force_success = True 92 | 93 | return s, jobmeta.mdata, success or force_success, jobmeta.error 94 | 95 | def cleanup(self, opts, job): 96 | if opts['dodelete'] and job.jobdir is not None and os.path.exists(job.jobdir): 97 | # Clean up 98 | shutil.rmtree(job.jobdir, ignore_errors=True) 99 | -------------------------------------------------------------------------------- /src/supremm/datasource/prometheus/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/prometheus/__init__.py -------------------------------------------------------------------------------- /src/supremm/datasource/prometheus/promdatasource.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import re 4 | 5 | from supremm.datasource.datasource import Datasource 6 | from supremm.datasource.prometheus.prommapping import MappingManager 7 | from supremm.datasource.prometheus.prominterface import PromClient 8 | from supremm.datasource.prometheus.promsummarize import PromSummarize 9 | from supremm.errors import ProcessingError 10 | 11 | 12 | class PromDatasource(Datasource): 13 | """ Instance of a Prometheus datasource class """ 14 | 15 | def __init__(self, preprocs, plugins, resconf): 16 | super().__init__(preprocs, plugins) 17 | 18 | self._client = PromClient(resconf) 19 | self._mapping = MappingManager(self.client) 20 | 21 | @property 22 | def client(self): 23 | return self._client 24 | 25 | @client.setter 26 | def client(self, c): 27 | self._client = c 28 | 29 | @property 30 | def mapping(self): 31 | return self._mapping 32 | 33 | @mapping.setter 34 | def mapping(self, m): 35 | self._mapping = m 36 | 37 | def presummarize(self, job, conf, resconf, opts): 38 | jobmeta = super().presummarize(job, conf, resconf, opts) 39 | 40 | # Initialize client and test connection 41 | if not self.client and not self.mapping: 42 | self.client = PromClient(resconf) 43 | if not self.client.connection: 44 | jobmeta.result = 1 45 | jobmeta.mdata["skipped_no_prom_connection"] = True 46 | jobmeta.error = ProcessingError.PROMETHEUS_CONNECTION 47 | logging.info("Skipping %s, skipped_no_prom_connection", job.job_id) 48 | jobmeta.missingnodes = job.nodecount 49 | return 50 | self.mapping = MappingManager(self.client) 51 | 52 | return jobmeta 53 | 54 | def summarizejob(self, job, jobmeta, config, opts): 55 | # Instantiate preproc, plugins 56 | preprocessors, analytics = super().summarizejob(job, jobmeta, config, opts) 57 | 58 | s = PromSummarize(preprocessors, analytics, job, config, self.mapping, opts["fail_fast"]) 59 | 60 | enough_nodes = False 61 | 62 | # missingnodes will always == nodecount if there is a Prometheus error 63 | if 0 == jobmeta.result or (job.nodecount !=0 and (jobmeta.missingnodes / job.nodecount < 0.05)): 64 | enough_nodes = True 65 | logging.info("Success for prometheus presummarize checks, job %s (%s/%s)", job.job_id, jobmeta.missingnodes, job.nodecount) 66 | s.process() 67 | elif jobmeta.error == None and job.nodecount != 0 and (jobmeta.missingnodes / job.nodecount >= 0.5): 68 | # Don't overwrite existing error 69 | # Don't have enough node data to even try summarization 70 | jobmeta.mdata["skipped_prom_error"] = True 71 | logging.info("Skipping %s, skipped_prom_error", job.job_id) 72 | jobmeta.error = ProcessingError.PROMETHEUS_CONNECTION 73 | 74 | if opts['tag'] != None: 75 | jobmeta.mdata['tag'] = opts['tag'] 76 | 77 | if jobmeta.missingnodes > 0: 78 | jobmeta.mdata['missingnodes'] = jobmeta.missingnodes 79 | 80 | success = s.good_enough() 81 | 82 | if not success and enough_nodes: 83 | # All other "known" errors should already be handled above. 84 | jobmeta.mdata["skipped_summarization_error"] = True 85 | logging.info("Skipping %s, skipped_summarization_error", job.job_id) 86 | jobmeta.error = ProcessingError.SUMMARIZATION_ERROR 87 | 88 | force_success = False 89 | if not success: 90 | force_timeout = opts['force_timeout'] 91 | if (datetime.datetime.now() - job.end_datetime) > datetime.timedelta(seconds=force_timeout): 92 | force_success = True 93 | 94 | return s, jobmeta.mdata, success or force_success, jobmeta.error 95 | 96 | def cleanup(self, opts, job): 97 | # Nothing to be done for Prometheus 98 | pass 99 | 100 | def parse_scrape_interval(interval): 101 | # function to parse scrape interval string 102 | # "30s" -> 30, "1m" -> 60, "1m30s" -> 90, etc 103 | times = re.split('(\d+[smhd])', interval) 104 | 105 | scrape_interval = 0 106 | for time in times: 107 | t = re.findall('\d+|\D+', time) 108 | try: 109 | result = int(t[0]) 110 | except ValueError: 111 | logging.error("Could not parse configured scrape interval: (%s)", interval) 112 | return None 113 | modifier = t[-1] 114 | if modifier == 's': 115 | scrape_interval += result 116 | elif modifier == 'm': 117 | scrape_interval += (result * 60) 118 | elif modifier == 'h': 119 | scrape_interval += (result * (60 * 60)) 120 | 121 | return scrape_interval 122 | -------------------------------------------------------------------------------- /src/supremm/errors.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ processing error class is defined so that common errors can be assigned short codes """ 3 | 4 | class ProcessingError(object): 5 | """ Container class for processing errors """ 6 | RAW_COUNTER_UNAVAILABLE = 1 7 | JOB_TOO_SHORT = 2 8 | INSUFFICIENT_DATA = 3 9 | INSUFFICIENT_HOSTDATA = 4 10 | CPUSET_UNKNOWN = 5 11 | PMDA_RESTARTED_DURING_JOB = 6 12 | INDOMS_CHANGED_DURING_JOB = 7 13 | PMLOGEXTRACT_ERROR = 8 14 | PARALLEL_TOO_SHORT = 9 15 | INVALID_NODECOUNT = 10 16 | JOB_TOO_BIG = 11 17 | TIME_TOO_SHORT = 12 18 | TIME_TOO_LONG = 13 19 | UNKNOWN_CANNOT_PROCESS = 14 20 | NO_ARCHIVES = 15 21 | SUMMARIZATION_ERROR = 16 22 | RAW_ARCHIVES = 17 23 | JOB_TOO_MANY_NODEHOURS = 18 24 | MAX_ERROR = 19 25 | PROMETHEUS_CONNECTION = 20 26 | 27 | def __init__(self, err_id): 28 | self._id = err_id 29 | 30 | def __str__(self): 31 | names = { 32 | ProcessingError.RAW_COUNTER_UNAVAILABLE: "Required raw metrics not available.", 33 | ProcessingError.JOB_TOO_SHORT: "The job was too short.", 34 | ProcessingError.INSUFFICIENT_DATA: "There were too few datapoints.", 35 | ProcessingError.INSUFFICIENT_HOSTDATA: "Not all of the hosts had raw metrics available", 36 | ProcessingError.CPUSET_UNKNOWN: "The cpuset that was assigned to the job is unavailable", 37 | ProcessingError.PMDA_RESTARTED_DURING_JOB: "The PMDA restarted during the job", 38 | ProcessingError.INDOMS_CHANGED_DURING_JOB: "The instance domains for required metrics changed during the job", 39 | ProcessingError.PMLOGEXTRACT_ERROR: "Generic failure in the pmlogextract step", 40 | ProcessingError.PARALLEL_TOO_SHORT: "Parallel job ran for too short of a time", 41 | ProcessingError.INVALID_NODECOUNT: "Fewer than 1 node reported for this job", 42 | ProcessingError.JOB_TOO_BIG: "Processing skipped due to large node count in job", 43 | ProcessingError.TIME_TOO_SHORT: "Job ran for too short of a time to provide enough performance data", 44 | ProcessingError.TIME_TOO_LONG: "Job consumed an impossible amount of walltime", 45 | ProcessingError.UNKNOWN_CANNOT_PROCESS: "Job cannot be summarized for unknown reason", 46 | ProcessingError.NO_ARCHIVES: "None of the nodes in the job have pcp archives", 47 | ProcessingError.SUMMARIZATION_ERROR: "There were enough archives to try summarization, but too few archives were successfully processed", 48 | ProcessingError.RAW_ARCHIVES: "Not enough raw archives to try pmlogextract", 49 | ProcessingError.JOB_TOO_MANY_NODEHOURS: "Total job node hours exceeded threshold", 50 | ProcessingError.PROMETHEUS_CONNECTION: "An error occurred with the Prometheus server during summarization" 51 | } 52 | return names[self._id] 53 | 54 | @staticmethod 55 | def doc(): 56 | """ Returns a dict containing the documentation for all supported errors """ 57 | docs = {} 58 | for i in range(1, ProcessingError.MAX_ERROR): 59 | docs[i] = str(ProcessingError(i)) 60 | 61 | return docs 62 | 63 | def get(self): 64 | """ get """ 65 | return self._id 66 | 67 | class NotApplicableError(Exception): 68 | """ Used by plugins to indicate that their analysis is not avaiable for 69 | the HPC job. For example, if a plugin implements a resource-manager-specific 70 | analysis and the job was not run on the supported resource manager. """ 71 | pass 72 | 73 | if __name__ == "__main__": 74 | print(ProcessingError.doc()) 75 | -------------------------------------------------------------------------------- /src/supremm/gen_pmlogger_control.py: -------------------------------------------------------------------------------- 1 | """ 2 | Script to generate remote host portion of pmlogger control file. 3 | 4 | Usage: cat [hostlist] | python gen-pmlogger-control.py 5 | 6 | Author: Andrew E. Bruno 7 | """ 8 | import fileinput 9 | 10 | pcp_archive_dir = '/data/pcp-logs' 11 | pmlogger_config = 'pmlogger-config.ubccr' 12 | 13 | def main(): 14 | for host in fileinput.input(): 15 | host = host.rstrip() 16 | print("%s n n %s/%s -c ./%s" % (host,pcp_archive_dir,host,pmlogger_config)) 17 | 18 | if __name__ == '__main__': 19 | main() -------------------------------------------------------------------------------- /src/supremm/lariat.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Utilities for processing Lariat data """ 3 | import datetime 4 | import os 5 | import json 6 | import logging 7 | 8 | class LariatManager(object): 9 | """ find and cache the lariat data for a job """ 10 | def __init__(self, lariatpath): 11 | self.lariatpath = lariatpath 12 | self.lariatdata = dict() 13 | self.filesprocessed = [] 14 | self.errors = dict() 15 | 16 | def find(self, jobid, jobstarttime, jobendtime): 17 | """ returns a dict containing the lariat data for a job """ 18 | 19 | if jobid in self.lariatdata: 20 | print("Lariat cache size is ", len(self.lariatdata)) 21 | return self.lariatdata.pop(jobid) 22 | 23 | for days in (0, -1, 1): 24 | searchday = datetime.datetime.utcfromtimestamp(jobendtime) + datetime.timedelta(days) 25 | lfilename = os.path.join(self.lariatpath, searchday.strftime('%Y'), searchday.strftime('%m'), searchday.strftime('lariatData-sgeT-%Y-%m-%d.json')) 26 | self.loadlariat(lfilename) 27 | if jobid in self.lariatdata: 28 | return self.lariatdata[jobid] 29 | 30 | for days in (0, -1, 1): 31 | searchday = datetime.datetime.utcfromtimestamp(jobstarttime) + datetime.timedelta(days) 32 | lfilename = os.path.join(self.lariatpath, searchday.strftime('%Y'), searchday.strftime('%m'), searchday.strftime('lariatData-sgeT-%Y-%m-%d.json')) 33 | self.loadlariat(lfilename) 34 | 35 | if jobid in self.lariatdata: 36 | return self.lariatdata[jobid] 37 | 38 | return None 39 | 40 | @staticmethod 41 | def removeDotKey(obj): 42 | """ replace . with - in the keys for the json object """ 43 | for key in list(obj.keys()): 44 | new_key = key.replace(".", "-") 45 | if new_key != key: 46 | obj[new_key] = obj[key] 47 | del obj[key] 48 | return obj 49 | 50 | def loadlariat(self, filename): 51 | """ load and store the contents of lariat output file "filename" """ 52 | 53 | if filename in self.filesprocessed: 54 | # No need to reparse file. If the job data was in the file, then this search 55 | # function would not have been called. 56 | return 57 | 58 | try: 59 | with open(filename, "rb") as fp: 60 | 61 | # Unfortunately, the lariat data is not in valid json 62 | # This workaround converts the illegal \' into valid quotes 63 | content = fp.read().replace("\\'", "'") 64 | lariatJson = json.loads(content, object_hook=LariatManager.removeDotKey) 65 | 66 | for k, v in lariatJson.items(): 67 | if k not in self.lariatdata: 68 | self.lariatdata[k] = v[0] 69 | else: 70 | # Have already got a record for this job. Keep the record 71 | # that has longer recorded runtime since this is probably 72 | # the endofjob record. 73 | if 'runtime' in v[0] and 'runtime' in self.lariatdata[k] and self.lariatdata[k]['runtime'] < v[0]['runtime']: 74 | self.lariatdata[k] = v[0] 75 | 76 | self.filesprocessed.append(filename) 77 | 78 | except Exception as e: 79 | logging.error("Error processing lariat file %s. Error was %s.", filename, str(e)) 80 | 81 | -------------------------------------------------------------------------------- /src/supremm/linuxhelpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Helper functions that can process data that is generated on 3 | resources that use a Linux kernel or Linux based OS.""" 4 | 5 | 6 | def parsecpusallowed(cpusallowed): 7 | """ cpusallowed parser converts the human-readable cpuset string to 8 | a list of cpu indexes 9 | """ 10 | 11 | cpulist = set() 12 | items = cpusallowed.split(",") 13 | for item in items: 14 | try: 15 | cpulist.add(int(item)) 16 | except ValueError as e: 17 | try: 18 | cpurange = [int(x) for x in item.split("-")] 19 | if len(cpurange) != 2: 20 | raise ValueError("Unable to parse cpusallowed \"" + cpusallowed + "\"") 21 | cpulist |= set(range(cpurange[0], cpurange[1] + 1)) 22 | except ValueError as e: 23 | raise ValueError("Unable to parse cpusallowed \"" + cpusallowed + "\"") 24 | 25 | return cpulist 26 | 27 | 28 | if __name__ == "__main__": 29 | print(parsecpusallowed("0-7")) 30 | print(parsecpusallowed("1")) 31 | print(parsecpusallowed("1,2")) 32 | print(parsecpusallowed("1,2,4-6,15")) 33 | print(parsecpusallowed("1,6-7")) 34 | print(parsecpusallowed("6-7,9")) 35 | -------------------------------------------------------------------------------- /src/supremm/migrations/1.0-1.1/modw_supremm.sql: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env mysql 2 | 3 | use modw_supremm; 4 | 5 | CREATE TABLE `archive_paths` ( 6 | `id` int(11) NOT NULL AUTO_INCREMENT, 7 | `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL, 8 | PRIMARY KEY (`id`), 9 | UNIQUE KEY (`filename`) 10 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 11 | 12 | CREATE TABLE `archives_nodelevel` ( 13 | `archive_id` int(11) NOT NULL, 14 | `host_id` int(11) NOT NULL, 15 | `start_time_ts` int(11) NOT NULL, 16 | `end_time_ts` int(11) NOT NULL, 17 | PRIMARY KEY (`archive_id`), 18 | KEY `hosttimes` (`host_id` ASC, `start_time_ts` ASC, `end_time_ts` ASC) 19 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 20 | 21 | CREATE TABLE `archives_joblevel` ( 22 | `archive_id` int(11) NOT NULL, 23 | `host_id` int(11) NOT NULL, 24 | `local_jobid` int(11) NOT NULL DEFAULT '-1', 25 | `local_job_array_index` int(11) NOT NULL DEFAULT '-1', 26 | `local_job_id_raw` int(11) NOT NULL, 27 | `start_time_ts` int(11) NOT NULL, 28 | `end_time_ts` int(11) NOT NULL, 29 | PRIMARY KEY (`archive_id`), 30 | KEY `hostjobs` (`host_id` ASC, `local_job_id_raw` ASC) 31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci; 32 | 33 | 34 | INSERT INTO `archive_paths` SELECT id, filename FROM archive; 35 | 36 | INSERT INTO `archives_nodelevel` SELECT id, hostid, FLOOR(start_time_ts), CEILING(end_time_ts) FROM `archive` WHERE jobid IS NULL; 37 | 38 | INSERT INTO `archives_joblevel` 39 | SELECT 40 | id, hostid, - 1, - 1, CAST(`jobid` AS SIGNED), FLOOR(start_time_ts), CEILING(end_time_ts) 41 | FROM 42 | `archive` 43 | WHERE 44 | jobid IS NOT NULL AND jobid RLIKE '^[0-9]+$'; 45 | 46 | INSERT INTO `archives_joblevel` 47 | SELECT 48 | id, hostid, SUBSTRING_INDEX(jobid, '[', 1), TRIM( TRAILING ']' FROM SUBSTRING_INDEX(jobid, '[', -1)), -1, FLOOR(start_time_ts), CEILING(end_time_ts) 49 | FROM 50 | `archive` 51 | WHERE 52 | jobid IS NOT NULL AND jobid RLIKE '^[0-9]+\[[0-9]+\]$'; 53 | 54 | INSERT INTO `archives_joblevel` 55 | SELECT 56 | id, hostid, SUBSTRING_INDEX(jobid, '_', 1), SUBSTRING_INDEX(jobid, '_', -1), -1, FLOOR(start_time_ts), CEILING(end_time_ts) 57 | FROM 58 | `archive` 59 | WHERE 60 | jobid IS NOT NULL AND jobid RLIKE '^[0-9]+_[0-9]+$'; 61 | -------------------------------------------------------------------------------- /src/supremm/plugins/ArmPowerUsageTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from collections import Counter 5 | import numpy 6 | 7 | from supremm.plugin import Plugin 8 | from supremm.subsample import TimeseriesAccumulator 9 | from supremm.errors import ProcessingError 10 | 11 | class ArmPowerUsageTimeseries(Plugin): 12 | """ Generate the Power usage as a timeseries data """ 13 | 14 | name = property(lambda x: "corepower") 15 | mode = property(lambda x: "timeseries") 16 | requiredMetrics = property(lambda x: ["perfevent.hwcounters.arm_a64fx__EA_CORE.value", "perfevent.hwcounters.arm_a64fx__EA_L2.value", 17 | "perfevent.hwcounters.arm_a64fx__EA_MEMORY.value"]) 18 | optionalMetrics = property(lambda x: []) 19 | derivedMetrics = property(lambda x: []) 20 | 21 | def __init__(self, job): 22 | super(ArmPowerUsageTimeseries, self).__init__(job) 23 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 24 | self._error = None 25 | self._hostdata = {} 26 | 27 | @staticmethod 28 | def computetimepoint(data): 29 | """ Get the power usage from the data """ 30 | if data[0][0] < numpy.finfo(numpy.float64).eps: 31 | return None 32 | 33 | return data[0][0] 34 | 35 | def process(self, nodemeta, timestamp, data, description): 36 | 37 | if self._job.getdata('perf')['active'] != True: 38 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 39 | return False 40 | 41 | if len(data[0]) == 0: 42 | # Skip data point with no data 43 | return True 44 | 45 | if nodemeta.nodeindex not in self._hostdata: 46 | self._hostdata[nodemeta.nodeindex] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, 3)) 47 | 48 | cpucount = numpy.sum(data[0]) 49 | l2count = data[1][0] + data[1][12] + data[1][24] + data[1][36] 50 | memcount = data[2][0] + data[2][12] + data[2][24] + data[2][36] 51 | 52 | energy = (8.04 * cpucount) + (32.8 * l2count) + (271.0 * memcount) 53 | 54 | insertat = self._data.adddata(nodemeta.nodeindex, timestamp, energy) 55 | 56 | if insertat is not None: 57 | self._hostdata[nodemeta.nodeindex][insertat] = numpy.array([cpucount, l2count, memcount]) 58 | 59 | return True 60 | 61 | def results(self): 62 | 63 | if self._error: 64 | return {"error": self._error} 65 | 66 | if len(self._hostdata) != self._job.nodecount: 67 | return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} 68 | 69 | values = self._data.get() 70 | 71 | if len(values[0, :, 0]) < 2: 72 | return {"error": ProcessingError.JOB_TOO_SHORT} 73 | 74 | rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) / 1.0e9 75 | 76 | if len(self._hostdata) > 64: 77 | 78 | # Compute min, max & median data and only save the host data 79 | # for these hosts 80 | 81 | sortarr = numpy.argsort(rates.T, axis=1) 82 | 83 | retdata = { 84 | "min": self.collatedata(sortarr[:, 0], rates), 85 | "max": self.collatedata(sortarr[:, -1], rates), 86 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates), 87 | "times": values[0, 1:, 0].tolist(), 88 | "hosts": {} 89 | } 90 | 91 | uniqhosts = Counter(sortarr[:, 0]) 92 | uniqhosts.update(sortarr[:, -1]) 93 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 94 | includelist = uniqhosts.keys() 95 | else: 96 | # Save data for all hosts 97 | retdata = { 98 | "times": values[0, 1:, 0].tolist(), 99 | "hosts": {} 100 | } 101 | includelist = self._hostdata.keys() 102 | 103 | scaling = { 104 | '0': 8.04e-9, 105 | '1': 32.8e-9, 106 | '2': 271.e-9 107 | } 108 | 109 | for hostidx in includelist: 110 | retdata['hosts'][str(hostidx)] = {} 111 | retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() 112 | retdata['hosts'][str(hostidx)]['dev'] = {} 113 | for devid in ['0', '1', '2']: 114 | dpnts = len(values[hostidx, :, 0]) 115 | retdata['hosts'][str(hostidx)]['dev'][devid] = (scaling[devid] * numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() 116 | 117 | retdata['hosts'][str(hostidx)]['names'] = {'0': 'cpu', '1': 'l2', '2': 'mem'} 118 | 119 | return retdata 120 | 121 | @staticmethod 122 | def collatedata(args, rates): 123 | """ build output data """ 124 | result = [] 125 | for timepoint, hostidx in enumerate(args): 126 | try: 127 | result.append([rates[hostidx, timepoint], int(hostidx)]) 128 | except IndexError: 129 | pass 130 | 131 | return result 132 | -------------------------------------------------------------------------------- /src/supremm/plugins/Block.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class Block(DeviceBasedPlugin): 6 | """ This plugin processes lots of metric that are all interested in the difference over the process """ 7 | 8 | name = property(lambda x: "block") 9 | requiredMetrics = property(lambda x: [ 10 | "disk.dev.read", 11 | "disk.dev.read_bytes", 12 | "disk.dev.write", 13 | "disk.dev.write_bytes" 14 | ]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/supremm/plugins/BlockTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import RateConvertingTimeseriesPlugin 5 | import numpy 6 | 7 | class BlockTimeseries(RateConvertingTimeseriesPlugin): 8 | """ Generate timeseries summary for block device usage data """ 9 | 10 | name = property(lambda x: "block") 11 | requiredMetrics = property(lambda x: ["disk.dev.read_bytes", 12 | "disk.dev.write_bytes"]) 13 | optionalMetrics = property(lambda x: []) 14 | derivedMetrics = property(lambda x: []) 15 | 16 | def __init__(self, job): 17 | super(BlockTimeseries, self).__init__(job) 18 | 19 | def computetimepoint(self, data): 20 | return numpy.sum(numpy.array(data)) / 1048576.0 21 | -------------------------------------------------------------------------------- /src/supremm/plugins/Catastrophe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import Plugin 4 | from supremm.errors import ProcessingError 5 | import numpy 6 | 7 | class Catastrophe(Plugin): 8 | """ Catastrophe analytic. Algorithm originally developed by Bill Barth et al. for the 9 | tacc_stats project """ 10 | 11 | name = property(lambda x: "catastrophe") 12 | mode = property(lambda x: "all") 13 | requiredMetrics = property(lambda x: [["perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value"], 14 | ["perfevent.hwcounters.L1D_REPLACEMENT.value"], 15 | ["perfevent.hwcounters.L1D_REPL.value"], 16 | ["perfevent.hwcounters.DATA_CACHE_MISSES_DC_MISS_STREAMING_STORE.value"]]) 17 | optionalMetrics = property(lambda x: []) 18 | derivedMetrics = property(lambda x: []) 19 | 20 | def __init__(self, job): 21 | super(Catastrophe, self).__init__(job) 22 | self._data = {} 23 | self._error = None 24 | 25 | def process(self, nodemeta, timestamp, data, description): 26 | 27 | if self._job.getdata('perf')['active'] != True: 28 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 29 | return False 30 | 31 | if len(data[0]) == 0: 32 | # Ignore datapoints where no data stored 33 | return True 34 | 35 | if nodemeta.nodename not in self._data: 36 | self._data[nodemeta.nodename] = {"x": [], "t": []} 37 | 38 | info = self._data[nodemeta.nodename] 39 | info['x'].append(1.0 * numpy.sum(data[0])) 40 | info['t'].append(timestamp) 41 | 42 | if len(info['x']) > 1: 43 | if numpy.any(info['x'][-1] - info['x'][-2] < 0.0): 44 | self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB 45 | return False 46 | 47 | return True 48 | 49 | def results(self): 50 | 51 | if self._error: 52 | return {"error": self._error} 53 | 54 | if len(self._data) == 0: 55 | return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE} 56 | 57 | vals = None 58 | 59 | for _, data in self._data.items(): 60 | 61 | if data['x'][-1] - data['x'][0] == 0.0: 62 | return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE} 63 | 64 | start = 2 65 | end = len(data['x'])-2 66 | 67 | for i in range(start+1, end-1): 68 | 69 | a = (data['x'][i] - data['x'][start]) / (data['t'][i] - data['t'][start]) 70 | b = (data['x'][end] - data['x'][i]) / (data['t'][end] - data['t'][i]) 71 | vals = b/a if vals == None else min(vals, b/a) 72 | 73 | if vals == None: 74 | return {"error": ProcessingError.JOB_TOO_SHORT} 75 | 76 | return {"value": vals} 77 | -------------------------------------------------------------------------------- /src/supremm/plugins/CgroupMemTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | from supremm.errors import ProcessingError, NotApplicableError 7 | import numpy 8 | from collections import Counter 9 | import re 10 | 11 | class CgroupMemTimeseries(Plugin): 12 | """ Generate timeseries summary for memory usage viewed from CGroup 13 | This code is SLURM-specific because of the SLURM cgroup naming convention. 14 | """ 15 | 16 | name = property(lambda x: "process_mem_usage") 17 | mode = property(lambda x: "timeseries") 18 | requiredMetrics = property(lambda x: ["cgroup.memory.usage"]) 19 | optionalMetrics = property(lambda x: []) 20 | derivedMetrics = property(lambda x: []) 21 | 22 | def __init__(self, job): 23 | super(CgroupMemTimeseries, self).__init__(job) 24 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 25 | self._hostdata = {} 26 | self._hostcounts = {} 27 | if job.acct['resource_manager'] == 'pbs': 28 | self._expectedcgroup = "/torque/{0}".format(job.job_id) 29 | elif job.acct['resource_manager'] == 'slurm': 30 | self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id) 31 | else: 32 | raise NotApplicableError 33 | 34 | def process(self, nodemeta, timestamp, data, description): 35 | 36 | hostidx = nodemeta.nodeindex 37 | 38 | if len(data[0]) == 0: 39 | # Skip data point with no data 40 | return True 41 | 42 | if nodemeta.nodeindex not in self._hostdata: 43 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, 1)) 44 | self._hostcounts[hostidx] = {'missing': 0, 'present': 0} 45 | 46 | try: 47 | dataidx = None 48 | for idx, desc in enumerate(description[0][1]): 49 | if re.match(r"^" + re.escape(self._expectedcgroup) + r"($|\.)", desc): 50 | dataidx = idx 51 | break 52 | # No cgroup info at this datapoint 53 | if dataidx is None: 54 | return True 55 | nodemem_gb = data[0][dataidx] / 1073741824.0 56 | self._hostcounts[hostidx]['present'] += 1 57 | except ValueError: 58 | self._hostcounts[hostidx]['missing'] += 1 59 | # No cgroup info at this datapoint 60 | return True 61 | 62 | insertat = self._data.adddata(hostidx, timestamp, nodemem_gb) 63 | if insertat != None: 64 | self._hostdata[hostidx][insertat] = nodemem_gb 65 | 66 | return True 67 | 68 | def results(self): 69 | 70 | if len(self._hostdata) != self._job.nodecount: 71 | return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE} 72 | 73 | for hcount in self._hostcounts.values(): 74 | if hcount['missing'] > hcount['present']: 75 | return {'error': ProcessingError.CPUSET_UNKNOWN} 76 | 77 | values = self._data.get() 78 | 79 | if len(self._hostdata) > 64: 80 | 81 | # Compute min, max & median data and only save the host data 82 | # for these hosts 83 | 84 | memdata = values[:, :, 1] 85 | sortarr = numpy.argsort(memdata.T, axis=1) 86 | 87 | retdata = { 88 | "min": self.collatedata(sortarr[:, 0], memdata), 89 | "max": self.collatedata(sortarr[:, -1], memdata), 90 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata), 91 | "times": values[0, :, 0].tolist(), 92 | "hosts": {} 93 | } 94 | 95 | uniqhosts = Counter(sortarr[:, 0]) 96 | uniqhosts.update(sortarr[:, -1]) 97 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 98 | includelist = list(uniqhosts.keys()) 99 | else: 100 | # Save data for all hosts 101 | retdata = { 102 | "times": values[0, :, 0].tolist(), 103 | "hosts": {} 104 | } 105 | includelist = list(self._hostdata.keys()) 106 | 107 | 108 | for hostidx in includelist: 109 | retdata['hosts'][str(hostidx)] = {} 110 | retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() 111 | 112 | return retdata 113 | 114 | @staticmethod 115 | def collatedata(args, rates): 116 | """ build output data """ 117 | result = [] 118 | for timepoint, hostidx in enumerate(args): 119 | try: 120 | result.append([rates[hostidx, timepoint], int(hostidx)]) 121 | except IndexError: 122 | pass 123 | 124 | return result 125 | -------------------------------------------------------------------------------- /src/supremm/plugins/CgroupMemory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Memory usage plugin """ 3 | 4 | import re 5 | from supremm.plugin import Plugin 6 | from supremm.statistics import RollingStats, calculate_stats 7 | from supremm.errors import ProcessingError, NotApplicableError 8 | 9 | class CgroupMemory(Plugin): 10 | """ Cgroup memory statistics for the job """ 11 | 12 | name = property(lambda x: "process_memory") 13 | mode = property(lambda x: "all") 14 | requiredMetrics = property(lambda x: ["cgroup.memory.usage", "cgroup.memory.limit"]) 15 | 16 | optionalMetrics = property(lambda x: []) 17 | derivedMetrics = property(lambda x: []) 18 | 19 | def __init__(self, job): 20 | super(CgroupMemory, self).__init__(job) 21 | self._data = {} 22 | self._hostcounts = {} 23 | if job.acct['resource_manager'] == 'pbs': 24 | self._expectedcgroup = "/torque/{0}".format(job.job_id) 25 | elif job.acct['resource_manager'] == 'slurm': 26 | self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id) 27 | else: 28 | raise NotApplicableError 29 | 30 | def process(self, nodemeta, timestamp, data, description): 31 | """ CGroup Memory statistics are the aritmetic mean of all values except the 32 | first. Rather than storing all of the meory measurements for 33 | the job, we use the RollingStats() class to keep track of the mean 34 | values. 35 | """ 36 | 37 | if len(data[0]) == 0: 38 | return True 39 | 40 | if nodemeta.nodeindex not in self._data: 41 | self._data[nodemeta.nodeindex] = [RollingStats() for i in range(len(self.requiredMetrics) + 1)] 42 | self._hostcounts[nodemeta.nodeindex] = {"present": 0, "missing": 0} 43 | # First data point for the node is ignored 44 | return True 45 | 46 | try: 47 | dataidx = None 48 | for idx, desc in enumerate(description[0][1]): 49 | if re.match(r"^" + re.escape(self._expectedcgroup) + r"($|\.)", desc): 50 | dataidx = idx 51 | break 52 | # No cgroup info at this datapoint 53 | if dataidx is None: 54 | return True 55 | for i in range(len(self.requiredMetrics)): 56 | if len(data[i]) < dataidx: 57 | # Skip timesteps with incomplete information 58 | raise ValueError 59 | 60 | self._hostcounts[nodemeta.nodeindex]["present"] += 1 61 | except ValueError: 62 | self._hostcounts[nodemeta.nodeindex]["missing"] += 1 63 | # No cgroup info at this datapoint 64 | return True 65 | 66 | hdata = self._data[nodemeta.nodeindex] 67 | 68 | for i in range(len(self.requiredMetrics)): 69 | hdata[i].append(data[i][dataidx]) 70 | 71 | if data[1][dataidx] > 0.0: 72 | hdata[2].append(1.0 * data[0][dataidx] / data[1][dataidx]) 73 | else: 74 | hdata[2].append(0.0) 75 | 76 | return True 77 | 78 | def results(self): 79 | 80 | if len(self._data) != self._job.nodecount: 81 | return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} 82 | 83 | for hoststat in self._hostcounts.values(): 84 | if hoststat['missing'] > hoststat['present']: 85 | return {"error": ProcessingError.CPUSET_UNKNOWN} 86 | 87 | stats = {"usage": {"avg": [], "max": []}, "limit": [], "usageratio": {"avg": [], "max": []}} 88 | 89 | datapoints = 0 90 | 91 | for memdata in self._data.values(): 92 | if memdata[0].count() > 0: 93 | datapoints += 1 94 | stats["usage"]["avg"].append(memdata[0].mean()) 95 | stats["usage"]["max"].append(memdata[0].max) 96 | stats["limit"].append(memdata[1].max) 97 | stats["usageratio"]["avg"].append(memdata[2].mean()) 98 | stats["usageratio"]["max"].append(memdata[2].max) 99 | 100 | if datapoints == 0: 101 | return {"error": ProcessingError.INSUFFICIENT_DATA} 102 | 103 | result = {"usage": {}, "usageratio": {}} 104 | result["usage"]["avg"] = calculate_stats(stats["usage"]["avg"]) 105 | result["usage"]["max"] = calculate_stats(stats["usage"]["max"]) 106 | result["limit"] = calculate_stats(stats["limit"]) 107 | result["usageratio"]["avg"] = calculate_stats(stats["usageratio"]["avg"]) 108 | result["usageratio"]["max"] = calculate_stats(stats["usageratio"]["max"]) 109 | 110 | return result 111 | -------------------------------------------------------------------------------- /src/supremm/plugins/CpuCategories.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ CPU categorization plugin """ 3 | 4 | from collections import OrderedDict 5 | import numpy as np 6 | 7 | from supremm.plugin import Plugin 8 | from supremm.errors import ProcessingError 9 | 10 | class CpuCategories(Plugin): 11 | """ Categorize a job based on its CPU utilization """ 12 | 13 | name = property(lambda x: "cpucategories") 14 | mode = property(lambda x: "all") 15 | requiredMetrics = property(lambda x: [[ 16 | "kernel.percpu.cpu.user", 17 | "kernel.percpu.cpu.nice", 18 | "kernel.percpu.cpu.sys", 19 | "kernel.percpu.cpu.idle", 20 | "kernel.percpu.cpu.wait.total", 21 | "kernel.percpu.cpu.intr", 22 | "kernel.percpu.cpu.irq.soft", 23 | "kernel.percpu.cpu.irq.hard" 24 | ]]) 25 | optionalMetrics = property(lambda x: []) 26 | derivedMetrics = property(lambda x: []) 27 | 28 | GOOD_THRESHOLD = 0.5 29 | PINNED_THRESHOLD = 0.9 30 | LOW_THRESHOLD = 0.1 31 | DELTA_THRESHOLD = 0.5 32 | MIN_DELTAS = 5 33 | MAX_DIFFERENCE = 0.1 34 | MIN_HIGH_SIZE = 1 35 | MIN_HIGH_VALUE = 0.5 36 | 37 | def __init__(self, job): 38 | super(CpuCategories, self).__init__(job) 39 | self._timeabove = {} 40 | self._timebelow = {} 41 | self._deltas = {} 42 | self._last = {} 43 | self._maxcores = {} 44 | 45 | def process(self, nodemeta, timestamp, data, description): 46 | length = len(data[0]) 47 | node = nodemeta.nodename 48 | proc = self._job.getdata('proc') 49 | 50 | # Initialize dicts to handle multiple nodes and cores 51 | if node not in self._last: 52 | self._timeabove[node] = {} 53 | self._timebelow[node] = {} 54 | self._deltas[node] = {} 55 | self._maxcores[node] = 0 56 | 57 | if proc is None or 'cpusallowed' not in proc or node not in proc['cpusallowed'] or 'error' in proc['cpusallowed'][node]: 58 | for i in range(length): 59 | self._timeabove[node][i] = 0 60 | self._timebelow[node][i] = 0 61 | self._deltas[node][i] = [] 62 | else: 63 | for i in proc['cpusallowed'][node]: 64 | self._timeabove[node][i] = 0 65 | self._timebelow[node][i] = 0 66 | self._deltas[node][i] = [] 67 | timeabove = [x for x in self._timeabove[node].keys()] 68 | self._last[node] = np.array(data)[:, timeabove] 69 | return True 70 | 71 | timeabove = [x for x in self._timeabove[node].keys()] 72 | nodedata = np.array(data)[:, timeabove] 73 | difference = nodedata - self._last[node] 74 | total = np.sum(difference, 0) 75 | self._last[node] = nodedata 76 | 77 | currentdeltas = difference[0] / total 78 | 79 | if length != 0: 80 | counter = 0 81 | for i in self._timeabove[node]: 82 | self._deltas[node][i].append(currentdeltas[counter]) 83 | if currentdeltas[counter] > self.DELTA_THRESHOLD: 84 | self._timeabove[node][i] += total[counter] 85 | else: 86 | self._timebelow[node][i] += total[counter] 87 | counter += 1 88 | 89 | totalusage = np.sum(currentdeltas) 90 | if not np.isnan(totalusage) and int(round(totalusage)) > self._maxcores[node]: 91 | self._maxcores[node] = int(round(totalusage)) 92 | return True 93 | 94 | def results(self): 95 | duty_cycles = OrderedDict() 96 | for node in self._timeabove: 97 | if len(list(self._deltas[node].values())[0]) < self.MIN_DELTAS: 98 | return {"error": ProcessingError.INSUFFICIENT_DATA} 99 | 100 | duty_cycles[node] = OrderedDict() 101 | for i in self._timeabove[node]: 102 | total_time = self._timeabove[node][i] + self._timebelow[node][i] 103 | ratio = self._timeabove[node][i] / total_time 104 | duty_cycles[node]["cpu{}".format(i)] = ratio 105 | 106 | # Categorize the job's performance 107 | duty_list = np.array([value for node in duty_cycles.values() for value in node.values()]) 108 | 109 | if not any(value < self.GOOD_THRESHOLD for value in duty_list): 110 | category = "GOOD" 111 | elif not any(value >= self.LOW_THRESHOLD for value in duty_list): 112 | category = "LOW" 113 | else: 114 | high = np.sort(duty_list[duty_list >= self.LOW_THRESHOLD]) 115 | if high.size > self.MIN_HIGH_SIZE: 116 | if high[-1] - high[0] < self.MAX_DIFFERENCE: 117 | category = "PINNED" 118 | else: 119 | category = "UNPINNED" 120 | else: 121 | if high[0] >= self.MIN_HIGH_VALUE: 122 | category = "PINNED" 123 | else: 124 | category = "UNPINNED" 125 | 126 | return {"dutycycles": duty_cycles, "category": category, "maxcores": sum(self._maxcores.values())} 127 | -------------------------------------------------------------------------------- /src/supremm/plugins/CpuUserTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | from collections import Counter 9 | 10 | class CpuUserTimeseries(Plugin): 11 | """ Generate the CPU usage as a timeseries data """ 12 | 13 | name = property(lambda x: "cpuuser") 14 | mode = property(lambda x: "timeseries") 15 | requiredMetrics = property(lambda x: ["kernel.percpu.cpu.user"]) 16 | optionalMetrics = property(lambda x: []) 17 | derivedMetrics = property(lambda x: []) 18 | 19 | def __init__(self, job): 20 | super(CpuUserTimeseries, self).__init__(job) 21 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 22 | self._hostdata = {} 23 | self._hostdevnames = {} 24 | self._cpusallowed = None 25 | 26 | def initcpus(self): 27 | if self._job.getdata('proc'): 28 | self._cpusallowed = self._job.getdata('proc')['cpusallowed'] 29 | else: 30 | self._cpusallowed = {} 31 | 32 | def process(self, nodemeta, timestamp, data, description): 33 | 34 | if self._cpusallowed == None: 35 | self.initcpus() 36 | 37 | if len(data[0]) == 0: 38 | # Skip datapoints that have no values 39 | return True 40 | 41 | if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]: 42 | cpudata = data[0][self._cpusallowed[nodemeta.nodename]] 43 | else: 44 | cpudata = data[0] 45 | 46 | hostidx = nodemeta.nodeindex 47 | 48 | if nodemeta.nodeindex not in self._hostdata: 49 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(cpudata))) 50 | if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]: 51 | self._hostdevnames[hostidx] = {} 52 | for i, cpuidx in enumerate(self._cpusallowed[nodemeta.nodename]): 53 | self._hostdevnames[hostidx][str(i)] = description[0][1][cpuidx] 54 | else: 55 | self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) 56 | 57 | insertat = self._data.adddata(hostidx, timestamp, numpy.mean(cpudata)/10.0) 58 | if insertat != None: 59 | self._hostdata[hostidx][insertat] = cpudata / 10.0 60 | 61 | return True 62 | 63 | def results(self): 64 | 65 | values = self._data.get() 66 | 67 | if len(values[0, :, 0]) < 3: 68 | return {"error": ProcessingError.JOB_TOO_SHORT} 69 | 70 | rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) 71 | 72 | if len(self._hostdata) > 64: 73 | 74 | # Compute min, max & median data and only save the host data 75 | # for these hosts 76 | 77 | sortarr = numpy.argsort(rates.T, axis=1) 78 | 79 | retdata = { 80 | "min": self.collatedata(sortarr[:, 0], rates), 81 | "max": self.collatedata(sortarr[:, -1], rates), 82 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates), 83 | "times": values[0, 1:, 0].tolist(), 84 | "hosts": {} 85 | } 86 | 87 | uniqhosts = Counter(sortarr[:, 0]) 88 | uniqhosts.update(sortarr[:, -1]) 89 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 90 | includelist = list(uniqhosts.keys()) 91 | else: 92 | # Save data for all hosts 93 | retdata = { 94 | "times": values[0, 1:, 0].tolist(), 95 | "hosts": {} 96 | } 97 | includelist = list(self._hostdata.keys()) 98 | 99 | 100 | for hostidx in includelist: 101 | retdata['hosts'][str(hostidx)] = {} 102 | retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() 103 | retdata['hosts'][str(hostidx)]['dev'] = {} 104 | 105 | for devid in self._hostdevnames[hostidx].keys(): 106 | dpnts = len(values[hostidx, :, 0]) 107 | retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() 108 | 109 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 110 | 111 | return retdata 112 | 113 | @staticmethod 114 | def collatedata(args, rates): 115 | """ build output data """ 116 | result = [] 117 | for timepoint, hostidx in enumerate(args): 118 | try: 119 | result.append([rates[hostidx, timepoint], int(hostidx)]) 120 | except IndexError: 121 | pass 122 | 123 | return result 124 | -------------------------------------------------------------------------------- /src/supremm/plugins/Gpfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class Gpfs(DeviceBasedPlugin): 6 | """ This plugin processes lots of metric that are all interested in the difference over the process """ 7 | 8 | name = property(lambda x: "gpfs") 9 | requiredMetrics = property(lambda x: [ 10 | "gpfs.fsios.read_bytes", 11 | "gpfs.fsios.write_bytes", 12 | "gpfs.fsios.reads", 13 | "gpfs.fsios.writes" 14 | ]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | 19 | -------------------------------------------------------------------------------- /src/supremm/plugins/GpfsTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import RateConvertingTimeseriesPlugin 5 | import numpy 6 | 7 | class GpfsTimeseries(RateConvertingTimeseriesPlugin): 8 | """ Generate the GPFS usage as a timeseries data """ 9 | 10 | name = property(lambda x: "lnet") 11 | requiredMetrics = property(lambda x: ["gpfs.fsios.read_bytes", "gpfs.fsios.write_bytes"]) 12 | optionalMetrics = property(lambda x: []) 13 | derivedMetrics = property(lambda x: []) 14 | 15 | def __init__(self, job): 16 | super(GpfsTimeseries, self).__init__(job) 17 | 18 | def computetimepoint(self, data): 19 | return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0 20 | -------------------------------------------------------------------------------- /src/supremm/plugins/GpuPower.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Energy usage plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import RollingStats, calculate_stats, Integrator 6 | from supremm.errors import ProcessingError 7 | 8 | class GpuPower(Plugin): 9 | """ Compute the power statistics for a job """ 10 | 11 | name = property(lambda x: "gpupower") 12 | mode = property(lambda x: "all") 13 | requiredMetrics = property(lambda x: ["nvidia.powerused"]) 14 | optionalMetrics = property(lambda x: []) 15 | derivedMetrics = property(lambda x: []) 16 | 17 | def __init__(self, job): 18 | super(GpuPower, self).__init__(job) 19 | self._data = {} 20 | 21 | def process(self, nodemeta, timestamp, data, description): 22 | """ Power measurements are similar to the memory measurements the first and last data points 23 | are ignored and the statistics are computed over all of the other measurements. 24 | """ 25 | 26 | if not data or not description: 27 | # nvidia pmda can be running, but no cards present 28 | return False 29 | 30 | if nodemeta.nodeindex not in self._data: 31 | self._data[nodemeta.nodeindex] = { 32 | 'power': RollingStats(), 33 | 'energy': Integrator(timestamp), 34 | 'names': [x for x in description[0][1]] 35 | } 36 | return True 37 | 38 | hdata = self._data[nodemeta.nodeindex] 39 | 40 | power_watts = data[0] / 1000.0 41 | 42 | hdata['power'].append(power_watts) 43 | hdata['energy'].add(timestamp, power_watts) 44 | 45 | return True 46 | 47 | def results(self): 48 | 49 | result = {} 50 | for data in self._data.values(): 51 | 52 | if data['power'].count() < 1: 53 | continue 54 | 55 | for i, devicename in enumerate(data['names']): 56 | if devicename not in result: 57 | result[devicename] = {"meanpower": [], "maxpower": [], "energy": []} 58 | 59 | result[devicename]["meanpower"].append(data['power'].mean()[i]) 60 | result[devicename]["maxpower"].append(data['power'].max[i]) 61 | result[devicename]["energy"].append(data['energy'].total[i]) 62 | 63 | if not result: 64 | return {"error": ProcessingError.INSUFFICIENT_DATA} 65 | 66 | output = {} 67 | for device, data in result.items(): 68 | output[device] = { 69 | "power": { 70 | "mean": calculate_stats(data['meanpower']), 71 | "max": calculate_stats(data['maxpower']) 72 | }, 73 | "energy": calculate_stats(data['energy']) 74 | } 75 | output[device]['energy']['total'] = output[device]['energy']['avg'] * output[device]['energy']['cnt'] 76 | 77 | return output 78 | -------------------------------------------------------------------------------- /src/supremm/plugins/GpuUsage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ GPU statistics """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import RollingStats, calculate_stats 6 | 7 | class GpuUsage(Plugin): 8 | """ Compute the overall gpu usage for a job """ 9 | 10 | name = property(lambda x: "gpu") 11 | mode = property(lambda x: "all") 12 | requiredMetrics = property(lambda x: ["nvidia.gpuactive", "nvidia.memused"]) 13 | optionalMetrics = property(lambda x: ["nvidia.memactive"]) 14 | derivedMetrics = property(lambda x: []) 15 | 16 | def __init__(self, job): 17 | super(GpuUsage, self).__init__(job) 18 | self._data = {} 19 | self.statnames = None 20 | 21 | def process(self, nodemeta, timestamp, data, description): 22 | 23 | if len(description) == 0 or len(data[0]) == 0: 24 | # nvidia pmda can be running, but no cards present 25 | return False 26 | 27 | if nodemeta.nodename not in self._data: 28 | if self.statnames == None: 29 | self.statnames = ['gpuactive', 'memused'] 30 | if len(data) == 3: 31 | self.statnames.append('memactive') 32 | 33 | self._data[nodemeta.nodename] = {} 34 | for statname in self.statnames: 35 | self._data[nodemeta.nodename][statname] = RollingStats() 36 | 37 | self._data[nodemeta.nodename]['names'] = [x for x in description[0][1]] 38 | 39 | for idx, statname in enumerate(self.statnames): 40 | self._data[nodemeta.nodename][statname].append(1.0 * data[idx]) 41 | 42 | return True 43 | 44 | def results(self): 45 | 46 | result = {} 47 | for data in self._data.values(): 48 | for i, devicename in enumerate(data['names']): 49 | if devicename not in result: 50 | result[devicename] = {} 51 | for statname in self.statnames: 52 | result[devicename][statname] = [] 53 | result[devicename][statname + "max"] = [] 54 | for statname in self.statnames: 55 | result[devicename][statname].append(data[statname].mean()[i]) 56 | result[devicename][statname + "max"].append(data[statname].max[i]) 57 | 58 | output = {} 59 | for device, data in result.items(): 60 | output[device] = {} 61 | for statname, datalist in data.items(): 62 | output[device][statname] = calculate_stats(datalist) 63 | 64 | if len(output) == 0: 65 | output['error'] = "no data" 66 | 67 | return output 68 | -------------------------------------------------------------------------------- /src/supremm/plugins/GpuUsageTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | import numpy 7 | from collections import Counter 8 | 9 | class GpuUsageTimeseries(Plugin): 10 | """ Generate the CPU usage as a timeseries data """ 11 | 12 | name = property(lambda x: "gpu_usage") 13 | mode = property(lambda x: "timeseries") 14 | requiredMetrics = property(lambda x: ["nvidia.gpuactive"]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | def __init__(self, job): 19 | super(GpuUsageTimeseries, self).__init__(job) 20 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 21 | self._hostdata = {} 22 | self._hostdevnames = {} 23 | 24 | def process(self, nodemeta, timestamp, data, description): 25 | 26 | hostidx = nodemeta.nodeindex 27 | 28 | if len(data[0]) == 0: 29 | # Skip data point with no data 30 | return True 31 | 32 | if nodemeta.nodeindex not in self._hostdata: 33 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) 34 | self._hostdevnames[hostidx] = dict((str(k), str(v)) for k, v in zip(description[0][0], description[0][1])) 35 | 36 | avg_usage = numpy.mean(data[0]) 37 | insertat = self._data.adddata(hostidx, timestamp, avg_usage) 38 | if insertat != None: 39 | self._hostdata[hostidx][insertat] = data[0] 40 | 41 | return True 42 | 43 | def results(self): 44 | 45 | values = self._data.get() 46 | 47 | if len(self._hostdata) > 64: 48 | 49 | # Compute min, max & median data and only save the host data 50 | # for these hosts 51 | 52 | memdata = values[:, :, 1] 53 | sortarr = numpy.argsort(memdata.T, axis=1) 54 | 55 | retdata = { 56 | "min": self.collatedata(sortarr[:, 0], memdata), 57 | "max": self.collatedata(sortarr[:, -1], memdata), 58 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata), 59 | "times": values[0, :, 0].tolist(), 60 | "hosts": {} 61 | } 62 | 63 | uniqhosts = Counter(sortarr[:, 0]) 64 | uniqhosts.update(sortarr[:, -1]) 65 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 66 | includelist = list(uniqhosts.keys()) 67 | else: 68 | # Save data for all hosts 69 | retdata = { 70 | "times": values[0, :, 0].tolist(), 71 | "hosts": {} 72 | } 73 | includelist = list(self._hostdata.keys()) 74 | 75 | 76 | for hostidx in includelist: 77 | retdata['hosts'][str(hostidx)] = {} 78 | retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() 79 | retdata['hosts'][str(hostidx)]['dev'] = {} 80 | 81 | for devid in self._hostdevnames[hostidx].keys(): 82 | dpnts = len(values[hostidx, :, 0]) 83 | retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, int(devid)].tolist() 84 | 85 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 86 | 87 | return retdata 88 | 89 | @staticmethod 90 | def collatedata(args, rates): 91 | """ build output data """ 92 | result = [] 93 | for timepoint, hostidx in enumerate(args): 94 | try: 95 | result.append([rates[hostidx, timepoint], int(hostidx)]) 96 | except IndexError: 97 | pass 98 | 99 | return result 100 | -------------------------------------------------------------------------------- /src/supremm/plugins/InfiniBand.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class InfiniBand(DeviceBasedPlugin): 6 | """ This plugin processes lots of metric that are all interested in the difference over the process """ 7 | 8 | name = property(lambda x: "infiniband") 9 | requiredMetrics = property(lambda x: [ 10 | "infiniband.port.switch.in.bytes", 11 | "infiniband.port.switch.in.packets", 12 | "infiniband.port.switch.out.bytes", 13 | "infiniband.port.switch.out.packets" 14 | ]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | -------------------------------------------------------------------------------- /src/supremm/plugins/InfiniBandTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import RateConvertingTimeseriesPlugin 5 | import numpy 6 | 7 | class InfiniBandTimeseries(RateConvertingTimeseriesPlugin): 8 | """ Generate the infiniband usage as a timeseries data """ 9 | 10 | name = property(lambda x: "ib_lnet") 11 | mode = property(lambda x: "timeseries") 12 | requiredMetrics = property(lambda x: ["infiniband.port.switch.in.bytes", "infiniband.port.switch.out.bytes"]) 13 | optionalMetrics = property(lambda x: []) 14 | derivedMetrics = property(lambda x: []) 15 | 16 | def __init__(self, job): 17 | super(InfiniBandTimeseries, self).__init__(job) 18 | 19 | def computetimepoint(self, data): 20 | return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0 21 | -------------------------------------------------------------------------------- /src/supremm/plugins/IpmiPower.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Energy usage plugin """ 3 | 4 | import numpy 5 | 6 | from supremm.plugin import Plugin 7 | from supremm.statistics import RollingStats, calculate_stats, Integrator 8 | from supremm.errors import ProcessingError 9 | 10 | class IpmiPower(Plugin): 11 | """ Compute the power statistics for a job """ 12 | 13 | name = property(lambda x: "ipmi") 14 | mode = property(lambda x: "all") 15 | requiredMetrics = property(lambda x: ["ipmi.dcmi.power"]) 16 | optionalMetrics = property(lambda x: []) 17 | derivedMetrics = property(lambda x: []) 18 | 19 | def __init__(self, job): 20 | super(IpmiPower, self).__init__(job) 21 | self._data = {} 22 | 23 | def process(self, nodemeta, timestamp, data, description): 24 | """ Power measurements are similar to the memory measurements the first and last data points 25 | are ignored and the statistics are computed over all of the other measurements. 26 | """ 27 | 28 | if not data or data[0].size == 0: 29 | return True 30 | 31 | if data[0][0] < numpy.finfo(numpy.float64).eps: 32 | # Some IPMI stacks return a zero value if they don't support power metrics 33 | return False 34 | 35 | if nodemeta.nodeindex not in self._data: 36 | self._data[nodemeta.nodeindex] = { 37 | 'power': RollingStats(), 38 | 'energy': Integrator(timestamp) 39 | } 40 | return True 41 | 42 | hdata = self._data[nodemeta.nodeindex] 43 | 44 | hdata['power'].append(data[0][0]) 45 | hdata['energy'].add(timestamp, data[0][0]) 46 | 47 | return True 48 | 49 | def results(self): 50 | 51 | meanpower = [] 52 | maxpower = [] 53 | 54 | energy = [] 55 | time_covered = 0 56 | 57 | for pdata in self._data.values(): 58 | if pdata['power'].count() > 0: 59 | meanpower.append(pdata['power'].mean()) 60 | maxpower.append(pdata['power'].max) 61 | energy.append(pdata['energy'].total) 62 | time_covered += pdata['energy'].elapsed 63 | 64 | total_energy = numpy.sum(energy) 65 | 66 | if total_energy < numpy.finfo(numpy.float64).eps: 67 | return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE} 68 | 69 | if time_covered < 0.9 * self._job.nodecount * self._job.walltime: 70 | return {"error": ProcessingError.INSUFFICIENT_DATA} 71 | 72 | if not meanpower: 73 | return {"error": ProcessingError.INSUFFICIENT_DATA} 74 | 75 | energy_stats = calculate_stats(energy) 76 | energy_stats['total'] = total_energy 77 | 78 | return { 79 | "power": { 80 | "mean": calculate_stats(meanpower), 81 | "max": calculate_stats(maxpower) 82 | }, 83 | "energy": energy_stats 84 | } 85 | -------------------------------------------------------------------------------- /src/supremm/plugins/Lnet.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Lnet statistics """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import calculate_stats 6 | import numpy 7 | 8 | class Lnet(Plugin): 9 | """ Compute the overall lnet usage for a job """ 10 | 11 | name = property(lambda x: "lnet") 12 | mode = property(lambda x: "firstlast") 13 | requiredMetrics = property(lambda x: ["lustre.lnet.drop_length", "lustre.lnet.recv_length", "lustre.lnet.send_length", "lustre.lnet.drop_count", "lustre.lnet.recv_count", "lustre.lnet.send_count"]) 14 | optionalMetrics = property(lambda x: []) 15 | derivedMetrics = property(lambda x: []) 16 | 17 | def __init__(self, job): 18 | super(Lnet, self).__init__(job) 19 | self._first = {} 20 | self._data = numpy.empty((job.nodecount, len(self.requiredMetrics))) 21 | self._hostidx = 0 22 | 23 | def process(self, nodemeta, timestamp, data, description): 24 | 25 | vals = numpy.array(data)[:, 0] 26 | 27 | if nodemeta.nodename not in self._first: 28 | self._first[nodemeta.nodename] = vals 29 | return True 30 | 31 | self._data[self._hostidx, :] = vals - self._first[nodemeta.nodename] 32 | self._hostidx += 1 33 | 34 | return True 35 | 36 | def results(self): 37 | 38 | output = {} 39 | 40 | for i, nicename in enumerate(['drop', 'recv', 'send', 'drop_count', 'recv_count', 'send_count']): 41 | output[nicename] = calculate_stats(self._data[:self._hostidx, i]) 42 | 43 | return output 44 | -------------------------------------------------------------------------------- /src/supremm/plugins/LoadAvg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Load Average plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import RollingStats, calculate_stats 6 | from supremm.errors import ProcessingError 7 | 8 | class LoadAvg(Plugin): 9 | """ Process the load average metrics """ 10 | 11 | name = property(lambda x: "load1") 12 | mode = property(lambda x: "all") 13 | requiredMetrics = property(lambda x: ["kernel.all.load"]) 14 | optionalMetrics = property(lambda x: []) 15 | derivedMetrics = property(lambda x: []) 16 | 17 | def __init__(self, job): 18 | super(LoadAvg, self).__init__(job) 19 | self._data = {} 20 | 21 | def process(self, nodemeta, timestamp, data, description): 22 | """ Computes the mean and max values of the load average for each node 23 | optionally normalizes this data to be per core (if the core count is available) 24 | """ 25 | 26 | if data[0].size < 1: 27 | return True 28 | 29 | if nodemeta.nodename not in self._data: 30 | self._data[nodemeta.nodename] = RollingStats() 31 | return True 32 | 33 | self._data[nodemeta.nodename].append(data[0][0]) 34 | 35 | return True 36 | 37 | def results(self): 38 | 39 | meanval = [] 40 | maxval = [] 41 | meanvalpercore = [] 42 | maxvalpercore = [] 43 | 44 | hinv = self._job.getdata('hinv') 45 | 46 | for nodename, loaddata in self._data.items(): 47 | if loaddata.count() > 0: 48 | meanval.append(loaddata.mean()) 49 | maxval.append(loaddata.max) 50 | 51 | if hinv != None and nodename in hinv: 52 | meanvalpercore.append(loaddata.mean() / hinv[nodename]['cores']) 53 | maxvalpercore.append(loaddata.max / hinv[nodename]['cores']) 54 | 55 | if len(meanval) == 0: 56 | return {"error": ProcessingError.INSUFFICIENT_DATA} 57 | 58 | results = { 59 | "mean": calculate_stats(meanval), 60 | "max": calculate_stats(maxval) 61 | } 62 | 63 | if len(meanvalpercore) > 0: 64 | results['meanpercore'] = calculate_stats(meanvalpercore) 65 | results['maxpercore'] = calculate_stats(maxvalpercore) 66 | 67 | return results 68 | 69 | -------------------------------------------------------------------------------- /src/supremm/plugins/Lustre.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class Lustre(DeviceBasedPlugin): 6 | """ This plugin processes lots of metric that are all interested in the difference over the process """ 7 | 8 | name = property(lambda x: "lustre") 9 | requiredMetrics = property(lambda x: [ 10 | "lustre.llite.read_bytes.total", 11 | "lustre.llite.write_bytes.total" 12 | ]) 13 | optionalMetrics = property(lambda x: []) 14 | derivedMetrics = property(lambda x: []) 15 | 16 | 17 | -------------------------------------------------------------------------------- /src/supremm/plugins/LustreTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import RateConvertingTimeseriesPlugin 5 | import numpy 6 | 7 | class LustreTimeseries(RateConvertingTimeseriesPlugin): 8 | """ Generate the Lustre usage as a timeseries data """ 9 | 10 | name = property(lambda x: "lnet") 11 | requiredMetrics = property(lambda x: ["lustre.llite.read_bytes.total", "lustre.llite.write_bytes.total"]) 12 | optionalMetrics = property(lambda x: []) 13 | derivedMetrics = property(lambda x: []) 14 | 15 | def __init__(self, job): 16 | super(LustreTimeseries, self).__init__(job) 17 | 18 | def computetimepoint(self, data): 19 | return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0 20 | -------------------------------------------------------------------------------- /src/supremm/plugins/MemUsageTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | import numpy 7 | from collections import Counter 8 | 9 | class MemUsageTimeseries(Plugin): 10 | """ Generate the CPU usage as a timeseries data """ 11 | 12 | name = property(lambda x: "memused_minus_diskcache") 13 | mode = property(lambda x: "timeseries") 14 | requiredMetrics = property(lambda x: ["mem.numa.util.used", "mem.numa.util.filePages", "mem.numa.util.slab"]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | def __init__(self, job): 19 | super(MemUsageTimeseries, self).__init__(job) 20 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 21 | self._hostdata = {} 22 | self._hostdevnames = {} 23 | 24 | def process(self, nodemeta, timestamp, data, description): 25 | 26 | hostidx = nodemeta.nodeindex 27 | 28 | if len(data[0]) == 0: 29 | # Skip data point with no data 30 | return True 31 | 32 | if nodemeta.nodeindex not in self._hostdata: 33 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) 34 | self._hostdevnames[hostidx] = dict((str(k), "numa " + v) for k, v in zip(description[0][0], description[0][1])) 35 | 36 | nodemem_kb = numpy.sum(data[0]) - numpy.sum(data[1]) - numpy.sum(data[2]) 37 | insertat = self._data.adddata(hostidx, timestamp, nodemem_kb / 1048576.0) 38 | if insertat != None: 39 | self._hostdata[hostidx][insertat] = (data[0] - data[1] - data[2]) / 1048576.0 40 | 41 | return True 42 | 43 | def results(self): 44 | 45 | values = self._data.get() 46 | 47 | if len(self._hostdata) > 64: 48 | 49 | # Compute min, max & median data and only save the host data 50 | # for these hosts 51 | 52 | memdata = values[:, :, 1] 53 | sortarr = numpy.argsort(memdata.T, axis=1) 54 | 55 | retdata = { 56 | "min": self.collatedata(sortarr[:, 0], memdata), 57 | "max": self.collatedata(sortarr[:, -1], memdata), 58 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata), 59 | "times": values[0, :, 0].tolist(), 60 | "hosts": {} 61 | } 62 | 63 | uniqhosts = Counter(sortarr[:, 0]) 64 | uniqhosts.update(sortarr[:, -1]) 65 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 66 | includelist = list(uniqhosts.keys()) 67 | else: 68 | # Save data for all hosts 69 | retdata = { 70 | "times": values[0, :, 0].tolist(), 71 | "hosts": {} 72 | } 73 | includelist = list(self._hostdata.keys()) 74 | 75 | 76 | for hostidx in includelist: 77 | retdata['hosts'][str(hostidx)] = {} 78 | retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() 79 | retdata['hosts'][str(hostidx)]['dev'] = {} 80 | 81 | for devid in self._hostdevnames[hostidx].keys(): 82 | dpnts = len(values[hostidx, :, 0]) 83 | retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, numpy.int(devid)].tolist() 84 | 85 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 86 | 87 | return retdata 88 | 89 | @staticmethod 90 | def collatedata(args, rates): 91 | """ build output data """ 92 | result = [] 93 | for timepoint, hostidx in enumerate(args): 94 | try: 95 | result.append([rates[hostidx, timepoint], int(hostidx)]) 96 | except IndexError: 97 | pass 98 | 99 | return result 100 | -------------------------------------------------------------------------------- /src/supremm/plugins/MemoryUsage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Memory usage plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import RollingStats, calculate_stats 6 | from supremm.errors import ProcessingError 7 | 8 | class MemoryUsage(Plugin): 9 | """ Compute the overall memory usage for a job """ 10 | 11 | name = property(lambda x: "memory") 12 | mode = property(lambda x: "all") 13 | requiredMetrics = property(lambda x: ["mem.numa.util.used", "mem.numa.util.filePages", "mem.numa.util.slab", "kernel.percpu.cpu.user"]) 14 | optionalMetrics = property(lambda x: []) 15 | derivedMetrics = property(lambda x: []) 16 | 17 | def __init__(self, job): 18 | super(MemoryUsage, self).__init__(job) 19 | self._data = {} 20 | self._hostcpucounts = {} 21 | 22 | def process(self, nodemeta, timestamp, data, description): 23 | """ Memory statistics are the aritmetic mean of all values except the 24 | first and last rather than storing all of the meory measurements for 25 | the job, we use the RollingStats() class to keep track of the mean 26 | values. Since we don't know which data point is the last one, we update 27 | the RollingStats with the value from the previous timestep at each timestep. 28 | """ 29 | 30 | if nodemeta.nodeindex not in self._data: 31 | self._data[nodemeta.nodeindex] = {'usedval': None, 32 | 'used': RollingStats(), 33 | 'usedminusval': None, 34 | 'usedminus': RollingStats()} 35 | return True 36 | 37 | if nodemeta.nodeindex not in self._hostcpucounts and data[3].size > 0: 38 | self._hostcpucounts[nodemeta.nodeindex] = data[3].size 39 | 40 | hdata = self._data[nodemeta.nodeindex] 41 | 42 | if hdata['usedval'] != None: 43 | hdata['used'].append(hdata['usedval']) 44 | hdata['usedminus'].append(hdata['usedminusval']) 45 | 46 | hdata['usedval'] = sum(data[0]) 47 | hdata['usedminusval'] = (sum(data[0]) - sum(data[1]) - sum(data[2])) 48 | 49 | return True 50 | 51 | def results(self): 52 | 53 | memused = [] 54 | memusedminus = [] 55 | 56 | for hostidx, memdata in self._data.items(): 57 | if hostidx not in self._hostcpucounts: 58 | return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} 59 | if memdata['used'].count() > 0: 60 | memused.append(memdata['used'].mean() / self._hostcpucounts[hostidx]) 61 | if memdata['usedminus'].count() > 0: 62 | memusedminus.append(memdata['usedminus'].mean() / self._hostcpucounts[hostidx]) 63 | 64 | if len(memused) == 0: 65 | return {"error": ProcessingError.INSUFFICIENT_DATA} 66 | 67 | return {"used": calculate_stats(memused), "used_minus_cache": calculate_stats(memusedminus)} 68 | -------------------------------------------------------------------------------- /src/supremm/plugins/Network.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class Network(DeviceBasedPlugin): 6 | """ This plugin processes lots of metric that are all interested in the difference over the process """ 7 | 8 | name = property(lambda x: "network") 9 | requiredMetrics = property(lambda x: [ 10 | "network.interface.in.bytes", 11 | "network.interface.out.bytes", 12 | ]) 13 | optionalMetrics = property(lambda x: [ 14 | "network.interface.in.packets" 15 | "network.interface.out.packets" 16 | ]) 17 | derivedMetrics = property(lambda x: []) 18 | 19 | 20 | -------------------------------------------------------------------------------- /src/supremm/plugins/Nfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import DeviceBasedPlugin 4 | 5 | class Nfs(DeviceBasedPlugin): 6 | """ Generate usage statistics for NFS clients """ 7 | 8 | name = property(lambda x: "nfs") 9 | requiredMetrics = property(lambda x: [ 10 | "nfsclient.bytes.read.normal", 11 | "nfsclient.bytes.read.direct", 12 | "nfsclient.bytes.read.server", 13 | "nfsclient.bytes.write.normal", 14 | "nfsclient.bytes.write.direct", 15 | "nfsclient.bytes.write.server" 16 | ]) 17 | optionalMetrics = property(lambda x: []) 18 | derivedMetrics = property(lambda x: []) 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/supremm/plugins/NfsTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import RateConvertingTimeseriesPlugin 5 | import numpy 6 | 7 | class NfsTimeseries(RateConvertingTimeseriesPlugin): 8 | """ Generate timeseries summary for NFS usage data """ 9 | 10 | name = property(lambda x: "nfs") 11 | requiredMetrics = property(lambda x: ["nfsclient.bytes.read.normal", 12 | "nfsclient.bytes.read.direct", 13 | "nfsclient.bytes.read.server", 14 | "nfsclient.bytes.write.normal", 15 | "nfsclient.bytes.write.direct", 16 | "nfsclient.bytes.write.server"]) 17 | optionalMetrics = property(lambda x: []) 18 | derivedMetrics = property(lambda x: []) 19 | 20 | def __init__(self, job): 21 | super(NfsTimeseries, self).__init__(job) 22 | 23 | def computetimepoint(self, data): 24 | try: 25 | return numpy.sum(numpy.array(data)) / 1048576.0 26 | except ValueError: 27 | # NFS mount points can dissapear / appear during the job 28 | # skip points that are inconsistent with the first point 29 | return None 30 | -------------------------------------------------------------------------------- /src/supremm/plugins/NodeMemoryUsage.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Memory usage plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import RollingStats, calculate_stats 6 | from supremm.errors import ProcessingError 7 | 8 | class NodeMemoryUsage(Plugin): 9 | """ Compute the overall memory usage for a job """ 10 | 11 | name = property(lambda x: "nodememory") 12 | mode = property(lambda x: "all") 13 | requiredMetrics = property(lambda x: [["mem.freemem", "mem.physmem"], ["mem.util.free", "hinv.physmem", "mem.util.cached"]]) 14 | optionalMetrics = property(lambda x: []) 15 | derivedMetrics = property(lambda x: []) 16 | 17 | def __init__(self, job): 18 | super(NodeMemoryUsage, self).__init__(job) 19 | self._data = {} 20 | 21 | def process(self, nodemeta, timestamp, data, description): 22 | """ Memory statistics are the aritmetic mean of all values except the 23 | first and last rather than storing all of the memory measurements for 24 | the job, we use the RollingStats() class to keep track of the mean 25 | values. Since we don't know which data point is the last one, we update 26 | the RollingStats with the value from the previous timestep at each timestep. 27 | """ 28 | 29 | if nodemeta.nodeindex not in self._data: 30 | self._data[nodemeta.nodeindex] = {'freeval': None, 31 | 'free': RollingStats(), 32 | 'cached': None, 33 | 'physmem': None} 34 | return True 35 | 36 | hdata = self._data[nodemeta.nodeindex] 37 | 38 | if hdata['freeval'] != None: 39 | hdata['free'].append(hdata['freeval']) 40 | 41 | if len(data[0]) > 0: 42 | hdata['freeval'] = data[0][0] 43 | 44 | if hdata['physmem'] == None and len(data[1]) > 0: 45 | hdata['physmem'] = data[1][0] 46 | if len(data) == 3: 47 | hdata['physmem'] *= 1024.0 48 | 49 | if len(data) == 3: 50 | if hdata['cached'] == None: 51 | hdata['cached'] = RollingStats() 52 | 53 | hdata['cached'].append(data[0][0] + data[2][0]) 54 | 55 | return True 56 | 57 | def results(self): 58 | 59 | memused = [] 60 | memusedminus = [] 61 | maxmemused = [] 62 | maxmemusedminus = [] 63 | memfree = [] 64 | maxmemfree = [] 65 | physmem = [] 66 | 67 | for hostidx, memdata in self._data.items(): 68 | if memdata['free'].count() > 0: 69 | memfree.append(memdata['free'].mean()) 70 | maxmemfree.append(memdata['free'].max) 71 | 72 | if memdata['physmem'] != None: 73 | memused.append(memdata['physmem'] - memdata['free'].mean()) 74 | maxmemused.append(memdata['physmem'] - memdata['free'].min) 75 | physmem.append(memdata['physmem']) 76 | 77 | if memdata['cached'] != None: 78 | memusedminus.append(memdata['physmem'] - memdata['cached'].mean()) 79 | maxmemusedminus.append(memdata['physmem'] - memdata['cached'].min) 80 | 81 | if len(memused) == 0: 82 | return {"error": ProcessingError.INSUFFICIENT_DATA} 83 | 84 | result = {"used": calculate_stats(memused), 85 | "maxused": calculate_stats(maxmemused), 86 | "free": calculate_stats(memfree), 87 | "physmem": calculate_stats(physmem), 88 | "maxfree": calculate_stats(maxmemfree)} 89 | 90 | if len(memusedminus) > 0: 91 | result['used_minus_cache'] = calculate_stats(memusedminus) 92 | result['maxused_minus_cache'] = calculate_stats(maxmemusedminus) 93 | 94 | return result 95 | -------------------------------------------------------------------------------- /src/supremm/plugins/PowerUsageTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from collections import Counter 5 | import numpy 6 | 7 | from supremm.plugin import Plugin 8 | from supremm.subsample import TimeseriesAccumulator 9 | from supremm.errors import ProcessingError 10 | 11 | class PowerUsageTimeseries(Plugin): 12 | """ Generate the Power usage as a timeseries data """ 13 | 14 | name = property(lambda x: "power") 15 | mode = property(lambda x: "timeseries") 16 | requiredMetrics = property(lambda x: ["ipmi.dcmi.power"]) 17 | optionalMetrics = property(lambda x: []) 18 | derivedMetrics = property(lambda x: []) 19 | 20 | def __init__(self, job): 21 | super(PowerUsageTimeseries, self).__init__(job) 22 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 23 | self._hostdata = {} 24 | 25 | @staticmethod 26 | def computetimepoint(data): 27 | """ Get the power usage from the data """ 28 | if data[0][0] < numpy.finfo(numpy.float64).eps: 29 | return None 30 | 31 | return data[0][0] 32 | 33 | def process(self, nodemeta, timestamp, data, description): 34 | 35 | if not data[0]: 36 | # Skip data point with no data 37 | return True 38 | 39 | if nodemeta.nodeindex not in self._hostdata: 40 | self._hostdata[nodemeta.nodeindex] = 1 41 | 42 | datum = self.computetimepoint(data) 43 | if datum != None: 44 | self._data.adddata(nodemeta.nodeindex, timestamp, datum) 45 | 46 | return True 47 | 48 | def results(self): 49 | 50 | if len(self._hostdata) != self._job.nodecount: 51 | return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} 52 | 53 | values = self._data.get() 54 | 55 | if len(values[0, :, 0]) < 3: 56 | return {"error": ProcessingError.JOB_TOO_SHORT} 57 | 58 | power = values[:, :, 1] 59 | 60 | if len(self._hostdata) > 64: 61 | 62 | # Compute min, max & median data and only save the host data 63 | # for these hosts 64 | 65 | sortarr = numpy.argsort(power.T, axis=1) 66 | 67 | retdata = { 68 | "min": self.collatedata(sortarr[:, 0], power), 69 | "max": self.collatedata(sortarr[:, -1], power), 70 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], power), 71 | "times": values[0, :, 0].tolist(), 72 | "hosts": {} 73 | } 74 | 75 | uniqhosts = Counter(sortarr[:, 0]) 76 | uniqhosts.update(sortarr[:, -1]) 77 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 78 | includelist = list(uniqhosts.keys()) 79 | else: 80 | # Save data for all hosts 81 | retdata = { 82 | "times": values[0, :, 0].tolist(), 83 | "hosts": {} 84 | } 85 | includelist = list(self._hostdata.keys()) 86 | 87 | 88 | for hostidx in includelist: 89 | retdata['hosts'][str(hostidx)] = {} 90 | retdata['hosts'][str(hostidx)]['all'] = power[hostidx, :].tolist() 91 | 92 | return retdata 93 | 94 | @staticmethod 95 | def collatedata(args, rates): 96 | """ build output data """ 97 | result = [] 98 | for timepoint, hostidx in enumerate(args): 99 | try: 100 | result.append([rates[hostidx, timepoint], int(hostidx)]) 101 | except IndexError: 102 | pass 103 | 104 | return result 105 | -------------------------------------------------------------------------------- /src/supremm/plugins/SimdInsTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | from collections import Counter 9 | 10 | SNB_METRICS = ["perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value", 11 | "perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_SCALAR_DOUBLE.value", 12 | "perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE.value", 13 | "perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value", 14 | "perfevent.hwcounters.FP_COMP_OPS_EXE_X87.value"] 15 | 16 | NHM_METRICS = ["perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP.value"] 17 | 18 | INTERLAGOS_METRICS = ["perfevent.hwcounters.RETIRED_SSE_OPS_ALL.value"] 19 | 20 | class SimdInsTimeseries(Plugin): 21 | """ Generate the CPU usage as a timeseries data """ 22 | 23 | name = property(lambda x: "simdins") 24 | mode = property(lambda x: "timeseries") 25 | requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS, INTERLAGOS_METRICS]) 26 | optionalMetrics = property(lambda x: []) 27 | derivedMetrics = property(lambda x: []) 28 | 29 | def __init__(self, job): 30 | super(SimdInsTimeseries, self).__init__(job) 31 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 32 | self._hostdata = {} 33 | self._hostdevnames = {} 34 | self._error = None 35 | 36 | def process(self, nodemeta, timestamp, data, description): 37 | 38 | if self._job.getdata('perf')['active'] != True: 39 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 40 | return False 41 | 42 | if len(data[0]) == 0: 43 | # Ignore timesteps where data was not available 44 | return True 45 | 46 | hostidx = nodemeta.nodeindex 47 | 48 | if nodemeta.nodeindex not in self._hostdata: 49 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) 50 | self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) 51 | 52 | if len(data) == len(NHM_METRICS): # Note that INTERLAGOS is covered here too 53 | flops = numpy.array(data[0]) 54 | else: 55 | flops = 4.0 * data[0] + 2.0 * data[1] + data[2] + data[3] 56 | 57 | insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops)) 58 | if insertat != None: 59 | self._hostdata[hostidx][insertat] = flops 60 | 61 | if insertat > 1: 62 | if numpy.any(flops - self._hostdata[hostidx][insertat-1] < 0.0): 63 | self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB 64 | return False 65 | 66 | return True 67 | 68 | def results(self): 69 | 70 | if self._error != None: 71 | return {"error": self._error} 72 | 73 | values = self._data.get() 74 | 75 | if len(values[0, :, 0]) < 3: 76 | return {"error": ProcessingError.JOB_TOO_SHORT} 77 | 78 | rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) 79 | 80 | if len(self._hostdata) > 64: 81 | 82 | # Compute min, max & median data and only save the host data 83 | # for these hosts 84 | 85 | sortarr = numpy.argsort(rates.T, axis=1) 86 | 87 | retdata = { 88 | "min": self.collatedata(sortarr[:, 0], rates), 89 | "max": self.collatedata(sortarr[:, -1], rates), 90 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates), 91 | "times": values[0, 1:, 0].tolist(), 92 | "hosts": {} 93 | } 94 | 95 | uniqhosts = Counter(sortarr[:, 0]) 96 | uniqhosts.update(sortarr[:, -1]) 97 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 98 | includelist = list(uniqhosts.keys()) 99 | else: 100 | # Save data for all hosts 101 | retdata = { 102 | "times": values[0, 1:, 0].tolist(), 103 | "hosts": {} 104 | } 105 | includelist = list(self._hostdata.keys()) 106 | 107 | 108 | for hostidx in includelist: 109 | retdata['hosts'][str(hostidx)] = {} 110 | retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() 111 | retdata['hosts'][str(hostidx)]['dev'] = {} 112 | 113 | for devid in self._hostdevnames[hostidx].keys(): 114 | dpnts = len(values[hostidx, :, 0]) 115 | retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() 116 | 117 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 118 | 119 | return retdata 120 | 121 | @staticmethod 122 | def collatedata(args, rates): 123 | """ build output data """ 124 | result = [] 125 | for timepoint, hostidx in enumerate(args): 126 | try: 127 | result.append([rates[hostidx, timepoint], int(hostidx)]) 128 | except IndexError: 129 | pass 130 | 131 | return result 132 | -------------------------------------------------------------------------------- /src/supremm/plugins/SveTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | from collections import Counter 9 | 10 | SVE_METRICS = ["perfevent.hwcounters.arm_a64fx__SVE_INST_RETIRED.value"] 11 | 12 | class SveTimeseries(Plugin): 13 | """ Generate the CPU usage as a timeseries data """ 14 | 15 | name = property(lambda x: "sveins") 16 | mode = property(lambda x: "timeseries") 17 | requiredMetrics = property(lambda x: [SVE_METRICS]) 18 | optionalMetrics = property(lambda x: []) 19 | derivedMetrics = property(lambda x: []) 20 | 21 | def __init__(self, job): 22 | super(SveTimeseries, self).__init__(job) 23 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 24 | self._hostdata = {} 25 | self._hostdevnames = {} 26 | self._error = None 27 | 28 | def process(self, nodemeta, timestamp, data, description): 29 | 30 | if self._job.getdata('perf')['active'] != True: 31 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 32 | return False 33 | 34 | if len(data[0]) == 0: 35 | # Ignore timesteps where data was not available 36 | return True 37 | 38 | hostidx = nodemeta.nodeindex 39 | 40 | if nodemeta.nodeindex not in self._hostdata: 41 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) 42 | self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1])) 43 | 44 | if len(data) == len(SVE_METRICS): # Note that INTERLAGOS is covered here too 45 | flops = numpy.array(data[0]) 46 | else: 47 | flops = 4.0 * data[0] + 2.0 * data[1] + data[2] + data[3] 48 | 49 | insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops)) 50 | if insertat != None: 51 | self._hostdata[hostidx][insertat] = flops 52 | 53 | if insertat > 1: 54 | if numpy.any(flops - self._hostdata[hostidx][insertat-1] < 0.0): 55 | self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB 56 | return False 57 | 58 | return True 59 | 60 | def results(self): 61 | 62 | if self._error != None: 63 | return {"error": self._error} 64 | 65 | values = self._data.get() 66 | 67 | if len(values[0, :, 0]) < 3: 68 | return {"error": ProcessingError.JOB_TOO_SHORT} 69 | 70 | rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) 71 | 72 | if len(self._hostdata) > 64: 73 | 74 | # Compute min, max & median data and only save the host data 75 | # for these hosts 76 | 77 | sortarr = numpy.argsort(rates.T, axis=1) 78 | 79 | retdata = { 80 | "min": self.collatedata(sortarr[:, 0], rates), 81 | "max": self.collatedata(sortarr[:, -1], rates), 82 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates), 83 | "times": values[0, 1:, 0].tolist(), 84 | "hosts": {} 85 | } 86 | 87 | uniqhosts = Counter(sortarr[:, 0]) 88 | uniqhosts.update(sortarr[:, -1]) 89 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 90 | includelist = uniqhosts.keys() 91 | else: 92 | # Save data for all hosts 93 | retdata = { 94 | "times": values[0, 1:, 0].tolist(), 95 | "hosts": {} 96 | } 97 | includelist = self._hostdata.keys() 98 | 99 | 100 | for hostidx in includelist: 101 | retdata['hosts'][str(hostidx)] = {} 102 | retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist() 103 | retdata['hosts'][str(hostidx)]['dev'] = {} 104 | 105 | for devid in self._hostdevnames[hostidx].iterkeys(): 106 | dpnts = len(values[hostidx, :, 0]) 107 | retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist() 108 | 109 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 110 | 111 | return retdata 112 | 113 | @staticmethod 114 | def collatedata(args, rates): 115 | """ build output data """ 116 | result = [] 117 | for timepoint, hostidx in enumerate(args): 118 | try: 119 | result.append([rates[hostidx, timepoint], int(hostidx)]) 120 | except IndexError: 121 | pass 122 | 123 | return result 124 | -------------------------------------------------------------------------------- /src/supremm/plugins/TaccCatastrophe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from supremm.plugin import Plugin 4 | from supremm.errors import ProcessingError 5 | from supremm.subsample import RangeConverter 6 | import numpy 7 | 8 | class TaccCatastrophe(Plugin): 9 | """ Catastrophe analytic. Algorithm originally developed by Bill Barth et al. for the 10 | tacc_stats project """ 11 | 12 | name = property(lambda x: "catastrophe") 13 | mode = property(lambda x: "all") 14 | requiredMetrics = property(lambda x: [ ["taccstats_perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value"], ["taccstats_perfevent.hwcounters.L1D_REPLACEMENT.value"] ]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | def __init__(self, job): 19 | super(TaccCatastrophe, self).__init__(job) 20 | self._data = {} 21 | self._values = {} 22 | 23 | def process(self, nodemeta, timestamp, data, description): 24 | 25 | if nodemeta.nodename not in self._data: 26 | self._data[nodemeta.nodename] = { "x": [], "t": [] } 27 | self._values[nodemeta.nodename] = RangeConverter(48, False) 28 | 29 | info = self._data[nodemeta.nodename] 30 | value = self._values[nodemeta.nodename].append(data) 31 | 32 | info['x'].append(1.0 * numpy.sum(value)) 33 | info['t'].append(timestamp) 34 | 35 | return True 36 | 37 | def results(self): 38 | 39 | if len(self._data) == 0: 40 | return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE} 41 | 42 | vals = None 43 | 44 | for host, data in self._data.items(): 45 | x = data['x'] 46 | t = data['t'] 47 | 48 | start = 2 49 | end = len(data['x'])-2 50 | 51 | for i in range(start+1, end-1): 52 | 53 | a = (data['x'][i] - data['x'][start]) / (data['t'][i] - data['t'][start]) 54 | b = (data['x'][end] - data['x'][i]) / (data['t'][end] - data['t'][i]) 55 | vals = b/a if vals == None else min(vals, b/a) 56 | 57 | if vals == None: 58 | return {"error": ProcessingError.JOB_TOO_SHORT} 59 | 60 | return {"value": vals} 61 | -------------------------------------------------------------------------------- /src/supremm/plugins/TaccPerfCounters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ CPU performance counter plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import calculate_stats 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | 9 | SNB_METRICS = ["taccstats_perfevent.hwcounters.UNHALTED_REFERENCE_CYCLES.value", 10 | "taccstats_perfevent.hwcounters.INSTRUCTION_RETIRED.value", 11 | "taccstats_perfevent.hwcounters.L1D_REPLACEMENT.value", 12 | "taccstats_perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value", 13 | "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE.value", 14 | "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_SCALAR_DOUBLE.value"] 15 | 16 | NHM_METRICS = ["taccstats_perfevent.hwcounters.UNHALTED_REFERENCE_CYCLES.value", 17 | "taccstats_perfevent.hwcounters.INSTRUCTIONS_RETIRED.value", 18 | "taccstats_perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value", 19 | "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP.value"] 20 | 21 | class TaccPerfCounters(Plugin): 22 | """ Compute various performance counter derived metrics """ 23 | name = property(lambda x: "cpuperf") 24 | mode = property(lambda x: "all") 25 | requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS]) 26 | optionalMetrics = property(lambda x: []) 27 | derivedMetrics = property(lambda x: []) 28 | 29 | def __init__(self, job): 30 | super(TaccPerfCounters, self).__init__(job) 31 | self._last = {} 32 | self._data = {} 33 | self._totalcores = 0 34 | self._error = None 35 | 36 | def process(self, nodemeta, timestamp, data, description): 37 | 38 | ndata = numpy.array(data) 39 | 40 | if nodemeta.nodename not in self._last: 41 | self._last[nodemeta.nodename] = ndata 42 | return True 43 | 44 | if ndata.shape == self._last[nodemeta.nodename].shape: 45 | if nodemeta.nodename not in self._data: 46 | # Only populate data for a host when we have at least 2 datapoints 47 | self._data[nodemeta.nodename] = numpy.zeros(ndata.shape) 48 | self._totalcores += data[0].size 49 | 50 | self._data[nodemeta.nodename] += (ndata - self._last[nodemeta.nodename]) % (2**48) 51 | self._last[nodemeta.nodename] = ndata 52 | else: 53 | # Perf counters changed during the job 54 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 55 | return False 56 | 57 | return True 58 | 59 | def results(self): 60 | 61 | if self._error != None: 62 | return {"error": self._error} 63 | 64 | nhosts = len(self._data) 65 | 66 | if nhosts < 1: 67 | return {"error": ProcessingError.INSUFFICIENT_HOSTDATA} 68 | 69 | flops = numpy.zeros(self._totalcores) 70 | cpiref = numpy.zeros(self._totalcores) 71 | cpldref = numpy.zeros(self._totalcores) 72 | 73 | coreindex = 0 74 | for data in self._data.values(): 75 | if len(data) == len(NHM_METRICS): 76 | flops[coreindex:coreindex+len(data[0])] = 1.0 * data[3] 77 | cpiref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[1] 78 | cpldref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[2] 79 | coreindex += len(data[0]) 80 | elif len(data) == len(SNB_METRICS): 81 | flops[coreindex:coreindex+len(data[0])] = 4.0 * data[3] + 2.0 * data[4] + 1.0 * data[5] 82 | cpiref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[1] 83 | cpldref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[2] 84 | coreindex += len(data[0]) 85 | else: 86 | return {"error": ProcessingError.INSUFFICIENT_DATA} 87 | 88 | results = {"flops": calculate_stats(flops), "cpiref": calculate_stats(cpiref), "cpldref": calculate_stats(cpldref)} 89 | return results 90 | -------------------------------------------------------------------------------- /src/supremm/plugins/TaccUncoreCounters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Socket level performance counter plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import calculate_stats 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | 9 | TACC_NHM_METRICS = ["taccstats_perfevent.hwcounters.UNC_LLC_MISS_READ.value", 10 | "taccstats_perfevent.hwcounters.UNC_LLC_MISS_WRITE.value"] 11 | 12 | class TaccUncoreCounters(Plugin): 13 | """ Compute various uncore performance counter derived metrics """ 14 | 15 | name = property(lambda x: "uncperf") 16 | mode = property(lambda x: "all") 17 | requiredMetrics = property(lambda x: TACC_NHM_METRICS) 18 | optionalMetrics = property(lambda x: []) 19 | derivedMetrics = property(lambda x: []) 20 | 21 | def __init__(self, job): 22 | super(TaccUncoreCounters, self).__init__(job) 23 | self._last = {} 24 | self._data = {} 25 | self._error = None 26 | 27 | def process(self, nodemeta, timestamp, data, description): 28 | ndata = numpy.array(data) 29 | 30 | if nodemeta.nodename not in self._last: 31 | self._last[nodemeta.nodename] = ndata 32 | self._data[nodemeta.nodename] = 0.0 33 | return True 34 | 35 | if ndata.shape == self._last[nodemeta.nodename].shape: 36 | self._data[nodemeta.nodename] += numpy.sum((ndata - self._last[nodemeta.nodename]) % 2**48) 37 | self._last[nodemeta.nodename] = ndata 38 | else: 39 | # Perf counters changed during the job 40 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 41 | return False 42 | 43 | return True 44 | 45 | def results(self): 46 | 47 | if self._error != None: 48 | return {"error": self._error} 49 | 50 | nhosts = len(self._data) 51 | 52 | if nhosts < 1: 53 | return {"error": ProcessingError.INSUFFICIENT_DATA} 54 | 55 | membw = numpy.zeros(nhosts) 56 | for hostindex, data in enumerate(self._data.values()): 57 | membw[hostindex] = data * 64.0 58 | 59 | results = {"membw": calculate_stats(membw)} 60 | return results 61 | -------------------------------------------------------------------------------- /src/supremm/plugins/TimeseriesPatternsGpfs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from supremm.TimeseriesPatterns import TimeseriesPatterns 3 | 4 | 5 | class TimeseriesPatternsGpfs(TimeseriesPatterns): 6 | requiredMetrics = property(lambda self: ["gpfs.fsios.read_bytes", "gpfs.fsios.write_bytes"]) 7 | name = property(lambda self: "timeseries_patterns_gpfs") 8 | 9 | def __init__(self, job): 10 | super(TimeseriesPatternsGpfs, self).__init__(job) 11 | -------------------------------------------------------------------------------- /src/supremm/plugins/TotalMemUsageTimeseries.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries generator module """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.subsample import TimeseriesAccumulator 6 | import numpy 7 | from collections import Counter 8 | 9 | class TotalMemUsageTimeseries(Plugin): 10 | """ Generate the CPU usage as a timeseries data """ 11 | 12 | name = property(lambda x: "memused") 13 | mode = property(lambda x: "timeseries") 14 | requiredMetrics = property(lambda x: ["mem.numa.util.used"]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | def __init__(self, job): 19 | super(TotalMemUsageTimeseries, self).__init__(job) 20 | self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime) 21 | self._hostdata = {} 22 | self._hostdevnames = {} 23 | 24 | def process(self, nodemeta, timestamp, data, description): 25 | 26 | hostidx = nodemeta.nodeindex 27 | 28 | if len(data[0]) == 0: 29 | # Skip data point with no data 30 | return True 31 | 32 | if nodemeta.nodeindex not in self._hostdata: 33 | self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0]))) 34 | self._hostdevnames[hostidx] = dict((str(k), "numa " + v) for k, v in zip(description[0][0], description[0][1])) 35 | 36 | nodemem_gb = numpy.sum(data[0]) / 1048576.0 37 | insertat = self._data.adddata(hostidx, timestamp, nodemem_gb) 38 | if insertat != None: 39 | self._hostdata[hostidx][insertat] = data[0] / 1048576.0 40 | 41 | return True 42 | 43 | def results(self): 44 | 45 | values = self._data.get() 46 | 47 | if len(self._hostdata) > 64: 48 | 49 | # Compute min, max & median data and only save the host data 50 | # for these hosts 51 | 52 | memdata = values[:, :, 1] 53 | sortarr = numpy.argsort(memdata.T, axis=1) 54 | 55 | retdata = { 56 | "min": self.collatedata(sortarr[:, 0], memdata), 57 | "max": self.collatedata(sortarr[:, -1], memdata), 58 | "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata), 59 | "times": values[0, :, 0].tolist(), 60 | "hosts": {} 61 | } 62 | 63 | uniqhosts = Counter(sortarr[:, 0]) 64 | uniqhosts.update(sortarr[:, -1]) 65 | uniqhosts.update(sortarr[:, sortarr.shape[1] // 2]) 66 | includelist = list(uniqhosts.keys()) 67 | else: 68 | # Save data for all hosts 69 | retdata = { 70 | "times": values[0, :, 0].tolist(), 71 | "hosts": {} 72 | } 73 | includelist = list(self._hostdata.keys()) 74 | 75 | 76 | for hostidx in includelist: 77 | retdata['hosts'][str(hostidx)] = {} 78 | retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist() 79 | retdata['hosts'][str(hostidx)]['dev'] = {} 80 | 81 | for devid in self._hostdevnames[hostidx].keys(): 82 | dpnts = len(values[hostidx, :, 0]) 83 | retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, numpy.int(devid)].tolist() 84 | 85 | retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx] 86 | 87 | return retdata 88 | 89 | @staticmethod 90 | def collatedata(args, rates): 91 | """ build output data """ 92 | result = [] 93 | for timepoint, hostidx in enumerate(args): 94 | try: 95 | result.append([rates[hostidx, timepoint], int(hostidx)]) 96 | except IndexError: 97 | pass 98 | 99 | return result 100 | -------------------------------------------------------------------------------- /src/supremm/plugins/UncoreCounters.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Socket level performance counter plugin """ 3 | 4 | from supremm.plugin import Plugin 5 | from supremm.statistics import calculate_stats 6 | from supremm.errors import ProcessingError 7 | import numpy 8 | 9 | SNB_METRICS = ["perfevent.hwcounters.snbep_unc_imc0__UNC_M_CAS_COUNT_RD.value", 10 | "perfevent.hwcounters.snbep_unc_imc0__UNC_M_CAS_COUNT_WR.value", 11 | "perfevent.hwcounters.snbep_unc_imc1__UNC_M_CAS_COUNT_RD.value", 12 | "perfevent.hwcounters.snbep_unc_imc1__UNC_M_CAS_COUNT_WR.value", 13 | "perfevent.hwcounters.snbep_unc_imc2__UNC_M_CAS_COUNT_RD.value", 14 | "perfevent.hwcounters.snbep_unc_imc2__UNC_M_CAS_COUNT_WR.value", 15 | "perfevent.hwcounters.snbep_unc_imc3__UNC_M_CAS_COUNT_RD.value", 16 | "perfevent.hwcounters.snbep_unc_imc3__UNC_M_CAS_COUNT_WR.value"] 17 | 18 | IVB_METRICS = ["perfevent.hwcounters.ivbep_unc_imc0__UNC_M_CAS_COUNT_RD.value", 19 | "perfevent.hwcounters.ivbep_unc_imc0__UNC_M_CAS_COUNT_WR.value", 20 | "perfevent.hwcounters.ivbep_unc_imc1__UNC_M_CAS_COUNT_RD.value", 21 | "perfevent.hwcounters.ivbep_unc_imc1__UNC_M_CAS_COUNT_WR.value", 22 | "perfevent.hwcounters.ivbep_unc_imc2__UNC_M_CAS_COUNT_RD.value", 23 | "perfevent.hwcounters.ivbep_unc_imc2__UNC_M_CAS_COUNT_WR.value", 24 | "perfevent.hwcounters.ivbep_unc_imc3__UNC_M_CAS_COUNT_RD.value", 25 | "perfevent.hwcounters.ivbep_unc_imc3__UNC_M_CAS_COUNT_WR.value"] 26 | 27 | NHM_METRICS = ["perfevent.hwcounters.UNC_LLC_MISS_READ.value", 28 | "perfevent.hwcounters.UNC_LLC_MISS_WRITE.value"] 29 | 30 | INTERLAGOS_METRICS = ["perfevent.hwcounters.L3_CACHE_MISSES_ALL.value"] 31 | 32 | class UncoreCounters(Plugin): 33 | """ Compute various uncore performance counter derived metrics """ 34 | 35 | name = property(lambda x: "uncperf") 36 | mode = property(lambda x: "firstlast") 37 | requiredMetrics = property(lambda x: [SNB_METRICS, IVB_METRICS, NHM_METRICS, INTERLAGOS_METRICS]) 38 | optionalMetrics = property(lambda x: []) 39 | derivedMetrics = property(lambda x: []) 40 | 41 | def __init__(self, job): 42 | super(UncoreCounters, self).__init__(job) 43 | self._first = {} 44 | self._data = {} 45 | self._error = None 46 | 47 | def process(self, nodemeta, timestamp, data, description): 48 | 49 | if self._job.getdata('perf')['active'] != True: 50 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 51 | return False 52 | 53 | ndata = numpy.array(data) 54 | 55 | if nodemeta.nodename not in self._first: 56 | self._first[nodemeta.nodename] = ndata 57 | return True 58 | 59 | if ndata.shape == self._first[nodemeta.nodename].shape: 60 | self._data[nodemeta.nodename] = numpy.sum(ndata - self._first[nodemeta.nodename]) 61 | if numpy.any(numpy.fabs(self._data[nodemeta.nodename]) != self._data[nodemeta.nodename]): 62 | self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB 63 | return False 64 | else: 65 | # Perf counters changed during the job 66 | self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE 67 | return False 68 | 69 | return True 70 | 71 | def results(self): 72 | 73 | if self._error != None: 74 | return {"error": self._error} 75 | 76 | nhosts = len(self._data) 77 | 78 | if nhosts < 1: 79 | return {"error": ProcessingError.INSUFFICIENT_DATA} 80 | 81 | membw = numpy.zeros(nhosts) 82 | for hostindex, data in enumerate(self._data.values()): 83 | membw[hostindex] = data * 64.0 84 | 85 | results = {"membw": calculate_stats(membw)} 86 | return results 87 | -------------------------------------------------------------------------------- /src/supremm/plugins/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/plugins/__init__.py -------------------------------------------------------------------------------- /src/supremm/preprocessors/HardwareInventory.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ hardware inventory pre-processor """ 3 | 4 | from supremm.plugin import PreProcessor 5 | from supremm.statistics import calculate_stats 6 | 7 | class HardwareInventory(PreProcessor): 8 | """ Parse and analyse hardware inventory information. Currently 9 | grabs the number of CPU cores for each host. 10 | """ 11 | 12 | name = property(lambda x: "hinv") 13 | mode = property(lambda x: "timeseries") 14 | requiredMetrics = property(lambda x: [["kernel.percpu.cpu.user"], ["hinv.ncpu"]]) 15 | optionalMetrics = property(lambda x: []) 16 | derivedMetrics = property(lambda x: []) 17 | 18 | def __init__(self, job): 19 | super(HardwareInventory, self).__init__(job) 20 | self.hostname = None 21 | self.corecount = None 22 | self.data = {} 23 | self.cores = [] 24 | 25 | def hoststart(self, hostname): 26 | self.hostname = hostname 27 | 28 | def process(self, timestamp, data, description): 29 | 30 | if len(data) == 1 and data[0][:, 0].size > 0: 31 | if data[0][0, 1] == -1: 32 | self.corecount = data[0][0, 0] 33 | else: 34 | self.corecount = data[0][:, 0].size 35 | # Have sufficient information, therefore return False to prevent 36 | # any further callbacks 37 | return False 38 | 39 | return True 40 | 41 | def hostend(self): 42 | if self.corecount != None: 43 | self.data[self.hostname] = {'cores': self.corecount} 44 | self.cores.append(self.corecount) 45 | 46 | self.corecount = None 47 | self.hostname = None 48 | 49 | self._job.adddata(self.name, self.data) 50 | 51 | def results(self): 52 | return {"cores": calculate_stats(self.cores)} 53 | 54 | -------------------------------------------------------------------------------- /src/supremm/preprocessors/PerfEvent.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ performance counters pre-processor """ 3 | 4 | from supremm.plugin import PreProcessor 5 | 6 | class PerfEvent(PreProcessor): 7 | """ The hardware performance counters are only valid if they were 8 | active and counting for the whole job. This preproc checks the active 9 | flag at all timepoints and the result is avaiable to all the plugins that 10 | use hardware counters. 11 | """ 12 | 13 | name = property(lambda x: "perf") 14 | mode = property(lambda x: "timeseries") 15 | requiredMetrics = property(lambda x: ["perfevent.active"]) 16 | optionalMetrics = property(lambda x: []) 17 | derivedMetrics = property(lambda x: []) 18 | 19 | def __init__(self, job): 20 | super(PerfEvent, self).__init__(job) 21 | self.perfactive = None 22 | 23 | def hoststart(self, hostname): 24 | pass 25 | 26 | def process(self, timestamp, data, description): 27 | 28 | if self.perfactive == False: 29 | return False 30 | 31 | if len(data) == 1 and data[0].shape == (1, 2) and data[0][:, 0].size > 0: 32 | self.perfactive = data[0][0, 0] != 0 33 | return self.perfactive 34 | 35 | return True 36 | 37 | def hostend(self): 38 | self._job.adddata(self.name, {"active": self.perfactive}) 39 | 40 | def results(self): 41 | return None 42 | 43 | -------------------------------------------------------------------------------- /src/supremm/preprocessors/ProcPrometheus.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Proc information pre-processor """ 3 | 4 | import re 5 | 6 | from supremm.preprocessors.Proc import Proc 7 | from supremm.linuxhelpers import parsecpusallowed 8 | 9 | 10 | class ProcPrometheus(Proc): 11 | """ Parse and analyse the proc information for a job. Supports parsing the cgroup information 12 | from SLRUM and PBS/Torque (if available). 13 | """ 14 | 15 | requiredMetrics = property(lambda x: ["prom:cgroup_cpu_info", 16 | "prom:cgroup_process_exec_count"]) 17 | 18 | optionalMetrics = property(lambda x: []) 19 | derivedMetrics = property(lambda x: []) 20 | 21 | def __init__(self, job): 22 | super(ProcPrometheus, self).__init__(job) 23 | 24 | def process(self, timestamp, data, description): 25 | """ Override Proc process() method """ 26 | # Set self.cgroupcpuset here using parsecpusallowed 27 | # The cgroupcpuset is returned as part of the description query 28 | if self.cpusallowed is None: 29 | allcores = set() 30 | try: 31 | for cpuset in description[0].values(): 32 | allcores |= parsecpusallowed(cpuset) 33 | if len(allcores) > 0: 34 | self.cpusallowed = allcores 35 | except ValueError: 36 | # Empty cpuset info seen in the wild - should get populated at 37 | # next timestep 38 | pass 39 | 40 | # All processes from the exporter are constrained 41 | for procname in description[1].values(): 42 | self.output['procDump']['constrained'][procname] += 1 43 | 44 | return True 45 | -------------------------------------------------------------------------------- /src/supremm/preprocessors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/preprocessors/__init__.py -------------------------------------------------------------------------------- /src/supremm/processhelpers.py: -------------------------------------------------------------------------------- 1 | """ various deprecated helper functions """ 2 | import os 3 | 4 | 5 | def get_utc_environ(): 6 | """ 7 | Creates a copy of this process' environment variables with the timezone 8 | variable set to UTC and returns it. 9 | 10 | Returns: 11 | A copy of os.environ with "TZ" set to "UTC". 12 | """ 13 | utc_environ = os.environ.copy() 14 | utc_environ["TZ"] = "UTC" 15 | return utc_environ 16 | 17 | 18 | def log_pipe(pipe, logging_function, template="%s"): 19 | """ 20 | Logs each non-empty line from a pipe (or other file-like object) 21 | using the given logging function. This will block until the end of 22 | the pipe is reached. 23 | 24 | Args: 25 | pipe: The pipe to read from. 26 | logging_function: The logging function to use. 27 | template: (Optional) A template string to place each line from pipe 28 | inside. 29 | """ 30 | if (not pipe) or (not logging_function): 31 | return 32 | 33 | for line in pipe: 34 | stripped_line = line.rstrip() 35 | if stripped_line: 36 | logging_function(template % stripped_line) 37 | 38 | 39 | def exists_ok_makedirs(path): 40 | """ 41 | A wrapper for os.makedirs that does not throw an exception 42 | if the given path points to an existing directory. 43 | 44 | Args: 45 | path: The path to the directory to create. 46 | Throws: 47 | EnvironmentError: Thrown if the directory could not be created. 48 | """ 49 | 50 | try: 51 | os.makedirs(path) 52 | except EnvironmentError: 53 | if not os.path.isdir(path): 54 | raise 55 | -------------------------------------------------------------------------------- /src/supremm/rangechange.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy 3 | 4 | class DataCache(object): 5 | """ Helper class that remembers the last value that it was passed """ 6 | def __init__(self): 7 | self.mdata = None 8 | self.timestamp = None 9 | self.data = None 10 | self.description = None 11 | 12 | def name(self): 13 | """ returns the name """ 14 | return 'datacache' 15 | 16 | def process(self, mdata, timestamp, data, description): 17 | """ process call """ 18 | self.mdata = mdata 19 | self.timestamp = timestamp 20 | self.data = data 21 | self.description = description 22 | 23 | def docallback(self, analytic): 24 | """ call the analytic with the paramerters from the most recent call to 25 | process (if any) """ 26 | if self.timestamp != None: 27 | return analytic.process(self.mdata, self.timestamp, self.data, self.description) 28 | else: 29 | return True 30 | 31 | class RangeChange(object): 32 | """ Convert counters that have < 64 bits to 64 bits """ 33 | def __init__(self, configobj): 34 | try: 35 | self.config = configobj.getsection('normalization') 36 | except KeyError: 37 | self.config = [] 38 | 39 | self._passthrough = False 40 | self.accumulator = [] 41 | self.last = [] 42 | self.needsfixup = [] 43 | 44 | def set_fetched_metrics(self, metriclist): 45 | """ sets the list of metrics that will be passed to the normalise_data function 46 | This resets the internal state of the object """ 47 | 48 | self.accumulator = [None] * len(metriclist) 49 | self.last = [None] * len(metriclist) 50 | self.needsfixup = [] 51 | self._passthrough = True 52 | 53 | for metric in metriclist: 54 | if metric in self.config: 55 | self.needsfixup.append(self.config[metric]) 56 | self._passthrough = False 57 | else: 58 | self.needsfixup.append(None) 59 | 60 | @property 61 | def passthrough(self): 62 | """ Returns whether the range changer will not modify data """ 63 | return self._passthrough 64 | 65 | def normalise_data(self, timestamp, data): 66 | """ Convert the data if needed """ 67 | 68 | if self._passthrough: 69 | return 70 | 71 | i = 0 72 | for datum in data: 73 | 74 | if self.needsfixup[i] is None: 75 | i += 1 76 | continue 77 | 78 | if len(datum) == 0: 79 | # Ignore entries with no data - this typically occurs when the 80 | # plugin requests multiple metrics and the metrics do not all appear 81 | # at every timestep 82 | i += 1 83 | continue 84 | 85 | if self.accumulator[i] is None: 86 | self.accumulator[i] = numpy.array(datum) 87 | self.last[i] = numpy.array(datum) 88 | else: 89 | self.accumulator[i] += (datum - self.last[i]) % numpy.uint64(1 << self.needsfixup[i]['range']) 90 | numpy.copyto(self.last[i], datum) 91 | numpy.copyto(datum, self.accumulator[i]) 92 | 93 | i += 1 94 | 95 | 96 | -------------------------------------------------------------------------------- /src/supremm/scripthelpers.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin env python 2 | """ common functions used in the command line scripts """ 3 | 4 | import re 5 | import datetime 6 | import pymysql 7 | import pymysql.cursors 8 | import sys 9 | import logging 10 | 11 | def parsetime(strtime): 12 | """ Try to be flexible in the time formats supported: 13 | 1) unixtimestamp prefixed with @ 14 | 2) year-month-day zero-padded 15 | 3) year-month-day hour:minute:second zero padded optional T between date and time 16 | 4) locale specific format 17 | """ 18 | m = re.search(r"^@(\d*)$", strtime) 19 | if m: 20 | return datetime.datetime.fromtimestamp(int(m.group(1))) 21 | if re.search(r"^\d{4}-\d{2}-\d{2}$", strtime): 22 | return datetime.datetime.strptime(strtime, "%Y-%m-%d") 23 | m = re.search(r"^(\d{4}-\d{2}-\d{2}).(\d{2}:\d{2}:\d{2})$", strtime) 24 | if m: 25 | return datetime.datetime.strptime(m.group(1) + " " + m.group(2), "%Y-%m-%d %H:%M:%S") 26 | 27 | return datetime.datetime.strptime(strtime, "%c") 28 | 29 | def getdbconnection(configsection, as_dict=False, defaultargs={}): 30 | """ Helper function that gets a database connection object from a config dictionary """ 31 | 32 | dbengine = configsection['dbengine'] if 'dbengine' in configsection else 'MySQLDB' 33 | 34 | if dbengine == 'MySQLDB': 35 | 36 | dbargs = defaultargs.copy() 37 | # Convert the external configuration names to python PEP-249 config names 38 | translate = {"host": "host", 39 | "defaultsfile": "read_default_file", 40 | "user": "user", 41 | "pass": "passwd", 42 | "port": "port"} 43 | 44 | for confval, myval in translate.items(): 45 | if confval in configsection: 46 | dbargs[myval] = configsection[confval] 47 | 48 | if as_dict: 49 | dbargs['cursorclass'] = pymysql.cursors.DictCursor 50 | 51 | dbargs['local_infile'] = 1 52 | 53 | return pymysql.connect(**dbargs) 54 | else: 55 | raise Exception("Unsupported database engine %s" % (dbengine)) 56 | 57 | def setuplogger(consolelevel, filename=None, filelevel=None): 58 | """ setup the python root logger to log to the console with defined log 59 | level. Optionally also log to file with the provided level """ 60 | 61 | if filelevel == None: 62 | filelevel = consolelevel 63 | 64 | if sys.version.startswith("2.7"): 65 | logging.captureWarnings(True) 66 | 67 | rootlogger = logging.getLogger() 68 | rootlogger.setLevel(min(consolelevel, filelevel)) 69 | 70 | formatter = logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s', datefmt='%Y-%m-%dT%H:%M:%S') 71 | 72 | if filename != None: 73 | filehandler = logging.FileHandler(filename) 74 | filehandler.setLevel(filelevel) 75 | filehandler.setFormatter(formatter) 76 | rootlogger.addHandler(filehandler) 77 | 78 | consolehandler = logging.StreamHandler() 79 | consolehandler.setLevel(consolelevel) 80 | consolehandler.setFormatter(formatter) 81 | rootlogger.addHandler(consolehandler) 82 | 83 | -------------------------------------------------------------------------------- /src/supremm/statistics.py: -------------------------------------------------------------------------------- 1 | """ Various utilities for calculating statistics """ 2 | import math 3 | import numpy 4 | import scipy.stats 5 | 6 | 7 | class Integrator(object): 8 | """ Helper class to itegrate data """ 9 | def __init__(self, x): 10 | self._x0 = x 11 | self._total = numpy.zeros_like(x) 12 | self._elapsed = 0.0 13 | 14 | def add(self, x, y): 15 | """ Add data to the accumulator """ 16 | delta_x = x - self._x0 17 | self._x0 = x 18 | 19 | self._total = y * delta_x + self._total 20 | self._elapsed += delta_x 21 | 22 | @property 23 | def total(self): 24 | """ get the total value """ 25 | return self._total 26 | 27 | @property 28 | def elapsed(self): 29 | """ Sum of segments """ 30 | return self._elapsed 31 | 32 | def calculate_stats(v): 33 | res = {} 34 | 35 | if len(v) == 1: 36 | return {'avg': float(v[0]), 'cnt': 1} 37 | 38 | if len(v) > 0: 39 | (v_n, (v_min, v_max), v_avg, v_var, v_skew, v_kurt) = scipy.stats.describe(v) 40 | 41 | if v_min == v_max: 42 | return {'avg': float(v[0]), 'cnt': len(v)} 43 | 44 | res['max'] = float(v_max) 45 | res['avg'] = v_avg 46 | res['krt'] = v_kurt 47 | res['min'] = float(v_min) 48 | res['skw'] = v_skew 49 | res['cnt'] = len(v) 50 | if res['min'] == res['max']: 51 | res['med'] = res['min'] 52 | res['std'] = 0.0 53 | else: 54 | res['med'] = float(numpy.median(v, axis=0)) 55 | if len(v) > 2: 56 | res['std'] = scipy.stats.tstd(v) 57 | 58 | if v_avg > 0: 59 | res['cov'] = math.sqrt(v_var) / v_avg 60 | 61 | return res 62 | 63 | 64 | class RollingStats(object): 65 | """ Uses Welford's method [1] to compute the mean and stddev of 66 | a series for data without storing all datapoints. 67 | 68 | Data should be added to the class instance using the append() 69 | function and the summary statistics can be accessed using get() 70 | 71 | [1] B. P. Welford (1962) Note on a Method for Calculating 72 | Corrected Sums of Squares and Products, Technometrics, 73 | 4:3, 419-420, DOI: 10.1080/00401706.1962.10490022 74 | """ 75 | def __init__(self): 76 | self._count = 0 77 | 78 | def append(self, x): 79 | """ append a datum. """ 80 | self._count += 1 81 | 82 | if self._count == 1: 83 | self.m = x 84 | self.last_m = x 85 | self.last_s = 0.0 86 | self.min = x 87 | self.max = x 88 | else: 89 | self.m = self.last_m + (x - self.last_m) / self._count 90 | self.s = self.last_s + (x - self.last_m) * (x - self.m) 91 | 92 | self.last_m = self.m 93 | self.last_s = self.s 94 | 95 | self.min = numpy.minimum(self.min, x) 96 | self.max = numpy.maximum(self.max, x) 97 | 98 | def get(self): 99 | """ return a dict with the various statistics """ 100 | return {'avg': self.mean(), 'min': self.min, 'max': self.max, 'cnt': self._count, 'std': math.sqrt(self.variance())} 101 | 102 | def mean(self): 103 | """ return the mean """ 104 | if self._count > 0: 105 | return self.m 106 | return 0.0 107 | 108 | def count(self): 109 | """ returns the number of data points that have been processed """ 110 | return self._count 111 | 112 | def variance(self): 113 | """ Return the variance of the data """ 114 | if self._count > 1: 115 | return self.s / (self._count - 1) 116 | return 0.0 117 | 118 | def __str__(self): 119 | return str(self.get()) 120 | 121 | def test(): 122 | """ test """ 123 | indata = [0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.1, 0.4] 124 | 125 | stats = RollingStats() 126 | for i in indata: 127 | stats.append(i) 128 | 129 | print(stats.get()) 130 | print(calculate_stats(indata)) 131 | 132 | if __name__ == "__main__": 133 | test() 134 | 135 | -------------------------------------------------------------------------------- /src/supremm/subsample.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ Timeseries subsampling module """ 3 | import numpy 4 | 5 | 6 | class TimeseriesAccumulator(object): 7 | """ Stores a subset of time-value pairs for a dataseries """ 8 | MAX_DATAPOINTS = 100 9 | LEAD_IN_DATAPOINTS = 10 10 | 11 | def __init__(self, nhosts, totaltime): 12 | self._totaltime = totaltime 13 | self._samplewindow = None 14 | self._leadout = None 15 | self._data = numpy.empty((nhosts, TimeseriesAccumulator.MAX_DATAPOINTS, 2)) 16 | self._count = numpy.zeros(nhosts, dtype=int) 17 | 18 | def adddata(self, hostidx, timestamp, value): 19 | """ Add a datapoint to the collection. 20 | The sampling algorithm is as follows: The first LEAD_IN data points are 21 | always added Then the sample interval is computed, and one datapoint 22 | per interval is collected Near the end of the job, all points are 23 | collected again (based on the amount of time to get the first LEAD_IN. 24 | 25 | The sampling algorithm could be changed to try to capture more fine 26 | detail by changing the sample interval in response to the rate of 27 | change of the value (longer sample interval when there is little 28 | change, shorter when change is occuring). But this is left as an 29 | exercise for the reader 30 | """ 31 | if self._count[hostidx] <= TimeseriesAccumulator.LEAD_IN_DATAPOINTS: 32 | idx = self._append(hostidx, timestamp, value) 33 | return idx 34 | 35 | if self._samplewindow == None: 36 | # compute sample window based on the first host to pass the post 37 | leadin = self._data[hostidx, TimeseriesAccumulator.LEAD_IN_DATAPOINTS, 0] - self._data[hostidx, 0, 0] 38 | self._samplewindow = (self._totaltime - (2.0 * leadin)) / (TimeseriesAccumulator.MAX_DATAPOINTS - 2 * TimeseriesAccumulator.LEAD_IN_DATAPOINTS) 39 | self._leadout = self._data[hostidx, 0, 0] + self._totaltime - leadin 40 | 41 | if ((timestamp > self._leadout) or (timestamp > self._data[hostidx, self._count[hostidx] - 1, 0] + self._samplewindow)) and self._count[hostidx] < TimeseriesAccumulator.MAX_DATAPOINTS: 42 | idx = self._append(hostidx, timestamp, value) 43 | return idx 44 | 45 | return None 46 | 47 | def _append(self, hostidx, timestamp, value): 48 | """ Add this data to the store """ 49 | insertidx = self._count[hostidx] 50 | self._data[hostidx, insertidx, 0] = timestamp 51 | self._data[hostidx, insertidx, 1] = value 52 | self._count[hostidx] += 1 53 | return insertidx 54 | 55 | def gethost(self, hostidx): 56 | """ return the data series """ 57 | return self._data[hostidx, :self._count[hostidx], :] 58 | 59 | def get(self): 60 | """ TODO numpy interp """ 61 | return self._data[:, :numpy.min(self._count), :] 62 | 63 | def __str__(self): 64 | return str(self._data[:, :self._count, :]) 65 | 66 | 67 | class RangeConverter(object): 68 | """ 69 | Convert data from limited width to 64bit width. Optionally raise an exception if 70 | the counters spin too fast. 71 | """ 72 | 73 | def __init__(self, precision, checkoverflow=False): 74 | self._range = pow(2.0, precision) 75 | self._last = None 76 | self._accumulator = None 77 | self._checkoverflow = checkoverflow 78 | 79 | def append(self, indata): 80 | """ add updated data and return stored value """ 81 | value = numpy.array(indata) 82 | 83 | if self._last != None: 84 | delta = (value - self._last) % self._range 85 | 86 | if self._checkoverflow: 87 | if delta > (self._range / 2.0): 88 | raise Exception("Counter overflow") 89 | self._accumulator += delta 90 | else: 91 | self._accumulator = numpy.zeros(value.shape) 92 | 93 | self._last = value 94 | 95 | return self._accumulator 96 | 97 | def get(self): 98 | """ get current stored value """ 99 | return self._accumulator 100 | -------------------------------------------------------------------------------- /src/supremm/summarize.py: -------------------------------------------------------------------------------- 1 | """ Definition of the summarize API """ 2 | from abc import ABC, abstractmethod 3 | 4 | VERSION = "1.0.6" 5 | TIMESERIES_VERSION = 4 6 | 7 | 8 | class Summarize(ABC): 9 | """ Abstract base class describing the job summarization interface. 10 | """ 11 | 12 | def __init__(self, preprocessors, analytics, job, config, fail_fast=False): 13 | self.preprocs = preprocessors 14 | self.alltimestamps = [x for x in analytics if x.mode in ("all", "timeseries")] 15 | self.firstlast = [x for x in analytics if x.mode == "firstlast"] 16 | self.errors = {} 17 | self.job = job 18 | self.fail_fast = fail_fast 19 | 20 | self.version = VERSION 21 | self.timeseries_version = TIMESERIES_VERSION 22 | 23 | @abstractmethod 24 | def get(self): 25 | """ Return a dict with the summary information """ 26 | pass 27 | 28 | def adderror(self, category, errormsg): 29 | """ All errors reported with this function show up in the job summary """ 30 | if category not in self.errors: 31 | self.errors[category] = set() 32 | if isinstance(errormsg, list): 33 | self.errors[category].update(set(errormsg)) 34 | else: 35 | self.errors[category].add(errormsg) 36 | 37 | @abstractmethod 38 | def process(self): 39 | """ Main entry point. All of a job's nodes are processed """ 40 | pass 41 | 42 | @abstractmethod 43 | def complete(self): 44 | """ A job is complete if data exist for all assigned nodes and they have 45 | been processed sucessfullly 46 | """ 47 | pass 48 | 49 | @abstractmethod 50 | def good_enough(self): 51 | """ A job is good_enough if archives for 95% of nodes have 52 | been processed sucessfullly 53 | """ 54 | pass 55 | -------------------------------------------------------------------------------- /src/supremm/supremm_update: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | #------------------------------------------------------------------------- 4 | # Configurable settigns 5 | 6 | LOCKFILE=/var/tmp/supremm_summary.lock 7 | 8 | THREADS=`nproc --ignore=2` 9 | 10 | ulimit -n4096 11 | 12 | #------------------------------------------------------------------------- 13 | # Main script 14 | 15 | reportfail() 16 | { 17 | echo "Not running due to another process holding the lock" 18 | exit 1 19 | } 20 | 21 | ( 22 | flock -n 9 || reportfail 23 | 24 | set -e 25 | 26 | #------------------------------------------------------------------------- 27 | # Run index and ingest 28 | 29 | if [ "$1" != "process" ]; then 30 | indexarchives.py -t $THREADS -q 31 | summarize_jobs.py -t $THREADS -q 32 | else 33 | summarize_jobs.py -t $THREADS -d 34 | fi 35 | 36 | ) 9>${LOCKFILE} 37 | 38 | -------------------------------------------------------------------------------- /src/supremm/supremm_upgrade.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ supremm-upgrade script used to alter database or config files to latest 3 | schema versions """ 4 | 5 | import argparse 6 | import signal 7 | import sys 8 | 9 | def signalHandler(sig, _): 10 | """ clean exit on an INT signal """ 11 | if sig == signal.SIGINT: 12 | sys.exit(0) 13 | 14 | def main(): 15 | """ main entry point """ 16 | parser = argparse.ArgumentParser(description='Upgrade the SUPReMM database and config files') 17 | parser.add_argument('-v', '--verbose', action='store_true', help='Output info level logging') 18 | parser.add_argument('-d', '--debug', action='store_true', help='Output debug level logging') 19 | parser.add_argument('-q', '--quiet', action='store_true', help='Output warning level logging') 20 | 21 | opts = parser.parse_args() 22 | 23 | signal.signal(signal.SIGINT, signalHandler) 24 | 25 | # Nothing to do for a 1.1 to 1.2 upgrade. 26 | 27 | if __name__ == "__main__": 28 | main() 29 | -------------------------------------------------------------------------------- /src/supremm/supremmconf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ helper utiility to print out config info """ 3 | 4 | import sys 5 | import os 6 | import json 7 | import logging 8 | from getopt import getopt 9 | 10 | from supremm.config import Config 11 | from supremm.scripthelpers import setuplogger 12 | 13 | def usage(): 14 | """ print usage """ 15 | print("usage: {0} [OPTS]".format(os.path.basename(__file__))) 16 | print(" -d --debug set log level to debug") 17 | print(" -c --config specify the path to the configuration file") 18 | print(" -s --section SECTION output the configuration data from the specified section") 19 | print(" -i --item ITEM output the configuration data for the specified item") 20 | print(" -h --help print this help message") 21 | 22 | def getoptions(): 23 | """ process comandline options """ 24 | 25 | retdata = {"log" : logging.ERROR, 26 | "config" : None, 27 | "section": None, 28 | "item" : None} 29 | 30 | opts, _ = getopt(sys.argv[1:], "dc:s:i:h", ["debug", "config=", "section=", "item=", "help"]) 31 | 32 | for opt in opts: 33 | if opt[0] in ("-d", "--debug"): 34 | retdata['log'] = logging.DEBUG 35 | if opt[0] in ("-c", "--config"): 36 | retdata['config'] = opt[1] 37 | if opt[0] in ("-s", "--section"): 38 | retdata['section'] = opt[1] 39 | if opt[0] in ("-i", "--item"): 40 | retdata['item'] = opt[1] 41 | if opt[0] in ("-h", "--help"): 42 | usage() 43 | sys.exit(0) 44 | 45 | if 'section' in retdata: 46 | return retdata 47 | 48 | usage() 49 | sys.exit(1) 50 | 51 | def main(): 52 | """ print out config data according to cmdline args """ 53 | opts = getoptions() 54 | 55 | setuplogger(opts['log']) 56 | 57 | if opts['config']: 58 | logging.debug("Using specified path: {}".format(opts['config'])) 59 | else: 60 | logging.debug("Automatically detecting configuration path.") 61 | 62 | try: 63 | conf = Config(opts['config']) 64 | except: 65 | logging.error("Configuration could not be found.") 66 | sys.exit(1) 67 | 68 | if not opts['section']: 69 | print(conf) 70 | sys.exit(0) 71 | 72 | try: 73 | section = conf.getsection(opts['section']) 74 | except KeyError: 75 | logging.error("Section '{}' not defined in configuration file.".format(opts['section'])) 76 | sys.exit(1) 77 | 78 | if opts['item']: 79 | try: 80 | item = section[opts['item']] 81 | except KeyError: 82 | logging.error("Item '{}' not defined in section '{}'.".format(opts['item'], opts['section'])) 83 | sys.exit(1) 84 | 85 | if isinstance(item, dict): 86 | item = json.dumps(item, indent=4) 87 | 88 | print(item) 89 | 90 | else: 91 | print(json.dumps(section, indent=4)) 92 | 93 | if __name__ == "__main__": 94 | main() 95 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/__init__.py -------------------------------------------------------------------------------- /tests/ci/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rockylinux:8 2 | VOLUME /tmp/supremm 3 | COPY setup.sh /sbin/setup.sh 4 | COPY build.sh /sbin/build.sh 5 | RUN setup.sh 6 | WORKDIR /tmp/supremm 7 | entrypoint ["build.sh"] 8 | -------------------------------------------------------------------------------- /tests/ci/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | 4 | declare -a builds=("rpm" "wheel" "src") 5 | for BUILD in "${builds[@]}"; 6 | do 7 | case $BUILD in 8 | "rpm") 9 | python3 setup.py bdist_rpm 10 | ;; 11 | 12 | "wheel") 13 | python3 setup.py bdist_wheel 14 | ;; 15 | 16 | "src") 17 | tar --exclude={'*.rpm','*.whl'} -czf /tmp/supremm.tar.gz . 18 | mv /tmp/supremm.tar.gz dist 19 | ;; 20 | 21 | *) 22 | ;; 23 | esac 24 | done 25 | -------------------------------------------------------------------------------- /tests/ci/setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | dnf install -y epel-release 4 | 5 | # enable powertools repo for Cython 6 | sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo 7 | 8 | SETUP=$1 9 | case $SETUP in 10 | "build") 11 | dnf install -y \ 12 | gcc \ 13 | pcp-devel \ 14 | rpm-build 15 | 16 | # Install development dependencies 17 | dnf install -y \ 18 | python3-numpy \ 19 | python3-scipy \ 20 | python36-devel \ 21 | python3-Cython \ 22 | python3-pymongo \ 23 | python3-PyMySQL \ 24 | python3-pcp \ 25 | python3-requests \ 26 | python3-wheel 27 | ;; 28 | "test") 29 | # Install dependencies 30 | dnf install -y \ 31 | python3-numpy \ 32 | python3-scipy \ 33 | python36-devel \ 34 | python3-Cython \ 35 | python3-pymongo \ 36 | python3-PyMySQL \ 37 | python3-pytest \ 38 | python3-pytest-cov \ 39 | python3-mock \ 40 | python3-pexpect \ 41 | python3-pylint \ 42 | python3-pcp \ 43 | python3-pytz \ 44 | python3-requests \ 45 | pcp-devel \ 46 | ;; 47 | esac 48 | 49 | -------------------------------------------------------------------------------- /tests/ci/srv/prom_cluster.txt: -------------------------------------------------------------------------------- 1 | 123456|123456|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:00:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|4|4|1000M|billing=1,cpu=4,mem=1000M,node=1|billing=1,cpu=4,mem=1000M,node=1|1-00:00:00|cpn-a21-01|mockjob1 2 | 789012|789012|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|8|8|1000M|billing=8,cpu=8,mem=1000M,node=1|billing=8,cpu=8,mem=1000M,node=8|1-00:00:00|cpn-b22-04|mockjob2 3 | 345678|345678|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|16|16|1000M|billing=16,cpu=16,mem=1000M,node=1|billing=16,cpu=16,mem=1000M,node=8|1-00:00:00|cpn-c23-04|mockjob3 4 | 901234|901234|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|3|4|4|1000M|billing=4,cpu=4,mem=1000M,node=3|billing=4,cpu=4,mem=1000M,node=3|1-00:00:00|cpn-a21-01,cpn-b22-04,cpn-c23-08|mockjob4 5 | -------------------------------------------------------------------------------- /tests/ci/srv/prometheus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM prom/prometheus:latest 2 | RUN touch /prometheus/queries.active 3 | COPY ./prometheus.yml /etc/prometheus/prometheus.yml 4 | ADD ./promdata.tar.gz /prometheus 5 | -------------------------------------------------------------------------------- /tests/ci/srv/prometheus/promdata.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/ci/srv/prometheus/promdata.tar.gz -------------------------------------------------------------------------------- /tests/ci/srv/prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: "30s" 3 | scrape_timeout: "15s" 4 | -------------------------------------------------------------------------------- /tests/ci/srv/prometheus/web.yml: -------------------------------------------------------------------------------- 1 | # TODO use this for basic auth 2 | -------------------------------------------------------------------------------- /tests/ci/srv/services.yml: -------------------------------------------------------------------------------- 1 | version: "3.9" 2 | 3 | services: 4 | prometheus: 5 | build: 6 | context: ./prometheus 7 | hostname: prometheus 8 | container_name: prometheus 9 | volumes: 10 | - type: volume 11 | source: tsdb 12 | target: /var/lib/prometheus 13 | command: 14 | - '--storage.tsdb.path=/prometheus' 15 | - '--config.file=/etc/prometheus/prometheus.yml' 16 | - '--web.config.file=/etc/prometheus/web.yml' 17 | network_mode: "host" 18 | 19 | mongod: 20 | image: 21 | mongo:6.0 22 | environment: 23 | - MONGO_INITDB_ROOT_USERNAME=supremm 24 | - MONGO_INITDB_ROOT_PASSWORD=supremm-test123 25 | volumes: 26 | - type: volume 27 | source: mongodb 28 | target: /data/db 29 | network_mode: "host" 30 | 31 | volumes: 32 | tsdb: 33 | mongodb: 34 | -------------------------------------------------------------------------------- /tests/ci/test/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rockylinux:8 2 | VOLUME /tmp/supremm 3 | WORKDIR /tmp/supremm 4 | RUN dnf install -y epel-release && \ 5 | sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo 6 | COPY bootstrap.sh /usr/bin/ 7 | COPY supremm_setup_expect.py /usr/bin/ 8 | COPY entrypoint.sh /usr/local/bin/ 9 | RUN chmod u+x /usr/bin/supremm_setup_expect.py 10 | RUN chmod u+x /usr/local/bin/entrypoint.sh 11 | ENTRYPOINT ["entrypoint.sh"] 12 | -------------------------------------------------------------------------------- /tests/ci/test/bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -euxo pipefail 3 | shopt -s extglob 4 | 5 | tests/ci/setup.sh test 6 | 7 | INSTALL_TYPE=$1 8 | case $INSTALL_TYPE in 9 | "rpm") 10 | dnf install -y dist/supremm-+([0-9.])*.x86_64.rpm 11 | ;; 12 | "wheel") 13 | pip3 install dist/supremm-+([0-9.])*.whl 14 | ;; 15 | "src") 16 | tar -xzf dist/supremm.tar.gz -C /tmp 17 | python3 /tmp/setup.py install 18 | ;; 19 | esac 20 | 21 | ~/bin/services start 22 | mongod -f /etc/mongod.conf --auth 23 | 24 | mkdir -p /data/{phillips,pozidriv,frearson,mortorq,robertson}/{pcp-logs,jobscripts} 25 | mkdir -p "/data/mortorq/pcp-logs/hostname/2016/12/30" 26 | 27 | # Run setup script 28 | python3 tests/integration_tests/supremm_setup_expect.py 29 | 30 | # Copy node-level archives 31 | cp tests/integration_tests/pcp_logs_extracted/* /data/mortorq/pcp-logs/hostname/2016/12/30 32 | 33 | # Create files containing 'job scripts' for 'start' jobs 34 | jspath=/data/phillips/jobscripts/20170101 35 | mkdir $jspath 36 | for jobid in 197155 197182 197186 197199 1234234[21] 123424[] 37 | do 38 | echo "Job script for job $jobid" > $jspath/$jobid.savescript 39 | done 40 | 41 | # Create job scripts for a submit jobs 42 | jspath=/data/robertson/jobscripts/20161212 43 | mkdir $jspath 44 | for jobid in 6066098 45 | do 46 | echo "Job script for job $jobid" > $jspath/$jobid.savescript 47 | done 48 | 49 | # Create job script for end jobs 50 | jspath=/data/pozidriv/jobscripts/20161230 51 | mkdir $jspath 52 | for jobid in 983936 53 | do 54 | echo "Job script for job $jobid" > $jspath/$jobid.savescript 55 | done 56 | -------------------------------------------------------------------------------- /tests/component/data/perfevent.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.0 -------------------------------------------------------------------------------- /tests/component/data/perfevent.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.index -------------------------------------------------------------------------------- /tests/component/data/perfevent.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.meta -------------------------------------------------------------------------------- /tests/component/runtests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euxo pipefail 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" 4 | 5 | outputfile=`mktemp` 6 | python3 $DIR/../../src/supremm/supremm_testharness.py -c $DIR/../../config $DIR/data/perfevent > $outputfile 7 | 8 | # Check that there are data in the output for all the following 9 | jq -e .cpuperf.cpiref.avg < $outputfile 10 | jq -e .cpuperf.flops.avg < $outputfile 11 | jq -e .cpuperf.cpldref.avg < $outputfile 12 | jq -e .uncperf.membw.avg < $outputfile 13 | jq -e .timeseries.membw < $outputfile 14 | jq -e .timeseries.simdins < $outputfile 15 | jq -e .summarization.datasource < $outputfile 16 | 17 | rm -f $outputfile 18 | -------------------------------------------------------------------------------- /tests/integration_tests/5894431-1622570028/cpn-d14-02.0: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.0 -------------------------------------------------------------------------------- /tests/integration_tests/5894431-1622570028/cpn-d14-02.index: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.index -------------------------------------------------------------------------------- /tests/integration_tests/5894431-1622570028/cpn-d14-02.meta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.meta -------------------------------------------------------------------------------- /tests/integration_tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/__init__.py -------------------------------------------------------------------------------- /tests/integration_tests/integration_plugin_api.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from mock import patch 3 | import pytest 4 | 5 | from supremm import summarize_jobs 6 | from tests.integration_tests.mock_preprocessor import MockPreprocessor 7 | from tests.integration_tests.throwing_plugin import InitThrowingPlugin, ProcessThrowingPlugin, ResultsThrowingPlugin 8 | 9 | 10 | @pytest.mark.parametrize("threads", [1, 3]) 11 | def test_plugin_api(threads): 12 | test_args = "summarize_jobs.py -d -r 2 -j 972366 --fail-fast --threads {}".format(threads).split() 13 | preprocs = [MockPreprocessor] 14 | plugins = [] 15 | # this was very non-obvious to me but since summarize_jobs does "from supremm.plugin import loadpreprocs" 16 | # you have to patch loadpreprocs as if it was in the summarize_jobs module 17 | with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors", return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins): 18 | summarize_jobs.main() 19 | 20 | 21 | @pytest.mark.parametrize("threads", [1, 3]) 22 | def test_exception_init(threads): 23 | test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split() 24 | plugins = [InitThrowingPlugin] 25 | preprocs = [] 26 | with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors", return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins): 27 | summarize_jobs.main() 28 | 29 | 30 | @pytest.mark.parametrize("threads", [1, 3]) 31 | def test_exception_process(threads): 32 | test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split() 33 | plugins = [ProcessThrowingPlugin] 34 | preprocs = [] 35 | with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors", return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins): 36 | summarize_jobs.main() 37 | 38 | 39 | @pytest.mark.parametrize("threads", [1, 3]) 40 | def test_exception_results(threads): 41 | test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split() 42 | plugins = [ResultsThrowingPlugin] 43 | preprocs = [] 44 | with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors", return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins): 45 | summarize_jobs.main() 46 | -------------------------------------------------------------------------------- /tests/integration_tests/integration_test.bash: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -euxo pipefail 4 | 5 | indexarchives.py -da 6 | summarize_jobs.py -d -r 2 -j 972366 --fail-fast 7 | 8 | # DISABLED until XDMoD is ported to Centos 8 9 | #aggregate_supremm.sh 10 | # 11 | #count=$(mysql -ss -u root < 1: 38 | p.expect('Enable SUPReMM summarization for this resource?') 39 | if i > 5: 40 | p.sendline("n") 41 | continue 42 | p.sendline("y") 43 | if i != 0: 44 | p.expect("Data collector backend \(pcp or prometheus\)") 45 | if i <= 4: 46 | config_pcp(p) 47 | elif i == 5: 48 | config_prometheus(p) 49 | p.expect("Source of accounting data") 50 | p.sendline() 51 | p.expect("node name unique identifier") 52 | p.sendline() 53 | p.expect("Directory containing job launch scripts") 54 | p.sendline() 55 | p.expect("Job launch script timestamp lookup mode \('submit', 'start' or 'none'\)") 56 | p.sendline(scriptsettings[i-1]) 57 | else: 58 | break 59 | 60 | p.expect("Press ENTER to continue") 61 | p.sendline() 62 | 63 | p.expect("Select an option") 64 | p.sendline("d") 65 | p.expect("Enter path to configuration files") 66 | p.sendline() 67 | p.expect("DB hostname") 68 | p.sendline() 69 | p.expect("DB port") 70 | p.sendline() 71 | p.expect("DB Admin Username") 72 | p.sendline() 73 | p.expect("DB Admin Password") 74 | p.sendline() 75 | p.expect("Do you wish to proceed") 76 | p.sendline("y") 77 | p.expect("Press ENTER to continue") 78 | p.sendline() 79 | 80 | p.expect("Select an option") 81 | p.sendline("m") 82 | p.expect("Enter path to configuration files") 83 | p.sendline() 84 | p.expect("URI") 85 | p.sendline("mongodb://localhost/supremm") 86 | p.expect("Do you wish to proceed") 87 | p.sendline("y") 88 | p.expect("Press ENTER to continue") 89 | p.sendline() 90 | 91 | p.expect("Select an option") 92 | p.sendline("q") 93 | 94 | if __name__ == '__main__': 95 | main() 96 | -------------------------------------------------------------------------------- /tests/integration_tests/throwing_plugin.py: -------------------------------------------------------------------------------- 1 | from supremm.plugin import Plugin 2 | 3 | 4 | class InitThrowingPlugin(Plugin): 5 | name = property(lambda self: "init_throwing_plugin") 6 | mode = property(lambda self: "timeseries") 7 | requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"]) 8 | optionalMetrics = property(lambda self: []) 9 | derivedMetrics = property(lambda self: []) 10 | 11 | def __init__(self, job): 12 | super(InitThrowingPlugin, self).__init__(job) 13 | raise Exception("Exception in __init__") 14 | 15 | def process(self, nodemeta, timestamp, data, description): 16 | pass 17 | 18 | def results(self): 19 | pass 20 | 21 | 22 | class ProcessThrowingPlugin(Plugin): 23 | name = property(lambda self: "process_throwing_plugin") 24 | mode = property(lambda self: "timeseries") 25 | requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"]) 26 | optionalMetrics = property(lambda self: []) 27 | derivedMetrics = property(lambda self: []) 28 | 29 | def __init__(self, job): 30 | super(ProcessThrowingPlugin, self).__init__(job) 31 | 32 | def process(self, nodemeta, timestamp, data, description): 33 | raise Exception("Exception in process") 34 | 35 | def results(self): 36 | pass 37 | 38 | 39 | class ResultsThrowingPlugin(Plugin): 40 | name = property(lambda self: "results_throwing_plugin") 41 | mode = property(lambda self: "timeseries") 42 | requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"]) 43 | optionalMetrics = property(lambda self: []) 44 | derivedMetrics = property(lambda self: []) 45 | 46 | def __init__(self, job): 47 | super(ResultsThrowingPlugin, self).__init__(job) 48 | 49 | def process(self, nodemeta, timestamp, data, description): 50 | return False 51 | 52 | def results(self): 53 | raise Exception("Exception in results") 54 | -------------------------------------------------------------------------------- /tests/testPcpArchiveProcessor.py: -------------------------------------------------------------------------------- 1 | """" tests for the pcp archive processor """ 2 | import unittest 3 | from supremm.datasource.pcp.indexarchives import PcpArchiveProcessor 4 | 5 | class TestPcpArchiveProcessor(unittest.TestCase): 6 | """ Tests for the pcp filename string parser code """ 7 | 8 | def setUp(self): 9 | """ setUp """ 10 | self.inst = PcpArchiveProcessor({'hostname_mode': 'hostname'}) 11 | 12 | def test_archivestringmatching(self): 13 | """ test timestamp parsing """ 14 | 15 | testCases = { 16 | 'jo.log.ex.e-end-20180614.09.48.29.index': None, 17 | 'job-2671016.index': None, 18 | 'job-2679009[431].index': None, 19 | 'job-123423-end-20181004.04.05.41.index': 1538625941.0, 20 | 'job-123423-begin-20181004.04.05.41.index': 1538625941.0, 21 | 'job-123423-postbegin-20181004.04.05.41.index': 1538625941.0, 22 | 'job-123423[234]-end-20181004.04.05.41.index': 1538625941.0, 23 | 'job-123423[]-end-20181004.04.05.41.index': 1538625941.0, 24 | 'job-123423[234].server.net-end-20181004.04.05.41.index': 1538625941.0, 25 | 'job-123423[234].server.net-postbegin-20181004.04.05.41.index': 1538625941.0, 26 | 'job-123423[234].server.net-begin-20181004.04.05.41.index': 1538625941.0, 27 | 'job-123423.server.net-end-20181004.04.05.41.index': 1538625941.0 28 | } 29 | 30 | for archiveName, expected in testCases.items(): 31 | assert self.inst.get_archive_data_fast('/some/path/to/data/' + archiveName) == expected 32 | 33 | def test_jobidparser(self): 34 | """ test jobid parsing """ 35 | 36 | testCases = { 37 | 'jo.log.ex.e-end-20180614.09.48.29.index': None, 38 | '20180729.04.36.index': None, 39 | 'job-2671016.index': (-1, -1, 2671016), 40 | 'job-2673760.index': (-1, -1, 2673760), 41 | 'job-2671022.login.example.edu-end-20180830.02.54.25.index': (-1, -1, 2671022), 42 | 'job-2673760.login.example.edu-end-20180830.02.40.28.index': (-1, -1, 2673760), 43 | 'job-2673760.login.example.edu-end-20180830.02.50.16.index': (-1, -1, 2673760), 44 | 'job-1450543.login.example.edu-postbegin-20180830.00.00.00.index': (-1, -1, 1450543), 45 | 'job-1450554.login.example.edu-postbegin-20180830.00.00.00.index': (-1, -1, 1450554), 46 | 'job-2676199[18].index': (2676199, 18, -1), 47 | 'job-2679009[431].index': (2679009, 431, -1), 48 | 'job-1451551[326].hd-20180614.13.26.33.index': (1451551, 326, -1), 49 | 'job-2676200[18].login.example.edu-end-20180830.02.45.38.index': (2676200, 18, -1), 50 | 'job-2676200[18].login.example.edu-end-20180830.02.46.54.index': (2676200, 18, -1), 51 | 'job-2679009[431].login.example.edu-end-20180904.18.38.02.index': (2679009, 431, -1), 52 | 'job-2679136[520].login.example.edu-postbegin-20180614.00.00.00.index': (2679136, 520, -1), 53 | 'job-2679136[523].login.example.edu-postbegin-20180614.00.00.00.index': (2679136, 523, -1), 54 | 'job-1450512[4].login.example.edu-postbegin-20180614.00.00.00.index': (1450512, 4, -1), 55 | 'job-123423-end-20181004.04.05.41.index': (-1, -1, 123423), 56 | 'job-123423[234]-end-20181004.04.05.41.index': (123423, 234, -1), 57 | 'job-123423[]-end-20181004.04.05.41.index': (-1, -1, 123423), 58 | 'job-end-20181004.04.05.41.index': None, 59 | 'job-123423[234].server.net-end-20181004.04.05.41.index': (123423, 234, -1), 60 | 'job-123423.server.net-end-20181004.04.05.41.index': (-1, -1, 123423) 61 | } 62 | 63 | for archiveName, expected in testCases.items(): 64 | assert self.inst.parsejobid(archiveName) == expected 65 | -------------------------------------------------------------------------------- /tests/testrangechange.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy 3 | from supremm.rangechange import RangeChange 4 | 5 | class MockConfig(object): 6 | def __init__(self, settings): 7 | self.settings = settings 8 | 9 | def getsection(self, sectionname): 10 | return dict(self.settings[sectionname]) 11 | 12 | 13 | class TestRangeChange(unittest.TestCase): 14 | 15 | def test_normalization(self): 16 | 17 | config = MockConfig({"normalization": {"perfevent.hwcounters.CPU_CLK_UNHALTED.value": {"range": 48}}}) 18 | 19 | r = RangeChange(config) 20 | 21 | r.set_fetched_metrics(["perfevent.hwcounters.CPU_CLK_UNHALTED.value", "something.else", "perfevent.hwcounters.RETIRED_INSTRUCTIONS.value"]) 22 | 23 | self.assertFalse(r.passthrough) 24 | 25 | data = [] 26 | val = numpy.power([2,2,2], 48) - numpy.array([1,2,3]) 27 | data.append(val) 28 | val = val - numpy.array([3,3,3]) 29 | data.append(val) 30 | val = val - numpy.array([3,3,3]) 31 | data.append(val) 32 | 33 | r.normalise_data(1.000, data) 34 | 35 | self.assertTrue( numpy.all(data[0] == numpy.power([2,2,2], 48) - numpy.array([1,2,3]) )) 36 | self.assertTrue( numpy.all(data[1] == numpy.power([2,2,2], 48) - numpy.array([4,5,6]) )) 37 | self.assertTrue( numpy.all(data[2] == numpy.power([2,2,2], 48) - numpy.array([7,8,9]) )) 38 | 39 | d2 = [] 40 | d2.append( (data[0] + numpy.array([10,10,10])) % numpy.power(2,48)) 41 | d2.append(numpy.array([40,50,60])) 42 | d2.append(numpy.array([70,80,90])) 43 | 44 | r.normalise_data(2.000, d2) 45 | 46 | delta = d2[0] - data[0] 47 | 48 | self.assertTrue( numpy.all(delta == numpy.array([10,10,10]))) 49 | self.assertTrue( numpy.all(d2[1] == numpy.array([40,50,60]))) 50 | self.assertTrue( numpy.all(d2[2] == numpy.array([70,80,90]))) 51 | 52 | 53 | def test_passthrough(self): 54 | 55 | config = MockConfig({"normalization": {"perfevent.hwcounters.CPU_CLK_UNHALTED.value": {"range": 48}}}) 56 | 57 | r = RangeChange(config) 58 | 59 | r.set_fetched_metrics(["kernel.percpu.cpu.user", "kernel.percpu.cpu.system"]) 60 | self.assertTrue(r.passthrough) 61 | 62 | data = [numpy.array([234,23423,234,23423,23423]), numpy.array([856,5698,789,127,90780])] 63 | 64 | r.normalise_data(1.000, data) 65 | 66 | self.assertTrue(numpy.all(data[0] == numpy.array([234,23423,234,23423,23423]))) 67 | self.assertTrue(numpy.all(data[1] == numpy.array([856,5698,789,127,90780]))) 68 | 69 | def test_missingconfig(self): 70 | 71 | config = MockConfig({}) 72 | r = RangeChange(config) 73 | 74 | r.set_fetched_metrics(["kernel.percpu.cpu.user", "kernel.percpu.cpu.system"]) 75 | 76 | data = [numpy.array([234,23423,234,23423,23423]), numpy.array([856,5698,789,127,90780])] 77 | 78 | r.normalise_data(1.000, data) 79 | 80 | self.assertTrue(numpy.all(data[0] == numpy.array([234,23423,234,23423,23423]))) 81 | self.assertTrue(numpy.all(data[1] == numpy.array([856,5698,789,127,90780]))) 82 | 83 | if __name__ == '__main__': 84 | unittest.main() 85 | --------------------------------------------------------------------------------