├── .circleci
    └── config.yml
├── .gitignore
├── .pylintrc
├── AUTHORS
├── CHANGELOG.md
├── LICENSE
├── README.md
├── config
    ├── ccr-files
    │   └── supremm_update_ccr
    ├── config.json
    ├── prometheus
    │   └── mapping.json
    └── templates
    │   ├── hotproc
    │       └── hotproc.conf
    │   ├── pmlogger
    │       ├── control
    │       └── pmlogger-supremm.config
    │   └── slurm
    │       ├── slurm-epilog
    │       └── slurm-prolog
├── pytest.ini
├── setup.cfg
├── setup.py
├── src
    └── supremm
    │   ├── .gitignore
    │   ├── Job.py
    │   ├── TimeseriesPatterns.py
    │   ├── __init__.py
    │   ├── account.py
    │   ├── accounting.py
    │   ├── assets
    │       ├── modw_pcp.sql
    │       ├── modw_supremm.sql
    │       └── mongo_setup.js
    │   ├── batch_acct.py
    │   ├── config.py
    │   ├── datadumper.py
    │   ├── datasource
    │       ├── __init__.py
    │       ├── datasource.py
    │       ├── factory.py
    │       ├── pcp
    │       │   ├── __init__.py
    │       │   ├── indexarchives.py
    │       │   ├── pcparchive.py
    │       │   ├── pcpcinterface
    │       │   │   ├── __init__.py
    │       │   │   ├── c_pcp.pxd
    │       │   │   └── pcpcinterface.pyx
    │       │   ├── pcpdatasource.py
    │       │   └── pcpsummarize.py
    │       └── prometheus
    │       │   ├── __init__.py
    │       │   ├── promdatasource.py
    │       │   ├── prominterface.py
    │       │   ├── prommapping.py
    │       │   └── promsummarize.py
    │   ├── errors.py
    │   ├── gen_pmlogger_control.py
    │   ├── ingest_jobscripts.py
    │   ├── lariat.py
    │   ├── linuxhelpers.py
    │   ├── migrations
    │       └── 1.0-1.1
    │       │   └── modw_supremm.sql
    │   ├── outputter.py
    │   ├── plugin.py
    │   ├── plugins
    │       ├── ArmPowerUsageTimeseries.py
    │       ├── Block.py
    │       ├── BlockTimeseries.py
    │       ├── Catastrophe.py
    │       ├── CgroupMemTimeseries.py
    │       ├── CgroupMemory.py
    │       ├── CpuCategories.py
    │       ├── CpuPerfCounters.py
    │       ├── CpuUsage.py
    │       ├── CpuUserTimeseries.py
    │       ├── Gpfs.py
    │       ├── GpfsTimeseries.py
    │       ├── GpuPower.py
    │       ├── GpuUsage.py
    │       ├── GpuUsageTimeseries.py
    │       ├── InfiniBand.py
    │       ├── InfiniBandTimeseries.py
    │       ├── IpmiPower.py
    │       ├── Lnet.py
    │       ├── LoadAvg.py
    │       ├── Lustre.py
    │       ├── LustreTimeseries.py
    │       ├── MemBwTimeseries.py
    │       ├── MemUsageTimeseries.py
    │       ├── MemoryUsage.py
    │       ├── Network.py
    │       ├── Nfs.py
    │       ├── NfsTimeseries.py
    │       ├── NodeMemoryUsage.py
    │       ├── PowerUsageTimeseries.py
    │       ├── SimdInsTimeseries.py
    │       ├── SveTimeseries.py
    │       ├── TaccCatastrophe.py
    │       ├── TaccPerfCounters.py
    │       ├── TaccUncoreCounters.py
    │       ├── TimeseriesPatternsGpfs.py
    │       ├── TotalMemUsageTimeseries.py
    │       ├── UncoreCounters.py
    │       └── __init__.py
    │   ├── preprocessors
    │       ├── HardwareInventory.py
    │       ├── PerfEvent.py
    │       ├── Proc.py
    │       ├── ProcPrometheus.py
    │       └── __init__.py
    │   ├── proc_common.py
    │   ├── processhelpers.py
    │   ├── rangechange.py
    │   ├── scripthelpers.py
    │   ├── statistics.py
    │   ├── subsample.py
    │   ├── summarize.py
    │   ├── summarize_jobs.py
    │   ├── summarize_mpi.py
    │   ├── supremm_setup.py
    │   ├── supremm_testharness.py
    │   ├── supremm_update
    │   ├── supremm_upgrade.py
    │   ├── supremmconf.py
    │   ├── xdmodaccount.py
    │   └── xdmodstylesetupmenu.py
└── tests
    ├── __init__.py
    ├── ci
        ├── Dockerfile
        ├── build.sh
        ├── setup.sh
        ├── srv
        │   ├── prom_cluster.txt
        │   ├── prometheus
        │   │   ├── Dockerfile
        │   │   ├── promdata.tar.gz
        │   │   ├── prometheus.yml
        │   │   └── web.yml
        │   └── services.yml
        └── test
        │   ├── Dockerfile
        │   └── bootstrap.sh
    ├── component
        ├── data
        │   ├── perfevent.0
        │   ├── perfevent.index
        │   └── perfevent.meta
        └── runtests.sh
    ├── integration_tests
        ├── 5894431-1622570028
        │   ├── cpn-d14-02.0
        │   ├── cpn-d14-02.index
        │   └── cpn-d14-02.meta
        ├── __init__.py
        ├── integration_plugin_api.py
        ├── integration_test.bash
        ├── mock_preprocessor.py
        ├── pcp_logs_extracted
        │   ├── 20161229.00.10.0
        │   ├── 20161229.00.10.index
        │   ├── 20161229.00.10.meta
        │   ├── job-972366-begin-20161229.23.06.00.0
        │   ├── job-972366-begin-20161229.23.06.00.index
        │   ├── job-972366-begin-20161229.23.06.00.meta
        │   ├── job-972366-end-20161230.00.06.00.0
        │   ├── job-972366-end-20161230.00.06.00.index
        │   └── job-972366-end-20161230.00.06.00.meta
        ├── supremm_setup_expect.py
        └── throwing_plugin.py
    ├── testPcpArchiveProcessor.py
    ├── testgetoptions.py
    ├── testrangechange.py
    └── testsummarize.py


/.circleci/config.yml:
--------------------------------------------------------------------------------
  1 | version: 2.1
  2 | jobs:
  3 |   build:
  4 |     docker:
  5 |       - image: rockylinux:8
  6 |     steps:
  7 |       - checkout
  8 |       - run:
  9 |           name: Install System Dependencies
 10 |           command: ./tests/ci/setup.sh build
 11 |       - run:
 12 |           name: Build Software Package
 13 |           command: ./tests/ci/build.sh
 14 |       - persist_to_workspace:
 15 |           root: .
 16 |           paths:
 17 |             - dist/*
 18 | 
 19 |   test:
 20 |     parameters:
 21 |       test-mode:
 22 |         type: string
 23 |       install-type:
 24 |         type: string
 25 |     docker:
 26 |       - image: tools-ext-01.ccr.xdmod.org/xdmod-job_performance-10.5.0:rockylinux8-0.1
 27 |     environment:
 28 |       TERM: xterm
 29 |       TERMINFO: /bin/bash
 30 |       COMPOSER_ALLOW_SUPERUSER: 1
 31 |       XDMOD_REALMS: 'jobs,storage,cloud'
 32 |       XDMOD_IS_CORE: yes
 33 |       XDMOD_INSTALL_DIR: /xdmod
 34 |       XDMOD_TEST_MODE: << parameters.test-mode >>
 35 |       SUPREMM_INSTALL_TYPE: << parameters.install-type >>
 36 |     steps:
 37 |       - checkout
 38 |       - run:
 39 |           name: Create Test Result Directories
 40 |           command: |
 41 |             mkdir -p shippable/testresults
 42 |             mkdir -p shippable/codecoverage
 43 |       - attach_workspace:
 44 |           at: .
 45 |       - run:
 46 |           name: Install Docker Compose
 47 |           command: |
 48 |             dnf install -y dnf-utils
 49 |             dnf config-manager --add-repo https://download.docker.com/linux/centos/docker-ce.repo
 50 |             dnf install -y docker-ce docker-ce-cli docker-compose-plugin
 51 | 
 52 |       - setup_remote_docker
 53 |       - run:
 54 |           name: Build services
 55 |           command: docker compose -f ./tests/ci/srv/services.yml build
 56 |       - run:
 57 |           name: Start services
 58 |           command: docker compose -f ./tests/ci/srv/services.yml up -d
 59 |       - run:
 60 |           name: Run Bootstrap
 61 |           command: ./tests/ci/test/bootstrap.sh $SUPREMM_INSTALL_TYPE
 62 |       - run:
 63 |           name: Run Integration Tests
 64 |           command: ./tests/integration_tests/integration_test.bash
 65 |       - run:
 66 |           name: Run Component Tests
 67 |           command: ./tests/component/runtests.sh
 68 |       - run:
 69 |           name: Summarize Jobs
 70 |           command: summarize_jobs.py -h > /dev/null
 71 |       - run:
 72 |           name: Index Archives
 73 |           command: indexarchives.py -h > /dev/null
 74 |       - run:
 75 |           name: Ingest Jobs
 76 |           command: ingest_jobscripts.py -d
 77 |       - run:
 78 |           name: Pylint
 79 |           command: pylint-3 --errors-only supremm
 80 |       - run:
 81 |           name: Pytest
 82 |           command: pytest-3 --junitxml=shippable/testresults/testreport.xml --cov=supremm --cov-report xml:shippable/codecoverage/coverage.xml
 83 |       - run:
 84 |           name: Remove Currently Installed SUPREMM
 85 |           command: dnf remove -y supremm
 86 |       - store_test_results:
 87 |           path: shippable/testresults
 88 |       - store_artifacts:
 89 |           path: shippable/codecoverage
 90 |       - store_artifacts:
 91 |           path: /var/log/xdmod
 92 | 
 93 | workflows:
 94 |   full-build:
 95 |     jobs:
 96 |       - build
 97 |       - test:
 98 |           matrix:
 99 |             parameters:
100 |               test-mode: ["fresh_install", "upgrade"]
101 |               install-type: ["rpm", "wheel", "src"]
102 |           requires:
103 |             - build
104 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | build/
 2 | dist/
 3 | *.egg-info/
 4 | *.pyc
 5 | *.so
 6 | pypmlogextract.c
 7 | pcpcinterface.c
 8 | 
 9 | .idea/
10 | .vscode/
11 | .cache/
12 | 
13 | # ci or testing files
14 | .coverage
15 | shippable/
16 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | SUPReMM Summarization 
 2 | =====================
 3 | 
 4 | The code for the SUPReMM summarization package was originally written by and includes contributions from:
 5 | 
 6 | - Bill Barth 
 7 | - Andrew E. Bruno
 8 | - Richard T. Evans
 9 | - John L. Hammond
10 | - Martins Innus
11 | - Kyle Markus
12 | - Jeffrey T. Palmer
13 | - Joseph P. White
14 | - Thomas Yearke
15 | 
16 | Contributors
17 | 
18 | - Trey Dockendorf
19 | - Ian DesJardin
20 | - Alex Kofke
21 | 


--------------------------------------------------------------------------------
/config/ccr-files/supremm_update_ccr:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #-------------------------------------------------------------------------
 4 | # Configurable settigns
 5 | 
 6 | PREFIX=/data/xdtas
 7 | LOCKFILE=/var/tmp/supremm_summary.lock
 8 | INSTALLPATH=$PREFIX/opt
 9 | 
10 | GITHUBKEY=$HOME/.ssh/id_rsa_github
11 | 
12 | THREADS=`nproc --ignore=2`
13 | 
14 | ulimit -n4096
15 | 
16 | #-------------------------------------------------------------------------
17 | #  Main script
18 | 
19 | reportfail()
20 | {
21 |     echo "Not running due to another process holding the lock"
22 |     exit 1
23 | }
24 | 
25 | (
26 |     flock -n 9 || reportfail
27 | 
28 |     set -e
29 | 
30 |     #-------------------------------------------------------------------------
31 |     # Pull the latest from git and install package from source
32 |     
33 |     ssh-agent sh -c "ssh-add $GITHUBKEY 2> /dev/null && cd $PREFIX/ccr-pcp && git pull -q" > /dev/null
34 |     
35 |     cd $PREFIX/ccr-pcp/scripts
36 |     python setup.py install --prefix=$INSTALLPATH > /dev/null
37 |     
38 |     #-------------------------------------------------------------------------
39 |     # Run index and ingest
40 |     
41 |     PATH=$INSTALLPATH/bin:$PATH
42 |     PYTHONPATH=$INSTALLPATH/lib64/python2.7/site-packages
43 |     
44 |     export PYTHONPATH PATH
45 |     
46 |     if [ "$1" != "process" ]; then
47 |         indexarchives.py
48 |         account.py
49 |         summarize_jobs.py -t $THREADS -q
50 |     else
51 |         summarize_jobs.py -t $THREADS -d
52 |     fi
53 | 
54 | ) 9>${LOCKFILE}
55 | 
56 | 


--------------------------------------------------------------------------------
/config/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Configuration settings for job data input
 3 |     // The database configuration below assumes everything on the same host and
 4 |     // XDMoD is installed on the host too.
 5 |     //
 6 |     // It is not necessary for all databases to run locally. A more complex
 7 |     // configuration could have multiple hosts and different credentials for
 8 |     // each DB.
 9 |     "xdmodroot": "/etc/xdmod",
10 |     "datawarehouse": {
11 |         "include": "xdmod://datawarehouse"
12 |     },
13 |     // Configuration setting for summary document output
14 |     "outputdatabase": {
15 |         "db_engine": "mongodb",
16 |         "uri": "mongodb://localhost:27017/supremm",
17 |         //"uri": "mongodb://username:password@localhost/supremm",
18 |         //"uri": "mongodb://mongodb-server1:27017,mongodb-server2:27017/supremm?replicaSet=foo",
19 |         //"uri": "mongodb://username:password@mongodb-server1:27017,mongodb-server2:27017/supremm?replicaSet=foo",
20 |         "dbname": "supremm"
21 |     },
22 |     "summary": {
23 |         // The archive out directory should be writable by the process that runs
24 |         // the summaries.
25 |         "archive_out_dir": "/dev/shm/supremm_test",
26 |         // The following substitutions are defined for the job archive subdirectory:
27 |         //  %r means the resource name
28 |         //  %j the local job id
29 |         //  the rest is sent to strftime with the end time of the job
30 |         //  Common examples:
31 |         //  %Y-%m-%d/%r/%j  includes the date/resource/jobid in the path
32 |         "subdir_out_format": "%r/%j"
33 |     },
34 |     "resources": {
35 |         // Edit the below to match your cluster name and data locations
36 |         "my_cluster_name": {
37 |             "enabled": true,
38 |             "resource_id": 1,
39 |             "batch_system": "XDMoD",
40 |             "hostname_mode": "hostname",
41 |             "pcp_log_dir": "/data/pcp-logs/my_cluster_name",
42 |             "script_dir": "/data/jobscripts/my_cluster_name",
43 | 
44 |             // fast_index uses an alternative method of indexing job-level pcp archives which can significantly speed
45 |             // up the indexarchives.py script. The tradeoff is that the indexed archive end time is not found and the
46 |             // start time is slightly less accurate. For normal summarization usage this doesn't matter, but set this
47 |             // to false if you need that data for other purposes.
48 |             "fast_index": true
49 | 
50 |             // When using fast_index mode, if the timezone of the resource where the pcp logs were collected is
51 |             // different than the timezone of the computer running the indexing, the timezone of the resource
52 |             // must be specified here.
53 |             //,"timezone": "America/New_York"
54 |         }
55 |     }
56 | }
57 | 


--------------------------------------------------------------------------------
/config/prometheus/mapping.json:
--------------------------------------------------------------------------------
  1 | {
  2 | 	"common": {
  3 | 		"params": ["host"],
  4 | 		"defaults": {"environment": "prod"}
  5 | 	},
  6 | 	"metrics": {
  7 | 		"cgroup.memory.usage": {
  8 | 			"name": "cgroup_memory_used_bytes",
  9 | 			"params": ["cgroup"],
 10 | 			"groupby": "cgroup"
 11 | 		},
 12 | 		"cgroup.memory.limit": {
 13 | 			"name": "cgroup_memory_total_bytes",
 14 | 			"params": ["cgroup"],
 15 | 			"groupby": "cgroup"
 16 | 		},
 17 | 		"disk.dev.read": {
 18 | 			"name": "node_disk_reads_completed_total",
 19 | 			"groupby": "device"
 20 | 		},
 21 | 		"disk.dev.read_bytes": {
 22 | 			"name": "node_disk_read_bytes_total",
 23 | 			"scaling": "0.0009765625",
 24 | 			"groupby": "device"
 25 | 		},
 26 | 		"disk.dev.write": {
 27 | 			"name": "node_disk_writes_completed_total",
 28 | 			"groupby": "device"
 29 | 		},
 30 | 		"disk.dev.write_bytes": {
 31 | 			"name": "node_disk_written_bytes_total",
 32 | 			"scaling": "0.0009765625",
 33 | 			"groupby": "device"
 34 | 		},
 35 | 		"infiniband.port.switch.in.bytes": {
 36 | 			"name": "node_infiniband_port_data_received_bytes_total",
 37 | 			"groupby": "port",
 38 | 			"out_fmt": ["{}:{}", "device", "port"]
 39 | 		},
 40 | 		"infiniband.port.switch.in.packets": {
 41 | 			"name": "node_infiniband_port_packets_received_total",
 42 | 			"groupby": "port",
 43 | 			"out_fmt": ["{}:{}", "device", "port"]
 44 | 		},
 45 | 		"infiniband.port.switch.out.bytes": {
 46 | 			"name": "node_infiniband_port_data_transmitted_bytes_total",
 47 | 			"groupby": "port",
 48 | 			"out_fmt": ["{}:{}", "device", "port"]
 49 | 		},
 50 | 		"infiniband.port.switch.out.packets": {
 51 | 			"name": "node_infiniband_port_packets_transmitted_total",
 52 | 			"groupby": "port",
 53 | 			"out_fmt": ["{}:{}", "device", "port"]
 54 | 		},
 55 | 		"ipmi.dcmi.power": {
 56 | 			"name": "ipmi_dcmi_power_consumption_watts",
 57 | 			"groupby": "host"
 58 | 		},
 59 | 		"kernel.all.load": {
 60 | 			"name": "node_load1",
 61 | 			"groupby": "host"
 62 | 		},
 63 | 		"kernel.percpu.cpu.user": {
 64 | 			"name": "node_cpu_seconds_total",
 65 |                         "defaults": {"mode" : "user"},
 66 | 			"scaling": "1000",
 67 | 			"groupby": "cpu",
 68 | 			"out_fmt": ["cpu{}", "cpu"]
 69 | 		},
 70 | 		"kernel.percpu.cpu.idle": {
 71 | 			"name": "node_cpu_seconds_total",
 72 | 			"defaults": {"mode" : "idle"},
 73 | 			"scaling": "1000",
 74 | 			"groupby": "cpu",
 75 | 			"out_fmt": ["cpu{}", "cpu"]
 76 | 		},
 77 | 		"kernel.percpu.cpu.nice": {
 78 | 			"name": "node_cpu_seconds_total",
 79 | 			"defaults": {"mode" : "nice"},
 80 | 			"scaling": "1000",
 81 | 			"groupby": "cpu",
 82 | 			"out_fmt": ["cpu{}", "cpu"]
 83 | 		},
 84 | 		"kernel.percpu.cpu.sys": {
 85 | 			"name": "node_cpu_seconds_total",
 86 | 			"defaults": {"mode" : "system"},
 87 | 			"scaling": "1000",
 88 | 			"groupby": "cpu",
 89 | 			"out_fmt": ["cpu{}", "cpu"]
 90 | 		},
 91 | 		"kernel.percpu.cpu.wait.total": {
 92 | 			"name": "node_cpu_seconds_total",
 93 | 			"defaults": {"mode" : "iowait"},
 94 | 			"scaling": "1000",
 95 | 			"groupby": "cpu",
 96 | 			"out_fmt": ["cpu{}", "cpu"]
 97 | 		},
 98 | 		"kernel.percpu.cpu.irq.hard": {
 99 | 			"name": "node_cpu_seconds_total",
100 | 			"defaults": {"mode" : "irq"},
101 | 			"scaling": "1000",
102 | 			"groupby": "cpu",
103 | 			"out_fmt": ["cpu{}", "cpu"]
104 | 		},
105 | 		"kernel.percpu.cpu.irq.soft": {
106 | 			"name": "node_cpu_seconds_total",
107 | 			"defaults": {"mode" : "softirq"},
108 | 			"scaling": "1000",
109 | 			"groupby": "cpu",
110 | 			"out_fmt": ["cpu{}", "cpu"]
111 | 		},
112 | 		"mem.numa.util.filePages": {
113 | 			"name": "node_memory_numa_FilePages",
114 | 			"groupby": "node"
115 | 		},
116 | 		"mem.numa.util.slab": {
117 | 			"name": "node_memory_numa_Slab",
118 | 			"groupby": "node"
119 | 		},
120 | 		"mem.numa.util.used": {
121 | 			"name": "node_memory_numa_MemUsed",
122 | 			"groupby": "node"
123 | 		},
124 | 		"mem.freemem": {
125 | 			"name": "node_memory_MemFree_bytes",
126 | 			"scaling": "0.0009765625",
127 | 			"groupby": "host"
128 | 		},
129 | 		"mem.physmem": {
130 | 			"name": "node_memory_MemTotal_bytes",
131 | 			"scaling": "0.0009765625",
132 | 			"groupby": "host"
133 | 		},
134 | 		"network.interface.in.bytes": {
135 | 			"name": "node_network_receive_bytes_total",
136 | 			"groupby": "device"
137 | 		},
138 | 		"network.interface.out.bytes": {
139 | 			"name": "node_network_transmit_bytes_total",
140 | 			"groupby": "device"
141 | 		},
142 | 		"nvidia.gpuactive": {
143 | 			"name": "DCGM_FI_DEV_GPU_UTIL",
144 | 			"groupby": "gpu",
145 | 			"out_fmt": ["gpu{}", "gpu"]
146 | 		},
147 | 		"nvidia.memused": {
148 | 			"name": "DCGM_FI_DEV_FB_USED",
149 | 			"groupby": "gpu",
150 | 			"out_fmt": ["gpu{}", "gpu"]
151 | 		},
152 | 		"nvidia.powerused": {
153 | 			"name": "DCGM_FI_DEV_POWER_USAGE",
154 | 			"scaling": "1000",
155 | 			"groupby": "gpu",
156 | 			"out_fmt": ["gpu{}", "gpu"]
157 | 		},
158 | 		"prom:cgroup_cpu_info": {
159 | 			"name": "cgroup_cpu_info",
160 | 			"params": ["cgroup"],
161 | 			"groupby": "cpus"
162 | 		},
163 | 		"prom:cgroup_process_exec_count": {
164 | 			"name": "cgroup_process_exec_count",
165 | 			"params": ["cgroup"],
166 | 			"groupby": "exec"
167 | 		}
168 | 	}
169 | }
170 | 


--------------------------------------------------------------------------------
/config/templates/hotproc/hotproc.conf:
--------------------------------------------------------------------------------
1 | #pmdahotproc
2 | Version 1.0
3 | 
4 | ( (uname != "root") && (uname != "rpc") && (uname != "rpcuser") && (uname != "dbus") && (uname != "avahi") && (uname != "munge") && (uname != "ntp") && (uname != "nagios") && (uname != "postfix") && (uname != "pcp") && (uname != "libstoragemgmt") && (uname != "chrony") && (uname != "polkitd") ) || cpuburn > 0.1
5 | 


--------------------------------------------------------------------------------
/config/templates/pmlogger/control:
--------------------------------------------------------------------------------
 1 | #
 2 | # PCP archive logging configuration/control
 3 | #
 4 | # This file is used by various of the PCP archive logging administrative
 5 | # tools to perform maintenance on the pmlogger instances running on
 6 | # the local host.
 7 | #
 8 | # This file contains one line per host to be logged, fields are
 9 | #    Host	name of host to be logged
10 | #    P(rimary)	is this the primary logger? y or n
11 | #    S(ocks)	should this logger be launched with pmsocks? y or n
12 | #    Directory	full pathname to directory where archive logs are
13 | #		to be maintained ... note all scripts "cd" to here as
14 | #		a first step
15 | #    Args	optional additional arguments to pmlogger and/or pmnewlog
16 | #
17 | 
18 | # === VARIABLE ASSIGNMENTS ===
19 | #
20 | # DO NOT REMOVE OR EDIT THE FOLLOWING LINE
21 | $version=1.1
22 | 
23 | # if pmsocks is being used, edit the IP address for $SOCKS_SERVER
24 | #$SOCKS_SERVER=123.456.789.123
25 | 
26 | # for remote loggers running over a WAN with potentially long delays
27 | $PMCD_CONNECT_TIMEOUT=150
28 | $PMCD_REQUEST_TIMEOUT=120
29 | 
30 | # === LOGGER CONTROL SPECIFICATIONS ===
31 | #
32 | #Host		P?  S?	directory				args
33 | 
34 | # local primary logger
35 | #
36 | # (LOCALHOSTNAME is expanded to local: in the first column,
37 | # and to `hostname` in the fourth (directory) column.)
38 | #
39 | LOCALHOSTNAME	y   n	"PCP_LOG_DIR/pmlogger/$(date +%Y)/$(date +%m)/LOCALHOSTNAME/$(date +%Y)-$(date +%m)-$(date +%d)"	-r -c /etc/pcp/pmlogger/pmlogger-supremm.config
40 | 
41 | # Note:	if multiple pmloggers for the same host (e.g. both primary and
42 | #	non-primary loggers are active), then they MUST use different
43 | #	directories
44 | 
45 | # local non-primary logger
46 | #LOCALHOSTNAME	n   n	PCP_LOG_DIR/pmlogger/mysummary		-r -T24h10m -c config.Summary
47 | 
48 | # remote host
49 | #remote		n   n	PCP_LOG_DIR/pmlogger/remote		-r -T24h10m -c config.remote
50 | 
51 | # thru the firewall via socks
52 | #distant	n   y	PCP_LOG_DIR/pmlogger/distant		-r -T24h10m -c config.distant
53 | 


--------------------------------------------------------------------------------
/config/templates/slurm/slurm-epilog:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use Sys::Hostname;
 5 | use Date::Simple;
 6 | use POSIX qw(strftime);
 7 | 
 8 | my $jobid = $ENV{SLURM_JOB_ID};
 9 | my $today = Date::Simple->new;
10 | 
11 | my $logyear = $today->format("%Y");
12 | my $logmonth = $today->format("%m");
13 | my $logday = $today->format("%d");
14 | 
15 | my $jobdatelong = strftime "%Y%m%d.%H.%M.%S", localtime;
16 | my $fullhost = hostname();
17 | 
18 | # PCP End of job logging
19 | 
20 | my $logdir = "/<GLOBAL_SHARED_SPACE>/supremm/pmlogger/$logyear/$logmonth/$fullhost/$logyear-$logmonth-$logday";
21 | system("env PMLOGGER_EPILOG=yes pmlogger -U pcp -c /etc/pcp/pmlogger/pmlogger-supremm.config -s 1 -l /tmp/job-$jobid-end-$jobdatelong.log $logdir/job-$jobid-end-$jobdatelong &> /dev/null");
22 | 
23 | exit(0);
24 | 


--------------------------------------------------------------------------------
/config/templates/slurm/slurm-prolog:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | 
 3 | use strict;
 4 | use Sys::Hostname;
 5 | use Date::Simple;
 6 | use POSIX qw(strftime setsid);
 7 | 
 8 | my $today = Date::Simple->new;
 9 | 
10 | my $logyear = $today->format("%Y");
11 | my $logmonth = $today->format("%m");
12 | my $logday = $today->format("%d");
13 | 
14 | my $jobdatelong = strftime "%Y%m%d.%H.%M.%S", localtime;
15 | my $host = ( split('\.', hostname()) )[0];
16 | my $jobid = $ENV{SLURM_JOB_ID};
17 | 
18 | # PCP job Start Logging
19 | 
20 | my $fullhost = hostname();
21 | 
22 | # This is the every 10 seconds for a total of 30 second log, after job start time
23 | # Slurm kills child processes of the prolog so we need to start our own process group to keep this running for 30 seconds after prolog is done
24 | 
25 | $SIG{CHLD} = 'IGNORE';        # Configure to autoreap zombies
26 | exit(0) unless defined ( my $child = fork );  # fork, or just exit if failed
27 | exit(0) if $child;      # Main prolog exits with success
28 | 
29 | # Below this is now the child
30 | setsid();                     # Become session leader
31 | open( STDIN,  "</dev/null" ); # Detach STDIN from shell
32 | open( STDOUT, ">/dev/null" ); # Detach STDOUT from shell
33 | open( STDERR, ">&STDOUT"   ); # Detach STDERR from shell
34 | chdir '/tmp';                 # Change working directory
35 | umask(0);                     # Reset umask
36 | 
37 | my $logdir = "/<GLOBAL_SHARED_SPACE>/supremm/pmlogger/$logyear/$logmonth/$fullhost/$logyear-$logmonth-$logday";
38 | 
39 | # The prolog config runs every 10 seconds, we exit after 4 samples
40 | system("env PMLOGGER_PROLOG=yes pmlogger -U pcp -c /etc/pcp/pmlogger/pmlogger-supremm.config -s 4 -l /tmp/job-$jobid-begin-$jobdatelong.log $logdir/job-$jobid-begin-$jobdatelong &> /dev/null");
41 | 
42 | exit(0)
43 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | python_files = test*.py
3 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_rpm]
2 | release = 1%%{?dist}
3 | build_requires = python36-devel, pcp-libs-devel >= 5.3, pcp-libs-devel < 5.4
4 | requires = python3, python3-pymongo, python3-numpy, python3-scipy, python3-PyMySQL, python3-pcp >= 5.3, python3-pcp < 5.4, pcp-libs >= 5.3, pcp-libs < 5.4, python3-Cython, python3-pytz, python3-requests
5 | install_script = .rpm_install_script.txt
6 | 
7 | [bdist_wheel]
8 | python-tag = py36
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ setup script for SUPReMM job summarization utilities """
 3 | import sys
 4 | import os
 5 | from setuptools import setup, find_packages, Extension
 6 | import numpy
 7 | 
 8 | from Cython.Build import cythonize
 9 | 
10 | # For rpm-based builds want the configuration files to
11 | # go in the standard location. Also need to rewrite the file list so that
12 | # the config filesa are listed as %config(noreplace)
13 | IS_RPM_BUILD = False
14 | if 'bdist_rpm' in sys.argv or 'RPM_BUILD_ROOT' in os.environ:
15 |     IS_RPM_BUILD = True
16 |     confpath = '/etc/supremm'
17 |     with open('.rpm_install_script.txt', 'w') as fp:
18 |         fp.write('%s %s install -O1 --root=$RPM_BUILD_ROOT --record=INSTALLED_FILES\n' % (sys.executable, os.path.basename(sys.argv[0])))
19 |         fp.write('sed -i \'s#^\\(%s\\)#%%config(noreplace) \\1#\' INSTALLED_FILES\n' % (confpath, ))
20 | else:
21 |     confpath = 'etc/supremm'
22 | 
23 | 
24 | setup(
25 |     name='supremm',
26 |     version='2.0.0',
27 |     description='SUPReMM Job Summarization Utilities',
28 |     long_description='Utilities for generating job-level summary data from host level PCP archives.\nAlso includes template configuration files for running PCP on an HPC system.',
29 |     license='LGPLv3',
30 |     author='Joseph P White',
31 |     author_email='jpwhite4@buffalo.edu',
32 |     url='https://github.com/ubccr/supremm',
33 | 
34 |     zip_safe=False,
35 |     package_dir={'': 'src'},
36 |     packages=find_packages(where='src'),
37 |     package_data={
38 |         'supremm': ['assets/modw_supremm.sql', 'assets/mongo_setup.js', '*.pxd', '*.pyx'],
39 |         'supremm.datasource.pcp.pcpcinterface': ['*.pxd', '*.pyx']
40 |     },
41 |     data_files=[
42 |         (confpath,                         ['config/config.json', 'config/prometheus/mapping.json']),
43 |         ('share/supremm/templates/slurm',       ['config/templates/slurm/slurm-epilog',  'config/templates/slurm/slurm-prolog']),
44 |         ('share/supremm/templates/hotproc',       ['config/templates/hotproc/hotproc.conf']),
45 |         ('share/supremm/templates/pmlogger',    ['config/templates/pmlogger/control',    'config/templates/pmlogger/pmlogger-supremm.config'])
46 |     ],
47 |     scripts=[
48 |              'src/supremm/supremm_update'
49 |     ],
50 |     entry_points={
51 |         'console_scripts': [
52 |             'gen-pmlogger-control.py = supremm.gen_pmlogger_control:main',
53 |             'summarize_jobs.py = supremm.summarize_jobs:main',
54 |             'summarize_mpi.py = supremm.summarize_mpi:main',
55 |             'indexarchives.py = supremm.datasource.pcp.indexarchives:runindexing',
56 |             'account.py = supremm.account:runingest',
57 |             'supremmconf.py = supremm.supremmconf:main',
58 |             'supremm-setup = supremm.supremm_setup:main',
59 |             'supremm-upgrade = supremm.supremm_upgrade:main',
60 |             'ingest_jobscripts.py = supremm.ingest_jobscripts:main'
61 | 
62 |         ]
63 |     },
64 |     install_requires=[
65 |         'numpy',
66 |         'PyMySQL',
67 |         'pcp',
68 |         'Cython',
69 |         'scipy',
70 |         'pymongo',
71 |         'pytz',
72 |         'requests'
73 |     ],
74 |     ext_modules=cythonize([
75 |         Extension("supremm.datasource.pcp.pcpcinterface.pcpcinterface", ["src/supremm/datasource/pcp/pcpcinterface/pcpcinterface.pyx"], libraries=["pcp"], include_dirs=[numpy.get_include()])
76 |     ])
77 | )
78 | 
79 | if IS_RPM_BUILD:
80 |     os.unlink('.rpm_install_script.txt')
81 | 


--------------------------------------------------------------------------------
/src/supremm/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | 


--------------------------------------------------------------------------------
/src/supremm/__init__.py:
--------------------------------------------------------------------------------
1 | """ The SUPReMM module contains software that generates job-level summaries from PCP archives """
2 | 


--------------------------------------------------------------------------------
/src/supremm/accounting.py:
--------------------------------------------------------------------------------
 1 | """ definition of the accounting API and implementations of some base classes that
 2 |     include common functions """
 3 | 
 4 | from abc import ABCMeta, abstractmethod
 5 | 
 6 | class Accounting(object, metaclass=ABCMeta):
 7 |     """ abstract base class describing the job accounting interface """
 8 | 
 9 |     PROCESS_VERSION = 1
10 | 
11 |     def __init__(self, resource_id, config):
12 |         self._resource_id = resource_id
13 |         self._config = config
14 | 
15 |     @abstractmethod
16 |     def getbylocaljobid(self, localjobid):
17 |         """ Yields one or more Jobs that match the localjobid """
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def getbytimerange(self, start, end, onlynew):
22 |         """ Search for all jobs based on the time interval. Matches based on the end
23 |         timestamp of the job """
24 |         pass
25 | 
26 |     @abstractmethod
27 |     def get(self, start, end):
28 |         """ Yields all unprocessed jobs. Optionally specify a time interval to process"""
29 |         pass
30 | 
31 |     @abstractmethod
32 |     def markasdone(self, job, success, elapsedtime):
33 |         """ log a job as being processed (either successfully or not) """
34 |         pass
35 | 
36 | class ArchiveCache(object, metaclass=ABCMeta):
37 |     """ abstract base class describing the job archive cache interface """
38 | 
39 |     def __init__(self, config):
40 |         self._config = config
41 | 
42 |     @abstractmethod
43 |     def insert(self, resource_id, hostname, filename, start, end, jobid):
44 |         """ insert a record into the cache """
45 |         pass
46 | 
47 |     @abstractmethod
48 |     def insert_from_files(self, paths_file, joblevel_file, nodelevel_file):
49 |         pass
50 | 
51 |     @abstractmethod
52 |     def postinsert(self):
53 |         """ Must be called after insert.  """
54 |         pass
55 | 


--------------------------------------------------------------------------------
/src/supremm/assets/modw_pcp.sql:
--------------------------------------------------------------------------------
  1 | -- MySQL dump 10.13  Distrib 5.5.41, for debian-linux-gnu (x86_64)
  2 | --
  3 | -- Host: localhost    Database: modw_pcp
  4 | -- ------------------------------------------------------
  5 | -- Server version	5.5.41-0ubuntu0.12.04.1-log
  6 | 
  7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
  8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
  9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
 10 | /*!40101 SET NAMES utf8 */;
 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
 12 | /*!40103 SET TIME_ZONE='+00:00' */;
 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
 17 | 
 18 | --
 19 | -- Current Database: `modw_pcp`
 20 | --
 21 | 
 22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `modw_pcp` /*!40100 DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci */;
 23 | 
 24 | USE `modw_pcp`;
 25 | 
 26 | --
 27 | -- Table structure for table `archive`
 28 | --
 29 | 
 30 | DROP TABLE IF EXISTS `archive`;
 31 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 32 | /*!40101 SET character_set_client = utf8 */;
 33 | CREATE TABLE `archive` (
 34 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 35 |   `hostid` int(11) NOT NULL,
 36 |   `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 37 |   `start_time_ts` double NOT NULL,
 38 |   `end_time_ts` double NOT NULL,
 39 |   `jobid` varchar(45) DEFAULT NULL,
 40 |   PRIMARY KEY (`id`),
 41 |   UNIQUE KEY `unique` (`filename`),
 42 |   KEY `fk_archive_1_idx` (`hostid`),
 43 |   CONSTRAINT `fk_archive_1` FOREIGN KEY (`hostid`) REFERENCES `hosts` (`id`) ON DELETE NO ACTION ON UPDATE NO ACTION
 44 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 45 | /*!40101 SET character_set_client = @saved_cs_client */;
 46 | 
 47 | --
 48 | -- Table structure for table `hosts`
 49 | --
 50 | 
 51 | DROP TABLE IF EXISTS `hosts`;
 52 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 53 | /*!40101 SET character_set_client = utf8 */;
 54 | CREATE TABLE `hosts` (
 55 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 56 |   `hostname` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 57 |   PRIMARY KEY (`id`),
 58 |   UNIQUE KEY `UNIQUE` (`hostname`)
 59 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 60 | /*!40101 SET character_set_client = @saved_cs_client */;
 61 | 
 62 | --
 63 | -- Table structure for table `job`
 64 | --
 65 | 
 66 | DROP TABLE IF EXISTS `job`;
 67 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 68 | /*!40101 SET character_set_client = utf8 */;
 69 | CREATE TABLE `job` (
 70 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 71 |   `resource_id` int(11) NOT NULL,
 72 |   `local_job_id` int(11) NOT NULL,
 73 |   `start_time_ts` int(11) NOT NULL,
 74 |   `end_time_ts` int(11) NOT NULL,
 75 |   `record` blob,
 76 |   PRIMARY KEY (`id`),
 77 |   UNIQUE KEY `UNIQUE` (`resource_id`,`local_job_id`,`end_time_ts`)
 78 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 79 | /*!40101 SET character_set_client = @saved_cs_client */;
 80 | 
 81 | --
 82 | -- Table structure for table `jobhosts`
 83 | --
 84 | 
 85 | DROP TABLE IF EXISTS `jobhosts`;
 86 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 87 | /*!40101 SET character_set_client = utf8 */;
 88 | CREATE TABLE `jobhosts` (
 89 |   `jobid` int(11) NOT NULL,
 90 |   `hostid` int(11) NOT NULL,
 91 |   UNIQUE KEY `UNIQUE` (`jobid`,`hostid`),
 92 |   KEY `fk_jobhosts_2_idx` (`hostid`),
 93 |   CONSTRAINT `fk_jobhosts_1` FOREIGN KEY (`jobid`) REFERENCES `job` (`id`) ON DELETE CASCADE ON UPDATE CASCADE,
 94 |   CONSTRAINT `fk_jobhosts_2` FOREIGN KEY (`hostid`) REFERENCES `hosts` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
 95 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 96 | /*!40101 SET character_set_client = @saved_cs_client */;
 97 | 
 98 | --
 99 | -- Table structure for table `process`
100 | --
101 | 
102 | DROP TABLE IF EXISTS `process`;
103 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
104 | /*!40101 SET character_set_client = utf8 */;
105 | CREATE TABLE `process` (
106 |   `jobid` int(11) NOT NULL,
107 |   `process_version` int(11) NOT NULL DEFAULT '0',
108 |   `process_timestamp` timestamp NULL DEFAULT NULL,
109 |   `process_time` double DEFAULT NULL,
110 |   `ingest_version` int(11) NOT NULL,
111 |   `ingest_timestamp` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
112 |   PRIMARY KEY (`jobid`),
113 |   KEY `proc` (`process_version`),
114 |   CONSTRAINT `fk_process_1` FOREIGN KEY (`jobid`) REFERENCES `job` (`id`) ON DELETE CASCADE ON UPDATE CASCADE
115 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
116 | /*!40101 SET character_set_client = @saved_cs_client */;
117 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
118 | 
119 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
120 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
121 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
122 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
123 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
124 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
125 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
126 | 
127 | -- Dump completed on 2015-05-19 11:00:21
128 | 


--------------------------------------------------------------------------------
/src/supremm/assets/modw_supremm.sql:
--------------------------------------------------------------------------------
  1 | -- MySQL dump 10.13  Distrib 5.5.41, for debian-linux-gnu (x86_64)
  2 | --
  3 | -- Host: localhost    Database: modw_pcp
  4 | -- ------------------------------------------------------
  5 | -- Server version	5.5.41-0ubuntu0.12.04.1-log
  6 | 
  7 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
  8 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
  9 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
 10 | /*!40101 SET NAMES utf8 */;
 11 | /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
 12 | /*!40103 SET TIME_ZONE='+00:00' */;
 13 | /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
 14 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
 15 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
 16 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
 17 | 
 18 | --
 19 | -- Current Database: `modw_pcp`
 20 | --
 21 | 
 22 | CREATE DATABASE /*!32312 IF NOT EXISTS*/ `modw_supremm` /*!40100 DEFAULT CHARACTER SET utf8 COLLATE utf8_unicode_ci */;
 23 | 
 24 | USE `modw_supremm`;
 25 | 
 26 | --
 27 | -- Table structure for table `archive_paths`
 28 | --
 29 | 
 30 | DROP TABLE IF EXISTS `archive_paths`;
 31 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 32 | /*!40101 SET character_set_client = utf8 */;
 33 | CREATE TABLE `archive_paths` (
 34 |       `id` int(11) NOT NULL AUTO_INCREMENT,
 35 |       `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 36 |       PRIMARY KEY (`id`),
 37 |       UNIQUE KEY `filename` (`filename`)
 38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 39 | /*!40101 SET character_set_client = @saved_cs_client */;
 40 | 
 41 | --
 42 | -- Table structure for table `archives_joblevel`
 43 | --
 44 | 
 45 | DROP TABLE IF EXISTS `archives_joblevel`;
 46 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 47 | /*!40101 SET character_set_client = utf8 */;
 48 | CREATE TABLE `archives_joblevel` (
 49 |       `archive_id` int(11) NOT NULL,
 50 |       `host_id` int(11) NOT NULL,
 51 |       `local_jobid` int(11) NOT NULL DEFAULT '-1',
 52 |       `local_job_array_index` int(11) NOT NULL DEFAULT '-1',
 53 |       `local_job_id_raw` int(11) NOT NULL,
 54 |       `start_time_ts` int(11) NOT NULL,
 55 |       `end_time_ts` int(11) NOT NULL,
 56 |       PRIMARY KEY (`archive_id`),
 57 |       KEY `hostjobs` (`host_id`,`local_job_id_raw`)
 58 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 59 | /*!40101 SET character_set_client = @saved_cs_client */;
 60 | 
 61 | --
 62 | -- Table structure for table `archives_nodelevel`
 63 | --
 64 | 
 65 | DROP TABLE IF EXISTS `archives_nodelevel`;
 66 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 67 | /*!40101 SET character_set_client = utf8 */;
 68 | CREATE TABLE `archives_nodelevel` (
 69 |       `archive_id` int(11) NOT NULL,
 70 |       `host_id` int(11) NOT NULL,
 71 |       `start_time_ts` int(11) NOT NULL,
 72 |       `end_time_ts` int(11) NOT NULL,
 73 |       PRIMARY KEY (`archive_id`),
 74 |       KEY `hosttimes` (`host_id`,`start_time_ts`,`end_time_ts`)
 75 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 76 | /*!40101 SET character_set_client = @saved_cs_client */;
 77 | 
 78 | --
 79 | -- Table structure for table `process`
 80 | --
 81 | 
 82 | DROP TABLE IF EXISTS `process`;
 83 | /*!40101 SET @saved_cs_client     = @@character_set_client */;
 84 | /*!40101 SET character_set_client = utf8 */;
 85 | CREATE TABLE `process` (
 86 |   `jobid` int(11) NOT NULL,
 87 |   `process_version` int(11) NOT NULL DEFAULT '0',
 88 |   `process_timestamp` timestamp NULL DEFAULT NULL,
 89 |   `process_time` double DEFAULT '0',
 90 |   PRIMARY KEY (`jobid`),
 91 |   KEY `proc` (`process_version`)
 92 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
 93 | /*!40101 SET character_set_client = @saved_cs_client */;
 94 | /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
 95 | 
 96 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
 97 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
 98 | /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
 99 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
100 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
101 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
102 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
103 | 
104 | -- Dump completed on 2015-05-19 11:00:21
105 | 


--------------------------------------------------------------------------------
/src/supremm/config.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Configuration data management """
  3 | import os
  4 | import json
  5 | import configparser
  6 | import re
  7 | import glob
  8 | import pkg_resources
  9 | import logging
 10 | 
 11 | 
 12 | def iscomment(line):
 13 |     """ check is line is a c++ style comment """
 14 |     if re.search(r"^\s*//", line):
 15 |         return True
 16 |     return False
 17 | 
 18 | class Config(object):
 19 |     """ Configuration data management
 20 |         The configuration file format is similar to json except lines that begin "//"
 21 |         are treated as comments and are ignored. Also the string \n[:space:]// is not permitted
 22 |         anywhere in json key or value.
 23 |     """
 24 | 
 25 |     def __init__(self, confpath=None):
 26 | 
 27 |         if confpath == None:
 28 |             confpath = self.autodetectconfpath()
 29 | 
 30 |         if confpath is None or os.path.isdir(confpath) == False:
 31 |             raise Exception("Missing configuration path %s" % confpath)
 32 | 
 33 |         conffile = os.path.join(confpath, "config.json")
 34 |         logging.debug("Using config file %s", conffile)
 35 |         with open(conffile, "r") as conffp:
 36 |             confdata = ""
 37 |             for line in conffp:
 38 |                 if not iscomment(line):
 39 |                     confdata += line
 40 |             try:
 41 |                 self._config = json.loads(confdata)
 42 |             except ValueError as exc:
 43 |                 raise Exception("Syntax error in %s.\n%s" % (conffile, str(exc)))
 44 | 
 45 |         self._xdmodconfig = None
 46 | 
 47 |     def __str__(self):
 48 |         return json.dumps(self._config, indent=4)
 49 | 
 50 |     @staticmethod
 51 |     def autodetectconfpath(filename="config.json"):
 52 |         """ search known paths for the configuration directory
 53 |             List of paths support the three typical install locations
 54 |             1) Environment variable SUPREMM_CONFIG_DIR
 55 |             2) source install with pip
 56 |             3) rpm based install
 57 |             4) source install with python setup.py install
 58 |             @returns Directory name or None if no suitable directory found
 59 |         """
 60 |         searchpaths = [
 61 |             os.getenv('SUPREMM_CONFIG_DIR', os.path.dirname(os.path.abspath(__file__)) + "/../../../../etc/supremm"),
 62 |             "/etc/supremm",
 63 |             pkg_resources.resource_filename(pkg_resources.Requirement.parse("supremm"), "etc/supremm")
 64 |         ]
 65 | 
 66 |         for path in searchpaths:
 67 |             if os.path.exists(os.path.join(path, filename)):
 68 |                 return os.path.abspath(path)
 69 | 
 70 |         return None
 71 | 
 72 |     def getsection(self, sectionname):
 73 |         """ return the dict for a given section """
 74 | 
 75 |         if "include" in self._config[sectionname]:
 76 |             self._config[sectionname] = self.process_include(sectionname, self._config[sectionname]['include'])
 77 | 
 78 |         return self._config[sectionname]
 79 | 
 80 |     def parsexdmod(self):
 81 |         """ locate and parse the XDMoD portal settings file """
 82 |         self._xdmodconfig = configparser.RawConfigParser()
 83 | 
 84 |         xdmodconfs = glob.glob(os.path.join(self._config['xdmodroot'], "portal_settings.d/*.ini"))
 85 |         xdmodconfs.sort()
 86 |         xdmodconfs.insert(0, os.path.join(self._config['xdmodroot'], "portal_settings.ini"))
 87 |         xdmodconfs.reverse()
 88 | 
 89 |         nread = self._xdmodconfig.read(xdmodconfs)
 90 |         if len(nread) == 0:
 91 |             raise Exception("Unable to read XDMoD configuration file. Locations scanned: %s", xdmodconfs)
 92 | 
 93 |     @staticmethod
 94 |     def strtonative(value):
 95 |         v = value.strip("\"")
 96 |         try:
 97 |             return int(v)
 98 |         except ValueError:
 99 |             return v
100 | 
101 |     def process_include(self, sectionname, url):
102 |         """ process an include directive (only xdmod parsing is supported) """
103 |         if url.startswith("xdmod://"):
104 |             if self._xdmodconfig == None:
105 |                 self.parsexdmod()
106 | 
107 |             xdmodsection = url[8:]
108 |             if not self._xdmodconfig.has_section(xdmodsection):
109 |                 raise Exception("Unable to locate include data for %s", url)
110 | 
111 |             result = {}
112 |             for k, v in self._xdmodconfig.items(xdmodsection):
113 |                 result[k] = self.strtonative(v)
114 | 
115 |             return result
116 |         else:
117 |             raise Exception("Unsupported include url %s in section %s", url, sectionname)
118 | 
119 |     def resourceconfigs(self):
120 |         """ Iterator over enabled resources """
121 |         for resname, resdata in self._config['resources'].items():
122 |             if "enabled" in resdata and resdata['enabled'] == False:
123 |                 continue
124 |             resdata['name'] = resname
125 |             yield (resname, resdata)
126 | 
127 | def test():
128 |     """ test """
129 |     conf = Config()
130 |     print(conf.getsection("datawarehouse"))
131 |     # for r, d in c.resourceconfigs():
132 |     #    print r, d
133 | 
134 | if __name__ == "__main__":
135 |     test()
136 | 


--------------------------------------------------------------------------------
/src/supremm/datasource/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/__init__.py


--------------------------------------------------------------------------------
/src/supremm/datasource/datasource.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from abc import ABC, abstractmethod
  3 | 
  4 | from supremm.errors import ProcessingError
  5 | from supremm.proc_common import instantiatePlugins
  6 | 
  7 | class Datasource(ABC):
  8 |     """ Definition of the Datasource API """
  9 | 
 10 |     def __init__(self, preprocs, plugins):
 11 |         self._allpreprocs = preprocs
 12 |         self._allplugins = plugins
 13 | 
 14 |     @property
 15 |     def allpreprocs(self):
 16 |         return self._allpreprocs
 17 | 
 18 |     @allpreprocs.setter
 19 |     def allpreprocs(self, preprocs):
 20 |         self._allpreprocs = preprocs
 21 | 
 22 |     @property
 23 |     def allplugins(self):
 24 |         return self._allplugins
 25 | 
 26 |     @allplugins.setter
 27 |     def allplugins(self, plugins):
 28 |         self._allplugins = plugins
 29 | 
 30 |     @abstractmethod
 31 |     def presummarize(self, job, config, resconf, opts):
 32 | 
 33 |         jobmeta = JobMeta()
 34 | 
 35 |         # Filter jobs by options
 36 |         if job.nodecount > 1 and opts['min_parallel_duration'] != None and job.walltime < opts['min_parallel_duration']:
 37 |             jobmeta.result = 1
 38 |             jobmeta.mdata["skipped_parallel_too_short"] = True
 39 |             jobmeta.error = ProcessingError.PARALLEL_TOO_SHORT
 40 |             # Was "skipped"
 41 |             jobmeta.missingnodes = job.nodecount
 42 |             logging.info("Skipping %s, skipped_parallel_too_short", job.job_id)
 43 |         elif opts['min_duration'] != None and job.walltime < opts['min_duration']:
 44 |             jobmeta.result = 1
 45 |             jobmeta.mdata["skipped_too_short"] = True
 46 |             jobmeta.error = ProcessingError.TIME_TOO_SHORT
 47 |             jobmeta.missingnodes = job.nodecount
 48 |             logging.info("Skipping %s, skipped_too_short", job.job_id)
 49 |         elif job.nodecount < 1:
 50 |             jobmeta.result = 1
 51 |             jobmeta.mdata["skipped_invalid_nodecount"] = True
 52 |             jobmeta.error = ProcessingError.INVALID_NODECOUNT
 53 |             jobmeta.missingnodes = job.nodecount
 54 |             logging.info("Skipping %s, skipped_invalid_nodecount", job.job_id)
 55 |         elif opts['max_nodes'] > 0 and job.nodecount > opts['max_nodes']:
 56 |             jobmeta.result = 1
 57 |             jobmeta.mdata["skipped_job_too_big"] = True
 58 |             jobmeta.error = ProcessingError.JOB_TOO_BIG
 59 |             jobmeta.missingnodes = job.nodecount
 60 |             logging.info("Skipping %s, skipped_job_too_big", job.job_id)
 61 |         elif opts['max_nodetime'] != None and (job.nodecount * job.walltime) > opts['max_nodetime']:
 62 |             jobmeta.result = 1
 63 |             jobmeta.mdata["skipped_job_nodehours"] = True
 64 |             jobmeta.error = ProcessingError.JOB_TOO_MANY_NODEHOURS
 65 |             jobmeta.missingnodes = job.nodecount
 66 |             logging.info("Skipping %s, skipped_job_too_big (node time)", job.job_id)
 67 |         elif opts['max_duration'] > 0 and job.walltime >= opts['max_duration']:
 68 |             jobmeta.result = 1
 69 |             jobmeta.mdata["skipped_too_long"] = True
 70 |             jobmeta.error = ProcessingError.TIME_TOO_LONG
 71 |             jobmeta.missingnodes = job.nodecount
 72 |             logging.info("Skipping %s, skipped_too_long", job.job_id)
 73 | 
 74 |         return jobmeta
 75 | 
 76 |     @abstractmethod
 77 |     def summarizejob(self, job, jobmeta, config, opts):
 78 |         # All datasources instantiate plugins/preprocs
 79 |         preprocessors = instantiatePlugins(self.allpreprocs, job)
 80 |         analytics = instantiatePlugins(self.allplugins, job)
 81 |         return preprocessors, analytics
 82 | 
 83 |     @abstractmethod
 84 |     def cleanup(self, job, opts):
 85 |         pass
 86 | 
 87 | 
 88 | class JobMeta():
 89 |     """ Container class for a job's metadata """
 90 | 
 91 |     def __init__(self):
 92 |         self.mdata = {}
 93 |         self.result = 0
 94 |         self.error = None
 95 |         self.missingnodes = 0
 96 | 
 97 |     @property
 98 |     def mdata(self):
 99 |         return self._mdata
100 | 
101 |     @mdata.setter
102 |     def mdata(self, md):
103 |         self._mdata = md
104 | 
105 |     @property
106 |     def result(self):
107 |         return self._result
108 | 
109 |     @result.setter
110 |     def result(self, r):
111 |         self._result = r
112 | 
113 |     @property
114 |     def error(self):
115 |         return self._error
116 | 
117 |     @error.setter
118 |     def error(self, e):
119 |         self._error = e
120 | 
121 |     @property
122 |     def missingnodes(self):
123 |         return self._missingnodes
124 | 
125 |     @missingnodes.setter
126 |     def missingnodes(self, mn):
127 |         self._missingnodes = mn
128 | 


--------------------------------------------------------------------------------
/src/supremm/datasource/factory.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from supremm.datasource.pcp.pcpdatasource import PCPDatasource
 4 | from supremm.datasource.prometheus.promdatasource import PromDatasource
 5 | 
 6 | 
 7 | class DatasourceFactory():
 8 |     """ Datasource class helper """
 9 | 
10 |     def __init__(self, preprocs, plugins, resconf):
11 | 
12 |         if resconf["datasource"] == "pcp":
13 |             self._datasource = PCPDatasource(preprocs, plugins)
14 |         elif resconf["datasource"] == "prometheus":
15 |             self._datasource = PromDatasource(preprocs, plugins, resconf)
16 |         else:
17 |             logging.error("Invalid datasource in configuration: %s", resconf["datasource"])
18 | 
19 |     def presummarize(self, job, config, resconf, opts):
20 |         return self._datasource.presummarize(job, config, resconf, opts)
21 | 
22 |     def summarizejob(self, job, jobmeta, config, opts):
23 |         return self._datasource.summarizejob(job, jobmeta, config, opts)
24 | 
25 |     def cleanup(self, job, opts):
26 |         return self._datasource.cleanup(job, opts)
27 | 


--------------------------------------------------------------------------------
/src/supremm/datasource/pcp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/pcp/__init__.py


--------------------------------------------------------------------------------
/src/supremm/datasource/pcp/pcpcinterface/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/pcp/pcpcinterface/__init__.py


--------------------------------------------------------------------------------
/src/supremm/datasource/pcp/pcpcinterface/c_pcp.pxd:
--------------------------------------------------------------------------------
 1 | from pcp import pmapi # Python bindings
 2 | 
 3 | cdef extern from "sys/time.h":
 4 |     ctypedef struct timeval:
 5 |         pass
 6 | 
 7 | cdef extern from "pcp/pmapi.h":
 8 |     # Errors
 9 |     int PM_ERR_GENERIC   =  "PM_ERR_GENERIC"
10 |     int PM_ERR_PMID      =  "PM_ERR_PMID"
11 |     int PM_ERR_INDOM     =  "PM_ERR_INDOM"
12 |     int PM_ERR_INST      =  "PM_ERR_INST"
13 |     int PM_ERR_PMID_LOG  =  "PM_ERR_PMID_LOG"
14 |     int PM_ERR_INDOM_LOG =  "PM_ERR_INDOM_LOG"
15 |     int PM_ERR_INST_LOG  =  "PM_ERR_INST_LOG"
16 |     int PM_ERR_NAME      =  "PM_ERR_NAME"
17 |     int PM_ERR_SIGN      =  "PM_ERR_SIGN"
18 |     enum: PM_ERR_VALUE
19 | 
20 |     # pmDesc.type -- data type of metric values 
21 |     int PM_TYPE_NOSUPPORT        = "PM_TYPE_NOSUPPORT"
22 |     int PM_TYPE_32               = "PM_TYPE_32"
23 |     int PM_TYPE_U32              = "PM_TYPE_U32"
24 |     int PM_TYPE_64               = "PM_TYPE_64"
25 |     int PM_TYPE_U64              = "PM_TYPE_U64"
26 |     int PM_TYPE_FLOAT            = "PM_TYPE_FLOAT"
27 |     int PM_TYPE_DOUBLE           = "PM_TYPE_DOUBLE"
28 |     int PM_TYPE_STRING           = "PM_TYPE_STRING"
29 |     int PM_TYPE_AGGREGATE        = "PM_TYPE_AGGREGATE"
30 |     int PM_TYPE_AGGREGATE_STATIC = "PM_TYPE_AGGREGATE_STATIC"
31 |     int PM_TYPE_EVENT            = "PM_TYPE_EVENT"
32 |     int PM_TYPE_HIGHRES_EVENT    = "PM_TYPE_HIGHRES_EVENT"
33 |     int PM_TYPE_UNKNOWN          = "PM_TYPE_UNKNOWN"
34 | 
35 |     ctypedef struct pmUnits:
36 |         pass
37 |     ctypedef unsigned int pmID
38 |     ctypedef unsigned int pmInDom
39 |     ctypedef struct pmValueBlock:
40 |         pass
41 |     ctypedef union myvalue:
42 |         pmValueBlock* pval
43 |         int lval
44 |     ctypedef struct pmValue: # Can't declare anonymous union
45 |         int inst
46 |         myvalue value
47 |     ctypedef struct pmDesc:
48 |         pmID pmid
49 |         int type
50 |         pmInDom indom
51 |         int sem
52 |         pmUnits units
53 |     ctypedef struct pmValueSet:
54 |         pmID pmid
55 |         int numval
56 |         int valfmt
57 |         pmValue vlist[1]
58 |     ctypedef struct pmResult:
59 |         timeval timestamp
60 |         int numpmid
61 |         pmValueSet *vset[1]
62 |     ctypedef union pmAtomValue:
63 |         # TODO use <inttypes.h> types instead of simple long etc.
64 |         char* cp
65 |         long l
66 |         unsigned long ul
67 |         long long ll
68 |         unsigned long long ull
69 |         float f
70 |         double d
71 | 
72 |     pmInDom PM_INDOM_NULL
73 | 
74 |     int pmLookupName(int, char **, pmID *)
75 |     int pmLookupDesc(pmID, pmDesc *)
76 |     int pmLookupInDom(pmInDom, const char *)
77 |     int pmLookupInDomArchive(pmInDom, const char *)
78 |     int pmNameInDom(pmInDom, int, char **)
79 |     int pmNameInDomArchive(pmInDom, int, char **)
80 |     int pmUseContext(int)
81 |     int pmGetInDom(pmInDom, int **, char ***)
82 |     int pmGetInDomArchive(pmInDom, int **, char ***)
83 |     int pmExtractValue(int, const pmValue *, int, pmAtomValue *, int)
84 |     char *pmErrStr(int)
85 | 


--------------------------------------------------------------------------------
/src/supremm/datasource/pcp/pcpdatasource.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import time
 4 | import logging
 5 | import datetime
 6 | 
 7 | from supremm.datasource.datasource import Datasource
 8 | from supremm.datasource.pcp.pcparchive import extract_and_merge_logs
 9 | from supremm.datasource.pcp.pcpsummarize import PCPSummarize
10 | from supremm.errors import ProcessingError
11 | 
12 | class PCPDatasource(Datasource):
13 |     """ Instance of a PCP datasource class """
14 | 
15 |     def __init__(self, preprocs, plugins):
16 |         super().__init__(preprocs, plugins)
17 | 
18 |     def presummarize(self, job, conf, resconf, opts):
19 |         jobmeta = super().presummarize(job, conf, resconf, opts)
20 | 
21 |         # Error with general presummarize, don't try datasource specific checks
22 |         if jobmeta.result != 0 and jobmeta.error != None:
23 |             return jobmeta
24 |         else:
25 |             mergestart = time.time()
26 |             if not job.has_any_archives():
27 |                 jobmeta.result = 1
28 |                 jobmeta.mdata["skipped_noarchives"] = True
29 |                 jobmeta.error = ProcessingError.NO_ARCHIVES
30 |                 jobmeta.missingnodes = job.nodecount
31 |                 logging.info("Skipping %s, skipped_noarchives", job.job_id)
32 |             elif not job.has_enough_raw_archives():
33 |                 jobmeta.result = 1
34 |                 jobmeta.mdata["skipped_rawarchives"] = True
35 |                 jobmeta.error = ProcessingError.RAW_ARCHIVES
36 |                 jobmeta.missingnodes = job.nodecount
37 |                 logging.info("Skipping %s, skipped_rawarchives", job.job_id)
38 |             else:
39 |                 jobmeta.result = extract_and_merge_logs(job, conf, resconf, opts)
40 |                 jobmeta.missingnodes = -1.0 * jobmeta.result
41 | 
42 |         mergeend = time.time()
43 |         jobmeta.mdata["mergetime"] = mergeend - mergestart
44 | 
45 |         if opts['extractonly']:
46 |             if jobmeta.result == 0:
47 |                 return None
48 |             else:
49 |                 logging.error("Failure extracting logs for job %s", job.job_id)
50 |                 return None
51 | 
52 |         return jobmeta
53 | 
54 |     def summarizejob(self, job, jobmeta, conf, opts):
55 |         preprocessors, analytics = super().summarizejob(job, jobmeta, conf, opts)
56 | 
57 |         s = PCPSummarize(preprocessors, analytics, job, conf, opts["fail_fast"])
58 | 
59 |         enough_nodes = False
60 | 
61 |         if 0 == jobmeta.result or (job.nodecount !=0 and (jobmeta.missingnodes / job.nodecount < 0.05)):
62 |             enough_nodes = True
63 |             logging.info("Success for %s files in %s (%s/%s)", job.job_id, job.jobdir, jobmeta.missingnodes, job.nodecount)
64 |             s.process()
65 |         elif jobmeta.error == None and job.nodecount != 0 and (jobmeta.missingnodes / job.nodecount >= 0.5):
66 |             # Don't overwrite existing error
67 |             # Don't have enough node data to even try summarization
68 |             jobmeta.mdata["skipped_pmlogextract_error"] = True
69 |             logging.info("Skipping %s, skipped_pmlogextract_error", job.job_id)
70 |             jobmeta.error = ProcessingError.PMLOGEXTRACT_ERROR
71 | 
72 |         if opts['tag'] != None:
73 |             jobmeta.mdata['tag'] = opts['tag']
74 | 
75 |         if jobmeta.missingnodes > 0:
76 |             jobmeta.mdata['missingnodes'] = jobmeta.missingnodes
77 | 
78 |         success = s.good_enough()
79 | 
80 |         if not success and enough_nodes:
81 |             # We get here if the pmlogextract step gave us enough nodes but summarization didn't succeed for enough nodes
82 |             # All other "known" errors should already be handled above.
83 |             jobmeta.mdata["skipped_summarization_error"] = True
84 |             logging.info("Skipping %s, skipped_summarization_error", job.job_id)
85 |             jobmeta.error = ProcessingError.SUMMARIZATION_ERROR
86 | 
87 |         force_success = False
88 |         if not success:
89 |             force_timeout = opts['force_timeout']
90 |             if (datetime.datetime.now() - job.end_datetime) > datetime.timedelta(seconds=force_timeout):
91 |                 force_success = True
92 | 
93 |         return s, jobmeta.mdata, success or force_success, jobmeta.error
94 | 
95 |     def cleanup(self, opts, job):
96 |         if opts['dodelete'] and job.jobdir is not None and os.path.exists(job.jobdir):
97 |             # Clean up
98 |             shutil.rmtree(job.jobdir, ignore_errors=True)
99 | 


--------------------------------------------------------------------------------
/src/supremm/datasource/prometheus/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/datasource/prometheus/__init__.py


--------------------------------------------------------------------------------
/src/supremm/datasource/prometheus/promdatasource.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | import re
  4 | 
  5 | from supremm.datasource.datasource import Datasource
  6 | from supremm.datasource.prometheus.prommapping import MappingManager
  7 | from supremm.datasource.prometheus.prominterface import PromClient
  8 | from supremm.datasource.prometheus.promsummarize import PromSummarize
  9 | from supremm.errors import ProcessingError
 10 | 
 11 | 
 12 | class PromDatasource(Datasource):
 13 |     """ Instance of a Prometheus datasource class """
 14 | 
 15 |     def __init__(self, preprocs, plugins, resconf):
 16 |         super().__init__(preprocs, plugins)
 17 | 
 18 |         self._client = PromClient(resconf)
 19 |         self._mapping = MappingManager(self.client)
 20 | 
 21 |     @property
 22 |     def client(self):
 23 |         return self._client
 24 | 
 25 |     @client.setter
 26 |     def client(self, c):
 27 |         self._client = c
 28 | 
 29 |     @property
 30 |     def mapping(self):
 31 |         return self._mapping
 32 | 
 33 |     @mapping.setter
 34 |     def mapping(self, m):
 35 |         self._mapping = m
 36 | 
 37 |     def presummarize(self, job, conf, resconf, opts):
 38 |         jobmeta = super().presummarize(job, conf, resconf, opts)
 39 | 
 40 |         # Initialize client and test connection
 41 |         if not self.client and not self.mapping:
 42 |             self.client = PromClient(resconf)
 43 |             if not self.client.connection:
 44 |                 jobmeta.result = 1
 45 |                 jobmeta.mdata["skipped_no_prom_connection"] = True
 46 |                 jobmeta.error = ProcessingError.PROMETHEUS_CONNECTION
 47 |                 logging.info("Skipping %s, skipped_no_prom_connection", job.job_id)
 48 |                 jobmeta.missingnodes = job.nodecount
 49 |                 return
 50 |             self.mapping = MappingManager(self.client)
 51 | 
 52 |         return jobmeta
 53 | 
 54 |     def summarizejob(self, job, jobmeta, config, opts):
 55 |         # Instantiate preproc, plugins
 56 |         preprocessors, analytics = super().summarizejob(job, jobmeta, config, opts)
 57 | 
 58 |         s = PromSummarize(preprocessors, analytics, job, config, self.mapping, opts["fail_fast"])
 59 | 
 60 |         enough_nodes = False
 61 | 
 62 |         # missingnodes will always == nodecount if there is a Prometheus error
 63 |         if 0 == jobmeta.result or (job.nodecount !=0 and (jobmeta.missingnodes / job.nodecount < 0.05)):
 64 |             enough_nodes = True
 65 |             logging.info("Success for prometheus presummarize checks, job %s (%s/%s)", job.job_id, jobmeta.missingnodes, job.nodecount)
 66 |             s.process()
 67 |         elif jobmeta.error == None and job.nodecount != 0 and (jobmeta.missingnodes / job.nodecount >= 0.5):
 68 |             # Don't overwrite existing error
 69 |             # Don't have enough node data to even try summarization
 70 |             jobmeta.mdata["skipped_prom_error"] = True
 71 |             logging.info("Skipping %s, skipped_prom_error", job.job_id)
 72 |             jobmeta.error = ProcessingError.PROMETHEUS_CONNECTION
 73 | 
 74 |         if opts['tag'] != None:
 75 |             jobmeta.mdata['tag'] = opts['tag']
 76 | 
 77 |         if jobmeta.missingnodes > 0:
 78 |             jobmeta.mdata['missingnodes'] = jobmeta.missingnodes
 79 | 
 80 |         success = s.good_enough()
 81 | 
 82 |         if not success and enough_nodes:
 83 |             # All other "known" errors should already be handled above.
 84 |             jobmeta.mdata["skipped_summarization_error"] = True
 85 |             logging.info("Skipping %s, skipped_summarization_error", job.job_id)
 86 |             jobmeta.error = ProcessingError.SUMMARIZATION_ERROR
 87 | 
 88 |         force_success = False
 89 |         if not success:
 90 |             force_timeout = opts['force_timeout']
 91 |             if (datetime.datetime.now() - job.end_datetime) > datetime.timedelta(seconds=force_timeout):
 92 |                 force_success = True
 93 | 
 94 |         return s, jobmeta.mdata, success or force_success, jobmeta.error
 95 | 
 96 |     def cleanup(self, opts, job):
 97 |         # Nothing to be done for Prometheus
 98 |         pass
 99 | 
100 | def parse_scrape_interval(interval):
101 |     # function to parse scrape interval string
102 |     # "30s" -> 30, "1m" -> 60, "1m30s" -> 90, etc
103 |     times = re.split('(\d+[smhd])', interval)
104 | 
105 |     scrape_interval = 0
106 |     for time in times:
107 |         t =  re.findall('\d+|\D+', time)
108 |         try:
109 |             result = int(t[0])
110 |         except ValueError:
111 |             logging.error("Could not parse configured scrape interval: (%s)", interval)
112 |             return None
113 |         modifier = t[-1]
114 |         if modifier == 's':
115 |             scrape_interval += result
116 |         elif modifier == 'm':
117 |             scrape_interval += (result * 60)
118 |         elif modifier == 'h':
119 |             scrape_interval += (result * (60 * 60))
120 | 
121 |     return scrape_interval
122 | 


--------------------------------------------------------------------------------
/src/supremm/errors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ processing error class is defined so that common errors can be assigned short codes """
 3 | 
 4 | class ProcessingError(object):
 5 |     """ Container class for processing errors """
 6 |     RAW_COUNTER_UNAVAILABLE = 1
 7 |     JOB_TOO_SHORT = 2
 8 |     INSUFFICIENT_DATA = 3
 9 |     INSUFFICIENT_HOSTDATA = 4
10 |     CPUSET_UNKNOWN = 5
11 |     PMDA_RESTARTED_DURING_JOB = 6
12 |     INDOMS_CHANGED_DURING_JOB = 7
13 |     PMLOGEXTRACT_ERROR = 8
14 |     PARALLEL_TOO_SHORT = 9
15 |     INVALID_NODECOUNT = 10
16 |     JOB_TOO_BIG = 11
17 |     TIME_TOO_SHORT = 12
18 |     TIME_TOO_LONG = 13
19 |     UNKNOWN_CANNOT_PROCESS = 14
20 |     NO_ARCHIVES = 15
21 |     SUMMARIZATION_ERROR = 16
22 |     RAW_ARCHIVES = 17
23 |     JOB_TOO_MANY_NODEHOURS = 18
24 |     MAX_ERROR = 19
25 |     PROMETHEUS_CONNECTION = 20
26 | 
27 |     def __init__(self, err_id):
28 |         self._id = err_id
29 | 
30 |     def __str__(self):
31 |         names = {
32 |             ProcessingError.RAW_COUNTER_UNAVAILABLE: "Required raw metrics not available.",
33 |             ProcessingError.JOB_TOO_SHORT: "The job was too short.",
34 |             ProcessingError.INSUFFICIENT_DATA: "There were too few datapoints.",
35 |             ProcessingError.INSUFFICIENT_HOSTDATA: "Not all of the hosts had raw metrics available",
36 |             ProcessingError.CPUSET_UNKNOWN: "The cpuset that was assigned to the job is unavailable",
37 |             ProcessingError.PMDA_RESTARTED_DURING_JOB: "The PMDA restarted during the job",
38 |             ProcessingError.INDOMS_CHANGED_DURING_JOB: "The instance domains for required metrics changed during the job",
39 |             ProcessingError.PMLOGEXTRACT_ERROR: "Generic failure in the pmlogextract step",
40 |             ProcessingError.PARALLEL_TOO_SHORT: "Parallel job ran for too short of a time",
41 |             ProcessingError.INVALID_NODECOUNT: "Fewer than 1 node reported for this job",
42 |             ProcessingError.JOB_TOO_BIG: "Processing skipped due to large node count in job",
43 |             ProcessingError.TIME_TOO_SHORT: "Job ran for too short of a time to provide enough performance data",
44 |             ProcessingError.TIME_TOO_LONG: "Job consumed an impossible amount of walltime",
45 |             ProcessingError.UNKNOWN_CANNOT_PROCESS: "Job cannot be summarized for unknown reason",
46 |             ProcessingError.NO_ARCHIVES: "None of the nodes in the job have pcp archives",
47 |             ProcessingError.SUMMARIZATION_ERROR: "There were enough archives to try summarization, but too few archives were successfully processed",
48 |             ProcessingError.RAW_ARCHIVES: "Not enough raw archives to try pmlogextract",
49 |             ProcessingError.JOB_TOO_MANY_NODEHOURS: "Total job node hours exceeded threshold",
50 |             ProcessingError.PROMETHEUS_CONNECTION: "An error occurred with the Prometheus server during summarization"
51 |         }
52 |         return names[self._id]
53 | 
54 |     @staticmethod
55 |     def doc():
56 |         """ Returns a dict containing the documentation for all supported errors """
57 |         docs = {}
58 |         for i in range(1, ProcessingError.MAX_ERROR):
59 |             docs[i] = str(ProcessingError(i))
60 | 
61 |         return docs
62 | 
63 |     def get(self):
64 |         """ get """
65 |         return self._id
66 | 
67 | class NotApplicableError(Exception):
68 |     """ Used by plugins to indicate that their analysis is not avaiable for
69 |         the HPC job. For example, if a plugin implements a resource-manager-specific
70 |         analysis and the job was not run on the supported resource manager. """
71 |     pass
72 | 
73 | if __name__ == "__main__":
74 |     print(ProcessingError.doc())
75 | 


--------------------------------------------------------------------------------
/src/supremm/gen_pmlogger_control.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Script to generate remote host portion of pmlogger control file.
 3 | 
 4 | Usage: cat [hostlist] | python gen-pmlogger-control.py 
 5 | 
 6 | Author: Andrew E. Bruno <aebruno2@buffalo.edu>
 7 | """
 8 | import fileinput
 9 | 
10 | pcp_archive_dir = '/data/pcp-logs'
11 | pmlogger_config = 'pmlogger-config.ubccr'
12 | 
13 | def main():
14 |     for host in fileinput.input():
15 |         host = host.rstrip()
16 |         print("%s          n   n   %s/%s               -c ./%s" % (host,pcp_archive_dir,host,pmlogger_config))
17 | 
18 | if __name__ == '__main__':
19 |     main()


--------------------------------------------------------------------------------
/src/supremm/lariat.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Utilities for processing Lariat data """
 3 | import datetime
 4 | import os
 5 | import json
 6 | import logging
 7 | 
 8 | class LariatManager(object):
 9 |     """ find and cache the lariat data for a job """
10 |     def __init__(self, lariatpath):
11 |         self.lariatpath = lariatpath
12 |         self.lariatdata = dict()
13 |         self.filesprocessed = []
14 |         self.errors = dict()
15 | 
16 |     def find(self, jobid, jobstarttime, jobendtime):
17 |         """ returns a dict containing the lariat data for a job """
18 | 
19 |         if jobid in self.lariatdata:
20 |             print("Lariat cache size is ", len(self.lariatdata))
21 |             return self.lariatdata.pop(jobid)
22 | 
23 |         for days in (0, -1, 1):
24 |             searchday = datetime.datetime.utcfromtimestamp(jobendtime) + datetime.timedelta(days)
25 |             lfilename = os.path.join(self.lariatpath, searchday.strftime('%Y'), searchday.strftime('%m'), searchday.strftime('lariatData-sgeT-%Y-%m-%d.json'))
26 |             self.loadlariat(lfilename)
27 |             if jobid in self.lariatdata:
28 |                 return self.lariatdata[jobid]
29 | 
30 |         for days in (0, -1, 1):
31 |             searchday = datetime.datetime.utcfromtimestamp(jobstarttime) + datetime.timedelta(days)
32 |             lfilename = os.path.join(self.lariatpath, searchday.strftime('%Y'), searchday.strftime('%m'), searchday.strftime('lariatData-sgeT-%Y-%m-%d.json'))
33 |             self.loadlariat(lfilename)
34 | 
35 |             if jobid in self.lariatdata:
36 |                 return self.lariatdata[jobid]
37 | 
38 |         return None
39 | 
40 |     @staticmethod
41 |     def removeDotKey(obj):
42 |         """ replace . with - in the keys for the json object """
43 |         for key in list(obj.keys()):
44 |             new_key = key.replace(".", "-")
45 |             if new_key != key:
46 |                 obj[new_key] = obj[key]
47 |                 del obj[key]
48 |         return obj
49 | 
50 |     def loadlariat(self, filename):
51 |         """ load and store the contents of  lariat output file "filename" """
52 | 
53 |         if filename in self.filesprocessed:
54 |             # No need to reparse file. If the job data was in the file, then this search
55 |             # function would not have been called.
56 |             return
57 | 
58 |         try:
59 |             with open(filename, "rb") as fp:
60 | 
61 |                 # Unfortunately, the lariat data is not in valid json
62 |                 # This workaround converts the illegal \' into valid quotes
63 |                 content = fp.read().replace("\\'", "'")
64 |                 lariatJson = json.loads(content, object_hook=LariatManager.removeDotKey)
65 | 
66 |                 for k, v in lariatJson.items():
67 |                     if k not in self.lariatdata:
68 |                         self.lariatdata[k] = v[0]
69 |                     else:
70 |                         # Have already got a record for this job. Keep the record
71 |                         # that has longer recorded runtime since this is probably
72 |                         # the endofjob record.
73 |                         if 'runtime' in v[0] and 'runtime' in self.lariatdata[k] and self.lariatdata[k]['runtime'] < v[0]['runtime']:
74 |                             self.lariatdata[k] = v[0]
75 | 
76 |                 self.filesprocessed.append(filename)
77 | 
78 |         except Exception as e:
79 |             logging.error("Error processing lariat file %s. Error was %s.", filename, str(e))
80 | 
81 | 


--------------------------------------------------------------------------------
/src/supremm/linuxhelpers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Helper functions that can process data that is generated on 
 3 |     resources that use a Linux kernel or Linux based OS."""
 4 | 
 5 | 
 6 | def parsecpusallowed(cpusallowed):
 7 |     """ cpusallowed parser converts the human-readable cpuset string to
 8 |         a list of cpu indexes
 9 |     """
10 | 
11 |     cpulist = set()
12 |     items = cpusallowed.split(",")
13 |     for item in items:
14 |         try:
15 |             cpulist.add(int(item))
16 |         except ValueError as e:
17 |             try:
18 |                 cpurange = [int(x) for x in item.split("-")]
19 |                 if len(cpurange) != 2:
20 |                     raise ValueError("Unable to parse cpusallowed \"" + cpusallowed + "\"")
21 |                 cpulist |= set(range(cpurange[0], cpurange[1] + 1))
22 |             except ValueError as e:
23 |                 raise ValueError("Unable to parse cpusallowed \"" + cpusallowed + "\"")
24 | 
25 |     return cpulist
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     print(parsecpusallowed("0-7"))
30 |     print(parsecpusallowed("1"))
31 |     print(parsecpusallowed("1,2"))
32 |     print(parsecpusallowed("1,2,4-6,15"))
33 |     print(parsecpusallowed("1,6-7"))
34 |     print(parsecpusallowed("6-7,9"))
35 | 


--------------------------------------------------------------------------------
/src/supremm/migrations/1.0-1.1/modw_supremm.sql:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env mysql
 2 | 
 3 | use modw_supremm;
 4 | 
 5 | CREATE TABLE `archive_paths` (
 6 |   `id` int(11) NOT NULL AUTO_INCREMENT,
 7 |   `filename` varchar(255) COLLATE utf8_unicode_ci NOT NULL,
 8 |   PRIMARY KEY (`id`),
 9 |   UNIQUE KEY (`filename`)
10 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
11 | 
12 | CREATE TABLE `archives_nodelevel` (
13 |   `archive_id` int(11) NOT NULL,
14 |   `host_id` int(11) NOT NULL,
15 |   `start_time_ts` int(11) NOT NULL,
16 |   `end_time_ts` int(11) NOT NULL,
17 |   PRIMARY KEY (`archive_id`),
18 |   KEY `hosttimes` (`host_id` ASC, `start_time_ts` ASC, `end_time_ts` ASC)
19 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
20 | 
21 | CREATE TABLE `archives_joblevel` (
22 |     `archive_id` int(11) NOT NULL,
23 |     `host_id` int(11) NOT NULL,
24 |     `local_jobid` int(11) NOT NULL DEFAULT '-1',
25 |     `local_job_array_index` int(11) NOT NULL DEFAULT '-1',
26 |     `local_job_id_raw` int(11) NOT NULL,
27 |     `start_time_ts` int(11) NOT NULL,
28 |     `end_time_ts` int(11) NOT NULL,
29 |     PRIMARY KEY (`archive_id`),
30 |     KEY `hostjobs` (`host_id` ASC, `local_job_id_raw` ASC)
31 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci;
32 | 
33 | 
34 | INSERT INTO `archive_paths` SELECT id, filename FROM archive;
35 | 
36 | INSERT INTO `archives_nodelevel` SELECT id, hostid, FLOOR(start_time_ts), CEILING(end_time_ts) FROM `archive` WHERE jobid IS NULL;
37 | 
38 | INSERT INTO `archives_joblevel`
39 |     SELECT
40 |         id, hostid, - 1, - 1, CAST(`jobid` AS SIGNED), FLOOR(start_time_ts), CEILING(end_time_ts)
41 |     FROM
42 |         `archive`
43 |     WHERE
44 |         jobid IS NOT NULL AND jobid RLIKE '^[0-9]+$';
45 | 
46 | INSERT INTO `archives_joblevel`
47 |     SELECT
48 |         id, hostid, SUBSTRING_INDEX(jobid, '[', 1), TRIM( TRAILING ']' FROM SUBSTRING_INDEX(jobid, '[', -1)), -1, FLOOR(start_time_ts), CEILING(end_time_ts)
49 |     FROM
50 |         `archive`
51 |     WHERE
52 |         jobid IS NOT NULL AND jobid RLIKE '^[0-9]+\[[0-9]+\]$';
53 | 
54 | INSERT INTO `archives_joblevel`
55 |     SELECT
56 |         id, hostid, SUBSTRING_INDEX(jobid, '_', 1), SUBSTRING_INDEX(jobid, '_', -1), -1, FLOOR(start_time_ts), CEILING(end_time_ts)
57 |     FROM
58 |         `archive`
59 |     WHERE
60 |         jobid IS NOT NULL AND jobid RLIKE '^[0-9]+_[0-9]+$';
61 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/ArmPowerUsageTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from collections import Counter
  5 | import numpy
  6 | 
  7 | from supremm.plugin import Plugin
  8 | from supremm.subsample import TimeseriesAccumulator
  9 | from supremm.errors import ProcessingError
 10 | 
 11 | class ArmPowerUsageTimeseries(Plugin):
 12 |     """ Generate the Power usage as a timeseries data """
 13 | 
 14 |     name = property(lambda x: "corepower")
 15 |     mode = property(lambda x: "timeseries")
 16 |     requiredMetrics = property(lambda x: ["perfevent.hwcounters.arm_a64fx__EA_CORE.value", "perfevent.hwcounters.arm_a64fx__EA_L2.value",
 17 | "perfevent.hwcounters.arm_a64fx__EA_MEMORY.value"])
 18 |     optionalMetrics = property(lambda x: [])
 19 |     derivedMetrics = property(lambda x: [])
 20 | 
 21 |     def __init__(self, job):
 22 |         super(ArmPowerUsageTimeseries, self).__init__(job)
 23 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 24 |         self._error = None
 25 |         self._hostdata = {}
 26 | 
 27 |     @staticmethod
 28 |     def computetimepoint(data):
 29 |         """ Get the power usage from the data """
 30 |         if data[0][0] < numpy.finfo(numpy.float64).eps:
 31 |             return None
 32 | 
 33 |         return data[0][0]
 34 | 
 35 |     def process(self, nodemeta, timestamp, data, description):
 36 | 
 37 |         if self._job.getdata('perf')['active'] != True:
 38 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
 39 |             return False
 40 | 
 41 |         if len(data[0]) == 0:
 42 |             # Skip data point with no data
 43 |             return True
 44 | 
 45 |         if nodemeta.nodeindex not in self._hostdata:
 46 |             self._hostdata[nodemeta.nodeindex] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, 3))
 47 | 
 48 |         cpucount = numpy.sum(data[0])
 49 |         l2count = data[1][0] + data[1][12] + data[1][24] + data[1][36]
 50 |         memcount = data[2][0] + data[2][12] + data[2][24] + data[2][36]
 51 | 
 52 |         energy = (8.04 * cpucount) + (32.8 * l2count) + (271.0 * memcount)
 53 | 
 54 |         insertat = self._data.adddata(nodemeta.nodeindex, timestamp, energy)
 55 | 
 56 |         if insertat is not None:
 57 |             self._hostdata[nodemeta.nodeindex][insertat] = numpy.array([cpucount, l2count, memcount])
 58 | 
 59 |         return True
 60 | 
 61 |     def results(self):
 62 | 
 63 |         if self._error:
 64 |             return {"error": self._error}
 65 | 
 66 |         if len(self._hostdata) != self._job.nodecount:
 67 |             return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}
 68 | 
 69 |         values = self._data.get()
 70 | 
 71 |         if len(values[0, :, 0]) < 2:
 72 |             return {"error": ProcessingError.JOB_TOO_SHORT}
 73 | 
 74 |         rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0]) / 1.0e9
 75 | 
 76 |         if len(self._hostdata) > 64:
 77 | 
 78 |             # Compute min, max & median data and only save the host data
 79 |             # for these hosts
 80 | 
 81 |             sortarr = numpy.argsort(rates.T, axis=1)
 82 | 
 83 |             retdata = {
 84 |                 "min": self.collatedata(sortarr[:, 0], rates),
 85 |                 "max": self.collatedata(sortarr[:, -1], rates),
 86 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates),
 87 |                 "times": values[0, 1:, 0].tolist(),
 88 |                 "hosts": {}
 89 |             }
 90 | 
 91 |             uniqhosts = Counter(sortarr[:, 0])
 92 |             uniqhosts.update(sortarr[:, -1])
 93 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 94 |             includelist = uniqhosts.keys()
 95 |         else:
 96 |             # Save data for all hosts
 97 |             retdata = {
 98 |                 "times": values[0, 1:, 0].tolist(),
 99 |                 "hosts": {}
100 |             }
101 |             includelist = self._hostdata.keys()
102 | 
103 |         scaling = {
104 |             '0': 8.04e-9,
105 |             '1': 32.8e-9,
106 |             '2': 271.e-9
107 |         }
108 | 
109 |         for hostidx in includelist:
110 |             retdata['hosts'][str(hostidx)] = {}
111 |             retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()
112 |             retdata['hosts'][str(hostidx)]['dev'] = {}
113 |             for devid in ['0', '1', '2']:
114 |                 dpnts = len(values[hostidx, :, 0])
115 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = (scaling[devid] * numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist()
116 | 
117 |             retdata['hosts'][str(hostidx)]['names'] = {'0': 'cpu', '1': 'l2', '2': 'mem'}
118 | 
119 |         return retdata
120 | 
121 |     @staticmethod
122 |     def collatedata(args, rates):
123 |         """ build output data """
124 |         result = []
125 |         for timepoint, hostidx in enumerate(args):
126 |             try:
127 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
128 |             except IndexError:
129 |                 pass
130 | 
131 |         return result
132 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Block.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class Block(DeviceBasedPlugin):
 6 |     """ This plugin processes lots of metric that are all interested in the difference over the process """
 7 | 
 8 |     name = property(lambda x: "block")
 9 |     requiredMetrics = property(lambda x: [
10 |         "disk.dev.read",
11 |         "disk.dev.read_bytes",
12 |         "disk.dev.write",
13 |         "disk.dev.write_bytes"
14 |         ])
15 |     optionalMetrics = property(lambda x: [])
16 |     derivedMetrics = property(lambda x: [])
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/BlockTimeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Timeseries generator module """
 3 | 
 4 | from supremm.plugin import RateConvertingTimeseriesPlugin
 5 | import numpy
 6 | 
 7 | class BlockTimeseries(RateConvertingTimeseriesPlugin):
 8 |     """ Generate timeseries summary for block device usage data """
 9 | 
10 |     name = property(lambda x: "block")
11 |     requiredMetrics = property(lambda x: ["disk.dev.read_bytes",
12 |                                           "disk.dev.write_bytes"])
13 |     optionalMetrics = property(lambda x: [])
14 |     derivedMetrics = property(lambda x: [])
15 | 
16 |     def __init__(self, job):
17 |         super(BlockTimeseries, self).__init__(job)
18 | 
19 |     def computetimepoint(self, data):
20 |         return numpy.sum(numpy.array(data)) / 1048576.0
21 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Catastrophe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import Plugin
 4 | from supremm.errors import ProcessingError
 5 | import numpy
 6 | 
 7 | class Catastrophe(Plugin):
 8 |     """ Catastrophe analytic. Algorithm originally developed by Bill Barth et al. for the
 9 |         tacc_stats project """
10 | 
11 |     name = property(lambda x: "catastrophe")
12 |     mode = property(lambda x: "all")
13 |     requiredMetrics = property(lambda x: [["perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value"],
14 |                                           ["perfevent.hwcounters.L1D_REPLACEMENT.value"],
15 |                                           ["perfevent.hwcounters.L1D_REPL.value"],
16 |                                           ["perfevent.hwcounters.DATA_CACHE_MISSES_DC_MISS_STREAMING_STORE.value"]])
17 |     optionalMetrics = property(lambda x: [])
18 |     derivedMetrics = property(lambda x: [])
19 | 
20 |     def __init__(self, job):
21 |         super(Catastrophe, self).__init__(job)
22 |         self._data = {}
23 |         self._error = None
24 | 
25 |     def process(self, nodemeta, timestamp, data, description):
26 | 
27 |         if self._job.getdata('perf')['active'] != True:
28 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
29 |             return False
30 | 
31 |         if len(data[0]) == 0:
32 |             # Ignore datapoints where no data stored
33 |             return True
34 | 
35 |         if nodemeta.nodename not in self._data:
36 |             self._data[nodemeta.nodename] = {"x": [], "t": []}
37 | 
38 |         info = self._data[nodemeta.nodename]
39 |         info['x'].append(1.0 * numpy.sum(data[0]))
40 |         info['t'].append(timestamp)
41 | 
42 |         if len(info['x']) > 1:
43 |             if numpy.any(info['x'][-1] - info['x'][-2] < 0.0):
44 |                 self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB
45 |                 return False
46 | 
47 |         return True
48 | 
49 |     def results(self):
50 | 
51 |         if self._error:
52 |             return {"error": self._error}
53 | 
54 |         if len(self._data) == 0:
55 |             return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE}
56 | 
57 |         vals = None
58 | 
59 |         for _, data in self._data.items():
60 | 
61 |             if data['x'][-1] - data['x'][0] == 0.0:
62 |                 return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE}
63 | 
64 |             start = 2
65 |             end = len(data['x'])-2
66 | 
67 |             for i in range(start+1, end-1):
68 | 
69 |                 a = (data['x'][i] - data['x'][start]) / (data['t'][i] - data['t'][start])
70 |                 b = (data['x'][end] - data['x'][i]) / (data['t'][end] - data['t'][i])
71 |                 vals = b/a if vals == None else min(vals, b/a)
72 | 
73 |         if vals == None:
74 |             return {"error": ProcessingError.JOB_TOO_SHORT}
75 | 
76 |         return {"value": vals}
77 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/CgroupMemTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | from supremm.errors import ProcessingError, NotApplicableError
  7 | import numpy
  8 | from collections import Counter
  9 | import re
 10 | 
 11 | class CgroupMemTimeseries(Plugin):
 12 |     """ Generate timeseries summary for memory usage viewed from CGroup
 13 |         This code is SLURM-specific because of the SLURM cgroup naming convention.
 14 |     """
 15 | 
 16 |     name = property(lambda x: "process_mem_usage")
 17 |     mode = property(lambda x: "timeseries")
 18 |     requiredMetrics = property(lambda x: ["cgroup.memory.usage"])
 19 |     optionalMetrics = property(lambda x: [])
 20 |     derivedMetrics = property(lambda x: [])
 21 | 
 22 |     def __init__(self, job):
 23 |         super(CgroupMemTimeseries, self).__init__(job)
 24 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 25 |         self._hostdata = {}
 26 |         self._hostcounts = {}
 27 |         if job.acct['resource_manager'] == 'pbs':
 28 |             self._expectedcgroup = "/torque/{0}".format(job.job_id)
 29 |         elif job.acct['resource_manager'] == 'slurm':
 30 |             self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id)
 31 |         else:
 32 |             raise NotApplicableError
 33 | 
 34 |     def process(self, nodemeta, timestamp, data, description):
 35 | 
 36 |         hostidx = nodemeta.nodeindex
 37 | 
 38 |         if len(data[0]) == 0:
 39 |             # Skip data point with no data
 40 |             return True
 41 | 
 42 |         if nodemeta.nodeindex not in self._hostdata:
 43 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, 1))
 44 |             self._hostcounts[hostidx] = {'missing': 0, 'present': 0}
 45 | 
 46 |         try:
 47 |             dataidx = None
 48 |             for idx, desc in enumerate(description[0][1]):
 49 |                 if re.match(r"^" + re.escape(self._expectedcgroup) + r"($|\.)", desc):
 50 |                     dataidx = idx
 51 |                     break
 52 |             # No cgroup info at this datapoint
 53 |             if dataidx is None:
 54 |                 return True
 55 |             nodemem_gb = data[0][dataidx] / 1073741824.0
 56 |             self._hostcounts[hostidx]['present'] += 1
 57 |         except ValueError:
 58 |             self._hostcounts[hostidx]['missing'] += 1
 59 |             # No cgroup info at this datapoint
 60 |             return True
 61 | 
 62 |         insertat = self._data.adddata(hostidx, timestamp, nodemem_gb)
 63 |         if insertat != None:
 64 |             self._hostdata[hostidx][insertat] = nodemem_gb
 65 | 
 66 |         return True
 67 | 
 68 |     def results(self):
 69 | 
 70 |         if len(self._hostdata) != self._job.nodecount:
 71 |             return {'error': ProcessingError.RAW_COUNTER_UNAVAILABLE}
 72 | 
 73 |         for hcount in self._hostcounts.values():
 74 |             if hcount['missing'] > hcount['present']:
 75 |                 return {'error': ProcessingError.CPUSET_UNKNOWN}
 76 | 
 77 |         values = self._data.get()
 78 | 
 79 |         if len(self._hostdata) > 64:
 80 | 
 81 |             # Compute min, max & median data and only save the host data
 82 |             # for these hosts
 83 | 
 84 |             memdata = values[:, :, 1]
 85 |             sortarr = numpy.argsort(memdata.T, axis=1)
 86 | 
 87 |             retdata = {
 88 |                 "min": self.collatedata(sortarr[:, 0], memdata),
 89 |                 "max": self.collatedata(sortarr[:, -1], memdata),
 90 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata),
 91 |                 "times": values[0, :, 0].tolist(),
 92 |                 "hosts": {}
 93 |             }
 94 | 
 95 |             uniqhosts = Counter(sortarr[:, 0])
 96 |             uniqhosts.update(sortarr[:, -1])
 97 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 98 |             includelist = list(uniqhosts.keys())
 99 |         else:
100 |             # Save data for all hosts
101 |             retdata = {
102 |                 "times": values[0, :, 0].tolist(),
103 |                 "hosts": {}
104 |             }
105 |             includelist = list(self._hostdata.keys())
106 | 
107 | 
108 |         for hostidx in includelist:
109 |             retdata['hosts'][str(hostidx)] = {}
110 |             retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist()
111 | 
112 |         return retdata
113 | 
114 |     @staticmethod
115 |     def collatedata(args, rates):
116 |         """ build output data """
117 |         result = []
118 |         for timepoint, hostidx in enumerate(args):
119 |             try:
120 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
121 |             except IndexError:
122 |                 pass
123 | 
124 |         return result
125 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/CgroupMemory.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Memory usage plugin """
  3 | 
  4 | import re
  5 | from supremm.plugin import Plugin
  6 | from supremm.statistics import RollingStats, calculate_stats
  7 | from supremm.errors import ProcessingError, NotApplicableError
  8 | 
  9 | class CgroupMemory(Plugin):
 10 |     """ Cgroup memory statistics for the job """
 11 | 
 12 |     name = property(lambda x: "process_memory")
 13 |     mode = property(lambda x: "all")
 14 |     requiredMetrics = property(lambda x: ["cgroup.memory.usage", "cgroup.memory.limit"])
 15 | 
 16 |     optionalMetrics = property(lambda x: [])
 17 |     derivedMetrics = property(lambda x: [])
 18 | 
 19 |     def __init__(self, job):
 20 |         super(CgroupMemory, self).__init__(job)
 21 |         self._data = {}
 22 |         self._hostcounts = {}
 23 |         if job.acct['resource_manager'] == 'pbs':
 24 |             self._expectedcgroup = "/torque/{0}".format(job.job_id)
 25 |         elif job.acct['resource_manager'] == 'slurm':
 26 |             self._expectedcgroup = "/slurm/uid_{0}/job_{1}".format(job.acct['uid'], job.job_id)
 27 |         else:
 28 |             raise NotApplicableError
 29 | 
 30 |     def process(self, nodemeta, timestamp, data, description):
 31 |         """ CGroup Memory statistics are the aritmetic mean of all values except the
 32 |             first. Rather than storing all of the meory measurements for
 33 |             the job, we use the RollingStats() class to keep track of the mean
 34 |             values.
 35 |         """
 36 | 
 37 |         if len(data[0]) == 0:
 38 |             return True
 39 | 
 40 |         if nodemeta.nodeindex not in self._data:
 41 |             self._data[nodemeta.nodeindex] = [RollingStats() for i in range(len(self.requiredMetrics) + 1)]
 42 |             self._hostcounts[nodemeta.nodeindex] = {"present": 0, "missing": 0}
 43 |             # First data point for the node is ignored
 44 |             return True
 45 | 
 46 |         try:
 47 |             dataidx = None
 48 |             for idx, desc in enumerate(description[0][1]):
 49 |                 if re.match(r"^" + re.escape(self._expectedcgroup) + r"($|\.)", desc):
 50 |                     dataidx = idx
 51 |                     break
 52 |             # No cgroup info at this datapoint
 53 |             if dataidx is None:
 54 |                 return True
 55 |             for i in range(len(self.requiredMetrics)):
 56 |                 if len(data[i]) < dataidx:
 57 |                     # Skip timesteps with incomplete information
 58 |                     raise ValueError
 59 | 
 60 |             self._hostcounts[nodemeta.nodeindex]["present"] += 1
 61 |         except ValueError:
 62 |             self._hostcounts[nodemeta.nodeindex]["missing"] += 1
 63 |             # No cgroup info at this datapoint
 64 |             return True
 65 | 
 66 |         hdata = self._data[nodemeta.nodeindex]
 67 | 
 68 |         for i in range(len(self.requiredMetrics)):
 69 |             hdata[i].append(data[i][dataidx])
 70 | 
 71 |         if data[1][dataidx] > 0.0:
 72 |             hdata[2].append(1.0 * data[0][dataidx] / data[1][dataidx])
 73 |         else:
 74 |             hdata[2].append(0.0)
 75 | 
 76 |         return True
 77 | 
 78 |     def results(self):
 79 | 
 80 |         if len(self._data) != self._job.nodecount:
 81 |             return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}
 82 | 
 83 |         for hoststat in self._hostcounts.values():
 84 |             if hoststat['missing'] > hoststat['present']:
 85 |                 return {"error": ProcessingError.CPUSET_UNKNOWN}
 86 | 
 87 |         stats = {"usage": {"avg": [], "max": []}, "limit": [], "usageratio": {"avg": [], "max": []}}
 88 | 
 89 |         datapoints = 0
 90 | 
 91 |         for memdata in self._data.values():
 92 |             if memdata[0].count() > 0:
 93 |                 datapoints += 1
 94 |                 stats["usage"]["avg"].append(memdata[0].mean())
 95 |                 stats["usage"]["max"].append(memdata[0].max)
 96 |                 stats["limit"].append(memdata[1].max)
 97 |                 stats["usageratio"]["avg"].append(memdata[2].mean())
 98 |                 stats["usageratio"]["max"].append(memdata[2].max)
 99 | 
100 |         if datapoints == 0:
101 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
102 | 
103 |         result = {"usage": {}, "usageratio": {}}
104 |         result["usage"]["avg"] = calculate_stats(stats["usage"]["avg"])
105 |         result["usage"]["max"] = calculate_stats(stats["usage"]["max"])
106 |         result["limit"] = calculate_stats(stats["limit"])
107 |         result["usageratio"]["avg"] = calculate_stats(stats["usageratio"]["avg"])
108 |         result["usageratio"]["max"] = calculate_stats(stats["usageratio"]["max"])
109 | 
110 |         return result
111 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/CpuCategories.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ CPU categorization plugin """
  3 | 
  4 | from collections import OrderedDict
  5 | import numpy as np
  6 | 
  7 | from supremm.plugin import Plugin
  8 | from supremm.errors import ProcessingError
  9 | 
 10 | class CpuCategories(Plugin):
 11 |     """ Categorize a job based on its CPU utilization """
 12 | 
 13 |     name = property(lambda x: "cpucategories")
 14 |     mode = property(lambda x: "all")
 15 |     requiredMetrics = property(lambda x: [[
 16 |         "kernel.percpu.cpu.user",
 17 |         "kernel.percpu.cpu.nice",
 18 |         "kernel.percpu.cpu.sys",
 19 |         "kernel.percpu.cpu.idle",
 20 |         "kernel.percpu.cpu.wait.total",
 21 |         "kernel.percpu.cpu.intr",
 22 |         "kernel.percpu.cpu.irq.soft",
 23 |         "kernel.percpu.cpu.irq.hard"
 24 |     ]])
 25 |     optionalMetrics = property(lambda x: [])
 26 |     derivedMetrics = property(lambda x: [])
 27 | 
 28 |     GOOD_THRESHOLD = 0.5
 29 |     PINNED_THRESHOLD = 0.9
 30 |     LOW_THRESHOLD = 0.1
 31 |     DELTA_THRESHOLD = 0.5
 32 |     MIN_DELTAS = 5
 33 |     MAX_DIFFERENCE = 0.1
 34 |     MIN_HIGH_SIZE = 1
 35 |     MIN_HIGH_VALUE = 0.5
 36 | 
 37 |     def __init__(self, job):
 38 |         super(CpuCategories, self).__init__(job)
 39 |         self._timeabove = {}
 40 |         self._timebelow = {}
 41 |         self._deltas = {}
 42 |         self._last = {}
 43 |         self._maxcores = {}
 44 | 
 45 |     def process(self, nodemeta, timestamp, data, description):
 46 |         length = len(data[0])
 47 |         node = nodemeta.nodename
 48 |         proc = self._job.getdata('proc')
 49 | 
 50 |         # Initialize dicts to handle multiple nodes and cores
 51 |         if node not in self._last:
 52 |             self._timeabove[node] = {}
 53 |             self._timebelow[node] = {}
 54 |             self._deltas[node] = {}
 55 |             self._maxcores[node] = 0
 56 | 
 57 |             if proc is None or 'cpusallowed' not in proc or node not in proc['cpusallowed'] or 'error' in proc['cpusallowed'][node]:
 58 |                 for i in range(length):
 59 |                     self._timeabove[node][i] = 0
 60 |                     self._timebelow[node][i] = 0
 61 |                     self._deltas[node][i] = []
 62 |             else:
 63 |                 for i in proc['cpusallowed'][node]:
 64 |                     self._timeabove[node][i] = 0
 65 |                     self._timebelow[node][i] = 0
 66 |                     self._deltas[node][i] = []
 67 |             timeabove = [x for x in self._timeabove[node].keys()]
 68 |             self._last[node] = np.array(data)[:, timeabove]
 69 |             return True
 70 | 
 71 |         timeabove = [x for x in self._timeabove[node].keys()]
 72 |         nodedata = np.array(data)[:, timeabove]
 73 |         difference = nodedata - self._last[node]
 74 |         total = np.sum(difference, 0)
 75 |         self._last[node] = nodedata
 76 | 
 77 |         currentdeltas = difference[0] / total
 78 | 
 79 |         if length != 0:
 80 |             counter = 0
 81 |             for i in self._timeabove[node]:
 82 |                 self._deltas[node][i].append(currentdeltas[counter])
 83 |                 if currentdeltas[counter] > self.DELTA_THRESHOLD:
 84 |                     self._timeabove[node][i] += total[counter]
 85 |                 else:
 86 |                     self._timebelow[node][i] += total[counter]
 87 |                 counter += 1
 88 | 
 89 |             totalusage = np.sum(currentdeltas)
 90 |             if not np.isnan(totalusage) and int(round(totalusage)) > self._maxcores[node]:
 91 |                 self._maxcores[node] = int(round(totalusage))
 92 |         return True
 93 | 
 94 |     def results(self):
 95 |         duty_cycles = OrderedDict()
 96 |         for node in self._timeabove:
 97 |             if len(list(self._deltas[node].values())[0]) < self.MIN_DELTAS:
 98 |                 return {"error": ProcessingError.INSUFFICIENT_DATA}
 99 | 
100 |             duty_cycles[node] = OrderedDict()
101 |             for i in self._timeabove[node]:
102 |                 total_time = self._timeabove[node][i] + self._timebelow[node][i]
103 |                 ratio = self._timeabove[node][i] / total_time
104 |                 duty_cycles[node]["cpu{}".format(i)] = ratio
105 | 
106 |         # Categorize the job's performance
107 |         duty_list = np.array([value for node in duty_cycles.values() for value in node.values()])
108 | 
109 |         if not any(value < self.GOOD_THRESHOLD for value in duty_list):
110 |             category = "GOOD"
111 |         elif not any(value >= self.LOW_THRESHOLD for value in duty_list):
112 |             category = "LOW"
113 |         else:
114 |             high = np.sort(duty_list[duty_list >= self.LOW_THRESHOLD])
115 |             if high.size > self.MIN_HIGH_SIZE:
116 |                 if high[-1] - high[0] < self.MAX_DIFFERENCE:
117 |                     category = "PINNED"
118 |                 else:
119 |                     category = "UNPINNED"
120 |             else:
121 |                 if high[0] >= self.MIN_HIGH_VALUE:
122 |                     category = "PINNED"
123 |                 else:
124 |                     category = "UNPINNED"
125 | 
126 |         return {"dutycycles": duty_cycles, "category": category, "maxcores": sum(self._maxcores.values())}
127 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/CpuUserTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | from supremm.errors import ProcessingError
  7 | import numpy
  8 | from collections import Counter
  9 | 
 10 | class CpuUserTimeseries(Plugin):
 11 |     """ Generate the CPU usage as a timeseries data """
 12 | 
 13 |     name = property(lambda x: "cpuuser")
 14 |     mode = property(lambda x: "timeseries")
 15 |     requiredMetrics = property(lambda x: ["kernel.percpu.cpu.user"])
 16 |     optionalMetrics = property(lambda x: [])
 17 |     derivedMetrics = property(lambda x: [])
 18 | 
 19 |     def __init__(self, job):
 20 |         super(CpuUserTimeseries, self).__init__(job)
 21 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 22 |         self._hostdata = {}
 23 |         self._hostdevnames = {}
 24 |         self._cpusallowed = None
 25 | 
 26 |     def initcpus(self):
 27 |         if self._job.getdata('proc'):
 28 |             self._cpusallowed = self._job.getdata('proc')['cpusallowed']
 29 |         else:
 30 |             self._cpusallowed = {}
 31 | 
 32 |     def process(self, nodemeta, timestamp, data, description):
 33 | 
 34 |         if self._cpusallowed == None:
 35 |             self.initcpus()
 36 | 
 37 |         if len(data[0]) == 0:
 38 |             # Skip datapoints that have no values
 39 |             return True
 40 | 
 41 |         if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]:
 42 |             cpudata = data[0][self._cpusallowed[nodemeta.nodename]]
 43 |         else:
 44 |             cpudata = data[0]
 45 | 
 46 |         hostidx = nodemeta.nodeindex
 47 | 
 48 |         if nodemeta.nodeindex not in self._hostdata:
 49 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(cpudata)))
 50 |             if nodemeta.nodename in self._cpusallowed and 'error' not in self._cpusallowed[nodemeta.nodename]:
 51 |                 self._hostdevnames[hostidx] = {}
 52 |                 for i, cpuidx in enumerate(self._cpusallowed[nodemeta.nodename]):
 53 |                     self._hostdevnames[hostidx][str(i)] = description[0][1][cpuidx]
 54 |             else:
 55 |                 self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1]))
 56 | 
 57 |         insertat = self._data.adddata(hostidx, timestamp, numpy.mean(cpudata)/10.0)
 58 |         if insertat != None:
 59 |             self._hostdata[hostidx][insertat] = cpudata / 10.0
 60 | 
 61 |         return True
 62 | 
 63 |     def results(self):
 64 | 
 65 |         values = self._data.get()
 66 | 
 67 |         if len(values[0, :, 0]) < 3:
 68 |             return {"error": ProcessingError.JOB_TOO_SHORT}
 69 | 
 70 |         rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0])
 71 | 
 72 |         if len(self._hostdata) > 64:
 73 | 
 74 |             # Compute min, max & median data and only save the host data
 75 |             # for these hosts
 76 | 
 77 |             sortarr = numpy.argsort(rates.T, axis=1)
 78 | 
 79 |             retdata = {
 80 |                 "min": self.collatedata(sortarr[:, 0], rates),
 81 |                 "max": self.collatedata(sortarr[:, -1], rates),
 82 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates),
 83 |                 "times": values[0, 1:, 0].tolist(),
 84 |                 "hosts": {}
 85 |             }
 86 | 
 87 |             uniqhosts = Counter(sortarr[:, 0])
 88 |             uniqhosts.update(sortarr[:, -1])
 89 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 90 |             includelist = list(uniqhosts.keys())
 91 |         else:
 92 |             # Save data for all hosts
 93 |             retdata = {
 94 |                 "times": values[0, 1:, 0].tolist(),
 95 |                 "hosts": {}
 96 |             }
 97 |             includelist = list(self._hostdata.keys())
 98 | 
 99 | 
100 |         for hostidx in includelist:
101 |             retdata['hosts'][str(hostidx)] = {}
102 |             retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()
103 |             retdata['hosts'][str(hostidx)]['dev'] = {}
104 | 
105 |             for devid in self._hostdevnames[hostidx].keys():
106 |                 dpnts = len(values[hostidx, :, 0])
107 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist()
108 | 
109 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
110 | 
111 |         return retdata
112 | 
113 |     @staticmethod
114 |     def collatedata(args, rates):
115 |         """ build output data """
116 |         result = []
117 |         for timepoint, hostidx in enumerate(args):
118 |             try:
119 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
120 |             except IndexError:
121 |                 pass
122 | 
123 |         return result
124 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Gpfs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class Gpfs(DeviceBasedPlugin):
 6 |     """ This plugin processes lots of metric that are all interested in the difference over the process """
 7 | 
 8 |     name = property(lambda x: "gpfs")
 9 |     requiredMetrics = property(lambda x: [
10 |         "gpfs.fsios.read_bytes",
11 |         "gpfs.fsios.write_bytes",
12 |         "gpfs.fsios.reads",
13 |         "gpfs.fsios.writes"
14 |         ])
15 |     optionalMetrics = property(lambda x: [])
16 |     derivedMetrics = property(lambda x: [])
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/GpfsTimeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Timeseries generator module """
 3 | 
 4 | from supremm.plugin import RateConvertingTimeseriesPlugin
 5 | import numpy
 6 | 
 7 | class GpfsTimeseries(RateConvertingTimeseriesPlugin):
 8 |     """ Generate the GPFS usage as a timeseries data """
 9 | 
10 |     name = property(lambda x: "lnet")
11 |     requiredMetrics = property(lambda x: ["gpfs.fsios.read_bytes", "gpfs.fsios.write_bytes"])
12 |     optionalMetrics = property(lambda x: [])
13 |     derivedMetrics = property(lambda x: [])
14 | 
15 |     def __init__(self, job):
16 |         super(GpfsTimeseries, self).__init__(job)
17 | 
18 |     def computetimepoint(self, data):
19 |         return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0
20 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/GpuPower.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Energy usage plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import RollingStats, calculate_stats, Integrator
 6 | from supremm.errors import ProcessingError
 7 | 
 8 | class GpuPower(Plugin):
 9 |     """ Compute the power statistics for a job """
10 | 
11 |     name = property(lambda x: "gpupower")
12 |     mode = property(lambda x: "all")
13 |     requiredMetrics = property(lambda x: ["nvidia.powerused"])
14 |     optionalMetrics = property(lambda x: [])
15 |     derivedMetrics = property(lambda x: [])
16 | 
17 |     def __init__(self, job):
18 |         super(GpuPower, self).__init__(job)
19 |         self._data = {}
20 | 
21 |     def process(self, nodemeta, timestamp, data, description):
22 |         """ Power measurements are similar to the memory measurements the first and last data points
23 |         are ignored and the statistics are computed over all of the other measurements.
24 |         """
25 | 
26 |         if not data or not description:
27 |             # nvidia pmda can be running, but no cards present
28 |             return False
29 | 
30 |         if nodemeta.nodeindex not in self._data:
31 |             self._data[nodemeta.nodeindex] = {
32 |                 'power': RollingStats(),
33 |                 'energy': Integrator(timestamp),
34 |                 'names': [x for x in description[0][1]]
35 |             }
36 |             return True
37 | 
38 |         hdata = self._data[nodemeta.nodeindex]
39 | 
40 |         power_watts = data[0] / 1000.0
41 | 
42 |         hdata['power'].append(power_watts)
43 |         hdata['energy'].add(timestamp, power_watts)
44 | 
45 |         return True
46 | 
47 |     def results(self):
48 | 
49 |         result = {}
50 |         for data in self._data.values():
51 | 
52 |             if data['power'].count() < 1:
53 |                 continue
54 | 
55 |             for i, devicename in enumerate(data['names']):
56 |                 if devicename not in result:
57 |                     result[devicename] = {"meanpower": [], "maxpower": [], "energy": []}
58 | 
59 |                 result[devicename]["meanpower"].append(data['power'].mean()[i])
60 |                 result[devicename]["maxpower"].append(data['power'].max[i])
61 |                 result[devicename]["energy"].append(data['energy'].total[i])
62 | 
63 |         if not result:
64 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
65 | 
66 |         output = {}
67 |         for device, data in result.items():
68 |             output[device] = {
69 |                 "power": {
70 |                     "mean": calculate_stats(data['meanpower']),
71 |                     "max": calculate_stats(data['maxpower'])
72 |                 },
73 |                 "energy": calculate_stats(data['energy'])
74 |             }
75 |             output[device]['energy']['total'] = output[device]['energy']['avg'] * output[device]['energy']['cnt']
76 | 
77 |         return output
78 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/GpuUsage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ GPU statistics """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import RollingStats, calculate_stats
 6 | 
 7 | class GpuUsage(Plugin):
 8 |     """ Compute the overall gpu usage for a job """
 9 | 
10 |     name = property(lambda x: "gpu")
11 |     mode = property(lambda x: "all")
12 |     requiredMetrics = property(lambda x: ["nvidia.gpuactive", "nvidia.memused"])
13 |     optionalMetrics = property(lambda x: ["nvidia.memactive"])
14 |     derivedMetrics = property(lambda x: [])
15 | 
16 |     def __init__(self, job):
17 |         super(GpuUsage, self).__init__(job)
18 |         self._data = {}
19 |         self.statnames = None
20 | 
21 |     def process(self, nodemeta, timestamp, data, description):
22 | 
23 |         if len(description) == 0 or len(data[0]) == 0:
24 |             # nvidia pmda can be running, but no cards present
25 |             return False
26 | 
27 |         if nodemeta.nodename not in self._data:
28 |             if self.statnames == None:
29 |                 self.statnames = ['gpuactive', 'memused']
30 |                 if len(data) == 3:
31 |                     self.statnames.append('memactive')
32 | 
33 |             self._data[nodemeta.nodename] = {}
34 |             for statname in self.statnames:
35 |                 self._data[nodemeta.nodename][statname] = RollingStats()
36 | 
37 |             self._data[nodemeta.nodename]['names'] = [x for x in description[0][1]]
38 | 
39 |         for idx, statname in enumerate(self.statnames):
40 |             self._data[nodemeta.nodename][statname].append(1.0 * data[idx])
41 | 
42 |         return True
43 | 
44 |     def results(self):
45 | 
46 |         result = {}
47 |         for data in self._data.values():
48 |             for i, devicename in enumerate(data['names']):
49 |                 if devicename not in result:
50 |                     result[devicename] = {}
51 |                     for statname in self.statnames:
52 |                         result[devicename][statname] = []
53 |                         result[devicename][statname + "max"] = []
54 |                 for statname in self.statnames:
55 |                     result[devicename][statname].append(data[statname].mean()[i])
56 |                     result[devicename][statname + "max"].append(data[statname].max[i])
57 |             
58 |         output = {}
59 |         for device, data in result.items():
60 |             output[device] = {}
61 |             for statname, datalist in data.items():
62 |                 output[device][statname] = calculate_stats(datalist)
63 | 
64 |         if len(output) == 0:
65 |             output['error'] = "no data"
66 | 
67 |         return output
68 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/GpuUsageTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | import numpy
  7 | from collections import Counter
  8 | 
  9 | class GpuUsageTimeseries(Plugin):
 10 |     """ Generate the CPU usage as a timeseries data """
 11 | 
 12 |     name = property(lambda x: "gpu_usage")
 13 |     mode = property(lambda x: "timeseries")
 14 |     requiredMetrics = property(lambda x: ["nvidia.gpuactive"])
 15 |     optionalMetrics = property(lambda x: [])
 16 |     derivedMetrics = property(lambda x: [])
 17 | 
 18 |     def __init__(self, job):
 19 |         super(GpuUsageTimeseries, self).__init__(job)
 20 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 21 |         self._hostdata = {}
 22 |         self._hostdevnames = {}
 23 | 
 24 |     def process(self, nodemeta, timestamp, data, description):
 25 | 
 26 |         hostidx = nodemeta.nodeindex
 27 | 
 28 |         if len(data[0]) == 0:
 29 |             # Skip data point with no data
 30 |             return True
 31 | 
 32 |         if nodemeta.nodeindex not in self._hostdata:
 33 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0])))
 34 |             self._hostdevnames[hostidx] = dict((str(k), str(v)) for k, v in zip(description[0][0], description[0][1]))
 35 | 
 36 |         avg_usage = numpy.mean(data[0])
 37 |         insertat = self._data.adddata(hostidx, timestamp, avg_usage)
 38 |         if insertat != None:
 39 |             self._hostdata[hostidx][insertat] = data[0]
 40 | 
 41 |         return True
 42 | 
 43 |     def results(self):
 44 | 
 45 |         values = self._data.get()
 46 | 
 47 |         if len(self._hostdata) > 64:
 48 | 
 49 |             # Compute min, max & median data and only save the host data
 50 |             # for these hosts
 51 | 
 52 |             memdata = values[:, :, 1]
 53 |             sortarr = numpy.argsort(memdata.T, axis=1)
 54 | 
 55 |             retdata = {
 56 |                 "min": self.collatedata(sortarr[:, 0], memdata),
 57 |                 "max": self.collatedata(sortarr[:, -1], memdata),
 58 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata),
 59 |                 "times": values[0, :, 0].tolist(),
 60 |                 "hosts": {}
 61 |             }
 62 | 
 63 |             uniqhosts = Counter(sortarr[:, 0])
 64 |             uniqhosts.update(sortarr[:, -1])
 65 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 66 |             includelist = list(uniqhosts.keys())
 67 |         else:
 68 |             # Save data for all hosts
 69 |             retdata = {
 70 |                 "times": values[0, :, 0].tolist(),
 71 |                 "hosts": {}
 72 |             }
 73 |             includelist = list(self._hostdata.keys())
 74 | 
 75 | 
 76 |         for hostidx in includelist:
 77 |             retdata['hosts'][str(hostidx)] = {}
 78 |             retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist()
 79 |             retdata['hosts'][str(hostidx)]['dev'] = {}
 80 | 
 81 |             for devid in self._hostdevnames[hostidx].keys():
 82 |                 dpnts = len(values[hostidx, :, 0])
 83 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, int(devid)].tolist()
 84 | 
 85 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
 86 | 
 87 |         return retdata
 88 | 
 89 |     @staticmethod
 90 |     def collatedata(args, rates):
 91 |         """ build output data """
 92 |         result = []
 93 |         for timepoint, hostidx in enumerate(args):
 94 |             try:
 95 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
 96 |             except IndexError:
 97 |                 pass
 98 | 
 99 |         return result
100 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/InfiniBand.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class InfiniBand(DeviceBasedPlugin):
 6 |     """ This plugin processes lots of metric that are all interested in the difference over the process """
 7 | 
 8 |     name = property(lambda x: "infiniband")
 9 |     requiredMetrics = property(lambda x: [
10 |         "infiniband.port.switch.in.bytes",
11 |         "infiniband.port.switch.in.packets",
12 |         "infiniband.port.switch.out.bytes",
13 |         "infiniband.port.switch.out.packets"
14 |         ])
15 |     optionalMetrics = property(lambda x: [])
16 |     derivedMetrics = property(lambda x: [])
17 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/InfiniBandTimeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Timeseries generator module """
 3 | 
 4 | from supremm.plugin import RateConvertingTimeseriesPlugin
 5 | import numpy
 6 | 
 7 | class InfiniBandTimeseries(RateConvertingTimeseriesPlugin):
 8 |     """ Generate the infiniband usage as a timeseries data """
 9 | 
10 |     name = property(lambda x: "ib_lnet")
11 |     mode = property(lambda x: "timeseries")
12 |     requiredMetrics = property(lambda x: ["infiniband.port.switch.in.bytes", "infiniband.port.switch.out.bytes"])
13 |     optionalMetrics = property(lambda x: [])
14 |     derivedMetrics = property(lambda x: [])
15 | 
16 |     def __init__(self, job):
17 |         super(InfiniBandTimeseries, self).__init__(job)
18 | 
19 |     def computetimepoint(self, data):
20 |         return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0
21 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/IpmiPower.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Energy usage plugin """
 3 | 
 4 | import numpy
 5 | 
 6 | from supremm.plugin import Plugin
 7 | from supremm.statistics import RollingStats, calculate_stats, Integrator
 8 | from supremm.errors import ProcessingError
 9 | 
10 | class IpmiPower(Plugin):
11 |     """ Compute the power statistics for a job """
12 | 
13 |     name = property(lambda x: "ipmi")
14 |     mode = property(lambda x: "all")
15 |     requiredMetrics = property(lambda x: ["ipmi.dcmi.power"])
16 |     optionalMetrics = property(lambda x: [])
17 |     derivedMetrics = property(lambda x: [])
18 | 
19 |     def __init__(self, job):
20 |         super(IpmiPower, self).__init__(job)
21 |         self._data = {}
22 | 
23 |     def process(self, nodemeta, timestamp, data, description):
24 |         """ Power measurements are similar to the memory measurements the first and last data points
25 |         are ignored and the statistics are computed over all of the other measurements.
26 |         """
27 | 
28 |         if not data or data[0].size == 0:
29 |             return True
30 | 
31 |         if data[0][0] < numpy.finfo(numpy.float64).eps:
32 |             # Some IPMI stacks return a zero value if they don't support power metrics
33 |             return False
34 | 
35 |         if nodemeta.nodeindex not in self._data:
36 |             self._data[nodemeta.nodeindex] = {
37 |                 'power': RollingStats(),
38 |                 'energy': Integrator(timestamp)
39 |             }
40 |             return True
41 | 
42 |         hdata = self._data[nodemeta.nodeindex]
43 | 
44 |         hdata['power'].append(data[0][0])
45 |         hdata['energy'].add(timestamp, data[0][0])
46 | 
47 |         return True
48 | 
49 |     def results(self):
50 | 
51 |         meanpower = []
52 |         maxpower = []
53 | 
54 |         energy = []
55 |         time_covered = 0
56 | 
57 |         for pdata in self._data.values():
58 |             if pdata['power'].count() > 0:
59 |                 meanpower.append(pdata['power'].mean())
60 |                 maxpower.append(pdata['power'].max)
61 |             energy.append(pdata['energy'].total)
62 |             time_covered += pdata['energy'].elapsed
63 | 
64 |         total_energy = numpy.sum(energy)
65 | 
66 |         if total_energy < numpy.finfo(numpy.float64).eps:
67 |             return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE}
68 | 
69 |         if time_covered < 0.9 * self._job.nodecount * self._job.walltime:
70 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
71 | 
72 |         if not meanpower:
73 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
74 | 
75 |         energy_stats = calculate_stats(energy)
76 |         energy_stats['total'] = total_energy
77 | 
78 |         return {
79 |             "power": {
80 |                 "mean": calculate_stats(meanpower),
81 |                 "max": calculate_stats(maxpower)
82 |             },
83 |             "energy": energy_stats
84 |         }
85 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Lnet.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Lnet statistics """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import calculate_stats
 6 | import numpy
 7 | 
 8 | class Lnet(Plugin):
 9 |     """ Compute the overall lnet usage for a job """
10 | 
11 |     name = property(lambda x: "lnet")
12 |     mode = property(lambda x: "firstlast")
13 |     requiredMetrics = property(lambda x: ["lustre.lnet.drop_length", "lustre.lnet.recv_length", "lustre.lnet.send_length", "lustre.lnet.drop_count", "lustre.lnet.recv_count", "lustre.lnet.send_count"])
14 |     optionalMetrics = property(lambda x: [])
15 |     derivedMetrics = property(lambda x: [])
16 | 
17 |     def __init__(self, job):
18 |         super(Lnet, self).__init__(job)
19 |         self._first = {}
20 |         self._data = numpy.empty((job.nodecount, len(self.requiredMetrics)))
21 |         self._hostidx = 0
22 | 
23 |     def process(self, nodemeta, timestamp, data, description):
24 | 
25 |         vals = numpy.array(data)[:, 0]
26 | 
27 |         if nodemeta.nodename not in self._first:
28 |             self._first[nodemeta.nodename] = vals
29 |             return True
30 | 
31 |         self._data[self._hostidx, :] = vals -  self._first[nodemeta.nodename]
32 |         self._hostidx += 1
33 | 
34 |         return True
35 | 
36 |     def results(self):
37 | 
38 |         output = {}
39 | 
40 |         for i, nicename in enumerate(['drop', 'recv', 'send', 'drop_count', 'recv_count', 'send_count']):
41 |             output[nicename] = calculate_stats(self._data[:self._hostidx, i])
42 | 
43 |         return output
44 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/LoadAvg.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Load Average plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import RollingStats, calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | 
 8 | class LoadAvg(Plugin):
 9 |     """ Process the load average metrics """
10 | 
11 |     name = property(lambda x: "load1")
12 |     mode = property(lambda x: "all")
13 |     requiredMetrics = property(lambda x: ["kernel.all.load"])
14 |     optionalMetrics = property(lambda x: [])
15 |     derivedMetrics = property(lambda x: [])
16 | 
17 |     def __init__(self, job):
18 |         super(LoadAvg, self).__init__(job)
19 |         self._data = {}
20 | 
21 |     def process(self, nodemeta, timestamp, data, description):
22 |         """ Computes the mean and max values of the load average for each node
23 |            optionally normalizes this data to be per core (if the core count is available)
24 |         """
25 | 
26 |         if data[0].size < 1:
27 |             return True
28 | 
29 |         if nodemeta.nodename not in self._data:
30 |             self._data[nodemeta.nodename] = RollingStats()
31 |             return True
32 | 
33 |         self._data[nodemeta.nodename].append(data[0][0])
34 | 
35 |         return True
36 | 
37 |     def results(self):
38 | 
39 |         meanval = []
40 |         maxval = []
41 |         meanvalpercore = []
42 |         maxvalpercore = []
43 | 
44 |         hinv = self._job.getdata('hinv')
45 | 
46 |         for nodename, loaddata in self._data.items():
47 |             if loaddata.count() > 0:
48 |                 meanval.append(loaddata.mean())
49 |                 maxval.append(loaddata.max)
50 | 
51 |                 if hinv != None and nodename in hinv:
52 |                     meanvalpercore.append(loaddata.mean() / hinv[nodename]['cores'])
53 |                     maxvalpercore.append(loaddata.max / hinv[nodename]['cores'])
54 | 
55 |         if len(meanval) == 0:
56 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
57 | 
58 |         results = {
59 |             "mean": calculate_stats(meanval),
60 |             "max": calculate_stats(maxval)
61 |         }
62 | 
63 |         if len(meanvalpercore) > 0:
64 |             results['meanpercore'] = calculate_stats(meanvalpercore)
65 |             results['maxpercore'] = calculate_stats(maxvalpercore)
66 | 
67 |         return results
68 | 
69 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Lustre.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class Lustre(DeviceBasedPlugin):
 6 |     """ This plugin processes lots of metric that are all interested in the difference over the process """
 7 | 
 8 |     name = property(lambda x: "lustre")
 9 |     requiredMetrics = property(lambda x: [
10 |         "lustre.llite.read_bytes.total",
11 |         "lustre.llite.write_bytes.total"
12 |         ])
13 |     optionalMetrics = property(lambda x: [])
14 |     derivedMetrics = property(lambda x: [])
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/LustreTimeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Timeseries generator module """
 3 | 
 4 | from supremm.plugin import RateConvertingTimeseriesPlugin
 5 | import numpy
 6 | 
 7 | class LustreTimeseries(RateConvertingTimeseriesPlugin):
 8 |     """ Generate the Lustre usage as a timeseries data """
 9 | 
10 |     name = property(lambda x: "lnet")
11 |     requiredMetrics = property(lambda x: ["lustre.llite.read_bytes.total", "lustre.llite.write_bytes.total"])
12 |     optionalMetrics = property(lambda x: [])
13 |     derivedMetrics = property(lambda x: [])
14 | 
15 |     def __init__(self, job):
16 |         super(LustreTimeseries, self).__init__(job)
17 | 
18 |     def computetimepoint(self, data):
19 |         return (numpy.sum(data[0]) + numpy.sum(data[1])) / 1048576.0
20 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/MemUsageTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | import numpy
  7 | from collections import Counter
  8 | 
  9 | class MemUsageTimeseries(Plugin):
 10 |     """ Generate the CPU usage as a timeseries data """
 11 | 
 12 |     name = property(lambda x: "memused_minus_diskcache")
 13 |     mode = property(lambda x: "timeseries")
 14 |     requiredMetrics = property(lambda x: ["mem.numa.util.used", "mem.numa.util.filePages", "mem.numa.util.slab"])
 15 |     optionalMetrics = property(lambda x: [])
 16 |     derivedMetrics = property(lambda x: [])
 17 | 
 18 |     def __init__(self, job):
 19 |         super(MemUsageTimeseries, self).__init__(job)
 20 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 21 |         self._hostdata = {}
 22 |         self._hostdevnames = {}
 23 | 
 24 |     def process(self, nodemeta, timestamp, data, description):
 25 | 
 26 |         hostidx = nodemeta.nodeindex
 27 | 
 28 |         if len(data[0]) == 0:
 29 |             # Skip data point with no data
 30 |             return True
 31 | 
 32 |         if nodemeta.nodeindex not in self._hostdata:
 33 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0])))
 34 |             self._hostdevnames[hostidx] = dict((str(k), "numa " + v) for k, v in zip(description[0][0], description[0][1]))
 35 | 
 36 |         nodemem_kb = numpy.sum(data[0]) - numpy.sum(data[1]) - numpy.sum(data[2])
 37 |         insertat = self._data.adddata(hostidx, timestamp, nodemem_kb / 1048576.0)
 38 |         if insertat != None:
 39 |             self._hostdata[hostidx][insertat] = (data[0] - data[1] - data[2]) / 1048576.0
 40 | 
 41 |         return True
 42 | 
 43 |     def results(self):
 44 | 
 45 |         values = self._data.get()
 46 | 
 47 |         if len(self._hostdata) > 64:
 48 | 
 49 |             # Compute min, max & median data and only save the host data
 50 |             # for these hosts
 51 | 
 52 |             memdata = values[:, :, 1]
 53 |             sortarr = numpy.argsort(memdata.T, axis=1)
 54 | 
 55 |             retdata = {
 56 |                 "min": self.collatedata(sortarr[:, 0], memdata),
 57 |                 "max": self.collatedata(sortarr[:, -1], memdata),
 58 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata),
 59 |                 "times": values[0, :, 0].tolist(),
 60 |                 "hosts": {}
 61 |             }
 62 | 
 63 |             uniqhosts = Counter(sortarr[:, 0])
 64 |             uniqhosts.update(sortarr[:, -1])
 65 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 66 |             includelist = list(uniqhosts.keys())
 67 |         else:
 68 |             # Save data for all hosts
 69 |             retdata = {
 70 |                 "times": values[0, :, 0].tolist(),
 71 |                 "hosts": {}
 72 |             }
 73 |             includelist = list(self._hostdata.keys())
 74 | 
 75 | 
 76 |         for hostidx in includelist:
 77 |             retdata['hosts'][str(hostidx)] = {}
 78 |             retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist()
 79 |             retdata['hosts'][str(hostidx)]['dev'] = {}
 80 | 
 81 |             for devid in self._hostdevnames[hostidx].keys():
 82 |                 dpnts = len(values[hostidx, :, 0])
 83 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, numpy.int(devid)].tolist()
 84 | 
 85 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
 86 | 
 87 |         return retdata
 88 | 
 89 |     @staticmethod
 90 |     def collatedata(args, rates):
 91 |         """ build output data """
 92 |         result = []
 93 |         for timepoint, hostidx in enumerate(args):
 94 |             try:
 95 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
 96 |             except IndexError:
 97 |                 pass
 98 | 
 99 |         return result
100 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/MemoryUsage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Memory usage plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import RollingStats, calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | 
 8 | class MemoryUsage(Plugin):
 9 |     """ Compute the overall memory usage for a job """
10 | 
11 |     name = property(lambda x: "memory")
12 |     mode = property(lambda x: "all")
13 |     requiredMetrics = property(lambda x: ["mem.numa.util.used", "mem.numa.util.filePages", "mem.numa.util.slab", "kernel.percpu.cpu.user"])
14 |     optionalMetrics = property(lambda x: [])
15 |     derivedMetrics = property(lambda x: [])
16 | 
17 |     def __init__(self, job):
18 |         super(MemoryUsage, self).__init__(job)
19 |         self._data = {}
20 |         self._hostcpucounts = {}
21 | 
22 |     def process(self, nodemeta, timestamp, data, description):
23 |         """ Memory statistics are the aritmetic mean of all values except the
24 |             first and last rather than storing all of the meory measurements for
25 |             the job, we use the RollingStats() class to keep track of the mean
26 |             values. Since we don't know which data point is the last one, we update
27 |             the RollingStats with the value from the previous timestep at each timestep.  
28 |         """
29 | 
30 |         if nodemeta.nodeindex not in self._data:
31 |             self._data[nodemeta.nodeindex] = {'usedval': None, 
32 |                                               'used': RollingStats(), 
33 |                                               'usedminusval': None, 
34 |                                               'usedminus': RollingStats()}
35 |             return True
36 | 
37 |         if nodemeta.nodeindex not in self._hostcpucounts and data[3].size > 0:
38 |             self._hostcpucounts[nodemeta.nodeindex] = data[3].size
39 | 
40 |         hdata = self._data[nodemeta.nodeindex]
41 | 
42 |         if hdata['usedval'] != None:
43 |             hdata['used'].append(hdata['usedval'])
44 |             hdata['usedminus'].append(hdata['usedminusval'])
45 |             
46 |         hdata['usedval'] = sum(data[0])
47 |         hdata['usedminusval'] = (sum(data[0]) - sum(data[1]) - sum(data[2]))
48 | 
49 |         return True
50 | 
51 |     def results(self):
52 | 
53 |         memused = []
54 |         memusedminus = []
55 | 
56 |         for hostidx, memdata in self._data.items():
57 |             if hostidx not in self._hostcpucounts:
58 |                 return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}
59 |             if memdata['used'].count() > 0:
60 |                 memused.append(memdata['used'].mean() / self._hostcpucounts[hostidx])
61 |             if memdata['usedminus'].count() > 0:
62 |                 memusedminus.append(memdata['usedminus'].mean() / self._hostcpucounts[hostidx])
63 | 
64 |         if len(memused) == 0:
65 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
66 | 
67 |         return {"used": calculate_stats(memused), "used_minus_cache": calculate_stats(memusedminus)}
68 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Network.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class Network(DeviceBasedPlugin):
 6 |     """ This plugin processes lots of metric that are all interested in the difference over the process """
 7 | 
 8 |     name = property(lambda x: "network")
 9 |     requiredMetrics = property(lambda x: [
10 |         "network.interface.in.bytes",
11 |         "network.interface.out.bytes",
12 |         ])
13 |     optionalMetrics = property(lambda x: [
14 |         "network.interface.in.packets"
15 |         "network.interface.out.packets"
16 |         ])
17 |     derivedMetrics = property(lambda x: [])
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/Nfs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import DeviceBasedPlugin
 4 | 
 5 | class Nfs(DeviceBasedPlugin):
 6 |     """ Generate usage statistics for NFS clients """
 7 | 
 8 |     name = property(lambda x: "nfs")
 9 |     requiredMetrics = property(lambda x: [
10 |         "nfsclient.bytes.read.normal",
11 |         "nfsclient.bytes.read.direct",
12 |         "nfsclient.bytes.read.server",
13 |         "nfsclient.bytes.write.normal",
14 |         "nfsclient.bytes.write.direct",
15 |         "nfsclient.bytes.write.server"
16 |         ])
17 |     optionalMetrics = property(lambda x: [])
18 |     derivedMetrics = property(lambda x: [])
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/NfsTimeseries.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Timeseries generator module """
 3 | 
 4 | from supremm.plugin import RateConvertingTimeseriesPlugin
 5 | import numpy
 6 | 
 7 | class NfsTimeseries(RateConvertingTimeseriesPlugin):
 8 |     """ Generate timeseries summary for NFS usage data """
 9 | 
10 |     name = property(lambda x: "nfs")
11 |     requiredMetrics = property(lambda x: ["nfsclient.bytes.read.normal",
12 |                                           "nfsclient.bytes.read.direct",
13 |                                           "nfsclient.bytes.read.server",
14 |                                           "nfsclient.bytes.write.normal",
15 |                                           "nfsclient.bytes.write.direct",
16 |                                           "nfsclient.bytes.write.server"])
17 |     optionalMetrics = property(lambda x: [])
18 |     derivedMetrics = property(lambda x: [])
19 | 
20 |     def __init__(self, job):
21 |         super(NfsTimeseries, self).__init__(job)
22 | 
23 |     def computetimepoint(self, data):
24 |         try:
25 |             return numpy.sum(numpy.array(data)) / 1048576.0
26 |         except ValueError:
27 |             # NFS mount points can dissapear / appear during the job
28 |             # skip points that are inconsistent with the first point
29 |             return None
30 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/NodeMemoryUsage.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Memory usage plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import RollingStats, calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | 
 8 | class NodeMemoryUsage(Plugin):
 9 |     """ Compute the overall memory usage for a job """
10 | 
11 |     name = property(lambda x: "nodememory")
12 |     mode = property(lambda x: "all")
13 |     requiredMetrics = property(lambda x: [["mem.freemem", "mem.physmem"], ["mem.util.free", "hinv.physmem", "mem.util.cached"]])
14 |     optionalMetrics = property(lambda x: [])
15 |     derivedMetrics = property(lambda x: [])
16 | 
17 |     def __init__(self, job):
18 |         super(NodeMemoryUsage, self).__init__(job)
19 |         self._data = {}
20 | 
21 |     def process(self, nodemeta, timestamp, data, description):
22 |         """ Memory statistics are the aritmetic mean of all values except the
23 |             first and last rather than storing all of the memory measurements for
24 |             the job, we use the RollingStats() class to keep track of the mean
25 |             values. Since we don't know which data point is the last one, we update
26 |             the RollingStats with the value from the previous timestep at each timestep.
27 |         """
28 | 
29 |         if nodemeta.nodeindex not in self._data:
30 |             self._data[nodemeta.nodeindex] = {'freeval': None,
31 |                                               'free': RollingStats(),
32 |                                               'cached': None,
33 |                                               'physmem': None}
34 |             return True
35 | 
36 |         hdata = self._data[nodemeta.nodeindex]
37 | 
38 |         if hdata['freeval'] != None:
39 |             hdata['free'].append(hdata['freeval'])
40 | 
41 |         if len(data[0]) > 0:
42 |             hdata['freeval'] = data[0][0]
43 | 
44 |         if hdata['physmem'] == None and len(data[1]) > 0:
45 |             hdata['physmem'] = data[1][0]
46 |             if len(data) == 3:
47 |                 hdata['physmem'] *= 1024.0
48 | 
49 |         if len(data) == 3:
50 |             if hdata['cached'] == None:
51 |                 hdata['cached'] = RollingStats()
52 | 
53 |             hdata['cached'].append(data[0][0] + data[2][0])
54 | 
55 |         return True
56 | 
57 |     def results(self):
58 | 
59 |         memused = []
60 |         memusedminus = []
61 |         maxmemused = []
62 |         maxmemusedminus = []
63 |         memfree = []
64 |         maxmemfree = []
65 |         physmem = []
66 | 
67 |         for hostidx, memdata in self._data.items():
68 |             if memdata['free'].count() > 0:
69 |                 memfree.append(memdata['free'].mean())
70 |                 maxmemfree.append(memdata['free'].max)
71 | 
72 |                 if memdata['physmem'] != None:
73 |                     memused.append(memdata['physmem'] - memdata['free'].mean())
74 |                     maxmemused.append(memdata['physmem'] - memdata['free'].min)
75 |                     physmem.append(memdata['physmem'])
76 | 
77 |                     if memdata['cached'] != None:
78 |                         memusedminus.append(memdata['physmem'] - memdata['cached'].mean())
79 |                         maxmemusedminus.append(memdata['physmem'] - memdata['cached'].min)
80 | 
81 |         if len(memused) == 0:
82 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
83 | 
84 |         result = {"used": calculate_stats(memused),
85 |                 "maxused": calculate_stats(maxmemused),
86 |                 "free": calculate_stats(memfree),
87 |                 "physmem": calculate_stats(physmem),
88 |                 "maxfree": calculate_stats(maxmemfree)}
89 | 
90 |         if len(memusedminus) > 0:
91 |             result['used_minus_cache'] = calculate_stats(memusedminus)
92 |             result['maxused_minus_cache'] = calculate_stats(maxmemusedminus)
93 | 
94 |         return result
95 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/PowerUsageTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from collections import Counter
  5 | import numpy
  6 | 
  7 | from supremm.plugin import Plugin
  8 | from supremm.subsample import TimeseriesAccumulator
  9 | from supremm.errors import ProcessingError
 10 | 
 11 | class PowerUsageTimeseries(Plugin):
 12 |     """ Generate the Power usage as a timeseries data """
 13 | 
 14 |     name = property(lambda x: "power")
 15 |     mode = property(lambda x: "timeseries")
 16 |     requiredMetrics = property(lambda x: ["ipmi.dcmi.power"])
 17 |     optionalMetrics = property(lambda x: [])
 18 |     derivedMetrics = property(lambda x: [])
 19 | 
 20 |     def __init__(self, job):
 21 |         super(PowerUsageTimeseries, self).__init__(job)
 22 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 23 |         self._hostdata = {}
 24 | 
 25 |     @staticmethod
 26 |     def computetimepoint(data):
 27 |         """ Get the power usage from the data """
 28 |         if data[0][0] < numpy.finfo(numpy.float64).eps:
 29 |             return None
 30 | 
 31 |         return data[0][0]
 32 | 
 33 |     def process(self, nodemeta, timestamp, data, description):
 34 | 
 35 |         if not data[0]:
 36 |             # Skip data point with no data
 37 |             return True
 38 | 
 39 |         if nodemeta.nodeindex not in self._hostdata:
 40 |             self._hostdata[nodemeta.nodeindex] = 1
 41 | 
 42 |         datum = self.computetimepoint(data)
 43 |         if datum != None:
 44 |             self._data.adddata(nodemeta.nodeindex, timestamp, datum)
 45 | 
 46 |         return True
 47 | 
 48 |     def results(self):
 49 | 
 50 |         if len(self._hostdata) != self._job.nodecount:
 51 |             return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}
 52 | 
 53 |         values = self._data.get()
 54 | 
 55 |         if len(values[0, :, 0]) < 3:
 56 |             return {"error": ProcessingError.JOB_TOO_SHORT}
 57 | 
 58 |         power = values[:, :, 1]
 59 | 
 60 |         if len(self._hostdata) > 64:
 61 | 
 62 |             # Compute min, max & median data and only save the host data
 63 |             # for these hosts
 64 | 
 65 |             sortarr = numpy.argsort(power.T, axis=1)
 66 | 
 67 |             retdata = {
 68 |                 "min": self.collatedata(sortarr[:, 0], power),
 69 |                 "max": self.collatedata(sortarr[:, -1], power),
 70 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], power),
 71 |                 "times": values[0, :, 0].tolist(),
 72 |                 "hosts": {}
 73 |             }
 74 | 
 75 |             uniqhosts = Counter(sortarr[:, 0])
 76 |             uniqhosts.update(sortarr[:, -1])
 77 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 78 |             includelist = list(uniqhosts.keys())
 79 |         else:
 80 |             # Save data for all hosts
 81 |             retdata = {
 82 |                 "times": values[0, :, 0].tolist(),
 83 |                 "hosts": {}
 84 |             }
 85 |             includelist = list(self._hostdata.keys())
 86 | 
 87 | 
 88 |         for hostidx in includelist:
 89 |             retdata['hosts'][str(hostidx)] = {}
 90 |             retdata['hosts'][str(hostidx)]['all'] = power[hostidx, :].tolist()
 91 | 
 92 |         return retdata
 93 | 
 94 |     @staticmethod
 95 |     def collatedata(args, rates):
 96 |         """ build output data """
 97 |         result = []
 98 |         for timepoint, hostidx in enumerate(args):
 99 |             try:
100 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
101 |             except IndexError:
102 |                 pass
103 | 
104 |         return result
105 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/SimdInsTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | from supremm.errors import ProcessingError
  7 | import numpy
  8 | from collections import Counter
  9 | 
 10 | SNB_METRICS = ["perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value",
 11 |                "perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_SCALAR_DOUBLE.value",
 12 |                "perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE.value",
 13 |                "perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value",
 14 |                "perfevent.hwcounters.FP_COMP_OPS_EXE_X87.value"]
 15 | 
 16 | NHM_METRICS = ["perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP.value"]
 17 | 
 18 | INTERLAGOS_METRICS = ["perfevent.hwcounters.RETIRED_SSE_OPS_ALL.value"]
 19 | 
 20 | class SimdInsTimeseries(Plugin):
 21 |     """ Generate the CPU usage as a timeseries data """
 22 | 
 23 |     name = property(lambda x: "simdins")
 24 |     mode = property(lambda x: "timeseries")
 25 |     requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS, INTERLAGOS_METRICS])
 26 |     optionalMetrics = property(lambda x: [])
 27 |     derivedMetrics = property(lambda x: [])
 28 | 
 29 |     def __init__(self, job):
 30 |         super(SimdInsTimeseries, self).__init__(job)
 31 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 32 |         self._hostdata = {}
 33 |         self._hostdevnames = {}
 34 |         self._error = None
 35 | 
 36 |     def process(self, nodemeta, timestamp, data, description):
 37 | 
 38 |         if self._job.getdata('perf')['active'] != True:
 39 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
 40 |             return False
 41 | 
 42 |         if len(data[0]) == 0:
 43 |             # Ignore timesteps where data was not available
 44 |             return True
 45 | 
 46 |         hostidx = nodemeta.nodeindex
 47 | 
 48 |         if nodemeta.nodeindex not in self._hostdata:
 49 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0])))
 50 |             self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1]))
 51 | 
 52 |         if len(data) == len(NHM_METRICS): # Note that INTERLAGOS is covered here too
 53 |             flops = numpy.array(data[0])
 54 |         else:
 55 |             flops = 4.0 * data[0] + 2.0 * data[1] + data[2] + data[3]
 56 | 
 57 |         insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops))
 58 |         if insertat != None:
 59 |             self._hostdata[hostidx][insertat] = flops
 60 | 
 61 |             if insertat > 1:
 62 |                 if numpy.any(flops - self._hostdata[hostidx][insertat-1] < 0.0):
 63 |                     self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB
 64 |                     return False
 65 | 
 66 |         return True
 67 | 
 68 |     def results(self):
 69 | 
 70 |         if self._error != None:
 71 |             return {"error": self._error}
 72 | 
 73 |         values = self._data.get()
 74 | 
 75 |         if len(values[0, :, 0]) < 3:
 76 |             return {"error": ProcessingError.JOB_TOO_SHORT}
 77 | 
 78 |         rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0])
 79 | 
 80 |         if len(self._hostdata) > 64:
 81 | 
 82 |             # Compute min, max & median data and only save the host data
 83 |             # for these hosts
 84 | 
 85 |             sortarr = numpy.argsort(rates.T, axis=1)
 86 | 
 87 |             retdata = {
 88 |                 "min": self.collatedata(sortarr[:, 0], rates),
 89 |                 "max": self.collatedata(sortarr[:, -1], rates),
 90 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates),
 91 |                 "times": values[0, 1:, 0].tolist(),
 92 |                 "hosts": {}
 93 |             }
 94 | 
 95 |             uniqhosts = Counter(sortarr[:, 0])
 96 |             uniqhosts.update(sortarr[:, -1])
 97 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 98 |             includelist = list(uniqhosts.keys())
 99 |         else:
100 |             # Save data for all hosts
101 |             retdata = {
102 |                 "times": values[0, 1:, 0].tolist(),
103 |                 "hosts": {}
104 |             }
105 |             includelist = list(self._hostdata.keys())
106 | 
107 | 
108 |         for hostidx in includelist:
109 |             retdata['hosts'][str(hostidx)] = {}
110 |             retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()
111 |             retdata['hosts'][str(hostidx)]['dev'] = {}
112 | 
113 |             for devid in self._hostdevnames[hostidx].keys():
114 |                 dpnts = len(values[hostidx, :, 0])
115 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist()
116 | 
117 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
118 | 
119 |         return retdata
120 | 
121 |     @staticmethod
122 |     def collatedata(args, rates):
123 |         """ build output data """
124 |         result = []
125 |         for timepoint, hostidx in enumerate(args):
126 |             try:
127 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
128 |             except IndexError:
129 |                 pass
130 | 
131 |         return result
132 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/SveTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | from supremm.errors import ProcessingError
  7 | import numpy
  8 | from collections import Counter
  9 | 
 10 | SVE_METRICS = ["perfevent.hwcounters.arm_a64fx__SVE_INST_RETIRED.value"]
 11 | 
 12 | class SveTimeseries(Plugin):
 13 |     """ Generate the CPU usage as a timeseries data """
 14 | 
 15 |     name = property(lambda x: "sveins")
 16 |     mode = property(lambda x: "timeseries")
 17 |     requiredMetrics = property(lambda x: [SVE_METRICS])
 18 |     optionalMetrics = property(lambda x: [])
 19 |     derivedMetrics = property(lambda x: [])
 20 | 
 21 |     def __init__(self, job):
 22 |         super(SveTimeseries, self).__init__(job)
 23 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 24 |         self._hostdata = {}
 25 |         self._hostdevnames = {}
 26 |         self._error = None
 27 | 
 28 |     def process(self, nodemeta, timestamp, data, description):
 29 | 
 30 |         if self._job.getdata('perf')['active'] != True:
 31 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
 32 |             return False
 33 | 
 34 |         if len(data[0]) == 0:
 35 |             # Ignore timesteps where data was not available
 36 |             return True
 37 | 
 38 |         hostidx = nodemeta.nodeindex
 39 | 
 40 |         if nodemeta.nodeindex not in self._hostdata:
 41 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0])))
 42 |             self._hostdevnames[hostidx] = dict((str(k), v) for k, v in zip(description[0][0], description[0][1]))
 43 | 
 44 |         if len(data) == len(SVE_METRICS): # Note that INTERLAGOS is covered here too
 45 |             flops = numpy.array(data[0])
 46 |         else:
 47 |             flops = 4.0 * data[0] + 2.0 * data[1] + data[2] + data[3]
 48 | 
 49 |         insertat = self._data.adddata(hostidx, timestamp, numpy.sum(flops))
 50 |         if insertat != None:
 51 |             self._hostdata[hostidx][insertat] = flops
 52 | 
 53 |             if insertat > 1:
 54 |                 if numpy.any(flops - self._hostdata[hostidx][insertat-1] < 0.0):
 55 |                     self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB
 56 |                     return False
 57 | 
 58 |         return True
 59 | 
 60 |     def results(self):
 61 | 
 62 |         if self._error != None:
 63 |             return {"error": self._error}
 64 | 
 65 |         values = self._data.get()
 66 | 
 67 |         if len(values[0, :, 0]) < 3:
 68 |             return {"error": ProcessingError.JOB_TOO_SHORT}
 69 | 
 70 |         rates = numpy.diff(values[:, :, 1]) / numpy.diff(values[:, :, 0])
 71 | 
 72 |         if len(self._hostdata) > 64:
 73 | 
 74 |             # Compute min, max & median data and only save the host data
 75 |             # for these hosts
 76 | 
 77 |             sortarr = numpy.argsort(rates.T, axis=1)
 78 | 
 79 |             retdata = {
 80 |                 "min": self.collatedata(sortarr[:, 0], rates),
 81 |                 "max": self.collatedata(sortarr[:, -1], rates),
 82 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], rates),
 83 |                 "times": values[0, 1:, 0].tolist(),
 84 |                 "hosts": {}
 85 |             }
 86 | 
 87 |             uniqhosts = Counter(sortarr[:, 0])
 88 |             uniqhosts.update(sortarr[:, -1])
 89 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 90 |             includelist = uniqhosts.keys()
 91 |         else:
 92 |             # Save data for all hosts
 93 |             retdata = {
 94 |                 "times": values[0, 1:, 0].tolist(),
 95 |                 "hosts": {}
 96 |             }
 97 |             includelist = self._hostdata.keys()
 98 | 
 99 | 
100 |         for hostidx in includelist:
101 |             retdata['hosts'][str(hostidx)] = {}
102 |             retdata['hosts'][str(hostidx)]['all'] = rates[hostidx, :].tolist()
103 |             retdata['hosts'][str(hostidx)]['dev'] = {}
104 | 
105 |             for devid in self._hostdevnames[hostidx].iterkeys():
106 |                 dpnts = len(values[hostidx, :, 0])
107 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = (numpy.diff(self._hostdata[hostidx][:dpnts, numpy.int(devid)]) / numpy.diff(values[hostidx, :, 0])).tolist()
108 | 
109 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
110 | 
111 |         return retdata
112 | 
113 |     @staticmethod
114 |     def collatedata(args, rates):
115 |         """ build output data """
116 |         result = []
117 |         for timepoint, hostidx in enumerate(args):
118 |             try:
119 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
120 |             except IndexError:
121 |                 pass
122 | 
123 |         return result
124 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/TaccCatastrophe.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from supremm.plugin import Plugin
 4 | from supremm.errors import ProcessingError
 5 | from supremm.subsample import RangeConverter
 6 | import numpy
 7 | 
 8 | class TaccCatastrophe(Plugin):
 9 |     """ Catastrophe analytic. Algorithm originally developed by Bill Barth et al. for the
10 |         tacc_stats project """
11 | 
12 |     name = property(lambda x: "catastrophe")
13 |     mode = property(lambda x: "all")
14 |     requiredMetrics = property(lambda x: [ ["taccstats_perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value"], ["taccstats_perfevent.hwcounters.L1D_REPLACEMENT.value"] ])
15 |     optionalMetrics = property(lambda x: [])
16 |     derivedMetrics = property(lambda x: [])
17 | 
18 |     def __init__(self, job):
19 |         super(TaccCatastrophe, self).__init__(job)
20 |         self._data = {}
21 |         self._values = {}
22 | 
23 |     def process(self, nodemeta, timestamp, data, description):
24 | 
25 |         if nodemeta.nodename not in self._data:
26 |             self._data[nodemeta.nodename] = { "x": [], "t": [] }
27 |             self._values[nodemeta.nodename] = RangeConverter(48, False)
28 | 
29 |         info = self._data[nodemeta.nodename]
30 |         value = self._values[nodemeta.nodename].append(data)
31 | 
32 |         info['x'].append(1.0 * numpy.sum(value))
33 |         info['t'].append(timestamp)
34 | 
35 |         return True
36 | 
37 |     def results(self):
38 | 
39 |         if len(self._data) == 0:
40 |             return {"error": ProcessingError.RAW_COUNTER_UNAVAILABLE}
41 | 
42 |         vals = None
43 | 
44 |         for host, data in self._data.items():
45 |             x = data['x']
46 |             t = data['t']
47 | 
48 |             start = 2
49 |             end = len(data['x'])-2
50 | 
51 |             for i in range(start+1, end-1):
52 | 
53 |                 a = (data['x'][i] - data['x'][start]) / (data['t'][i] - data['t'][start])
54 |                 b = (data['x'][end] - data['x'][i]) / (data['t'][end] - data['t'][i])
55 |                 vals = b/a if vals == None else min(vals, b/a)
56 | 
57 |         if vals == None:
58 |             return {"error": ProcessingError.JOB_TOO_SHORT}
59 | 
60 |         return {"value": vals}
61 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/TaccPerfCounters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ CPU performance counter plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | import numpy
 8 | 
 9 | SNB_METRICS = ["taccstats_perfevent.hwcounters.UNHALTED_REFERENCE_CYCLES.value",
10 |                "taccstats_perfevent.hwcounters.INSTRUCTION_RETIRED.value",
11 |                "taccstats_perfevent.hwcounters.L1D_REPLACEMENT.value",
12 |                "taccstats_perfevent.hwcounters.SIMD_FP_256_PACKED_DOUBLE.value",
13 |                "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE.value",
14 |                "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_SCALAR_DOUBLE.value"]
15 | 
16 | NHM_METRICS = ["taccstats_perfevent.hwcounters.UNHALTED_REFERENCE_CYCLES.value",
17 |                "taccstats_perfevent.hwcounters.INSTRUCTIONS_RETIRED.value",
18 |                "taccstats_perfevent.hwcounters.MEM_LOAD_RETIRED_L1D_HIT.value",
19 |                "taccstats_perfevent.hwcounters.FP_COMP_OPS_EXE_SSE_FP.value"]
20 | 
21 | class TaccPerfCounters(Plugin):
22 |     """ Compute various performance counter derived metrics """
23 |     name = property(lambda x: "cpuperf")
24 |     mode = property(lambda x: "all")
25 |     requiredMetrics = property(lambda x: [SNB_METRICS, NHM_METRICS])
26 |     optionalMetrics = property(lambda x: [])
27 |     derivedMetrics = property(lambda x: [])
28 | 
29 |     def __init__(self, job):
30 |         super(TaccPerfCounters, self).__init__(job)
31 |         self._last = {}
32 |         self._data = {}
33 |         self._totalcores = 0
34 |         self._error = None
35 | 
36 |     def process(self, nodemeta, timestamp, data, description):
37 | 
38 |         ndata = numpy.array(data)
39 | 
40 |         if nodemeta.nodename not in self._last:
41 |             self._last[nodemeta.nodename] = ndata
42 |             return True
43 | 
44 |         if ndata.shape == self._last[nodemeta.nodename].shape:
45 |             if nodemeta.nodename not in self._data:
46 |                 # Only populate data for a host when we have at least 2 datapoints
47 |                 self._data[nodemeta.nodename] = numpy.zeros(ndata.shape)
48 |                 self._totalcores += data[0].size
49 | 
50 |             self._data[nodemeta.nodename] += (ndata - self._last[nodemeta.nodename]) % (2**48)
51 |             self._last[nodemeta.nodename] = ndata
52 |         else:
53 |             # Perf counters changed during the job
54 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
55 |             return False
56 | 
57 |         return True
58 | 
59 |     def results(self):
60 | 
61 |         if self._error != None:
62 |             return {"error": self._error}
63 | 
64 |         nhosts = len(self._data)
65 | 
66 |         if nhosts < 1:
67 |             return {"error": ProcessingError.INSUFFICIENT_HOSTDATA}
68 | 
69 |         flops = numpy.zeros(self._totalcores)
70 |         cpiref = numpy.zeros(self._totalcores)
71 |         cpldref = numpy.zeros(self._totalcores)
72 | 
73 |         coreindex = 0
74 |         for data in self._data.values():
75 |             if len(data) == len(NHM_METRICS):
76 |                 flops[coreindex:coreindex+len(data[0])] = 1.0 * data[3]
77 |                 cpiref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[1]
78 |                 cpldref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[2]
79 |                 coreindex += len(data[0])
80 |             elif len(data) == len(SNB_METRICS):
81 |                 flops[coreindex:coreindex+len(data[0])] = 4.0 * data[3] + 2.0 * data[4] + 1.0 * data[5]
82 |                 cpiref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[1]
83 |                 cpldref[coreindex:coreindex+len(data[0])] = 1.0 * data[0] / data[2]
84 |                 coreindex += len(data[0])
85 |             else:
86 |                 return {"error": ProcessingError.INSUFFICIENT_DATA}
87 | 
88 |         results = {"flops": calculate_stats(flops), "cpiref": calculate_stats(cpiref), "cpldref": calculate_stats(cpldref)}
89 |         return results
90 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/TaccUncoreCounters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Socket level performance counter plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | import numpy
 8 | 
 9 | TACC_NHM_METRICS = ["taccstats_perfevent.hwcounters.UNC_LLC_MISS_READ.value",
10 |                     "taccstats_perfevent.hwcounters.UNC_LLC_MISS_WRITE.value"]
11 | 
12 | class TaccUncoreCounters(Plugin):
13 |     """ Compute various uncore performance counter derived metrics """
14 | 
15 |     name = property(lambda x: "uncperf")
16 |     mode = property(lambda x: "all")
17 |     requiredMetrics = property(lambda x: TACC_NHM_METRICS)
18 |     optionalMetrics = property(lambda x: [])
19 |     derivedMetrics = property(lambda x: [])
20 | 
21 |     def __init__(self, job):
22 |         super(TaccUncoreCounters, self).__init__(job)
23 |         self._last = {}
24 |         self._data = {}
25 |         self._error = None
26 | 
27 |     def process(self, nodemeta, timestamp, data, description):
28 |         ndata = numpy.array(data)
29 | 
30 |         if nodemeta.nodename not in self._last:
31 |             self._last[nodemeta.nodename] = ndata
32 |             self._data[nodemeta.nodename] = 0.0
33 |             return True
34 | 
35 |         if ndata.shape == self._last[nodemeta.nodename].shape:
36 |             self._data[nodemeta.nodename] += numpy.sum((ndata - self._last[nodemeta.nodename]) % 2**48)
37 |             self._last[nodemeta.nodename] = ndata
38 |         else:
39 |             # Perf counters changed during the job
40 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
41 |             return False
42 | 
43 |         return True
44 | 
45 |     def results(self):
46 | 
47 |         if self._error != None:
48 |             return {"error": self._error}
49 | 
50 |         nhosts = len(self._data)
51 | 
52 |         if nhosts < 1:
53 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
54 | 
55 |         membw = numpy.zeros(nhosts)
56 |         for hostindex, data in enumerate(self._data.values()):
57 |             membw[hostindex] = data * 64.0
58 | 
59 |         results = {"membw": calculate_stats(membw)}
60 |         return results
61 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/TimeseriesPatternsGpfs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from supremm.TimeseriesPatterns import TimeseriesPatterns
 3 | 
 4 | 
 5 | class TimeseriesPatternsGpfs(TimeseriesPatterns):
 6 |     requiredMetrics = property(lambda self: ["gpfs.fsios.read_bytes", "gpfs.fsios.write_bytes"])
 7 |     name = property(lambda self: "timeseries_patterns_gpfs")
 8 | 
 9 |     def __init__(self, job):
10 |         super(TimeseriesPatternsGpfs, self).__init__(job)
11 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/TotalMemUsageTimeseries.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries generator module """
  3 | 
  4 | from supremm.plugin import Plugin
  5 | from supremm.subsample import TimeseriesAccumulator
  6 | import numpy
  7 | from collections import Counter
  8 | 
  9 | class TotalMemUsageTimeseries(Plugin):
 10 |     """ Generate the CPU usage as a timeseries data """
 11 | 
 12 |     name = property(lambda x: "memused")
 13 |     mode = property(lambda x: "timeseries")
 14 |     requiredMetrics = property(lambda x: ["mem.numa.util.used"])
 15 |     optionalMetrics = property(lambda x: [])
 16 |     derivedMetrics = property(lambda x: [])
 17 | 
 18 |     def __init__(self, job):
 19 |         super(TotalMemUsageTimeseries, self).__init__(job)
 20 |         self._data = TimeseriesAccumulator(job.nodecount, self._job.walltime)
 21 |         self._hostdata = {}
 22 |         self._hostdevnames = {}
 23 | 
 24 |     def process(self, nodemeta, timestamp, data, description):
 25 | 
 26 |         hostidx = nodemeta.nodeindex
 27 | 
 28 |         if len(data[0]) == 0:
 29 |             # Skip data point with no data
 30 |             return True
 31 | 
 32 |         if nodemeta.nodeindex not in self._hostdata:
 33 |             self._hostdata[hostidx] = numpy.empty((TimeseriesAccumulator.MAX_DATAPOINTS, len(data[0])))
 34 |             self._hostdevnames[hostidx] = dict((str(k), "numa " + v) for k, v in zip(description[0][0], description[0][1]))
 35 | 
 36 |         nodemem_gb = numpy.sum(data[0]) / 1048576.0
 37 |         insertat = self._data.adddata(hostidx, timestamp, nodemem_gb)
 38 |         if insertat != None:
 39 |             self._hostdata[hostidx][insertat] = data[0] / 1048576.0
 40 | 
 41 |         return True
 42 | 
 43 |     def results(self):
 44 | 
 45 |         values = self._data.get()
 46 | 
 47 |         if len(self._hostdata) > 64:
 48 | 
 49 |             # Compute min, max & median data and only save the host data
 50 |             # for these hosts
 51 | 
 52 |             memdata = values[:, :, 1]
 53 |             sortarr = numpy.argsort(memdata.T, axis=1)
 54 | 
 55 |             retdata = {
 56 |                 "min": self.collatedata(sortarr[:, 0], memdata),
 57 |                 "max": self.collatedata(sortarr[:, -1], memdata),
 58 |                 "med": self.collatedata(sortarr[:, sortarr.shape[1] // 2], memdata),
 59 |                 "times": values[0, :, 0].tolist(),
 60 |                 "hosts": {}
 61 |             }
 62 | 
 63 |             uniqhosts = Counter(sortarr[:, 0])
 64 |             uniqhosts.update(sortarr[:, -1])
 65 |             uniqhosts.update(sortarr[:, sortarr.shape[1] // 2])
 66 |             includelist = list(uniqhosts.keys())
 67 |         else:
 68 |             # Save data for all hosts
 69 |             retdata = {
 70 |                 "times": values[0, :, 0].tolist(),
 71 |                 "hosts": {}
 72 |             }
 73 |             includelist = list(self._hostdata.keys())
 74 | 
 75 | 
 76 |         for hostidx in includelist:
 77 |             retdata['hosts'][str(hostidx)] = {}
 78 |             retdata['hosts'][str(hostidx)]['all'] = values[hostidx, :, 1].tolist()
 79 |             retdata['hosts'][str(hostidx)]['dev'] = {}
 80 | 
 81 |             for devid in self._hostdevnames[hostidx].keys():
 82 |                 dpnts = len(values[hostidx, :, 0])
 83 |                 retdata['hosts'][str(hostidx)]['dev'][devid] = self._hostdata[hostidx][:dpnts, numpy.int(devid)].tolist()
 84 | 
 85 |             retdata['hosts'][str(hostidx)]['names'] = self._hostdevnames[hostidx]
 86 | 
 87 |         return retdata
 88 | 
 89 |     @staticmethod
 90 |     def collatedata(args, rates):
 91 |         """ build output data """
 92 |         result = []
 93 |         for timepoint, hostidx in enumerate(args):
 94 |             try:
 95 |                 result.append([rates[hostidx, timepoint], int(hostidx)])
 96 |             except IndexError:
 97 |                 pass
 98 | 
 99 |         return result
100 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/UncoreCounters.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Socket level performance counter plugin """
 3 | 
 4 | from supremm.plugin import Plugin
 5 | from supremm.statistics import calculate_stats
 6 | from supremm.errors import ProcessingError
 7 | import numpy
 8 | 
 9 | SNB_METRICS = ["perfevent.hwcounters.snbep_unc_imc0__UNC_M_CAS_COUNT_RD.value",
10 |                "perfevent.hwcounters.snbep_unc_imc0__UNC_M_CAS_COUNT_WR.value",
11 |                "perfevent.hwcounters.snbep_unc_imc1__UNC_M_CAS_COUNT_RD.value",
12 |                "perfevent.hwcounters.snbep_unc_imc1__UNC_M_CAS_COUNT_WR.value",
13 |                "perfevent.hwcounters.snbep_unc_imc2__UNC_M_CAS_COUNT_RD.value",
14 |                "perfevent.hwcounters.snbep_unc_imc2__UNC_M_CAS_COUNT_WR.value",
15 |                "perfevent.hwcounters.snbep_unc_imc3__UNC_M_CAS_COUNT_RD.value",
16 |                "perfevent.hwcounters.snbep_unc_imc3__UNC_M_CAS_COUNT_WR.value"]
17 | 
18 | IVB_METRICS = ["perfevent.hwcounters.ivbep_unc_imc0__UNC_M_CAS_COUNT_RD.value",
19 |                "perfevent.hwcounters.ivbep_unc_imc0__UNC_M_CAS_COUNT_WR.value",
20 |                "perfevent.hwcounters.ivbep_unc_imc1__UNC_M_CAS_COUNT_RD.value",
21 |                "perfevent.hwcounters.ivbep_unc_imc1__UNC_M_CAS_COUNT_WR.value",
22 |                "perfevent.hwcounters.ivbep_unc_imc2__UNC_M_CAS_COUNT_RD.value",
23 |                "perfevent.hwcounters.ivbep_unc_imc2__UNC_M_CAS_COUNT_WR.value",
24 |                "perfevent.hwcounters.ivbep_unc_imc3__UNC_M_CAS_COUNT_RD.value",
25 |                "perfevent.hwcounters.ivbep_unc_imc3__UNC_M_CAS_COUNT_WR.value"]
26 | 
27 | NHM_METRICS = ["perfevent.hwcounters.UNC_LLC_MISS_READ.value",
28 |                "perfevent.hwcounters.UNC_LLC_MISS_WRITE.value"]
29 | 
30 | INTERLAGOS_METRICS = ["perfevent.hwcounters.L3_CACHE_MISSES_ALL.value"]
31 | 
32 | class UncoreCounters(Plugin):
33 |     """ Compute various uncore performance counter derived metrics """
34 | 
35 |     name = property(lambda x: "uncperf")
36 |     mode = property(lambda x: "firstlast")
37 |     requiredMetrics = property(lambda x: [SNB_METRICS, IVB_METRICS, NHM_METRICS, INTERLAGOS_METRICS])
38 |     optionalMetrics = property(lambda x: [])
39 |     derivedMetrics = property(lambda x: [])
40 | 
41 |     def __init__(self, job):
42 |         super(UncoreCounters, self).__init__(job)
43 |         self._first = {}
44 |         self._data = {}
45 |         self._error = None
46 | 
47 |     def process(self, nodemeta, timestamp, data, description):
48 | 
49 |         if self._job.getdata('perf')['active'] != True:
50 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
51 |             return False
52 | 
53 |         ndata = numpy.array(data)
54 | 
55 |         if nodemeta.nodename not in self._first:
56 |             self._first[nodemeta.nodename] = ndata
57 |             return True
58 | 
59 |         if ndata.shape == self._first[nodemeta.nodename].shape:
60 |             self._data[nodemeta.nodename] = numpy.sum(ndata - self._first[nodemeta.nodename])
61 |             if numpy.any(numpy.fabs(self._data[nodemeta.nodename]) != self._data[nodemeta.nodename]):
62 |                 self._error = ProcessingError.PMDA_RESTARTED_DURING_JOB
63 |                 return False
64 |         else:
65 |             # Perf counters changed during the job
66 |             self._error = ProcessingError.RAW_COUNTER_UNAVAILABLE
67 |             return False
68 | 
69 |         return True
70 | 
71 |     def results(self):
72 | 
73 |         if self._error != None:
74 |             return {"error": self._error}
75 | 
76 |         nhosts = len(self._data)
77 | 
78 |         if nhosts < 1:
79 |             return {"error": ProcessingError.INSUFFICIENT_DATA}
80 | 
81 |         membw = numpy.zeros(nhosts)
82 |         for hostindex, data in enumerate(self._data.values()):
83 |             membw[hostindex] = data * 64.0
84 | 
85 |         results = {"membw": calculate_stats(membw)}
86 |         return results
87 | 


--------------------------------------------------------------------------------
/src/supremm/plugins/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/plugins/__init__.py


--------------------------------------------------------------------------------
/src/supremm/preprocessors/HardwareInventory.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ hardware inventory pre-processor """
 3 | 
 4 | from supremm.plugin import PreProcessor
 5 | from supremm.statistics import calculate_stats
 6 | 
 7 | class HardwareInventory(PreProcessor):
 8 |     """ Parse and analyse hardware inventory information. Currently
 9 |         grabs the number of CPU cores for each host.
10 |     """
11 | 
12 |     name = property(lambda x: "hinv")
13 |     mode = property(lambda x: "timeseries")
14 |     requiredMetrics = property(lambda x: [["kernel.percpu.cpu.user"], ["hinv.ncpu"]])
15 |     optionalMetrics = property(lambda x: [])
16 |     derivedMetrics = property(lambda x: [])
17 | 
18 |     def __init__(self, job):
19 |         super(HardwareInventory, self).__init__(job)
20 |         self.hostname = None
21 |         self.corecount = None
22 |         self.data = {}
23 |         self.cores = []
24 | 
25 |     def hoststart(self, hostname):
26 |         self.hostname = hostname
27 | 
28 |     def process(self, timestamp, data, description):
29 | 
30 |         if len(data) == 1 and data[0][:, 0].size > 0:
31 |             if data[0][0, 1] == -1:
32 |                 self.corecount = data[0][0, 0]
33 |             else:
34 |                 self.corecount = data[0][:, 0].size
35 |             # Have sufficient information, therefore return False to prevent
36 |             # any further callbacks
37 |             return False
38 | 
39 |         return True
40 | 
41 |     def hostend(self):
42 |         if self.corecount != None:
43 |             self.data[self.hostname] = {'cores': self.corecount}
44 |             self.cores.append(self.corecount)
45 | 
46 |         self.corecount = None
47 |         self.hostname = None
48 | 
49 |         self._job.adddata(self.name, self.data)
50 | 
51 |     def results(self):
52 |         return {"cores": calculate_stats(self.cores)}
53 | 
54 | 


--------------------------------------------------------------------------------
/src/supremm/preprocessors/PerfEvent.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ performance counters pre-processor """
 3 | 
 4 | from supremm.plugin import PreProcessor
 5 | 
 6 | class PerfEvent(PreProcessor):
 7 |     """ The hardware performance counters are only valid if they were
 8 |         active and counting for the whole job. This preproc checks the active
 9 |         flag at all timepoints and the result is avaiable to all the plugins that
10 |         use hardware counters.
11 |     """
12 | 
13 |     name = property(lambda x: "perf")
14 |     mode = property(lambda x: "timeseries")
15 |     requiredMetrics = property(lambda x: ["perfevent.active"])
16 |     optionalMetrics = property(lambda x: [])
17 |     derivedMetrics = property(lambda x: [])
18 | 
19 |     def __init__(self, job):
20 |         super(PerfEvent, self).__init__(job)
21 |         self.perfactive = None
22 | 
23 |     def hoststart(self, hostname):
24 |         pass
25 | 
26 |     def process(self, timestamp, data, description):
27 | 
28 |         if self.perfactive == False:
29 |             return False
30 | 
31 |         if len(data) == 1 and data[0].shape == (1, 2) and data[0][:, 0].size > 0:
32 |             self.perfactive = data[0][0, 0] != 0
33 |             return self.perfactive
34 | 
35 |         return True
36 | 
37 |     def hostend(self):
38 |         self._job.adddata(self.name, {"active": self.perfactive})
39 | 
40 |     def results(self):
41 |         return None
42 | 
43 | 


--------------------------------------------------------------------------------
/src/supremm/preprocessors/ProcPrometheus.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ Proc information pre-processor """
 3 | 
 4 | import re
 5 | 
 6 | from supremm.preprocessors.Proc import Proc
 7 | from supremm.linuxhelpers import parsecpusallowed
 8 | 
 9 | 
10 | class ProcPrometheus(Proc):
11 |     """ Parse and analyse the proc information for a job. Supports parsing the cgroup information
12 |         from SLRUM and PBS/Torque (if available).
13 |     """
14 | 
15 |     requiredMetrics = property(lambda x: ["prom:cgroup_cpu_info",
16 |                                           "prom:cgroup_process_exec_count"])
17 | 
18 |     optionalMetrics = property(lambda x: [])
19 |     derivedMetrics = property(lambda x: [])
20 | 
21 |     def __init__(self, job):
22 |         super(ProcPrometheus, self).__init__(job)
23 | 
24 |     def process(self, timestamp, data, description):
25 |         """ Override Proc process() method """
26 |         # Set self.cgroupcpuset here using parsecpusallowed
27 |         # The cgroupcpuset is returned as part of the description query
28 |         if self.cpusallowed is None:
29 |             allcores = set()
30 |             try:
31 |                 for cpuset in description[0].values():
32 |                     allcores |= parsecpusallowed(cpuset)
33 |                 if len(allcores) > 0:
34 |                     self.cpusallowed = allcores
35 |             except ValueError:
36 |                 # Empty cpuset info seen in the wild - should get populated at
37 |                 # next timestep
38 |                 pass
39 | 
40 |         # All processes from the exporter are constrained
41 |         for procname in description[1].values():
42 |             self.output['procDump']['constrained'][procname] += 1
43 | 
44 |         return True
45 | 


--------------------------------------------------------------------------------
/src/supremm/preprocessors/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/src/supremm/preprocessors/__init__.py


--------------------------------------------------------------------------------
/src/supremm/processhelpers.py:
--------------------------------------------------------------------------------
 1 | """ various deprecated helper functions """
 2 | import os
 3 | 
 4 | 
 5 | def get_utc_environ():
 6 |     """
 7 |     Creates a copy of this process' environment variables with the timezone
 8 |     variable set to UTC and returns it.
 9 | 
10 |     Returns:
11 |         A copy of os.environ with "TZ" set to "UTC".
12 |     """
13 |     utc_environ = os.environ.copy()
14 |     utc_environ["TZ"] = "UTC"
15 |     return utc_environ
16 | 
17 | 
18 | def log_pipe(pipe, logging_function, template="%s"):
19 |     """
20 |     Logs each non-empty line from a pipe (or other file-like object)
21 |     using the given logging function. This will block until the end of
22 |     the pipe is reached.
23 | 
24 |     Args:
25 |         pipe: The pipe to read from.
26 |         logging_function: The logging function to use.
27 |         template: (Optional) A template string to place each line from pipe
28 |                   inside.
29 |     """
30 |     if (not pipe) or (not logging_function):
31 |         return
32 | 
33 |     for line in pipe:
34 |         stripped_line = line.rstrip()
35 |         if stripped_line:
36 |             logging_function(template % stripped_line)
37 | 
38 | 
39 | def exists_ok_makedirs(path):
40 |     """
41 |     A wrapper for os.makedirs that does not throw an exception
42 |     if the given path points to an existing directory.
43 | 
44 |     Args:
45 |         path: The path to the directory to create.
46 |     Throws:
47 |         EnvironmentError: Thrown if the directory could not be created.
48 |     """
49 | 
50 |     try:
51 |         os.makedirs(path)
52 |     except EnvironmentError:
53 |         if not os.path.isdir(path):
54 |             raise
55 | 


--------------------------------------------------------------------------------
/src/supremm/rangechange.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import numpy
 3 | 
 4 | class DataCache(object):
 5 |     """ Helper class that remembers the last value that it was passed """
 6 |     def __init__(self):
 7 |         self.mdata = None
 8 |         self.timestamp = None
 9 |         self.data = None
10 |         self.description = None
11 | 
12 |     def name(self):
13 |         """ returns the name """
14 |         return 'datacache'
15 | 
16 |     def process(self, mdata, timestamp, data, description):
17 |         """ process call """
18 |         self.mdata = mdata
19 |         self.timestamp = timestamp
20 |         self.data = data
21 |         self.description = description
22 | 
23 |     def docallback(self, analytic):
24 |         """ call the analytic with the paramerters from the most recent call to
25 |             process (if any) """
26 |         if self.timestamp != None:
27 |             return analytic.process(self.mdata, self.timestamp, self.data, self.description)
28 |         else:
29 |             return True
30 | 
31 | class RangeChange(object):
32 |     """ Convert counters that have < 64 bits to 64 bits """
33 |     def __init__(self, configobj):
34 |         try:
35 |             self.config = configobj.getsection('normalization')
36 |         except KeyError:
37 |             self.config = []
38 | 
39 |         self._passthrough = False
40 |         self.accumulator = []
41 |         self.last = []
42 |         self.needsfixup = []
43 | 
44 |     def set_fetched_metrics(self, metriclist):
45 |         """ sets the list of metrics that will be passed to the normalise_data function
46 |             This resets the internal state of the object """
47 | 
48 |         self.accumulator = [None] * len(metriclist)
49 |         self.last = [None] * len(metriclist)
50 |         self.needsfixup = []
51 |         self._passthrough = True
52 | 
53 |         for metric in metriclist:
54 |             if metric in self.config:
55 |                 self.needsfixup.append(self.config[metric])
56 |                 self._passthrough = False
57 |             else:
58 |                 self.needsfixup.append(None)
59 | 
60 |     @property
61 |     def passthrough(self):
62 |         """ Returns whether the range changer will not modify data """
63 |         return self._passthrough
64 | 
65 |     def normalise_data(self, timestamp, data):
66 |         """ Convert the data if needed """
67 | 
68 |         if self._passthrough:
69 |             return
70 | 
71 |         i = 0
72 |         for datum in data:
73 | 
74 |             if self.needsfixup[i] is None:
75 |                 i += 1
76 |                 continue
77 | 
78 |             if len(datum) == 0:
79 |                 # Ignore entries with no data - this typically occurs when the
80 |                 # plugin requests multiple metrics and the metrics do not all appear
81 |                 # at every timestep
82 |                 i += 1
83 |                 continue
84 | 
85 |             if self.accumulator[i] is None:
86 |                 self.accumulator[i] = numpy.array(datum)
87 |                 self.last[i] = numpy.array(datum)
88 |             else:
89 |                 self.accumulator[i] += (datum - self.last[i]) % numpy.uint64(1 << self.needsfixup[i]['range'])
90 |                 numpy.copyto(self.last[i], datum)
91 |                 numpy.copyto(datum, self.accumulator[i])
92 | 
93 |             i += 1
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/src/supremm/scripthelpers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin env python
 2 | """ common functions used in the command line scripts """
 3 | 
 4 | import re
 5 | import datetime
 6 | import pymysql
 7 | import pymysql.cursors
 8 | import sys
 9 | import logging
10 | 
11 | def parsetime(strtime):
12 |     """ Try to be flexible in the time formats supported:
13 |            1) unixtimestamp prefixed with @
14 |            2) year-month-day zero-padded
15 |            3) year-month-day hour:minute:second zero padded optional T between date and time
16 |            4) locale specific format
17 |     """
18 |     m = re.search(r"^@(\d*)$", strtime)
19 |     if m:
20 |         return datetime.datetime.fromtimestamp(int(m.group(1)))
21 |     if re.search(r"^\d{4}-\d{2}-\d{2}$", strtime):
22 |         return datetime.datetime.strptime(strtime, "%Y-%m-%d")
23 |     m = re.search(r"^(\d{4}-\d{2}-\d{2}).(\d{2}:\d{2}:\d{2})$", strtime)
24 |     if m:
25 |         return datetime.datetime.strptime(m.group(1) + " " + m.group(2), "%Y-%m-%d %H:%M:%S")
26 | 
27 |     return datetime.datetime.strptime(strtime, "%c")
28 | 
29 | def getdbconnection(configsection, as_dict=False, defaultargs={}):
30 |     """ Helper function that gets a database connection object from a config dictionary """
31 | 
32 |     dbengine = configsection['dbengine'] if 'dbengine' in configsection else 'MySQLDB'
33 | 
34 |     if dbengine == 'MySQLDB':
35 | 
36 |         dbargs = defaultargs.copy()
37 |         # Convert the external configuration names to python PEP-249 config names
38 |         translate = {"host": "host", 
39 |                      "defaultsfile": "read_default_file",
40 |                      "user": "user",
41 |                      "pass": "passwd",
42 |                      "port": "port"}
43 | 
44 |         for confval, myval in translate.items():
45 |             if confval in configsection:
46 |                 dbargs[myval] = configsection[confval]
47 | 
48 |         if as_dict:
49 |             dbargs['cursorclass'] = pymysql.cursors.DictCursor
50 | 
51 |         dbargs['local_infile'] = 1
52 | 
53 |         return pymysql.connect(**dbargs)
54 |     else:
55 |         raise Exception("Unsupported database engine %s" % (dbengine))
56 | 
57 | def setuplogger(consolelevel, filename=None, filelevel=None):
58 |     """ setup the python root logger to log to the console with defined log
59 |         level. Optionally also log to file with the provided level """
60 | 
61 |     if filelevel == None:
62 |         filelevel = consolelevel
63 | 
64 |     if sys.version.startswith("2.7"):
65 |         logging.captureWarnings(True)
66 | 
67 |     rootlogger = logging.getLogger()
68 |     rootlogger.setLevel(min(consolelevel, filelevel))
69 | 
70 |     formatter = logging.Formatter('%(asctime)s.%(msecs)03d [%(levelname)s] %(message)s', datefmt='%Y-%m-%dT%H:%M:%S')
71 | 
72 |     if filename != None:
73 |         filehandler = logging.FileHandler(filename)
74 |         filehandler.setLevel(filelevel)
75 |         filehandler.setFormatter(formatter)
76 |         rootlogger.addHandler(filehandler)
77 | 
78 |     consolehandler = logging.StreamHandler()
79 |     consolehandler.setLevel(consolelevel)
80 |     consolehandler.setFormatter(formatter)
81 |     rootlogger.addHandler(consolehandler)
82 | 
83 | 


--------------------------------------------------------------------------------
/src/supremm/statistics.py:
--------------------------------------------------------------------------------
  1 | """ Various utilities for calculating statistics """
  2 | import math
  3 | import numpy
  4 | import scipy.stats
  5 | 
  6 | 
  7 | class Integrator(object):
  8 |     """ Helper class to itegrate data """
  9 |     def __init__(self, x):
 10 |         self._x0 = x
 11 |         self._total = numpy.zeros_like(x)
 12 |         self._elapsed = 0.0
 13 | 
 14 |     def add(self, x, y):
 15 |         """ Add data to the accumulator """
 16 |         delta_x = x - self._x0
 17 |         self._x0 = x
 18 | 
 19 |         self._total = y * delta_x + self._total
 20 |         self._elapsed += delta_x
 21 | 
 22 |     @property
 23 |     def total(self):
 24 |         """ get the total value """
 25 |         return self._total
 26 | 
 27 |     @property
 28 |     def elapsed(self):
 29 |         """ Sum of segments """
 30 |         return self._elapsed
 31 | 
 32 | def calculate_stats(v):
 33 |     res = {}
 34 | 
 35 |     if len(v) == 1:
 36 |         return {'avg': float(v[0]), 'cnt': 1}
 37 | 
 38 |     if len(v) > 0:
 39 |         (v_n, (v_min, v_max), v_avg, v_var, v_skew, v_kurt) = scipy.stats.describe(v)
 40 | 
 41 |         if v_min == v_max:
 42 |             return {'avg': float(v[0]), 'cnt': len(v)}
 43 | 
 44 |         res['max'] = float(v_max)
 45 |         res['avg'] = v_avg
 46 |         res['krt'] = v_kurt
 47 |         res['min'] = float(v_min)
 48 |         res['skw'] = v_skew
 49 |         res['cnt'] = len(v)
 50 |         if res['min'] == res['max']:
 51 |             res['med'] = res['min']
 52 |             res['std'] = 0.0
 53 |         else:
 54 |             res['med'] = float(numpy.median(v, axis=0))
 55 |             if len(v) > 2:
 56 |                 res['std'] = scipy.stats.tstd(v)
 57 | 
 58 |         if v_avg > 0:
 59 |             res['cov'] = math.sqrt(v_var) / v_avg
 60 | 
 61 |     return res
 62 | 
 63 | 
 64 | class RollingStats(object):
 65 |     """ Uses Welford's method [1] to compute the mean and stddev of
 66 |         a series for data without storing all datapoints.
 67 |     
 68 |         Data should be added to the class instance using the append()
 69 |         function and the summary statistics can be accessed using get()
 70 | 
 71 |         [1] B. P. Welford (1962) Note on a Method for Calculating
 72 |         Corrected Sums of Squares and Products, Technometrics,
 73 |         4:3, 419-420, DOI: 10.1080/00401706.1962.10490022
 74 |     """
 75 |     def __init__(self):
 76 |         self._count = 0
 77 | 
 78 |     def append(self, x):
 79 |         """ append a datum. """ 
 80 |         self._count += 1
 81 | 
 82 |         if self._count == 1:
 83 |             self.m = x
 84 |             self.last_m = x
 85 |             self.last_s = 0.0
 86 |             self.min = x
 87 |             self.max = x
 88 |         else:
 89 |             self.m = self.last_m + (x - self.last_m) / self._count
 90 |             self.s = self.last_s + (x - self.last_m) * (x - self.m)
 91 | 
 92 |             self.last_m = self.m
 93 |             self.last_s = self.s
 94 | 
 95 |             self.min = numpy.minimum(self.min, x)
 96 |             self.max = numpy.maximum(self.max, x)
 97 | 
 98 |     def get(self):
 99 |         """ return a dict with the various statistics """
100 |         return {'avg': self.mean(), 'min': self.min, 'max': self.max, 'cnt': self._count, 'std': math.sqrt(self.variance())}
101 | 
102 |     def mean(self):
103 |         """ return the mean """
104 |         if self._count > 0:
105 |             return self.m
106 |         return 0.0
107 | 
108 |     def count(self):
109 |         """ returns the number of data points that have been processed """
110 |         return self._count
111 | 
112 |     def variance(self):
113 |         """ Return the variance of the data """
114 |         if self._count > 1:
115 |             return self.s / (self._count - 1)
116 |         return 0.0
117 | 
118 |     def __str__(self):
119 |         return str(self.get())
120 | 
121 | def test():
122 |     """ test """
123 |     indata = [0.1, 0.2, 0.3, 0.4, 0.4, 0.5, 0.1, 0.4]
124 | 
125 |     stats = RollingStats()
126 |     for i in indata:
127 |         stats.append(i)
128 | 
129 |     print(stats.get())
130 |     print(calculate_stats(indata))
131 | 
132 | if __name__ == "__main__":
133 |     test()
134 | 
135 | 


--------------------------------------------------------------------------------
/src/supremm/subsample.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """ Timeseries subsampling module """
  3 | import numpy
  4 | 
  5 | 
  6 | class TimeseriesAccumulator(object):
  7 |     """ Stores a subset of time-value pairs for a dataseries """
  8 |     MAX_DATAPOINTS = 100
  9 |     LEAD_IN_DATAPOINTS = 10
 10 | 
 11 |     def __init__(self, nhosts, totaltime):
 12 |         self._totaltime = totaltime
 13 |         self._samplewindow = None
 14 |         self._leadout = None
 15 |         self._data = numpy.empty((nhosts, TimeseriesAccumulator.MAX_DATAPOINTS, 2))
 16 |         self._count = numpy.zeros(nhosts, dtype=int)
 17 | 
 18 |     def adddata(self, hostidx, timestamp, value):
 19 |         """ Add a datapoint to the collection.
 20 |         The sampling algorithm is as follows: The first LEAD_IN data points are
 21 |         always added Then the sample interval is computed, and one datapoint
 22 |         per interval is collected Near the end of the job, all points are
 23 |         collected again (based on the amount of time to get the first LEAD_IN.
 24 | 
 25 |         The sampling algorithm could be changed to try to capture more fine
 26 |         detail by changing the sample interval in response to the rate of
 27 |         change of the value (longer sample interval when there is little
 28 |         change, shorter when change is occuring). But this is left as an
 29 |         exercise for the reader
 30 |         """
 31 |         if self._count[hostidx] <= TimeseriesAccumulator.LEAD_IN_DATAPOINTS:
 32 |             idx = self._append(hostidx, timestamp, value)
 33 |             return idx
 34 | 
 35 |         if self._samplewindow == None:
 36 |             # compute sample window based on the first host to pass the post
 37 |             leadin = self._data[hostidx, TimeseriesAccumulator.LEAD_IN_DATAPOINTS, 0] - self._data[hostidx, 0, 0]
 38 |             self._samplewindow = (self._totaltime - (2.0 * leadin)) / (TimeseriesAccumulator.MAX_DATAPOINTS - 2 * TimeseriesAccumulator.LEAD_IN_DATAPOINTS)
 39 |             self._leadout = self._data[hostidx, 0, 0] + self._totaltime - leadin
 40 | 
 41 |         if ((timestamp > self._leadout) or (timestamp > self._data[hostidx, self._count[hostidx] - 1, 0] + self._samplewindow)) and self._count[hostidx] < TimeseriesAccumulator.MAX_DATAPOINTS:
 42 |             idx = self._append(hostidx, timestamp, value)
 43 |             return idx
 44 | 
 45 |         return None
 46 | 
 47 |     def _append(self, hostidx, timestamp, value):
 48 |         """ Add this data to the store """
 49 |         insertidx = self._count[hostidx]
 50 |         self._data[hostidx, insertidx, 0] = timestamp
 51 |         self._data[hostidx, insertidx, 1] = value
 52 |         self._count[hostidx] += 1
 53 |         return insertidx
 54 | 
 55 |     def gethost(self, hostidx):
 56 |         """ return the data series """
 57 |         return self._data[hostidx, :self._count[hostidx], :]
 58 | 
 59 |     def get(self):
 60 |         """ TODO numpy interp """
 61 |         return self._data[:, :numpy.min(self._count), :]
 62 | 
 63 |     def __str__(self):
 64 |         return str(self._data[:, :self._count, :])
 65 | 
 66 | 
 67 | class RangeConverter(object):
 68 |     """
 69 |     Convert data from limited width to 64bit width. Optionally raise an exception if
 70 |     the counters spin too fast.
 71 |     """
 72 | 
 73 |     def __init__(self, precision, checkoverflow=False):
 74 |         self._range = pow(2.0, precision)
 75 |         self._last = None
 76 |         self._accumulator = None
 77 |         self._checkoverflow = checkoverflow
 78 | 
 79 |     def append(self, indata):
 80 |         """ add updated data and return stored value """
 81 |         value = numpy.array(indata)
 82 | 
 83 |         if self._last != None:
 84 |             delta = (value - self._last) % self._range
 85 | 
 86 |             if self._checkoverflow:
 87 |                 if delta > (self._range / 2.0):
 88 |                     raise Exception("Counter overflow")
 89 |             self._accumulator += delta
 90 |         else:
 91 |             self._accumulator = numpy.zeros(value.shape)
 92 | 
 93 |         self._last = value
 94 | 
 95 |         return self._accumulator
 96 | 
 97 |     def get(self):
 98 |         """ get current stored value """
 99 |         return self._accumulator
100 | 


--------------------------------------------------------------------------------
/src/supremm/summarize.py:
--------------------------------------------------------------------------------
 1 | """ Definition of the summarize API """
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | VERSION = "1.0.6"
 5 | TIMESERIES_VERSION = 4
 6 | 
 7 | 
 8 | class Summarize(ABC):
 9 |     """ Abstract base class describing the job summarization interface.
10 |     """
11 | 
12 |     def __init__(self, preprocessors, analytics, job, config, fail_fast=False):
13 |         self.preprocs = preprocessors
14 |         self.alltimestamps = [x for x in analytics if x.mode in ("all", "timeseries")]
15 |         self.firstlast = [x for x in analytics if x.mode == "firstlast"]
16 |         self.errors = {}
17 |         self.job = job
18 |         self.fail_fast = fail_fast
19 | 
20 |         self.version = VERSION
21 |         self.timeseries_version = TIMESERIES_VERSION
22 | 
23 |     @abstractmethod
24 |     def get(self):
25 |         """ Return a dict with the summary information """
26 |         pass
27 | 
28 |     def adderror(self, category, errormsg):
29 |         """ All errors reported with this function show up in the job summary """
30 |         if category not in self.errors:
31 |             self.errors[category] = set()
32 |         if isinstance(errormsg, list):
33 |             self.errors[category].update(set(errormsg))
34 |         else:
35 |             self.errors[category].add(errormsg)
36 | 
37 |     @abstractmethod
38 |     def process(self):
39 |         """ Main entry point. All of a job's nodes are processed """
40 |         pass
41 | 
42 |     @abstractmethod
43 |     def complete(self):
44 |         """ A job is complete if data exist for all assigned nodes and they have
45 |             been processed sucessfullly
46 |         """
47 |         pass
48 | 
49 |     @abstractmethod
50 |     def good_enough(self):
51 |         """ A job is good_enough if archives for 95% of nodes have
52 |             been processed sucessfullly
53 |         """
54 |         pass
55 | 


--------------------------------------------------------------------------------
/src/supremm/supremm_update:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | #-------------------------------------------------------------------------
 4 | # Configurable settigns
 5 | 
 6 | LOCKFILE=/var/tmp/supremm_summary.lock
 7 | 
 8 | THREADS=`nproc --ignore=2`
 9 | 
10 | ulimit -n4096
11 | 
12 | #-------------------------------------------------------------------------
13 | #  Main script
14 | 
15 | reportfail()
16 | {
17 |     echo "Not running due to another process holding the lock"
18 |     exit 1
19 | }
20 | 
21 | (
22 |     flock -n 9 || reportfail
23 | 
24 |     set -e
25 | 
26 |     #-------------------------------------------------------------------------
27 |     # Run index and ingest
28 |     
29 |     if [ "$1" != "process" ]; then
30 |         indexarchives.py -t $THREADS -q
31 |         summarize_jobs.py -t $THREADS -q
32 |     else
33 |         summarize_jobs.py -t $THREADS -d
34 |     fi
35 | 
36 | ) 9>${LOCKFILE}
37 | 
38 | 


--------------------------------------------------------------------------------
/src/supremm/supremm_upgrade.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ supremm-upgrade script used to alter database or config files to latest
 3 |     schema versions """
 4 | 
 5 | import argparse
 6 | import signal
 7 | import sys
 8 | 
 9 | def signalHandler(sig, _):
10 |     """ clean exit on an INT signal """
11 |     if sig == signal.SIGINT:
12 |         sys.exit(0)
13 | 
14 | def main():
15 |     """ main entry point """
16 |     parser = argparse.ArgumentParser(description='Upgrade the SUPReMM database and config files')
17 |     parser.add_argument('-v', '--verbose', action='store_true', help='Output info level logging')
18 |     parser.add_argument('-d', '--debug', action='store_true', help='Output debug level logging')
19 |     parser.add_argument('-q', '--quiet', action='store_true', help='Output warning level logging')
20 | 
21 |     opts = parser.parse_args()
22 | 
23 |     signal.signal(signal.SIGINT, signalHandler)
24 | 
25 |     # Nothing to do for a 1.1 to 1.2 upgrade.
26 | 
27 | if __name__ == "__main__":
28 |     main()
29 | 


--------------------------------------------------------------------------------
/src/supremm/supremmconf.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """ helper utiility to print out config info """
 3 | 
 4 | import sys
 5 | import os
 6 | import json
 7 | import logging
 8 | from getopt import getopt
 9 | 
10 | from supremm.config import Config
11 | from supremm.scripthelpers import setuplogger
12 | 
13 | def usage():
14 |     """ print usage """
15 |     print("usage: {0} [OPTS]".format(os.path.basename(__file__)))
16 |     print("  -d --debug            set log level to debug")
17 |     print("  -c --config           specify the path to the configuration file")
18 |     print("  -s --section SECTION  output the configuration data from the specified section")
19 |     print("  -i --item ITEM        output the configuration data for the specified item")
20 |     print("  -h --help             print this help message")
21 | 
22 | def getoptions():
23 |     """ process comandline options """
24 | 
25 |     retdata = {"log"	: logging.ERROR,
26 |                "config"	: None,
27 |                "section": None,
28 |                "item"	: None}
29 | 
30 |     opts, _ = getopt(sys.argv[1:], "dc:s:i:h", ["debug", "config=", "section=", "item=", "help"])
31 | 
32 |     for opt in opts:
33 |         if opt[0] in ("-d", "--debug"):
34 |             retdata['log'] = logging.DEBUG 
35 |         if opt[0] in ("-c", "--config"):
36 |             retdata['config'] = opt[1]
37 |         if opt[0] in ("-s", "--section"):
38 |             retdata['section'] = opt[1]
39 |         if opt[0] in ("-i", "--item"):
40 |             retdata['item'] = opt[1]
41 |         if opt[0] in ("-h", "--help"):
42 |             usage()
43 |             sys.exit(0)
44 | 
45 |     if 'section' in retdata:
46 |         return retdata
47 | 
48 |     usage()
49 |     sys.exit(1)
50 | 
51 | def main():
52 |     """ print out config data according to cmdline args """
53 |     opts = getoptions()
54 | 
55 |     setuplogger(opts['log'])
56 | 
57 |     if opts['config']:
58 |         logging.debug("Using specified path: {}".format(opts['config']))
59 |     else:
60 |         logging.debug("Automatically detecting configuration path.")
61 | 
62 |     try:
63 |         conf = Config(opts['config'])
64 |     except:
65 |         logging.error("Configuration could not be found.")
66 |         sys.exit(1)
67 | 
68 |     if not opts['section']:
69 |        print(conf)
70 |        sys.exit(0)
71 | 
72 |     try:
73 |         section = conf.getsection(opts['section'])
74 |     except KeyError:
75 |         logging.error("Section '{}' not defined in configuration file.".format(opts['section']))
76 |         sys.exit(1)
77 | 
78 |     if opts['item']:
79 |         try:
80 |             item = section[opts['item']]
81 |         except KeyError:
82 |             logging.error("Item '{}' not defined in section '{}'.".format(opts['item'], opts['section']))
83 |             sys.exit(1)
84 | 
85 |         if isinstance(item, dict):
86 |             item = json.dumps(item, indent=4)
87 |  
88 |         print(item)
89 | 
90 |     else:
91 |         print(json.dumps(section, indent=4))
92 | 
93 | if __name__ == "__main__":
94 |     main()
95 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/__init__.py


--------------------------------------------------------------------------------
/tests/ci/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM rockylinux:8
2 | VOLUME /tmp/supremm
3 | COPY setup.sh /sbin/setup.sh
4 | COPY build.sh /sbin/build.sh
5 | RUN setup.sh
6 | WORKDIR /tmp/supremm
7 | entrypoint ["build.sh"]
8 | 


--------------------------------------------------------------------------------
/tests/ci/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | 
 4 | declare -a builds=("rpm" "wheel" "src")
 5 | for BUILD in "${builds[@]}";
 6 | do
 7 | case $BUILD in
 8 |   "rpm")
 9 |     python3 setup.py bdist_rpm
10 |     ;;
11 | 
12 |   "wheel")
13 |     python3 setup.py bdist_wheel
14 |     ;;
15 | 
16 |   "src")
17 |     tar --exclude={'*.rpm','*.whl'} -czf /tmp/supremm.tar.gz .
18 |     mv /tmp/supremm.tar.gz dist
19 |     ;;
20 | 
21 |   *)
22 |     ;;
23 | esac
24 | done
25 | 


--------------------------------------------------------------------------------
/tests/ci/setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | dnf install -y epel-release
 4 | 
 5 | # enable powertools repo for Cython
 6 | sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo
 7 | 
 8 | SETUP=$1
 9 | case $SETUP in
10 |   "build")
11 |     dnf install -y \
12 |         gcc \
13 |         pcp-devel \
14 |         rpm-build
15 | 
16 |     # Install development dependencies
17 |     dnf install -y \
18 |         python3-numpy \
19 |         python3-scipy \
20 |         python36-devel \
21 |         python3-Cython \
22 |         python3-pymongo \
23 |         python3-PyMySQL \
24 |         python3-pcp \
25 |         python3-requests \
26 |         python3-wheel
27 |    ;;
28 |   "test")
29 |     # Install dependencies
30 |     dnf install -y \
31 |         python3-numpy \
32 |         python3-scipy \
33 |         python36-devel \
34 |         python3-Cython \
35 |         python3-pymongo \
36 |         python3-PyMySQL \
37 |         python3-pytest \
38 |         python3-pytest-cov \
39 |         python3-mock \
40 |         python3-pexpect \
41 |         python3-pylint \
42 |         python3-pcp \
43 |         python3-pytz \
44 |         python3-requests \
45 |         pcp-devel \
46 |     ;;
47 | esac
48 | 
49 | 


--------------------------------------------------------------------------------
/tests/ci/srv/prom_cluster.txt:
--------------------------------------------------------------------------------
1 | 123456|123456|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:00:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|4|4|1000M|billing=1,cpu=4,mem=1000M,node=1|billing=1,cpu=4,mem=1000M,node=1|1-00:00:00|cpn-a21-01|mockjob1
2 | 789012|789012|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|8|8|1000M|billing=8,cpu=8,mem=1000M,node=1|billing=8,cpu=8,mem=1000M,node=8|1-00:00:00|cpn-b22-04|mockjob2
3 | 345678|345678|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|1|16|16|1000M|billing=16,cpu=16,mem=1000M,node=1|billing=16,cpu=16,mem=1000M,node=8|1-00:00:00|cpn-c23-04|mockjob3
4 | 901234|901234|robertson|prom|prom|mms|xdmod|1|supremm|2|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T00:05:00|2023-06-02T04:05:00|0-04:05:00|0:0|COMPLETED|3|4|4|1000M|billing=4,cpu=4,mem=1000M,node=3|billing=4,cpu=4,mem=1000M,node=3|1-00:00:00|cpn-a21-01,cpn-b22-04,cpn-c23-08|mockjob4
5 | 


--------------------------------------------------------------------------------
/tests/ci/srv/prometheus/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM prom/prometheus:latest
2 | RUN touch /prometheus/queries.active
3 | COPY ./prometheus.yml /etc/prometheus/prometheus.yml
4 | ADD ./promdata.tar.gz /prometheus
5 | 


--------------------------------------------------------------------------------
/tests/ci/srv/prometheus/promdata.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/ci/srv/prometheus/promdata.tar.gz


--------------------------------------------------------------------------------
/tests/ci/srv/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
1 | global:
2 |   scrape_interval: "30s"
3 |   scrape_timeout: "15s"
4 | 


--------------------------------------------------------------------------------
/tests/ci/srv/prometheus/web.yml:
--------------------------------------------------------------------------------
1 | # TODO use this for basic auth
2 | 


--------------------------------------------------------------------------------
/tests/ci/srv/services.yml:
--------------------------------------------------------------------------------
 1 | version: "3.9"
 2 | 
 3 | services: 
 4 |   prometheus:
 5 |     build:
 6 |       context: ./prometheus
 7 |     hostname: prometheus
 8 |     container_name: prometheus
 9 |     volumes:
10 |       - type: volume
11 |         source: tsdb
12 |         target: /var/lib/prometheus
13 |     command:
14 |       - '--storage.tsdb.path=/prometheus'
15 |       - '--config.file=/etc/prometheus/prometheus.yml'
16 |       - '--web.config.file=/etc/prometheus/web.yml'
17 |     network_mode: "host"
18 | 
19 |   mongod:
20 |     image:
21 |       mongo:6.0
22 |     environment:
23 |       - MONGO_INITDB_ROOT_USERNAME=supremm
24 |       - MONGO_INITDB_ROOT_PASSWORD=supremm-test123
25 |     volumes:
26 |       - type: volume
27 |         source: mongodb
28 |         target: /data/db
29 |     network_mode: "host"
30 | 
31 | volumes:
32 |   tsdb:
33 |   mongodb:
34 | 


--------------------------------------------------------------------------------
/tests/ci/test/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rockylinux:8
 2 | VOLUME /tmp/supremm
 3 | WORKDIR /tmp/supremm
 4 | RUN dnf install -y epel-release && \
 5 |     sed -i 's/enabled=0/enabled=1/' /etc/yum.repos.d/Rocky-PowerTools.repo
 6 | COPY bootstrap.sh /usr/bin/
 7 | COPY supremm_setup_expect.py /usr/bin/
 8 | COPY entrypoint.sh /usr/local/bin/
 9 | RUN chmod u+x /usr/bin/supremm_setup_expect.py
10 | RUN chmod u+x /usr/local/bin/entrypoint.sh
11 | ENTRYPOINT ["entrypoint.sh"]
12 | 


--------------------------------------------------------------------------------
/tests/ci/test/bootstrap.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euxo pipefail
 3 | shopt -s extglob
 4 | 
 5 | tests/ci/setup.sh test
 6 | 
 7 | INSTALL_TYPE=$1
 8 | case $INSTALL_TYPE in
 9 |   "rpm")
10 |     dnf install -y dist/supremm-+([0-9.])*.x86_64.rpm
11 |     ;;
12 |   "wheel")
13 |     pip3 install dist/supremm-+([0-9.])*.whl
14 |     ;;
15 |   "src")
16 |     tar -xzf dist/supremm.tar.gz -C /tmp
17 |     python3 /tmp/setup.py install
18 |     ;;
19 | esac
20 | 
21 | ~/bin/services start
22 | mongod -f /etc/mongod.conf --auth
23 | 
24 | mkdir -p /data/{phillips,pozidriv,frearson,mortorq,robertson}/{pcp-logs,jobscripts}
25 | mkdir -p "/data/mortorq/pcp-logs/hostname/2016/12/30"
26 | 
27 | # Run setup script
28 | python3 tests/integration_tests/supremm_setup_expect.py
29 | 
30 | # Copy node-level archives
31 | cp tests/integration_tests/pcp_logs_extracted/* /data/mortorq/pcp-logs/hostname/2016/12/30
32 | 
33 | # Create files containing 'job scripts' for 'start' jobs
34 | jspath=/data/phillips/jobscripts/20170101
35 | mkdir $jspath
36 | for jobid in 197155 197182 197186 197199 1234234[21] 123424[]
37 | do
38 |     echo "Job script for job $jobid" > $jspath/$jobid.savescript
39 | done
40 | 
41 | # Create job scripts for a submit jobs
42 | jspath=/data/robertson/jobscripts/20161212
43 | mkdir $jspath
44 | for jobid in 6066098
45 | do
46 |     echo "Job script for job $jobid" > $jspath/$jobid.savescript
47 | done
48 | 
49 | # Create job script for end jobs
50 | jspath=/data/pozidriv/jobscripts/20161230
51 | mkdir $jspath
52 | for jobid in 983936
53 | do
54 |     echo "Job script for job $jobid" > $jspath/$jobid.savescript
55 | done
56 | 


--------------------------------------------------------------------------------
/tests/component/data/perfevent.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.0


--------------------------------------------------------------------------------
/tests/component/data/perfevent.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.index


--------------------------------------------------------------------------------
/tests/component/data/perfevent.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/component/data/perfevent.meta


--------------------------------------------------------------------------------
/tests/component/runtests.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euxo pipefail
 3 | DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 4 | 
 5 | outputfile=`mktemp`
 6 | python3 $DIR/../../src/supremm/supremm_testharness.py -c $DIR/../../config $DIR/data/perfevent > $outputfile
 7 | 
 8 | # Check that there are data in the output for all the following
 9 | jq -e .cpuperf.cpiref.avg < $outputfile 
10 | jq -e .cpuperf.flops.avg < $outputfile 
11 | jq -e .cpuperf.cpldref.avg < $outputfile 
12 | jq -e .uncperf.membw.avg < $outputfile
13 | jq -e .timeseries.membw < $outputfile
14 | jq -e .timeseries.simdins < $outputfile
15 | jq -e .summarization.datasource < $outputfile
16 | 
17 | rm -f $outputfile
18 | 


--------------------------------------------------------------------------------
/tests/integration_tests/5894431-1622570028/cpn-d14-02.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.0


--------------------------------------------------------------------------------
/tests/integration_tests/5894431-1622570028/cpn-d14-02.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.index


--------------------------------------------------------------------------------
/tests/integration_tests/5894431-1622570028/cpn-d14-02.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/5894431-1622570028/cpn-d14-02.meta


--------------------------------------------------------------------------------
/tests/integration_tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/__init__.py


--------------------------------------------------------------------------------
/tests/integration_tests/integration_plugin_api.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from mock import patch
 3 | import pytest
 4 | 
 5 | from supremm import summarize_jobs
 6 | from tests.integration_tests.mock_preprocessor import MockPreprocessor
 7 | from tests.integration_tests.throwing_plugin import InitThrowingPlugin, ProcessThrowingPlugin, ResultsThrowingPlugin
 8 | 
 9 | 
10 | @pytest.mark.parametrize("threads", [1, 3])
11 | def test_plugin_api(threads):
12 |     test_args = "summarize_jobs.py -d -r 2 -j 972366 --fail-fast --threads {}".format(threads).split()
13 |     preprocs = [MockPreprocessor]
14 |     plugins = []
15 |     # this was very non-obvious to me but since summarize_jobs does "from supremm.plugin import loadpreprocs"
16 |     # you have to patch loadpreprocs as if it was in the summarize_jobs module
17 |     with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors",  return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins):
18 |         summarize_jobs.main()
19 | 
20 | 
21 | @pytest.mark.parametrize("threads", [1, 3])
22 | def test_exception_init(threads):
23 |     test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split()
24 |     plugins = [InitThrowingPlugin]
25 |     preprocs = []
26 |     with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors",  return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins):
27 |         summarize_jobs.main()
28 | 
29 | 
30 | @pytest.mark.parametrize("threads", [1, 3])
31 | def test_exception_process(threads):
32 |     test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split()
33 |     plugins = [ProcessThrowingPlugin]
34 |     preprocs = []
35 |     with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors",  return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins):
36 |         summarize_jobs.main()
37 | 
38 | 
39 | @pytest.mark.parametrize("threads", [1, 3])
40 | def test_exception_results(threads):
41 |     test_args = "summarize_jobs.py -d -r 2 -j 972366 --threads {}".format(threads).split()
42 |     plugins = [ResultsThrowingPlugin]
43 |     preprocs = []
44 |     with patch.object(sys, "argv", test_args), patch("supremm.summarize_jobs.loadpreprocessors",  return_value=preprocs), patch("supremm.summarize_jobs.loadplugins", return_value=plugins):
45 |         summarize_jobs.main()
46 | 


--------------------------------------------------------------------------------
/tests/integration_tests/integration_test.bash:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -euxo pipefail
 4 | 
 5 | indexarchives.py -da
 6 | summarize_jobs.py -d -r 2 -j 972366 --fail-fast
 7 | 
 8 | # DISABLED until XDMoD is ported to Centos 8
 9 | #aggregate_supremm.sh
10 | #
11 | #count=$(mysql -ss -u root <<EOF
12 | #USE modw_supremm;
13 | #SELECT count(*) FROM job WHERE local_job_id=972366 AND resource_id=2 AND netdrv_gpfs_rx IS NOT NULL;
14 | #EOF
15 | #)
16 | #
17 | #[[ $count -eq 1 ]]
18 | 
19 | ingest_jobscripts.py
20 | 
21 | count=$(mysql -ss -u root modw_supremm <<EOF
22 | SELECT COUNT(*) FROM \`job_scripts\` js , \`modw\`.\`job_tasks\` jt WHERE js.tg_job_id = jt.job_id and jt.resource_id = 3 and jt.local_jobid IN (197155, 197199, 197186, 197182);
23 | EOF
24 | )
25 | 
26 | [[ $count -eq 4 ]]
27 | 
28 | count=$(mysql -ss -u root modw_supremm <<EOF
29 | SELECT COUNT(*) FROM \`job_scripts\`;
30 | EOF
31 | )
32 | 
33 | [[ $count -eq 6 ]]
34 | 
35 | pytest-3 tests/integration_tests/integration_plugin_api.py
36 | 
37 | match=$(python3 src/supremm/supremm_testharness.py -i CpuCategories tests/integration_tests/5894431-1622570028/ | grep -q "GOOD"; echo $?)
38 | 
39 | [[ $match -eq 0 ]]
40 | 


--------------------------------------------------------------------------------
/tests/integration_tests/mock_preprocessor.py:
--------------------------------------------------------------------------------
 1 | from supremm.plugin import PreProcessor
 2 | 
 3 | 
 4 | class MockPreprocessor(PreProcessor):
 5 |     name = property(lambda self: "test_preproc")
 6 |     mode = property(lambda self: "timeseries")
 7 |     requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"])
 8 |     optionalMetrics = property(lambda self: [])
 9 |     derivedMetrics = property(lambda self: [])
10 | 
11 |     def __init__(self, job):
12 |         super(MockPreprocessor, self).__init__(job)
13 |         self.process_called = False  # make sure our test actually runs (can get skipped if things arent set up correctly)
14 | 
15 |     def hoststart(self, hostname):
16 |         pass
17 | 
18 |     def process(self, timestamp, data, description):
19 |         self.process_called = True
20 |         print(timestamp)
21 |         print(data)
22 |         print(description)
23 |         assert len(data) == 2
24 |         assert len(description) == 2
25 |         assert description[0] == {}  # hinv.ncpu has no instances but we should get an empty dict
26 | 
27 |     def hostend(self):
28 |         pass
29 | 
30 |     def results(self):
31 |         assert self.process_called
32 |         return {}
33 | 


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/20161229.00.10.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/20161229.00.10.0


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/20161229.00.10.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/20161229.00.10.index


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/20161229.00.10.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/20161229.00.10.meta


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.0


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.index


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-begin-20161229.23.06.00.meta


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.0:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.0


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.index:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.index


--------------------------------------------------------------------------------
/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.meta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ubccr/supremm/732b8c696e4a8926b7d29317c9ec95f16d6f87e3/tests/integration_tests/pcp_logs_extracted/job-972366-end-20161230.00.06.00.meta


--------------------------------------------------------------------------------
/tests/integration_tests/supremm_setup_expect.py:
--------------------------------------------------------------------------------
 1 | import pexpect
 2 | import sys
 3 | 
 4 | def config_pcp(p):
 5 |     p.sendline()
 6 |     p.expect("Directory containing node-level PCP archives")
 7 |     p.sendline()
 8 | 
 9 | def config_prometheus(p):
10 |     p.sendline("prometheus")
11 |     p.expect("Hostname for Prometheus server")
12 |     p.sendline()
13 |     p.expect("Username for basic authentication to Prometheus server")
14 |     p.sendline(" ")
15 |     #p.expect("Password for basic authentication to Prometheus server")
16 | 
17 | def main():
18 |     scriptsettings = ['start', 'start', 'start', 'end', 'submit']
19 |     with open("supremm_expect_log", "wb") as f:
20 |         p = pexpect.spawn('supremm-setup')
21 |         p.logfile = f
22 |         
23 |         p.expect("Select an option")
24 |         p.sendline("c")
25 |         
26 |         p.expect("Enter path to configuration files")
27 |         p.sendline()
28 |         p.expect("Do you wish to specify the XDMoD install directory")
29 |         p.sendline()
30 |         p.expect("XDMoD configuration directory path")
31 |         p.sendline()
32 |         p.expect("Temporary directory to use for job archive processing")
33 |         p.sendline()
34 | 
35 |         while True:
36 |             i = p.expect(["Overwrite config file","frearson", "mortorq", "phillips", "pozidriv", "robertson", "openstack", "recex", "torx", "nutsetters"])
37 |             if i > 1:
38 |                 p.expect('Enable SUPReMM summarization for this resource?')
39 |             if i > 5:
40 |                 p.sendline("n")
41 |                 continue
42 |             p.sendline("y")
43 |             if i != 0:
44 |                 p.expect("Data collector backend \(pcp or prometheus\)")
45 |                 if i <= 4: 
46 |                     config_pcp(p)
47 |                 elif i == 5:
48 |                     config_prometheus(p)
49 |                 p.expect("Source of accounting data")
50 |                 p.sendline()
51 |                 p.expect("node name unique identifier")
52 |                 p.sendline()
53 |                 p.expect("Directory containing job launch scripts")
54 |                 p.sendline()
55 |                 p.expect("Job launch script timestamp lookup mode \('submit', 'start' or 'none'\)")
56 |                 p.sendline(scriptsettings[i-1])
57 |             else:
58 |                 break
59 | 
60 |         p.expect("Press ENTER to continue")
61 |         p.sendline()
62 | 
63 |         p.expect("Select an option")
64 |         p.sendline("d")
65 |         p.expect("Enter path to configuration files")
66 |         p.sendline()
67 |         p.expect("DB hostname")
68 |         p.sendline()
69 |         p.expect("DB port")
70 |         p.sendline()
71 |         p.expect("DB Admin Username")
72 |         p.sendline()
73 |         p.expect("DB Admin Password")
74 |         p.sendline()
75 |         p.expect("Do you wish to proceed")
76 |         p.sendline("y")
77 |         p.expect("Press ENTER to continue")
78 |         p.sendline()
79 | 
80 |         p.expect("Select an option")
81 |         p.sendline("m")
82 |         p.expect("Enter path to configuration files")
83 |         p.sendline()
84 |         p.expect("URI")
85 |         p.sendline("mongodb://localhost/supremm")
86 |         p.expect("Do you wish to proceed")
87 |         p.sendline("y")
88 |         p.expect("Press ENTER to continue")
89 |         p.sendline()
90 |         
91 |         p.expect("Select an option")
92 |         p.sendline("q")
93 | 
94 | if __name__ == '__main__':
95 |     main()
96 | 


--------------------------------------------------------------------------------
/tests/integration_tests/throwing_plugin.py:
--------------------------------------------------------------------------------
 1 | from supremm.plugin import Plugin
 2 | 
 3 | 
 4 | class InitThrowingPlugin(Plugin):
 5 |     name = property(lambda self: "init_throwing_plugin")
 6 |     mode = property(lambda self: "timeseries")
 7 |     requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"])
 8 |     optionalMetrics = property(lambda self: [])
 9 |     derivedMetrics = property(lambda self: [])
10 | 
11 |     def __init__(self, job):
12 |         super(InitThrowingPlugin, self).__init__(job)
13 |         raise Exception("Exception in __init__")
14 | 
15 |     def process(self, nodemeta, timestamp, data, description):
16 |         pass
17 | 
18 |     def results(self):
19 |         pass
20 | 
21 | 
22 | class ProcessThrowingPlugin(Plugin):
23 |     name = property(lambda self: "process_throwing_plugin")
24 |     mode = property(lambda self: "timeseries")
25 |     requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"])
26 |     optionalMetrics = property(lambda self: [])
27 |     derivedMetrics = property(lambda self: [])
28 | 
29 |     def __init__(self, job):
30 |         super(ProcessThrowingPlugin, self).__init__(job)
31 | 
32 |     def process(self, nodemeta, timestamp, data, description):
33 |         raise Exception("Exception in process")
34 | 
35 |     def results(self):
36 |         pass
37 | 
38 | 
39 | class ResultsThrowingPlugin(Plugin):
40 |     name = property(lambda self: "results_throwing_plugin")
41 |     mode = property(lambda self: "timeseries")
42 |     requiredMetrics = property(lambda self: ["hinv.ncpu", "gpfs.fsios.read_bytes"])
43 |     optionalMetrics = property(lambda self: [])
44 |     derivedMetrics = property(lambda self: [])
45 | 
46 |     def __init__(self, job):
47 |         super(ResultsThrowingPlugin, self).__init__(job)
48 | 
49 |     def process(self, nodemeta, timestamp, data, description):
50 |         return False
51 | 
52 |     def results(self):
53 |         raise Exception("Exception in results")
54 | 


--------------------------------------------------------------------------------
/tests/testPcpArchiveProcessor.py:
--------------------------------------------------------------------------------
 1 | """" tests for the pcp archive processor """
 2 | import unittest
 3 | from supremm.datasource.pcp.indexarchives import PcpArchiveProcessor
 4 | 
 5 | class TestPcpArchiveProcessor(unittest.TestCase):
 6 |     """ Tests for the pcp filename string parser code """
 7 | 
 8 |     def setUp(self):
 9 |         """ setUp """
10 |         self.inst = PcpArchiveProcessor({'hostname_mode': 'hostname'})
11 | 
12 |     def test_archivestringmatching(self):
13 |         """ test timestamp parsing """
14 | 
15 |         testCases = {
16 |             'jo.log.ex.e-end-20180614.09.48.29.index': None,
17 |             'job-2671016.index': None,
18 |             'job-2679009[431].index': None,
19 |             'job-123423-end-20181004.04.05.41.index': 1538625941.0,
20 |             'job-123423-begin-20181004.04.05.41.index': 1538625941.0,
21 |             'job-123423-postbegin-20181004.04.05.41.index': 1538625941.0,
22 |             'job-123423[234]-end-20181004.04.05.41.index': 1538625941.0,
23 |             'job-123423[]-end-20181004.04.05.41.index': 1538625941.0,
24 |             'job-123423[234].server.net-end-20181004.04.05.41.index': 1538625941.0,
25 |             'job-123423[234].server.net-postbegin-20181004.04.05.41.index': 1538625941.0,
26 |             'job-123423[234].server.net-begin-20181004.04.05.41.index': 1538625941.0,
27 |             'job-123423.server.net-end-20181004.04.05.41.index': 1538625941.0
28 |         }
29 | 
30 |         for archiveName, expected in testCases.items():
31 |             assert self.inst.get_archive_data_fast('/some/path/to/data/' + archiveName) == expected
32 | 
33 |     def test_jobidparser(self):
34 |         """ test jobid parsing """
35 | 
36 |         testCases = {
37 |             'jo.log.ex.e-end-20180614.09.48.29.index': None,
38 |             '20180729.04.36.index': None,
39 |             'job-2671016.index': (-1, -1, 2671016),
40 |             'job-2673760.index': (-1, -1, 2673760),
41 |             'job-2671022.login.example.edu-end-20180830.02.54.25.index': (-1, -1, 2671022),
42 |             'job-2673760.login.example.edu-end-20180830.02.40.28.index': (-1, -1, 2673760),
43 |             'job-2673760.login.example.edu-end-20180830.02.50.16.index': (-1, -1, 2673760),
44 |             'job-1450543.login.example.edu-postbegin-20180830.00.00.00.index': (-1, -1, 1450543),
45 |             'job-1450554.login.example.edu-postbegin-20180830.00.00.00.index': (-1, -1, 1450554),
46 |             'job-2676199[18].index': (2676199, 18, -1),
47 |             'job-2679009[431].index': (2679009, 431, -1),
48 |             'job-1451551[326].hd-20180614.13.26.33.index': (1451551, 326, -1),
49 |             'job-2676200[18].login.example.edu-end-20180830.02.45.38.index': (2676200, 18, -1),
50 |             'job-2676200[18].login.example.edu-end-20180830.02.46.54.index': (2676200, 18, -1),
51 |             'job-2679009[431].login.example.edu-end-20180904.18.38.02.index': (2679009, 431, -1),
52 |             'job-2679136[520].login.example.edu-postbegin-20180614.00.00.00.index': (2679136, 520, -1),
53 |             'job-2679136[523].login.example.edu-postbegin-20180614.00.00.00.index': (2679136, 523, -1),
54 |             'job-1450512[4].login.example.edu-postbegin-20180614.00.00.00.index': (1450512, 4, -1),
55 |             'job-123423-end-20181004.04.05.41.index': (-1, -1, 123423),
56 |             'job-123423[234]-end-20181004.04.05.41.index': (123423, 234, -1),
57 |             'job-123423[]-end-20181004.04.05.41.index': (-1, -1, 123423),
58 |             'job-end-20181004.04.05.41.index': None,
59 |             'job-123423[234].server.net-end-20181004.04.05.41.index': (123423, 234, -1),
60 |             'job-123423.server.net-end-20181004.04.05.41.index': (-1, -1, 123423)
61 |         }
62 | 
63 |         for archiveName, expected in testCases.items():
64 |             assert self.inst.parsejobid(archiveName) == expected
65 | 


--------------------------------------------------------------------------------
/tests/testrangechange.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import numpy
 3 | from supremm.rangechange import RangeChange
 4 | 
 5 | class MockConfig(object):
 6 |     def __init__(self, settings):
 7 |         self.settings = settings
 8 | 
 9 |     def getsection(self, sectionname):
10 |         return dict(self.settings[sectionname])
11 | 
12 | 
13 | class TestRangeChange(unittest.TestCase):
14 | 
15 |     def test_normalization(self):
16 | 
17 |         config = MockConfig({"normalization": {"perfevent.hwcounters.CPU_CLK_UNHALTED.value": {"range": 48}}})
18 | 
19 |         r = RangeChange(config)
20 | 
21 |         r.set_fetched_metrics(["perfevent.hwcounters.CPU_CLK_UNHALTED.value", "something.else", "perfevent.hwcounters.RETIRED_INSTRUCTIONS.value"])
22 | 
23 |         self.assertFalse(r.passthrough)
24 | 
25 |         data = []
26 |         val = numpy.power([2,2,2], 48) - numpy.array([1,2,3])
27 |         data.append(val)
28 |         val = val - numpy.array([3,3,3])
29 |         data.append(val)
30 |         val = val - numpy.array([3,3,3])
31 |         data.append(val)
32 | 
33 |         r.normalise_data(1.000, data)
34 | 
35 |         self.assertTrue( numpy.all(data[0] ==  numpy.power([2,2,2], 48) - numpy.array([1,2,3]) ))
36 |         self.assertTrue( numpy.all(data[1] ==  numpy.power([2,2,2], 48) - numpy.array([4,5,6]) ))
37 |         self.assertTrue( numpy.all(data[2] ==  numpy.power([2,2,2], 48) - numpy.array([7,8,9]) ))
38 | 
39 |         d2 = []
40 |         d2.append( (data[0] + numpy.array([10,10,10])) % numpy.power(2,48))
41 |         d2.append(numpy.array([40,50,60]))
42 |         d2.append(numpy.array([70,80,90]))
43 | 
44 |         r.normalise_data(2.000, d2)
45 | 
46 |         delta = d2[0] - data[0]
47 | 
48 |         self.assertTrue( numpy.all(delta == numpy.array([10,10,10])))
49 |         self.assertTrue( numpy.all(d2[1] == numpy.array([40,50,60])))
50 |         self.assertTrue( numpy.all(d2[2] == numpy.array([70,80,90])))
51 | 
52 | 
53 |     def test_passthrough(self):
54 | 
55 |         config = MockConfig({"normalization": {"perfevent.hwcounters.CPU_CLK_UNHALTED.value": {"range": 48}}})
56 | 
57 |         r = RangeChange(config)
58 | 
59 |         r.set_fetched_metrics(["kernel.percpu.cpu.user", "kernel.percpu.cpu.system"])
60 |         self.assertTrue(r.passthrough)
61 | 
62 |         data = [numpy.array([234,23423,234,23423,23423]), numpy.array([856,5698,789,127,90780])]
63 | 
64 |         r.normalise_data(1.000, data)
65 | 
66 |         self.assertTrue(numpy.all(data[0] == numpy.array([234,23423,234,23423,23423])))
67 |         self.assertTrue(numpy.all(data[1] == numpy.array([856,5698,789,127,90780])))
68 | 
69 |     def test_missingconfig(self):
70 | 
71 |         config = MockConfig({})
72 |         r = RangeChange(config)
73 | 
74 |         r.set_fetched_metrics(["kernel.percpu.cpu.user", "kernel.percpu.cpu.system"])
75 | 
76 |         data = [numpy.array([234,23423,234,23423,23423]), numpy.array([856,5698,789,127,90780])]
77 | 
78 |         r.normalise_data(1.000, data)
79 | 
80 |         self.assertTrue(numpy.all(data[0] == numpy.array([234,23423,234,23423,23423])))
81 |         self.assertTrue(numpy.all(data[1] == numpy.array([856,5698,789,127,90780])))
82 | 
83 | if __name__ == '__main__':
84 |     unittest.main()
85 | 


--------------------------------------------------------------------------------