├── .github └── workflows │ ├── codeql.yml │ └── pylint.yml ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── collector.py ├── collectors ├── __init__.py ├── certificate_collector.py ├── firmware_collector.py ├── health_collector.py └── performance_collector.py ├── config.yml ├── dockerbuild.ps1 ├── handler.py ├── main.py ├── redfish-exporter.sh ├── renovate.json └── requirements.txt /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL Advanced" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | branches: [ "main" ] 19 | schedule: 20 | - cron: '43 2 * * 1' 21 | 22 | jobs: 23 | analyze: 24 | name: Analyze (${{ matrix.language }}) 25 | # Runner size impacts CodeQL analysis time. To learn more, please see: 26 | # - https://gh.io/recommended-hardware-resources-for-running-codeql 27 | # - https://gh.io/supported-runners-and-hardware-resources 28 | # - https://gh.io/using-larger-runners (GitHub.com only) 29 | # Consider using larger runners or machines with greater resources for possible analysis time improvements. 30 | runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }} 31 | permissions: 32 | # required for all workflows 33 | security-events: write 34 | 35 | # required to fetch internal or private CodeQL packs 36 | packages: read 37 | 38 | # only required for workflows in private repositories 39 | actions: read 40 | contents: read 41 | 42 | strategy: 43 | fail-fast: false 44 | matrix: 45 | include: 46 | - language: python 47 | build-mode: none 48 | # CodeQL supports the following values keywords for 'language': 'actions', 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift' 49 | # Use `c-cpp` to analyze code written in C, C++ or both 50 | # Use 'java-kotlin' to analyze code written in Java, Kotlin or both 51 | # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both 52 | # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis, 53 | # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning. 54 | # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how 55 | # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages 56 | steps: 57 | - name: Checkout repository 58 | uses: actions/checkout@v4 59 | 60 | # Add any setup steps before running the `github/codeql-action/init` action. 61 | # This includes steps like installing compilers or runtimes (`actions/setup-node` 62 | # or others). This is typically only required for manual builds. 63 | # - name: Setup runtime (example) 64 | # uses: actions/setup-example@v1 65 | 66 | # Initializes the CodeQL tools for scanning. 67 | - name: Initialize CodeQL 68 | uses: github/codeql-action/init@v3 69 | with: 70 | languages: ${{ matrix.language }} 71 | build-mode: ${{ matrix.build-mode }} 72 | # If you wish to specify custom queries, you can do so here or in a config file. 73 | # By default, queries listed here will override any specified in a config file. 74 | # Prefix the list here with "+" to use these queries and those in the config file. 75 | 76 | # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 77 | # queries: security-extended,security-and-quality 78 | 79 | # If the analyze step fails for one of the languages you are analyzing with 80 | # "We were unable to automatically build your code", modify the matrix above 81 | # to set the build mode to "manual" for that language. Then modify this step 82 | # to build your code. 83 | # ℹ️ Command-line programs to run using the OS shell. 84 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 85 | - if: matrix.build-mode == 'manual' 86 | shell: bash 87 | run: | 88 | echo 'If you are using a "manual" build mode for one or more of the' \ 89 | 'languages you are analyzing, replace this with the commands to build' \ 90 | 'your code, for example:' 91 | echo ' make bootstrap' 92 | echo ' make release' 93 | exit 1 94 | 95 | - name: Perform CodeQL Analysis 96 | uses: github/codeql-action/analyze@v3 97 | with: 98 | category: "/language:${{matrix.language}}" 99 | -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.11"] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install pylint 21 | pip install -r requirements.txt 22 | pip install ruff 23 | - name: Analysing the code with pylint 24 | run: | 25 | ruff check . 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .vscode/* 3 | env_* 4 | hosts_*.txt 5 | *.ps1 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM keppel.eu-de-1.cloud.sap/ccloud-dockerhub-mirror/library/ubuntu:latest 2 | 3 | RUN export DEBIAN_FRONTEND=noninteractive \ 4 | && apt-get update \ 5 | && apt-get upgrade -y \ 6 | && apt-get install -y python3 \ 7 | && apt-get install -y python3-pip \ 8 | && apt-get install -y curl \ 9 | && apt-get autoremove -y \ 10 | && apt-get clean -y \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | ARG FOLDERNAME=redfish_exporter 14 | 15 | RUN mkdir /${FOLDERNAME} 16 | RUN mkdir /${FOLDERNAME}/collectors 17 | 18 | WORKDIR /${FOLDERNAME} 19 | 20 | RUN pip3 install --break-system-packages --upgrade pip --ignore-install 21 | COPY requirements.txt /${FOLDERNAME} 22 | RUN pip3 install --break-system-packages --no-cache-dir -r requirements.txt 23 | 24 | COPY *.py /${FOLDERNAME}/ 25 | COPY collectors/ /${FOLDERNAME}/collectors/ 26 | COPY config.yml /${FOLDERNAME}/ 27 | 28 | RUN curl -ks 'https://aia.pki.co.sap.com/aia/SAPNetCA_G2.crt' -o '/usr/lib/ssl/certs/SAPNetCA_G2.crt' 29 | RUN curl -ks 'https://cacerts.digicert.com/DigiCertGlobalRootG2.crt.pem' -o '/usr/lib/ssl/certs/DigiCertGlobalRootCA.crt' 30 | RUN /usr/sbin/update-ca-certificates 31 | 32 | LABEL source_repository="https://github.com/sapcc/redfish-exporter" 33 | LABEL maintainer="Bernd Kuespert " 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Redfish-Exporter 2 | 3 | This is a Prometheus Exporter for extracting metrics from a server using the Redfish API. 4 | The hostname of the server has to be passed as **target parameter** in the http call. 5 | 6 | It has been tested with the following server models: 7 | 8 | Cisco UCS C480M5, working properly since BMC FW 4.1(1d) 9 | Cisco UCS C240M4 10 | Cisco UCS C240M5 11 | Cisco UCS C220M4 12 | Cisco UCS C220M5 13 | 14 | Cisco BMC FW below 4.x has its flaws regarding redfish API. Hence, I recommend updating at least to 4.0(1c). 15 | 16 | Dell PowerEdge R640 17 | Dell PowerEdge R730 18 | Dell PowerEdge R740 19 | Dell PowerEdge R640 20 | Dell PowerEdge R840 21 | 22 | Lenovo ThinkSystem SR950 23 | 24 | HPE DL360 Gen10 25 | HPE DL560 Gen10 26 | 27 | ## Example Call 28 | 29 | If you are logged into the POD running the exporter, you can retrieve metrics from a server, make an HTTP call to the exporter with the `target` and `job` parameters. The exporter supports the following endpoints: 30 | 31 | ### `/health` 32 | Retrieves health-related metrics (e.g., system status, memory errors, power state). 33 | 34 | ```bash 35 | curl "http://localhost:9220/health?target=server1.example.com&job=redfish-myjob" 36 | ``` 37 | 38 | ### `/firmware` 39 | Retrieves firmware version information for the server components. 40 | 41 | ```bash 42 | curl "http://localhost:9220/firmware?target=server1.example.com&job=redfish-myjob" 43 | ``` 44 | 45 | ### `/performance` 46 | Retrieves performance-related metrics (e.g., power consumption, temperature). 47 | 48 | ```bash 49 | curl "http://localhost:9220/performance?target=server1.example.com&job=redfish-myjob" 50 | ``` 51 | 52 | **Notes**: 53 | - Replace `server1.example.com` with the hostname or IP address of your Redfish server. 54 | - Replace `redfish-myjob` with the name of your job (used to map credentials). 55 | - The exporter listens on port 9220 by default. 56 | 57 | ## Prerequisites and Installation 58 | 59 | The exporter was written for Python 3.6 or newer. To install all modules needed you have to run the following command: 60 | 61 | ```bash 62 | pip3 install --no-cache-dir -r requirements.txt 63 | ``` 64 | 65 | Alternatively, you can use the provided Dockerfile to build and run the exporter in a container. 66 | 67 | ## Running with Docker 68 | 69 | To run the exporter in a Docker container, use the following command: 70 | 71 | ```bash 72 | docker run -d -p 9220:9220 your-path/redfish-exporter:v0.1.0 73 | ``` 74 | 75 | **Notes**: 76 | - Mount a custom `config.yaml` file if needed (see below). 77 | 78 | ## Parameters 79 | 80 | `-l ` - all output is written to a logfile. 81 | 82 | `-d` - switches on debugging mode 83 | 84 | `-c ` - you can specify the path to the config file, default is config.yml. 85 | 86 | ## The config.yml file 87 | 88 | * The **listen_port** is providing the port on which the exporter is waiting to receive calls. It is overwritten by the environment variable **LISTEN_PORT**. 89 | 90 | * The credentials for login to the switches can either be added to the config.yaml file or passed via environment variables. The environment variables are taking precedence over the entries in config.yaml file. 91 | 92 | The mapping of job names to environment variables follows a schema: `REDFISH_JOB1_USERNAME` and `REDFISH_JOB1_PASSWORD` would be the variables for example of the first job called `redfish/job1`. 93 | A slash gets replaced by underscore and everything gets converted to uppercase. 94 | 95 | * The **timeout** parameter specifies the amount of time to wait for an answer from the server. Again this can alos be provided via TIMEOUT environment variable. 96 | 97 | * The **job** parameter specifies the Prometheus job that will be passed as label if no job was handed over during the API call. 98 | 99 | ### Example of a config file 100 | 101 | ```yaml 102 | listen_port: 9200 103 | username: 104 | password: 105 | timeout: 40 106 | job: 'redfish-myjob' 107 | ``` 108 | 109 | ## Exported Metrics 110 | 111 | All metrics returned by the redfish exporter are gauge metrics. 112 | 113 | ### redfish_up 114 | 115 | Indicating if the redfish API was giving useful data back (== 1) or not (== 0). 116 | 117 | ### redfish_health 118 | 119 | Show the health information of the hardware parts like processor, memory, storage controllers, disks, fans, power and chassis if available. 120 | 121 | ### redfish_memory_correctable 122 | 123 | ### redfish_memory_uncorrectable 124 | 125 | Showing the count of errors per dimm. 126 | 127 | Cisco servers do not seem to provide this kind of information via redfish. Dell PowerEdge servers only with certain DIMM manufacturers (Samsung not, Micron Technology and Hynix Semiconductor do). 128 | 129 | ### redfish_powerstate 130 | 131 | Showing the powerstate of the server 132 | 133 | ### redfish_response_duration_seconds 134 | 135 | The duration of the first response of the server to a call to /redfish/v1 136 | 137 | ### redfish_up 138 | 139 | Metric indicating if there was a valid redfish response while calling /redfish/v1 140 | 141 | ### redfish_scrape_duration_seconds 142 | 143 | Total duration of scarping all data from the server 144 | 145 | ### redfish_firmware 146 | 147 | A collection of firmware version data stored in the labels. The value is always 1. 148 | -------------------------------------------------------------------------------- /collector.py: -------------------------------------------------------------------------------- 1 | """Prometheus Exporter for collecting baremetal server Redfish metrics.""" 2 | import logging 3 | import os 4 | import time 5 | import sys 6 | import re 7 | import requests 8 | from prometheus_client.core import GaugeMetricFamily 9 | from collectors.performance_collector import PerformanceCollector 10 | from collectors.firmware_collector import FirmwareCollector 11 | from collectors.health_collector import HealthCollector 12 | from collectors.certificate_collector import CertificateCollector 13 | 14 | class RedfishMetricsCollector: 15 | """Class for collecting Redfish metrics.""" 16 | def __enter__(self): 17 | return self 18 | 19 | def __init__(self, config, target, host, usr, pwd, metrics_type): 20 | self.target = target 21 | self.host = host 22 | 23 | self._username = usr 24 | self._password = pwd 25 | 26 | self.metrics_type = metrics_type 27 | 28 | self._timeout = int(os.getenv("TIMEOUT", config.get('timeout', 10))) 29 | self.labels = {"host": self.host} 30 | self._redfish_up = 0 31 | self._response_time = 0 32 | self._last_http_code = 0 33 | self.powerstate = 0 34 | 35 | self.urls = { 36 | "Systems": "", 37 | "SessionService": "", 38 | "Memory": "", 39 | "ManagedBy": "", 40 | "Processors": "", 41 | "Storage": "", 42 | "Chassis": "", 43 | "Power": "", 44 | "Thermal": "", 45 | "PowerSubsystem": "", 46 | "ThermalSubsystem": "", 47 | "NetworkInterfaces": "", 48 | } 49 | 50 | self.server_health = 0 51 | 52 | self.manufacturer = "" 53 | self.model = "" 54 | self.serial = "" 55 | self.status = { 56 | "ok": 0, 57 | "operable": 0, 58 | "enabled": 0, 59 | "good": 0, 60 | "critical": 1, 61 | "error": 1, 62 | "warning": 2, 63 | "absent": 0 64 | } 65 | self._start_time = time.time() 66 | 67 | self._session_url = "" 68 | self._auth_token = "" 69 | self._basic_auth = False 70 | self._session = "" 71 | self.redfish_version = "not available" 72 | 73 | def get_session(self): 74 | """Get the url for the server info and messure the response time""" 75 | logging.info("Target %s: Connecting to server %s", self.target, self.host) 76 | start_time = time.time() 77 | server_response = self.connect_server("/redfish/v1", noauth=True) 78 | 79 | self._response_time = round(time.time() - start_time, 2) 80 | logging.info("Target %s: Response time: %s seconds.", self.target, self._response_time) 81 | 82 | if not server_response: 83 | logging.warning("Target %s: No data received from server %s!", self.target, self.host) 84 | return 85 | 86 | logging.debug("Target %s: data received from server %s.", self.target, self.host) 87 | 88 | if "RedfishVersion" in server_response: 89 | self.redfish_version = server_response['RedfishVersion'] 90 | 91 | for key in ["Systems", "SessionService"]: 92 | if key in server_response: 93 | self.urls[key] = server_response[key]['@odata.id'] 94 | else: 95 | logging.warning( 96 | "Target %s: No %s URL found on server %s!", 97 | self.target, 98 | key, 99 | self.host 100 | ) 101 | return 102 | 103 | session_service = self.connect_server( 104 | self.urls['SessionService'], 105 | basic_auth=True 106 | ) 107 | 108 | if self._last_http_code != 200: 109 | logging.warning( 110 | "Target %s: Failed to get a session from server %s!", 111 | self.target, 112 | self.host 113 | ) 114 | self._basic_auth = True 115 | return 116 | 117 | sessions_url = f"https://{self.target}{session_service['Sessions']['@odata.id']}" 118 | session_data = {"UserName": self._username, "Password": self._password} 119 | self._session.auth = None 120 | result = "" 121 | 122 | # Try to get a session 123 | try: 124 | result = self._session.post( 125 | sessions_url, json=session_data, verify=False, timeout=self._timeout 126 | ) 127 | result.raise_for_status() 128 | 129 | except requests.exceptions.ConnectionError: 130 | logging.warning( 131 | "Target %s: Failed to get an auth token from server %s. Retrying ...", 132 | self.target, self.host 133 | ) 134 | try: 135 | result = self._session.post( 136 | sessions_url, json=session_data, verify=False, timeout=self._timeout 137 | ) 138 | result.raise_for_status() 139 | 140 | except requests.exceptions.ConnectionError as e: 141 | logging.error( 142 | "Target %s: Error getting an auth token from server %s: %s", 143 | self.target, self.host, e 144 | ) 145 | self._basic_auth = True 146 | 147 | except requests.exceptions.HTTPError as err: 148 | logging.warning( 149 | "Target %s: No session received from server %s: %s", 150 | self.target, self.host, err 151 | ) 152 | logging.warning("Target %s: Switching to basic authentication.", 153 | self.target 154 | ) 155 | self._basic_auth = True 156 | 157 | except requests.exceptions.ReadTimeout as err: 158 | logging.warning( 159 | "Target %s: No session received from server %s: %s", 160 | self.target, self.host, err 161 | ) 162 | logging.warning("Target %s: Switching to basic authentication.", 163 | self.target 164 | ) 165 | self._basic_auth = True 166 | 167 | if result: 168 | if result.status_code in [200, 201]: 169 | self._auth_token = result.headers['X-Auth-Token'] 170 | self._session_url = result.json()['@odata.id'] 171 | logging.info("Target %s: Got an auth token from server %s!", self.target, self.host) 172 | self._redfish_up = 1 173 | 174 | def connect_server(self, command, noauth=False, basic_auth=False): 175 | """Connect to the server and get the data.""" 176 | logging.captureWarnings(True) 177 | 178 | req = "" 179 | req_text = "" 180 | server_response = "" 181 | self._last_http_code = 200 182 | request_duration = 0 183 | request_start = time.time() 184 | 185 | url = f"https://{self.target}{command}" 186 | 187 | # check if we already established a session with the server 188 | if not self._session: 189 | self._session = requests.Session() 190 | else: 191 | logging.debug("Target %s: Using existing session.", self.target) 192 | 193 | self._session.verify = False 194 | self._session.headers.update({"charset": "utf-8"}) 195 | self._session.headers.update({"content-type": "application/json"}) 196 | 197 | if noauth: 198 | logging.debug("Target %s: Using no auth", self.target) 199 | elif basic_auth or self._basic_auth: 200 | self._session.auth = (self._username, self._password) 201 | logging.debug("Target %s: Using basic auth with user %s", self.target, self._username) 202 | else: 203 | logging.debug("Target %s: Using auth token", self.target) 204 | self._session.auth = None 205 | self._session.headers.update({"X-Auth-Token": self._auth_token}) 206 | 207 | logging.debug("Target %s: Using URL %s", self.target, url) 208 | try: 209 | req = self._session.get(url, stream=True, timeout=self._timeout) 210 | req.raise_for_status() 211 | 212 | except requests.exceptions.HTTPError as err: 213 | self._last_http_code = err.response.status_code 214 | if err.response.status_code in [401,403]: 215 | logging.error( 216 | "Target %s: Authorization Error: " 217 | "Wrong job provided or user/password set wrong on server %s: %s", 218 | self.target, self.host, err 219 | ) 220 | else: 221 | logging.error("Target %s: HTTP Error on server %s: %s", self.target, self.host, err) 222 | 223 | except requests.exceptions.ConnectTimeout: 224 | logging.error("Target %s: Timeout while connecting to %s", self.target, self.host) 225 | self._last_http_code = 408 226 | 227 | except requests.exceptions.ReadTimeout: 228 | logging.error("Target %s: Timeout while reading data from %s", self.target, self.host) 229 | self._last_http_code = 408 230 | 231 | except requests.exceptions.ConnectionError as err: 232 | logging.error("Target %s: Unable to connect to %s: %s", self.target, self.host, err) 233 | self._last_http_code = 444 234 | except requests.exceptions.RequestException: 235 | logging.error("Target %s: Unexpected error: %s", self.target, sys.exc_info()[0]) 236 | self._last_http_code = 500 237 | 238 | if req != "": 239 | self._last_http_code = req.status_code 240 | try: 241 | req_text = req.json() 242 | 243 | except requests.JSONDecodeError: 244 | logging.debug("Target %s: No json data received.", self.target) 245 | 246 | # req will evaluate to True if the status code was between 200 and 400 247 | # and False otherwise. 248 | if req: 249 | server_response = req_text 250 | 251 | # if the request fails the server might give a hint in the ExtendedInfo field 252 | else: 253 | if req_text: 254 | logging.debug( 255 | "Target %s: %s: %s", 256 | self.target, 257 | req_text['error']['code'], 258 | req_text['error']['message'] 259 | ) 260 | 261 | if "@Message.ExtendedInfo" in req_text['error']: 262 | 263 | if isinstance(req_text['error']['@Message.ExtendedInfo'], list): 264 | if "Message" in req_text['error']['@Message.ExtendedInfo'][0]: 265 | logging.debug( 266 | "Target %s: %s", 267 | self.target, 268 | req_text['error']['@Message.ExtendedInfo'][0]['Message'] 269 | ) 270 | 271 | elif isinstance(req_text['error']['@Message.ExtendedInfo'], dict): 272 | 273 | if "Message" in req_text['error']['@Message.ExtendedInfo']: 274 | logging.debug( 275 | "Target %s: %s", 276 | self.target, 277 | req_text['error']['@Message.ExtendedInfo']['Message'] 278 | ) 279 | else: 280 | pass 281 | 282 | request_duration = round(time.time() - request_start, 2) 283 | logging.debug("Target %s: Request duration: %s", self.target, request_duration) 284 | return server_response 285 | 286 | def get_base_labels(self): 287 | """Get the basic labels for the metrics.""" 288 | systems = self.connect_server(self.urls['Systems']) 289 | 290 | if not systems: 291 | return 292 | 293 | power_states = {"off": 0, "on": 1} 294 | # Get the server info for the labels 295 | server_info = {} 296 | for member in systems['Members']: 297 | self._systems_url = member['@odata.id'] 298 | info = self.connect_server(self._systems_url) 299 | if info: 300 | server_info.update(info) 301 | 302 | if not server_info: 303 | return 304 | self.manufacturer = server_info.get('Manufacturer') 305 | self.model = server_info.get('Model') 306 | if not self.manufacturer or not self.model: 307 | logging.error("Target %s: No manufacturer or model found on server %s!", self.target, self.host) 308 | return 309 | self.powerstate = power_states[server_info['PowerState'].lower()] 310 | # Dell has the Serial# in the SKU field, others in the SerialNumber field. 311 | if "SKU" in server_info and re.match(r'^[Dd]ell.*', server_info['Manufacturer']): 312 | self.serial = server_info['SKU'] 313 | else: 314 | self.serial = server_info['SerialNumber'] 315 | 316 | self.labels.update( 317 | { 318 | "host": self.host, 319 | "server_manufacturer": self.manufacturer, 320 | "server_model": self.model, 321 | "server_serial": self.serial 322 | } 323 | ) 324 | 325 | self.server_health = self.status[server_info['Status']['Health'].lower()] 326 | 327 | # get the links of the parts for later 328 | for url in self.urls: 329 | if url in server_info: 330 | self.urls[url] = server_info[url]['@odata.id'] 331 | 332 | # standard is a list but there are exceptions 333 | if isinstance(server_info['Links']['Chassis'][0], str): 334 | self.urls['Chassis'] = server_info['Links']['Chassis'][0] 335 | self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0] 336 | else: 337 | self.urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id'] 338 | self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id'] 339 | 340 | self.get_chassis_urls() 341 | 342 | def get_chassis_urls(self): 343 | """Get the urls for the chassis parts.""" 344 | chassis_data = self.connect_server(self.urls['Chassis']) 345 | if not chassis_data: 346 | return None 347 | 348 | urls = ['PowerSubsystem', 'Power', 'ThermalSubsystem', 'Thermal'] 349 | 350 | for url in urls: 351 | if url in chassis_data: 352 | self.urls[url] = chassis_data[url]['@odata.id'] 353 | 354 | return chassis_data 355 | 356 | def collect(self): 357 | """Collect the metrics.""" 358 | if self.metrics_type == 'health': 359 | up_metrics = GaugeMetricFamily( 360 | "redfish_up", 361 | "Redfish Server Monitoring availability", 362 | labels = self.labels, 363 | ) 364 | up_metrics.add_sample( 365 | "redfish_up", 366 | value = self._redfish_up, 367 | labels = self.labels 368 | ) 369 | yield up_metrics 370 | 371 | version_metrics = GaugeMetricFamily( 372 | "redfish_version", 373 | "Redfish Server Monitoring redfish version", 374 | labels = self.labels, 375 | ) 376 | version_labels = {'version': self.redfish_version} 377 | version_labels.update(self.labels) 378 | version_metrics.add_sample( 379 | "redfish_version", 380 | value = 1, 381 | labels = version_labels 382 | ) 383 | yield version_metrics 384 | 385 | response_metrics = GaugeMetricFamily( 386 | "redfish_response_duration_seconds", 387 | "Redfish Server Monitoring response time", 388 | labels = self.labels, 389 | ) 390 | response_metrics.add_sample( 391 | "redfish_response_duration_seconds", 392 | value = self._response_time, 393 | labels = self.labels, 394 | ) 395 | yield response_metrics 396 | 397 | if self._redfish_up == 0: 398 | return 399 | 400 | self.get_base_labels() 401 | 402 | if self.metrics_type == 'health': 403 | 404 | cert_metrics = CertificateCollector(self.host, self.target, self.labels) 405 | cert_metrics.collect() 406 | 407 | yield cert_metrics.cert_metrics_isvalid 408 | yield cert_metrics.cert_metrics_valid_hostname 409 | yield cert_metrics.cert_metrics_valid_days 410 | yield cert_metrics.cert_metrics_selfsigned 411 | 412 | powerstate_metrics = GaugeMetricFamily( 413 | "redfish_powerstate", 414 | "Redfish Server Monitoring Power State Data", 415 | labels = self.labels, 416 | ) 417 | powerstate_metrics.add_sample( 418 | "redfish_powerstate", value = self.powerstate, labels = self.labels 419 | ) 420 | yield powerstate_metrics 421 | 422 | metrics = HealthCollector(self) 423 | metrics.collect() 424 | 425 | yield metrics.mem_metrics_correctable 426 | yield metrics.mem_metrics_uncorrectable 427 | yield metrics.health_metrics 428 | 429 | # Get the firmware information 430 | if self.metrics_type == 'firmware': 431 | metrics = FirmwareCollector(self) 432 | metrics.collect() 433 | 434 | yield metrics.fw_metrics 435 | 436 | # Get the performance information 437 | if self.metrics_type == 'performance': 438 | metrics = PerformanceCollector(self) 439 | metrics.collect() 440 | 441 | yield metrics.power_metrics 442 | yield metrics.temperature_metrics 443 | 444 | # Finish with calculating the scrape duration 445 | duration = round(time.time() - self._start_time, 2) 446 | logging.info( 447 | "Target %s: %s scrape duration: %s seconds", 448 | self.target, self.metrics_type, duration 449 | ) 450 | 451 | scrape_metrics = GaugeMetricFamily( 452 | f"redfish_{self.metrics_type}_scrape_duration_seconds", 453 | f"Redfish Server Monitoring redfish {self.metrics_type} scrabe duration in seconds", 454 | labels = self.labels, 455 | ) 456 | 457 | scrape_metrics.add_sample( 458 | f"redfish_{self.metrics_type}_scrape_duration_seconds", 459 | value = duration, 460 | labels = self.labels, 461 | ) 462 | yield scrape_metrics 463 | 464 | def __exit__(self, exc_type, exc_val, exc_tb): 465 | logging.debug("Target %s: Deleting Redfish session with server %s", self.target, self.host) 466 | 467 | response = None 468 | 469 | if self._auth_token: 470 | session_url = f"https://{self.target}{self._session_url}" 471 | headers = {"x-auth-token": self._auth_token} 472 | 473 | logging.debug("Target %s: Using URL %s", self.target, session_url) 474 | 475 | try: 476 | response = requests.delete( 477 | session_url, verify=False, timeout=self._timeout, headers=headers 478 | ) 479 | response.close() 480 | 481 | except requests.exceptions.RequestException as e: 482 | logging.error( 483 | "Target %s: Error deleting session with server %s: %s", 484 | self.target, self.host, e 485 | ) 486 | 487 | if response: 488 | logging.info("Target %s: Redfish Session deleted successfully.", self.target) 489 | else: 490 | logging.warning( 491 | "Target %s: Failed to delete session with server %s", 492 | self.target, 493 | self.host 494 | ) 495 | logging.warning("Target %s: Token: %s", self.target, self._auth_token) 496 | 497 | else: 498 | logging.debug( 499 | "Target %s: No Redfish session existing with server %s", 500 | self.target, 501 | self.host 502 | ) 503 | 504 | if self._session: 505 | logging.info("Target %s: Closing requests session.", self.target) 506 | self._session.close() 507 | -------------------------------------------------------------------------------- /collectors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sapcc/redfish-exporter/a88f10dc1ac11af4fc2241436830e72a737e9dd2/collectors/__init__.py -------------------------------------------------------------------------------- /collectors/certificate_collector.py: -------------------------------------------------------------------------------- 1 | """Collects certificate information from the Redfish API.""" 2 | 3 | import logging 4 | import ssl 5 | import datetime 6 | import OpenSSL 7 | from prometheus_client.core import GaugeMetricFamily 8 | 9 | class CertificateCollector: 10 | """Collects certificate information from the Redfish API.""" 11 | 12 | def __init__(self, host, target, labels): 13 | self.host = host 14 | self.target = target 15 | self.timeout = 10 16 | self.labels = labels 17 | self.port = 443 18 | 19 | self.cert_metrics_isvalid = GaugeMetricFamily( 20 | name = "redfish_certificate_isvalid", 21 | documentation = "Redfish Server Monitoring certificate is valid", 22 | labels = self.labels, 23 | ) 24 | 25 | self.cert_metrics_valid_hostname = GaugeMetricFamily( 26 | name = "redfish_certificate_valid_hostname", 27 | documentation = "Redfish Server Monitoring certificate has valid hostname", 28 | labels = self.labels, 29 | ) 30 | 31 | self.cert_metrics_valid_days = GaugeMetricFamily( 32 | name = "redfish_certificate_valid_days", 33 | documentation = "Redfish Server Monitoring certificate valid for days", 34 | labels = self.labels, 35 | ) 36 | 37 | self.cert_metrics_selfsigned = GaugeMetricFamily( 38 | name = "redfish_certificate_selfsigned", 39 | documentation = "Redfish Server Monitoring certificate is self-signed", 40 | labels = self.labels, 41 | ) 42 | 43 | def collect(self): 44 | '''Collect Certificate data''' 45 | logging.info("Target %s: Collecting certificate data ...", self.target) 46 | 47 | cert = None 48 | x509 = None 49 | cert_days_left = 0 50 | cert_valid = 0 51 | cert_has_right_hostname = 0 52 | cert_selfsigned = 0 53 | current_labels = { 54 | "issuer": "n/a", 55 | "subject": "n/a", 56 | "not_after": "n/a", 57 | } 58 | 59 | try: 60 | cert = ssl.get_server_certificate((self.host, self.port)) 61 | x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, cert) 62 | 63 | except OpenSSL.SSL.Error as e: 64 | logging.debug("Target %s: Certificate Validation Error!", self.target) 65 | logging.debug("Target %s: %s", self.target, e) 66 | 67 | if cert and x509: 68 | subject = [ 69 | value.decode('utf-8') for name, value in x509.get_subject().get_components() 70 | if name.decode('utf-8') == 'CN' 71 | ][0] 72 | issuer = [ 73 | value.decode('utf-8') for name, value in x509.get_issuer().get_components() 74 | if name.decode('utf-8') == 'CN' 75 | ][0] 76 | 77 | not_after_str = x509.get_notAfter().decode('utf-8') 78 | 79 | cert_expiry_date = datetime.datetime.strptime( 80 | not_after_str, '%Y%m%d%H%M%S%fZ' 81 | ) if not_after_str else datetime.datetime.now() 82 | 83 | cert_days_left = (cert_expiry_date - datetime.datetime.now()).days 84 | 85 | current_labels.update( 86 | { 87 | "issuer": issuer, 88 | "subject": subject, 89 | "not_after": cert_expiry_date.strftime("%Y-%m-%d %H:%M:%S"), 90 | } 91 | ) 92 | 93 | if issuer == subject: 94 | logging.warning( 95 | "Target %s: Certificate is self-signed. Issuer: %s, Subject: %s", 96 | self.target, issuer, subject 97 | ) 98 | cert_selfsigned = 1 99 | else: 100 | logging.info( 101 | "Target %s: Certificate not self-signed. Issuer: %s, Subject: %s", 102 | self.target, issuer, subject 103 | ) 104 | 105 | if subject == self.host: 106 | logging.info("Target %s: Certificate has right hostname.", self.target) 107 | cert_has_right_hostname = 1 108 | else: 109 | logging.warning( 110 | "Target %s: Certificate has wrong hostname. Hostname: %s, Subject: %s", 111 | self.target, self.host, subject 112 | ) 113 | 114 | if cert_days_left > 0: 115 | logging.info( 116 | "Target %s: Certificate still valid. Days left: %d", 117 | self.target, cert_days_left 118 | ) 119 | if cert_has_right_hostname: 120 | cert_valid = 1 121 | else: 122 | logging.warning( 123 | "Target %s: Certificate not valid. Days left: %d", 124 | self.target, cert_days_left 125 | ) 126 | 127 | current_labels.update(self.labels) 128 | 129 | self.cert_metrics_isvalid.add_sample( 130 | "redfish_certificate_isvalid", 131 | value = cert_valid, 132 | labels = current_labels, 133 | ) 134 | 135 | self.cert_metrics_valid_hostname.add_sample( 136 | "redfish_certificate_valid_hostname", 137 | value = cert_has_right_hostname, 138 | labels = current_labels, 139 | ) 140 | 141 | self.cert_metrics_valid_days.add_sample( 142 | "redfish_certificate_valid_days", 143 | value = cert_days_left, 144 | labels = current_labels, 145 | ) 146 | 147 | self.cert_metrics_selfsigned.add_sample( 148 | "redfish_certificate_selfsigned", 149 | value = cert_selfsigned, 150 | labels = current_labels, 151 | ) 152 | -------------------------------------------------------------------------------- /collectors/firmware_collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | The collector for firmware information is implemented in the FirmwareCollector class. 3 | The collect method retrieves the firmware information from the Redfish API and adds it to the firmware metrics. 4 | The collect method is called by the collect method of the RedfishMetricsCollector class. 5 | The __enter__ and __exit__ methods are used to manage the lifecycle of the FirmwareCollector class. 6 | """ 7 | 8 | import logging 9 | from re import search 10 | 11 | from prometheus_client.core import GaugeMetricFamily 12 | 13 | class FirmwareCollector: 14 | """ 15 | Collects firmware information from the Redfish API. 16 | """ 17 | 18 | def __enter__(self): 19 | return self 20 | 21 | def __init__(self, redfish_metrics_collector): 22 | 23 | self.col = redfish_metrics_collector 24 | 25 | self.fw_metrics = GaugeMetricFamily( 26 | "redfish_firmware", 27 | "Redfish Server Monitoring Firmware Data", 28 | labels=self.col.labels, 29 | ) 30 | 31 | def collect(self): 32 | """ 33 | Collects firmware information from the Redfish API. 34 | """ 35 | 36 | logging.info("Target %s: Get the firmware information.", self.col.target) 37 | 38 | fw_collection = self.col.connect_server( 39 | "/redfish/v1/UpdateService/FirmwareInventory" 40 | ) 41 | if not fw_collection: 42 | logging.warning("Target %s: Cannot get Firmware data!", self.col.target) 43 | return 44 | 45 | for fw_member in fw_collection['Members']: 46 | fw_member_url = fw_member['@odata.id'] 47 | # only look at entries on a Dell server if the device is marked as installed 48 | if (search(".*Dell.*", self.col.manufacturer) and ("Installed" in fw_member_url)) or not search(".*Dell.*", self.col.manufacturer): 49 | fw_item = self.col.connect_server(fw_member_url) 50 | if not fw_item: 51 | continue 52 | 53 | item_name = fw_item['Name'].split(",", 1)[0] 54 | current_labels = {"item_name": item_name} 55 | 56 | if self.col.manufacturer == 'Lenovo': 57 | # Lenovo has always Firmware: in front of the names, let's remove it 58 | item_name = fw_item['Name'].replace('Firmware:','') 59 | current_labels.update({"item_name": item_name}) 60 | # we need an additional label to distinguish the metrics because 61 | # the device ID is not in the name in case of Lenovo 62 | if "Id" in fw_item: 63 | current_labels.update({"item_id": fw_item['Id']}) 64 | 65 | if "Manufacturer" in fw_item: 66 | current_labels.update({"item_manufacturer": fw_item['Manufacturer']}) 67 | 68 | if "Version" in fw_item: 69 | version = fw_item['Version'] 70 | if version != "N/A" and version is not None: 71 | current_labels.update({"version": version}) 72 | current_labels.update(self.col.labels) 73 | self.fw_metrics.add_sample( 74 | "redfish_firmware", 75 | value=1, 76 | labels=current_labels 77 | ) 78 | 79 | def __exit__(self, exc_type, exc_val, exc_tb): 80 | if exc_tb is not None: 81 | logging.exception( 82 | "Target %s: An exception occured in {exc_tb.f_code.co_filename}:{exc_tb.tb_lineno}", 83 | self.col.target 84 | ) 85 | -------------------------------------------------------------------------------- /collectors/health_collector.py: -------------------------------------------------------------------------------- 1 | """Collects health information from the Redfish API.""" 2 | import logging 3 | import math 4 | 5 | from prometheus_client.core import GaugeMetricFamily 6 | 7 | class HealthCollector(): 8 | """Collects health information from the Redfish API.""" 9 | def __enter__(self): 10 | return self 11 | 12 | def __init__(self, redfish_metrics_collector): 13 | self.col = redfish_metrics_collector 14 | 15 | self.health_metrics = GaugeMetricFamily( 16 | "redfish_health", 17 | "Redfish Server Monitoring Health Data", 18 | labels=self.col.labels, 19 | ) 20 | self.mem_metrics_correctable = GaugeMetricFamily( 21 | "redfish_memory_correctable", 22 | "Redfish Server Monitoring Memory Data for correctable errors", 23 | labels=self.col.labels, 24 | ) 25 | self.mem_metrics_uncorrectable = GaugeMetricFamily( 26 | "redfish_memory_uncorrectable", 27 | "Redfish Server Monitoring Memory Data for uncorrectable errors", 28 | labels=self.col.labels, 29 | ) 30 | 31 | def get_processors_health(self): 32 | """Get the Processor data from the Redfish API.""" 33 | logging.debug("Target %s: Get the CPU health data.", self.col.target) 34 | processor_collection = self.col.connect_server(self.col.urls["Processors"]) 35 | 36 | if not processor_collection: 37 | return 38 | for processor in processor_collection["Members"]: 39 | processor_data = self.col.connect_server(processor["@odata.id"]) 40 | if not processor_data: 41 | continue 42 | 43 | proc_status = self.extract_health_status( 44 | processor_data, "Processor", processor_data.get("Socket", "unknown") 45 | ) 46 | current_labels = { 47 | "device_type": "processor", 48 | "device_name": processor_data.get("Socket", "unknown"), 49 | "device_manufacturer": processor_data.get("Manufacturer", "unknown"), 50 | "cpu_type": processor_data.get("ProcessorType", "unknown"), 51 | "cpu_model": processor_data.get("Model", "unknown"), 52 | "cpu_cores": str(processor_data.get("TotalCores", "unknown")), 53 | "cpu_threads": str(processor_data.get("TotalThreads", "unknown")), 54 | } 55 | current_labels.update(self.col.labels) 56 | 57 | self.add_metric_sample( 58 | "redfish_health", 59 | {"Health": proc_status}, 60 | "Health", 61 | current_labels 62 | ) 63 | 64 | def get_storage_health(self): 65 | """Get the Storage data from the Redfish API.""" 66 | logging.debug("Target %s: Get the storage health data.", self.col.target) 67 | storage_collection = self.col.connect_server(self.col.urls["Storage"]) 68 | 69 | if not storage_collection: 70 | return 71 | 72 | for controller in storage_collection["Members"]: 73 | controller_data = self.col.connect_server(controller["@odata.id"]) 74 | if not controller_data: 75 | continue 76 | 77 | controller_details = self.get_controller_details(controller_data) 78 | controller_name = self.get_controller_name(controller_details, controller_data) 79 | controller_status = self.extract_health_status( 80 | controller_details, "Controller", controller_name 81 | ) 82 | 83 | current_labels = self.get_controller_labels(controller_details, controller_name) 84 | self.add_metric_sample( 85 | "redfish_health", 86 | {"Health": controller_status}, 87 | "Health", 88 | current_labels 89 | ) 90 | 91 | for disk in controller_data["Drives"]: 92 | disk_data = self.col.connect_server(disk["@odata.id"]) 93 | if not disk_data: 94 | continue 95 | 96 | disk_status = self.extract_health_status( 97 | disk_data, 98 | "Disk", 99 | disk_data.get("Name", "unknown") 100 | ) 101 | current_labels = self.get_disk_labels(disk_data) 102 | self.add_metric_sample( 103 | "redfish_health", 104 | {"Health": disk_status}, 105 | "Health", 106 | current_labels 107 | ) 108 | 109 | def get_controller_details(self, controller_data): 110 | """Get controller details from controller data.""" 111 | if controller_data.get("StorageControllers"): 112 | if isinstance(controller_data["StorageControllers"], list): 113 | return controller_data["StorageControllers"][0] 114 | return list(controller_data["StorageControllers"].values())[0] 115 | return controller_data 116 | 117 | def get_controller_name(self, controller_details, controller_data): 118 | """Get controller name from controller details or data.""" 119 | return controller_details.get("Name") or controller_data.get("Name", "unknown") 120 | 121 | def extract_health_status(self, data, device_type, device_name): 122 | """Extract health status from data.""" 123 | if "Status" not in data: 124 | return math.nan 125 | 126 | status = data["Status"] 127 | if isinstance(status, str): 128 | return self.col.status[status.lower()] 129 | 130 | status = {k.lower(): v for k, v in status.items()} 131 | state = status.get("state") 132 | if state is None or state.lower() == "absent": 133 | logging.debug( 134 | "Target %s: Host %s, Model %s, %s %s: absent.", 135 | self.col.target, 136 | self.col.host, 137 | self.col.model, 138 | device_type, 139 | device_name 140 | ) 141 | return math.nan 142 | 143 | health = status.get("health", "") 144 | if not health: 145 | logging.warning( 146 | "Target %s: No %s health data provided for %s!", 147 | self.col.target, 148 | device_type, 149 | device_name 150 | ) 151 | return math.nan 152 | 153 | return self.col.status[health.lower()] 154 | 155 | def get_controller_labels(self, controller_details, controller_name): 156 | """Generate labels for Controller.""" 157 | labels = { 158 | "device_type": "storage", 159 | "device_name": controller_name, 160 | "device_manufacturer": controller_details.get("Manufacturer", "unknown"), 161 | "controller_model": controller_details.get("Model", "unknown"), 162 | } 163 | labels.update(self.col.labels) 164 | return labels 165 | 166 | def get_disk_labels(self, disk_data): 167 | """Generate labels for Disk.""" 168 | disk_attributes = { 169 | "Name": "device_name", 170 | "MediaType": "disk_type", 171 | "Manufacturer": "device_manufacturer", 172 | "Model": "disk_model", 173 | "CapacityBytes": "disk_capacity", 174 | "Protocol": "disk_protocol", 175 | } 176 | labels = {"device_type": "disk"} 177 | for disk_attribute, label_name in disk_attributes.items(): 178 | if disk_attribute in disk_data: 179 | labels[label_name] = str(disk_data[disk_attribute]) 180 | labels.update(self.col.labels) 181 | return labels 182 | 183 | def get_chassis_health(self): 184 | """Get the Chassis data from the Redfish API.""" 185 | logging.debug("Target %s: Get the Chassis health data.", self.col.target) 186 | chassis_data = self.col.connect_server(self.col.urls["Chassis"]) 187 | if not chassis_data: 188 | return 189 | 190 | current_labels = { 191 | "device_type": "chassis", 192 | "device_name": chassis_data["Name"] 193 | } 194 | current_labels.update(self.col.labels) 195 | chassis_health = self.extract_health_status(chassis_data, "Chassis", chassis_data["Name"]) 196 | self.add_metric_sample( 197 | "redfish_health", 198 | {"Health": chassis_health}, 199 | "Health", 200 | current_labels 201 | ) 202 | 203 | def get_power_health(self): 204 | """Get the Power data from the Redfish API.""" 205 | logging.debug("Target %s: Get the PDU health data.", self.col.target) 206 | power_data = self.col.connect_server(self.col.urls["Power"]) 207 | if not power_data: 208 | return 209 | 210 | for psu in power_data["PowerSupplies"]: 211 | psu_name = psu["Name"] if "Name" in psu and psu["Name"] is not None else "unknown" 212 | psu_model = psu["Model"] if "Model" in psu and psu["Model"] is not None else "unknown" 213 | 214 | current_labels = { 215 | "device_type": "powersupply", 216 | "device_name": psu_name, 217 | "device_model": psu_model 218 | } 219 | current_labels.update(self.col.labels) 220 | psu_health = self.extract_health_status(psu, "PSU", psu_name) 221 | self.add_metric_sample( 222 | "redfish_health", 223 | {"Health": psu_health}, 224 | "Health", 225 | current_labels 226 | ) 227 | 228 | def get_thermal_health(self): 229 | """Get the Thermal data from the Redfish API.""" 230 | logging.debug("Target %s: Get the thermal health data.", self.col.target) 231 | thermal_data = self.col.connect_server(self.col.urls["Thermal"]) 232 | if not thermal_data: 233 | return 234 | 235 | for fan in thermal_data["Fans"]: 236 | fan_name = fan.get("Name", "unknown") 237 | current_labels = { 238 | "device_type": "fan", 239 | "device_name": fan_name 240 | } 241 | current_labels.update(self.col.labels) 242 | fan_health = self.extract_health_status(fan, "Fan", fan_name) 243 | self.add_metric_sample( 244 | "redfish_health", 245 | {"Health": fan_health}, 246 | "Health", 247 | current_labels 248 | ) 249 | 250 | def get_memory_health(self): 251 | """Get the Memory data from the Redfish API.""" 252 | logging.debug("Target %s: Get the Memory data.", self.col.target) 253 | 254 | memory_collection = self.col.connect_server(self.col.urls["Memory"]) 255 | if not memory_collection: 256 | return 257 | 258 | for dimm_url in memory_collection["Members"]: 259 | dimm_info = self.col.connect_server(dimm_url["@odata.id"]) 260 | if not dimm_info: 261 | continue 262 | 263 | dimm_health = self.extract_health_status( 264 | dimm_info, 265 | "Dimm", 266 | dimm_info.get("Name", "unknown") 267 | ) 268 | if dimm_health is math.nan: 269 | logging.debug( 270 | "Target %s: Host %s, Model %s, Dimm %s: No health data found.", 271 | self.col.target, 272 | self.col.host, 273 | self.col.model, 274 | dimm_info['Name'] 275 | ) 276 | continue 277 | 278 | current_labels = self.get_dimm_labels(dimm_info) 279 | self.add_metric_sample( 280 | "redfish_health", 281 | {"Health": dimm_health}, 282 | "Health", 283 | current_labels 284 | ) 285 | 286 | if "Metrics" in dimm_info: 287 | self.process_dimm_metrics(dimm_info, current_labels) 288 | 289 | def get_dimm_labels(self, dimm_info): 290 | """Generate labels for DIMM.""" 291 | labels = { 292 | "device_type": "memory", 293 | "device_name": dimm_info["Name"], 294 | "dimm_capacity": str(dimm_info["CapacityMiB"]), 295 | "dimm_speed": str(dimm_info.get("OperatingSpeedMhz", "unknown")), 296 | "dimm_type": dimm_info["MemoryDeviceType"], 297 | "device_manufacturer": dimm_info.get("Manufacturer", "N/A") 298 | } 299 | 300 | if "Oem" in dimm_info and "Hpe" in dimm_info["Oem"]: 301 | labels["device_manufacturer"] = dimm_info["Oem"]["Hpe"].get("VendorName", "unknown") 302 | 303 | labels.update(self.col.labels) 304 | return labels 305 | 306 | def process_dimm_metrics(self, dimm_info, current_labels): 307 | """Process DIMM metrics.""" 308 | dimm_metrics = self.col.connect_server(dimm_info["Metrics"]["@odata.id"]) 309 | if not dimm_metrics: 310 | return 311 | 312 | health_data = dimm_metrics.get("HealthData", {}).get("AlarmTrips", {}) 313 | self.add_metric_sample( 314 | "redfish_memory_correctable", 315 | health_data, 316 | "CorrectableECCError", 317 | current_labels 318 | ) 319 | 320 | self.add_metric_sample( 321 | "redfish_memory_uncorrectable", 322 | health_data, 323 | "UncorrectableECCError", 324 | current_labels 325 | ) 326 | 327 | def add_metric_sample(self, metric_name, data, key, labels): 328 | """Add a sample to the specified metric.""" 329 | try: 330 | value = int(data[key]) if data.get(key) is not None else math.nan 331 | except (ValueError, TypeError): 332 | value = math.nan 333 | 334 | if math.isnan(value): 335 | logging.debug( 336 | "Target %s: Host %s, Model %s, Name %s: No %s Metrics found.", 337 | self.col.target, 338 | self.col.host, 339 | self.col.model, 340 | labels["device_name"], 341 | key 342 | ) 343 | else: 344 | if metric_name == "redfish_health": 345 | metric_family = self.health_metrics 346 | else: 347 | metric_family = getattr(self, f"mem_metrics_{metric_name.split('_')[-1]}") 348 | metric_family.add_sample(metric_name, value=value, labels=labels) 349 | 350 | def collect_health_data(self, url_key): 351 | """Helper method to collect health data.""" 352 | health_function_name = f"get_{url_key.lower()}_health" 353 | health_function = getattr(self, health_function_name, None) 354 | if health_function and self.col.urls[url_key]: 355 | health_function() 356 | else: 357 | warning_message = f"No {url_key} URL provided! Cannot get {url_key} data!" 358 | logging.warning("Target %s: %s", self.col.target, warning_message) 359 | 360 | def collect(self): 361 | """Collect the health data.""" 362 | logging.info("Target %s: Collecting health data ...", self.col.target) 363 | 364 | current_labels = {"device_type": "system", "device_name": "summary"} 365 | current_labels.update(self.col.labels) 366 | self.add_metric_sample( 367 | "redfish_health", 368 | {"Health": self.col.server_health}, 369 | "Health", 370 | current_labels 371 | ) 372 | 373 | for url_key in ["Processors", "Storage", "Chassis", "Power", "Thermal", "Memory"]: 374 | self.collect_health_data(url_key) 375 | 376 | def __exit__(self, exc_type, exc_val, exc_tb): 377 | if exc_tb is not None: 378 | logging.exception( 379 | "Target %s: An exception occured in %s:%s", 380 | self.col.target, 381 | exc_tb.tb_frame.f_code.co_filename, 382 | exc_tb.tb_lineno 383 | ) 384 | -------------------------------------------------------------------------------- /collectors/performance_collector.py: -------------------------------------------------------------------------------- 1 | """Collects performance, thermal and power information from the Redfish API like.""" 2 | import logging 3 | import math 4 | from prometheus_client.core import GaugeMetricFamily 5 | 6 | class PerformanceCollector: 7 | """Collects performance information from the Redfish API.""" 8 | def __enter__(self): 9 | return self 10 | 11 | def __init__(self, redfish_metrics_collector): 12 | 13 | self.col = redfish_metrics_collector 14 | 15 | self.performance_metrics = GaugeMetricFamily( 16 | "redfish_performance", 17 | "Redfish Server Monitoring Performance Data", 18 | labels=self.col.labels, 19 | ) 20 | self.power_metrics = GaugeMetricFamily( 21 | "redfish_power", 22 | "Redfish Server Monitoring Power Data", 23 | labels=self.col.labels, 24 | ) 25 | self.temperature_metrics = GaugeMetricFamily( 26 | "redfish_temperature", 27 | "Redfish Server Monitoring Temperature Data", 28 | labels=self.col.labels, 29 | unit="Celsius" 30 | ) 31 | 32 | def get_power_metrics(self): 33 | """Get the Power data from the Redfish API.""" 34 | logging.info("Target %s: Get the PDU Power data.", self.col.target) 35 | no_psu_metrics = True 36 | 37 | if self.col.urls['PowerSubsystem']: 38 | no_psu_metrics = self.get_power_subsystem_metrics() 39 | 40 | # fall back to deprecated URL 41 | if self.col.urls['Power'] and no_psu_metrics: 42 | self.get_old_power_metrics() 43 | 44 | if no_psu_metrics: 45 | logging.warning( 46 | "Target %s, Host %s, Model %s: No power url found.", 47 | self.col.target, 48 | self.col.host, 49 | self.col.model 50 | ) 51 | 52 | def get_power_subsystem_metrics(self): 53 | '''Get the PowerSubsystem data from the Redfish API.''' 54 | no_psu_metrics = True 55 | power_supplies_url = None 56 | 57 | logging.debug("Target %s:Checking PowerSubsystem ...", self.col.target) 58 | power_subsystem = self.col.connect_server(self.col.urls['PowerSubsystem']) 59 | metrics = ['CapacityWatts', 'Allocation'] 60 | 61 | for metric in metrics: 62 | if metric not in power_subsystem: 63 | continue 64 | 65 | if isinstance(power_subsystem[metric], dict): 66 | for submetric in power_subsystem[metric]: 67 | current_labels = {'type': submetric} 68 | current_labels.update(self.col.labels) 69 | power_metric_value = ( 70 | math.nan 71 | if power_subsystem[metric][submetric] is None 72 | else power_subsystem[metric][submetric] 73 | ) 74 | self.power_metrics.add_sample( 75 | "redfish_power", 76 | value=power_metric_value, 77 | labels=current_labels 78 | ) 79 | else: 80 | current_labels = {'type': metric} 81 | current_labels.update(self.col.labels) 82 | power_metric_value = ( 83 | math.nan 84 | if power_subsystem[metric] is None 85 | else power_subsystem[metric] 86 | ) 87 | self.power_metrics.add_sample( 88 | "redfish_power", 89 | value=power_metric_value, 90 | labels=current_labels 91 | ) 92 | 93 | power_supplies_url = power_subsystem.get('PowerSupplies', {}).get('@odata.id') 94 | 95 | if not power_supplies_url: 96 | logging.warning( 97 | "Target %s, Host %s, Model %s: No power supplies url found.", 98 | self.col.target, 99 | self.col.host, 100 | self.col.model 101 | ) 102 | return no_psu_metrics 103 | 104 | power_supplies = self.col.connect_server(power_supplies_url) 105 | 106 | if 'Members' in power_supplies: 107 | power_supplies = power_supplies['Members'] 108 | 109 | for power_supply in power_supplies: 110 | no_psu_metrics = self.get_power_supply_metrics(power_supply) 111 | 112 | return no_psu_metrics 113 | 114 | def get_power_supply_metrics(self, power_supply): 115 | """Get power supply metrics and update labels.""" 116 | fields = ["Name", "Manufacturer", "Model"] 117 | metrics = ["PowerInputWatts", "PowerOutputWatts", "PowerCapacityWatts", "InputPowerWatts", "OutputPowerWatts"] 118 | no_psu_metrics = True 119 | 120 | 121 | power_supply_labels = {} 122 | power_supply_data = self.col.connect_server(power_supply['@odata.id']) 123 | 124 | if 'Metrics' not in power_supply_data: 125 | logging.warning( 126 | "Target %s, Host %s, Model %s: No power supply metrics url found for %s.", 127 | self.col.target, 128 | self.col.host, 129 | self.col.model, 130 | power_supply_data.get('Name', 'unknown') 131 | ) 132 | return no_psu_metrics 133 | 134 | for field in fields: 135 | power_supply_labels.update({field: power_supply_data.get(field, 'unknown')}) 136 | 137 | power_supply_labels.update(self.col.labels) 138 | 139 | power_supply_metrics_url = power_supply_data['Metrics']['@odata.id'] 140 | power_supply_metrics = self.col.connect_server(power_supply_metrics_url) 141 | 142 | no_psu_metrics = False 143 | for metric in metrics: 144 | current_labels = {'type': metric} 145 | current_labels.update(power_supply_labels) 146 | if metric not in power_supply_metrics: 147 | continue 148 | 149 | power_metric_value = ( 150 | math.nan 151 | if power_supply_metrics[metric]['Reading'] is None 152 | else power_supply_metrics[metric]['Reading'] 153 | ) 154 | self.power_metrics.add_sample( 155 | "redfish_power", value=power_metric_value, labels=current_labels 156 | ) 157 | 158 | return no_psu_metrics 159 | 160 | 161 | def get_old_power_metrics(self): 162 | """Get the Power data from the Redfish API.""" 163 | logging.debug("Target %s: Fallback to deprecated Power URL.", self.col.target) 164 | 165 | no_psu_metrics = True 166 | 167 | power_data = self.col.connect_server(self.col.urls['Power']) 168 | if not power_data: 169 | return no_psu_metrics 170 | 171 | metrics = [ 172 | 'PowerOutputWatts', 173 | 'EfficiencyPercent', 174 | 'PowerInputWatts', 175 | 'LineInputVoltage' 176 | ] 177 | 178 | for psu in power_data['PowerSupplies']: 179 | psu_name = ( 180 | 'unknown' 181 | if psu.get('Name', 'unknown') is None 182 | else psu.get('Name', 'unknown') 183 | ) 184 | psu_model = ( 185 | 'unknown' 186 | if psu.get('Model', 'unknown') is None 187 | else psu.get('Model', 'unknown') 188 | ) 189 | 190 | for metric in metrics: 191 | if metric not in psu: 192 | continue 193 | 194 | no_psu_metrics = False 195 | power_metric_value = ( 196 | math.nan 197 | if psu[metric] is None 198 | else psu[metric] 199 | ) 200 | 201 | current_labels = { 202 | 'device_name': psu_name, 203 | 'device_model': psu_model, 204 | 'type': metric 205 | } 206 | current_labels.update(self.col.labels) 207 | self.power_metrics.add_sample( 208 | "redfish_power", 209 | value=power_metric_value, 210 | labels=current_labels 211 | ) 212 | 213 | return no_psu_metrics 214 | 215 | def get_temp_metrics(self): 216 | """Get the Thermal data from the Redfish API.""" 217 | logging.info("Target %s: Get the Thermal data.", self.col.target) 218 | 219 | if self.col.urls['ThermalSubsystem']: 220 | thermal_subsystem = self.col.connect_server(self.col.urls['ThermalSubsystem']) 221 | thermal_metrics_url = thermal_subsystem['ThermalMetrics']['@odata.id'] 222 | result = self.col.connect_server(thermal_metrics_url) 223 | thermal_metrics = result.get('TemperatureSummaryCelsius', {}) 224 | 225 | for metric in thermal_metrics: 226 | current_labels = {'type': metric} 227 | current_labels.update(self.col.labels) 228 | thermal_metric_value = ( 229 | math.nan 230 | if thermal_metrics[metric]['Reading'] is None 231 | else thermal_metrics[metric]['Reading'] 232 | ) 233 | self.temperature_metrics.add_sample( 234 | "redfish_temperature", value=thermal_metric_value, labels=current_labels 235 | ) 236 | 237 | def collect(self): 238 | """Collects performance information from the Redfish API.""" 239 | logging.info("Target %s: Collecting performance data ...",self.col.target) 240 | self.get_power_metrics() 241 | self.get_temp_metrics() 242 | 243 | def __exit__(self, exc_type, exc_val, exc_tb): 244 | if exc_tb is not None: 245 | logging.exception( 246 | "Target %s: An exception occured in %s:%s", 247 | self.col.target, 248 | exc_tb.tb_frame.f_code.co_filename, 249 | exc_tb.tb_lineno 250 | ) 251 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | listen_port: 9220 2 | timeout: 10 3 | job: 'redfish-myjob' 4 | username: admin 5 | password: admin 6 | -------------------------------------------------------------------------------- /dockerbuild.ps1: -------------------------------------------------------------------------------- 1 | $image = "redfish-exporter" 2 | 3 | $version = get-date -Format yyyyMMddHHmmss 4 | 5 | docker login keppel.eu-de-1.cloud.sap 6 | docker build . -t keppel.eu-de-1.cloud.sap/ccloud/${image}:$version 7 | docker image tag keppel.eu-de-1.cloud.sap/ccloud/${image}:$version keppel.eu-de-1.cloud.sap/ccloud/${image}:latest 8 | docker push keppel.eu-de-1.cloud.sap/ccloud/${image} --all-tags 9 | -------------------------------------------------------------------------------- /handler.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains the handler classes for the Falcon web server. 3 | """ 4 | 5 | import logging 6 | import socket 7 | import re 8 | import os 9 | import traceback 10 | import falcon 11 | 12 | from prometheus_client.exposition import CONTENT_TYPE_LATEST 13 | from prometheus_client.exposition import generate_latest 14 | 15 | from collector import RedfishMetricsCollector 16 | 17 | # pylint: disable=no-member 18 | 19 | class WelcomePage: 20 | """ 21 | Create the Welcome page for the API. 22 | """ 23 | 24 | def on_get(self, req, resp): 25 | """ 26 | Define the GET method for the API. 27 | """ 28 | 29 | resp.status = falcon.HTTP_200 30 | resp.content_type = 'text/html' 31 | resp.text = """ 32 |

Redfish Exporter

33 |

Prometheus Exporter for redfish API based servers monitoring

34 |
    35 |
  • Health Metrics: Use /health to retrieve health-related metrics, such as system status, memory errors, and power state.
  • 36 |
  • Firmware Metrics: Use /firmware to retrieve firmware version information for the server components.
  • 37 |
  • Performance Metrics: Use /performance to retrieve performance-related metrics like power consumption and temperature data.
  • 38 |
39 | """ 40 | 41 | class MetricsHandler: 42 | """ 43 | Metrics Handler for the Falcon API. 44 | """ 45 | 46 | def __init__(self, config, metrics_type): 47 | self._config = config 48 | self.metrics_type = metrics_type 49 | 50 | def on_get(self, req, resp): 51 | """ 52 | Define the GET method for the API. 53 | """ 54 | target = req.get_param("target") 55 | if not target: 56 | logging.error("No target parameter provided!") 57 | raise falcon.HTTPMissingParam("target") 58 | 59 | job = req.get_param("job") 60 | if not job: 61 | logging.error("Target %s: No job provided!", target) 62 | raise falcon.HTTPMissingParam("job") 63 | 64 | logging.debug("Received Target %s with Job %s", target, job) 65 | 66 | ip_re = re.compile( 67 | r"^(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}" 68 | r"([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])$" 69 | ) 70 | 71 | resp.set_header("Content-Type", CONTENT_TYPE_LATEST) 72 | 73 | host = None 74 | if ip_re.match(target): 75 | logging.debug("Target %s: Target is an IP Address.", target) 76 | try: 77 | host = socket.gethostbyaddr(target)[0] 78 | except socket.herror as err: 79 | logging.warning("Target %s: Reverse DNS lookup failed: %s. Using IP address as host.", target, err) 80 | host = target 81 | else: 82 | logging.debug("Target %s: Target is a hostname.", target) 83 | host = target 84 | try: 85 | target = socket.gethostbyname(host) 86 | except socket.gaierror as err: 87 | msg = f"Target {target}: DNS lookup failed: {err}" 88 | logging.error(msg) 89 | raise falcon.HTTPInvalidParam(msg, "target") 90 | 91 | usr_env_var = job.replace("-", "_").upper() + "_USERNAME" 92 | pwd_env_var = job.replace("-", "_").upper() + "_PASSWORD" 93 | usr = os.getenv(usr_env_var, self._config.get("username")) 94 | pwd = os.getenv(pwd_env_var, self._config.get("password")) 95 | 96 | if not usr or not pwd: 97 | msg = ( 98 | f"Target {target}: " 99 | "Unknown job provided or " 100 | f"no user/password found in environment and config file: {job}" 101 | ) 102 | logging.error(msg) 103 | raise falcon.HTTPInvalidParam(msg, "job") 104 | 105 | logging.debug("Target %s: Using user %s", target, usr) 106 | 107 | with RedfishMetricsCollector( 108 | self._config, 109 | target = target, 110 | host = host, 111 | usr = usr, 112 | pwd = pwd, 113 | metrics_type = self.metrics_type 114 | ) as registry: 115 | 116 | # open a session with the remote board 117 | registry.get_session() 118 | 119 | try: 120 | # collect the actual metrics 121 | resp.text = generate_latest(registry) 122 | resp.status = falcon.HTTP_200 123 | 124 | except Exception: 125 | message = f"Exception: {traceback.format_exc()}" 126 | logging.error("Target %s: %s", target, message) 127 | raise falcon.HTTPBadRequest(description=message) 128 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Redfish Prometheus Exporter 3 | """ 4 | import argparse 5 | import logging 6 | import os 7 | import warnings 8 | import sys 9 | 10 | from wsgiref.simple_server import make_server, WSGIServer, WSGIRequestHandler 11 | from socketserver import ThreadingMixIn 12 | import yaml 13 | 14 | import falcon 15 | 16 | from handler import MetricsHandler 17 | from handler import WelcomePage 18 | 19 | class _SilentHandler(WSGIRequestHandler): 20 | """WSGI handler that does not log requests.""" 21 | 22 | def log_message(self, format, *args): # pylint: disable=redefined-builtin 23 | """Log nothing.""" 24 | 25 | 26 | class ThreadingWSGIServer(ThreadingMixIn, WSGIServer): 27 | """Thread per request HTTP server.""" 28 | 29 | def falcon_app(config): 30 | """ 31 | Start the Falcon API 32 | """ 33 | port = int(os.getenv("LISTEN_PORT", config.get("listen_port", 9200))) 34 | addr = "0.0.0.0" 35 | logging.info("Starting Redfish Prometheus Server ...") 36 | 37 | api = falcon.API() 38 | api.add_route("/health", MetricsHandler(config, metrics_type='health')) 39 | api.add_route("/firmware", MetricsHandler(config, metrics_type='firmware')) 40 | api.add_route("/performance", MetricsHandler(config, metrics_type='performance')) 41 | api.add_route("/", WelcomePage()) 42 | 43 | with make_server(addr, port, api, ThreadingWSGIServer, handler_class=_SilentHandler) as httpd: 44 | httpd.daemon = True # pylint: disable=attribute-defined-outside-init 45 | logging.info("Listening on Port %s", port) 46 | try: 47 | httpd.serve_forever() 48 | except (KeyboardInterrupt, SystemExit): 49 | logging.info("Stopping Redfish Prometheus Server") 50 | 51 | def enable_logging(filename, debug): 52 | """enable logging""" 53 | logger = logging.getLogger() 54 | 55 | formatter = logging.Formatter( 56 | '%(asctime)-15s %(process)d %(filename)24s:%(lineno)-3d %(levelname)-7s %(message)s' 57 | ) 58 | 59 | if debug: 60 | logger.setLevel("DEBUG") 61 | else: 62 | logger.setLevel("INFO") 63 | 64 | sh = logging.StreamHandler() 65 | sh.setFormatter(formatter) 66 | logger.addHandler(sh) 67 | 68 | if filename: 69 | try: 70 | fh = logging.FileHandler(filename, mode='w') 71 | except FileNotFoundError as e: 72 | logging.error("Could not open logfile %s: %s", filename, e) 73 | sys.exit(1) 74 | 75 | fh.setFormatter(formatter) 76 | logger.addHandler(fh) 77 | 78 | def get_args(): 79 | """ 80 | Get the command line arguments 81 | """ 82 | parser = argparse.ArgumentParser() 83 | parser.add_argument( 84 | "-c", 85 | "--config", 86 | help="Specify config yaml file", 87 | metavar="FILE", 88 | required=False, 89 | default="config.yml" 90 | ) 91 | parser.add_argument( 92 | "-l", 93 | "--logging", 94 | help="Log all messages to a file", 95 | metavar="FILE", 96 | required=False 97 | ) 98 | parser.add_argument( 99 | "-d", "--debug", 100 | help="Debugging mode", 101 | action="store_true", 102 | required=False 103 | ) 104 | 105 | return parser.parse_args() 106 | 107 | 108 | if __name__ == "__main__": 109 | 110 | call_args = get_args() 111 | 112 | warnings.filterwarnings("ignore") 113 | 114 | enable_logging(call_args.logging, call_args.debug) 115 | 116 | # get the config 117 | 118 | if call_args.config: 119 | try: 120 | with open(call_args.config, "r", encoding="utf8") as config_file: 121 | configuration = yaml.load(config_file.read(), Loader=yaml.FullLoader) 122 | except FileNotFoundError as err: 123 | print(f"Config File not found: {err}") 124 | sys.exit(1) 125 | 126 | falcon_app(configuration) 127 | -------------------------------------------------------------------------------- /redfish-exporter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DEBIAN_FRONTEND=noninteractive 4 | apt-get update 5 | apt-get install -y python3 6 | apt-get install -y python3-pip 7 | 8 | pip3 install --no-cache-dir -r requirements.txt 9 | 10 | PYTHONPATH=. python3 main.py "$@" 11 | 12 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:recommended" 4 | ] 5 | } 6 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | prometheus-client 3 | falcon 4 | argparse 5 | pyyaml 6 | pyOpenSSL --------------------------------------------------------------------------------