├── .github
    └── dependabot.yml
├── .gitignore
├── .gitmodules
├── .pylintrc
├── CONTRIBUTING.md
├── Dockerfile.cloud-energy
├── Dockerfile.cloud-energy-asciicharts
├── LICENSE
├── README.md
├── auto_detect.py
├── data
    ├── spec_data.csv
    ├── spec_data_cleaned.csv
    └── spec_data_cleaned_unmelted.csv
├── demo-reporter
    ├── cpu-utilization.c
    ├── cpu-utilization_mac.c
    ├── static-binary-linux-amd64
    └── static-binary-linux-arm64
├── hyperparameter_tuning.py
├── img
    ├── fujitsu_TX1330_SPEC.png
    ├── fujitsu_TX1330_measured.png
    └── hp_synergy_480_Gen10_Plus.png
├── interact_validation.py
├── ols.py
├── requirements-dev.txt
├── requirements-docker.txt
├── requirements.txt
├── scripts
    ├── create_data_csv.py
    ├── data_cleaning.py
    └── include
    │   └── helper_functions.py
└── xgb.py


/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | venv
2 | .DS_Store
3 | __pycache__
4 | sftp-config.json
5 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "data/raw"]
2 | 	path = data/raw
3 | 	url = https://github.com/green-coding-solutions/spec-power-raw-data
4 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [FORMAT]
 2 | 
 3 | # Maximum number of characters on a single line.
 4 | max-line-length=120
 5 | 
 6 | 
 7 | [MESSAGES CONTROL]
 8 | 
 9 | disable=missing-function-docstring,
10 |     missing-module-docstring,
11 |     missing-class-docstring,
12 |     too-few-public-methods,
13 |     duplicate-code,
14 |     too-many-nested-blocks,
15 |     line-too-long,
16 |     too-many-boolean-expressions,
17 |     too-many-nested-blocks,
18 |     line-too-long,
19 |     protected-access,
20 |     too-many-lines,
21 |     multiple-statements,
22 |     pointless-string-statement,
23 |     too-many-locals,
24 |     too-many-public-methods,
25 |     too-many-branches,
26 |     too-many-statements,
27 |     too-many-arguments,
28 |     too-many-return-statements,
29 |     too-many-instance-attributes,
30 |     invalid-name,
31 |     wrong-import-position,
32 |     wrong-import-order,
33 |     ungrouped-imports,
34 |     fixme
35 | 
36 | 
37 | 
38 | [MASTER]
39 | ignore=env
40 | ignore-patterns=^env.*
41 | ignore-paths=^env.*$


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing
  2 | 
  3 | We are super happy if you are interested in contributing to the Green Metrics Tool.
  4 | 
  5 | All contributions should be done via Pull Requests.
  6 | 
  7 | Please see our: [Contribution Guidelines](https://docs.green-coding.org/docs/contributing/green-metrics-tool-contribution/)
  8 | 
  9 | ## Python conventions
 10 | 
 11 | We adhere to standard ([PEP8][pep8]) python conventions for the most part.
 12 | Here we address a few grey areas and stylistic choices.
 13 | 
 14 | - quotes
 15 |   - use `'` for fixed constant strings
 16 |   - use `"` for modifiable and format strings
 17 |   - break this rule if needed to avoid escaping
 18 |     - e.g., `re.compile(r'say "hi"')`
 19 | 
 20 | - multiline component imports should be in parens, one component per line,
 21 |   alphabetical
 22 | 
 23 |     ```python
 24 | 	from foo import (
 25 | 		bar,
 26 | 		bazzle,
 27 | 		otherstuff,
 28 | 	)
 29 | 	```
 30 | 
 31 | - try to put class-specific constants inside class
 32 |   - e.g., `Membership.MEMBER_TYPE_PATIENT`
 33 | 
 34 | - same for class-specific exceptions
 35 |   - e.g., `MetricDefinition.MetricMissingData`
 36 | 
 37 | - use `foo.pk is None` instead of `foo._state.adding` to test for insertion in
 38 |   pre-save hooks
 39 |   - this is not totally obvious in public convention, but we decided we don't
 40 |     like accessing vars that start with `_` if we can help it
 41 |   - avoid use of `# pylint: disable=protected-access`
 42 | 
 43 | - add tests for any use of variables which start with `_`
 44 |   - whenever we access a variable like django's `model._meta`, we should make
 45 |     sure we write a test case which covers the particular use case so that we
 46 |     are informed of any behavior changes
 47 | 
 48 | ### Lint
 49 | 
 50 | We use `pylint` to check for clean code, with a few minor variations:
 51 | 
 52 | - Because of the way we handle a lot of imports we need to modify the `sys.path` to include some folders. This
 53 |   can not be understood by the linter so most of the import checks will not work and need disabling.
 54 | 
 55 | - We have also disabled most of the requirements for docstrings as they often tend to state the obvious.
 56 | 
 57 | - pylint has quite good code duplication finding capabilities which trigger a lot of our code which actually has a lot
 58 |   of duplicate code as for example the provider.py just sets a few things and then relies on the BaseProvider to
 59 |   do most of the heavy lifting. We could refactor this but this would lead to very unreadable code in the greater
 60 |   context of things.
 61 | 
 62 | To check if your code works you can call `pylint -j0 **.py` in the project directory.
 63 | 
 64 | We recommend that you set a pre-commit hook to lint your code every time you commit. This can be done by adding
 65 | ```
 66 | git diff --diff-filter=d --cached --name-only | grep -E -i '.py$' | xargs -r pylint -j0
 67 | ```
 68 | to a file named `./.git/hooks/pre-commit` and making it executable `chmod +x ./.git/hooks/pre-commit`
 69 | 
 70 | Because of the automatic checks, it is tempting to use the `# pylint: disable` directive.  Some
 71 | guidelines on when to use or avoid this directive follow:
 72 | 
 73 | - instead of disabling `unused-argument`, prefix the arg (or kwarg) with `_`
 74 | 
 75 | - if you _do_ add disable directives, be aware of where you place them.  They can unintentially
 76 |   effect more code than you indend.  The order of precidence should be:
 77 | 
 78 |   1. inline (only affects the current line)
 79 |   1. wrap small bits of code in `# pylint: disable` ... `# pylint: enable` blocks
 80 |   1. scope to method calls by placing within the method
 81 |   1. scope to class defs by placing within the class definition
 82 |   1. last resort: scope to module
 83 | 
 84 | ## Git Branching Strategy
 85 | 
 86 | Branch naming conventions:
 87 | 
 88 | - Use `<desc>-<issue number>` for feature/topic branches
 89 | - Use `wip-` as a prefix for branches are not stable and should not be branched off
 90 | - Prefer hyphens to underscores and lowercase branch names
 91 |   - `csv-reports-142` not `CSV_Reports_142`
 92 | 
 93 | Merging with dev:
 94 | 
 95 | 1. File an issue if one does not exist already
 96 | 2. Create a branch of the form `<desc>-<issue number>`
 97 | 3. Make sure your branch is up to date, then push to GitHub
 98 | 4. Create a pull request against `dev`
 99 | 5. Assign reviewers
100 | 6. Add commits to address feedback
101 | 7. Squash commits, as appropriate
102 | 8. Merge with dev
103 | 9. Delete your branch on GitHub
104 | 
105 | An example of this flow would be:
106 | 
107 | 1. Alice files issue #456 to build a CSV Reporting module
108 | 2. She creates the branch `csv-reports-456` with her commits
109 | 3. She opens a pull request against `dev` with her branch
110 | 4. She assigns Bob to review it
111 | 5. Bob leaves feedback for Alice to address
112 | 6. Alice pushes additional commits to address the feedback
113 | 7. Bob gives it a thumbsup and Alice squashes some commits
114 | 8. Alice merges the pull request with dev
115 | 9. Alice deletes branch `csv-reports-456`
116 | 
117 | In general, do not push directly to a branch unless you are the owner or have
118 | previously discussed it with the branch owner.
119 | 
120 | To update someone else's branch:
121 | 
122 | 1. Create `<branch name>-pr-<author>`
123 | 2. Add commits and create a PR against the original branch
124 | 3. Add the branch owner as a reviewer
125 | 4. Address feedback from the owner
126 | 5. The owner merges the branch when satisfied
127 | 6. Delete your branch
128 | 
129 | An example of this flow would be:
130 | 
131 | 1. Given Alice's branch `csv-reports-456`, Bob creates `csv-reports-456-pr-bob`
132 | 2. Bob adds commits and opens a PR against Alice's branch
133 | 3. Bob adds Alice as a reviewer
134 | 4. Alice leaves feedback for Bob, which Bob addresses
135 | 5. Alice merges Bob's pull request
136 | 6. Bob deletes branch `csv-reports-456-pr-bob`
137 | 
138 | ### Git Commit Messages
139 | 
140 | - Use the present tense in the subject
141 |   - "Add feature" not "Added feature"
142 | - Use the imperative mood in the subject
143 |   - "Move cursor to..." not "Moves cursor to..."
144 | - Leave a blank line between the subject and the body
145 | - Limit the subject to 50 characters or less
146 | - Limit the body to 72 columns or less (configure your editor to enforce this)
147 | - Reference issues and pull requests liberally in the description
148 | - When only changing documentation, include `[ci skip]` in the commit description
149 | - Include syntax to [automatically close][auto-close] issues
150 | - Squash commits to the extent that it improves clarity
151 |   - but don't combine patches for two issues in one commit
152 | 
153 | Rationale behind this is [here][tpope], [here][blog1], [here][blog2],
154 | [here][blog3], and [here][blog4]. For a contrarian view, see [here][holman].
155 | 
156 | Optional - consider starting the commit subject with an applicable emoji:
157 | 
158 | | emoji               | shorthand             | usage                       |
159 | | -----               | ---------             | -----                       |
160 | | :art:               | `:art:`               | Cosmetic/style changes      |
161 | | :racehorse:         | `:racehorse:`         | Performance                 |
162 | | :recycle:           | `:recycle:`           | Refactoring                 |
163 | | :sparkles:          | `:sparkles:`          | New feature                 |
164 | | :books:             | `:books:`             | Documentation               |
165 | | :card_index:        | `:card_index:`        | Metadata/fixtures           |
166 | | :wrench:            | `:wrench:`            | Tooling                     |
167 | | :floppy_disk:       | `:floppy_disk:`       | Data migration              |
168 | | :wastebasket:       | `:wastebasket:`       | Remove code/files           |
169 | | :bug:               | `:bug:`               | Bug fix                     |
170 | | :fire:              | `:fire:`              | Hotfix                      |
171 | | :poop:              | `:poop:`              | Deprecation                 |
172 | | :green_heart:       | `:green_heart:`       | Fixing the CI build         |
173 | | :white_check_mark:  | `:white_check_mark:`  | Adding tests                |
174 | | :lock:              | `:lock:`              | Dealing with security       |
175 | | :shirt:             | `:shirt:`             | Removing linter warnings    |
176 | | :arrow_up:          | `:arrow_up:`          | Upgrading dependencies      |
177 | | :arrow_down:        | `:arrow_down:`        | Downgrading dependencies    |
178 | | :penguin:           | `:penguin:`           | Fixing something on Linux   |
179 | | :apple:             | `:apple:`             | Fixing something on macOS   |
180 | | :checkered_flag:    | `:checkered_flag:`    | Fixing something on Windows |
181 | | :non-potable_water: | `:non-potable_water:` | Fix memory leaks            |
182 | 
183 | [auto-close]: https://help.github.com/articles/closing-issues-via-commit-messages/
184 | [tpope]: https://tbaggery.com/2008/04/19/a-note-about-git-commit-messages.html
185 | [blog1]: https://who-t.blogspot.co.at/2009/12/on-commit-messages.html
186 | [blog2]: https://chris.beams.io/posts/git-commit/
187 | [blog3]: https://robots.thoughtbot.com/5-useful-tips-for-a-better-commit-message
188 | [blog4]: https://git-scm.com/book/en/v2/Distributed-Git-Contributing-to-a-Project
189 | [holman]: https://zachholman.com/posts/git-commit-history/
190 | [pep8]: https://www.python.org/dev/peps/pep-0008/
191 | 


--------------------------------------------------------------------------------
/Dockerfile.cloud-energy:
--------------------------------------------------------------------------------
 1 | # Stage 1: Build the binary
 2 | FROM python:3.12.3-slim-bullseye as builder
 3 | 
 4 | # Install build tools
 5 | RUN apt-get update && apt-get install -y wget build-essential
 6 | 
 7 | # Set the working directory
 8 | WORKDIR /app
 9 | 
10 | # Copy the source code
11 | COPY demo-reporter/cpu-utilization.c demo-reporter/cpu-utilization.c
12 | RUN gcc -o cpu-utilization demo-reporter/cpu-utilization.c
13 | 
14 | # we need to use debian image as many dependencies do not have compiled binaries in alpine and no GCC is installed
15 | FROM python:3.12.3-slim-bullseye
16 | 
17 | RUN pip install --upgrade pip
18 | RUN useradd -d /home/worker worker
19 | WORKDIR /home/worker
20 | RUN chown -R worker:worker /home/worker
21 | 
22 | USER worker
23 | 
24 | # Copy the compiled binary from the builder stage
25 | COPY --from=builder --chown=worker:worker /app/cpu-utilization .
26 | 
27 | COPY --chown=worker:worker requirements-docker.txt requirements.txt
28 | RUN --mount=type=cache,target=/home/worker/.cache/pip pip install --user -r requirements.txt
29 | 
30 | ENV PATH="/home/worker/.local/bin:${PATH}"
31 | 
32 | COPY --chown=worker:worker data/spec_data_cleaned.csv data/spec_data_cleaned.csv
33 | COPY --chown=worker:worker auto_detect.py auto_detect.py
34 | COPY --chown=worker:worker xgb.py xgb.py
35 | 
36 | CMD ["python"]


--------------------------------------------------------------------------------
/Dockerfile.cloud-energy-asciicharts:
--------------------------------------------------------------------------------
 1 | # Stage 1: Install go dependency
 2 | FROM python:3.12.3-slim-bullseye as builder
 3 | 
 4 | RUN apt-get update && apt-get install -y wget
 5 | 
 6 | # Set the working directory
 7 | WORKDIR /app
 8 | 
 9 | ARG TARGETARCH
10 | RUN if [ "$TARGETARCH" = "arm64" ]; then \
11 |         wget https://go.dev/dl/go1.21.4.linux-arm64.tar.gz; \
12 |         tar -C /usr/local -xzf go1.21.4.linux-arm64.tar.gz; \
13 |     else \
14 |         wget https://go.dev/dl/go1.21.4.linux-amd64.tar.gz; \
15 |         tar -C /usr/local -xzf go1.21.4.linux-amd64.tar.gz; \
16 |     fi
17 | 
18 | RUN /usr/local/go/bin/go install github.com/guptarohit/asciigraph/cmd/asciigraph@latest
19 | 
20 | # we need to use debian image as many dependencies do not have compiled binaries in alpine and no GCC is installed
21 | FROM greencoding/cloud-energy:latest
22 | 
23 | WORKDIR /home/worker
24 | 
25 | COPY --from=builder /usr/local/go /usr/local/go
26 | 
27 | USER worker
28 | 
29 | COPY --from=builder --chown=worker:worker /root/go /home/worker/go
30 | 
31 | CMD ["python"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                     GNU AFFERO GENERAL PUBLIC LICENSE
  2 |                        Version 3, 19 November 2007
  3 | 
  4 |  Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
  5 |  Everyone is permitted to copy and distribute verbatim copies
  6 |  of this license document, but changing it is not allowed.
  7 | 
  8 |                             Preamble
  9 | 
 10 |   The GNU Affero General Public License is a free, copyleft license for
 11 | software and other kinds of works, specifically designed to ensure
 12 | cooperation with the community in the case of network server software.
 13 | 
 14 |   The licenses for most software and other practical works are designed
 15 | to take away your freedom to share and change the works.  By contrast,
 16 | our General Public Licenses are intended to guarantee your freedom to
 17 | share and change all versions of a program--to make sure it remains free
 18 | software for all its users.
 19 | 
 20 |   When we speak of free software, we are referring to freedom, not
 21 | price.  Our General Public Licenses are designed to make sure that you
 22 | have the freedom to distribute copies of free software (and charge for
 23 | them if you wish), that you receive source code or can get it if you
 24 | want it, that you can change the software or use pieces of it in new
 25 | free programs, and that you know you can do these things.
 26 | 
 27 |   Developers that use our General Public Licenses protect your rights
 28 | with two steps: (1) assert copyright on the software, and (2) offer
 29 | you this License which gives you legal permission to copy, distribute
 30 | and/or modify the software.
 31 | 
 32 |   A secondary benefit of defending all users' freedom is that
 33 | improvements made in alternate versions of the program, if they
 34 | receive widespread use, become available for other developers to
 35 | incorporate.  Many developers of free software are heartened and
 36 | encouraged by the resulting cooperation.  However, in the case of
 37 | software used on network servers, this result may fail to come about.
 38 | The GNU General Public License permits making a modified version and
 39 | letting the public access it on a server without ever releasing its
 40 | source code to the public.
 41 | 
 42 |   The GNU Affero General Public License is designed specifically to
 43 | ensure that, in such cases, the modified source code becomes available
 44 | to the community.  It requires the operator of a network server to
 45 | provide the source code of the modified version running there to the
 46 | users of that server.  Therefore, public use of a modified version, on
 47 | a publicly accessible server, gives the public access to the source
 48 | code of the modified version.
 49 | 
 50 |   An older license, called the Affero General Public License and
 51 | published by Affero, was designed to accomplish similar goals.  This is
 52 | a different license, not a version of the Affero GPL, but Affero has
 53 | released a new version of the Affero GPL which permits relicensing under
 54 | this license.
 55 | 
 56 |   The precise terms and conditions for copying, distribution and
 57 | modification follow.
 58 | 
 59 |                        TERMS AND CONDITIONS
 60 | 
 61 |   0. Definitions.
 62 | 
 63 |   "This License" refers to version 3 of the GNU Affero General Public License.
 64 | 
 65 |   "Copyright" also means copyright-like laws that apply to other kinds of
 66 | works, such as semiconductor masks.
 67 | 
 68 |   "The Program" refers to any copyrightable work licensed under this
 69 | License.  Each licensee is addressed as "you".  "Licensees" and
 70 | "recipients" may be individuals or organizations.
 71 | 
 72 |   To "modify" a work means to copy from or adapt all or part of the work
 73 | in a fashion requiring copyright permission, other than the making of an
 74 | exact copy.  The resulting work is called a "modified version" of the
 75 | earlier work or a work "based on" the earlier work.
 76 | 
 77 |   A "covered work" means either the unmodified Program or a work based
 78 | on the Program.
 79 | 
 80 |   To "propagate" a work means to do anything with it that, without
 81 | permission, would make you directly or secondarily liable for
 82 | infringement under applicable copyright law, except executing it on a
 83 | computer or modifying a private copy.  Propagation includes copying,
 84 | distribution (with or without modification), making available to the
 85 | public, and in some countries other activities as well.
 86 | 
 87 |   To "convey" a work means any kind of propagation that enables other
 88 | parties to make or receive copies.  Mere interaction with a user through
 89 | a computer network, with no transfer of a copy, is not conveying.
 90 | 
 91 |   An interactive user interface displays "Appropriate Legal Notices"
 92 | to the extent that it includes a convenient and prominently visible
 93 | feature that (1) displays an appropriate copyright notice, and (2)
 94 | tells the user that there is no warranty for the work (except to the
 95 | extent that warranties are provided), that licensees may convey the
 96 | work under this License, and how to view a copy of this License.  If
 97 | the interface presents a list of user commands or options, such as a
 98 | menu, a prominent item in the list meets this criterion.
 99 | 
100 |   1. Source Code.
101 | 
102 |   The "source code" for a work means the preferred form of the work
103 | for making modifications to it.  "Object code" means any non-source
104 | form of a work.
105 | 
106 |   A "Standard Interface" means an interface that either is an official
107 | standard defined by a recognized standards body, or, in the case of
108 | interfaces specified for a particular programming language, one that
109 | is widely used among developers working in that language.
110 | 
111 |   The "System Libraries" of an executable work include anything, other
112 | than the work as a whole, that (a) is included in the normal form of
113 | packaging a Major Component, but which is not part of that Major
114 | Component, and (b) serves only to enable use of the work with that
115 | Major Component, or to implement a Standard Interface for which an
116 | implementation is available to the public in source code form.  A
117 | "Major Component", in this context, means a major essential component
118 | (kernel, window system, and so on) of the specific operating system
119 | (if any) on which the executable work runs, or a compiler used to
120 | produce the work, or an object code interpreter used to run it.
121 | 
122 |   The "Corresponding Source" for a work in object code form means all
123 | the source code needed to generate, install, and (for an executable
124 | work) run the object code and to modify the work, including scripts to
125 | control those activities.  However, it does not include the work's
126 | System Libraries, or general-purpose tools or generally available free
127 | programs which are used unmodified in performing those activities but
128 | which are not part of the work.  For example, Corresponding Source
129 | includes interface definition files associated with source files for
130 | the work, and the source code for shared libraries and dynamically
131 | linked subprograms that the work is specifically designed to require,
132 | such as by intimate data communication or control flow between those
133 | subprograms and other parts of the work.
134 | 
135 |   The Corresponding Source need not include anything that users
136 | can regenerate automatically from other parts of the Corresponding
137 | Source.
138 | 
139 |   The Corresponding Source for a work in source code form is that
140 | same work.
141 | 
142 |   2. Basic Permissions.
143 | 
144 |   All rights granted under this License are granted for the term of
145 | copyright on the Program, and are irrevocable provided the stated
146 | conditions are met.  This License explicitly affirms your unlimited
147 | permission to run the unmodified Program.  The output from running a
148 | covered work is covered by this License only if the output, given its
149 | content, constitutes a covered work.  This License acknowledges your
150 | rights of fair use or other equivalent, as provided by copyright law.
151 | 
152 |   You may make, run and propagate covered works that you do not
153 | convey, without conditions so long as your license otherwise remains
154 | in force.  You may convey covered works to others for the sole purpose
155 | of having them make modifications exclusively for you, or provide you
156 | with facilities for running those works, provided that you comply with
157 | the terms of this License in conveying all material for which you do
158 | not control copyright.  Those thus making or running the covered works
159 | for you must do so exclusively on your behalf, under your direction
160 | and control, on terms that prohibit them from making any copies of
161 | your copyrighted material outside their relationship with you.
162 | 
163 |   Conveying under any other circumstances is permitted solely under
164 | the conditions stated below.  Sublicensing is not allowed; section 10
165 | makes it unnecessary.
166 | 
167 |   3. Protecting Users' Legal Rights From Anti-Circumvention Law.
168 | 
169 |   No covered work shall be deemed part of an effective technological
170 | measure under any applicable law fulfilling obligations under article
171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or
172 | similar laws prohibiting or restricting circumvention of such
173 | measures.
174 | 
175 |   When you convey a covered work, you waive any legal power to forbid
176 | circumvention of technological measures to the extent such circumvention
177 | is effected by exercising rights under this License with respect to
178 | the covered work, and you disclaim any intention to limit operation or
179 | modification of the work as a means of enforcing, against the work's
180 | users, your or third parties' legal rights to forbid circumvention of
181 | technological measures.
182 | 
183 |   4. Conveying Verbatim Copies.
184 | 
185 |   You may convey verbatim copies of the Program's source code as you
186 | receive it, in any medium, provided that you conspicuously and
187 | appropriately publish on each copy an appropriate copyright notice;
188 | keep intact all notices stating that this License and any
189 | non-permissive terms added in accord with section 7 apply to the code;
190 | keep intact all notices of the absence of any warranty; and give all
191 | recipients a copy of this License along with the Program.
192 | 
193 |   You may charge any price or no price for each copy that you convey,
194 | and you may offer support or warranty protection for a fee.
195 | 
196 |   5. Conveying Modified Source Versions.
197 | 
198 |   You may convey a work based on the Program, or the modifications to
199 | produce it from the Program, in the form of source code under the
200 | terms of section 4, provided that you also meet all of these conditions:
201 | 
202 |     a) The work must carry prominent notices stating that you modified
203 |     it, and giving a relevant date.
204 | 
205 |     b) The work must carry prominent notices stating that it is
206 |     released under this License and any conditions added under section
207 |     7.  This requirement modifies the requirement in section 4 to
208 |     "keep intact all notices".
209 | 
210 |     c) You must license the entire work, as a whole, under this
211 |     License to anyone who comes into possession of a copy.  This
212 |     License will therefore apply, along with any applicable section 7
213 |     additional terms, to the whole of the work, and all its parts,
214 |     regardless of how they are packaged.  This License gives no
215 |     permission to license the work in any other way, but it does not
216 |     invalidate such permission if you have separately received it.
217 | 
218 |     d) If the work has interactive user interfaces, each must display
219 |     Appropriate Legal Notices; however, if the Program has interactive
220 |     interfaces that do not display Appropriate Legal Notices, your
221 |     work need not make them do so.
222 | 
223 |   A compilation of a covered work with other separate and independent
224 | works, which are not by their nature extensions of the covered work,
225 | and which are not combined with it such as to form a larger program,
226 | in or on a volume of a storage or distribution medium, is called an
227 | "aggregate" if the compilation and its resulting copyright are not
228 | used to limit the access or legal rights of the compilation's users
229 | beyond what the individual works permit.  Inclusion of a covered work
230 | in an aggregate does not cause this License to apply to the other
231 | parts of the aggregate.
232 | 
233 |   6. Conveying Non-Source Forms.
234 | 
235 |   You may convey a covered work in object code form under the terms
236 | of sections 4 and 5, provided that you also convey the
237 | machine-readable Corresponding Source under the terms of this License,
238 | in one of these ways:
239 | 
240 |     a) Convey the object code in, or embodied in, a physical product
241 |     (including a physical distribution medium), accompanied by the
242 |     Corresponding Source fixed on a durable physical medium
243 |     customarily used for software interchange.
244 | 
245 |     b) Convey the object code in, or embodied in, a physical product
246 |     (including a physical distribution medium), accompanied by a
247 |     written offer, valid for at least three years and valid for as
248 |     long as you offer spare parts or customer support for that product
249 |     model, to give anyone who possesses the object code either (1) a
250 |     copy of the Corresponding Source for all the software in the
251 |     product that is covered by this License, on a durable physical
252 |     medium customarily used for software interchange, for a price no
253 |     more than your reasonable cost of physically performing this
254 |     conveying of source, or (2) access to copy the
255 |     Corresponding Source from a network server at no charge.
256 | 
257 |     c) Convey individual copies of the object code with a copy of the
258 |     written offer to provide the Corresponding Source.  This
259 |     alternative is allowed only occasionally and noncommercially, and
260 |     only if you received the object code with such an offer, in accord
261 |     with subsection 6b.
262 | 
263 |     d) Convey the object code by offering access from a designated
264 |     place (gratis or for a charge), and offer equivalent access to the
265 |     Corresponding Source in the same way through the same place at no
266 |     further charge.  You need not require recipients to copy the
267 |     Corresponding Source along with the object code.  If the place to
268 |     copy the object code is a network server, the Corresponding Source
269 |     may be on a different server (operated by you or a third party)
270 |     that supports equivalent copying facilities, provided you maintain
271 |     clear directions next to the object code saying where to find the
272 |     Corresponding Source.  Regardless of what server hosts the
273 |     Corresponding Source, you remain obligated to ensure that it is
274 |     available for as long as needed to satisfy these requirements.
275 | 
276 |     e) Convey the object code using peer-to-peer transmission, provided
277 |     you inform other peers where the object code and Corresponding
278 |     Source of the work are being offered to the general public at no
279 |     charge under subsection 6d.
280 | 
281 |   A separable portion of the object code, whose source code is excluded
282 | from the Corresponding Source as a System Library, need not be
283 | included in conveying the object code work.
284 | 
285 |   A "User Product" is either (1) a "consumer product", which means any
286 | tangible personal property which is normally used for personal, family,
287 | or household purposes, or (2) anything designed or sold for incorporation
288 | into a dwelling.  In determining whether a product is a consumer product,
289 | doubtful cases shall be resolved in favor of coverage.  For a particular
290 | product received by a particular user, "normally used" refers to a
291 | typical or common use of that class of product, regardless of the status
292 | of the particular user or of the way in which the particular user
293 | actually uses, or expects or is expected to use, the product.  A product
294 | is a consumer product regardless of whether the product has substantial
295 | commercial, industrial or non-consumer uses, unless such uses represent
296 | the only significant mode of use of the product.
297 | 
298 |   "Installation Information" for a User Product means any methods,
299 | procedures, authorization keys, or other information required to install
300 | and execute modified versions of a covered work in that User Product from
301 | a modified version of its Corresponding Source.  The information must
302 | suffice to ensure that the continued functioning of the modified object
303 | code is in no case prevented or interfered with solely because
304 | modification has been made.
305 | 
306 |   If you convey an object code work under this section in, or with, or
307 | specifically for use in, a User Product, and the conveying occurs as
308 | part of a transaction in which the right of possession and use of the
309 | User Product is transferred to the recipient in perpetuity or for a
310 | fixed term (regardless of how the transaction is characterized), the
311 | Corresponding Source conveyed under this section must be accompanied
312 | by the Installation Information.  But this requirement does not apply
313 | if neither you nor any third party retains the ability to install
314 | modified object code on the User Product (for example, the work has
315 | been installed in ROM).
316 | 
317 |   The requirement to provide Installation Information does not include a
318 | requirement to continue to provide support service, warranty, or updates
319 | for a work that has been modified or installed by the recipient, or for
320 | the User Product in which it has been modified or installed.  Access to a
321 | network may be denied when the modification itself materially and
322 | adversely affects the operation of the network or violates the rules and
323 | protocols for communication across the network.
324 | 
325 |   Corresponding Source conveyed, and Installation Information provided,
326 | in accord with this section must be in a format that is publicly
327 | documented (and with an implementation available to the public in
328 | source code form), and must require no special password or key for
329 | unpacking, reading or copying.
330 | 
331 |   7. Additional Terms.
332 | 
333 |   "Additional permissions" are terms that supplement the terms of this
334 | License by making exceptions from one or more of its conditions.
335 | Additional permissions that are applicable to the entire Program shall
336 | be treated as though they were included in this License, to the extent
337 | that they are valid under applicable law.  If additional permissions
338 | apply only to part of the Program, that part may be used separately
339 | under those permissions, but the entire Program remains governed by
340 | this License without regard to the additional permissions.
341 | 
342 |   When you convey a copy of a covered work, you may at your option
343 | remove any additional permissions from that copy, or from any part of
344 | it.  (Additional permissions may be written to require their own
345 | removal in certain cases when you modify the work.)  You may place
346 | additional permissions on material, added by you to a covered work,
347 | for which you have or can give appropriate copyright permission.
348 | 
349 |   Notwithstanding any other provision of this License, for material you
350 | add to a covered work, you may (if authorized by the copyright holders of
351 | that material) supplement the terms of this License with terms:
352 | 
353 |     a) Disclaiming warranty or limiting liability differently from the
354 |     terms of sections 15 and 16 of this License; or
355 | 
356 |     b) Requiring preservation of specified reasonable legal notices or
357 |     author attributions in that material or in the Appropriate Legal
358 |     Notices displayed by works containing it; or
359 | 
360 |     c) Prohibiting misrepresentation of the origin of that material, or
361 |     requiring that modified versions of such material be marked in
362 |     reasonable ways as different from the original version; or
363 | 
364 |     d) Limiting the use for publicity purposes of names of licensors or
365 |     authors of the material; or
366 | 
367 |     e) Declining to grant rights under trademark law for use of some
368 |     trade names, trademarks, or service marks; or
369 | 
370 |     f) Requiring indemnification of licensors and authors of that
371 |     material by anyone who conveys the material (or modified versions of
372 |     it) with contractual assumptions of liability to the recipient, for
373 |     any liability that these contractual assumptions directly impose on
374 |     those licensors and authors.
375 | 
376 |   All other non-permissive additional terms are considered "further
377 | restrictions" within the meaning of section 10.  If the Program as you
378 | received it, or any part of it, contains a notice stating that it is
379 | governed by this License along with a term that is a further
380 | restriction, you may remove that term.  If a license document contains
381 | a further restriction but permits relicensing or conveying under this
382 | License, you may add to a covered work material governed by the terms
383 | of that license document, provided that the further restriction does
384 | not survive such relicensing or conveying.
385 | 
386 |   If you add terms to a covered work in accord with this section, you
387 | must place, in the relevant source files, a statement of the
388 | additional terms that apply to those files, or a notice indicating
389 | where to find the applicable terms.
390 | 
391 |   Additional terms, permissive or non-permissive, may be stated in the
392 | form of a separately written license, or stated as exceptions;
393 | the above requirements apply either way.
394 | 
395 |   8. Termination.
396 | 
397 |   You may not propagate or modify a covered work except as expressly
398 | provided under this License.  Any attempt otherwise to propagate or
399 | modify it is void, and will automatically terminate your rights under
400 | this License (including any patent licenses granted under the third
401 | paragraph of section 11).
402 | 
403 |   However, if you cease all violation of this License, then your
404 | license from a particular copyright holder is reinstated (a)
405 | provisionally, unless and until the copyright holder explicitly and
406 | finally terminates your license, and (b) permanently, if the copyright
407 | holder fails to notify you of the violation by some reasonable means
408 | prior to 60 days after the cessation.
409 | 
410 |   Moreover, your license from a particular copyright holder is
411 | reinstated permanently if the copyright holder notifies you of the
412 | violation by some reasonable means, this is the first time you have
413 | received notice of violation of this License (for any work) from that
414 | copyright holder, and you cure the violation prior to 30 days after
415 | your receipt of the notice.
416 | 
417 |   Termination of your rights under this section does not terminate the
418 | licenses of parties who have received copies or rights from you under
419 | this License.  If your rights have been terminated and not permanently
420 | reinstated, you do not qualify to receive new licenses for the same
421 | material under section 10.
422 | 
423 |   9. Acceptance Not Required for Having Copies.
424 | 
425 |   You are not required to accept this License in order to receive or
426 | run a copy of the Program.  Ancillary propagation of a covered work
427 | occurring solely as a consequence of using peer-to-peer transmission
428 | to receive a copy likewise does not require acceptance.  However,
429 | nothing other than this License grants you permission to propagate or
430 | modify any covered work.  These actions infringe copyright if you do
431 | not accept this License.  Therefore, by modifying or propagating a
432 | covered work, you indicate your acceptance of this License to do so.
433 | 
434 |   10. Automatic Licensing of Downstream Recipients.
435 | 
436 |   Each time you convey a covered work, the recipient automatically
437 | receives a license from the original licensors, to run, modify and
438 | propagate that work, subject to this License.  You are not responsible
439 | for enforcing compliance by third parties with this License.
440 | 
441 |   An "entity transaction" is a transaction transferring control of an
442 | organization, or substantially all assets of one, or subdividing an
443 | organization, or merging organizations.  If propagation of a covered
444 | work results from an entity transaction, each party to that
445 | transaction who receives a copy of the work also receives whatever
446 | licenses to the work the party's predecessor in interest had or could
447 | give under the previous paragraph, plus a right to possession of the
448 | Corresponding Source of the work from the predecessor in interest, if
449 | the predecessor has it or can get it with reasonable efforts.
450 | 
451 |   You may not impose any further restrictions on the exercise of the
452 | rights granted or affirmed under this License.  For example, you may
453 | not impose a license fee, royalty, or other charge for exercise of
454 | rights granted under this License, and you may not initiate litigation
455 | (including a cross-claim or counterclaim in a lawsuit) alleging that
456 | any patent claim is infringed by making, using, selling, offering for
457 | sale, or importing the Program or any portion of it.
458 | 
459 |   11. Patents.
460 | 
461 |   A "contributor" is a copyright holder who authorizes use under this
462 | License of the Program or a work on which the Program is based.  The
463 | work thus licensed is called the contributor's "contributor version".
464 | 
465 |   A contributor's "essential patent claims" are all patent claims
466 | owned or controlled by the contributor, whether already acquired or
467 | hereafter acquired, that would be infringed by some manner, permitted
468 | by this License, of making, using, or selling its contributor version,
469 | but do not include claims that would be infringed only as a
470 | consequence of further modification of the contributor version.  For
471 | purposes of this definition, "control" includes the right to grant
472 | patent sublicenses in a manner consistent with the requirements of
473 | this License.
474 | 
475 |   Each contributor grants you a non-exclusive, worldwide, royalty-free
476 | patent license under the contributor's essential patent claims, to
477 | make, use, sell, offer for sale, import and otherwise run, modify and
478 | propagate the contents of its contributor version.
479 | 
480 |   In the following three paragraphs, a "patent license" is any express
481 | agreement or commitment, however denominated, not to enforce a patent
482 | (such as an express permission to practice a patent or covenant not to
483 | sue for patent infringement).  To "grant" such a patent license to a
484 | party means to make such an agreement or commitment not to enforce a
485 | patent against the party.
486 | 
487 |   If you convey a covered work, knowingly relying on a patent license,
488 | and the Corresponding Source of the work is not available for anyone
489 | to copy, free of charge and under the terms of this License, through a
490 | publicly available network server or other readily accessible means,
491 | then you must either (1) cause the Corresponding Source to be so
492 | available, or (2) arrange to deprive yourself of the benefit of the
493 | patent license for this particular work, or (3) arrange, in a manner
494 | consistent with the requirements of this License, to extend the patent
495 | license to downstream recipients.  "Knowingly relying" means you have
496 | actual knowledge that, but for the patent license, your conveying the
497 | covered work in a country, or your recipient's use of the covered work
498 | in a country, would infringe one or more identifiable patents in that
499 | country that you have reason to believe are valid.
500 | 
501 |   If, pursuant to or in connection with a single transaction or
502 | arrangement, you convey, or propagate by procuring conveyance of, a
503 | covered work, and grant a patent license to some of the parties
504 | receiving the covered work authorizing them to use, propagate, modify
505 | or convey a specific copy of the covered work, then the patent license
506 | you grant is automatically extended to all recipients of the covered
507 | work and works based on it.
508 | 
509 |   A patent license is "discriminatory" if it does not include within
510 | the scope of its coverage, prohibits the exercise of, or is
511 | conditioned on the non-exercise of one or more of the rights that are
512 | specifically granted under this License.  You may not convey a covered
513 | work if you are a party to an arrangement with a third party that is
514 | in the business of distributing software, under which you make payment
515 | to the third party based on the extent of your activity of conveying
516 | the work, and under which the third party grants, to any of the
517 | parties who would receive the covered work from you, a discriminatory
518 | patent license (a) in connection with copies of the covered work
519 | conveyed by you (or copies made from those copies), or (b) primarily
520 | for and in connection with specific products or compilations that
521 | contain the covered work, unless you entered into that arrangement,
522 | or that patent license was granted, prior to 28 March 2007.
523 | 
524 |   Nothing in this License shall be construed as excluding or limiting
525 | any implied license or other defenses to infringement that may
526 | otherwise be available to you under applicable patent law.
527 | 
528 |   12. No Surrender of Others' Freedom.
529 | 
530 |   If conditions are imposed on you (whether by court order, agreement or
531 | otherwise) that contradict the conditions of this License, they do not
532 | excuse you from the conditions of this License.  If you cannot convey a
533 | covered work so as to satisfy simultaneously your obligations under this
534 | License and any other pertinent obligations, then as a consequence you may
535 | not convey it at all.  For example, if you agree to terms that obligate you
536 | to collect a royalty for further conveying from those to whom you convey
537 | the Program, the only way you could satisfy both those terms and this
538 | License would be to refrain entirely from conveying the Program.
539 | 
540 |   13. Remote Network Interaction; Use with the GNU General Public License.
541 | 
542 |   Notwithstanding any other provision of this License, if you modify the
543 | Program, your modified version must prominently offer all users
544 | interacting with it remotely through a computer network (if your version
545 | supports such interaction) an opportunity to receive the Corresponding
546 | Source of your version by providing access to the Corresponding Source
547 | from a network server at no charge, through some standard or customary
548 | means of facilitating copying of software.  This Corresponding Source
549 | shall include the Corresponding Source for any work covered by version 3
550 | of the GNU General Public License that is incorporated pursuant to the
551 | following paragraph.
552 | 
553 |   Notwithstanding any other provision of this License, you have
554 | permission to link or combine any covered work with a work licensed
555 | under version 3 of the GNU General Public License into a single
556 | combined work, and to convey the resulting work.  The terms of this
557 | License will continue to apply to the part which is the covered work,
558 | but the work with which it is combined will remain governed by version
559 | 3 of the GNU General Public License.
560 | 
561 |   14. Revised Versions of this License.
562 | 
563 |   The Free Software Foundation may publish revised and/or new versions of
564 | the GNU Affero General Public License from time to time.  Such new versions
565 | will be similar in spirit to the present version, but may differ in detail to
566 | address new problems or concerns.
567 | 
568 |   Each version is given a distinguishing version number.  If the
569 | Program specifies that a certain numbered version of the GNU Affero General
570 | Public License "or any later version" applies to it, you have the
571 | option of following the terms and conditions either of that numbered
572 | version or of any later version published by the Free Software
573 | Foundation.  If the Program does not specify a version number of the
574 | GNU Affero General Public License, you may choose any version ever published
575 | by the Free Software Foundation.
576 | 
577 |   If the Program specifies that a proxy can decide which future
578 | versions of the GNU Affero General Public License can be used, that proxy's
579 | public statement of acceptance of a version permanently authorizes you
580 | to choose that version for the Program.
581 | 
582 |   Later license versions may give you additional or different
583 | permissions.  However, no additional obligations are imposed on any
584 | author or copyright holder as a result of your choosing to follow a
585 | later version.
586 | 
587 |   15. Disclaimer of Warranty.
588 | 
589 |   THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
590 | APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
594 | PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
595 | IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
597 | 
598 |   16. Limitation of Liability.
599 | 
600 |   IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
608 | SUCH DAMAGES.
609 | 
610 |   17. Interpretation of Sections 15 and 16.
611 | 
612 |   If the disclaimer of warranty and limitation of liability provided
613 | above cannot be given local legal effect according to their terms,
614 | reviewing courts shall apply local law that most closely approximates
615 | an absolute waiver of all civil liability in connection with the
616 | Program, unless a warranty or assumption of liability accompanies a
617 | copy of the Program in return for a fee.
618 | 
619 |                      END OF TERMS AND CONDITIONS
620 | 
621 |             How to Apply These Terms to Your New Programs
622 | 
623 |   If you develop a new program, and you want it to be of the greatest
624 | possible use to the public, the best way to achieve this is to make it
625 | free software which everyone can redistribute and change under these terms.
626 | 
627 |   To do so, attach the following notices to the program.  It is safest
628 | to attach them to the start of each source file to most effectively
629 | state the exclusion of warranty; and each file should have at least
630 | the "copyright" line and a pointer to where the full notice is found.
631 | 
632 |     <one line to give the program's name and a brief idea of what it does.>
633 |     Copyright (C) <year>  <name of author>
634 | 
635 |     This program is free software: you can redistribute it and/or modify
636 |     it under the terms of the GNU Affero General Public License as published
637 |     by the Free Software Foundation, either version 3 of the License, or
638 |     (at your option) any later version.
639 | 
640 |     This program is distributed in the hope that it will be useful,
641 |     but WITHOUT ANY WARRANTY; without even the implied warranty of
642 |     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
643 |     GNU Affero General Public License for more details.
644 | 
645 |     You should have received a copy of the GNU Affero General Public License
646 |     along with this program.  If not, see <http://www.gnu.org/licenses/>.
647 | 
648 | Also add information on how to contact you by electronic and paper mail.
649 | 
650 |   If your software can interact with users remotely through a computer
651 | network, you should also make sure that it provides a way for users to
652 | get its source.  For example, if your program is a web application, its
653 | interface could display a "Source" link that leads users to an archive
654 | of the code.  There are many ways you could offer source, and different
655 | solutions will be better for different programs; see section 13 for the
656 | specific requirements.
657 | 
658 |   You should also get your employer (if you work as a programmer) or school,
659 | if any, to sign a "copyright disclaimer" for the program, if necessary.
660 | For more information on this, and how to apply and follow the GNU AGPL, see
661 | <http://www.gnu.org/licenses/>.
662 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Overview
  2 | 
  3 | This repository containes the needed data to train a Linear Model (OLS) / XGBoost for the [SPECPower
  4 | data set](https://www.spec.org/power_ssj2008/).
  5 | 
  6 | The models are built with dynamic variables designed to work 
  7 | in different cloud environments where some information may not be available.
  8 | 
  9 | Its use is the estimation of the current power draw of the whole machine in 
 10 | Watts.
 11 | 
 12 | Currently the model supports following variables:
 13 | - CPU Utilization `[float [0-100]]`
 14 |     + The utilization of all your assigned threads cumulative and normalized to 0-100
 15 | - CPU Chips `[integer [1,)]`
 16 |     + The CPU chips installed on the mainboard. Most machines have either 1 or 2.
 17 |     + If you do not know this value rather leave it off.
 18 | - CPU Threads `[integer [1,)]`
 19 |     + The total amount of CPU threads over all installed chips.
 20 |     + Example: The CPU has 10 physical cores with two threads each and two chips installed you enter 10 * 2 * 2 = 40.
 21 |     + Please note that if you are restricted to use only a subset of the threads, like it is typical \
 22 |       in virtualized or containerized environments you still enter the full capacity of the CPU. The ratio assigned \
 23 |       to you is handled by the parameter `vHost Ratio`
 24 | - CPU Cores `[integer [1,)]`
 25 |     + Threads and cores do not have to be equal. When Hyperthreading is active the amount of threads
 26 |     + is typically greater than the amount of cores.
 27 |     + If you do not know how many phyiscal cores you really have rather do not supply this argument
 28 | - CPU Frequency `[integer [1,)]`
 29 |     + The base frequency of the processor in MHz.
 30 |     + This value is only used in the XGBoost variant of the model
 31 | - Architecture `[str]`
 32 |     * For example: "haswell"
 33 | - CPU Make `[str]`
 34 |     * either "intel" or "amd"
 35 | - Release year `[int]`
 36 |     + ex. 2011
 37 | - RAM `[integer (0,]]`
 38 |     * in Gigabytes
 39 | - TDP `[integer (0,]]`
 40 |     + In Watts
 41 |     + The thermal design power of the CPU in your system. This value you typically find only on the data sheet online.
 42 | - vHost Ratio `[float (0,1])`
 43 |     + The vHost ratio on the system you are on. If you are on a bare metal machine this is 1
 44 |     + If you are a guest and have e.g. 24 of the 96 Threads than the ratio would be 0.25
 45 |     + Currently the model cannot account for non-balanced CPU and memory ratios.
 46 | 
 47 | Only the CPU Utilization parameter is mandatory. All other paramters are optional. \
 48 | vHost ratio is assumed to be 1 if not given.
 49 | 
 50 | You are free to supply only the utilization or as many additional parameters that
 51 | the model supports. The model will then be retrained on the new configuration on the spot.
 52 | 
 53 | Typically the model gets more accurate the more parameters you can supply. Please see the *Assumptions & Limitations* part at the end to get an idea how accurate the model will be in different circumstances.
 54 | 
 55 | # Background
 56 | 
 57 | Typically in the cloud, especially when virtualized, it is not posssible to 
 58 | access any energy metrics either from the [ILO](https://en.wikipedia.org/wiki/HP_Integrated_Lights-Out) / [IDRAC](https://en.wikipedia.org/wiki/Dell_DRAC) 
 59 | controllers or from [RAPL](https://en.wikipedia.org/wiki/Perf_(Linux)#RAPL).
 60 | 
 61 | Therfore power draw must be estimated.
 62 | 
 63 | Many approaches like this have been made so far:
 64 | - https://www.cloudcarbonfootprint.org/
 65 | - https://greenpixie.com/
 66 | - https://medium.com/teads-engineering/evaluating-the-carbon-footprint-of-a-software-platform-hosted-in-the-cloud-e716e14e060c#3bf5
 67 | 
 68 | Cloud Carbon Footprint and Teads operate on Billing data and are too coarse 
 69 | for a fast paced development that pushes changing code on a daily basis.
 70 | 
 71 | Teads could theoretically solve this, but is strictily limited to AWS EC2. Also
 72 | it provides no interface out of the box to inline monitor the emissions.
 73 | 
 74 | Therefore we created a model out of the SPECPower dataset that also can be used
 75 | in real-time.
 76 | 
 77 | # Discovery of the parameters
 78 | 
 79 | At least utilization is needed as an input parameter.
 80 | 
 81 | You need some small script that streams the CPU utilization as pure float numbers
 82 | line by line.
 83 | 
 84 | The solution we are using is a modified version of our [CPU Utilization reporter
 85 | from the Green Metrics Tool](https://github.com/green-coding-solutions/green-metrics-tool/tree/main/metric_providers/cpu/utilization/procfs/system).
 86 | 
 87 | This one is tailored to read from the procfs. You might need something different in your case ...
 88 | 
 89 | ## Hyperthreading
 90 | 
 91 | HT can be easily checked if the core-id is similar to the processor id.
 92 | 
 93 | Last Core-ID should be processor_id+1
 94 | If Last core ID is > processor_id+2  then HT is enabled
 95 | 
 96 | Alternatively looking at `lscpu` might reveal some infos.
 97 | 
 98 | ## SVM / VT-X / VT-D / AMD-V ...
 99 | The presence of virtualization can be checked by looking at:
100 | 
101 | `/dev/kvm`
102 | 
103 | If that directory is present this is a strong indicator, that virtualization is enabled.
104 | 
105 | One can also install cpu-checker and then run 
106 | `sudo apt install kvm-ok -y && sudo kvm-ok`
107 | 
108 | This will tell with more checks if virtualization is on. even on AMD machines.
109 | 
110 | However in a vHost this might not work at all, as the directory is generally hidden.
111 | 
112 | Here it must be checked if a virtualization is already running through:
113 | `sudo apt install virt-what -y && sudo virt-what`
114 | 
115 | Also `lscpu` might provide some insights by having these lines:
116 | 
117 | ```
118 | Virtualization features:
119 |   Hypervisor vendor:     KVM
120 |   Virtualization type:   full
121 | ```  
122 | 
123 | ## Hardware prefetchers
124 | 
125 | There are actually many to disable:
126 | The above mentioned processors support 4 types of h/w prefetchers for prefetching data. There are 2 prefetchers associated with L1-data cache (also known as DCU DCU prefetcher, DCU IP prefetcher) and 2 prefetchers associated with L2 cache (L2 hardware prefetcher, L2 adjacent cache line prefetcher).
127 | 
128 | There is a Model Specific Register (MSR) on every core with address of 0x1A4 that can be used to control these 4 prefetchers. Bits 0-3 in this register can be used to either enable or disable these prefetchers. Other bits of this MSR are reserved.
129 | 
130 | However it seems that for some processors this setting is only available in the BIOS
131 | as it is not necessary disclosed info by Intel how to disable it.
132 | For servers it seems quite standard to do be an available feature apparently ...
133 | 
134 | https://stackoverflow.com/questions/54753423/correctly-disable-hardware-prefetching-with-msr-in-skylake
135 | https://stackoverflow.com/questions/55967873/how-can-i-verify-that-my-hardware-prefetcher-is-disabled
136 | https://stackoverflow.com/questions/784041/how-do-i-programmatically-disable-hardware-prefetching
137 | https://stackoverflow.com/questions/19435788/unable-to-disable-hardware-prefetcher-in-core-i7
138 | https://stackoverflow.com/questions/784041/how-do-i-programmatically-disable-hardware-prefetching
139 | 
140 | 
141 | ## Other variables
142 | Other variables to be discovered like CPU Make etc. can be found in these locations typically:
143 | 
144 | - `/proc/stat`
145 | - `/proc/memory`
146 | - `/proc/cpuinfo`
147 | - `/sys/devices/virtual/dmi`
148 | - `dmidecode`
149 | - `lspci`
150 | - `lshw`
151 | - `/var/log/dmesg`
152 | 
153 | Informations like the vHost-Ratio you can sometimes see in `/proc/stat`, but this
154 | info is usually given in the machine selector of your cloud provider.
155 | 
156 | If you cannot find out specific parameters the best thing is: Write an email to your cloud provider and ask :)
157 | 
158 | # Model Details / EDA
159 | 
160 | - Model uses SPECPower raw data
161 |     + Current copy is stored in `./data/raw`
162 |     + We only process the html data. It contains the same info as the text
163 |     + Look into `./scripts/create_data_csv.py`
164 |     + Unprocessed version is then in `./data/spec_data.csv`
165 | - CPU microarchitecture and TDP data is coming from
166 |     + David Mytton [Cloud carbon coefficients](https://github.com/cloud-carbon-footprint/cloud-carbon-coefficients) (only AMD Epyc info)
167 |     + Wikipedia (very! thorough source)
168 | - Data is cleaned. Look into `./scripts/data_cleaning.py`
169 |     + Cleaned and enriched version is then in `./data/spec_data_cleaned.csv`
170 | 
171 | The EDA is currently only on Kaggle, where you can see how we selected the subset of the 
172 | available variables and their interaction in our [Kaggle notebook](https://www.kaggle.com/code/arne3000/spec-power-eda-pass-2)
173 | 
174 | In order to create some columns we inspected the `SUT_BIOS` and `SUT_Notes` fields
175 | and created some feature columns dervied from them. Here is a quick summary:
176 | 
177 | - *BIOS_P_States_Enabled*
178 |     + P-states are a power feature. P-State = 1 is the base frequency
179 |     + Setting P-states to off will set P-State to max non-turbo (aka 1) (https://www.thomas-krenn.com/en/wiki/Disable_CPU_Power_Saving_Management_in_BIOS)
180 |     + All P-States greater than 1 are power efficient states: https://www.thomas-krenn.com/en/wiki/Processor_P-states_and_C-states
181 | 
182 | - *BIOS_Memory_Setting_Changed*
183 |     + When we found infos like "DDR Frequency set to 1066 MHz" we considered this memory tuning
184 | 
185 | - *BIOS_HT_Enabled*
186 |     + We found Hyperthreading mostly not mentioned, but when than turned on. Which should be the default anyway.
187 |     
188 | - *BIOS_VT_Enabled*
189 |     + Virtualization was sometimes disabled, which is also very often the default
190 |     + However we believe it is almost always on in cloud environments, as it is for instance a prerequiste for KVM (EC2 hypervisor)
191 |     + Includes SVM from AMD
192 |    
193 | - *BIOS_Turbo_Boost_Enabled*
194 |     + Turbo Boost was very often turned off, which is a clear sign of tuning
195 |     + Turbo Boost is almost always on by default
196 |    
197 | - *BIOS_C_States_Enabled*
198 |     + C-States are a power saving feature. If they are fixed to a certain state this could well be considered tuning, as this is non default and very untypical for the cloud
199 |    
200 | - *BIOS_Prefetchers_Enabled*
201 |     + Prefetchers like DCU Prefetcher, Adjacent Cache Line Prefetch, MLC Spatial Prefetcher etc. are almost always on by default
202 |     + Most systems however have these disabled.
203 |     + We do not know the typical state in the cloud here.
204 | 
205 | ## Unclear data in SUT_BIOS / SUT_Notes
206 | 
207 | Some info we thought might be related to energy, but we could not make sense of them.
208 | If you can, please share and create and create a Pull Request:
209 | 
210 | - The cores were mostly fixed to a JVM instance: *Each JVM instance was affinitized two logical processors on a single socket.*
211 |     + We do not know if this optimizing for the benchmark or a SPECPower requirement.
212 |     + Therefore not processed further
213 | 
214 | - We found however settings with TurboBoost on and then the *Maximum Processor State: 100%.* was set. 
215 |     + We are not exactly sure what that means, but it could indicate that TurboBoost although enabled could never be executed ...
216 | 
217 | - We found settings like *SATA Controller = Disabled*
218 |     + This setting was mostly set cause the machines were running on PCIe / M2 disks
219 | 
220 | - *Set "Uncore Frequency Override = Power balanced" in BIOS.* or *Power Option: Power Saver* or *"Power Mode: Balanced"*
221 |     + Unsure what does translates to really since "power balanced" has no defined meaning and changes for every vendor.
222 |     + Balanced might for instance include TurboBoost On for one vendor and Off for another
223 | 
224 | - *DEMT -enabled.*
225 |     + Dynamic energy management
226 |     + Ignored cause we do not know how this really affects energy consumption
227 | 
228 | - *Memory Data Scrambling: Disable* / *Set "Memory Patrol Scrub = Disabled"*
229 |     + Ignored cause we do not know how this really affects energy consumption
230 | 
231 | - *EIST* is sometimes enabled and sometimes not. Although it can be a power saving feature it alone says nothing about power itself.
232 |     + We believe this column holds no information on its own
233 | 
234 | - ASPM Support - Power saving for PCIe
235 |     + Ignored cause we do not know how this really affects energy consumption
236 | 
237 | -  'USB Front Port Disabled.',
238 |     + Ignored cause we do not know how this really affects energy consumption
239 |     + Also we believe this is cloud standard
240 | 
241 | - *CPU Power Management set to DAPC*
242 |     + Dell only feature for energy. Did not look into further
243 | 
244 | - *EfficiencyModeEn = Enabled*
245 |     + Too few entries with feature
246 | 
247 | - *SGX enabled / disabled* 
248 |     + is also very curious ... unclear what the cloud setting is
249 | 
250 | 
251 | # Interpolation for output
252 | 
253 | Like all tree based models our XGBoost model can only predict what it has seen so 
254 | far.
255 | 
256 | Since the original data from SPECPower only has information for every 10% of 
257 | utilization the model will by default for instance give the same value for 6%
258 | as well as for 7%.
259 | 
260 | To combat this behaviour we interpolate between the points where the model actually
261 | reports new data, which is:
262 | - 0-5
263 | - 5-15
264 | - 15-25
265 | - 25-35
266 | - 35-45
267 | - 45-55
268 | - 55-65
269 | - 65-75
270 | - 75-85
271 | - 85-95
272 | - 95-100
273 | 
274 | The data is just interpolated linearly. The interpolation is done directly 
275 | when the `xgb.py` script is starting and thus all possible infered values for 
276 | utilization (0.00 - 100.00) are stored in a dict.
277 | This makes the model extremely performant at the cost of a minimal memory cost.
278 | 
279 | # Results
280 | 
281 | We have first compared the model against a machine from SPECPower that we 
282 | did not include in the model training: [Hewlett Packard Enterprise Synergy 480 Gen10 Plus Compute Module](https://www.spec.org/power_ssj2008/results/res2022q1/power_ssj2008-20211207-01142.html)
283 | 
284 | This machine is comprised of 10 identical nodes, therefore the power values
285 | have to be divided by 10 to get the approximate value that would have resulted 
286 | if only one node was tested individually.
287 | 
288 | An individual node has the following characteristics as model parameters:
289 | - --cpu-freq 2300
290 | - --tdp 270
291 | - --ram 256
292 | - --cpu-threads 160
293 | - --cpu-chips 2
294 | 
295 | ![hp_synergy_480_Gen10_Plus.png](/img/hp_synergy_480_Gen10_Plus.png)
296 | 
297 | 
298 | This is the comparison chart:
299 | 
300 | Secondly we have bought a machine from the SPECPower dataset: [FUJITSU Server PRIMERGY RX1330 M3](https://www.spec.org/power_ssj2008/results/res2017q2/power_ssj2008-20170315-00744.html)
301 | 
302 | The machine has the following characteristics as model parameters:
303 | - --cpu-freq 3500
304 | - --tdp 24
305 | - --ram 16
306 | - --cpu-threads 8
307 | - --cpu-chips 1
308 | 
309 | This is the comparison chart for the SPEC data vs our modelling:
310 | ![fujitsu_TX1330_SPEC.png](/img/fujitsu_TX1330_SPEC.png)
311 | 
312 | 
313 | This is the comparison chart where we compare the standard BIOS setup against the *tuning* settings from SPECPower:
314 | ![fujitsu_TX1330_measured.png](/img/fujitsu_TX1330_measured.png)
315 | 
316 | ## Summary
317 | - We can see that the SDIA model in its current form cannot account for the idle state of the machine and thus always underestimates here
318 | - The SDIA model underestimates 1-chip machines and greatly over-estimates 2-chip machines
319 |     + Taken into account that for 2-chip machines we only have SPECPower data at the moment and no real world data
320 | - The linear model is good for parameter exploration, but delivers badly fitted results
321 | - The XGBoost model is able to estimate a real world 1-chip machine and an out of sample 2-chip machine from SPECPower very nicely.
322 |     + However it tends to under-estimate
323 | - We see suprisingly no efficiency gain from applying the SPECPower BIOS settings but rather a smoothing of the curve. The reason to that is currently unknown.
324 | 
325 | # Installation
326 | 
327 | Tested on python-3.10 but should work on older versions.
328 | 
329 | ```
330 | python3 -m venv venv
331 | source venv/bin/activate
332 | pip3 install -r requirements.txt
333 | ```
334 | 
335 | ## Re-build training data
336 | If you want to rebuild the training data (`spec_data*.csv`) then you have to include 
337 | the git submodule with the raw data.
338 | 
339 | ```bash
340 | git submodule update --init
341 | ```
342 | 
343 | # Use
344 | You must call the python file `ols.py` or `xgb.py`. 
345 | This file is designed to accept streaming inputs.
346 | 
347 | A typical call with a streaming binary that reports CPU Utilization could look like
348 | so: 
349 | ```
350 | $ ./static-binary | python3 ols.py --tdp 240 
351 | 191.939294374113
352 | 169.99632303510703
353 | 191.939294374113
354 | 191.939294374113
355 | 191.939294374113
356 | 191.939294374113
357 | 194.37740205685841
358 | 191.939294374113
359 | 169.99632303510703
360 | 191.939294374113
361 | ....
362 | ```
363 | 
364 | Since all possible outputs are infered directly into a dict the model is highly
365 | performant to use in inline reporting scenarios.
366 | 
367 | # Demo reporter
368 | 
369 | If you want to use the demo reporter to read the CPU utilization there is a C reporter
370 | in the `demo-reporter` directory.
371 | 
372 | Compile it with `gcc cpu-utilization.c`
373 | 
374 | Then run it with `./a.out`
375 | 
376 | Or feed it directly to the model with: `./a.out | python3 model.py --tdp ....`
377 | 
378 | 
379 | ## Comparison with Interact DC variable selection
380 | 
381 | Run the `interact_validation.py` to see a K-Folds comparison of our variable 
382 | selection against the one from Interact DC.
383 | 
384 | Without Hyperparameter Tuning when comparing the available variables in the cloud
385 | they are about the same.
386 | 
387 | # Assumptions & Limitations
388 | - The model was trained on the SPECpower dataset which almost exclusively includes compute focussed machines. This means it will not be accurate for memory-heavy machines like database servers or ML machines that tend to use GPUs/TPUs or even ASICS
389 | - The main input variable for the model is CPU utilization. This metric is only reliable if the system frequencies do not change much. See our in depth article about [usefulness of CPU Utilization as a metric](https://www.green-coding.berlin/case-studies/cpu-utilization-usefulness/)
390 | - SPECPower machines tend to be rather tuned and do not necessarily represent the reality of current datacenter configurations. So you are likely to get a too small value than a too high value. This was also detailed in the analysis earlier in the README, where we talk about the turned off features.
391 | - If you are in a shared resource system like a Virtual Machine the model will assume a linear fraction of the load. This is debateable and might need improvement. See the discussion here: https://github.com/green-coding-solutions/spec-power-model/issues/4
392 | 
393 | 
394 | # TODO
395 | 
396 | - vhost operating point
397 | - ~~validation of EC2 machines and the data from Teads.~~
398 | - ~~Performance optimizations for inline processing to get below 2% of utilization for 100ms intervals~~
399 | - Re-evaluating more machines from the SPECPower database in our lab and better understand what the BIOS settings really impact in regards to the server energy
400 | - Research what values in the cloud are typically set for the BIOS settings that SPECPower lists and if they can be configured in the cloud
401 | - Introspecting our models to understand which parameter in which setting will give the most energy gain when set on the machine so that developers can optimize these parameters
402 | 
403 | ## Credits
404 | 
405 | A similar model has been developed in academia from [Interact DC](https://interactdc.com/) and the 
406 | paper can be downloaded on [their official resources site](https://interactdc.com/static/images/documents/Elsevier_Journal.pdf).
407 | 
408 | Our model was initially developed idependently but we have taken some inspiration 
409 | from the paper to tune the model afterwards.
410 | 
411 | A big thank you to [Rich Kenny](https://twitter.com/bigkatrich) from Interact DC to providing some insights to
412 | parameters and possible pitfalls during our model development.
413 | 


--------------------------------------------------------------------------------
/auto_detect.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=redefined-outer-name,invalid-name
  2 | 
  3 | import subprocess
  4 | import re
  5 | import logging
  6 | import math
  7 | 
  8 | def get_cpu_info(logger):
  9 | 
 10 |     data = {
 11 |         'freq' : None,
 12 |         'threads': None,
 13 |         'cores': None,
 14 |         'tdp': None,
 15 |         'mem': None,
 16 |         'make': None,
 17 |         'chips': None
 18 |     }
 19 | 
 20 |     try:
 21 |         file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/name'
 22 |         with open(file_path, 'r', encoding='UTF-8') as file:
 23 |             domain_name = file.read().strip()
 24 |             if domain_name != 'package-0':
 25 |                 raise RuntimeError(f"Domain /sys/class/powercap/intel-rapl/intel-rapl:0/name was not package-0, but {domain_name}")
 26 | 
 27 |         file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name'
 28 |         with open(file_path, 'r', encoding='UTF-8') as file:
 29 |             constraint_name = file.read().strip()
 30 |             if constraint_name != 'long_term':
 31 |                 raise RuntimeError(f"Constraint /sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_name was not long_term, but {constraint_name}")
 32 | 
 33 |         file_path = '/sys/class/powercap/intel-rapl/intel-rapl:0/constraint_0_max_power_uw'
 34 |         with open(file_path, 'r', encoding='UTF-8') as file:
 35 |             tdp = file.read()
 36 |             data['tdp'] = int(tdp) / 1_000_000
 37 | 
 38 |         logger.info('Found TDP: %d W', data['tdp'])
 39 |     #pylint: disable=broad-except
 40 |     except Exception as err:
 41 |         logger.info('Exception: %s', err)
 42 |         logger.info('Could not read RAPL powercapping info from /sys/class/powercap/intel-rapl')
 43 | 
 44 |     try:
 45 |         file_paths = {
 46 |             1: '/sys/class/powercap/intel-rapl/intel-rapl:0/name',
 47 |             2: '/sys/class/powercap/intel-rapl/intel-rapl:1/name',
 48 |             3: '/sys/class/powercap/intel-rapl/intel-rapl:2/name',
 49 |             4: '/sys/class/powercap/intel-rapl/intel-rapl:3/name',
 50 |             5: '/sys/class/powercap/intel-rapl/intel-rapl:4/name',
 51 |             6: '/sys/class/powercap/intel-rapl/intel-rapl:5/name',
 52 |         }
 53 |         for chips, file_path in file_paths.items():
 54 |             with open(file_path, 'r', encoding='UTF-8') as file:
 55 |                 domain_name = file.read().strip()
 56 |                 if domain_name != f"package-{chips-1}":
 57 |                     raise RuntimeError(f"Domain {file_path} was not package-{chips-1}, but {domain_name}")
 58 |                 logger.info('Found Sockets: %d', chips)
 59 |                 data['chips'] = chips
 60 |     #pylint: disable=broad-except
 61 |     except Exception as err:
 62 |         logger.info('Exception: %s', err)
 63 |         logger.info('Could not find (additional) chips info under file path. Most likely reached final chip. continuing ...')
 64 | 
 65 | 
 66 |     try:
 67 |         cpuinfo = subprocess.check_output('lscpu', encoding='UTF-8')
 68 |         match = re.search(r'On-line CPU\(s\) list:\s*(0-)?(\d+)', cpuinfo)
 69 |         if match:
 70 |             data['threads'] = int(match.group(2))+1 # +1 because 0 indexed
 71 |             logger.info('Found Threads: %d', data['threads'])
 72 |         else:
 73 |             logger.info('Could not find Threads. Using default None')
 74 | 
 75 |         # this will overwrite info we have from RAPL socket discovery, as we
 76 |         # deem lscpu more relieable
 77 |         match = re.search(r'Socket\(s\):\s*(\d+)', cpuinfo)
 78 |         if match:
 79 |             data['chips'] = int(match.group(1))
 80 |             logger.info('Found Sockets: %d (will take precedence if not 0)', data['chips'])
 81 |         else:
 82 |             logger.info('Could not find Chips/Sockets via lscpu')
 83 | 
 84 |         if data['chips']:
 85 |             match = re.search(r'Core\(s\) per socket:\s*(\d+)', cpuinfo)
 86 |             if match:
 87 |                 cores_per_socket = int(match.group(1))
 88 |                 data['cores'] = cores_per_socket * data['chips']
 89 |                 logger.info('Found cores: %d ', data['cores'])
 90 |             else:
 91 |                 logger.info('Could not find Cores. Using default None')
 92 | 
 93 |         match = re.search(r'Model name:.*@\s*([\d.]+)\s*GHz', cpuinfo)
 94 |         if match:
 95 |             data['freq'] = int(float(match.group(1))*1000)
 96 |             logger.info('Found Frequency: %s', data['freq'])
 97 |         else:
 98 |             logger.info('Could not find Frequency. Using default None')
 99 | 
100 |         match = re.search(r'Model name:.*Intel\(R\)', cpuinfo)
101 |         if match:
102 |             data['make'] = 'intel'
103 |             logger.info('Found Make: %s', data['make'])
104 | 
105 |         match = re.search(r'Model name:.*AMD ', cpuinfo)
106 |         if match:
107 |             data['make'] = 'amd'
108 |             logger.info('Found Make: %s', data['make'])
109 | 
110 | 
111 |         # we currently do not match for architecture, as this info is provided nowhere
112 | 
113 |         # we also currently do not matc for make, as this info can result in ARM which is currently not supported and
114 |         # would rather lead to confusion
115 |     #pylint: disable=broad-except
116 |     except Exception as err:
117 |         logger.info('Exception: %s', err)
118 |         logger.info('Could not check for CPU info.')
119 | 
120 | 
121 |     """ This code is problematic, as the CPU freq is changing rapidly sometimes and making the resulting XGBoost
122 |     values fluctuate a lot.
123 |     """
124 | 
125 | 
126 |     # if not data['freq']:
127 |     #     try:
128 |     #         cpuinfo_proc = subprocess.check_output(['cat', '/proc/cpuinfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
129 |     #         match = re.findall(r'cpu MHz\s*:\s*([\d.]+)', cpuinfo_proc)
130 |     #         if match:
131 |     #             data['freq'] = round(max(map(float, match)))
132 |     #             logger.info('Found assumend Frequency: %d', data['freq'])
133 |     #         else:
134 |     #             logger.info('Could not find Frequency. Using default None')
135 |     #     #pylint: disable=broad-except
136 |     #     except Exception as err:
137 |     #         logger.info('Exception: %s', err)
138 |     #         logger.info('/proc/cpuinfo not accesible on system. Could not check for Base Frequency info. Setting value to None.')
139 | 
140 | 
141 | 
142 |     try:
143 |         meminfo = subprocess.check_output(['cat', '/proc/meminfo'], encoding='UTF-8', stderr=subprocess.DEVNULL)
144 |         match = re.search(r'MemTotal:\s*(\d+) kB', meminfo)
145 |         if match:
146 |             data['mem'] = math.ceil(int(match.group(1)) / 1024 / 1024)
147 |             logger.info('Found Memory: %d GB', data['mem'])
148 |         else:
149 |             logger.info('Could not find Memory. Using default None')
150 |     #pylint: disable=broad-except
151 |     except Exception as err:
152 |         logger.info('Exception: %s', err)
153 |         logger.info('/proc/meminfo not accesible on system. Could not check for Memory info. Defaulting to None.')
154 | 
155 |     return data
156 | 
157 | if __name__ == "__main__":
158 |     logger = logging.getLogger(__name__)
159 |     logger.addHandler(logging.StreamHandler())
160 |     logger.setLevel(logging.INFO)
161 | 
162 |     print(get_cpu_info(logger))
163 | 


--------------------------------------------------------------------------------
/demo-reporter/cpu-utilization.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <errno.h>
  4 | #include <unistd.h>
  5 | #include <sys/time.h>
  6 | #include <time.h>
  7 | 
  8 | typedef struct procfs_time_t { // struct is a specification and this static makes no sense here
  9 |     unsigned long user_time;
 10 |     unsigned long nice_time;
 11 |     unsigned long system_time;
 12 |     unsigned long wait_time;
 13 |     unsigned long iowait_time;
 14 |     unsigned long irq_time;
 15 |     unsigned long softirq_time;
 16 |     unsigned long steal_time;
 17 |     unsigned long guest_time;
 18 |     unsigned long compute_time; // custom attr by us not in standard /proc/stat format
 19 |     unsigned long idle_time; // custom attr by us not in standard /proc/stat format
 20 | } procfs_time_t;
 21 | 
 22 | 
 23 | // All variables are made static, because we believe that this will
 24 | // keep them local in scope to the file and not make them persist in state
 25 | // between Threads.
 26 | // TODO: If this code ever gets multi-threaded please review this assumption to
 27 | // not pollute another threads state
 28 | static long int user_hz;
 29 | static unsigned int msleep_time=1000;
 30 | 
 31 | static void read_cpu_proc(procfs_time_t* procfs_time_struct) {
 32 | 
 33 |     FILE* fd = NULL;
 34 | 
 35 |     fd = fopen("/proc/stat", "r");
 36 |     if ( fd == NULL) {
 37 |         fprintf(stderr, "Error - file %s failed to open: errno: %d\n", "/proc/stat/", errno);
 38 |         exit(1);
 39 |     }
 40 | 
 41 |     fscanf(fd, "cpu %ld %ld %ld %ld %ld %ld %ld %ld %ld", &procfs_time_struct->user_time, &procfs_time_struct->nice_time, &procfs_time_struct->system_time, &procfs_time_struct->wait_time, &procfs_time_struct->iowait_time, &procfs_time_struct->irq_time, &procfs_time_struct->softirq_time, &procfs_time_struct->steal_time, &procfs_time_struct->guest_time);
 42 | 
 43 |     // debug
 44 |     // printf("Read: cpu %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", procfs_time_struct->user_time, procfs_time_struct->nice_time, procfs_time_struct->system_time, procfs_time_struct->idle_time, procfs_time_struct->iowait_time, procfs_time_struct->irq_time, procfs_time_struct->softirq_time, procfs_time_struct->steal_time, procfs_time_struct->guest_time);
 45 | 
 46 |     fclose(fd);
 47 | 
 48 |     // after this multiplication we are on microseconds
 49 |     // integer division is deliberately, cause we don't loose precision as *1000000 is done before
 50 | 
 51 |     procfs_time_struct->idle_time = procfs_time_struct->wait_time + procfs_time_struct->iowait_time + procfs_time_struct->irq_time + procfs_time_struct->softirq_time;
 52 |     // in /proc/stat nice time is NOT included in the user time! (it is in cgroups however though)
 53 |     procfs_time_struct->compute_time = procfs_time_struct->user_time + procfs_time_struct->system_time + procfs_time_struct->nice_time;
 54 | 
 55 | }
 56 | 
 57 | 
 58 | static int output_stats(int show_diff_time) {
 59 |     long int idle_reading, compute_time_reading;
 60 |     procfs_time_t main_cpu_reading_before;
 61 |     procfs_time_t main_cpu_reading_after;
 62 |     struct timeval start, end;
 63 | 
 64 |     gettimeofday(&start, NULL);
 65 |     read_cpu_proc(&main_cpu_reading_before);
 66 | 
 67 |     usleep(msleep_time * 1000);
 68 | 
 69 |     gettimeofday(&end, NULL);
 70 |     read_cpu_proc(&main_cpu_reading_after);
 71 | 
 72 |     // Calculate actual sleep duration in seconds (as double)
 73 |     double slept_time = (end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec) / 1.0e6;
 74 | 
 75 |     idle_reading = main_cpu_reading_after.idle_time - main_cpu_reading_before.idle_time;
 76 |     compute_time_reading = main_cpu_reading_after.compute_time - main_cpu_reading_before.compute_time;
 77 | 
 78 |     double output = 100.0 * (double)compute_time_reading / (compute_time_reading + idle_reading);
 79 |     if (output < 0.0) output = 0.0;
 80 |     if (output > 100.0) output = 100.0;
 81 | 
 82 |     if (show_diff_time == 1) {
 83 |         printf("%.6f %.2f\n", slept_time, output);  // Print sleep time and utilization
 84 |     } else {
 85 |         printf("%.2f\n", output);  // Print  utilization only
 86 |     }
 87 |     return 1;
 88 | }
 89 | 
 90 | 
 91 | int main(int argc, char **argv) {
 92 | 
 93 |     int c;
 94 |     int show_diff_time = 0;
 95 | 
 96 |     setvbuf(stdout, NULL, _IONBF, 0);
 97 |     user_hz = sysconf(_SC_CLK_TCK);
 98 | 
 99 |     while ((c = getopt (argc, argv, "i:hx")) != -1) {
100 |         switch (c) {
101 |         case 'h':
102 |             printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
103 |             printf("\t-h      : displays this help\n");
104 |             printf("\t-i      : specifies the milliseconds sleep time that will be slept between measurements\n\n");
105 |             printf("\t-x      : show diff time before utilization\n\n");
106 | 
107 |             struct timespec res;
108 |             double resolution;
109 | 
110 |             printf("\tEnvironment variables:\n");
111 |             printf("\tUserHZ\t\t%ld\n", user_hz);
112 |             clock_getres(CLOCK_REALTIME, &res);
113 |             resolution = res.tv_sec + (((double)res.tv_nsec)/1.0e9);
114 |             printf("\tSystemHZ\t%ld\n", (unsigned long)(1/resolution + 0.5));
115 |             printf("\tCLOCKS_PER_SEC\t%ld\n", CLOCKS_PER_SEC);
116 |             exit(0);
117 |         case 'i':
118 |             msleep_time = atoi(optarg);
119 |             break;
120 |         case 'x':
121 |             show_diff_time = 1;
122 |             break;
123 |         default:
124 |             fprintf(stderr,"Unknown option %c\n",c);
125 |             exit(-1);
126 |         }
127 |     }
128 | 
129 |     while(1) {
130 |         output_stats(show_diff_time);
131 |     }
132 | 
133 |     return 0;
134 | }


--------------------------------------------------------------------------------
/demo-reporter/cpu-utilization_mac.c:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include <stdlib.h>
  3 | #include <sys/types.h>
  4 | #include <sys/sysctl.h>
  5 | #include <sys/time.h>
  6 | #include <mach/mach.h>
  7 | #include <mach/mach_host.h>
  8 | #include <unistd.h>
  9 | 
 10 | void loop_utilization(unsigned int msleep_time) {
 11 |     processor_info_array_t cpuInfo = NULL, prevCpuInfo = NULL;
 12 |     mach_msg_type_number_t numCpuInfo, numPrevCpuInfo;
 13 | 
 14 |     while(1){
 15 |         natural_t numCPUsU = 0U;
 16 |         kern_return_t err = host_processor_info(mach_host_self(), PROCESSOR_CPU_LOAD_INFO, &numCPUsU, &cpuInfo, &numCpuInfo);
 17 | 
 18 |         if (err == KERN_SUCCESS) {
 19 | 
 20 |             float ut_total = 0U;
 21 |             struct timeval now;
 22 | 
 23 |             for (unsigned i = 0; i < numCPUsU; ++i) {
 24 |                 float inUse, total;
 25 |                 if (prevCpuInfo) {
 26 |                     inUse = ((cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_USER] - prevCpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_USER]) +
 27 |                             (cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_SYSTEM] - prevCpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_SYSTEM]) +
 28 |                             (cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_NICE] - prevCpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_NICE]));
 29 |                     total = inUse + (cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_IDLE] - prevCpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_IDLE]);
 30 |                 } else {
 31 |                     inUse = cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_USER] + cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_SYSTEM] + cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_NICE];
 32 |                     total = inUse + cpuInfo[(CPU_STATE_MAX * i) + CPU_STATE_IDLE];
 33 |                 }
 34 |                 ut_total = ut_total + (inUse / total);
 35 |             }
 36 | 
 37 |             gettimeofday(&now, NULL);
 38 |             printf("%ld%06i %i\n", now.tv_sec, now.tv_usec, (int)(ut_total * 100 / numCPUsU));
 39 | 
 40 |             if (prevCpuInfo) {
 41 |                 size_t prevCpuInfoSize = sizeof(integer_t) * numPrevCpuInfo;
 42 |                 vm_deallocate(mach_task_self(), (vm_address_t)prevCpuInfo, prevCpuInfoSize);
 43 |             }
 44 | 
 45 |             prevCpuInfo = cpuInfo;
 46 |             numPrevCpuInfo = numCpuInfo;
 47 | 
 48 |             cpuInfo = NULL;
 49 |             numCpuInfo = 0U;
 50 |         } else {
 51 |             fprintf(stderr, "Error: %s\n", mach_error_string(err));
 52 |         }
 53 | 
 54 |     usleep(msleep_time*1000);
 55 |     }
 56 | }
 57 | 
 58 | 
 59 | static int check_system() {
 60 |     processor_info_array_t cpuInfo = NULL;
 61 |     mach_msg_type_number_t numCpuInfo;
 62 |     natural_t numCPUsU = 0U;
 63 | 
 64 |     kern_return_t err = host_processor_info(mach_host_self(), PROCESSOR_CPU_LOAD_INFO, &numCPUsU, &cpuInfo, &numCpuInfo);
 65 | 
 66 |     if (err == KERN_SUCCESS) {
 67 |         if (numCPUsU > 0){
 68 |             return 0;
 69 |         }else{
 70 |             fprintf(stderr, "The call was successful but the data is wrong.");
 71 |             return 1;
 72 |         }
 73 |     }else{
 74 |         fprintf(stderr, "There was an error getting CPU info: %s\n", mach_error_string(err));
 75 |         return err;
 76 |     }
 77 | }
 78 | 
 79 | int main(int argc, char **argv) {
 80 | 
 81 |     int c;
 82 |     unsigned int msleep_time=1000;
 83 | 
 84 |     setvbuf(stdout, NULL, _IONBF, 0);
 85 | 
 86 |     while ((c = getopt (argc, argv, "i:hc")) != -1) {
 87 |         switch (c) {
 88 |         case 'h':
 89 |             printf("Usage: %s [-i msleep_time] [-h]\n\n",argv[0]);
 90 |             printf("\t-h      : displays this help\n");
 91 |             printf("\t-i      : specifies the milliseconds sleep time that will be slept between measurements\n");
 92 |             printf("\t-c      : check system and exit\n\n");
 93 |             exit(0);
 94 |         case 'i':
 95 |             msleep_time = atoi(optarg);
 96 |             if (msleep_time < 50){
 97 |                 fprintf(stderr,"A value of %i is to small. Results will include 0s as the kernel does not update as fast.\n",msleep_time);
 98 |             }
 99 |             break;
100 |         case 'c':
101 |             exit(check_system());
102 |         default:
103 |             fprintf(stderr,"Unknown option %c\n",c);
104 |             exit(-1);
105 |         }
106 |     }
107 | 
108 |     loop_utilization(msleep_time);
109 | 
110 |     return 0;
111 | }
112 | 


--------------------------------------------------------------------------------
/demo-reporter/static-binary-linux-amd64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/green-coding-solutions/cloud-energy/98a138553a5c1f5648202acacd8426c27b8d3291/demo-reporter/static-binary-linux-amd64


--------------------------------------------------------------------------------
/demo-reporter/static-binary-linux-arm64:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/green-coding-solutions/cloud-energy/98a138553a5c1f5648202acacd8426c27b8d3291/demo-reporter/static-binary-linux-arm64


--------------------------------------------------------------------------------
/hyperparameter_tuning.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from xgboost import XGBRegressor
 4 | from sklearn.metrics import mean_absolute_error
 5 | from sklearn.model_selection import train_test_split
 6 | from sklearn.metrics import mean_squared_error
 7 | import optuna
 8 | 
 9 | 
10 | def objective(trial):
11 | 
12 |     params = {
13 |         "tree_method":"exact",
14 |         'max_depth': trial.suggest_int('max_depth', 3, 10),
15 |         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.5),
16 |         'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
17 |         'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
18 |         'random_state': trial.suggest_int('random_state', 1, 1000)
19 |     }
20 | 
21 | 
22 |     inner_model = XGBRegressor(
23 |         **params,
24 |         n_jobs=-1,
25 |         early_stopping_rounds=10 #should be around 10% of amount of trials
26 |     )
27 | 
28 |     inner_model.fit(X_train, y_train,         eval_set=[(X_valid, y_valid)]        ,
29 | 
30 |               verbose=0
31 |    )
32 | 
33 |     y_hat = inner_model.predict(X_valid)
34 | 
35 |     # Note: This is only meaningful for Regressors. Use a different metric for Classifiers!
36 |     return mean_squared_error(y_valid, y_hat, squared=False)
37 | 
38 | 
39 | df = pd.read_csv(f"{os.path.dirname(os.path.abspath(__file__))}/data/spec_data_cleaned.csv")
40 | 
41 | X = df[df.CPUChips == 2] # Re-run script with a tuning for every amount of CPUChips
42 | y = X["power"]
43 | X = X.drop(columns=["power"])
44 | 
45 | Z = pd.DataFrame.from_dict({
46 |     'HW_CPUFreq' : [],
47 |     'CPUCores': [],
48 |     'CPUThreads': [],
49 |     'TDP': [],
50 |     'Hardware_Availability_Year': [],
51 |     'HW_MemAmountGB': [],
52 |     'Architecture': [],
53 |     'CPUMake': [],
54 |     'utilization': []
55 | })
56 | 
57 | X = X[Z.columns]
58 | 
59 | 
60 | X = pd.get_dummies(X, columns=["CPUMake", "Architecture"])
61 | 
62 | X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)
63 | study = optuna.create_study(direction='minimize', study_name='regression')
64 | study.optimize(objective, n_trials=100) # Love to do 100, but this leads to an underflow error ... unclear why
65 | print('Number of finished trials:', len(study.trials))
66 | print('Best trial:', study.best_trial.params)
67 | model = XGBRegressor(**study.best_trial.params, early_stopping_rounds=4)
68 | 
69 | model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],verbose=False)
70 | y_pred_default = model.predict(X_valid)
71 | print("Mean Absolute Error:" , mean_absolute_error(y_pred_default,y_valid))
72 | print("Mean Squared Error:" , mean_squared_error(y_valid, y_pred_default, squared=False))
73 | 
74 | 
75 | print("\n### BASE")
76 | model = XGBRegressor(random_state=study.best_trial.params['random_state'])
77 | model.fit(X_train,y_train,eval_set=[(X_valid, y_valid)],verbose=False)
78 | y_pred_default = model.predict(X_valid)
79 | 
80 | print("Mean Absolute Error:" , mean_absolute_error(y_pred_default,y_valid))
81 | print("Mean Squared Error:" , mean_squared_error(y_valid, y_pred_default, squared=False))
82 | 


--------------------------------------------------------------------------------
/img/fujitsu_TX1330_SPEC.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/green-coding-solutions/cloud-energy/98a138553a5c1f5648202acacd8426c27b8d3291/img/fujitsu_TX1330_SPEC.png


--------------------------------------------------------------------------------
/img/fujitsu_TX1330_measured.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/green-coding-solutions/cloud-energy/98a138553a5c1f5648202acacd8426c27b8d3291/img/fujitsu_TX1330_measured.png


--------------------------------------------------------------------------------
/img/hp_synergy_480_Gen10_Plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/green-coding-solutions/cloud-energy/98a138553a5c1f5648202acacd8426c27b8d3291/img/hp_synergy_480_Gen10_Plus.png


--------------------------------------------------------------------------------
/interact_validation.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | from xgboost import XGBRegressor
 4 | from sklearn.model_selection import RepeatedKFold
 5 | from sklearn.model_selection import cross_val_score
 6 | 
 7 | df = pd.read_csv(f"{os.path.dirname(os.path.abspath(__file__))}/data/spec_data_cleaned.csv")
 8 | 
 9 | df_new = df.copy()
10 | df_new = df_new[df_new.CPUChips == 2] # Fit a model for every amount of CPUChips
11 | 
12 | X = df_new[[
13 |     'HW_MemAmountGB',
14 |     'TDP',
15 |     'utilization',
16 |     'CPUCores',
17 |     'CPUThreads',
18 |     'HW_CPUFreq',
19 |     'Hardware_Availability_Year',
20 |     'HW_FormFactor',
21 |     'HW_Vendor'
22 | ]]
23 | X = pd.get_dummies(X, columns=['HW_FormFactor', 'HW_Vendor'])
24 | y = df_new.power
25 | model = XGBRegressor()
26 | kfold = RepeatedKFold()
27 | kf_cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
28 | # pylint: disable=consider-using-f-string
29 | print(f"[Interact DC Original (untuned)] K-fold CV score range: \
30 |     {kf_cv_scores.min():.2f} < {kf_cv_scores.mean():.2f} < {kf_cv_scores.max():.2f}"
31 | )
32 | 
33 | X = df_new[[
34 |     'HW_MemAmountGB',
35 |     'TDP',
36 |     'utilization',
37 |     'CPUCores',
38 |     'CPUThreads',
39 |     'HW_CPUFreq',
40 |     'Hardware_Availability_Year'
41 | ]]
42 | y = df_new.power
43 | model = XGBRegressor()
44 | kfold = RepeatedKFold()
45 | kf_cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
46 | # pylint: disable=consider-using-f-string
47 | print(f"[Interact DC cloud available variables (untuned)] K-fold CV score range: \
48 |         {kf_cv_scores.min():.2f} < {kf_cv_scores.mean():.2f} < {kf_cv_scores.max():.2f}"
49 | )
50 | 
51 | 
52 | 
53 | X = df_new[[
54 |     'HW_MemAmountGB',
55 |     'TDP',
56 |     'utilization',
57 |     'CPUCores',
58 |     'CPUThreads',
59 |     'HW_CPUFreq',
60 |     'Hardware_Availability_Year',
61 |     'Architecture',
62 |     'CPUMake'
63 | ]]
64 | X = pd.get_dummies(X, columns=['Architecture', 'CPUMake'])
65 | y = df_new.power
66 | model = XGBRegressor()
67 | kfold = RepeatedKFold()
68 | kf_cv_scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_absolute_error')
69 | # pylint: disable=consider-using-f-string
70 | print(f"[Our variable selection (untuned)] K-fold CV score range: \
71 |         {kf_cv_scores.min():.2f} < {kf_cv_scores.mean():.2f} < {kf_cv_scores.max():.2f}"
72 | )
73 | 
74 | ## Expected output from 07.12.2022 with the pre-interpolated data
75 | ## [Interact DC Original (untuned)] K-fold CV score range: -4.70 < -4.54 < -4.33
76 | ## [Interact DC cloud available variables (untuned)] K-fold CV score range: -8.00 < -7.88 < -7.80
77 | ## [Our variable selection (untuned)] K-fold CV score range: -8.13 < -8.02 < -7.93
78 | 


--------------------------------------------------------------------------------
/ols.py:
--------------------------------------------------------------------------------
 1 | #pylint: disable=invalid-name
 2 | 
 3 | import sys
 4 | import statsmodels.formula.api as smf
 5 | import pandas as pd
 6 | 
 7 | def train_model(cpu_chips, ram, tdp, cpu_threads):
 8 | 
 9 |     df = pd.read_csv('./data/spec_data_cleaned.csv')
10 | 
11 |     formula = 'power ~ utilization'
12 | 
13 |     if cpu_threads is not None:
14 |         formula = f"{formula} + CPUThreads"
15 | 
16 |     if cpu_chips is not None:
17 |         formula = f"{formula}*C(CPUChips)"
18 | 
19 |     if ram is not None:
20 |         formula = f"{formula} + HW_MemAmountGB"
21 | 
22 |     if tdp is not None:
23 |         formula = f"{formula} + TDP"
24 | 
25 |     return smf.ols(formula=formula, data=df).fit()
26 | 
27 | if __name__ == '__main__':
28 |     import argparse
29 | 
30 |     parser = argparse.ArgumentParser()
31 | 
32 |     parser.add_argument('--cpu-chips', type=float, help='Number of CPUChips')
33 |     parser.add_argument('--cpu-threads', type=float, help='Number of CPU Threads')
34 |     parser.add_argument('--cpu-freq',
35 |         type=float,
36 |         help='CPU frequency. (Not used. Only for compatibility with XGBoost model)'
37 |     )
38 |     parser.add_argument('--tdp', type=float, help='TDP of the CPU')
39 |     parser.add_argument('--ram', type=float, help='Amount of RAM for the bare metal system')
40 |     parser.add_argument('--vhost-ratio',
41 |         type=float,
42 |         help='Virtualization ratio of the system. Input numbers between (0,1].',
43 |         default=1.0
44 |     )
45 |     parser.add_argument('--silent',
46 |         action='store_true',
47 |         help='Will suppress all debug output. Typically used in production.'
48 |     )
49 | 
50 |     args = parser.parse_args()
51 | 
52 |     model = train_model(args.cpu_chips, args.ram, args.tdp, args.cpu_threads)
53 |     my_data = pd.DataFrame.from_dict({
54 |         'utilization': 0,
55 |         'CPUChips': [args.cpu_chips],
56 |         'CPUThreads': [args.cpu_threads],
57 |         'HW_MemAmountGB': [args.ram],
58 |         'TDP' : [args.tdp]
59 |     })
60 | 
61 |     # Drop all arguments that were not supplied
62 |     my_data = my_data.dropna(axis=1)
63 | 
64 |     if not args.silent:
65 |         print('Sending following dataframe to model', my_data)
66 |         print('vHost ratio is set to ', args.vhost_ratio)
67 | 
68 | 
69 |     for line in sys.stdin:
70 |         my_data['utilization'] = float(line.strip())
71 |         print(model.predict(my_data)[0] * args.vhost_ratio)
72 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | optuna==4.3.0
2 | 


--------------------------------------------------------------------------------
/requirements-docker.txt:
--------------------------------------------------------------------------------
1 | numpy==2.2.4
2 | pandas==2.2.3
3 | xgboost==2.1.4
4 | pyarrow==20.0.0
5 | scikit-learn==1.6.1


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | numpy==2.2.4
 2 | pandas==2.2.3
 3 | # Use xgboost-cpu on Linux or Windows x86_64
 4 | xgboost-cpu==2.1.4; sys_platform == "linux" and platform_machine == "x86_64"
 5 | xgboost-cpu==2.1.4; sys_platform == "win32" and platform_machine == "x86_64"
 6 | 
 7 | # Use xgboost on all other platforms (e.g., macOS Intel or ARM, or non-x86_64)
 8 | xgboost==2.1.4; sys_platform == "darwin"
 9 | xgboost==2.1.4; platform_machine != "x86_64"
10 | psutil==7.0.0
11 | pyarrow==20.0.0
12 | scikit-learn==1.6.1


--------------------------------------------------------------------------------
/scripts/create_data_csv.py:
--------------------------------------------------------------------------------
  1 | import csv, re, os
  2 | 
  3 | header = [
  4 | 
  5 | 'Test_Sponsor', 'SPEC_License', 'Test_Method', 'Tested_By', 'Test_Location',
  6 | 'Test_Date', 'Hardware_Availability', 'Software_Availability', 'Publication',
  7 | 'System_Source', 'System_Designation', 'Power_Provisioning',
  8 | 
  9 | '100_ActualLoad', '100_ssj_ops', '100_AvgPower', '100_PerfPowerRatio',
 10 | '90_ActualLoad', '90_ssj_ops', '90_AvgPower', '90_PerfPowerRatio',
 11 | '80_ActualLoad', '80_ssj_ops', '80_AvgPower', '80_PerfPowerRatio',
 12 | '70_ActualLoad', '70_ssj_ops', '70_AvgPower', '70_PerfPowerRatio',
 13 | '60_ActualLoad', '60_ssj_ops', '60_AvgPower', '60_PerfPowerRatio',
 14 | '50_ActualLoad', '50_ssj_ops', '50_AvgPower', '50_PerfPowerRatio',
 15 | '40_ActualLoad', '40_ssj_ops', '40_AvgPower', '40_PerfPowerRatio',
 16 | '30_ActualLoad', '30_ssj_ops', '30_AvgPower', '30_PerfPowerRatio',
 17 | '20_ActualLoad', '20_ssj_ops', '20_AvgPower', '20_PerfPowerRatio',
 18 | '10_ActualLoad', '10_ssj_ops', '10_AvgPower', '10_PerfPowerRatio', 'ActiveIdle',
 19 | 
 20 | 'HW_Vendor', 'HW_Model', 'HW_FormFactor', 'HW_CPUName',
 21 | 'HW_CPUChars', 'HW_CPUFreq', 'HW_CPUsEnabled','HW_HardwareThreads', 
 22 | 'HW_CPUsOrderable', 'HW_PrimaryCache', 'HW_SecondaryCache','HW_TertiaryCache',
 23 | 'HW_OtherCache', 'HW_MemAmountGB','HW_DIMMNumAndSize','HW_MemDetails', 
 24 | 'HW_PSUQuantAndRating', 'HW_PSUDetails','HW_DiskDrive','HW_DiskController',
 25 | 'HW_NICSNumAndType', 'HW_NICSFirm/OS/Conn','HW_NetSpeedMbit','HW_Keyboard','HW_Mouse',
 26 | 'HW_Monitor', 'HW_OpticalDrive', 'HW_Other',
 27 | 
 28 | 'SW_PowerManagement', 'SW_OS', 'SW_OSVersion',
 29 | 'SW_Filesystem', 'SW_JVMVendor', 'SW_JVMVersion', 'SW_JVMCLIOpts', 
 30 | 'SW_JVMAffinity', 'SW_JVMInstances', 'SW_JVMInitialHeapMB', 'SW_JVMMaxHeapMB', 
 31 | 'SW_JVMAddressBits', 'SW_BootFirmwareVersion', 'SW_MgmtFirmwareVersion', 
 32 | 'SW_WorkloadVersion', 'SW_DirectorLocation', 'SW_Others', 
 33 | 
 34 | 'SUT_BIOS', 'SUT_Firmware', 'SUT_Notes',
 35 | ]
 36 | 
 37 | rows = []
 38 | rowcount=-1
 39 | 
 40 | for f in os.scandir('../data/raw/spec-power/'):
 41 |     if f.is_file():
 42 |         rowcount+=1
 43 |         rows.append([])
 44 |         o = open(f,'r')
 45 |         text = o.read()
 46 |         o.close()
 47 | 
 48 |         ## Get Test info
 49 |         m = re.search(
 50 |             'Test Sponsor:</a></td>$\s*.*>(.*)</td>$'                      # 1
 51 |             '\s*.*SPEC License #:</a></td>$\s*.*>(.*)</td>$'               # 2
 52 |             '\s*.*Test Method:</a></td>$\s*.*>(.*)</td>$\s*</tr>$\s*<tr>$' # 3
 53 |             '\s*.*Tested By:</a></td>$\s*.*>(.*)</td>$'                    # 4
 54 |             '\s*.*Test Location:</a></td>$\s*.*>(.*)</td>$'                # 5
 55 |             '\s*.*Test Date:</a></td>$\s*.*>(.*)</td>$\s*</tr>$\s*<tr>$'   # 6
 56 |             '\s*.*Hardware Availability:</a></td>$\s*.*>(.*)</td>$'        # 7
 57 |             '\s*.*Software Availability:</a></td>$\s*.*>(.*)</td>$'        # 8
 58 |             '\s*.*Publication:</a></td>$\s*.*>(.*)</td>$\s*</tr>$\s*<tr>$' # 9
 59 |             '\s*.*System Source:</a></td>$\s*.*>(.*)</td>$'                # 10
 60 |             '\s*.*System Designation:</a></td>$\s*.*>(.*)</td>$'           # 11
 61 |             '\s*.*Power Provisioning:</a></td>$\s*.*>(.*)</td>$'           # 12
 62 |             ,text , re.M)
 63 | 
 64 |         if m:
 65 |             for x in range(1,13):
 66 |                 rows[rowcount].append(m.group(x))
 67 | 
 68 | 
 69 |         ## Get Power Chart
 70 |         for x in range(100, 0, -10):
 71 |             m = re.search(f'<td>{x}%</td>$'
 72 |                 '\s*<td>(.*)%</td>$'
 73 |                 '\s*<td>(.*)</td>$'
 74 |                 '\s*<td>(.*)</td>$'
 75 |                 '\s*<td>(.*)</td>$'
 76 |                 , text, re.M)
 77 |             if m:
 78 |                 ssj_ops_cln = re.sub(',', "", m.group(2))
 79 |                 avg_pwr_cln = re.sub(',', "", m.group(3))
 80 |                 perf_pwr_ratio_cln = re.sub(',', "", m.group(4))
 81 |                 rows[rowcount].extend([m.group(1), ssj_ops_cln, avg_pwr_cln
 82 |                     , perf_pwr_ratio_cln])
 83 |                 #print(f"Actual Load: {m.group(1)} --- ssj_ops: {m.group(2)} --- avg.power: {m.group(3)} --- perf.power.ratio: {m.group(4)}\n")
 84 |         
 85 |         ## Get Idle Power
 86 |         m = re.search('Active Idle.*$'
 87 |             '\s*<td>.*</td>$'
 88 |             '\s*<td>(.*)</td>$'
 89 |             , text, re.M)
 90 |         if m: rows[rowcount].append(m.group(1))
 91 | 
 92 |         ## Get Hardware Info
 93 |         m = re.search('Hardware Vendor:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'   # 1 
 94 |             '\s*.*Model:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'                  # 2   
 95 |             '\s*.*Form Factor:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'            # 3
 96 |             '\s*.*CPU Name:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'               # 4
 97 |             '\s*.*CPU Characteristics:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'    # 5
 98 |             '\s*.*CPU Frequency \(MHz\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'  # 6
 99 |             '\s*.*CPU\(s\) Enabled:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'       # 7
100 |             '\s*.*Hardware Threads:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'       # 8
101 |             '\s*.*CPU\(s\) Orderable:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'     # 9
102 |             '\s*.*Primary Cache:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'          # 10
103 |             '\s*.*Secondary Cache:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'        # 11
104 |             '\s*.*Tertiary Cache:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'         # 12
105 |             '\s*.*Other Cache:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'            # 13
106 |             '\s*.*Memory Amount \(GB\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'   # 14 
107 |             '\s*.*# and size of DIMM:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'     # 15
108 |             '\s*.*Memory Details:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'         # 16
109 |             '\s*.*Power Supply Quantity and Rating \(W\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' #17
110 |             '\s*.*Power Supply Details:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'   # 18
111 |             '\s*.*Disk Drive:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'             # 19
112 |             '\s*.*Disk Controller:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'        # 20
113 |             '\s*.*# and type of Network Interface Cards \(NICs\) Installed:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 21
114 |             '\s*.*NICs Enabled in Firmware / OS / Connected:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 22
115 |             '\s*.*Network Speed \(Mbit\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 23
116 |             '\s*.*Keyboard:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'               # 24
117 |             '\s*.*Mouse:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'                  # 25
118 |             '\s*.*Monitor:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'                # 26
119 |             '\s*.*Optical Drives:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'         # 27
120 |             '\s*.*Other Hardware:</a></td>$\s*.*>(.*)</td>'                          # 28
121 |             ,text , re.M)
122 | 
123 |         if m: #print(m.group(28))
124 |             for x in range(1,29):
125 |                 rows[rowcount].append(m.group(x))
126 | 
127 |         ## Get Software Info
128 |         m = re.search('Power Management:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'  # 1
129 |             '\s*.*Operating System \(OS\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 2   
130 |             '\s*.*OS Version:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'             # 3   
131 |             '\s*.*Filesystem:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'             # 4   
132 |             '\s*.*JVM Vendor:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'             # 5   
133 |             '\s*.*JVM Version:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'            # 6   
134 |             '\s*.*JVM Command-line Options:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 7   
135 |             '\s*.*JVM Affinity:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'           # 8
136 |             '\s*.*JVM Instances:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'          # 9
137 |             '\s*.*JVM Initial Heap \(MB\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 10
138 |             '\s*.*JVM Maximum Heap \(MB\):</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 11
139 |             '\s*.*JVM Address Bits:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'       # 12
140 |             '\s*.*Boot Firmware Version:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'  # 13
141 |             '\s*.*Management Firmware Version:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$' # 14
142 |             '\s*.*Workload Version:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'       # 15
143 |             '\s*.*Director Location:</a></td>$\s*.*>(.*)</td>\s*</tr>$\s*<tr>$'      # 16
144 |             '\s*.*Other Software:</a></td>$\s*.*>(.*)</td>'                          # 17
145 |             ,text , re.M)
146 | 
147 |         if m: #print(m.group(17))
148 |             for x in range(1, 18):
149 |                 rows[rowcount].append(m.group(x))
150 | 
151 |         ## Get SUT Notes and BIOS / Firmware
152 |         m = re.search(
153 |             'Boot Firmware Settings</a></div>$\s*<div class=\'freeForm\'>\s*([\w\W]*?)\s*</div>$'              # 1
154 |             '[\w\W]*?Management Firmware Settings</a></div>$\s*<div class=\'freeForm\'>\s*([\w\W]*?)\s*</div>$'# 2
155 |             '[\w\W]*?System Under Test Notes</a></div>$\s*<div class=\'freeForm\'>\s*([\w\W]*?)\s*</div>$'     # 3
156 |             ,text , re.M)
157 | 
158 |         if m:
159 |             for x in range(1,4):
160 |                 ### Elements are lists inside. We remove HTML tags and separate by ;;;
161 |                 group = m.group(x).replace('<li>', ';;;', case=False) \
162 |                                   .replace('</li>', '', case=False) \
163 |                                   .replace('<ul>', '', case=False) \
164 |                                   .replace('</ul>', '', case=False) \
165 |                                   .strip()
166 |                 rows[rowcount].append(group)
167 | 
168 | 
169 | #print(rows)
170 | with open('../data/spec_data.csv', 'w', encoding='UTF8', newline='\n') as f:
171 |     writer = csv.writer(f, delimiter='|')
172 |     writer.writerow(header)
173 |     writer.writerows(rows)
174 | 


--------------------------------------------------------------------------------
/scripts/data_cleaning.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import glob
  3 | import pandas as pd
  4 | import include.helper_functions as helper
  5 | 
  6 | """
  7 |     This file reads the generated CSV file into a data frame
  8 |     and applies some cleaning and feature engineering to feed the data into
  9 |     a linear model.
 10 | 
 11 |     If the SPECPower data ever gets updated please walk this file manually
 12 |     from top to bottom.
 13 | 
 14 |     In the file some asserts are given, but also manual checks should be run
 15 | """
 16 | 
 17 | 
 18 | def remove_unneeded_columns(df_original):
 19 |     df = df_original.copy()
 20 |     # Just for simplicity, remove columns that we currently do not need
 21 |     df = df.drop(
 22 |         ['100_ActualLoad', '100_ssj_ops', "100_PerfPowerRatio",
 23 |            '90_ActualLoad', '90_ssj_ops',  '90_PerfPowerRatio',
 24 |            '80_ActualLoad', '80_ssj_ops',  '80_PerfPowerRatio',
 25 |            '70_ActualLoad', '70_ssj_ops',  '70_PerfPowerRatio',
 26 |            '60_ActualLoad', '60_ssj_ops',  '60_PerfPowerRatio',
 27 |            '50_ActualLoad', '50_ssj_ops',  '50_PerfPowerRatio',
 28 |            '40_ActualLoad', '40_ssj_ops',  '40_PerfPowerRatio',
 29 |            '30_ActualLoad', '30_ssj_ops',  '30_PerfPowerRatio',
 30 |            '20_ActualLoad', '20_ssj_ops',  '20_PerfPowerRatio',
 31 |            '10_ActualLoad', '10_ssj_ops',  '10_PerfPowerRatio'], axis=1)
 32 |     helper.columns_diff(df, df_original)
 33 |     return df
 34 | 
 35 | def split_hardware_availabilty(df_original):
 36 |     df = df_original.copy()
 37 | 
 38 |     availability = df["Hardware_Availability"].str.split("-", expand=True)
 39 |     df["Hardware_Availability_Month"] = availability[0]
 40 |     df["Hardware_Availability_Year"]  = availability[1]
 41 |     df["Hardware_Availability_Year"] =  df["Hardware_Availability_Year"].astype(int)
 42 | 
 43 |     return df
 44 | 
 45 | def melt_power_and_load(df_original):
 46 |     df = df_original.copy()
 47 | 
 48 |     melt_columns = ['100_AvgPower', '90_AvgPower', '80_AvgPower', '70_AvgPower',
 49 |            '60_AvgPower', '50_AvgPower', '40_AvgPower', '30_AvgPower',
 50 |            '20_AvgPower', '10_AvgPower', 'ActiveIdle']
 51 |     remaining_columns = df.columns[~df.columns.isin(melt_columns)]
 52 | 
 53 |     df = df.melt(
 54 |         id_vars=remaining_columns,
 55 |         value_vars=melt_columns,
 56 |         var_name='utilization',
 57 |         value_name="power",
 58 |     )
 59 | 
 60 |     helper.columns_diff(df, df_original)
 61 |     return df
 62 | 
 63 | def clean_power_and_load(df_original):
 64 |     df = df_original.copy()
 65 |     df.utilization = df.utilization.str.replace('ActiveIdle', '0')
 66 |     df.utilization = df.utilization.str.replace('100_AvgPower', '100')
 67 |     df.utilization = df.utilization.str.replace('90_AvgPower', '90')
 68 |     df.utilization = df.utilization.str.replace('90_AvgPower', '90')
 69 |     df.utilization = df.utilization.str.replace('80_AvgPower', '80')
 70 |     df.utilization = df.utilization.str.replace('70_AvgPower', '70')
 71 |     df.utilization = df.utilization.str.replace('60_AvgPower', '60')
 72 |     df.utilization = df.utilization.str.replace('50_AvgPower', '50')
 73 |     df.utilization = df.utilization.str.replace('40_AvgPower', '40')
 74 |     df.utilization = df.utilization.str.replace('30_AvgPower', '30')
 75 |     df.utilization = df.utilization.str.replace('20_AvgPower', '20')
 76 |     df.utilization = df.utilization.str.replace('10_AvgPower', '10')
 77 | 
 78 |     df.utilization = df.utilization.astype(int)
 79 |     helper.same_column_diff(df, df, 'utilization')
 80 | 
 81 | 
 82 |     return df
 83 | 
 84 | 
 85 | def create_cpu_make(df_original):
 86 |     df = df_original.copy()
 87 |     df["CPUMake"] = None
 88 | 
 89 |     assert df.loc[(df['HW_CPUName'].str.contains("Intel", case=False)) & (df['HW_CPUName'].str.contains("AMD", case=False)), "CPUMake"].empty # Intel and AMD never in one column
 90 | 
 91 |     df.loc[df['HW_CPUName'].str.contains("Intel"), "CPUMake"] = "intel"
 92 |     df.loc[df['HW_CPUName'].str.contains("AMD"), "CPUMake"] = "amd"
 93 | 
 94 | 
 95 |     ## How many do we have left?
 96 |     df[df.CPUMake.isna()].HW_CPUName
 97 | 
 98 |     ## Currently we only see the Xeon L5420 @Dan: Was not specific
 99 |     df.loc[df.HW_CPUName== "Xeon L5420", "CPUMake"] = "intel"
100 | 
101 |     helper.new_column_diff(df, 'HW_CPUName', 'CPUMake')
102 | 
103 | 
104 |     # All the makes should now either be intel or AMD
105 |     assert list(df['CPUMake'].unique()) == ["intel", "amd"], "CPUMake contained vendors other than AMD / Intel"
106 | 
107 |     return df
108 | 
109 | def create_cpu_name(df_original):
110 |     df = df_original.copy()
111 | 
112 |     df['CPUName'] = df['HW_CPUName']
113 | 
114 | 
115 |     ## Now remove the vendor from the column and generate a new
116 |     df['CPUName'] = df['CPUName'].str.replace(r'Intel\s*', "", regex=True, flags=re.IGNORECASE)
117 |     df['CPUName'] = df['CPUName'].str.replace(r'AMD\s*', "", regex=True, flags=re.IGNORECASE)
118 | 
119 | 
120 |     df['CPUName'] = df['CPUName'].str.replace(r"\(\s*Intel\s*Turbo\s*Boost\s*Technology\s*up\s*to\s*\d+\.\d*\s*GHz\s*\)", "", regex=True, flags=re.IGNORECASE)
121 |     df['CPUName'] = df['CPUName'].str.replace(r"\(\s*Turbo\s*Boost\s*Technology\s*up\s*to\s*\d+\.\d*\s*GHz\s*\)", "", regex=True, flags=re.IGNORECASE)
122 |     df['CPUName'] = df['CPUName'].str.replace(r"\(\s*Turbo\s*CORE\s*Technology\s*up\s*to\s*\d+\.\d*\s*GHz\s*\)", "", regex=True, flags=re.IGNORECASE)
123 |     df['CPUName'] = df['CPUName'].str.replace(r"\(\s*\d+\.\d*\s*GHz\s*\)", "", regex=True,flags=re.IGNORECASE) # remove only frequency
124 |     df['CPUName'] = df['CPUName'].str.replace(r"\(\s*r\s*\)", "", regex=True,flags=re.IGNORECASE) # remove (r)
125 |     df['CPUName'] = df['CPUName'].str.replace(r"Processor\s*", "", regex=True,flags=re.IGNORECASE) # remove Processor
126 |     df['CPUName'] = df['CPUName'].str.replace(r"@?\s*\d+\.\d*\s*GHz\s*", "", regex=True,flags=re.IGNORECASE) # remove @ 2.3 GHz
127 |     df['CPUName'] = df['CPUName'].str.replace(r"CPU\s*", "", regex=True,flags=re.IGNORECASE) # remove CPU
128 |     df['CPUName'] = df['CPUName'].str.replace(r"\w+-Core\s*", "", regex=True,flags=re.IGNORECASE) # remove Quad-Core etc.
129 |     df['CPUName'] = df['CPUName'].str.replace(r",\s*", "", regex=True,flags=re.IGNORECASE) # remove ,
130 | 
131 |     # Unique cases
132 |     df['CPUName'] = df['CPUName'].str.replace("Dell SKU [338-BNCG]", "", regex=False) # remove special case Dell SKU [338-BNCG]
133 |     df.loc[df.CPUName == 'X3350', 'CPUName'] = 'xeonx3350'
134 |     df.loc[df.CPUName == 'E3-1260L v5', 'CPUName'] = 'xeone3-1260lv5'
135 |     df.loc[df.CPUName == 'Xeon', 'CPUName'] = 'xeon-undefined' # move to XeonUNDEFINED so the model will later have no false-positive match for "Xeon"
136 | 
137 |     df['CPUName'] = df['CPUName'].str.replace(r"\s*", "", regex=True) # normalize
138 |     df.CPUName = df.CPUName.str.lower() # normalize
139 | 
140 |     assert df[~df.CPUName.str.match(r"opteron|xeon|epyc|pentium|corei3|corei5|corei7")].CPUName.empty, "Unknown processors in CPUName apart from Opteron|Xeon|EPYC|Pentium|Corei3|Corei5|Corei7"
141 | 
142 |     assert df[df['CPUName'].str.contains('(', regex=False)].empty, "Still brackets () in CPUName"
143 | 
144 |     # validate what we have as uniques
145 |     helper.visual_check(df.CPUName.unique(), "All names ok?") #DEBUG
146 | 
147 |     helper.new_column_diff(df, 'HW_CPUName', 'CPUName')
148 | 
149 |     return df
150 | 
151 | def create_turbo_boost(df_original):
152 |     df = df_original.copy()
153 | 
154 |     ## look at all the brackets and try to spot pattern.
155 |     df.HW_CPUName.str.match(".*\(\s*Turbo.*")
156 | 
157 |     ## how many? A: 41
158 |     df.loc[df.HW_CPUName.str.match(".*\(\s*Turbo.*"), 'HW_CPUName'].count()
159 | 
160 |     # Create TurboBoost column
161 |     df["TurboBoostGHz"] = None
162 | 
163 |     # Check HW_CPUName column / count
164 |     df.loc[df['HW_CPUName'].str.match(".*\(.*Turbo.*\)"), 'HW_CPUName'].count()
165 |     df.loc[df['HW_CPUName'].str.match(".*\(.*up to (.*)GHz.*"), 'HW_CPUName'].count()
166 | 
167 |     # Fill column from CPU Name data
168 |     df['TurboBoostGHz'] = df['HW_CPUName'].str.extract(".*\(.*up to (\d+\.\d+)\s*GHz.*")
169 |     df[df['TurboBoostGHz'].notna()] # How many do we have
170 | 
171 |     helper.new_column_diff(df, 'HW_CPUName', 'TurboBoostGHz')
172 | 
173 |     return df
174 | 
175 | # X cores, X chips, X cores/chip
176 | def make_cpu_cores(df_original):
177 |     df = df_original.copy()
178 |     df['CPUCores'] = df.HW_CPUsEnabled.str.extract("^\s*(\d+)\s*cores?\s*,.*", flags=re.IGNORECASE).astype(int)
179 |     assert df[df.CPUCores.isna()].empty , "CPUCores contains NA"
180 |     helper.new_column_diff(df, 'CPUCores', 'HW_CPUsEnabled')
181 |     # recover original column
182 |     df.HW_CPUsEnabled = df_original.HW_CPUsEnabled
183 |     return df
184 | 
185 | def make_cpu_chips(df_original):
186 |     df = df_original.copy()
187 |     df['CPUChips'] = df.HW_CPUsEnabled.str.extract(".*,\s*(\d)+\s*chips?,.*", flags=re.IGNORECASE).astype(int)
188 |     assert df[df.CPUChips.isna()].empty, "CPUChips contains NA"
189 |     helper.new_column_diff(df, 'CPUChips', 'HW_CPUsEnabled')
190 |     df.HW_CPUsEnabled = df_original.HW_CPUsEnabled
191 |     return df
192 | 
193 | 
194 | # 8 (2 / core)
195 | # first number relevant
196 | # second number can be figured out from this / total cores, so shouldn't be needed
197 | # write code to double check that first, similar to FREQ comparision code
198 | 
199 | def make_hardware_threads(df_original):
200 |     df = df_original.copy()
201 |     df.HW_HardwareThreads.unique() # DEBUG
202 |     df['CPUThreads'] = df.HW_HardwareThreads.str.extract("^\s*(\d+)\s*\(.*").astype(int)
203 |     thread_per_core = df['CPUThreads']  / df['CPUCores']
204 |     thread_per_core_extract = df.HW_HardwareThreads.str.extract("\((\d+)\s*\/\s*core\)", expand=False, flags=re.IGNORECASE).dropna().astype(int)
205 |     assert thread_per_core.eq(thread_per_core_extract).all(), "Threads per core was not equal to comparison via calculation"
206 | 
207 |     return df
208 | 
209 | # can be split into 2 easily
210 | #   2 x 700W
211 | #   1 x 1100
212 | # NumOfPSU, PSURating
213 | def split_psu(df_original):
214 |     df = df_original.copy()
215 | 
216 |     df.HW_PSUQuantAndRating.unique()
217 | 
218 |     df['NumOfPSU'] = df.HW_PSUQuantAndRating.str.extract("^\s*(\d+)\s*x.*")
219 |     df['NumOfPSU'] = df['NumOfPSU'].astype('Int64')
220 |     df['PSURating'] = df.HW_PSUQuantAndRating.str.extract(".*x\s*(\d+).*").astype('Int64')
221 |     df['PSURating'] = df['PSURating'].astype('Int64')
222 | 
223 |     ## Assert that all is good here
224 |     # well since some of them are actually empty, not sure what to actually check for
225 | 
226 |     # 6 (66 expanded) rows are empty is allowed with the specified hashes ...
227 |     # but somehow the hash column is gone and I cannot reference it ... why?
228 |     #assert df.drop(df.hash)[df.NumOfPSU.isna()].empty
229 |     #assert df[df.PSURating.isna()].empty
230 | 
231 |     assert df.NumOfPSU.isna().sum() == 6, "PSU was not 6"
232 |     return df
233 | 
234 | 
235 | ## Column with cpu family
236 | # do by known names - Xeon, Opteron, etc.
237 | ## Xeon , Opteron , EPYC ,
238 | # do unique, figure it out from there
239 | def make_cpu_family(df_original):
240 |     df = df_original.copy()
241 |     df['CPUFamily'] = None
242 |     pat = r"xeon|opteron|epyc|i3|i5|pentium"
243 | 
244 |     assert df.loc[~df['CPUName'].str.contains(pat, regex=True)].CPUName.empty, "Unknown family found"
245 | 
246 | 
247 |     possible_families = [
248 |         df.loc[df['CPUName'].str.contains("xeon"), "CPUName"],
249 |         df.loc[df['CPUName'].str.contains("opteron"), "CPUName"],
250 |         df.loc[df['CPUName'].str.contains("epyc"), "CPUName"],
251 |         df.loc[df['CPUName'].str.contains("i3"), "CPUName"],
252 |         df.loc[df['CPUName'].str.contains("i5"), "CPUName"],
253 |         df.loc[df['CPUName'].str.contains("pentium"), "CPUName"]
254 |     ]
255 | 
256 |     for i, possible_family in enumerate(possible_families):
257 |         for j in range(i+1, len(possible_families)):
258 |             # print(f"Checking {possible_family.iloc[0]} and {possible_families[j].iloc[0]}")
259 |             assert possible_family.index.intersection(possible_families[j].index).empty, f"Possible families had overlap - between {possible_family.iloc[0]} and {possible_families[j].iloc[0]}"
260 | 
261 | 
262 |     ## CPUFamily fill
263 |     df.loc[df['CPUName'].str.contains("xeon"), "CPUFamily"] = "xeon"
264 |     df.loc[df['CPUName'].str.contains("opteron"), "CPUFamily"] = "opteron"
265 |     df.loc[df['CPUName'].str.contains("epyc"), "CPUFamily"] = "epyc"
266 |     df.loc[df['CPUName'].str.contains("i3"), "CPUFamily"] = "core-i3"
267 |     df.loc[df['CPUName'].str.contains("i5"), "CPUFamily"] = "core-i5"
268 |     df.loc[df['CPUName'].str.contains("pentium"), "CPUFamily"] = "pentium"
269 | 
270 |     ## Assert CPU Family
271 | 
272 |     assert df.CPUFamily.isna().sum() == 0, "CPUFamily contained NA"
273 |     return df
274 | 
275 | 
276 | def make_l2_cache(df_original):
277 |     df = df_original.copy()
278 |     ## L2 Cache
279 |     df['L2CacheKB'] = None
280 |     mb_pat_l2='^\s*(?P<L2Cache>\d+)\s*mb' # remember flags=re.IGNORECASE)
281 |     kb_pat_l2='^\s*(?P<L2Cache>\d+)\s*kb' # remember flags=re.IGNORECASE)
282 |     mb_l2 = df.HW_SecondaryCache.str.extract(mb_pat_l2, flags=re.IGNORECASE).astype('float') * 1000
283 |     kb_l2 = df.HW_SecondaryCache.str.extract(kb_pat_l2, flags=re.IGNORECASE).astype('float')
284 |     df['L2CacheKB'] = mb_l2
285 |     df['L2CacheKB'] = df['L2CacheKB'].fillna(kb_l2['L2Cache'])
286 |     assert df['L2CacheKB'].isna().any() == False, "L2Cache contained empties"
287 |     ## TODO: Line 103 also seems stupidly big
288 | 
289 |     return df
290 | 
291 | def make_l3_cache(df_original):
292 |     df = df_original.copy()
293 |     ## L3 Cache, as #
294 |     df['L3CacheKB'] = None
295 |     mb_pat_l3='^\s*(?P<L3Cache>\d+)\s*mb' # remember case=False
296 |     kb_pat_l3='^\s*(?P<L3Cache>\d+)\s*kb' # remember case=False
297 |     mb_l3 = df.HW_TertiaryCache.str.extract(mb_pat_l3, flags=re.IGNORECASE).astype('float') * 1000
298 |     kb_l3 = df.HW_TertiaryCache.str.extract(kb_pat_l3, flags=re.IGNORECASE).astype('float')
299 |     df['L3CacheKB'] = mb_l3
300 |     df['L3CacheKB'] = df['L3CacheKB'].fillna(kb_l3['L3Cache'])
301 | 
302 |     # There is a typo in the L3 cache size. It is kb not MB, but only for these allowed processors
303 |     list(df[df['L3CacheKB'] > 39423000.0].CPUName.unique()) == ['XeonPlatinum8176', 'XeonPlatinum8280', 'XeonPlatinum8276L']
304 | 
305 |     df.loc[df['L3CacheKB'] > 39422000.0, 'L3CacheKB']
306 |     df.loc[df['L3CacheKB'] > 39423000.0, 'L3CacheKB'] = 39424.0
307 |     assert df['L3CacheKB'].isna().sum() == 85, f"L3Cache contained more than 85 empties: {df['L3CacheKB'].isna().sum()}"
308 | 
309 |     return df
310 | 
311 | 
312 | # This function is not used anymore
313 | # Problem being that only 50% of the architecture was matched! Rest was still NA
314 | def make_architecture_old(df_original):
315 |     df = df_original.copy()
316 | 
317 |     ## Import David Mytton's arch data into a dataframe
318 |     files = glob.glob("./../data/cpu_arch/*.csv")
319 |     pat='(intel|amd)-([^/]*).csv$'
320 |     arch = pd.DataFrame(columns=['architecture', 'value'])
321 |     for f in files:
322 |         header = re.search(pat,f).group(2)
323 |         f
324 |         csv = pd.read_csv(f, sep="|",header=None, names=[header])
325 |         csv = csv.melt(id_vars=None, value_vars=[header], var_name="architecture")
326 |         arch = arch.append(csv, ignore_index=True)
327 | 
328 |     arch
329 | 
330 |     df['Architecture'] = None
331 |     clean_names = df.CPUName.str.replace(r"xeon|opteron|core", "", regex=True)
332 | 
333 |     assert arch.value.nunique() == arch.shape[0], "Duplicate entries where in architecture lookup file"
334 | 
335 | 
336 |     arch.value = arch.value.str.lower().str.replace(r"\s*", "", regex=True) # normalize
337 | 
338 |     for i, clean_name in clean_names.iteritems():
339 |         found_architecture = arch[arch.value == clean_name].architecture
340 |         assert len(found_architecture) < 2
341 |         if not found_architecture.empty:
342 |             df.loc[i,'Architecture'] = found_architecture.iloc[0]
343 | 
344 | 
345 |     # opteron is known
346 |     df.loc[df['CPUName'].str.contains('opteron'), 'Architecture'] = 'opteron'
347 | 
348 | 
349 |     assert df[df['Architecture'].isna()].CPUName.unique().shape[0] == 59, "More than 59 unique unknown architectures found!"
350 | 
351 |     return df
352 | 
353 | # This function was based on the Intel HTML files. But it has still over 50% missing ...
354 | def make_tdp_old(df_original):
355 |     df = df_original.copy()
356 | 
357 |     ## Import David Mytton's arch data into a dataframe
358 |     amd = pd.read_csv("../data/cpu_spec_sheets/amd.csv")
359 | 
360 |     df["TDP"] = None
361 | 
362 |     clean_amd_names = df[df.CPUMake == "amd"].CPUName.str.replace(r"xeon|opteron|core", "", regex=True) # normalize
363 |     amd['models_clean'] = amd.Model.str.replace("™", "").str.replace("AMD ", "").str.replace(r"\s*","", regex=True).str.lower() # normalize
364 | 
365 |     # amd['models_clean'].value_counts() # array is NOT unique. but the non-unique are currently no problem
366 |     # we assert for that in the loop
367 | 
368 |     for i, clean_amd_name in clean_amd_names.iteritems(): # be vary not to use enumerate() as you will not get the real index, but a re-keyed list
369 |         matching_processors = amd[amd.models_clean == clean_amd_name]
370 |         assert matching_processors.shape[0] < 2, f"Found more than one processor to match with TDP for {clean_amd_name}"
371 |         if not matching_processors.empty:
372 |             df.loc[i, "TDP"] = matching_processors["Default TDP"].str.replace('W', "").iloc[0]
373 | 
374 | 
375 |     df.loc[df.TDP == "155/170", "TDP"] = 170 # Correct for AMD unclear spec to upper bound
376 | 
377 | 
378 |     intel_files = glob.glob("./../data/cpu_spec_sheets/*.html")
379 |     intel_tdps = pd.DataFrame(columns=['Processor Number', 'TDP'])
380 | 
381 | 
382 |     for f in intel_files:
383 |         tables = pd.read_html(f)
384 |         assert len(tables) == 1, f"More than one table ({len(tables)}) in Intel ARK download file: {f}"
385 | 
386 |         table = tables[0]
387 |         tdp_columns = table[table.iloc[:,0] == 'TDP'].index.values
388 |         processor_number_columns = table[table.iloc[:,0] == 'Processor Number'].index.values
389 |         assert len(tdp_columns) == 1, f"More than one column ({len(tdp_columns)}) for TDP in Intel ARK download file: {f}"
390 |         assert len(processor_number_columns) == 1, f"More than one column ({len(processor_number_columns)}) for Processor Number in Intel ARK download file: {f}"
391 | 
392 |         # now it is ok to transpose
393 |         tp = table.transpose()
394 |         tp.columns = tp.iloc[0]
395 |         tp = tp.drop(tp.index[0], axis=0)
396 | 
397 | 
398 |         # manually looked up on Intel.Ark.com
399 |         tp.loc[tp['Processor Number'] == 'W-11855M', 'TDP'] = 45
400 |         tp.loc[tp['Processor Number'] == 'W-11865MRE', 'TDP'] = 45
401 |         tp.loc[tp['Processor Number'] == 'W-11555MRE', 'TDP'] = 45
402 |         tp.loc[tp['Processor Number'] == 'W-11155MRE', 'TDP'] = 45
403 |         tp.loc[tp['Processor Number'] == 'W-11955M', 'TDP'] = 45
404 |         tp.loc[tp['Processor Number'] == 'W-11855M', 'TDP'] = 45
405 |         tp.loc[tp['Processor Number'] == 'W-11865MRE', 'TDP'] = 45
406 | 
407 | 
408 |         assert tp.TDP.isna().sum() == 0, f"TDP was not null for following models: {tp.loc[tp.TDP.isna(),'Processor Number']}"
409 | 
410 | 
411 |         assert (tp.loc[:, ["Processor Number", "TDP"]].groupby("Processor Number").nunique() == 1).all().all(), "Found conflicting info for processor type and TDP"
412 | 
413 |         print(tp.loc[:, ['Processor Number', 'TDP']].shape)
414 |         intel_tdps = intel_tdps.append(tp.loc[:, ['Processor Number', 'TDP']], ignore_index=True)
415 | 
416 |     intel_tdps['Processor Number'] = intel_tdps['Processor Number'].str.replace(r"\s*", "", regex=True).str.lower()
417 |     intel_tdps['TDP'] = intel_tdps['TDP'].str.replace("W", "").str.replace(r"\s*", "", regex=True)
418 | 
419 | 
420 |     intel_tdps['Processor Number'].value_counts() # NOT unique. But we have no TDP Overlap. So we make it unique
421 |     intel_tdps = intel_tdps.drop_duplicates(subset=["Processor Number"])
422 | 
423 | 
424 |     clean_intel_names = df[df.CPUMake == "intel"].CPUName.str.replace(r"xeon|opteron|core", "", regex=True) # normalize
425 | 
426 |     for i, clean_intel_name in clean_intel_names.iteritems(): # be vary not to use enumerate() as you will not get the real index, but a re-keyed list
427 |         matching_processors = intel_tdps[intel_tdps['Processor Number'] == clean_intel_name]
428 |         if not matching_processors.empty:
429 |             df.loc[i, "TDP"] = matching_processors.TDP.iloc[0]
430 | 
431 |     df[df.TDP.isna()]
432 | 
433 |     return df
434 | 
435 | def make_tdp_and_architecture(df_original):
436 |     df = df_original.copy()
437 | 
438 |     cpus = pd.DataFrame(columns=["ModelNumber", "TDP", "Architecture"])
439 | 
440 |     urls = {
441 |         "opteron" : "https://en.wikipedia.org/wiki/List_of_AMD_Opteron_processors",
442 |         "core" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Core-based)",
443 |         "nehalem" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Nehalem-based)",
444 |         "sandybridge" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Sandy_Bridge-based)",
445 |         "ivybridge" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Ivy_Bridge-based)",
446 |         "haswell" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Haswell-based)",
447 |         "broadwell" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Broadwell-based)",
448 |         "skylake" :"https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Skylake-based)",
449 |         "kabylabe" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Kaby_Lake-based)",
450 |         "coffeelake" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Coffee_Lake-based)",
451 |         "cascadelake" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Cascade_Lake-based)",
452 |         "cometlake" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Comet_Lake-based)",
453 |         "icelake" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Ice_Lake-based)",
454 |         "rocketlake" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(Rocket_Lake-based)",
455 |         "tigerlake" : "https://en.wikipedia.org/wiki/Tiger_Lake#List_of_Tiger_Lake_CPUs",
456 |         "epyc" : "https://en.wikipedia.org/wiki/Epyc",
457 |         "cooperlake" : "https://en.wikipedia.org/wiki/Cooper_Lake_(microprocessor)#List_of_Cooper_Lake_processors",
458 |         "netburst" : "https://en.wikipedia.org/wiki/List_of_Intel_Xeon_processors_(NetBurst-based)",
459 |     }
460 | 
461 |     for architecture in urls.keys():
462 |         print(architecture)
463 |         tables = pd.read_html(urls[architecture])
464 | 
465 |         for table in tables:
466 | 
467 |             # normalize different header names
468 |             table = table.rename({'Model number': 'ModelNumber', 'TDP(W)': 'TDP'}, axis=1)
469 |             table = table.rename({'Modelnumber': 'ModelNumber', 'TDP (W)': 'TDP'}, axis=1)
470 |             table = table.rename({'Model': 'ModelNumber'}, axis=1)
471 |             table = table.rename({'Modelnumber[7]': 'ModelNumber'}, axis=1)
472 | 
473 |             table["Architecture"] = architecture # Add static column
474 | 
475 |             if 'ModelNumber' in table.columns:
476 |                 table.columns = table.columns.get_level_values(0)
477 |                 # print("Found Table", table.iloc[0], table.columns)
478 |                 cpus = pd.concat([table.loc[:, ["ModelNumber", "TDP", "Architecture"]], cpus], ignore_index=True, axis=0, join="outer")
479 | 
480 |     # Remove header columns where ModelNumber = TDP
481 |     cpus = cpus.drop(cpus[cpus.ModelNumber == cpus.TDP].index)
482 | 
483 |     cpus_save = cpus.copy()
484 |     cpus = cpus_save.copy()
485 | 
486 | 
487 | 
488 |     cpus[cpus.TDP.astype(str).str.contains(",")] # DEBUG
489 |     cpus[cpus.ModelNumber.astype(str).str.contains(r"\[",regex=True)].to_dict() # DEBUG
490 | 
491 | 
492 |     # Unique replacements without pattern
493 |     cpus.TDP = cpus.TDP.str.replace("^80,20W$", "80", regex=True)
494 |     cpus.TDP = cpus.TDP.str.replace("^\s*80\s*,\s*120\s*W$", "120", regex=True)
495 |     cpus.TDP = cpus.TDP.str.replace("^\s*150\s*,\s*120\s*W$", "150", regex=True)
496 |     cpus.TDP = cpus.TDP.str.replace("^92.6 68$", "92.6", regex=True)
497 | 
498 |     # Replacements with spotted pattern
499 |     cpus.TDP = cpus.TDP.str.replace("^\s*\d+\s*/(\d+)\s*W\s*$", r"\1",regex=True)
500 |     cpus.TDP = cpus.TDP.str.replace("^\s*\d+\s*W\s*(\d+)\s*W\s*$", r"\1",regex=True)
501 |     cpus.TDP = cpus.TDP.str.replace("^\s*\d+\s*-\s*(\d+)\s*W\s*$", r"\1",regex=True)
502 |     cpus.TDP = cpus.TDP.str.replace("^\s*\d+\s*–\s*(\d+)\s*\s*$", r"\1",regex=True) # Note this is a UTF-8 hyphen!
503 | 
504 | 
505 |     #replace weird bracketing, but only for 24 knowns
506 |     assert cpus[cpus.ModelNumber.astype(str).str.contains(r"\[",regex=True)].shape[0] == 24, "More than 24 brackets in ModelNumber. Please manually verify that new ones are also OK"
507 |     cpus.ModelNumber = cpus.ModelNumber.str.replace("\[\d*\]", "",regex=True)
508 | 
509 | 
510 |     # normalizing
511 |     cpus.ModelNumber = cpus.ModelNumber.str.replace("\s*", "",regex=True).str.lower()
512 |     cpus.TDP = cpus.TDP.str.replace("\s*W\s*", "",regex=True, flags=re.IGNORECASE)
513 | 
514 |     unknown_cpus_selector = cpus.TDP.astype(str).str.contains("?", regex=False)
515 |     assert cpus.loc[unknown_cpus_selector].shape[0] == 2, f"More than two unknown processors with ? in TDP found: {cpus.loc[unknown_cpus_selector]}"
516 |     cpus = cpus.drop(cpus[unknown_cpus_selector].index, axis=0)
517 | 
518 | 
519 |     assert cpus.ModelNumber.isna().sum() == 0, "ModelNumber contained more than 0 NAs!"
520 |     assert cpus.TDP.isna().sum() == 10, f"TDPcontained more than 10 NAs: {cpus[cpus.TDP.isna()]}" # 10 NA is ok
521 | 
522 |     # For these 10 the TDP we just don't have ...
523 |     cpus = cpus.dropna()
524 | 
525 | 
526 |     cpus.TDP = cpus.TDP.astype(float)
527 | 
528 | 
529 |     # Fix the known processors where we have confilicting values by setting to the highes
530 |     # TDP value we know
531 |     cpus["TDP"] = cpus.groupby(["ModelNumber"]).transform(max).TDP
532 |     # now remove the duplicates
533 |     cpus = cpus.drop_duplicates(subset=["ModelNumber"])
534 | 
535 |     clean_names = df.CPUName.str.replace(r"opteron", "", regex=True)
536 |     for i, clean_name in clean_names.iteritems():
537 |         if cpus[cpus.ModelNumber == clean_name].empty:
538 |             print("No match for", clean_name)
539 |             continue
540 |         found = cpus[cpus.ModelNumber == clean_name]
541 |         assert len(found) < 2, f"Found multiple architectures: {cpus[cpus.ModelNumber == clean_name]}"
542 | 
543 |         df.loc[i,'TDP'] = found.iloc[0].TDP # Insert TDP in any case
544 | 
545 |         if df.loc[i,'Architecture'] is not None: # Check before overwriting architecture
546 |             if df.loc[i,'Architecture'] in ['epyc-gen3', 'epyc-gen1', 'epyc-gen2']: continue # allow these, since they provide more info
547 |             assert df.loc[i,'Architecture'] == found.iloc[0].Architecture, f"Previous architecture was {df.loc[i,'Architecture']}. New found is: {found.iloc[0].Architecture}"
548 |         else:
549 |             df.loc[i,'Architecture'] = found.iloc[0].Architecture
550 | 
551 | 
552 |     return df
553 | 
554 | 
555 | """ This function is just a helper function which is used during analysing
556 | the SUT_BIOS and SUT_Notes columns.
557 | 
558 | The functionality to create new columns from that knowledge has been coded
559 | into def make_feature_columns()
560 | 
561 | This function should only be used when new data to SPECPower is added
562 | 
563 | """
564 | def helper_for_bios_and_notes(df):
565 |     import numpy as np
566 |     df = pd.read_csv("./../data/spec_data_cleaned_unmelted.csv")
567 | 
568 | 
569 |     mylist = df.SUT_BIOS.tolist()
570 |     mylist += df.SUT_Notes.tolist()
571 |     items = []
572 |     for item in mylist:
573 |         if item is np.NaN: continue
574 |         items.extend(item.split(";;;"))
575 |     item_series = pd.Series(items).value_counts()
576 |     pd.set_option('display.max_rows', None)
577 | 
578 |     item_series.index.nunique() # 1243 items ... this is too much too inspect
579 | 
580 | 
581 |     # manual visual checking beforhand
582 |     # We limit down to the ones with more than 5 occurences and manually try to find interesting ones
583 |     item_series[item_series.values > 5].index.tolist()
584 | 
585 |     # Back from manual checking.
586 |     # We found very many interesting settings that deserve the creation of
587 |     # a dummy variable.
588 | 
589 | 
590 |     #'Memory Frequency set to DDR3-1333MHz.',
591 |     #'Set "DDR Performance = Power balanced" in BIOS.',
592 |     #'DRAM Controller Power Down: Enabled',
593 |     #'DRAM Power Down Enable = Enabled',
594 |     #'Memory clock speed = 1333 MHz',
595 |     #'SMEE = Disabled', -> Secure Memory Encryptoion Enable
596 |     item_series[item_series.index.str.match(r".*(memory|ddr|dram|smee)", case=False)].shape
597 | 
598 | 
599 |     #'Intel HT Technology -enabled (default).',
600 |     #'Hyper Threading Technology: Enabled',
601 |     #'HT Technology: enabled',
602 |     a = item_series[item_series.index.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*enabled", case=False)]
603 |     b = item_series[item_series.index.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*disabled", case=False)]
604 |     c = item_series[item_series.index.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)", case=False)] #unknown status
605 |     c_new = set(c.index)
606 |     assert c_new == (set(a.index).union(set(b.index))), f"Set contained differences: {c_new.difference(set(a.index).union(set(b.index)))}"
607 | 
608 | 
609 | 
610 |     #'Intel Virtualization (Intel VT): Disabled',
611 |     #'SVM disabled' -> AMD equivalent
612 |     #'VT-x: Disabled'
613 |     a = item_series[item_series.index.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*enable", case=False)]
614 |     b = item_series[item_series.index.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*disable", case=False)]
615 |     c = item_series[item_series.index.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)", case=False)]
616 |     c_new = set(c.index)
617 |     c_new.remove("Memory Patrol Scrubbing: DisabledVirtualization Options") # This one is known bogus
618 | 
619 |     assert c_new == (set(a.index).union(set(b.index))), f"Set contained differences: {c_new.difference(set(a.index).union(set(b.index)))}"
620 | 
621 | 
622 |     'Intel Turbo Boost enabled<C1E enabled<C States enabled',
623 |     'Set &quot;Intel Turbo Boost Technology = Disabled&quot; in BIOS.',
624 |     'Set "Turbo = Disabled" in BIOS.',
625 |     'Turbo Boost was disabled in BIOS.',
626 |     'AMD Fmax Boost Limit Control: Manual',
627 |     'Core Performance Boost = Disabled',
628 |     'Turbo mode Disabled.',
629 |     'AMD Core Performance Boost: Disabled',
630 | 
631 | 
632 |     a = item_series[item_series.index.str.match(r".*(Turbo\s*Boost)[^,.<;]*enabled", case=False)]
633 | 
634 |     b = item_series[item_series.index.str.match(r".*(Turbo\s*Boost)[^,.<;]*(disabled|disable$)", case=False)]
635 |     b2 = item_series[item_series.index.str.match(r".*(Manual Boost FMax set to 2200)", case=False)]
636 |     b3 = item_series[item_series.index.str.match(r".*disable.*boost", case=False)]
637 |     b4 = item_series[item_series.index.str.match(r".*boost.*limit", case=False)]
638 |     b5 = item_series[item_series.index.str.match(r".*boost fmax set to manual", case=False)]
639 |     b6 = item_series[item_series.index.str.match(r"Core Performance Boost: Disable</UL>", case=False)]
640 |     b7 = item_series[item_series.index.str.match(r"Set &quot;Intel Turbo Boost Technology = Disable&quot; in BIOS.", case=False)]
641 |     b8 = item_series[item_series.index.str.match(r".*(Turbo Mode)[^,.<;]*disabled", case=False)]
642 |     b9 = item_series[item_series.index.str.match(r".*Disable(d)? \"?Turbo", case=False)]
643 |     b10 = item_series[item_series.index.str.match(r"&quot;Turbo Mode&quot; was disabled in BIOS.", case=False)]
644 |     b11 = item_series[item_series.index.str.match(r"Turned off &quot;TURBO mode&quot; in BIOS.", case=False)]
645 | 
646 |     b = b.append(b2).append(b3).append(b4).append(b5).append(b6).append(b7).append(b8).append(b9).append(b10).append(b11)
647 |     c = item_series[item_series.index.str.match(r".*(Turbo\s*Boost|Turbo Mode)", case=False)] #unknown status
648 |     c_new = set(c.index)
649 |     c_new.discard("Turbo Boost Technology ") # This one is known bogus
650 | 
651 |     assert c_new == (set(a.index).union(set(b.index))), f"Set contained differences: {c_new.difference(set(a.index).union(set(b.index)))}"
652 | 
653 | 
654 |     'Package C State: No Limit',
655 |     'C-State Efficiency Mode: Enabled',
656 |     'Data Fabric C-State Enable: Force Enabled',
657 |     'C States enabled',
658 |     'Enhanced Halt State'
659 |     'C\d+'
660 |     'Package C State limit set to No Limit.'
661 |     a = item_series[item_series.index.str.match(r".*(C.State)[^,.<;]*(enabled|enable[\".\s]*|no limit|auto|c3|c6)", case=False)]
662 |     a2 = item_series[item_series.index.str.match(r"Enable \"*CPU C-states\"*", case=False)]
663 |     a = a.append(a2)
664 | 
665 |     b = item_series[item_series.index.str.match(r".*(C.State)[^,.<;]*disabled", case=False)]
666 | 
667 |     c = item_series[item_series.index.str.match(r".*(C6)", case=False)] #unknown status
668 |     c_new = set(c.index)
669 | 
670 |     assert c_new == (set(a.index).union(set(b.index))), f"Set contained differences: {c_new.difference(set(a.index).union(set(b.index)))}"
671 | 
672 | 
673 |     # P-states are a power feature. P-State = 1 is the base frequency
674 |     # Setting P-states to off will set P-State to max non-turbo (aka 1) (https://www.thomas-krenn.com/en/wiki/Disable_CPU_Power_Saving_Management_in_BIOS)
675 |     # All P-States greater than 1 are power efficient states: https://www.thomas-krenn.com/en/wiki/Processor_P-states_and_C-states
676 |     "SOC P-states: P3"
677 |     item_series[item_series.index.str.match(r".*(P.State)[^,.<;]*(P2|P3)", case=False)]
678 | 
679 | 
680 | 
681 | 
682 |      #"Hardware Prefetcher = Disabled"
683 |      #Adjacent Cache Line Prefetch
684 |      #MLC Spatial Prefetcher
685 |      #Spatial prefetch
686 |      #MLC Streamer
687 |      #Data Reuse Prefetcher
688 |      #Adjacent Cache Line Prefetch
689 |      #DCU Prefetcher
690 |      #L1 Stream HW Prefetcher
691 |      #Adjacent Sector Prefetch = Disable
692 | 
693 |     a = item_series[item_series.index.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*disable", case=False)]
694 |     b = item_series[item_series.index.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*enable", case=False)]
695 | 
696 | 
697 |     c = item_series[item_series.index.str.match(r".*prefetch", case=False)]
698 |     c_new = set(c.index)
699 |     c_new.remove('Disable L1 Cache Stream Prefetchers')
700 |     c_new.remove('HW Prefetcher')
701 |     c_new.remove('DCU Stream Prefetcher')
702 |     c_new.remove('Disable L2 Cache Stream Prefetchers')
703 |     c_new.remove('Disable Processor Data Prefetch')
704 |     c_new.remove('Adjacent Sector Prefetch')
705 |     c_new.remove('Disabled Adjacent Cache Line Prefetch in BIOS.')
706 |     c_new.remove('LLC Prefetcher''Disabled DCU Streamer Prefetcher in BIOS.')
707 |     c_new.remove('XPT Prefetcher')
708 |     c_new.remove('Disabled Hardware Prefetcher in BIOS.')
709 |     c_new.remove('DCU IP Prefetcher')
710 |     c_new.remove('Sub-NUMA Clustering: EnabledProcessor Prefetcher Options')
711 |     c_new.remove('Disable "Processor Data Prefetch"')
712 |     c_new.remove('LLC Prefetcher')
713 |     c_new.remove('Disabled DCU Streamer Prefetcher in BIOS.')
714 | 
715 |     assert c_new == (set(a.index).union(set(b.index))), f"Set contained differences: {c_new.difference(set(a.index).union(set(b.index)))}"
716 | 
717 | 
718 | def make_bios_features(df_original):
719 | 
720 |     df = df_original.copy()
721 | 
722 |     df.loc[df.SUT_BIOS.isna(), "SUT_BIOS"] = "--" # Move none to empty string so we can make operations on them
723 |     df.loc[df.SUT_Notes.isna(), "SUT_Notes"] = "--" # Move none to empty string so we can make operations on them
724 | 
725 | 
726 |     df["BIOS_Memory_Setting_Changed"] = None
727 |     df.loc[df.SUT_Notes.str.match(r".*(dram|memory|ddr)\s*frequency", case=False), "BIOS_Memory_Setting_Changed"] = True
728 | 
729 |     df.loc[df.SUT_BIOS.str.match(r".*(memory|ddr|dram|smee)", case=False), "BIOS_Memory_Setting_Changed"] = True
730 |     # There is no category for "False" since no infos are give like "memory timings default". So we resort to None as fallback
731 |     df["BIOS_Memory_Setting_Changed"].count()
732 | 
733 | 
734 |     df["BIOS_HT_Enabled"] = None
735 |     df.loc[df.SUT_Notes.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*enabled", case=False), "BIOS_HT_Enabled"] = True
736 |     df.loc[df.SUT_Notes.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*disabled", case=False), "BIOS_HT_Enabled"] = False
737 | 
738 |     df.loc[df.SUT_BIOS.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*enabled", case=False), "BIOS_HT_Enabled"] = True
739 |     df.loc[df.SUT_BIOS.str.match(r".*(Intel Hyper|Intel HT|Hyper.*Thread|HT Tech)[^,.<;]*disabled", case=False), "BIOS_HT_Enabled"] = False
740 |     df["BIOS_HT_Enabled"].count()
741 | 
742 | 
743 |     df["BIOS_VT_Enabled"] = None
744 |     df.loc[df.SUT_Notes.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*enable", case=False), "BIOS_VT_Enabled"] = True
745 |     df.loc[df.SUT_Notes.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*disable", case=False), "BIOS_VT_Enabled"] = False
746 | 
747 |     df.loc[df.SUT_BIOS.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*enable", case=False), "BIOS_VT_Enabled"] = True
748 |     df.loc[df.SUT_BIOS.str.match(r".*(AMD-V|SVM|VT|VT-x|Virtualization)[\s:=]*(Technology)?\s*disable", case=False), "BIOS_VT_Enabled"] = False
749 |     df["BIOS_VT_Enabled"].count()
750 | 
751 | 
752 |     df["BIOS_Turbo_Boost_Enabled"] = None
753 |     df.loc[df.SUT_Notes.str.match(r".*(Turbo\s*Boost)[^,.<;]*enabled", case=False), "BIOS_Turbo_Boost_Enabled"] = True
754 |     df.loc[df.SUT_Notes.str.match(r".*(Turbo\s*Boost)[^,.<;]*(disabled|disable$)", case=False), "BIOS_Turbo_Boost_Enabled"] = False
755 |     df.loc[df.SUT_Notes.str.match(r".*(Manual Boost FMax set to 2200)", case=False), "BIOS_Turbo_Boost_Enabled"] = False
756 |     df.loc[df.SUT_Notes.str.match(r".*disable.*boost", case=False), "BIOS_Turbo_Boost_Enabled"] = False
757 |     df.loc[df.SUT_Notes.str.match(r".*boost.*limit", case=False), "BIOS_Turbo_Boost_Enabled"] = False
758 |     df.loc[df.SUT_Notes.str.match(r".*boost fmax set to manual", case=False), "BIOS_Turbo_Boost_Enabled"] = False
759 |     df.loc[df.SUT_Notes.str.match(r"Core Performance Boost: Disable</UL>", case=False), "BIOS_Turbo_Boost_Enabled"] = False
760 |     df.loc[df.SUT_Notes.str.match(r"Set &quot;Intel Turbo Boost Technology = Disable&quot; in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
761 |     df.loc[df.SUT_Notes.str.match(r".*(Turbo Mode)[^,.<;]*disabled", case=False), "BIOS_Turbo_Boost_Enabled"] = False
762 |     df.loc[df.SUT_Notes.str.match(r".*Disable(d)? \"?Turbo", case=False), "BIOS_Turbo_Boost_Enabled"] = False
763 |     df.loc[df.SUT_Notes.str.match(r"&quot;Turbo Mode&quot; was disabled in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
764 |     df.loc[df.SUT_Notes.str.match(r"Turned off &quot;TURBO mode&quot; in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
765 | 
766 |     df.loc[df.SUT_BIOS.str.match(r".*(Turbo\s*Boost)[^,.<;]*enabled", case=False), "BIOS_Turbo_Boost_Enabled"] = True
767 |     df.loc[df.SUT_BIOS.str.match(r".*(Turbo\s*Boost)[^,.<;]*(disabled|disable$)", case=False), "BIOS_Turbo_Boost_Enabled"] = False
768 |     df.loc[df.SUT_BIOS.str.match(r".*(Manual Boost FMax set to 2200)", case=False), "BIOS_Turbo_Boost_Enabled"] = False
769 |     df.loc[df.SUT_BIOS.str.match(r".*disable.*boost", case=False), "BIOS_Turbo_Boost_Enabled"] = False
770 |     df.loc[df.SUT_BIOS.str.match(r".*boost.*limit", case=False), "BIOS_Turbo_Boost_Enabled"] = False
771 |     df.loc[df.SUT_BIOS.str.match(r".*boost fmax set to manual", case=False), "BIOS_Turbo_Boost_Enabled"] = False
772 |     df.loc[df.SUT_BIOS.str.match(r"Core Performance Boost: Disable</UL>", case=False), "BIOS_Turbo_Boost_Enabled"] = False
773 |     df.loc[df.SUT_BIOS.str.match(r"Set &quot;Intel Turbo Boost Technology = Disable&quot; in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
774 |     df.loc[df.SUT_BIOS.str.match(r".*(Turbo Mode)[^,.<;]*disabled", case=False), "BIOS_Turbo_Boost_Enabled"] = False
775 |     df.loc[df.SUT_BIOS.str.match(r".*Disable(d)? \"?Turbo", case=False), "BIOS_Turbo_Boost_Enabled"] = False
776 |     df.loc[df.SUT_BIOS.str.match(r"&quot;Turbo Mode&quot; was disabled in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
777 |     df.loc[df.SUT_BIOS.str.match(r"Turned off &quot;TURBO mode&quot; in BIOS.", case=False), "BIOS_Turbo_Boost_Enabled"] = False
778 |     df["BIOS_Turbo_Boost_Enabled"].count()
779 | 
780 | 
781 |     df["BIOS_C_States_Enabled"] = None
782 |     df.loc[df.SUT_Notes.str.match(r".*(C.State)[^,.<;]*(enabled|enable[\".\s]*|no limit|auto|c3|c6)", case=False), "BIOS_C_States_Enabled"] = True
783 |     df.loc[df.SUT_Notes.str.match(r"Enable \"*CPU C-states\"*", case=False), "BIOS_C_States_Enabled"] = True
784 |     df.loc[df.SUT_Notes.str.match(r".*(C.State)[^,.<;]*disabled", case=False), "BIOS_C_States_Enabled"] = False
785 | 
786 |     df.loc[df.SUT_BIOS.str.match(r".*(C.State)[^,.<;]*(enabled|enable[\".\s]*|no limit|auto|c3|c6)", case=False), "BIOS_C_States_Enabled"] = True
787 |     df.loc[df.SUT_BIOS.str.match(r"Enable \"*CPU C-states\"*", case=False), "BIOS_C_States_Enabled"] = True
788 |     df.loc[df.SUT_BIOS.str.match(r".*(C.State)[^,.<;]*disabled", case=False), "BIOS_C_States_Enabled"] = False
789 |     df["BIOS_C_States_Enabled"].count()
790 | 
791 | 
792 |     df["BIOS_P_States_Enabled"] = None
793 |     df.loc[df.SUT_Notes.str.match(r".*(P.State)[^,.<;]*(P2|P3)", case=False), "BIOS_P_States_Enabled"] = True
794 |     df.loc[df.SUT_BIOS.str.match(r".*(P.State)[^,.<;]*(P2|P3)", case=False), "BIOS_P_States_Enabled"] = True
795 |     # There is no category for "False" since no infos are give like "P-States default". So we resort to None as fallback as we do not know the status
796 |     # Most likey it is on, since this is the default. But also the CPU might not support it ...
797 |     df["BIOS_P_States_Enabled"].count()
798 | 
799 | 
800 |     df["BIOS_Prefetchers_Enabled"] = None
801 |     df.loc[df.SUT_Notes.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*enable", case=False), "BIOS_Prefetchers_Enabled"] = True
802 |     df.loc[df.SUT_Notes.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*disable", case=False), "BIOS_Prefetchers_Enabled"] = False
803 |     df.loc[df.SUT_Notes.str.match(r".*Disable L1 Cache Stream Prefetchers", case=False), "BIOS_Prefetchers_Enabled"] = False
804 |     df.loc[df.SUT_Notes.str.match(r".*Disable L2 Cache Stream Prefetchers", case=False), "BIOS_Prefetchers_Enabled"] = False
805 |     df.loc[df.SUT_Notes.str.match(r".*Disable Processor Data Prefetch", case=False), "BIOS_Prefetchers_Enabled"] = False
806 |     df.loc[df.SUT_Notes.str.match(r".*Disabled Adjacent Cache Line Prefetch in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
807 |     df.loc[df.SUT_Notes.str.match(r".*Disabled DCU Streamer Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
808 |     df.loc[df.SUT_Notes.str.match(r".*Disabled Hardware Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
809 |     df.loc[df.SUT_Notes.str.match(r".*Disable \"Processor Data Prefetch\"", case=False), "BIOS_Prefetchers_Enabled"] = False
810 |     df.loc[df.SUT_Notes.str.match(r".*Disabled DCU Streamer Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
811 | 
812 |     df.loc[df.SUT_BIOS.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*enable", case=False), "BIOS_Prefetchers_Enabled"] = True
813 |     df.loc[df.SUT_BIOS.str.match(r".*prefetch(er)?s?( was)?( set to)?[\"'=\s:-]*disable", case=False), "BIOS_Prefetchers_Enabled"] = False
814 |     df.loc[df.SUT_BIOS.str.match(r".*Disable L1 Cache Stream Prefetchers", case=False), "BIOS_Prefetchers_Enabled"] = False
815 |     df.loc[df.SUT_BIOS.str.match(r".*Disable L2 Cache Stream Prefetchers", case=False), "BIOS_Prefetchers_Enabled"] = False
816 |     df.loc[df.SUT_BIOS.str.match(r".*Disable Processor Data Prefetch", case=False), "BIOS_Prefetchers_Enabled"] = False
817 |     df.loc[df.SUT_BIOS.str.match(r".*Disabled Adjacent Cache Line Prefetch in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
818 |     df.loc[df.SUT_BIOS.str.match(r".*Disabled DCU Streamer Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
819 |     df.loc[df.SUT_BIOS.str.match(r".*Disabled Hardware Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
820 |     df.loc[df.SUT_BIOS.str.match(r".*Disable \"Processor Data Prefetch\"", case=False), "BIOS_Prefetchers_Enabled"] = False
821 |     df.loc[df.SUT_BIOS.str.match(r".*Disabled DCU Streamer Prefetcher in BIOS.", case=False), "BIOS_Prefetchers_Enabled"] = False
822 |     df["BIOS_Prefetchers_Enabled"].count()
823 | 
824 |     return df
825 | 
826 | def main():
827 |     pd.set_option("display.max_rows", 100)
828 |     pd.set_option("display.max_columns", 20)
829 |     pd.set_option('display.max_colwidth', None)
830 | 
831 |     df = pd.read_csv("./../data/spec_data.csv", sep="|", index_col=False, na_values=["None"])
832 | 
833 |     # Hashing cause we want to identify columns later on based on initial uniqueness
834 |     df['hash'] = pd.util.hash_pandas_object(df)
835 |     #assert(df.hash.nunique() == df.shape[0]) # no duplicate hashes
836 | 
837 |     ## Cleaning
838 | 
839 |     helper.visual_check(df.dtypes.to_dict(), "Are all data types ok?")
840 | 
841 |     df = remove_unneeded_columns(df)
842 | 
843 |     df = split_hardware_availabilty(df)
844 | 
845 |     df = create_cpu_make(df)
846 | 
847 |     df = create_cpu_name(df)
848 | 
849 |     df = create_turbo_boost(df)
850 | 
851 |     df = make_cpu_cores(df)
852 |     df = make_cpu_chips(df)
853 | 
854 |     df = make_hardware_threads(df)
855 | 
856 |     df = split_psu(df)
857 | 
858 |     df = make_cpu_family(df)
859 | 
860 |     df = make_l2_cache(df)
861 | 
862 |     df = make_l3_cache(df)
863 | 
864 |     df = make_architecture_old(df)
865 | 
866 |     df = make_tdp_and_architecture(df)
867 | 
868 |     df = make_bios_features(df)
869 | 
870 |     df["AvgPower"] = df.loc[:,['100_AvgPower', '90_AvgPower', '80_AvgPower', '70_AvgPower',
871 |            '60_AvgPower', '50_AvgPower', '40_AvgPower', '30_AvgPower',
872 |            '20_AvgPower', '10_AvgPower', 'ActiveIdle']].mean(axis=1)
873 | 
874 |     df.to_csv("./../data/spec_data_cleaned_unmelted.csv")
875 | 
876 | 
877 |     df = df.drop("AvgPower", axis=1)
878 | 
879 |     df = melt_power_and_load(df) # spread columns to rows
880 |     df = clean_power_and_load(df) # move 100_AvgPower => 100 as int
881 | 
882 |     df.to_csv("./../data/spec_data_cleaned.csv")
883 | 
884 | 
885 |     '''
886 |     ## Now do the same, but with HW_CPUChars column
887 |     x = df.loc[df['HW_CPUChars'].str.match(".*\(.*Boost.*\)"), 'HW_CPUChars']
888 |     y = df.loc[df['HW_CPUChars'].str.match(".*\(.*up to (.*)GHz.*"), 'HW_CPUChars']
889 |     ## What's different?
890 |     pd.concat([x,y]).drop_duplicates(keep=False)
891 |     #A: (Max Boost Clock up to 3.5 GHz)
892 |     #df.iloc[445]
893 |     # I googled that CPU, its the same thing as Turbo Boost
894 | 
895 | 
896 |     ### Merge (actually fillna()) the two TurboBoostGHz columns
897 |     turbo_from_chars = df['HW_CPUChars'].str.extract(r'.*\(.*up to (?P<TurboBoostGHz>.*)GHz.*')
898 |     turbo_from_chars.count()      # 91
899 |     df['TurboBoostGHz'].count()   # 41
900 |     df['TurboBoostGHz'] = df['TurboBoostGHz'].fillna(turbo_from_chars['TurboBoostGHz'])
901 |     df['TurboBoostGHz'].count()  # 132, looks good considering duplicates
902 | 
903 |     # Strip TurboBoost from CPU Chars
904 |     df['HW_CPUChars'] = df['HW_CPUChars'].str.replace("\(.*up to (.*)GHz.*\)", "", regex=True)
905 | 
906 |     ## Make sure type is correct
907 |     df["TurboBoostGHz"] = df["TurboBoostGHz"].astype(float)
908 |     df.dtypes.to_dict()
909 | 
910 |     # HW_CPUName
911 |     # @ X.X GHz -> this should be able to be safely removed, as the info already
912 |     # exists in the HW_CPUFreq column
913 | 
914 |     df.loc[df['HW_CPUName'].str.match(".*\d+\.\d*\s*GHz.*", case=False), 'HW_CPUName']
915 |     df.loc[df['HW_CPUName'].str.match(".*\d+\.\d*\s*GHz.*", case=False), 'HW_CPUName'].count()
916 |     df.loc[df['HW_CPUName'].str.match(".*\d+\.\d*\s*GHz.*", case=False), 'HW_CPUName'].unique()
917 | 
918 | 
919 |     ## Compare the freq extract vs the freq in column, make sure they're ==
920 |     freq_from_name = df.HW_CPUName.str.extract(".*(\d+\.\d*)\s*GHz.*", expand = False, flags=re.IGNORECASE).dropna()
921 |     freq_from_column = df.loc[freq_from_name.index]
922 |     freq_from_name = freq_from_name.astype('float') * 1000
923 |     freq_from_column.HW_CPUFreq = freq_from_column.HW_CPUFreq.astype('float')
924 |     ### What's a simple way to see everywhere this fails?
925 |     #x = freq_from_column.loc[~freq_from_name.eq(freq_from_column.HW_CPUFreq)]
926 |     #df.loc[x.index, x.hash] == df.hash
927 |     #assert(freq_from_name.eq(freq_from_column.HW_CPUFreq).all()
928 | 
929 |     #z = freq_from_name.eq(freq_from_column) ### Line 484 is false
930 |     # 484 in freq_from_name is 2933
931 |     # 484 in freq_from_column is 2930
932 |     # oh its a rounding error. hmm. what do here. could just drop. not best long
933 |     # thinking though.
934 |     # TODO: for now this is fine, it means we can safely remove these values,
935 |     # but I want to turn this assert back on
936 |     # clean line 484 manually, then use the assert :-)
937 |     # hash the row
938 | 
939 | 
940 | 
941 |     # TODO: HW_Vendor
942 |     # just basic cleaning, standardize how companies are written
943 |     # etc.
944 |     vendors = df.loc[df['HW_Vendor'].str.contains('Hewlett'), 'HW_Vendor'].unique()
945 |     #df.HW_Vendor = df.HW_Vendor.str.replace('HEPYCEPYCewlett\s?-?\s?Packard\s?(Enterprise|Company)', '', regex=True, case=False)
946 |     '''
947 | 
948 | 
949 | 
950 | 
951 | 
952 | 
953 |     # test code
954 |     # a_archs_t = pd.DataFrame({'XeonPeon':['CoolProcessorWon 1234', 'CoolProcessorToo 4321', None, None],'OptimusPrime':[None, None, 'CoolProcessorTree', 'CoolProcessorFore'],})
955 |     # a_df_t = pd.DataFrame({'CPU': ['CoolProcessorWon 1234', 'Totally Not In DataSet','CoolProcessorWon', 'CoolProcessorFore', 'CoolProcessor']})
956 |     # a_t = a_df_t.CPU.isin(a_archs_t.XeonPeon)
957 | 
958 |     ## Disk Drive, size + type (SSD or HDD)
959 | 
960 | 
961 | 
962 | 
963 | 
964 | 
965 | if __name__ == "__main__":
966 |     main()
967 | 


--------------------------------------------------------------------------------
/scripts/include/helper_functions.py:
--------------------------------------------------------------------------------
1 | import sysfrom pprint import PrettyPrinterimport pandas as pddef visual_check(result, question):    PrettyPrinter(indent=4).pprint(result)    print(question, "Then press enter ...")    sys.stdin.readline()def columns_diff(df, df_temp):    print("\nThe following new columns have beed added: ", set(df_temp.columns).difference(set(df.columns)))    print("\nThe following columns have been removed: ", set(df.columns).difference(set(df_temp.columns)))def new_column_diff(df, column, column_new, aggregate=True):    print("\nThe following rows have been altered in place:" )    changes = {}    for key, value in df[column].iteritems():        replace = (str(df.loc[key, column_new]).replace(str(value), ''))        if replace != '':            changes[key] = f"{value} => {replace}"        PrettyPrinter(indent=4).pprint(pd.Series(changes).value_counts().to_dict())def same_column_diff(df, df_temp, column, aggregate=True):    print("\nNew columns have the following individual changes:" )    changes = {}    for key, value in df[column].iteritems():        replace = ( str(df_temp.loc[key,column]).replace(str(value), ''))        if replace != '':            changes[key] = f"{value} => {replace}"        PrettyPrinter(indent=4).pprint(pd.Series(changes).value_counts().to_dict())


--------------------------------------------------------------------------------
/xgb.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=redefined-outer-name,invalid-name
  2 | 
  3 | import sys
  4 | import os
  5 | import time
  6 | import logging
  7 | import platform
  8 | import pandas as pd
  9 | import numpy as np
 10 | import warnings
 11 | from xgboost import XGBRegressor
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | logger.addHandler(logging.StreamHandler())
 15 | logger.setLevel(logging.INFO)
 16 | 
 17 | def train_model(cpu_chips, Z):
 18 | 
 19 |     df = pd.read_csv(f"{os.path.dirname(os.path.abspath(__file__))}/data/spec_data_cleaned.csv")
 20 | 
 21 |     X = df.copy()
 22 |     X = pd.get_dummies(X, columns=['CPUMake', 'Architecture'])
 23 | 
 24 |     if cpu_chips:
 25 |         logger.info('Training data will be restricted to the following amount of chips: %d', cpu_chips)
 26 | 
 27 |         X = X[X.CPUChips == cpu_chips] # Fit a model for every amount of CPUChips
 28 | 
 29 |     if X.empty:
 30 |         raise RuntimeError(f"The training data does not contain any servers with a chips amount ({cpu_chips}). Please select a different amount.")
 31 | 
 32 |     y = X.power
 33 | 
 34 |     X = X[Z.columns] # only select the supplied columns from the command line
 35 | 
 36 |     logger.info('Model will be trained on the following columns and restrictions: \n%s', Z)
 37 | 
 38 | #    params = {
 39 | #      'max_depth': 10,
 40 | #      'learning_rate': 0.3037182109676833,
 41 | #      'n_estimators': 792,
 42 | #      'min_child_weight': 1,
 43 | #      'random_state': 762
 44 | #    }
 45 |     params = {} # we see no strong improvements with hyperparamters tuned by optune
 46 | 
 47 |     model = XGBRegressor(**params)
 48 |     model.fit(X,y)
 49 | 
 50 |     return model
 51 | 
 52 | def infer_predictions(model, Z):
 53 | 
 54 |     predictions = {}
 55 | 
 56 |     for i in range(0,110,5):
 57 |         Z['utilization'] = i
 58 |         predictions[float(i)] = model.predict(Z)[0]
 59 |     return predictions
 60 | 
 61 | def interpolate_helper(predictions, lower, upper, step=501):
 62 | 
 63 |     diff = int(upper-lower)
 64 |     diff_value = predictions[upper] - predictions[lower]
 65 | 
 66 |     for i in np.linspace(0, diff, step):
 67 |         predictions[round(lower+i, 2)] = predictions[lower]+((diff_value/diff)*i)
 68 | 
 69 |     return predictions
 70 | 
 71 | def interpolate_predictions(predictions):
 72 |     predictions = interpolate_helper(predictions, 0.0, 5.0, 501)
 73 |     predictions = interpolate_helper(predictions, 5.0, 15.0, 1001)
 74 |     predictions = interpolate_helper(predictions, 15.0, 25.0, 1001)
 75 |     predictions = interpolate_helper(predictions, 25.0, 35.0, 1001)
 76 |     predictions = interpolate_helper(predictions, 35.0, 45.0, 1001)
 77 |     predictions = interpolate_helper(predictions, 45.0, 55.0, 1001)
 78 |     predictions = interpolate_helper(predictions, 55.0, 65.0, 1001)
 79 |     predictions = interpolate_helper(predictions, 65.0, 75.0, 1001)
 80 |     predictions = interpolate_helper(predictions, 75.0, 85.0, 1001)
 81 |     predictions = interpolate_helper(predictions, 85.0, 95.0, 1001)
 82 |     # Question: between 95 and 100 is no difference. How do we extrapolate?
 83 |     predictions = interpolate_helper(predictions, 95.0, 100.0, 501)
 84 | 
 85 |     return predictions
 86 | 
 87 | def set_silent():
 88 |     # sadly some libs have future warnings we need to suppress for
 89 |     # silent mode to work in bash scripts
 90 |     warnings.simplefilter(action='ignore', category=FutureWarning)
 91 |     logger.setLevel(logging.WARNING)
 92 | 
 93 | if __name__ == '__main__':
 94 | 
 95 |     import argparse
 96 | 
 97 |     parser = argparse.ArgumentParser()
 98 | 
 99 |     parser.add_argument('--cpu-chips', type=int, help='Number of CPU chips')
100 |     parser.add_argument('--cpu-freq', type=int, help='CPU frequency')
101 |     parser.add_argument('--cpu-threads', type=int, help='Number of CPU threads')
102 |     parser.add_argument('--cpu-cores', type=int, help='Number of CPU cores')
103 |     parser.add_argument('--release-year', type=int, help='Release year of the CPU')
104 |     parser.add_argument('--tdp', type=int, help='TDP of the CPU')
105 |     parser.add_argument('--ram', type=int, help='Amount of DRAM for the bare metal system')
106 |     parser.add_argument('--architecture', type=str, help='The architecture of the CPU. lowercase. ex.: haswell')
107 |     parser.add_argument('--cpu-make', type=str, help='The make of the CPU (intel or amd)')
108 |     parser.add_argument('--auto', action='store_true', help='Force auto detect. Will overwrite supplied arguments')
109 | 
110 |     parser.add_argument('--vhost-ratio',
111 |         type=float,
112 |         help='Virtualization ratio of the system. Input numbers between (0,1].'
113 | 
114 |     )
115 |     parser.add_argument('--silent',
116 |         action='store_true',
117 |         help='Will suppress all debug output. Typically used in production.'
118 |     )
119 | 
120 |     parser.add_argument('--energy',
121 |         action='store_true',
122 |         help='Switches to energy mode. The output will be in Joules instead of Watts. \
123 |         This is achieved by multiplying the interval between inputs with the estimated wattage'
124 |     )
125 | 
126 |     parser.add_argument('--autoinput', action='store_true', help='Will get the CPU utilization through psutil.')
127 |     parser.add_argument('--interval', type=float, help='Interval in seconds if autoinput is used.', default=1.0)
128 |     parser.add_argument('--dump', action='store_true', help='Dump all predicitions to STDOUT.')
129 |     parser.add_argument('--dump-hashmap', action='store_true', help='Dump all predicitions to STDOUT as bash hashmap.')
130 | 
131 |     args = parser.parse_args()
132 | 
133 |     if args.silent:
134 |         set_silent()
135 | 
136 |     args_dict = args.__dict__.copy()
137 |     del args_dict['silent']
138 |     del args_dict['auto']
139 |     del args_dict['energy']
140 | 
141 |     # did the user supply any of the auto detectable arguments?
142 |     if not any(args_dict.values()) or args.auto:
143 |         logger.info('No arguments where supplied, or auto mode was forced. Running auto detect on the sytem.')
144 | 
145 |         import auto_detect
146 | 
147 |         data = auto_detect.get_cpu_info(logger)
148 | 
149 |         logger.info('The following data was auto detected: %s', data)
150 | 
151 |         # only overwrite not already supplied values
152 |         args.cpu_freq = args.cpu_freq or data['freq']
153 |         args.cpu_threads = args.cpu_threads or data['threads']
154 |         args.cpu_cores = args.cpu_cores or data['cores']
155 |         args.tdp = args.tdp or data['tdp']
156 |         args.ram = args.ram or data['mem']
157 |         args.cpu_make = args.cpu_make or data['make']
158 |         args.cpu_chips = args.cpu_chips or data['chips']
159 | 
160 |     # set default. We do this here and not in argparse, so we can check if anything was supplied at all
161 |     if not args.vhost_ratio:
162 |         args.vhost_ratio = 1.0
163 | 
164 |     if platform.system() == 'Darwin' and args.autoinput and args.interval < 0.5:
165 |         print('''
166 |                 Under MacOS the internal values are updated every 0.5 seconds by the kernel if you use the host_statistics call.
167 |                 There is another way to get the cpu utilization by using the host_processor_info call.
168 |                 Psutils uses host_statistics so intervals under 0.5 are not sensible. We have opened a discussion here:
169 |                 https://github.com/giampaolo/psutil/issues/2368
170 |                 If you want a higher resolution you can use the cpu_utilization_mac.c file in the demo-reporter folder.
171 |               ''')
172 |         sys.exit(1)
173 | 
174 |     Z = pd.DataFrame.from_dict({
175 |         'HW_CPUFreq' : [args.cpu_freq],
176 |         'CPUThreads': [args.cpu_threads],
177 |         'CPUCores': [args.cpu_cores],
178 |         'TDP': [args.tdp],
179 |         'Hardware_Availability_Year': [args.release_year],
180 |         'HW_MemAmountGB': [args.ram],
181 |         'Architecture': [args.architecture],
182 |         'CPUMake': [args.cpu_make],
183 |         'utilization': [0.0]
184 |     })
185 | 
186 |     Z = pd.get_dummies(Z, columns=['CPUMake', 'Architecture'])
187 | 
188 |     Z = Z.dropna(axis=1)
189 | 
190 | 
191 |     logger.info('vHost ratio is set to %s', args.vhost_ratio)
192 | 
193 |     trained_model = train_model(args.cpu_chips, Z)
194 | 
195 |     logger.info('Infering all predictions to dictionary')
196 | 
197 |     inferred_predictions = infer_predictions(trained_model, Z)
198 |     interpolated_predictions = interpolate_predictions(inferred_predictions)
199 | 
200 |     input_source = sys.stdin
201 |     if args.autoinput:
202 |         import psutil
203 |         def cpu_utilization():
204 |             while True:
205 |                 cpu_util = psutil.cpu_percent(args.interval)
206 |                 yield str(cpu_util)
207 | 
208 |         input_source = cpu_utilization()
209 | 
210 | 
211 |     if args.dump:
212 |         for key, val in interpolated_predictions.items():
213 |             print(key, ':', val*args.vhost_ratio, flush=True)
214 |         sys.exit(0)
215 | 
216 |     if args.dump_hashmap:
217 |         print('#!/usr/bin/env bash')
218 |         print('set -eu')
219 |         print('declare -A cloud_energy_hashmap')
220 |         for key, val in interpolated_predictions.items():
221 |             print(f'cloud_energy_hashmap[{key:.2f}]={val*args.vhost_ratio}', flush=True)
222 |         sys.exit(0)
223 | 
224 |     current_time = time.time_ns()
225 |     for line in input_source:
226 |         utilization = float(line.strip())
227 |         if utilization < 0 or utilization > 100:
228 |             raise ValueError("Utilization can not be over 100%. If you have multiple CPU cores please divide by cpu count.")
229 | 
230 |         if args.energy:
231 |             print(interpolated_predictions[utilization] * args.vhost_ratio * \
232 |                 (time.time_ns() - current_time) / 1_000_000_000, flush=True)
233 |             current_time = time.time_ns()
234 |         else:
235 |             print(interpolated_predictions[utilization] * args.vhost_ratio, flush=True)
236 | 


--------------------------------------------------------------------------------