├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug_report.yaml
    │   ├── document.yaml
    │   └── feature_request.yaml
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── build-push.yml
    │   ├── collect-data-self-hosted.yml
    │   ├── collect-train.yml
    │   ├── commit-msg.yml
    │   ├── integration-test.yml
    │   ├── lint.yml
    │   ├── pr.yml
    │   ├── push-to-main.yml
    │   ├── release.yml
    │   ├── tekton-test.yml
    │   ├── train-model.yml
    │   ├── train.yml
    │   └── unit-test.yml
├── .gitignore
├── .vscode
    └── settings.json
├── .yamllint.yaml
├── LICENSE
├── Makefile
├── README.md
├── VERSION
├── cmd
    └── main.py
├── contributing.md
├── dockerfiles
    ├── Dockerfile
    ├── Dockerfile.base
    ├── Dockerfile.dockerignore
    ├── Dockerfile.test
    ├── Dockerfile.test-nobase
    ├── Dockerfile.test-nobase.dockerignore
    └── Dockerfile.test.dockerignore
├── docs
    └── developer
    │   ├── README.md
    │   ├── estimate
    │       ├── classes.plantuml
    │       ├── classes.svg
    │       ├── packages.plantuml
    │       └── packages.svg
    │   ├── server
    │       ├── classes.plantuml
    │       ├── classes.svg
    │       ├── packages.plantuml
    │       └── packages.svg
    │   └── train
    │       ├── classes.plantuml
    │       ├── classes.svg
    │       ├── packages.plantuml
    │       ├── packages.svg
    │       └── trainer
    │           ├── classes.plantuml
    │           ├── classes.svg
    │           ├── packages.plantuml
    │           └── packages.svg
├── fig
    ├── comm_diagram.png
    ├── model-server-components-simplified.png
    ├── tekton-complete-train.png
    ├── tekton-kepler-default.png
    └── tekton-single-train.png
├── hack
    ├── aws_helper.sh
    ├── k8s_helper.sh
    └── utils.bash
├── manifests
    ├── base
    │   ├── estimate-only
    │   │   └── kustomization.yaml
    │   ├── estimate-with-server
    │   │   └── kustomization.yaml
    │   ├── kustomization.yaml
    │   ├── openshift
    │   │   ├── estimate-only
    │   │   │   └── kustomization.yaml
    │   │   ├── estimate-with-server
    │   │   │   └── kustomization.yaml
    │   │   ├── scc.yaml
    │   │   └── serve-only
    │   │   │   └── kustomization.yaml
    │   ├── patch
    │   │   ├── patch-estimator-sidecar.yaml
    │   │   ├── patch-model-server.yaml
    │   │   ├── patch-openshift.yaml
    │   │   └── patch-server-only.yaml
    │   └── serve-only
    │   │   └── kustomization.yaml
    ├── compose
    │   ├── dev
    │   │   ├── compose.yaml
    │   │   ├── grafana
    │   │   │   └── dashboards
    │   │   │   │   └── dev
    │   │   │   │       └── dashboard.json
    │   │   ├── kepler
    │   │   │   ├── common
    │   │   │   │   └── var
    │   │   │   │   │   └── lib
    │   │   │   │   │       └── kepler
    │   │   │   │   │           └── data
    │   │   │   │   │               ├── cpus.yaml
    │   │   │   │   │               └── model_weight
    │   │   │   │   │                   ├── acpi_AbsPowerModel.json
    │   │   │   │   │                   ├── acpi_DynPowerModel.json
    │   │   │   │   │                   ├── intel_rapl_AbsPowerModel.json
    │   │   │   │   │                   └── intel_rapl_DynPowerModel.json
    │   │   │   ├── metal
    │   │   │   │   └── etc
    │   │   │   │   │   └── kepler
    │   │   │   │   │       └── kepler.config
    │   │   │   │   │           ├── ENABLE_PROCESS_METRICS
    │   │   │   │   │           ├── EXPOSE_ESTIMATED_IDLE_POWER_METRICS
    │   │   │   │   │           └── EXPOSE_VM_METRICS
    │   │   │   └── models
    │   │   │   │   └── etc
    │   │   │   │       └── kepler
    │   │   │   │           └── kepler.config
    │   │   │   │               ├── ENABLE_PROCESS_METRICS
    │   │   │   │               ├── EXPOSE_ESTIMATED_IDLE_POWER_METRICS
    │   │   │   │               ├── MODEL_CONFIG
    │   │   │   │               ├── MODEL_SERVER_ENABLE
    │   │   │   │               └── MODEL_SERVER_URL
    │   │   ├── overrides.yaml
    │   │   └── prometheus
    │   │   │   └── scrape-configs
    │   │   │       └── dev.yaml
    │   └── monitoring
    │   │   ├── compose.yaml
    │   │   ├── grafana
    │   │       ├── Dockerfile
    │   │       ├── dashboards.yml
    │   │       └── datasource.yml
    │   │   └── prometheus
    │   │       ├── Dockerfile
    │   │       ├── prometheus.yml
    │   │       └── rules
    │   │           └── kepler.rule
    ├── kepler
    │   ├── kustomization.yaml
    │   └── patch
    │   │   └── patch-ci.yaml
    ├── offline-trainer
    │   ├── kustomization.yaml
    │   └── offline-trainer.yaml
    ├── server
    │   ├── base
    │   │   └── kustomization.yaml
    │   ├── kustomization.yaml
    │   ├── kustomizeconfig.yaml
    │   ├── online-train
    │   │   ├── kustomization.yaml
    │   │   └── patch-trainer.yaml
    │   ├── openshift
    │   │   ├── online-train
    │   │   │   ├── kustomization.yaml
    │   │   │   └── patch-trainer.yaml
    │   │   ├── patch-openshift.yaml
    │   │   └── serve-only
    │   │   │   └── kustomization.yaml
    │   └── server.yaml
    ├── set.sh
    └── test
    │   ├── file-server.yaml
    │   ├── model-request-client.yaml
    │   ├── patch-estimator-sidecar.yaml
    │   └── power-request-client.yaml
├── model_training
    ├── README.md
    ├── cmd_instruction.md
    ├── deployment
    │   ├── kepler.yaml
    │   ├── prom-kepler-rbac.yaml
    │   └── prom-np.yaml
    ├── s3
    │   ├── Dockerfile
    │   ├── LICENSE.txt
    │   ├── README.md
    │   ├── pyproject.toml
    │   ├── src
    │   │   └── s3
    │   │   │   ├── __about__.py
    │   │   │   ├── __init__.py
    │   │   │   ├── loader.py
    │   │   │   ├── pusher.py
    │   │   │   └── util.py
    │   └── tests
    │   │   └── __init__.py
    ├── script.sh
    └── tekton
    │   ├── README.md
    │   ├── examples
    │       ├── complete-pipelinerun.yaml
    │       ├── single-train
    │       │   ├── abs-power.yaml
    │       │   ├── aws-push.yaml
    │       │   ├── default.yaml
    │       │   ├── dyn-power.yaml
    │       │   └── ibmcloud-push.yaml
    │       ├── test-collect.yaml
    │       └── test-retrain.yaml
    │   ├── pipelines
    │       ├── collect.yaml
    │       ├── complete-retrain.yaml
    │       ├── complete-train.yaml
    │       ├── single-retrain.yaml
    │       └── single-train.yaml
    │   ├── pvc
    │       └── hostpath.yaml
    │   └── tasks
    │       ├── extract-task.yaml
    │       ├── isolate-task.yaml
    │       ├── original-pipeline-task.yaml
    │       ├── s3
    │           ├── aws-s3-load.yaml
    │           ├── aws-s3-push.yaml
    │           ├── ibmcloud-s3-load.yaml
    │           └── ibmcloud-s3-push.yaml
    │       ├── stressng-task.yaml
    │       └── train-task.yaml
├── pyproject.toml
├── src
    └── kepler_model
    │   ├── __about__.py
    │   ├── __init__.py
    │   ├── abs-train-pipelinerun.yaml
    │   ├── cmd
    │       ├── README.md
    │       ├── __init__.py
    │       ├── cmd_plot.py
    │       ├── cmd_util.py
    │       └── main.py
    │   ├── estimate
    │       ├── __init__.py
    │       ├── archived_model.py
    │       ├── estimator.py
    │       ├── model
    │       │   ├── __init__.py
    │       │   ├── curvefit_model.py
    │       │   ├── estimate_common.py
    │       │   ├── keras_model.py
    │       │   ├── model.py
    │       │   ├── scikit_model.py
    │       │   └── xgboost_model.py
    │       └── model_server_connector.py
    │   ├── server
    │       └── model_server.py
    │   ├── train
    │       ├── __init__.py
    │       ├── ec2_pipeline.py
    │       ├── exporter
    │       │   ├── __init__.py
    │       │   ├── exporter.py
    │       │   ├── validator.py
    │       │   └── writer.py
    │       ├── extractor
    │       │   ├── __init__.py
    │       │   ├── extractor.py
    │       │   ├── preprocess.py
    │       │   └── smooth_extractor.py
    │       ├── isolator
    │       │   ├── __init__.py
    │       │   ├── isolator.py
    │       │   └── train_isolator.py
    │       ├── offline_trainer.py
    │       ├── online_trainer.py
    │       ├── pipeline.py
    │       ├── profiler
    │       │   ├── __init__.py
    │       │   ├── generate_scaler.py
    │       │   ├── node_type_index.py
    │       │   └── profiler.py
    │       ├── prom
    │       │   ├── __init__.py
    │       │   └── prom_query.py
    │       ├── specpower_pipeline.py
    │       └── trainer
    │       │   ├── ExponentialRegressionTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── GradientBoostingRegressorTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── KNeighborsRegressorTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── LinearRegressionTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── LogarithmicRegressionTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── LogisticRegressionTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── PolynomialRegressionTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── SGDRegressorTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── SVRRegressorTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── XGBoostTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── XgboostFitTrainer
    │       │       ├── __init__.py
    │       │       └── main.py
    │       │   ├── __init__.py
    │       │   ├── curvefit.py
    │       │   ├── scikit.py
    │       │   └── xgboost_interface.py
    │   └── util
    │       ├── __init__.py
    │       ├── config.py
    │       ├── extract_types.py
    │       ├── format.py
    │       ├── loader.py
    │       ├── prom_types.py
    │       ├── saver.py
    │       ├── similarity.py
    │       └── train_types.py
└── tests
    ├── README.md
    ├── __init__.py
    ├── client_load_tester.py
    ├── common_plot.py
    ├── data
        ├── machine
        │   └── spec.json
        ├── node_type_index.json
        └── prom_output
        │   ├── idle.json
        │   └── prom_response.json
    ├── e2e_test.sh
    ├── estimator_model_request_test.py
    ├── estimator_model_test.py
    ├── estimator_power_request_test.py
    ├── extractor_test.py
    ├── http_server.py
    ├── isolator_test.py
    ├── minimal_trainer.py
    ├── model_select_test.py
    ├── model_server_test.py
    ├── model_tester.py
    ├── offline_trainer_test.py
    ├── pipeline_test.py
    ├── prom_test.py
    ├── trainer_test.py
    ├── weight_model_request_test.py
    └── xgboost_test.py


/.github/ISSUE_TEMPLATE/document.yaml:
--------------------------------------------------------------------------------
 1 | name: Documentation Issue
 2 | description: Provide supporting details for documentation issue
 3 | labels: kind/documentation
 4 | body:
 5 |   - type: textarea
 6 |     id: document
 7 |     attributes:
 8 |       label: Which document would you like to address?
 9 |       description: Include the link to the document if applicable
10 |     validations:
11 |       required: true
12 | 
13 |   - type: textarea
14 |     id: documentFixDetail
15 |     attributes:
16 |       label: What is the issue?
17 |     validations:
18 |       required: true
19 | 
20 |   - type: textarea
21 |     id: documentFixDetail
22 |     attributes:
23 |       label: How do you suggest this is fixed?
24 |     validations:
25 |       required: false
26 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.yaml:
--------------------------------------------------------------------------------
 1 | name: Enhancement Tracking Issue
 2 | description: Provide supporting details for a feature in development
 3 | labels: kind/feature
 4 | body:
 5 |   - type: textarea
 6 |     id: feature
 7 |     attributes:
 8 |       label: What would you like to be added?
 9 |       description: |
10 |         Feature requests are unlikely to make progress as issues. Please consider engaging with SIGs on slack and mailing lists, instead.
11 |         A proposal that works through the design along with the implications of the change can be opened as a KEP.
12 |         See https://git.k8s.io/enhancements/keps#kubernetes-enhancement-proposals-keps
13 |     validations:
14 |       required: true
15 | 
16 |   - type: textarea
17 |     id: rationale
18 |     attributes:
19 |       label: Why is this needed?
20 |     validations:
21 |       required: true
22 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: pip
 4 |     directory: /
 5 |     schedule:
 6 |       day: monday
 7 |       interval: weekly
 8 |     groups:
 9 |       github-actions:
10 |         patterns:
11 |           - "*"
12 |   - package-ecosystem: github-actions
13 |     directory: /
14 |     schedule:
15 |       day: monday
16 |       interval: weekly
17 |     groups:
18 |       github-actions:
19 |         patterns:
20 |           - "*"
21 |   - package-ecosystem: docker
22 |     directory: /
23 |     schedule:
24 |       day: monday
25 |       interval: weekly
26 |     groups:
27 |       github-actions:
28 |         patterns:
29 |           - "*"
30 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | # Checklist for PR Author
 2 | 
 3 | ---
 4 | 
 5 | In addition to approval, the author must confirm the following check list:
 6 | 
 7 | - [ ] Run the following command to format your code:
 8 | 
 9 |   ```bash
10 |   make fmt
11 |   ```
12 | 
13 | - [ ] Create issues for unresolved comments and link them to this PR. Use one of the following labels:
14 |   - `must-fix`: The logic appears incorrect and must be addressed.
15 |   - `minor`: Typos, minor issues, or potential refactoring for better readability.
16 |   - `nit`: Trivial issues like extra spaces, commas, etc.
17 | 


--------------------------------------------------------------------------------
/.github/workflows/collect-train.yml:
--------------------------------------------------------------------------------
 1 | # manually run on collect needed
 2 | on: # yamllint disable-line rule:truthy
 3 |   workflow_dispatch:
 4 | 
 5 | jobs:
 6 |   collect-data:
 7 |     uses: ./.github/workflows/collect-data-self-hosted.yml
 8 |     strategy:
 9 |       matrix:
10 |         instance_type: [i3.metal]
11 |       max-parallel: 1
12 |     with:
13 |       instance_type: ${{ matrix.instance_type }}
14 |       ami_id: ami-0e4d0bb9670ea8db0
15 |       github_repo: ${{ github.repository }}
16 |       model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest
17 |     secrets:
18 |       self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
19 |       aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
20 |       aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
21 |       security_group_id: ${{ secrets.AWS_SECURITY_GROUP_ID }}
22 |       aws_region: ${{ secrets.AWS_REGION }}
23 | 
24 |   train-model:
25 |     needs: [collect-data]
26 |     strategy:
27 |       matrix:
28 |         instance_type: [i3.metal]
29 |     uses: ./.github/workflows/train-model.yml
30 |     with:
31 |       pipeline_name: std_v0.7.11
32 |       instance_type: ${{ matrix.instance_type }}
33 |       ami_id: ami-0e4d0bb9670ea8db0
34 |       github_repo: ${{ github.repository }}
35 |       model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest
36 |       trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer
37 |     secrets:
38 |       self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
39 |       aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
40 |       aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
41 |       aws_region: ${{ secrets.AWS_REGION }}
42 | 


--------------------------------------------------------------------------------
/.github/workflows/commit-msg.yml:
--------------------------------------------------------------------------------
 1 | name: Commit message check
 2 | 
 3 | on: # yamllint disable-line rule:truthy
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 |   check-commit-message:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       - name: Checkout code
11 |         uses: actions/checkout@v4
12 | 
13 |       - name: Check commit message
14 |         uses: webiny/action-conventional-commits@v1.3.0
15 |         with:
16 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 | 


--------------------------------------------------------------------------------
/.github/workflows/integration-test.yml:
--------------------------------------------------------------------------------
  1 | name: Integration Test
  2 | on: # yamllint disable-line rule:truthy
  3 |   workflow_call:
  4 |     inputs:
  5 |       base_change:
  6 |         description: Change flag on base image
  7 |         required: true
  8 |         type: string
  9 |       docker_secret:
 10 |         description: Secret check
 11 |         required: true
 12 |         type: string
 13 |       image_repo:
 14 |         description: The image repo to use
 15 |         required: true
 16 |         type: string
 17 |       image_tag:
 18 |         description: The image tag to use
 19 |         required: true
 20 |         type: string
 21 |       kepler_tag:
 22 |         description: Kepler image tag
 23 |         required: true
 24 |         type: string
 25 |       additional_opts:
 26 |         description: additional deployment opts
 27 |         required: true
 28 |         type: string
 29 | 
 30 | env:
 31 |   BASE_IMAGE: ${{ inputs.image_repo }}/kepler_model_server_base:${{ inputs.image_tag }}
 32 |   IMAGE: localhost:5001/kepler_model_server:devel
 33 |   KEPLER_IMAGE: quay.io/sustainable_computing_io/kepler:${{ inputs.kepler_tag }}
 34 |   DEFAULT_MODEL_SERVER_BASE_IMAGE: quay.io/sustainable_computing_io/kepler_model_server_base:latest
 35 | 
 36 | jobs:
 37 |   run-integration:
 38 |     runs-on: ubuntu-20.04
 39 |     steps:
 40 |       - name: use Kepler action to deploy cluster
 41 |         uses: sustainable-computing-io/kepler-action@v0.0.9
 42 |         with:
 43 |           runningBranch: kind
 44 |           cluster_provider: kind
 45 |       - name: load kepler image
 46 |         run: |
 47 |           docker pull ${{ env.KEPLER_IMAGE }}
 48 |           kind load docker-image ${{ env.KEPLER_IMAGE }}
 49 |       - name: checkout
 50 |         uses: actions/checkout@v4
 51 |       - name: set up QEMU
 52 |         uses: docker/setup-qemu-action@v3
 53 |       - name: set up Docker Buildx
 54 |         uses: docker/setup-buildx-action@v3
 55 |       - name: Replace value in Dockerfile if base changes
 56 |         if: ${{ (inputs.base_change == 'true') && (inputs.docker_secret == 'true') }}
 57 |         run: |
 58 |           sed -i "s|${{ env.DEFAULT_MODEL_SERVER_BASE_IMAGE }}|${{ env.BASE_IMAGE }}|" dockerfiles/Dockerfile
 59 |       - name: Replace value in Dockerfile.test if base changes
 60 |         if: ${{ (inputs.base_change == 'true') && (inputs.docker_secret == 'true') }}
 61 |         run: |
 62 |           sed -i "s|${{ env.DEFAULT_MODEL_SERVER_BASE_IMAGE }}|${{ env.BASE_IMAGE }}|" dockerfiles/Dockerfile.test
 63 |       - name: build Kepler model server and test image and push to local registry
 64 |         run: make build build-test push push-test
 65 |       - name: set up Kustomize
 66 |         run: |
 67 |           curl -o install_kustomize.sh https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh
 68 |           chmod +x install_kustomize.sh
 69 |           ./install_kustomize.sh 5.3.0
 70 |           chmod +x kustomize
 71 |           mv kustomize /usr/local/bin/
 72 |       - name: test deploying with only estimator
 73 |         run: |
 74 |           make deploy
 75 |           ./tests/e2e_test.sh --estimator ${{ inputs.additional_opts }}
 76 |           make cleanup
 77 |         env:
 78 |           OPTS: ESTIMATOR
 79 |           KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }}
 80 |       - name: test deploying with only server
 81 |         run: |
 82 |           make deploy
 83 |           ./tests/e2e_test.sh --server ${{ inputs.additional_opts }}
 84 |           make cleanup
 85 |         env:
 86 |           OPTS: SERVER
 87 |           KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }}
 88 |       - name: test deploying with estimator and model server
 89 |         run: |
 90 |           make deploy
 91 |           ./tests/e2e_test.sh --estimator --server ${{ inputs.additional_opts }}
 92 |           make cleanup
 93 |         env:
 94 |           OPTS: ESTIMATOR SERVER
 95 |           KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }}
 96 | 
 97 |       - name: upload artifacts on failure
 98 |         if: ${{ failure() }}
 99 |         uses: actions/upload-artifact@v4
100 |         with:
101 |           name: integration-test-artifacts
102 |           path: tmp/e2e
103 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Run linters and formatters
 2 | 
 3 | on: # yamllint disable-line rule:truthy
 4 |   pull_request:
 5 | 
 6 | jobs:
 7 |   markdown-lint:
 8 |     runs-on: ubuntu-latest
 9 |     steps:
10 |       # checkout soruce code
11 |       - name: Checkout code
12 |         uses: actions/checkout@v4
13 | 
14 |       # setup Python environment
15 |       - name: Set up Python
16 |         uses: actions/setup-python@v5
17 |         with:
18 |           python-version: "3.10"
19 | 
20 |       # install hatch
21 |       - name: Install hatch
22 |         run: |
23 |           python -m pip install --upgrade pip
24 |           pip install hatch
25 | 
26 |       # scan for markdown linting errors
27 |       - name: Run pymarkdownlnt on markdown files
28 |         shell: bash
29 |         run: |
30 |           make lint
31 | 
32 |       # run hatch fmt
33 |       - name: Run formatter using hatch
34 |         shell: bash
35 |         run: |
36 |           make fmt
37 |           git diff --exit-code
38 | 


--------------------------------------------------------------------------------
/.github/workflows/push-to-main.yml:
--------------------------------------------------------------------------------
 1 | on: # yamllint disable-line rule:truthy
 2 |   push:
 3 |     branches:
 4 |       - main
 5 | 
 6 | env:
 7 |   TAG: latest
 8 | 
 9 | jobs:
10 |   check-branch:
11 |     runs-on: ubuntu-latest
12 | 
13 |     outputs:
14 |       tag: ${{ steps.image-tag.outputs.tag }}
15 | 
16 |     steps:
17 |       - uses: actions/checkout@v4
18 |       - name: Find Image Tag
19 |         id: image-tag
20 |         env:
21 |           BRANCH: ${{ github.ref_name }}
22 |           COMMIT: ${{ github.sha }}
23 |         run: |
24 |             if [ "${{ github.event_name }}" == 'pull_request' ]; then
25 |               echo "tag=pr-${{ github.event.number }}" >> "$GITHUB_OUTPUT"
26 |             else
27 |               if [ "$BRANCH" == "main" ]; then
28 |                   echo "tag=${{ env.TAG }}" >> "$GITHUB_OUTPUT"
29 |               else
30 |                   echo "tag=$COMMIT" >> "$GITHUB_OUTPUT"
31 |               fi
32 |             fi
33 | 
34 |   check-change:
35 |     runs-on: ubuntu-latest
36 | 
37 |     outputs:
38 |       base: ${{ steps.filter.outputs.base }}
39 |       modeling: ${{ steps.filter.outputs.modeling }}
40 |       s3: ${{ steps.filter.outputs.s3 }}
41 | 
42 |     steps:
43 |       - uses: actions/checkout@v4
44 |       - uses: dorny/paths-filter@v3
45 |         id: filter
46 |         with:
47 |           filters: |
48 |             base:
49 |               - 'pyproject.toml'
50 |               - 'dockerfiles/Dockerfile.base'
51 |               - '.github/workflows/build-push.yml'
52 |             modeling:
53 |               - 'src/**'
54 |               - 'model_training/**'
55 |               - 'hack/**'
56 |               - '.github/workflows/train-model.yml'
57 |             s3:
58 |               - 'model_training/s3/**'
59 | 
60 |   build-push:
61 |     needs: [check-change, check-branch]
62 |     uses: ./.github/workflows/build-push.yml
63 |     with:
64 |       base_change: ${{ needs.check-change.outputs.base }}
65 |       s3_change: ${{ needs.check-change.outputs.s3 }}
66 |       image_repo: ${{ vars.IMAGE_REPO }}
67 |       image_tag: ${{ needs.check-branch.outputs.tag }}
68 |       push: true
69 |     secrets:
70 |       docker_username: ${{ secrets.BOT_NAME }}
71 |       docker_password: ${{ secrets.BOT_TOKEN }}
72 | 
73 |   train-model:
74 |     needs: [check-change, check-branch, build-push]
75 |     if: ${{ needs.check-change.outputs.modeling == 'true' }}
76 |     strategy:
77 |       matrix:
78 |         instance_type: [i3.metal]
79 |     uses: ./.github/workflows/train-model.yml
80 |     with:
81 |       pipeline_name: std_v0.7.11
82 |       instance_type: ${{ matrix.instance_type }}
83 |       ami_id: ami-0e4d0bb9670ea8db0
84 |       github_repo: ${{ github.repository }}
85 |       model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:${{ needs.check-branch.outputs.tag }}
86 |       trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer
87 |     secrets:
88 |       self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
89 |       aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
90 |       aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
91 |       aws_region: ${{ secrets.AWS_REGION }}
92 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | on: # yamllint disable-line rule:truthy
  3 |   workflow_dispatch:
  4 |     inputs:
  5 |       tag:
  6 |         description: Tag name, e.g. 0.7.11
  7 |         default: ""
  8 |         required: true
  9 | 
 10 | jobs:
 11 |   build:
 12 |     name: Upload Release Asset
 13 |     permissions:
 14 |       contents: write
 15 |     runs-on: ubuntu-latest
 16 |     steps:
 17 |       - name: Checkout code
 18 |         uses: actions/checkout@v4
 19 | 
 20 |       - name: Login to Quay.io
 21 |         uses: docker/login-action@v3
 22 |         with:
 23 |           registry: ${{ vars.IMAGE_REGISTRY }}
 24 |           username: ${{ secrets.BOT_NAME }}
 25 |           password: ${{ secrets.BOT_TOKEN }}
 26 | 
 27 |       - name: Git set user
 28 |         shell: bash
 29 |         run: |
 30 |           git config user.name "$USERNAME"
 31 |           git config user.email "$USERENAME-bot@users.noreply.github.com"
 32 |         env:
 33 |           USERNAME: ${{ github.actor }}
 34 | 
 35 |       - name: Update the VERSION
 36 |         run: |
 37 |           echo "$VERSION" > VERSION
 38 |         env:
 39 |           VERSION: ${{ github.event.inputs.tag }}
 40 | 
 41 |       - name: Build model-server-base
 42 |         run: |
 43 |           make build-base
 44 |         env:
 45 |           IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }}
 46 | 
 47 |       - name: Push model-server-base
 48 |         run: |
 49 |           make push-base
 50 |         env:
 51 |           IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }}
 52 | 
 53 |       - name: Update base in model-server dockerfile
 54 |         run: |
 55 |           sed -i "s/model_server_base:.*/model_server_base:v$VERSION/g" ./dockerfiles/Dockerfile
 56 |         env:
 57 |           VERSION: ${{ github.event.inputs.tag }}
 58 | 
 59 |       - name: Build model-server
 60 |         run: |
 61 |           make build
 62 |         env:
 63 |           IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }}
 64 | 
 65 |       - name: Create tag
 66 |         run: |
 67 |           git add VERSION ./dockerfiles/Dockerfile
 68 |           git commit -m "ci: update VERSION to $VERSION"
 69 |           git tag -a "v$VERSION" -m "$VERSION"
 70 |           git show --stat
 71 |         env:
 72 |           VERSION: ${{ github.event.inputs.tag }}
 73 | 
 74 |       - name: Push Images
 75 |         run: |
 76 |           make push
 77 |         env:
 78 |           IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }}
 79 | 
 80 |       - name: Push Release tag
 81 |         run: |
 82 |           git push --follow-tags
 83 | 
 84 |       - name: Create Release
 85 |         id: create_release
 86 |         uses: actions/create-release@v1
 87 |         env:
 88 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 89 |         with:
 90 |           tag_name: v${{ github.event.inputs.tag }}
 91 |           release_name: v${{ github.event.inputs.tag }}-release
 92 |           draft: false
 93 |           prerelease: false
 94 | 
 95 |   create-release-branch:
 96 |     name: Create Release Branch
 97 |     permissions:
 98 |       contents: write
 99 |     needs: build
100 |     runs-on: ubuntu-latest
101 |     steps:
102 |       - name: Create release branch
103 |         uses: peterjgrainger/action-create-branch@v3.0.0
104 |         env:
105 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
106 |         with:
107 |           branch: v${{ github.event.inputs.tag }}-release
108 |           sha: ${{ github.event.pull_request.head.sha }}
109 | 


--------------------------------------------------------------------------------
/.github/workflows/train.yml:
--------------------------------------------------------------------------------
 1 | # manually run on retrain needed
 2 | name: Retrain All Machines
 3 | on: # yamllint disable-line rule:truthy
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 | 
 8 |   check-change:
 9 |     runs-on: ubuntu-latest
10 | 
11 |     outputs:
12 |       modeling: ${{ steps.filter.outputs.modeling }}
13 | 
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: dorny/paths-filter@v3
17 |         id: filter
18 |         with:
19 |           filters: |
20 |             modeling:
21 |               - 'src/**'
22 |               - 'model_training/**'
23 |               - 'hack/**'
24 |               - '.github/workflows/train-model.yml'
25 | 
26 |   train-model:
27 |     needs: [check-change]
28 |     if: ${{ needs.check-change.outputs.modeling == 'true' }}
29 |     strategy:
30 |       matrix:
31 |         instance_type: [i3.metal]
32 |     uses: ./.github/workflows/train-model.yml
33 |     with:
34 |       pipeline_name: std_v0.7.11
35 |       instance_type: ${{ matrix.instance_type }}
36 |       ami_id: ami-0e4d0bb9670ea8db0
37 |       github_repo: ${{ github.repository }}
38 |       model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest
39 |       trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer
40 |     secrets:
41 |       self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }}
42 |       aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }}
43 |       aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
44 |       aws_region: ${{ secrets.AWS_REGION }}
45 | 


--------------------------------------------------------------------------------
/.github/workflows/unit-test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Test
 2 | 
 3 | on: # yamllint disable-line rule:truthy
 4 |   workflow_call:
 5 |     secrets:
 6 |       docker_username:
 7 |         description: Docker username
 8 |         required: false
 9 |       docker_password:
10 |         description: Docker password
11 |         required: false
12 |     inputs:
13 |       base_change:
14 |         description: Change flag on base image
15 |         required: true
16 |         type: string
17 | 
18 | jobs:
19 |   unit-test:
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: actions/checkout@v4
23 |       - name: Set up Docker
24 |         uses: docker/setup-buildx-action@v3
25 |       - name: Build test with base image
26 |         if: ${{ inputs.base_change != 'true' }}
27 |         run: make build-test
28 |       - name: Build test without base image
29 |         if: ${{ inputs.base_change == 'true' }}
30 |         run: make build-test-nobase
31 |       - name: Test pipeline # need to run first to build the models
32 |         run: make test-pipeline
33 |       - name: Test model server
34 |         run: make test-model-server
35 |         timeout-minutes: 5
36 |       - name: Test estimator
37 |         run: make test-estimator
38 |         timeout-minutes: 5
39 |       - name: Test offline trainer
40 |         run: make test-offline-trainer
41 |       - name: Test model server select
42 |         run: make test-model-server-select
43 |         timeout-minutes: 5
44 |       - name: Test model server select via estimator
45 |         run: make test-model-server-estimator-select
46 |         timeout-minutes: 5
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # models
  2 | server/train/local/
  3 | server/models
  4 | */*/download
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | .ipynb_checkpoints
 84 | 
 85 | # IPython
 86 | profile_default/
 87 | ipython_config.py
 88 | 
 89 | # pyenv
 90 | .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
100 | __pypackages__/
101 | 
102 | # Celery stuff
103 | celerybeat-schedule
104 | celerybeat.pid
105 | 
106 | # SageMath parsed files
107 | *.sage.py
108 | 
109 | # Environments
110 | .env
111 | .venv
112 | env/
113 | venv/
114 | ENV/
115 | env.bak/
116 | venv.bak/
117 | 
118 | # Spyder project settings
119 | .spyderproject
120 | .spyproject
121 | 
122 | # Rope project settings
123 | .ropeproject
124 | 
125 | # mkdocs documentation
126 | /site
127 | 
128 | # mypy
129 | .mypy_cache/
130 | .dmypy.json
131 | dmypy.json
132 | 
133 | # Pyre type checker
134 | .pyre/
135 | 
136 | tests/download/*
137 | .DS_Store
138 | */.DS_Store
139 | */*/.DS_Store
140 | */*/*/.DS_Store
141 | 
142 | /src/kepler_model/models/
143 | /tests/models/
144 | /src/resource/
145 | tests/data/extractor_output
146 | tests/data/isolator_output
147 | tests/data/offline_trainer_output
148 | tests/data/plot_output
149 | model_training/*data*
150 | model_training/tekton/secret
151 | local-dev-cluster
152 | tmp
153 | tests/db-models
154 | db-models
155 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.analysis.extraPaths": [
3 |         "./src/util"
4 |     ]
5 | }


--------------------------------------------------------------------------------
/.yamllint.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | extends: default
 3 | rules:
 4 |   line-length: disable
 5 |   document-start: disable
 6 |   comments:
 7 |     min-spaces-from-content: 1
 8 |   quoted-strings:
 9 |     required: only-when-needed
10 |     extra-required:
11 |       - ^.*:\s.*$
12 |       - ^.*:$
13 |     quote-type: double
14 | ignore:
15 |   - model_training/deployment/cpe-operator.yaml
16 |   - tmp/
17 | 


--------------------------------------------------------------------------------
/VERSION:
--------------------------------------------------------------------------------
1 | 0.7.12
2 | 


--------------------------------------------------------------------------------
/cmd/main.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # -*- coding: utf-8 -*-
 4 | import re
 5 | import sys
 6 | 
 7 | from kepler_model.cmd.main import run
 8 | 
 9 | if __name__ == "__main__":
10 |     sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0])
11 |     sys.exit(run())
12 | 


--------------------------------------------------------------------------------
/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | [Get started with Kepler Model Server.](https://sustainable-computing.io/kepler_model_server/get_started/)
 4 | 
 5 | - The main source codes are in [src directory](./src/).
 6 | 
 7 | ## PR Hands-on
 8 | 
 9 | - Create related [issue](https://github.com/sustainable-computing-io/kepler-model-server/issues) with your name assigned first (if not exist).
10 | 
11 | - Set required secret and environment for local repository test if needed. Check below table.
12 | 
13 | | Objective | Required Secret | Required Environment |
14 | | --------- | --------------- |----------------------|
15 | | Push to private repo |BOT_NAME, BOT_TOKEN | IMAGE_REPO |
16 | | Change on base image | BOT_NAME, BOT_TOKEN | IMAGE_REPO |
17 | | Save data/models to AWS COS | AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,AWS_REGION | |
18 | 
19 | ## Improve components in training pipelines
20 | 
21 | Learn more details about [Training Pipeline](https://sustainable-computing.io/kepler_model_server/pipeline/)
22 | 
23 | ### Introduce new feature group
24 | 
25 | - Define new feature group name `FeatureGroup` and update metric list map `FeatureGroups` in [train types](./src/util/train_types.py)
26 | 
27 | ### Introduce new energy sources
28 | 
29 | - Define new energy source map `PowerSourceMap` in [train types](./src/util/train_types.py)
30 | 
31 | ### Improve preprocessing method
32 | 
33 | - [extractor](./src/train/extractor/): convert from numerically aggregated metrics to per-second value
34 | - [isolator](./src/train/isolator/): isolate background (idle) power from the collected power
35 | 
36 | ### Introduce new learning method
37 | 
38 | - [trainer](./src/train/trainer/): apply learning method to build a model using extracted data and isolated data
39 | 
40 | ## Model training
41 | 
42 | Learn more details about [model training](./model_training/)
43 | 
44 | ### Introduce new benchmarks
45 | 
46 | ### Tekton
47 | 
48 | Create workload `Task` and provide example `Pipeline` to run.
49 | 
50 | ### Add new trained models
51 | 
52 | TBD
53 | 
54 | ## Source improvement
55 | 
56 | Any improvement in `src` and `cmd`.
57 | 
58 | ## Test and CI improvement
59 | 
60 | Any improvement in `tests`, `dockerfiles`, `manifests` and `.github/workflows`
61 | 
62 | ## Documentation
63 | 
64 | Detailed documentation should be posted to [kepler-doc](https://github.com/sustainable-computing-io/kepler-doc) repository.
65 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM quay.io/sustainable_computing_io/kepler_model_server_base:v0.7.12
 2 | 
 3 | WORKDIR /kepler_model
 4 | ENV PYTHONPATH=/kepler_model
 5 | 
 6 | COPY pyproject.toml .
 7 | COPY README.md .
 8 | COPY cmd/ cmd/
 9 | COPY src/ src/
10 | 
11 | RUN pip install --no-cache-dir .
12 | 
13 | # port for Model Server
14 | EXPOSE 8100
15 | # port for Online Trainer (TODO: reserved for event-based online training)
16 | EXPOSE 8101
17 | # port for Offline Trainer
18 | EXPOSE 8102
19 | 
20 | ENTRYPOINT ["bash", "-c"]
21 | CMD ["kepler-model"]
22 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.base:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | #
 3 | # NOTE: This file contains all tools and dependencies needed for
 4 | # setting up the development and testing environment
 5 | 
 6 | # Prevents Python from writing pyc files.
 7 | ENV PYTHONDONTWRITEBYTECODE=1
 8 | 
 9 | # Keeps Python from buffering stdout and stderr to avoid situations where
10 | # the application crashes without emitting any logs due to buffering.
11 | ENV PYTHONUNBUFFERED=1
12 | 
13 | RUN pip install --no-cache-dir --upgrade pip && \
14 |     python -m pip install --no-cache-dir hatch && \
15 | 		pip cache purge
16 | 
17 | WORKDIR /kepler_model
18 | ENV	PYTHONPATH=/kepler_model
19 | 
20 | COPY pyproject.toml .
21 | 
22 | # NOTE: README.md and __about__.py are referenced in pyproject.toml
23 | # so they are copied into the image for pip install to succeed
24 | COPY README.md .
25 | 
26 | RUN mkdir -p src/kepler_model
27 | COPY src/kepler_model/__init__.py src/kepler_model/
28 | COPY src/kepler_model/__about__.py src/kepler_model/
29 | 
30 | RUN pip install --no-cache-dir . && \
31 | 		pip cache purge
32 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | src/resource/
3 | src/kepler_model/models/
4 | tests/models/
5 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.test:
--------------------------------------------------------------------------------
 1 | FROM quay.io/sustainable_computing_io/kepler_model_server_base:latest
 2 | 
 3 | # Prevents Python from writing pyc files.
 4 | ENV PYTHONDONTWRITEBYTECODE=1
 5 | 
 6 | # Keeps Python from buffering stdout and stderr to avoid situations where
 7 | # the application crashes without emitting any logs due to buffering.
 8 | ENV PYTHONUNBUFFERED=1
 9 | 
10 | 
11 | WORKDIR /kepler_model
12 | ENV PYTHONPATH=/kepler_model
13 | 
14 | COPY pyproject.toml .
15 | COPY README.md .
16 | COPY cmd/ cmd/
17 | COPY src/ src/
18 | COPY tests/ tests/
19 | 
20 | RUN pip install --no-cache-dir . && \
21 | 		pip cache purge
22 | 
23 | RUN mkdir -p /mnt/models
24 | 
25 | # port for Model Server
26 | EXPOSE 8100
27 | # port for Online Trainer (TODO: reserved for event-based online training)
28 | EXPOSE 8101
29 | # port for Offline Trainer
30 | EXPOSE 8102
31 | 
32 | CMD ["model-server"]
33 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.test-nobase:
--------------------------------------------------------------------------------
 1 | FROM python:3.10-slim
 2 | 
 3 | # NOTE: This file contains all tools and dependencies needed for
 4 | # setting up the development and testing environment
 5 | 
 6 | # Prevents Python from writing pyc files.
 7 | ENV PYTHONDONTWRITEBYTECODE=1
 8 | 
 9 | # Keeps Python from buffering stdout and stderr to avoid situations where
10 | # the application crashes without emitting any logs due to buffering.
11 | ENV PYTHONUNBUFFERED=1
12 | 
13 | RUN python -m pip install --no-cache-dir hatch && \
14 | 		pip cache purge
15 | 
16 | WORKDIR /kepler_model
17 | ENV PYTHONPATH=/kepler_model
18 | 
19 | 
20 | COPY pyproject.toml .
21 | 
22 | # NOTE: README.md and src/../__about__.py are referenced in pyproject.toml
23 | # so that they are copied into the image for pip install to succeed
24 | COPY README.md .
25 | COPY cmd/ cmd/
26 | COPY src/ src/
27 | COPY tests/ tests/
28 | 
29 | RUN pip install --no-cache-dir . && \
30 | 		pip cache purge
31 | 
32 | RUN hatch env create
33 | 
34 | 
35 | RUN mkdir -p /mnt/models
36 | # port for Model Server
37 | EXPOSE 8100
38 | # port for Online Trainer (TODO: reserved for event-based online training)
39 | EXPOSE 8101
40 | # port for Offline Trainer
41 | EXPOSE 8102
42 | 
43 | CMD ["model-server"]
44 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.test-nobase.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | src/resource/
3 | src/kepler_model/models/
4 | tests/models/
5 | 


--------------------------------------------------------------------------------
/dockerfiles/Dockerfile.test.dockerignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | src/resource/
3 | src/kepler_model/models/
4 | tests/models/
5 | 


--------------------------------------------------------------------------------
/docs/developer/README.md:
--------------------------------------------------------------------------------
 1 | # Developer Guide
 2 | 
 3 | - Temporarily add `__init__.py` to all directories
 4 | 
 5 | ```bash
 6 | find ./src -type d -exec touch {}/__init__.py \;
 7 | ```
 8 | 
 9 | - Generate `classes.plantuml` and `packages.plantuml` using the following commands
10 | 
11 | ```bash
12 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2  --show-ancestors 1 --verbose -d umls/server/ --source-roots ./src/ ./src/server/
13 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2  --show-ancestors 1 --verbose -d umls/estimate/ --source-roots ./src/ ./src/estimate/
14 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2  --show-ancestors 1 --verbose -d umls/train/ --source-roots ./src/ ./src/train/
15 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2  --show-ancestors 1 --verbose -d umls/train/trainer/ --source-roots ./src/ ./src/train/trainer/
16 | ```
17 | 
18 | - Use [plantuml](https://plantuml.com/download) to convert planuml files  to `svg` files
19 | NeoVim plugin `neovim-soil` was used to generate svg files from plantuml files
20 | 


--------------------------------------------------------------------------------
/docs/developer/estimate/packages.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml packages
 2 | set namespaceSeparator none
 3 | package "estimate" as estimate #77AADD {
 4 | }
 5 | package "estimate.archived_model" as estimate.archived_model #77AADD {
 6 | }
 7 | package "estimate.estimator" as estimate.estimator #77AADD {
 8 | }
 9 | package "estimate.model" as estimate.model #99DDFF {
10 | }
11 | package "estimate.model.curvefit_model" as estimate.model.curvefit_model #99DDFF {
12 | }
13 | package "estimate.model.estimate_common" as estimate.model.estimate_common #99DDFF {
14 | }
15 | package "estimate.model.keras_model" as estimate.model.keras_model #99DDFF {
16 | }
17 | package "estimate.model.model" as estimate.model.model #99DDFF {
18 | }
19 | package "estimate.model.scikit_model" as estimate.model.scikit_model #99DDFF {
20 | }
21 | package "estimate.model.xgboost_model" as estimate.model.xgboost_model #99DDFF {
22 | }
23 | package "estimate.model_server_connector" as estimate.model_server_connector #77AADD {
24 | }
25 | estimate --> estimate.model
26 | estimate.archived_model --> estimate.model_server_connector
27 | estimate.estimator --> estimate.archived_model
28 | estimate.estimator --> estimate.model
29 | estimate.estimator --> estimate.model_server_connector
30 | estimate.model.curvefit_model --> estimate.model.estimate_common
31 | estimate.model.keras_model --> estimate.model.estimate_common
32 | estimate.model.model --> estimate.model.curvefit_model
33 | estimate.model.model --> estimate.model.scikit_model
34 | estimate.model.model --> estimate.model.xgboost_model
35 | estimate.model.scikit_model --> estimate.model.estimate_common
36 | estimate.model.xgboost_model --> estimate.model.estimate_common
37 | @enduml
38 | 


--------------------------------------------------------------------------------
/docs/developer/server/classes.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml classes
 2 | set namespaceSeparator none
 3 | class "server.model_server.ModelRequest" as server.model_server.ModelRequest #77AADD {
 4 |   filter : str
 5 |   metrics
 6 |   node_type : int
 7 |   output_type
 8 |   source : str
 9 |   trainer_name : str
10 |   weight : bool
11 | }
12 | @enduml
13 | 


--------------------------------------------------------------------------------
/docs/developer/server/classes.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="us-ascii" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentStyleType="text/css" height="202px" preserveAspectRatio="none" style="width:290px;height:202px;background:#FFFFFF;" version="1.1" viewBox="0 0 290 202" width="290px" zoomAndPan="magnify"><defs/><g><!--class server.model_server.ModelRequest--><g id="elem_server.model_server.ModelRequest"><rect codeLine="2" fill="#77AADD" height="181.4754" id="server.model_server.ModelRequest" rx="2.5" ry="2.5" style="stroke:#181818;stroke-width:0.5;" width="269" x="7" y="7"/><ellipse cx="22" cy="23" fill="#ADD1B2" rx="11" ry="11" style="stroke:#181818;stroke-width:1.0;"/><path d="M22.1094,18.5313 Q21.1875,18.5313 20.4531,18.8438 Q19.7344,19.1563 19.2344,19.75 Q18.7344,20.3281 18.4531,21.2031 Q18.1875,22.0781 18.1875,23.1719 Q18.1875,24.6094 18.625,25.6563 Q19.0781,26.7031 19.9219,27.25 Q20.7813,27.7969 22.0938,27.7969 Q22.875,27.7969 23.5625,27.6719 Q24.2656,27.5313 25.2656,27.2188 L25.2656,29.0938 Q24.4063,29.4219 23.6406,29.5469 Q22.8906,29.6719 21.875,29.6719 Q19.9531,29.6719 18.6406,28.875 Q17.3438,28.0625 16.6875,26.5938 Q16.0313,25.1094 16.0313,23.1406 Q16.0313,21.7188 16.4219,20.5469 Q16.8281,19.3594 17.6094,18.4844 Q18.3906,17.6094 19.5313,17.1406 Q20.6719,16.6563 22.1406,16.6563 Q23.0938,16.6563 23.9844,16.8594 Q24.8906,17.0469 25.8281,17.5156 L25,19.2969 Q24.2188,18.9219 23.5625,18.7344 Q22.9063,18.5313 22.1094,18.5313 Z " fill="#000000"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="237" x="36" y="28.432">server.model_server.ModelRequest</text><line style="stroke:#181818;stroke-width:0.5;" x1="8" x2="275" y1="39" y2="39"/><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="62" x="13" y="57.9659">filter : str</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="50" x="13" y="77.0339">metrics</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="99" x="13" y="96.1018">node_type : int</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="80" x="13" y="115.1697">output_type</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="75" x="13" y="134.2376">source : str</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="120" x="13" y="153.3055">trainer_name : str</text><text fill="#000000" font-family="sans-serif" font-size="14" lengthAdjust="spacing" textLength="87" x="13" y="172.3734">weight : bool</text><line style="stroke:#181818;stroke-width:0.5;" x1="8" x2="275" y1="180.4754" y2="180.4754"/></g><!--SRC=[XO-n3i8m34JtVeLLTjPAR9Mwii07L24S42bDWkqK8SI_uoBKbUtUisyuWL9o4wGuZoEAOwUPAUM4naSdGgs0P_1socV4yGVx1GwuLuYsv8J-FT6cw_f-6EZL45r2L33jI9GD9oW7BvQIDKPz5jWj93MJgvQgNsSakRB7MbHs8O77PS0g7mZNcngUSexD-mC0]--></g></svg>


--------------------------------------------------------------------------------
/docs/developer/server/packages.plantuml:
--------------------------------------------------------------------------------
1 | @startuml packages
2 | set namespaceSeparator none
3 | package "server" as server #77AADD {
4 | }
5 | package "server.model_server" as server.model_server #77AADD {
6 | }
7 | @enduml
8 | 


--------------------------------------------------------------------------------
/docs/developer/server/packages.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="us-ascii" standalone="no"?><svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" contentStyleType="text/css" height="71px" preserveAspectRatio="none" style="width:277px;height:71px;background:#FFFFFF;" version="1.1" viewBox="0 0 277 71" width="277px" zoomAndPan="magnify"><defs/><g><path d="M8.5,6.602 L54.125,6.602 A3.75,3.75 0 0 1 56.625,9.102 L63.625,31.6699 L68.125,31.6699 A2.5,2.5 0 0 1 70.625,34.1699 L70.625,62.2378 A2.5,2.5 0 0 1 68.125,64.7378 L8.5,64.7378 A2.5,2.5 0 0 1 6,62.2378 L6,9.102 A2.5,2.5 0 0 1 8.5,6.602 " fill="#77AADD" style="stroke:#181818;stroke-width:0.5;"/><line style="stroke:#181818;stroke-width:0.5;" x1="6" x2="63.625" y1="31.6699" y2="31.6699"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="44.625" x="10" y="23.5679">server</text><path d="M109,6.602 L254.3125,6.602 A3.75,3.75 0 0 1 256.8125,9.102 L263.8125,31.6699 L268.3125,31.6699 A2.5,2.5 0 0 1 270.8125,34.1699 L270.8125,62.2378 A2.5,2.5 0 0 1 268.3125,64.7378 L109,64.7378 A2.5,2.5 0 0 1 106.5,62.2378 L106.5,9.102 A2.5,2.5 0 0 1 109,6.602 " fill="#77AADD" style="stroke:#181818;stroke-width:0.5;"/><line style="stroke:#181818;stroke-width:0.5;" x1="106.5" x2="263.8125" y1="31.6699" y2="31.6699"/><text fill="#000000" font-family="sans-serif" font-size="14" font-weight="bold" lengthAdjust="spacing" textLength="144.3125" x="110.5" y="23.5679">server.model_server</text><!--SRC=[AqujKSXBp4qjBaXCJWrEBKWiIYp9BrB8oyzBvG8APIUcfoee5QSMbQKMAIaa5Yj0c0hAvkQEZYuk2jLSjUZAz7BpKr9puZ7qe8WZ6m00]--></g></svg>


--------------------------------------------------------------------------------
/docs/developer/train/trainer/packages.plantuml:
--------------------------------------------------------------------------------
 1 | @startuml packages
 2 | set namespaceSeparator none
 3 | package "train.trainer" as train.trainer #77AADD {
 4 | }
 5 | package "train.trainer.ExponentialRegressionTrainer" as train.trainer.ExponentialRegressionTrainer #77AADD {
 6 | }
 7 | package "train.trainer.ExponentialRegressionTrainer.main" as train.trainer.ExponentialRegressionTrainer.main #77AADD {
 8 | }
 9 | package "train.trainer.GradientBoostingRegressorTrainer" as train.trainer.GradientBoostingRegressorTrainer #77AADD {
10 | }
11 | package "train.trainer.GradientBoostingRegressorTrainer.main" as train.trainer.GradientBoostingRegressorTrainer.main #77AADD {
12 | }
13 | package "train.trainer.KNeighborsRegressorTrainer" as train.trainer.KNeighborsRegressorTrainer #77AADD {
14 | }
15 | package "train.trainer.KNeighborsRegressorTrainer.main" as train.trainer.KNeighborsRegressorTrainer.main #77AADD {
16 | }
17 | package "train.trainer.LinearRegressionTrainer" as train.trainer.LinearRegressionTrainer #77AADD {
18 | }
19 | package "train.trainer.LinearRegressionTrainer.main" as train.trainer.LinearRegressionTrainer.main #77AADD {
20 | }
21 | package "train.trainer.LogarithmicRegressionTrainer" as train.trainer.LogarithmicRegressionTrainer #77AADD {
22 | }
23 | package "train.trainer.LogarithmicRegressionTrainer.main" as train.trainer.LogarithmicRegressionTrainer.main #77AADD {
24 | }
25 | package "train.trainer.LogisticRegressionTrainer" as train.trainer.LogisticRegressionTrainer #77AADD {
26 | }
27 | package "train.trainer.LogisticRegressionTrainer.main" as train.trainer.LogisticRegressionTrainer.main #77AADD {
28 | }
29 | package "train.trainer.PolynomialRegressionTrainer" as train.trainer.PolynomialRegressionTrainer #77AADD {
30 | }
31 | package "train.trainer.PolynomialRegressionTrainer.main" as train.trainer.PolynomialRegressionTrainer.main #77AADD {
32 | }
33 | package "train.trainer.SGDRegressorTrainer" as train.trainer.SGDRegressorTrainer #77AADD {
34 | }
35 | package "train.trainer.SGDRegressorTrainer.main" as train.trainer.SGDRegressorTrainer.main #77AADD {
36 | }
37 | package "train.trainer.SVRRegressorTrainer" as train.trainer.SVRRegressorTrainer #77AADD {
38 | }
39 | package "train.trainer.SVRRegressorTrainer.main" as train.trainer.SVRRegressorTrainer.main #77AADD {
40 | }
41 | package "train.trainer.XGBoostTrainer" as train.trainer.XGBoostTrainer #77AADD {
42 | }
43 | package "train.trainer.XGBoostTrainer.main" as train.trainer.XGBoostTrainer.main #77AADD {
44 | }
45 | package "train.trainer.XgboostFitTrainer" as train.trainer.XgboostFitTrainer #77AADD {
46 | }
47 | package "train.trainer.XgboostFitTrainer.main" as train.trainer.XgboostFitTrainer.main #77AADD {
48 | }
49 | package "train.trainer.curvefit" as train.trainer.curvefit #77AADD {
50 | }
51 | package "train.trainer.scikit" as train.trainer.scikit #77AADD {
52 | }
53 | package "train.trainer.xgboost_interface" as train.trainer.xgboost_interface #77AADD {
54 | }
55 | train.trainer.XgboostFitTrainer.main --> train.trainer.xgboost_interface
56 | @enduml
57 | 


--------------------------------------------------------------------------------
/fig/comm_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/comm_diagram.png


--------------------------------------------------------------------------------
/fig/model-server-components-simplified.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/model-server-components-simplified.png


--------------------------------------------------------------------------------
/fig/tekton-complete-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-complete-train.png


--------------------------------------------------------------------------------
/fig/tekton-kepler-default.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-kepler-default.png


--------------------------------------------------------------------------------
/fig/tekton-single-train.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-single-train.png


--------------------------------------------------------------------------------
/hack/k8s_helper.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #
  3 | # This file is part of the Kepler project
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | 
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | 
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | #
 17 | # Copyright 2023 The Kepler Contributors
 18 | #
 19 | 
 20 | set -e
 21 | 
 22 | rollout_ns_status() {
 23 | 	local resources
 24 | 	resources=$(kubectl get deployments,statefulsets,daemonsets -n=$1 -o name)
 25 | 	for res in $resources; do
 26 | 		kubectl rollout status $res --namespace $1 --timeout=10m || die "failed to check status of ${res} inside namespace ${1}"
 27 | 	done
 28 | }
 29 | 
 30 | _get_value() {
 31 |     res=$1
 32 |     namespace=$2
 33 |     location=$3
 34 |     kubectl get $res -n $namespace -ojson|jq -r $location
 35 | }
 36 | 
 37 | _get_succeed_condition() {
 38 |     resource=$1
 39 |     name=$2
 40 |     namespace=$3
 41 |     if [ "$(kubectl get $resource $name -n $namespace -ojson|jq '.status.conditions | length')" == 0 ]; then
 42 |         echo Unknown
 43 |     else
 44 |         location='.status.conditions|map(select(.type="Succeeded"))[0].status'
 45 |         _get_value $resource/$name $namespace $location
 46 |     fi
 47 | }
 48 | 
 49 | _log_completed_pod() {
 50 |     local resources
 51 |     name=$1
 52 |     namespace=$2
 53 |     location=".status.phase"
 54 | 	resources=$(kubectl get pods -n=$namespace -o name)
 55 | 	for res in $resources; do
 56 |         if [ "$res" == "pod/${name}-run-stressng-pod" ]; then
 57 |             # get parameters and estimation time
 58 |             kubectl logs $res -n $namespace|head
 59 |         fi
 60 |         echo $res
 61 |         if [ "$res" == "pod/${name}-presteps-pod" ]; then
 62 |             # get parameters and estimation time
 63 |             kubectl logs $res -n $namespace -c step-collect-idle|tail
 64 |         else
 65 |             kubectl logs $res -n $namespace|tail
 66 |         fi
 67 | 	done
 68 | }
 69 | 
 70 | wait_for_pipelinerun() {
 71 |     resource=pipelinerun
 72 |     name=$1
 73 |     namespace=default
 74 |     
 75 |     if kubectl get taskruns|grep ${name}-run-stressng; then
 76 |         value=$(_get_succeed_condition $resource $name $namespace)
 77 |         while [ "$value" == "Unknown" ] ; 
 78 |         do
 79 |             echo "Wait for pipeline $name to run workload"
 80 |             kubectl get pods
 81 |             value=$(_get_succeed_condition $resource $name $namespace)
 82 |             if kubectl get pod/${name}-run-stressng-pod |grep Running ; then
 83 |                 estimate_time_line=$(kubectl logs pod/${name}-run-stressng-pod -c step-run-stressng -n $namespace|grep "Estimation Time (s):")
 84 |                 estimate_time=$(echo ${estimate_time_line}|awk '{print $4}')
 85 |                 echo "${estimate_time_line}, sleep"
 86 |                 sleep ${estimate_time}
 87 |                 break
 88 |             fi
 89 |             sleep 60
 90 |         done
 91 |     fi
 92 | 
 93 |     value=$(_get_succeed_condition $resource $name $namespace)
 94 |     while [ "$value" == "Unknown" ] ; 
 95 |     do
 96 |         echo "Wait for pipeline $name to be succeeded"
 97 |         kubectl get pods
 98 |         sleep 60
 99 |         value=$(_get_succeed_condition $resource $name $namespace)
100 |     done
101 | 
102 |     kubectl get taskrun
103 |     _log_completed_pod $name $namespace
104 |     if [ "$value" == "False" ]; then
105 |         exit 1
106 |     fi
107 | }
108 | 
109 | "$@"
110 | 


--------------------------------------------------------------------------------
/hack/utils.bash:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2024.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | is_fn() {
18 |   [[ $(type -t "$1") == "function" ]]
19 |   return $?
20 | }
21 | 
22 | header() {
23 |   local title=" 🔆🔆🔆  $*  🔆🔆🔆 "
24 | 
25 |   local len=40
26 |   if [[ ${#title} -gt $len ]]; then
27 |     len=${#title}
28 |   fi
29 | 
30 |   echo -e "\n\n  \033[1m${title}\033[0m"
31 |   echo -n "━━━━━"
32 |   printf '━%.0s' $(seq "$len")
33 |   echo "━━━━━━━"
34 | 
35 | }
36 | 
37 | info() {
38 |   echo -e " 🔔 $*" >&2
39 | }
40 | 
41 | err() {
42 |   echo -e " 😱 $*" >&2
43 | }
44 | 
45 | warn() {
46 |   echo -e "   $*" >&2
47 | }
48 | 
49 | ok() {
50 |   echo -e "   ✅ $*" >&2
51 | }
52 | 
53 | skip() {
54 |   echo -e " 🙈 SKIP: $*" >&2
55 | }
56 | 
57 | fail() {
58 |   echo -e " ❌ FAIL: $*" >&2
59 | }
60 | 
61 | info_run() {
62 |   echo -e "      $*\n" >&2
63 | }
64 | 
65 | run() {
66 |   echo -e " ❯ $*\n" >&2
67 |   "$@"
68 | }
69 | 
70 | die() {
71 |   echo -e "\n ✋ $* "
72 |   echo -e "──────────────────── ⛔️⛔️⛔️ ────────────────────────\n"
73 |   exit 1
74 | }
75 | 
76 | line() {
77 |   local len="$1"
78 |   local style="${2:-thin}"
79 |   shift
80 | 
81 |   local ch='─'
82 |   [[ "$style" == 'heavy' ]] && ch="━"
83 | 
84 |   printf "$ch%.0s" $(seq "$len") >&2
85 |   echo
86 | }
87 | 


--------------------------------------------------------------------------------
/manifests/base/estimate-only/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | apiVersion: kustomize.config.k8s.io/v1beta1
 4 | kind: Kustomization
 5 | images:
 6 |   - name: kepler_model_server
 7 |     newName: localhost:5001/kepler_model_server
 8 |     newTag: devel
 9 | 
10 | patchesStrategicMerge:
11 |   - ./patch/patch-estimator-sidecar.yaml
12 | 
13 | resources:
14 |   - ../kepler
15 | 


--------------------------------------------------------------------------------
/manifests/base/estimate-with-server/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | apiVersion: kustomize.config.k8s.io/v1beta1
 4 | kind: Kustomization
 5 | images:
 6 |   - name: kepler_model_server
 7 |     newName: localhost:5001/kepler_model_server
 8 |     newTag: devel
 9 | 
10 | patchesStrategicMerge:
11 |   - ./patch/patch-estimator-sidecar.yaml
12 |   - ./patch/patch-model-server.yaml
13 | 
14 | resources:
15 |   - ../kepler
16 |   - ../server
17 | 


--------------------------------------------------------------------------------
/manifests/base/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | apiVersion: kustomize.config.k8s.io/v1beta1
 4 | kind: Kustomization
 5 | images:
 6 |   - name: kepler_model_server
 7 |     newName: quay.io/sustainable_computing_io/kepler_model_server
 8 |     newTag: latest
 9 | 
10 | patchesStrategicMerge:
11 |   - ./patch/patch-estimator-sidecar.yaml
12 |   - ./patch/patch-model-server.yaml
13 | 
14 | resources:
15 |   - ../kepler
16 |   - ../server
17 | 


--------------------------------------------------------------------------------
/manifests/base/openshift/estimate-only/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | patchesStrategicMerge:
 4 |   - ./patch/patch-estimator-sidecar.yaml
 5 |   - ./patch/patch-openshift.yaml
 6 | 
 7 | resources:
 8 |   - ../kepler
 9 |   - ./openshift/scc.yaml
10 | 


--------------------------------------------------------------------------------
/manifests/base/openshift/estimate-with-server/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | patchesStrategicMerge:
 4 |   - ./patch/patch-estimator-sidecar.yaml
 5 |   - ./patch/patch-model-server.yaml
 6 |   - ./patch/patch-openshift.yaml
 7 | 
 8 | resources:
 9 |   - ../kepler
10 |   - ../server
11 |   - ./openshift/scc.yaml
12 | 


--------------------------------------------------------------------------------
/manifests/base/openshift/scc.yaml:
--------------------------------------------------------------------------------
 1 | # scc for the Kepler
 2 | kind: SecurityContextConstraints
 3 | apiVersion: security.openshift.io/v1
 4 | metadata:
 5 |   name: kepler-scc
 6 | # To allow running privilegedContainers
 7 | allowPrivilegedContainer: true
 8 | allowHostDirVolumePlugin: true
 9 | allowHostNetwork: false
10 | allowHostPorts: false
11 | allowHostIPC: false
12 | allowHostPID: true
13 | readOnlyRootFilesystem: true
14 | defaultAddCapabilities:
15 |   - SYS_ADMIN
16 | runAsUser:
17 |   type: RunAsAny
18 | seLinuxContext:
19 |   type: RunAsAny
20 | fsGroup:
21 |   type: RunAsAny
22 | volumes:
23 |   - configMap
24 |   - projected
25 |   - emptyDir
26 |   - hostPath
27 |   - secret
28 | users:
29 |   - kepler
30 |   - system:serviceaccount:kepler:kepler-sa
31 | 


--------------------------------------------------------------------------------
/manifests/base/openshift/serve-only/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | patchesStrategicMerge:
 4 |   - ./patch/patch-model-server.yaml
 5 |   - ./patch/patch-openshift.yaml
 6 | 
 7 | resources:
 8 |   - ../kepler
 9 |   - ../server
10 |   - ./openshift/scc.yaml
11 | 


--------------------------------------------------------------------------------
/manifests/base/patch/patch-estimator-sidecar.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-cfm
 5 |   namespace: kepler
 6 | data:
 7 |   MODEL_CONFIG: |
 8 |     NODE_COMPONENTS_ESTIMATOR=true
 9 |     NODE_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/ec2-0.7.11/rapl-sysfs/AbsPower/BPFOnly/SGDRegressorTrainer_0.zip
10 |     NODE_TOTAL_ESTIMATOR=true
11 |     NODE_TOTAL_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/specpower-0.7.11/acpi/AbsPower/BPFOnly/SGDRegressorTrainer_0.zip
12 | ---
13 | apiVersion: apps/v1
14 | kind: DaemonSet
15 | metadata:
16 |   name: kepler-exporter
17 |   namespace: kepler
18 | spec:
19 |   template:
20 |     spec:
21 |       containers:
22 |         # kepler: wait for estimator socket
23 |         - command:
24 |             - /bin/sh
25 |             - -c
26 |           args:
27 |             - until [ -e /tmp/estimator.sock ]; do sleep 1; done && /usr/bin/kepler -v=$(KEPLER_LOG_LEVEL)
28 |           volumeMounts:
29 |             - mountPath: /tmp
30 |               name: tmp
31 |           name: kepler-exporter
32 |         # estimator container
33 |         - image: kepler_model_server
34 |           imagePullPolicy: IfNotPresent
35 |           args: [estimator]
36 |           name: estimator
37 |           volumeMounts:
38 |             - name: cfm
39 |               mountPath: /etc/kepler/kepler.config
40 |               readOnly: true
41 |             - mountPath: /tmp
42 |               name: tmp
43 |             - mountPath: /mnt
44 |               name: mnt
45 |       volumes:
46 |         - emptyDir: {}
47 |           name: tmp
48 |         - emptyDir: {}
49 |           name: mnt
50 | 


--------------------------------------------------------------------------------
/manifests/base/patch/patch-model-server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-cfm
 5 |   namespace: kepler
 6 | data:
 7 |   MODEL_SERVER_ENABLE: "true"
 8 |   MODEL_SERVER_ENDPOINT: http://kepler-model-server.$(MODEL_SERVER_NAMESPACE).svc.cluster.local:$(MODEL_SERVER_PORT)/model
 9 |   MODEL_SERVER_PORT: |
10 |     $(MODEL_SERVER_PORT)
11 |   MODEL_SERVER_URL: http://kepler-model-server.$(MODEL_SERVER_NAMESPACE).svc.cluster.local:$(MODEL_SERVER_PORT)
12 |   MODEL_SERVER_MODEL_REQ_PATH: /model
13 |   MODEL_SERVER_MODEL_LIST_PATH: /best-models
14 | 


--------------------------------------------------------------------------------
/manifests/base/patch/patch-openshift.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Namespace
 3 | metadata:
 4 |   annotations:
 5 |     openshift.io/description: Kepler exporter
 6 |     openshift.io/display-name: ""
 7 |   name: kepler
 8 | ---
 9 | apiVersion: apps/v1
10 | kind: DaemonSet
11 | metadata:
12 |   name: kepler-exporter
13 |   namespace: kepler
14 | spec:
15 |   template:
16 |     spec:
17 |       containers:
18 |         - name: kepler-exporter
19 |           volumeMounts:
20 |             - name: kernel-src
21 |               mountPath: /usr/src/kernels
22 |           securityContext:
23 |             privileged: true
24 |       volumes:
25 |         - name: kernel-src
26 |           hostPath:
27 |             path: /usr/src/kernels
28 |             type: Directory
29 | 


--------------------------------------------------------------------------------
/manifests/base/patch/patch-server-only.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-cfm
 5 |   namespace: kepler
 6 | data:
 7 |   MODEL_CONFIG: |
 8 |     NODE_COMPONENTS_TRAINER=SGDRegressorTrainer
 9 |     NODE_TOTAL_TRAINER=SGDRegressorTrainer
10 | 


--------------------------------------------------------------------------------
/manifests/base/serve-only/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | apiVersion: kustomize.config.k8s.io/v1beta1
 4 | kind: Kustomization
 5 | images:
 6 |   - name: kepler_model_server
 7 |     newName: localhost:5001/kepler-model-server
 8 |     newTag: devel
 9 | 
10 | patchesStrategicMerge:
11 |   - ./patch/patch-model-server.yaml
12 |   - ./patch/patch-server-only.yaml
13 | resources:
14 |   - ../kepler
15 |   - ../server
16 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/common/var/lib/kepler/data/cpus.yaml:
--------------------------------------------------------------------------------
  1 | ##########
  2 | # CPUS - used to lookup uarch and channels by family, model, and stepping
  3 | #    The model and stepping fields will be interpreted as regular expressions
  4 | #    An empty stepping field means 'any' stepping
  5 | 
  6 | ##########
  7 | # Intel Core CPUs
  8 | ##########
  9 | #  Haswell
 10 | - core: HSW
 11 |   uarch: Haswell
 12 |   family: 6
 13 |   model: (50|69|70)
 14 |   stepping:
 15 | 
 16 | #  Broadwell
 17 | - core: BDW
 18 |   uarch: Broadwell
 19 |   family: 6
 20 |   model: (61|71)
 21 |   stepping:
 22 | 
 23 | #  Skylake
 24 | - core: SKL
 25 |   uarch: Skylake
 26 |   family: 6
 27 |   model: (78|94)
 28 |   stepping:
 29 | 
 30 | #  Kabylake
 31 | - core: KBL
 32 |   uarch: Kaby Lake
 33 |   family: 6
 34 |   model: (142|158)
 35 |   stepping: 9
 36 | 
 37 | #  Coffelake
 38 | - core: CFL
 39 |   uarch: Coffee Lake
 40 |   family: 6
 41 |   model: (142|158)
 42 |   stepping: (10|11|12|13)
 43 | 
 44 | #  Rocket Lake
 45 | - core: RKL
 46 |   uarch: Cypress Cove
 47 |   family: 6
 48 |   model: 167
 49 |   stepping:
 50 | 
 51 | #  Tiger Lake
 52 | - core: TGL
 53 |   uarch: Willow Cove
 54 |   family: 6
 55 |   model: (140|141)
 56 |   stepping:
 57 | 
 58 | #  Alder Lake
 59 | - core: ADL
 60 |   uarch: Golden Cove
 61 |   family: 6
 62 |   model: (151|154)
 63 |   stepping:
 64 | 
 65 | #  Raptor Lake
 66 | - core: RTL
 67 |   uarch: Raptor Cove
 68 |   family: 6
 69 |   model: 183
 70 |   stepping:
 71 | 
 72 | ##########
 73 | # Intel Xeon CPUs
 74 | ##########
 75 | #  Haswell
 76 | - core: HSX
 77 |   uarch: Haswell
 78 |   family: 6
 79 |   model: 63
 80 |   stepping:
 81 | 
 82 | #  Broadwell
 83 | - core: BDX
 84 |   uarch: Broadwell
 85 |   family: 6
 86 |   model: (79|86)
 87 |   stepping:
 88 | 
 89 | #  Skylake
 90 | - core: SKX
 91 |   uarch: Skylake
 92 |   family: 6
 93 |   model: 85
 94 |   stepping: (0|1|2|3|4)
 95 | 
 96 | #  Cascadelake
 97 | - core: CLX
 98 |   uarch: Cascade Lake
 99 |   family: 6
100 |   model: 85
101 |   stepping: (5|6|7)
102 | 
103 | #  Cooperlake
104 | - core: CPX
105 |   uarch: Cooper Lake
106 |   family: 6
107 |   model: 85
108 |   stepping: 11
109 | 
110 | #  Icelake
111 | - core: ICX
112 |   uarch: Sunny Cove
113 |   family: 6
114 |   model: (106|108)
115 |   stepping:
116 | 
117 | #  Sapphire Rapids
118 | - core: SPR
119 |   uarch: Sapphire Rapids
120 |   family: 6
121 |   model: 143
122 |   stepping:
123 | 
124 | #  Emerald Rapids
125 | - core: EMR
126 |   uarch: Emerald Rapids
127 |   family: 6
128 |   model: 207
129 |   stepping:
130 | 
131 | #  Granite Rapids
132 | - core: GNR
133 |   uarch: Granite Rapids
134 |   family: 6
135 |   model: 173
136 |   stepping:
137 | 
138 | #  Sierra Forest
139 | - core: SRF
140 |   uarch: Sierra Forest
141 |   family: 6
142 |   model: 175
143 |   stepping:
144 | 
145 | ##########
146 | # AMD CPUs
147 | ##########
148 | #  Naples
149 | - core: Naples
150 |   uarch: Zen
151 |   family: 23
152 |   model: 1
153 |   stepping:
154 | 
155 | #  Rome
156 | - core: Rome
157 |   uarch: Zen 2
158 |   family: 23
159 |   model: 49
160 |   stepping:
161 | 
162 | #  Milan
163 | - core: Milan
164 |   uarch: Zen 3
165 |   family: 25
166 |   model: 1
167 |   stepping:
168 | 
169 | #  Genoa
170 | - core: Genoa
171 |   uarch: Zen 4
172 |   family: 25
173 |   model: 17
174 |   stepping:
175 | 
176 | # Siena
177 | - core: Siena
178 |   uarch: Zen 4c
179 |   family: 25
180 |   model: 160
181 |   stepping:
182 | 
183 | ##########
184 | # ARM CPUs
185 | #########
186 | #  AWS Graviton 2
187 | - core: Ares
188 |   uarch: neoverse_n1
189 |   family:
190 |   model: 1
191 |   stepping: r3p1
192 | 
193 | #  AWS Graviton 3
194 | - core: Zeus
195 |   uarch: neoverse_v1
196 |   family:
197 |   model: 1
198 |   stepping: r1p1
199 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/acpi_AbsPowerModel.json:
--------------------------------------------------------------------------------
1 | {"platform": {"All_Weights": {"Bias_Weight": 220.9079278650894, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 29.028228361462897}}}}}
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/acpi_DynPowerModel.json:
--------------------------------------------------------------------------------
1 | {"platform": {"All_Weights": {"Bias_Weight": 49.56491877218095, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 28.501356366108837}}}}}
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/intel_rapl_AbsPowerModel.json:
--------------------------------------------------------------------------------
1 | {"package": {"All_Weights": {"Bias_Weight": 69.91739430907396, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.16772409328642}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 47.142633336743344, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.57348245077466}}}}}
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/intel_rapl_DynPowerModel.json:
--------------------------------------------------------------------------------
1 | {"package": {"All_Weights": {"Bias_Weight": 38.856412561925055, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.258830113477515}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 9.080889901856153, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.0358946796490924}}}}}
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/ENABLE_PROCESS_METRICS:
--------------------------------------------------------------------------------
1 | true
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/EXPOSE_ESTIMATED_IDLE_POWER_METRICS:
--------------------------------------------------------------------------------
1 | false
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/EXPOSE_VM_METRICS:
--------------------------------------------------------------------------------
1 | true
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/models/etc/kepler/kepler.config/ENABLE_PROCESS_METRICS:
--------------------------------------------------------------------------------
1 | true
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/models/etc/kepler/kepler.config/EXPOSE_ESTIMATED_IDLE_POWER_METRICS:
--------------------------------------------------------------------------------
1 | false
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_CONFIG:
--------------------------------------------------------------------------------
1 | NODE_TOTAL_ESTIMATOR=true
2 | NODE_TOTAL_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/specpower-0.7.11/acpi/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip
3 | NODE_COMPONENTS_ESTIMATOR=true
4 | NODE_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/ec2-0.7.11/rapl-sysfs/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip
5 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_SERVER_ENABLE:
--------------------------------------------------------------------------------
1 | false
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_SERVER_URL:
--------------------------------------------------------------------------------
1 | http://model-server:8100
2 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/overrides.yaml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   prometheus:
 3 |     networks:
 4 |       - kepler-models-network
 5 |       - kepler-metal-network
 6 |       - model-server-network
 7 | 
 8 |     volumes:
 9 |       - type: bind
10 |         source: ../dev/prometheus/scrape-configs/dev.yaml
11 |         target: /etc/prometheus/scrape-configs/dev.yaml
12 | 
13 |   grafana:
14 |     environment:
15 |       GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /var/lib/grafana/dashboards/dev/dashboard.json
16 |     volumes:
17 |       - type: bind
18 |         source: ../dev/grafana/dashboards/dev/
19 |         target: /var/lib/grafana/dashboards/dev
20 | 


--------------------------------------------------------------------------------
/manifests/compose/dev/prometheus/scrape-configs/dev.yaml:
--------------------------------------------------------------------------------
1 | scrape_configs:
2 |   - job_name: models
3 |     static_configs:
4 |       - targets: [kepler-models:9100]
5 | 
6 |   - job_name: metal
7 |     static_configs:
8 |       - targets: [kepler-metal:9100]
9 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/compose.yaml:
--------------------------------------------------------------------------------
 1 | name: monitoring
 2 | 
 3 | services:
 4 |   prometheus:
 5 |     build:
 6 |       context: ./prometheus
 7 |     ports:
 8 |       - 19090:9090
 9 |     volumes:
10 |       - prom-data:/prometheus
11 |       - type: bind
12 |         source: ./prometheus/prometheus.yml
13 |         target: /etc/prometheus/prometheus.yml
14 |     networks:
15 |       - monitoring
16 | 
17 |     healthcheck:
18 |       test: wget -q --spider http://localhost:9090/ -O /dev/null || exit 1
19 |       interval: ${HEALTHCHECK_INTERVAL:-50s}
20 |       timeout: ${HEALTHCHECK_TIMEOUT:-30s}
21 |       retries: ${HEALTHCHECK_RETRIES:-3}
22 |       start_period: ${HEALTHCHECK_START_PERIOD:-1m}
23 | 
24 |   grafana:
25 |     build:
26 |       context: ./grafana
27 |     environment:
28 |       GF_AUTH_ANONYMOUS_ENABLED: "true"
29 |       GF_SECURITY_ADMIN_PASSWORD: admin
30 |       GF_AUTH_ANONYMOUS_ORG_ROLE: Admin
31 | 
32 |     user: "1000" # NOTE: change this to your `id -u`
33 |     depends_on:
34 |       - prometheus
35 |     ports:
36 |       - 13000:3000
37 |     networks:
38 |       - monitoring
39 | 
40 |     healthcheck:
41 |       test: curl -f http://localhost:3000/ || exit 1
42 |       interval: ${HEALTHCHECK_INTERVAL:-50s}
43 |       timeout: ${HEALTHCHECK_TIMEOUT:-30s}
44 |       retries: ${HEALTHCHECK_RETRIES:-3}
45 |       start_period: ${HEALTHCHECK_START_PERIOD:-1m}
46 | 
47 | volumes:
48 |   # volume for holding prometheus (ts)db
49 |   prom-data:
50 | 
51 | networks:
52 |   monitoring:
53 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/grafana/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM quay.io/ceph/grafana:10.4.2
2 | 
3 | COPY /datasource.yml /etc/grafana/provisioning/datasources/
4 | COPY /dashboards.yml /etc/grafana/provisioning/dashboards/
5 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/grafana/dashboards.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: 1
 2 | 
 3 | providers:
 4 |   # <string> an unique provider name. Required
 5 |   - name: kepler
 6 |     # <int> Org id. Default to 1
 7 |     orgId: 1
 8 |     # <string> name of the dashboard folder.
 9 |     folder: kepler
10 |     # <string> provider type. Default to 'file'
11 |     type: file
12 |     # <bool> disable dashboard deletion
13 |     disableDeletion: true
14 |     # <bool> allow updating provisioned dashboards from the UI
15 |     allowUiUpdates: true
16 |     options:
17 |       # <string, required> path to dashboard files on disk. Required when using the 'file' type
18 |       path: /var/lib/grafana/dashboards
19 |       # <bool> use folder names from filesystem to create folders in Grafana
20 |       foldersFromFilesStructure: true
21 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/grafana/datasource.yml:
--------------------------------------------------------------------------------
 1 | # config file version
 2 | apiVersion: 1
 3 | 
 4 | datasources:
 5 |   # <string, required> name of the datasource. Required
 6 |   - name: kepler-prometheus
 7 |     # <string, required> datasource type. Required
 8 |     type: prometheus
 9 |     # <string, required> access mode. direct or proxy. Required
10 |     access: proxy
11 |     # <int> org id. will default to orgId 1 if not specified
12 |     orgId: 1
13 |     # <string> url
14 |     url: http://prometheus:9090
15 |     isDefault: true
16 |     version: 1
17 |     # <bool> allow users to edit datasources from the UI.
18 |     editable: true
19 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/prometheus/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM quay.io/prometheus/prometheus:main
 2 | 
 3 | COPY /prometheus.yml /etc/prometheus/prometheus.yml
 4 | 
 5 | CMD [\
 6 | 	"--config.file=/etc/prometheus/prometheus.yml",\
 7 | 	"--storage.tsdb.path=/prometheus", \
 8 | 	"--web.enable-admin-api" \
 9 | ]
10 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/prometheus/prometheus.yml:
--------------------------------------------------------------------------------
 1 | global:
 2 |   scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute.
 3 |   evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
 4 |   # scrape_timeout is set to the global default (10s).
 5 | 
 6 |   # Attach these labels to any time series or alerts when communicating with
 7 |   # external systems (federation, remote storage, Alertmanager).
 8 |   external_labels:
 9 |     monitor: kepler
10 | 
11 | # A scrape configuration containing exactly one endpoint to scrape:
12 | # Here it's Prometheus itself.
13 | scrape_configs:
14 |   # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
15 |   - job_name: prometheus
16 |     # metrics_path defaults to '/metrics'
17 |     # scheme defaults to 'http'.
18 |     static_configs:
19 |       - targets: [localhost:9090]
20 | 
21 | # Load rules once and periodically evaluate them according to
22 | # the global 'evaluation_interval'.
23 | rule_files:
24 |   - /etc/prometheus/rules/*.yaml
25 |   - /etc/prometheus/rules/*.yml
26 | 
27 | # additional scrape configs
28 | scrape_config_files:
29 |   - /etc/prometheus/scrape-configs/*.yaml
30 |   - /etc/prometheus/scrape-configs/*.yml
31 | 
32 |   # NOTE:  e.g. to add more jobs to scrape a
33 |   # VM with IP 192.168.122.78 on port 8888,
34 |   # - job_name: 'vm'
35 |   #   static_configs:
36 |   #     - targets: ['192.168.122.100:8888']
37 | 


--------------------------------------------------------------------------------
/manifests/compose/monitoring/prometheus/rules/kepler.rule:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/manifests/compose/monitoring/prometheus/rules/kepler.rule


--------------------------------------------------------------------------------
/manifests/kepler/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: kustomize.config.k8s.io/v1beta1
 2 | kind: Kustomization
 3 | resources:
 4 |   - github.com/sustainable-computing-io/kepler/manifests/k8s/config/base
 5 | 
 6 | patchesStrategicMerge:
 7 |   - ./patch/patch-ci.yaml
 8 | images:
 9 |   - name: kepler
10 |     newName: quay.io/sustainable_computing_io/kepler
11 |     newTag: release-0.7.11
12 | 


--------------------------------------------------------------------------------
/manifests/kepler/patch/patch-ci.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-cfm
 5 |   namespace: system
 6 | data:
 7 |   KEPLER_LOG_LEVEL: 4
 8 | ---
 9 | apiVersion: apps/v1
10 | kind: DaemonSet
11 | metadata:
12 |   name: kepler-exporter
13 |   namespace: system
14 | spec:
15 |   template:
16 |     spec:
17 |       containers:
18 |         - name: kepler-exporter
19 |           imagePullPolicy: IfNotPresent
20 |           image: kepler:latest
21 | 


--------------------------------------------------------------------------------
/manifests/offline-trainer/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | namespace: kepler
 2 | 
 3 | resources:
 4 |   - offline-trainer.yaml
 5 | 
 6 | images:
 7 |   - name: kepler_model_server
 8 |     newName: quay.io/sustainable_computing_io/kepler_model_server
 9 |     newTag: latest
10 | 


--------------------------------------------------------------------------------
/manifests/offline-trainer/offline-trainer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: kepler-offline-trainer
 5 |   namespace: system
 6 |   labels:
 7 |     app.kubernetes.io/component: model-server
 8 |     app.kubernetes.io/name: kepler-model-server
 9 | spec:
10 |   replicas: 1
11 |   selector:
12 |     matchLabels:
13 |       app.kubernetes.io/component: offline-trainer
14 |       app.kubernetes.io/name: kepler-offline-trainer
15 |   template:
16 |     metadata:
17 |       labels:
18 |         app.kubernetes.io/component: offline-trainer
19 |         app.kubernetes.io/name: kepler-offline-trainer
20 |     spec:
21 |       volumes:
22 |         - name: cfm
23 |           configMap:
24 |             name: kepler-model-server-cfm
25 |         - emptyDir: {}
26 |           name: mnt
27 |       containers:
28 |         - name: offline-trainer
29 |           image: kepler_model_server
30 |           imagePullPolicy: Always
31 |           ports:
32 |             - containerPort: 8102
33 |               name: http
34 |           volumeMounts:
35 |             - name: cfm
36 |               mountPath: /etc/kepler/kepler.config
37 |               readOnly: true
38 |             - name: mnt
39 |               mountPath: /mnt
40 |               readOnly: false
41 |           args: [offline-trainer]
42 | ---
43 | kind: Service
44 | apiVersion: v1
45 | metadata:
46 |   name: kepler-offline-trainer
47 |   namespace: system
48 |   labels:
49 |     app.kubernetes.io/component: offline-trainer
50 |     app.kubernetes.io/name: kepler-offline-trainer
51 | spec:
52 |   clusterIP: None
53 |   selector:
54 |     app.kubernetes.io/component: offline-trainer
55 |     app.kubernetes.io/name: kepler-offline-trainer
56 |   ports:
57 |     - name: http
58 |       port: 8102
59 |       targetPort: http
60 | 


--------------------------------------------------------------------------------
/manifests/server/base/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   - server.yaml
 3 | 
 4 | apiVersion: kustomize.config.k8s.io/v1beta1
 5 | kind: Kustomization
 6 | vars:
 7 |   - name: MODEL_SERVER_NAMESPACE
 8 |     objref:
 9 |       kind: Deployment
10 |       group: apps
11 |       version: v1
12 |       name: kepler-model-server
13 |     fieldref:
14 |       fieldpath: metadata.namespace
15 |   - name: MODEL_SERVER_PORT
16 |     objref:
17 |       kind: Deployment
18 |       group: apps
19 |       version: v1
20 |       name: kepler-model-server
21 |     fieldref:
22 |       fieldpath: spec.template.spec.containers[0].ports[0].containerPort
23 | 
24 | configurations:
25 |   - kustomizeconfig.yaml
26 | 


--------------------------------------------------------------------------------
/manifests/server/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   - server.yaml
 3 | 
 4 | apiVersion: kustomize.config.k8s.io/v1beta1
 5 | kind: Kustomization
 6 | vars:
 7 |   - fieldref:
 8 |       fieldPath: metadata.namespace
 9 |     name: MODEL_SERVER_NAMESPACE
10 |     objref:
11 |       group: apps
12 |       kind: Deployment
13 |       name: kepler-model-server
14 |       version: v1
15 |   - fieldref:
16 |       fieldPath: spec.template.spec.containers[0].ports[0].containerPort
17 |     name: MODEL_SERVER_PORT
18 |     objref:
19 |       group: apps
20 |       kind: Deployment
21 |       name: kepler-model-server
22 |       version: v1
23 | 
24 | configurations:
25 |   - kustomizeconfig.yaml
26 | images:
27 |   - name: kepler_model_server
28 |     newName: quay.io/sustainable_computing_io/kepler_model_server
29 |     newTag: latest
30 | 


--------------------------------------------------------------------------------
/manifests/server/kustomizeconfig.yaml:
--------------------------------------------------------------------------------
 1 | varReference:
 2 |   - kind: ConfigMap
 3 |     group: ""
 4 |     version: v1
 5 |     name: kepler-cfm
 6 |     path: data/MODEL_SERVER_ENDPOINT
 7 |   - kind: ConfigMap
 8 |     group: ""
 9 |     version: v1
10 |     name: kepler-cfm
11 |     path: data/MODEL_SERVER_URL
12 |   - kind: ConfigMap
13 |     group: ""
14 |     version: v1
15 |     name: kepler-cfm
16 |     path: data/MODEL_SERVER_PORT
17 | 


--------------------------------------------------------------------------------
/manifests/server/online-train/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   - server.yaml
 3 | 
 4 | patchesStrategicMerge:
 5 |   - ./online-train/patch-trainer.yaml
 6 | 
 7 | apiVersion: kustomize.config.k8s.io/v1beta1
 8 | kind: Kustomization
 9 | vars:
10 |   - name: MODEL_SERVER_NAMESPACE
11 |     objref:
12 |       kind: Deployment
13 |       group: apps
14 |       version: v1
15 |       name: kepler-model-server
16 |     fieldref:
17 |       fieldpath: metadata.namespace
18 |   - name: MODEL_SERVER_PORT
19 |     objref:
20 |       kind: Deployment
21 |       group: apps
22 |       version: v1
23 |       name: kepler-model-server
24 |     fieldref:
25 |       fieldpath: spec.template.spec.containers[0].ports[0].containerPort
26 | 
27 | configurations:
28 |   - kustomizeconfig.yaml
29 | 


--------------------------------------------------------------------------------
/manifests/server/online-train/patch-trainer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-model-server-cfm
 5 |   namespace: kepler
 6 | data:
 7 |   PROM_SERVER: http://prometheus-k8s.monitoring.svc.cluster.local:9090
 8 |   PROM_QUERY_INTERVAL: 20
 9 |   PROM_QUERY_STEP: 3
10 |   PROM_SSL_DISABLE: true
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 |   name: kepler-model-server
16 |   namespace: kepler
17 | spec:
18 |   template:
19 |     spec:
20 |       containers:
21 |         - name: server-api
22 |         - name: online-trainer
23 |           image: kepler_model_server
24 |           imagePullPolicy: IfNotPresent
25 |           volumeMounts:
26 |             - name: cfm
27 |               mountPath: /etc/kepler/kepler.config
28 |               readOnly: true
29 |             - name: mnt
30 |               mountPath: /mnt
31 |               readOnly: false
32 |           args: [online-trainer]
33 | 


--------------------------------------------------------------------------------
/manifests/server/openshift/online-train/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   - server.yaml
 3 | 
 4 | patchesStrategicMerge:
 5 |   - ./openshift/patch-openshift.yaml
 6 |   - ./online-train/patch-trainer.yaml
 7 |   - ./openshift/online-train/patch-trainer.yaml
 8 | 
 9 | apiVersion: kustomize.config.k8s.io/v1beta1
10 | kind: Kustomization
11 | vars:
12 |   - name: MODEL_SERVER_NAMESPACE
13 |     objref:
14 |       kind: Deployment
15 |       group: apps
16 |       version: v1
17 |       name: kepler-model-server
18 |     fieldref:
19 |       fieldpath: metadata.namespace
20 |   - name: MODEL_SERVER_PORT
21 |     objref:
22 |       kind: Deployment
23 |       group: apps
24 |       version: v1
25 |       name: kepler-model-server
26 |     fieldref:
27 |       fieldpath: spec.template.spec.containers[0].ports[0].containerPort
28 | 
29 | configurations:
30 |   - kustomizeconfig.yaml
31 | 


--------------------------------------------------------------------------------
/manifests/server/openshift/online-train/patch-trainer.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-model-server-cfm
 5 |   namespace: system
 6 | data:
 7 |   PROM_SERVER: http://prometheus-operated.openshift-monitoring.svc.cluster.local:9090
 8 |   PROM_QUERY_INTERVAL: 20
 9 |   PROM_QUERY_STEP: 3
10 |   PROM_SSL_DISABLE: true
11 | ---
12 | apiVersion: apps/v1
13 | kind: Deployment
14 | metadata:
15 |   name: kepler-model-server
16 |   namespace: system
17 | spec:
18 |   template:
19 |     spec:
20 |       containers:
21 |         - name: server-api
22 |         - name: online-trainer
23 |           securityContext:
24 |             privileged: true
25 | 


--------------------------------------------------------------------------------
/manifests/server/openshift/patch-openshift.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: Deployment
 3 | metadata:
 4 |   name: kepler-model-server
 5 |   namespace: system
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       serviceAccountName: kepler-sa
10 |       containers:
11 |         - name: server-api
12 |           securityContext:
13 |             privileged: true
14 | 


--------------------------------------------------------------------------------
/manifests/server/openshift/serve-only/kustomization.yaml:
--------------------------------------------------------------------------------
 1 | resources:
 2 |   - server.yaml
 3 | 
 4 | patchesStrategicMerge:
 5 |   - ./openshift/patch-openshift.yaml
 6 | 
 7 | apiVersion: kustomize.config.k8s.io/v1beta1
 8 | kind: Kustomization
 9 | vars:
10 |   - name: MODEL_SERVER_NAMESPACE
11 |     objref:
12 |       kind: Deployment
13 |       group: apps
14 |       version: v1
15 |       name: kepler-model-server
16 |     fieldref:
17 |       fieldpath: metadata.namespace
18 |   - name: MODEL_SERVER_PORT
19 |     objref:
20 |       kind: Deployment
21 |       group: apps
22 |       version: v1
23 |       name: kepler-model-server
24 |     fieldref:
25 |       fieldpath: spec.template.spec.containers[0].ports[0].containerPort
26 | 
27 | configurations:
28 |   - kustomizeconfig.yaml
29 | 


--------------------------------------------------------------------------------
/manifests/server/server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: kepler-model-server-cfm
 5 |   namespace: system
 6 | ---
 7 | apiVersion: apps/v1
 8 | kind: Deployment
 9 | metadata:
10 |   name: kepler-model-server
11 |   namespace: system
12 |   labels:
13 |     app.kubernetes.io/component: model-server
14 |     app.kubernetes.io/name: kepler-model-server
15 | spec:
16 |   replicas: 1
17 |   selector:
18 |     matchLabels:
19 |       app.kubernetes.io/component: model-server
20 |       app.kubernetes.io/name: kepler-model-server
21 |   template:
22 |     metadata:
23 |       labels:
24 |         app.kubernetes.io/component: model-server
25 |         app.kubernetes.io/name: kepler-model-server
26 |     spec:
27 |       volumes:
28 |         - name: cfm
29 |           configMap:
30 |             name: kepler-model-server-cfm
31 |         - emptyDir: {}
32 |           name: mnt
33 |       containers:
34 |         - name: server-api
35 |           image: kepler_model_server
36 |           imagePullPolicy: IfNotPresent
37 |           ports:
38 |             - containerPort: 8100
39 |               name: http
40 |           volumeMounts:
41 |             - name: cfm
42 |               mountPath: /etc/kepler/kepler.config
43 |               readOnly: true
44 |             - name: mnt
45 |               mountPath: /mnt
46 |               readOnly: false
47 |           args: [model-server]
48 | ---
49 | kind: Service
50 | apiVersion: v1
51 | metadata:
52 |   name: kepler-model-server
53 |   namespace: system
54 |   labels:
55 |     app.kubernetes.io/component: model-server
56 |     app.kubernetes.io/name: kepler-model-server
57 | spec:
58 |   clusterIP: None
59 |   selector:
60 |     app.kubernetes.io/component: model-server
61 |     app.kubernetes.io/name: kepler-model-server
62 |   ports:
63 |     - name: http
64 |       port: 8100
65 |       targetPort: http
66 | 


--------------------------------------------------------------------------------
/manifests/set.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #
 3 | # This file is part of the Kepler project
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | 
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | 
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | # Copyright 2022 The Kepler Contributors
18 | #
19 | 
20 | # set options
21 | # for example: ./set.sh "ESTIMATOR SERVER"
22 | unset $SERVER
23 | unset $ONLINE_TRAINER
24 | unset $ESTIMATOR
25 | unset $OPENSHIFT_DEPLOY
26 | 
27 | DEPLOY_OPTIONS=$1
28 | for opt in ${DEPLOY_OPTIONS}; do export $opt=true; done;
29 | 
30 | echo DEPLOY_OPTIONS=${DEPLOY_OPTIONS}
31 | 
32 | version=$(kubectl version| grep 'Client Version' | sed 's/.*v//g' | cut -b -4)
33 | if [ 1 -eq "$(echo "${version} < 1.21" | bc)" ]
34 | then
35 |     echo "You need to update your kubectl version to 1.21+ to support kustomize"
36 |     exit 1
37 | fi
38 | 
39 | echo "Preparing manifests..."
40 | 
41 | if [ ! -z ${SERVER} ]; then
42 |     echo "deploy model server"
43 |     if [ ! -z ${ESTIMATOR} ]; then
44 |         echo "add estimator-sidecar"
45 |         # OPTS="ESTIMATOR SERVER" --> base
46 |         cp ./manifests/base/estimate-with-server/kustomization.yaml ./manifests/base/kustomization.yaml
47 |         if [ ! -z ${OPENSHIFT_DEPLOY} ]; then
48 |             echo "patch openshift deployment for exporter (estimator-with-server)"
49 |             # OPTS="ESTIMATOR SERVER OPENSHIFT_DEPLOY" --> base
50 |             cp ./manifests/base/openshift/estimate-with-server/kustomization.yaml ./manifests/base/kustomization.yaml
51 |         fi
52 |     else
53 |         # OPTS="SERVER" --> base
54 |         cp ./manifests/base/serve-only/kustomization.yaml ./manifests/base/kustomization.yaml
55 |         if [ ! -z ${OPENSHIFT_DEPLOY} ]; then
56 |             echo "patch openshift deployment for exporter (serve-only)"
57 |             # OPTS="SERVER OPENSHIFT_DEPLOY" --> base
58 |             cp ./manifests/base/openshift/serve-only/kustomization.yaml ./manifests/base/kustomization.yaml
59 |         fi
60 |     fi
61 |     
62 |     if [ ! -z ${ONLINE_TRAINER} ]; then
63 |         echo "add online trainer"
64 |         # OPTS="... SERVER ONLINE_TRAINER" --> server
65 |         cp ./manifests/server/online-train/kustomization.yaml ./manifests/server/kustomization.yaml
66 |         if [ ! -z ${OPENSHIFT_DEPLOY} ]; then
67 |             echo "patch openshift deployment for server (with online trainer)"
68 |             # OPTS="... SERVER ONLINE_TRAINER OPENSHIFT_DEPLOY" --> server
69 |             cp ./manifests/server/openshift/online-train/kustomization.yaml ./manifests/server/kustomization.yaml   
70 |         fi
71 |     else 
72 |         # OPTS="... SERVER" --> server
73 |         cp ./manifests/server/base/kustomization.yaml ./manifests/server/kustomization.yaml
74 |         if [ ! -z ${OPENSHIFT_DEPLOY} ]; then
75 |             echo "patch openshift deployment for server"
76 |             # OPTS="... SERVER OPENSHIFT_DEPLOY" --> server
77 |             cp ./manifests/server/openshift/serve-only/kustomization.yaml ./manifests/server/kustomization.yaml
78 |         fi
79 |     fi
80 | elif [ ! -z ${ESTIMATOR} ]; then
81 |     echo "add estimator-sidecar"
82 |     # OPTS="ESTIMATOR" --> base
83 |     cp ./manifests/base/estimate-only/kustomization.yaml ./manifests/base/kustomization.yaml
84 |     if [ ! -z ${OPENSHIFT_DEPLOY} ]; then
85 |         echo "patch openshift deployment for exporter (estimator-only)"
86 |         # OPTS="ESTIMATOR OPENSHIFT_DEPLOY" --> base
87 |         cp ./manifests/base/openshift/estimate-only/kustomization.yaml ./manifests/base/kustomization.yaml
88 |     fi
89 | fi
90 | 
91 | for opt in ${DEPLOY_OPTIONS}; do unset $opt; done; 
92 | 
93 | echo "Done $0"


--------------------------------------------------------------------------------
/manifests/test/file-server.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: model-db
 5 |   namespace: kepler
 6 |   labels:
 7 |     app.kubernetes.io/component: model-db
 8 | spec:
 9 |   containers:
10 |     - name: file-server
11 |       image: localhost:5001/kepler_model_server:devel-test
12 |       imagePullPolicy: IfNotPresent
13 |       args: [python3, tests/http_server.py]
14 |       ports:
15 |         - containerPort: 8110
16 |           name: http
17 |       volumeMounts:
18 |         - name: mnt
19 |           mountPath: /mnt
20 |   initContainers:
21 |     - name: trainer
22 |       image: localhost:5001/kepler_model_server:devel-test
23 |       imagePullPolicy: IfNotPresent
24 |       args: [python3, tests/minimal_trainer.py]
25 |       volumeMounts:
26 |         - name: mnt
27 |           mountPath: /mnt
28 |   # Add other init container configurations here
29 |   volumes:
30 |     - name: mnt
31 |       emptyDir: {}
32 | ---
33 | kind: Service
34 | apiVersion: v1
35 | metadata:
36 |   name: model-db
37 |   namespace: kepler
38 |   labels:
39 |     app.kubernetes.io/component: model-db
40 | spec:
41 |   clusterIP: None
42 |   selector:
43 |     app.kubernetes.io/component: model-db
44 |   ports:
45 |     - name: http
46 |       port: 8110
47 |       targetPort: http
48 | 


--------------------------------------------------------------------------------
/manifests/test/model-request-client.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: kepler-exporter
 5 |   namespace: kepler
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |         - name: kepler-exporter
11 |           image: localhost:5001/kepler_model_server:devel-test
12 |           imagePullPolicy: IfNotPresent
13 |           command: [/bin/bash, -c]
14 |           args: [python3 tests/weight_model_request_test.py && echo Done && sleep infinity]
15 |           volumeMounts:
16 |             - name: cfm
17 |               mountPath: /etc/kepler/kepler.config
18 |               readOnly: true
19 |             - mountPath: /tmp
20 |               name: tmp
21 |       volumes:
22 |         - emptyDir: {}
23 |           name: tmp
24 | 


--------------------------------------------------------------------------------
/manifests/test/patch-estimator-sidecar.yaml:
--------------------------------------------------------------------------------
1 | data:
2 |   MODEL_CONFIG: |
3 |     NODE_COMPONENTS_ESTIMATOR=true
4 |     NODE_COMPONENTS_INIT_URL=http://model-db.kepler.svc.cluster.local:8110/std_v0.7.11/rapl-sysfs/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip
5 |     NODE_TOTAL_ESTIMATOR=true
6 |     NODE_TOTAL_INIT_URL=http://model-db.kepler.svc.cluster.local:8110/std_v0.7.11/acpi/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip
7 | 


--------------------------------------------------------------------------------
/manifests/test/power-request-client.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: apps/v1
 2 | kind: DaemonSet
 3 | metadata:
 4 |   name: kepler-exporter
 5 |   namespace: kepler
 6 | spec:
 7 |   template:
 8 |     spec:
 9 |       containers:
10 |         - name: kepler-exporter
11 |           image: localhost:5001/kepler_model_server:devel-test
12 |           imagePullPolicy: IfNotPresent
13 |           command: [/bin/bash, -c]
14 |           args: ["until [ -e /tmp/estimator.sock ]; do sleep 1; done && python3 -u tests/estimator_power_request_test.py && echo Done && sleep infinity"]
15 |           volumeMounts:
16 |             - name: cfm
17 |               mountPath: /etc/kepler/kepler.config
18 |               readOnly: true
19 |             - mountPath: /tmp
20 |               name: tmp
21 |         - name: estimator
22 |       volumes:
23 |         - emptyDir: {}
24 |           name: tmp
25 | 


--------------------------------------------------------------------------------
/model_training/README.md:
--------------------------------------------------------------------------------
 1 | # Contribute to power profiling and model training
 2 | 
 3 | <!--toc:start-->
 4 | 
 5 | - [Contribute to power profiling and model training](#contribute-to-power-profiling-and-model-training)
 6 |   - [Requirements](#requirements)
 7 |   - [Pre-step](#pre-step)
 8 |   - [Setup](#setup)
 9 |     - [Prepare cluster](#prepare-cluster)
10 |     - [From scratch (no target kubernetes cluster)](#from-scratch-no-target-kubernetes-cluster)
11 |     - [For managed cluster](#for-managed-cluster)
12 |     - [Run benchmark and collect metrics](#run-benchmark-and-collect-metrics)
13 |     - [With manual execution](#with-manual-execution)
14 |   - [Clean up](#clean-up)
15 | 
16 | <!--toc:end-->
17 | 
18 | ## Requirements
19 | 
20 | - git > 2.22
21 | - hatch
22 | - kubectl
23 | - yq, jq
24 | - power meter if available
25 | 
26 | ## Pre-step
27 | 
28 | - Fork and clone this repository and move to `model_training` folder
29 | 
30 | ```bash
31 | git clone
32 | cd model_training
33 | ```
34 | 
35 | ## Setup
36 | 
37 | ### Prepare cluster
38 | 
39 | ### From scratch (no target kubernetes cluster)
40 | 
41 | > Note: port 9090 and 5101 should not being used. It will be used in port-forward for prometheus and kind registry respectively
42 | 
43 |   ```bash
44 |   ./script.sh prepare_cluster
45 |   ```
46 | 
47 | The script will:
48 | 
49 | - create a kind cluster `kind-for-training` with registry at port `5101`.
50 | - deploy Prometheus.
51 | - deploy Prometheus RBAC and node port to `30090` port on kind node which will be forwarded to `9090` port on the host.
52 | - deploy service monitor for kepler and reload to Prometheus server
53 | 
54 | ### For managed cluster
55 | 
56 | Please confirm the following requirements:
57 | 
58 | - Kepler installation
59 | - Prometheus installation
60 | - Kepler metrics are exported to Promtheus server
61 | - Prometheus server is available at `http://localhost:9090`. Otherwise, set environment `PROM_SERVER`.
62 | 
63 | ### Run benchmark and collect metrics
64 | 
65 | - [Tekton Pipeline Instruction](./tekton/README.md)
66 | 
67 | ### With manual execution
68 | 
69 | In addition to the above approach, you can manually run your own benchmarks, then collect, train, and export the models by the entrypoint
70 | 
71 | [Manual Metric Collection and Training with Entrypoint](./cmd_instruction.md)
72 | 
73 | ## Clean up
74 | 
75 | For kind-for-training cluster:
76 | 
77 | ```bash
78 | ./script.sh cleanup
79 | ```
80 | 


--------------------------------------------------------------------------------
/model_training/deployment/prom-kepler-rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: Role
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: prometheus-k8s
 5 |   namespace: kepler
 6 |   labels:
 7 |     app.kubernetes.io/component: prometheus
 8 |     app.kubernetes.io/instance: k8s
 9 |     app.kubernetes.io/name: prometheus
10 | rules:
11 |   - verbs:
12 |       - get
13 |       - list
14 |       - watch
15 |     apiGroups:
16 |       - "" # yamllint disable-line rule:quoted-strings
17 |     resources:
18 |       - services
19 |       - endpoints
20 |       - pods
21 |   - verbs:
22 |       - get
23 |       - list
24 |       - watch
25 |     apiGroups:
26 |       - extensions
27 |     resources:
28 |       - ingresses
29 |   - verbs:
30 |       - get
31 |       - list
32 |       - watch
33 |     apiGroups:
34 |       - networking.k8s.io
35 |     resources:
36 |       - ingresses
37 | ---
38 | kind: RoleBinding
39 | apiVersion: rbac.authorization.k8s.io/v1
40 | metadata:
41 |   name: prometheus-k8s
42 |   namespace: kepler
43 |   labels:
44 |     app.kubernetes.io/component: prometheus
45 |     app.kubernetes.io/instance: k8s
46 |     app.kubernetes.io/name: prometheus
47 | subjects:
48 |   - kind: ServiceAccount
49 |     name: prometheus-k8s
50 |     namespace: monitoring
51 | roleRef:
52 |   apiGroup: rbac.authorization.k8s.io
53 |   kind: Role
54 |   name: prometheus-k8s
55 | 


--------------------------------------------------------------------------------
/model_training/deployment/prom-np.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   labels:
 5 |     app.kubernetes.io/component: prometheus
 6 |     app.kubernetes.io/instance: k8s
 7 |     app.kubernetes.io/name: prometheus
 8 |     app.kubernetes.io/part-of: kube-prometheus
 9 |   name: prometheus-k8s-np
10 |   namespace: monitoring
11 | spec:
12 |   ports:
13 |     - name: web
14 |       port: 9090
15 |       protocol: TCP
16 |       targetPort: web
17 |       nodePort: 30090
18 |   selector:
19 |     app.kubernetes.io/component: prometheus
20 |     app.kubernetes.io/instance: k8s
21 |     app.kubernetes.io/name: prometheus
22 |     app.kubernetes.io/part-of: kube-prometheus
23 |   type: NodePort
24 | 


--------------------------------------------------------------------------------
/model_training/s3/Dockerfile:
--------------------------------------------------------------------------------
 1 | # NOTE: Dockerfile for generating quay.io/kepler_model_server/s3 images
 2 | 
 3 | FROM python:3.10-slim
 4 | 
 5 | WORKDIR /usr/local
 6 | 
 7 | COPY . /usr/local
 8 | RUN pip install --no-cache-dir . && \
 9 |     pip cache purge
10 | 


--------------------------------------------------------------------------------
/model_training/s3/README.md:
--------------------------------------------------------------------------------
1 | # S3-Pusher
2 | 
3 | A simple script and Dockerfile to push model_training/data folder to s3 bucket.
4 | 


--------------------------------------------------------------------------------
/model_training/s3/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["hatchling"]
 3 | build-backend = "hatchling.build"
 4 | 
 5 | [project]
 6 | name = "s3"
 7 | dynamic = ["version"]
 8 | description = ''
 9 | readme = "README.md"
10 | requires-python = ">=3.10"
11 | license = "Apache-2.0"
12 | keywords = []
13 | authors = [
14 |   { name = "Sunyanan Choochotkaew", email = "sunyanan.choochotkaew1@ibm.com" },
15 | ]
16 | classifiers = [
17 |   "Programming Language :: Python",
18 |   "Programming Language :: Python :: 3.10",
19 |   "Programming Language :: Python :: Implementation :: CPython",
20 |   "Programming Language :: Python :: Implementation :: PyPy",
21 | ]
22 | dependencies = [
23 |  "boto3",
24 |  "ibm-cos-sdk",
25 | ]
26 | 
27 | [project.urls]
28 | Documentation = "https://github.com/sustainable-computing-io/kepler-model-server#readme"
29 | Issues = "https://github.com/sustainable-computing-io/kepler-model-server/issues"
30 | Source = "https://github.com/sustainable-computing-io/kepler-model-server"
31 | 
32 | [project.scripts]
33 | s3-loader = "s3.loader:run"
34 | s3-pusher = "s3.pusher:run"
35 | 
36 | [tool.hatch.version]
37 | path = "src/s3/__about__.py"
38 | 
39 | [tool.hatch.envs.default]
40 | python = "3.10"
41 | 
42 | [tool.hatch.envs.types]
43 | extra-dependencies = [
44 |   "mypy>=1.0.0",
45 | ]
46 | [tool.hatch.envs.types.scripts]
47 | check = "mypy --install-types --non-interactive {args:src/s3 tests}"
48 | 
49 | [tool.coverage.run]
50 | source_pkgs = ["s3", "tests"]
51 | branch = true
52 | parallel = true
53 | omit = [
54 |   "src/s3/__about__.py",
55 | ]
56 | 
57 | [tool.coverage.paths]
58 | s3 = ["src/s3", "*/s3/src/s3"]
59 | tests = ["tests", "*/s3/tests"]
60 | 
61 | [tool.coverage.report]
62 | exclude_lines = [
63 |   "no cov",
64 |   "if __name__ == .__main__.:",
65 |   "if TYPE_CHECKING:",
66 | ]
67 | 


--------------------------------------------------------------------------------
/model_training/s3/src/s3/__about__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha <sthaha@redhat.com>
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | __version__ = "0.7.11"
5 | 


--------------------------------------------------------------------------------
/model_training/s3/src/s3/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha <sthaha@redhat.com>
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/model_training/s3/src/s3/loader.py:
--------------------------------------------------------------------------------
 1 | ## get client
 2 | # client = new_<provider>_client(args)
 3 | ## upload all files in mnt path
 4 | # <provider>_upload(client, mnt_path)
 5 | import argparse
 6 | import os
 7 | 
 8 | from . import util
 9 | 
10 | model_dir = "models"
11 | data_dir = "data"
12 | machine_spec_dir = "machine_spec"
13 | 
14 | 
15 | def aws_list_keys(client, bucket_name, prefix):
16 |     response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
17 |     return [obj["Key"] for obj in response.get("Contents", [])]
18 | 
19 | 
20 | def ibmcloud_list_keys(client, bucket_name, prefix):
21 |     bucket_obj = client.Bucket(bucket_name)
22 |     data_response = bucket_obj.objects.filter(Prefix=prefix)
23 |     return [obj.key for obj in data_response]
24 | 
25 | 
26 | def get_bucket_file_map(client, bucket_name, machine_id, mnt_path, pipeline_name, list_func):
27 |     bucket_file_map = dict()
28 |     top_key_path = ""
29 |     if machine_id is not None and machine_id != "":
30 |         top_key_path = "/" + machine_id
31 |     # add data key map
32 |     data_path = os.path.join(mnt_path, data_dir)
33 |     datapath_prefix = top_key_path + "/data"
34 |     keys = list_func(client, bucket_name, datapath_prefix)
35 |     for key in keys:
36 |         filepath = key.replace(datapath_prefix, data_path)
37 |         bucket_file_map[key] = filepath
38 |     # add model key map
39 |     model_path = os.path.join(mnt_path, model_dir, pipeline_name)
40 |     model_predix = "/models/" + pipeline_name
41 |     keys = list_func(client, bucket_name, model_predix)
42 |     for key in keys:
43 |         filepath = key.replace(model_predix, model_path)
44 |         bucket_file_map[key] = filepath
45 |     return bucket_file_map
46 | 
47 | 
48 | def aws_download(client, bucket_name, machine_id, mnt_path, pipeline_name):
49 |     print("AWS Download")
50 |     bucket_file_map = get_bucket_file_map(client, bucket_name, machine_id=machine_id, mnt_path=mnt_path, pipeline_name=pipeline_name, list_func=aws_list_keys)
51 |     for key, filepath in bucket_file_map.items():
52 |         print(key, filepath)
53 |         dir = os.path.dirname(filepath)
54 |         if not os.path.exists(dir):
55 |             os.makedirs(dir)
56 |         client.download_file(bucket_name, key, filepath)
57 | 
58 | 
59 | def ibm_download(client, bucket_name, machine_id, mnt_path, pipeline_name):
60 |     print("IBM Download")
61 |     bucket_file_map = get_bucket_file_map(
62 |         client, bucket_name, machine_id=machine_id, mnt_path=mnt_path, pipeline_name=pipeline_name, list_func=ibmcloud_list_keys
63 |     )
64 |     for key, filepath in bucket_file_map.items():
65 |         print(key, filepath)
66 |         dir = os.path.dirname(filepath)
67 |         if not os.path.exists(dir):
68 |             os.makedirs(dir)
69 |         client.Bucket(bucket_name).download_file(key, filepath)
70 | 
71 | 
72 | def add_common_args(subparser):
73 |     subparser.add_argument("--bucket-name", help="Bucket name", required=True)
74 |     subparser.add_argument("--mnt-path", help="Mount path", required=True)
75 |     subparser.add_argument("--pipeline-name", help="Pipeline name")
76 |     subparser.add_argument("--machine-id", help="Machine ID")
77 | 
78 | 
79 | def run():
80 |     parser = argparse.ArgumentParser(description="S3 Pusher")
81 |     args = util.get_command(parser, add_common_args, ibm_download, aws_download)
82 |     if hasattr(args, "new_client_func") and hasattr(args, "func"):
83 |         client = args.new_client_func(args)
84 |         args.func(client, args.bucket_name, args.machine_id, args.mnt_path, args.pipeline_name)
85 |     else:
86 |         parser.print_help()
87 | 
88 | 
89 | if __name__ == "__main__":
90 |     run()
91 | 


--------------------------------------------------------------------------------
/model_training/s3/src/s3/pusher.py:
--------------------------------------------------------------------------------
 1 | ## get client
 2 | # client = new_<provider>_client(args)
 3 | ## upload all files in mnt path
 4 | # <provider>_upload(client, mnt_path)
 5 | import argparse
 6 | import os
 7 | 
 8 | from . import util
 9 | 
10 | model_dir = "models"
11 | data_dir = "data"
12 | machine_spec_dir = "machine_spec"
13 | 
14 | 
15 | def get_bucket_file_map(machine_id, mnt_path, query_data, idle_data):
16 |     model_path = os.path.join(mnt_path, model_dir)
17 |     bucket_file_map = dict()
18 |     top_key_path = ""
19 |     if machine_id is not None and machine_id != "":
20 |         top_key_path = "/" + machine_id
21 |     if os.path.exists(model_path):
22 |         for root, _, files in os.walk(model_path):
23 |             for file in files:
24 |                 filepath = os.path.join(root, file)
25 |                 key = filepath.replace(model_path, "/models")
26 |                 bucket_file_map[key] = filepath
27 |     data_path = os.path.join(mnt_path, data_dir)
28 |     for data_filename in [query_data, idle_data]:
29 |         if data_filename is not None:
30 |             filepath = os.path.join(data_path, data_filename + ".json")
31 |             if os.path.exists(filepath):
32 |                 key = filepath.replace(data_path, top_key_path + "/data")
33 |                 bucket_file_map[key] = filepath
34 |     filepath = os.path.join(data_path, machine_spec_dir, machine_id + ".json")
35 |     if os.path.exists(filepath):
36 |         key = filepath.replace(data_path, top_key_path + "/data")
37 |         bucket_file_map[key] = filepath
38 |     return bucket_file_map
39 | 
40 | 
41 | def aws_upload(client, bucket_name, machine_id, mnt_path, query_data, idle_data):
42 |     print("AWS Upload")
43 |     bucket_file_map = get_bucket_file_map(machine_id=machine_id, mnt_path=mnt_path, query_data=query_data, idle_data=idle_data)
44 |     for key, filepath in bucket_file_map.items():
45 |         print(key, filepath)
46 |         client.upload_file(filepath, bucket_name, key)
47 | 
48 | 
49 | def ibm_upload(client, bucket_name, machine_id, mnt_path, query_data, idle_data):
50 |     print("IBM Upload")
51 |     bucket_file_map = get_bucket_file_map(machine_id=machine_id, mnt_path=mnt_path, query_data=query_data, idle_data=idle_data)
52 |     for key, filepath in bucket_file_map.items():
53 |         print(key, filepath)
54 |         client.Object(bucket_name, key).upload_file(filepath)
55 | 
56 | 
57 | def add_common_args(subparser):
58 |     subparser.add_argument("--bucket-name", help="Bucket name", required=True)
59 |     subparser.add_argument("--mnt-path", help="Mount path", required=True)
60 |     subparser.add_argument("--query-data", help="Query data filename")
61 |     subparser.add_argument("--idle-data", help="Idle data filename")
62 |     subparser.add_argument("--machine-id", help="Machine ID")
63 | 
64 | 
65 | def run():
66 |     parser = argparse.ArgumentParser(description="S3 Pusher")
67 |     args = util.get_command(parser, add_common_args, ibm_upload, aws_upload)
68 |     if hasattr(args, "new_client_func") and hasattr(args, "func"):
69 |         client = args.new_client_func(args)
70 |         args.func(client, args.bucket_name, args.machine_id, args.mnt_path, args.query_data, args.idle_data)
71 |     else:
72 |         parser.print_help()
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     run()
77 | 


--------------------------------------------------------------------------------
/model_training/s3/src/s3/util.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import s3.__about__ as about
 4 | 
 5 | 
 6 | def new_ibm_client(args):
 7 |     import ibm_boto3
 8 |     from ibm_botocore.client import Config
 9 | 
10 |     cos = ibm_boto3.resource(
11 |         "s3",
12 |         ibm_api_key_id=args.api_key,
13 |         ibm_service_instance_id=args.service_instance_id,
14 |         config=Config(signature_version="oauth"),
15 |         endpoint_url=args.service_endpoint,
16 |     )
17 |     return cos
18 | 
19 | 
20 | def new_aws_client(args):
21 |     import boto3 as aws_boto3
22 | 
23 |     s3 = aws_boto3.client("s3", aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.region_name)
24 |     return s3
25 | 
26 | 
27 | def get_command(parser: argparse.ArgumentParser, add_common_args, ibm_func, aws_func):
28 |     parser.add_argument("--version", action="version", version=about.__version__)
29 | 
30 |     subparsers = parser.add_subparsers(title="S3 provider", dest="provider")
31 |     ibm_parser = subparsers.add_parser("ibmcloud", help="IBM Cloud")
32 |     ibm_parser.add_argument("--api-key", type=str, help="API key", required=True)
33 |     ibm_parser.add_argument("--service-instance-id", type=str, help="Service instance ID", required=True)
34 |     ibm_parser.add_argument("--service-endpoint", type=str, help="Service endpoint", required=True)
35 |     add_common_args(ibm_parser)
36 |     ibm_parser.set_defaults(new_client_func=new_ibm_client, func=ibm_func)
37 | 
38 |     aws_parser = subparsers.add_parser("aws", help="AWS")
39 |     aws_parser.add_argument("--aws-access-key-id", type=str, help="Access key ID", required=True)
40 |     aws_parser.add_argument("--aws-secret-access-key", type=str, help="Secret key", required=True)
41 |     aws_parser.add_argument("--region-name", type=str, help="Region name", required=True)
42 |     add_common_args(aws_parser)
43 |     aws_parser.set_defaults(new_client_func=new_aws_client, func=aws_func)
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     return args
48 | 


--------------------------------------------------------------------------------
/model_training/s3/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha <sthaha@redhat.com>
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/complete-pipelinerun.yaml:
--------------------------------------------------------------------------------
 1 | # example-complete-train-pipeline
 2 | #   running pipelines with all default value to train AbsPower/DynPower for all energysource and featuregroup
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: example-complete-train-pipeline
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: CompleteTrainPipelineExample
18 |     # the below parameters are for short test run
19 |     - name: STRESS_ARGS
20 |       value:
21 |         - cpu;none;none
22 |     - name: STRESS_TIMEOUT
23 |       value: 20
24 |     - name: STRESS_BREAK_INTERVAL
25 |       value: 1
26 |     - name: IDLE_COLLECT_INTERVAL
27 |       value: 100
28 |     - name: CPU_FREQUENCY_ENABLED
29 |       value: false
30 |   pipelineRef:
31 |     name: complete-train-pipeline
32 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/single-train/abs-power.yaml:
--------------------------------------------------------------------------------
 1 | # example-abs-train-pipeline:
 2 | #   running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly)
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: example-abs-train-pipeline
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: AbsPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: AbsPower
20 |     # the below parameters are for short test run
21 |     - name: STRESS_ARGS
22 |       value:
23 |         - cpu;none;none
24 |     - name: STRESS_TIMEOUT
25 |       value: 20
26 |     - name: STRESS_BREAK_INTERVAL
27 |       value: 1
28 |     - name: IDLE_COLLECT_INTERVAL
29 |       value: 100
30 |     - name: CPU_FREQUENCY_ENABLED
31 |       value: false
32 |   pipelineRef:
33 |     name: single-train-pipeline
34 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/single-train/aws-push.yaml:
--------------------------------------------------------------------------------
 1 | # test-pipeline-aws
 2 | #   short run of pipelines to test e2e from collect to train with AWS COS
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: test-pipeline-aws
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: AbsPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: AbsPower
20 |     - name: MACHINE_ID
21 |       value: test
22 |     - name: COS_PROVIDER
23 |       value: aws
24 |     - name: COS_SECRET_NAME
25 |       value: aws-cos-secret
26 |     # the below parameters are for short test run
27 |     - name: STRESS_ARGS
28 |       value:
29 |         - cpu;none;none
30 |     - name: STRESS_TIMEOUT
31 |       value: 20
32 |     - name: STRESS_BREAK_INTERVAL
33 |       value: 1
34 |     - name: IDLE_COLLECT_INTERVAL
35 |       value: 100
36 |     - name: CPU_FREQUENCY_ENABLED
37 |       value: false
38 |   pipelineRef:
39 |     name: single-train-pipeline
40 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/single-train/default.yaml:
--------------------------------------------------------------------------------
 1 | # kepler-default
 2 | #   running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly) with COS
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: default
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: AbsPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: AbsPower
20 |       # Uncomment the following lines for IBM Cloud COS
21 |       # - name: COS_PROVIDER
22 |       #   value: ibmcloud
23 |       # - name: COS_SECRET_NAME
24 |       #   value: ibm-cos-secret
25 |       # Uncomment the following lines for AWS COS
26 |       # - name: COS_PROVIDER
27 |       #   value: aws
28 |       # - name: COS_SECRET_NAME
29 |       #   value: aws-cos-secret
30 |   pipelineRef:
31 |     name: single-train-pipeline
32 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/single-train/dyn-power.yaml:
--------------------------------------------------------------------------------
 1 | # example-dyn-train-pipeline:
 2 | #   running pipelines with all default value to train DynPower model (rapl-sysfs, BPFOnly)
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: example-dyn-train-pipeline
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: DynPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: DynPower
20 |     # the below parameters are for short test run
21 |     - name: STRESS_ARGS
22 |       value:
23 |         - cpu;none;none
24 |     - name: STRESS_TIMEOUT
25 |       value: 20
26 |     - name: STRESS_BREAK_INTERVAL
27 |       value: 1
28 |     - name: IDLE_COLLECT_INTERVAL
29 |       value: 100
30 |     - name: CPU_FREQUENCY_ENABLED
31 |       value: false
32 |   pipelineRef:
33 |     name: single-train-pipeline
34 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/single-train/ibmcloud-push.yaml:
--------------------------------------------------------------------------------
 1 | # test-pipeline-ibmcloud
 2 | #   short run of pipelines to test e2e from collect to train with IBMCloud COS
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: test-pipeline-ibmcloud
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: AbsPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: AbsPower
20 |     - name: MACHINE_ID
21 |       value: test
22 |     - name: COS_PROVIDER
23 |       value: ibmcloud
24 |     - name: COS_SECRET_NAME
25 |       value: ibm-cos-secret
26 |     # the below parameters are for short test run
27 |     - name: STRESS_ARGS
28 |       value:
29 |         - cpu;none;none
30 |     - name: STRESS_TIMEOUT
31 |       value: 20
32 |     - name: STRESS_BREAK_INTERVAL
33 |       value: 1
34 |     - name: IDLE_COLLECT_INTERVAL
35 |       value: 100
36 |     - name: CPU_FREQUENCY_ENABLED
37 |       value: false
38 |   pipelineRef:
39 |     name: single-train-pipeline
40 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/test-collect.yaml:
--------------------------------------------------------------------------------
 1 | # test-collect
 2 | #   short run of pipelines to test collecting data
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: test-collect
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: MACHINE_ID
17 |       value: test
18 |     - name: STRESS_ARGS
19 |       value:
20 |         - cpu;none;none
21 |     - name: STRESS_TIMEOUT
22 |       value: 20
23 |     - name: STRESS_BREAK_INTERVAL
24 |       value: 1
25 |     - name: IDLE_COLLECT_INTERVAL
26 |       value: 100
27 |     - name: CPU_FREQUENCY_ENABLED
28 |       value: false
29 |   pipelineRef:
30 |     name: collect-data-pipeline
31 | 


--------------------------------------------------------------------------------
/model_training/tekton/examples/test-retrain.yaml:
--------------------------------------------------------------------------------
 1 | # example-abs-train-pipeline:
 2 | #   running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly)
 3 | apiVersion: tekton.dev/v1
 4 | kind: PipelineRun
 5 | metadata:
 6 |   name: test-retrain-ibmcloud
 7 | spec:
 8 |   timeouts:
 9 |     pipeline: 6h
10 |     tasks: 5h50m
11 |   workspaces:
12 |     - name: mnt
13 |       persistentVolumeClaim:
14 |         claimName: task-pvc
15 |   params:
16 |     - name: PIPELINE_NAME
17 |       value: AbsPowerTrainPipelineExample
18 |     - name: OUTPUT_TYPE
19 |       value: AbsPower
20 |   pipelineRef:
21 |     name: single-retrain-pipeline
22 | 


--------------------------------------------------------------------------------
/model_training/tekton/pvc/hostpath.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: PersistentVolume
 3 | metadata:
 4 |   name: task-pv-volume
 5 |   labels:
 6 |     type: local
 7 | spec:
 8 |   storageClassName: manual
 9 |   capacity:
10 |     storage: 5Gi
11 |   accessModes:
12 |     - ReadWriteMany
13 |   hostPath:
14 |     path: /mnt
15 | ---
16 | apiVersion: v1
17 | kind: PersistentVolumeClaim
18 | metadata:
19 |   name: task-pvc
20 |   namespace: default
21 | spec:
22 |   storageClassName: manual
23 |   volumeName: task-pv-volume
24 |   accessModes:
25 |     - ReadWriteMany
26 |   resources:
27 |     requests:
28 |       storage: 3Gi
29 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/extract-task.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ## extract-from-metric:
 4 | ##
 5 | ##    load kepler_query.json and extract data to extracted_data.csv
 6 | ##
 7 | ######################################
 8 | apiVersion: tekton.dev/v1
 9 | kind: Task
10 | metadata:
11 |   name: extract-from-metric
12 | spec:
13 |   params:
14 |     - name: MODEL_SERVER_IMAGE
15 |       description: Specify model server image
16 |       default: quay.io/sustainable_computing_io/kepler_model_server:latest
17 |     - name: PIPELINE_NAME
18 |       description: Specify pipeline name (output prefix/folder)
19 |       default: default
20 |     - name: OUTPUT_TYPE
21 |       description: Specify target output type (check https://sustainable-computing.io/kepler_model_server/pipeline/#power-isolation)
22 |     - name: ENERGY_SOURCE
23 |       description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source)
24 |       default: rapl-sysfs
25 |     - name: FEATURE_GROUP
26 |       description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group)
27 |       default: BPFOnly
28 |     - name: EXTRACTOR
29 |       description: Specify extractor class (default or smooth)
30 |       default: default
31 |     - name: THIRDPARTY_METRICS
32 |       description: Specify list of third party metric to export (required only for ThirdParty feature group)
33 |       default: ""
34 |   workspaces:
35 |     - name: mnt
36 |       optional: true
37 |   steps:
38 |     - name: extract
39 |       image: $(params.MODEL_SERVER_IMAGE)
40 |       command: [kepler-model]
41 |       args:
42 |         - extract
43 |         - --data-path=$(workspaces.mnt.path)/data
44 |         - --input=kepler_query
45 |         - --output=$(params.PIPELINE_NAME)_$(params.ENERGY_SOURCE)_$(params.FEATURE_GROUP)_data
46 |         - --extractor=$(params.EXTRACTOR)
47 |         - --feature-group=$(params.FEATURE_GROUP)
48 |         - --energy-source=$(params.ENERGY_SOURCE)
49 |         - --output-type=$(params.OUTPUT_TYPE)
50 |         - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)"
51 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/isolate-task.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ## isolate-from-metric:
 4 | ##
 5 | ##    load kepler_query.json and isolate data to isolated_data.csv
 6 | ##
 7 | ######################################
 8 | apiVersion: tekton.dev/v1
 9 | kind: Task
10 | metadata:
11 |   name: isolate-from-metric
12 | spec:
13 |   params:
14 |     - name: MODEL_SERVER_IMAGE
15 |       description: Specify model server image
16 |       default: quay.io/sustainable_computing_io/kepler_model_server:latest
17 |     - name: PIPELINE_NAME
18 |       description: Specify pipeline name (output prefix/folder)
19 |       default: default
20 |     - name: ENERGY_SOURCE
21 |       description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source)
22 |       default: rapl-sysfs
23 |     - name: FEATURE_GROUP
24 |       description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group)
25 |       default: BPFOnly
26 |     - name: EXTRACTOR
27 |       description: Specify extractor class (default or smooth)
28 |       default: default
29 |     - name: ISOLATOR
30 |       description: Specify isolator class (none, min, profile, or trainer (if ABS_PIPELINE_NAME is set)
31 |       default: min
32 |     - name: THIRDPARTY_METRICS
33 |       description: Specify list of third party metric to export (required only for ThirdParty feature group)
34 |       default: ""
35 |     - name: TARGET_HINTS
36 |       description: Specify target process keywords to keep in DynPower model training
37 |       default: stress
38 |     - name: BG_HINTS
39 |       description: Specify background process keywords to remove from DynPower model training
40 |       default: ""
41 |     - name: ABS_PIPELINE_NAME
42 |       description: Specify pipeline name to be used for initializing trainer isolator
43 |       default: ""
44 |   workspaces:
45 |     - name: mnt
46 |       optional: true
47 |   steps:
48 |     - name: isolate
49 |       image: $(params.MODEL_SERVER_IMAGE)
50 |       command: [kepler-model]
51 |       args:
52 |         - isolate
53 |         - --data-path=$(workspaces.mnt.path)/data
54 |         - --input=kepler_query
55 |         - --output=$(params.PIPELINE_NAME)_$(params.ENERGY_SOURCE)_$(params.FEATURE_GROUP)_data
56 |         - --pipeline-name=$(params.PIPELINE_NAME)
57 |         - --extractor=$(params.EXTRACTOR)
58 |         - --isolator=$(params.ISOLATOR)
59 |         - --feature-group=$(params.FEATURE_GROUP)
60 |         - --energy-source=$(params.ENERGY_SOURCE)
61 |         - --output-type=DynPower
62 |         - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)"
63 |         - --abs-pipeline-name=$(params.ABS_PIPELINE_NAME)
64 |         - --profile=idle
65 |         - --target-hints="$(params.TARGET_HINTS)"
66 |         - --bg-hints="$(params.BG_HINTS)"
67 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/original-pipeline-task.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ## train-pipeline:
 4 | ##
 5 | ##    load kepler_query.json and run training pipeline
 6 | ##
 7 | ######################################
 8 | apiVersion: tekton.dev/v1
 9 | kind: Task
10 | metadata:
11 |   name: original-pipeline-task
12 | spec:
13 |   params:
14 |     - name: MODEL_SERVER_IMAGE
15 |       description: Specify model server image
16 |       default: quay.io/sustainable_computing_io/kepler_model_server:latest
17 |     - name: PIPELINE_NAME
18 |       description: Specify output pipeline name
19 |       default: default
20 |     - name: EXTRACTOR
21 |       description: Specify extractor class (default or smooth)
22 |       default: default
23 |     - name: ISOLATOR
24 |       description: Specify isolator class (none, min, profile, or trainer (if ABS_PIPELINE_NAME is set)
25 |       default: min
26 |     - name: ABS_TRAINERS
27 |       description: Specify a list of trainers for training AbsPower models
28 |       default: default
29 |     - name: DYN_TRAINERS
30 |       description: Specify a list of trainers for training DynPower models
31 |       default: default
32 |     - name: ENERGY_SOURCE
33 |       description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source)
34 |       default: acpi,rapl-sysfs
35 |     - name: TARGET_HINTS
36 |       description: Specify target process keywords to keep in DynPower model training
37 |       default: stress
38 |     - name: BG_HINTS
39 |       description: Specify background process keywords to remove from DynPower model training
40 |       default: ""
41 |     - name: THIRDPARTY_METRICS
42 |       description: Specify list of third party metric to export (required only for ThirdParty feature group)
43 |       default: ""
44 |     - name: MACHINE_ID
45 |       description: Specify machine id to identify node_type
46 |   workspaces:
47 |     - name: mnt
48 |       optional: true
49 |   steps:
50 |     - name: pipeline-train
51 |       image: $(params.MODEL_SERVER_IMAGE)
52 |       command: [kepler-model]
53 |       env:
54 |         - name: MODEL_PATH
55 |           value: $(workspaces.mnt.path)/models
56 |       args:
57 |         - train
58 |         - --data-path=$(workspaces.mnt.path)/data
59 |         - --input=kepler_query
60 |         - --pipeline-name=$(params.PIPELINE_NAME)
61 |         - --extractor=$(params.EXTRACTOR)
62 |         - --isolator=$(params.ISOLATOR)
63 |         - --profile=idle
64 |         - --target-hints="$(params.TARGET_HINTS)"
65 |         - --bg-hints="$(params.BG_HINTS)"
66 |         - --abs-trainers=$(params.ABS_TRAINERS)
67 |         - --dyn-trainers=$(params.DYN_TRAINERS)
68 |         - --energy-source=$(params.ENERGY_SOURCE)
69 |         - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)"
70 |         - --id=$(params.MACHINE_ID)
71 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/s3/aws-s3-load.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ##   s3-push task for AWS
 4 | ##
 5 | ######################################
 6 | apiVersion: tekton.dev/v1
 7 | kind: Task
 8 | metadata:
 9 |   name: aws-s3-load
10 | spec:
11 |   params:
12 |     - name: COS_SECRET_NAME
13 |       description: Specify cos secret name
14 |       default: ""
15 |     - name: MACHINE_ID
16 |       description: Specify machine id to group model result in bucket
17 |       default: ""
18 |     - name: PIPELINE_NAME
19 |       description: Specify pipeline name (output prefix/folder)
20 |       default: default
21 |   workspaces:
22 |     - name: mnt
23 |       optional: true
24 |   steps:
25 |     - name: load
26 |       image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest
27 |       env:
28 |         - name: ACCESS_KEY_ID
29 |           valueFrom:
30 |             secretKeyRef:
31 |               name: $(params.COS_SECRET_NAME)
32 |               key: accessKeyID
33 |         - name: ACCESS_SECRET
34 |           valueFrom:
35 |             secretKeyRef:
36 |               name: $(params.COS_SECRET_NAME)
37 |               key: accessSecret
38 |         - name: REGION_NAME
39 |           valueFrom:
40 |             secretKeyRef:
41 |               name: $(params.COS_SECRET_NAME)
42 |               key: regionName
43 |         - name: BUCKET_NAME
44 |           valueFrom:
45 |             secretKeyRef:
46 |               name: $(params.COS_SECRET_NAME)
47 |               key: bucketName
48 |       command: [s3-loader]
49 |       args:
50 |         - aws
51 |         - --aws-access-key-id=$(ACCESS_KEY_ID)
52 |         - --aws-secret-access-key=$(ACCESS_SECRET)
53 |         - --region-name=$(REGION_NAME)
54 |         - --bucket-name=$(BUCKET_NAME)
55 |         - --mnt-path=$(workspaces.mnt.path)
56 |         - --pipeline-name=$(params.PIPELINE_NAME)
57 |         - --machine-id=$(params.MACHINE_ID)
58 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/s3/aws-s3-push.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ##   s3-push task for AWS
 4 | ##
 5 | ######################################
 6 | apiVersion: tekton.dev/v1
 7 | kind: Task
 8 | metadata:
 9 |   name: aws-s3-push
10 | spec:
11 |   params:
12 |     - name: COS_SECRET_NAME
13 |       description: Specify cos secret name
14 |       default: ""
15 |     - name: MACHINE_ID
16 |       description: Specify machine id to group model result in bucket
17 |       default: ""
18 |   workspaces:
19 |     - name: mnt
20 |       optional: true
21 |   steps:
22 |     - name: push
23 |       image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest
24 |       env:
25 |         - name: ACCESS_KEY_ID
26 |           valueFrom:
27 |             secretKeyRef:
28 |               name: $(params.COS_SECRET_NAME)
29 |               key: accessKeyID
30 |         - name: ACCESS_SECRET
31 |           valueFrom:
32 |             secretKeyRef:
33 |               name: $(params.COS_SECRET_NAME)
34 |               key: accessSecret
35 |         - name: REGION_NAME
36 |           valueFrom:
37 |             secretKeyRef:
38 |               name: $(params.COS_SECRET_NAME)
39 |               key: regionName
40 |         - name: BUCKET_NAME
41 |           valueFrom:
42 |             secretKeyRef:
43 |               name: $(params.COS_SECRET_NAME)
44 |               key: bucketName
45 |       command: [s3-pusher]
46 |       args:
47 |         - aws
48 |         - --aws-access-key-id=$(ACCESS_KEY_ID)
49 |         - --aws-secret-access-key=$(ACCESS_SECRET)
50 |         - --region-name=$(REGION_NAME)
51 |         - --bucket-name=$(BUCKET_NAME)
52 |         - --mnt-path=$(workspaces.mnt.path)
53 |         - --query-data=kepler_query
54 |         - --idle-data=idle
55 |         - --machine-id=$(params.MACHINE_ID)
56 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/s3/ibmcloud-s3-load.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ##   s3-push task for IBM Cloud
 4 | ##
 5 | ######################################
 6 | apiVersion: tekton.dev/v1
 7 | kind: Task
 8 | metadata:
 9 |   name: ibmcloud-s3-load
10 | spec:
11 |   params:
12 |     - name: COS_SECRET_NAME
13 |       description: Specify cos secret name
14 |       default: ""
15 |     - name: MACHINE_ID
16 |       description: Specify machine id to group model result in bucket
17 |       default: ""
18 |     - name: PIPELINE_NAME
19 |       description: Specify pipeline name (output prefix/folder)
20 |       default: default
21 |   workspaces:
22 |     - name: mnt
23 |       optional: true
24 |   steps:
25 |     - name: load
26 |       image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest
27 |       env:
28 |         - name: SERVICE_ENDPOINT
29 |           valueFrom:
30 |             secretKeyRef:
31 |               name: $(params.COS_SECRET_NAME)
32 |               key: serviceEndpoint
33 |         - name: API_KEY
34 |           valueFrom:
35 |             secretKeyRef:
36 |               name: $(params.COS_SECRET_NAME)
37 |               key: apiKey
38 |         - name: SERVICE_INSTANCE_ID
39 |           valueFrom:
40 |             secretKeyRef:
41 |               name: $(params.COS_SECRET_NAME)
42 |               key: serviceInstanceID
43 |         - name: BUCKET_NAME
44 |           valueFrom:
45 |             secretKeyRef:
46 |               name: $(params.COS_SECRET_NAME)
47 |               key: bucketName
48 |       command: [s3-loader]
49 |       args:
50 |         - ibmcloud
51 |         - --service-endpoint=$(SERVICE_ENDPOINT)
52 |         - --api-key=$(API_KEY)
53 |         - --service-instance-id=$(SERVICE_INSTANCE_ID)
54 |         - --bucket-name=$(BUCKET_NAME)
55 |         - --mnt-path=$(workspaces.mnt.path)
56 |         - --pipeline-name=$(params.PIPELINE_NAME)
57 |         - --machine-id=$(params.MACHINE_ID)
58 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/s3/ibmcloud-s3-push.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ##   s3-push task for IBM Cloud
 4 | ##
 5 | ######################################
 6 | apiVersion: tekton.dev/v1
 7 | kind: Task
 8 | metadata:
 9 |   name: ibmcloud-s3-push
10 | spec:
11 |   params:
12 |     - name: COS_SECRET_NAME
13 |       description: Specify cos secret name
14 |       default: ""
15 |     - name: MACHINE_ID
16 |       description: Specify machine id to group model result in bucket
17 |       default: ""
18 |   workspaces:
19 |     - name: mnt
20 |       optional: true
21 |   steps:
22 |     - name: push
23 |       image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest
24 |       env:
25 |         - name: SERVICE_ENDPOINT
26 |           valueFrom:
27 |             secretKeyRef:
28 |               name: $(params.COS_SECRET_NAME)
29 |               key: serviceEndpoint
30 |         - name: API_KEY
31 |           valueFrom:
32 |             secretKeyRef:
33 |               name: $(params.COS_SECRET_NAME)
34 |               key: apiKey
35 |         - name: SERVICE_INSTANCE_ID
36 |           valueFrom:
37 |             secretKeyRef:
38 |               name: $(params.COS_SECRET_NAME)
39 |               key: serviceInstanceID
40 |         - name: BUCKET_NAME
41 |           valueFrom:
42 |             secretKeyRef:
43 |               name: $(params.COS_SECRET_NAME)
44 |               key: bucketName
45 |       command: [s3-pusher]
46 |       args:
47 |         - ibmcloud
48 |         - --service-endpoint=$(SERVICE_ENDPOINT)
49 |         - --api-key=$(API_KEY)
50 |         - --service-instance-id=$(SERVICE_INSTANCE_ID)
51 |         - --bucket-name=$(BUCKET_NAME)
52 |         - --mnt-path=$(workspaces.mnt.path)
53 |         - --query-data=kepler_query
54 |         - --idle-data=idle
55 |         - --machine-id=$(params.MACHINE_ID)
56 | 


--------------------------------------------------------------------------------
/model_training/tekton/tasks/train-task.yaml:
--------------------------------------------------------------------------------
 1 | ######################################
 2 | ##
 3 | ## train-model:
 4 | ##
 5 | ##    train model from extracted data/isolated data
 6 | ##
 7 | ######################################
 8 | apiVersion: tekton.dev/v1
 9 | kind: Task
10 | metadata:
11 |   name: train-model
12 | spec:
13 |   params:
14 |     - name: MODEL_SERVER_IMAGE
15 |       description: Specify model server image
16 |       default: quay.io/sustainable_computing_io/kepler_model_server:latest
17 |     - name: INPUT_DATA
18 |       description: Specify input data file name (extracted_data or isolated_data)
19 |     - name: PIPELINE_NAME
20 |       description: Specify pipeline name (output prefix/folder)
21 |       default: default
22 |     - name: OUTPUT_TYPE
23 |       description: Specify target output type (check https://sustainable-computing.io/kepler_model_server/pipeline/#power-isolation)
24 |       default: AbsPower
25 |     - name: ENERGY_SOURCE
26 |       description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source)
27 |       default: rapl-sysfs
28 |     - name: FEATURE_GROUP
29 |       description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group)
30 |       default: BPFOnly
31 |     - name: TRAINERS
32 |       description: Specify trainer names (use comma(,) as delimiter)
33 |       default: XgboostFitTrainer
34 |     - name: THIRDPARTY_METRICS
35 |       description: Specify list of third party metric to export (required only for ThirdParty feature group)
36 |       default: ""
37 |     - name: MACHINE_ID
38 |       description: Specify machine id to identify node_type
39 |       default: ""
40 |   workspaces:
41 |     - name: mnt
42 |       optional: true
43 |   steps:
44 |     - name: train-from-data
45 |       image: $(params.MODEL_SERVER_IMAGE)
46 |       command: [kepler-model]
47 |       env:
48 |         - name: MODEL_PATH
49 |           value: $(workspaces.mnt.path)/models
50 |       args:
51 |         - train_from_data
52 |         - --data-path=$(workspaces.mnt.path)/data
53 |         - --input=$(params.INPUT_DATA)
54 |         - --pipeline-name=$(params.PIPELINE_NAME)
55 |         - --feature-group=$(params.FEATURE_GROUP)
56 |         - --energy-source=$(params.ENERGY_SOURCE)
57 |         - --output-type=$(params.OUTPUT_TYPE)
58 |         - --trainers=$(params.TRAINERS)
59 |         - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)"
60 |         - --id=$(params.MACHINE_ID)
61 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "kepler_model"
  7 | dynamic = ["version"]
  8 | description = "kepler model server for serving kepler models"
  9 | readme = "README.md"
 10 | requires-python = ">= 3.10"
 11 | license = "Apache-2.0"
 12 | keywords = [
 13 |   "kepler", "models", 
 14 |   "model-server", "estimator"
 15 | ]
 16 | 
 17 | authors = [
 18 |   { name = "Sunyanan Choochotkaew", email = "sunyanan.choochotkaew1@ibm.com" },
 19 |   { name = "Sunil Thaha", email = "sthaha@redhat.com" },
 20 | ]
 21 | 
 22 | classifiers = [
 23 |   "Programming Language :: Python",
 24 |   "Programming Language :: Python :: 3",
 25 |   "Programming Language :: Python :: 3.10",
 26 | ]
 27 | dependencies = [
 28 |   "flask==3.0.3",
 29 |   "joblib==1.4.2",
 30 |   "numpy==2.1.2",
 31 |   "pandas==2.2.3",
 32 |   "prometheus-api-client==0.5.5",
 33 |   "prometheus-client==0.21.0",
 34 |   "protobuf==5.28.2",
 35 |   "psutil==6.1.0",
 36 |   "py-cpuinfo==9.0.0",
 37 |   "pyudev==0.24.3",
 38 |   "pyyaml_env_tag==0.1",
 39 |   "scikit-learn==1.5.2",
 40 |   "scipy==1.14.1",
 41 |   "seaborn==0.13.2",
 42 |   "Werkzeug==3.0.4",
 43 |   "xgboost==2.1.2",
 44 |   "boto3==1.35.43",
 45 |   "pymarkdownlnt==0.9.22",
 46 |   "yamllint==1.35.1",
 47 |   "requests-file==2.1.0",
 48 | ]
 49 | 
 50 | [project.scripts]
 51 | model-server = "kepler_model.server.model_server:run"
 52 | estimator = "kepler_model.estimate.estimator:run"
 53 | kepler-model = "kepler_model.cmd.main:run"
 54 | offline-trainer = "kepler_model.train.offline_trainer:run"
 55 | online-trainer = "kepler_model.train.online_trainer:run"
 56 | 
 57 | [project.urls]
 58 | Documentation = "https://github.com/sustainable-computing-io/kepler-model-server#readme"
 59 | Issues = "https://github.com/sustainable-computing-io/kepler-model-server/issues"
 60 | Source = "https://github.com/sustainable-computing-io/kepler-model-server"
 61 | 
 62 | [tool.hatch.version]
 63 | path = "src/kepler_model/__about__.py"
 64 | 
 65 | [tool.hatch.envs.default]
 66 | python = "3.10"
 67 | extra-dependencies = [
 68 |   "coverage[toml]>=6.5",
 69 |     "ipdb",
 70 |     "ipython",
 71 |   "pytest",
 72 | ]
 73 | 
 74 | [tool.hatch.envs.default.scripts]
 75 | test = "pytest {args:tests}"
 76 | test-cov = "coverage run -m pytest {args:tests}"
 77 | cov-report = [
 78 |   "- coverage combine",
 79 |   "coverage report",
 80 | ]
 81 | cov = [
 82 |   "test-cov",
 83 |   "cov-report",
 84 | ]
 85 | 
 86 | [tool.hatch.envs.lab]
 87 | extra-dependencies = [
 88 |   "jupyterlab",
 89 |   "notebook",
 90 |   "voila",
 91 |   "ipywidgets",
 92 |   # vim please
 93 |   "jupyterlab-vim",
 94 | 
 95 |   "beautifulsoup4",
 96 |   # read parquet files
 97 |   # "pyarrow",
 98 | 
 99 |   # graphing
100 |   "matplotlib",
101 |   "graphviz",
102 | ]
103 | 
104 | [tool.hatch.envs.lab.scripts]
105 | note = "jupyter lab --NotebookApp.token='' --allow-root"
106 | 
107 | [tool.hatch.envs.types]
108 | extra-dependencies = [
109 |   "mypy>=1.0.0",
110 | ]
111 | [tool.hatch.envs.types.scripts]
112 | check = "mypy --install-types --non-interactive {args:src/kepler_model_server tests}"
113 | 
114 | [tool.coverage.run]
115 | source_pkgs = ["kepler_model", "tests"]
116 | branch = true
117 | parallel = true
118 | omit = [
119 |   "src/kepler_model/__about__.py",
120 | ]
121 | 
122 | [tool.coverage.paths]
123 | kepler_model = ["src/kepler_model", "*/kepler_model/src/kepler_model"]
124 | tests = ["tests", "*/kepler_model/tests"]
125 | 
126 | [tool.coverage.report]
127 | exclude_lines = [
128 |   "no cov",
129 |   "if __name__ == .__main__.:",
130 |   "if TYPE_CHECKING:",
131 | ]
132 | 
133 | [tool.ruff]
134 | line-length = 160
135 | 
136 | [tool.pytest.ini_options]
137 | markers = [
138 |     "focus",  # used in development to mark focused tests
139 | ]
140 | 
141 | [tool.pymarkdown]
142 | plugins.md013.enabled = false
143 | 


--------------------------------------------------------------------------------
/src/kepler_model/__about__.py:
--------------------------------------------------------------------------------
1 | # SPDX-FileCopyrightText: 2024-present
2 | #
3 | # SPDX-License-Identifier: Apache-2.0
4 | __version__ = "0.7.11"
5 | 


--------------------------------------------------------------------------------
/src/kepler_model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/abs-train-pipelinerun.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: tekton.dev/v1
 2 | kind: PipelineRun
 3 | metadata:
 4 |   name: example-abs-train-pipeline
 5 | spec:
 6 |   timeouts:
 7 |     pipeline: 6h
 8 |     tasks: 5h50m
 9 |   workspaces:
10 |     - name: mnt
11 |       persistentVolumeClaim:
12 |         claimName: task-pvc
13 |   params:
14 |     - name: PIPELINE_NAME
15 |       value: AbsPowerTrainPipelineExample
16 |     - name: OUTPUT_TYPE
17 |       value: AbsPower
18 |   pipelineRef:
19 |     name: single-train-pipeline
20 | 


--------------------------------------------------------------------------------
/src/kepler_model/cmd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/cmd/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/estimate/__init__.py:
--------------------------------------------------------------------------------
 1 | from .model.estimate_common import compute_error
 2 | from .model.model import (
 3 |     default_idle_predicted_col_func,
 4 |     default_predicted_col_func,
 5 |     get_background_containers,
 6 |     get_dynamic_power_colname,
 7 |     get_label_power_colname,
 8 |     get_predicted_background_power_colname,
 9 |     get_predicted_dynamic_background_power_colname,
10 |     get_predicted_dynamic_power_colname,
11 |     get_predicted_power_colname,
12 |     get_reconstructed_power_colname,
13 |     load_model,
14 | )
15 | 
16 | __all__ = [
17 |     "compute_error",
18 |     "load_model",
19 |     "get_background_containers",
20 |     "default_predicted_col_func",
21 |     "get_predicted_power_colname",
22 |     "get_predicted_background_power_colname",
23 |     "get_dynamic_power_colname",
24 |     "get_predicted_dynamic_power_colname",
25 |     "get_predicted_dynamic_background_power_colname",
26 |     "get_label_power_colname",
27 |     "get_reconstructed_power_colname",
28 |     "default_idle_predicted_col_func",
29 | ]
30 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/archived_model.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | 
  3 | import requests
  4 | from requests_file import FileAdapter
  5 | 
  6 | from kepler_model.estimate.model_server_connector import unpack
  7 | from kepler_model.util.config import get_init_model_url
  8 | from kepler_model.util.loader import load_metadata
  9 | from kepler_model.util.train_types import ModelOutputType
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | failed_list = []
 14 | 
 15 | FILTER_ITEM_DELIMIT = ";"
 16 | VALUE_DELIMIT = ":"
 17 | ARRAY_DELIMIT = ","
 18 | 
 19 | 
 20 | def parse_filters(filter):
 21 |     filter_list = filter.split(FILTER_ITEM_DELIMIT)
 22 |     filters = dict()
 23 |     for filter_item in filter_list:
 24 |         splits = filter_item.split(VALUE_DELIMIT)
 25 |         if len(splits) != 2:
 26 |             continue
 27 |         key = splits[0]
 28 |         if key == "features":
 29 |             value = splits[1].split(ARRAY_DELIMIT)
 30 |         else:
 31 |             value = splits[1]
 32 |         filters[key] = value
 33 |     return filters
 34 | 
 35 | 
 36 | def valid_metrics(metrics, features):
 37 |     for feature in features:
 38 |         if feature not in metrics:
 39 |             return False
 40 |     return True
 41 | 
 42 | 
 43 | def is_valid_model(metrics, metadata, filters):
 44 |     if not valid_metrics(metrics, metadata["features"]):
 45 |         return False
 46 | 
 47 |     for attrb, val in filters.items():
 48 |         if not hasattr(metadata, attrb) or getattr(metadata, attrb) is None:
 49 |             logger.warning(f"{metadata['model_name']} has no {attrb}")
 50 |             return False
 51 | 
 52 |         cmp_val = getattr(metadata, attrb)
 53 |         val = float(val)
 54 |         if attrb == "abs_max_corr":  # higher is better
 55 |             valid = cmp_val >= val
 56 |         else:  # lower is better
 57 |             valid = cmp_val <= val
 58 |         if not valid:
 59 |             return False
 60 | 
 61 |     return True
 62 | 
 63 | 
 64 | def reset_failed_list():
 65 |     global failed_list
 66 |     failed_list = []
 67 | 
 68 | 
 69 | def get_achived_model(power_request):
 70 |     global failed_list
 71 |     output_type_name = power_request.output_type
 72 |     if output_type_name in failed_list:
 73 |         return None
 74 |     output_type = ModelOutputType[power_request.output_type]
 75 |     url = get_init_model_url(power_request.energy_source, output_type_name)
 76 |     if url == "":
 77 |         logger.warning(f"no URL set for {output_type_name}, {power_request.energy_source}")
 78 |         return None
 79 |     logger.info(f"try getting archieved model from URL: {url} for {output_type_name}")
 80 | 
 81 |     s = requests.Session()
 82 |     s.mount("file://", FileAdapter())
 83 |     response = s.get(url)
 84 |     logger.debug(f"response: {response}")
 85 | 
 86 |     if response.status_code != 200:
 87 |         return None
 88 | 
 89 |     output_path = unpack(power_request.energy_source, output_type, response, replace=False)
 90 |     if output_path is not None:
 91 |         metadata = load_metadata(output_path)
 92 |         filters = parse_filters(power_request.filter)
 93 |         try:
 94 |             if not is_valid_model(power_request.metrics, metadata, filters):
 95 |                 failed_list += [output_type_name]
 96 |                 return None
 97 |         except Exception as e:
 98 |             logger.warning(f"cannot validate the archived model: {e}")
 99 |             return None
100 | 
101 |     return output_path
102 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/estimate/model/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/curvefit_model.py:
--------------------------------------------------------------------------------
 1 | import collections.abc
 2 | 
 3 | from kepler_model.estimate.model.estimate_common import (
 4 |     is_component_model,
 5 |     load_model_by_json,
 6 |     load_model_by_pickle,
 7 |     transform_and_predict,
 8 | )
 9 | from kepler_model.util import ModelOutputType
10 | from kepler_model.util.train_types import get_valid_feature_groups, main_feature
11 | 
12 | 
13 | class CurveFitModelEstimator:
14 |     def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False, feature_group=None):
15 |         self.name = model_name
16 |         self.features = features
17 |         if feature_group is None:
18 |             self.feauture_group = get_valid_feature_groups(features)[0]
19 |         else:
20 |             self.feauture_group = feature_group
21 |         self.output_type = ModelOutputType[output_type]
22 | 
23 |         self.comp_type = not component_init and is_component_model(model_file)
24 |         if self.comp_type:
25 |             self.models = dict()
26 |             model_info = load_model_by_json(model_path, model_file)
27 |             for comp, model_metadata in model_info.items():
28 |                 model = CurveFitModelEstimator(
29 |                     model_path,
30 |                     self.name,
31 |                     self.output_type.name,
32 |                     model_metadata["model_file"],
33 |                     model_metadata["features"],
34 |                     model_metadata["fe_files"],
35 |                     component_init=True,
36 |                 )
37 |                 feature_index = main_feature(self.feauture_group.name, comp)
38 |                 if model.model is not None:
39 |                     model.model.set_feature_index(feature_index)
40 |                 self.models[comp] = model
41 |         else:
42 |             self.model = load_model_by_pickle(model_path, model_file)
43 |             self.fe_list = []
44 |             for fe_filename in fe_files:
45 |                 self.fe_list += [load_model_by_pickle(model_path, fe_filename)]
46 | 
47 |     def get_power(self, request):
48 |         if self.comp_type:
49 |             results = dict()
50 |             for comp, model in self.models.items():
51 |                 y, msg = transform_and_predict(model, request)
52 |                 if msg != "":
53 |                     return [], msg
54 |                 if not isinstance(y, collections.abc.Sequence):
55 |                     y = [y]
56 |                 results[comp] = y
57 |             return results, msg
58 |         else:
59 |             return transform_and_predict(self, request)
60 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/estimate_common.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import cpuinfo
 4 | import numpy as np
 5 | from sklearn.metrics import mean_absolute_error, mean_squared_error
 6 | 
 7 | from kepler_model.util.loader import load_json, load_pkl
 8 | 
 9 | keras_enabled = True
10 | cpu_info = cpuinfo.get_cpu_info()
11 | 
12 | # if 'flags' in cpu_info and 'avx' in cpu_info['flags']:
13 | #     import keras
14 | #     from keras import backend as K
15 | # else:
16 | #     print("AVX instructions are not available.")
17 | #     keras_enabled = False
18 | 
19 | 
20 | def is_component_model(model_file):
21 |     return ".json" in model_file
22 | 
23 | 
24 | def transform_and_predict(model, datapoint):
25 |     msg = ""
26 |     try:
27 |         x_values = datapoint[model.features].values
28 |         for fe in model.fe_list:
29 |             if fe is None:
30 |                 continue
31 |             x_values = fe.transform(x_values)
32 |         y = model.model.predict(x_values).squeeze()
33 |         y[y < 0] = 0
34 |         y = y.tolist()
35 |     except Exception as e:
36 |         msg = f"{e}\n"
37 |         y = []
38 |     return y, msg
39 | 
40 | 
41 | def load_model_by_pickle(model_path, model_filename):
42 |     return load_pkl(model_path, model_filename)
43 | 
44 | 
45 | def coeff_determination(y_true, y_pred):
46 |     if not keras_enabled:
47 |         return None
48 |     SS_res = K.sum(K.square(y_true - y_pred))
49 |     SS_tot = K.sum(K.square(y_true - K.mean(y_true)))
50 |     return 1 - SS_res / (SS_tot + K.epsilon())
51 | 
52 | 
53 | def load_model_by_keras(model_path, model_filename):
54 |     model_file = os.path.join(model_path, model_filename)
55 |     try:
56 |         model = keras.models.load_model(model_file, custom_objects={"coeff_determination": coeff_determination})
57 |     except Exception as e:
58 |         print(e)
59 |         return None
60 |     return model
61 | 
62 | 
63 | def load_model_by_json(model_path, model_filename):
64 |     return load_json(model_path, model_filename)
65 | 
66 | 
67 | # return mae, mse, mape
68 | def compute_error(predicted_power, actual_powers):
69 |     mse = mean_squared_error(actual_powers, predicted_power)
70 |     mae = mean_absolute_error(actual_powers, predicted_power)
71 |     actual_power_values = list(actual_powers)
72 |     predicted_power_values = list(predicted_power)
73 |     if len(actual_powers) == 0:
74 |         mape = -1
75 |     else:
76 |         non_zero_predicted_powers = np.array([predicted_power_values[i] for i in range(len(predicted_power_values)) if actual_power_values[i] > 0])
77 |         if len(non_zero_predicted_powers) == 0:
78 |             mape = -1
79 |         else:
80 |             non_zero_y_test = np.array([y for y in actual_powers if y > 0])
81 |             absolute_percentage_errors = np.abs((non_zero_y_test - non_zero_predicted_powers) / non_zero_y_test) * 100
82 |             mape = np.mean(absolute_percentage_errors)
83 |     return mae, mse, mape
84 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/keras_model.py:
--------------------------------------------------------------------------------
 1 | from kepler_model.estimate.model.estimate_common import (
 2 |     is_component_model,
 3 |     load_model_by_json,
 4 |     load_model_by_keras,
 5 |     load_model_by_pickle,
 6 |     transform_and_predict,
 7 | )
 8 | from kepler_model.estimate.model_server_connector import ModelOutputType
 9 | 
10 | 
11 | class KerasModelEstimator:
12 |     def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False):
13 |         self.name = model_name
14 |         self.features = features
15 |         self.output_type = ModelOutputType[output_type]
16 |         self.comp_type = not component_init and is_component_model(self.output_type)
17 |         if self.comp_type:
18 |             self.models = dict()
19 |             model_info = load_model_by_json(model_path, model_file)
20 |             for comp, model_metadata in model_info.items():
21 |                 model = KerasModelEstimator(
22 |                     model_path,
23 |                     self.name,
24 |                     self.output_type.name,
25 |                     model_metadata["model_file"],
26 |                     model_metadata["features"],
27 |                     model_metadata["fe_files"],
28 |                     component_init=True,
29 |                 )
30 |                 self.models[comp] = model
31 |         else:
32 |             self.model = load_model_by_keras(model_path, model_file)
33 |             self.fe_list = []
34 |             for fe_filename in fe_files:
35 |                 self.fe_list += [load_model_by_pickle(model_path, fe_filename)]
36 | 
37 |     def get_power(self, request):
38 |         if self.comp_type:
39 |             results = dict()
40 |             for comp, model in self.models.items():
41 |                 y, msg = transform_and_predict(model, request)
42 |                 if msg != "":
43 |                     return [], msg
44 |                 results[comp] = y
45 |             return results, msg
46 |         else:
47 |             return transform_and_predict(self, request)
48 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/scikit_model.py:
--------------------------------------------------------------------------------
 1 | import collections.abc
 2 | 
 3 | from kepler_model.estimate.model.estimate_common import (
 4 |     is_component_model,
 5 |     load_model_by_json,
 6 |     load_model_by_pickle,
 7 |     transform_and_predict,
 8 | )
 9 | from kepler_model.util import ModelOutputType
10 | 
11 | 
12 | class ScikitModelEstimator:
13 |     def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False):
14 |         self.name = model_name
15 |         self.features = features
16 |         self.output_type = ModelOutputType[output_type]
17 | 
18 |         self.comp_type = not component_init and is_component_model(model_file)
19 |         if self.comp_type:
20 |             self.models = dict()
21 |             model_info = load_model_by_json(model_path, model_file)
22 |             for comp, model_metadata in model_info.items():
23 |                 model = ScikitModelEstimator(
24 |                     model_path,
25 |                     self.name,
26 |                     self.output_type.name,
27 |                     model_metadata["model_file"],
28 |                     model_metadata["features"],
29 |                     model_metadata["fe_files"],
30 |                     component_init=True,
31 |                 )
32 |                 self.models[comp] = model
33 |         else:
34 |             self.model = load_model_by_pickle(model_path, model_file)
35 |             self.fe_list = []
36 |             for fe_filename in fe_files:
37 |                 self.fe_list += [load_model_by_pickle(model_path, fe_filename)]
38 | 
39 |     def get_power(self, request):
40 |         if self.comp_type:
41 |             results = dict()
42 |             for comp, model in self.models.items():
43 |                 y, msg = transform_and_predict(model, request)
44 |                 if msg != "":
45 |                     return [], msg
46 |                 if not isinstance(y, collections.abc.Sequence):
47 |                     y = [y]
48 |                 results[comp] = y
49 |             return results, msg
50 |         else:
51 |             return transform_and_predict(self, request)
52 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model/xgboost_model.py:
--------------------------------------------------------------------------------
 1 | import collections.abc
 2 | import os
 3 | 
 4 | import xgboost as xgb
 5 | 
 6 | from kepler_model.estimate.model.estimate_common import (
 7 |     is_component_model,
 8 |     load_model_by_json,
 9 |     load_model_by_pickle,
10 |     transform_and_predict,
11 | )
12 | from kepler_model.util import ModelOutputType
13 | 
14 | 
15 | class XgboostModelEstimator:
16 |     def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False):
17 |         self.name = model_name
18 |         self.features = features
19 |         self.output_type = ModelOutputType[output_type]
20 | 
21 |         self.comp_type = not component_init and is_component_model(model_file)
22 |         if self.comp_type:
23 |             self.models = dict()
24 |             model_info = load_model_by_json(model_path, model_file)
25 |             for comp, model_metadata in model_info.items():
26 |                 model = XgboostModelEstimator(
27 |                     model_path,
28 |                     self.name,
29 |                     self.output_type.name,
30 |                     model_metadata["model_file"],
31 |                     model_metadata["features"],
32 |                     model_metadata["fe_files"],
33 |                     component_init=True,
34 |                 )
35 |                 self.models[comp] = model
36 |         else:
37 |             filepath = os.path.join(model_path, model_file)
38 |             self.model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.1)
39 |             self.model.load_model(filepath)
40 |             self.fe_list = []
41 |             for fe_filename in fe_files:
42 |                 self.fe_list += [load_model_by_pickle(model_path, fe_filename)]
43 | 
44 |     def get_power(self, request):
45 |         if self.comp_type:
46 |             results = dict()
47 |             for comp, model in self.models.items():
48 |                 y, msg = transform_and_predict(model, request)
49 |                 if msg != "":
50 |                     return [], msg
51 |                 if not isinstance(y, collections.abc.Sequence):
52 |                     y = [y]
53 |                 results[comp] = y
54 |             return results, msg
55 |         else:
56 |             return transform_and_predict(self, request)
57 | 


--------------------------------------------------------------------------------
/src/kepler_model/estimate/model_server_connector.py:
--------------------------------------------------------------------------------
 1 | import codecs
 2 | import json
 3 | import os
 4 | import shutil
 5 | 
 6 | import requests
 7 | 
 8 | from kepler_model.server.model_server import ModelListParam
 9 | from kepler_model.util.config import (
10 |     download_path,
11 |     get_model_server_list_endpoint,
12 |     get_model_server_req_endpoint,
13 |     is_model_server_enabled,
14 | )
15 | from kepler_model.util.loader import get_download_output_path
16 | from kepler_model.util.train_types import ModelOutputType
17 | 
18 | 
19 | def make_model_request(power_request, machine_spec=None):
20 |     model_request = {
21 |         "metrics": power_request.metrics + power_request.system_features,
22 |         "output_type": power_request.output_type,
23 |         "source": power_request.energy_source,
24 |         "filter": power_request.filter,
25 |         "trainer_name": power_request.trainer_name,
26 |     }
27 |     if machine_spec is not None:
28 |         model_request["machine_spec"] = machine_spec
29 |     return model_request
30 | 
31 | 
32 | TMP_FILE = "tmp.zip"
33 | 
34 | 
35 | def unpack(energy_source, output_type, response, replace=True):
36 |     output_path = get_download_output_path(download_path, energy_source, output_type)
37 |     tmp_filepath = os.path.join(download_path, TMP_FILE)
38 |     if os.path.exists(output_path):
39 |         if not replace:
40 |             if os.path.exists(tmp_filepath):
41 |                 # delete downloaded file
42 |                 os.remove(tmp_filepath)
43 |             return output_path
44 |         # delete existing model
45 |         shutil.rmtree(output_path)
46 |     with codecs.open(tmp_filepath, "wb") as f:
47 |         f.write(response.content)
48 |     shutil.unpack_archive(tmp_filepath, output_path)
49 |     os.remove(tmp_filepath)
50 |     return output_path
51 | 
52 | 
53 | def make_request(power_request, machine_spec):
54 |     if not is_model_server_enabled():
55 |         return None
56 |     model_request = make_model_request(power_request, machine_spec)
57 |     output_type = ModelOutputType[power_request.output_type]
58 |     try:
59 |         response = requests.post(get_model_server_req_endpoint(), json=model_request)
60 |     except Exception as err:
61 |         print(f"cannot make request to {get_model_server_req_endpoint()}: {err}")
62 |         return None
63 |     if response.status_code != 200:
64 |         return None
65 |     return unpack(power_request.energy_source, output_type, response)
66 | 
67 | 
68 | def list_all_models(energy_source=None, output_type=None, feature_group=None, node_type=None, filter=None):
69 |     if not is_model_server_enabled():
70 |         return dict()
71 |     try:
72 |         endpoint = get_model_server_list_endpoint()
73 |         params = {}
74 |         if energy_source:
75 |             params[ModelListParam.EnergySource.value] = energy_source
76 |         if output_type:
77 |             params[ModelListParam.OutputType.value] = output_type
78 |         if feature_group:
79 |             params[ModelListParam.FeatureGroup.value] = feature_group
80 |         if node_type:
81 |             params[ModelListParam.NodeType.value] = node_type
82 |         if filter:
83 |             params[ModelListParam.Filter.value] = filter
84 | 
85 |         response = requests.get(endpoint, params=params)
86 |     except Exception as err:
87 |         print(f"cannot list model: {err}")
88 |         return dict()
89 |     if response.status_code != 200:
90 |         return dict()
91 |     model_names = json.loads(response.content.decode("utf-8"))
92 |     return model_names
93 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/__init__.py:
--------------------------------------------------------------------------------
 1 | # comonly used within train module
 2 | 
 3 | from .extractor.extractor import DefaultExtractor
 4 | from .extractor.smooth_extractor import SmoothExtractor
 5 | from .isolator.isolator import MinIdleIsolator, NoneIsolator, ProfileBackgroundIsolator
 6 | from .isolator.train_isolator import TrainIsolator
 7 | from .pipeline import NewPipeline, load_class
 8 | from .profiler.node_type_index import NodeTypeIndexCollection, NodeTypeSpec
 9 | from .profiler.profiler import Profiler, generate_profiles
10 | 
11 | DefaultProfiler = Profiler(extractor=DefaultExtractor())
12 | 
13 | __all__ = [
14 |     "DefaultExtractor",
15 |     "SmoothExtractor",
16 |     "Profiler",
17 |     "generate_profiles",
18 |     "NodeTypeIndexCollection",
19 |     "NodeTypeSpec",
20 |     "MinIdleIsolator",
21 |     "ProfileBackgroundIsolator",
22 |     "NoneIsolator",
23 |     "TrainIsolator",
24 |     "NewPipeline",
25 |     "load_class",
26 | ]
27 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/exporter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/exporter/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/exporter/exporter.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from kepler_model.train.exporter.validator import BestModelCollection, get_validated_export_items
 4 | from kepler_model.train.exporter.writer import (
 5 |     append_version_readme,
 6 |     generate_pipeline_page,
 7 |     generate_pipeline_readme,
 8 |     generate_report_results,
 9 |     get_workload_content,
10 | )
11 | from kepler_model.util.config import ERROR_KEY
12 | from kepler_model.util.format import time_to_str
13 | from kepler_model.util.loader import get_export_path, get_version_path, load_metadata, load_node_type_index
14 | from kepler_model.util.saver import save_node_type_index, save_pipeline_metadata
15 | 
16 | repo_url = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models"
17 | 
18 | 
19 | def export(data_path, pipeline_path, db_path, publisher, collect_date, inputs):
20 |     # load pipeline metadata
21 |     pipeline_metadata = load_metadata(pipeline_path)
22 |     if pipeline_metadata is None:
23 |         print("no pipeline metadata")
24 |         return
25 |     # add publish information to pipeline metadata
26 |     pipeline_metadata["publisher"] = publisher
27 |     pipeline_metadata["collect_time"] = time_to_str(collect_date)
28 |     pipeline_metadata["export_time"] = time_to_str(datetime.datetime.utcnow())
29 | 
30 |     node_type_index_json = load_node_type_index(pipeline_path)
31 |     if node_type_index_json is None:
32 |         print("no node type index")
33 |         return
34 |     node_types = node_type_index_json.keys()
35 |     best_model_collections = dict()
36 |     for node_type in node_types:
37 |         best_model_collections[int(node_type)] = BestModelCollection(ERROR_KEY)
38 | 
39 |     # get path
40 |     pipeline_name = pipeline_metadata["name"]
41 |     local_export_path = get_export_path(db_path, pipeline_name)
42 |     local_version_path = get_version_path(db_path)
43 |     remote_version_path = get_version_path(repo_url, assure=False)
44 | 
45 |     # get validated export items (models)
46 |     export_items, valid_metadata_df = get_validated_export_items(pipeline_path, pipeline_name)
47 |     # save pipeline metadata
48 |     for energy_source, ot_metadata_df in valid_metadata_df.items():
49 |         for model_type, metadata_df in ot_metadata_df.items():
50 |             metadata_df = metadata_df.sort_values(by=["feature_group", ERROR_KEY])
51 |             save_pipeline_metadata(local_export_path, pipeline_metadata, energy_source, model_type, metadata_df)
52 |     # save node_type_index.json
53 |     save_node_type_index(local_export_path, node_type_index_json)
54 | 
55 |     for export_item in export_items:
56 |         # export models
57 |         export_item.export(local_version_path)
58 |         # update best model
59 |         best_model_collections[export_item.node_type].compare_new_item(export_item)
60 | 
61 |     # generate pipeline page
62 |     workload_content = get_workload_content(data_path, inputs)
63 |     generate_pipeline_page(local_version_path, pipeline_metadata, workload_content)
64 |     # generate error report page
65 |     generate_report_results(local_export_path, best_model_collections, node_type_index_json, remote_version_path)
66 |     # generate validation result page
67 |     generate_pipeline_readme(pipeline_name, local_export_path, node_type_index_json, best_model_collections)
68 |     # add new pipeline item to version path
69 |     append_version_readme(local_version_path, pipeline_metadata)
70 | 
71 |     return local_export_path
72 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/extractor/__init__.py:
--------------------------------------------------------------------------------
1 | from .extractor import DefaultExtractor
2 | 
3 | __all__ = ["DefaultExtractor"]
4 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/extractor/preprocess.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from kepler_model.estimate.model.model import get_label_power_colname
 4 | from kepler_model.util.extract_types import col_to_component
 5 | from kepler_model.util.prom_types import TIMESTAMP_COL
 6 | from kepler_model.util.train_types import PowerSourceMap
 7 | 
 8 | 
 9 | def drop_zero_column(data, cols):
10 |     sum_col = "sum_val"
11 |     data[sum_col] = data[cols].sum(axis=1)
12 |     data = data.drop(data[data[sum_col] == 0].index)
13 |     data = data.drop(columns=[sum_col])
14 |     return data
15 | 
16 | 
17 | def remove_outlier(df, workload_features, threshold=1):
18 |     # Calculate the Z-score for each column
19 |     z_scores = np.abs((df[workload_features] - df[workload_features].mean()) / df[workload_features].std())
20 |     # Remove rows with outliers
21 |     df_no_outliers = df[(z_scores < threshold).all(axis=1)]
22 |     return df_no_outliers
23 | 
24 | 
25 | def time_filter(data, min_time, max_time):
26 |     _data = data.reset_index()
27 |     start_time = _data[TIMESTAMP_COL].min()
28 |     _data = _data[(_data[TIMESTAMP_COL] >= start_time + min_time) & (_data[TIMESTAMP_COL] <= start_time + max_time)]
29 |     return _data
30 | 
31 | 
32 | def get_extracted_power_labels(extracted_data, energy_components, label_cols):
33 |     # mean over the same value across container-level
34 |     extracted_power_labels = extracted_data[[TIMESTAMP_COL] + label_cols].groupby([TIMESTAMP_COL]).mean().sort_index()
35 |     for energy_component in energy_components:
36 |         target_cols = [col for col in label_cols if col_to_component(col) == energy_component]
37 |         component_label_col = get_label_power_colname(energy_component)
38 |         extracted_power_labels[component_label_col] = extracted_power_labels[target_cols].sum(axis=1)
39 |     return extracted_power_labels
40 | 
41 | 
42 | def find_correlations(energy_source, feature_power_data, power_columns, workload_features):
43 |     power_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[power_columns].mean()
44 |     feature_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[workload_features].sum()
45 |     energy_components = PowerSourceMap[energy_source]
46 |     target_cols = [col for col in power_columns if col_to_component(col) == energy_components[0]]
47 |     process_power_data = power_data.copy()
48 |     # mean over the same value across container-level
49 |     process_power_over_ts = process_power_data[target_cols].reset_index().groupby([TIMESTAMP_COL]).sum()
50 |     process_power_data[energy_source] = process_power_over_ts.sum(axis=1)
51 |     # sum usage all container-level
52 |     join_data = feature_data.join(process_power_data[energy_source]).dropna()
53 |     corr = join_data.corr()[[energy_source]]
54 |     return corr.drop(index=energy_source)
55 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/extractor/smooth_extractor.py:
--------------------------------------------------------------------------------
 1 | from kepler_model.util.train_types import SYSTEM_FEATURES, FeatureGroup, FeatureGroups
 2 | 
 3 | from .extractor import DefaultExtractor, find_correlations
 4 | 
 5 | 
 6 | class SmoothExtractor(DefaultExtractor):
 7 |     def __init__(self, smooth_window=30):
 8 |         self.smooth_window = smooth_window
 9 | 
10 |     def get_name(self):
11 |         return "smooth"
12 | 
13 |     # implement extract function
14 |     def extract(self, query_results, energy_components, feature_group, energy_source, node_level, aggr=True, use_vm_metrics=False):
15 |         feature_power_data, power_columns, _, features = super().extract(
16 |             query_results, energy_components, feature_group, energy_source, node_level, aggr, use_vm_metrics=use_vm_metrics
17 |         )
18 | 
19 |         features = FeatureGroups[FeatureGroup[feature_group]]
20 |         smoothed_data = feature_power_data.copy()
21 |         workload_features = [feature for feature in features if feature not in SYSTEM_FEATURES]
22 | 
23 |         for col in list(workload_features) + list(power_columns):
24 |             smoothed_data[col] = feature_power_data[col].rolling(window=self.smooth_window).mean()
25 |         smoothed_data = smoothed_data.dropna()
26 | 
27 |         corr = find_correlations(energy_source, feature_power_data, power_columns, workload_features)
28 | 
29 |         return smoothed_data, power_columns, corr, features
30 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/isolator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/isolator/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/online_trainer.py:
--------------------------------------------------------------------------------
 1 | # TODO: test
 2 | import time
 3 | 
 4 | from kepler_model.train.extractor import DefaultExtractor
 5 | from kepler_model.train.isolator.isolator import MinIdleIsolator, ProfileBackgroundIsolator
 6 | from kepler_model.train.pipeline import NewPipeline
 7 | from kepler_model.train.profiler.profiler import load_all_profiles
 8 | from kepler_model.train.prom.prom_query import PrometheusClient
 9 | from kepler_model.util.config import get_config
10 | from kepler_model.util.loader import default_train_output_pipeline
11 | from kepler_model.util.prom_types import PROM_QUERY_INTERVAL, get_valid_feature_group_from_queries
12 | from kepler_model.util.train_types import FeatureGroups, PowerSourceMap
13 | 
14 | SAMPLING_INTERVAL = get_config("SAMPLING_INTERVAL", PROM_QUERY_INTERVAL)
15 | 
16 | 
17 | default_trainers = ["GradientBoostingRegressorTrainer"]
18 | abs_trainer_names = default_trainers + []
19 | dyn_trainer_names = default_trainers + []
20 | 
21 | 
22 | def initial_pipelines():
23 |     target_energy_sources = PowerSourceMap.keys()
24 |     valid_feature_groups = FeatureGroups.keys()
25 |     profiles = load_all_profiles()
26 |     profile_pipeline = NewPipeline(
27 |         default_train_output_pipeline,
28 |         abs_trainer_names,
29 |         dyn_trainer_names,
30 |         extractor=DefaultExtractor(),
31 |         isolator=ProfileBackgroundIsolator(profiles),
32 |         target_energy_sources=target_energy_sources,
33 |         valid_feature_groups=valid_feature_groups,
34 |     )
35 |     non_profile_pipeline = NewPipeline(
36 |         default_train_output_pipeline,
37 |         abs_trainer_names,
38 |         dyn_trainer_names,
39 |         extractor=DefaultExtractor(),
40 |         isolator=MinIdleIsolator(),
41 |         target_energy_sources=target_energy_sources,
42 |         valid_feature_groups=valid_feature_groups,
43 |     )
44 |     return profile_pipeline, non_profile_pipeline
45 | 
46 | 
47 | def run():
48 |     profile_pipeline, non_profile_pipeline = initial_pipelines()
49 |     prom_client = PrometheusClient()
50 |     while True:
51 |         prom_client.query()
52 |         query_results = prom_client.snapshot_query_result()
53 |         valid_feature_groups = get_valid_feature_group_from_queries(query_results.keys())
54 |         for energy_source, energy_components in PowerSourceMap.items():
55 |             for feature_group in valid_feature_groups:
56 |                 success, _, _ = profile_pipeline.process(query_results, energy_components, energy_source, feature_group=feature_group)
57 |                 if not success:
58 |                     # failed to process with profile, try non_profile pipeline
59 |                     success, _, _ = non_profile_pipeline.process(query_results, energy_components, energy_source, feature_group=feature_group)
60 |                     if success:
61 |                         non_profile_pipeline.save_metadata()
62 |                 else:
63 |                     profile_pipeline.save_metadata()
64 |         time.sleep(SAMPLING_INTERVAL)
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     run()
69 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/profiler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/profiler/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/profiler/generate_scaler.py:
--------------------------------------------------------------------------------
 1 | ############################################################
 2 | ##
 3 | ## generate_scaler
 4 | ## generate a scaler for each node type from prom query
 5 | ##
 6 | ## ./python generate_scaler.py query_output_folder
 7 | ## e.g., ./python generate_scaler.py ../tests/data/prom_output
 8 | ##
 9 | ## input must be a query output of loaded state
10 | ##
11 | ############################################################
12 | 
13 | # WARN: is this file used ?
14 | 
15 | import os
16 | import pickle
17 | 
18 | import pandas as pd
19 | from sklearn.preprocessing import MaxAbsScaler
20 | 
21 | from kepler_model.train import DefaultExtractor
22 | from kepler_model.util.prom_types import TIMESTAMP_COL, node_info_column
23 | from kepler_model.util.train_types import SYSTEM_FEATURES, FeatureGroup, FeatureGroups
24 | 
25 | # WARN: unable to find this anymore
26 | # from profile_background import profile_path
27 | 
28 | 
29 | # HACK:
30 | extractor = DefaultExtractor()
31 | profile_path = "profile/path"
32 | max_scaler_top_path = os.path.join(profile_path, "..", "max_scaler")
33 | 
34 | if not os.path.exists(max_scaler_top_path):
35 |     os.mkdir(max_scaler_top_path)
36 | 
37 | 
38 | def read_query_results(query_path):
39 |     results = dict()
40 |     metric_filenames = [metric_filename for metric_filename in os.listdir(query_path)]
41 |     for metric_filename in metric_filenames:
42 |         metric = metric_filename.replace(".csv", "")
43 |         filepath = os.path.join(query_path, metric_filename)
44 |         results[metric] = pd.read_csv(filepath)
45 |     return results
46 | 
47 | 
48 | def save_scaler(scaler, node_type, feature_group, scaler_top_path):
49 |     node_type_path = os.path.join(scaler_top_path, str(node_type))
50 |     if not os.path.exists(node_type_path):
51 |         os.mkdir(node_type_path)
52 |     filename = os.path.join(node_type_path, feature_group + ".pkl")
53 |     with open(filename, "wb") as f:
54 |         pickle.dump(scaler, f)
55 | 
56 | 
57 | def process(query_results):
58 |     node_info_data = extractor.get_system_category(query_results)
59 |     if node_info_data is None:
60 |         print("No Node Info")
61 |         return None
62 |     node_types = pd.unique(node_info_data[node_info_column])
63 |     for node_type in node_types:
64 |         for feature_group in FeatureGroups:
65 |             feature_group_name = feature_group.name
66 |             features = FeatureGroups[FeatureGroup[feature_group_name]]
67 |             workload_features = [feature for feature in features if feature not in SYSTEM_FEATURES]
68 |             system_features = [feature for feature in features if feature in SYSTEM_FEATURES]
69 |             feature_data = extractor.get_workload_feature_data(query_results, workload_features)
70 |             if feature_data is None:
71 |                 print("cannot process ", feature_group_name)
72 |                 continue
73 |             workload_feature_data = feature_data.groupby([TIMESTAMP_COL]).sum()[workload_features]
74 |             if len(system_features) > 0:
75 |                 system_feature_data = extractor.get_system_feature_data(query_results, system_features)
76 |                 feature_data = workload_feature_data.join(system_feature_data).sort_index().dropna()
77 |             else:
78 |                 feature_data = workload_feature_data
79 | 
80 |             feature_data = feature_data.join(node_info_data)
81 |             node_types = pd.unique(feature_data[node_info_column])
82 |             # filter and extract features
83 |             x_values = feature_data[feature_data[node_info_column] == node_type][features].values
84 |             max_scaler = MaxAbsScaler()
85 |             max_scaler.fit(x_values)
86 |             save_scaler(max_scaler, node_type, feature_group_name, max_scaler_top_path)
87 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/prom/__init__.py:
--------------------------------------------------------------------------------
1 | from .prom_query import PrometheusClient
2 | 
3 | __all__ = ["PrometheusClient"]
4 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/prom/prom_query.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | from prometheus_api_client import PrometheusConnect
 4 | 
 5 | from kepler_model.util.prom_types import (
 6 |     PROM_QUERY_INTERVAL,
 7 |     PROM_QUERY_STEP,
 8 |     PROM_SERVER,
 9 |     PROM_SSL_DISABLE,
10 |     generate_dataframe_from_response,
11 |     metric_prefix,
12 | )
13 | 
14 | UTC_OFFSET_TIMEDELTA = datetime.datetime.utcnow() - datetime.datetime.now()
15 | 
16 | 
17 | def _range_queries(prom, metric_list, start, end, step, params=None):
18 |     response = dict()
19 |     for metric in metric_list:
20 |         response[metric] = prom.custom_query_range(metric, start, end, step, params)
21 |     return response
22 | 
23 | 
24 | class PrometheusClient:
25 |     def __init__(self):
26 |         self.prom = PrometheusConnect(url=PROM_SERVER, disable_ssl=PROM_SSL_DISABLE)
27 |         self.interval = PROM_QUERY_INTERVAL
28 |         self.step = PROM_QUERY_STEP
29 |         self.latest_query_result = dict()
30 | 
31 |     def query(self):
32 |         available_metrics = self.prom.all_metrics()
33 |         queries = [m for m in available_metrics if metric_prefix in m]
34 |         end = datetime.datetime.now()
35 |         start = end - datetime.timedelta(seconds=self.interval)
36 |         self.latest_query_result = dict()
37 |         response_dict = _range_queries(self.prom, queries, start, end, self.step, None)
38 |         for query_metric, prom_response in response_dict.items():
39 |             self.latest_query_result[query_metric] = generate_dataframe_from_response(query_metric, prom_response)
40 |         return response_dict
41 | 
42 |     def snapshot_query_result(self):
43 |         return {metric: data for metric, data in self.latest_query_result.items() if len(data) > 0}
44 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/ExponentialRegressionTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/ExponentialRegressionTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/ExponentialRegressionTrainer/main.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | import numpy as np
 4 | 
 5 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer
 6 | 
 7 | 
 8 | def p0_func(x, y):
 9 |     a = (y.max() - y.min()) // math.e  # scale value
10 |     b = 1  # start from linear
11 |     c = y.min() - a  # initial offset
12 |     return [a, b, c]
13 | 
14 | 
15 | def expo_func(x, a, b, c):
16 |     y = a * np.exp(b * x) + c
17 |     return y
18 | 
19 | 
20 | class ExponentialRegressionTrainer(CurveFitTrainer):
21 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
22 |         super(ExponentialRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
23 |         self.fe_files = []
24 | 
25 |     def init_model(self):
26 |         return CurveFitModel(expo_func, p0_func=p0_func)
27 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.ensemble import GradientBoostingRegressor
 2 | 
 3 | from kepler_model.train.trainer.scikit import ScikitTrainer
 4 | 
 5 | model_class = "scikit"
 6 | 
 7 | 
 8 | class GradientBoostingRegressorTrainer(ScikitTrainer):
 9 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
10 |         super(GradientBoostingRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
11 |         self.fe_files = []
12 | 
13 |     def init_model(self):
14 |         return GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.1)
15 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/KNeighborsRegressorTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/KNeighborsRegressorTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/KNeighborsRegressorTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.neighbors import KNeighborsRegressor
 2 | 
 3 | from kepler_model.train.trainer.scikit import ScikitTrainer
 4 | 
 5 | model_class = "scikit"
 6 | 
 7 | 
 8 | class KNeighborsRegressorTrainer(ScikitTrainer):
 9 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
10 |         super(KNeighborsRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
11 |         self.fe_files = []
12 | 
13 |     def init_model(self):
14 |         return KNeighborsRegressor(n_neighbors=6)
15 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LinearRegressionTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LinearRegressionTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LinearRegressionTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LinearRegression
 2 | 
 3 | from kepler_model.train.trainer.scikit import ScikitTrainer
 4 | 
 5 | model_class = "scikit"
 6 | 
 7 | 
 8 | class LinearRegressionTrainer(ScikitTrainer):
 9 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
10 |         super(LinearRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
11 |         self.fe_files = []
12 | 
13 |     def init_model(self):
14 |         return LinearRegression(positive=True)
15 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LogarithmicRegressionTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LogarithmicRegressionTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LogarithmicRegressionTrainer/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer
 4 | 
 5 | 
 6 | def p0_func(x, y):
 7 |     a = y.max() - y.min()
 8 |     b = 1
 9 |     c = y.min()
10 |     return [a, b, c]
11 | 
12 | 
13 | def log_func(x, a, b, c):
14 |     y = a * np.log(b * x + 1) + c
15 |     return y
16 | 
17 | 
18 | class LogarithmicRegressionTrainer(CurveFitTrainer):
19 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
20 |         super(LogarithmicRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
21 |         self.fe_files = []
22 | 
23 |     def init_model(self):
24 |         return CurveFitModel(log_func, p0_func=p0_func)
25 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LogisticRegressionTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LogisticRegressionTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/LogisticRegressionTrainer/main.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer
 4 | 
 5 | 
 6 | def p0_func(x, y):
 7 |     A = y.max() - y.min()  # value range
 8 |     x0 = 0.5  # sigmoid mid point (as normalized value is in 0 to 1, start mid point = 0.5)
 9 |     k = A // np.std(y)  # growth rate (larger std, lower growth)
10 |     off = y.min()  # initial offset
11 |     return [A, x0, k, off]
12 | 
13 | 
14 | def logi_func(x, A, x0, k, off):
15 |     return A / (1 + np.exp(-k * (x - x0))) + off
16 | 
17 | 
18 | class LogisticRegressionTrainer(CurveFitTrainer):
19 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
20 |         super(LogisticRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
21 |         self.fe_files = []
22 | 
23 |     def init_model(self):
24 |         return CurveFitModel(logi_func, p0_func=p0_func)
25 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/PolynomialRegressionTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/PolynomialRegressionTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/PolynomialRegressionTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import LinearRegression
 2 | from sklearn.preprocessing import PolynomialFeatures
 3 | 
 4 | from kepler_model.train.trainer.scikit import ScikitTrainer
 5 | 
 6 | poly_scaler_filename = "poly_scaler.pkl"
 7 | 
 8 | 
 9 | class PolynomialRegressionTrainer(ScikitTrainer):
10 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
11 |         super(PolynomialRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
12 |         self.poly_scaler = PolynomialFeatures(degree=2)
13 |         self.fe_files = [poly_scaler_filename]
14 |         self.fe = [PolynomialFeatures(degree=2)]
15 | 
16 |     def init_model(self):
17 |         return LinearRegression()
18 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/SGDRegressorTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/SGDRegressorTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/SGDRegressorTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.linear_model import SGDRegressor
 2 | 
 3 | from kepler_model.train.trainer.scikit import ScikitTrainer
 4 | 
 5 | 
 6 | class SGDRegressorTrainer(ScikitTrainer):
 7 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
 8 |         super(SGDRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
 9 |         self.fe_files = []
10 | 
11 |     def init_model(self):
12 |         return SGDRegressor(max_iter=1000)
13 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/SVRRegressorTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/SVRRegressorTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/SVRRegressorTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from sklearn.svm import SVR
 2 | 
 3 | from kepler_model.train.trainer.scikit import ScikitTrainer
 4 | 
 5 | common_node_type = 1
 6 | 
 7 | 
 8 | class SVRRegressorTrainer(ScikitTrainer):
 9 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
10 |         super(SVRRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
11 |         self.fe_files = []
12 | 
13 |     def init_model(self):
14 |         return SVR(C=1.0, epsilon=0.2)
15 | 


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/XGBoostTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/XGBoostTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/XgboostFitTrainer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/XgboostFitTrainer/__init__.py


--------------------------------------------------------------------------------
/src/kepler_model/train/trainer/XgboostFitTrainer/main.py:
--------------------------------------------------------------------------------
 1 | from kepler_model.train.trainer.xgboost_interface import XgboostTrainer
 2 | 
 3 | 
 4 | class XgboostFitTrainer(XgboostTrainer):
 5 |     def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
 6 |         super(XgboostFitTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
 7 |         self.fe_files = []
 8 | 
 9 |     def _train(self, node_type, component, X_values, y_values):
10 |         model = self.node_models[node_type][component]
11 |         if model.__sklearn_is_fitted__():
12 |             self.node_models[node_type][component].fit(X_values, y_values, xgb_model=model)
13 |         else:
14 |             self.node_models[node_type][component].fit(X_values, y_values)
15 | 


--------------------------------------------------------------------------------
/src/kepler_model/util/__init__.py:
--------------------------------------------------------------------------------
 1 | # commonly-used definitions
 2 | from .config import get_config, model_toppath
 3 | from .loader import (
 4 |     class_to_json,
 5 |     default_train_output_pipeline,
 6 |     list_model_names,
 7 |     load_csv,
 8 |     load_json,
 9 |     load_metadata,
10 |     load_pkl,
11 |     load_remote_pkl,
12 |     load_scaler,
13 |     load_weight,
14 |     version,
15 | )
16 | from .prom_types import get_valid_feature_group_from_queries
17 | from .saver import assure_path, save_csv, save_json, save_metadata, save_pkl, save_scaler, save_weight
18 | from .train_types import (
19 |     BPF_FEATURES,
20 |     COUNTER_FEAUTRES,
21 |     IRQ_FEATURES,
22 |     SYSTEM_FEATURES,
23 |     WORKLOAD_FEATURES,
24 |     FeatureGroup,
25 |     FeatureGroups,
26 |     ModelOutputType,
27 |     PowerSourceMap,
28 |     get_feature_group,
29 | )
30 | 
31 | __all__ = [
32 |     "load_json",
33 |     "load_csv",
34 |     "load_pkl",
35 |     "load_metadata",
36 |     "load_scaler",
37 |     "load_weight",
38 |     "load_remote_pkl",
39 |     "list_model_names",
40 |     "default_train_output_pipeline",
41 |     "class_to_json",
42 |     "version",
43 |     "assure_path",
44 |     "save_csv",
45 |     "save_json",
46 |     "save_pkl",
47 |     "save_metadata",
48 |     "save_scaler",
49 |     "save_weight",
50 |     "get_config",
51 |     "model_toppath",
52 |     "SYSTEM_FEATURES",
53 |     "COUNTER_FEAUTRES",
54 |     "BPF_FEATURES",
55 |     "IRQ_FEATURES",
56 |     "WORKLOAD_FEATURES",
57 |     "PowerSourceMap",
58 |     "FeatureGroup",
59 |     "FeatureGroups",
60 |     "ModelOutputType",
61 |     "get_feature_group",
62 |     "get_valid_feature_group_from_queries",
63 | ]
64 | 


--------------------------------------------------------------------------------
/src/kepler_model/util/extract_types.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .prom_types import TIMESTAMP_COL, pkg_id_column
 4 | from .train_types import PowerSourceMap
 5 | 
 6 | container_id_colname = "id"
 7 | all_container_key = "all containers"
 8 | accelerator_type_colname = "type"
 9 | 
10 | node_level_index = [TIMESTAMP_COL]
11 | pkg_level_index = [TIMESTAMP_COL, pkg_id_column]
12 | container_level_index = [TIMESTAMP_COL, container_id_colname]
13 | 
14 | 
15 | def component_to_col(component, unit_col=None, unit_val=None):
16 |     power_colname = f"{component}_power"
17 |     if unit_col is None:
18 |         return power_colname
19 |     return f"{unit_col}_{unit_val}_{power_colname}"
20 | 
21 | 
22 | def col_to_component(component_col):
23 |     splits = component_col.split("_")
24 |     component = splits[-2:][0]
25 |     if component == "dynamic" or component == "background":
26 |         return splits[-3:][0]
27 |     return component
28 | 
29 | 
30 | def col_to_unit_val(component_col):
31 |     return component_col.split("_")[-3:][0]
32 | 
33 | 
34 | def ratio_to_col(unit_val):
35 |     return f"packge_ratio_{unit_val}"
36 | 
37 | 
38 | def get_unit_vals(power_columns):
39 |     return np.unique([col_to_unit_val(col) for col in power_columns if "package" in col])
40 | 
41 | 
42 | def get_num_of_unit(energy_source, label_cols):
43 |     energy_components = PowerSourceMap(energy_source)
44 |     num_of_unit = len(label_cols) / len(energy_components)
45 |     return num_of_unit
46 | 
47 | 
48 | def get_expected_power_columns(energy_components, num_of_unit=1):
49 |     # TODO: if ratio applied,
50 |     # return [component_to_col(component, "package", unit_val) for component in energy_components for unit_val in range(0,num_of_unit)]
51 |     return [component_to_col(component) for component in energy_components]
52 | 


--------------------------------------------------------------------------------
/src/kepler_model/util/format.py:
--------------------------------------------------------------------------------
 1 | def print_bounded_multiline_message(input_lines, maxlength=200):
 2 |     lines = []
 3 |     for line in input_lines:
 4 |         i = 0
 5 |         while len(line) > maxlength:
 6 |             lines += [line[0:maxlength]]
 7 |             line = line[maxlength:-1]
 8 |             i = maxlength
 9 |         if len(line) > 0:
10 |             lines += [line]
11 | 
12 |     max_line_length = max(len(line) for line in lines)
13 |     border = "#" * (max_line_length + 4)
14 |     print(border)
15 | 
16 |     for line in lines:
17 |         formatted_line = f"# {line.ljust(max_line_length)} #"
18 |         print(formatted_line)
19 | 
20 |     print(border)
21 | 
22 | 
23 | from datetime import datetime
24 | 
25 | 
26 | def time_to_str(time):
27 |     if isinstance(time, datetime):
28 |         return time.strftime("%Y-%m-%d %H:%M:%S")
29 |     return time
30 | 


--------------------------------------------------------------------------------
/src/kepler_model/util/saver.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | import joblib
 5 | 
 6 | METADATA_FILENAME = "metadata"
 7 | SCALER_FILENAME = "scaler"
 8 | WEIGHT_FILENAME = "weight"
 9 | TRAIN_ARGS_FILENAME = "train_arguments"
10 | NODE_TYPE_INDEX_FILENAME = "node_type_index"
11 | 
12 | MACHINE_SPEC_PATH = "machine_spec"
13 | 
14 | 
15 | def _pipeline_model_metadata_filename(energy_source, model_type):
16 |     return f"{energy_source}_{model_type}_model_metadata"
17 | 
18 | 
19 | def _power_curve_filename(energy_source, model_type):
20 |     return f"{energy_source}_{model_type}_power_curve"
21 | 
22 | 
23 | def assure_path(path):
24 |     if path == "":
25 |         return ""
26 |     if not os.path.exists(path):
27 |         os.makedirs(path, exist_ok=True)
28 |     return path
29 | 
30 | 
31 | def save_json(path, name, data):
32 |     if name.endswith(".json") is False:
33 |         name = name + ".json"
34 | 
35 |     assure_path(path)
36 |     filename = os.path.join(path, name)
37 |     with open(filename, "w") as f:
38 |         json.dump(data, f)
39 |     return name
40 | 
41 | 
42 | def save_pkl(path, name, data):
43 |     if ".pkl" not in name:
44 |         name = name + ".pkl"
45 |     assure_path(path)
46 |     filename = os.path.join(path, name)
47 |     joblib.dump(data, filename)
48 |     return name
49 | 
50 | 
51 | def save_csv(path, name, data):
52 |     if ".csv" not in name:
53 |         name = name + ".csv"
54 |     assure_path(path)
55 |     filename = os.path.join(path, name)
56 |     data.to_csv(filename)
57 |     return name
58 | 
59 | 
60 | def save_machine_spec(data_path, machine_id, spec):
61 |     machine_spec_path = os.path.join(data_path, MACHINE_SPEC_PATH)
62 |     assure_path(machine_spec_path)
63 |     save_json(machine_spec_path, machine_id, spec.get_json())
64 | 
65 | 
66 | def save_node_type_index(pipeline_path, node_type_index):
67 |     return save_json(pipeline_path, NODE_TYPE_INDEX_FILENAME, node_type_index)
68 | 
69 | 
70 | def save_metadata(model_path, metadata):
71 |     return save_json(model_path, METADATA_FILENAME, metadata)
72 | 
73 | 
74 | def save_train_args(pipeline_path, args):
75 |     return save_json(pipeline_path, TRAIN_ARGS_FILENAME, args)
76 | 
77 | 
78 | def save_scaler(model_path, scaler):
79 |     return save_pkl(model_path, SCALER_FILENAME, scaler)
80 | 
81 | 
82 | def save_weight(model_path, weight):
83 |     return save_json(model_path, WEIGHT_FILENAME, weight)
84 | 
85 | 
86 | def save_pipeline_metadata(pipeline_path, pipeline_metadata, energy_source, model_type, metadata_df):
87 |     save_metadata(pipeline_path, pipeline_metadata)
88 |     pipeline_model_metadata_filename = _pipeline_model_metadata_filename(energy_source, model_type)
89 |     return save_csv(pipeline_path, pipeline_model_metadata_filename, metadata_df)
90 | 
91 | 
92 | def save_profile(profile_path, source, profile):
93 |     profile_filename = os.path.join(profile_path, source + ".json")
94 |     with open(profile_filename, "w") as f:
95 |         json.dump(profile, f)
96 | 


--------------------------------------------------------------------------------
/src/kepler_model/util/similarity.py:
--------------------------------------------------------------------------------
 1 | from .train_types import NodeAttribute
 2 | 
 3 | # simplified weights
 4 | # TODO: experimental support for deciding the weight
 5 | similarity_reference = {
 6 |     NodeAttribute.PROCESSOR: 5,
 7 |     NodeAttribute.CORES: 1,
 8 |     NodeAttribute.CHIPS: 1,
 9 |     NodeAttribute.MEMORY: 0.5,
10 |     NodeAttribute.FREQ: 0.5,
11 | }
12 | 
13 | similarity_total_weight = sum(similarity_reference.values())
14 | 
15 | 
16 | def get_similarity_weight(attr):
17 |     return similarity_reference[attr] / similarity_total_weight
18 | 
19 | 
20 | def compute_jaccard_similarity(str1: str, str2: str) -> float:
21 |     if str1.lower() == str2.lower():  # including the case of both are empty
22 |         return 1
23 |     if len(str1) == 0 or len(str2) == 0:
24 |         return 0
25 |     set1 = set(str1.lower())  # Convert to lowercase for case-insensitive comparison
26 |     set2 = set(str2.lower())
27 | 
28 |     intersection = len(set1.intersection(set2))
29 |     union = len(set1.union(set2))
30 | 
31 |     similarity = intersection / union * 0.5
32 |     return similarity
33 | 
34 | 
35 | def compute_similarity(base: float, cmp: float) -> float:
36 |     base = float(base)
37 |     cmp = float(cmp)
38 |     diff_ratio = 0
39 |     if base > 0 or cmp > 0:
40 |         diff_ratio = abs(cmp - base) / ((base + cmp) / 2)
41 |     if diff_ratio >= 1:
42 |         return 0
43 |     else:
44 |         return 1 - diff_ratio
45 | 
46 | 
47 | def compute_looseness(similarity):
48 |     return 1 - similarity
49 | 
50 | 
51 | # get_candidate_score returns certainty
52 | def get_candidate_score(candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total):
53 |     candidate_score = dict()
54 |     for attr, candidates in candidate_uncertain_attribute_freq.items():
55 |         total = candidate_uncertain_attribute_total[attr]
56 |         if total == 0:
57 |             # no uncertainty
58 |             continue
59 |         for candidate in candidates:
60 |             candidate_index = candidate[0]
61 |             candidate_freq = candidate[1]
62 |             if candidate_index not in candidate_score:
63 |                 candidate_score[candidate_index] = 0
64 |             candidate_score[candidate_index] += float(candidate_freq) / total
65 |     return candidate_score
66 | 
67 | 
68 | def find_best_candidate(candidate_score):
69 |     max_score = 0
70 |     best_candidate_index = -1
71 |     for index, score in candidate_score.items():
72 |         if score > max_score:
73 |             best_candidate_index = index
74 |             max_score = score
75 |     return best_candidate_index, max_score
76 | 
77 | 
78 | def compute_uncertainty(max_score, num_of_none):
79 |     if num_of_none == 0:
80 |         return 0  # covered
81 |     uncertainty = 1 - max_score / num_of_none
82 |     return uncertainty
83 | 
84 | 
85 | def get_num_of_none(in_spec):
86 |     num_of_none = 0
87 |     for attr in NodeAttribute:
88 |         if in_spec.attrs[attr] is None:
89 |             num_of_none += 1
90 |     return num_of_none
91 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/tests/__init__.py


--------------------------------------------------------------------------------
/tests/client_load_tester.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from estimator import SERVE_SOCKET
 4 | from estimator_model_test import generate_request, model_names
 5 | from estimator_power_request_test import Client
 6 | 
 7 | loads = range(10, 11, 10)
 8 | duration = 120
 9 | 
10 | if __name__ == "__main__":
11 |     client = Client(SERVE_SOCKET)
12 |     for model_name in model_names:
13 |         for load in loads:
14 |             request_json = generate_request(model_name, load)
15 |             start_time = time.time()
16 |             client.make_request(request_json)
17 |             elapsed_time = time.time() - start_time
18 |             output = f"{model_name},{load},{elapsed_time}"
19 |             print(output)
20 |             time.sleep(1)
21 | 


--------------------------------------------------------------------------------
/tests/data/machine/spec.json:
--------------------------------------------------------------------------------
1 | {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500}
2 | 


--------------------------------------------------------------------------------
/tests/data/node_type_index.json:
--------------------------------------------------------------------------------
1 | {"0": {"attrs": {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500}, "members": ["m5.metal-ami-0e4d0bb9670ea8db0"]}, "1": {"attrs": {"processor": "intel_xeon_e5_2686v4", "cores": 72, "chips": 2, "memory": 503, "frequency": 3000}, "members": ["i3.metal-ami-0e4d0bb9670ea8db0"]}, "2": {"attrs": {"processor": "intel_xeon_platinum_8275cl", "cores": 96, "chips": 2, "memory": 188, "frequency": 3900}, "members": ["c5.metal-ami-0e4d0bb9670ea8db0"]}, "3": {"attrs": {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 755, "frequency": 3500}, "members": ["r5.metal-ami-0e4d0bb9670ea8db0"]}, "4": {"attrs": {"processor": "intel_xeon_platinum_8252c", "cores": 48, "chips": 2, "memory": 188, "frequency": 4500}, "members": ["m5zn.metal-ami-0e4d0bb9670ea8db0"]}, "5": {"attrs": {"processor": "intel_xeon_platinum_8488c", "cores": 96, "chips": 1, "memory": 377, "frequency": 3800}, "members": ["m7i.metal-24xl-ami-0e4d0bb9670ea8db0"]}}
2 | 


--------------------------------------------------------------------------------
/tests/estimator_power_request_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import socket
 3 | 
 4 | from kepler_model.util.config import SERVE_SOCKET
 5 | from kepler_model.util.train_types import (
 6 |     CATEGORICAL_LABEL_TO_VOCAB,
 7 |     SYSTEM_FEATURES,
 8 |     WORKLOAD_FEATURES,
 9 |     ModelOutputType,
10 | )
11 | from tests.extractor_test import test_energy_source
12 | 
13 | trainer_names = ["SGDRegressorTrainer"]
14 | test_energy_sources = ["acpi", "rapl-sysfs"]
15 | 
16 | 
17 | def generate_request(
18 |     train_name, n=1, metrics=WORKLOAD_FEATURES, system_features=SYSTEM_FEATURES, output_type=ModelOutputType.DynPower.name, energy_source=test_energy_source
19 | ):
20 |     request_json = dict()
21 |     if train_name is not None:
22 |         request_json["trainer_name"] = train_name
23 |     request_json["metrics"] = metrics
24 |     request_json["system_features"] = system_features
25 |     request_json["system_values"] = []
26 |     for m in system_features:
27 |         request_json["system_values"] += [CATEGORICAL_LABEL_TO_VOCAB[m][0]]
28 |     request_json["values"] = [[1.0] * len(metrics)] * n
29 |     request_json["output_type"] = output_type
30 |     request_json["source"] = energy_source
31 |     return request_json
32 | 
33 | 
34 | def process(client, energy_source):
35 |     request_json = generate_request(trainer_names[0], 2, output_type="AbsPower", energy_source=energy_source)
36 |     res = client.make_request(request_json)
37 |     res_json = json.loads(res)
38 |     print(res_json)
39 |     assert res_json["msg"] == "", "response error: {}".format(res_json["msg"])
40 |     assert len(res_json["powers"]) > 0, "zero powers"
41 | 
42 | 
43 | class Client:
44 |     def __init__(self, socket_path):
45 |         self.socket_path = socket_path
46 | 
47 |     def make_request(self, request_json):
48 |         s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
49 |         s.connect(self.socket_path)
50 |         data = json.dumps(request_json)
51 |         print(data)
52 |         s.send(data.encode())
53 |         data = b""
54 |         while True:
55 |             shunk = s.recv(1024).strip()
56 |             data += shunk
57 |             if shunk is None or len(shunk.decode()) == 0 or shunk.decode()[-1] == "}":
58 |                 break
59 |         decoded_data = data.decode()
60 |         s.close()
61 |         return decoded_data
62 | 
63 | 
64 | def test_estimator_power_request():
65 |     client = Client(SERVE_SOCKET)
66 |     for energy_source in test_energy_sources:
67 |         process(client, energy_source)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     test_estimator_power_request()
72 | 


--------------------------------------------------------------------------------
/tests/http_server.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import http.server
 3 | import os
 4 | import socketserver
 5 | import threading
 6 | 
 7 | from kepler_model.util.config import model_toppath
 8 | 
 9 | 
10 | def cleanup_task(server):
11 |     print("Shutdown server...")
12 |     server.shutdown()
13 | 
14 | 
15 | def get_server(file_server_port):
16 |     Handler = http.server.SimpleHTTPRequestHandler
17 |     httpd = socketserver.TCPServer(("", file_server_port), Handler)
18 | 
19 |     # Register the cleanup task to be executed on program exit
20 |     atexit.register(cleanup_task, httpd)
21 | 
22 |     print("Http File Serve Serving at Port", file_server_port, " for ", model_toppath)
23 |     return httpd
24 | 
25 | 
26 | def http_file_server(file_server_port):
27 |     try:
28 |         httpd = get_server(file_server_port)
29 |         # Start the server in a separate thread
30 |         server_thread = threading.Thread(target=httpd.serve_forever)
31 |         server_thread.daemon = True
32 |         server_thread.start()
33 |     except Exception as err:
34 |         print(f"File server is running: {err}")
35 | 
36 | 
37 | def run():
38 |     os.chdir(model_toppath)
39 |     httpd = get_server(8110)
40 |     httpd.serve_forever()
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     run()
45 | 


--------------------------------------------------------------------------------
/tests/minimal_trainer.py:
--------------------------------------------------------------------------------
 1 | from pipeline_test import process
 2 | 
 3 | from kepler_model.util import FeatureGroup
 4 | 
 5 | trainer_names = ["GradientBoostingRegressorTrainer", "SGDRegressorTrainer", "XgboostFitTrainer"]
 6 | valid_feature_groups = [FeatureGroup.BPFOnly]
 7 | 
 8 | if __name__ == "__main__":
 9 |     process(
10 |         target_energy_sources=["acpi", "rapl-sysfs"],
11 |         abs_trainer_names=trainer_names,
12 |         dyn_trainer_names=trainer_names,
13 |         valid_feature_groups=valid_feature_groups,
14 |     )
15 | 


--------------------------------------------------------------------------------
/tests/pipeline_test.py:
--------------------------------------------------------------------------------
 1 | from kepler_model.train import NewPipeline, NodeTypeSpec
 2 | from kepler_model.util import PowerSourceMap, get_valid_feature_group_from_queries
 3 | from kepler_model.util.loader import default_node_type, default_train_output_pipeline
 4 | from tests.extractor_test import test_energy_source, test_extractors
 5 | from tests.isolator_test import test_isolators
 6 | from tests.prom_test import get_query_results, prom_output_filename, prom_output_path
 7 | from tests.trainer_test import assert_train, test_trainer_names
 8 | 
 9 | # fake spec value
10 | spec_values = {"processor": "test", "cores": 1, "chips": 1, "memory": -1, "frequency": -1}
11 | spec = NodeTypeSpec(**spec_values)
12 | 
13 | test_energy_sources = ["acpi", "rapl-sysfs"]
14 | 
15 | 
16 | def assert_pipeline(pipeline, query_results, feature_group, energy_source, energy_components):
17 |     success, abs_data, dyn_data = pipeline.process(
18 |         query_results, energy_components, energy_source, feature_group=feature_group.name, replace_node_type=default_node_type
19 |     )
20 |     assert success, f"failed to process pipeline {pipeline.name}"
21 |     for trainer in pipeline.trainers:
22 |         if trainer.feature_group == feature_group and trainer.energy_source == energy_source:
23 |             if trainer.node_level:
24 |                 assert_train(trainer, abs_data, energy_components)
25 |             else:
26 |                 assert_train(trainer, dyn_data, energy_components)
27 | 
28 | 
29 | def process(
30 |     save_pipeline_name=default_train_output_pipeline,
31 |     prom_save_path=prom_output_path,
32 |     prom_save_name=prom_output_filename,
33 |     abs_trainer_names=test_trainer_names,
34 |     dyn_trainer_names=test_trainer_names,
35 |     extractors=test_extractors,
36 |     isolators=test_isolators,
37 |     target_energy_sources=[test_energy_source],
38 |     valid_feature_groups=None,
39 | ):
40 |     query_results = get_query_results(save_path=prom_save_path, save_name=prom_save_name)
41 |     if valid_feature_groups is None:
42 |         valid_feature_groups = get_valid_feature_group_from_queries(query_results.keys())
43 |     for extractor in extractors:
44 |         for isolator in isolators:
45 |             pipeline = NewPipeline(
46 |                 save_pipeline_name,
47 |                 abs_trainer_names,
48 |                 dyn_trainer_names,
49 |                 extractor=extractor,
50 |                 isolator=isolator,
51 |                 target_energy_sources=target_energy_sources,
52 |                 valid_feature_groups=valid_feature_groups,
53 |             )
54 |             global spec
55 |             pipeline.node_collection.index_train_machine("test", spec)
56 |             for energy_source in target_energy_sources:
57 |                 energy_components = PowerSourceMap[energy_source]
58 |                 for feature_group in valid_feature_groups:
59 |                     assert_pipeline(pipeline, query_results, feature_group, energy_source, energy_components)
60 |             # save metadata
61 |             pipeline.save_metadata()
62 |             # save node collection
63 |             pipeline.node_collection.save()
64 |             # save pipeline
65 |             pipeline.archive_pipeline()
66 | 
67 | 
68 | def test_process():
69 |     process(target_energy_sources=test_energy_sources)
70 | 


--------------------------------------------------------------------------------
/tests/prom_test.py:
--------------------------------------------------------------------------------
 1 | # prom_test.py
 2 | #   - prom_client.query
 3 | #   - prom_client.snapshot_query_result
 4 | #
 5 | # save response to prom_output_path/prom_output_filename.json
 6 | #
 7 | # To use output:
 8 | # from prom_test import get_prom_output
 9 | # response = get_prom_response()
10 | # or
11 | # query_result = get_query_results()
12 | 
13 | import os
14 | 
15 | from kepler_model.train.prom import PrometheusClient
16 | from kepler_model.util import load_json, save_json
17 | from kepler_model.util.prom_types import prom_responses_to_results
18 | 
19 | prom_output_path = os.path.join(os.path.dirname(__file__), "data", "prom_output")
20 | prom_output_filename = "prom_response"
21 | 
22 | 
23 | def get_prom_response(save_path=prom_output_path, save_name=prom_output_filename):
24 |     return load_json(save_path, save_name)
25 | 
26 | 
27 | def get_query_results(save_path=prom_output_path, save_name=prom_output_filename):
28 |     response = get_prom_response(save_path=save_path, save_name=save_name)
29 |     return prom_responses_to_results(response)
30 | 
31 | 
32 | def process(save_path=prom_output_path, save_name=prom_output_filename, server=None, interval=None, step=None):
33 |     if server is not None:
34 |         os.environ["PROM_SERVER"] = server
35 |     if interval is not None:
36 |         os.environ["PROM_QUERY_INTERVAL"] = interval
37 |     if step is not None:
38 |         os.environ["PROM_QUERY_STEP"] = step
39 |     prom_client = PrometheusClient()
40 |     response_dict = prom_client.query()
41 |     results = prom_client.snapshot_query_result()
42 |     print("Available metrics: ", results.keys())
43 |     # print query data in csv
44 |     for metric, data in results.items():
45 |         print(metric)
46 |         print(data.head())
47 |     save_json(save_path, save_name, response_dict)
48 | 
49 | 
50 | def test_prom_process():
51 |     process()
52 | 


--------------------------------------------------------------------------------
/tests/weight_model_request_test.py:
--------------------------------------------------------------------------------
 1 | #########################
 2 | # weight_mode_request.py
 3 | #
 4 | # This file covers the following cases.
 5 | # - getting weight from model server based on available features
 6 | #
 7 | #########################
 8 | 
 9 | import json
10 | import os
11 | import sys
12 | import time
13 | 
14 | import requests
15 | 
16 | from kepler_model.estimate.model_server_connector import list_all_models
17 | from kepler_model.util.config import download_path, get_model_server_req_endpoint
18 | from kepler_model.util.loader import get_download_output_path
19 | from kepler_model.util.train_types import FeatureGroup, FeatureGroups, ModelOutputType
20 | from tests.estimator_power_request_test import generate_request
21 | from tests.extractor_test import test_energy_source
22 | 
23 | os.environ["MODEL_SERVER_URL"] = "http://localhost:8100"
24 | 
25 | weight_available_trainers = ["SGDRegressorTrainer"]
26 | 
27 | if __name__ == "__main__":
28 |     # test getting model from server
29 |     os.environ["MODEL_SERVER_ENABLE"] = "true"
30 |     energy_source = test_energy_source
31 | 
32 |     available_models = list_all_models(energy_source=energy_source)
33 |     while len(available_models) == 0:
34 |         time.sleep(1)
35 |         print("wait for kepler model server response")
36 |         available_models = list_all_models(energy_source=energy_source)
37 | 
38 |     for output_type_name, valid_fgs in available_models.items():
39 |         output_type = ModelOutputType[output_type_name]
40 |         output_path = get_download_output_path(download_path, energy_source, output_type)
41 |         for fg_name, best_model in valid_fgs.items():
42 |             for trainer in weight_available_trainers:
43 |                 print("feature group: ", fg_name)
44 |                 metrics = FeatureGroups[FeatureGroup[fg_name]]
45 |                 request_json = generate_request(trainer, n=10, metrics=metrics, output_type=output_type_name)
46 |                 request_json["metrics"] += request_json["system_features"]
47 |                 request_json["weight"] = "true"
48 |                 del request_json["system_features"]
49 |                 del request_json["values"]
50 |                 del request_json["system_values"]
51 |                 try:
52 |                     response = requests.post(get_model_server_req_endpoint(), json=request_json)
53 |                 except Exception as err:
54 |                     print(f"cannot get response from model server: {err}")
55 |                     sys.exit(1)
56 |                 assert response.status_code == 200, f"response {request_json} not OK"
57 |                 loaded_weight = json.loads(response.content)
58 |                 print(loaded_weight)
59 | 


--------------------------------------------------------------------------------
/tests/xgboost_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | 
 4 | from kepler_model.train import DefaultExtractor
 5 | from kepler_model.train.profiler.profiler import response_to_result
 6 | from kepler_model.train.trainer.XGBoostTrainer.main import XGBoostRegressionStandalonePipeline
 7 | from kepler_model.util.train_types import FeatureGroup, XGBoostRegressionTrainType
 8 | 
 9 | energy_components = ["package", "core", "uncore", "dram"]
10 | feature_group = FeatureGroup.BPFIRQ.name
11 | energy_source = "rapl-sysfs"
12 | 
13 | prom_response_file = os.path.join(os.path.dirname(__file__), "data", "prom_output", "prom_response.json")
14 | 
15 | 
16 | def read_sample_query_results():
17 |     with open(prom_response_file) as f:
18 |         response = json.load(f)
19 |         return response_to_result(response)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     # Note that extractor mutates the query results
24 |     query_results = read_sample_query_results()
25 |     assert len(query_results) > 0, "cannot read_sample_query_results"
26 |     instance = DefaultExtractor()
27 |     extracted_data, power_columns, _, _ = instance.extract(query_results, energy_components, feature_group, energy_source, node_level=True)
28 |     xgb_container_level_pipeline_kfold = XGBoostRegressionStandalonePipeline(
29 |         XGBoostRegressionTrainType.KFoldCrossValidation, "test_models/XGBoost/", node_level=True
30 |     )
31 |     xgb_node_pipeline_kfold = XGBoostRegressionStandalonePipeline(XGBoostRegressionTrainType.KFoldCrossValidation, "test_models/XGBoost/", node_level=False)
32 |     xgb_container_level_pipeline_tts = XGBoostRegressionStandalonePipeline(
33 |         XGBoostRegressionTrainType.TrainTestSplitFit, "test_models/XGBoost/", node_level=False
34 |     )
35 |     xgb_node_pipeline_tts = XGBoostRegressionStandalonePipeline(XGBoostRegressionTrainType.TrainTestSplitFit, "test_models/XGBoost/", node_level=True)
36 |     xgb_node_pipeline_kfold.train(None, query_results)
37 |     xgb_container_level_pipeline_tts.train(None, query_results)
38 |     xgb_node_pipeline_tts.train(None, query_results)
39 |     xgb_container_level_pipeline_kfold.train(None, query_results)
40 | 


--------------------------------------------------------------------------------