├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.yaml │ ├── document.yaml │ └── feature_request.yaml ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── build-push.yml │ ├── collect-data-self-hosted.yml │ ├── collect-train.yml │ ├── commit-msg.yml │ ├── integration-test.yml │ ├── lint.yml │ ├── pr.yml │ ├── push-to-main.yml │ ├── release.yml │ ├── tekton-test.yml │ ├── train-model.yml │ ├── train.yml │ └── unit-test.yml ├── .gitignore ├── .vscode └── settings.json ├── .yamllint.yaml ├── LICENSE ├── Makefile ├── README.md ├── VERSION ├── cmd └── main.py ├── contributing.md ├── dockerfiles ├── Dockerfile ├── Dockerfile.base ├── Dockerfile.dockerignore ├── Dockerfile.test ├── Dockerfile.test-nobase ├── Dockerfile.test-nobase.dockerignore └── Dockerfile.test.dockerignore ├── docs └── developer │ ├── README.md │ ├── estimate │ ├── classes.plantuml │ ├── classes.svg │ ├── packages.plantuml │ └── packages.svg │ ├── server │ ├── classes.plantuml │ ├── classes.svg │ ├── packages.plantuml │ └── packages.svg │ └── train │ ├── classes.plantuml │ ├── classes.svg │ ├── packages.plantuml │ ├── packages.svg │ └── trainer │ ├── classes.plantuml │ ├── classes.svg │ ├── packages.plantuml │ └── packages.svg ├── fig ├── comm_diagram.png ├── model-server-components-simplified.png ├── tekton-complete-train.png ├── tekton-kepler-default.png └── tekton-single-train.png ├── hack ├── aws_helper.sh ├── k8s_helper.sh └── utils.bash ├── manifests ├── base │ ├── estimate-only │ │ └── kustomization.yaml │ ├── estimate-with-server │ │ └── kustomization.yaml │ ├── kustomization.yaml │ ├── openshift │ │ ├── estimate-only │ │ │ └── kustomization.yaml │ │ ├── estimate-with-server │ │ │ └── kustomization.yaml │ │ ├── scc.yaml │ │ └── serve-only │ │ │ └── kustomization.yaml │ ├── patch │ │ ├── patch-estimator-sidecar.yaml │ │ ├── patch-model-server.yaml │ │ ├── patch-openshift.yaml │ │ └── patch-server-only.yaml │ └── serve-only │ │ └── kustomization.yaml ├── compose │ ├── dev │ │ ├── compose.yaml │ │ ├── grafana │ │ │ └── dashboards │ │ │ │ └── dev │ │ │ │ └── dashboard.json │ │ ├── kepler │ │ │ ├── common │ │ │ │ └── var │ │ │ │ │ └── lib │ │ │ │ │ └── kepler │ │ │ │ │ └── data │ │ │ │ │ ├── cpus.yaml │ │ │ │ │ └── model_weight │ │ │ │ │ ├── acpi_AbsPowerModel.json │ │ │ │ │ ├── acpi_DynPowerModel.json │ │ │ │ │ ├── intel_rapl_AbsPowerModel.json │ │ │ │ │ └── intel_rapl_DynPowerModel.json │ │ │ ├── metal │ │ │ │ └── etc │ │ │ │ │ └── kepler │ │ │ │ │ └── kepler.config │ │ │ │ │ ├── ENABLE_PROCESS_METRICS │ │ │ │ │ ├── EXPOSE_ESTIMATED_IDLE_POWER_METRICS │ │ │ │ │ └── EXPOSE_VM_METRICS │ │ │ └── models │ │ │ │ └── etc │ │ │ │ └── kepler │ │ │ │ └── kepler.config │ │ │ │ ├── ENABLE_PROCESS_METRICS │ │ │ │ ├── EXPOSE_ESTIMATED_IDLE_POWER_METRICS │ │ │ │ ├── MODEL_CONFIG │ │ │ │ ├── MODEL_SERVER_ENABLE │ │ │ │ └── MODEL_SERVER_URL │ │ ├── overrides.yaml │ │ └── prometheus │ │ │ └── scrape-configs │ │ │ └── dev.yaml │ └── monitoring │ │ ├── compose.yaml │ │ ├── grafana │ │ ├── Dockerfile │ │ ├── dashboards.yml │ │ └── datasource.yml │ │ └── prometheus │ │ ├── Dockerfile │ │ ├── prometheus.yml │ │ └── rules │ │ └── kepler.rule ├── kepler │ ├── kustomization.yaml │ └── patch │ │ └── patch-ci.yaml ├── offline-trainer │ ├── kustomization.yaml │ └── offline-trainer.yaml ├── server │ ├── base │ │ └── kustomization.yaml │ ├── kustomization.yaml │ ├── kustomizeconfig.yaml │ ├── online-train │ │ ├── kustomization.yaml │ │ └── patch-trainer.yaml │ ├── openshift │ │ ├── online-train │ │ │ ├── kustomization.yaml │ │ │ └── patch-trainer.yaml │ │ ├── patch-openshift.yaml │ │ └── serve-only │ │ │ └── kustomization.yaml │ └── server.yaml ├── set.sh └── test │ ├── file-server.yaml │ ├── model-request-client.yaml │ ├── patch-estimator-sidecar.yaml │ └── power-request-client.yaml ├── model_training ├── README.md ├── cmd_instruction.md ├── deployment │ ├── kepler.yaml │ ├── prom-kepler-rbac.yaml │ └── prom-np.yaml ├── s3 │ ├── Dockerfile │ ├── LICENSE.txt │ ├── README.md │ ├── pyproject.toml │ ├── src │ │ └── s3 │ │ │ ├── __about__.py │ │ │ ├── __init__.py │ │ │ ├── loader.py │ │ │ ├── pusher.py │ │ │ └── util.py │ └── tests │ │ └── __init__.py ├── script.sh └── tekton │ ├── README.md │ ├── examples │ ├── complete-pipelinerun.yaml │ ├── single-train │ │ ├── abs-power.yaml │ │ ├── aws-push.yaml │ │ ├── default.yaml │ │ ├── dyn-power.yaml │ │ └── ibmcloud-push.yaml │ ├── test-collect.yaml │ └── test-retrain.yaml │ ├── pipelines │ ├── collect.yaml │ ├── complete-retrain.yaml │ ├── complete-train.yaml │ ├── single-retrain.yaml │ └── single-train.yaml │ ├── pvc │ └── hostpath.yaml │ └── tasks │ ├── extract-task.yaml │ ├── isolate-task.yaml │ ├── original-pipeline-task.yaml │ ├── s3 │ ├── aws-s3-load.yaml │ ├── aws-s3-push.yaml │ ├── ibmcloud-s3-load.yaml │ └── ibmcloud-s3-push.yaml │ ├── stressng-task.yaml │ └── train-task.yaml ├── pyproject.toml ├── src └── kepler_model │ ├── __about__.py │ ├── __init__.py │ ├── abs-train-pipelinerun.yaml │ ├── cmd │ ├── README.md │ ├── __init__.py │ ├── cmd_plot.py │ ├── cmd_util.py │ └── main.py │ ├── estimate │ ├── __init__.py │ ├── archived_model.py │ ├── estimator.py │ ├── model │ │ ├── __init__.py │ │ ├── curvefit_model.py │ │ ├── estimate_common.py │ │ ├── keras_model.py │ │ ├── model.py │ │ ├── scikit_model.py │ │ └── xgboost_model.py │ └── model_server_connector.py │ ├── server │ └── model_server.py │ ├── train │ ├── __init__.py │ ├── ec2_pipeline.py │ ├── exporter │ │ ├── __init__.py │ │ ├── exporter.py │ │ ├── validator.py │ │ └── writer.py │ ├── extractor │ │ ├── __init__.py │ │ ├── extractor.py │ │ ├── preprocess.py │ │ └── smooth_extractor.py │ ├── isolator │ │ ├── __init__.py │ │ ├── isolator.py │ │ └── train_isolator.py │ ├── offline_trainer.py │ ├── online_trainer.py │ ├── pipeline.py │ ├── profiler │ │ ├── __init__.py │ │ ├── generate_scaler.py │ │ ├── node_type_index.py │ │ └── profiler.py │ ├── prom │ │ ├── __init__.py │ │ └── prom_query.py │ ├── specpower_pipeline.py │ └── trainer │ │ ├── ExponentialRegressionTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── GradientBoostingRegressorTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── KNeighborsRegressorTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── LinearRegressionTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── LogarithmicRegressionTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── LogisticRegressionTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── PolynomialRegressionTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── SGDRegressorTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── SVRRegressorTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── XGBoostTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── XgboostFitTrainer │ │ ├── __init__.py │ │ └── main.py │ │ ├── __init__.py │ │ ├── curvefit.py │ │ ├── scikit.py │ │ └── xgboost_interface.py │ └── util │ ├── __init__.py │ ├── config.py │ ├── extract_types.py │ ├── format.py │ ├── loader.py │ ├── prom_types.py │ ├── saver.py │ ├── similarity.py │ └── train_types.py └── tests ├── README.md ├── __init__.py ├── client_load_tester.py ├── common_plot.py ├── data ├── machine │ └── spec.json ├── node_type_index.json └── prom_output │ ├── idle.json │ └── prom_response.json ├── e2e_test.sh ├── estimator_model_request_test.py ├── estimator_model_test.py ├── estimator_power_request_test.py ├── extractor_test.py ├── http_server.py ├── isolator_test.py ├── minimal_trainer.py ├── model_select_test.py ├── model_server_test.py ├── model_tester.py ├── offline_trainer_test.py ├── pipeline_test.py ├── prom_test.py ├── trainer_test.py ├── weight_model_request_test.py └── xgboost_test.py /.github/ISSUE_TEMPLATE/document.yaml: -------------------------------------------------------------------------------- 1 | name: Documentation Issue 2 | description: Provide supporting details for documentation issue 3 | labels: kind/documentation 4 | body: 5 | - type: textarea 6 | id: document 7 | attributes: 8 | label: Which document would you like to address? 9 | description: Include the link to the document if applicable 10 | validations: 11 | required: true 12 | 13 | - type: textarea 14 | id: documentFixDetail 15 | attributes: 16 | label: What is the issue? 17 | validations: 18 | required: true 19 | 20 | - type: textarea 21 | id: documentFixDetail 22 | attributes: 23 | label: How do you suggest this is fixed? 24 | validations: 25 | required: false 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.yaml: -------------------------------------------------------------------------------- 1 | name: Enhancement Tracking Issue 2 | description: Provide supporting details for a feature in development 3 | labels: kind/feature 4 | body: 5 | - type: textarea 6 | id: feature 7 | attributes: 8 | label: What would you like to be added? 9 | description: | 10 | Feature requests are unlikely to make progress as issues. Please consider engaging with SIGs on slack and mailing lists, instead. 11 | A proposal that works through the design along with the implications of the change can be opened as a KEP. 12 | See https://git.k8s.io/enhancements/keps#kubernetes-enhancement-proposals-keps 13 | validations: 14 | required: true 15 | 16 | - type: textarea 17 | id: rationale 18 | attributes: 19 | label: Why is this needed? 20 | validations: 21 | required: true 22 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: / 5 | schedule: 6 | day: monday 7 | interval: weekly 8 | groups: 9 | github-actions: 10 | patterns: 11 | - "*" 12 | - package-ecosystem: github-actions 13 | directory: / 14 | schedule: 15 | day: monday 16 | interval: weekly 17 | groups: 18 | github-actions: 19 | patterns: 20 | - "*" 21 | - package-ecosystem: docker 22 | directory: / 23 | schedule: 24 | day: monday 25 | interval: weekly 26 | groups: 27 | github-actions: 28 | patterns: 29 | - "*" 30 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | # Checklist for PR Author 2 | 3 | --- 4 | 5 | In addition to approval, the author must confirm the following check list: 6 | 7 | - [ ] Run the following command to format your code: 8 | 9 | ```bash 10 | make fmt 11 | ``` 12 | 13 | - [ ] Create issues for unresolved comments and link them to this PR. Use one of the following labels: 14 | - `must-fix`: The logic appears incorrect and must be addressed. 15 | - `minor`: Typos, minor issues, or potential refactoring for better readability. 16 | - `nit`: Trivial issues like extra spaces, commas, etc. 17 | -------------------------------------------------------------------------------- /.github/workflows/collect-train.yml: -------------------------------------------------------------------------------- 1 | # manually run on collect needed 2 | on: # yamllint disable-line rule:truthy 3 | workflow_dispatch: 4 | 5 | jobs: 6 | collect-data: 7 | uses: ./.github/workflows/collect-data-self-hosted.yml 8 | strategy: 9 | matrix: 10 | instance_type: [i3.metal] 11 | max-parallel: 1 12 | with: 13 | instance_type: ${{ matrix.instance_type }} 14 | ami_id: ami-0e4d0bb9670ea8db0 15 | github_repo: ${{ github.repository }} 16 | model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest 17 | secrets: 18 | self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }} 19 | aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} 20 | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 21 | security_group_id: ${{ secrets.AWS_SECURITY_GROUP_ID }} 22 | aws_region: ${{ secrets.AWS_REGION }} 23 | 24 | train-model: 25 | needs: [collect-data] 26 | strategy: 27 | matrix: 28 | instance_type: [i3.metal] 29 | uses: ./.github/workflows/train-model.yml 30 | with: 31 | pipeline_name: std_v0.7.11 32 | instance_type: ${{ matrix.instance_type }} 33 | ami_id: ami-0e4d0bb9670ea8db0 34 | github_repo: ${{ github.repository }} 35 | model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest 36 | trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer 37 | secrets: 38 | self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }} 39 | aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} 40 | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 41 | aws_region: ${{ secrets.AWS_REGION }} 42 | -------------------------------------------------------------------------------- /.github/workflows/commit-msg.yml: -------------------------------------------------------------------------------- 1 | name: Commit message check 2 | 3 | on: # yamllint disable-line rule:truthy 4 | pull_request: 5 | 6 | jobs: 7 | check-commit-message: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v4 12 | 13 | - name: Check commit message 14 | uses: webiny/action-conventional-commits@v1.3.0 15 | with: 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | -------------------------------------------------------------------------------- /.github/workflows/integration-test.yml: -------------------------------------------------------------------------------- 1 | name: Integration Test 2 | on: # yamllint disable-line rule:truthy 3 | workflow_call: 4 | inputs: 5 | base_change: 6 | description: Change flag on base image 7 | required: true 8 | type: string 9 | docker_secret: 10 | description: Secret check 11 | required: true 12 | type: string 13 | image_repo: 14 | description: The image repo to use 15 | required: true 16 | type: string 17 | image_tag: 18 | description: The image tag to use 19 | required: true 20 | type: string 21 | kepler_tag: 22 | description: Kepler image tag 23 | required: true 24 | type: string 25 | additional_opts: 26 | description: additional deployment opts 27 | required: true 28 | type: string 29 | 30 | env: 31 | BASE_IMAGE: ${{ inputs.image_repo }}/kepler_model_server_base:${{ inputs.image_tag }} 32 | IMAGE: localhost:5001/kepler_model_server:devel 33 | KEPLER_IMAGE: quay.io/sustainable_computing_io/kepler:${{ inputs.kepler_tag }} 34 | DEFAULT_MODEL_SERVER_BASE_IMAGE: quay.io/sustainable_computing_io/kepler_model_server_base:latest 35 | 36 | jobs: 37 | run-integration: 38 | runs-on: ubuntu-20.04 39 | steps: 40 | - name: use Kepler action to deploy cluster 41 | uses: sustainable-computing-io/kepler-action@v0.0.9 42 | with: 43 | runningBranch: kind 44 | cluster_provider: kind 45 | - name: load kepler image 46 | run: | 47 | docker pull ${{ env.KEPLER_IMAGE }} 48 | kind load docker-image ${{ env.KEPLER_IMAGE }} 49 | - name: checkout 50 | uses: actions/checkout@v4 51 | - name: set up QEMU 52 | uses: docker/setup-qemu-action@v3 53 | - name: set up Docker Buildx 54 | uses: docker/setup-buildx-action@v3 55 | - name: Replace value in Dockerfile if base changes 56 | if: ${{ (inputs.base_change == 'true') && (inputs.docker_secret == 'true') }} 57 | run: | 58 | sed -i "s|${{ env.DEFAULT_MODEL_SERVER_BASE_IMAGE }}|${{ env.BASE_IMAGE }}|" dockerfiles/Dockerfile 59 | - name: Replace value in Dockerfile.test if base changes 60 | if: ${{ (inputs.base_change == 'true') && (inputs.docker_secret == 'true') }} 61 | run: | 62 | sed -i "s|${{ env.DEFAULT_MODEL_SERVER_BASE_IMAGE }}|${{ env.BASE_IMAGE }}|" dockerfiles/Dockerfile.test 63 | - name: build Kepler model server and test image and push to local registry 64 | run: make build build-test push push-test 65 | - name: set up Kustomize 66 | run: | 67 | curl -o install_kustomize.sh https://raw.githubusercontent.com/kubernetes-sigs/kustomize/master/hack/install_kustomize.sh 68 | chmod +x install_kustomize.sh 69 | ./install_kustomize.sh 5.3.0 70 | chmod +x kustomize 71 | mv kustomize /usr/local/bin/ 72 | - name: test deploying with only estimator 73 | run: | 74 | make deploy 75 | ./tests/e2e_test.sh --estimator ${{ inputs.additional_opts }} 76 | make cleanup 77 | env: 78 | OPTS: ESTIMATOR 79 | KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }} 80 | - name: test deploying with only server 81 | run: | 82 | make deploy 83 | ./tests/e2e_test.sh --server ${{ inputs.additional_opts }} 84 | make cleanup 85 | env: 86 | OPTS: SERVER 87 | KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }} 88 | - name: test deploying with estimator and model server 89 | run: | 90 | make deploy 91 | ./tests/e2e_test.sh --estimator --server ${{ inputs.additional_opts }} 92 | make cleanup 93 | env: 94 | OPTS: ESTIMATOR SERVER 95 | KEPLER_IMAGE_VERSION: ${{ inputs.kepler_tag }} 96 | 97 | - name: upload artifacts on failure 98 | if: ${{ failure() }} 99 | uses: actions/upload-artifact@v4 100 | with: 101 | name: integration-test-artifacts 102 | path: tmp/e2e 103 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Run linters and formatters 2 | 3 | on: # yamllint disable-line rule:truthy 4 | pull_request: 5 | 6 | jobs: 7 | markdown-lint: 8 | runs-on: ubuntu-latest 9 | steps: 10 | # checkout soruce code 11 | - name: Checkout code 12 | uses: actions/checkout@v4 13 | 14 | # setup Python environment 15 | - name: Set up Python 16 | uses: actions/setup-python@v5 17 | with: 18 | python-version: "3.10" 19 | 20 | # install hatch 21 | - name: Install hatch 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install hatch 25 | 26 | # scan for markdown linting errors 27 | - name: Run pymarkdownlnt on markdown files 28 | shell: bash 29 | run: | 30 | make lint 31 | 32 | # run hatch fmt 33 | - name: Run formatter using hatch 34 | shell: bash 35 | run: | 36 | make fmt 37 | git diff --exit-code 38 | -------------------------------------------------------------------------------- /.github/workflows/push-to-main.yml: -------------------------------------------------------------------------------- 1 | on: # yamllint disable-line rule:truthy 2 | push: 3 | branches: 4 | - main 5 | 6 | env: 7 | TAG: latest 8 | 9 | jobs: 10 | check-branch: 11 | runs-on: ubuntu-latest 12 | 13 | outputs: 14 | tag: ${{ steps.image-tag.outputs.tag }} 15 | 16 | steps: 17 | - uses: actions/checkout@v4 18 | - name: Find Image Tag 19 | id: image-tag 20 | env: 21 | BRANCH: ${{ github.ref_name }} 22 | COMMIT: ${{ github.sha }} 23 | run: | 24 | if [ "${{ github.event_name }}" == 'pull_request' ]; then 25 | echo "tag=pr-${{ github.event.number }}" >> "$GITHUB_OUTPUT" 26 | else 27 | if [ "$BRANCH" == "main" ]; then 28 | echo "tag=${{ env.TAG }}" >> "$GITHUB_OUTPUT" 29 | else 30 | echo "tag=$COMMIT" >> "$GITHUB_OUTPUT" 31 | fi 32 | fi 33 | 34 | check-change: 35 | runs-on: ubuntu-latest 36 | 37 | outputs: 38 | base: ${{ steps.filter.outputs.base }} 39 | modeling: ${{ steps.filter.outputs.modeling }} 40 | s3: ${{ steps.filter.outputs.s3 }} 41 | 42 | steps: 43 | - uses: actions/checkout@v4 44 | - uses: dorny/paths-filter@v3 45 | id: filter 46 | with: 47 | filters: | 48 | base: 49 | - 'pyproject.toml' 50 | - 'dockerfiles/Dockerfile.base' 51 | - '.github/workflows/build-push.yml' 52 | modeling: 53 | - 'src/**' 54 | - 'model_training/**' 55 | - 'hack/**' 56 | - '.github/workflows/train-model.yml' 57 | s3: 58 | - 'model_training/s3/**' 59 | 60 | build-push: 61 | needs: [check-change, check-branch] 62 | uses: ./.github/workflows/build-push.yml 63 | with: 64 | base_change: ${{ needs.check-change.outputs.base }} 65 | s3_change: ${{ needs.check-change.outputs.s3 }} 66 | image_repo: ${{ vars.IMAGE_REPO }} 67 | image_tag: ${{ needs.check-branch.outputs.tag }} 68 | push: true 69 | secrets: 70 | docker_username: ${{ secrets.BOT_NAME }} 71 | docker_password: ${{ secrets.BOT_TOKEN }} 72 | 73 | train-model: 74 | needs: [check-change, check-branch, build-push] 75 | if: ${{ needs.check-change.outputs.modeling == 'true' }} 76 | strategy: 77 | matrix: 78 | instance_type: [i3.metal] 79 | uses: ./.github/workflows/train-model.yml 80 | with: 81 | pipeline_name: std_v0.7.11 82 | instance_type: ${{ matrix.instance_type }} 83 | ami_id: ami-0e4d0bb9670ea8db0 84 | github_repo: ${{ github.repository }} 85 | model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:${{ needs.check-branch.outputs.tag }} 86 | trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer 87 | secrets: 88 | self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }} 89 | aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} 90 | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 91 | aws_region: ${{ secrets.AWS_REGION }} 92 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | on: # yamllint disable-line rule:truthy 3 | workflow_dispatch: 4 | inputs: 5 | tag: 6 | description: Tag name, e.g. 0.7.11 7 | default: "" 8 | required: true 9 | 10 | jobs: 11 | build: 12 | name: Upload Release Asset 13 | permissions: 14 | contents: write 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v4 19 | 20 | - name: Login to Quay.io 21 | uses: docker/login-action@v3 22 | with: 23 | registry: ${{ vars.IMAGE_REGISTRY }} 24 | username: ${{ secrets.BOT_NAME }} 25 | password: ${{ secrets.BOT_TOKEN }} 26 | 27 | - name: Git set user 28 | shell: bash 29 | run: | 30 | git config user.name "$USERNAME" 31 | git config user.email "$USERENAME-bot@users.noreply.github.com" 32 | env: 33 | USERNAME: ${{ github.actor }} 34 | 35 | - name: Update the VERSION 36 | run: | 37 | echo "$VERSION" > VERSION 38 | env: 39 | VERSION: ${{ github.event.inputs.tag }} 40 | 41 | - name: Build model-server-base 42 | run: | 43 | make build-base 44 | env: 45 | IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }} 46 | 47 | - name: Push model-server-base 48 | run: | 49 | make push-base 50 | env: 51 | IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }} 52 | 53 | - name: Update base in model-server dockerfile 54 | run: | 55 | sed -i "s/model_server_base:.*/model_server_base:v$VERSION/g" ./dockerfiles/Dockerfile 56 | env: 57 | VERSION: ${{ github.event.inputs.tag }} 58 | 59 | - name: Build model-server 60 | run: | 61 | make build 62 | env: 63 | IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }} 64 | 65 | - name: Create tag 66 | run: | 67 | git add VERSION ./dockerfiles/Dockerfile 68 | git commit -m "ci: update VERSION to $VERSION" 69 | git tag -a "v$VERSION" -m "$VERSION" 70 | git show --stat 71 | env: 72 | VERSION: ${{ github.event.inputs.tag }} 73 | 74 | - name: Push Images 75 | run: | 76 | make push 77 | env: 78 | IMAGE_REGISTRY: ${{ vars.IMAGE_REGISTRY }} 79 | 80 | - name: Push Release tag 81 | run: | 82 | git push --follow-tags 83 | 84 | - name: Create Release 85 | id: create_release 86 | uses: actions/create-release@v1 87 | env: 88 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 89 | with: 90 | tag_name: v${{ github.event.inputs.tag }} 91 | release_name: v${{ github.event.inputs.tag }}-release 92 | draft: false 93 | prerelease: false 94 | 95 | create-release-branch: 96 | name: Create Release Branch 97 | permissions: 98 | contents: write 99 | needs: build 100 | runs-on: ubuntu-latest 101 | steps: 102 | - name: Create release branch 103 | uses: peterjgrainger/action-create-branch@v3.0.0 104 | env: 105 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 106 | with: 107 | branch: v${{ github.event.inputs.tag }}-release 108 | sha: ${{ github.event.pull_request.head.sha }} 109 | -------------------------------------------------------------------------------- /.github/workflows/train.yml: -------------------------------------------------------------------------------- 1 | # manually run on retrain needed 2 | name: Retrain All Machines 3 | on: # yamllint disable-line rule:truthy 4 | workflow_dispatch: 5 | 6 | jobs: 7 | 8 | check-change: 9 | runs-on: ubuntu-latest 10 | 11 | outputs: 12 | modeling: ${{ steps.filter.outputs.modeling }} 13 | 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: dorny/paths-filter@v3 17 | id: filter 18 | with: 19 | filters: | 20 | modeling: 21 | - 'src/**' 22 | - 'model_training/**' 23 | - 'hack/**' 24 | - '.github/workflows/train-model.yml' 25 | 26 | train-model: 27 | needs: [check-change] 28 | if: ${{ needs.check-change.outputs.modeling == 'true' }} 29 | strategy: 30 | matrix: 31 | instance_type: [i3.metal] 32 | uses: ./.github/workflows/train-model.yml 33 | with: 34 | pipeline_name: std_v0.7.11 35 | instance_type: ${{ matrix.instance_type }} 36 | ami_id: ami-0e4d0bb9670ea8db0 37 | github_repo: ${{ github.repository }} 38 | model_server_image: ${{ vars.IMAGE_REPO }}/kepler_model_server:latest 39 | trainers: LogisticRegressionTrainer,ExponentialRegressionTrainer,SGDRegressorTrainer,GradientBoostingRegressorTrainer,XgboostFitTrainer 40 | secrets: 41 | self_hosted_github_token: ${{ secrets.GH_SELF_HOSTED_RUNNER_TOKEN }} 42 | aws_access_key_id: ${{ secrets.AWS_ACCESS_KEY_ID }} 43 | aws_secret_access_key: ${{ secrets.AWS_SECRET_ACCESS_KEY }} 44 | aws_region: ${{ secrets.AWS_REGION }} 45 | -------------------------------------------------------------------------------- /.github/workflows/unit-test.yml: -------------------------------------------------------------------------------- 1 | name: Unit Test 2 | 3 | on: # yamllint disable-line rule:truthy 4 | workflow_call: 5 | secrets: 6 | docker_username: 7 | description: Docker username 8 | required: false 9 | docker_password: 10 | description: Docker password 11 | required: false 12 | inputs: 13 | base_change: 14 | description: Change flag on base image 15 | required: true 16 | type: string 17 | 18 | jobs: 19 | unit-test: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@v4 23 | - name: Set up Docker 24 | uses: docker/setup-buildx-action@v3 25 | - name: Build test with base image 26 | if: ${{ inputs.base_change != 'true' }} 27 | run: make build-test 28 | - name: Build test without base image 29 | if: ${{ inputs.base_change == 'true' }} 30 | run: make build-test-nobase 31 | - name: Test pipeline # need to run first to build the models 32 | run: make test-pipeline 33 | - name: Test model server 34 | run: make test-model-server 35 | timeout-minutes: 5 36 | - name: Test estimator 37 | run: make test-estimator 38 | timeout-minutes: 5 39 | - name: Test offline trainer 40 | run: make test-offline-trainer 41 | - name: Test model server select 42 | run: make test-model-server-select 43 | timeout-minutes: 5 44 | - name: Test model server select via estimator 45 | run: make test-model-server-estimator-select 46 | timeout-minutes: 5 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # models 2 | server/train/local/ 3 | server/models 4 | */*/download 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | .ipynb_checkpoints 84 | 85 | # IPython 86 | profile_default/ 87 | ipython_config.py 88 | 89 | # pyenv 90 | .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 100 | __pypackages__/ 101 | 102 | # Celery stuff 103 | celerybeat-schedule 104 | celerybeat.pid 105 | 106 | # SageMath parsed files 107 | *.sage.py 108 | 109 | # Environments 110 | .env 111 | .venv 112 | env/ 113 | venv/ 114 | ENV/ 115 | env.bak/ 116 | venv.bak/ 117 | 118 | # Spyder project settings 119 | .spyderproject 120 | .spyproject 121 | 122 | # Rope project settings 123 | .ropeproject 124 | 125 | # mkdocs documentation 126 | /site 127 | 128 | # mypy 129 | .mypy_cache/ 130 | .dmypy.json 131 | dmypy.json 132 | 133 | # Pyre type checker 134 | .pyre/ 135 | 136 | tests/download/* 137 | .DS_Store 138 | */.DS_Store 139 | */*/.DS_Store 140 | */*/*/.DS_Store 141 | 142 | /src/kepler_model/models/ 143 | /tests/models/ 144 | /src/resource/ 145 | tests/data/extractor_output 146 | tests/data/isolator_output 147 | tests/data/offline_trainer_output 148 | tests/data/plot_output 149 | model_training/*data* 150 | model_training/tekton/secret 151 | local-dev-cluster 152 | tmp 153 | tests/db-models 154 | db-models 155 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.analysis.extraPaths": [ 3 | "./src/util" 4 | ] 5 | } -------------------------------------------------------------------------------- /.yamllint.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | extends: default 3 | rules: 4 | line-length: disable 5 | document-start: disable 6 | comments: 7 | min-spaces-from-content: 1 8 | quoted-strings: 9 | required: only-when-needed 10 | extra-required: 11 | - ^.*:\s.*$ 12 | - ^.*:$ 13 | quote-type: double 14 | ignore: 15 | - model_training/deployment/cpe-operator.yaml 16 | - tmp/ 17 | -------------------------------------------------------------------------------- /VERSION: -------------------------------------------------------------------------------- 1 | 0.7.12 2 | -------------------------------------------------------------------------------- /cmd/main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # -*- coding: utf-8 -*- 4 | import re 5 | import sys 6 | 7 | from kepler_model.cmd.main import run 8 | 9 | if __name__ == "__main__": 10 | sys.argv[0] = re.sub(r"(-script\.pyw|\.exe)?$", "", sys.argv[0]) 11 | sys.exit(run()) 12 | -------------------------------------------------------------------------------- /contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | [Get started with Kepler Model Server.](https://sustainable-computing.io/kepler_model_server/get_started/) 4 | 5 | - The main source codes are in [src directory](./src/). 6 | 7 | ## PR Hands-on 8 | 9 | - Create related [issue](https://github.com/sustainable-computing-io/kepler-model-server/issues) with your name assigned first (if not exist). 10 | 11 | - Set required secret and environment for local repository test if needed. Check below table. 12 | 13 | | Objective | Required Secret | Required Environment | 14 | | --------- | --------------- |----------------------| 15 | | Push to private repo |BOT_NAME, BOT_TOKEN | IMAGE_REPO | 16 | | Change on base image | BOT_NAME, BOT_TOKEN | IMAGE_REPO | 17 | | Save data/models to AWS COS | AWS_ACCESS_KEY_ID,AWS_SECRET_ACCESS_KEY,AWS_REGION | | 18 | 19 | ## Improve components in training pipelines 20 | 21 | Learn more details about [Training Pipeline](https://sustainable-computing.io/kepler_model_server/pipeline/) 22 | 23 | ### Introduce new feature group 24 | 25 | - Define new feature group name `FeatureGroup` and update metric list map `FeatureGroups` in [train types](./src/util/train_types.py) 26 | 27 | ### Introduce new energy sources 28 | 29 | - Define new energy source map `PowerSourceMap` in [train types](./src/util/train_types.py) 30 | 31 | ### Improve preprocessing method 32 | 33 | - [extractor](./src/train/extractor/): convert from numerically aggregated metrics to per-second value 34 | - [isolator](./src/train/isolator/): isolate background (idle) power from the collected power 35 | 36 | ### Introduce new learning method 37 | 38 | - [trainer](./src/train/trainer/): apply learning method to build a model using extracted data and isolated data 39 | 40 | ## Model training 41 | 42 | Learn more details about [model training](./model_training/) 43 | 44 | ### Introduce new benchmarks 45 | 46 | ### Tekton 47 | 48 | Create workload `Task` and provide example `Pipeline` to run. 49 | 50 | ### Add new trained models 51 | 52 | TBD 53 | 54 | ## Source improvement 55 | 56 | Any improvement in `src` and `cmd`. 57 | 58 | ## Test and CI improvement 59 | 60 | Any improvement in `tests`, `dockerfiles`, `manifests` and `.github/workflows` 61 | 62 | ## Documentation 63 | 64 | Detailed documentation should be posted to [kepler-doc](https://github.com/sustainable-computing-io/kepler-doc) repository. 65 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/sustainable_computing_io/kepler_model_server_base:v0.7.12 2 | 3 | WORKDIR /kepler_model 4 | ENV PYTHONPATH=/kepler_model 5 | 6 | COPY pyproject.toml . 7 | COPY README.md . 8 | COPY cmd/ cmd/ 9 | COPY src/ src/ 10 | 11 | RUN pip install --no-cache-dir . 12 | 13 | # port for Model Server 14 | EXPOSE 8100 15 | # port for Online Trainer (TODO: reserved for event-based online training) 16 | EXPOSE 8101 17 | # port for Offline Trainer 18 | EXPOSE 8102 19 | 20 | ENTRYPOINT ["bash", "-c"] 21 | CMD ["kepler-model"] 22 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.base: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | # 3 | # NOTE: This file contains all tools and dependencies needed for 4 | # setting up the development and testing environment 5 | 6 | # Prevents Python from writing pyc files. 7 | ENV PYTHONDONTWRITEBYTECODE=1 8 | 9 | # Keeps Python from buffering stdout and stderr to avoid situations where 10 | # the application crashes without emitting any logs due to buffering. 11 | ENV PYTHONUNBUFFERED=1 12 | 13 | RUN pip install --no-cache-dir --upgrade pip && \ 14 | python -m pip install --no-cache-dir hatch && \ 15 | pip cache purge 16 | 17 | WORKDIR /kepler_model 18 | ENV PYTHONPATH=/kepler_model 19 | 20 | COPY pyproject.toml . 21 | 22 | # NOTE: README.md and __about__.py are referenced in pyproject.toml 23 | # so they are copied into the image for pip install to succeed 24 | COPY README.md . 25 | 26 | RUN mkdir -p src/kepler_model 27 | COPY src/kepler_model/__init__.py src/kepler_model/ 28 | COPY src/kepler_model/__about__.py src/kepler_model/ 29 | 30 | RUN pip install --no-cache-dir . && \ 31 | pip cache purge 32 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | src/resource/ 3 | src/kepler_model/models/ 4 | tests/models/ 5 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.test: -------------------------------------------------------------------------------- 1 | FROM quay.io/sustainable_computing_io/kepler_model_server_base:latest 2 | 3 | # Prevents Python from writing pyc files. 4 | ENV PYTHONDONTWRITEBYTECODE=1 5 | 6 | # Keeps Python from buffering stdout and stderr to avoid situations where 7 | # the application crashes without emitting any logs due to buffering. 8 | ENV PYTHONUNBUFFERED=1 9 | 10 | 11 | WORKDIR /kepler_model 12 | ENV PYTHONPATH=/kepler_model 13 | 14 | COPY pyproject.toml . 15 | COPY README.md . 16 | COPY cmd/ cmd/ 17 | COPY src/ src/ 18 | COPY tests/ tests/ 19 | 20 | RUN pip install --no-cache-dir . && \ 21 | pip cache purge 22 | 23 | RUN mkdir -p /mnt/models 24 | 25 | # port for Model Server 26 | EXPOSE 8100 27 | # port for Online Trainer (TODO: reserved for event-based online training) 28 | EXPOSE 8101 29 | # port for Offline Trainer 30 | EXPOSE 8102 31 | 32 | CMD ["model-server"] 33 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.test-nobase: -------------------------------------------------------------------------------- 1 | FROM python:3.10-slim 2 | 3 | # NOTE: This file contains all tools and dependencies needed for 4 | # setting up the development and testing environment 5 | 6 | # Prevents Python from writing pyc files. 7 | ENV PYTHONDONTWRITEBYTECODE=1 8 | 9 | # Keeps Python from buffering stdout and stderr to avoid situations where 10 | # the application crashes without emitting any logs due to buffering. 11 | ENV PYTHONUNBUFFERED=1 12 | 13 | RUN python -m pip install --no-cache-dir hatch && \ 14 | pip cache purge 15 | 16 | WORKDIR /kepler_model 17 | ENV PYTHONPATH=/kepler_model 18 | 19 | 20 | COPY pyproject.toml . 21 | 22 | # NOTE: README.md and src/../__about__.py are referenced in pyproject.toml 23 | # so that they are copied into the image for pip install to succeed 24 | COPY README.md . 25 | COPY cmd/ cmd/ 26 | COPY src/ src/ 27 | COPY tests/ tests/ 28 | 29 | RUN pip install --no-cache-dir . && \ 30 | pip cache purge 31 | 32 | RUN hatch env create 33 | 34 | 35 | RUN mkdir -p /mnt/models 36 | # port for Model Server 37 | EXPOSE 8100 38 | # port for Online Trainer (TODO: reserved for event-based online training) 39 | EXPOSE 8101 40 | # port for Offline Trainer 41 | EXPOSE 8102 42 | 43 | CMD ["model-server"] 44 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.test-nobase.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | src/resource/ 3 | src/kepler_model/models/ 4 | tests/models/ 5 | -------------------------------------------------------------------------------- /dockerfiles/Dockerfile.test.dockerignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | src/resource/ 3 | src/kepler_model/models/ 4 | tests/models/ 5 | -------------------------------------------------------------------------------- /docs/developer/README.md: -------------------------------------------------------------------------------- 1 | # Developer Guide 2 | 3 | - Temporarily add `__init__.py` to all directories 4 | 5 | ```bash 6 | find ./src -type d -exec touch {}/__init__.py \; 7 | ``` 8 | 9 | - Generate `classes.plantuml` and `packages.plantuml` using the following commands 10 | 11 | ```bash 12 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2 --show-ancestors 1 --verbose -d umls/server/ --source-roots ./src/ ./src/server/ 13 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2 --show-ancestors 1 --verbose -d umls/estimate/ --source-roots ./src/ ./src/estimate/ 14 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2 --show-ancestors 1 --verbose -d umls/train/ --source-roots ./src/ ./src/train/ 15 | pyreverse --colorized --output plantuml --module-names y --show-stdlib --show-associated 2 --show-ancestors 1 --verbose -d umls/train/trainer/ --source-roots ./src/ ./src/train/trainer/ 16 | ``` 17 | 18 | - Use [plantuml](https://plantuml.com/download) to convert planuml files to `svg` files 19 | NeoVim plugin `neovim-soil` was used to generate svg files from plantuml files 20 | -------------------------------------------------------------------------------- /docs/developer/estimate/packages.plantuml: -------------------------------------------------------------------------------- 1 | @startuml packages 2 | set namespaceSeparator none 3 | package "estimate" as estimate #77AADD { 4 | } 5 | package "estimate.archived_model" as estimate.archived_model #77AADD { 6 | } 7 | package "estimate.estimator" as estimate.estimator #77AADD { 8 | } 9 | package "estimate.model" as estimate.model #99DDFF { 10 | } 11 | package "estimate.model.curvefit_model" as estimate.model.curvefit_model #99DDFF { 12 | } 13 | package "estimate.model.estimate_common" as estimate.model.estimate_common #99DDFF { 14 | } 15 | package "estimate.model.keras_model" as estimate.model.keras_model #99DDFF { 16 | } 17 | package "estimate.model.model" as estimate.model.model #99DDFF { 18 | } 19 | package "estimate.model.scikit_model" as estimate.model.scikit_model #99DDFF { 20 | } 21 | package "estimate.model.xgboost_model" as estimate.model.xgboost_model #99DDFF { 22 | } 23 | package "estimate.model_server_connector" as estimate.model_server_connector #77AADD { 24 | } 25 | estimate --> estimate.model 26 | estimate.archived_model --> estimate.model_server_connector 27 | estimate.estimator --> estimate.archived_model 28 | estimate.estimator --> estimate.model 29 | estimate.estimator --> estimate.model_server_connector 30 | estimate.model.curvefit_model --> estimate.model.estimate_common 31 | estimate.model.keras_model --> estimate.model.estimate_common 32 | estimate.model.model --> estimate.model.curvefit_model 33 | estimate.model.model --> estimate.model.scikit_model 34 | estimate.model.model --> estimate.model.xgboost_model 35 | estimate.model.scikit_model --> estimate.model.estimate_common 36 | estimate.model.xgboost_model --> estimate.model.estimate_common 37 | @enduml 38 | -------------------------------------------------------------------------------- /docs/developer/server/classes.plantuml: -------------------------------------------------------------------------------- 1 | @startuml classes 2 | set namespaceSeparator none 3 | class "server.model_server.ModelRequest" as server.model_server.ModelRequest #77AADD { 4 | filter : str 5 | metrics 6 | node_type : int 7 | output_type 8 | source : str 9 | trainer_name : str 10 | weight : bool 11 | } 12 | @enduml 13 | -------------------------------------------------------------------------------- /docs/developer/server/classes.svg: -------------------------------------------------------------------------------- 1 | server.model_server.ModelRequestfilter : strmetricsnode_type : intoutput_typesource : strtrainer_name : strweight : bool -------------------------------------------------------------------------------- /docs/developer/server/packages.plantuml: -------------------------------------------------------------------------------- 1 | @startuml packages 2 | set namespaceSeparator none 3 | package "server" as server #77AADD { 4 | } 5 | package "server.model_server" as server.model_server #77AADD { 6 | } 7 | @enduml 8 | -------------------------------------------------------------------------------- /docs/developer/server/packages.svg: -------------------------------------------------------------------------------- 1 | serverserver.model_server -------------------------------------------------------------------------------- /docs/developer/train/trainer/packages.plantuml: -------------------------------------------------------------------------------- 1 | @startuml packages 2 | set namespaceSeparator none 3 | package "train.trainer" as train.trainer #77AADD { 4 | } 5 | package "train.trainer.ExponentialRegressionTrainer" as train.trainer.ExponentialRegressionTrainer #77AADD { 6 | } 7 | package "train.trainer.ExponentialRegressionTrainer.main" as train.trainer.ExponentialRegressionTrainer.main #77AADD { 8 | } 9 | package "train.trainer.GradientBoostingRegressorTrainer" as train.trainer.GradientBoostingRegressorTrainer #77AADD { 10 | } 11 | package "train.trainer.GradientBoostingRegressorTrainer.main" as train.trainer.GradientBoostingRegressorTrainer.main #77AADD { 12 | } 13 | package "train.trainer.KNeighborsRegressorTrainer" as train.trainer.KNeighborsRegressorTrainer #77AADD { 14 | } 15 | package "train.trainer.KNeighborsRegressorTrainer.main" as train.trainer.KNeighborsRegressorTrainer.main #77AADD { 16 | } 17 | package "train.trainer.LinearRegressionTrainer" as train.trainer.LinearRegressionTrainer #77AADD { 18 | } 19 | package "train.trainer.LinearRegressionTrainer.main" as train.trainer.LinearRegressionTrainer.main #77AADD { 20 | } 21 | package "train.trainer.LogarithmicRegressionTrainer" as train.trainer.LogarithmicRegressionTrainer #77AADD { 22 | } 23 | package "train.trainer.LogarithmicRegressionTrainer.main" as train.trainer.LogarithmicRegressionTrainer.main #77AADD { 24 | } 25 | package "train.trainer.LogisticRegressionTrainer" as train.trainer.LogisticRegressionTrainer #77AADD { 26 | } 27 | package "train.trainer.LogisticRegressionTrainer.main" as train.trainer.LogisticRegressionTrainer.main #77AADD { 28 | } 29 | package "train.trainer.PolynomialRegressionTrainer" as train.trainer.PolynomialRegressionTrainer #77AADD { 30 | } 31 | package "train.trainer.PolynomialRegressionTrainer.main" as train.trainer.PolynomialRegressionTrainer.main #77AADD { 32 | } 33 | package "train.trainer.SGDRegressorTrainer" as train.trainer.SGDRegressorTrainer #77AADD { 34 | } 35 | package "train.trainer.SGDRegressorTrainer.main" as train.trainer.SGDRegressorTrainer.main #77AADD { 36 | } 37 | package "train.trainer.SVRRegressorTrainer" as train.trainer.SVRRegressorTrainer #77AADD { 38 | } 39 | package "train.trainer.SVRRegressorTrainer.main" as train.trainer.SVRRegressorTrainer.main #77AADD { 40 | } 41 | package "train.trainer.XGBoostTrainer" as train.trainer.XGBoostTrainer #77AADD { 42 | } 43 | package "train.trainer.XGBoostTrainer.main" as train.trainer.XGBoostTrainer.main #77AADD { 44 | } 45 | package "train.trainer.XgboostFitTrainer" as train.trainer.XgboostFitTrainer #77AADD { 46 | } 47 | package "train.trainer.XgboostFitTrainer.main" as train.trainer.XgboostFitTrainer.main #77AADD { 48 | } 49 | package "train.trainer.curvefit" as train.trainer.curvefit #77AADD { 50 | } 51 | package "train.trainer.scikit" as train.trainer.scikit #77AADD { 52 | } 53 | package "train.trainer.xgboost_interface" as train.trainer.xgboost_interface #77AADD { 54 | } 55 | train.trainer.XgboostFitTrainer.main --> train.trainer.xgboost_interface 56 | @enduml 57 | -------------------------------------------------------------------------------- /fig/comm_diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/comm_diagram.png -------------------------------------------------------------------------------- /fig/model-server-components-simplified.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/model-server-components-simplified.png -------------------------------------------------------------------------------- /fig/tekton-complete-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-complete-train.png -------------------------------------------------------------------------------- /fig/tekton-kepler-default.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-kepler-default.png -------------------------------------------------------------------------------- /fig/tekton-single-train.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/fig/tekton-single-train.png -------------------------------------------------------------------------------- /hack/k8s_helper.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # This file is part of the Kepler project 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Copyright 2023 The Kepler Contributors 18 | # 19 | 20 | set -e 21 | 22 | rollout_ns_status() { 23 | local resources 24 | resources=$(kubectl get deployments,statefulsets,daemonsets -n=$1 -o name) 25 | for res in $resources; do 26 | kubectl rollout status $res --namespace $1 --timeout=10m || die "failed to check status of ${res} inside namespace ${1}" 27 | done 28 | } 29 | 30 | _get_value() { 31 | res=$1 32 | namespace=$2 33 | location=$3 34 | kubectl get $res -n $namespace -ojson|jq -r $location 35 | } 36 | 37 | _get_succeed_condition() { 38 | resource=$1 39 | name=$2 40 | namespace=$3 41 | if [ "$(kubectl get $resource $name -n $namespace -ojson|jq '.status.conditions | length')" == 0 ]; then 42 | echo Unknown 43 | else 44 | location='.status.conditions|map(select(.type="Succeeded"))[0].status' 45 | _get_value $resource/$name $namespace $location 46 | fi 47 | } 48 | 49 | _log_completed_pod() { 50 | local resources 51 | name=$1 52 | namespace=$2 53 | location=".status.phase" 54 | resources=$(kubectl get pods -n=$namespace -o name) 55 | for res in $resources; do 56 | if [ "$res" == "pod/${name}-run-stressng-pod" ]; then 57 | # get parameters and estimation time 58 | kubectl logs $res -n $namespace|head 59 | fi 60 | echo $res 61 | if [ "$res" == "pod/${name}-presteps-pod" ]; then 62 | # get parameters and estimation time 63 | kubectl logs $res -n $namespace -c step-collect-idle|tail 64 | else 65 | kubectl logs $res -n $namespace|tail 66 | fi 67 | done 68 | } 69 | 70 | wait_for_pipelinerun() { 71 | resource=pipelinerun 72 | name=$1 73 | namespace=default 74 | 75 | if kubectl get taskruns|grep ${name}-run-stressng; then 76 | value=$(_get_succeed_condition $resource $name $namespace) 77 | while [ "$value" == "Unknown" ] ; 78 | do 79 | echo "Wait for pipeline $name to run workload" 80 | kubectl get pods 81 | value=$(_get_succeed_condition $resource $name $namespace) 82 | if kubectl get pod/${name}-run-stressng-pod |grep Running ; then 83 | estimate_time_line=$(kubectl logs pod/${name}-run-stressng-pod -c step-run-stressng -n $namespace|grep "Estimation Time (s):") 84 | estimate_time=$(echo ${estimate_time_line}|awk '{print $4}') 85 | echo "${estimate_time_line}, sleep" 86 | sleep ${estimate_time} 87 | break 88 | fi 89 | sleep 60 90 | done 91 | fi 92 | 93 | value=$(_get_succeed_condition $resource $name $namespace) 94 | while [ "$value" == "Unknown" ] ; 95 | do 96 | echo "Wait for pipeline $name to be succeeded" 97 | kubectl get pods 98 | sleep 60 99 | value=$(_get_succeed_condition $resource $name $namespace) 100 | done 101 | 102 | kubectl get taskrun 103 | _log_completed_pod $name $namespace 104 | if [ "$value" == "False" ]; then 105 | exit 1 106 | fi 107 | } 108 | 109 | "$@" 110 | -------------------------------------------------------------------------------- /hack/utils.bash: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2024. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | is_fn() { 18 | [[ $(type -t "$1") == "function" ]] 19 | return $? 20 | } 21 | 22 | header() { 23 | local title=" 🔆🔆🔆 $* 🔆🔆🔆 " 24 | 25 | local len=40 26 | if [[ ${#title} -gt $len ]]; then 27 | len=${#title} 28 | fi 29 | 30 | echo -e "\n\n \033[1m${title}\033[0m" 31 | echo -n "━━━━━" 32 | printf '━%.0s' $(seq "$len") 33 | echo "━━━━━━━" 34 | 35 | } 36 | 37 | info() { 38 | echo -e " 🔔 $*" >&2 39 | } 40 | 41 | err() { 42 | echo -e " 😱 $*" >&2 43 | } 44 | 45 | warn() { 46 | echo -e "  $*" >&2 47 | } 48 | 49 | ok() { 50 | echo -e " ✅ $*" >&2 51 | } 52 | 53 | skip() { 54 | echo -e " 🙈 SKIP: $*" >&2 55 | } 56 | 57 | fail() { 58 | echo -e " ❌ FAIL: $*" >&2 59 | } 60 | 61 | info_run() { 62 | echo -e "  $*\n" >&2 63 | } 64 | 65 | run() { 66 | echo -e " ❯ $*\n" >&2 67 | "$@" 68 | } 69 | 70 | die() { 71 | echo -e "\n ✋ $* " 72 | echo -e "──────────────────── ⛔️⛔️⛔️ ────────────────────────\n" 73 | exit 1 74 | } 75 | 76 | line() { 77 | local len="$1" 78 | local style="${2:-thin}" 79 | shift 80 | 81 | local ch='─' 82 | [[ "$style" == 'heavy' ]] && ch="━" 83 | 84 | printf "$ch%.0s" $(seq "$len") >&2 85 | echo 86 | } 87 | -------------------------------------------------------------------------------- /manifests/base/estimate-only/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: kepler_model_server 7 | newName: localhost:5001/kepler_model_server 8 | newTag: devel 9 | 10 | patchesStrategicMerge: 11 | - ./patch/patch-estimator-sidecar.yaml 12 | 13 | resources: 14 | - ../kepler 15 | -------------------------------------------------------------------------------- /manifests/base/estimate-with-server/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: kepler_model_server 7 | newName: localhost:5001/kepler_model_server 8 | newTag: devel 9 | 10 | patchesStrategicMerge: 11 | - ./patch/patch-estimator-sidecar.yaml 12 | - ./patch/patch-model-server.yaml 13 | 14 | resources: 15 | - ../kepler 16 | - ../server 17 | -------------------------------------------------------------------------------- /manifests/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: kepler_model_server 7 | newName: quay.io/sustainable_computing_io/kepler_model_server 8 | newTag: latest 9 | 10 | patchesStrategicMerge: 11 | - ./patch/patch-estimator-sidecar.yaml 12 | - ./patch/patch-model-server.yaml 13 | 14 | resources: 15 | - ../kepler 16 | - ../server 17 | -------------------------------------------------------------------------------- /manifests/base/openshift/estimate-only/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | patchesStrategicMerge: 4 | - ./patch/patch-estimator-sidecar.yaml 5 | - ./patch/patch-openshift.yaml 6 | 7 | resources: 8 | - ../kepler 9 | - ./openshift/scc.yaml 10 | -------------------------------------------------------------------------------- /manifests/base/openshift/estimate-with-server/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | patchesStrategicMerge: 4 | - ./patch/patch-estimator-sidecar.yaml 5 | - ./patch/patch-model-server.yaml 6 | - ./patch/patch-openshift.yaml 7 | 8 | resources: 9 | - ../kepler 10 | - ../server 11 | - ./openshift/scc.yaml 12 | -------------------------------------------------------------------------------- /manifests/base/openshift/scc.yaml: -------------------------------------------------------------------------------- 1 | # scc for the Kepler 2 | kind: SecurityContextConstraints 3 | apiVersion: security.openshift.io/v1 4 | metadata: 5 | name: kepler-scc 6 | # To allow running privilegedContainers 7 | allowPrivilegedContainer: true 8 | allowHostDirVolumePlugin: true 9 | allowHostNetwork: false 10 | allowHostPorts: false 11 | allowHostIPC: false 12 | allowHostPID: true 13 | readOnlyRootFilesystem: true 14 | defaultAddCapabilities: 15 | - SYS_ADMIN 16 | runAsUser: 17 | type: RunAsAny 18 | seLinuxContext: 19 | type: RunAsAny 20 | fsGroup: 21 | type: RunAsAny 22 | volumes: 23 | - configMap 24 | - projected 25 | - emptyDir 26 | - hostPath 27 | - secret 28 | users: 29 | - kepler 30 | - system:serviceaccount:kepler:kepler-sa 31 | -------------------------------------------------------------------------------- /manifests/base/openshift/serve-only/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | patchesStrategicMerge: 4 | - ./patch/patch-model-server.yaml 5 | - ./patch/patch-openshift.yaml 6 | 7 | resources: 8 | - ../kepler 9 | - ../server 10 | - ./openshift/scc.yaml 11 | -------------------------------------------------------------------------------- /manifests/base/patch/patch-estimator-sidecar.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-cfm 5 | namespace: kepler 6 | data: 7 | MODEL_CONFIG: | 8 | NODE_COMPONENTS_ESTIMATOR=true 9 | NODE_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/ec2-0.7.11/rapl-sysfs/AbsPower/BPFOnly/SGDRegressorTrainer_0.zip 10 | NODE_TOTAL_ESTIMATOR=true 11 | NODE_TOTAL_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/specpower-0.7.11/acpi/AbsPower/BPFOnly/SGDRegressorTrainer_0.zip 12 | --- 13 | apiVersion: apps/v1 14 | kind: DaemonSet 15 | metadata: 16 | name: kepler-exporter 17 | namespace: kepler 18 | spec: 19 | template: 20 | spec: 21 | containers: 22 | # kepler: wait for estimator socket 23 | - command: 24 | - /bin/sh 25 | - -c 26 | args: 27 | - until [ -e /tmp/estimator.sock ]; do sleep 1; done && /usr/bin/kepler -v=$(KEPLER_LOG_LEVEL) 28 | volumeMounts: 29 | - mountPath: /tmp 30 | name: tmp 31 | name: kepler-exporter 32 | # estimator container 33 | - image: kepler_model_server 34 | imagePullPolicy: IfNotPresent 35 | args: [estimator] 36 | name: estimator 37 | volumeMounts: 38 | - name: cfm 39 | mountPath: /etc/kepler/kepler.config 40 | readOnly: true 41 | - mountPath: /tmp 42 | name: tmp 43 | - mountPath: /mnt 44 | name: mnt 45 | volumes: 46 | - emptyDir: {} 47 | name: tmp 48 | - emptyDir: {} 49 | name: mnt 50 | -------------------------------------------------------------------------------- /manifests/base/patch/patch-model-server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-cfm 5 | namespace: kepler 6 | data: 7 | MODEL_SERVER_ENABLE: "true" 8 | MODEL_SERVER_ENDPOINT: http://kepler-model-server.$(MODEL_SERVER_NAMESPACE).svc.cluster.local:$(MODEL_SERVER_PORT)/model 9 | MODEL_SERVER_PORT: | 10 | $(MODEL_SERVER_PORT) 11 | MODEL_SERVER_URL: http://kepler-model-server.$(MODEL_SERVER_NAMESPACE).svc.cluster.local:$(MODEL_SERVER_PORT) 12 | MODEL_SERVER_MODEL_REQ_PATH: /model 13 | MODEL_SERVER_MODEL_LIST_PATH: /best-models 14 | -------------------------------------------------------------------------------- /manifests/base/patch/patch-openshift.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | annotations: 5 | openshift.io/description: Kepler exporter 6 | openshift.io/display-name: "" 7 | name: kepler 8 | --- 9 | apiVersion: apps/v1 10 | kind: DaemonSet 11 | metadata: 12 | name: kepler-exporter 13 | namespace: kepler 14 | spec: 15 | template: 16 | spec: 17 | containers: 18 | - name: kepler-exporter 19 | volumeMounts: 20 | - name: kernel-src 21 | mountPath: /usr/src/kernels 22 | securityContext: 23 | privileged: true 24 | volumes: 25 | - name: kernel-src 26 | hostPath: 27 | path: /usr/src/kernels 28 | type: Directory 29 | -------------------------------------------------------------------------------- /manifests/base/patch/patch-server-only.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-cfm 5 | namespace: kepler 6 | data: 7 | MODEL_CONFIG: | 8 | NODE_COMPONENTS_TRAINER=SGDRegressorTrainer 9 | NODE_TOTAL_TRAINER=SGDRegressorTrainer 10 | -------------------------------------------------------------------------------- /manifests/base/serve-only/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | apiVersion: kustomize.config.k8s.io/v1beta1 4 | kind: Kustomization 5 | images: 6 | - name: kepler_model_server 7 | newName: localhost:5001/kepler-model-server 8 | newTag: devel 9 | 10 | patchesStrategicMerge: 11 | - ./patch/patch-model-server.yaml 12 | - ./patch/patch-server-only.yaml 13 | resources: 14 | - ../kepler 15 | - ../server 16 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/common/var/lib/kepler/data/cpus.yaml: -------------------------------------------------------------------------------- 1 | ########## 2 | # CPUS - used to lookup uarch and channels by family, model, and stepping 3 | # The model and stepping fields will be interpreted as regular expressions 4 | # An empty stepping field means 'any' stepping 5 | 6 | ########## 7 | # Intel Core CPUs 8 | ########## 9 | # Haswell 10 | - core: HSW 11 | uarch: Haswell 12 | family: 6 13 | model: (50|69|70) 14 | stepping: 15 | 16 | # Broadwell 17 | - core: BDW 18 | uarch: Broadwell 19 | family: 6 20 | model: (61|71) 21 | stepping: 22 | 23 | # Skylake 24 | - core: SKL 25 | uarch: Skylake 26 | family: 6 27 | model: (78|94) 28 | stepping: 29 | 30 | # Kabylake 31 | - core: KBL 32 | uarch: Kaby Lake 33 | family: 6 34 | model: (142|158) 35 | stepping: 9 36 | 37 | # Coffelake 38 | - core: CFL 39 | uarch: Coffee Lake 40 | family: 6 41 | model: (142|158) 42 | stepping: (10|11|12|13) 43 | 44 | # Rocket Lake 45 | - core: RKL 46 | uarch: Cypress Cove 47 | family: 6 48 | model: 167 49 | stepping: 50 | 51 | # Tiger Lake 52 | - core: TGL 53 | uarch: Willow Cove 54 | family: 6 55 | model: (140|141) 56 | stepping: 57 | 58 | # Alder Lake 59 | - core: ADL 60 | uarch: Golden Cove 61 | family: 6 62 | model: (151|154) 63 | stepping: 64 | 65 | # Raptor Lake 66 | - core: RTL 67 | uarch: Raptor Cove 68 | family: 6 69 | model: 183 70 | stepping: 71 | 72 | ########## 73 | # Intel Xeon CPUs 74 | ########## 75 | # Haswell 76 | - core: HSX 77 | uarch: Haswell 78 | family: 6 79 | model: 63 80 | stepping: 81 | 82 | # Broadwell 83 | - core: BDX 84 | uarch: Broadwell 85 | family: 6 86 | model: (79|86) 87 | stepping: 88 | 89 | # Skylake 90 | - core: SKX 91 | uarch: Skylake 92 | family: 6 93 | model: 85 94 | stepping: (0|1|2|3|4) 95 | 96 | # Cascadelake 97 | - core: CLX 98 | uarch: Cascade Lake 99 | family: 6 100 | model: 85 101 | stepping: (5|6|7) 102 | 103 | # Cooperlake 104 | - core: CPX 105 | uarch: Cooper Lake 106 | family: 6 107 | model: 85 108 | stepping: 11 109 | 110 | # Icelake 111 | - core: ICX 112 | uarch: Sunny Cove 113 | family: 6 114 | model: (106|108) 115 | stepping: 116 | 117 | # Sapphire Rapids 118 | - core: SPR 119 | uarch: Sapphire Rapids 120 | family: 6 121 | model: 143 122 | stepping: 123 | 124 | # Emerald Rapids 125 | - core: EMR 126 | uarch: Emerald Rapids 127 | family: 6 128 | model: 207 129 | stepping: 130 | 131 | # Granite Rapids 132 | - core: GNR 133 | uarch: Granite Rapids 134 | family: 6 135 | model: 173 136 | stepping: 137 | 138 | # Sierra Forest 139 | - core: SRF 140 | uarch: Sierra Forest 141 | family: 6 142 | model: 175 143 | stepping: 144 | 145 | ########## 146 | # AMD CPUs 147 | ########## 148 | # Naples 149 | - core: Naples 150 | uarch: Zen 151 | family: 23 152 | model: 1 153 | stepping: 154 | 155 | # Rome 156 | - core: Rome 157 | uarch: Zen 2 158 | family: 23 159 | model: 49 160 | stepping: 161 | 162 | # Milan 163 | - core: Milan 164 | uarch: Zen 3 165 | family: 25 166 | model: 1 167 | stepping: 168 | 169 | # Genoa 170 | - core: Genoa 171 | uarch: Zen 4 172 | family: 25 173 | model: 17 174 | stepping: 175 | 176 | # Siena 177 | - core: Siena 178 | uarch: Zen 4c 179 | family: 25 180 | model: 160 181 | stepping: 182 | 183 | ########## 184 | # ARM CPUs 185 | ######### 186 | # AWS Graviton 2 187 | - core: Ares 188 | uarch: neoverse_n1 189 | family: 190 | model: 1 191 | stepping: r3p1 192 | 193 | # AWS Graviton 3 194 | - core: Zeus 195 | uarch: neoverse_v1 196 | family: 197 | model: 1 198 | stepping: r1p1 199 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/acpi_AbsPowerModel.json: -------------------------------------------------------------------------------- 1 | {"platform": {"All_Weights": {"Bias_Weight": 220.9079278650894, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 29.028228361462897}}}}} 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/acpi_DynPowerModel.json: -------------------------------------------------------------------------------- 1 | {"platform": {"All_Weights": {"Bias_Weight": 49.56491877218095, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 28.501356366108837}}}}} 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/intel_rapl_AbsPowerModel.json: -------------------------------------------------------------------------------- 1 | {"package": {"All_Weights": {"Bias_Weight": 69.91739430907396, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.16772409328642}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 47.142633336743344, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.57348245077466}}}}} 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/common/var/lib/kepler/data/model_weight/intel_rapl_DynPowerModel.json: -------------------------------------------------------------------------------- 1 | {"package": {"All_Weights": {"Bias_Weight": 38.856412561925055, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 22.258830113477515}}}}, "core": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "uncore": {"All_Weights": {"Bias_Weight": 0.0, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 0.0}}}}, "dram": {"All_Weights": {"Bias_Weight": 9.080889901856153, "Categorical_Variables": {}, "Numerical_Variables": {"bpf_cpu_time_ms": {"scale": 5911.969193263386, "mean": 0, "variance": 0, "weight": 3.0358946796490924}}}}} 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/ENABLE_PROCESS_METRICS: -------------------------------------------------------------------------------- 1 | true 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/EXPOSE_ESTIMATED_IDLE_POWER_METRICS: -------------------------------------------------------------------------------- 1 | false 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/metal/etc/kepler/kepler.config/EXPOSE_VM_METRICS: -------------------------------------------------------------------------------- 1 | true 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/models/etc/kepler/kepler.config/ENABLE_PROCESS_METRICS: -------------------------------------------------------------------------------- 1 | true 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/models/etc/kepler/kepler.config/EXPOSE_ESTIMATED_IDLE_POWER_METRICS: -------------------------------------------------------------------------------- 1 | false 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_CONFIG: -------------------------------------------------------------------------------- 1 | NODE_TOTAL_ESTIMATOR=true 2 | NODE_TOTAL_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/specpower-0.7.11/acpi/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip 3 | NODE_COMPONENTS_ESTIMATOR=true 4 | NODE_COMPONENTS_INIT_URL=https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models/v0.7/ec2-0.7.11/rapl-sysfs/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip 5 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_SERVER_ENABLE: -------------------------------------------------------------------------------- 1 | false 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/kepler/models/etc/kepler/kepler.config/MODEL_SERVER_URL: -------------------------------------------------------------------------------- 1 | http://model-server:8100 2 | -------------------------------------------------------------------------------- /manifests/compose/dev/overrides.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | prometheus: 3 | networks: 4 | - kepler-models-network 5 | - kepler-metal-network 6 | - model-server-network 7 | 8 | volumes: 9 | - type: bind 10 | source: ../dev/prometheus/scrape-configs/dev.yaml 11 | target: /etc/prometheus/scrape-configs/dev.yaml 12 | 13 | grafana: 14 | environment: 15 | GF_DASHBOARDS_DEFAULT_HOME_DASHBOARD_PATH: /var/lib/grafana/dashboards/dev/dashboard.json 16 | volumes: 17 | - type: bind 18 | source: ../dev/grafana/dashboards/dev/ 19 | target: /var/lib/grafana/dashboards/dev 20 | -------------------------------------------------------------------------------- /manifests/compose/dev/prometheus/scrape-configs/dev.yaml: -------------------------------------------------------------------------------- 1 | scrape_configs: 2 | - job_name: models 3 | static_configs: 4 | - targets: [kepler-models:9100] 5 | 6 | - job_name: metal 7 | static_configs: 8 | - targets: [kepler-metal:9100] 9 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/compose.yaml: -------------------------------------------------------------------------------- 1 | name: monitoring 2 | 3 | services: 4 | prometheus: 5 | build: 6 | context: ./prometheus 7 | ports: 8 | - 19090:9090 9 | volumes: 10 | - prom-data:/prometheus 11 | - type: bind 12 | source: ./prometheus/prometheus.yml 13 | target: /etc/prometheus/prometheus.yml 14 | networks: 15 | - monitoring 16 | 17 | healthcheck: 18 | test: wget -q --spider http://localhost:9090/ -O /dev/null || exit 1 19 | interval: ${HEALTHCHECK_INTERVAL:-50s} 20 | timeout: ${HEALTHCHECK_TIMEOUT:-30s} 21 | retries: ${HEALTHCHECK_RETRIES:-3} 22 | start_period: ${HEALTHCHECK_START_PERIOD:-1m} 23 | 24 | grafana: 25 | build: 26 | context: ./grafana 27 | environment: 28 | GF_AUTH_ANONYMOUS_ENABLED: "true" 29 | GF_SECURITY_ADMIN_PASSWORD: admin 30 | GF_AUTH_ANONYMOUS_ORG_ROLE: Admin 31 | 32 | user: "1000" # NOTE: change this to your `id -u` 33 | depends_on: 34 | - prometheus 35 | ports: 36 | - 13000:3000 37 | networks: 38 | - monitoring 39 | 40 | healthcheck: 41 | test: curl -f http://localhost:3000/ || exit 1 42 | interval: ${HEALTHCHECK_INTERVAL:-50s} 43 | timeout: ${HEALTHCHECK_TIMEOUT:-30s} 44 | retries: ${HEALTHCHECK_RETRIES:-3} 45 | start_period: ${HEALTHCHECK_START_PERIOD:-1m} 46 | 47 | volumes: 48 | # volume for holding prometheus (ts)db 49 | prom-data: 50 | 51 | networks: 52 | monitoring: 53 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/grafana/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/ceph/grafana:10.4.2 2 | 3 | COPY /datasource.yml /etc/grafana/provisioning/datasources/ 4 | COPY /dashboards.yml /etc/grafana/provisioning/dashboards/ 5 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/grafana/dashboards.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | # an unique provider name. Required 5 | - name: kepler 6 | # Org id. Default to 1 7 | orgId: 1 8 | # name of the dashboard folder. 9 | folder: kepler 10 | # provider type. Default to 'file' 11 | type: file 12 | # disable dashboard deletion 13 | disableDeletion: true 14 | # allow updating provisioned dashboards from the UI 15 | allowUiUpdates: true 16 | options: 17 | # path to dashboard files on disk. Required when using the 'file' type 18 | path: /var/lib/grafana/dashboards 19 | # use folder names from filesystem to create folders in Grafana 20 | foldersFromFilesStructure: true 21 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/grafana/datasource.yml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | datasources: 5 | # name of the datasource. Required 6 | - name: kepler-prometheus 7 | # datasource type. Required 8 | type: prometheus 9 | # access mode. direct or proxy. Required 10 | access: proxy 11 | # org id. will default to orgId 1 if not specified 12 | orgId: 1 13 | # url 14 | url: http://prometheus:9090 15 | isDefault: true 16 | version: 1 17 | # allow users to edit datasources from the UI. 18 | editable: true 19 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/prometheus/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM quay.io/prometheus/prometheus:main 2 | 3 | COPY /prometheus.yml /etc/prometheus/prometheus.yml 4 | 5 | CMD [\ 6 | "--config.file=/etc/prometheus/prometheus.yml",\ 7 | "--storage.tsdb.path=/prometheus", \ 8 | "--web.enable-admin-api" \ 9 | ] 10 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 5s # Set the scrape interval to every 5 seconds. Default is every 1 minute. 3 | evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. 4 | # scrape_timeout is set to the global default (10s). 5 | 6 | # Attach these labels to any time series or alerts when communicating with 7 | # external systems (federation, remote storage, Alertmanager). 8 | external_labels: 9 | monitor: kepler 10 | 11 | # A scrape configuration containing exactly one endpoint to scrape: 12 | # Here it's Prometheus itself. 13 | scrape_configs: 14 | # The job name is added as a label `job=` to any timeseries scraped from this config. 15 | - job_name: prometheus 16 | # metrics_path defaults to '/metrics' 17 | # scheme defaults to 'http'. 18 | static_configs: 19 | - targets: [localhost:9090] 20 | 21 | # Load rules once and periodically evaluate them according to 22 | # the global 'evaluation_interval'. 23 | rule_files: 24 | - /etc/prometheus/rules/*.yaml 25 | - /etc/prometheus/rules/*.yml 26 | 27 | # additional scrape configs 28 | scrape_config_files: 29 | - /etc/prometheus/scrape-configs/*.yaml 30 | - /etc/prometheus/scrape-configs/*.yml 31 | 32 | # NOTE: e.g. to add more jobs to scrape a 33 | # VM with IP 192.168.122.78 on port 8888, 34 | # - job_name: 'vm' 35 | # static_configs: 36 | # - targets: ['192.168.122.100:8888'] 37 | -------------------------------------------------------------------------------- /manifests/compose/monitoring/prometheus/rules/kepler.rule: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/manifests/compose/monitoring/prometheus/rules/kepler.rule -------------------------------------------------------------------------------- /manifests/kepler/kustomization.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: kustomize.config.k8s.io/v1beta1 2 | kind: Kustomization 3 | resources: 4 | - github.com/sustainable-computing-io/kepler/manifests/k8s/config/base 5 | 6 | patchesStrategicMerge: 7 | - ./patch/patch-ci.yaml 8 | images: 9 | - name: kepler 10 | newName: quay.io/sustainable_computing_io/kepler 11 | newTag: release-0.7.11 12 | -------------------------------------------------------------------------------- /manifests/kepler/patch/patch-ci.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-cfm 5 | namespace: system 6 | data: 7 | KEPLER_LOG_LEVEL: 4 8 | --- 9 | apiVersion: apps/v1 10 | kind: DaemonSet 11 | metadata: 12 | name: kepler-exporter 13 | namespace: system 14 | spec: 15 | template: 16 | spec: 17 | containers: 18 | - name: kepler-exporter 19 | imagePullPolicy: IfNotPresent 20 | image: kepler:latest 21 | -------------------------------------------------------------------------------- /manifests/offline-trainer/kustomization.yaml: -------------------------------------------------------------------------------- 1 | namespace: kepler 2 | 3 | resources: 4 | - offline-trainer.yaml 5 | 6 | images: 7 | - name: kepler_model_server 8 | newName: quay.io/sustainable_computing_io/kepler_model_server 9 | newTag: latest 10 | -------------------------------------------------------------------------------- /manifests/offline-trainer/offline-trainer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: kepler-offline-trainer 5 | namespace: system 6 | labels: 7 | app.kubernetes.io/component: model-server 8 | app.kubernetes.io/name: kepler-model-server 9 | spec: 10 | replicas: 1 11 | selector: 12 | matchLabels: 13 | app.kubernetes.io/component: offline-trainer 14 | app.kubernetes.io/name: kepler-offline-trainer 15 | template: 16 | metadata: 17 | labels: 18 | app.kubernetes.io/component: offline-trainer 19 | app.kubernetes.io/name: kepler-offline-trainer 20 | spec: 21 | volumes: 22 | - name: cfm 23 | configMap: 24 | name: kepler-model-server-cfm 25 | - emptyDir: {} 26 | name: mnt 27 | containers: 28 | - name: offline-trainer 29 | image: kepler_model_server 30 | imagePullPolicy: Always 31 | ports: 32 | - containerPort: 8102 33 | name: http 34 | volumeMounts: 35 | - name: cfm 36 | mountPath: /etc/kepler/kepler.config 37 | readOnly: true 38 | - name: mnt 39 | mountPath: /mnt 40 | readOnly: false 41 | args: [offline-trainer] 42 | --- 43 | kind: Service 44 | apiVersion: v1 45 | metadata: 46 | name: kepler-offline-trainer 47 | namespace: system 48 | labels: 49 | app.kubernetes.io/component: offline-trainer 50 | app.kubernetes.io/name: kepler-offline-trainer 51 | spec: 52 | clusterIP: None 53 | selector: 54 | app.kubernetes.io/component: offline-trainer 55 | app.kubernetes.io/name: kepler-offline-trainer 56 | ports: 57 | - name: http 58 | port: 8102 59 | targetPort: http 60 | -------------------------------------------------------------------------------- /manifests/server/base/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - server.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | vars: 7 | - name: MODEL_SERVER_NAMESPACE 8 | objref: 9 | kind: Deployment 10 | group: apps 11 | version: v1 12 | name: kepler-model-server 13 | fieldref: 14 | fieldpath: metadata.namespace 15 | - name: MODEL_SERVER_PORT 16 | objref: 17 | kind: Deployment 18 | group: apps 19 | version: v1 20 | name: kepler-model-server 21 | fieldref: 22 | fieldpath: spec.template.spec.containers[0].ports[0].containerPort 23 | 24 | configurations: 25 | - kustomizeconfig.yaml 26 | -------------------------------------------------------------------------------- /manifests/server/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - server.yaml 3 | 4 | apiVersion: kustomize.config.k8s.io/v1beta1 5 | kind: Kustomization 6 | vars: 7 | - fieldref: 8 | fieldPath: metadata.namespace 9 | name: MODEL_SERVER_NAMESPACE 10 | objref: 11 | group: apps 12 | kind: Deployment 13 | name: kepler-model-server 14 | version: v1 15 | - fieldref: 16 | fieldPath: spec.template.spec.containers[0].ports[0].containerPort 17 | name: MODEL_SERVER_PORT 18 | objref: 19 | group: apps 20 | kind: Deployment 21 | name: kepler-model-server 22 | version: v1 23 | 24 | configurations: 25 | - kustomizeconfig.yaml 26 | images: 27 | - name: kepler_model_server 28 | newName: quay.io/sustainable_computing_io/kepler_model_server 29 | newTag: latest 30 | -------------------------------------------------------------------------------- /manifests/server/kustomizeconfig.yaml: -------------------------------------------------------------------------------- 1 | varReference: 2 | - kind: ConfigMap 3 | group: "" 4 | version: v1 5 | name: kepler-cfm 6 | path: data/MODEL_SERVER_ENDPOINT 7 | - kind: ConfigMap 8 | group: "" 9 | version: v1 10 | name: kepler-cfm 11 | path: data/MODEL_SERVER_URL 12 | - kind: ConfigMap 13 | group: "" 14 | version: v1 15 | name: kepler-cfm 16 | path: data/MODEL_SERVER_PORT 17 | -------------------------------------------------------------------------------- /manifests/server/online-train/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - server.yaml 3 | 4 | patchesStrategicMerge: 5 | - ./online-train/patch-trainer.yaml 6 | 7 | apiVersion: kustomize.config.k8s.io/v1beta1 8 | kind: Kustomization 9 | vars: 10 | - name: MODEL_SERVER_NAMESPACE 11 | objref: 12 | kind: Deployment 13 | group: apps 14 | version: v1 15 | name: kepler-model-server 16 | fieldref: 17 | fieldpath: metadata.namespace 18 | - name: MODEL_SERVER_PORT 19 | objref: 20 | kind: Deployment 21 | group: apps 22 | version: v1 23 | name: kepler-model-server 24 | fieldref: 25 | fieldpath: spec.template.spec.containers[0].ports[0].containerPort 26 | 27 | configurations: 28 | - kustomizeconfig.yaml 29 | -------------------------------------------------------------------------------- /manifests/server/online-train/patch-trainer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-model-server-cfm 5 | namespace: kepler 6 | data: 7 | PROM_SERVER: http://prometheus-k8s.monitoring.svc.cluster.local:9090 8 | PROM_QUERY_INTERVAL: 20 9 | PROM_QUERY_STEP: 3 10 | PROM_SSL_DISABLE: true 11 | --- 12 | apiVersion: apps/v1 13 | kind: Deployment 14 | metadata: 15 | name: kepler-model-server 16 | namespace: kepler 17 | spec: 18 | template: 19 | spec: 20 | containers: 21 | - name: server-api 22 | - name: online-trainer 23 | image: kepler_model_server 24 | imagePullPolicy: IfNotPresent 25 | volumeMounts: 26 | - name: cfm 27 | mountPath: /etc/kepler/kepler.config 28 | readOnly: true 29 | - name: mnt 30 | mountPath: /mnt 31 | readOnly: false 32 | args: [online-trainer] 33 | -------------------------------------------------------------------------------- /manifests/server/openshift/online-train/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - server.yaml 3 | 4 | patchesStrategicMerge: 5 | - ./openshift/patch-openshift.yaml 6 | - ./online-train/patch-trainer.yaml 7 | - ./openshift/online-train/patch-trainer.yaml 8 | 9 | apiVersion: kustomize.config.k8s.io/v1beta1 10 | kind: Kustomization 11 | vars: 12 | - name: MODEL_SERVER_NAMESPACE 13 | objref: 14 | kind: Deployment 15 | group: apps 16 | version: v1 17 | name: kepler-model-server 18 | fieldref: 19 | fieldpath: metadata.namespace 20 | - name: MODEL_SERVER_PORT 21 | objref: 22 | kind: Deployment 23 | group: apps 24 | version: v1 25 | name: kepler-model-server 26 | fieldref: 27 | fieldpath: spec.template.spec.containers[0].ports[0].containerPort 28 | 29 | configurations: 30 | - kustomizeconfig.yaml 31 | -------------------------------------------------------------------------------- /manifests/server/openshift/online-train/patch-trainer.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-model-server-cfm 5 | namespace: system 6 | data: 7 | PROM_SERVER: http://prometheus-operated.openshift-monitoring.svc.cluster.local:9090 8 | PROM_QUERY_INTERVAL: 20 9 | PROM_QUERY_STEP: 3 10 | PROM_SSL_DISABLE: true 11 | --- 12 | apiVersion: apps/v1 13 | kind: Deployment 14 | metadata: 15 | name: kepler-model-server 16 | namespace: system 17 | spec: 18 | template: 19 | spec: 20 | containers: 21 | - name: server-api 22 | - name: online-trainer 23 | securityContext: 24 | privileged: true 25 | -------------------------------------------------------------------------------- /manifests/server/openshift/patch-openshift.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: Deployment 3 | metadata: 4 | name: kepler-model-server 5 | namespace: system 6 | spec: 7 | template: 8 | spec: 9 | serviceAccountName: kepler-sa 10 | containers: 11 | - name: server-api 12 | securityContext: 13 | privileged: true 14 | -------------------------------------------------------------------------------- /manifests/server/openshift/serve-only/kustomization.yaml: -------------------------------------------------------------------------------- 1 | resources: 2 | - server.yaml 3 | 4 | patchesStrategicMerge: 5 | - ./openshift/patch-openshift.yaml 6 | 7 | apiVersion: kustomize.config.k8s.io/v1beta1 8 | kind: Kustomization 9 | vars: 10 | - name: MODEL_SERVER_NAMESPACE 11 | objref: 12 | kind: Deployment 13 | group: apps 14 | version: v1 15 | name: kepler-model-server 16 | fieldref: 17 | fieldpath: metadata.namespace 18 | - name: MODEL_SERVER_PORT 19 | objref: 20 | kind: Deployment 21 | group: apps 22 | version: v1 23 | name: kepler-model-server 24 | fieldref: 25 | fieldpath: spec.template.spec.containers[0].ports[0].containerPort 26 | 27 | configurations: 28 | - kustomizeconfig.yaml 29 | -------------------------------------------------------------------------------- /manifests/server/server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: kepler-model-server-cfm 5 | namespace: system 6 | --- 7 | apiVersion: apps/v1 8 | kind: Deployment 9 | metadata: 10 | name: kepler-model-server 11 | namespace: system 12 | labels: 13 | app.kubernetes.io/component: model-server 14 | app.kubernetes.io/name: kepler-model-server 15 | spec: 16 | replicas: 1 17 | selector: 18 | matchLabels: 19 | app.kubernetes.io/component: model-server 20 | app.kubernetes.io/name: kepler-model-server 21 | template: 22 | metadata: 23 | labels: 24 | app.kubernetes.io/component: model-server 25 | app.kubernetes.io/name: kepler-model-server 26 | spec: 27 | volumes: 28 | - name: cfm 29 | configMap: 30 | name: kepler-model-server-cfm 31 | - emptyDir: {} 32 | name: mnt 33 | containers: 34 | - name: server-api 35 | image: kepler_model_server 36 | imagePullPolicy: IfNotPresent 37 | ports: 38 | - containerPort: 8100 39 | name: http 40 | volumeMounts: 41 | - name: cfm 42 | mountPath: /etc/kepler/kepler.config 43 | readOnly: true 44 | - name: mnt 45 | mountPath: /mnt 46 | readOnly: false 47 | args: [model-server] 48 | --- 49 | kind: Service 50 | apiVersion: v1 51 | metadata: 52 | name: kepler-model-server 53 | namespace: system 54 | labels: 55 | app.kubernetes.io/component: model-server 56 | app.kubernetes.io/name: kepler-model-server 57 | spec: 58 | clusterIP: None 59 | selector: 60 | app.kubernetes.io/component: model-server 61 | app.kubernetes.io/name: kepler-model-server 62 | ports: 63 | - name: http 64 | port: 8100 65 | targetPort: http 66 | -------------------------------------------------------------------------------- /manifests/set.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # 3 | # This file is part of the Kepler project 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | # Copyright 2022 The Kepler Contributors 18 | # 19 | 20 | # set options 21 | # for example: ./set.sh "ESTIMATOR SERVER" 22 | unset $SERVER 23 | unset $ONLINE_TRAINER 24 | unset $ESTIMATOR 25 | unset $OPENSHIFT_DEPLOY 26 | 27 | DEPLOY_OPTIONS=$1 28 | for opt in ${DEPLOY_OPTIONS}; do export $opt=true; done; 29 | 30 | echo DEPLOY_OPTIONS=${DEPLOY_OPTIONS} 31 | 32 | version=$(kubectl version| grep 'Client Version' | sed 's/.*v//g' | cut -b -4) 33 | if [ 1 -eq "$(echo "${version} < 1.21" | bc)" ] 34 | then 35 | echo "You need to update your kubectl version to 1.21+ to support kustomize" 36 | exit 1 37 | fi 38 | 39 | echo "Preparing manifests..." 40 | 41 | if [ ! -z ${SERVER} ]; then 42 | echo "deploy model server" 43 | if [ ! -z ${ESTIMATOR} ]; then 44 | echo "add estimator-sidecar" 45 | # OPTS="ESTIMATOR SERVER" --> base 46 | cp ./manifests/base/estimate-with-server/kustomization.yaml ./manifests/base/kustomization.yaml 47 | if [ ! -z ${OPENSHIFT_DEPLOY} ]; then 48 | echo "patch openshift deployment for exporter (estimator-with-server)" 49 | # OPTS="ESTIMATOR SERVER OPENSHIFT_DEPLOY" --> base 50 | cp ./manifests/base/openshift/estimate-with-server/kustomization.yaml ./manifests/base/kustomization.yaml 51 | fi 52 | else 53 | # OPTS="SERVER" --> base 54 | cp ./manifests/base/serve-only/kustomization.yaml ./manifests/base/kustomization.yaml 55 | if [ ! -z ${OPENSHIFT_DEPLOY} ]; then 56 | echo "patch openshift deployment for exporter (serve-only)" 57 | # OPTS="SERVER OPENSHIFT_DEPLOY" --> base 58 | cp ./manifests/base/openshift/serve-only/kustomization.yaml ./manifests/base/kustomization.yaml 59 | fi 60 | fi 61 | 62 | if [ ! -z ${ONLINE_TRAINER} ]; then 63 | echo "add online trainer" 64 | # OPTS="... SERVER ONLINE_TRAINER" --> server 65 | cp ./manifests/server/online-train/kustomization.yaml ./manifests/server/kustomization.yaml 66 | if [ ! -z ${OPENSHIFT_DEPLOY} ]; then 67 | echo "patch openshift deployment for server (with online trainer)" 68 | # OPTS="... SERVER ONLINE_TRAINER OPENSHIFT_DEPLOY" --> server 69 | cp ./manifests/server/openshift/online-train/kustomization.yaml ./manifests/server/kustomization.yaml 70 | fi 71 | else 72 | # OPTS="... SERVER" --> server 73 | cp ./manifests/server/base/kustomization.yaml ./manifests/server/kustomization.yaml 74 | if [ ! -z ${OPENSHIFT_DEPLOY} ]; then 75 | echo "patch openshift deployment for server" 76 | # OPTS="... SERVER OPENSHIFT_DEPLOY" --> server 77 | cp ./manifests/server/openshift/serve-only/kustomization.yaml ./manifests/server/kustomization.yaml 78 | fi 79 | fi 80 | elif [ ! -z ${ESTIMATOR} ]; then 81 | echo "add estimator-sidecar" 82 | # OPTS="ESTIMATOR" --> base 83 | cp ./manifests/base/estimate-only/kustomization.yaml ./manifests/base/kustomization.yaml 84 | if [ ! -z ${OPENSHIFT_DEPLOY} ]; then 85 | echo "patch openshift deployment for exporter (estimator-only)" 86 | # OPTS="ESTIMATOR OPENSHIFT_DEPLOY" --> base 87 | cp ./manifests/base/openshift/estimate-only/kustomization.yaml ./manifests/base/kustomization.yaml 88 | fi 89 | fi 90 | 91 | for opt in ${DEPLOY_OPTIONS}; do unset $opt; done; 92 | 93 | echo "Done $0" -------------------------------------------------------------------------------- /manifests/test/file-server.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: model-db 5 | namespace: kepler 6 | labels: 7 | app.kubernetes.io/component: model-db 8 | spec: 9 | containers: 10 | - name: file-server 11 | image: localhost:5001/kepler_model_server:devel-test 12 | imagePullPolicy: IfNotPresent 13 | args: [python3, tests/http_server.py] 14 | ports: 15 | - containerPort: 8110 16 | name: http 17 | volumeMounts: 18 | - name: mnt 19 | mountPath: /mnt 20 | initContainers: 21 | - name: trainer 22 | image: localhost:5001/kepler_model_server:devel-test 23 | imagePullPolicy: IfNotPresent 24 | args: [python3, tests/minimal_trainer.py] 25 | volumeMounts: 26 | - name: mnt 27 | mountPath: /mnt 28 | # Add other init container configurations here 29 | volumes: 30 | - name: mnt 31 | emptyDir: {} 32 | --- 33 | kind: Service 34 | apiVersion: v1 35 | metadata: 36 | name: model-db 37 | namespace: kepler 38 | labels: 39 | app.kubernetes.io/component: model-db 40 | spec: 41 | clusterIP: None 42 | selector: 43 | app.kubernetes.io/component: model-db 44 | ports: 45 | - name: http 46 | port: 8110 47 | targetPort: http 48 | -------------------------------------------------------------------------------- /manifests/test/model-request-client.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: kepler-exporter 5 | namespace: kepler 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kepler-exporter 11 | image: localhost:5001/kepler_model_server:devel-test 12 | imagePullPolicy: IfNotPresent 13 | command: [/bin/bash, -c] 14 | args: [python3 tests/weight_model_request_test.py && echo Done && sleep infinity] 15 | volumeMounts: 16 | - name: cfm 17 | mountPath: /etc/kepler/kepler.config 18 | readOnly: true 19 | - mountPath: /tmp 20 | name: tmp 21 | volumes: 22 | - emptyDir: {} 23 | name: tmp 24 | -------------------------------------------------------------------------------- /manifests/test/patch-estimator-sidecar.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | MODEL_CONFIG: | 3 | NODE_COMPONENTS_ESTIMATOR=true 4 | NODE_COMPONENTS_INIT_URL=http://model-db.kepler.svc.cluster.local:8110/std_v0.7.11/rapl-sysfs/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip 5 | NODE_TOTAL_ESTIMATOR=true 6 | NODE_TOTAL_INIT_URL=http://model-db.kepler.svc.cluster.local:8110/std_v0.7.11/acpi/AbsPower/BPFOnly/GradientBoostingRegressorTrainer_0.zip 7 | -------------------------------------------------------------------------------- /manifests/test/power-request-client.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: apps/v1 2 | kind: DaemonSet 3 | metadata: 4 | name: kepler-exporter 5 | namespace: kepler 6 | spec: 7 | template: 8 | spec: 9 | containers: 10 | - name: kepler-exporter 11 | image: localhost:5001/kepler_model_server:devel-test 12 | imagePullPolicy: IfNotPresent 13 | command: [/bin/bash, -c] 14 | args: ["until [ -e /tmp/estimator.sock ]; do sleep 1; done && python3 -u tests/estimator_power_request_test.py && echo Done && sleep infinity"] 15 | volumeMounts: 16 | - name: cfm 17 | mountPath: /etc/kepler/kepler.config 18 | readOnly: true 19 | - mountPath: /tmp 20 | name: tmp 21 | - name: estimator 22 | volumes: 23 | - emptyDir: {} 24 | name: tmp 25 | -------------------------------------------------------------------------------- /model_training/README.md: -------------------------------------------------------------------------------- 1 | # Contribute to power profiling and model training 2 | 3 | 4 | 5 | - [Contribute to power profiling and model training](#contribute-to-power-profiling-and-model-training) 6 | - [Requirements](#requirements) 7 | - [Pre-step](#pre-step) 8 | - [Setup](#setup) 9 | - [Prepare cluster](#prepare-cluster) 10 | - [From scratch (no target kubernetes cluster)](#from-scratch-no-target-kubernetes-cluster) 11 | - [For managed cluster](#for-managed-cluster) 12 | - [Run benchmark and collect metrics](#run-benchmark-and-collect-metrics) 13 | - [With manual execution](#with-manual-execution) 14 | - [Clean up](#clean-up) 15 | 16 | 17 | 18 | ## Requirements 19 | 20 | - git > 2.22 21 | - hatch 22 | - kubectl 23 | - yq, jq 24 | - power meter if available 25 | 26 | ## Pre-step 27 | 28 | - Fork and clone this repository and move to `model_training` folder 29 | 30 | ```bash 31 | git clone 32 | cd model_training 33 | ``` 34 | 35 | ## Setup 36 | 37 | ### Prepare cluster 38 | 39 | ### From scratch (no target kubernetes cluster) 40 | 41 | > Note: port 9090 and 5101 should not being used. It will be used in port-forward for prometheus and kind registry respectively 42 | 43 | ```bash 44 | ./script.sh prepare_cluster 45 | ``` 46 | 47 | The script will: 48 | 49 | - create a kind cluster `kind-for-training` with registry at port `5101`. 50 | - deploy Prometheus. 51 | - deploy Prometheus RBAC and node port to `30090` port on kind node which will be forwarded to `9090` port on the host. 52 | - deploy service monitor for kepler and reload to Prometheus server 53 | 54 | ### For managed cluster 55 | 56 | Please confirm the following requirements: 57 | 58 | - Kepler installation 59 | - Prometheus installation 60 | - Kepler metrics are exported to Promtheus server 61 | - Prometheus server is available at `http://localhost:9090`. Otherwise, set environment `PROM_SERVER`. 62 | 63 | ### Run benchmark and collect metrics 64 | 65 | - [Tekton Pipeline Instruction](./tekton/README.md) 66 | 67 | ### With manual execution 68 | 69 | In addition to the above approach, you can manually run your own benchmarks, then collect, train, and export the models by the entrypoint 70 | 71 | [Manual Metric Collection and Training with Entrypoint](./cmd_instruction.md) 72 | 73 | ## Clean up 74 | 75 | For kind-for-training cluster: 76 | 77 | ```bash 78 | ./script.sh cleanup 79 | ``` 80 | -------------------------------------------------------------------------------- /model_training/deployment/prom-kepler-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: Role 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: prometheus-k8s 5 | namespace: kepler 6 | labels: 7 | app.kubernetes.io/component: prometheus 8 | app.kubernetes.io/instance: k8s 9 | app.kubernetes.io/name: prometheus 10 | rules: 11 | - verbs: 12 | - get 13 | - list 14 | - watch 15 | apiGroups: 16 | - "" # yamllint disable-line rule:quoted-strings 17 | resources: 18 | - services 19 | - endpoints 20 | - pods 21 | - verbs: 22 | - get 23 | - list 24 | - watch 25 | apiGroups: 26 | - extensions 27 | resources: 28 | - ingresses 29 | - verbs: 30 | - get 31 | - list 32 | - watch 33 | apiGroups: 34 | - networking.k8s.io 35 | resources: 36 | - ingresses 37 | --- 38 | kind: RoleBinding 39 | apiVersion: rbac.authorization.k8s.io/v1 40 | metadata: 41 | name: prometheus-k8s 42 | namespace: kepler 43 | labels: 44 | app.kubernetes.io/component: prometheus 45 | app.kubernetes.io/instance: k8s 46 | app.kubernetes.io/name: prometheus 47 | subjects: 48 | - kind: ServiceAccount 49 | name: prometheus-k8s 50 | namespace: monitoring 51 | roleRef: 52 | apiGroup: rbac.authorization.k8s.io 53 | kind: Role 54 | name: prometheus-k8s 55 | -------------------------------------------------------------------------------- /model_training/deployment/prom-np.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: prometheus 6 | app.kubernetes.io/instance: k8s 7 | app.kubernetes.io/name: prometheus 8 | app.kubernetes.io/part-of: kube-prometheus 9 | name: prometheus-k8s-np 10 | namespace: monitoring 11 | spec: 12 | ports: 13 | - name: web 14 | port: 9090 15 | protocol: TCP 16 | targetPort: web 17 | nodePort: 30090 18 | selector: 19 | app.kubernetes.io/component: prometheus 20 | app.kubernetes.io/instance: k8s 21 | app.kubernetes.io/name: prometheus 22 | app.kubernetes.io/part-of: kube-prometheus 23 | type: NodePort 24 | -------------------------------------------------------------------------------- /model_training/s3/Dockerfile: -------------------------------------------------------------------------------- 1 | # NOTE: Dockerfile for generating quay.io/kepler_model_server/s3 images 2 | 3 | FROM python:3.10-slim 4 | 5 | WORKDIR /usr/local 6 | 7 | COPY . /usr/local 8 | RUN pip install --no-cache-dir . && \ 9 | pip cache purge 10 | -------------------------------------------------------------------------------- /model_training/s3/README.md: -------------------------------------------------------------------------------- 1 | # S3-Pusher 2 | 3 | A simple script and Dockerfile to push model_training/data folder to s3 bucket. 4 | -------------------------------------------------------------------------------- /model_training/s3/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "s3" 7 | dynamic = ["version"] 8 | description = '' 9 | readme = "README.md" 10 | requires-python = ">=3.10" 11 | license = "Apache-2.0" 12 | keywords = [] 13 | authors = [ 14 | { name = "Sunyanan Choochotkaew", email = "sunyanan.choochotkaew1@ibm.com" }, 15 | ] 16 | classifiers = [ 17 | "Programming Language :: Python", 18 | "Programming Language :: Python :: 3.10", 19 | "Programming Language :: Python :: Implementation :: CPython", 20 | "Programming Language :: Python :: Implementation :: PyPy", 21 | ] 22 | dependencies = [ 23 | "boto3", 24 | "ibm-cos-sdk", 25 | ] 26 | 27 | [project.urls] 28 | Documentation = "https://github.com/sustainable-computing-io/kepler-model-server#readme" 29 | Issues = "https://github.com/sustainable-computing-io/kepler-model-server/issues" 30 | Source = "https://github.com/sustainable-computing-io/kepler-model-server" 31 | 32 | [project.scripts] 33 | s3-loader = "s3.loader:run" 34 | s3-pusher = "s3.pusher:run" 35 | 36 | [tool.hatch.version] 37 | path = "src/s3/__about__.py" 38 | 39 | [tool.hatch.envs.default] 40 | python = "3.10" 41 | 42 | [tool.hatch.envs.types] 43 | extra-dependencies = [ 44 | "mypy>=1.0.0", 45 | ] 46 | [tool.hatch.envs.types.scripts] 47 | check = "mypy --install-types --non-interactive {args:src/s3 tests}" 48 | 49 | [tool.coverage.run] 50 | source_pkgs = ["s3", "tests"] 51 | branch = true 52 | parallel = true 53 | omit = [ 54 | "src/s3/__about__.py", 55 | ] 56 | 57 | [tool.coverage.paths] 58 | s3 = ["src/s3", "*/s3/src/s3"] 59 | tests = ["tests", "*/s3/tests"] 60 | 61 | [tool.coverage.report] 62 | exclude_lines = [ 63 | "no cov", 64 | "if __name__ == .__main__.:", 65 | "if TYPE_CHECKING:", 66 | ] 67 | -------------------------------------------------------------------------------- /model_training/s3/src/s3/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | __version__ = "0.7.11" 5 | -------------------------------------------------------------------------------- /model_training/s3/src/s3/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /model_training/s3/src/s3/loader.py: -------------------------------------------------------------------------------- 1 | ## get client 2 | # client = new__client(args) 3 | ## upload all files in mnt path 4 | # _upload(client, mnt_path) 5 | import argparse 6 | import os 7 | 8 | from . import util 9 | 10 | model_dir = "models" 11 | data_dir = "data" 12 | machine_spec_dir = "machine_spec" 13 | 14 | 15 | def aws_list_keys(client, bucket_name, prefix): 16 | response = client.list_objects_v2(Bucket=bucket_name, Prefix=prefix) 17 | return [obj["Key"] for obj in response.get("Contents", [])] 18 | 19 | 20 | def ibmcloud_list_keys(client, bucket_name, prefix): 21 | bucket_obj = client.Bucket(bucket_name) 22 | data_response = bucket_obj.objects.filter(Prefix=prefix) 23 | return [obj.key for obj in data_response] 24 | 25 | 26 | def get_bucket_file_map(client, bucket_name, machine_id, mnt_path, pipeline_name, list_func): 27 | bucket_file_map = dict() 28 | top_key_path = "" 29 | if machine_id is not None and machine_id != "": 30 | top_key_path = "/" + machine_id 31 | # add data key map 32 | data_path = os.path.join(mnt_path, data_dir) 33 | datapath_prefix = top_key_path + "/data" 34 | keys = list_func(client, bucket_name, datapath_prefix) 35 | for key in keys: 36 | filepath = key.replace(datapath_prefix, data_path) 37 | bucket_file_map[key] = filepath 38 | # add model key map 39 | model_path = os.path.join(mnt_path, model_dir, pipeline_name) 40 | model_predix = "/models/" + pipeline_name 41 | keys = list_func(client, bucket_name, model_predix) 42 | for key in keys: 43 | filepath = key.replace(model_predix, model_path) 44 | bucket_file_map[key] = filepath 45 | return bucket_file_map 46 | 47 | 48 | def aws_download(client, bucket_name, machine_id, mnt_path, pipeline_name): 49 | print("AWS Download") 50 | bucket_file_map = get_bucket_file_map(client, bucket_name, machine_id=machine_id, mnt_path=mnt_path, pipeline_name=pipeline_name, list_func=aws_list_keys) 51 | for key, filepath in bucket_file_map.items(): 52 | print(key, filepath) 53 | dir = os.path.dirname(filepath) 54 | if not os.path.exists(dir): 55 | os.makedirs(dir) 56 | client.download_file(bucket_name, key, filepath) 57 | 58 | 59 | def ibm_download(client, bucket_name, machine_id, mnt_path, pipeline_name): 60 | print("IBM Download") 61 | bucket_file_map = get_bucket_file_map( 62 | client, bucket_name, machine_id=machine_id, mnt_path=mnt_path, pipeline_name=pipeline_name, list_func=ibmcloud_list_keys 63 | ) 64 | for key, filepath in bucket_file_map.items(): 65 | print(key, filepath) 66 | dir = os.path.dirname(filepath) 67 | if not os.path.exists(dir): 68 | os.makedirs(dir) 69 | client.Bucket(bucket_name).download_file(key, filepath) 70 | 71 | 72 | def add_common_args(subparser): 73 | subparser.add_argument("--bucket-name", help="Bucket name", required=True) 74 | subparser.add_argument("--mnt-path", help="Mount path", required=True) 75 | subparser.add_argument("--pipeline-name", help="Pipeline name") 76 | subparser.add_argument("--machine-id", help="Machine ID") 77 | 78 | 79 | def run(): 80 | parser = argparse.ArgumentParser(description="S3 Pusher") 81 | args = util.get_command(parser, add_common_args, ibm_download, aws_download) 82 | if hasattr(args, "new_client_func") and hasattr(args, "func"): 83 | client = args.new_client_func(args) 84 | args.func(client, args.bucket_name, args.machine_id, args.mnt_path, args.pipeline_name) 85 | else: 86 | parser.print_help() 87 | 88 | 89 | if __name__ == "__main__": 90 | run() 91 | -------------------------------------------------------------------------------- /model_training/s3/src/s3/pusher.py: -------------------------------------------------------------------------------- 1 | ## get client 2 | # client = new__client(args) 3 | ## upload all files in mnt path 4 | # _upload(client, mnt_path) 5 | import argparse 6 | import os 7 | 8 | from . import util 9 | 10 | model_dir = "models" 11 | data_dir = "data" 12 | machine_spec_dir = "machine_spec" 13 | 14 | 15 | def get_bucket_file_map(machine_id, mnt_path, query_data, idle_data): 16 | model_path = os.path.join(mnt_path, model_dir) 17 | bucket_file_map = dict() 18 | top_key_path = "" 19 | if machine_id is not None and machine_id != "": 20 | top_key_path = "/" + machine_id 21 | if os.path.exists(model_path): 22 | for root, _, files in os.walk(model_path): 23 | for file in files: 24 | filepath = os.path.join(root, file) 25 | key = filepath.replace(model_path, "/models") 26 | bucket_file_map[key] = filepath 27 | data_path = os.path.join(mnt_path, data_dir) 28 | for data_filename in [query_data, idle_data]: 29 | if data_filename is not None: 30 | filepath = os.path.join(data_path, data_filename + ".json") 31 | if os.path.exists(filepath): 32 | key = filepath.replace(data_path, top_key_path + "/data") 33 | bucket_file_map[key] = filepath 34 | filepath = os.path.join(data_path, machine_spec_dir, machine_id + ".json") 35 | if os.path.exists(filepath): 36 | key = filepath.replace(data_path, top_key_path + "/data") 37 | bucket_file_map[key] = filepath 38 | return bucket_file_map 39 | 40 | 41 | def aws_upload(client, bucket_name, machine_id, mnt_path, query_data, idle_data): 42 | print("AWS Upload") 43 | bucket_file_map = get_bucket_file_map(machine_id=machine_id, mnt_path=mnt_path, query_data=query_data, idle_data=idle_data) 44 | for key, filepath in bucket_file_map.items(): 45 | print(key, filepath) 46 | client.upload_file(filepath, bucket_name, key) 47 | 48 | 49 | def ibm_upload(client, bucket_name, machine_id, mnt_path, query_data, idle_data): 50 | print("IBM Upload") 51 | bucket_file_map = get_bucket_file_map(machine_id=machine_id, mnt_path=mnt_path, query_data=query_data, idle_data=idle_data) 52 | for key, filepath in bucket_file_map.items(): 53 | print(key, filepath) 54 | client.Object(bucket_name, key).upload_file(filepath) 55 | 56 | 57 | def add_common_args(subparser): 58 | subparser.add_argument("--bucket-name", help="Bucket name", required=True) 59 | subparser.add_argument("--mnt-path", help="Mount path", required=True) 60 | subparser.add_argument("--query-data", help="Query data filename") 61 | subparser.add_argument("--idle-data", help="Idle data filename") 62 | subparser.add_argument("--machine-id", help="Machine ID") 63 | 64 | 65 | def run(): 66 | parser = argparse.ArgumentParser(description="S3 Pusher") 67 | args = util.get_command(parser, add_common_args, ibm_upload, aws_upload) 68 | if hasattr(args, "new_client_func") and hasattr(args, "func"): 69 | client = args.new_client_func(args) 70 | args.func(client, args.bucket_name, args.machine_id, args.mnt_path, args.query_data, args.idle_data) 71 | else: 72 | parser.print_help() 73 | 74 | 75 | if __name__ == "__main__": 76 | run() 77 | -------------------------------------------------------------------------------- /model_training/s3/src/s3/util.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import s3.__about__ as about 4 | 5 | 6 | def new_ibm_client(args): 7 | import ibm_boto3 8 | from ibm_botocore.client import Config 9 | 10 | cos = ibm_boto3.resource( 11 | "s3", 12 | ibm_api_key_id=args.api_key, 13 | ibm_service_instance_id=args.service_instance_id, 14 | config=Config(signature_version="oauth"), 15 | endpoint_url=args.service_endpoint, 16 | ) 17 | return cos 18 | 19 | 20 | def new_aws_client(args): 21 | import boto3 as aws_boto3 22 | 23 | s3 = aws_boto3.client("s3", aws_access_key_id=args.aws_access_key_id, aws_secret_access_key=args.aws_secret_access_key, region_name=args.region_name) 24 | return s3 25 | 26 | 27 | def get_command(parser: argparse.ArgumentParser, add_common_args, ibm_func, aws_func): 28 | parser.add_argument("--version", action="version", version=about.__version__) 29 | 30 | subparsers = parser.add_subparsers(title="S3 provider", dest="provider") 31 | ibm_parser = subparsers.add_parser("ibmcloud", help="IBM Cloud") 32 | ibm_parser.add_argument("--api-key", type=str, help="API key", required=True) 33 | ibm_parser.add_argument("--service-instance-id", type=str, help="Service instance ID", required=True) 34 | ibm_parser.add_argument("--service-endpoint", type=str, help="Service endpoint", required=True) 35 | add_common_args(ibm_parser) 36 | ibm_parser.set_defaults(new_client_func=new_ibm_client, func=ibm_func) 37 | 38 | aws_parser = subparsers.add_parser("aws", help="AWS") 39 | aws_parser.add_argument("--aws-access-key-id", type=str, help="Access key ID", required=True) 40 | aws_parser.add_argument("--aws-secret-access-key", type=str, help="Secret key", required=True) 41 | aws_parser.add_argument("--region-name", type=str, help="Region name", required=True) 42 | add_common_args(aws_parser) 43 | aws_parser.set_defaults(new_client_func=new_aws_client, func=aws_func) 44 | 45 | args = parser.parse_args() 46 | 47 | return args 48 | -------------------------------------------------------------------------------- /model_training/s3/tests/__init__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present Sunil Thaha 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | -------------------------------------------------------------------------------- /model_training/tekton/examples/complete-pipelinerun.yaml: -------------------------------------------------------------------------------- 1 | # example-complete-train-pipeline 2 | # running pipelines with all default value to train AbsPower/DynPower for all energysource and featuregroup 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: example-complete-train-pipeline 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: CompleteTrainPipelineExample 18 | # the below parameters are for short test run 19 | - name: STRESS_ARGS 20 | value: 21 | - cpu;none;none 22 | - name: STRESS_TIMEOUT 23 | value: 20 24 | - name: STRESS_BREAK_INTERVAL 25 | value: 1 26 | - name: IDLE_COLLECT_INTERVAL 27 | value: 100 28 | - name: CPU_FREQUENCY_ENABLED 29 | value: false 30 | pipelineRef: 31 | name: complete-train-pipeline 32 | -------------------------------------------------------------------------------- /model_training/tekton/examples/single-train/abs-power.yaml: -------------------------------------------------------------------------------- 1 | # example-abs-train-pipeline: 2 | # running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly) 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: example-abs-train-pipeline 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: AbsPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: AbsPower 20 | # the below parameters are for short test run 21 | - name: STRESS_ARGS 22 | value: 23 | - cpu;none;none 24 | - name: STRESS_TIMEOUT 25 | value: 20 26 | - name: STRESS_BREAK_INTERVAL 27 | value: 1 28 | - name: IDLE_COLLECT_INTERVAL 29 | value: 100 30 | - name: CPU_FREQUENCY_ENABLED 31 | value: false 32 | pipelineRef: 33 | name: single-train-pipeline 34 | -------------------------------------------------------------------------------- /model_training/tekton/examples/single-train/aws-push.yaml: -------------------------------------------------------------------------------- 1 | # test-pipeline-aws 2 | # short run of pipelines to test e2e from collect to train with AWS COS 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: test-pipeline-aws 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: AbsPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: AbsPower 20 | - name: MACHINE_ID 21 | value: test 22 | - name: COS_PROVIDER 23 | value: aws 24 | - name: COS_SECRET_NAME 25 | value: aws-cos-secret 26 | # the below parameters are for short test run 27 | - name: STRESS_ARGS 28 | value: 29 | - cpu;none;none 30 | - name: STRESS_TIMEOUT 31 | value: 20 32 | - name: STRESS_BREAK_INTERVAL 33 | value: 1 34 | - name: IDLE_COLLECT_INTERVAL 35 | value: 100 36 | - name: CPU_FREQUENCY_ENABLED 37 | value: false 38 | pipelineRef: 39 | name: single-train-pipeline 40 | -------------------------------------------------------------------------------- /model_training/tekton/examples/single-train/default.yaml: -------------------------------------------------------------------------------- 1 | # kepler-default 2 | # running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly) with COS 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: default 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: AbsPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: AbsPower 20 | # Uncomment the following lines for IBM Cloud COS 21 | # - name: COS_PROVIDER 22 | # value: ibmcloud 23 | # - name: COS_SECRET_NAME 24 | # value: ibm-cos-secret 25 | # Uncomment the following lines for AWS COS 26 | # - name: COS_PROVIDER 27 | # value: aws 28 | # - name: COS_SECRET_NAME 29 | # value: aws-cos-secret 30 | pipelineRef: 31 | name: single-train-pipeline 32 | -------------------------------------------------------------------------------- /model_training/tekton/examples/single-train/dyn-power.yaml: -------------------------------------------------------------------------------- 1 | # example-dyn-train-pipeline: 2 | # running pipelines with all default value to train DynPower model (rapl-sysfs, BPFOnly) 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: example-dyn-train-pipeline 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: DynPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: DynPower 20 | # the below parameters are for short test run 21 | - name: STRESS_ARGS 22 | value: 23 | - cpu;none;none 24 | - name: STRESS_TIMEOUT 25 | value: 20 26 | - name: STRESS_BREAK_INTERVAL 27 | value: 1 28 | - name: IDLE_COLLECT_INTERVAL 29 | value: 100 30 | - name: CPU_FREQUENCY_ENABLED 31 | value: false 32 | pipelineRef: 33 | name: single-train-pipeline 34 | -------------------------------------------------------------------------------- /model_training/tekton/examples/single-train/ibmcloud-push.yaml: -------------------------------------------------------------------------------- 1 | # test-pipeline-ibmcloud 2 | # short run of pipelines to test e2e from collect to train with IBMCloud COS 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: test-pipeline-ibmcloud 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: AbsPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: AbsPower 20 | - name: MACHINE_ID 21 | value: test 22 | - name: COS_PROVIDER 23 | value: ibmcloud 24 | - name: COS_SECRET_NAME 25 | value: ibm-cos-secret 26 | # the below parameters are for short test run 27 | - name: STRESS_ARGS 28 | value: 29 | - cpu;none;none 30 | - name: STRESS_TIMEOUT 31 | value: 20 32 | - name: STRESS_BREAK_INTERVAL 33 | value: 1 34 | - name: IDLE_COLLECT_INTERVAL 35 | value: 100 36 | - name: CPU_FREQUENCY_ENABLED 37 | value: false 38 | pipelineRef: 39 | name: single-train-pipeline 40 | -------------------------------------------------------------------------------- /model_training/tekton/examples/test-collect.yaml: -------------------------------------------------------------------------------- 1 | # test-collect 2 | # short run of pipelines to test collecting data 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: test-collect 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: MACHINE_ID 17 | value: test 18 | - name: STRESS_ARGS 19 | value: 20 | - cpu;none;none 21 | - name: STRESS_TIMEOUT 22 | value: 20 23 | - name: STRESS_BREAK_INTERVAL 24 | value: 1 25 | - name: IDLE_COLLECT_INTERVAL 26 | value: 100 27 | - name: CPU_FREQUENCY_ENABLED 28 | value: false 29 | pipelineRef: 30 | name: collect-data-pipeline 31 | -------------------------------------------------------------------------------- /model_training/tekton/examples/test-retrain.yaml: -------------------------------------------------------------------------------- 1 | # example-abs-train-pipeline: 2 | # running pipelines with all default value to train AbsPower model (rapl-sysfs, BPFOnly) 3 | apiVersion: tekton.dev/v1 4 | kind: PipelineRun 5 | metadata: 6 | name: test-retrain-ibmcloud 7 | spec: 8 | timeouts: 9 | pipeline: 6h 10 | tasks: 5h50m 11 | workspaces: 12 | - name: mnt 13 | persistentVolumeClaim: 14 | claimName: task-pvc 15 | params: 16 | - name: PIPELINE_NAME 17 | value: AbsPowerTrainPipelineExample 18 | - name: OUTPUT_TYPE 19 | value: AbsPower 20 | pipelineRef: 21 | name: single-retrain-pipeline 22 | -------------------------------------------------------------------------------- /model_training/tekton/pvc/hostpath.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: PersistentVolume 3 | metadata: 4 | name: task-pv-volume 5 | labels: 6 | type: local 7 | spec: 8 | storageClassName: manual 9 | capacity: 10 | storage: 5Gi 11 | accessModes: 12 | - ReadWriteMany 13 | hostPath: 14 | path: /mnt 15 | --- 16 | apiVersion: v1 17 | kind: PersistentVolumeClaim 18 | metadata: 19 | name: task-pvc 20 | namespace: default 21 | spec: 22 | storageClassName: manual 23 | volumeName: task-pv-volume 24 | accessModes: 25 | - ReadWriteMany 26 | resources: 27 | requests: 28 | storage: 3Gi 29 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/extract-task.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## extract-from-metric: 4 | ## 5 | ## load kepler_query.json and extract data to extracted_data.csv 6 | ## 7 | ###################################### 8 | apiVersion: tekton.dev/v1 9 | kind: Task 10 | metadata: 11 | name: extract-from-metric 12 | spec: 13 | params: 14 | - name: MODEL_SERVER_IMAGE 15 | description: Specify model server image 16 | default: quay.io/sustainable_computing_io/kepler_model_server:latest 17 | - name: PIPELINE_NAME 18 | description: Specify pipeline name (output prefix/folder) 19 | default: default 20 | - name: OUTPUT_TYPE 21 | description: Specify target output type (check https://sustainable-computing.io/kepler_model_server/pipeline/#power-isolation) 22 | - name: ENERGY_SOURCE 23 | description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source) 24 | default: rapl-sysfs 25 | - name: FEATURE_GROUP 26 | description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group) 27 | default: BPFOnly 28 | - name: EXTRACTOR 29 | description: Specify extractor class (default or smooth) 30 | default: default 31 | - name: THIRDPARTY_METRICS 32 | description: Specify list of third party metric to export (required only for ThirdParty feature group) 33 | default: "" 34 | workspaces: 35 | - name: mnt 36 | optional: true 37 | steps: 38 | - name: extract 39 | image: $(params.MODEL_SERVER_IMAGE) 40 | command: [kepler-model] 41 | args: 42 | - extract 43 | - --data-path=$(workspaces.mnt.path)/data 44 | - --input=kepler_query 45 | - --output=$(params.PIPELINE_NAME)_$(params.ENERGY_SOURCE)_$(params.FEATURE_GROUP)_data 46 | - --extractor=$(params.EXTRACTOR) 47 | - --feature-group=$(params.FEATURE_GROUP) 48 | - --energy-source=$(params.ENERGY_SOURCE) 49 | - --output-type=$(params.OUTPUT_TYPE) 50 | - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)" 51 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/isolate-task.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## isolate-from-metric: 4 | ## 5 | ## load kepler_query.json and isolate data to isolated_data.csv 6 | ## 7 | ###################################### 8 | apiVersion: tekton.dev/v1 9 | kind: Task 10 | metadata: 11 | name: isolate-from-metric 12 | spec: 13 | params: 14 | - name: MODEL_SERVER_IMAGE 15 | description: Specify model server image 16 | default: quay.io/sustainable_computing_io/kepler_model_server:latest 17 | - name: PIPELINE_NAME 18 | description: Specify pipeline name (output prefix/folder) 19 | default: default 20 | - name: ENERGY_SOURCE 21 | description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source) 22 | default: rapl-sysfs 23 | - name: FEATURE_GROUP 24 | description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group) 25 | default: BPFOnly 26 | - name: EXTRACTOR 27 | description: Specify extractor class (default or smooth) 28 | default: default 29 | - name: ISOLATOR 30 | description: Specify isolator class (none, min, profile, or trainer (if ABS_PIPELINE_NAME is set) 31 | default: min 32 | - name: THIRDPARTY_METRICS 33 | description: Specify list of third party metric to export (required only for ThirdParty feature group) 34 | default: "" 35 | - name: TARGET_HINTS 36 | description: Specify target process keywords to keep in DynPower model training 37 | default: stress 38 | - name: BG_HINTS 39 | description: Specify background process keywords to remove from DynPower model training 40 | default: "" 41 | - name: ABS_PIPELINE_NAME 42 | description: Specify pipeline name to be used for initializing trainer isolator 43 | default: "" 44 | workspaces: 45 | - name: mnt 46 | optional: true 47 | steps: 48 | - name: isolate 49 | image: $(params.MODEL_SERVER_IMAGE) 50 | command: [kepler-model] 51 | args: 52 | - isolate 53 | - --data-path=$(workspaces.mnt.path)/data 54 | - --input=kepler_query 55 | - --output=$(params.PIPELINE_NAME)_$(params.ENERGY_SOURCE)_$(params.FEATURE_GROUP)_data 56 | - --pipeline-name=$(params.PIPELINE_NAME) 57 | - --extractor=$(params.EXTRACTOR) 58 | - --isolator=$(params.ISOLATOR) 59 | - --feature-group=$(params.FEATURE_GROUP) 60 | - --energy-source=$(params.ENERGY_SOURCE) 61 | - --output-type=DynPower 62 | - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)" 63 | - --abs-pipeline-name=$(params.ABS_PIPELINE_NAME) 64 | - --profile=idle 65 | - --target-hints="$(params.TARGET_HINTS)" 66 | - --bg-hints="$(params.BG_HINTS)" 67 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/original-pipeline-task.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## train-pipeline: 4 | ## 5 | ## load kepler_query.json and run training pipeline 6 | ## 7 | ###################################### 8 | apiVersion: tekton.dev/v1 9 | kind: Task 10 | metadata: 11 | name: original-pipeline-task 12 | spec: 13 | params: 14 | - name: MODEL_SERVER_IMAGE 15 | description: Specify model server image 16 | default: quay.io/sustainable_computing_io/kepler_model_server:latest 17 | - name: PIPELINE_NAME 18 | description: Specify output pipeline name 19 | default: default 20 | - name: EXTRACTOR 21 | description: Specify extractor class (default or smooth) 22 | default: default 23 | - name: ISOLATOR 24 | description: Specify isolator class (none, min, profile, or trainer (if ABS_PIPELINE_NAME is set) 25 | default: min 26 | - name: ABS_TRAINERS 27 | description: Specify a list of trainers for training AbsPower models 28 | default: default 29 | - name: DYN_TRAINERS 30 | description: Specify a list of trainers for training DynPower models 31 | default: default 32 | - name: ENERGY_SOURCE 33 | description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source) 34 | default: acpi,rapl-sysfs 35 | - name: TARGET_HINTS 36 | description: Specify target process keywords to keep in DynPower model training 37 | default: stress 38 | - name: BG_HINTS 39 | description: Specify background process keywords to remove from DynPower model training 40 | default: "" 41 | - name: THIRDPARTY_METRICS 42 | description: Specify list of third party metric to export (required only for ThirdParty feature group) 43 | default: "" 44 | - name: MACHINE_ID 45 | description: Specify machine id to identify node_type 46 | workspaces: 47 | - name: mnt 48 | optional: true 49 | steps: 50 | - name: pipeline-train 51 | image: $(params.MODEL_SERVER_IMAGE) 52 | command: [kepler-model] 53 | env: 54 | - name: MODEL_PATH 55 | value: $(workspaces.mnt.path)/models 56 | args: 57 | - train 58 | - --data-path=$(workspaces.mnt.path)/data 59 | - --input=kepler_query 60 | - --pipeline-name=$(params.PIPELINE_NAME) 61 | - --extractor=$(params.EXTRACTOR) 62 | - --isolator=$(params.ISOLATOR) 63 | - --profile=idle 64 | - --target-hints="$(params.TARGET_HINTS)" 65 | - --bg-hints="$(params.BG_HINTS)" 66 | - --abs-trainers=$(params.ABS_TRAINERS) 67 | - --dyn-trainers=$(params.DYN_TRAINERS) 68 | - --energy-source=$(params.ENERGY_SOURCE) 69 | - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)" 70 | - --id=$(params.MACHINE_ID) 71 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/s3/aws-s3-load.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## s3-push task for AWS 4 | ## 5 | ###################################### 6 | apiVersion: tekton.dev/v1 7 | kind: Task 8 | metadata: 9 | name: aws-s3-load 10 | spec: 11 | params: 12 | - name: COS_SECRET_NAME 13 | description: Specify cos secret name 14 | default: "" 15 | - name: MACHINE_ID 16 | description: Specify machine id to group model result in bucket 17 | default: "" 18 | - name: PIPELINE_NAME 19 | description: Specify pipeline name (output prefix/folder) 20 | default: default 21 | workspaces: 22 | - name: mnt 23 | optional: true 24 | steps: 25 | - name: load 26 | image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest 27 | env: 28 | - name: ACCESS_KEY_ID 29 | valueFrom: 30 | secretKeyRef: 31 | name: $(params.COS_SECRET_NAME) 32 | key: accessKeyID 33 | - name: ACCESS_SECRET 34 | valueFrom: 35 | secretKeyRef: 36 | name: $(params.COS_SECRET_NAME) 37 | key: accessSecret 38 | - name: REGION_NAME 39 | valueFrom: 40 | secretKeyRef: 41 | name: $(params.COS_SECRET_NAME) 42 | key: regionName 43 | - name: BUCKET_NAME 44 | valueFrom: 45 | secretKeyRef: 46 | name: $(params.COS_SECRET_NAME) 47 | key: bucketName 48 | command: [s3-loader] 49 | args: 50 | - aws 51 | - --aws-access-key-id=$(ACCESS_KEY_ID) 52 | - --aws-secret-access-key=$(ACCESS_SECRET) 53 | - --region-name=$(REGION_NAME) 54 | - --bucket-name=$(BUCKET_NAME) 55 | - --mnt-path=$(workspaces.mnt.path) 56 | - --pipeline-name=$(params.PIPELINE_NAME) 57 | - --machine-id=$(params.MACHINE_ID) 58 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/s3/aws-s3-push.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## s3-push task for AWS 4 | ## 5 | ###################################### 6 | apiVersion: tekton.dev/v1 7 | kind: Task 8 | metadata: 9 | name: aws-s3-push 10 | spec: 11 | params: 12 | - name: COS_SECRET_NAME 13 | description: Specify cos secret name 14 | default: "" 15 | - name: MACHINE_ID 16 | description: Specify machine id to group model result in bucket 17 | default: "" 18 | workspaces: 19 | - name: mnt 20 | optional: true 21 | steps: 22 | - name: push 23 | image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest 24 | env: 25 | - name: ACCESS_KEY_ID 26 | valueFrom: 27 | secretKeyRef: 28 | name: $(params.COS_SECRET_NAME) 29 | key: accessKeyID 30 | - name: ACCESS_SECRET 31 | valueFrom: 32 | secretKeyRef: 33 | name: $(params.COS_SECRET_NAME) 34 | key: accessSecret 35 | - name: REGION_NAME 36 | valueFrom: 37 | secretKeyRef: 38 | name: $(params.COS_SECRET_NAME) 39 | key: regionName 40 | - name: BUCKET_NAME 41 | valueFrom: 42 | secretKeyRef: 43 | name: $(params.COS_SECRET_NAME) 44 | key: bucketName 45 | command: [s3-pusher] 46 | args: 47 | - aws 48 | - --aws-access-key-id=$(ACCESS_KEY_ID) 49 | - --aws-secret-access-key=$(ACCESS_SECRET) 50 | - --region-name=$(REGION_NAME) 51 | - --bucket-name=$(BUCKET_NAME) 52 | - --mnt-path=$(workspaces.mnt.path) 53 | - --query-data=kepler_query 54 | - --idle-data=idle 55 | - --machine-id=$(params.MACHINE_ID) 56 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/s3/ibmcloud-s3-load.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## s3-push task for IBM Cloud 4 | ## 5 | ###################################### 6 | apiVersion: tekton.dev/v1 7 | kind: Task 8 | metadata: 9 | name: ibmcloud-s3-load 10 | spec: 11 | params: 12 | - name: COS_SECRET_NAME 13 | description: Specify cos secret name 14 | default: "" 15 | - name: MACHINE_ID 16 | description: Specify machine id to group model result in bucket 17 | default: "" 18 | - name: PIPELINE_NAME 19 | description: Specify pipeline name (output prefix/folder) 20 | default: default 21 | workspaces: 22 | - name: mnt 23 | optional: true 24 | steps: 25 | - name: load 26 | image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest 27 | env: 28 | - name: SERVICE_ENDPOINT 29 | valueFrom: 30 | secretKeyRef: 31 | name: $(params.COS_SECRET_NAME) 32 | key: serviceEndpoint 33 | - name: API_KEY 34 | valueFrom: 35 | secretKeyRef: 36 | name: $(params.COS_SECRET_NAME) 37 | key: apiKey 38 | - name: SERVICE_INSTANCE_ID 39 | valueFrom: 40 | secretKeyRef: 41 | name: $(params.COS_SECRET_NAME) 42 | key: serviceInstanceID 43 | - name: BUCKET_NAME 44 | valueFrom: 45 | secretKeyRef: 46 | name: $(params.COS_SECRET_NAME) 47 | key: bucketName 48 | command: [s3-loader] 49 | args: 50 | - ibmcloud 51 | - --service-endpoint=$(SERVICE_ENDPOINT) 52 | - --api-key=$(API_KEY) 53 | - --service-instance-id=$(SERVICE_INSTANCE_ID) 54 | - --bucket-name=$(BUCKET_NAME) 55 | - --mnt-path=$(workspaces.mnt.path) 56 | - --pipeline-name=$(params.PIPELINE_NAME) 57 | - --machine-id=$(params.MACHINE_ID) 58 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/s3/ibmcloud-s3-push.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## s3-push task for IBM Cloud 4 | ## 5 | ###################################### 6 | apiVersion: tekton.dev/v1 7 | kind: Task 8 | metadata: 9 | name: ibmcloud-s3-push 10 | spec: 11 | params: 12 | - name: COS_SECRET_NAME 13 | description: Specify cos secret name 14 | default: "" 15 | - name: MACHINE_ID 16 | description: Specify machine id to group model result in bucket 17 | default: "" 18 | workspaces: 19 | - name: mnt 20 | optional: true 21 | steps: 22 | - name: push 23 | image: quay.io/sustainable_computing_io/kepler_model_server/s3:latest 24 | env: 25 | - name: SERVICE_ENDPOINT 26 | valueFrom: 27 | secretKeyRef: 28 | name: $(params.COS_SECRET_NAME) 29 | key: serviceEndpoint 30 | - name: API_KEY 31 | valueFrom: 32 | secretKeyRef: 33 | name: $(params.COS_SECRET_NAME) 34 | key: apiKey 35 | - name: SERVICE_INSTANCE_ID 36 | valueFrom: 37 | secretKeyRef: 38 | name: $(params.COS_SECRET_NAME) 39 | key: serviceInstanceID 40 | - name: BUCKET_NAME 41 | valueFrom: 42 | secretKeyRef: 43 | name: $(params.COS_SECRET_NAME) 44 | key: bucketName 45 | command: [s3-pusher] 46 | args: 47 | - ibmcloud 48 | - --service-endpoint=$(SERVICE_ENDPOINT) 49 | - --api-key=$(API_KEY) 50 | - --service-instance-id=$(SERVICE_INSTANCE_ID) 51 | - --bucket-name=$(BUCKET_NAME) 52 | - --mnt-path=$(workspaces.mnt.path) 53 | - --query-data=kepler_query 54 | - --idle-data=idle 55 | - --machine-id=$(params.MACHINE_ID) 56 | -------------------------------------------------------------------------------- /model_training/tekton/tasks/train-task.yaml: -------------------------------------------------------------------------------- 1 | ###################################### 2 | ## 3 | ## train-model: 4 | ## 5 | ## train model from extracted data/isolated data 6 | ## 7 | ###################################### 8 | apiVersion: tekton.dev/v1 9 | kind: Task 10 | metadata: 11 | name: train-model 12 | spec: 13 | params: 14 | - name: MODEL_SERVER_IMAGE 15 | description: Specify model server image 16 | default: quay.io/sustainable_computing_io/kepler_model_server:latest 17 | - name: INPUT_DATA 18 | description: Specify input data file name (extracted_data or isolated_data) 19 | - name: PIPELINE_NAME 20 | description: Specify pipeline name (output prefix/folder) 21 | default: default 22 | - name: OUTPUT_TYPE 23 | description: Specify target output type (check https://sustainable-computing.io/kepler_model_server/pipeline/#power-isolation) 24 | default: AbsPower 25 | - name: ENERGY_SOURCE 26 | description: Specify target energy source (check https://sustainable-computing.io/kepler_model_server/pipeline/#energy-source) 27 | default: rapl-sysfs 28 | - name: FEATURE_GROUP 29 | description: Specify target feature group (check https://sustainable-computing.io/kepler_model_server/pipeline/#feature-group) 30 | default: BPFOnly 31 | - name: TRAINERS 32 | description: Specify trainer names (use comma(,) as delimiter) 33 | default: XgboostFitTrainer 34 | - name: THIRDPARTY_METRICS 35 | description: Specify list of third party metric to export (required only for ThirdParty feature group) 36 | default: "" 37 | - name: MACHINE_ID 38 | description: Specify machine id to identify node_type 39 | default: "" 40 | workspaces: 41 | - name: mnt 42 | optional: true 43 | steps: 44 | - name: train-from-data 45 | image: $(params.MODEL_SERVER_IMAGE) 46 | command: [kepler-model] 47 | env: 48 | - name: MODEL_PATH 49 | value: $(workspaces.mnt.path)/models 50 | args: 51 | - train_from_data 52 | - --data-path=$(workspaces.mnt.path)/data 53 | - --input=$(params.INPUT_DATA) 54 | - --pipeline-name=$(params.PIPELINE_NAME) 55 | - --feature-group=$(params.FEATURE_GROUP) 56 | - --energy-source=$(params.ENERGY_SOURCE) 57 | - --output-type=$(params.OUTPUT_TYPE) 58 | - --trainers=$(params.TRAINERS) 59 | - --thirdparty-metrics="$(params.THIRDPARTY_METRICS)" 60 | - --id=$(params.MACHINE_ID) 61 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["hatchling"] 3 | build-backend = "hatchling.build" 4 | 5 | [project] 6 | name = "kepler_model" 7 | dynamic = ["version"] 8 | description = "kepler model server for serving kepler models" 9 | readme = "README.md" 10 | requires-python = ">= 3.10" 11 | license = "Apache-2.0" 12 | keywords = [ 13 | "kepler", "models", 14 | "model-server", "estimator" 15 | ] 16 | 17 | authors = [ 18 | { name = "Sunyanan Choochotkaew", email = "sunyanan.choochotkaew1@ibm.com" }, 19 | { name = "Sunil Thaha", email = "sthaha@redhat.com" }, 20 | ] 21 | 22 | classifiers = [ 23 | "Programming Language :: Python", 24 | "Programming Language :: Python :: 3", 25 | "Programming Language :: Python :: 3.10", 26 | ] 27 | dependencies = [ 28 | "flask==3.0.3", 29 | "joblib==1.4.2", 30 | "numpy==2.1.2", 31 | "pandas==2.2.3", 32 | "prometheus-api-client==0.5.5", 33 | "prometheus-client==0.21.0", 34 | "protobuf==5.28.2", 35 | "psutil==6.1.0", 36 | "py-cpuinfo==9.0.0", 37 | "pyudev==0.24.3", 38 | "pyyaml_env_tag==0.1", 39 | "scikit-learn==1.5.2", 40 | "scipy==1.14.1", 41 | "seaborn==0.13.2", 42 | "Werkzeug==3.0.4", 43 | "xgboost==2.1.2", 44 | "boto3==1.35.43", 45 | "pymarkdownlnt==0.9.22", 46 | "yamllint==1.35.1", 47 | "requests-file==2.1.0", 48 | ] 49 | 50 | [project.scripts] 51 | model-server = "kepler_model.server.model_server:run" 52 | estimator = "kepler_model.estimate.estimator:run" 53 | kepler-model = "kepler_model.cmd.main:run" 54 | offline-trainer = "kepler_model.train.offline_trainer:run" 55 | online-trainer = "kepler_model.train.online_trainer:run" 56 | 57 | [project.urls] 58 | Documentation = "https://github.com/sustainable-computing-io/kepler-model-server#readme" 59 | Issues = "https://github.com/sustainable-computing-io/kepler-model-server/issues" 60 | Source = "https://github.com/sustainable-computing-io/kepler-model-server" 61 | 62 | [tool.hatch.version] 63 | path = "src/kepler_model/__about__.py" 64 | 65 | [tool.hatch.envs.default] 66 | python = "3.10" 67 | extra-dependencies = [ 68 | "coverage[toml]>=6.5", 69 | "ipdb", 70 | "ipython", 71 | "pytest", 72 | ] 73 | 74 | [tool.hatch.envs.default.scripts] 75 | test = "pytest {args:tests}" 76 | test-cov = "coverage run -m pytest {args:tests}" 77 | cov-report = [ 78 | "- coverage combine", 79 | "coverage report", 80 | ] 81 | cov = [ 82 | "test-cov", 83 | "cov-report", 84 | ] 85 | 86 | [tool.hatch.envs.lab] 87 | extra-dependencies = [ 88 | "jupyterlab", 89 | "notebook", 90 | "voila", 91 | "ipywidgets", 92 | # vim please 93 | "jupyterlab-vim", 94 | 95 | "beautifulsoup4", 96 | # read parquet files 97 | # "pyarrow", 98 | 99 | # graphing 100 | "matplotlib", 101 | "graphviz", 102 | ] 103 | 104 | [tool.hatch.envs.lab.scripts] 105 | note = "jupyter lab --NotebookApp.token='' --allow-root" 106 | 107 | [tool.hatch.envs.types] 108 | extra-dependencies = [ 109 | "mypy>=1.0.0", 110 | ] 111 | [tool.hatch.envs.types.scripts] 112 | check = "mypy --install-types --non-interactive {args:src/kepler_model_server tests}" 113 | 114 | [tool.coverage.run] 115 | source_pkgs = ["kepler_model", "tests"] 116 | branch = true 117 | parallel = true 118 | omit = [ 119 | "src/kepler_model/__about__.py", 120 | ] 121 | 122 | [tool.coverage.paths] 123 | kepler_model = ["src/kepler_model", "*/kepler_model/src/kepler_model"] 124 | tests = ["tests", "*/kepler_model/tests"] 125 | 126 | [tool.coverage.report] 127 | exclude_lines = [ 128 | "no cov", 129 | "if __name__ == .__main__.:", 130 | "if TYPE_CHECKING:", 131 | ] 132 | 133 | [tool.ruff] 134 | line-length = 160 135 | 136 | [tool.pytest.ini_options] 137 | markers = [ 138 | "focus", # used in development to mark focused tests 139 | ] 140 | 141 | [tool.pymarkdown] 142 | plugins.md013.enabled = false 143 | -------------------------------------------------------------------------------- /src/kepler_model/__about__.py: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2024-present 2 | # 3 | # SPDX-License-Identifier: Apache-2.0 4 | __version__ = "0.7.11" 5 | -------------------------------------------------------------------------------- /src/kepler_model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/abs-train-pipelinerun.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: tekton.dev/v1 2 | kind: PipelineRun 3 | metadata: 4 | name: example-abs-train-pipeline 5 | spec: 6 | timeouts: 7 | pipeline: 6h 8 | tasks: 5h50m 9 | workspaces: 10 | - name: mnt 11 | persistentVolumeClaim: 12 | claimName: task-pvc 13 | params: 14 | - name: PIPELINE_NAME 15 | value: AbsPowerTrainPipelineExample 16 | - name: OUTPUT_TYPE 17 | value: AbsPower 18 | pipelineRef: 19 | name: single-train-pipeline 20 | -------------------------------------------------------------------------------- /src/kepler_model/cmd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/cmd/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/estimate/__init__.py: -------------------------------------------------------------------------------- 1 | from .model.estimate_common import compute_error 2 | from .model.model import ( 3 | default_idle_predicted_col_func, 4 | default_predicted_col_func, 5 | get_background_containers, 6 | get_dynamic_power_colname, 7 | get_label_power_colname, 8 | get_predicted_background_power_colname, 9 | get_predicted_dynamic_background_power_colname, 10 | get_predicted_dynamic_power_colname, 11 | get_predicted_power_colname, 12 | get_reconstructed_power_colname, 13 | load_model, 14 | ) 15 | 16 | __all__ = [ 17 | "compute_error", 18 | "load_model", 19 | "get_background_containers", 20 | "default_predicted_col_func", 21 | "get_predicted_power_colname", 22 | "get_predicted_background_power_colname", 23 | "get_dynamic_power_colname", 24 | "get_predicted_dynamic_power_colname", 25 | "get_predicted_dynamic_background_power_colname", 26 | "get_label_power_colname", 27 | "get_reconstructed_power_colname", 28 | "default_idle_predicted_col_func", 29 | ] 30 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/archived_model.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import requests 4 | from requests_file import FileAdapter 5 | 6 | from kepler_model.estimate.model_server_connector import unpack 7 | from kepler_model.util.config import get_init_model_url 8 | from kepler_model.util.loader import load_metadata 9 | from kepler_model.util.train_types import ModelOutputType 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | failed_list = [] 14 | 15 | FILTER_ITEM_DELIMIT = ";" 16 | VALUE_DELIMIT = ":" 17 | ARRAY_DELIMIT = "," 18 | 19 | 20 | def parse_filters(filter): 21 | filter_list = filter.split(FILTER_ITEM_DELIMIT) 22 | filters = dict() 23 | for filter_item in filter_list: 24 | splits = filter_item.split(VALUE_DELIMIT) 25 | if len(splits) != 2: 26 | continue 27 | key = splits[0] 28 | if key == "features": 29 | value = splits[1].split(ARRAY_DELIMIT) 30 | else: 31 | value = splits[1] 32 | filters[key] = value 33 | return filters 34 | 35 | 36 | def valid_metrics(metrics, features): 37 | for feature in features: 38 | if feature not in metrics: 39 | return False 40 | return True 41 | 42 | 43 | def is_valid_model(metrics, metadata, filters): 44 | if not valid_metrics(metrics, metadata["features"]): 45 | return False 46 | 47 | for attrb, val in filters.items(): 48 | if not hasattr(metadata, attrb) or getattr(metadata, attrb) is None: 49 | logger.warning(f"{metadata['model_name']} has no {attrb}") 50 | return False 51 | 52 | cmp_val = getattr(metadata, attrb) 53 | val = float(val) 54 | if attrb == "abs_max_corr": # higher is better 55 | valid = cmp_val >= val 56 | else: # lower is better 57 | valid = cmp_val <= val 58 | if not valid: 59 | return False 60 | 61 | return True 62 | 63 | 64 | def reset_failed_list(): 65 | global failed_list 66 | failed_list = [] 67 | 68 | 69 | def get_achived_model(power_request): 70 | global failed_list 71 | output_type_name = power_request.output_type 72 | if output_type_name in failed_list: 73 | return None 74 | output_type = ModelOutputType[power_request.output_type] 75 | url = get_init_model_url(power_request.energy_source, output_type_name) 76 | if url == "": 77 | logger.warning(f"no URL set for {output_type_name}, {power_request.energy_source}") 78 | return None 79 | logger.info(f"try getting archieved model from URL: {url} for {output_type_name}") 80 | 81 | s = requests.Session() 82 | s.mount("file://", FileAdapter()) 83 | response = s.get(url) 84 | logger.debug(f"response: {response}") 85 | 86 | if response.status_code != 200: 87 | return None 88 | 89 | output_path = unpack(power_request.energy_source, output_type, response, replace=False) 90 | if output_path is not None: 91 | metadata = load_metadata(output_path) 92 | filters = parse_filters(power_request.filter) 93 | try: 94 | if not is_valid_model(power_request.metrics, metadata, filters): 95 | failed_list += [output_type_name] 96 | return None 97 | except Exception as e: 98 | logger.warning(f"cannot validate the archived model: {e}") 99 | return None 100 | 101 | return output_path 102 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/estimate/model/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/curvefit_model.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | 3 | from kepler_model.estimate.model.estimate_common import ( 4 | is_component_model, 5 | load_model_by_json, 6 | load_model_by_pickle, 7 | transform_and_predict, 8 | ) 9 | from kepler_model.util import ModelOutputType 10 | from kepler_model.util.train_types import get_valid_feature_groups, main_feature 11 | 12 | 13 | class CurveFitModelEstimator: 14 | def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False, feature_group=None): 15 | self.name = model_name 16 | self.features = features 17 | if feature_group is None: 18 | self.feauture_group = get_valid_feature_groups(features)[0] 19 | else: 20 | self.feauture_group = feature_group 21 | self.output_type = ModelOutputType[output_type] 22 | 23 | self.comp_type = not component_init and is_component_model(model_file) 24 | if self.comp_type: 25 | self.models = dict() 26 | model_info = load_model_by_json(model_path, model_file) 27 | for comp, model_metadata in model_info.items(): 28 | model = CurveFitModelEstimator( 29 | model_path, 30 | self.name, 31 | self.output_type.name, 32 | model_metadata["model_file"], 33 | model_metadata["features"], 34 | model_metadata["fe_files"], 35 | component_init=True, 36 | ) 37 | feature_index = main_feature(self.feauture_group.name, comp) 38 | if model.model is not None: 39 | model.model.set_feature_index(feature_index) 40 | self.models[comp] = model 41 | else: 42 | self.model = load_model_by_pickle(model_path, model_file) 43 | self.fe_list = [] 44 | for fe_filename in fe_files: 45 | self.fe_list += [load_model_by_pickle(model_path, fe_filename)] 46 | 47 | def get_power(self, request): 48 | if self.comp_type: 49 | results = dict() 50 | for comp, model in self.models.items(): 51 | y, msg = transform_and_predict(model, request) 52 | if msg != "": 53 | return [], msg 54 | if not isinstance(y, collections.abc.Sequence): 55 | y = [y] 56 | results[comp] = y 57 | return results, msg 58 | else: 59 | return transform_and_predict(self, request) 60 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/estimate_common.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cpuinfo 4 | import numpy as np 5 | from sklearn.metrics import mean_absolute_error, mean_squared_error 6 | 7 | from kepler_model.util.loader import load_json, load_pkl 8 | 9 | keras_enabled = True 10 | cpu_info = cpuinfo.get_cpu_info() 11 | 12 | # if 'flags' in cpu_info and 'avx' in cpu_info['flags']: 13 | # import keras 14 | # from keras import backend as K 15 | # else: 16 | # print("AVX instructions are not available.") 17 | # keras_enabled = False 18 | 19 | 20 | def is_component_model(model_file): 21 | return ".json" in model_file 22 | 23 | 24 | def transform_and_predict(model, datapoint): 25 | msg = "" 26 | try: 27 | x_values = datapoint[model.features].values 28 | for fe in model.fe_list: 29 | if fe is None: 30 | continue 31 | x_values = fe.transform(x_values) 32 | y = model.model.predict(x_values).squeeze() 33 | y[y < 0] = 0 34 | y = y.tolist() 35 | except Exception as e: 36 | msg = f"{e}\n" 37 | y = [] 38 | return y, msg 39 | 40 | 41 | def load_model_by_pickle(model_path, model_filename): 42 | return load_pkl(model_path, model_filename) 43 | 44 | 45 | def coeff_determination(y_true, y_pred): 46 | if not keras_enabled: 47 | return None 48 | SS_res = K.sum(K.square(y_true - y_pred)) 49 | SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 50 | return 1 - SS_res / (SS_tot + K.epsilon()) 51 | 52 | 53 | def load_model_by_keras(model_path, model_filename): 54 | model_file = os.path.join(model_path, model_filename) 55 | try: 56 | model = keras.models.load_model(model_file, custom_objects={"coeff_determination": coeff_determination}) 57 | except Exception as e: 58 | print(e) 59 | return None 60 | return model 61 | 62 | 63 | def load_model_by_json(model_path, model_filename): 64 | return load_json(model_path, model_filename) 65 | 66 | 67 | # return mae, mse, mape 68 | def compute_error(predicted_power, actual_powers): 69 | mse = mean_squared_error(actual_powers, predicted_power) 70 | mae = mean_absolute_error(actual_powers, predicted_power) 71 | actual_power_values = list(actual_powers) 72 | predicted_power_values = list(predicted_power) 73 | if len(actual_powers) == 0: 74 | mape = -1 75 | else: 76 | non_zero_predicted_powers = np.array([predicted_power_values[i] for i in range(len(predicted_power_values)) if actual_power_values[i] > 0]) 77 | if len(non_zero_predicted_powers) == 0: 78 | mape = -1 79 | else: 80 | non_zero_y_test = np.array([y for y in actual_powers if y > 0]) 81 | absolute_percentage_errors = np.abs((non_zero_y_test - non_zero_predicted_powers) / non_zero_y_test) * 100 82 | mape = np.mean(absolute_percentage_errors) 83 | return mae, mse, mape 84 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/keras_model.py: -------------------------------------------------------------------------------- 1 | from kepler_model.estimate.model.estimate_common import ( 2 | is_component_model, 3 | load_model_by_json, 4 | load_model_by_keras, 5 | load_model_by_pickle, 6 | transform_and_predict, 7 | ) 8 | from kepler_model.estimate.model_server_connector import ModelOutputType 9 | 10 | 11 | class KerasModelEstimator: 12 | def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False): 13 | self.name = model_name 14 | self.features = features 15 | self.output_type = ModelOutputType[output_type] 16 | self.comp_type = not component_init and is_component_model(self.output_type) 17 | if self.comp_type: 18 | self.models = dict() 19 | model_info = load_model_by_json(model_path, model_file) 20 | for comp, model_metadata in model_info.items(): 21 | model = KerasModelEstimator( 22 | model_path, 23 | self.name, 24 | self.output_type.name, 25 | model_metadata["model_file"], 26 | model_metadata["features"], 27 | model_metadata["fe_files"], 28 | component_init=True, 29 | ) 30 | self.models[comp] = model 31 | else: 32 | self.model = load_model_by_keras(model_path, model_file) 33 | self.fe_list = [] 34 | for fe_filename in fe_files: 35 | self.fe_list += [load_model_by_pickle(model_path, fe_filename)] 36 | 37 | def get_power(self, request): 38 | if self.comp_type: 39 | results = dict() 40 | for comp, model in self.models.items(): 41 | y, msg = transform_and_predict(model, request) 42 | if msg != "": 43 | return [], msg 44 | results[comp] = y 45 | return results, msg 46 | else: 47 | return transform_and_predict(self, request) 48 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/scikit_model.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | 3 | from kepler_model.estimate.model.estimate_common import ( 4 | is_component_model, 5 | load_model_by_json, 6 | load_model_by_pickle, 7 | transform_and_predict, 8 | ) 9 | from kepler_model.util import ModelOutputType 10 | 11 | 12 | class ScikitModelEstimator: 13 | def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False): 14 | self.name = model_name 15 | self.features = features 16 | self.output_type = ModelOutputType[output_type] 17 | 18 | self.comp_type = not component_init and is_component_model(model_file) 19 | if self.comp_type: 20 | self.models = dict() 21 | model_info = load_model_by_json(model_path, model_file) 22 | for comp, model_metadata in model_info.items(): 23 | model = ScikitModelEstimator( 24 | model_path, 25 | self.name, 26 | self.output_type.name, 27 | model_metadata["model_file"], 28 | model_metadata["features"], 29 | model_metadata["fe_files"], 30 | component_init=True, 31 | ) 32 | self.models[comp] = model 33 | else: 34 | self.model = load_model_by_pickle(model_path, model_file) 35 | self.fe_list = [] 36 | for fe_filename in fe_files: 37 | self.fe_list += [load_model_by_pickle(model_path, fe_filename)] 38 | 39 | def get_power(self, request): 40 | if self.comp_type: 41 | results = dict() 42 | for comp, model in self.models.items(): 43 | y, msg = transform_and_predict(model, request) 44 | if msg != "": 45 | return [], msg 46 | if not isinstance(y, collections.abc.Sequence): 47 | y = [y] 48 | results[comp] = y 49 | return results, msg 50 | else: 51 | return transform_and_predict(self, request) 52 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model/xgboost_model.py: -------------------------------------------------------------------------------- 1 | import collections.abc 2 | import os 3 | 4 | import xgboost as xgb 5 | 6 | from kepler_model.estimate.model.estimate_common import ( 7 | is_component_model, 8 | load_model_by_json, 9 | load_model_by_pickle, 10 | transform_and_predict, 11 | ) 12 | from kepler_model.util import ModelOutputType 13 | 14 | 15 | class XgboostModelEstimator: 16 | def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False): 17 | self.name = model_name 18 | self.features = features 19 | self.output_type = ModelOutputType[output_type] 20 | 21 | self.comp_type = not component_init and is_component_model(model_file) 22 | if self.comp_type: 23 | self.models = dict() 24 | model_info = load_model_by_json(model_path, model_file) 25 | for comp, model_metadata in model_info.items(): 26 | model = XgboostModelEstimator( 27 | model_path, 28 | self.name, 29 | self.output_type.name, 30 | model_metadata["model_file"], 31 | model_metadata["features"], 32 | model_metadata["fe_files"], 33 | component_init=True, 34 | ) 35 | self.models[comp] = model 36 | else: 37 | filepath = os.path.join(model_path, model_file) 38 | self.model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.1) 39 | self.model.load_model(filepath) 40 | self.fe_list = [] 41 | for fe_filename in fe_files: 42 | self.fe_list += [load_model_by_pickle(model_path, fe_filename)] 43 | 44 | def get_power(self, request): 45 | if self.comp_type: 46 | results = dict() 47 | for comp, model in self.models.items(): 48 | y, msg = transform_and_predict(model, request) 49 | if msg != "": 50 | return [], msg 51 | if not isinstance(y, collections.abc.Sequence): 52 | y = [y] 53 | results[comp] = y 54 | return results, msg 55 | else: 56 | return transform_and_predict(self, request) 57 | -------------------------------------------------------------------------------- /src/kepler_model/estimate/model_server_connector.py: -------------------------------------------------------------------------------- 1 | import codecs 2 | import json 3 | import os 4 | import shutil 5 | 6 | import requests 7 | 8 | from kepler_model.server.model_server import ModelListParam 9 | from kepler_model.util.config import ( 10 | download_path, 11 | get_model_server_list_endpoint, 12 | get_model_server_req_endpoint, 13 | is_model_server_enabled, 14 | ) 15 | from kepler_model.util.loader import get_download_output_path 16 | from kepler_model.util.train_types import ModelOutputType 17 | 18 | 19 | def make_model_request(power_request, machine_spec=None): 20 | model_request = { 21 | "metrics": power_request.metrics + power_request.system_features, 22 | "output_type": power_request.output_type, 23 | "source": power_request.energy_source, 24 | "filter": power_request.filter, 25 | "trainer_name": power_request.trainer_name, 26 | } 27 | if machine_spec is not None: 28 | model_request["machine_spec"] = machine_spec 29 | return model_request 30 | 31 | 32 | TMP_FILE = "tmp.zip" 33 | 34 | 35 | def unpack(energy_source, output_type, response, replace=True): 36 | output_path = get_download_output_path(download_path, energy_source, output_type) 37 | tmp_filepath = os.path.join(download_path, TMP_FILE) 38 | if os.path.exists(output_path): 39 | if not replace: 40 | if os.path.exists(tmp_filepath): 41 | # delete downloaded file 42 | os.remove(tmp_filepath) 43 | return output_path 44 | # delete existing model 45 | shutil.rmtree(output_path) 46 | with codecs.open(tmp_filepath, "wb") as f: 47 | f.write(response.content) 48 | shutil.unpack_archive(tmp_filepath, output_path) 49 | os.remove(tmp_filepath) 50 | return output_path 51 | 52 | 53 | def make_request(power_request, machine_spec): 54 | if not is_model_server_enabled(): 55 | return None 56 | model_request = make_model_request(power_request, machine_spec) 57 | output_type = ModelOutputType[power_request.output_type] 58 | try: 59 | response = requests.post(get_model_server_req_endpoint(), json=model_request) 60 | except Exception as err: 61 | print(f"cannot make request to {get_model_server_req_endpoint()}: {err}") 62 | return None 63 | if response.status_code != 200: 64 | return None 65 | return unpack(power_request.energy_source, output_type, response) 66 | 67 | 68 | def list_all_models(energy_source=None, output_type=None, feature_group=None, node_type=None, filter=None): 69 | if not is_model_server_enabled(): 70 | return dict() 71 | try: 72 | endpoint = get_model_server_list_endpoint() 73 | params = {} 74 | if energy_source: 75 | params[ModelListParam.EnergySource.value] = energy_source 76 | if output_type: 77 | params[ModelListParam.OutputType.value] = output_type 78 | if feature_group: 79 | params[ModelListParam.FeatureGroup.value] = feature_group 80 | if node_type: 81 | params[ModelListParam.NodeType.value] = node_type 82 | if filter: 83 | params[ModelListParam.Filter.value] = filter 84 | 85 | response = requests.get(endpoint, params=params) 86 | except Exception as err: 87 | print(f"cannot list model: {err}") 88 | return dict() 89 | if response.status_code != 200: 90 | return dict() 91 | model_names = json.loads(response.content.decode("utf-8")) 92 | return model_names 93 | -------------------------------------------------------------------------------- /src/kepler_model/train/__init__.py: -------------------------------------------------------------------------------- 1 | # comonly used within train module 2 | 3 | from .extractor.extractor import DefaultExtractor 4 | from .extractor.smooth_extractor import SmoothExtractor 5 | from .isolator.isolator import MinIdleIsolator, NoneIsolator, ProfileBackgroundIsolator 6 | from .isolator.train_isolator import TrainIsolator 7 | from .pipeline import NewPipeline, load_class 8 | from .profiler.node_type_index import NodeTypeIndexCollection, NodeTypeSpec 9 | from .profiler.profiler import Profiler, generate_profiles 10 | 11 | DefaultProfiler = Profiler(extractor=DefaultExtractor()) 12 | 13 | __all__ = [ 14 | "DefaultExtractor", 15 | "SmoothExtractor", 16 | "Profiler", 17 | "generate_profiles", 18 | "NodeTypeIndexCollection", 19 | "NodeTypeSpec", 20 | "MinIdleIsolator", 21 | "ProfileBackgroundIsolator", 22 | "NoneIsolator", 23 | "TrainIsolator", 24 | "NewPipeline", 25 | "load_class", 26 | ] 27 | -------------------------------------------------------------------------------- /src/kepler_model/train/exporter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/exporter/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/exporter/exporter.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from kepler_model.train.exporter.validator import BestModelCollection, get_validated_export_items 4 | from kepler_model.train.exporter.writer import ( 5 | append_version_readme, 6 | generate_pipeline_page, 7 | generate_pipeline_readme, 8 | generate_report_results, 9 | get_workload_content, 10 | ) 11 | from kepler_model.util.config import ERROR_KEY 12 | from kepler_model.util.format import time_to_str 13 | from kepler_model.util.loader import get_export_path, get_version_path, load_metadata, load_node_type_index 14 | from kepler_model.util.saver import save_node_type_index, save_pipeline_metadata 15 | 16 | repo_url = "https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-db/main/models" 17 | 18 | 19 | def export(data_path, pipeline_path, db_path, publisher, collect_date, inputs): 20 | # load pipeline metadata 21 | pipeline_metadata = load_metadata(pipeline_path) 22 | if pipeline_metadata is None: 23 | print("no pipeline metadata") 24 | return 25 | # add publish information to pipeline metadata 26 | pipeline_metadata["publisher"] = publisher 27 | pipeline_metadata["collect_time"] = time_to_str(collect_date) 28 | pipeline_metadata["export_time"] = time_to_str(datetime.datetime.utcnow()) 29 | 30 | node_type_index_json = load_node_type_index(pipeline_path) 31 | if node_type_index_json is None: 32 | print("no node type index") 33 | return 34 | node_types = node_type_index_json.keys() 35 | best_model_collections = dict() 36 | for node_type in node_types: 37 | best_model_collections[int(node_type)] = BestModelCollection(ERROR_KEY) 38 | 39 | # get path 40 | pipeline_name = pipeline_metadata["name"] 41 | local_export_path = get_export_path(db_path, pipeline_name) 42 | local_version_path = get_version_path(db_path) 43 | remote_version_path = get_version_path(repo_url, assure=False) 44 | 45 | # get validated export items (models) 46 | export_items, valid_metadata_df = get_validated_export_items(pipeline_path, pipeline_name) 47 | # save pipeline metadata 48 | for energy_source, ot_metadata_df in valid_metadata_df.items(): 49 | for model_type, metadata_df in ot_metadata_df.items(): 50 | metadata_df = metadata_df.sort_values(by=["feature_group", ERROR_KEY]) 51 | save_pipeline_metadata(local_export_path, pipeline_metadata, energy_source, model_type, metadata_df) 52 | # save node_type_index.json 53 | save_node_type_index(local_export_path, node_type_index_json) 54 | 55 | for export_item in export_items: 56 | # export models 57 | export_item.export(local_version_path) 58 | # update best model 59 | best_model_collections[export_item.node_type].compare_new_item(export_item) 60 | 61 | # generate pipeline page 62 | workload_content = get_workload_content(data_path, inputs) 63 | generate_pipeline_page(local_version_path, pipeline_metadata, workload_content) 64 | # generate error report page 65 | generate_report_results(local_export_path, best_model_collections, node_type_index_json, remote_version_path) 66 | # generate validation result page 67 | generate_pipeline_readme(pipeline_name, local_export_path, node_type_index_json, best_model_collections) 68 | # add new pipeline item to version path 69 | append_version_readme(local_version_path, pipeline_metadata) 70 | 71 | return local_export_path 72 | -------------------------------------------------------------------------------- /src/kepler_model/train/extractor/__init__.py: -------------------------------------------------------------------------------- 1 | from .extractor import DefaultExtractor 2 | 3 | __all__ = ["DefaultExtractor"] 4 | -------------------------------------------------------------------------------- /src/kepler_model/train/extractor/preprocess.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from kepler_model.estimate.model.model import get_label_power_colname 4 | from kepler_model.util.extract_types import col_to_component 5 | from kepler_model.util.prom_types import TIMESTAMP_COL 6 | from kepler_model.util.train_types import PowerSourceMap 7 | 8 | 9 | def drop_zero_column(data, cols): 10 | sum_col = "sum_val" 11 | data[sum_col] = data[cols].sum(axis=1) 12 | data = data.drop(data[data[sum_col] == 0].index) 13 | data = data.drop(columns=[sum_col]) 14 | return data 15 | 16 | 17 | def remove_outlier(df, workload_features, threshold=1): 18 | # Calculate the Z-score for each column 19 | z_scores = np.abs((df[workload_features] - df[workload_features].mean()) / df[workload_features].std()) 20 | # Remove rows with outliers 21 | df_no_outliers = df[(z_scores < threshold).all(axis=1)] 22 | return df_no_outliers 23 | 24 | 25 | def time_filter(data, min_time, max_time): 26 | _data = data.reset_index() 27 | start_time = _data[TIMESTAMP_COL].min() 28 | _data = _data[(_data[TIMESTAMP_COL] >= start_time + min_time) & (_data[TIMESTAMP_COL] <= start_time + max_time)] 29 | return _data 30 | 31 | 32 | def get_extracted_power_labels(extracted_data, energy_components, label_cols): 33 | # mean over the same value across container-level 34 | extracted_power_labels = extracted_data[[TIMESTAMP_COL] + label_cols].groupby([TIMESTAMP_COL]).mean().sort_index() 35 | for energy_component in energy_components: 36 | target_cols = [col for col in label_cols if col_to_component(col) == energy_component] 37 | component_label_col = get_label_power_colname(energy_component) 38 | extracted_power_labels[component_label_col] = extracted_power_labels[target_cols].sum(axis=1) 39 | return extracted_power_labels 40 | 41 | 42 | def find_correlations(energy_source, feature_power_data, power_columns, workload_features): 43 | power_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[power_columns].mean() 44 | feature_data = feature_power_data.reset_index().groupby([TIMESTAMP_COL])[workload_features].sum() 45 | energy_components = PowerSourceMap[energy_source] 46 | target_cols = [col for col in power_columns if col_to_component(col) == energy_components[0]] 47 | process_power_data = power_data.copy() 48 | # mean over the same value across container-level 49 | process_power_over_ts = process_power_data[target_cols].reset_index().groupby([TIMESTAMP_COL]).sum() 50 | process_power_data[energy_source] = process_power_over_ts.sum(axis=1) 51 | # sum usage all container-level 52 | join_data = feature_data.join(process_power_data[energy_source]).dropna() 53 | corr = join_data.corr()[[energy_source]] 54 | return corr.drop(index=energy_source) 55 | -------------------------------------------------------------------------------- /src/kepler_model/train/extractor/smooth_extractor.py: -------------------------------------------------------------------------------- 1 | from kepler_model.util.train_types import SYSTEM_FEATURES, FeatureGroup, FeatureGroups 2 | 3 | from .extractor import DefaultExtractor, find_correlations 4 | 5 | 6 | class SmoothExtractor(DefaultExtractor): 7 | def __init__(self, smooth_window=30): 8 | self.smooth_window = smooth_window 9 | 10 | def get_name(self): 11 | return "smooth" 12 | 13 | # implement extract function 14 | def extract(self, query_results, energy_components, feature_group, energy_source, node_level, aggr=True, use_vm_metrics=False): 15 | feature_power_data, power_columns, _, features = super().extract( 16 | query_results, energy_components, feature_group, energy_source, node_level, aggr, use_vm_metrics=use_vm_metrics 17 | ) 18 | 19 | features = FeatureGroups[FeatureGroup[feature_group]] 20 | smoothed_data = feature_power_data.copy() 21 | workload_features = [feature for feature in features if feature not in SYSTEM_FEATURES] 22 | 23 | for col in list(workload_features) + list(power_columns): 24 | smoothed_data[col] = feature_power_data[col].rolling(window=self.smooth_window).mean() 25 | smoothed_data = smoothed_data.dropna() 26 | 27 | corr = find_correlations(energy_source, feature_power_data, power_columns, workload_features) 28 | 29 | return smoothed_data, power_columns, corr, features 30 | -------------------------------------------------------------------------------- /src/kepler_model/train/isolator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/isolator/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/online_trainer.py: -------------------------------------------------------------------------------- 1 | # TODO: test 2 | import time 3 | 4 | from kepler_model.train.extractor import DefaultExtractor 5 | from kepler_model.train.isolator.isolator import MinIdleIsolator, ProfileBackgroundIsolator 6 | from kepler_model.train.pipeline import NewPipeline 7 | from kepler_model.train.profiler.profiler import load_all_profiles 8 | from kepler_model.train.prom.prom_query import PrometheusClient 9 | from kepler_model.util.config import get_config 10 | from kepler_model.util.loader import default_train_output_pipeline 11 | from kepler_model.util.prom_types import PROM_QUERY_INTERVAL, get_valid_feature_group_from_queries 12 | from kepler_model.util.train_types import FeatureGroups, PowerSourceMap 13 | 14 | SAMPLING_INTERVAL = get_config("SAMPLING_INTERVAL", PROM_QUERY_INTERVAL) 15 | 16 | 17 | default_trainers = ["GradientBoostingRegressorTrainer"] 18 | abs_trainer_names = default_trainers + [] 19 | dyn_trainer_names = default_trainers + [] 20 | 21 | 22 | def initial_pipelines(): 23 | target_energy_sources = PowerSourceMap.keys() 24 | valid_feature_groups = FeatureGroups.keys() 25 | profiles = load_all_profiles() 26 | profile_pipeline = NewPipeline( 27 | default_train_output_pipeline, 28 | abs_trainer_names, 29 | dyn_trainer_names, 30 | extractor=DefaultExtractor(), 31 | isolator=ProfileBackgroundIsolator(profiles), 32 | target_energy_sources=target_energy_sources, 33 | valid_feature_groups=valid_feature_groups, 34 | ) 35 | non_profile_pipeline = NewPipeline( 36 | default_train_output_pipeline, 37 | abs_trainer_names, 38 | dyn_trainer_names, 39 | extractor=DefaultExtractor(), 40 | isolator=MinIdleIsolator(), 41 | target_energy_sources=target_energy_sources, 42 | valid_feature_groups=valid_feature_groups, 43 | ) 44 | return profile_pipeline, non_profile_pipeline 45 | 46 | 47 | def run(): 48 | profile_pipeline, non_profile_pipeline = initial_pipelines() 49 | prom_client = PrometheusClient() 50 | while True: 51 | prom_client.query() 52 | query_results = prom_client.snapshot_query_result() 53 | valid_feature_groups = get_valid_feature_group_from_queries(query_results.keys()) 54 | for energy_source, energy_components in PowerSourceMap.items(): 55 | for feature_group in valid_feature_groups: 56 | success, _, _ = profile_pipeline.process(query_results, energy_components, energy_source, feature_group=feature_group) 57 | if not success: 58 | # failed to process with profile, try non_profile pipeline 59 | success, _, _ = non_profile_pipeline.process(query_results, energy_components, energy_source, feature_group=feature_group) 60 | if success: 61 | non_profile_pipeline.save_metadata() 62 | else: 63 | profile_pipeline.save_metadata() 64 | time.sleep(SAMPLING_INTERVAL) 65 | 66 | 67 | if __name__ == "__main__": 68 | run() 69 | -------------------------------------------------------------------------------- /src/kepler_model/train/profiler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/profiler/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/profiler/generate_scaler.py: -------------------------------------------------------------------------------- 1 | ############################################################ 2 | ## 3 | ## generate_scaler 4 | ## generate a scaler for each node type from prom query 5 | ## 6 | ## ./python generate_scaler.py query_output_folder 7 | ## e.g., ./python generate_scaler.py ../tests/data/prom_output 8 | ## 9 | ## input must be a query output of loaded state 10 | ## 11 | ############################################################ 12 | 13 | # WARN: is this file used ? 14 | 15 | import os 16 | import pickle 17 | 18 | import pandas as pd 19 | from sklearn.preprocessing import MaxAbsScaler 20 | 21 | from kepler_model.train import DefaultExtractor 22 | from kepler_model.util.prom_types import TIMESTAMP_COL, node_info_column 23 | from kepler_model.util.train_types import SYSTEM_FEATURES, FeatureGroup, FeatureGroups 24 | 25 | # WARN: unable to find this anymore 26 | # from profile_background import profile_path 27 | 28 | 29 | # HACK: 30 | extractor = DefaultExtractor() 31 | profile_path = "profile/path" 32 | max_scaler_top_path = os.path.join(profile_path, "..", "max_scaler") 33 | 34 | if not os.path.exists(max_scaler_top_path): 35 | os.mkdir(max_scaler_top_path) 36 | 37 | 38 | def read_query_results(query_path): 39 | results = dict() 40 | metric_filenames = [metric_filename for metric_filename in os.listdir(query_path)] 41 | for metric_filename in metric_filenames: 42 | metric = metric_filename.replace(".csv", "") 43 | filepath = os.path.join(query_path, metric_filename) 44 | results[metric] = pd.read_csv(filepath) 45 | return results 46 | 47 | 48 | def save_scaler(scaler, node_type, feature_group, scaler_top_path): 49 | node_type_path = os.path.join(scaler_top_path, str(node_type)) 50 | if not os.path.exists(node_type_path): 51 | os.mkdir(node_type_path) 52 | filename = os.path.join(node_type_path, feature_group + ".pkl") 53 | with open(filename, "wb") as f: 54 | pickle.dump(scaler, f) 55 | 56 | 57 | def process(query_results): 58 | node_info_data = extractor.get_system_category(query_results) 59 | if node_info_data is None: 60 | print("No Node Info") 61 | return None 62 | node_types = pd.unique(node_info_data[node_info_column]) 63 | for node_type in node_types: 64 | for feature_group in FeatureGroups: 65 | feature_group_name = feature_group.name 66 | features = FeatureGroups[FeatureGroup[feature_group_name]] 67 | workload_features = [feature for feature in features if feature not in SYSTEM_FEATURES] 68 | system_features = [feature for feature in features if feature in SYSTEM_FEATURES] 69 | feature_data = extractor.get_workload_feature_data(query_results, workload_features) 70 | if feature_data is None: 71 | print("cannot process ", feature_group_name) 72 | continue 73 | workload_feature_data = feature_data.groupby([TIMESTAMP_COL]).sum()[workload_features] 74 | if len(system_features) > 0: 75 | system_feature_data = extractor.get_system_feature_data(query_results, system_features) 76 | feature_data = workload_feature_data.join(system_feature_data).sort_index().dropna() 77 | else: 78 | feature_data = workload_feature_data 79 | 80 | feature_data = feature_data.join(node_info_data) 81 | node_types = pd.unique(feature_data[node_info_column]) 82 | # filter and extract features 83 | x_values = feature_data[feature_data[node_info_column] == node_type][features].values 84 | max_scaler = MaxAbsScaler() 85 | max_scaler.fit(x_values) 86 | save_scaler(max_scaler, node_type, feature_group_name, max_scaler_top_path) 87 | -------------------------------------------------------------------------------- /src/kepler_model/train/prom/__init__.py: -------------------------------------------------------------------------------- 1 | from .prom_query import PrometheusClient 2 | 3 | __all__ = ["PrometheusClient"] 4 | -------------------------------------------------------------------------------- /src/kepler_model/train/prom/prom_query.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | from prometheus_api_client import PrometheusConnect 4 | 5 | from kepler_model.util.prom_types import ( 6 | PROM_QUERY_INTERVAL, 7 | PROM_QUERY_STEP, 8 | PROM_SERVER, 9 | PROM_SSL_DISABLE, 10 | generate_dataframe_from_response, 11 | metric_prefix, 12 | ) 13 | 14 | UTC_OFFSET_TIMEDELTA = datetime.datetime.utcnow() - datetime.datetime.now() 15 | 16 | 17 | def _range_queries(prom, metric_list, start, end, step, params=None): 18 | response = dict() 19 | for metric in metric_list: 20 | response[metric] = prom.custom_query_range(metric, start, end, step, params) 21 | return response 22 | 23 | 24 | class PrometheusClient: 25 | def __init__(self): 26 | self.prom = PrometheusConnect(url=PROM_SERVER, disable_ssl=PROM_SSL_DISABLE) 27 | self.interval = PROM_QUERY_INTERVAL 28 | self.step = PROM_QUERY_STEP 29 | self.latest_query_result = dict() 30 | 31 | def query(self): 32 | available_metrics = self.prom.all_metrics() 33 | queries = [m for m in available_metrics if metric_prefix in m] 34 | end = datetime.datetime.now() 35 | start = end - datetime.timedelta(seconds=self.interval) 36 | self.latest_query_result = dict() 37 | response_dict = _range_queries(self.prom, queries, start, end, self.step, None) 38 | for query_metric, prom_response in response_dict.items(): 39 | self.latest_query_result[query_metric] = generate_dataframe_from_response(query_metric, prom_response) 40 | return response_dict 41 | 42 | def snapshot_query_result(self): 43 | return {metric: data for metric, data in self.latest_query_result.items() if len(data) > 0} 44 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/ExponentialRegressionTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/ExponentialRegressionTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/ExponentialRegressionTrainer/main.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import numpy as np 4 | 5 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer 6 | 7 | 8 | def p0_func(x, y): 9 | a = (y.max() - y.min()) // math.e # scale value 10 | b = 1 # start from linear 11 | c = y.min() - a # initial offset 12 | return [a, b, c] 13 | 14 | 15 | def expo_func(x, a, b, c): 16 | y = a * np.exp(b * x) + c 17 | return y 18 | 19 | 20 | class ExponentialRegressionTrainer(CurveFitTrainer): 21 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 22 | super(ExponentialRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 23 | self.fe_files = [] 24 | 25 | def init_model(self): 26 | return CurveFitModel(expo_func, p0_func=p0_func) 27 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/GradientBoostingRegressorTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.ensemble import GradientBoostingRegressor 2 | 3 | from kepler_model.train.trainer.scikit import ScikitTrainer 4 | 5 | model_class = "scikit" 6 | 7 | 8 | class GradientBoostingRegressorTrainer(ScikitTrainer): 9 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 10 | super(GradientBoostingRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 11 | self.fe_files = [] 12 | 13 | def init_model(self): 14 | return GradientBoostingRegressor(n_estimators=100, max_depth=3, learning_rate=0.1) 15 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/KNeighborsRegressorTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/KNeighborsRegressorTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/KNeighborsRegressorTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.neighbors import KNeighborsRegressor 2 | 3 | from kepler_model.train.trainer.scikit import ScikitTrainer 4 | 5 | model_class = "scikit" 6 | 7 | 8 | class KNeighborsRegressorTrainer(ScikitTrainer): 9 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 10 | super(KNeighborsRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 11 | self.fe_files = [] 12 | 13 | def init_model(self): 14 | return KNeighborsRegressor(n_neighbors=6) 15 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LinearRegressionTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LinearRegressionTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LinearRegressionTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LinearRegression 2 | 3 | from kepler_model.train.trainer.scikit import ScikitTrainer 4 | 5 | model_class = "scikit" 6 | 7 | 8 | class LinearRegressionTrainer(ScikitTrainer): 9 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 10 | super(LinearRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 11 | self.fe_files = [] 12 | 13 | def init_model(self): 14 | return LinearRegression(positive=True) 15 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LogarithmicRegressionTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LogarithmicRegressionTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LogarithmicRegressionTrainer/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer 4 | 5 | 6 | def p0_func(x, y): 7 | a = y.max() - y.min() 8 | b = 1 9 | c = y.min() 10 | return [a, b, c] 11 | 12 | 13 | def log_func(x, a, b, c): 14 | y = a * np.log(b * x + 1) + c 15 | return y 16 | 17 | 18 | class LogarithmicRegressionTrainer(CurveFitTrainer): 19 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 20 | super(LogarithmicRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 21 | self.fe_files = [] 22 | 23 | def init_model(self): 24 | return CurveFitModel(log_func, p0_func=p0_func) 25 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LogisticRegressionTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/LogisticRegressionTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/LogisticRegressionTrainer/main.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from kepler_model.train.trainer.curvefit import CurveFitModel, CurveFitTrainer 4 | 5 | 6 | def p0_func(x, y): 7 | A = y.max() - y.min() # value range 8 | x0 = 0.5 # sigmoid mid point (as normalized value is in 0 to 1, start mid point = 0.5) 9 | k = A // np.std(y) # growth rate (larger std, lower growth) 10 | off = y.min() # initial offset 11 | return [A, x0, k, off] 12 | 13 | 14 | def logi_func(x, A, x0, k, off): 15 | return A / (1 + np.exp(-k * (x - x0))) + off 16 | 17 | 18 | class LogisticRegressionTrainer(CurveFitTrainer): 19 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 20 | super(LogisticRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 21 | self.fe_files = [] 22 | 23 | def init_model(self): 24 | return CurveFitModel(logi_func, p0_func=p0_func) 25 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/PolynomialRegressionTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/PolynomialRegressionTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/PolynomialRegressionTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import LinearRegression 2 | from sklearn.preprocessing import PolynomialFeatures 3 | 4 | from kepler_model.train.trainer.scikit import ScikitTrainer 5 | 6 | poly_scaler_filename = "poly_scaler.pkl" 7 | 8 | 9 | class PolynomialRegressionTrainer(ScikitTrainer): 10 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 11 | super(PolynomialRegressionTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 12 | self.poly_scaler = PolynomialFeatures(degree=2) 13 | self.fe_files = [poly_scaler_filename] 14 | self.fe = [PolynomialFeatures(degree=2)] 15 | 16 | def init_model(self): 17 | return LinearRegression() 18 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/SGDRegressorTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/SGDRegressorTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/SGDRegressorTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.linear_model import SGDRegressor 2 | 3 | from kepler_model.train.trainer.scikit import ScikitTrainer 4 | 5 | 6 | class SGDRegressorTrainer(ScikitTrainer): 7 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 8 | super(SGDRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 9 | self.fe_files = [] 10 | 11 | def init_model(self): 12 | return SGDRegressor(max_iter=1000) 13 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/SVRRegressorTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/SVRRegressorTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/SVRRegressorTrainer/main.py: -------------------------------------------------------------------------------- 1 | from sklearn.svm import SVR 2 | 3 | from kepler_model.train.trainer.scikit import ScikitTrainer 4 | 5 | common_node_type = 1 6 | 7 | 8 | class SVRRegressorTrainer(ScikitTrainer): 9 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 10 | super(SVRRegressorTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 11 | self.fe_files = [] 12 | 13 | def init_model(self): 14 | return SVR(C=1.0, epsilon=0.2) 15 | -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/XGBoostTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/XGBoostTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/XgboostFitTrainer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/src/kepler_model/train/trainer/XgboostFitTrainer/__init__.py -------------------------------------------------------------------------------- /src/kepler_model/train/trainer/XgboostFitTrainer/main.py: -------------------------------------------------------------------------------- 1 | from kepler_model.train.trainer.xgboost_interface import XgboostTrainer 2 | 3 | 4 | class XgboostFitTrainer(XgboostTrainer): 5 | def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name): 6 | super(XgboostFitTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name) 7 | self.fe_files = [] 8 | 9 | def _train(self, node_type, component, X_values, y_values): 10 | model = self.node_models[node_type][component] 11 | if model.__sklearn_is_fitted__(): 12 | self.node_models[node_type][component].fit(X_values, y_values, xgb_model=model) 13 | else: 14 | self.node_models[node_type][component].fit(X_values, y_values) 15 | -------------------------------------------------------------------------------- /src/kepler_model/util/__init__.py: -------------------------------------------------------------------------------- 1 | # commonly-used definitions 2 | from .config import get_config, model_toppath 3 | from .loader import ( 4 | class_to_json, 5 | default_train_output_pipeline, 6 | list_model_names, 7 | load_csv, 8 | load_json, 9 | load_metadata, 10 | load_pkl, 11 | load_remote_pkl, 12 | load_scaler, 13 | load_weight, 14 | version, 15 | ) 16 | from .prom_types import get_valid_feature_group_from_queries 17 | from .saver import assure_path, save_csv, save_json, save_metadata, save_pkl, save_scaler, save_weight 18 | from .train_types import ( 19 | BPF_FEATURES, 20 | COUNTER_FEAUTRES, 21 | IRQ_FEATURES, 22 | SYSTEM_FEATURES, 23 | WORKLOAD_FEATURES, 24 | FeatureGroup, 25 | FeatureGroups, 26 | ModelOutputType, 27 | PowerSourceMap, 28 | get_feature_group, 29 | ) 30 | 31 | __all__ = [ 32 | "load_json", 33 | "load_csv", 34 | "load_pkl", 35 | "load_metadata", 36 | "load_scaler", 37 | "load_weight", 38 | "load_remote_pkl", 39 | "list_model_names", 40 | "default_train_output_pipeline", 41 | "class_to_json", 42 | "version", 43 | "assure_path", 44 | "save_csv", 45 | "save_json", 46 | "save_pkl", 47 | "save_metadata", 48 | "save_scaler", 49 | "save_weight", 50 | "get_config", 51 | "model_toppath", 52 | "SYSTEM_FEATURES", 53 | "COUNTER_FEAUTRES", 54 | "BPF_FEATURES", 55 | "IRQ_FEATURES", 56 | "WORKLOAD_FEATURES", 57 | "PowerSourceMap", 58 | "FeatureGroup", 59 | "FeatureGroups", 60 | "ModelOutputType", 61 | "get_feature_group", 62 | "get_valid_feature_group_from_queries", 63 | ] 64 | -------------------------------------------------------------------------------- /src/kepler_model/util/extract_types.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .prom_types import TIMESTAMP_COL, pkg_id_column 4 | from .train_types import PowerSourceMap 5 | 6 | container_id_colname = "id" 7 | all_container_key = "all containers" 8 | accelerator_type_colname = "type" 9 | 10 | node_level_index = [TIMESTAMP_COL] 11 | pkg_level_index = [TIMESTAMP_COL, pkg_id_column] 12 | container_level_index = [TIMESTAMP_COL, container_id_colname] 13 | 14 | 15 | def component_to_col(component, unit_col=None, unit_val=None): 16 | power_colname = f"{component}_power" 17 | if unit_col is None: 18 | return power_colname 19 | return f"{unit_col}_{unit_val}_{power_colname}" 20 | 21 | 22 | def col_to_component(component_col): 23 | splits = component_col.split("_") 24 | component = splits[-2:][0] 25 | if component == "dynamic" or component == "background": 26 | return splits[-3:][0] 27 | return component 28 | 29 | 30 | def col_to_unit_val(component_col): 31 | return component_col.split("_")[-3:][0] 32 | 33 | 34 | def ratio_to_col(unit_val): 35 | return f"packge_ratio_{unit_val}" 36 | 37 | 38 | def get_unit_vals(power_columns): 39 | return np.unique([col_to_unit_val(col) for col in power_columns if "package" in col]) 40 | 41 | 42 | def get_num_of_unit(energy_source, label_cols): 43 | energy_components = PowerSourceMap(energy_source) 44 | num_of_unit = len(label_cols) / len(energy_components) 45 | return num_of_unit 46 | 47 | 48 | def get_expected_power_columns(energy_components, num_of_unit=1): 49 | # TODO: if ratio applied, 50 | # return [component_to_col(component, "package", unit_val) for component in energy_components for unit_val in range(0,num_of_unit)] 51 | return [component_to_col(component) for component in energy_components] 52 | -------------------------------------------------------------------------------- /src/kepler_model/util/format.py: -------------------------------------------------------------------------------- 1 | def print_bounded_multiline_message(input_lines, maxlength=200): 2 | lines = [] 3 | for line in input_lines: 4 | i = 0 5 | while len(line) > maxlength: 6 | lines += [line[0:maxlength]] 7 | line = line[maxlength:-1] 8 | i = maxlength 9 | if len(line) > 0: 10 | lines += [line] 11 | 12 | max_line_length = max(len(line) for line in lines) 13 | border = "#" * (max_line_length + 4) 14 | print(border) 15 | 16 | for line in lines: 17 | formatted_line = f"# {line.ljust(max_line_length)} #" 18 | print(formatted_line) 19 | 20 | print(border) 21 | 22 | 23 | from datetime import datetime 24 | 25 | 26 | def time_to_str(time): 27 | if isinstance(time, datetime): 28 | return time.strftime("%Y-%m-%d %H:%M:%S") 29 | return time 30 | -------------------------------------------------------------------------------- /src/kepler_model/util/saver.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | import joblib 5 | 6 | METADATA_FILENAME = "metadata" 7 | SCALER_FILENAME = "scaler" 8 | WEIGHT_FILENAME = "weight" 9 | TRAIN_ARGS_FILENAME = "train_arguments" 10 | NODE_TYPE_INDEX_FILENAME = "node_type_index" 11 | 12 | MACHINE_SPEC_PATH = "machine_spec" 13 | 14 | 15 | def _pipeline_model_metadata_filename(energy_source, model_type): 16 | return f"{energy_source}_{model_type}_model_metadata" 17 | 18 | 19 | def _power_curve_filename(energy_source, model_type): 20 | return f"{energy_source}_{model_type}_power_curve" 21 | 22 | 23 | def assure_path(path): 24 | if path == "": 25 | return "" 26 | if not os.path.exists(path): 27 | os.makedirs(path, exist_ok=True) 28 | return path 29 | 30 | 31 | def save_json(path, name, data): 32 | if name.endswith(".json") is False: 33 | name = name + ".json" 34 | 35 | assure_path(path) 36 | filename = os.path.join(path, name) 37 | with open(filename, "w") as f: 38 | json.dump(data, f) 39 | return name 40 | 41 | 42 | def save_pkl(path, name, data): 43 | if ".pkl" not in name: 44 | name = name + ".pkl" 45 | assure_path(path) 46 | filename = os.path.join(path, name) 47 | joblib.dump(data, filename) 48 | return name 49 | 50 | 51 | def save_csv(path, name, data): 52 | if ".csv" not in name: 53 | name = name + ".csv" 54 | assure_path(path) 55 | filename = os.path.join(path, name) 56 | data.to_csv(filename) 57 | return name 58 | 59 | 60 | def save_machine_spec(data_path, machine_id, spec): 61 | machine_spec_path = os.path.join(data_path, MACHINE_SPEC_PATH) 62 | assure_path(machine_spec_path) 63 | save_json(machine_spec_path, machine_id, spec.get_json()) 64 | 65 | 66 | def save_node_type_index(pipeline_path, node_type_index): 67 | return save_json(pipeline_path, NODE_TYPE_INDEX_FILENAME, node_type_index) 68 | 69 | 70 | def save_metadata(model_path, metadata): 71 | return save_json(model_path, METADATA_FILENAME, metadata) 72 | 73 | 74 | def save_train_args(pipeline_path, args): 75 | return save_json(pipeline_path, TRAIN_ARGS_FILENAME, args) 76 | 77 | 78 | def save_scaler(model_path, scaler): 79 | return save_pkl(model_path, SCALER_FILENAME, scaler) 80 | 81 | 82 | def save_weight(model_path, weight): 83 | return save_json(model_path, WEIGHT_FILENAME, weight) 84 | 85 | 86 | def save_pipeline_metadata(pipeline_path, pipeline_metadata, energy_source, model_type, metadata_df): 87 | save_metadata(pipeline_path, pipeline_metadata) 88 | pipeline_model_metadata_filename = _pipeline_model_metadata_filename(energy_source, model_type) 89 | return save_csv(pipeline_path, pipeline_model_metadata_filename, metadata_df) 90 | 91 | 92 | def save_profile(profile_path, source, profile): 93 | profile_filename = os.path.join(profile_path, source + ".json") 94 | with open(profile_filename, "w") as f: 95 | json.dump(profile, f) 96 | -------------------------------------------------------------------------------- /src/kepler_model/util/similarity.py: -------------------------------------------------------------------------------- 1 | from .train_types import NodeAttribute 2 | 3 | # simplified weights 4 | # TODO: experimental support for deciding the weight 5 | similarity_reference = { 6 | NodeAttribute.PROCESSOR: 5, 7 | NodeAttribute.CORES: 1, 8 | NodeAttribute.CHIPS: 1, 9 | NodeAttribute.MEMORY: 0.5, 10 | NodeAttribute.FREQ: 0.5, 11 | } 12 | 13 | similarity_total_weight = sum(similarity_reference.values()) 14 | 15 | 16 | def get_similarity_weight(attr): 17 | return similarity_reference[attr] / similarity_total_weight 18 | 19 | 20 | def compute_jaccard_similarity(str1: str, str2: str) -> float: 21 | if str1.lower() == str2.lower(): # including the case of both are empty 22 | return 1 23 | if len(str1) == 0 or len(str2) == 0: 24 | return 0 25 | set1 = set(str1.lower()) # Convert to lowercase for case-insensitive comparison 26 | set2 = set(str2.lower()) 27 | 28 | intersection = len(set1.intersection(set2)) 29 | union = len(set1.union(set2)) 30 | 31 | similarity = intersection / union * 0.5 32 | return similarity 33 | 34 | 35 | def compute_similarity(base: float, cmp: float) -> float: 36 | base = float(base) 37 | cmp = float(cmp) 38 | diff_ratio = 0 39 | if base > 0 or cmp > 0: 40 | diff_ratio = abs(cmp - base) / ((base + cmp) / 2) 41 | if diff_ratio >= 1: 42 | return 0 43 | else: 44 | return 1 - diff_ratio 45 | 46 | 47 | def compute_looseness(similarity): 48 | return 1 - similarity 49 | 50 | 51 | # get_candidate_score returns certainty 52 | def get_candidate_score(candidate_uncertain_attribute_freq, candidate_uncertain_attribute_total): 53 | candidate_score = dict() 54 | for attr, candidates in candidate_uncertain_attribute_freq.items(): 55 | total = candidate_uncertain_attribute_total[attr] 56 | if total == 0: 57 | # no uncertainty 58 | continue 59 | for candidate in candidates: 60 | candidate_index = candidate[0] 61 | candidate_freq = candidate[1] 62 | if candidate_index not in candidate_score: 63 | candidate_score[candidate_index] = 0 64 | candidate_score[candidate_index] += float(candidate_freq) / total 65 | return candidate_score 66 | 67 | 68 | def find_best_candidate(candidate_score): 69 | max_score = 0 70 | best_candidate_index = -1 71 | for index, score in candidate_score.items(): 72 | if score > max_score: 73 | best_candidate_index = index 74 | max_score = score 75 | return best_candidate_index, max_score 76 | 77 | 78 | def compute_uncertainty(max_score, num_of_none): 79 | if num_of_none == 0: 80 | return 0 # covered 81 | uncertainty = 1 - max_score / num_of_none 82 | return uncertainty 83 | 84 | 85 | def get_num_of_none(in_spec): 86 | num_of_none = 0 87 | for attr in NodeAttribute: 88 | if in_spec.attrs[attr] is None: 89 | num_of_none += 1 90 | return num_of_none 91 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sustainable-computing-io/kepler-model-server/bd4c15cc1d66c8d5fa72e08c80041429ccc41dce/tests/__init__.py -------------------------------------------------------------------------------- /tests/client_load_tester.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from estimator import SERVE_SOCKET 4 | from estimator_model_test import generate_request, model_names 5 | from estimator_power_request_test import Client 6 | 7 | loads = range(10, 11, 10) 8 | duration = 120 9 | 10 | if __name__ == "__main__": 11 | client = Client(SERVE_SOCKET) 12 | for model_name in model_names: 13 | for load in loads: 14 | request_json = generate_request(model_name, load) 15 | start_time = time.time() 16 | client.make_request(request_json) 17 | elapsed_time = time.time() - start_time 18 | output = f"{model_name},{load},{elapsed_time}" 19 | print(output) 20 | time.sleep(1) 21 | -------------------------------------------------------------------------------- /tests/data/machine/spec.json: -------------------------------------------------------------------------------- 1 | {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500} 2 | -------------------------------------------------------------------------------- /tests/data/node_type_index.json: -------------------------------------------------------------------------------- 1 | {"0": {"attrs": {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 377, "frequency": 3500}, "members": ["m5.metal-ami-0e4d0bb9670ea8db0"]}, "1": {"attrs": {"processor": "intel_xeon_e5_2686v4", "cores": 72, "chips": 2, "memory": 503, "frequency": 3000}, "members": ["i3.metal-ami-0e4d0bb9670ea8db0"]}, "2": {"attrs": {"processor": "intel_xeon_platinum_8275cl", "cores": 96, "chips": 2, "memory": 188, "frequency": 3900}, "members": ["c5.metal-ami-0e4d0bb9670ea8db0"]}, "3": {"attrs": {"processor": "intel_xeon_platinum_8259cl", "cores": 96, "chips": 2, "memory": 755, "frequency": 3500}, "members": ["r5.metal-ami-0e4d0bb9670ea8db0"]}, "4": {"attrs": {"processor": "intel_xeon_platinum_8252c", "cores": 48, "chips": 2, "memory": 188, "frequency": 4500}, "members": ["m5zn.metal-ami-0e4d0bb9670ea8db0"]}, "5": {"attrs": {"processor": "intel_xeon_platinum_8488c", "cores": 96, "chips": 1, "memory": 377, "frequency": 3800}, "members": ["m7i.metal-24xl-ami-0e4d0bb9670ea8db0"]}} 2 | -------------------------------------------------------------------------------- /tests/estimator_power_request_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import socket 3 | 4 | from kepler_model.util.config import SERVE_SOCKET 5 | from kepler_model.util.train_types import ( 6 | CATEGORICAL_LABEL_TO_VOCAB, 7 | SYSTEM_FEATURES, 8 | WORKLOAD_FEATURES, 9 | ModelOutputType, 10 | ) 11 | from tests.extractor_test import test_energy_source 12 | 13 | trainer_names = ["SGDRegressorTrainer"] 14 | test_energy_sources = ["acpi", "rapl-sysfs"] 15 | 16 | 17 | def generate_request( 18 | train_name, n=1, metrics=WORKLOAD_FEATURES, system_features=SYSTEM_FEATURES, output_type=ModelOutputType.DynPower.name, energy_source=test_energy_source 19 | ): 20 | request_json = dict() 21 | if train_name is not None: 22 | request_json["trainer_name"] = train_name 23 | request_json["metrics"] = metrics 24 | request_json["system_features"] = system_features 25 | request_json["system_values"] = [] 26 | for m in system_features: 27 | request_json["system_values"] += [CATEGORICAL_LABEL_TO_VOCAB[m][0]] 28 | request_json["values"] = [[1.0] * len(metrics)] * n 29 | request_json["output_type"] = output_type 30 | request_json["source"] = energy_source 31 | return request_json 32 | 33 | 34 | def process(client, energy_source): 35 | request_json = generate_request(trainer_names[0], 2, output_type="AbsPower", energy_source=energy_source) 36 | res = client.make_request(request_json) 37 | res_json = json.loads(res) 38 | print(res_json) 39 | assert res_json["msg"] == "", "response error: {}".format(res_json["msg"]) 40 | assert len(res_json["powers"]) > 0, "zero powers" 41 | 42 | 43 | class Client: 44 | def __init__(self, socket_path): 45 | self.socket_path = socket_path 46 | 47 | def make_request(self, request_json): 48 | s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 49 | s.connect(self.socket_path) 50 | data = json.dumps(request_json) 51 | print(data) 52 | s.send(data.encode()) 53 | data = b"" 54 | while True: 55 | shunk = s.recv(1024).strip() 56 | data += shunk 57 | if shunk is None or len(shunk.decode()) == 0 or shunk.decode()[-1] == "}": 58 | break 59 | decoded_data = data.decode() 60 | s.close() 61 | return decoded_data 62 | 63 | 64 | def test_estimator_power_request(): 65 | client = Client(SERVE_SOCKET) 66 | for energy_source in test_energy_sources: 67 | process(client, energy_source) 68 | 69 | 70 | if __name__ == "__main__": 71 | test_estimator_power_request() 72 | -------------------------------------------------------------------------------- /tests/http_server.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import http.server 3 | import os 4 | import socketserver 5 | import threading 6 | 7 | from kepler_model.util.config import model_toppath 8 | 9 | 10 | def cleanup_task(server): 11 | print("Shutdown server...") 12 | server.shutdown() 13 | 14 | 15 | def get_server(file_server_port): 16 | Handler = http.server.SimpleHTTPRequestHandler 17 | httpd = socketserver.TCPServer(("", file_server_port), Handler) 18 | 19 | # Register the cleanup task to be executed on program exit 20 | atexit.register(cleanup_task, httpd) 21 | 22 | print("Http File Serve Serving at Port", file_server_port, " for ", model_toppath) 23 | return httpd 24 | 25 | 26 | def http_file_server(file_server_port): 27 | try: 28 | httpd = get_server(file_server_port) 29 | # Start the server in a separate thread 30 | server_thread = threading.Thread(target=httpd.serve_forever) 31 | server_thread.daemon = True 32 | server_thread.start() 33 | except Exception as err: 34 | print(f"File server is running: {err}") 35 | 36 | 37 | def run(): 38 | os.chdir(model_toppath) 39 | httpd = get_server(8110) 40 | httpd.serve_forever() 41 | 42 | 43 | if __name__ == "__main__": 44 | run() 45 | -------------------------------------------------------------------------------- /tests/minimal_trainer.py: -------------------------------------------------------------------------------- 1 | from pipeline_test import process 2 | 3 | from kepler_model.util import FeatureGroup 4 | 5 | trainer_names = ["GradientBoostingRegressorTrainer", "SGDRegressorTrainer", "XgboostFitTrainer"] 6 | valid_feature_groups = [FeatureGroup.BPFOnly] 7 | 8 | if __name__ == "__main__": 9 | process( 10 | target_energy_sources=["acpi", "rapl-sysfs"], 11 | abs_trainer_names=trainer_names, 12 | dyn_trainer_names=trainer_names, 13 | valid_feature_groups=valid_feature_groups, 14 | ) 15 | -------------------------------------------------------------------------------- /tests/pipeline_test.py: -------------------------------------------------------------------------------- 1 | from kepler_model.train import NewPipeline, NodeTypeSpec 2 | from kepler_model.util import PowerSourceMap, get_valid_feature_group_from_queries 3 | from kepler_model.util.loader import default_node_type, default_train_output_pipeline 4 | from tests.extractor_test import test_energy_source, test_extractors 5 | from tests.isolator_test import test_isolators 6 | from tests.prom_test import get_query_results, prom_output_filename, prom_output_path 7 | from tests.trainer_test import assert_train, test_trainer_names 8 | 9 | # fake spec value 10 | spec_values = {"processor": "test", "cores": 1, "chips": 1, "memory": -1, "frequency": -1} 11 | spec = NodeTypeSpec(**spec_values) 12 | 13 | test_energy_sources = ["acpi", "rapl-sysfs"] 14 | 15 | 16 | def assert_pipeline(pipeline, query_results, feature_group, energy_source, energy_components): 17 | success, abs_data, dyn_data = pipeline.process( 18 | query_results, energy_components, energy_source, feature_group=feature_group.name, replace_node_type=default_node_type 19 | ) 20 | assert success, f"failed to process pipeline {pipeline.name}" 21 | for trainer in pipeline.trainers: 22 | if trainer.feature_group == feature_group and trainer.energy_source == energy_source: 23 | if trainer.node_level: 24 | assert_train(trainer, abs_data, energy_components) 25 | else: 26 | assert_train(trainer, dyn_data, energy_components) 27 | 28 | 29 | def process( 30 | save_pipeline_name=default_train_output_pipeline, 31 | prom_save_path=prom_output_path, 32 | prom_save_name=prom_output_filename, 33 | abs_trainer_names=test_trainer_names, 34 | dyn_trainer_names=test_trainer_names, 35 | extractors=test_extractors, 36 | isolators=test_isolators, 37 | target_energy_sources=[test_energy_source], 38 | valid_feature_groups=None, 39 | ): 40 | query_results = get_query_results(save_path=prom_save_path, save_name=prom_save_name) 41 | if valid_feature_groups is None: 42 | valid_feature_groups = get_valid_feature_group_from_queries(query_results.keys()) 43 | for extractor in extractors: 44 | for isolator in isolators: 45 | pipeline = NewPipeline( 46 | save_pipeline_name, 47 | abs_trainer_names, 48 | dyn_trainer_names, 49 | extractor=extractor, 50 | isolator=isolator, 51 | target_energy_sources=target_energy_sources, 52 | valid_feature_groups=valid_feature_groups, 53 | ) 54 | global spec 55 | pipeline.node_collection.index_train_machine("test", spec) 56 | for energy_source in target_energy_sources: 57 | energy_components = PowerSourceMap[energy_source] 58 | for feature_group in valid_feature_groups: 59 | assert_pipeline(pipeline, query_results, feature_group, energy_source, energy_components) 60 | # save metadata 61 | pipeline.save_metadata() 62 | # save node collection 63 | pipeline.node_collection.save() 64 | # save pipeline 65 | pipeline.archive_pipeline() 66 | 67 | 68 | def test_process(): 69 | process(target_energy_sources=test_energy_sources) 70 | -------------------------------------------------------------------------------- /tests/prom_test.py: -------------------------------------------------------------------------------- 1 | # prom_test.py 2 | # - prom_client.query 3 | # - prom_client.snapshot_query_result 4 | # 5 | # save response to prom_output_path/prom_output_filename.json 6 | # 7 | # To use output: 8 | # from prom_test import get_prom_output 9 | # response = get_prom_response() 10 | # or 11 | # query_result = get_query_results() 12 | 13 | import os 14 | 15 | from kepler_model.train.prom import PrometheusClient 16 | from kepler_model.util import load_json, save_json 17 | from kepler_model.util.prom_types import prom_responses_to_results 18 | 19 | prom_output_path = os.path.join(os.path.dirname(__file__), "data", "prom_output") 20 | prom_output_filename = "prom_response" 21 | 22 | 23 | def get_prom_response(save_path=prom_output_path, save_name=prom_output_filename): 24 | return load_json(save_path, save_name) 25 | 26 | 27 | def get_query_results(save_path=prom_output_path, save_name=prom_output_filename): 28 | response = get_prom_response(save_path=save_path, save_name=save_name) 29 | return prom_responses_to_results(response) 30 | 31 | 32 | def process(save_path=prom_output_path, save_name=prom_output_filename, server=None, interval=None, step=None): 33 | if server is not None: 34 | os.environ["PROM_SERVER"] = server 35 | if interval is not None: 36 | os.environ["PROM_QUERY_INTERVAL"] = interval 37 | if step is not None: 38 | os.environ["PROM_QUERY_STEP"] = step 39 | prom_client = PrometheusClient() 40 | response_dict = prom_client.query() 41 | results = prom_client.snapshot_query_result() 42 | print("Available metrics: ", results.keys()) 43 | # print query data in csv 44 | for metric, data in results.items(): 45 | print(metric) 46 | print(data.head()) 47 | save_json(save_path, save_name, response_dict) 48 | 49 | 50 | def test_prom_process(): 51 | process() 52 | -------------------------------------------------------------------------------- /tests/weight_model_request_test.py: -------------------------------------------------------------------------------- 1 | ######################### 2 | # weight_mode_request.py 3 | # 4 | # This file covers the following cases. 5 | # - getting weight from model server based on available features 6 | # 7 | ######################### 8 | 9 | import json 10 | import os 11 | import sys 12 | import time 13 | 14 | import requests 15 | 16 | from kepler_model.estimate.model_server_connector import list_all_models 17 | from kepler_model.util.config import download_path, get_model_server_req_endpoint 18 | from kepler_model.util.loader import get_download_output_path 19 | from kepler_model.util.train_types import FeatureGroup, FeatureGroups, ModelOutputType 20 | from tests.estimator_power_request_test import generate_request 21 | from tests.extractor_test import test_energy_source 22 | 23 | os.environ["MODEL_SERVER_URL"] = "http://localhost:8100" 24 | 25 | weight_available_trainers = ["SGDRegressorTrainer"] 26 | 27 | if __name__ == "__main__": 28 | # test getting model from server 29 | os.environ["MODEL_SERVER_ENABLE"] = "true" 30 | energy_source = test_energy_source 31 | 32 | available_models = list_all_models(energy_source=energy_source) 33 | while len(available_models) == 0: 34 | time.sleep(1) 35 | print("wait for kepler model server response") 36 | available_models = list_all_models(energy_source=energy_source) 37 | 38 | for output_type_name, valid_fgs in available_models.items(): 39 | output_type = ModelOutputType[output_type_name] 40 | output_path = get_download_output_path(download_path, energy_source, output_type) 41 | for fg_name, best_model in valid_fgs.items(): 42 | for trainer in weight_available_trainers: 43 | print("feature group: ", fg_name) 44 | metrics = FeatureGroups[FeatureGroup[fg_name]] 45 | request_json = generate_request(trainer, n=10, metrics=metrics, output_type=output_type_name) 46 | request_json["metrics"] += request_json["system_features"] 47 | request_json["weight"] = "true" 48 | del request_json["system_features"] 49 | del request_json["values"] 50 | del request_json["system_values"] 51 | try: 52 | response = requests.post(get_model_server_req_endpoint(), json=request_json) 53 | except Exception as err: 54 | print(f"cannot get response from model server: {err}") 55 | sys.exit(1) 56 | assert response.status_code == 200, f"response {request_json} not OK" 57 | loaded_weight = json.loads(response.content) 58 | print(loaded_weight) 59 | -------------------------------------------------------------------------------- /tests/xgboost_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | 4 | from kepler_model.train import DefaultExtractor 5 | from kepler_model.train.profiler.profiler import response_to_result 6 | from kepler_model.train.trainer.XGBoostTrainer.main import XGBoostRegressionStandalonePipeline 7 | from kepler_model.util.train_types import FeatureGroup, XGBoostRegressionTrainType 8 | 9 | energy_components = ["package", "core", "uncore", "dram"] 10 | feature_group = FeatureGroup.BPFIRQ.name 11 | energy_source = "rapl-sysfs" 12 | 13 | prom_response_file = os.path.join(os.path.dirname(__file__), "data", "prom_output", "prom_response.json") 14 | 15 | 16 | def read_sample_query_results(): 17 | with open(prom_response_file) as f: 18 | response = json.load(f) 19 | return response_to_result(response) 20 | 21 | 22 | if __name__ == "__main__": 23 | # Note that extractor mutates the query results 24 | query_results = read_sample_query_results() 25 | assert len(query_results) > 0, "cannot read_sample_query_results" 26 | instance = DefaultExtractor() 27 | extracted_data, power_columns, _, _ = instance.extract(query_results, energy_components, feature_group, energy_source, node_level=True) 28 | xgb_container_level_pipeline_kfold = XGBoostRegressionStandalonePipeline( 29 | XGBoostRegressionTrainType.KFoldCrossValidation, "test_models/XGBoost/", node_level=True 30 | ) 31 | xgb_node_pipeline_kfold = XGBoostRegressionStandalonePipeline(XGBoostRegressionTrainType.KFoldCrossValidation, "test_models/XGBoost/", node_level=False) 32 | xgb_container_level_pipeline_tts = XGBoostRegressionStandalonePipeline( 33 | XGBoostRegressionTrainType.TrainTestSplitFit, "test_models/XGBoost/", node_level=False 34 | ) 35 | xgb_node_pipeline_tts = XGBoostRegressionStandalonePipeline(XGBoostRegressionTrainType.TrainTestSplitFit, "test_models/XGBoost/", node_level=True) 36 | xgb_node_pipeline_kfold.train(None, query_results) 37 | xgb_container_level_pipeline_tts.train(None, query_results) 38 | xgb_node_pipeline_tts.train(None, query_results) 39 | xgb_container_level_pipeline_kfold.train(None, query_results) 40 | --------------------------------------------------------------------------------