├── .github ├── mlc_config.json ├── workflows │ ├── shellcheck.yml │ ├── docs.yml │ ├── codeql-analysis.yml │ └── python.yml └── dependabot.yml ├── AWS ├── requirements.txt ├── setup.cfg ├── README.md └── aws_cspm_benchmark.py ├── Azure ├── setup.cfg ├── requirements.txt ├── README.md └── azure_cspm_benchmark.py ├── GCP ├── setup.cfg ├── requirements.txt ├── README.md └── gcp_cspm_benchmark.py ├── setup.cfg ├── LICENSE ├── DEVELOPMENT.md ├── .gitignore ├── README.md ├── CODE_OF_CONDUCT.md └── benchmark.sh /.github/mlc_config.json: -------------------------------------------------------------------------------- 1 | { 2 | } 3 | -------------------------------------------------------------------------------- /AWS/requirements.txt: -------------------------------------------------------------------------------- 1 | tabulate 2 | boto3 3 | botocore 4 | -------------------------------------------------------------------------------- /Azure/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-complexity = 10 4 | 5 | [pylint.MASTER] 6 | disable=C0301,C0116,C0115 -------------------------------------------------------------------------------- /AWS/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-complexity = 10 4 | 5 | [pylint.MASTER] 6 | disable=C0301,C0116,C0115,C0103 -------------------------------------------------------------------------------- /GCP/setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | max-complexity = 10 4 | 5 | [pylint.MASTER] 6 | disable=C0301,C0116,C0115,C0114 -------------------------------------------------------------------------------- /GCP/requirements.txt: -------------------------------------------------------------------------------- 1 | google-cloud-compute 2 | google-cloud-run 3 | google-cloud-resource-manager 4 | google-api-python-client 5 | oauth2client 6 | tabulate 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # General flake8, pylint settings 2 | [flake8] 3 | max-line-length = 120 4 | max-complexity = 10 5 | 6 | [pylint.MASTER] 7 | disable=C0301,C0116,C0115,C0114 8 | -------------------------------------------------------------------------------- /.github/workflows/shellcheck.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: [ main ] 4 | pull_request: 5 | branches: [ main ] 6 | 7 | jobs: 8 | bash: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v5 12 | - name: Run ShellCheck 13 | uses: ludeeus/action-shellcheck@master 14 | with: 15 | format: tty 16 | scandir: './' 17 | -------------------------------------------------------------------------------- /Azure/requirements.txt: -------------------------------------------------------------------------------- 1 | # 2 | # This file is autogenerated by pip-compile 3 | # To update, run: 4 | # 5 | # pip-compile requirements.in 6 | # 7 | azure-identity 8 | azure-mgmt-resource 9 | azure-mgmt-containerservice 10 | azure-mgmt-compute 11 | azure-mgmt-containerinstance 12 | msrestazure 13 | pyjwt>=2.4.0 # not directly required, pinned by Snyk to avoid a vulnerability 14 | tabulate -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | --- 2 | version: 2 3 | updates: 4 | - package-ecosystem: pip 5 | directory: "/AWS" 6 | schedule: 7 | interval: weekly 8 | open-pull-requests-limit: 10 9 | - package-ecosystem: pip 10 | directory: "/Azure" 11 | schedule: 12 | interval: weekly 13 | open-pull-requests-limit: 10 14 | - package-ecosystem: github-actions 15 | directory: "/" 16 | schedule: 17 | interval: monthly 18 | open-pull-requests-limit: 10 19 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | --- 2 | name: Docs 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | paths: 9 | - '**.md' 10 | jobs: 11 | markdown-link-check: 12 | name: Broken Links 13 | runs-on: ubuntu-latest 14 | steps: 15 | - name: Checkout 16 | uses: actions/checkout@v5 17 | with: 18 | submodules: recursive 19 | - name: Run link check 20 | uses: gaurav-nelson/github-action-markdown-link-check@v1 21 | with: 22 | use-quiet-mode: 'no' 23 | use-verbose-mode: 'yes' 24 | check-modified-files-only: 'yes' 25 | config-file: '.github/mlc_config.json' 26 | base-branch: main 27 | -------------------------------------------------------------------------------- /Azure/README.md: -------------------------------------------------------------------------------- 1 | # Cloud-Benchmark - Azure 2 | 3 | This script is a read-only utility that counts cloud resources in your Azure account. 4 | No changes will be made to your account. No data will be sent anywhere and will remain in your cloud shell environment. 5 | 6 | ## How to use 7 | 8 | ### Initialize execution environment 9 | 10 | - Log-in with azure. Using the account that has read access to all your azure tenants/subscriptions 11 | - Navigate to [Azure Cloud Shell](https://shell.azure.com) and choose bash option 12 | 13 | ### Run the script 14 | 15 | ```shell 16 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 17 | ``` 18 | 19 | ### Collect the findings 20 | 21 | ```shell 22 | cat ./cloud-benchmark/*benchmark.csv 23 | ``` 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | This is free and unencumbered software released into the public domain. 2 | 3 | Anyone is free to copy, modify, publish, use, compile, sell, or 4 | distribute this software, either in source code form or as a compiled 5 | binary, for any purpose, commercial or non-commercial, and by any 6 | means. 7 | 8 | In jurisdictions that recognize copyright laws, the author or authors 9 | of this software dedicate any and all copyright interest in the 10 | software to the public domain. We make this dedication for the benefit 11 | of the public at large and to the detriment of our heirs and 12 | successors. We intend this dedication to be an overt act of 13 | relinquishment in perpetuity of all present and future rights to this 14 | software under copyright law. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR 20 | OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, 21 | ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR 22 | OTHER DEALINGS IN THE SOFTWARE. 23 | 24 | For more information, please refer to 25 | -------------------------------------------------------------------------------- /.github/workflows/codeql-analysis.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ main ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ main ] 20 | schedule: 21 | - cron: '30 7 * * 5' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ 'python' ] 36 | 37 | steps: 38 | - name: Checkout repository 39 | uses: actions/checkout@v5 40 | 41 | # Initializes the CodeQL tools for scanning. 42 | - name: Initialize CodeQL 43 | uses: github/codeql-action/init@v4 44 | with: 45 | languages: ${{ matrix.language }} 46 | # If you wish to specify custom queries, you can do so here or in a config file. 47 | # By default, queries listed here will override any specified in a config file. 48 | # Prefix the list here with "+" to use these queries and those in the config file. 49 | # queries: ./path/to/local/query, your-org/your-repo/queries@main 50 | 51 | # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). 52 | # If this step fails, then you should remove it and run the build manually (see below) 53 | - name: Autobuild 54 | uses: github/codeql-action/autobuild@v4 55 | 56 | # ℹ️ Command-line programs to run using the OS shell. 57 | # 📚 https://git.io/JvXDl 58 | 59 | # ✏️ If the Autobuild fails above, remove it and uncomment the following three lines 60 | # and modify them (or add more) to build your code if your project 61 | # uses a compiled language 62 | 63 | #- run: | 64 | # make bootstrap 65 | # make release 66 | 67 | - name: Perform CodeQL Analysis 68 | uses: github/codeql-action/analyze@v4 69 | -------------------------------------------------------------------------------- /.github/workflows/python.yml: -------------------------------------------------------------------------------- 1 | name: Python Lint 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | aws: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: ['3.x'] 15 | 16 | steps: 17 | - uses: actions/checkout@v5 18 | - name: Set up Python ${{ matrix.python-version }} 19 | uses: actions/setup-python@v6 20 | with: 21 | python-version: ${{ matrix.python-version }} 22 | - name: Install dependencies 23 | run: | 24 | cd AWS 25 | python -m pip install -r requirements.txt 26 | pip install flake8 pylint bandit 27 | - name: Lint with flake8 28 | run: | 29 | cd AWS 30 | flake8 aws_cspm_benchmark.py 31 | - name: Lint with pylint 32 | run: | 33 | cd AWS 34 | pylint aws_cspm_benchmark.py 35 | - name: Lint with bandit 36 | run: | 37 | cd AWS 38 | bandit -l -i -r . 39 | 40 | azure: 41 | runs-on: ubuntu-latest 42 | strategy: 43 | matrix: 44 | python-version: ['3.x'] 45 | steps: 46 | - uses: actions/checkout@v5 47 | - name: Set up Python ${{ matrix.python-version }} 48 | uses: actions/setup-python@v6 49 | with: 50 | python-version: ${{ matrix.python-version }} 51 | - name: Install dependencies 52 | run: | 53 | cd Azure 54 | python -m pip install -r requirements.txt 55 | pip install flake8 pylint bandit 56 | - name: Lint with flake8 57 | run: | 58 | cd Azure 59 | flake8 azure_cspm_benchmark.py 60 | - name: Lint with pylint 61 | run: | 62 | cd Azure 63 | pylint azure_cspm_benchmark.py 64 | - name: Lint with bandit 65 | run: | 66 | cd Azure 67 | bandit -l -i -r . 68 | 69 | gcp: 70 | runs-on: ubuntu-latest 71 | strategy: 72 | matrix: 73 | python-version: ['3.x'] 74 | steps: 75 | - uses: actions/checkout@v5 76 | - name: Set up Python ${{ matrix.python-version }} 77 | uses: actions/setup-python@v6 78 | with: 79 | python-version: ${{ matrix.python-version }} 80 | - name: Install dependencies 81 | run: | 82 | cd GCP 83 | python -m pip install -r requirements.txt 84 | pip install flake8 pylint bandit 85 | - name: Lint with flake8 86 | run: | 87 | cd GCP 88 | flake8 gcp_cspm_benchmark.py 89 | - name: Lint with pylint 90 | run: | 91 | cd GCP 92 | pylint gcp_cspm_benchmark.py 93 | - name: Lint with bandit 94 | run: | 95 | cd GCP 96 | bandit -l -i -r . 97 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Developer Guide 2 | 3 | This guide is intended to provide an overview of the CrowdStrike CWP / Horizon Benchmark Utilities project and explain how to contribute to the development of the benchmark scripts for AWS, Azure, and GCP. 4 | 5 | ## Project Overview 6 | 7 | The project aims to provide a set of scripts for auditing cloud resources across AWS, Azure, and GCP. The main `benchmark.sh` script handles argument parsing, checking for Python3 and pip installations, and running the appropriate benchmarking script for each supported cloud provider. The benchmarking scripts themselves are written in Python, and the main script downloads the necessary files and installs Python dependencies before running them. 8 | 9 | ## Directory Structure 10 | 11 | The project is structured as follows: 12 | 13 | ```terminal 14 | . 15 | ├── AWS 16 | │ ├── README.md 17 | │ ├── requirements.txt 18 | │ └── aws_cspm_benchmark.py 19 | ├── Azure 20 | │ ├── README.md 21 | │ ├── requirements.txt 22 | │ └── azure_cspm_benchmark.py 23 | ├── GCP 24 | │ ├── README.md 25 | │ ├── requirements.txt 26 | │ └── gcp_cspm_benchmark.py 27 | └── benchmark.sh 28 | ``` 29 | 30 | Each cloud provider has its own directory, containing a README file, a requirements.txt file for Python dependencies, and the corresponding benchmark script. 31 | 32 | ## Contributing to the Benchmark Scripts 33 | 34 | To contribute to the development of the benchmark scripts, follow these steps: 35 | 36 | 1. **Fork the repository**: Create a fork of the main repository on your GitHub account. 37 | 38 | 2. **Clone your fork**: Clone your fork of the repository to your local machine. 39 | 40 | 3. **Set up a virtual environment**: It's a good practice to set up a virtual environment for your development work. You can do this by running: 41 | 42 | ```shell 43 | python3 -m venv ./cloud-benchmark-dev 44 | source ./cloud-benchmark-dev/bin/activate 45 | ``` 46 | 47 | 4. **Install Python dependencies**: Install the necessary Python dependencies for the cloud provider you're working on: 48 | 49 | ```shell 50 | pip3 install -r path/to/provider/requirements.txt 51 | ``` 52 | 53 | 5. **Modify the benchmark script**: Make changes to the appropriate benchmark script (e.g., `aws_cspm_benchmark.py`, `azure_cspm_benchmark.py`, or `gcp_cspm_benchmark.py`) according to your contribution. 54 | 55 | 6. **Test your changes**: Run the modified benchmark script to test your changes and ensure they work as expected. 56 | 57 | 7. **Commit and push your changes**: Commit your changes to your fork and push them to your remote GitHub repository. 58 | 59 | 8. **Create a pull request**: Open a pull request to merge your changes into the main repository. 60 | 61 | ## Coding Guidelines 62 | 63 | When contributing to the benchmark scripts, keep these coding guidelines in mind: 64 | 65 | - Follow the [PEP 8 style guide](https://www.python.org/dev/peps/pep-0008/) for Python code. 66 | - Use meaningful variable and function names. 67 | - Include docstrings for functions and classes to explain their purpose and usage. 68 | - Keep functions small and focused on a single task. 69 | 70 | By following these guidelines and the contribution steps outlined above, you can help improve this project and make it more useful for everyone. 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in the cloud-benchmark directory 2 | cloud-benchmark/* 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # Visual Studio Code 135 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 136 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 137 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 138 | # you could uncomment the following to ignore the enitre vscode folder 139 | .vscode/ 140 | 141 | # Ruff stuff: 142 | .ruff_cache/ 143 | 144 | # PyPI configuration file 145 | .pypirc 146 | 147 | # AI Assistants 148 | # Various AI coding assistants create local cache, settings, and conversation history 149 | # These contain user-specific data and should not be committed to version control 150 | .cursorignore 151 | .cursorindexingignore 152 | .claude/ 153 | CLAUDE.md 154 | .anthropic/ 155 | .openai/ 156 | .codeium/ 157 | .tabnine/ 158 | .github-copilot/ 159 | .roo/ 160 | .aider/ 161 | .aider* 162 | .clinerules/ 163 | memory-bank/ 164 | -------------------------------------------------------------------------------- /GCP/README.md: -------------------------------------------------------------------------------- 1 | # Cloud-Benchmark - GCP 2 | 3 | This script is a read-only utility that counts cloud resources in your GCP account. It will autodiscover all GCP projects. 4 | 5 | No changes will be made to your account. No data will be sent anywhere and will remain in your cloud shell environment. 6 | 7 | ## How to use 8 | 9 | ### Initialize execution environment 10 | 11 | [![Open GCP Cloud Shell](https://gstatic.com/cloudssh/images/open-btn.svg)](https://shell.cloud.google.com/cloudshell/editor?cloudshell_git_repo=https%3A%2F%2Fgithub.com%2FCrowdStrike%2Fcloud-resource-estimator) 12 | 13 | ### Run the script 14 | 15 | ```shell 16 | ./benchmark.sh 17 | ``` 18 | 19 | ## Project Filtering 20 | 21 | The GCP script automatically excludes projects with `sys-*` prefixes by default. This improves performance because Google Apps Script commonly creates projects with this naming pattern that don't contain billable compute resources relevant for CSPM benchmarking. 22 | 23 | If you don't use Google Apps Script, or have legitimate projects that start with `sys-*` that should be included in the scan, you can enable scanning of these projects. 24 | 25 | ### Filtering Environment Variables 26 | 27 | **sys-* Projects:** 28 | 29 | ```bash 30 | # Include projects starting with sys-* (default: false) 31 | export GCP_ENABLE_SYS_PROJECTS=true 32 | ./benchmark.sh gcp 33 | ``` 34 | 35 | **Pattern-Based Filtering:** 36 | 37 | ```bash 38 | # Only scan projects matching these patterns (allowlist) 39 | export GCP_INCLUDE_PATTERNS="prod-*,production-*" 40 | ./benchmark.sh gcp 41 | 42 | # Skip projects matching these patterns (denylist) 43 | export GCP_EXCLUDE_PATTERNS="dev-*,test-*,*-sandbox,tmp-*" 44 | ./benchmark.sh gcp 45 | 46 | # Combine multiple filters 47 | export GCP_EXCLUDE_PATTERNS="dev-*,test-*" 48 | export GCP_INCLUDE_SYSTEM_PROJECTS=false 49 | ./benchmark.sh gcp 50 | ``` 51 | 52 | ### Common Filtering Examples 53 | 54 | **Production Only:** 55 | 56 | ```bash 57 | export GCP_INCLUDE_PATTERNS="prod-*,production-*,*-prod" 58 | ``` 59 | 60 | **Skip Development/Test:** 61 | 62 | ```bash 63 | export GCP_EXCLUDE_PATTERNS="dev-*,test-*,staging-*,*-dev,*-test,*-stage" 64 | ``` 65 | 66 | **Skip Personal/Temporary Projects:** 67 | 68 | ```bash 69 | export GCP_EXCLUDE_PATTERNS="tmp-*,temp-*,*-tmp,poc-*,experiment-*" 70 | ``` 71 | 72 | ## Performance Configuration 73 | 74 | The GCP script supports parallel processing and configurable performance settings for faster execution: 75 | 76 | ### Performance Environment Variables 77 | 78 | **Threading & Batching:** 79 | 80 | ```bash 81 | # Number of concurrent project processors (default: 3, recommended: 3-5) 82 | export GCP_THREADS=3 83 | 84 | # Projects per batch (default: 20) 85 | export GCP_BATCH_SIZE=20 86 | 87 | # Delay between batches in seconds (default: 10) 88 | export GCP_BATCH_DELAY=10 89 | 90 | # Delay between API calls in seconds (default: 0.05) 91 | export GCP_API_DELAY=0.05 92 | ``` 93 | 94 | ### Performance Examples 95 | 96 | **Fast Processing (for smaller organizations):** 97 | 98 | ```bash 99 | export GCP_THREADS=5 100 | export GCP_API_DELAY=0.01 101 | export GCP_BATCH_DELAY=5 102 | ./benchmark.sh gcp 103 | ``` 104 | 105 | **Rate-Limited Processing (for large organizations):** 106 | 107 | ```bash 108 | export GCP_THREADS=3 109 | export GCP_API_DELAY=0.1 110 | export GCP_BATCH_DELAY=15 111 | ./benchmark.sh gcp 112 | ``` 113 | 114 | **Maximum Performance (use with caution):** 115 | 116 | ```bash 117 | export GCP_THREADS=5 118 | export GCP_API_DELAY=0 119 | export GCP_BATCH_DELAY=0 120 | ./benchmark.sh gcp 121 | ``` 122 | 123 | > [!WARNING] 124 | > Setting `GCP_API_DELAY=0` and `GCP_BATCH_DELAY=0` may trigger Google Cloud's rate limiting mechanisms, especially in organizations with many projects. If you encounter rate limit errors, increase these values. Start with conservative settings and adjust based on your organization's API quota limits. 125 | 126 | ### Rate Limiting Considerations 127 | 128 | Google Cloud APIs have quotas and rate limits that vary by service and organization. When scanning large numbers of projects: 129 | 130 | - **Start Conservative**: Use default settings first (`GCP_THREADS=3`, `GCP_API_DELAY=0.05`) 131 | - **Monitor for Errors**: Watch for "quota exceeded" or "rate limit exceeded" errors in the output 132 | - **Adjust Gradually**: If you encounter rate limits, increase `GCP_API_DELAY` and `GCP_BATCH_DELAY` values 133 | - **Organization Size Matters**: Larger organizations should use more conservative settings 134 | 135 | **Recommended Settings by Organization Size:** 136 | 137 | - **Small** (< 50 projects): Default settings work well 138 | - **Medium** (50-200 projects): `GCP_THREADS=3`, `GCP_API_DELAY=0.1` 139 | - **Large** (200+ projects): `GCP_THREADS=3`, `GCP_API_DELAY=0.15`, `GCP_BATCH_DELAY=15` 140 | 141 | ### Collect the findings 142 | 143 | ```shell 144 | cat ./cloud-benchmark/*benchmark.csv 145 | ``` 146 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![CrowdStrike Logo (Light)](https://raw.githubusercontent.com/CrowdStrike/.github/main/assets/cs-logo-light-mode.png#gh-light-mode-only) 2 | ![CrowdStrike Logo (Dark)](https://raw.githubusercontent.com/CrowdStrike/.github/main/assets/cs-logo-dark-mode.png#gh-dark-mode-only) 3 | 4 | # CrowdStrike Cloud Resource Estimator 5 | 6 | This multi-cloud resource auditing utility helps organizations calculate the size of their cloud deployments across AWS, Azure, and Google Cloud Platform. It's designed for **CrowdStrike CWP/Horizon licensing calculations** and cloud security posture management (CSPM) benchmarking. 7 | 8 | ## What This Tool Does 9 | 10 | The Cloud Resource Estimator performs **read-only** scanning of your cloud infrastructure to count: 11 | 12 | - Virtual machines and compute instances 13 | - Container services (ECS, AKS, GKE) 14 | - Serverless functions and managed services 15 | - Other billable resources relevant for CSPM licensing 16 | 17 | **No changes are made to your cloud environment** - this is strictly an auditing and counting tool. 18 | 19 | ## Running an audit 20 | 21 | The `benchmark.sh` entrypoint script helps you to perform sizing calculations for your cloud resources. It detects the cloud provider (AWS, Azure, or GCP) and downloads the necessary scripts to perform the calculation. You can also pass one or more cloud providers as arguments. 22 | 23 | ## Configuration 24 | 25 | Each cloud provider supports various environment variables for performance tuning and filtering. Below are the key AWS configuration options (for complete configuration details, see the provider-specific README files): 26 | 27 | ### AWS Configuration 28 | 29 | | Variable | Default | Description | 30 | | :--- | :--- | :--- | 31 | | `AWS_ASSUME_ROLE_NAME` | `OrganizationAccountAccessRole` | IAM role name for cross-account access | 32 | | `AWS_REGIONS` | All regions | Comma-separated list of regions to scan | 33 | | `AWS_THREADS` | `5` | Number of concurrent account threads | 34 | | `AWS_BATCH_SIZE` | `20` | Accounts processed per batch | 35 | | `AWS_BATCH_DELAY` | `30` | Seconds to wait between batches | 36 | | `AWS_API_DELAY` | `0.1` | Seconds to wait between API calls | 37 | | `AWS_MAX_RETRIES` | `5` | Maximum retry attempts for failed operations | 38 | | `AWS_OPERATION_TIMEOUT` | `300` | Timeout for individual operations (seconds) | 39 | | `AWS_RESUME_FILE` | `aws_benchmark_progress.json` | Progress tracking file | 40 | | `AWS_SKIP_ACCOUNTS` | None | Comma-separated list of account IDs to skip | 41 | | `AWS_DRY_RUN` | `false` | Set to `true` to simulate without API calls | 42 | 43 | To use, please export variables in your environment prior to running the script: 44 | 45 | ```shell 46 | export AWS_ASSUME_ROLE_NAME="Example-Role-Name" 47 | ``` 48 | 49 | ### Azure and GCP Configuration 50 | 51 | Azure and GCP also support performance tuning and filtering options. For complete configuration details: 52 | 53 | - **Azure**: See [Azure README](Azure/README.md) for subscription filtering and performance settings 54 | - **GCP**: See [GCP README](GCP/README.md) for project filtering (including sys-* project handling) and threading options 55 | 56 | **Note**: see [AWS Readme](AWS/README.md) for detailed configuration options. 57 | 58 | ## Usage 59 | 60 | ```shell 61 | ./benchmark.sh [aws|azure|gcp]... 62 | ``` 63 | 64 | Below are two different ways to execute the script. 65 | 66 | ### In Cloud Shell 67 | 68 | To execute the script in your environment using Cloud Shell, follow the appropriate guide based on your cloud provider: 69 | 70 | - [AWS](AWS/README.md) 71 | - [Azure](Azure/README.md) 72 | - [GCP](GCP/README.md) 73 | 74 | ### In your Local Environment 75 | 76 | For those who prefer to run the script locally, or would like to run the script against more than one cloud provider at a time, follow the instructions below: 77 | 78 | #### Requirements 79 | 80 | - Python 3 81 | - pip 82 | - curl 83 | - Appropriate cloud provider CLI ([AWS](https://aws.amazon.com/cli/), [Azure](https://learn.microsoft.com/en-us/cli/azure/install-azure-cli), [GCP](https://cloud.google.com/sdk/docs/install)) 84 | 85 | #### Steps 86 | 87 | 1. Download the script: 88 | 89 | ```shell 90 | curl -O https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh 91 | ``` 92 | 93 | 1. Set execution permissions: 94 | 95 | ```shell 96 | chmod +x benchmark.sh 97 | ``` 98 | 99 | 1. Example: Run the script against AWS and Azure: 100 | 101 | ```shell 102 | ./benchmark.sh aws azure 103 | ``` 104 | 105 | --- 106 | 107 | **Alternatively, you can run the script directly from the URL:** 108 | 109 | - Run the script against AWS and Azure: 110 | 111 | ```shell 112 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash -s -- aws azure 113 | ``` 114 | 115 | - Run the script and let it determine the available cloud providers: 116 | 117 | ```shell 118 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 119 | ``` 120 | 121 | ## Development 122 | 123 | Please review our [Developer Guide](DEVELOPMENT.md) for more information on how to contribute to this project. 124 | 125 | ## License 126 | 127 | These scripts are provided to the community, for free, under the Unlicense license. As such, these scripts 128 | carry no formal support, express or implied. 129 | 130 | ## Questions? 131 | 132 | Please review our [Code of Conduct](CODE_OF_CONDUCT.md) and then submit an issue or pull request. 133 | We will address the issue as quickly as possible. 134 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Cloud Benchmark Community Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | CrowdStrike. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. -------------------------------------------------------------------------------- /AWS/README.md: -------------------------------------------------------------------------------- 1 | # Cloud-Benchmark - AWS 2 | 3 | This script is a read-only utility that counts cloud resources in your AWS account. If you run this in your organization account, it will discover resources in all accounts in your organization. 4 | 5 | No changes will be made to your account. No data will be sent anywhere and will remain in your cloud shell environment. 6 | 7 | ## 🔧 How it works 8 | This script can run against an individual AWS account or all child accounts in an AWS Organization. When running the script in CloudShell, it will establish the session using the AWS Identity currently signed in. When running the script in your local environment, it will establish the session based on your AWS CLI configuration. Please see [Local Environment Instructions](../README.md) for more details. If your AWS Identity is in the AWS Organization Management account, the script will use the default role `OrganizationAccountAccessRole` (or custom role if provided) to switch into each child account. If your AWS Identity is not in an AWS Organization Management account, the script will only process resources in this single account. Upon completion, a CSV report is generated with the findings. 9 | 10 | ### Reported Resources 11 | Reported Resources will include a count of each of the following resource types per AWS Region: 12 | 13 | | Resource | Description | 14 | | :--- | :--- | 15 | | Terminated VMs | Terminated EC2 Instances | 16 | | Running VMs | Running EC2 Instances | 17 | | Terminated Kubernetes Nodes | Terminated EKS Nodes | 18 | | Running Kubernetes Nodes | Running EKS Nodes | 19 | | Active EKS Fargate Profiles | Active EKS Fargate Profiles for each EKS Cluster. Excludes any existing Falcon Profiles eg. fp-falcon* | 20 | | ECS Service Fargate Tasks | DesiredCount of tasks in Active ECS Services. Excludes standalone tasks or tasks that are scheduled outside of Services | 21 | 22 | ## ▶️ Usage 23 | 24 | ### Initialize execution environment 25 | 26 | Open AWS Cloud Shell ([overview](https://aws.amazon.com/cloudshell/), [documentation](https://docs.aws.amazon.com/cloudshell/latest/userguide/welcome.html)) using one of the direct links: 27 | 28 | | Region | Link | 29 | | :--- | :--- | 30 | | us-east-1 | **[Virginia, United States](https://us-east-1.console.aws.amazon.com/cloudshell/home?region=us-east-1)** | 31 | | us-east-2 | **[Ohio, United States](https://us-east-2.console.aws.amazon.com/cloudshell/home?region=us-east-2)** | 32 | | us-west-2 | **[Oregon, United States](https://us-west-2.console.aws.amazon.com/cloudshell/home?region=us-west-2)** | 33 | | eu-west-1 | **[Ireland](https://eu-west-1.console.aws.amazon.com/cloudshell/home?region=eu-west-1)** | 34 | | ap-northeast-1 | **[Tokyo, Japan](https://ap-northeast-1.console.aws.amazon.com/cloudshell/home?region=ap-northeast-1)** | 35 | 36 | ### Export Environment Variables 37 | 38 | ```shell 39 | export AWS_ASSUME_ROLE_NAME="Example-Role-Name" 40 | ``` 41 | 42 | ### Execute Script 43 | 44 | ```shell 45 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 46 | ``` 47 | 48 | ### Collect the findings 49 | 50 | ```shell 51 | cat ./cloud-benchmark/*benchmark*.csv 52 | ``` 53 | 54 | ## ⚙️ Features & Configuration 55 | 56 | ### Complete Environment Variables 57 | 58 | | Variable | Default | Description | 59 | | :--- | :--- | :--- | 60 | | `AWS_ASSUME_ROLE_NAME` | `OrganizationAccountAccessRole` | IAM role name for cross-account access | 61 | | `AWS_REGIONS` | All regions | Comma-separated list of regions to scan | 62 | | `AWS_THREADS` | `5` | Number of concurrent account threads | 63 | | `AWS_BATCH_SIZE` | `20` | Accounts processed per batch | 64 | | `AWS_BATCH_DELAY` | `30` | Seconds to wait between batches | 65 | | `AWS_API_DELAY` | `0.1` | Seconds to wait between API calls | 66 | | `AWS_MAX_RETRIES` | `5` | Maximum retry attempts for failed operations | 67 | | `AWS_OPERATION_TIMEOUT` | `300` | Timeout for individual operations (seconds) | 68 | | `AWS_RESUME_FILE` | `aws_benchmark_progress.json` | Progress tracking file | 69 | | `AWS_SKIP_ACCOUNTS` | None | Comma-separated list of account IDs to skip | 70 | | `AWS_DRY_RUN` | `false` | Set to `true` to simulate without API calls | 71 | 72 | ### Configuration Recommendations per Organization Size 73 | 74 | #### Standard Processing for Small Organizations (< 50 accounts) 75 | ```shell 76 | # Default settings work well - no configuration needed 77 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 78 | ``` 79 | 80 | #### Fast Processing for Smaller Organizations (< 50 accounts) 81 | ```shell 82 | export AWS_THREADS=8 83 | export AWS_BATCH_SIZE=50 84 | export AWS_BATCH_DELAY=15 85 | export AWS_API_DELAY=0.05 86 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 87 | ``` 88 | 89 | #### Medium Organizations (50-200 accounts) 90 | ```shell 91 | export AWS_THREADS=4 92 | export AWS_BATCH_SIZE=15 93 | export AWS_BATCH_DELAY=30 94 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 95 | ``` 96 | 97 | #### Large Organizations (200+ accounts) 98 | ```shell 99 | export AWS_THREADS=2 100 | export AWS_BATCH_SIZE=10 101 | export AWS_BATCH_DELAY=60 102 | export AWS_API_DELAY=0.2 103 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 104 | ``` 105 | 106 | ### Resume Interrupted Runs 107 | 108 | If the script times out or is interrupted, it automatically saves progress and can be resumed: 109 | 110 | ```shell 111 | # The script will automatically resume from where it left off 112 | curl https://raw.githubusercontent.com/CrowdStrike/cloud-resource-estimator/main/benchmark.sh | bash 113 | ``` 114 | 115 | The script will display progress and automatically skip completed accounts. 116 | 117 | ### Other Usage Options 118 | 119 | #### Scan specific regions only 120 | ```shell 121 | export AWS_REGIONS="us-east-1,us-west-2,eu-west-1" 122 | ``` 123 | 124 | #### Skip Problematic Accounts 125 | ```shell 126 | python aws_cspm_benchmark.py \ 127 | --skip-accounts "123456789012,234567890123,345678901234" 128 | ``` 129 | 130 | #### Dry Run to Preview Processing 131 | ```shell 132 | python aws_cspm_benchmark.py --dry-run 133 | ``` 134 | -------------------------------------------------------------------------------- /Azure/azure_cspm_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | azure-cspm-benchmark.py 3 | 4 | Assists with provisioning calculations by retrieving a count 5 | of all billable resources attached to an Azure subscription. 6 | """ 7 | 8 | import csv 9 | import logging 10 | 11 | from functools import cached_property, lru_cache 12 | from azure.identity import AzureCliCredential 13 | from azure.mgmt.resource import ResourceManagementClient, SubscriptionClient 14 | from azure.mgmt.containerservice import ContainerServiceClient 15 | from azure.mgmt.compute import ComputeManagementClient 16 | from azure.mgmt.containerinstance import ContainerInstanceManagementClient 17 | import msrestazure.tools 18 | from tabulate import tabulate 19 | 20 | headers = { 21 | 'tenant_id': 'Azure Tenant ID', 22 | 'subscription_id': 'Azure Subscription ID', 23 | 'aks_nodes': 'Kubernetes Nodes', 24 | 'vms': 'Virtual Machines', 25 | 'aci_containers': 'Container Instances' 26 | } 27 | 28 | 29 | class AzureHandle: 30 | def __init__(self): 31 | # Acquire a credential object using CLI-based authentication. 32 | self.creds = AzureCliCredential() 33 | 34 | @cached_property 35 | def subscriptions(self): 36 | return list(self.subscription_client.subscriptions.list()) 37 | 38 | @property 39 | def tenants(self): 40 | return list(self.subscription_client.tenants.list()) 41 | 42 | def aci_resources(self, subscription_id): 43 | client = self.resource_client(subscription_id) 44 | return client.resources.list(filter="resourceType eq 'microsoft.containerinstance/containergroups'") 45 | 46 | def aks_resources(self, subscription_id): 47 | client = self.resource_client(subscription_id) 48 | return client.resources.list(filter="resourceType eq 'microsoft.containerservice/managedclusters'") 49 | 50 | def vmss_resources(self, subscription_id): 51 | client = self.resource_client(subscription_id) 52 | return client.resources.list(filter="resourceType eq 'Microsoft.Compute/virtualMachineScaleSets'") 53 | 54 | def vms_resources(self, subscription_id): 55 | client = self.resource_client(subscription_id) 56 | return client.resources.list(filter="resourceType eq 'Microsoft.Compute/virtualMachines'") 57 | 58 | def managed_clusters(self, subscription_id): 59 | return self.container_client(subscription_id).managed_clusters.list() 60 | 61 | def rhos_clusters(self, subscription_id): 62 | return self.container_client(subscription_id).open_shift_managed_clusters.list() 63 | 64 | def container_vmss(self, aks_resource): 65 | parsed_id = msrestazure.tools.parse_resource_id(aks_resource.id) 66 | client = self.container_client(parsed_id['subscription']) 67 | return client.agent_pools.list(resource_group_name=parsed_id['resource_group'], 68 | resource_name=parsed_id['resource_name']) 69 | 70 | def container_aci(self, aci_resource): 71 | parsed_id = msrestazure.tools.parse_resource_id(aci_resource.id) 72 | client = self.container_instance_client(parsed_id['subscription']) 73 | return client.container_groups.get(resource_group_name=parsed_id['resource_group'], 74 | container_group_name=parsed_id['resource_name']).containers 75 | 76 | def vms_inside_vmss(self, vmss_resource): 77 | parsed_id = msrestazure.tools.parse_resource_id(vmss_resource.id) 78 | client = ComputeManagementClient(self.creds, parsed_id['subscription']) 79 | return client.virtual_machine_scale_set_vms.list(resource_group_name=parsed_id['resource_group'], 80 | virtual_machine_scale_set_name=vmss_resource.name) 81 | 82 | @lru_cache 83 | def container_client(self, subscription_id): 84 | return ContainerServiceClient(self.creds, subscription_id) 85 | 86 | @lru_cache 87 | def container_instance_client(self, subscription_id): 88 | return ContainerInstanceManagementClient(self.creds, subscription_id) 89 | 90 | @lru_cache 91 | def resource_client(self, subscription_id): 92 | return ResourceManagementClient(self.creds, subscription_id) 93 | 94 | @cached_property 95 | def subscription_client(self): 96 | return SubscriptionClient(self.creds) 97 | 98 | 99 | LOG_LEVEL = logging.INFO 100 | LOG_LEVEL = logging.DEBUG 101 | log = logging.getLogger('azure') 102 | log.setLevel(LOG_LEVEL) 103 | ch = logging.StreamHandler() 104 | ch.setLevel(LOG_LEVEL) 105 | formatter = logging.Formatter('%(asctime)s %(levelname)-8s %(message)s', '%Y-%m-%d %H:%M:%S') 106 | ch.setFormatter(formatter) 107 | log.addHandler(ch) 108 | 109 | for mod in ['azure.identity._internal.decorators', 'azure.core.pipeline.policies.http_logging_policy']: 110 | logging.getLogger(mod).setLevel(logging.WARNING) 111 | 112 | 113 | data = [] 114 | totals = {'tenant_id': 'totals', 'subscription_id': 'totals', 'aks_nodes': 0, 'vms': 0, 'aci_containers': 0} 115 | az = AzureHandle() 116 | 117 | log.info("You have access to %d subscription(s) within %s tenant(s)", len(az.subscriptions), len(az.tenants)) 118 | for subscription in az.subscriptions: 119 | row = {'tenant_id': subscription.tenant_id, 'subscription_id': subscription.subscription_id, 120 | 'aks_nodes': 0, 'vms': 0, 'aci_containers': 0} 121 | log.info("Processing Azure subscription: %s (id=%s)", subscription.display_name, subscription.subscription_id) 122 | 123 | vmss_list = list(az.vmss_resources(subscription.subscription_id)) 124 | 125 | # (1) Process AKS 126 | for aks in az.aks_resources(subscription.subscription_id): 127 | for node_pool in az.container_vmss(aks): 128 | log.info("Identified node pool: '%s' within AKS: '%s' with %d node(s)", 129 | node_pool.name, aks.name, node_pool.count) 130 | row['aks_nodes'] += node_pool.count 131 | 132 | # (2) Process VMSS 133 | for vmss in az.vmss_resources(subscription.subscription_id): 134 | if vmss.tags is not None and 'aks-managed-createOperationID' in vmss.tags: 135 | # AKS resources already accounted for above 136 | continue 137 | 138 | vm_count = sum(1 for vm in az.vms_inside_vmss(vmss)) 139 | log.info("Identified %d vm resource(s) inside Scale Set: '%s'", vm_count, vmss.name) 140 | row['vms'] += vm_count 141 | 142 | # # (3) Process ACI 143 | for aci in az.aci_resources(subscription.subscription_id): 144 | container_count = sum(1 for container in az.container_aci(aci)) 145 | log.info("Identified %d container resource(s) inside Container Group: '%s'", container_count, aci.name) 146 | row['aci_containers'] += container_count 147 | 148 | # (4) Process VMs 149 | vm_count = sum((1 for vm in az.vms_resources(subscription.subscription_id))) 150 | log.info('Identified %d vm resource(s) outside of Scale Sets', vm_count) 151 | row['vms'] += vm_count 152 | data.append(row) 153 | 154 | totals['vms'] += row['vms'] 155 | totals['aks_nodes'] += row['aks_nodes'] 156 | totals['aci_containers'] += row['aci_containers'] 157 | 158 | data.append(totals) 159 | 160 | # Output our results 161 | print(tabulate(data, headers=headers, tablefmt="grid")) 162 | 163 | with open('az-benchmark.csv', 'w', newline='', encoding='utf-8') as csv_file: 164 | csv_writer = csv.DictWriter(csv_file, fieldnames=headers.keys()) 165 | csv_writer.writeheader() 166 | csv_writer.writerows(data) 167 | 168 | log.info("CSV summary has been exported to ./az-benchmark.csv file") 169 | -------------------------------------------------------------------------------- /benchmark.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Universal cloud provider provisioning calculator 3 | # Based on the cloud provider, downloads the necessary scripts 4 | # to perform a sizing calculation. 5 | 6 | base_url=https://raw.githubusercontent.com/CrowdStrike/Cloud-Benchmark/main 7 | 8 | # Usage message 9 | usage() { 10 | echo """ 11 | Usage: $0 [OPTIONS] [aws|azure|gcp]... 12 | 13 | The script recognizes the following environment variables: 14 | 15 | AWS: 16 | - AWS_ASSUME_ROLE_NAME: The name of the AWS role to assume (optional) 17 | - AWS_REGIONS: The name of the AWS Region to target or a comma-delimited list of AWS Regions to target (optional) 18 | - AWS_THREADS: Number of worker threads for parallel processing (default: 5) 19 | - AWS_BATCH_SIZE: Number of accounts to process per batch (default: 20) 20 | - AWS_BATCH_DELAY: Delay in seconds between batches (default: 30) 21 | - AWS_API_DELAY: Delay in seconds between API calls (default: 0.1) 22 | - AWS_MAX_RETRIES: Maximum retry attempts for failed operations (default: 5) 23 | - AWS_OPERATION_TIMEOUT: Timeout in seconds for individual operations (default: 300) 24 | - AWS_RESUME_FILE: File to store/resume progress (default: aws_benchmark_progress.json) 25 | - AWS_SKIP_ACCOUNTS: Comma-separated list of account IDs to skip 26 | - AWS_DRY_RUN: Set to 'true' to show what would be processed without making API calls 27 | 28 | Example for large organizations (200+ accounts): 29 | export AWS_THREADS=3 30 | export AWS_BATCH_SIZE=12 31 | export AWS_BATCH_DELAY=45 32 | export AWS_API_DELAY=0.15 33 | """ 34 | } 35 | 36 | # Check if the system has Python3 and pip installed 37 | check_python3() { 38 | if ! type python3 >/dev/null 2>&1; then 39 | echo "Python3 not found. Please install Python3 and try again." 40 | exit 1 41 | fi 42 | if ! type pip3 >/dev/null 2>&1; then 43 | echo "Pip not found. Please install pip and try again." 44 | exit 1 45 | fi 46 | } 47 | 48 | # Ensures the provided cloud provider arg is valid 49 | is_valid_cloud() { 50 | local cloud="$1" 51 | local lower_cloud 52 | lower_cloud=$(echo "$cloud" | tr '[:upper:]' '[:lower:]') 53 | 54 | case "$lower_cloud" in 55 | aws) 56 | echo "AWS" 57 | return 0 58 | ;; 59 | azure) 60 | echo "Azure" 61 | return 0 62 | ;; 63 | gcp) 64 | echo "GCP" 65 | return 0 66 | ;; 67 | *) 68 | return 1 69 | ;; 70 | esac 71 | } 72 | 73 | # Calls the python script for the specified cloud provider with the 74 | # appropriate arguments 75 | call_benchmark_script() { 76 | local cloud="$1" 77 | local file="$2" 78 | local args=() 79 | 80 | case "$cloud" in 81 | AWS) 82 | [[ -n $AWS_ASSUME_ROLE_NAME ]] && args+=("-r" "$AWS_ASSUME_ROLE_NAME") 83 | [[ -n $AWS_REGIONS ]] && args+=("-R" "$AWS_REGIONS") 84 | [[ -n $AWS_THREADS ]] && args+=("--threads" "$AWS_THREADS") 85 | [[ -n $AWS_BATCH_SIZE ]] && args+=("--batch-size" "$AWS_BATCH_SIZE") 86 | [[ -n $AWS_BATCH_DELAY ]] && args+=("--batch-delay" "$AWS_BATCH_DELAY") 87 | [[ -n $AWS_API_DELAY ]] && args+=("--api-delay" "$AWS_API_DELAY") 88 | [[ -n $AWS_MAX_RETRIES ]] && args+=("--max-retries" "$AWS_MAX_RETRIES") 89 | [[ -n $AWS_OPERATION_TIMEOUT ]] && args+=("--operation-timeout" "$AWS_OPERATION_TIMEOUT") 90 | [[ -n $AWS_RESUME_FILE ]] && args+=("--resume-file" "$AWS_RESUME_FILE") 91 | [[ -n $AWS_SKIP_ACCOUNTS ]] && args+=("--skip-accounts" "$AWS_SKIP_ACCOUNTS") 92 | [[ -n $AWS_DRY_RUN ]] && [[ $AWS_DRY_RUN == "true" ]] && args+=("--dry-run") 93 | ;; 94 | Azure) 95 | ;; 96 | GCP) 97 | ;; 98 | *) 99 | echo "Invalid cloud provider specified: $cloud" 100 | usage 101 | exit 1 102 | ;; 103 | esac 104 | 105 | python3 "${file}" "${args[@]}" 106 | } 107 | 108 | audit() { 109 | CLOUD="$1" 110 | echo "Working in cloud: ${CLOUD}" 111 | cloud=$(echo "$CLOUD" | tr '[:upper:]' '[:lower:]') 112 | 113 | case "$CLOUD" in 114 | AWS) 115 | # Use local AWS script if available 116 | if [ -f "../AWS/aws_cspm_benchmark.py" ]; then 117 | echo "Using local AWS CSPM benchmark script..." 118 | file="../AWS/aws_cspm_benchmark.py" 119 | 120 | # Install requirements from local AWS directory 121 | if [ -f "../AWS/requirements.txt" ]; then 122 | python3 -m pip install --disable-pip-version-check -qq -r "../AWS/requirements.txt" 123 | else 124 | echo "AWS requirements.txt not found locally, downloading from remote" 125 | curl -s -o requirements.txt "${base_url}/${CLOUD}/requirements.txt" 126 | python3 -m pip install --disable-pip-version-check -qq -r requirements.txt 127 | fi 128 | else 129 | echo "Local AWS script not found, downloading from remote" 130 | curl -s -o requirements.txt "${base_url}/${CLOUD}/requirements.txt" 131 | echo "Installing python dependencies for communicating with ${CLOUD} into (~/cloud-benchmark)" 132 | python3 -m pip install --disable-pip-version-check -qq -r requirements.txt 133 | file="${cloud}_cspm_benchmark.py" 134 | curl -s -o "${file}" "${base_url}/${CLOUD}/${file}" 135 | fi 136 | ;; 137 | Azure|GCP) 138 | # Use remote scripts for Azure and GCP (unchanged behavior) 139 | curl -s -o requirements.txt "${base_url}/${CLOUD}/requirements.txt" 140 | echo "Installing python dependencies for communicating with ${CLOUD} into (~/cloud-benchmark)" 141 | python3 -m pip install --disable-pip-version-check -qq -r requirements.txt 142 | file="${cloud}_cspm_benchmark.py" 143 | curl -s -o "${file}" "${base_url}/${CLOUD}/${file}" 144 | ;; 145 | *) 146 | echo "Unsupported cloud provider: $CLOUD" 147 | exit 1 148 | ;; 149 | esac 150 | 151 | call_benchmark_script "$CLOUD" "${file}" 152 | } 153 | 154 | check_python3 155 | 156 | python3 -m venv ./cloud-benchmark 157 | pushd ./cloud-benchmark >/dev/null || exit 158 | # shellcheck source=/dev/null 159 | source ./bin/activate 160 | 161 | # MAIN ROUTINE 162 | found_provider=false 163 | 164 | # If arguments are provided, audit the specified providers 165 | for arg in "$@"; do 166 | result=$(is_valid_cloud "$arg") 167 | # shellcheck disable=SC2181 168 | if [ $? -eq 0 ]; then 169 | audit "$result" 170 | found_provider=true 171 | else 172 | echo "Invalid cloud provider specified: $arg" 173 | # Exit only if found_provider is false. This means that if the user 174 | # specifies a valid cloud provider, but also an invalid one, we will 175 | # still run the audit for the valid provider. 176 | if [ "$found_provider" = false ]; then 177 | usage 178 | popd >/dev/null && exit 1 179 | fi 180 | fi 181 | done 182 | 183 | # If no arguments provided, auto-detect the available cloud providers 184 | if [ $# -eq 0 ]; then 185 | echo "Determining cloud provider..." 186 | if type aws >/dev/null 2>&1; then 187 | audit "AWS" 188 | found_provider=true 189 | fi 190 | if type az >/dev/null 2>&1; then 191 | audit "Azure" 192 | found_provider=true 193 | fi 194 | 195 | if type gcloud >/dev/null 2>&1; then 196 | audit "GCP" 197 | found_provider=true 198 | fi 199 | fi 200 | 201 | if [ "$found_provider" = false ]; then 202 | echo "No supported cloud provider found." 203 | usage 204 | popd >/dev/null && exit 1 205 | fi 206 | 207 | popd >/dev/null || exit 208 | deactivate 209 | 210 | echo "Type following command to export cloud counts:" 211 | echo "cat ./cloud-benchmark/*benchmark.csv" 212 | 213 | # END 214 | # 215 | # -''--. 216 | # _`> `\.-'< 217 | # _.' _ '._ 218 | # .' _.=' '=._ '. 219 | # >_ / /_\ /_\ \ _< - jgs 220 | # / ( \o/\\o/ ) \ 221 | # >._\ .-,_)-. /_.< 222 | # /__/ \__\ 223 | # '---' E=mc^2 224 | # 225 | # 226 | -------------------------------------------------------------------------------- /GCP/gcp_cspm_benchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | gcp-cspm-benchmark.py 3 | 4 | Assists with provisioning calculations by retrieving a count 5 | of all billable resources attached to a GCP project. 6 | """ 7 | 8 | import csv 9 | import fnmatch 10 | import logging 11 | import os 12 | import ssl 13 | import time 14 | from concurrent.futures import ThreadPoolExecutor, as_completed 15 | from functools import lru_cache 16 | from threading import Lock, local 17 | from typing import List, Dict, Any 18 | from tabulate import tabulate 19 | import google.api_core.exceptions 20 | from google.cloud.resourcemanager import ProjectsClient 21 | from google.cloud.resourcemanager_v3.types import Project 22 | from google.cloud import compute 23 | from googleapiclient import discovery 24 | from googleapiclient.errors import HttpError 25 | import requests.exceptions 26 | import urllib3.exceptions 27 | 28 | # Suppress gRPC and absl logs 29 | os.environ["GRPC_VERBOSITY"] = "ERROR" 30 | os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" 31 | 32 | # Configuration for logging 33 | LOG_LEVEL = logging.DEBUG 34 | log = logging.getLogger("gcp") 35 | log.setLevel(LOG_LEVEL) 36 | ch = logging.StreamHandler() 37 | ch.setLevel(LOG_LEVEL) 38 | formatter = logging.Formatter("%(asctime)s %(levelname)-8s %(message)s", "%Y-%m-%d %H:%M:%S") 39 | ch.setFormatter(formatter) 40 | log.addHandler(ch) 41 | 42 | # Performance configuration 43 | API_DELAY = float(os.environ.get('GCP_API_DELAY', '0.05')) 44 | THREADS = int(os.environ.get('GCP_THREADS', '3')) 45 | BATCH_SIZE = int(os.environ.get('GCP_BATCH_SIZE', '20')) 46 | BATCH_DELAY = float(os.environ.get('GCP_BATCH_DELAY', '10')) 47 | 48 | # Thread-safe locks for shared data 49 | data_lock = Lock() 50 | totals_lock = Lock() 51 | service_calls_lock = Lock() 52 | 53 | # Thread-local storage for GCP clients 54 | thread_local_data = local() 55 | 56 | 57 | def get_thread_local_gcp(): 58 | """Get or create thread-local GCP client instance.""" 59 | if not hasattr(thread_local_data, 'gcp'): 60 | thread_local_data.gcp = GCP() 61 | return thread_local_data.gcp 62 | 63 | 64 | def api_delay(): 65 | """Add configurable delay between API calls for rate limiting.""" 66 | if API_DELAY > 0: 67 | time.sleep(API_DELAY) 68 | 69 | 70 | class GCP: 71 | def __init__(self): 72 | """Initialize GCP client with thread-local instances.""" 73 | self._instances_client = None 74 | self._container_client = None 75 | self._run_client = None 76 | 77 | def projects(self) -> List[Project]: 78 | return ProjectsClient().search_projects() 79 | 80 | def list_instances(self, project_id: str): 81 | api_delay() 82 | request = compute.AggregatedListInstancesRequest(max_results=50, project=project_id) 83 | return self.instances_client.aggregated_list(request=request) 84 | 85 | def clusters(self, project_id: str) -> List[Dict[str, Any]]: 86 | api_delay() 87 | endpoint = self.container_client.projects().zones().clusters() # pylint: disable=no-member 88 | request = endpoint.list(projectId=project_id, zone="-") 89 | response = request.execute() 90 | return response.get("clusters", []) 91 | 92 | @lru_cache(maxsize=128) 93 | def get_cached_clusters(self, project_id: str) -> List[Dict[str, Any]]: 94 | """Cache cluster data to prevent duplicate API calls per project.""" 95 | return self.clusters(project_id) 96 | 97 | def list_cloud_run_services(self, project_id: str) -> List[Dict[str, Any]]: 98 | api_delay() 99 | parent = f"projects/{project_id}/locations/-" 100 | request = self.run_client.projects().locations().services().list(parent=parent) # pylint: disable=no-member 101 | response = request.execute() 102 | return response.get("items", []) 103 | 104 | def list_cloud_run_jobs(self, project_id: str) -> List[Dict[str, Any]]: 105 | api_delay() 106 | parent = f"namespaces/{project_id}" 107 | request = self.run_client.namespaces().jobs().list(parent=parent) # pylint: disable=no-member 108 | response = request.execute() 109 | return response.get("items", []) 110 | 111 | @property 112 | def instances_client(self) -> compute.InstancesClient: 113 | """Thread-safe instances client creation.""" 114 | if self._instances_client is None: 115 | self._instances_client = compute.InstancesClient() 116 | return self._instances_client 117 | 118 | @property 119 | def container_client(self): 120 | """Thread-safe Container API client creation.""" 121 | if self._container_client is None: 122 | self._container_client = discovery.build("container", "v1") 123 | return self._container_client 124 | 125 | @property 126 | def run_client(self): 127 | """Thread-safe Cloud Run API client creation.""" 128 | if self._run_client is None: 129 | self._run_client = discovery.build("run", "v1") 130 | return self._run_client 131 | 132 | @classmethod 133 | def is_vm_kubenode(cls, instance: compute.Instance) -> bool: 134 | if any(k.key == "kubeconfig" for k in instance.metadata.items): 135 | return True 136 | 137 | if instance.labels: 138 | gke_indicators = ["goog-gke-node", "gke-cluster", "k8s-", "kubernetes"] 139 | for key, _ in instance.labels.items(): 140 | if any(indicator in key.lower() for indicator in gke_indicators): 141 | return True 142 | 143 | if instance.name and "gke-" in instance.name: 144 | return True 145 | 146 | return False 147 | 148 | @classmethod 149 | def is_vm_running(cls, instance: compute.Instance) -> bool: 150 | return instance.status != "TERMINATED" 151 | 152 | @classmethod 153 | def is_cluster_autopilot(cls, cluster: Dict[str, Any]) -> bool: 154 | return cluster.get("autopilot", {}).get("enabled", False) 155 | 156 | @classmethod 157 | def get_autopilot_active_nodes(cls, cluster: Dict[str, Any]) -> int: 158 | return cluster.get("currentNodeCount", 0) 159 | 160 | 161 | def process_gcp_project(gcp_project: Project) -> Dict[str, Any]: 162 | if gcp_project.state == Project.State.DELETE_REQUESTED: 163 | log.info("Skipping GCP project %s (project pending deletion)", gcp_project.display_name) 164 | return {} 165 | 166 | result = { 167 | "project_id": gcp_project.project_id, 168 | "kubenodes_running": 0, 169 | "kubenodes_terminated": 0, 170 | "vms_running": 0, 171 | "vms_terminated": 0, 172 | "autopilot_clusters": 0, 173 | "autopilot_nodes": 0, 174 | "cloud_run_services": 0, 175 | "cloud_run_jobs": 0, 176 | } 177 | 178 | log.info("Processing GCP project: %s", gcp_project.display_name) 179 | 180 | fail_safe(count_instances, gcp_project, result, gcp_project) 181 | fail_safe(count_autopilot_clusters, gcp_project, result, gcp_project) 182 | fail_safe(count_cloud_run_resources, gcp_project, result, gcp_project) 183 | fail_safe(validate_and_adjust_kube_counts, gcp_project, result, gcp_project) 184 | 185 | return result 186 | 187 | 188 | def fail_safe(count_func, *args) -> None: 189 | # Extract project from args for error handling 190 | project = args[-1] if args else None 191 | # Remove project from args passed to count_func (it's already in the first args) 192 | func_args = args[:-1] if len(args) > 1 else args 193 | 194 | try: 195 | count_func(*func_args) 196 | except google.api_core.exceptions.Forbidden as exc: 197 | if "Compute Engine API has not been used" in str(exc) and project: 198 | log_warning("compute.googleapis.com", project.display_name) 199 | # Safely extract error message 200 | error_message = str(exc) 201 | if hasattr(exc, 'errors') and exc.errors and len(exc.errors) > 0: 202 | error_message = exc.errors[0].get("message", str(exc)) 203 | add_message(project.project_id, error_message) 204 | else: 205 | log.error("Unexpected error for project: %s: %s", 206 | project.display_name if project else "Unknown", exc) 207 | except HttpError as exc: 208 | if exc.status_code == 403 and ("SERVICE_DISABLED" in str(exc) or "BILLING_DISABLED" in str(exc)) and project: 209 | service_name = get_service_disabled_name(exc) 210 | error_type = "BILLING_DISABLED" if "BILLING_DISABLED" in str(exc) else "SERVICE_DISABLED" 211 | log_warning(service_name, project.display_name, error_type) 212 | add_message(project.project_id, getattr(exc, 'reason', str(exc))) 213 | else: 214 | log.error("Unexpected error for project: %s: %s", 215 | project.display_name if project else "Unknown", exc) 216 | except (ssl.SSLError, requests.exceptions.SSLError) as exc: 217 | log.warning("SSL connection issue for project: %s (retryable network error): %s", 218 | project.display_name if project else "Unknown", exc) 219 | add_message(project.project_id if project else "Unknown", f"SSL connection issue: {exc}") 220 | except ( 221 | requests.exceptions.ConnectionError, 222 | requests.exceptions.Timeout, 223 | urllib3.exceptions.ProtocolError, 224 | ConnectionError, 225 | OSError 226 | ) as exc: 227 | log.warning("Network connection issue for project: %s (retryable network error): %s", 228 | project.display_name if project else "Unknown", exc) 229 | add_message(project.project_id if project else "Unknown", f"Network connection issue: {exc}") 230 | except Exception as exc: # pylint: disable=broad-except 231 | log.error("Unexpected error for project: %s: %s", 232 | project.display_name if project else "Unknown", exc) 233 | 234 | 235 | def log_warning(api: str, project_name: str, error_type: str = "SERVICE_DISABLED") -> None: 236 | api_names = { 237 | "compute.googleapis.com": "Compute Engine", 238 | "container.googleapis.com": "Kubernetes Engine", 239 | "run.googleapis.com": "Cloud Run (Services & Jobs)", 240 | } 241 | 242 | if error_type == "BILLING_DISABLED": 243 | message = f"Billing not enabled for {api_names[api]} API on project: {project_name}. Enable billing to access this API." 244 | else: 245 | message = f"Unable to process {api_names[api]} API for project: {project_name}." 246 | 247 | log.warning(message) 248 | 249 | 250 | def add_message(project_id: str, message: str) -> None: 251 | with service_calls_lock: 252 | if project_id not in service_disabled_calls: 253 | service_disabled_calls[project_id] = [] 254 | service_disabled_calls[project_id].append(message) 255 | 256 | 257 | def get_service_disabled_name(exc: HttpError) -> str: 258 | """ 259 | Extract the service name from HttpError details safely. 260 | 261 | Returns: 262 | str: Service name if found, or 'unknown' if not extractable 263 | """ 264 | try: 265 | for detail in exc.error_details: 266 | if detail.get("@type") == "type.googleapis.com/google.rpc.ErrorInfo": 267 | metadata = detail.get("metadata", {}) 268 | service = metadata.get("service") 269 | if service: 270 | return service 271 | except (KeyError, AttributeError, TypeError): 272 | pass 273 | return "unknown" 274 | 275 | 276 | def validate_and_adjust_kube_counts(gcp_project: Project, result: Dict[str, Any]) -> None: 277 | """Compare instance-detected kube nodes with GKE API reported nodes and adjust if needed.""" 278 | try: 279 | # Check if we already know the Container API is unavailable (thread-safe read) 280 | with service_calls_lock: 281 | api_errors = service_disabled_calls.get(gcp_project.project_id, []) 282 | 283 | if api_errors: 284 | # Check for container API issues (service disabled, billing disabled, or any container-related error) 285 | container_unavailable = any( 286 | "container" in err.lower() or 287 | ("billing" in err.lower() and "container" in err.lower()) or 288 | "service_disabled" in err.lower() 289 | for err in api_errors 290 | ) 291 | if container_unavailable: 292 | message = ( 293 | f"Skipping validation for project {gcp_project.project_id} due to container API access issues" 294 | ) 295 | log.debug(message) 296 | return 297 | 298 | gcp = get_thread_local_gcp() 299 | standard_node_count = 0 300 | for cluster in gcp.get_cached_clusters(gcp_project.project_id): 301 | if not GCP.is_cluster_autopilot(cluster): 302 | standard_node_count += cluster.get("currentNodeCount", 0) 303 | 304 | detected_nodes = result["kubenodes_running"] 305 | 306 | if standard_node_count > detected_nodes: 307 | 308 | discrepancy = standard_node_count - detected_nodes 309 | message = ( 310 | f"Project {gcp_project.project_id}: GKE API reports {standard_node_count} nodes, " 311 | f"but only {detected_nodes} were detected via instance metadata. " 312 | f"Adjusting count to {standard_node_count} (added {discrepancy} nodes)" 313 | ) 314 | log.warning(message) 315 | 316 | result["kubenodes_running"] = standard_node_count 317 | 318 | except Exception: # pylint: disable=broad-except 319 | # Don't log this as an error since we likely already logged the underlying API issue 320 | message = f"Skipping node count validation for project {gcp_project.project_id} due to API access issues" 321 | log.debug(message) 322 | 323 | 324 | def count_autopilot_clusters(gcp_project: Project, result: Dict[str, int]): 325 | gcp = get_thread_local_gcp() 326 | for cluster in gcp.get_cached_clusters(gcp_project.project_id): 327 | if GCP.is_cluster_autopilot(cluster): 328 | result["autopilot_clusters"] += 1 329 | result["autopilot_nodes"] += GCP.get_autopilot_active_nodes(cluster) 330 | 331 | 332 | def count_instances(gcp_project: Project, result: Dict[str, int]): 333 | gcp = get_thread_local_gcp() 334 | for _, response in gcp.list_instances(gcp_project.project_id): 335 | if response.instances: 336 | for instance in response.instances: 337 | typ = "kubenode" if GCP.is_vm_kubenode(instance) else "vm" 338 | state = "running" if GCP.is_vm_running(instance) else "terminated" 339 | key = f"{typ}s_{state}" 340 | result[key] += 1 341 | 342 | 343 | def count_cloud_run_services(gcp_project: Project, result: Dict[str, int]): 344 | gcp = get_thread_local_gcp() 345 | services = gcp.list_cloud_run_services(gcp_project.project_id) 346 | result["cloud_run_services"] = len(services) 347 | 348 | 349 | def count_cloud_run_jobs(gcp_project: Project, result: Dict[str, int]): 350 | gcp = get_thread_local_gcp() 351 | jobs = gcp.list_cloud_run_jobs(gcp_project.project_id) 352 | result["cloud_run_jobs"] = len(jobs) 353 | 354 | 355 | def count_cloud_run_resources(gcp_project: Project, result: Dict[str, int]): 356 | """ 357 | Count both Cloud Run services and jobs in a single operation. 358 | 359 | This combined function eliminates duplicate API calls and error messages 360 | since both services and jobs use the same underlying Cloud Run API. 361 | If the API is disabled, both counts will be set to 0. 362 | """ 363 | try: 364 | gcp = get_thread_local_gcp() 365 | # Try services first 366 | services = gcp.list_cloud_run_services(gcp_project.project_id) 367 | result["cloud_run_services"] = len(services) 368 | 369 | # Only try jobs if services succeeded (same API, so if one works, both should) 370 | jobs = gcp.list_cloud_run_jobs(gcp_project.project_id) 371 | result["cloud_run_jobs"] = len(jobs) 372 | 373 | except Exception: 374 | # If Cloud Run API is disabled, both services and jobs are unavailable 375 | result["cloud_run_services"] = 0 376 | result["cloud_run_jobs"] = 0 377 | raise # Re-raise for fail_safe() error handling and logging 378 | 379 | 380 | def should_skip_project(project: Project) -> bool: 381 | """ 382 | Determine if a project should be skipped during scanning based on filtering rules. 383 | 384 | Filtering rules (in order of precedence): 385 | 1. GCP projects starting with (sys-*) are skipped by default unless GCP_ENABLE_SYS_PROJECTS=true 386 | 2. Include patterns (allowlist) - if set, only matching projects are processed 387 | 3. Exclude patterns (denylist) - matching projects are skipped 388 | 389 | Returns True if the project should be skipped. 390 | """ 391 | project_id = project.project_id 392 | 393 | # 1. sys-* projects (default skip due to Apps Script) 394 | enable_sys_projects = ( 395 | os.environ.get('GCP_ENABLE_SYS_PROJECTS', 'false').lower() == 'true' 396 | ) 397 | if project_id.startswith('sys-') and not enable_sys_projects: 398 | log.info("Skipping sys-* project: %s", project_id) 399 | return True 400 | 401 | # 2. Include patterns (allowlist - if set, only these patterns are processed) 402 | include_patterns = os.environ.get('GCP_INCLUDE_PATTERNS', '') 403 | if include_patterns: 404 | patterns = [p.strip() for p in include_patterns.split(',') if p.strip()] 405 | if not matches_any_pattern(project_id, patterns): 406 | log.info("Project %s doesn't match include patterns, skipping", project_id) 407 | return True 408 | 409 | # 3. Exclude patterns (denylist) 410 | exclude_patterns = os.environ.get('GCP_EXCLUDE_PATTERNS', '') 411 | if exclude_patterns: 412 | patterns = [p.strip() for p in exclude_patterns.split(',') if p.strip()] 413 | if matches_any_pattern(project_id, patterns): 414 | log.info("Project %s matches exclude pattern, skipping", project_id) 415 | return True 416 | 417 | return False 418 | 419 | 420 | def matches_any_pattern(project_id: str, patterns: List[str]) -> bool: 421 | """ 422 | Check if project_id matches any of the provided patterns using glob-style matching. 423 | 424 | Performs case-insensitive matching to handle environment variable patterns correctly. 425 | 426 | Args: 427 | project_id: The GCP project ID to check 428 | patterns: List of glob patterns (e.g., ['dev-*', 'test-*', '*-sandbox']) 429 | 430 | Returns: 431 | True if project_id matches any pattern, False otherwise 432 | """ 433 | project_id_lower = project_id.lower() 434 | return any(fnmatch.fnmatch(project_id_lower, pattern.lower()) for pattern in patterns) 435 | 436 | 437 | def process_project_batch(projects_batch: List[Project], batch_num: int, total_batches: int) -> Dict[str, Any]: 438 | """ 439 | Process a batch of projects with thread-safe data collection. 440 | 441 | Returns: 442 | Dictionary with batch statistics: processed_count, skipped_count, rows 443 | """ 444 | batch_stats = { 445 | 'processed_count': 0, 446 | 'skipped_count': 0, 447 | 'rows': [] 448 | } 449 | 450 | log.info("Processing batch %d/%d (%d projects)", batch_num, total_batches, len(projects_batch)) 451 | 452 | # Process projects in this batch using ThreadPoolExecutor 453 | with ThreadPoolExecutor(max_workers=THREADS) as executor: 454 | # Submit all projects in this batch for processing (pre-filtered, so no skip check needed) 455 | future_to_project = {} 456 | for project in projects_batch: 457 | future = executor.submit(process_gcp_project, project) 458 | future_to_project[future] = project 459 | 460 | # Collect results as they complete 461 | for future in as_completed(future_to_project): 462 | project = future_to_project[future] 463 | try: 464 | row = future.result() 465 | if row: # Non-empty result (successful processing) 466 | batch_stats['rows'].append(row) 467 | batch_stats['processed_count'] += 1 468 | else: # Empty result (e.g., project in DELETE_REQUESTED state) 469 | batch_stats['skipped_count'] += 1 470 | except Exception as exc: 471 | log.error("Error processing project %s: %s", project.display_name, exc) 472 | # Processing errors are tracked separately - don't count as skipped 473 | # The project was attempted but failed 474 | 475 | return batch_stats 476 | 477 | 478 | def update_totals_threadsafe(rows: List[Dict], stats: Dict[str, int]) -> None: 479 | """Thread-safe update of global data and totals.""" 480 | with data_lock: 481 | data.extend(rows) 482 | 483 | with totals_lock: 484 | for row in rows: 485 | for k in totals: 486 | if k != "project_id": 487 | totals[k] += row[k] 488 | 489 | 490 | data = [] 491 | service_disabled_calls = {} 492 | headers = { 493 | "project_id": "Project ID", 494 | "kubenodes_running": "K8s Nodes (Running)", 495 | "kubenodes_terminated": "K8s Nodes (Terminated)", 496 | "vms_running": "VMs (Running)", 497 | "vms_terminated": "VMs (Terminated)", 498 | "autopilot_clusters": "Autopilot Clusters", 499 | "autopilot_nodes": "Autopilot Nodes (Running)", 500 | "cloud_run_services": "Cloud Run Services", 501 | "cloud_run_jobs": "Cloud Run Jobs", 502 | } 503 | totals = { 504 | "project_id": "totals", 505 | "kubenodes_running": 0, 506 | "kubenodes_terminated": 0, 507 | "vms_running": 0, 508 | "vms_terminated": 0, 509 | "autopilot_clusters": 0, 510 | "autopilot_nodes": 0, 511 | "cloud_run_services": 0, 512 | "cloud_run_jobs": 0, 513 | } 514 | 515 | main_gcp = GCP() 516 | 517 | projects = list(main_gcp.projects()) 518 | if not projects: 519 | log.error("No GCP projects found") 520 | exit(1) # pylint: disable=consider-using-sys-exit 521 | 522 | # Track filtering statistics for summary 523 | total_projects = 0 524 | skipped_projects = 0 525 | processed_projects = 0 526 | 527 | log.info("Starting GCP project scan with parallel processing enabled") 528 | log.info("Environment variables:") 529 | log.info(" GCP_ENABLE_SYS_PROJECTS: %s", os.environ.get('GCP_ENABLE_SYS_PROJECTS', 'false')) 530 | log.info(" GCP_INCLUDE_PATTERNS: %s", os.environ.get('GCP_INCLUDE_PATTERNS', '(not set)')) 531 | log.info(" GCP_EXCLUDE_PATTERNS: %s", os.environ.get('GCP_EXCLUDE_PATTERNS', '(not set)')) 532 | log.info(" GCP_THREADS: %d", THREADS) 533 | log.info(" GCP_BATCH_SIZE: %d", BATCH_SIZE) 534 | log.info(" GCP_API_DELAY: %.3fs", API_DELAY) 535 | 536 | # Apply filtering to projects before batching for efficiency 537 | filtered_projects = [] 538 | total_discovered_projects = len(projects) 539 | 540 | log.info("Applying project filters...") 541 | for project in projects: 542 | if should_skip_project(project): 543 | continue 544 | filtered_projects.append(project) 545 | 546 | log.info("Project filtering complete:") 547 | log.info(" Total projects discovered: %d", total_discovered_projects) 548 | log.info(" Projects matching filters: %d", len(filtered_projects)) 549 | log.info(" Projects to be skipped: %d", total_discovered_projects - len(filtered_projects)) 550 | 551 | if not filtered_projects: 552 | log.error("No projects match the current filtering criteria") 553 | log.error("Consider adjusting your filter settings:") 554 | log.error(" GCP_INCLUDE_PATTERNS: %s", os.environ.get('GCP_INCLUDE_PATTERNS', '(not set)')) 555 | log.error(" GCP_EXCLUDE_PATTERNS: %s", os.environ.get('GCP_EXCLUDE_PATTERNS', '(not set)')) 556 | log.error(" GCP_ENABLE_SYS_PROJECTS: %s", os.environ.get('GCP_ENABLE_SYS_PROJECTS', 'false')) 557 | exit(1) # pylint: disable=consider-using-sys-exit 558 | 559 | # Process filtered projects in batches with parallel execution 560 | total_projects = len(filtered_projects) 561 | processed_projects = 0 562 | post_filter_skipped = 0 # Projects skipped after filtering (e.g., DELETE_REQUESTED) 563 | 564 | # Split filtered projects into batches 565 | batches = [filtered_projects[i:i + BATCH_SIZE] for i in range(0, len(filtered_projects), BATCH_SIZE)] 566 | total_batches = len(batches) 567 | 568 | log.info("Processing %d projects in %d batches", total_projects, total_batches) 569 | 570 | for batch_num, batch in enumerate(batches, 1): 571 | # Process this batch 572 | batch_stats = process_project_batch(batch, batch_num, total_batches) 573 | 574 | # Update global statistics and data 575 | update_totals_threadsafe(batch_stats['rows'], batch_stats) 576 | processed_projects += batch_stats['processed_count'] 577 | post_filter_skipped += batch_stats['skipped_count'] 578 | 579 | # Log batch completion 580 | log.info("Batch %d/%d complete: %d processed, %d skipped", 581 | batch_num, total_batches, 582 | batch_stats['processed_count'], 583 | batch_stats['skipped_count']) 584 | 585 | # Add delay between batches (except for the last batch) 586 | if batch_num < total_batches and BATCH_DELAY > 0: 587 | log.debug("Waiting %.1fs before next batch...", BATCH_DELAY) 588 | time.sleep(BATCH_DELAY) 589 | 590 | # Log final processing summary 591 | skipped_by_filters = total_discovered_projects - total_projects 592 | processing_errors = max(0, total_projects - processed_projects - post_filter_skipped) 593 | 594 | log.info("Processing summary:") 595 | log.info(" Total projects discovered: %d", total_discovered_projects) 596 | log.info(" Projects matching filters: %d", total_projects) 597 | log.info(" Projects skipped by filters: %d", skipped_by_filters) 598 | log.info(" Projects successfully processed: %d", processed_projects) 599 | log.info(" Projects skipped during processing: %d", post_filter_skipped) 600 | log.info(" Projects with processing errors: %d", processing_errors) 601 | 602 | data.append(totals) 603 | 604 | # Output our results 605 | print(tabulate(data, headers=headers, tablefmt="grid", maxheadercolwidths=[10, 15, 15, 10, 15, 15, 15, 15, 12])) 606 | 607 | with open("gcp-benchmark.csv", "w", newline="", encoding="utf-8") as csv_file: 608 | csv_writer = csv.DictWriter(csv_file, fieldnames=headers.keys()) 609 | csv_writer.writeheader() 610 | csv_writer.writerows(data) 611 | 612 | log.info("CSV file saved to: ./gcp-benchmark.csv") 613 | 614 | if service_disabled_calls: 615 | MSG = ( 616 | "Some API service calls were disabled in certain projects, preventing data processing. " 617 | "These APIs might be intentionally disabled in your environment. " 618 | "Details have been captured and saved to: ./gcp-exceptions.txt for your review." 619 | ) 620 | log.warning(MSG) 621 | 622 | with open("gcp-exceptions.txt", "w", encoding="utf-8") as f: 623 | for project, messages in service_disabled_calls.items(): 624 | f.write(f"Project ID: {project}\n") 625 | for msg in set(messages): 626 | f.write(f"- {msg}\n") 627 | f.write("\n") 628 | -------------------------------------------------------------------------------- /AWS/aws_cspm_benchmark.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=C0301,C0302,E0401,W1203,W0718 2 | # flake8: noqa: E501 3 | """ 4 | aws-cspm-benchmark.py 5 | 6 | Assists with provisioning calculations by retrieving a count of 7 | all billable resources attached to an AWS account. 8 | """ 9 | 10 | import argparse 11 | import csv 12 | import concurrent.futures 13 | import threading 14 | import time 15 | import json 16 | import os 17 | import random 18 | import signal 19 | import logging 20 | from datetime import datetime, timezone 21 | from typing import Dict, List, Optional, Any, Union 22 | import boto3 23 | import botocore 24 | from botocore.config import Config 25 | 26 | 27 | # Global data structures 28 | data: List[Dict[str, Any]] = [] 29 | headers = { 30 | "account_id": "AWS Account ID", 31 | "region": "Region", 32 | "vms_terminated": "Terminated VMs", 33 | "vms_running": "Running VMs", 34 | "kubenodes_terminated": "Terminated Kubernetes Nodes", 35 | "kubenodes_running": "Running Kubernetes Nodes", 36 | "fargate_profiles": "Active EKS Fargate Profiles", 37 | "fargate_tasks": "ECS Service Fargate Tasks", 38 | } 39 | totals: Dict[str, Union[str, int]] = { 40 | "region": "TOTAL", 41 | "account_id": "TOTAL", 42 | "vms_terminated": 0, 43 | "vms_running": 0, 44 | "kubenodes_terminated": 0, 45 | "kubenodes_running": 0, 46 | "fargate_profiles": 0, 47 | "fargate_tasks": 0, 48 | } 49 | 50 | # Thread-safe data structures 51 | data_lock = threading.Lock() 52 | totals_lock = threading.Lock() 53 | progress_lock = threading.Lock() 54 | console_lock = threading.Lock() # For synchronized console output 55 | 56 | # Progress tracking 57 | progress_state: Dict[str, Any] = { 58 | "completed_accounts": set(), 59 | "failed_accounts": set(), 60 | "start_time": None, 61 | "total_accounts": 0, 62 | "current_batch": 0, 63 | } 64 | 65 | # Global configuration - will be initialized in main() 66 | args: Optional[argparse.Namespace] = None 67 | logger: Optional[logging.Logger] = None 68 | 69 | 70 | class ErrorCollector: 71 | """Thread-safe error collection system to defer error output until after progress completes on each account""" 72 | 73 | def __init__(self): 74 | self.errors = [] 75 | self.lock = threading.Lock() 76 | 77 | def add_error(self, error_msg, context=None): 78 | """Add an error message with optional context""" 79 | with self.lock: 80 | error_entry = { 81 | "message": error_msg, 82 | "context": context or {}, 83 | "timestamp": time.time(), 84 | } 85 | self.errors.append(error_entry) 86 | 87 | def add_retry_message(self, operation_name, attempt, max_retries, delay, error): # pylint: disable=R0913,R0917 88 | """Add a retry message""" 89 | msg = f"Retry {attempt + 1}/{max_retries} for {operation_name} in {delay:.2f}s: {error}" 90 | self.add_error(msg, {"type": "retry", "operation": operation_name}) 91 | 92 | def add_timeout_error(self, operation, region=None, account=None): 93 | """Add a timeout error""" 94 | msg = f"Timeout processing {operation}" 95 | if region: 96 | msg += f" in {region}" 97 | context = {"type": "timeout", "operation": operation} 98 | if region: 99 | context["region"] = region 100 | if account: 101 | context["account"] = account 102 | self.add_error(msg, context) 103 | 104 | def add_processing_error(self, operation, region, error, account=None): 105 | """Add a processing error""" 106 | msg = f"Error processing {operation} in {region}: {error}" 107 | context = {"type": "processing", "operation": operation, "region": region} 108 | if account: 109 | context["account"] = account 110 | self.add_error(msg, context) 111 | 112 | def get_errors(self): 113 | """Get all collected errors""" 114 | with self.lock: 115 | return self.errors.copy() 116 | 117 | def clear_errors(self): 118 | """Clear all collected errors""" 119 | with self.lock: 120 | self.errors.clear() 121 | 122 | def has_errors(self): 123 | """Check if there are any errors collected""" 124 | with self.lock: 125 | return len(self.errors) > 0 126 | 127 | def display_errors(self, max_errors=10): # pylint: disable=R0912 128 | """Display collected errors in an organized format""" 129 | errors = self.get_errors() 130 | if not errors: 131 | return 132 | 133 | print(f"\n⚠️ Collected {len(errors)} error(s) during processing:") 134 | 135 | # Group errors by type 136 | retry_errors = [ 137 | e for e in errors if e.get("context", {}).get("type") == "retry" 138 | ] 139 | timeout_errors = [ 140 | e for e in errors if e.get("context", {}).get("type") == "timeout" 141 | ] 142 | processing_errors = [ 143 | e for e in errors if e.get("context", {}).get("type") == "processing" 144 | ] 145 | other_errors = [ 146 | e 147 | for e in errors 148 | if e.get("context", {}).get("type") 149 | not in ["retry", "timeout", "processing"] 150 | ] 151 | 152 | # Display errors by category 153 | if timeout_errors: 154 | print(f"\n🕐 Timeout Errors ({len(timeout_errors)}):") 155 | for error in timeout_errors[:max_errors]: 156 | print(f" • {error['message']}") 157 | if len(timeout_errors) > max_errors: 158 | print( 159 | f" ... and {len(timeout_errors) - max_errors} more timeout errors" 160 | ) 161 | 162 | if processing_errors: 163 | print(f"\n🔧 Processing Errors ({len(processing_errors)}):") 164 | for error in processing_errors[:max_errors]: 165 | print(f" • {error['message']}") 166 | if len(processing_errors) > max_errors: 167 | print( 168 | f" ... and {len(processing_errors) - max_errors} more processing errors" 169 | ) 170 | 171 | if retry_errors: 172 | print(f"\n🔄 Retry Messages ({len(retry_errors)}):") 173 | for error in retry_errors[ 174 | :3 175 | ]: # Show fewer retry messages as they're verbose 176 | print(f" • {error['message']}") 177 | if len(retry_errors) > 3: 178 | print(f" ... and {len(retry_errors) - 3} more retry messages") 179 | 180 | if other_errors: 181 | print(f"\n❓ Other Errors ({len(other_errors)}):") 182 | for error in other_errors[:max_errors]: 183 | print(f" • {error['message']}") 184 | if len(other_errors) > max_errors: 185 | print(f" ... and {len(other_errors) - max_errors} more errors") 186 | 187 | print() # Add blank line after error display 188 | 189 | 190 | def parse_args() -> argparse.Namespace: 191 | """Parse and validate command line arguments""" 192 | parser = argparse.ArgumentParser(description="AWS accounts analyzer") 193 | 194 | # Check for AWS_ASSUME_ROLE_NAME environment variable 195 | default_role = os.environ.get( 196 | "AWS_ASSUME_ROLE_NAME", "OrganizationAccountAccessRole" 197 | ) 198 | 199 | parser.add_argument( 200 | "-r", 201 | "--role_name", 202 | default=default_role, 203 | help=f"Specify a custom role name to assume into (default: {default_role}, can be set via AWS_ASSUME_ROLE_NAME env var).", 204 | ) 205 | parser.add_argument("-R", "--regions", help="Specify which AWS regions to analyze.") 206 | parser.add_argument( 207 | "-t", 208 | "--threads", 209 | type=int, 210 | default=5, 211 | help="Number of worker threads for parallel processing (default: 5, reduced for rate limiting).", 212 | ) 213 | parser.add_argument( 214 | "--batch-size", 215 | type=int, 216 | default=20, 217 | help="Number of accounts to process per batch (default: 20).", 218 | ) 219 | parser.add_argument( 220 | "--batch-delay", 221 | type=int, 222 | default=30, 223 | help="Delay in seconds between batches (default: 30).", 224 | ) 225 | parser.add_argument( 226 | "--api-delay", 227 | type=float, 228 | default=0.1, 229 | help="Delay in seconds between API calls (default: 0.1).", 230 | ) 231 | parser.add_argument( 232 | "--max-retries", 233 | type=int, 234 | default=5, 235 | help="Maximum retry attempts for failed operations (default: 5).", 236 | ) 237 | parser.add_argument( 238 | "--operation-timeout", 239 | type=int, 240 | default=300, 241 | help="Timeout in seconds for individual operations (default: 300).", 242 | ) 243 | parser.add_argument( 244 | "--resume-file", 245 | default="aws_benchmark_progress.json", 246 | help="File to store/resume progress (default: aws_benchmark_progress.json).", 247 | ) 248 | parser.add_argument( 249 | "--skip-accounts", help="Comma-separated list of account IDs to skip." 250 | ) 251 | parser.add_argument( 252 | "--dry-run", 253 | action="store_true", 254 | help="Show what would be processed without making API calls.", 255 | ) 256 | 257 | args = parser.parse_args() # pylint: disable=W0621 258 | 259 | # Input validation 260 | if args.threads < 1 or args.threads > 20: 261 | parser.error("Threads must be between 1 and 20") 262 | 263 | if args.batch_size < 1 or args.batch_size > 100: 264 | parser.error("Batch size must be between 1 and 100") 265 | 266 | if args.batch_delay < 0 or args.batch_delay > 3600: 267 | parser.error("Batch delay must be between 0 and 3600 seconds") 268 | 269 | if args.api_delay < 0 or args.api_delay > 10: 270 | parser.error("API delay must be between 0 and 10 seconds") 271 | 272 | if args.max_retries < 0 or args.max_retries > 20: 273 | parser.error("Max retries must be between 0 and 20") 274 | 275 | if args.operation_timeout < 30 or args.operation_timeout > 3600: 276 | parser.error("Operation timeout must be between 30 and 3600 seconds") 277 | 278 | if not args.role_name.strip(): 279 | parser.error("Role name cannot be empty") 280 | 281 | # Validate regions format if provided 282 | if args.regions: 283 | regions = [r.strip() for r in args.regions.split(",")] 284 | for region in regions: 285 | if not region or not region.replace("-", "").isalnum(): 286 | parser.error(f"Invalid region format: {region}") 287 | 288 | return args 289 | 290 | 291 | def setup_logging(log_level: str = "INFO") -> logging.Logger: 292 | """Setup logging configuration""" 293 | logging.basicConfig( 294 | level=getattr(logging, log_level.upper()), 295 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 296 | datefmt="%Y-%m-%d %H:%M:%S", 297 | ) 298 | return logging.getLogger(__name__) 299 | 300 | 301 | def setup_signal_handlers() -> None: 302 | """Setup signal handlers for graceful shutdown""" 303 | 304 | def signal_handler(signum: int) -> None: 305 | logger.info(f"Received signal {signum}, initiating graceful shutdown...") 306 | # The KeyboardInterrupt will be caught in main() for proper cleanup 307 | raise KeyboardInterrupt("Shutdown signal received") 308 | 309 | signal.signal(signal.SIGINT, signal_handler) 310 | signal.signal(signal.SIGTERM, signal_handler) 311 | 312 | 313 | class RateLimiter: # pylint: disable=R0903 314 | """Simple rate limiter to control API call frequency""" 315 | 316 | def __init__(self, calls_per_second=10): 317 | self.calls_per_second = calls_per_second 318 | self.min_interval = 1.0 / calls_per_second 319 | self.last_called = 0 320 | self.lock = threading.Lock() 321 | 322 | def wait(self): 323 | with self.lock: 324 | elapsed = time.time() - self.last_called 325 | if elapsed < self.min_interval: 326 | time.sleep(self.min_interval - elapsed) 327 | self.last_called = time.time() 328 | 329 | 330 | class RetryHandler: 331 | """Handles exponential backoff and retry logic with error collection support""" 332 | 333 | def __init__(self, error_collector=None): 334 | self.error_collector = error_collector 335 | 336 | @staticmethod 337 | def exponential_backoff(attempt, base_delay=1, max_delay=300, jitter=True): 338 | """Calculate exponential backoff delay with optional jitter""" 339 | delay = min(base_delay * (2**attempt), max_delay) 340 | if jitter: 341 | delay *= 0.5 + random.random() * 0.5 # nosec B311 342 | return delay 343 | 344 | @staticmethod 345 | def should_retry(exception, attempt, max_retries): 346 | """Determine if an exception should be retried""" 347 | if attempt >= max_retries: 348 | return False 349 | 350 | if isinstance( 351 | exception, 352 | ( 353 | botocore.exceptions.ClientError, 354 | botocore.exceptions.ReadTimeoutError, 355 | botocore.exceptions.ConnectTimeoutError, 356 | botocore.exceptions.EndpointConnectionError, 357 | ), 358 | ): 359 | if hasattr(exception, "response"): 360 | error_code = exception.response.get("Error", {}).get("Code", "") 361 | # Retry on throttling and temporary errors 362 | return error_code in [ 363 | "Throttling", 364 | "ThrottledException", 365 | "TooManyRequestsException", 366 | "RequestLimitExceeded", 367 | "ServiceUnavailable", 368 | "InternalError", 369 | ] 370 | return True 371 | return False 372 | 373 | def retry_with_backoff(self, func, max_retries=5, operation_name="operation"): 374 | """Execute function with exponential backoff retry""" 375 | for attempt in range(max_retries + 1): 376 | try: 377 | return func() 378 | except Exception as e: 379 | if not self.should_retry(e, attempt, max_retries): 380 | error_msg = ( 381 | f"Failed {operation_name} after {attempt + 1} attempts: {e}" 382 | ) 383 | if self.error_collector: 384 | self.error_collector.add_error( 385 | error_msg, 386 | {"type": "final_failure", "operation": operation_name}, 387 | ) 388 | else: 389 | print(error_msg) 390 | raise 391 | 392 | if attempt < max_retries: 393 | delay = self.exponential_backoff(attempt) 394 | if self.error_collector: 395 | self.error_collector.add_retry_message( 396 | operation_name, attempt, max_retries, delay, e 397 | ) 398 | else: 399 | print( 400 | f"Retry {attempt + 1}/{max_retries} for {operation_name} in {delay:.2f}s: {e}" 401 | ) 402 | time.sleep(delay) 403 | 404 | raise Exception(f"Max retries exceeded for {operation_name}") # pylint: disable=W0719 405 | 406 | 407 | class ProgressTracker: 408 | """Handles progress tracking and resumption""" 409 | 410 | def __init__(self, progress_file): 411 | self.progress_file = progress_file 412 | self.load_progress() 413 | 414 | def load_progress(self) -> None: 415 | """Load progress from file if it exists""" 416 | if os.path.exists(self.progress_file): 417 | try: 418 | with open(self.progress_file, "r", encoding="utf-8") as f: 419 | saved_progress = json.load(f) 420 | progress_state.update(saved_progress) 421 | progress_state["completed_accounts"] = set( 422 | progress_state.get("completed_accounts", []) 423 | ) 424 | progress_state["failed_accounts"] = set( 425 | progress_state.get("failed_accounts", []) 426 | ) 427 | if logger: 428 | logger.info( 429 | f"Resumed from progress file: {len(progress_state['completed_accounts'])} accounts completed" 430 | ) 431 | else: 432 | print( 433 | f"Resumed from progress file: {len(progress_state['completed_accounts'])} accounts completed" 434 | ) 435 | except (json.JSONDecodeError, KeyError) as e: 436 | error_msg = f"Invalid progress file format: {e}" 437 | if logger: 438 | logger.error(error_msg) 439 | else: 440 | print(error_msg) 441 | except OSError as e: 442 | error_msg = f"Could not read progress file: {e}" 443 | if logger: 444 | logger.error(error_msg) 445 | else: 446 | print(error_msg) 447 | 448 | def save_progress(self) -> None: 449 | """Save current progress to file""" 450 | try: 451 | with progress_lock: 452 | save_data = progress_state.copy() 453 | save_data["completed_accounts"] = list(save_data["completed_accounts"]) 454 | save_data["failed_accounts"] = list(save_data["failed_accounts"]) 455 | save_data["last_updated"] = datetime.now(timezone.utc).isoformat() 456 | 457 | with open(self.progress_file, "w", encoding="utf-8") as f: 458 | json.dump(save_data, f, indent=2) 459 | except OSError as e: 460 | error_msg = f"Could not save progress to {self.progress_file}: {e}" 461 | if logger: 462 | logger.error(error_msg) 463 | else: 464 | print(error_msg) 465 | except (TypeError, ValueError) as e: 466 | error_msg = f"Could not serialize progress data: {e}" 467 | if logger: 468 | logger.error(error_msg) 469 | else: 470 | print(error_msg) 471 | 472 | def mark_completed(self, account_id): 473 | """Mark an account as completed""" 474 | with progress_lock: 475 | progress_state["completed_accounts"].add(account_id) 476 | progress_state["failed_accounts"].discard(account_id) 477 | self.save_progress() 478 | 479 | def mark_failed(self, account_id): 480 | """Mark an account as failed""" 481 | with progress_lock: 482 | progress_state["failed_accounts"].add(account_id) 483 | self.save_progress() 484 | 485 | def is_completed(self, account_id): 486 | """Check if account is already completed""" 487 | return account_id in progress_state["completed_accounts"] 488 | 489 | def should_skip(self, account_id): 490 | """Check if account should be skipped""" 491 | return self.is_completed(account_id) 492 | 493 | 494 | class AWSOrgAccess: 495 | """Handles AWS Organizations access and account enumeration""" 496 | 497 | def __init__( 498 | self, rate_limiter: "RateLimiter", retry_handler: "RetryHandler" 499 | ) -> None: 500 | """Initialize AWS Organizations access 501 | 502 | Args: 503 | rate_limiter: Rate limiter for API calls 504 | retry_handler: Retry handler for failed operations 505 | """ 506 | # Configure boto3 with retry settings 507 | config = Config( 508 | retries={"max_attempts": args.max_retries, "mode": "adaptive"}, 509 | max_pool_connections=50, 510 | ) 511 | 512 | self.master_session = boto3.session.Session() 513 | self.master_sts = self.master_session.client("sts", config=config) 514 | self.master_account_id = self.master_sts.get_caller_identity()["Account"] 515 | self.rate_limiter = rate_limiter 516 | self.retry_handler = retry_handler 517 | 518 | def accounts(self) -> List["AWSHandle"]: 519 | """Get all active AWS accounts in the organization 520 | 521 | Returns: 522 | List of AWSHandle objects for active accounts 523 | """ 524 | try: 525 | 526 | def get_accounts() -> List[Dict[str, Any]]: 527 | client = boto3.client( 528 | "organizations", 529 | config=Config( 530 | retries={"max_attempts": args.max_retries, "mode": "adaptive"} 531 | ), 532 | ) 533 | response = client.list_accounts() 534 | accounts = response["Accounts"] 535 | next_token = response.get("NextToken", None) 536 | 537 | while next_token: 538 | self.rate_limiter.wait() 539 | response = client.list_accounts(NextToken=next_token) 540 | accounts += response["Accounts"] 541 | next_token = response.get("NextToken", None) 542 | 543 | return accounts 544 | 545 | accounts = self.retry_handler.retry_with_backoff( 546 | get_accounts, args.max_retries, "list_accounts" 547 | ) 548 | 549 | # Filter active accounts 550 | active_accounts = [a for a in accounts if a["Status"] == "ACTIVE"] 551 | 552 | # Apply skip list if provided 553 | if args.skip_accounts: 554 | skip_list = [acc.strip() for acc in args.skip_accounts.split(",")] 555 | active_accounts = [ 556 | a for a in active_accounts if a["Id"] not in skip_list 557 | ] 558 | 559 | # Return lazy handles that defer session creation to avoid n+ STS calls upfront 560 | return [self.create_lazy_handle(a) for a in active_accounts] 561 | 562 | except botocore.exceptions.ClientError as e: 563 | error_code = e.response.get("Error", {}).get("Code", "") 564 | if error_code == "AccessDeniedException": 565 | msg = "Cannot autodiscover adjacent accounts: cannot list accounts within the AWS organization" 566 | if logger: 567 | logger.warning(msg) 568 | else: 569 | print(msg) 570 | return [ 571 | AWSHandle( 572 | rate_limiter=self.rate_limiter, retry_handler=self.retry_handler 573 | ) 574 | ] 575 | if error_code == "AWSOrganizationsNotInUseException": 576 | msg = "This account is not a member of an AWS Organization" 577 | if logger: 578 | logger.info(msg) 579 | else: 580 | print(msg) 581 | return [ 582 | AWSHandle( 583 | rate_limiter=self.rate_limiter, retry_handler=self.retry_handler 584 | ) 585 | ] 586 | raise 587 | 588 | def create_lazy_handle(self, account: Dict[str, Any]) -> "AWSHandle": 589 | """Create a lazy AWSHandle that defers session creation until needed 590 | 591 | Args: 592 | account: Account dictionary from AWS Organizations API 593 | 594 | Returns: 595 | AWSHandle object with lazy session creation 596 | """ 597 | return AWSHandle( 598 | account_id=account["Id"], 599 | master_session=self.master_session, 600 | master_account_id=self.master_account_id, 601 | role_name=args.role_name, 602 | rate_limiter=self.rate_limiter, 603 | retry_handler=self.retry_handler, 604 | ) 605 | 606 | def aws_handle(self, account: Dict[str, Any]) -> Optional["AWSHandle"]: 607 | """Create an AWSHandle for the given account 608 | 609 | Args: 610 | account: Account dictionary from AWS Organizations API 611 | 612 | Returns: 613 | AWSHandle object or None if session creation failed 614 | """ 615 | if account["Id"] == self.master_account_id: 616 | return AWSHandle( 617 | aws_session=self.master_session, 618 | account_id=self.master_account_id, 619 | rate_limiter=self.rate_limiter, 620 | retry_handler=self.retry_handler, 621 | ) 622 | 623 | session = self.new_session(account["Id"]) 624 | if session: 625 | return AWSHandle( 626 | aws_session=session, 627 | account_id=account["Id"], 628 | rate_limiter=self.rate_limiter, 629 | retry_handler=self.retry_handler, 630 | ) 631 | return None 632 | 633 | def new_session(self, account_id: str) -> Optional[boto3.session.Session]: 634 | """Create a new session for the specified account using assumed role 635 | 636 | Args: 637 | account_id: AWS account ID to assume role into 638 | 639 | Returns: 640 | Boto3 session object or None if failed 641 | """ 642 | try: 643 | 644 | def assume_role() -> Dict[str, Any]: 645 | return self.master_sts.assume_role( 646 | RoleArn=f"arn:aws:iam::{account_id}:role/{args.role_name}", 647 | RoleSessionName=f"cspm-benchmark-{account_id}", 648 | ) 649 | 650 | credentials = self.retry_handler.retry_with_backoff( 651 | assume_role, args.max_retries, f"assume_role_{account_id}" 652 | ) 653 | 654 | return boto3.session.Session( 655 | aws_access_key_id=credentials["Credentials"]["AccessKeyId"], 656 | aws_secret_access_key=credentials["Credentials"]["SecretAccessKey"], 657 | aws_session_token=credentials["Credentials"]["SessionToken"], 658 | region_name="us-east-1", 659 | ) 660 | except botocore.exceptions.ClientError as e: 661 | error_code = e.response.get("Error", {}).get("Code", "") 662 | error_msg = f"Cannot access account {account_id}: {error_code} - {e}" 663 | if logger: 664 | logger.error(error_msg) 665 | else: 666 | print(error_msg) 667 | return None 668 | except ( 669 | botocore.exceptions.BotoCoreError, 670 | botocore.exceptions.NoCredentialsError, 671 | ) as e: 672 | error_msg = f"AWS credentials error for account {account_id}: {e}" 673 | if logger: 674 | logger.error(error_msg) 675 | else: 676 | print(error_msg) 677 | return None 678 | 679 | 680 | class AWSHandle: 681 | EKS_TAGS = [ 682 | "eks:cluster-name", 683 | "alpha.eksctl.io/nodegroup-type", 684 | "aws:eks:cluster-name", 685 | "eks:nodegroup-name", 686 | ] 687 | 688 | def __init__( 689 | self, 690 | aws_session=None, 691 | account_id=None, 692 | rate_limiter=None, 693 | retry_handler=None, 694 | master_session=None, 695 | master_account_id=None, 696 | role_name=None, 697 | ): 698 | config = Config( 699 | retries={"max_attempts": args.max_retries, "mode": "adaptive"}, 700 | max_pool_connections=50, 701 | read_timeout=args.operation_timeout, 702 | connect_timeout=30, 703 | ) 704 | 705 | # Store lazy session creation parameters 706 | self._aws_session = aws_session 707 | self._master_session = master_session 708 | self._master_account_id = master_account_id 709 | self._role_name = role_name 710 | self.acc_id = account_id 711 | self.config = config 712 | self.rate_limiter = rate_limiter or RateLimiter() 713 | self.retry_handler = retry_handler or RetryHandler() 714 | self._session_created = False 715 | self._session_lock = threading.Lock() # Thread safety for lazy session creation 716 | 717 | # Validate lazy creation parameters 718 | if account_id and account_id != master_account_id: 719 | if not all([master_session, role_name]): 720 | raise ValueError( 721 | f"Cross-account session requires master_session and role_name for account {account_id}" 722 | ) 723 | 724 | @property 725 | def aws_session(self): 726 | """Lazy creation of AWS session - only create when first accessed""" 727 | if self._aws_session is None and not self._session_created: 728 | with self._session_lock: 729 | # Double-check locking pattern 730 | if self._aws_session is None and not self._session_created: 731 | try: 732 | if self.acc_id and self.acc_id != self._master_account_id: 733 | # Need to create session by assuming role 734 | self._aws_session = self._create_cross_account_session() 735 | else: 736 | # Use master session or create default session 737 | self._aws_session = ( 738 | self._master_session or boto3.session.Session() 739 | ) 740 | 741 | # Only set flag on successful session creation 742 | self._session_created = True 743 | 744 | except Exception as e: 745 | # Don't set flag on failure to allow retries 746 | logger.error(f"Session creation failed for {self.acc_id}: {e}") 747 | # Re-raise the original exception to preserve error details 748 | raise 749 | 750 | # Don't silently fallback to default session - raise error if session is None 751 | if self._aws_session is None: 752 | raise RuntimeError( 753 | f"No valid AWS session available for account {self.acc_id}" 754 | ) 755 | 756 | return self._aws_session 757 | 758 | def _create_cross_account_session(self): 759 | """Create a cross-account session using STS assume role""" 760 | if not self._master_session or not self._role_name: 761 | raise ValueError( 762 | f"Cannot create session for account {self.acc_id}: missing master session or role name" 763 | ) 764 | 765 | try: 766 | master_sts = self._master_session.client("sts", config=self.config) 767 | 768 | def assume_role(): 769 | return master_sts.assume_role( 770 | RoleArn=f"arn:aws:iam::{self.acc_id}:role/{self._role_name}", 771 | RoleSessionName=f"cspm-benchmark-{self.acc_id}", 772 | ) 773 | 774 | credentials = self.retry_handler.retry_with_backoff( 775 | assume_role, args.max_retries, f"lazy_assume_role_{self.acc_id}" 776 | ) 777 | 778 | return boto3.session.Session( 779 | aws_access_key_id=credentials["Credentials"]["AccessKeyId"], 780 | aws_secret_access_key=credentials["Credentials"]["SecretAccessKey"], 781 | aws_session_token=credentials["Credentials"]["SessionToken"], 782 | region_name="us-east-1", 783 | ) 784 | except botocore.exceptions.ClientError as e: 785 | error_code = e.response.get("Error", {}).get("Code", "") 786 | error_message = e.response.get("Error", {}).get("Message", str(e)) 787 | 788 | # Provide specific error messages for common issues 789 | if error_code == "AccessDenied": 790 | raise RuntimeError( 791 | f"Access denied assuming role {self._role_name} in account {self.acc_id}. " 792 | f"Check if role exists and trusts the master account." 793 | ) from e 794 | elif error_code == "NoSuchEntity": 795 | raise RuntimeError( 796 | f"Role {self._role_name} not found in account {self.acc_id}. " 797 | f"Ensure the role exists and is spelled correctly." 798 | ) from e 799 | elif error_code in ["InvalidUserID.NotFound", "ValidationError"]: 800 | raise RuntimeError( 801 | f"Account {self.acc_id} may be invalid, suspended, or not in organization." 802 | ) from e 803 | else: 804 | raise RuntimeError( 805 | f"STS assume role failed for account {self.acc_id}: {error_code} - {error_message}" 806 | ) from e 807 | except Exception as e: 808 | raise RuntimeError( 809 | f"Unexpected error creating session for account {self.acc_id}: {type(e).__name__}: {e}" 810 | ) from e 811 | 812 | @property 813 | def regions(self): 814 | def get_regions(): 815 | self.rate_limiter.wait() 816 | response = self.ec2.describe_regions() 817 | return [region["RegionName"] for region in response["Regions"]] 818 | 819 | return self.retry_handler.retry_with_backoff( 820 | get_regions, args.max_retries, f"describe_regions_{self.account_id}" 821 | ) 822 | 823 | def ec2_instances(self, aws_region): 824 | client = self.aws_session.client("ec2", aws_region, config=self.config) 825 | 826 | def get_instances(): 827 | self.rate_limiter.wait() 828 | response = client.describe_instances(MaxResults=1000) 829 | instances = response["Reservations"] 830 | next_token = response.get("NextToken") 831 | 832 | while next_token: 833 | self.rate_limiter.wait() 834 | response = client.describe_instances( 835 | MaxResults=1000, NextToken=next_token 836 | ) 837 | instances += response["Reservations"] 838 | next_token = response.get("NextToken") 839 | 840 | return instances 841 | 842 | return self.retry_handler.retry_with_backoff( 843 | get_instances, 844 | args.max_retries, 845 | f"ec2_instances_{aws_region}_{self.account_id}", 846 | ) 847 | 848 | @property 849 | def ec2(self): 850 | return self.aws_session.client("ec2", config=self.config) 851 | 852 | @classmethod 853 | def is_vm_kubenode(cls, vm): 854 | return any(True for tag in vm.get("Tags", []) if tag["Key"] in cls.EKS_TAGS) 855 | 856 | @classmethod 857 | def is_vm_running(cls, vm): 858 | return vm["State"]["Name"] != "stopped" 859 | 860 | @property 861 | def account_id(self): 862 | if self.acc_id is None: 863 | sts = self.aws_session.client("sts", config=self.config) 864 | self.acc_id = sts.get_caller_identity()["Account"] 865 | return self.acc_id 866 | 867 | def fargate_profiles(self, aws_region): 868 | client = self.aws_session.client("eks", aws_region, config=self.config) 869 | 870 | def get_profiles(): 871 | self.rate_limiter.wait() 872 | response = client.list_clusters(maxResults=100) 873 | clusters = response["clusters"] 874 | next_token = response.get("NextToken") 875 | 876 | while next_token: 877 | self.rate_limiter.wait() 878 | response = client.list_clusters(maxResults=100, NextToken=next_token) 879 | clusters += response["clusters"] 880 | next_token = response.get("NextToken") 881 | 882 | profiles_count = 0 883 | for c in clusters: 884 | self.rate_limiter.wait() 885 | response = client.list_fargate_profiles(clusterName=c, maxResults=100) 886 | fargate_profiles = response["fargateProfileNames"] 887 | next_token = response.get("NextToken") 888 | 889 | while next_token: 890 | self.rate_limiter.wait() 891 | response = client.list_fargate_profiles( 892 | clusterName=c, maxResults=100, NextToken=next_token 893 | ) 894 | fargate_profiles += response["fargateProfileNames"] 895 | next_token = response.get("NextToken") 896 | 897 | for p in fargate_profiles: 898 | if "fp-falcon" not in p: 899 | self.rate_limiter.wait() 900 | response = client.describe_fargate_profile( 901 | clusterName=c, fargateProfileName=p 902 | ) 903 | if "ACTIVE" in response["fargateProfile"]["status"]: 904 | profiles_count += 1 905 | 906 | return profiles_count 907 | 908 | return self.retry_handler.retry_with_backoff( 909 | get_profiles, 910 | args.max_retries, 911 | f"fargate_profiles_{aws_region}_{self.account_id}", 912 | ) 913 | 914 | def fargate_tasks(self, aws_region): 915 | client = self.aws_session.client("ecs", aws_region, config=self.config) 916 | 917 | def get_tasks(): 918 | self.rate_limiter.wait() 919 | response = client.list_clusters(maxResults=100) 920 | cluster_arns = response["clusterArns"] 921 | next_token = response.get("NextToken") 922 | 923 | while next_token: 924 | self.rate_limiter.wait() 925 | response = client.list_clusters(maxResults=100, NextToken=next_token) 926 | cluster_arns += response["clusterArns"] 927 | next_token = response.get("NextToken") 928 | 929 | tasks_count = 0 930 | for c in cluster_arns: 931 | self.rate_limiter.wait() 932 | response = client.list_services( 933 | cluster=c, maxResults=100, launchType="FARGATE" 934 | ) 935 | service_arns = response["serviceArns"] 936 | next_token = response.get("NextToken") 937 | 938 | while next_token: 939 | self.rate_limiter.wait() 940 | response = client.list_services( 941 | cluster=c, launchType="FARGATE", NextToken=next_token 942 | ) 943 | service_arns += response["serviceArns"] 944 | next_token = response.get("NextToken") 945 | 946 | for a in service_arns: 947 | self.rate_limiter.wait() 948 | response = client.describe_services(cluster=c, services=[a]) 949 | for s in response["services"]: 950 | if "ACTIVE" in s["status"]: 951 | tasks_count += s["desiredCount"] 952 | 953 | return tasks_count 954 | 955 | return self.retry_handler.retry_with_backoff( 956 | get_tasks, args.max_retries, f"fargate_tasks_{aws_region}_{self.account_id}" 957 | ) 958 | 959 | 960 | def process_ec2_instances(aws_handle, region_name, error_collector=None): 961 | """Process EC2 instances for a specific region with timeout""" 962 | 963 | def process(): 964 | vms_terminated = 0 965 | vms_running = 0 966 | kubenodes_terminated = 0 967 | kubenodes_running = 0 968 | 969 | for reservation in aws_handle.ec2_instances(region_name): 970 | for instance in reservation["Instances"]: 971 | typ = "kubenode" if AWSHandle.is_vm_kubenode(instance) else "vm" 972 | state = "running" if AWSHandle.is_vm_running(instance) else "terminated" 973 | 974 | if typ == "kubenode": 975 | if state == "running": 976 | kubenodes_running += 1 977 | else: 978 | kubenodes_terminated += 1 979 | else: 980 | if state == "running": 981 | vms_running += 1 982 | else: 983 | vms_terminated += 1 984 | 985 | return { 986 | "vms_terminated": vms_terminated, 987 | "vms_running": vms_running, 988 | "kubenodes_terminated": kubenodes_terminated, 989 | "kubenodes_running": kubenodes_running, 990 | } 991 | 992 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 993 | future = executor.submit(process) 994 | try: 995 | return future.result(timeout=args.operation_timeout) 996 | except concurrent.futures.TimeoutError: 997 | if error_collector: 998 | error_collector.add_timeout_error( 999 | "EC2 instances", region_name, aws_handle.account_id 1000 | ) 1001 | else: 1002 | print(f"Timeout processing EC2 instances in {region_name}") 1003 | future.cancel() 1004 | return { 1005 | "vms_terminated": 0, 1006 | "vms_running": 0, 1007 | "kubenodes_terminated": 0, 1008 | "kubenodes_running": 0, 1009 | } 1010 | 1011 | 1012 | def process_fargate_profiles(aws_handle, region_name, error_collector=None): 1013 | """Process Fargate profiles for a specific region with timeout""" 1014 | 1015 | def process(): 1016 | return aws_handle.fargate_profiles(region_name) 1017 | 1018 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 1019 | future = executor.submit(process) 1020 | try: 1021 | return future.result(timeout=args.operation_timeout) 1022 | except concurrent.futures.TimeoutError: 1023 | if error_collector: 1024 | error_collector.add_timeout_error( 1025 | "Fargate profiles", region_name, aws_handle.account_id 1026 | ) 1027 | else: 1028 | print(f"Timeout processing Fargate profiles in {region_name}") 1029 | future.cancel() 1030 | return 0 1031 | 1032 | 1033 | def process_fargate_tasks(aws_handle, region_name, error_collector=None): 1034 | """Process Fargate tasks for a specific region with timeout""" 1035 | 1036 | def process(): 1037 | return aws_handle.fargate_tasks(region_name) 1038 | 1039 | with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: 1040 | future = executor.submit(process) 1041 | try: 1042 | return future.result(timeout=args.operation_timeout) 1043 | except concurrent.futures.TimeoutError: 1044 | if error_collector: 1045 | error_collector.add_timeout_error( 1046 | "Fargate tasks", region_name, aws_handle.account_id 1047 | ) 1048 | else: 1049 | print(f"Timeout processing Fargate tasks in {region_name}") 1050 | future.cancel() 1051 | return 0 1052 | 1053 | 1054 | def process_region(aws_handle, region_name, error_collector=None, max_workers=2): # pylint: disable=R0912 1055 | """Process all resources in a region using parallel processing with reduced concurrency""" 1056 | # Removed verbose per-region output - now handled by progress bar in process_account 1057 | 1058 | row = { 1059 | "account_id": aws_handle.account_id, 1060 | "region": region_name, 1061 | "vms_terminated": 0, 1062 | "vms_running": 0, 1063 | "kubenodes_terminated": 0, 1064 | "kubenodes_running": 0, 1065 | "fargate_profiles": 0, 1066 | "fargate_tasks": 0, 1067 | } 1068 | 1069 | try: 1070 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 1071 | # Submit all tasks with error collector 1072 | ec2_future = executor.submit( 1073 | process_ec2_instances, aws_handle, region_name, error_collector 1074 | ) 1075 | fargate_profiles_future = executor.submit( 1076 | process_fargate_profiles, aws_handle, region_name, error_collector 1077 | ) 1078 | fargate_tasks_future = executor.submit( 1079 | process_fargate_tasks, aws_handle, region_name, error_collector 1080 | ) 1081 | 1082 | # Collect results with timeout 1083 | try: 1084 | ec2_results = ec2_future.result(timeout=args.operation_timeout) 1085 | row.update(ec2_results) 1086 | except Exception as e: 1087 | if error_collector: 1088 | error_collector.add_processing_error( 1089 | "EC2", region_name, e, aws_handle.account_id 1090 | ) 1091 | else: 1092 | print(f"Error processing EC2 in {region_name}: {e}") 1093 | 1094 | try: 1095 | row["fargate_profiles"] = fargate_profiles_future.result( 1096 | timeout=args.operation_timeout 1097 | ) 1098 | except Exception as e: 1099 | if error_collector: 1100 | error_collector.add_processing_error( 1101 | "Fargate profiles", region_name, e, aws_handle.account_id 1102 | ) 1103 | else: 1104 | print(f"Error processing Fargate profiles in {region_name}: {e}") 1105 | 1106 | try: 1107 | row["fargate_tasks"] = fargate_tasks_future.result( 1108 | timeout=args.operation_timeout 1109 | ) 1110 | except Exception as e: 1111 | if error_collector: 1112 | error_collector.add_processing_error( 1113 | "Fargate tasks", region_name, e, aws_handle.account_id 1114 | ) 1115 | else: 1116 | print(f"Error processing Fargate tasks in {region_name}: {e}") 1117 | 1118 | except Exception as e: 1119 | if error_collector: 1120 | error_collector.add_processing_error( 1121 | "region", region_name, e, aws_handle.account_id 1122 | ) 1123 | else: 1124 | print(f"Error processing region {region_name}: {e}") 1125 | 1126 | # Thread-safe updates to global data structures 1127 | with data_lock: 1128 | data.append(row) 1129 | 1130 | with totals_lock: 1131 | for k in [ 1132 | "vms_terminated", 1133 | "vms_running", 1134 | "kubenodes_terminated", 1135 | "kubenodes_running", 1136 | "fargate_profiles", 1137 | "fargate_tasks", 1138 | ]: 1139 | totals[k] += row[k] 1140 | 1141 | 1142 | def process_account(aws_handle, regions_to_process, progress_tracker, max_workers=3): # pylint: disable=R0914 1143 | """Process all regions for an account using parallel processing with simple status messages""" 1144 | account_id = aws_handle.account_id 1145 | 1146 | if progress_tracker.should_skip(account_id): 1147 | print(f"✓ Skipping already completed account: {account_id}") 1148 | return 1149 | 1150 | # Initialize error collector 1151 | error_collector = ErrorCollector() 1152 | 1153 | # Simple status message when starting account processing 1154 | with console_lock: 1155 | print(f"Processing account: {account_id}") 1156 | 1157 | try: 1158 | with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: 1159 | region_futures = [ 1160 | executor.submit( 1161 | process_region, aws_handle, region_name, error_collector 1162 | ) 1163 | for region_name in regions_to_process 1164 | ] 1165 | 1166 | # Wait for all regions to complete with timeout 1167 | future_errors = [] 1168 | 1169 | for future in concurrent.futures.as_completed( 1170 | region_futures, timeout=args.operation_timeout * len(regions_to_process) 1171 | ): 1172 | try: 1173 | result = future.result() # pylint: disable=W0612 1174 | except Exception as e: 1175 | future_errors.append(str(e)) 1176 | 1177 | # Simple completion message with error handling 1178 | with console_lock: 1179 | error_count = len(error_collector.get_errors()) + len(future_errors) 1180 | if error_count > 0: 1181 | print(f"\n✓ {account_id} - completed with {error_count} error(s)") 1182 | 1183 | # Display collected errors 1184 | if error_collector.has_errors(): 1185 | error_collector.display_errors() 1186 | 1187 | # Print any future execution errors that weren't collected 1188 | if future_errors: 1189 | print("\n Additional execution errors:") 1190 | for error in future_errors[:3]: 1191 | print(f" ⚠️ {error}") 1192 | if len(future_errors) > 3: 1193 | print( 1194 | f" ... and {len(future_errors) - 3} more execution errors" 1195 | ) 1196 | print() # Add blank line after additional errors 1197 | else: 1198 | print(f"\n✓ {account_id} - completed successfully\n") 1199 | 1200 | progress_tracker.mark_completed(account_id) 1201 | 1202 | except Exception as e: 1203 | with console_lock: 1204 | print(f"\n✗ {account_id} - failed") 1205 | # Display any collected errors before showing the failure 1206 | if error_collector.has_errors(): 1207 | error_collector.display_errors() 1208 | print(f" ❌ {e}\n") 1209 | progress_tracker.mark_failed(account_id) 1210 | 1211 | 1212 | def process_accounts_in_batches(accounts, regions_to_process, progress_tracker): 1213 | """Process accounts in batches to manage rate limiting""" 1214 | total_accounts = len(accounts) 1215 | batch_size = args.batch_size 1216 | 1217 | progress_state["total_accounts"] = total_accounts 1218 | progress_state["start_time"] = datetime.now(timezone.utc).isoformat() 1219 | 1220 | print(f"Processing {total_accounts} accounts in batches of {batch_size}") 1221 | 1222 | for i in range(0, total_accounts, batch_size): 1223 | batch = accounts[i : i + batch_size] 1224 | batch_num = i // batch_size + 1 1225 | total_batches = (total_accounts + batch_size - 1) // batch_size 1226 | 1227 | progress_state["current_batch"] = batch_num 1228 | progress_tracker.save_progress() 1229 | 1230 | print( 1231 | f"\n--- Processing Batch {batch_num}/{total_batches} ({len(batch)} accounts) ---" 1232 | ) 1233 | 1234 | if args.dry_run: 1235 | for aws_handle in batch: 1236 | print(f"Would process account: {aws_handle.account_id}") 1237 | continue 1238 | 1239 | # Process accounts in current batch 1240 | with concurrent.futures.ThreadPoolExecutor( 1241 | max_workers=args.threads 1242 | ) as executor: 1243 | account_futures = [ 1244 | executor.submit( 1245 | process_account, aws_handle, regions_to_process, progress_tracker 1246 | ) 1247 | for aws_handle in batch 1248 | ] 1249 | 1250 | # Wait for batch to complete 1251 | for future in concurrent.futures.as_completed(account_futures): 1252 | try: 1253 | future.result() 1254 | except Exception as e: 1255 | print(f"Error in batch processing: {e}") 1256 | 1257 | # Delay between batches (except for the last batch) 1258 | if i + batch_size < total_accounts: 1259 | print(f"Waiting {args.batch_delay} seconds before next batch...") 1260 | time.sleep(args.batch_delay) 1261 | 1262 | print(f"\nCompleted processing all {total_accounts} accounts") 1263 | 1264 | 1265 | def print_resume_guidance(progress_tracker, args): # pylint: disable=W0621,R0912,R0915 1266 | """Print helpful guidance on how to resume interrupted processing""" 1267 | completed_count = len(progress_state.get("completed_accounts", [])) 1268 | total_count = progress_state.get("total_accounts", 0) 1269 | failed_count = len(progress_state.get("failed_accounts", [])) 1270 | 1271 | print("\n" + "=" * 70) 1272 | print("📁 PROGRESS SAVED - PROCESSING CAN BE RESUMED") 1273 | print("=" * 70) 1274 | 1275 | if completed_count > 0 or failed_count > 0: 1276 | print("Progress Summary:") 1277 | print(f" ✅ Completed accounts: {completed_count}") 1278 | print(f" ❌ Failed accounts: {failed_count}") 1279 | if total_count > 0: 1280 | remaining = total_count - completed_count 1281 | print(f" ⏳ Remaining accounts: {remaining}") 1282 | completion_percent = (completed_count / total_count) * 100 1283 | print(f" 📊 Progress: {completion_percent:.1f}% complete") 1284 | 1285 | print(f"\n📄 Progress file: {progress_tracker.progress_file}") 1286 | 1287 | print("\n🚀 To resume processing, run the same command:") 1288 | 1289 | # Build the resume command 1290 | script_name = os.path.basename(__file__) 1291 | resume_cmd = f"python3 {script_name}" 1292 | 1293 | # Add the most important arguments 1294 | if args.role_name != "OrganizationAccountAccessRole": 1295 | resume_cmd += f" -r {args.role_name}" 1296 | if args.regions: 1297 | resume_cmd += f' -R "{args.regions}"' 1298 | if args.threads != 5: 1299 | resume_cmd += f" --threads {args.threads}" 1300 | if args.batch_size != 20: 1301 | resume_cmd += f" --batch-size {args.batch_size}" 1302 | if args.batch_delay != 30: 1303 | resume_cmd += f" --batch-delay {args.batch_delay}" 1304 | if args.api_delay != 0.1: 1305 | resume_cmd += f" --api-delay {args.api_delay}" 1306 | if args.max_retries != 5: 1307 | resume_cmd += f" --max-retries {args.max_retries}" 1308 | if args.operation_timeout != 300: 1309 | resume_cmd += f" --operation-timeout {args.operation_timeout}" 1310 | if args.resume_file != "aws_benchmark_progress.json": 1311 | resume_cmd += f" --resume-file {args.resume_file}" 1312 | if args.skip_accounts: 1313 | resume_cmd += f' --skip-accounts "{args.skip_accounts}"' 1314 | 1315 | print(f" {resume_cmd}") 1316 | 1317 | if completed_count > 0: 1318 | print("\n💡 The script will automatically:") 1319 | print(f" • Skip {completed_count} already completed accounts") 1320 | print(" • Continue from where it left off") 1321 | print(" • Process only the remaining accounts") 1322 | 1323 | print("\n🔧 For large organizations (200+ accounts), consider using:") 1324 | print(" export AWS_THREADS=2") 1325 | print(" export AWS_BATCH_SIZE=10") 1326 | print(" export AWS_BATCH_DELAY=60") 1327 | print(" export AWS_API_DELAY=0.2") 1328 | print(" ./benchmark.sh aws") 1329 | 1330 | print("\n📚 For more help, see: AWS/RATE_LIMITING_SOLUTIONS.md") 1331 | print("=" * 70) 1332 | 1333 | 1334 | def main() -> None: # pylint: disable=R0915,R0914,R0912 1335 | """Main Function""" 1336 | global args, logger # pylint: disable=W0603 1337 | 1338 | # Initialize global configuration first 1339 | args = parse_args() # pylint: disable=W0621 1340 | logger = setup_logging() 1341 | setup_signal_handlers() 1342 | 1343 | start_time = time.time() 1344 | 1345 | logger.info("Starting AWS CSPM benchmark") 1346 | logger.info("Configuration:") 1347 | logger.info(f" - Threads: {args.threads}") 1348 | logger.info(f" - Batch size: {args.batch_size}") 1349 | logger.info(f" - Batch delay: {args.batch_delay}s") 1350 | logger.info(f" - API delay: {args.api_delay}s") 1351 | logger.info(f" - Max retries: {args.max_retries}") 1352 | logger.info(f" - Operation timeout: {args.operation_timeout}s") 1353 | logger.info(f" - Dry run: {args.dry_run}") 1354 | 1355 | # Initialize components with global error collector 1356 | global_error_collector = ErrorCollector() 1357 | rate_limiter = RateLimiter(calls_per_second=1.0 / args.api_delay) 1358 | retry_handler = RetryHandler(error_collector=global_error_collector) 1359 | progress_tracker = ProgressTracker(args.resume_file) 1360 | 1361 | # Get all AWS accounts 1362 | try: 1363 | accounts = AWSOrgAccess(rate_limiter, retry_handler).accounts() 1364 | print(f"Found {len(accounts)} accounts to process") 1365 | 1366 | if not accounts: 1367 | print("No accounts found to process") 1368 | return 1369 | 1370 | # Filter out already completed accounts 1371 | if not args.dry_run: 1372 | pending_accounts = [ 1373 | acc 1374 | for acc in accounts 1375 | if not progress_tracker.should_skip(acc.account_id) 1376 | ] 1377 | print( 1378 | f"Accounts pending: {len(pending_accounts)} (skipping {len(accounts) - len(pending_accounts)} completed)" 1379 | ) 1380 | accounts = pending_accounts 1381 | 1382 | if not accounts: 1383 | print("All accounts already completed!") 1384 | return 1385 | 1386 | # Determine regions to process and display in configuration 1387 | if args.regions: 1388 | regions_to_process = [x.strip() for x in args.regions.split(",")] 1389 | regions_display = ", ".join(regions_to_process) 1390 | print(f"\n📍 Regions to process: {regions_display}") 1391 | else: 1392 | regions_to_process = accounts[0].regions 1393 | regions_display = ", ".join(regions_to_process[:10]) 1394 | if len(regions_to_process) > 10: 1395 | regions_display += f" ... (+{len(regions_to_process) - 10} more)" 1396 | print( 1397 | f"\n📍 Processing all {len(regions_to_process)} regions: {regions_display}" 1398 | ) 1399 | 1400 | # Process accounts in batches 1401 | process_accounts_in_batches(accounts, regions_to_process, progress_tracker) 1402 | 1403 | except KeyboardInterrupt: 1404 | print("\nProcessing interrupted by user") 1405 | progress_tracker.save_progress() 1406 | print_resume_guidance(progress_tracker, args) 1407 | return 1408 | except Exception as e: 1409 | print(f"Fatal error: {e}") 1410 | progress_tracker.save_progress() 1411 | print_resume_guidance(progress_tracker, args) 1412 | return 1413 | 1414 | if not args.dry_run: 1415 | # Add totals row 1416 | data.append(totals) 1417 | 1418 | end_time = time.time() 1419 | processing_time = end_time - start_time 1420 | 1421 | print(f"\nProcessing completed in {processing_time:.2f} seconds") 1422 | print(f"Completed accounts: {len(progress_state['completed_accounts'])}") 1423 | print(f"Failed accounts: {len(progress_state['failed_accounts'])}") 1424 | 1425 | # Output results 1426 | if data: 1427 | # Save to CSV with timestamp 1428 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 1429 | csv_filename = f"aws-benchmark-{timestamp}.csv" 1430 | 1431 | with open(csv_filename, "w", newline="", encoding="utf-8") as csv_file: 1432 | csv_writer = csv.DictWriter(csv_file, fieldnames=headers.keys()) 1433 | csv_writer.writeheader() 1434 | csv_writer.writerows(data) 1435 | 1436 | print(f"\nCSV file stored in: ./cloud-benchmark/{csv_filename}") 1437 | 1438 | # Clean up progress file on successful completion 1439 | if len(progress_state["failed_accounts"]) == 0: 1440 | try: 1441 | os.remove(args.resume_file) 1442 | logger.info("Progress file cleaned up") 1443 | except OSError as e: 1444 | logger.warning( 1445 | f"Could not remove progress file {args.resume_file}: {e}" 1446 | ) 1447 | except Exception as e: 1448 | logger.error(f"Unexpected error removing progress file: {e}") 1449 | 1450 | 1451 | if __name__ == "__main__": 1452 | main() 1453 | --------------------------------------------------------------------------------