├── .github ├── CODEOWNERS ├── ISSUE_TEMPLATE │ ├── 01_question.md │ ├── 02_bug.md │ ├── 03_feature.md │ └── config.yml ├── PULL_REQUEST_TEMPLATE.md ├── dependabot.yml └── workflows │ ├── lint.yml │ ├── publish-pypi.yaml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .pylintrc ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── mypy.ini ├── pyproject.toml ├── src └── pgtracer │ ├── __init__.py │ ├── ebpf │ ├── __init__.py │ ├── code │ │ ├── block_rq.c │ │ ├── data.h │ │ ├── ebpf_maps.h │ │ ├── gucset.c │ │ ├── memusage.c │ │ ├── perf.c │ │ ├── plan.c │ │ ├── program.c │ │ ├── stack.h │ │ └── utils.h │ ├── collector │ │ ├── __init__.py │ │ ├── c_defs.py │ │ ├── guc.py │ │ ├── querytracer.py │ │ └── utils.py │ ├── dwarf.py │ ├── eh_frame_hdr.py │ └── unwind.py │ ├── model │ ├── __init__.py │ ├── memory.py │ ├── plan.py │ └── query.py │ ├── scripts │ ├── pgtrace_gucs.py │ └── pgtrace_queries.py │ └── utils.py └── tests ├── conftest.py ├── scripts └── setup_fedora_container.sh ├── test_bins ├── Makefile ├── test.elf ├── test.elf.c ├── test_stack.main └── test_stack.main.c ├── test_dwarf.py ├── test_guctracer.py ├── test_querytracer.py └── test_stack_unwinding.py /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @aiven/aiven-open-source 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/01_question.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: ❓ Ask a question 3 | about: Got stuck or missing something from the docs? Ask away! 4 | --- 5 | 6 | # What can we help you with? 7 | 8 | 9 | 10 | # Where would you expect to find this information? 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/02_bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 🐜 Report a bug 3 | about: Spotted a problem? Let us know 4 | --- 5 | 6 | # What happened? 7 | 8 | 9 | 10 | # What did you expect to happen? 11 | 12 | 13 | 14 | # What else do we need to know? 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/03_feature.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: 💡 Feature suggestion 3 | about: What would make this even better? 4 | --- 5 | 6 | # What is currently missing? 7 | 8 | 9 | 10 | # How could this be improved? 11 | 12 | 13 | 14 | # Is this a feature you would work on yourself? 15 | 16 | * [ ] I plan to open a pull request for this feature 17 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Aiven Security Bug Bounty 4 | url: https://hackerone.com/aiven_ltd 5 | about: Our bug bounty program. 6 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | 2 | # About this change - What it does 3 | 4 | 5 | 6 | 7 | Resolves: #xxxxx 8 | 9 | # Why this way 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "pip" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | branches: 4 | - main 5 | tags: 6 | - '**' 7 | pull_request: 8 | 9 | jobs: 10 | 11 | lint: 12 | runs-on: ubuntu-22.04 13 | strategy: 14 | matrix: 15 | # only use one version for the lint step 16 | python-version: [3.9] 17 | 18 | steps: 19 | 20 | - id: checkout 21 | uses: actions/checkout@v2 22 | with: 23 | # Do not persist the token during execution of this job. 24 | persist-credentials: false 25 | 26 | - id: dependencies 27 | run: | 28 | # Must be installed via the system 29 | sudo apt install python3-bpfcc python3-pip 30 | pip install -U pip toml 31 | pip install '.[lint]' 32 | 33 | - id: pylint 34 | run: pylint --rcfile .pylintrc src/ || pylint-exit $? -efail 35 | 36 | - id: mypy 37 | run: python -m mypy --strict src/ --python-version 3.8 38 | 39 | - id: validate-style 40 | run: | 41 | isort --recursive src/ 42 | black src/ 43 | if [ $(git diff --name-only --diff-filter=ACMR | wc -l ) != 0 ]; then 44 | echo "Reformatting failed! Please run make fmt on your commits and resubmit!" 1>&2; 45 | git diff; 46 | exit 1; 47 | fi 48 | -------------------------------------------------------------------------------- /.github/workflows/publish-pypi.yaml: -------------------------------------------------------------------------------- 1 | # Based on https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/ 2 | 3 | name: Publish to PyPI 4 | on: 5 | push: 6 | tags: 7 | - 'releases/**' 8 | 9 | jobs: 10 | build-n-publish: 11 | name: Build and publish 12 | runs-on: ubuntu-latest 13 | 14 | steps: 15 | - uses: actions/checkout@v3 16 | with: 17 | fetch-depth: 0 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v4 21 | with: 22 | python-version: "3.8" 23 | 24 | - name: Install pypa/build 25 | run: >- 26 | python -m pip install build --user 27 | - name: Build a binary wheel and a source tarball 28 | run: >- 29 | python -m 30 | build 31 | --sdist 32 | --wheel 33 | --outdir dist/ 34 | . 35 | - name: Publish distribution to PyPI 36 | if: startsWith(github.ref, 'refs/tags') 37 | uses: pypa/gh-action-pypi-publish@release/v1 38 | with: 39 | password: ${{ secrets.PYPI_API_TOKEN }} 40 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | 2 | on: 3 | push: 4 | branches: 5 | - main 6 | tags: 7 | - '**' 8 | pull_request: 9 | 10 | jobs: 11 | 12 | tests: 13 | runs-on: ubuntu-22.04 14 | # We don't fail on dev versions, as those are snapshots 15 | continue-on-error: ${{ matrix.experimental }} 16 | strategy: 17 | matrix: 18 | postgresql_version: [11, 12, 13, 14, ] 19 | experimental: [false] 20 | repo: ["pgdg"] 21 | pytest_args: ["-m 'not slow'"] 22 | include: 23 | # Define the current dev version to be experimental 24 | - postgresql_version: 16 25 | experimental: true 26 | repo: "pgdg-snapshot" 27 | pytest_args: "-m 'not slow'" 28 | # For latest stable version, include "slow" tests 29 | - postgresql_version: 15 30 | experimental: false 31 | repo: "pgdg" 32 | pytest_args: "" 33 | env: 34 | PGVERSION: ${{ matrix.postgresql_version }} 35 | DISTRO: ubuntu 36 | steps: 37 | 38 | - id: checkout 39 | uses: actions/checkout@v2 40 | with: 41 | # Do not persist the token during execution of this job. 42 | persist-credentials: false 43 | 44 | - id: dependencies 45 | run: | 46 | # Must be installed via the system 47 | sudo apt update 48 | sudo apt install curl ca-certificates gnupg 49 | sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r) 50 | curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null 51 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-${{ matrix.repo }} main ${{ matrix.postgresql_version }}" > /etc/apt/sources.list.d/pgdg.list' 52 | sudo apt update 53 | # Install postgresql-common so that update alternatives doesn't fail 54 | sudo apt install postgresql-common postgresql-client-common 55 | sudo apt install postgresql-${{matrix.postgresql_version}} postgresql-${{matrix.postgresql_version}}-dbgsym 56 | sudo pip install -U pip toml 57 | # Install requirements from pyproject.toml 58 | sudo pip install -e '.[test]' 59 | 60 | - id: tests 61 | run: | 62 | sudo pytest --postgresql-exec /usr/lib/postgresql/${{matrix.postgresql_version}}/bin/pg_ctl --cov src/ --cov-report=xml ${{matrix.pytest_args}} 63 | 64 | - name: Upload coverage reports to Codecov 65 | uses: codecov/codecov-action@v3 66 | with: 67 | env_vars: PGVERSION 68 | fail_ci_if_error: true 69 | files: ./coverage.xml 70 | verbose: true 71 | name: codecov-umbrella 72 | 73 | tests_fedora_container: 74 | runs-on: ubuntu-22.04 75 | env: 76 | PGVERSION: 13 77 | DISTRO: fedora 78 | steps: 79 | - id: checkout 80 | uses: actions/checkout@v2 81 | with: 82 | persist-credentials: false 83 | - id: dependencies 84 | run: | 85 | sudo apt update 86 | sudo apt install dnf systemd-container 87 | sudo apt install postgresql-client 88 | sudo ./tests/scripts/setup_fedora_container.sh 89 | sudo apt install curl ca-certificates gnupg 90 | sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r) 91 | # Also install it in the host, for the tests running outside the 92 | # container 93 | sudo pip install -U pip toml 94 | sudo pip install -e '.[test]' 95 | 96 | - id: fedora_tests 97 | run: | 98 | sudo pytest --postgresql-host 172.16.0.1 --container fedora --cov src/ --cov-report=xml -m "not slow" 99 | 100 | - name: Upload coverage reports to Codecov 101 | uses: codecov/codecov-action@v3 102 | with: 103 | env_vars: PGVERSION, DISTRO 104 | fail_ci_if_error: true 105 | files: ./coverage.xml 106 | verbose: true 107 | name: codecov 108 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | build 4 | *.tmp 5 | .coverage 6 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 5.12.0 4 | hooks: 5 | - id: isort 6 | name: isort (python) 7 | - repo: https://github.com/psf/black 8 | rev: 23.1.0 9 | hooks: 10 | - id: black 11 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | disable=too-few-public-methods 3 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | opensource@aiven.io. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Welcome! 2 | 3 | Contributions are very welcome on pgtracer. When contributing please keep this in mind: 4 | 5 | - Open an issue to discuss new bigger features. 6 | - Write code consistent with the project style and make sure the tests are passing. 7 | - Stay in touch with us if we have follow up questions or requests for further changes. 8 | 9 | # Development 10 | 11 | ## Local Environment 12 | 13 | 14 | ## Tests 15 | 16 | 17 | ## Static checking and Linting 18 | 19 | 20 | ## Manual testing 21 | 22 | 23 | ### Configuration 24 | 25 | 26 | # Opening a PR 27 | 28 | - Commit messages should describe the changes, not the filenames. Win our admiration by following 29 | the [excellent advice from Chris Beams](https://chris.beams.io/posts/git-commit/) when composing 30 | commit messages. 31 | - Choose a meaningful title for your pull request. 32 | - The pull request description should focus on what changed and why. 33 | - Check that the tests pass (and add test coverage for your changes if appropriate). 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | PGTracer 2 | Copyright (C) 2022 Aiven 3 | 4 | This program is free software: you can redistribute it and/or modify 5 | it under the terms of the GNU General Public License as published by 6 | the Free Software Foundation, either version 3 of the License, or 7 | (at your option) any later version. 8 | 9 | This program is distributed in the hope that it will be useful, 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 | GNU General Public License for more details. 13 | 14 | You should have received a copy of the GNU General Public License 15 | along with this program. If not, see . 16 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/pgtracer/ebpf/code/*.c 2 | include src/pgtracer/ebpf/code/*.h 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | PGTracer 2 | ======== 3 | 4 | PGTracer is a collection of tools to trace queries, execution plans and more in 5 | PostgreSQL®, using eBPF. 6 | 7 | Overview 8 | ======== 9 | 10 | PGTracer offers a way to instrument PostgreSQL, using the Linux eBPF facility. 11 | As it does advanced memory access, it needs the PostgreSQL debug symbols to 12 | resolve symbols and offsets in structs. 13 | 14 | Features 15 | ============ 16 | 17 | * Attach to a running PostgreSQL backend, and dump every executed query along 18 | with it's search path 19 | * Optionally turn on instrumentation (just like EXPLAIN ANALYZE does) to gather 20 | more information 21 | 22 | Planned features: 23 | * Gather information about individual execution nodes to print query plans 24 | * Gather system information and link them to individual nodes (think syscalls, 25 | IO, memory allocation...) 26 | * Build a TUI to explore the data 27 | * Allow to follow a transaction 28 | 29 | 30 | Install 31 | ============ 32 | 33 | You will need a running PostgreSQL install, and it's debug symbols. 34 | 35 | For pgtracer itself you will need: 36 | - libunwind installed on the system 37 | - the [BPF Compiler Collection](https://github.com/iovisor/bcc/blob/master/INSTALL.md) 38 | - several python packages as dependencies: 39 | - `psutil` 40 | - `pyelftools` 41 | 42 | Support will vary depending on your Linux distribution, kernel version, and 43 | library versions, as well as how PostgreSQL was compiled. 44 | 45 | Please file a bug if it doesn't work as expected. 46 | 47 | Ubuntu 48 | ------------ 49 | 50 | To install the debug symbols, install the `postgresql-version-dbgsym` package. You may have to enable additional repositories though. 51 | 52 | To run pgtracer you will need some python packages as well as packages only available from the repos. 53 | 54 | ``` 55 | apt install python3-bpfcc python3-pip libunwind-dev 56 | ``` 57 | 58 | Then upgrade pip using pip: 59 | 60 | ``` 61 | pip install pip --upgrade 62 | ``` 63 | 64 | And you are now ready to install the pgtracer package itself: 65 | 66 | ``` 67 | git clone https://github.com/aiven/pgtracer.git 68 | cd pgtracer 69 | pip install . 70 | ``` 71 | 72 | 73 | Fedora 74 | --------- 75 | 76 | To install the debugging symbols: 77 | 78 | ``` 79 | yum install dnf-utils 80 | debuginfo-install postgresql-server 81 | ``` 82 | 83 | For the dependencies: 84 | 85 | ``` 86 | yum install python3-bcc libunwind python3-pip libunwind-devel 87 | ``` 88 | 89 | Then install pgtracer itself: 90 | 91 | ``` 92 | git clone https://github.com/aiven/pgtracer.git 93 | cd pgtracer 94 | pip install pip --upgrade 95 | pip install . 96 | ``` 97 | 98 | 99 | 100 | Arch Linux 101 | ------------ 102 | 103 | To install PostgreSQL debug symbols, as root: 104 | 105 | ``` 106 | pacman -S debuginfod 107 | export DEBUGINFOD_URLS="https://debuginfod.archlinux.org/" 108 | debuginfod-find debuginfo /usr/bin/postgres 109 | ``` 110 | 111 | To install the required packages: 112 | 113 | ``` 114 | pacman -S python-bcc libunwind python-pip 115 | ``` 116 | 117 | Then install the pgtracer package itself: 118 | 119 | ``` 120 | git clone https://github.com/aiven/pgtracer.git 121 | cd pgtracer 122 | pip install . 123 | ``` 124 | 125 | 126 | Usage 127 | ============= 128 | 129 | Currently, only one script comes with pgtracer: `pgtrace_queries`. 130 | Since pgtracer uses eBPF, it needs to be run as root. 131 | 132 | ``` 133 | usage: pgtrace_queries [-h] [--instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...]] [--nodes-collection] pid 134 | 135 | Dump a running backend execution plan 136 | 137 | positional arguments: 138 | pid PID to connect to 139 | 140 | options: 141 | -h, --help show this help message and exit 142 | --instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...], -I [{TIMER,BUFFERS,ROWS,WAL,ALL} ...] 143 | Instrument flags to set. (warning: writes into backends memory!) 144 | --nodes-collection, -n 145 | Collect information about individual execution nodes 146 | ``` 147 | 148 | 149 | 150 | Depending on the way the PostgreSQL binary have been compiled, you may need a 151 | more recent pyelftools version than what is packaged with your distribution: 152 | DWARF5 support is quite recent and continuously improving. 153 | 154 | 155 | 156 | 157 | 158 | License 159 | ======= 160 | pgtracer is licensed under the PostgreSQL license. Full license text is available in the [LICENSE](LICENSE) file. 161 | 162 | Please note that the project explicitly does not require a CLA (Contributor License Agreement) from its contributors. 163 | 164 | Contact 165 | ============ 166 | Bug reports and patches are very welcome, please post them as GitHub issues and pull requests at https://github.com/aiven/pgtracer . 167 | To report any possible vulnerabilities or other serious issues please see our [security](SECURITY.md) policy. 168 | 169 | Trademarks 170 | ========== 171 | 172 | The terms Postgres and PostgreSQL are registered trademarks of the PostgreSQL Community Association of Canada. 173 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Supported Versions 4 | 5 | We release patches for security vulnerabilities. Which versions are eligible 6 | to receive such patches depend on the CVSS v3.0 Rating: 7 | 8 | | CVSS v3.0 | Supported Versions | 9 | | --------- | ----------------------------------------- | 10 | | 4.0-10.0 | Most recent release | 11 | 12 | ## Reporting a Vulnerability 13 | 14 | Please report (suspected) security vulnerabilities to our **[bug bounty 15 | program](https://hackerone.com/aiven_ltd)**. You will receive a response from 16 | us within 2 working days. If the issue is confirmed, we will release a patch as 17 | soon as possible depending on impact and complexity. 18 | 19 | ## Qualifying Vulnerabilities 20 | 21 | Any reproducible vulnerability that has a severe effect on the security or 22 | privacy of our users is likely to be in scope for the program. 23 | 24 | We generally **aren't** interested in the following issues: 25 | * Social engineering (e.g. phishing, vishing, smishing) attacks 26 | * Brute force, DoS, text injection 27 | * Missing best practices such as HTTP security headers (CSP, X-XSS, etc.), 28 | email (SPF/DKIM/DMARC records), SSL/TLS configuration. 29 | * Software version disclosure / Banner identification issues / Descriptive 30 | error messages or headers (e.g. stack traces, application or server errors). 31 | * Clickjacking on pages with no sensitive actions 32 | * Theoretical vulnerabilities where you can't demonstrate a significant 33 | security impact with a proof of concept. 34 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.7 3 | 4 | [mypy-elftools.*] 5 | ignore_missing_imports = True 6 | 7 | [mypy-psutil.*] 8 | ignore_missing_imports = True 9 | 10 | [mypy-bcc.*] 11 | ignore_missing_imports = True 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "pgtracer" 3 | description = "Tracing tools for PostgreSQL" 4 | version = "0.1.0" 5 | authors = [ 6 | { name = "Ronan Dunklau", email = "ronan.dunklau@aiven.com" } 7 | ] 8 | dependencies = [ 9 | "pyelftools", 10 | "pypsutil" 11 | ] 12 | readme = "README.md" 13 | requires-python = ">=3.7" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: PostgreSQL License", 17 | "Operating System :: POSIX :: Linux", 18 | ] 19 | 20 | 21 | [project.scripts] 22 | pgtrace_queries = "pgtracer.scripts.pgtrace_queries:main" 23 | pgtrace_gucs = "pgtracer.scripts.pgtrace_gucs:main" 24 | 25 | [project.optional-dependencies] 26 | lint = [ 27 | 'black', 28 | 'isort', 29 | 'mypy', 30 | 'pylint', 31 | 'pylint-exit', 32 | ] 33 | 34 | test = [ 35 | 'psycopg', 36 | 'pytest', 37 | 'pytest-coverage', 38 | 'pytest-postgresql', 39 | 'flaky' 40 | ] 41 | 42 | [tool.isort] 43 | profile = "black" 44 | -------------------------------------------------------------------------------- /src/pgtracer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/__init__.py -------------------------------------------------------------------------------- /src/pgtracer/ebpf/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/ebpf/__init__.py -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/block_rq.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "data.h" 3 | #include "utils.h" 4 | 5 | struct io_req_data_t { 6 | event_base event_base; 7 | char rwbs[8]; 8 | u64 bytes; 9 | }; 10 | 11 | 12 | TRACEPOINT_PROBE(block, block_rq_issue) 13 | { 14 | struct io_req_data_t *event; 15 | ##CHECK_POSTMASTER## 16 | /* We need to filter on pid ourselves inside syscalls. */ 17 | #ifdef PID 18 | if (bpf_get_current_pid_tgid() >> 32 != PID) 19 | return 1; 20 | #endif 21 | 22 | event = event_ring.ringbuf_reserve(sizeof(struct io_req_data_t)); 23 | if (!event) 24 | return 1; 25 | 26 | fill_event_base(&(event->event_base), EventTypeKBlockRqIssue); 27 | event->bytes = args->nr_sector << 9; 28 | if (event->bytes == 0) { 29 | event->bytes = args->bytes; 30 | } 31 | bpf_probe_read(&event->rwbs, sizeof(event->rwbs), args->rwbs); 32 | event_ring.ringbuf_submit(event, 0); 33 | return 0; 34 | } 35 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/data.h: -------------------------------------------------------------------------------- 1 | #ifndef DATA_H 2 | #define DATA_H 3 | #include "stack.h" 4 | 5 | typedef struct event_base { 6 | short event_type; 7 | int pid; 8 | } event_base; 9 | 10 | typedef struct Id128 { 11 | u64 u1; 12 | u64 u2; 13 | } Id128; 14 | 15 | struct portal_data_t { 16 | event_base event_base; 17 | Id128 portal_key; 18 | u64 queryAddr; 19 | u64 query_id; 20 | double startup_cost; 21 | double total_cost; 22 | double plan_rows; 23 | char query[MAX_QUERY_LENGTH]; // Dynamically injected using defines 24 | char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines 25 | char search_path[MAX_SEARCHPATH_LENGTH]; 26 | }; 27 | 28 | struct plan_data_t { 29 | u64 plan_addr; 30 | int plan_tag; 31 | double startup_cost; 32 | double total_cost; 33 | double plan_rows; 34 | int plan_width; 35 | bool parallel_aware; 36 | }; 37 | 38 | struct planstate_data_t { 39 | event_base event_base; 40 | Id128 portal_key; 41 | u64 planstate_addr; 42 | int planstate_tag; 43 | u64 lefttree; 44 | u64 righttree; 45 | struct plan_data_t plan_data; 46 | char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines 47 | struct stack_data_t stack_capture; 48 | }; 49 | 50 | #endif 51 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/ebpf_maps.h: -------------------------------------------------------------------------------- 1 | #ifndef EBPF_MAPS_H 2 | #define EBPF_MAPS_H 3 | /* Main ringbuf for communicating events to user space. */ 4 | BPF_RINGBUF_OUTPUT(event_ring, EVENTRING_PAGE_SIZE); 5 | 6 | #endif 7 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/gucset.c: -------------------------------------------------------------------------------- 1 | #include "ebpf_maps.h" 2 | 3 | struct guc_request_t { 4 | u64 guc_location; 5 | int guc_size; 6 | char payload[GUC_MAX_LENGTH]; 7 | }; 8 | 9 | struct guc_response_t { 10 | short event_type; 11 | u64 guc_location; 12 | bool status; 13 | }; 14 | 15 | BPF_QUEUE(gucs_to_set, struct guc_request_t, 128); 16 | 17 | 18 | /* This will be attached at various points in the program flow, 19 | * to override GUCs as seen fit. 20 | * */ 21 | int process_guc_uprobe(struct pt_regs *ctx) 22 | { 23 | struct guc_request_t guc_request; 24 | struct guc_response_t *guc_response; 25 | int i = 0; 26 | int size = 0; 27 | int ret; 28 | while (i < 20) 29 | { 30 | guc_response = event_ring.ringbuf_reserve(sizeof(struct guc_response_t)); 31 | if (!guc_response) 32 | return 1; 33 | guc_response->event_type = EventTypeGUCResponse; 34 | 35 | /* If no resquest to process, bail out */ 36 | if (gucs_to_set.pop(&guc_request) < 0) 37 | { 38 | event_ring.ringbuf_discard(guc_response, 0); 39 | return 1; 40 | } 41 | guc_response->guc_location = guc_request.guc_location; 42 | size = guc_request.guc_size; 43 | clamp_umax(size, GUC_MAX_LENGTH); 44 | ret = -1; 45 | if (size > 0 && guc_request.guc_size <= GUC_MAX_LENGTH) 46 | ret = bpf_probe_write_user((void *) guc_request.guc_location, &(guc_request.payload), size); 47 | guc_response->status = (ret >= 0); 48 | event_ring.ringbuf_submit(guc_response, 0); 49 | i++; 50 | } 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/memusage.c: -------------------------------------------------------------------------------- 1 | #include "ebpf_maps.h" 2 | #include "stack.h" 3 | #include "linux/sched.h" 4 | #include "utils.h" 5 | #include "data.h" 6 | 7 | #define offsetof(type, member) __builtin_offsetof (type, member) 8 | 9 | 10 | struct memory_account_t { 11 | event_base event_base; 12 | long long size; 13 | short kind; 14 | }; 15 | 16 | 17 | static inline int send_memory_account(long long size, short kind) 18 | { 19 | struct memory_account_t *account = event_ring.ringbuf_reserve(sizeof(struct memory_account_t)); 20 | if (!account) 21 | return 1; 22 | fill_event_base(&(account->event_base), EventTypeMemoryAccount); 23 | account->size = size; 24 | account->kind = kind; 25 | event_ring.ringbuf_submit(account, 0); 26 | return 0; 27 | } 28 | 29 | /* 30 | * sbrk moves are instrumented through the convenient tracepoints. 31 | */ 32 | int sbrk_more(struct pt_regs *ctx) 33 | { 34 | ##CHECK_POSTMASTER## 35 | size_t size; 36 | bpf_usdt_readarg(2, ctx, &size); 37 | return send_memory_account(size, MemoryAllocTypeSbrk); 38 | } 39 | 40 | int sbrk_less(struct pt_regs *ctx) 41 | { 42 | ##CHECK_POSTMASTER## 43 | size_t size; 44 | bpf_usdt_readarg(2, ctx, &size); 45 | return send_memory_account(-size, MemoryAllocTypeSbrk); 46 | } 47 | 48 | /* 49 | * glibc doesn't offer tracepoints for mmap, so instrument the functions directly. 50 | */ 51 | 52 | int mmap_enter(struct pt_regs *ctx) 53 | { 54 | ##CHECK_POSTMASTER## 55 | size_t size = PT_REGS_PARM2(ctx); 56 | return send_memory_account(size, MemoryAllocTypeMmap); 57 | } 58 | 59 | int munmap_enter(struct pt_regs *ctx) 60 | { 61 | ##CHECK_POSTMASTER## 62 | size_t size = PT_REGS_PARM2(ctx); 63 | return send_memory_account(-size, MemoryAllocTypeMmap); 64 | } 65 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/perf.c: -------------------------------------------------------------------------------- 1 | #include "ebpf_maps.h" 2 | #include "stack.h" 3 | #include "uapi/linux/bpf_perf_event.h" 4 | #include "utils.h" 5 | #include "data.h" 6 | 7 | struct memory_request_t { 8 | short event_type; 9 | Id128 request_id; 10 | int path_size; 11 | u64 size; 12 | u64 memory_path[MEMORY_PATH_SIZE]; 13 | }; 14 | 15 | struct memory_response_t { 16 | event_base event_base; 17 | Id128 request_id; 18 | char payload[MEMORY_REQUEST_MAXSIZE]; 19 | }; 20 | 21 | /* 22 | * We embed the whole portal_data_t 23 | */ 24 | struct stack_sample_t { 25 | struct portal_data_t portal_data; 26 | struct stack_data_t stack_data; 27 | }; 28 | 29 | # define QUERY_DISCOVERY_KEY 1 30 | # define NODE_DISCOVERY_KEY 2 31 | BPF_HASH(discovery_enabled, int, bool, 2); 32 | 33 | BPF_QUEUE(memory_requests, struct memory_request_t, 1024); 34 | /* Define one queue per process */ 35 | #if LIBBCC_VERSION_GEQ(0, 21, 0) 36 | BPF_HASH_OF_MAPS(pid_queues, int, "memory_requests", 1024); 37 | #else 38 | BPF_HASH_OF_MAPS(pid_queues, "memory_requests", 1024); 39 | #endif 40 | 41 | /* 42 | * This code is run on perf event, with a specific frequency. 43 | * What we want is to be able to read specific memory locations whenever the perf 44 | * event is triggered. 45 | * 46 | * Userland code pushes memory locations to read to the memory_requests queues, 47 | * and sends the responses back through the same event_ringbuffer used 48 | * everywhere. 49 | */ 50 | int perf_event(struct bpf_perf_event_data *ctx) 51 | { 52 | ##CHECK_POSTMASTER## 53 | struct memory_request_t request; 54 | struct memory_response_t *response; 55 | u64 size; 56 | u64 memory_location; 57 | int pid = (bpf_get_current_pid_tgid() >> 32); 58 | int i = 0; 59 | int j; 60 | void * queue; 61 | #ifdef ENABLE_QUERY_DISCOVERY 62 | int key = QUERY_DISCOVERY_KEY; 63 | bool *need_discovery; 64 | need_discovery = discovery_enabled.lookup(&key); 65 | bool need_query = (need_discovery && *need_discovery); 66 | key = NODE_DISCOVERY_KEY; 67 | need_discovery = discovery_enabled.lookup(&key); 68 | bool need_node = (need_discovery && *need_discovery); 69 | if (need_query || need_node) 70 | { 71 | void *activeportal = 0; 72 | bpf_probe_read_user(&activeportal, 73 | sizeof(void*), 74 | (void *) GlobalVariablesActivePortal); 75 | /* Only proceed if we have a current query. */ 76 | if(activeportal != 0) 77 | { 78 | struct stack_sample_t *stack_sample = event_ring.ringbuf_reserve(sizeof(struct stack_sample_t)); 79 | 80 | /* 81 | * If we can't allocate for the stack sample, we keep going to the memory request code. 82 | */ 83 | if (stack_sample) 84 | { 85 | fill_event_base(&(stack_sample->portal_data.event_base), EventTypeStackSample); 86 | if (need_query) 87 | { 88 | void *queryDesc = 0; 89 | bpf_probe_read_user(&queryDesc, sizeof(void *), 90 | OffsetFrom(activeportal, PortalData, queryDesc)); 91 | fill_portal_data(queryDesc, &stack_sample->portal_data); 92 | stack_sample->portal_data.portal_key = get_portal_key(activeportal); 93 | } 94 | if (need_node) 95 | { 96 | capture_stack(&(ctx->regs), &(stack_sample->stack_data), MAX_STACK_READ); 97 | } 98 | event_ring.ringbuf_submit(stack_sample, 0); 99 | } 100 | } 101 | } 102 | #endif 103 | queue = pid_queues.lookup(&pid); 104 | if (!queue) 105 | return 0; 106 | while (i < 5) 107 | { 108 | 109 | /* No more requests to process. */ 110 | if (bpf_map_pop_elem(queue, &request) < 0) 111 | { 112 | return 0; 113 | } 114 | 115 | size = request.size; 116 | /* We treat those specially, as we have the opportunity to gather a bunch of 117 | * data at the same time. 118 | */ 119 | if (request.event_type == EventTypeMemoryNodeData) 120 | { 121 | struct planstate_data_t *response = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t)); 122 | if (!response) 123 | return 1; 124 | fill_event_base(&(response->event_base), EventTypeMemoryNodeData); 125 | record_node((void *) request.memory_path[0], response, NULL, false); 126 | event_ring.ringbuf_submit(response, 0); 127 | i++; 128 | continue; 129 | } 130 | response = event_ring.ringbuf_reserve(sizeof(struct memory_response_t)); 131 | if (!response) 132 | return 1; 133 | 134 | fill_event_base(&(response->event_base), request.event_type); 135 | if (size >= MEMORY_REQUEST_MAXSIZE) 136 | size = MEMORY_REQUEST_MAXSIZE; 137 | /* 138 | * request.path_size can't be greater than MEMORY_PATH_SIZE, 139 | * but the eBPF verifier doesn't know this. 140 | */ 141 | memory_location = 0; 142 | j = 0; 143 | /* Chase pointers as needed */ 144 | while(j < request.path_size - 1 && j < MEMORY_PATH_SIZE) 145 | { 146 | if (memory_location != 0) 147 | { 148 | if(bpf_probe_read_user(&memory_location, sizeof(u64), 149 | (void *) memory_location)) 150 | { 151 | /* We failed to read here, so bail out. */ 152 | event_ring.ringbuf_discard(response, 0); 153 | return 0; 154 | } 155 | } 156 | memory_location = request.memory_path[j] + memory_location; 157 | j++; 158 | } 159 | if (bpf_probe_read_user(&response->payload, size, (void *) memory_location)) 160 | { 161 | event_ring.ringbuf_discard(response, 0); 162 | } else { 163 | response->request_id = request.request_id; 164 | event_ring.ringbuf_submit(response, 0); 165 | } 166 | i++; 167 | } 168 | return 0; 169 | } 170 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/plan.c: -------------------------------------------------------------------------------- 1 | #include "data.h" 2 | #include "utils.h" 3 | #include "stack.h" 4 | 5 | int execprocnodefirst_enter(struct pt_regs *ctx); 6 | int execendnode_enter(struct pt_regs *ctx); 7 | 8 | /* 9 | * On each first execution of a node, send the node information to the client 10 | * side 11 | */ 12 | int execprocnodefirst_enter(struct pt_regs *ctx) 13 | { 14 | ##CHECK_POSTMASTER## 15 | struct planstate_data_t *node; 16 | node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t)); 17 | if (!node) 18 | return 0; 19 | fill_event_base(&(node->event_base), EventTypeExecProcNodeFirst); 20 | record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true); 21 | event_ring.ringbuf_submit(node, 0); 22 | return 0; 23 | } 24 | 25 | /* 26 | * On each node teardown, send the node information to the client side (again) 27 | */ 28 | int execendnode_enter(struct pt_regs *ctx) 29 | { 30 | ##CHECK_POSTMASTER## 31 | struct planstate_data_t *node; 32 | node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t)); 33 | if (!node) 34 | return 0; 35 | fill_event_base(&(node->event_base), EventTypeExecEndNode); 36 | record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true); 37 | event_ring.ringbuf_submit(node, 0); 38 | return 0; 39 | } 40 | 41 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/program.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "ebpf_maps.h" 3 | #include "data.h" 4 | #include "utils.h" 5 | 6 | static int override_instrument_options(void * querydesc); 7 | 8 | int executorstart_enter(struct pt_regs *ctx) 9 | { 10 | ##CHECK_POSTMASTER## 11 | void *queryDesc = (void *) PT_REGS_PARM1(ctx); 12 | #ifdef USER_INSTRUMENT_FLAGS 13 | override_instrument_options(queryDesc); 14 | #endif 15 | return 0; 16 | } 17 | 18 | int executorrun_enter(struct pt_regs *ctx) 19 | { 20 | u64 ppid; 21 | ##CHECK_POSTMASTER## 22 | void *queryDesc = (void *) PT_REGS_PARM1(ctx); 23 | void *sourceText; 24 | void *portaladdr; 25 | void *search_path; 26 | void *plan; 27 | 28 | struct portal_data_t *event; 29 | bpf_probe_read_user(&portaladdr, 30 | sizeof(void*), 31 | (void *) GlobalVariablesActivePortal); 32 | Id128 key = get_portal_key(portaladdr); 33 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t)); 34 | if (!event) 35 | return 1; 36 | fill_event_base(&(event->event_base), EventTypeExecutorRun); 37 | event->portal_key = key; 38 | fill_portal_data(queryDesc, event); 39 | bpf_probe_read_user(&search_path, sizeof(void *), (void *) GlobalVariablesnamespace_search_path); 40 | bpf_probe_read_user_str(&event->search_path, MAX_SEARCHPATH_LENGTH, 41 | search_path); 42 | event_ring.ringbuf_submit(event, 0); 43 | return 0; 44 | } 45 | 46 | int executorfinish_enter(struct pt_regs *ctx) 47 | { 48 | ##CHECK_POSTMASTER## 49 | void *portal; 50 | void *queryDesc = (void *) PT_REGS_PARM1(ctx); 51 | struct portal_data_t *event; 52 | Id128 key; 53 | bpf_probe_read_user(&portal, 54 | sizeof(void*), 55 | (void *) GlobalVariablesActivePortal); 56 | 57 | key = get_portal_key((void*) portal); 58 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t)); 59 | if (!event) 60 | return 1; 61 | init_portal_data(event); 62 | fill_portal_data(queryDesc, event); 63 | fill_event_base(&(event->event_base), EventTypeExecutorFinish); 64 | event->portal_key = key; 65 | event_ring.ringbuf_submit(event, 0); 66 | return 0; 67 | } 68 | 69 | int portaldrop_return(struct pt_regs *ctx) 70 | { 71 | ##CHECK_POSTMASTER## 72 | struct portal_data_t *event; 73 | Id128 key = {0}; 74 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t)); 75 | if (!event) 76 | return 1; 77 | init_portal_data(event); 78 | fill_event_base(&(event->event_base), EventTypeDropPortalReturn); 79 | event->portal_key = key; 80 | event_ring.ringbuf_submit(event, 0); 81 | return 0; 82 | } 83 | 84 | int portaldrop_enter(struct pt_regs *ctx) 85 | { 86 | ##CHECK_POSTMASTER## 87 | void *portal = (void *) PT_REGS_PARM1(ctx); 88 | Id128 key = get_portal_key(portal); 89 | struct portal_data_t *event; 90 | void *queryDesc; 91 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t)); 92 | if (!event) 93 | return 1; 94 | init_portal_data(event); 95 | bpf_probe_read_user(&queryDesc, sizeof(void *), 96 | OffsetFrom(portal, PortalData, queryDesc)); 97 | fill_portal_data(queryDesc, event); 98 | fill_event_base(&(event->event_base), EventTypeDropPortalEnter); 99 | event->portal_key = key; 100 | event_ring.ringbuf_submit(event, 0); 101 | return 0; 102 | } 103 | 104 | /* When instrumenting a whole cluster, we also trace new processes. 105 | * Additionally, specific collectors can embed code in here. 106 | */ 107 | #ifdef POSTMASTER_PID 108 | TRACEPOINT_PROBE(sched, sched_process_fork) 109 | { 110 | u32 pid = args->parent_pid; 111 | if (args->parent_pid != POSTMASTER_PID) 112 | return 0; 113 | struct event_base *event; 114 | event = event_ring.ringbuf_reserve(sizeof (struct event_base)); 115 | if (!event) 116 | return 1; 117 | event->pid = args->child_pid; 118 | event->event_type = EventTypeProcessFork; 119 | event_ring.ringbuf_submit(event, 0); 120 | return 0; 121 | } 122 | #endif 123 | 124 | TRACEPOINT_PROBE(sched, sched_process_exit) 125 | { 126 | ##CHECK_POSTMASTER## 127 | #ifdef PID 128 | if (bpf_get_current_pid_tgid() >> 32 != PID) 129 | return 1; 130 | #endif 131 | struct event_base *event; 132 | event = event_ring.ringbuf_reserve(sizeof (struct event_base)); 133 | if (!event) 134 | return 1; 135 | fill_event_base(event, EventTypeProcessExit); 136 | event_ring.ringbuf_submit(event, 0); 137 | return 0; 138 | } 139 | 140 | 141 | #ifdef USER_INSTRUMENT_FLAGS 142 | static int override_instrument_options(void * querydesc) 143 | { 144 | void * options_addr = OffsetFrom(querydesc, QueryDesc, instrument_options); 145 | int instr_options; 146 | bpf_probe_read_user(&instr_options, 147 | sizeof(int), 148 | options_addr); 149 | instr_options |= USER_INSTRUMENT_FLAGS; 150 | return bpf_probe_write_user(options_addr, &instr_options, sizeof(int)); 151 | } 152 | #endif 153 | 154 | #ifdef CAPTURE_PLANS 155 | #include "plan.h" 156 | #endif 157 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/stack.h: -------------------------------------------------------------------------------- 1 | #ifndef STACK_H 2 | #define STACK_H 3 | #include 4 | 5 | struct stack_data_t { 6 | u64 rax; 7 | u64 rdx; 8 | u64 rcx; 9 | u64 rbx; 10 | u64 rsi; 11 | u64 rdi; 12 | u64 rbp; 13 | u64 rsp; 14 | u64 r8; 15 | u64 r9; 16 | u64 r10; 17 | u64 r11; 18 | u64 r12; 19 | u64 r13; 20 | u64 r14; 21 | u64 r15; 22 | u64 rip; 23 | u64 size; 24 | u64 start_addr; 25 | char stack[MAX_STACK_READ]; // Dynamically injected using defines 26 | }; 27 | 28 | /* 29 | * Capture the current stack and register values. 30 | */ 31 | static inline int capture_stack(struct pt_regs *ctx, struct stack_data_t *stack_data, u64 max_read) 32 | { 33 | int ret = 0; 34 | stack_data->rax = ctx->ax; 35 | stack_data->rdx = ctx->dx; 36 | stack_data->rcx = ctx->cx; 37 | stack_data->rbx = ctx->bx; 38 | stack_data->rsi = ctx->si; 39 | stack_data->rdi = ctx->di; 40 | stack_data->rbp = ctx->bp; 41 | stack_data->rsp = ctx->sp; 42 | stack_data->r8 = ctx->r8; 43 | stack_data->r9 = ctx->r9; 44 | stack_data->r10 = ctx->r10; 45 | stack_data->r11 = ctx->r11; 46 | stack_data->r12 = ctx->r12; 47 | stack_data->r13 = ctx->r13; 48 | stack_data->r14 = ctx->r14; 49 | stack_data->r15 = ctx->r15; 50 | stack_data->rip = ctx->ip; 51 | stack_data->start_addr = stack_data->rsp; 52 | stack_data->size = (STACK_TOP_ADDR - stack_data->rsp); 53 | if (stack_data->size > max_read) 54 | stack_data->size = max_read; 55 | ret = bpf_probe_read_user(&stack_data->stack, 56 | stack_data->size, 57 | (void *) (stack_data->rsp)); 58 | if (ret != 0) 59 | { 60 | stack_data->size = 0; 61 | } 62 | return ret; 63 | } 64 | 65 | #endif 66 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/code/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | #define EPOCH_OFFSET ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY) 4 | 5 | #define Offset(structname, member) (STRUCT_ ## structname ## _OFFSET_ ## member) 6 | #define OffsetFrom(pointer, structname, member) ((void *) (pointer + Offset(structname, member))) 7 | 8 | /* Reuse code from LIBCC for version matching */ 9 | #define __LIBBCC_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c))) 10 | #define LIBBCC_VERSION_CODE __LIBBCC_VERSION(LIBBCC_MAJOR_VERSION, LIBBCC_MINOR_VERSION, LIBBCC_PATCH_VERSION) 11 | #define LIBBCC_VERSION_GEQ(a,b,c) LIBBCC_VERSION_CODE >= __LIBBCC_VERSION(a, b, c) 12 | 13 | 14 | #include "data.h" 15 | 16 | /* Clamp a value to a max value, and make the eBPF verifier happy. */ 17 | #define clamp_umax(VAR, UMAX) \ 18 | asm volatile ( \ 19 | "if %0 <= %[max] goto +1\n" \ 20 | "%0 = %[max]\n" \ 21 | : "+r"(VAR) \ 22 | : [max]"i"(UMAX) \ 23 | ) 24 | 25 | static u64 pgts_to_unixts(u64 pgts) 26 | { 27 | ulong secs = (ulong) pgts / 1000000; 28 | uint microsecs = (uint) pgts % 1000000; 29 | return (secs + EPOCH_OFFSET) * 1000000 + microsecs; 30 | } 31 | 32 | 33 | // Handle code related to the portal information capture 34 | static inline Id128 get_portal_key(void * portal) 35 | { 36 | Id128 ret; 37 | u64 creation_time; 38 | __builtin_memset(&ret, 0, sizeof(ret)); 39 | ret.u1 = bpf_get_current_pid_tgid(); 40 | bpf_probe_read_user(&creation_time, 41 | sizeof(u64), 42 | OffsetFrom(portal, PortalData, creation_time)); 43 | ret.u2 = pgts_to_unixts(creation_time); 44 | return ret; 45 | } 46 | 47 | static inline void fill_event_base(event_base* event, short event_type) 48 | { 49 | event->event_type = event_type; 50 | event->pid = (bpf_get_current_pid_tgid() >> 32); 51 | } 52 | 53 | static inline void fill_portal_data(void * queryDesc, struct portal_data_t* event) 54 | { 55 | void *sourceText; 56 | void *planstate; 57 | void *instrument; 58 | void *plannedStmt; 59 | void *plan; 60 | int ret; 61 | event->queryAddr = (u64) queryDesc; 62 | bpf_probe_read_user(&sourceText, 63 | sizeof(void *), 64 | OffsetFrom(queryDesc, QueryDesc, sourceText)); 65 | bpf_probe_read_user_str(&event->query, 66 | MAX_QUERY_LENGTH, 67 | (void *) sourceText); 68 | ret = bpf_probe_read_user(&plannedStmt, 69 | sizeof(void *), 70 | OffsetFrom(queryDesc, QueryDesc, plannedstmt)); 71 | if (plannedStmt && ret == 0) 72 | { 73 | bpf_probe_read_user(&event->query_id, 74 | sizeof(u64), 75 | OffsetFrom(plannedStmt, PlannedStmt, queryId)); 76 | } 77 | ret = bpf_probe_read_user(&planstate, 78 | sizeof(void *), 79 | OffsetFrom(queryDesc, QueryDesc, planstate)); 80 | if (planstate && ret == 0) 81 | { 82 | ret = bpf_probe_read_user(&plan, sizeof(void *), 83 | OffsetFrom(planstate, PlanState, plan)); 84 | if (plan && ret == 0) 85 | { 86 | bpf_probe_read_user(&event->startup_cost, 87 | sizeof(double), 88 | OffsetFrom(plan, Plan, startup_cost)); 89 | bpf_probe_read_user(&event->total_cost, 90 | sizeof(double), 91 | OffsetFrom(plan, Plan, total_cost)); 92 | bpf_probe_read_user(&event->plan_rows, 93 | sizeof(double), 94 | OffsetFrom(plan, Plan, plan_rows)); 95 | } 96 | ret = bpf_probe_read_user(&instrument, 97 | sizeof(void *), 98 | OffsetFrom(planstate, PlanState, instrument)); 99 | if (instrument && ret == 0) 100 | { 101 | bpf_probe_read_user(&event->instrument, 102 | STRUCT_SIZE_Instrumentation, 103 | instrument); 104 | } 105 | } 106 | } 107 | 108 | static inline void init_portal_data(struct portal_data_t* event) 109 | { 110 | event->query[0] = 0; 111 | event->instrument[0] = 0; 112 | event->search_path[0] = 0; 113 | } 114 | 115 | /* 116 | * Record information about a PlanStateNode 117 | */ 118 | static inline void record_node(void * nodeaddr, struct planstate_data_t *node, 119 | struct pt_regs *ctx, bool need_capture_stack) 120 | { 121 | void *portal; 122 | void *instrument; 123 | void *planaddr; 124 | bpf_probe_read_user(&portal, 125 | sizeof(void*), 126 | (void *) GlobalVariablesActivePortal); 127 | node->portal_key = get_portal_key(portal); 128 | node->planstate_addr = (u64) nodeaddr; 129 | if (need_capture_stack) 130 | capture_stack(ctx, &node->stack_capture, MAX_STACK_READ); 131 | 132 | /* Read the associated Plan node, and it's estimates */ 133 | bpf_probe_read_user(&planaddr, 134 | sizeof(void *), 135 | OffsetFrom(nodeaddr, PlanState, plan)); 136 | node->plan_data.plan_addr = (u64) planaddr; 137 | bpf_probe_read_user(&node->plan_data.plan_tag, 138 | sizeof(int), 139 | OffsetFrom(planaddr, Plan, type)); 140 | 141 | bpf_probe_read_user(&node->plan_data.startup_cost, 142 | sizeof(double), 143 | OffsetFrom(planaddr, Plan, startup_cost)); 144 | bpf_probe_read_user(&node->plan_data.total_cost, 145 | sizeof(double), 146 | OffsetFrom(planaddr, Plan, total_cost)); 147 | bpf_probe_read_user(&node->plan_data.plan_rows, 148 | sizeof(double), 149 | OffsetFrom(planaddr, Plan, plan_rows)); 150 | bpf_probe_read_user(&node->plan_data.plan_width, 151 | sizeof(int), 152 | OffsetFrom(planaddr, Plan, plan_width)); 153 | bpf_probe_read_user(&node->plan_data.parallel_aware, 154 | sizeof(bool), 155 | OffsetFrom(planaddr, Plan, parallel_aware)); 156 | /* Read the PlanState node data */ 157 | bpf_probe_read_user(&node->planstate_tag, 158 | sizeof(int), 159 | OffsetFrom(nodeaddr, PlanState, type)); 160 | bpf_probe_read_user(&node->lefttree, 161 | sizeof(void *), 162 | OffsetFrom(nodeaddr, PlanState, lefttree)); 163 | bpf_probe_read_user(&node->righttree, 164 | sizeof(void *), 165 | OffsetFrom(nodeaddr, PlanState, righttree)); 166 | bpf_probe_read_user(&instrument, 167 | sizeof(void *), 168 | OffsetFrom(nodeaddr, PlanState, instrument)); 169 | if (instrument) 170 | bpf_probe_read_user(&node->instrument, 171 | STRUCT_SIZE_Instrumentation, 172 | instrument); 173 | } 174 | #endif 175 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/collector/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Workhorse for pgtracer. 3 | 4 | The BPFCollector works by combining two things: 5 | - an ebpf program loaded in to the kernel, which is built on the fly 6 | - DWARF information extracted from the executable (or a separate debug 7 | symbols file). 8 | """ 9 | from __future__ import annotations 10 | 11 | import ctypes as ct 12 | import os 13 | from dataclasses import dataclass 14 | from enum import IntEnum 15 | from pathlib import Path 16 | from threading import Lock, Thread 17 | from time import sleep 18 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union 19 | 20 | from bcc import BPF, USDT, PerfSWConfig, PerfType 21 | from bcc import __version__ as bcc_version 22 | from bcc import lib as bcclib 23 | from pypsutil import Process 24 | 25 | from ...model import MemoryAllocType, Query 26 | from ..dwarf import DWARFPointer, ProcessMetadata, Struct, get_size 27 | from ..unwind import stack_data_t 28 | from .c_defs import * 29 | from .utils import CODE_BASE_PATH, defines_dict_to_c, intenum_to_c, load_c_file 30 | 31 | BCC_VERSION_TUPLE = tuple(int(part) for part in bcc_version.split(".")) 32 | 33 | 34 | class InvalidStateException(Exception): 35 | """ 36 | Invalid State of a BPFCollector Exception. 37 | 38 | This Exception occurs when an operation is performed on a BPFCollector 39 | which is not in the prerequisite state. 40 | """ 41 | 42 | 43 | # pylint: disable=invalid-name 44 | class EventHandler: 45 | """ 46 | Base class for handling events. 47 | 48 | The handle_event method dispatched to handle_{EventType} methods if they 49 | exist. This acts mostly as a namespace to not pollute the BPFCollector 50 | class itself. 51 | """ 52 | 53 | def __init__(self) -> None: 54 | pass 55 | 56 | def handle_event(self, bpf_collector: BPFCollector, event: ct._CData) -> int: 57 | """ 58 | Handle an event from EBPF ringbuffer. 59 | Every event should be tagged with a short int as the first member to 60 | handle it's type. It is then dispatched to the appropriate method, 61 | which will be able to make sense of the actual struct. 62 | """ 63 | # All events should be tagged with the event's type 64 | event_stub = ct.cast(event, ct.POINTER(event_base)).contents 65 | event_type_name = EventType(event_stub.event_type).name 66 | pid = event_stub.pid 67 | method_name = f"handle_{event_type_name}" 68 | method: Callable[[BPFCollector, ct._CData, int], int] = getattr( 69 | self, method_name 70 | ) 71 | return method(bpf_collector, event, pid) 72 | 73 | # pylint: disable=unused-argument 74 | def handle_ProcessExit( 75 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 76 | ) -> int: 77 | """ 78 | Handle ProcessExit event. 79 | """ 80 | return bpf_collector.cleanup_process(pid) 81 | 82 | # pylint: disable=unused-argument 83 | def handle_ProcessFork( 84 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 85 | ) -> int: 86 | """ 87 | Handle ProcessEnter event. 88 | """ 89 | return bpf_collector.setup_process(pid) 90 | 91 | 92 | @dataclass 93 | class CollectorOptions: 94 | """ 95 | Base class for BPFCollector Options. 96 | """ 97 | 98 | enable_perf_events: bool = True 99 | sample_freq: int = 1200 100 | 101 | 102 | T = TypeVar("T", bound="BPFCollector") 103 | 104 | 105 | class BPFCollector: 106 | """ 107 | Workhorse for pgtracer. 108 | 109 | This class allows the user to load an EBPF program dynamically generated 110 | using supplied options and extracted metadata about the Postgres 111 | executable. 112 | """ 113 | 114 | options_cls: Type[CollectorOptions] = CollectorOptions 115 | event_handler_cls: Type[EventHandler] = EventHandler 116 | 117 | ExecEndFuncs = [ 118 | "ExecEndAgg", 119 | "ExecEndAppend", 120 | "ExecEndBitmapAnd", 121 | "ExecEndBitmapHeapScan", 122 | "ExecEndBitmapIndexScan", 123 | "ExecEndBitmapOr", 124 | "ExecEndCteScan", 125 | "ExecEndCustomScan", 126 | "ExecEndForeignScan", 127 | "ExecEndFunctionScan", 128 | "ExecEndGather", 129 | "ExecEndGatherMerge", 130 | "ExecEndGroup", 131 | "ExecEndHash", 132 | "ExecEndHashJoin", 133 | "ExecEndIncrementalSort", 134 | "ExecEndIndexOnlyScan", 135 | "ExecEndIndexScan", 136 | "ExecEndLimit", 137 | "ExecEndLockRows", 138 | "ExecEndMaterial", 139 | "ExecEndMemoize", 140 | "ExecEndMergeAppend", 141 | "ExecEndMergeJoin", 142 | "ExecEndModifyTable", 143 | "ExecEndNamedTuplestoreScan", 144 | "ExecEndNode", 145 | "ExecEndNestLoop", 146 | "ExecEndProjectSet", 147 | "ExecEndRecursiveUnion", 148 | "ExecEndResult", 149 | "ExecEndSampleScan", 150 | "ExecEndSeqScan", 151 | "ExecEndSetOp", 152 | "ExecEndSort", 153 | "ExecEndSubqueryScan", 154 | "ExecEndTableFuncScan", 155 | "ExecEndTidRangeScan", 156 | "ExecEndTidScan", 157 | "ExecEndUnique", 158 | "ExecEndValuesScan", 159 | "ExecEndWindowAgg", 160 | "ExecEndWorkTableScan", 161 | ] 162 | 163 | def __init__( 164 | self, 165 | metadata: ProcessMetadata, 166 | options: Optional[CollectorOptions] = None, 167 | include_children: bool = False, 168 | ): 169 | if options is None: 170 | options = self.options_cls() 171 | self.options = options 172 | self.include_children = include_children 173 | self.anon_map_fds: Dict[int, int] = {} 174 | self.ppid: Optional[int] 175 | if include_children: 176 | self.pid = -1 177 | self.ppid = metadata.pid 178 | else: 179 | self.pid = metadata.pid 180 | self.ppid = None 181 | 182 | self.metadata = metadata 183 | self.program = str(self.metadata.program).encode("utf8") 184 | # Old bcc version don't support global usdt probes, so disable 185 | # memory tracking in that case 186 | if self.include_children is False or BCC_VERSION_TUPLE >= (0, 19, 0): 187 | self.usdt_ctx = USDT(metadata.pid) 188 | self.enable_usdt_probes(self.usdt_ctx) 189 | else: 190 | self.usdt_ctx = None 191 | self.bpf = self.prepare_bpf() 192 | self.setup_bpf_state() 193 | self.event_handler: EventHandler = self.event_handler_cls() 194 | self.update_struct_defs() 195 | self.is_running = False 196 | self.background_thread: Optional[Thread] = None 197 | self.lock = Lock() 198 | self.sample_freq = options.sample_freq 199 | self.backend_type: Optional[IntEnum] = None 200 | 201 | @classmethod 202 | def from_pid( 203 | cls: Type[T], pid: int, options: CollectorOptions = CollectorOptions() 204 | ) -> T: 205 | """ 206 | Build a BPFCollector from a pid. 207 | """ 208 | # FIXME: make this configurable 209 | cache_dir = Path("~/.cache").expanduser() / "pgtracer" 210 | process = Process(pid=pid) 211 | # Check if we are given the postmaster pid, or a backend. 212 | # If our parent is itself a postgres process, then we are instrumenting the whole backend. 213 | pprocess = process.parent() 214 | include_children = bool( 215 | pprocess and pprocess.name() not in ("postgres", "postmaster") 216 | ) 217 | processmetadata = ProcessMetadata(process, cache_dir=cache_dir) 218 | return cls(processmetadata, options, include_children=include_children) 219 | 220 | def update_struct_defs(self) -> None: 221 | """ 222 | Update the ctypes struct definitions from the DWARF metadata. 223 | 224 | Some C structs used in EBPF must match what is defined by Postgres: 225 | so we build the class dynamically after the DWARF file has been loaded. 226 | """ 227 | global instrument_type # pylint: disable=global-statement 228 | instrument_type = ct.c_byte * self.metadata.structs.Instrumentation.size 229 | # Update global struct definitions with actual sizes 230 | portal_data.update_fields( 231 | { 232 | "query": ct.c_char * MAX_QUERY_LENGTH, 233 | "instrument": instrument_type, 234 | "search_path": ct.c_char * MAX_SEARCHPATH_LENGTH, 235 | } 236 | ) 237 | planstate_data.update_fields({"instrument": instrument_type}) 238 | stack_sample.update_fields({"portal_data": portal_data}) 239 | 240 | @property 241 | def constant_defines(self) -> Dict[str, int]: 242 | """ 243 | Returns a list of constants to add to the ebpf program as #define 244 | directives. 245 | """ 246 | constants = { 247 | "STACK_TOP_ADDR": self.metadata.stack_top, 248 | # TODO: find a way to extract those ? 249 | "POSTGRES_EPOCH_JDATE": 2451545, 250 | "UNIX_EPOCH_JDATE": 2440588, 251 | "SECS_PER_DAY": 86400, 252 | # TODO: make those configurable ? 253 | "MAX_QUERY_NUMBER": 10, 254 | "MAX_QUERY_LENGTH": MAX_QUERY_LENGTH, 255 | "MAX_STACK_READ": 4096, 256 | "MAX_SEARCHPATH_LENGTH": MAX_SEARCHPATH_LENGTH, 257 | "EVENTRING_PAGE_SIZE": 131072, 258 | "MEMORY_REQUEST_MAXSIZE": MEMORY_REQUEST_MAXSIZE, 259 | "MEMORY_PATH_SIZE": MEMORY_PATH_SIZE, 260 | "LIBBCC_MAJOR_VERSION": BCC_VERSION_TUPLE[0], 261 | "LIBBCC_MINOR_VERSION": BCC_VERSION_TUPLE[1], 262 | "LIBBCC_PATCH_VERSION": BCC_VERSION_TUPLE[2], 263 | } 264 | if self.ppid is not None: 265 | constants["POSTMASTER_PID"] = self.ppid 266 | else: 267 | constants["PID"] = self.pid 268 | return constants 269 | 270 | @property 271 | def struct_offsets_defines(self) -> Dict[str, int]: 272 | """ 273 | Build C-Code for the eBPF code to easily access named members in 274 | structs. 275 | 276 | We read the offset in a struct for known members, so that the eBPF code 277 | can read those members from the Postgres struct. 278 | 279 | This is necessary because we can't include Postgres headers in the eBPF 280 | code. 281 | """ 282 | # Returns a normalized way of DEFINING struct offsets 283 | s = self.metadata.structs 284 | 285 | return { 286 | f"STRUCT_{struct}_OFFSET_{member}": getattr(s, struct) 287 | .field_definition(member) 288 | .offset 289 | for struct, member in ( 290 | ("Node", "type"), 291 | ("Plan", "type"), 292 | ("Plan", "startup_cost"), 293 | ("Plan", "total_cost"), 294 | ("Plan", "plan_rows"), 295 | ("Plan", "plan_width"), 296 | ("Plan", "parallel_aware"), 297 | ("PlannedStmt", "queryId"), 298 | ("PlanState", "instrument"), 299 | ("PlanState", "plan"), 300 | ("PlanState", "type"), 301 | ("PlanState", "lefttree"), 302 | ("PlanState", "righttree"), 303 | ("PortalData", "creation_time"), 304 | ("PortalData", "queryDesc"), 305 | ("QueryDesc", "instrument_options"), 306 | ("QueryDesc", "planstate"), 307 | ("QueryDesc", "sourceText"), 308 | ("QueryDesc", "plannedstmt"), 309 | ) 310 | } 311 | 312 | def make_global_variables_enum(self) -> Type[IntEnum]: 313 | """ 314 | Create an IntEnum mapping global variables names to their address in 315 | the program. 316 | """ 317 | mapping = {} 318 | 319 | for key in ("ActivePortal", "namespace_search_path"): 320 | mapping[key] = self.metadata.global_variable(key) 321 | # Mypy complains about dynamic enums 322 | globalenum = IntEnum("GlobalVariables", mapping) # type: ignore 323 | 324 | return globalenum 325 | 326 | def make_struct_sizes_dict(self) -> Dict[str, int]: 327 | """ 328 | Create a dictionary mapping struct name to their bytesize. 329 | 330 | Once again, this is because we can't include Postgres header and call 331 | "sizeof". 332 | """ 333 | mapping = {} 334 | 335 | for key in ("Instrumentation",): 336 | mapping[f"STRUCT_SIZE_{key}"] = getattr(self.metadata.structs, key).size 337 | 338 | return mapping 339 | 340 | def _attach_uprobe(self, function_name: str, ebpf_function: str) -> None: 341 | """ 342 | Helper to attach a uprobe executing `ebpf_function` at every 343 | `function_name` location. 344 | """ 345 | for addr in self.metadata.function_addresses(function_name): 346 | self.bpf.attach_uprobe( 347 | name=self.program, 348 | fn_name=ebpf_function.encode("utf8"), 349 | addr=addr, 350 | pid=self.pid, 351 | ) 352 | 353 | def _attach_uretprobe(self, function_name: str, ebpf_function: str) -> None: 354 | """ 355 | Helper to attach a uretprobe executing `ebpf_function` at every 356 | `function_name` location. 357 | """ 358 | # TODO: make sure multiple addresses work too 359 | for addr in self.metadata.function_addresses(function_name): 360 | self.bpf.attach_uretprobe( 361 | name=self.program, 362 | fn_name=ebpf_function.encode("utf8"), 363 | addr=addr, 364 | pid=self.pid, 365 | ) 366 | 367 | def background_polling(self, refresh_rate: int) -> None: 368 | """ 369 | Run the polling in the background. 370 | """ 371 | while self.is_running: 372 | self.bpf.ring_buffer_poll(refresh_rate) 373 | sleep(refresh_rate / 1000.0) 374 | 375 | def attach_probes(self) -> None: 376 | """ 377 | Attach the required probes for this collector. 378 | """ 379 | if self.options.enable_perf_events: 380 | self.bpf.attach_perf_event( 381 | ev_type=PerfType.SOFTWARE, 382 | ev_config=PerfSWConfig.CPU_CLOCK, 383 | fn_name=b"perf_event", 384 | pid=self.pid, 385 | sample_freq=self.sample_freq, 386 | ) 387 | 388 | def enable_usdt_probes(self, usdt: USDT) -> None: 389 | """ 390 | Enable USDT probes. 391 | """ 392 | 393 | def start(self) -> None: 394 | """ 395 | Starts the bpf collector. 396 | """ 397 | 398 | if self.is_running: 399 | raise InvalidStateException("BPF Collector is already running") 400 | print("Starting eBPF collector...") 401 | self.bpf[b"event_ring"].open_ring_buffer(self._handle_event) 402 | self.attach_probes() 403 | self.is_running = True 404 | self.background_thread = Thread(target=self.background_polling, args=(100,)) 405 | self.background_thread.start() 406 | print("eBPF collector started") 407 | 408 | def stop(self) -> None: 409 | """ 410 | Stop polling the collector. 411 | """ 412 | self.is_running = False 413 | if self.background_thread: 414 | self.background_thread.join() 415 | self.background_thread = None 416 | for ( 417 | pid, 418 | fd, 419 | ) in self.anon_map_fds.copy().items(): # pylint: disable=invalid-name 420 | os.close(fd) 421 | try: 422 | del self.bpf[b"pid_queues"][ct.c_int(pid)] 423 | except KeyError: 424 | pass 425 | self.anon_map_fds.clear() 426 | self.bpf.cleanup() 427 | 428 | # pylint: disable=unused-argument 429 | def _handle_event(self, cpu: int, data: ct._CData, size: int) -> int: 430 | """ 431 | Callback for the ring_buffer_poll. We actually dispatch this to the 432 | `EventHandler` 433 | """ 434 | # Returning a negative value aborts polling 435 | if not self.is_running: 436 | return -1 437 | return self.event_handler.handle_event(self, data) 438 | 439 | def _optional_code(self) -> str: 440 | """ 441 | Load additional code, depending on options or the specific 442 | Collector type. 443 | """ 444 | buf = "" 445 | if self.options.enable_perf_events: 446 | buf += load_c_file("perf.c") 447 | return buf 448 | 449 | def build_memory_request( 450 | self, 451 | event_type: EventType, 452 | request_id: Id128, 453 | base_addr: int, 454 | base_type: Type[Union[ct._CData, Struct, DWARFPointer]], 455 | path: List[str], 456 | ) -> memory_request: 457 | """ 458 | Build a memory request from a request_id, a base_addr, a known base_type living 459 | at this addr and a path describing which fields to follow to the final memory location. 460 | 461 | The fields definitions are extracted from the debug symbols. 462 | """ 463 | memory_path = (ct.c_ulonglong * MEMORY_PATH_SIZE)() 464 | # We have the base address, the path, and finally an offset 0 to read the memory itself. 465 | mempath_length = len(path) + 1 466 | assert mempath_length <= MEMORY_PATH_SIZE 467 | memory_path[0] = base_addr 468 | current_type = base_type 469 | current_idx = 0 470 | for part in path: 471 | # If we follow a pointer, add a new item to the underlying path. 472 | # Otherwise, just add to the previous type. 473 | if issubclass(current_type, DWARFPointer): 474 | current_type = current_type.pointed_type 475 | current_idx += 1 476 | memory_path[current_idx] = 0 477 | if issubclass(current_type, Struct): 478 | attr = current_type.field_definition(part) 479 | if attr is None: 480 | raise AttributeError(f"Type {current_type} has no field {attr}") 481 | current_type = attr.member_type 482 | memory_path[current_idx] += attr.offset 483 | else: 484 | raise AttributeError( 485 | f"Cannot dereference field {part} from type {current_type}" 486 | ) 487 | # For convenience, support the last field as a pointer. 488 | if issubclass(current_type, DWARFPointer) or current_type == ct.c_char_p: 489 | memory_path[current_idx + 1] = 0 490 | mempath_length += 1 491 | size = get_size(current_type, dereference=True) 492 | 493 | return memory_request( 494 | event_type=event_type, 495 | request_id=request_id, 496 | path_size=mempath_length, 497 | size=size, 498 | memory_path=memory_path, 499 | ) 500 | 501 | def send_memory_request(self, pid: int, request: memory_request) -> None: 502 | """ 503 | Sends a memory request to the ebpf program. 504 | """ 505 | ret = -1 506 | if pid in self.anon_map_fds: 507 | map_fd = self.anon_map_fds[pid] 508 | ret = bcclib.bpf_update_elem(ct.c_int(map_fd), 0, ct.byref(request), 0) 509 | if ret < 0: 510 | raise ValueError("Something went wrong while sending a memory request") 511 | 512 | def preprocess_code(self, buf: str) -> str: 513 | """ 514 | Preprocess code for things macro are not allowed to do with BCC. 515 | """ 516 | if self.include_children: 517 | buf = buf.replace( 518 | "##CHECK_POSTMASTER##", 519 | """{ 520 | u64 ppid; 521 | struct task_struct* task_p = (struct task_struct*)bpf_get_current_task(); 522 | struct task_struct* parent_task_p = task_p->real_parent; 523 | ppid = parent_task_p->tgid; 524 | if (ppid != POSTMASTER_PID) 525 | return 0; 526 | };""", 527 | ) 528 | else: 529 | buf = buf.replace("##CHECK_POSTMASTER##", "") 530 | return buf 531 | 532 | def prepare_bpf(self) -> BPF: 533 | """ 534 | Generate the eBPF program, both from static code and dynamically 535 | generated defines and enums. 536 | """ 537 | buf = defines_dict_to_c(self.constant_defines) 538 | buf += defines_dict_to_c(self.struct_offsets_defines) 539 | buf += defines_dict_to_c(self.make_struct_sizes_dict()) 540 | buf += intenum_to_c(EventType) 541 | buf += intenum_to_c(MemoryAllocType) 542 | buf += intenum_to_c(self.make_global_variables_enum()) 543 | buf += load_c_file("program.c") 544 | buf += self._optional_code() 545 | # Ok, now workaround some limitations of the macro system with bcc and implement our own. 546 | buf = self.preprocess_code(buf) 547 | # Add the code directory as include dir 548 | cflags = [f"-I{CODE_BASE_PATH}"] 549 | # Suppress some common warnings depending on bcc / kernel combinations 550 | cflags.append("-Wno-macro-redefined") 551 | cflags.append("-Wno-ignored-attributes") 552 | # Only enable global memory probe if bcc version is recent enough 553 | kwargs: Dict[str, Any] = {} 554 | if self.include_children and BCC_VERSION_TUPLE >= (0, 19, 0): 555 | kwargs["attach_usdt_ignore_pid"] = True 556 | kwargs["usdt_contexts"] = [self.usdt_ctx] 557 | bpf = BPF(text=buf.encode("utf8"), cflags=cflags, debug=0, **kwargs) 558 | return bpf 559 | 560 | def setup_bpf_state(self) -> None: 561 | """ 562 | Setup the initial BPF State 563 | """ 564 | if self.pid > 0: 565 | self.setup_process(self.pid) 566 | 567 | def setup_process(self, pid: int) -> int: 568 | """ 569 | Callback when a new process is created. 570 | """ 571 | if self.options.enable_perf_events: 572 | new_map = bcclib.bcc_create_map( 573 | BPF_MAP_TYPE_QUEUE, None, 0, ct.sizeof(memory_request), 1024, 0 574 | ) 575 | self.bpf[b"pid_queues"][ct.c_int(pid)] = ct.c_int(new_map) 576 | self.anon_map_fds[pid] = new_map 577 | return 0 578 | 579 | def cleanup_process(self, pid: int) -> int: 580 | """ 581 | Callback when a process exits. 582 | """ 583 | # If we instrument a single pid, exit 584 | if self.pid == pid: 585 | print(f"Process {pid} is terminating, stopping collection") 586 | self.is_running = False 587 | else: 588 | try: 589 | if pid in self.anon_map_fds: 590 | try: 591 | del self.bpf[b"pid_queues"][ct.c_int(pid)] 592 | except KeyError: 593 | pass 594 | os.close(self.anon_map_fds[pid]) 595 | del self.anon_map_fds[pid] 596 | except KeyError: 597 | return 0 598 | return 0 599 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/collector/c_defs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Datastructure definitions used in the C ebpf code. 3 | """ 4 | from __future__ import annotations 5 | 6 | import ctypes as ct 7 | from enum import IntEnum 8 | from typing import Dict, List, Tuple, Type 9 | 10 | from ..unwind import stack_data_t 11 | 12 | BPF_MAP_TYPE_QUEUE = 22 13 | 14 | 15 | class Id128(ct.Structure): 16 | """ 17 | Structure containing two u64, to be used as either a single 8-bytes int or a two 8-bytes tuple. 18 | """ 19 | 20 | _fields_ = [("u1", ct.c_ulonglong), ("u2", ct.c_ulonglong)] 21 | 22 | @classmethod 23 | def from_int(cls, intvalue: int) -> Id128: 24 | """ 25 | Create an Id128 from a single integer. 26 | """ 27 | return cls(intvalue, 0) 28 | 29 | def as_int(self) -> int: 30 | """ 31 | Interpret an Id128 as a single integer. 32 | """ 33 | val: int = self.u1 34 | return val 35 | 36 | @classmethod 37 | def from_tuple(cls, inttuple: Tuple[int, int]) -> Id128: 38 | """ 39 | Create an Id128 from a two-ints tuple. 40 | """ 41 | return cls(*inttuple) 42 | 43 | def as_tuple(self) -> Tuple[int, int]: 44 | """ 45 | Interpret an Id128 as a two-int tuple. 46 | """ 47 | return (self.u1, self.u2) 48 | 49 | 50 | # pylint: disable=invalid-name 51 | class EventType(IntEnum): 52 | """ 53 | EventTypes generated by the EBPF code. 54 | """ 55 | 56 | ExecutorRun = 1 57 | ExecutorFinish = 2 58 | DropPortalEnter = 3 59 | DropPortalReturn = 4 60 | ExecProcNodeFirst = 5 61 | ExecEndNode = 6 62 | KBlockRqIssue = 7 63 | StackSample = 8 64 | MemoryResponseQueryInstr = 9 65 | MemoryResponseNodeInstr = 10 66 | MemoryNodeData = 11 67 | GUCResponse = 12 68 | MemoryAccount = 13 69 | ProcessFork = 14 70 | ProcessExit = 15 71 | 72 | 73 | instrument_type = ct.c_byte * 0 74 | 75 | 76 | class StubStructure(ct.Structure): 77 | """ 78 | StubStructure definition, which actual fields must be updated at runtime. 79 | """ 80 | 81 | _protofields: List[Tuple[str, Type[ct._CData]]] = [] 82 | 83 | @classmethod 84 | def update_fields(cls, fields: Dict[str, Type[ct._CData]]) -> None: 85 | """ 86 | Update the structure fields. 87 | """ 88 | if hasattr(cls, "_fields_"): 89 | # We are not allowed to update it. But if all updated values are 90 | # the same as the first update, we don't care. 91 | fields_dict = dict(cls._fields_) # type: ignore 92 | for key, value in fields.items(): 93 | if fields_dict[key] != value: 94 | raise ValueError("Cannot update a struct more than once.") 95 | return 96 | fields_dict = dict(cls._protofields) 97 | fields_dict.update(fields) 98 | cls._fields_ = list(fields_dict.items()) 99 | 100 | 101 | MAX_QUERY_LENGTH = 2048 102 | MAX_SEARCHPATH_LENGTH = 1024 103 | 104 | 105 | class event_base(ct.Structure): 106 | """ 107 | Common fields for all events. 108 | """ 109 | 110 | _fields_ = [("event_type", ct.c_short), ("pid", ct.c_int)] 111 | 112 | 113 | class portal_data(StubStructure): 114 | """ 115 | Represents the portal_data associated to a portal. 116 | """ 117 | 118 | _protofields = [ 119 | ("event", event_base), 120 | ("portal_key", Id128), 121 | ("query_addr", ct.c_ulonglong), 122 | ("query_id", ct.c_ulonglong), 123 | ("startup_cost", ct.c_double), 124 | ("total_cost", ct.c_double), 125 | ("plan_rows", ct.c_double), 126 | ("query", ct.c_char * MAX_QUERY_LENGTH), 127 | ("instrument", instrument_type), 128 | ("search_path", ct.c_char * MAX_SEARCHPATH_LENGTH), 129 | ] 130 | 131 | 132 | class io_req_data(ct.Structure): 133 | """ 134 | Represents the io_req_data coming from instrumenting the kernel. 135 | """ 136 | 137 | _fields_ = [ 138 | ("event", event_base), 139 | ("rwbs", ct.c_char * 8), 140 | ("bytes", ct.c_ulonglong), 141 | ] 142 | 143 | 144 | class plan_data(ct.Structure): 145 | """ 146 | Represents the data associated with a PlanNode. 147 | """ 148 | 149 | _fields_ = [ 150 | ("plan_addr", ct.c_ulonglong), 151 | ("plan_tag", ct.c_int), 152 | ("startup_cost", ct.c_double), 153 | ("total_cost", ct.c_double), 154 | ("plan_rows", ct.c_double), 155 | ("plan_width", ct.c_int), 156 | ("parallel_aware", ct.c_bool), 157 | ] 158 | 159 | 160 | class planstate_data(StubStructure): 161 | """ 162 | Represents the data associated to a PlanState node. 163 | """ 164 | 165 | _protofields = [ 166 | ("event", event_base), 167 | ("portal_key", Id128), 168 | ("planstate_addr", ct.c_ulonglong), 169 | ("planstate_tag", ct.c_int), 170 | ("lefttree", ct.c_ulonglong), 171 | ("righttree", ct.c_ulonglong), 172 | ("plan_data", plan_data), 173 | ("instrument", instrument_type), 174 | ("stack_capture", stack_data_t), 175 | ] 176 | 177 | 178 | MEMORY_REQUEST_MAXSIZE = 131072 179 | MEMORY_PATH_SIZE = 5 180 | 181 | 182 | class memory_request(ct.Structure): 183 | """ 184 | Represents a memory request, to be processed in the perf event handler. 185 | """ 186 | 187 | _fields_ = [ 188 | ("event_type", ct.c_short), 189 | ("request_id", Id128), 190 | ("path_size", ct.c_int), 191 | ("size", ct.c_ulonglong), 192 | ("memory_path", ct.c_ulonglong * MEMORY_PATH_SIZE), 193 | ] 194 | 195 | 196 | class memory_response(ct.Structure): 197 | """ 198 | Represents a memory response, sent back from the perf event handler. 199 | """ 200 | 201 | _fields_ = [ 202 | ("event", event_base), 203 | ("request_id", Id128), 204 | ("payload", ct.c_char * MEMORY_REQUEST_MAXSIZE), 205 | ] 206 | 207 | @property 208 | def payload_addr(self) -> int: 209 | """ 210 | Returns the address of the payload field: useful to parse it into it's 211 | own struct. 212 | """ 213 | return ct.addressof(self) + memory_response.payload.offset 214 | 215 | 216 | class stack_sample(StubStructure): 217 | """ 218 | Represents a stack sample, sent back from the perf event handler. 219 | """ 220 | 221 | _protofields = [("portal_data", portal_data), ("stack_data", stack_data_t)] 222 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/collector/guc.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module defines the collector for getting / setting GUC. 3 | """ 4 | from __future__ import annotations 5 | 6 | import ctypes as ct 7 | import struct 8 | from dataclasses import dataclass 9 | from typing import Any, BinaryIO, Dict, Optional, Tuple, Type 10 | 11 | from elftools.elf.elffile import ELFFile 12 | 13 | from ...utils import readcstr 14 | from ..dwarf import ProcessMetadata, Struct 15 | from . import BPFCollector, CollectorOptions, EventHandler 16 | from .c_defs import event_base 17 | from .utils import load_c_file 18 | 19 | GUC_MAX_LENGTH = 128 20 | 21 | 22 | # pylint: disable=invalid-name 23 | class guc_request(ct.Structure): 24 | """ 25 | A request to set a guc. 26 | """ 27 | 28 | _fields_ = [ 29 | ("guc_location", ct.c_ulonglong), 30 | ("guc_size", ct.c_int), 31 | ("payload", ct.c_byte * GUC_MAX_LENGTH), 32 | ] 33 | 34 | 35 | # pylint: disable=invalid-name 36 | class guc_response(ct.Structure): 37 | """ 38 | A response to a guc_request. 39 | """ 40 | 41 | _fields_ = [ 42 | ("event", event_base), 43 | ("guc_location", ct.c_ulonglong), 44 | ("status", ct.c_bool), 45 | ] 46 | 47 | 48 | class GUCTracerOptions(CollectorOptions): 49 | """ 50 | Dataclass for GUCTracerBPFCollector options. 51 | """ 52 | 53 | sample_freq: int = 3000 54 | guc_to_watch: Dict[str, str] = {} 55 | 56 | 57 | class GUCTracerEventHandler(EventHandler): 58 | """ 59 | EventHandler for the GUCTracerBPFCollector. 60 | """ 61 | 62 | def __init__(self) -> None: 63 | super().__init__() 64 | self.pending_names_req: Dict[int, GUCDefinition] = {} 65 | 66 | # pylint: disable=invalid-name 67 | def handle_GUCResponse( 68 | self, bpf_collector: GUCTracerBPFCollector, event: ct._CData, pid: int 69 | ) -> int: 70 | """ 71 | Handle GUCResponse messages. 72 | """ 73 | event = ct.cast(event, ct.POINTER(guc_response)).contents 74 | guc_def, value = bpf_collector.pending_guc_sets.pop(event.guc_location) 75 | if event.status: 76 | print( 77 | f"GUC {guc_def.guc_name}@{event.guc_location} has been successfully set to {value}" 78 | ) 79 | else: 80 | print( 81 | f"GUC {guc_def.guc_name}@{event.guc_location} has failed to be set to {value}" 82 | ) 83 | return 0 84 | 85 | 86 | @dataclass 87 | class GUCDefinition: 88 | """ 89 | A GUC definition, extracted from the binary. 90 | """ 91 | 92 | guc_type: str 93 | guc_name: str 94 | guc_location: int 95 | 96 | 97 | class GUCTracerBPFCollector(BPFCollector): 98 | """ 99 | BPF Collector tracing GUCs and potentially modifying them. 100 | """ 101 | 102 | options_cls = GUCTracerOptions 103 | event_handler_cls = GUCTracerEventHandler 104 | 105 | GUC_TABLE_TYPE_TO_VARIABLE = { 106 | "config_bool": "ConfigureNamesBool", 107 | "config_int": "ConfigureNamesInt", 108 | "config_real": "ConfigureNamesReal", 109 | "config_string": "ConfigureNamesString", 110 | "config_enum": "ConfigureNamesEnum", 111 | } 112 | 113 | def __init__( 114 | self, 115 | metadata: ProcessMetadata, 116 | options: Optional[CollectorOptions] = None, 117 | include_children: bool = False, 118 | ): 119 | if include_children: 120 | raise NotImplementedError( 121 | "GUC Tracer does not support attaching to the whole cluster." 122 | ) 123 | self.options: CollectorOptions 124 | self.guc_defs: Dict[str, GUCDefinition] = {} 125 | self.pending_guc_sets: Dict[int, Tuple[GUCDefinition, Any]] = {} 126 | # We must not rely on the debug symbol elffile, but instead the one 127 | # from the executable itself 128 | with ELFFile.load_from_path(metadata.program) as elf: 129 | reladyn = elf.get_section_by_name(".rela.dyn") 130 | self.relocations: Dict[int, int] = { 131 | reloc["r_offset"]: reloc["r_addend"] 132 | for reloc in reladyn.iter_relocations() 133 | } 134 | self.ready = False 135 | super().__init__(metadata, options) 136 | 137 | def _relocate_addr(self, addr: int) -> int: 138 | """ 139 | Relocate an address from the .rela.dyn section information. 140 | """ 141 | if addr in self.relocations: 142 | return self.relocations[addr] 143 | return 0 144 | 145 | def _load_one_gucdef( 146 | self, addr: int, gucdef_type: Type[Struct], binfile: BinaryIO 147 | ) -> Optional[GUCDefinition]: 148 | """ 149 | Load one GUC definition from the binary 150 | """ 151 | # First lookup it's name. We could just use the base address 152 | # since it's the first member but better make it correct 153 | gen_definition = gucdef_type.field_definition("gen") 154 | if gen_definition is None: 155 | raise ValueError( 156 | f"Could not find member gen in struct {gucdef_type.__name__}" 157 | ) 158 | name_definition = gen_definition.member_type.field_definition("name") # type: ignore 159 | if name_definition is None: 160 | raise ValueError( 161 | f"Could not find member name in struct {gen_definition.member_type.__name__}" 162 | ) 163 | name_pointer_addr = addr + gen_definition.offset + name_definition.offset 164 | # Now lookup the relocation information for that address 165 | reloced_addr = self._relocate_addr(name_pointer_addr) 166 | if reloced_addr == 0: 167 | return None 168 | # Now we can read the data from the binary 169 | binfile.seek(reloced_addr) 170 | guc_bname = readcstr(binfile) 171 | guc_name = guc_bname.decode("utf8") 172 | # Now relocate the GUC global variable address 173 | variable_definition = gucdef_type.field_definition("variable") 174 | if variable_definition is None: 175 | raise ValueError( 176 | f"Could not find member variable in struct {gucdef_type.__name__}" 177 | ) 178 | 179 | variable_pointer_addr = addr + variable_definition.offset 180 | reloced_addr = self._relocate_addr(variable_pointer_addr) 181 | return GUCDefinition( 182 | guc_name=guc_name, 183 | guc_type=gucdef_type.__name__.replace("config_", ""), 184 | guc_location=reloced_addr + self.metadata.base_addr, 185 | ) 186 | 187 | def _load_guc_defs_from_binary(self) -> None: 188 | """ 189 | Load GUC definitions from the binary executable. 190 | """ 191 | with open(self.metadata.program, "rb") as programbin: 192 | for typname, variable_name in self.GUC_TABLE_TYPE_TO_VARIABLE.items(): 193 | deftype = getattr(self.metadata.structs, typname) 194 | typsize = deftype.size 195 | variable_addr = self.metadata.global_variable(variable_name) 196 | if variable_addr is None: 197 | raise ValueError( 198 | f"Could not locate global variable {variable_name}" 199 | ) 200 | addr = variable_addr - self.metadata.base_addr 201 | 202 | # Now iterate over the entries. 203 | while True: 204 | guc = self._load_one_gucdef(addr, deftype, programbin) 205 | if guc is None: 206 | break 207 | self.guc_defs[guc.guc_name] = guc 208 | addr += typsize 209 | 210 | def set_guc(self, guc_name: str, guc_value: str) -> None: 211 | """ 212 | Send a request to set a GUC to a specific value. 213 | """ 214 | guc_def = self.guc_defs[guc_name] 215 | guc_c_value: Optional[bytes] = None 216 | if guc_def.guc_type != "int": 217 | raise NotImplementedError("We only support ints for now.") 218 | guc_c_value = struct.pack("i", int(guc_value)) 219 | guc_ct_value: ct._CData = ct.create_string_buffer(guc_c_value, GUC_MAX_LENGTH) 220 | guc_ct_value = ct.cast( 221 | guc_ct_value, ct.POINTER(ct.c_byte * GUC_MAX_LENGTH) 222 | ).contents 223 | guc_req = guc_request( 224 | ct.c_ulonglong(guc_def.guc_location), guc_size=4, payload=guc_ct_value 225 | ) 226 | self.pending_guc_sets[guc_def.guc_location] = guc_def, guc_value 227 | self.bpf[b"gucs_to_set"].push(guc_req) 228 | 229 | def setup_bpf_state(self) -> None: 230 | super().setup_bpf_state() 231 | # Build a mapping of GUC names to variables addresses 232 | self._load_guc_defs_from_binary() 233 | 234 | @property 235 | def constant_defines(self) -> Dict[str, int]: 236 | constants = super().constant_defines 237 | constants["GUC_MAX_LENGTH"] = GUC_MAX_LENGTH 238 | return constants 239 | 240 | def attach_probes(self) -> None: 241 | super().attach_probes() 242 | # Attach at various not-too-intrusive points. 243 | self._attach_uretprobe("BeginCommand", "process_guc_uprobe") 244 | self._attach_uretprobe("printtup", "process_guc_uprobe") 245 | 246 | self._attach_uretprobe("launcher_determine_sleep", "process_guc_uprobe") 247 | self._attach_uretprobe("vacuum_delay_point", "process_guc_uprobe") 248 | 249 | def _optional_code(self) -> str: 250 | buf = super()._optional_code() 251 | buf += load_c_file("gucset.c") 252 | return buf 253 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/collector/querytracer.py: -------------------------------------------------------------------------------- 1 | """ 2 | BPF Collector tracing queries. 3 | """ 4 | from __future__ import annotations 5 | 6 | import ctypes as ct 7 | from dataclasses import dataclass, field 8 | from enum import IntEnum 9 | from typing import Dict, List, Optional, Tuple 10 | 11 | from bcc import USDT 12 | 13 | from pgtracer.ebpf.dwarf import ProcessMetadata 14 | from pgtracer.model.plan import PlanState 15 | from pgtracer.model.query import Query 16 | 17 | from ...model import PlanState, Query, memory_account 18 | from . import BPFCollector, CollectorOptions, EventHandler, EventType 19 | from .c_defs import ( 20 | Id128, 21 | io_req_data, 22 | memory_response, 23 | planstate_data, 24 | portal_data, 25 | stack_sample, 26 | ) 27 | from .utils import load_c_file 28 | 29 | 30 | class InstrumentationFlags(IntEnum): 31 | """ 32 | Instrumentation flags. 33 | 34 | Mimic the InstrumentOption enum from PG. 35 | We define it statically here as it can be used from options. 36 | """ 37 | 38 | TIMER = 1 << 0 39 | BUFFERS = 1 << 1 40 | ROWS = 1 << 2 41 | WAL = 1 << 3 42 | ALL = 0x7FFFFFFF # INT32 Max 43 | 44 | 45 | @dataclass 46 | class QueryTracerOptions(CollectorOptions): 47 | """ 48 | Dataclass for QueryTracerBPFCollector options. 49 | """ 50 | 51 | instrument_flags: int = 0 52 | enable_nodes_collection: bool = False 53 | enable_query_discovery: bool = True 54 | 55 | 56 | @dataclass 57 | class PerProcessInfo: 58 | """ 59 | Store information about the queries processed by a backend. 60 | """ 61 | 62 | pid: int 63 | last_portal_key: Optional[Tuple[int, int]] = None 64 | query_history: List[Query] = field(default_factory=list) 65 | query_cache: Dict[Tuple[int, int], Query] = field(default_factory=dict) 66 | current_executor: Optional[Tuple[int, int]] = None 67 | current_query: Optional[Query] = None 68 | 69 | 70 | # pylint: disable=invalid-name 71 | class QueryTracerEventHandler(EventHandler): 72 | """ 73 | EventHandler for QueryTracer. 74 | """ 75 | 76 | def __init__(self) -> None: 77 | self.per_process_info: Dict[int, PerProcessInfo] = {} 78 | self.next_request_id = 0 79 | self.process_history: List[PerProcessInfo] = [] 80 | 81 | def get_process_info(self, pid: int) -> PerProcessInfo: 82 | """ 83 | Returns the process info for a given PID, creating it if needed. 84 | """ 85 | if pid not in self.per_process_info: 86 | self.per_process_info[pid] = PerProcessInfo(pid) 87 | return self.per_process_info[pid] 88 | 89 | def _process_portal_data( 90 | self, bpf_collector: BPFCollector, event: portal_data, pid: int 91 | ) -> int: 92 | """ 93 | Process the portal data. This is used both when a query starts, and when we see 94 | the first live query during query discovery. 95 | """ 96 | key = event.portal_key.as_tuple() 97 | process_info = self.get_process_info(pid) 98 | process_info.current_executor = event.portal_key.as_tuple() 99 | 100 | if key not in process_info.query_cache: 101 | process_info.query_cache[key] = Query.from_event( 102 | bpf_collector.metadata, event 103 | ) 104 | else: 105 | process_info.query_cache[key].update(bpf_collector.metadata, event) 106 | process_info.current_query = process_info.query_cache[key] 107 | # If perf events are enabled, start watching the query instrumentation. 108 | if bpf_collector.options.enable_perf_events: 109 | structs = bpf_collector.metadata.structs 110 | request = bpf_collector.build_memory_request( 111 | EventType.MemoryResponseQueryInstr, 112 | event.portal_key, 113 | event.query_addr, 114 | structs.QueryDesc, 115 | ["planstate", "instrument"], 116 | ) 117 | bpf_collector.send_memory_request(pid, request) 118 | return 0 119 | 120 | def handle_ExecutorRun( 121 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 122 | ) -> int: 123 | """ 124 | Handle ExecutorRun event. This event is produced by an uprobe on 125 | standard_ExecutorRun. See executorstart_enter in program.c. 126 | 127 | We record the fact that a query started, extracting relevant metadata 128 | already present at the query start. 129 | """ 130 | if bpf_collector.options.enable_perf_events: 131 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False) 132 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool(False) 133 | event = ct.cast(event, ct.POINTER(portal_data)).contents 134 | return self._process_portal_data(bpf_collector, event, pid) 135 | 136 | # pylint: disable=unused-argument 137 | def handle_ExecutorFinish( 138 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 139 | ) -> int: 140 | """ 141 | Handle ExecutorFinish event. 142 | """ 143 | event = ct.cast(event, ct.POINTER(portal_data)).contents 144 | key = event.portal_key.as_tuple() 145 | process_info = self.get_process_info(pid) 146 | if process_info.current_executor: 147 | process_info.current_executor = None 148 | process_info.current_query = None 149 | if key in process_info.query_cache: 150 | process_info.query_cache[event.portal_key.as_tuple()].update( 151 | bpf_collector.metadata, event 152 | ) 153 | return 0 154 | 155 | # pylint: disable=unused-argument 156 | def handle_DropPortalEnter( 157 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 158 | ) -> int: 159 | """ 160 | Handle DropPortalEnter event. This event is produced by a uprobe on 161 | DropPortal. See protaldrop_enter in program.c. 162 | 163 | PortalDrop is called whenever a query is finished: once the last row 164 | has been read in the case of a single query, or when the cursor is 165 | closed in the case of a cursor. 166 | 167 | Since PortalDrop is responsbile for cleaning up the portal, we record 168 | the instrumentation and other data about the query here, and remember 169 | it's identifier. Only once we return from DropPortal will we actually 170 | clean up the query from our current cache, and append it to history. 171 | """ 172 | event = ct.cast(event, ct.POINTER(portal_data)).contents 173 | process_info = self.get_process_info(pid) 174 | process_info.last_portal_key = event.portal_key.as_tuple() 175 | if process_info.last_portal_key in process_info.query_cache: 176 | process_info.query_cache[process_info.last_portal_key].update( 177 | bpf_collector.metadata, event 178 | ) 179 | return 0 180 | 181 | # pylint: disable=unused-argument 182 | def handle_DropPortalReturn( 183 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 184 | ) -> int: 185 | """ 186 | Handle DropPortalReturn event. This event is produced by an uretprobe on 187 | DropPortal. See protaldrop_return in program.c. 188 | 189 | We remove the query from the internal cache and append it to history. 190 | """ 191 | event = ct.cast(event, ct.POINTER(portal_data)).contents 192 | process_info = self.get_process_info(pid) 193 | if process_info.last_portal_key is not None: 194 | if process_info.last_portal_key in process_info.query_cache: 195 | query = process_info.query_cache[process_info.last_portal_key] 196 | process_info.query_history.append(query) 197 | del process_info.query_cache[process_info.last_portal_key] 198 | process_info.last_portal_key = None 199 | process_info.current_executor = None 200 | process_info.current_query = None 201 | return 0 202 | 203 | def handle_ExecProcNodeFirst( 204 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 205 | ) -> int: 206 | """ 207 | Handle ExecProcNodeFirst event. This event is produced by a uprobe on 208 | ExecProcNodeFirst. 209 | 210 | The goal here is to build a plan tree for the query. 211 | """ 212 | event = ct.cast(event, ct.POINTER(planstate_data)).contents 213 | process_info = self.get_process_info(pid) 214 | query = process_info.query_cache.get(event.portal_key.as_tuple()) 215 | if query is None: 216 | # We don't know this query: maybe it started running before us ? 217 | return 0 218 | query.add_node_from_event(bpf_collector.metadata, event) 219 | if bpf_collector.options.enable_perf_events: 220 | request = bpf_collector.build_memory_request( 221 | EventType.MemoryResponseNodeInstr, 222 | Id128.from_int(event.planstate_addr), 223 | event.planstate_addr, 224 | bpf_collector.metadata.structs.PlanState, 225 | ["instrument"], 226 | ) 227 | bpf_collector.send_memory_request(pid, request) 228 | return 0 229 | 230 | def handle_ExecEndNode( 231 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 232 | ) -> int: 233 | """ 234 | Handle ExecEndNode event. This event is produced by a uprobe on 235 | ExecEndNode's implementations. 236 | 237 | Once the executor node is destroyed, we want to collect it's 238 | instrumentation data if any. 239 | """ 240 | event = ct.cast(event, ct.POINTER(planstate_data)).contents 241 | process_info = self.get_process_info(pid) 242 | if process_info.last_portal_key is None: 243 | return 0 244 | query = process_info.query_cache.get(process_info.last_portal_key) 245 | if query is None: 246 | return 0 247 | node = query.nodes.get(event.planstate_addr) 248 | if node is None: 249 | return 0 250 | instrument_addr = ct.addressof(event.instrument) 251 | instrument = bpf_collector.metadata.structs.Instrumentation(instrument_addr) 252 | instrument.nloops = ct.c_double(instrument.nloops.value + 1) # type: ignore 253 | node.instrument = instrument 254 | return 0 255 | 256 | def handle_KBlockRqIssue( 257 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 258 | ) -> int: 259 | """ 260 | Handle KBlockRqIssue event. This event is produced by a kernel 261 | tracepoint on block_rq_issue. 262 | 263 | This serves to keep a count of block IO performed by a device, which 264 | can be useful to compute "real" cache hit ratio. 265 | """ 266 | event = ct.cast(event, ct.POINTER(io_req_data)).contents 267 | process_info = self.get_process_info(pid) 268 | # We try to attach it to a specific query. 269 | # If we don't have one, don't bother 270 | if not process_info.current_executor: 271 | return 0 272 | query = process_info.query_cache.get(process_info.current_executor) 273 | if query is None: 274 | return 0 275 | if b"R" in event.rwbs: 276 | query.io_counters["R"] += event.bytes 277 | elif b"W" in event.rwbs: 278 | query.io_counters["W"] += event.bytes 279 | return 0 280 | 281 | def handle_MemoryResponseQueryInstr( 282 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 283 | ) -> int: 284 | """ 285 | Handle MemoryResponseQueryInstr 286 | 287 | We lookup the request_id, and update the given counters if needed. 288 | """ 289 | ev = ct.cast(event, ct.POINTER(memory_response)).contents 290 | 291 | process_info = self.get_process_info(pid) 292 | if not process_info.current_executor: 293 | return 0 294 | # We have a memory response for the whole query 295 | query = process_info.query_cache.get(ev.request_id.as_tuple(), None) 296 | if query: 297 | instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr) 298 | query.instrument = instr 299 | # Load all fields from the underlying memory. 300 | instr.as_dict(include_all=True) 301 | # Re-send the same request for continuous monitoring 302 | request = bpf_collector.build_memory_request( 303 | EventType.MemoryResponseQueryInstr, 304 | ev.request_id, 305 | query.addr, 306 | bpf_collector.metadata.structs.QueryDesc, 307 | ["planstate", "instrument"], 308 | ) 309 | 310 | bpf_collector.send_memory_request(pid, request) 311 | return 0 312 | 313 | def handle_MemoryResponseNodeInstr( 314 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 315 | ) -> int: 316 | """ 317 | Handle MemoryResponseNodeInstr produced as a response to some memory_request. 318 | """ 319 | process_info = self.get_process_info(pid) 320 | if not process_info.current_executor: 321 | return 0 322 | query = process_info.query_cache.get(process_info.current_executor, None) 323 | ev = ct.cast(event, ct.POINTER(memory_response)).contents 324 | nodeid = ev.request_id.as_int() 325 | # We have a memory response for an individual node 326 | if query is not None and nodeid is not None: 327 | node = query.nodes.get(nodeid) 328 | if node is not None: 329 | instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr) 330 | node.instrument = instr 331 | # Re-send the same request for continuous monitoring 332 | request = bpf_collector.build_memory_request( 333 | EventType.MemoryResponseNodeInstr, 334 | Id128.from_int(nodeid), 335 | nodeid, 336 | bpf_collector.metadata.structs.PlanState, 337 | ["instrument"], 338 | ) 339 | bpf_collector.send_memory_request(pid, request) 340 | return 0 341 | 342 | def handle_MemoryNodeData( 343 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 344 | ) -> int: 345 | """ 346 | Handle MemoryNodeData produced as a response for a memory_request. 347 | """ 348 | process_info = self.get_process_info(pid) 349 | if not process_info.current_executor: 350 | return 0 351 | ev = ct.cast(event, ct.POINTER(planstate_data)).contents 352 | query = process_info.query_cache.get(process_info.current_executor, None) 353 | if query is not None: 354 | node = query.add_node_from_event(bpf_collector.metadata, ev) 355 | if ev.lefttree and ev.lefttree not in query.nodes: 356 | leftchild = PlanState(ev.lefttree) 357 | leftchild.parent_node = node 358 | query.nodes[ev.lefttree] = leftchild 359 | node.children[leftchild] = None 360 | self._gather_node_info(bpf_collector, ev.lefttree, pid) 361 | if ev.righttree and ev.righttree not in query.nodes: 362 | rightchild = PlanState(ev.righttree) 363 | rightchild.parent_node = node 364 | query.nodes[ev.righttree] = rightchild 365 | node.children[rightchild] = None 366 | self._gather_node_info(bpf_collector, ev.righttree, pid) 367 | return 0 368 | 369 | def _gather_node_info( 370 | self, bpf_collector: BPFCollector, nodeaddr: int, pid: int 371 | ) -> None: 372 | """ 373 | Send memory requests to gather information about a specific node. 374 | """ 375 | req = bpf_collector.build_memory_request( 376 | EventType.MemoryNodeData, 377 | Id128.from_int(nodeaddr), 378 | nodeaddr, 379 | bpf_collector.metadata.structs.PlanState, 380 | [], 381 | ) 382 | bpf_collector.send_memory_request(pid, req) 383 | 384 | def handle_StackSample( 385 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 386 | ) -> int: 387 | """ 388 | Handle StackSample events produced during perf sampling. 389 | """ 390 | ev = ct.cast(event, ct.POINTER(stack_sample)).contents 391 | process_info = self.get_process_info(pid) 392 | _, creation_time = ev.portal_data.portal_key.as_tuple() 393 | if creation_time: 394 | self._process_portal_data(bpf_collector, ev.portal_data, pid) 395 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False) 396 | if process_info.current_query: 397 | # Now add the nodes from the stacktrace 398 | process_info.current_query.add_nodes_from_stack( 399 | bpf_collector.metadata, ev.stack_data 400 | ) 401 | # And add memory_requests to gather their information. 402 | for node in process_info.current_query.nodes.values(): 403 | if node.is_stub and node.addr: 404 | self._gather_node_info(bpf_collector, node.addr, pid) 405 | return 0 406 | 407 | def handle_MemoryAccount( 408 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int 409 | ) -> int: 410 | """ 411 | Handle MemoryAccount events produced by malloc instrumentation. 412 | """ 413 | ev = ct.cast(event, ct.POINTER(memory_account)).contents 414 | process_info = self.get_process_info(pid) 415 | if process_info.current_query: 416 | process_info.current_query.memallocs.update(ev) 417 | return 0 418 | 419 | 420 | class QueryTracerBPFCollector(BPFCollector): 421 | """ 422 | BPF Collector tracing queries and optionally individual nodes. 423 | """ 424 | 425 | options_cls = QueryTracerOptions 426 | event_handler_cls = QueryTracerEventHandler 427 | 428 | def __init__( 429 | self, 430 | metadata: ProcessMetadata, 431 | options: Optional[QueryTracerOptions] = None, 432 | include_children: bool = False, 433 | ): 434 | self.options: QueryTracerOptions 435 | self.event_handler: QueryTracerEventHandler 436 | super().__init__(metadata, options, include_children) 437 | 438 | def attach_probes(self) -> None: 439 | super().attach_probes() 440 | self._attach_uprobe("PortalDrop", "portaldrop_enter") 441 | self._attach_uretprobe("PortalDrop", "portaldrop_return") 442 | self._attach_uprobe("standard_ExecutorStart", "executorstart_enter") 443 | self._attach_uprobe("standard_ExecutorRun", "executorrun_enter") 444 | self._attach_uprobe("ExecutorFinish", "executorfinish_enter") 445 | self._attach_uprobe("mmap", "mmap_enter") 446 | self.bpf.attach_uprobe( 447 | name=b"c", sym=b"mmap", fn_name=b"mmap_enter", pid=self.pid 448 | ) 449 | self.bpf.attach_uprobe( 450 | name=b"c", sym=b"munmap", fn_name=b"munmap_enter", pid=self.pid 451 | ) 452 | if self.options.enable_nodes_collection: 453 | self._attach_uprobe("ExecProcNodeFirst", "execprocnodefirst_enter") 454 | for func in self.ExecEndFuncs: 455 | self._attach_uprobe(func, "execendnode_enter") 456 | 457 | def enable_usdt_probes(self, usdt: USDT) -> None: 458 | usdt.enable_probe(probe="libc:memory_sbrk_less", fn_name="sbrk_less") 459 | usdt.enable_probe(probe="libc:memory_sbrk_more", fn_name="sbrk_more") 460 | 461 | @property 462 | def constant_defines(self) -> Dict[str, int]: 463 | constants = super().constant_defines 464 | # USER_INSTRUMENT_FLAGS is defined only if the user wants to 465 | # inconditonally turn on instrumentation. 466 | if self.options.instrument_flags: 467 | constants["USER_INSTRUMENT_FLAGS"] = self.options.instrument_flags 468 | if self.options.enable_query_discovery: 469 | if not self.ppid: 470 | constants["ENABLE_QUERY_DISCOVERY"] = True 471 | return constants 472 | 473 | def _optional_code(self) -> str: 474 | buf = super()._optional_code() 475 | if self.options.enable_nodes_collection: 476 | buf += load_c_file("plan.c") 477 | buf += load_c_file("block_rq.c") 478 | buf += load_c_file("memusage.c") 479 | return buf 480 | 481 | def setup_bpf_state(self) -> None: 482 | # FIXME: get rid of those magic numbers. 483 | super().setup_bpf_state() 484 | if self.options.enable_perf_events: 485 | self.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool( 486 | self.options.enable_query_discovery 487 | ) 488 | self.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool( 489 | self.options.enable_query_discovery 490 | ) 491 | 492 | def cleanup_process(self, pid: int) -> int: 493 | if pid in self.event_handler.per_process_info: 494 | self.event_handler.process_history.append( 495 | self.event_handler.per_process_info.pop(pid) 496 | ) 497 | return super().cleanup_process(pid) 498 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/collector/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Various utilities for collector implementations. 3 | """ 4 | from enum import IntEnum 5 | from pathlib import Path 6 | from typing import Any, Dict, Type 7 | 8 | 9 | def intenum_to_c(intenum: Type[IntEnum]) -> str: 10 | """ 11 | Generate C code defining an enum corresponding to a Python IntEnum. 12 | """ 13 | buf = f"enum {intenum.__name__} {{\n" 14 | members = [] 15 | 16 | for member in intenum: 17 | members.append(f"{intenum.__name__}{member.name} = {member.value}") 18 | buf += ",\n".join(members) 19 | buf += "\n};\n" 20 | 21 | return buf 22 | 23 | 24 | def defines_dict_to_c(defines_dict: Dict[str, Any]) -> str: 25 | """ 26 | Generate a string of C #define directives from a mapping. 27 | """ 28 | return ( 29 | "\n".join(f"#define {key} {value}" for key, value in defines_dict.items()) 30 | + "\n" 31 | ) 32 | 33 | 34 | CODE_BASE_PATH = Path(__file__).parent.parent / "code" 35 | 36 | 37 | def load_c_file(filename: str) -> str: 38 | """ 39 | Loads a C file from the package code directory. 40 | """ 41 | filepath = CODE_BASE_PATH / filename 42 | with filepath.open() as cfile: 43 | return cfile.read() 44 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/eh_frame_hdr.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains code for parsing an .eh_frame_hdr section. 3 | """ 4 | from __future__ import annotations 5 | 6 | import struct 7 | from enum import IntEnum 8 | from typing import TYPE_CHECKING, Any, Iterable, Optional, Tuple, no_type_check 9 | 10 | from elftools.dwarf.callframe import CallFrameInfo 11 | from elftools.dwarf.enums import DW_EH_encoding_flags 12 | from elftools.elf.elffile import ELFFile 13 | 14 | if TYPE_CHECKING: 15 | from elftools.dwarf.callframe import CFIEntry 16 | from elftools.elf.sections import Section 17 | 18 | DW_EH_Encoding = IntEnum("DW_EH_Encoding", DW_EH_encoding_flags) # type: ignore 19 | 20 | 21 | class EhFrameHdr: 22 | """ 23 | Parsed .eh_frame_hdr section 24 | """ 25 | 26 | def __init__(self, section: Section, elffile: ELFFile): 27 | self.elffile = elffile 28 | self.section = section 29 | self.offset = self.section.global_offset 30 | self.eh_frame_hdr_start = self.section.stream.tell() 31 | # First read the fixed header 32 | ( 33 | self.version, 34 | self.eh_frame_ptr_enc, 35 | self.fde_count_enc, 36 | self.table_enc, 37 | ) = self._unpack_from("<4B", offset=0) 38 | self.frame_ptr: int = self.read_value(self.eh_frame_ptr_enc) # type: ignore 39 | self.fde_count: int = self.read_value(self.fde_count_enc) # type: ignore 40 | self.table_start = self.section.stream.tell() 41 | self.dwarf_info = elffile.get_dwarf_info() 42 | self.cfi = CallFrameInfo( 43 | stream=self.dwarf_info.eh_frame_sec.stream, 44 | size=self.dwarf_info.eh_frame_sec.size, 45 | address=self.dwarf_info.eh_frame_sec.address, 46 | base_structs=self.dwarf_info.structs, 47 | for_eh_frame=True, 48 | ) 49 | 50 | @no_type_check 51 | def read_value( 52 | self, 53 | encoding: int, 54 | offset: Optional[int] = None, 55 | relative: bool = True, 56 | program_counter: int = 0, 57 | ) -> int: 58 | """ 59 | Read a value with the given encoding at the specific offset. 60 | Relative indicate wether the offset is relative to the start of the 61 | section or absolute in the ELFFile. 62 | program_counter is the current program counter used for DW_EH_PE_pcrel calculations. 63 | """ 64 | value_enc = encoding & 0x0F 65 | relative_enc = encoding & 0x70 66 | if value_enc == DW_EH_Encoding.DW_EH_PE_absptr: 67 | result = self._unpack_from("@B", offset=offset, relative=relative) 68 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata2: 69 | result = self._unpack_from("@H", offset=offset, relative=relative) 70 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata2: 71 | result = self._unpack_from("@h", offset=offset, relative=relative) 72 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata4: 73 | result = self._unpack_from("@I", offset=offset, relative=relative) 74 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata4: 75 | result = self._unpack_from("@i", offset=offset, relative=relative) 76 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata8: 77 | result = self._unpack_from("@Q", offset=offset, relative=relative) 78 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata8: 79 | result = self._unpack_from("@q", offset=offset, relative=relative) 80 | else: 81 | raise ValueError(f"Unknown value encoding: {value_enc}") 82 | 83 | result = result[0] 84 | 85 | if relative_enc == DW_EH_Encoding.DW_EH_PE_absptr: 86 | pass 87 | elif relative_enc == DW_EH_Encoding.DW_EH_PE_pcrel: 88 | result += program_counter 89 | elif relative_enc == DW_EH_Encoding.DW_EH_PE_datarel: 90 | result += self.offset 91 | else: 92 | raise ValueError(f"Pointer encoding {relative_enc} not supported") 93 | return result 94 | 95 | @no_type_check 96 | def get_table_entry_size(self) -> int: 97 | """ 98 | Returns the size of a table entry. 99 | """ 100 | enc = self.table_enc & 0x0F 101 | if enc in (DW_EH_Encoding.DW_EH_PE_udata2, DW_EH_Encoding.DW_EH_PE_sdata2): 102 | return 4 103 | if enc in (DW_EH_Encoding.DW_EH_PE_udata4, DW_EH_Encoding.DW_EH_PE_sdata4): 104 | return 8 105 | if enc in (DW_EH_Encoding.DW_EH_PE_udata8, DW_EH_Encoding.DW_EH_PE_sdata8): 106 | return 16 107 | if enc == DW_EH_Encoding.DW_EH_PE_omit: 108 | return 0 109 | raise ValueError(f"Invalid table encoding: {enc}") 110 | 111 | def _read_section( 112 | self, size: int, offset: Optional[int], relative: bool = False 113 | ) -> Any: 114 | """ 115 | Read `size` bytes from the underlying stream at the given `offset`. 116 | relative indicates whether the given offset is relative to the 117 | .eh_frame_hdr section start, or absolute in the ELFFile. 118 | """ 119 | stream = self.section.stream 120 | if offset is not None: 121 | if relative: 122 | offset = offset + self.offset 123 | stream.seek(offset) 124 | return stream.read(size) 125 | 126 | def _unpack_from( 127 | self, fmt: str, offset: Optional[int] = None, relative: bool = False 128 | ) -> Tuple[int, ...]: 129 | """ 130 | Unpack a value read at offset according to format. 131 | """ 132 | size = struct.calcsize(fmt) 133 | buffer = self._read_section(size, offset, relative) 134 | return struct.unpack_from(fmt, buffer) 135 | 136 | def read_entry(self, offset: Optional[int] = None) -> Tuple[int, int]: 137 | """ 138 | Read a table entry at the given offset. .eh_frame_hdr table entries are 139 | couples of location / offset of the corresponding FDE. 140 | """ 141 | loc_val: int = self.read_value(self.table_enc, offset, relative=False) 142 | offset_val: int = self.read_value(self.table_enc) 143 | return (loc_val, offset_val) 144 | 145 | def iter_entries(self) -> Iterable[Tuple[int, int]]: 146 | """ 147 | Iter over .eh_frame_hdr table entries. 148 | """ 149 | self.section.stream.seek(self.table_start) 150 | for _ in range(0, self.fde_count): 151 | yield self.read_entry() 152 | 153 | def find_fde(self, addrkey: int) -> Optional[CFIEntry]: 154 | """ 155 | Find an antry by doing a binary search. 156 | """ 157 | minidx = 0 158 | maxidx = self.fde_count 159 | size = self.get_table_entry_size() 160 | while True: 161 | idx = minidx + (maxidx - minidx) // 2 162 | offset = self.table_start + idx * size 163 | (addr, loc) = self.read_entry(offset=offset) 164 | # We found the looked up key, now we need to find the right tag 165 | if addrkey == addr or (minidx == idx and addrkey > addr): 166 | fde = self.cfi._parse_entry_at( 167 | loc - self.cfi.address 168 | ) # pylint: disable=protected-access 169 | if addrkey < fde.header.initial_location + fde.header.address_range: 170 | return fde 171 | # If the key is not in range, then we don't have an entry. 172 | return None 173 | if addrkey < addr: 174 | if maxidx == idx: 175 | return None 176 | maxidx = idx 177 | elif addrkey > addr: 178 | minidx = idx 179 | 180 | @classmethod 181 | def load_eh_frame_hdr(cls, elf_file: ELFFile) -> Optional[EhFrameHdr]: 182 | """ 183 | Load an EHFrameHDR from an ELFFile. 184 | """ 185 | eh_frame_hdr = elf_file.get_section_by_name(".eh_frame_hdr") 186 | if eh_frame_hdr is None: 187 | return None 188 | 189 | # pylint: disable=protected-access 190 | eh_frame_hdr = elf_file._read_dwarf_section( 191 | eh_frame_hdr, relocate_dwarf_sections=True 192 | ) 193 | eh_frame_hdr_data = EhFrameHdr(eh_frame_hdr, elf_file) 194 | return eh_frame_hdr_data 195 | -------------------------------------------------------------------------------- /src/pgtracer/ebpf/unwind.py: -------------------------------------------------------------------------------- 1 | # pylint: disable=invalid-name 2 | """ 3 | This module provides access to libunwind through ctypes. 4 | """ 5 | from __future__ import annotations 6 | 7 | import ctypes as ct 8 | import ctypes.util 9 | import platform 10 | import re 11 | from functools import cached_property 12 | from pathlib import Path 13 | from typing import TYPE_CHECKING, Any, Generator, List, Optional, Tuple, Type, TypeVar 14 | 15 | from elftools.dwarf.callframe import CFARule, CFIEntry 16 | from elftools.dwarf.die import DIE, AttributeValue 17 | from elftools.dwarf.dwarf_expr import DWARFExprOp, DWARFExprParser 18 | from elftools.dwarf.locationlists import BaseAddressEntry, LocationEntry, LocationExpr 19 | 20 | from .dwarf import MappedRegion, ProcessMetadata, die_name 21 | 22 | if TYPE_CHECKING: 23 | try: 24 | from typing import TypeAlias # type: ignore 25 | except ImportError: 26 | from typing_extensions import TypeAlias 27 | CFuncPtr: TypeAlias = ct._FuncPointer # pylint: disable=protected-access 28 | Pointer: TypeAlias = ct.pointer 29 | SimpleCData = ct._SimpleCData[Any] # pylint: disable=protected-access 30 | else: 31 | # Make pylint happy 32 | CFuncPtr = object() 33 | Pointer = List 34 | SimpleCData = Any 35 | 36 | 37 | CT = TypeVar("CT", bound=SimpleCData) 38 | 39 | ARCH = platform.machine() 40 | 41 | 42 | def find_libunwind_version() -> Tuple[int, int]: 43 | """ 44 | Returns the libunwind version. 45 | We try to extract this from the headers. 46 | 47 | TODO: maybe we should call cc to get the actual include dirs ? 48 | """ 49 | include_dir_candidates = [ 50 | Path("/usr/include/"), 51 | Path(f"/usr/include/{ARCH}-linux-gnu/"), 52 | ] 53 | major_re = re.compile(r"#define UNW_VERSION_MAJOR\s+(\d+)") 54 | minor_re = re.compile(r"#define UNW_VERSION_MINOR\s+(\d+)") 55 | header_filename = Path("libunwind-common.h") 56 | major_version = None 57 | minor_version = None 58 | found = False 59 | for candidate in include_dir_candidates: 60 | include_file = candidate / header_filename 61 | if include_file.exists(): 62 | with include_file.open() as f: 63 | for line in f: 64 | match = major_re.match(line) 65 | if match: 66 | found = True 67 | major_version = int(match.group(1)) 68 | continue 69 | match = minor_re.match(line) 70 | if match: 71 | found = True 72 | minor_version = int(match.group(1)) 73 | if found: 74 | break 75 | if major_version is None or minor_version is None: 76 | raise ValueError("Could not identify libunwind version !") 77 | return (major_version, minor_version) 78 | 79 | 80 | LIBUNWIND_VERSION = find_libunwind_version() 81 | 82 | UNW_PREFIX = f"_U{ARCH}_" 83 | libname = ctypes.util.find_library(f"unwind-{ARCH}") 84 | if libname is None: 85 | raise ImportError(f"Cannot load libunwind-{ARCH}") 86 | libunwind = ct.cdll.LoadLibrary(libname) 87 | if ARCH == "x86_64": 88 | UNW_TDEP_CURSOR_LEN = 127 89 | unw_word_t = ct.c_ulonglong 90 | UNW_WORD_T_FORMAT = " CFuncPtr: 180 | """ 181 | Returns the CPointer function of that name. Depending on the architecture, 182 | the function names are not the same. 183 | """ 184 | return getattr(libunwind, f"{UNW_PREFIX}{funcname}") 185 | 186 | 187 | class unw_dyn_remote_table_info_t(ct.Structure): 188 | """ 189 | Mapping of unw_dyn_remote_table_info_t type. 190 | """ 191 | 192 | _fields_ = [ 193 | ("name_ptr", unw_word_t), 194 | ("segbase", unw_word_t), 195 | ("table_len", unw_word_t), 196 | ("table_data", unw_word_t), 197 | ] 198 | 199 | 200 | # We have to define the fields after the class, as it is a self-referencing 201 | # type. 202 | class unw_dyn_info_t(ct.Structure): 203 | """ 204 | Mapping of unw_dyn_info_t type. 205 | """ 206 | 207 | 208 | # Libunwind does not preserve perfect ABI compatibility. 209 | load_offset_field = [] 210 | if LIBUNWIND_VERSION >= (1, 6): 211 | load_offset_field = [("load_offset", unw_word_t)] 212 | 213 | unw_dyn_info_t._fields_ = [ # pylint: disable=protected-access 214 | ("next", ct.POINTER(unw_dyn_info_t)), 215 | ("prev", ct.POINTER(unw_dyn_info_t)), 216 | ("start_ip", unw_word_t), 217 | ("end_ip", unw_word_t), 218 | ("gp", unw_word_t), 219 | ("format", ct.c_int32), 220 | ("pad", ct.c_int32), 221 | *load_offset_field, 222 | ("rti", unw_dyn_remote_table_info_t) # Supposed to be an union, but we will 223 | # only ever use this one. 224 | ] 225 | 226 | unw_regnum_t = ct.c_int 227 | unw_fpreg_t = unw_tdep_fpreg_t 228 | # Opaque type 229 | unw_addr_space_t = ct.c_void_p 230 | 231 | # Definition of function types 232 | FIND_PROC_INFO_FUNCTYPE = ct.CFUNCTYPE( 233 | ct.c_int, # Return value 234 | unw_addr_space_t, 235 | unw_word_t, 236 | ct.POINTER(unw_proc_info_t), 237 | ct.c_int, 238 | ct.c_void_p, 239 | ) 240 | PUT_UNWIND_INFO_FUNCTYPE = ct.CFUNCTYPE( 241 | None, unw_addr_space_t, ct.POINTER(unw_proc_info_t), ct.c_void_p 242 | ) 243 | GET_DYN_INFO_LIST_ADDR_FUNCTYPE = ct.CFUNCTYPE( 244 | ct.c_int, unw_addr_space_t, ct.POINTER(unw_word_t), ct.c_void_p 245 | ) 246 | ACCESS_MEM_FUNCTYPE = ct.CFUNCTYPE( 247 | ct.c_int, 248 | unw_addr_space_t, 249 | unw_word_t, 250 | ct.POINTER(unw_word_t), 251 | ct.c_int, 252 | ct.c_void_p, 253 | ) 254 | ACCESS_REG_FUNCTYPE = ct.CFUNCTYPE( 255 | ct.c_int, 256 | unw_addr_space_t, 257 | unw_regnum_t, 258 | ct.POINTER(unw_word_t), 259 | ct.c_int, 260 | ct.c_void_p, 261 | ) 262 | ACCESS_FPREG_FUNCTYPE = ct.CFUNCTYPE( 263 | ct.c_int, 264 | unw_addr_space_t, 265 | unw_regnum_t, 266 | ct.POINTER(unw_fpreg_t), 267 | ct.c_int, 268 | ct.c_void_p, 269 | ) 270 | GET_PROC_NAME_FUNCTYPE = ct.CFUNCTYPE( 271 | ct.c_int, 272 | unw_addr_space_t, 273 | unw_word_t, 274 | ct.c_char_p, 275 | ct.c_size_t, 276 | ct.POINTER(unw_word_t), 277 | ct.c_void_p, 278 | ) 279 | 280 | create_addr_space = unw_func("create_addr_space") 281 | create_addr_space.restype = ct.c_void_p 282 | create_addr_space.argtypes = [ct.c_void_p, ct.c_int] 283 | 284 | init_remote = unw_func("init_remote") 285 | init_remote.restype = ct.c_int 286 | init_remote.argtypes = [ct.c_void_p, ct.c_void_p, ct.c_int] 287 | 288 | 289 | dwarf_search_unwind_table = unw_func("dwarf_search_unwind_table") 290 | dwarf_search_unwind_table.restype = ct.c_int 291 | dwarf_search_unwind_table.argtypes = [ 292 | unw_addr_space_t, 293 | unw_word_t, 294 | ct.POINTER(unw_dyn_info_t), 295 | ct.POINTER(unw_proc_info_t), 296 | ct.c_int, 297 | ct.c_void_p, 298 | ] 299 | 300 | 301 | class unw_cursor_t(ct.Structure): 302 | """ 303 | Mapping of unw_cursor_t type. 304 | """ 305 | 306 | _fields_ = [("opaque", unw_word_t * UNW_TDEP_CURSOR_LEN)] 307 | 308 | 309 | step = unw_func("step") 310 | step.restype = ct.c_int 311 | step.argtypes = [ct.POINTER(unw_cursor_t)] 312 | 313 | get_reg = unw_func("get_reg") 314 | get_reg.restype = ct.c_int 315 | get_reg.argtypes = [ct.POINTER(unw_cursor_t), unw_regnum_t, ct.POINTER(unw_word_t)] 316 | 317 | 318 | class unw_accesors(ct.Structure): 319 | """ 320 | Mapping of unw_accessors type. 321 | """ 322 | 323 | _fields_ = [ 324 | ("find_proc_info", FIND_PROC_INFO_FUNCTYPE), 325 | ("put_unwind_info", PUT_UNWIND_INFO_FUNCTYPE), 326 | ("get_dyn_info_list_addr", GET_DYN_INFO_LIST_ADDR_FUNCTYPE), 327 | ("access_mem", ACCESS_MEM_FUNCTYPE), 328 | ("access_reg", ACCESS_REG_FUNCTYPE), 329 | ("access_fpreg", ACCESS_FPREG_FUNCTYPE), 330 | ("resume", ct.c_void_p), # Unused 331 | ("get_proc_name", GET_PROC_NAME_FUNCTYPE), 332 | ] 333 | 334 | 335 | class Frame: 336 | """ 337 | A stack frame. 338 | """ 339 | 340 | def __init__( 341 | self, 342 | stack: ct._CData, 343 | ip: int, 344 | die: DIE, 345 | start_addr: int, 346 | processmetadata: ProcessMetadata, 347 | cursor: unw_cursor_t, 348 | prev_frame: Optional[Frame] = None, 349 | next_frame: Optional[Frame] = None, 350 | ): 351 | self.stack = stack 352 | self.ip = ip 353 | self.die = die 354 | 355 | self.start_addr = start_addr 356 | self.processmetadata = processmetadata 357 | # We don't keep the cursor itself, we make a copy instead. 358 | self.cursor = unw_cursor_t() 359 | ct.pointer(self.cursor)[0] = cursor 360 | self.prev_frame = prev_frame 361 | self.next_frame = next_frame 362 | 363 | @cached_property 364 | def fde(self) -> Optional[CFIEntry]: 365 | """ 366 | Returns the FDE associated with this call frame. 367 | """ 368 | region = self.region 369 | if region is None: 370 | return None 371 | v_ip = self.ip - region.start 372 | if region.eh_frame_hdr is None: 373 | return None 374 | fde = region.eh_frame_hdr.find_fde(v_ip) 375 | return fde 376 | 377 | @cached_property 378 | def _expr_parser(self) -> DWARFExprParser: 379 | """ 380 | DWARF Expr parser. 381 | """ 382 | return DWARFExprParser(self.processmetadata.dwarf_info.structs) 383 | 384 | @cached_property 385 | def cfa_rule(self) -> Optional[CFARule]: 386 | """ 387 | Returns the CFA rule associated with this call frame. 388 | """ 389 | if self.fde is None: 390 | return None 391 | for row in reversed(self.fde.get_decoded().table): 392 | if row["pc"] < self.ip - self.region.start: 393 | return row["cfa"] 394 | return None 395 | 396 | @cached_property 397 | def cfa(self) -> Optional[int]: 398 | """ 399 | Compute the CFA for this call frame. 400 | """ 401 | if self.cfa_rule is None: 402 | return None 403 | cfa_reg_value = unw_word_t(0) 404 | get_reg(self.cursor, self.cfa_rule.reg, ct.byref(cfa_reg_value)) 405 | return cfa_reg_value.value + self.cfa_rule.offset - self.start_addr # type: ignore 406 | 407 | @cached_property 408 | def region(self) -> MappedRegion: 409 | """ 410 | Return the MappedRegion correspoding to this Frame's IP. 411 | """ 412 | region = self.processmetadata.map_for_addr(self.ip) 413 | if region is None: 414 | raise ValueError("This frame could not be associated to a region.") 415 | return region 416 | 417 | @cached_property 418 | def function_name(self) -> Optional[str]: 419 | """ 420 | Returns the function name associated to this frame's DIE 421 | """ 422 | if self.die is None: 423 | return None 424 | return die_name(self.die) 425 | 426 | def _get_parsed_expr_for_attribute(self, argnum: int) -> List[DWARFExprOp]: 427 | """ 428 | Returns a list of parsed DwarfEXPROp for the attribute corresponding to the 429 | argnum'th argument. 430 | """ 431 | curargnum = 0 432 | if self.die is None: 433 | return [] 434 | for subdie in self.die.iter_children(): 435 | if subdie.tag == "DW_TAG_formal_parameter": 436 | curargnum += 1 437 | if curargnum == argnum: 438 | locattr = subdie.attributes["DW_AT_location"] 439 | return self._get_parsed_exprs_from_loc(subdie, locattr) 440 | return [] 441 | 442 | def _get_parsed_exprs_from_loc( 443 | self, die: DIE, locattr: AttributeValue 444 | ) -> List[DWARFExprOp]: 445 | """ 446 | Returns a list of parsed DWARFExprOp for a given attribute. 447 | """ 448 | expr = None 449 | loc = self.processmetadata.location_parser.parse_from_attribute( 450 | locattr, die.cu.header.version, die 451 | ) 452 | if isinstance(loc, LocationExpr): 453 | expr = loc.loc_expr 454 | else: 455 | base_address = die.cu.get_top_DIE().attributes["DW_AT_low_pc"].value 456 | expr = None 457 | for entry in loc: 458 | if isinstance(entry, BaseAddressEntry): 459 | base_address = entry.base_address 460 | elif isinstance(entry, LocationEntry): 461 | start = entry.begin_offset + base_address 462 | end = entry.end_offset + base_address 463 | if start <= (self.ip - self.region.start) <= end: 464 | expr = entry.loc_expr 465 | break 466 | else: 467 | raise NotImplementedError( 468 | f"Location entries of type {type(entry)} are not supported" 469 | ) 470 | if expr is None: 471 | raise ValueError("Could not find LocationExpr in attr {locattr}") 472 | parsed_exprs: List[DWARFExprOp] = self._expr_parser.parse_expr(expr) 473 | return parsed_exprs 474 | 475 | def fetch_arg(self, argnum: int, ctype: Type[CT]) -> CT: 476 | """ 477 | Fetch the argument number argnum, interpreting it as a ctype. 478 | """ 479 | # We have all the registers set up correctly, fetch things directly. 480 | rv: CT 481 | if self.cfa is None: 482 | # Fetch the argument directly from the register 483 | argreg = unw_word_t(0) 484 | ARGNUM_TO_REGNUM = {1: 5, 2: 4, 3: 1, 4: 2, 5: 8} 485 | get_reg(self.cursor, ARGNUM_TO_REGNUM[argnum], ct.byref(argreg)) 486 | return ctype(argreg.value) 487 | expr = self._get_parsed_expr_for_attribute(argnum) 488 | dwarf_stack: List[CT] = [] 489 | for op in expr: 490 | rv = self.eval_expr(op, ctype, dwarf_stack) 491 | return rv 492 | 493 | def _read_arg_from_stack(self, offset: int, ctype: Type[CT]) -> CT: 494 | """ 495 | Read an argument of givent type at the given offset from the stack. 496 | """ 497 | assert 0 <= offset < len(self.stack) # type: ignore 498 | return ctype.from_buffer(bytearray(self.stack)[offset:]) 499 | 500 | def eval_expr( 501 | self, expr: DWARFExprOp, ctype: Type[CT], dwarf_stack: List[CT] 502 | ) -> CT: 503 | """ 504 | Eval simple expressions. 505 | """ 506 | # It's a register 507 | if self.die is None: 508 | raise ValueError("No DIE could be found for frame {self}") 509 | if expr.op_name == "DW_OP_fbreg": 510 | # If we are an inlined subroutine, lookup the parent frame base. 511 | die = self.die 512 | while die.tag == "DW_TAG_inlined_subroutine": 513 | if self.next_frame is None: 514 | raise ValueError("Cannot find parent frame of inlined subroutine") 515 | die = self.next_frame.die 516 | frameexpr = self.processmetadata.location_parser.parse_from_attribute( 517 | die.attributes["DW_AT_frame_base"], 518 | self.die.cu.header.version, 519 | self.die, 520 | ) 521 | parsed_expr = self._expr_parser.parse_expr(frameexpr.loc_expr) 522 | for item in parsed_expr: 523 | base_value = self.eval_expr(item, ct.c_int, dwarf_stack) # type: ignore 524 | offset = base_value.value + expr.args[0] 525 | return self._read_arg_from_stack(offset, ctype) 526 | if expr.op_name == "DW_OP_call_frame_cfa": 527 | return ctype(self.cfa) 528 | if expr.op_name == "DW_OP_entry_value": 529 | # We evaluate the expression in the calling frame. 530 | for op in expr.args[0]: 531 | if self.next_frame is None: 532 | raise ValueError( 533 | "Cannot find parent frame for evaluation of entry point" 534 | ) 535 | rv = self.next_frame.eval_expr(op, ctype, dwarf_stack) 536 | dwarf_stack.append(rv) 537 | return ctype(0) 538 | if expr.op_name == "DW_OP_stack_value": 539 | return dwarf_stack[-1] 540 | if expr.op_name.startswith("DW_OP_reg"): 541 | regnum = expr.op - 0x50 542 | val = unw_word_t(0) 543 | get_reg(self.cursor, regnum, ct.byref(val)) 544 | return ctype(val.value) 545 | raise NotImplementedError(f"Unsupported expr type: {expr.op_name}") 546 | 547 | 548 | class UnwindAddressSpace: 549 | """ 550 | A virtual address space for use by libunwind. 551 | """ 552 | 553 | def __init__(self, capture: stack_data_t, processmetadata: ProcessMetadata): 554 | self.capture = capture 555 | self.registers: List[ct.c_ulonglong] = [ 556 | ct.c_ulonglong(getattr(self.capture, name)) for name in REG_NAMES 557 | ] 558 | self.processmetadata = processmetadata 559 | self.accessors = unw_accesors( 560 | find_proc_info=FIND_PROC_INFO_FUNCTYPE(self.find_proc_info), 561 | put_unwind_info=PUT_UNWIND_INFO_FUNCTYPE(self.put_unwind_info), 562 | get_dyn_info_list_addr=GET_DYN_INFO_LIST_ADDR_FUNCTYPE( 563 | self.get_dyn_info_list_addr 564 | ), 565 | access_mem=ACCESS_MEM_FUNCTYPE(self.access_mem), 566 | access_reg=ACCESS_REG_FUNCTYPE(self.access_reg), 567 | access_fpreg=ACCESS_FPREG_FUNCTYPE(self.access_reg), 568 | get_proc_name=GET_PROC_NAME_FUNCTYPE(self.get_proc_name), 569 | ) 570 | 571 | # 0 takes the default byteorder 572 | self.unw_addr_space = create_addr_space(ct.byref(self.accessors), 0) 573 | if self.unw_addr_space == 0: 574 | raise RuntimeError("Something bad happened in create_addr_space") 575 | self.unw_cursor = unw_cursor_t() 576 | retval = init_remote( 577 | ct.byref(self.unw_cursor), self.unw_addr_space, 0 578 | ) # Don't use the opaque pointer for now 579 | if retval != 0: 580 | raise RuntimeError("Something bad happened in init_remote") 581 | 582 | def find_proc_info( 583 | self, 584 | addr_space: unw_addr_space_t, 585 | ip: int, 586 | pip: Pointer[unw_proc_info_t], 587 | need_unwind_info: ct.c_int, 588 | arg: ct.c_void_p, 589 | ) -> int: 590 | # pylint: disable=unused-argument,too-many-arguments 591 | """ 592 | Implementation of libunwind find_proc_info callback. 593 | """ 594 | # Find the top of the elfile. 595 | mmap = self.processmetadata.map_for_addr(ip) 596 | 597 | if mmap is None or mmap.eh_frame_hdr is None: 598 | return -UNW_ESTOPUNWIND 599 | pip[0] = unw_proc_info_t() 600 | dynamic_info = unw_dyn_info_t( 601 | start_ip=mmap.start, 602 | end_ip=mmap.end, 603 | format=UNW_INFO_FORMAT_REMOTE_TABLE, 604 | ) 605 | dynamic_info.rti.name_ptr = 0 606 | # We only consider one specific binary. The virtual address space will 607 | # then consist of the actual stack and we will consider that the 608 | # eh_frame_hdr and everything else is located after that. 609 | dynamic_info.rti.segbase = mmap.start + mmap.eh_frame_hdr.offset 610 | dynamic_info.rti.table_data = ( 611 | mmap.start + mmap.eh_frame_hdr.table_start + mmap.eh_frame_hdr.offset 612 | ) 613 | dynamic_info.rti.table_len = (mmap.eh_frame_hdr.fde_count * 8) // ct.sizeof( 614 | unw_word_t 615 | ) 616 | ret: int = dwarf_search_unwind_table( 617 | addr_space, ip, ct.byref(dynamic_info), pip, need_unwind_info, None 618 | ) 619 | return ret 620 | 621 | def put_unwind_info( 622 | self, 623 | addr_space: unw_addr_space_t, 624 | pip: Pointer[unw_proc_info_t], 625 | arg: ct.c_void_p, 626 | ) -> None: 627 | """ 628 | Implementation of libunwind put_unwind_info callback. 629 | """ 630 | # pylint: disable=unused-argument 631 | return 632 | 633 | def get_dyn_info_list_addr( 634 | self, 635 | addr_space: unw_addr_space_t, 636 | dilap: Pointer[unw_word_t], 637 | arg: ct.c_void_p, 638 | ) -> int: 639 | """ 640 | Implementation of libunwind get_dyn_info_list_addr callback. 641 | """ 642 | # pylint: disable=unused-argument 643 | return -UNW_ENOINFO 644 | 645 | def access_mem( 646 | self, 647 | addr_space: unw_addr_space_t, 648 | addr: int, 649 | valp: Pointer[unw_word_t], 650 | write: int, 651 | arg: ct.c_void_p, 652 | ) -> int: 653 | """ 654 | Implementation of libunwind access_mem callback. 655 | """ 656 | # pylint: disable=unused-argument,too-many-arguments 657 | # We only support either file-mapped addresses, or addresses 658 | # refering to the stack. 659 | region = self.processmetadata.map_for_addr(addr) 660 | if region is None: 661 | return -UNW_EINVAL 662 | if region.path == "[stack]": 663 | stack_idx = addr - self.capture.start_addr 664 | if stack_idx >= self.capture.size: 665 | return -UNW_EINVAL 666 | if write == 0: 667 | valp[0] = unw_word_t.from_buffer( 668 | bytearray(self.capture.stack[stack_idx : stack_idx + 8]) 669 | ) 670 | else: 671 | self.capture.stack[stack_idx] = valp.contents 672 | return 0 673 | 674 | # It's from the ELFFile itself. 675 | if region.real_path: 676 | if write == 0: 677 | with region.real_path.open("rb") as f: 678 | f.seek(addr - region.start) 679 | valp[0] = unw_word_t.from_buffer( 680 | bytearray(f.read(ct.sizeof(unw_word_t))) 681 | ) 682 | return 0 683 | return -UNW_EINVAL 684 | 685 | # It's from anywhere else: return EINVAL 686 | return -UNW_EINVAL 687 | 688 | def access_reg( 689 | self, 690 | addr_space: unw_addr_space_t, 691 | regnum: int, 692 | valp: Pointer[unw_word_t], 693 | write: int, 694 | arg: ct.c_void_p, 695 | ) -> int: 696 | """ 697 | Implementation of libunwind access_reg callback. 698 | """ 699 | # pylint: disable=unused-argument,too-many-arguments 700 | if write == 0: 701 | valp[0] = unw_word_t(self.registers[regnum].value) 702 | else: 703 | self.registers[regnum] = valp.contents 704 | return 0 705 | 706 | def access_fpreg( 707 | self, 708 | addr_space: unw_addr_space_t, 709 | regnum: unw_regnum_t, 710 | fpvalp: Pointer[unw_fpreg_t], 711 | write: ct.c_int, 712 | arg: ct.c_void_p, 713 | ) -> int: 714 | """ 715 | Implementation of libunwind access_fpreg callback. 716 | """ 717 | # pylint: disable=unused-argument,too-many-arguments 718 | return -UNW_EINVAL 719 | 720 | def get_proc_name( 721 | self, 722 | addr_space: unw_addr_space_t, 723 | addr: unw_word_t, 724 | bufp: ct.c_char_p, 725 | buf_len: ct.c_size_t, 726 | offp: Pointer[unw_word_t], 727 | arg: ct.c_void_p, 728 | ) -> int: 729 | """ 730 | Implementation of libunwind get_proc_name callback. 731 | """ 732 | # pylint: disable=unused-argument,too-many-arguments 733 | return -UNW_EINVAL 734 | 735 | def ip(self) -> int: 736 | """ 737 | Return the instruction pointer from the unwind cursor. 738 | """ 739 | ip = unw_word_t(0) 740 | get_reg(self.unw_cursor, UNW_REG_IP, ct.byref(ip)) 741 | return ip.value 742 | 743 | def dies_for_ip(self) -> Tuple[DIE, ...]: 744 | """ 745 | Return a tuple of DIEs for a given ip. 746 | """ 747 | ip = self.ip() 748 | region = self.processmetadata.map_for_addr(ip) 749 | if region is None: 750 | return (None,) 751 | if region.path == str(self.processmetadata.program_raw): 752 | dies = self.processmetadata.get_die_and_inlined_subdies_for_addr( 753 | ip - region.start 754 | ) 755 | if dies is not None: 756 | return dies 757 | return (None,) 758 | 759 | def frames(self) -> Generator[Frame, None, None]: 760 | """ 761 | Returns the list of frames for this stack. 762 | """ 763 | cur = ct.byref(self.unw_cursor) 764 | prev_frame = None 765 | while True: 766 | # Extract the IP 767 | ip = self.ip() 768 | for die in self.dies_for_ip(): 769 | # The cursor is copied by the frame, no need to 770 | # worry about it 771 | cur_frame = Frame( 772 | self.capture.stack, 773 | ip, 774 | die, 775 | self.capture.start_addr, 776 | self.processmetadata, 777 | self.unw_cursor, 778 | prev_frame=prev_frame, 779 | ) 780 | if prev_frame is not None: 781 | prev_frame.next_frame = cur_frame 782 | yield prev_frame 783 | prev_frame = cur_frame 784 | if step(cur) <= 0: 785 | break 786 | if prev_frame is not None: 787 | yield prev_frame 788 | -------------------------------------------------------------------------------- /src/pgtracer/model/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Models definitions for execution concepts we extract information about. 3 | """ 4 | from .memory import MemoryAllocations, MemoryAllocType, memory_account 5 | from .plan import PlanState 6 | from .query import Query 7 | 8 | __all__ = [ 9 | "Query", 10 | "PlanState", 11 | "memory_account", 12 | "MemoryAllocations", 13 | "MemoryAllocType", 14 | ] 15 | -------------------------------------------------------------------------------- /src/pgtracer/model/memory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Classes storing information about memory allocations. 3 | """ 4 | 5 | import ctypes as ct 6 | from dataclasses import dataclass 7 | from enum import IntEnum 8 | 9 | 10 | # pylint: disable=invalid-name 11 | class MemoryAllocType(IntEnum): 12 | """ 13 | MemoryAllocation types. 14 | """ 15 | 16 | Sbrk = 1 17 | Mmap = 2 18 | 19 | 20 | class memory_account(ct.Structure): 21 | """ 22 | Represents the data associated to a memory allocation or deallocation. 23 | """ 24 | 25 | _fields_ = [ 26 | ("event_type", ct.c_short), 27 | ("size", ct.c_longlong), 28 | ("kind", ct.c_short), 29 | ] 30 | 31 | 32 | @dataclass 33 | class MemoryAllocations: 34 | """ 35 | Memory allocation counters. 36 | """ 37 | 38 | mmap_alloc: int = 0 39 | mmap_free: int = 0 40 | sbrk_alloc: int = 0 41 | sbrk_free: int = 0 42 | 43 | current_running_mmap: int = 0 44 | current_running_sbrk: int = 0 45 | 46 | current_mem_peak: int = 0 47 | 48 | @property 49 | def mmap_total(self) -> int: 50 | """ 51 | Compute the resulting mmaped total. 52 | """ 53 | return self.mmap_alloc - self.mmap_free 54 | 55 | @property 56 | def sbrk_total(self) -> int: 57 | """ 58 | Compute the resulting sbrk total. 59 | """ 60 | return self.sbrk_alloc - self.sbrk_free 61 | 62 | @property 63 | def total_malloc(self) -> int: 64 | """ 65 | Compute the total memory diff. 66 | """ 67 | return self.mmap_total + self.sbrk_total 68 | 69 | def update(self, memory_account_event: memory_account) -> None: 70 | """ 71 | Update the current totals. 72 | """ 73 | if memory_account_event.kind == MemoryAllocType.Sbrk: 74 | self.current_running_sbrk += memory_account_event.size 75 | if memory_account_event.size > 0: 76 | self.sbrk_alloc += memory_account_event.size 77 | else: 78 | self.sbrk_free += -memory_account_event.size 79 | elif memory_account_event.kind == MemoryAllocType.Mmap: 80 | self.current_running_mmap += memory_account_event.size 81 | if memory_account_event.size > 0: 82 | self.mmap_alloc += memory_account_event.size 83 | else: 84 | self.mmap_free += -memory_account_event.size 85 | self.current_mem_peak = max( 86 | self.current_mem_peak, self.current_running_sbrk + self.current_running_mmap 87 | ) 88 | -------------------------------------------------------------------------------- /src/pgtracer/model/plan.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains definitions for representing PostgreSQL plans. 3 | """ 4 | from __future__ import annotations 5 | 6 | import ctypes as ct 7 | from typing import TYPE_CHECKING, Dict, Optional 8 | 9 | from ..ebpf.collector.c_defs import plan_data, planstate_data 10 | from ..ebpf.dwarf import ProcessMetadata, Struct 11 | from ..utils import timespec_to_float 12 | 13 | if TYPE_CHECKING: 14 | from enum import IntEnum 15 | 16 | 17 | def explain_dict_to_str(parts: Dict[str, str]) -> str: 18 | """ 19 | Format a dict in the commonly used key=value format. 20 | """ 21 | return " ".join(f"{key}={value}" for key, value in parts.items()) 22 | 23 | 24 | class PlanState: 25 | """ 26 | Information collected from a PostgreSQL PlanState Node. 27 | """ 28 | 29 | def __init__(self, addr: Optional[int]): 30 | self.addr = addr 31 | self.tag: Optional[IntEnum] = None 32 | self.instrument: Optional[Struct] = None 33 | self.parent_node: Optional[PlanState] = None 34 | self.plan_data: Optional[plan_data] = None 35 | self.is_stub = True 36 | # We're using a Dict as poor man's OrderedSet 37 | self.children: Dict[PlanState, None] = {} 38 | 39 | def update(self, metadata: ProcessMetadata, event: planstate_data) -> None: 40 | """ 41 | Update a Planstate from an event planstate_data. 42 | """ 43 | instrument_addr = ct.addressof(event.instrument) 44 | tag = metadata.enums.NodeTag(event.plan_data.plan_tag) # type: ignore 45 | self.tag = tag 46 | self.instrument = metadata.structs.Instrumentation(instrument_addr) 47 | self.plan_data = plan_data() 48 | ct.pointer(self.plan_data)[0] = event.plan_data 49 | 50 | @property 51 | def title(self) -> str: 52 | """ 53 | Return the node's title. 54 | """ 55 | if self.tag is None: 56 | return "???" 57 | prefix = "" 58 | if self.plan_data and self.plan_data.parallel_aware: 59 | prefix = "Parallel " 60 | buf = f"{prefix}{str(self.tag.name[2:])}" 61 | # TODO: add additional information here 62 | return buf 63 | 64 | @property 65 | def cost(self) -> str: 66 | """ 67 | Returns the "cost" section formatted similarly to PostgreSQL explain 68 | """ 69 | if self.plan_data is None: 70 | parts = {"cost": "?..?", "rows": "?", "width": "?"} 71 | else: 72 | parts = { 73 | "cost": f"{self.plan_data.startup_cost:.2f}..{self.plan_data.total_cost:.2f}", 74 | "rows": f"{int(self.plan_data.plan_rows)}", 75 | "width": f"{int(self.plan_data.plan_width)}", 76 | } 77 | return f"({explain_dict_to_str(parts)})" 78 | 79 | @property 80 | def actual(self) -> str: 81 | """ 82 | Returns the "actual" section formatted similarly to PostgreSQL explain. 83 | """ 84 | if self.instrument is None: 85 | parts = {"time": "?..?", "rows": "?", "loops": "?"} 86 | else: 87 | total = timespec_to_float(self.instrument.counter) 88 | parts = { 89 | "time": f"{(self.instrument.firsttuple.value * 1000):0.3f}...{(total * 1000):0.3f}", 90 | "rows": f"{int(self.instrument.tuplecount.value)}", 91 | "loops": f"{int(self.instrument.nloops.value)}", 92 | } 93 | return f"(actual {explain_dict_to_str(parts)})" 94 | 95 | @property 96 | def buffers(self) -> str: 97 | """ 98 | Returns the "buffers" section formatted similarly to PostgreSQL 99 | explain. 100 | """ 101 | if self.instrument is None: 102 | return "" 103 | bufusage_dict = self.instrument.bufusage.as_dict(include_all=True) 104 | parts = {} 105 | for key, value in bufusage_dict.items(): 106 | if isinstance(value, (ct.c_long,)) and value.value != 0: 107 | parts[key] = str(value.value) 108 | if not parts: 109 | return "" 110 | return f"Buffers: {explain_dict_to_str(parts)}" 111 | 112 | def explain(self, indent_level: int = 0) -> str: 113 | """ 114 | Format the plan represented by this node similarly to PostgreSQL 115 | explain. 116 | """ 117 | if indent_level == 0: 118 | prefix = "" 119 | else: 120 | prefix = "\t" * indent_level + "-> " 121 | buf = f"{prefix}{self.title} {self.cost} {self.actual}" 122 | buffer_line = self.buffers 123 | if buffer_line: 124 | buf += "\n" + "\t" * (indent_level + 1) + buffer_line 125 | for child in self.children: 126 | buf += "\n" 127 | buf += child.explain(indent_level + 1) 128 | return buf 129 | -------------------------------------------------------------------------------- /src/pgtracer/model/query.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module contains definitions for representing PostgreSQL queries. 3 | """ 4 | from __future__ import annotations 5 | 6 | import ctypes as ct 7 | from collections import defaultdict 8 | from datetime import datetime, timedelta 9 | from typing import TYPE_CHECKING, Any, Dict, Optional 10 | 11 | from ..ebpf.unwind import UnwindAddressSpace, stack_data_t 12 | from ..utils import timespec_to_timedelta 13 | from .memory import MemoryAllocations 14 | from .plan import PlanState 15 | 16 | if TYPE_CHECKING: 17 | from ..ebpf.collector import planstate_data, portal_data 18 | from ..ebpf.dwarf import ProcessMetadata 19 | 20 | 21 | FUNCTION_ARGS_MAPPING = { 22 | "ExecProcNodeFirst": 1, 23 | "ExecProcNodeInstr": 1, 24 | "ExecProcNode": 1, 25 | "ExecAgg": 1, 26 | "ExecAppend": 1, 27 | "ExecBitmapAnd": 1, 28 | "ExecBitmapHeapScan": 1, 29 | "ExecBitmapIndexScan": 1, 30 | "ExecBitmapOr": 1, 31 | "ExecCteScan": 1, 32 | "ExecCustomScan": 1, 33 | "ExecForeignScan": 1, 34 | "ExecFunctionScan": 1, 35 | "ExecGather": 1, 36 | "ExecGatherMerge": 1, 37 | "ExecGroup": 1, 38 | "ExecHash": 1, 39 | "ExecHashJoin": 1, 40 | "ExecIncrementalSort": 1, 41 | "ExecIndexOnlyScan": 1, 42 | "ExecIndexScan": 1, 43 | "ExecLimit": 1, 44 | "ExecLockRows": 1, 45 | "ExecMaterial": 1, 46 | "ExecMemoize": 1, 47 | "ExecMergeAppend": 1, 48 | "ExecMergeJoin": 1, 49 | "ExecModifyTable": 1, 50 | "ExecNamedTuplestoreScan": 1, 51 | "ExecNestLoop": 1, 52 | "ExecProjectSet": 1, 53 | "ExecRecursiveUnion": 1, 54 | "ExecResult": 1, 55 | "ExecSampleScan": 1, 56 | "ExecSeqScan": 1, 57 | "ExecSetOp": 1, 58 | "ExecSort": 1, 59 | "ExecSubqueryScan": 1, 60 | "ExecTableFuncScan": 1, 61 | "ExecTidRangeScan": 1, 62 | "ExecTidScan": 1, 63 | "ExecUnique": 1, 64 | "ExecValuesScan": 1, 65 | "ExecWindowAgg": 1, 66 | "ExecWorkTableScan": 1, 67 | "MultiExecHash": 1, 68 | "MultiExecBitmapIndexScan": 1, 69 | "MultiExecBitmapAnd": 1, 70 | "MultiExecBitmapOr": 1, 71 | } 72 | 73 | 74 | class Query: 75 | """ 76 | A PostgreSQL Query. 77 | """ 78 | 79 | def __init__( 80 | self, 81 | *, 82 | addr: int, 83 | query_id: int, 84 | startup_cost: float, 85 | total_cost: float, 86 | plan_rows: float, 87 | startts: Optional[float] = None, 88 | text: Optional[str] = None, 89 | # Instrumentation is a dynamically generated class, no way to check it 90 | instrument: Any = None, 91 | search_path: Optional[str] = None, 92 | ): 93 | self.addr = addr 94 | self.query_id = query_id 95 | self.startup_cost = startup_cost 96 | self.total_cost = total_cost 97 | self.plan_rows = plan_rows 98 | self.startts = startts 99 | self.text = text 100 | self.instrument = instrument 101 | self.search_path = search_path 102 | self.nodes: Dict[int, PlanState] = {} 103 | self.io_counters: Dict[str, int] = defaultdict(lambda: 0) 104 | self.memallocs: MemoryAllocations = MemoryAllocations() 105 | 106 | @property 107 | def root_node(self) -> PlanState: 108 | """ 109 | Returns the plan's root node. 110 | """ 111 | root_candidates = [ 112 | node for node in self.nodes.values() if node.parent_node is None 113 | ] 114 | if len(root_candidates) == 0: 115 | raise ValueError("Invalid plan, we have no root node when we expect 1") 116 | if len(root_candidates) > 1: 117 | # In that case, we need to build a "fake" parent node. 118 | root_node = PlanState(None) 119 | root_node.children = {c: None for c in root_candidates} 120 | else: 121 | root_node = root_candidates[0] 122 | return root_node 123 | 124 | @classmethod 125 | def from_event(cls, metadata: ProcessMetadata, event: portal_data) -> Query: 126 | """ 127 | Build a query from portal_data event generated by eBPF. 128 | """ 129 | instrument_addr = ct.addressof(event.instrument) 130 | instrument = metadata.structs.Instrumentation(instrument_addr) 131 | search_path = None 132 | if event.search_path: 133 | search_path = event.search_path.decode("utf8") 134 | _, creation_time = event.portal_key.as_tuple() 135 | return cls( 136 | addr=event.query_addr, 137 | query_id=event.query_id, 138 | startup_cost=event.startup_cost, 139 | total_cost=event.total_cost, 140 | plan_rows=event.plan_rows, 141 | startts=creation_time, 142 | text=event.query.decode("utf8"), 143 | instrument=instrument, 144 | search_path=search_path, 145 | ) 146 | 147 | def update(self, metadata: ProcessMetadata, event: portal_data) -> None: 148 | """ 149 | Update the query from an eBPF portal_data event. 150 | """ 151 | instrument_addr = ct.addressof(event.instrument) 152 | instrument = metadata.structs.Instrumentation(instrument_addr) 153 | if instrument.running: 154 | self.instrument = instrument 155 | _, creation_time = event.portal_key.as_tuple() 156 | self.startts = creation_time or self.startts 157 | self.text = event.query.decode("utf-8") or self.text 158 | search_path = event.search_path.decode("utf8") 159 | self.search_path = search_path or self.search_path 160 | 161 | @property 162 | def start_datetime(self) -> Optional[datetime]: 163 | """ 164 | Returns the creation timestamp of the portal associated to this query. 165 | """ 166 | if self.startts is None: 167 | return None 168 | return datetime.fromtimestamp(self.startts / 1000000) 169 | 170 | @property 171 | def runtime(self) -> Optional[timedelta]: 172 | """ 173 | Returns the query's top-node total runtime. 174 | """ 175 | if self.instrument and self.instrument.need_timer.value: 176 | return timespec_to_timedelta(self.instrument.counter) 177 | return None 178 | 179 | @property 180 | def shared_buffers_hitratio(self) -> Optional[float]: 181 | """ 182 | Returns the hit ratio from the shared buffers. 183 | """ 184 | if self.instrument is None: 185 | return None 186 | bufusage = self.instrument.bufusage 187 | total_blks = bufusage.shared_blks_hit.value + bufusage.shared_blks_read.value 188 | # If we didn't read any block, hit ratio is None 189 | if total_blks == 0: 190 | return None 191 | return float(bufusage.shared_blks_hit.value / total_blks * 100) 192 | 193 | @property 194 | def syscache_hitratio(self) -> Optional[float]: 195 | """ 196 | Returns the system's hit ratio. 197 | """ 198 | if self.instrument is None: 199 | return None 200 | bufusage = self.instrument.bufusage 201 | # FIXME: don't assume a fixed block size, either pass it as an option 202 | # or query the actual value from the DB 203 | blksize = 8192 204 | total_blks = ( 205 | bufusage.shared_blks_read.value 206 | + bufusage.local_blks_read.value 207 | + bufusage.temp_blks_read.value 208 | ) 209 | total_bytes = total_blks * blksize 210 | if total_bytes == 0: 211 | return None 212 | bytes_hit = total_bytes - self.io_counters["R"] 213 | return float(bytes_hit / total_bytes * 100) 214 | 215 | def add_nodes_from_stack( 216 | self, 217 | metadata: ProcessMetadata, 218 | stack: stack_data_t, 219 | start_at: int = 0, 220 | base_node: Optional[PlanState] = None, 221 | ) -> None: 222 | """ 223 | Process a capture stack to add node stubs to this query. 224 | """ 225 | addr_space = UnwindAddressSpace(stack, metadata) 226 | nodes = self.nodes 227 | cur_node = base_node 228 | for idx, frame in enumerate(addr_space.frames()): 229 | if idx < start_at: 230 | continue 231 | if frame.function_name in FUNCTION_ARGS_MAPPING: 232 | argnum = FUNCTION_ARGS_MAPPING[frame.function_name] 233 | parent_addr = frame.fetch_arg(argnum, ct.c_ulonglong).value 234 | if cur_node and parent_addr == cur_node.addr: 235 | continue 236 | parent_node = nodes.get(parent_addr) 237 | if parent_node is None: 238 | parent_node = PlanState(parent_addr) 239 | nodes[parent_addr] = parent_node 240 | if cur_node: 241 | cur_node.parent_node = parent_node 242 | parent_node.children[cur_node] = None 243 | # The parent_node is already not a stub, meaning its ancestors 244 | # have been resolved. Stop walking the frame here 245 | if not parent_node.is_stub: 246 | break 247 | cur_node = parent_node 248 | 249 | def add_node_from_event( 250 | self, metadata: ProcessMetadata, event: planstate_data 251 | ) -> PlanState: 252 | """ 253 | Add a node from planstate_data event to this query plantree. 254 | We walk the stack up to understand where the nodes are located relative 255 | to each other. 256 | """ 257 | nodes = self.nodes 258 | addr = event.planstate_addr 259 | planstate = nodes.get(addr) 260 | if planstate is None: 261 | planstate = PlanState(addr) 262 | nodes[addr] = planstate 263 | planstate.update(metadata, event) 264 | if not planstate.is_stub: 265 | return planstate 266 | self.add_nodes_from_stack( 267 | metadata, event.stack_capture, start_at=1, base_node=planstate 268 | ) 269 | planstate.is_stub = False 270 | return planstate 271 | -------------------------------------------------------------------------------- /src/pgtracer/scripts/pgtrace_gucs.py: -------------------------------------------------------------------------------- 1 | """ 2 | This simple script reads and writes GUCs in a running PostgreSQL backend 3 | """ 4 | import argparse 5 | 6 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector, GUCTracerOptions 7 | 8 | 9 | def main() -> None: 10 | """ 11 | Entry point for the pgtrace_gucs script. 12 | """ 13 | parser = argparse.ArgumentParser( 14 | description="Run and / or write GUCs from a running PostgreSQL backend." 15 | ) 16 | parser.add_argument("pid", type=int, help="PID to connect to") 17 | 18 | parser.add_argument( 19 | "--set-guc", 20 | metavar="GUC=VALUE", 21 | dest="set_gucs", 22 | nargs="+", 23 | default=[], 24 | help="Set a number of GUCs in the running backend", 25 | ) 26 | 27 | args = parser.parse_args() 28 | pid = args.pid 29 | 30 | # Parse the set-guc option. 31 | set_gucs = {} 32 | for keyvalue in args.set_gucs: 33 | key, value = keyvalue.split("=") 34 | set_gucs[key] = value 35 | options = GUCTracerOptions() 36 | 37 | collector = GUCTracerBPFCollector.from_pid(pid, options) 38 | collector.start() 39 | print(f"Backend is of type {str(collector.backend_type)}") 40 | seen = set() 41 | for gucname, gucvalue in set_gucs.items(): 42 | collector.set_guc(gucname, gucvalue) 43 | while collector.is_running: 44 | with collector.lock: 45 | for guc in collector.guc_defs.values(): 46 | if guc.guc_name is not None: 47 | seen.add(guc.guc_name) 48 | collector.stop() 49 | 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /src/pgtracer/scripts/pgtrace_queries.py: -------------------------------------------------------------------------------- 1 | """ 2 | This simple script trace queries executed by a Postgres backend. 3 | """ 4 | 5 | import argparse 6 | import sys 7 | import time 8 | from collections import defaultdict 9 | from datetime import timedelta 10 | from typing import Any, Dict, Optional 11 | 12 | from pgtracer.ebpf.collector.querytracer import ( 13 | InstrumentationFlags, 14 | QueryTracerBPFCollector, 15 | QueryTracerOptions, 16 | ) 17 | from pgtracer.ebpf.dwarf import Struct 18 | from pgtracer.model.query import Query 19 | from pgtracer.utils import timespec_to_timedelta 20 | 21 | 22 | def dump_dict(somedict: Dict[str, Any], indent: int = 0) -> str: 23 | """ 24 | Dump a dictionary as an indented string of key / value pairs. 25 | """ 26 | parts = [] 27 | tabs = "\t" * indent 28 | for key, value in somedict.items(): 29 | if isinstance(value, Struct): 30 | # Special case for timespec 31 | if value.__class__.__name__ == "timespec": 32 | try: 33 | value = timespec_to_timedelta(value) 34 | except OverflowError: 35 | # Ignore overflowing timespecs 36 | continue 37 | else: 38 | value = value.as_dict(include_all=True) 39 | if isinstance(value, dict): 40 | part = "\n" + dump_dict(value, indent + 1) 41 | else: 42 | if hasattr(value, "value"): 43 | part = value.value 44 | else: 45 | part = value 46 | parts.append(f"{tabs}{key}: {part}") 47 | return "\n".join(parts) 48 | 49 | 50 | def print_query(query: Query, options: QueryTracerOptions) -> None: 51 | """ 52 | Print a query according to which collector options have been set. 53 | """ 54 | parts = [] 55 | start = "" 56 | if query.start_datetime is not None: 57 | start = query.start_datetime.isoformat() 58 | parts.append(f"{start} {query.text}") 59 | mapping = {} 60 | mapping["search_path"] = query.search_path 61 | mapping["query_id"] = str(query.query_id) or "" 62 | mapping["startup_cost"] = str(query.startup_cost) 63 | mapping["total_cost"] = str(query.total_cost) 64 | mapping["plan_rows"] = str(query.plan_rows) 65 | mapping["peak_mem_alloc"] = str(query.memallocs.current_mem_peak) 66 | if query.instrument.need_timer: 67 | mapping["runtime"] = str(query.runtime) 68 | if options.instrument_flags & InstrumentationFlags.BUFFERS: 69 | mapping["written_bytes_to_disk"] = str(query.io_counters["W"]) 70 | if query.shared_buffers_hitratio is not None: 71 | mapping["shared_buffers_hitratio"] = f"{query.shared_buffers_hitratio:0.2f}" 72 | else: 73 | mapping["shared_buffers_hitratio"] = None 74 | if query.syscache_hitratio is not None: 75 | mapping["syscache_hitratio"] = f"{query.syscache_hitratio:0.2f}" 76 | else: 77 | mapping["syscache_hitratio"] = None 78 | if query.instrument: 79 | mapping["buffer_usage"] = query.instrument.bufusage 80 | if options.instrument_flags & InstrumentationFlags.WAL and query.instrument: 81 | mapping["wal_usage"] = query.instrument.walusage 82 | print(query.text) 83 | print(dump_dict(mapping, 1)) 84 | if options.enable_nodes_collection: 85 | print(query.root_node.explain()) 86 | 87 | 88 | LINE_UP = "\033[1A" 89 | LINE_CLEAR = "\x1b[2K" 90 | 91 | 92 | def print_running_query( 93 | query: Query, print_plan: bool, first_time: bool, clear_line: int = 0 94 | ) -> int: 95 | """ 96 | Print the currently running query. 97 | """ 98 | nb_lines = 0 99 | if first_time: 100 | print("Currently running:") 101 | print(query.text) 102 | if not print_plan: 103 | print("Tuples produced / tuple expected") 104 | print("") 105 | for _ in range(clear_line): 106 | print(LINE_UP, end=LINE_CLEAR) 107 | if print_plan and query.root_node: 108 | plan = query.root_node.explain() 109 | nb_lines = len(plan.split("\n")) 110 | print(plan) 111 | else: 112 | print(f"{int(query.instrument.tuplecount.value)} / {int(query.plan_rows)}") 113 | return nb_lines 114 | 115 | 116 | def main() -> None: 117 | """ 118 | Entry point for the pgtrace_queries script. 119 | """ 120 | parser = argparse.ArgumentParser( 121 | description="Dump a running backend execution plan" 122 | ) 123 | parser.add_argument("pid", type=int, help="PID to connect to") 124 | parser.add_argument( 125 | "--instrument", 126 | "-I", 127 | type=str, 128 | default=None, 129 | nargs="*", 130 | choices=[flag.name for flag in InstrumentationFlags], 131 | action="extend", 132 | help="""Instrument flags to set. (warning: writes into backends 133 | memory!)""", 134 | ) 135 | parser.add_argument( 136 | "--nodes-collection", 137 | "-n", 138 | default=False, 139 | action="store_true", 140 | help="""Collect information about individual execution nodes""", 141 | ) 142 | 143 | args = parser.parse_args() 144 | pid = args.pid 145 | instrument_flags = 0 146 | if args.instrument: 147 | for flag in args.instrument: 148 | instrument_flags |= InstrumentationFlags[flag] 149 | options = QueryTracerOptions( 150 | instrument_flags=instrument_flags, 151 | enable_nodes_collection=args.nodes_collection, 152 | enable_perf_events=instrument_flags != 0, 153 | ) 154 | collector = QueryTracerBPFCollector.from_pid(pid, options) 155 | collector.start() 156 | total_queries = 0 157 | last_running_query: Dict[int, Optional[Query]] = defaultdict(lambda: None) 158 | lines_to_clear = 0 159 | while collector.is_running: 160 | try: 161 | time.sleep(1) 162 | for ( 163 | pid, 164 | process_info, 165 | ) in collector.event_handler.per_process_info.copy().items(): 166 | if not process_info.query_history and process_info.current_query: 167 | first_time = ( 168 | last_running_query[pid] is not process_info.current_query 169 | ) 170 | if first_time: 171 | lines_to_clear = 0 172 | lines_to_clear = print_running_query( 173 | process_info.current_query, 174 | options.enable_nodes_collection, 175 | first_time, 176 | lines_to_clear, 177 | ) 178 | last_running_query[pid] = process_info.current_query 179 | continue 180 | last_running_query[pid] = None 181 | for query in process_info.query_history: 182 | print_query(query, options) 183 | total_queries += len(process_info.query_history) 184 | process_info.query_history = [] 185 | except KeyboardInterrupt: 186 | break 187 | collector.stop() 188 | total_processes = len(collector.event_handler.process_history) + len( 189 | collector.event_handler.per_process_info 190 | ) 191 | print(f"Processed {total_queries} queries among {total_processes} processes") 192 | 193 | 194 | if __name__ == "__main__": 195 | main() 196 | -------------------------------------------------------------------------------- /src/pgtracer/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Miscellaneous utility functions. 3 | """ 4 | 5 | import functools 6 | import itertools 7 | import re 8 | import subprocess 9 | from datetime import timedelta 10 | from typing import TYPE_CHECKING, BinaryIO, Optional, Union 11 | 12 | from pypsutil import Process 13 | 14 | from pgtracer.ebpf.dwarf import Struct 15 | 16 | if TYPE_CHECKING: 17 | from ctypes import _CData 18 | else: 19 | _CData = object 20 | 21 | 22 | def timespec_to_timedelta(timespec: Union[_CData, Struct]) -> timedelta: 23 | """ 24 | Convert a timespec_t or instr_time struct to a timedelta. 25 | """ 26 | # Can't really compare it to a proper class, so test on the class name 27 | if timespec.__class__.__name__ == "timespec": 28 | return timedelta( 29 | seconds=timespec.tv_sec.value, # type: ignore 30 | microseconds=timespec.tv_nsec.value / 1000, # type: ignore 31 | ) 32 | if timespec.__class__.__name__ == "instr_time": 33 | return timedelta(seconds=timespec.ticks.value / 1000000000) # type: ignore 34 | raise ValueError("Expecting a timespec or instr_time struct") 35 | 36 | 37 | def timespec_to_float(timespec: _CData) -> float: 38 | """ 39 | Convert a timespec_t or instr_time struct to a float representing the number of seconds. 40 | """ 41 | if timespec.__class__.__name__ == "timespec": 42 | return float(timespec.tv_sec.value + timespec.tv_nsec.value / 1000000000) # type: ignore 43 | if timespec.__class__.__name__ == "instr_time": 44 | return float(timespec.ticks.value / 1000000000) # type: ignore 45 | raise ValueError("Expecting a timespec or instr_time struct") 46 | 47 | 48 | NSPID_PARSING_RE = re.compile(rb"^NSpid:\s+((?:(?:\d+)\s*)+)") 49 | 50 | 51 | def resolve_container_pid(container: str, container_pid: int) -> Optional[int]: 52 | """ 53 | Resolve container_pid from the systemd-nspawn container `container` 54 | to a host pid. 55 | """ 56 | # FIXME: this probably does not handle nested namespaces. 57 | completed_process = subprocess.run( 58 | ["machinectl", "show", container, "-p", "Leader"], 59 | capture_output=True, 60 | check=True, 61 | ) 62 | container_leader_pid = int(completed_process.stdout.split(b"=")[1]) 63 | # Now iterate over all child processes from this container. 64 | leader_process = Process(container_leader_pid) 65 | for child in leader_process.children(recursive=True): 66 | with open(f"/proc/{child.pid}/status", "rb") as statf: 67 | for line in statf: 68 | nspid_match = NSPID_PARSING_RE.match(line) 69 | if nspid_match: 70 | ns_pids = list(map(int, nspid_match.group(1).strip().split(b"\t"))) 71 | if ns_pids[-1] == container_pid: 72 | return ns_pids[0] 73 | return None 74 | 75 | 76 | def readcstr(filelike: BinaryIO) -> bytes: 77 | """ 78 | Read a NULL terminated C-string from a BinaryIO 79 | Courtesy of https://stackoverflow.com/a/32775270 80 | """ 81 | toeof = iter(functools.partial(filelike.read, 1), b"") 82 | return b"".join(itertools.takewhile(b"\0".__ne__, toeof)) 83 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pytest fixtures. 3 | """ 4 | 5 | import os 6 | import re 7 | import subprocess 8 | from pathlib import Path 9 | from pwd import getpwnam 10 | from tempfile import TemporaryDirectory 11 | from typing import Iterator 12 | 13 | import port_for 14 | import psycopg 15 | import pytest 16 | from pytest import FixtureRequest 17 | from pytest_postgresql.config import get_config 18 | from pytest_postgresql.executor import PostgreSQLExecutor 19 | from pytest_postgresql.executor_noop import NoopExecutor 20 | 21 | from pgtracer.ebpf.collector import CollectorOptions 22 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector 23 | from pgtracer.ebpf.collector.querytracer import ( 24 | InstrumentationFlags, 25 | QueryTracerBPFCollector, 26 | ) 27 | from pgtracer.utils import resolve_container_pid 28 | 29 | 30 | def pytest_addoption(parser): 31 | """ 32 | Add the required options to pytest. 33 | """ 34 | parser.addoption( 35 | "--container", 36 | help="Set this if the backend we are testing against is " 37 | "running inside a container.", 38 | ) 39 | 40 | 41 | def pytest_configure(config): 42 | """ 43 | Add used markers. 44 | """ 45 | config.addinivalue_line( 46 | "markers", "slow: mark test as being 'slow', allowing to skip it" 47 | ) 48 | 49 | 50 | @pytest.fixture(scope="session") 51 | def nonroot_postgres(request: FixtureRequest) -> Iterator[PostgreSQLExecutor]: 52 | """ 53 | Returns a PostgreSQLExecutor to a newly created instance, running as the 54 | postgres user. 55 | 56 | FIXME: make the unix user used to run the instance configurable. 57 | """ 58 | 59 | config = get_config(request) 60 | 61 | # If we have a host, use that instead of creating a new instance. 62 | if request.config.getoption("postgresql_host"): 63 | postgresql_executor = NoopExecutor( 64 | config.get("host"), 5432, "postgres", {}, "postgres" 65 | ) 66 | postgresql_executor.unixsocketdir = None 67 | yield postgresql_executor 68 | return 69 | 70 | postgresql_ctl = config["exec"] 71 | 72 | if not os.path.exists(postgresql_ctl): 73 | pg_bindir = subprocess.check_output( 74 | ["pg_config", "--bindir"], universal_newlines=True 75 | ).strip() 76 | postgresql_ctl = os.path.join(pg_bindir, "pg_ctl") 77 | 78 | pg_passwd = getpwnam("postgres") 79 | 80 | with TemporaryDirectory() as tempdir_str: 81 | tmpdir = Path(tempdir_str) 82 | os.chown(tmpdir, pg_passwd.pw_uid, pg_passwd.pw_gid) 83 | pg_port = port_for.select_random() 84 | datadir = tmpdir / f"data-{pg_port}" 85 | unix_socket_dir = tmpdir / "unix-socket" 86 | postgresql_executor = PostgreSQLExecutor( 87 | executable=postgresql_ctl, 88 | shell=True, 89 | port=pg_port, 90 | host="localhost", 91 | unixsocketdir=str(unix_socket_dir), 92 | logfile=str(tmpdir / "pg_log"), 93 | dbname="postgres", 94 | startparams="", 95 | datadir=str(datadir), 96 | ) 97 | postgresql_executor.VERSION_RE = re.compile( 98 | ".* (?P\\d+((\\.\\d+)|beta\\d|rc\\d|dev))" 99 | ) 100 | pid = os.fork() 101 | if pid == 0: 102 | try: 103 | os.setuid(pg_passwd.pw_uid) 104 | os.chdir(str(tmpdir)) 105 | datadir.mkdir() 106 | unix_socket_dir.mkdir() 107 | postgresql_executor.start() 108 | postgresql_executor.wait_for_postgres() 109 | except Exception as exc: # pylint: disable=broad-except 110 | print(exc) 111 | os._exit(1) # pylint: disable=protected-access 112 | finally: 113 | os._exit(0) # pylint: disable=protected-access 114 | else: 115 | pid, return_code = os.waitpid(pid, 0) 116 | if return_code != 0: 117 | raise Exception("Could not start postgresql") 118 | try: 119 | yield postgresql_executor 120 | finally: 121 | pid = os.fork() 122 | if pid == 0: 123 | try: 124 | os.setuid(pg_passwd.pw_uid) 125 | postgresql_executor.stop() 126 | finally: 127 | os._exit(0) # pylint: disable=protected-access 128 | os.waitpid(pid, 0) 129 | 130 | 131 | @pytest.fixture 132 | def connection(nonroot_postgres): # pylint: disable=redefined-outer-name 133 | """ 134 | Returns a connection to the temporary postgresql instance. 135 | """ 136 | conn = psycopg.connect( 137 | port=nonroot_postgres.port, 138 | host=nonroot_postgres.unixsocketdir or nonroot_postgres.host, 139 | user=nonroot_postgres.user, 140 | ) 141 | yield conn 142 | conn.close() 143 | 144 | 145 | def make_collector( 146 | cls, connection, config, **kwargs 147 | ): # pylint: disable=redefined-outer-name 148 | """ 149 | Create a collector from a connection. 150 | """ 151 | backend_pid = connection.info.backend_pid 152 | if config.getoption("container"): 153 | # If we have a container, look into it to translate the backend_pid 154 | # to the host namespace. 155 | backend_pid = resolve_container_pid(config.getoption("container"), backend_pid) 156 | options = cls.options_cls(**kwargs) 157 | collector = cls.from_pid(pid=backend_pid, options=options) 158 | collector.start() 159 | return collector 160 | 161 | 162 | @pytest.fixture 163 | def querytracer_factory(connection, request): 164 | def factory_func(**kwargs): 165 | kwargs.setdefault("enable_nodes_collection", True) 166 | return make_collector( 167 | QueryTracerBPFCollector, connection, request.config, **kwargs 168 | ) 169 | 170 | return factory_func 171 | 172 | 173 | @pytest.fixture 174 | def querytracer( 175 | request: FixtureRequest, connection 176 | ): # pylint: disable=redefined-outer-name 177 | """ 178 | Returns a bpfcollector associated to the current connection. 179 | """ 180 | collector = make_collector( 181 | QueryTracerBPFCollector, 182 | connection, 183 | request.config, 184 | enable_nodes_collection=True, 185 | ) 186 | yield collector 187 | collector.stop() 188 | 189 | 190 | @pytest.fixture 191 | def querytracer_instrumented( 192 | request: FixtureRequest, connection 193 | ): # pylint: disable=redefined-outer-name 194 | """ 195 | Returns a bpfcollector with instrumentation turned on. 196 | """ 197 | collector = make_collector( 198 | QueryTracerBPFCollector, 199 | connection, 200 | request.config, 201 | instrument_flags=InstrumentationFlags.ALL, 202 | enable_perf_events=True, 203 | enable_query_discovery=True, 204 | enable_nodes_collection=True, 205 | ) 206 | yield collector 207 | collector.stop() 208 | 209 | 210 | @pytest.fixture 211 | def guctracer(request: FixtureRequest, connection): 212 | """ 213 | Fixture returning an instance of a GUCTracer. 214 | """ 215 | collector = make_collector(GUCTracerBPFCollector, connection, request.config) 216 | yield collector 217 | collector.stop() 218 | -------------------------------------------------------------------------------- /tests/scripts/setup_fedora_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | # Create a directory for the container 5 | mkdir ~/fedora 6 | mkdir -p /var/lib/machines/fedora 7 | mount -o bind ~/fedora /var/lib/machines/fedora 8 | mkdir -p /etc/distro.repos.d 9 | # Configure yum repos for fedora 10 | cat << EOF > /etc/distro.repos.d/fedora.repo 11 | [fedora] 12 | name=Fedora \$releasever – \$basearch 13 | failovermethod=priority 14 | baseurl=http://download.fedoraproject.org/pub/fedora/linux/releases/\$releasever/Everything/\$basearch/os 15 | metalink=https://mirrors.fedoraproject.org/metalink?repo=fedora-\$releasever&arch=\$basearch 16 | enabled=1 17 | gpgcheck=1 18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-\$releasever-\$basearch 19 | metadata_expire=1 20 | skip_if_unavailable=False 21 | EOF 22 | 23 | # Install the fedora key for f36 24 | # TODO: generalize it 25 | mkdir -p /etc/pki/rpm-gpg/ 26 | wget https://getfedora.org/static/fedora.gpg -O /etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-36-x86_64 27 | 28 | # Install the required packages in the container 29 | dnf -y --releasever=36 --best \ 30 | --refresh \ 31 | --setopt=install_weak_deps=False \ 32 | --installroot=/var/lib/machines/fedora/ \ 33 | install \ 34 | dhcp-client dnf fedora-release glibc glibc-langpack-en glibc-langpack-de \ 35 | iputils less ncurses passwd systemd \ 36 | systemd-networkd systemd-resolved util-linux vim-default-editor \ 37 | postgresql-server dnf-utils dnf-plugins-core \ 38 | python-bcc python-pip libunwind 39 | 40 | rm /var/lib/machines/fedora/etc/resolv.conf 41 | cp /etc/resolv.conf /var/lib/machines/fedora/etc/resolve.conf 42 | 43 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf --best -y --releasever=36 install postgresql-server 44 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf -y --releasever=36 debuginfo-install postgresql-server 45 | 46 | # Set a dummy password for the root user 47 | systemd-nspawn --console=pipe -D /var/lib/machines/fedora/ passwd root --stdin << EOF 48 | fedora 49 | EOF 50 | 51 | systemctl start systemd-nspawn@fedora 52 | sleep 2 53 | systemd-run --machine fedora --pipe --wait /usr/bin/postgresql-setup --initdb 54 | systemd-run --machine fedora --pipe --wait /usr/bin/sed "s/#listen_addresses = 'localhost'/listen_addresses = '*'/" /var/lib/pgsql/data/postgresql.conf -i 55 | systemd-run --machine fedora --pipe --wait /usr/bin/bash -c 'echo "host all all 0.0.0.0/0 trust" > /var/lib/pgsql/data/pg_hba.conf' 56 | systemd-run --machine fedora --pipe --wait /usr/bin/systemctl enable postgresql --now 57 | 58 | 59 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip link set up host0 60 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip addr add 172.16.0.1/30 dev host0 61 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip route add default dev host0 62 | 63 | # Ok, now we need to assign a static IP address 64 | ip link set up ve-fedora 65 | ip route add 172.16.0.0/30 dev ve-fedora 66 | ip addr add 172.16.0.2/30 dev ve-fedora 67 | -------------------------------------------------------------------------------- /tests/test_bins/Makefile: -------------------------------------------------------------------------------- 1 | %.elf: %.elf.c 2 | gcc -Wl,--build-id -gdwarf-5 -O0 -c $*.elf.c -o $@ 3 | 4 | %.main: %.main.c 5 | gcc -Wl,--build-id -gdwarf-5 -O0 $*.main.c -o $@ 6 | 7 | all: test.elf test_stack.main 8 | -------------------------------------------------------------------------------- /tests/test_bins/test.elf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test.elf -------------------------------------------------------------------------------- /tests/test_bins/test.elf.c: -------------------------------------------------------------------------------- 1 | typedef struct StructA { 2 | int a_int; 3 | float a_float; 4 | char* a_charp; 5 | } StructA; 6 | 7 | typedef struct StructB { 8 | StructA b_structa; 9 | StructA* b_structap; 10 | struct StructB* b_structbp; 11 | } StructB; 12 | 13 | StructA GLOBAL_STRUCT_A = {1, 1.0, "TEST"}; 14 | 15 | StructB GLOBAL_STRUCT_B = {0}; 16 | -------------------------------------------------------------------------------- /tests/test_bins/test_stack.main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test_stack.main -------------------------------------------------------------------------------- /tests/test_bins/test_stack.main.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int func_1(int a, int b) 4 | { 5 | int c = a + b; 6 | return c; 7 | } 8 | 9 | int func_2(int a, int b) 10 | { 11 | return func_1(a + 1, b + 2); 12 | } 13 | 14 | int main(int argc, char** argv) 15 | { 16 | /* 17 | * Block until the testing program sends something on stdin. 18 | * This is to allow for the testing program to get our proc/maps 19 | */ 20 | getchar(); 21 | return func_2(10, 20); 22 | } 23 | -------------------------------------------------------------------------------- /tests/test_dwarf.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module tests some utilities from the dwarf module. 3 | """ 4 | 5 | import ctypes as ct 6 | import os 7 | from pathlib import Path 8 | from unittest import TestCase 9 | from unittest.mock import patch 10 | 11 | from pgtracer.ebpf.dwarf import ( 12 | DWARFPointer, 13 | ProcessMetadata, 14 | Struct, 15 | StructMemberDefinition, 16 | ) 17 | from pgtracer.ebpf.eh_frame_hdr import EhFrameHdr 18 | 19 | TEST_BINARY = Path(__file__).parent / "test_bins" / "test.elf" 20 | TEST_EXEC_BINARY = Path(__file__).parent / "test_bins" / "test_stack.main" 21 | 22 | 23 | class MockProcess: 24 | """ 25 | Mock a pypsutil.Process. 26 | """ 27 | 28 | def __init__(self, binary): 29 | self.binary = binary 30 | 31 | def exe(self): 32 | """ 33 | Returns a constant binary string. 34 | """ 35 | return self.binary 36 | 37 | @property 38 | def pid(self): 39 | """ 40 | Returns self pid. We only need an existing pid... 41 | """ 42 | return os.getpid() 43 | 44 | 45 | class TestProcessMetadata(TestCase): 46 | """ 47 | Test the dwarf helpers in ProcessMetadata. 48 | """ 49 | 50 | @patch("pgtracer.ebpf.dwarf.get_mapped_regions", lambda process, root: []) 51 | def setUp(self): 52 | self.process_meta = ProcessMetadata(MockProcess(TEST_BINARY)) 53 | self.exec_process_meta = ProcessMetadata(MockProcess(TEST_EXEC_BINARY)) 54 | 55 | def test_struct(self): 56 | """ 57 | Test the struct parsing helper. 58 | """ 59 | structs = self.process_meta.structs 60 | 61 | StructA = structs.StructA # pylint: disable=invalid-name 62 | self.assertTrue(issubclass(StructA, Struct)) 63 | self.assertEqual(StructA.size, 16) 64 | 65 | a_int = StructA.field_definition("a_int") 66 | self.assertIsInstance(a_int, StructMemberDefinition) 67 | self.assertEqual(a_int.offset, 0) 68 | self.assertEqual(a_int.member_type, ct.c_int) 69 | 70 | a_float = StructA.field_definition("a_float") 71 | self.assertEqual(a_float.offset, 4) 72 | self.assertEqual(a_float.member_type, ct.c_float) 73 | 74 | a_charp = StructA.field_definition("a_charp") 75 | self.assertEqual(a_charp.offset, 8) 76 | self.assertTrue(issubclass(a_charp.member_type, ct._Pointer)) 77 | self.assertEqual(a_charp.member_type._type_, ct.c_byte) 78 | 79 | StructB = structs.StructB # pylint: disable=invalid-name 80 | 81 | b_structa = StructB.field_definition("b_structa") 82 | self.assertEqual(b_structa.offset, 0) 83 | self.assertEqual(b_structa.member_type, StructA) 84 | 85 | b_structap = StructB.field_definition("b_structap") 86 | self.assertEqual(b_structap.offset, StructA.size) 87 | self.assertTrue(issubclass(b_structap.member_type, DWARFPointer)) 88 | self.assertEqual(b_structap.member_type.pointed_type, StructA) 89 | 90 | b_structbp = StructB.field_definition("b_structbp") 91 | self.assertEqual(b_structbp.offset, StructA.size + 8) 92 | self.assertTrue(issubclass(b_structbp.member_type, DWARFPointer)) 93 | self.assertEqual(b_structbp.member_type.pointed_type, StructB) 94 | 95 | def test_eh_frame_hdr(self): 96 | """ 97 | The the eh_frame_hdr parser. 98 | """ 99 | eh_frame_hdr = EhFrameHdr.load_eh_frame_hdr(self.exec_process_meta.elffile) 100 | all_entries = list(eh_frame_hdr.iter_entries()) 101 | assert len(all_entries) == 5 102 | assert eh_frame_hdr.fde_count == 5 103 | assert eh_frame_hdr.find_fde(0) == None 104 | assert eh_frame_hdr.find_fde(0xFFFFFFFFF) == None 105 | assert eh_frame_hdr.find_fde(4412).header.initial_location == 4409 106 | 107 | def test_die_contains_addr(self): 108 | dw = self.exec_process_meta.dwarf_info 109 | all_cus = list(dw.iter_CUs()) 110 | # CU at index 3 as a DW_AT_ranges attribute 111 | cu = all_cus[3] 112 | die = cu.get_top_DIE() 113 | assert self.exec_process_meta.die_contains_addr(die, 4096) 114 | assert self.exec_process_meta.die_contains_addr(die, 4100) 115 | assert not self.exec_process_meta.die_contains_addr(die, 4095) 116 | assert not self.exec_process_meta.die_contains_addr(die, 4118) 117 | -------------------------------------------------------------------------------- /tests/test_guctracer.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from time import sleep 3 | from unittest.mock import patch 4 | 5 | from pgtracer.ebpf.collector.guc import GUCTracerEventHandler 6 | 7 | 8 | def test_setting_one_guc(guctracer, connection): 9 | """ 10 | Test to set a GUC in a running backend. 11 | """ 12 | guc_has_been_set = False 13 | original_method = GUCTracerEventHandler.handle_GUCResponse 14 | 15 | def observe_guc_response(event_handler, collector, event, pid): 16 | nonlocal guc_has_been_set 17 | guc_has_been_set = True 18 | return original_method(event_handler, collector, event, pid) 19 | 20 | with patch( 21 | f"pgtracer.ebpf.collector.guc.GUCTracerEventHandler.handle_GUCResponse", 22 | observe_guc_response, 23 | ): 24 | # Set work_mem to 64kB 25 | guctracer.set_guc("work_mem", 64) 26 | start = datetime.now() 27 | while not guc_has_been_set and (datetime.now() - start) < timedelta(seconds=20): 28 | # Generate some activity to trigger the probe 29 | with connection.execute("SELECT 1") as cur: 30 | pass 31 | sleep(0.1) 32 | with connection.execute("show work_mem") as cur: 33 | result = cur.fetchall() 34 | val = result[0][0] 35 | # Depending on the version, it can come back as str or bytes 36 | if isinstance(val, bytes): 37 | val = val.decode("utf8") 38 | assert val == "64kB" 39 | -------------------------------------------------------------------------------- /tests/test_querytracer.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module acts as a general health check for the eBPF collector. 3 | """ 4 | import re 5 | from collections import defaultdict 6 | from contextlib import ExitStack 7 | from datetime import timedelta 8 | from threading import Thread 9 | from time import sleep 10 | from unittest.mock import patch 11 | 12 | import pytest 13 | from flaky import flaky 14 | 15 | from pgtracer.ebpf.collector.querytracer import ( 16 | InstrumentationFlags, 17 | QueryTracerEventHandler, 18 | ) 19 | from pgtracer.utils import timespec_to_timedelta as tstimedelta 20 | 21 | 22 | def wait_for_collector(collector): 23 | """ 24 | Wait for the collector to have at least one query. 25 | """ 26 | tries = 0 27 | process_info = collector.event_handler.per_process_info[collector.pid] 28 | while len(process_info.query_history) == 0 and tries < 1000: 29 | tries += 1 30 | sleep(0.05) 31 | 32 | 33 | def test_basic_ebf_collector(querytracer, connection): 34 | """ 35 | Test the most basic functionality of the ebpf collector works. 36 | """ 37 | # Now try running a query, and see if we can get it back 38 | with connection.execute("SELECT now()") as cur: 39 | querystart = cur.fetchall()[0][0].replace(microsecond=0, tzinfo=None) 40 | wait_for_collector(querytracer) 41 | assert len(querytracer.event_handler.per_process_info) == 1 42 | process_info = querytracer.event_handler.per_process_info[querytracer.pid] 43 | assert len(process_info.query_history) == 1 44 | query = process_info.query_history[0] 45 | assert query.text == "SELECT now()" 46 | assert query.search_path == '"$user", public' 47 | assert query.start_datetime.replace(microsecond=0) == querystart 48 | assert query.runtime is None 49 | assert query.instrument.need_timer.value is False 50 | assert query.instrument.need_bufusage.value is False 51 | assert query.shared_buffers_hitratio is None 52 | assert query.syscache_hitratio is None 53 | 54 | 55 | def test_instrumentation(querytracer_instrumented, connection): 56 | """ 57 | Test that turning instrumentation on works as expected. 58 | """ 59 | connection.execute("SET track_io_timing = on") 60 | # We want to have at least a few system reads, so do what is necessary... 61 | with open("/proc/sys/vm/drop_caches", "wb") as procf: 62 | procf.write(b"1") 63 | 64 | with connection.execute("SELECT * FROM pg_attribute") as cur: 65 | cur.fetchall() 66 | wait_for_collector(querytracer_instrumented) 67 | assert len(querytracer_instrumented.event_handler.per_process_info) == 1 68 | process_info = querytracer_instrumented.event_handler.per_process_info[ 69 | querytracer_instrumented.pid 70 | ] 71 | 72 | assert len(process_info.query_history) == 1 73 | query = process_info.query_history[0] 74 | assert query.instrument.need_timer.value is True 75 | assert query.instrument.need_bufusage.value is True 76 | assert query.runtime > timedelta(0) 77 | assert query.instrument.bufusage.shared_blks_hit.value > 0 78 | assert query.instrument.bufusage.shared_blks_read.value >= 0 79 | assert query.instrument.bufusage.temp_blks_read.value == 0 80 | assert query.instrument.bufusage.temp_blks_written.value == 0 81 | if connection.info.server_version >= 150000: 82 | assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) == timedelta(0) 83 | assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) == timedelta( 84 | 0 85 | ) 86 | # We can't make any assumptions about the hit ratios, so just ensure they 87 | # have some valid values. 88 | assert 0 <= query.shared_buffers_hitratio < 100 89 | # The syscache_hitratio can be negative, when we actually end up reading 90 | # more blocks than what is accounted for by instrumentation. 91 | assert query.syscache_hitratio <= 100 92 | 93 | # Check that we don't crash without any instrumentation whatshowever 94 | query.instrument = None 95 | assert query.shared_buffers_hitratio is None 96 | assert query.syscache_hitratio is None 97 | 98 | # Generate some temp files for fun 99 | process_info.query_history = [] 100 | connection.execute("SET work_mem = '64kB'") 101 | with connection.execute("SELECT * FROM generate_series(1, 10000) as t"): 102 | pass 103 | wait_for_collector(querytracer_instrumented) 104 | query = process_info.query_history[0] 105 | assert query.text == "SELECT * FROM generate_series(1, 10000) as t" 106 | assert query.instrument.bufusage.temp_blks_read.value > 0 107 | assert query.instrument.bufusage.temp_blks_written.value > 0 108 | if connection.info.server_version >= 150000: 109 | assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) > timedelta(0) 110 | assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) > timedelta(0) 111 | 112 | # Now do the same query with a big enough work_mem to trigger some memory allocations 113 | connection.execute("SET work_mem = '32MB'") 114 | process_info.query_history = [] 115 | with connection.execute("SELECT * FROM generate_series(1, 10000) as t") as cur: 116 | pass 117 | wait_for_collector(querytracer_instrumented) 118 | query = process_info.query_history[0] 119 | # The reparatition between sbrk / mmap and wether we move sbrk back to it's initial 120 | # value depends on the state of malloc and it's configuration. So best thing we can test is that "something" 121 | # happened 122 | assert query.memallocs.current_mem_peak > 0 123 | # We can't assert anything meaningful about total_malloc but we can at least exercise the code 124 | assert query.memallocs.total_malloc is not None 125 | 126 | 127 | def test_plans(querytracer_instrumented, connection): 128 | """ 129 | Test that we are able to build a plans. 130 | """ 131 | with connection.execute( 132 | "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t" 133 | ) as cur: 134 | cur.fetchall() 135 | wait_for_collector(querytracer_instrumented) 136 | process_info = querytracer_instrumented.event_handler.per_process_info[ 137 | querytracer_instrumented.pid 138 | ] 139 | query = process_info.query_history[0] 140 | root_node = query.root_node 141 | NodeTag = querytracer_instrumented.metadata.enums.NodeTag 142 | assert root_node.tag == NodeTag.T_Limit 143 | assert len(root_node.children) == 1 144 | assert root_node.parent_node is None 145 | assert root_node.instrument.tuplecount.value == 10 146 | 147 | sort_node = list(root_node.children)[0] 148 | assert sort_node.tag == NodeTag.T_Sort 149 | assert len(sort_node.children) == 1 150 | assert sort_node.parent_node == root_node 151 | # FIXME: investigate why we can't fetch this value on ubuntu's PG11. 152 | if connection.info.server_version >= 120000: 153 | assert sort_node.instrument.tuplecount.value == 10 154 | 155 | seqscan_node = list(sort_node.children)[0] 156 | assert seqscan_node.tag == NodeTag.T_SeqScan 157 | assert len(seqscan_node.children) == 0 158 | assert seqscan_node.parent_node == sort_node 159 | 160 | 161 | def test_explain(querytracer, connection): 162 | """ 163 | Test that we are able to build a plans. 164 | """ 165 | # We have some trouble with collecting instrumentation for PG < 12 166 | if connection.info.server_version < 120000: 167 | return 168 | cost_snippet = r"\d+\.\d+\..\d+\.\d+" 169 | wanted_plan = rf"""Limit \(cost={cost_snippet} rows=10 width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\) 170 | \t-> Sort \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\) 171 | \t\t-> SeqScan \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)""" 172 | 173 | with connection.execute( 174 | "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t" 175 | ) as cur: 176 | cur.fetchall() 177 | wait_for_collector(querytracer) 178 | assert len(querytracer.event_handler.per_process_info) == 1 179 | process_info = querytracer.event_handler.per_process_info[querytracer.pid] 180 | query = process_info.query_history[0] 181 | root_node = query.root_node 182 | assert re.match(wanted_plan, root_node.explain()) 183 | 184 | 185 | def background_query(connection, query): 186 | def execute_query(): 187 | with connection.execute(query) as cur: 188 | cur.fetchall() 189 | 190 | newthread = Thread(target=execute_query) 191 | newthread.start() 192 | return newthread 193 | 194 | 195 | @pytest.mark.slow 196 | def test_long_query(querytracer_instrumented, connection): 197 | events = defaultdict(int) 198 | 199 | def event_handler_observer(method_name): 200 | original_method = getattr(QueryTracerEventHandler, method_name) 201 | 202 | def observe_event_handler(event_handler, bpf_collector, event, pid): 203 | events[method_name] += 1 204 | return original_method(event_handler, bpf_collector, event, pid) 205 | 206 | return observe_event_handler 207 | 208 | with ExitStack() as stack: 209 | for meth_name in ( 210 | "handle_MemoryResponseNodeInstr", 211 | "handle_MemoryResponseQueryInstr", 212 | ): 213 | stack.enter_context( 214 | patch( 215 | f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}", 216 | event_handler_observer(meth_name), 217 | ) 218 | ) 219 | with connection.execute( 220 | """SELECT count(*) FROM ( 221 | SELECT pg_sleep(0.01) 222 | FROM pg_class 223 | JOIN pg_attribute ON pg_class.oid = attrelid 224 | ) as s """ 225 | ) as cur: 226 | cur.fetchall() 227 | wait_for_collector(querytracer_instrumented) 228 | assert events["handle_MemoryResponseQueryInstr"] > 0 229 | assert events["handle_MemoryResponseNodeInstr"] > 0 230 | 231 | 232 | @pytest.mark.slow 233 | @flaky(max_runs=5) 234 | def test_query_discovery(querytracer_factory, connection): 235 | """ 236 | Test that information is gathered during a query. 237 | """ 238 | events = defaultdict(int) 239 | 240 | def event_handler_observer(method_name): 241 | original_method = getattr(QueryTracerEventHandler, method_name) 242 | 243 | def observe_event_handler(event_handler, bpf_collector, event, pid): 244 | events[method_name] += 1 245 | return original_method(event_handler, bpf_collector, event, pid) 246 | 247 | return observe_event_handler 248 | 249 | with ExitStack() as stack: 250 | for meth_name in ("handle_StackSample", "handle_MemoryNodeData"): 251 | stack.enter_context( 252 | patch( 253 | f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}", 254 | event_handler_observer(meth_name), 255 | ) 256 | ) 257 | thread = background_query( 258 | connection, 259 | """SELECT count(*) FROM ( 260 | SELECT pg_sleep(0.01) 261 | FROM pg_class 262 | JOIN pg_attribute ON pg_class.oid = attrelid 263 | ) as s """, 264 | ) 265 | # Now set up the collector. 266 | collector = None 267 | try: 268 | collector = querytracer_factory( 269 | instrument_flags=InstrumentationFlags.ALL, 270 | enable_perf_events=True, 271 | enable_query_discovery=True, 272 | enable_nodes_collection=True, 273 | sample_freq=1200, 274 | ) 275 | # And wait for the query to finish 276 | thread.join() 277 | # Wait a few seconds more to make sure collector has gathered all info 278 | sleep(3) 279 | finally: 280 | if collector is not None: 281 | collector.stop() 282 | assert events["handle_StackSample"] > 0 283 | assert events["handle_MemoryNodeData"] > 0 284 | -------------------------------------------------------------------------------- /tests/test_stack_unwinding.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module tests the frame unwinding code. 3 | """ 4 | import ctypes as ct 5 | import subprocess 6 | from pathlib import Path 7 | from unittest import TestCase 8 | 9 | from bcc import BPF 10 | from bcc.libbcc import lib as libbcc 11 | from pypsutil import Process 12 | 13 | from pgtracer.ebpf.collector import CODE_BASE_PATH 14 | from pgtracer.ebpf.dwarf import ProcessMetadata, die_name 15 | from pgtracer.ebpf.unwind import MAX_STACK_READ, UnwindAddressSpace, stack_data_t 16 | 17 | TEST_EBPF_PROGRAM = """ 18 | /* 19 | * Fill in placeholders for generated defines 20 | */ 21 | #define EVENTRING_PAGE_SIZE 1024 22 | #include "ebpf_maps.h" 23 | #include "stack.h" 24 | 25 | int capture_stack_enter(struct pt_regs *ctx) 26 | { 27 | struct stack_data_t* stack_data = event_ring.ringbuf_reserve(sizeof(struct 28 | stack_data_t)); 29 | int i = 0, ret = 0; 30 | u64 maxread = MAX_STACK_READ; 31 | if (!stack_data) 32 | return -1; 33 | while(stack_data && i < 10) 34 | { 35 | ret = capture_stack(ctx, stack_data, maxread); 36 | i++; 37 | maxread = maxread / 2; 38 | } 39 | event_ring.ringbuf_submit(stack_data, 0); 40 | } 41 | """ 42 | 43 | 44 | class TestStackUnwinding(TestCase): 45 | def setUp(self): 46 | self.captured_data = [] 47 | 48 | def tearDown(self): 49 | for k, v in list(self.ebpf.uprobe_fds.items()): 50 | self.ebpf.detach_uprobe_event(k) 51 | 52 | def _capture_data(self, cpu, data, size): 53 | content = stack_data_t() 54 | ct.pointer(content)[0] = ct.cast(data, ct.POINTER(stack_data_t)).contents 55 | self.captured_data.append(content) 56 | 57 | def test_simple_call_stack(self): 58 | # Load an eBPF program which will capture stacks. 59 | binpath = Path(__file__).parent / "test_bins" / "test_stack.main" 60 | 61 | # Run the program. 62 | program = subprocess.Popen([binpath], stdin=subprocess.PIPE) 63 | # Now get the stack base address for the program. 64 | pm = ProcessMetadata(Process(program.pid)) 65 | bpf_prog = f"#define STACK_TOP_ADDR {pm.stack_top}\n" 66 | bpf_prog += f"#define MAX_STACK_READ {MAX_STACK_READ}\n" 67 | bpf_prog += TEST_EBPF_PROGRAM 68 | 69 | self.ebpf = BPF( 70 | text=bpf_prog.encode("utf8"), 71 | cflags=[f"-I{CODE_BASE_PATH}"], 72 | ) 73 | self.ebpf.attach_uprobe( 74 | name=str(binpath).encode("utf8"), 75 | fn_name=b"capture_stack_enter", 76 | sym=b"func_1", 77 | ) 78 | self.ebpf.attach_uprobe( 79 | name=str(binpath).encode("utf8"), 80 | fn_name=b"capture_stack_enter", 81 | sym=b"func_2", 82 | ) 83 | self.ebpf[b"event_ring"].open_ring_buffer(self._capture_data) 84 | # Ok, now everything is ready for the program to actually run. 85 | program.communicate(input=b"C") 86 | # Now that the ebpf program has been loaded, run the executable and 87 | # check the output. 88 | self.ebpf.ring_buffer_poll() 89 | assert len(self.captured_data) == 2 90 | 91 | # First stack should be: 92 | # (???) libc 93 | # main 94 | # func_2 95 | adress_space = UnwindAddressSpace(self.captured_data[0], pm) 96 | frames = list(adress_space.frames()) 97 | assert len(frames) == 3 98 | assert frames[0].region.path == str(binpath) 99 | assert die_name(frames[0].die) == "func_2" 100 | assert frames[1].region.path == str(binpath) 101 | assert die_name(frames[1].die) == "main" 102 | libname = Path(frames[2].region.path) 103 | # Remove all suffixes 104 | while libname.suffix != ".so": 105 | libname = libname.with_suffix("") 106 | assert libname.name == "libc.so" 107 | assert frames[2].die is None 108 | 109 | # Second stack should be: 110 | # (???) libc 111 | # main 112 | # func_2 113 | # func_1 114 | adress_space = UnwindAddressSpace(self.captured_data[1], pm) 115 | frames = list(adress_space.frames()) 116 | assert len(frames) == 4 117 | assert frames[0].region.path == str(binpath) 118 | assert die_name(frames[0].die) == "func_1" 119 | assert frames[1].region.path == str(binpath) 120 | assert die_name(frames[1].die) == "func_2" 121 | assert frames[2].region.path == str(binpath) 122 | assert die_name(frames[2].die) == "main" 123 | libname = Path(frames[3].region.path) 124 | # Remove all suffixes 125 | while libname.suffix != ".so": 126 | libname = libname.with_suffix("") 127 | assert libname.name == "libc.so" 128 | assert frames[3].die is None 129 | 130 | # Check the argument values 131 | assert frames[0].fetch_arg(1, ct.c_int).value == 11 132 | assert frames[0].fetch_arg(2, ct.c_int).value == 22 133 | assert frames[1].fetch_arg(1, ct.c_int).value == 10 134 | assert frames[1].fetch_arg(2, ct.c_int).value == 20 135 | --------------------------------------------------------------------------------