├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── 01_question.md
│ ├── 02_bug.md
│ ├── 03_feature.md
│ └── config.yml
├── PULL_REQUEST_TEMPLATE.md
├── dependabot.yml
└── workflows
│ ├── lint.yml
│ ├── publish-pypi.yaml
│ └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── mypy.ini
├── pyproject.toml
├── src
└── pgtracer
│ ├── __init__.py
│ ├── ebpf
│ ├── __init__.py
│ ├── code
│ │ ├── block_rq.c
│ │ ├── data.h
│ │ ├── ebpf_maps.h
│ │ ├── gucset.c
│ │ ├── memusage.c
│ │ ├── perf.c
│ │ ├── plan.c
│ │ ├── program.c
│ │ ├── stack.h
│ │ └── utils.h
│ ├── collector
│ │ ├── __init__.py
│ │ ├── c_defs.py
│ │ ├── guc.py
│ │ ├── querytracer.py
│ │ └── utils.py
│ ├── dwarf.py
│ ├── eh_frame_hdr.py
│ └── unwind.py
│ ├── model
│ ├── __init__.py
│ ├── memory.py
│ ├── plan.py
│ └── query.py
│ ├── scripts
│ ├── pgtrace_gucs.py
│ └── pgtrace_queries.py
│ └── utils.py
└── tests
├── conftest.py
├── scripts
└── setup_fedora_container.sh
├── test_bins
├── Makefile
├── test.elf
├── test.elf.c
├── test_stack.main
└── test_stack.main.c
├── test_dwarf.py
├── test_guctracer.py
├── test_querytracer.py
└── test_stack_unwinding.py
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @aiven/aiven-open-source
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/01_question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: ❓ Ask a question
3 | about: Got stuck or missing something from the docs? Ask away!
4 | ---
5 |
6 | # What can we help you with?
7 |
8 |
9 |
10 | # Where would you expect to find this information?
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/02_bug.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 🐜 Report a bug
3 | about: Spotted a problem? Let us know
4 | ---
5 |
6 | # What happened?
7 |
8 |
9 |
10 | # What did you expect to happen?
11 |
12 |
13 |
14 | # What else do we need to know?
15 |
16 |
17 |
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/03_feature.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: 💡 Feature suggestion
3 | about: What would make this even better?
4 | ---
5 |
6 | # What is currently missing?
7 |
8 |
9 |
10 | # How could this be improved?
11 |
12 |
13 |
14 | # Is this a feature you would work on yourself?
15 |
16 | * [ ] I plan to open a pull request for this feature
17 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 | - name: Aiven Security Bug Bounty
4 | url: https://hackerone.com/aiven_ltd
5 | about: Our bug bounty program.
6 |
--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 |
2 | # About this change - What it does
3 |
4 |
5 |
6 |
7 | Resolves: #xxxxx
8 |
9 | # Why this way
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "pip" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 |
--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 | branches:
4 | - main
5 | tags:
6 | - '**'
7 | pull_request:
8 |
9 | jobs:
10 |
11 | lint:
12 | runs-on: ubuntu-22.04
13 | strategy:
14 | matrix:
15 | # only use one version for the lint step
16 | python-version: [3.9]
17 |
18 | steps:
19 |
20 | - id: checkout
21 | uses: actions/checkout@v2
22 | with:
23 | # Do not persist the token during execution of this job.
24 | persist-credentials: false
25 |
26 | - id: dependencies
27 | run: |
28 | # Must be installed via the system
29 | sudo apt install python3-bpfcc python3-pip
30 | pip install -U pip toml
31 | pip install '.[lint]'
32 |
33 | - id: pylint
34 | run: pylint --rcfile .pylintrc src/ || pylint-exit $? -efail
35 |
36 | - id: mypy
37 | run: python -m mypy --strict src/ --python-version 3.8
38 |
39 | - id: validate-style
40 | run: |
41 | isort --recursive src/
42 | black src/
43 | if [ $(git diff --name-only --diff-filter=ACMR | wc -l ) != 0 ]; then
44 | echo "Reformatting failed! Please run make fmt on your commits and resubmit!" 1>&2;
45 | git diff;
46 | exit 1;
47 | fi
48 |
--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yaml:
--------------------------------------------------------------------------------
1 | # Based on https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
2 |
3 | name: Publish to PyPI
4 | on:
5 | push:
6 | tags:
7 | - 'releases/**'
8 |
9 | jobs:
10 | build-n-publish:
11 | name: Build and publish
12 | runs-on: ubuntu-latest
13 |
14 | steps:
15 | - uses: actions/checkout@v3
16 | with:
17 | fetch-depth: 0
18 |
19 | - name: Set up Python
20 | uses: actions/setup-python@v4
21 | with:
22 | python-version: "3.8"
23 |
24 | - name: Install pypa/build
25 | run: >-
26 | python -m pip install build --user
27 | - name: Build a binary wheel and a source tarball
28 | run: >-
29 | python -m
30 | build
31 | --sdist
32 | --wheel
33 | --outdir dist/
34 | .
35 | - name: Publish distribution to PyPI
36 | if: startsWith(github.ref, 'refs/tags')
37 | uses: pypa/gh-action-pypi-publish@release/v1
38 | with:
39 | password: ${{ secrets.PYPI_API_TOKEN }}
40 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
1 |
2 | on:
3 | push:
4 | branches:
5 | - main
6 | tags:
7 | - '**'
8 | pull_request:
9 |
10 | jobs:
11 |
12 | tests:
13 | runs-on: ubuntu-22.04
14 | # We don't fail on dev versions, as those are snapshots
15 | continue-on-error: ${{ matrix.experimental }}
16 | strategy:
17 | matrix:
18 | postgresql_version: [11, 12, 13, 14, ]
19 | experimental: [false]
20 | repo: ["pgdg"]
21 | pytest_args: ["-m 'not slow'"]
22 | include:
23 | # Define the current dev version to be experimental
24 | - postgresql_version: 16
25 | experimental: true
26 | repo: "pgdg-snapshot"
27 | pytest_args: "-m 'not slow'"
28 | # For latest stable version, include "slow" tests
29 | - postgresql_version: 15
30 | experimental: false
31 | repo: "pgdg"
32 | pytest_args: ""
33 | env:
34 | PGVERSION: ${{ matrix.postgresql_version }}
35 | DISTRO: ubuntu
36 | steps:
37 |
38 | - id: checkout
39 | uses: actions/checkout@v2
40 | with:
41 | # Do not persist the token during execution of this job.
42 | persist-credentials: false
43 |
44 | - id: dependencies
45 | run: |
46 | # Must be installed via the system
47 | sudo apt update
48 | sudo apt install curl ca-certificates gnupg
49 | sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r)
50 | curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null
51 | sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-${{ matrix.repo }} main ${{ matrix.postgresql_version }}" > /etc/apt/sources.list.d/pgdg.list'
52 | sudo apt update
53 | # Install postgresql-common so that update alternatives doesn't fail
54 | sudo apt install postgresql-common postgresql-client-common
55 | sudo apt install postgresql-${{matrix.postgresql_version}} postgresql-${{matrix.postgresql_version}}-dbgsym
56 | sudo pip install -U pip toml
57 | # Install requirements from pyproject.toml
58 | sudo pip install -e '.[test]'
59 |
60 | - id: tests
61 | run: |
62 | sudo pytest --postgresql-exec /usr/lib/postgresql/${{matrix.postgresql_version}}/bin/pg_ctl --cov src/ --cov-report=xml ${{matrix.pytest_args}}
63 |
64 | - name: Upload coverage reports to Codecov
65 | uses: codecov/codecov-action@v3
66 | with:
67 | env_vars: PGVERSION
68 | fail_ci_if_error: true
69 | files: ./coverage.xml
70 | verbose: true
71 | name: codecov-umbrella
72 |
73 | tests_fedora_container:
74 | runs-on: ubuntu-22.04
75 | env:
76 | PGVERSION: 13
77 | DISTRO: fedora
78 | steps:
79 | - id: checkout
80 | uses: actions/checkout@v2
81 | with:
82 | persist-credentials: false
83 | - id: dependencies
84 | run: |
85 | sudo apt update
86 | sudo apt install dnf systemd-container
87 | sudo apt install postgresql-client
88 | sudo ./tests/scripts/setup_fedora_container.sh
89 | sudo apt install curl ca-certificates gnupg
90 | sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r)
91 | # Also install it in the host, for the tests running outside the
92 | # container
93 | sudo pip install -U pip toml
94 | sudo pip install -e '.[test]'
95 |
96 | - id: fedora_tests
97 | run: |
98 | sudo pytest --postgresql-host 172.16.0.1 --container fedora --cov src/ --cov-report=xml -m "not slow"
99 |
100 | - name: Upload coverage reports to Codecov
101 | uses: codecov/codecov-action@v3
102 | with:
103 | env_vars: PGVERSION, DISTRO
104 | fail_ci_if_error: true
105 | files: ./coverage.xml
106 | verbose: true
107 | name: codecov
108 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg-info
3 | build
4 | *.tmp
5 | .coverage
6 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pycqa/isort
3 | rev: 5.12.0
4 | hooks:
5 | - id: isort
6 | name: isort (python)
7 | - repo: https://github.com/psf/black
8 | rev: 23.1.0
9 | hooks:
10 | - id: black
11 |
--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=too-few-public-methods
3 |
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Contributor Covenant Code of Conduct
2 |
3 | ## Our Pledge
4 |
5 | We as members, contributors, and leaders pledge to make participation in our
6 | community a harassment-free experience for everyone, regardless of age, body
7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
8 | identity and expression, level of experience, education, socio-economic status,
9 | nationality, personal appearance, race, religion, or sexual identity
10 | and orientation.
11 |
12 | We pledge to act and interact in ways that contribute to an open, welcoming,
13 | diverse, inclusive, and healthy community.
14 |
15 | ## Our Standards
16 |
17 | Examples of behavior that contributes to a positive environment for our
18 | community include:
19 |
20 | * Demonstrating empathy and kindness toward other people
21 | * Being respectful of differing opinions, viewpoints, and experiences
22 | * Giving and gracefully accepting constructive feedback
23 | * Accepting responsibility and apologizing to those affected by our mistakes,
24 | and learning from the experience
25 | * Focusing on what is best not just for us as individuals, but for the
26 | overall community
27 |
28 | Examples of unacceptable behavior include:
29 |
30 | * The use of sexualized language or imagery, and sexual attention or
31 | advances of any kind
32 | * Trolling, insulting or derogatory comments, and personal or political attacks
33 | * Public or private harassment
34 | * Publishing others' private information, such as a physical or email
35 | address, without their explicit permission
36 | * Other conduct which could reasonably be considered inappropriate in a
37 | professional setting
38 |
39 | ## Enforcement Responsibilities
40 |
41 | Community leaders are responsible for clarifying and enforcing our standards of
42 | acceptable behavior and will take appropriate and fair corrective action in
43 | response to any behavior that they deem inappropriate, threatening, offensive,
44 | or harmful.
45 |
46 | Community leaders have the right and responsibility to remove, edit, or reject
47 | comments, commits, code, wiki edits, issues, and other contributions that are
48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
49 | decisions when appropriate.
50 |
51 | ## Scope
52 |
53 | This Code of Conduct applies within all community spaces, and also applies when
54 | an individual is officially representing the community in public spaces.
55 | Examples of representing our community include using an official e-mail address,
56 | posting via an official social media account, or acting as an appointed
57 | representative at an online or offline event.
58 |
59 | ## Enforcement
60 |
61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
62 | reported to the community leaders responsible for enforcement at
63 | opensource@aiven.io.
64 | All complaints will be reviewed and investigated promptly and fairly.
65 |
66 | All community leaders are obligated to respect the privacy and security of the
67 | reporter of any incident.
68 |
69 | ## Enforcement Guidelines
70 |
71 | Community leaders will follow these Community Impact Guidelines in determining
72 | the consequences for any action they deem in violation of this Code of Conduct:
73 |
74 | ### 1. Correction
75 |
76 | **Community Impact**: Use of inappropriate language or other behavior deemed
77 | unprofessional or unwelcome in the community.
78 |
79 | **Consequence**: A private, written warning from community leaders, providing
80 | clarity around the nature of the violation and an explanation of why the
81 | behavior was inappropriate. A public apology may be requested.
82 |
83 | ### 2. Warning
84 |
85 | **Community Impact**: A violation through a single incident or series
86 | of actions.
87 |
88 | **Consequence**: A warning with consequences for continued behavior. No
89 | interaction with the people involved, including unsolicited interaction with
90 | those enforcing the Code of Conduct, for a specified period of time. This
91 | includes avoiding interactions in community spaces as well as external channels
92 | like social media. Violating these terms may lead to a temporary or
93 | permanent ban.
94 |
95 | ### 3. Temporary Ban
96 |
97 | **Community Impact**: A serious violation of community standards, including
98 | sustained inappropriate behavior.
99 |
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 |
106 | ### 4. Permanent Ban
107 |
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior, harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 |
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 |
115 | ## Attribution
116 |
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 |
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 |
124 | [homepage]: https://www.contributor-covenant.org
125 |
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Welcome!
2 |
3 | Contributions are very welcome on pgtracer. When contributing please keep this in mind:
4 |
5 | - Open an issue to discuss new bigger features.
6 | - Write code consistent with the project style and make sure the tests are passing.
7 | - Stay in touch with us if we have follow up questions or requests for further changes.
8 |
9 | # Development
10 |
11 | ## Local Environment
12 |
13 |
14 | ## Tests
15 |
16 |
17 | ## Static checking and Linting
18 |
19 |
20 | ## Manual testing
21 |
22 |
23 | ### Configuration
24 |
25 |
26 | # Opening a PR
27 |
28 | - Commit messages should describe the changes, not the filenames. Win our admiration by following
29 | the [excellent advice from Chris Beams](https://chris.beams.io/posts/git-commit/) when composing
30 | commit messages.
31 | - Choose a meaningful title for your pull request.
32 | - The pull request description should focus on what changed and why.
33 | - Check that the tests pass (and add test coverage for your changes if appropriate).
34 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | PGTracer
2 | Copyright (C) 2022 Aiven
3 |
4 | This program is free software: you can redistribute it and/or modify
5 | it under the terms of the GNU General Public License as published by
6 | the Free Software Foundation, either version 3 of the License, or
7 | (at your option) any later version.
8 |
9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 | GNU General Public License for more details.
13 |
14 | You should have received a copy of the GNU General Public License
15 | along with this program. If not, see .
16 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/pgtracer/ebpf/code/*.c
2 | include src/pgtracer/ebpf/code/*.h
3 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | PGTracer
2 | ========
3 |
4 | PGTracer is a collection of tools to trace queries, execution plans and more in
5 | PostgreSQL®, using eBPF.
6 |
7 | Overview
8 | ========
9 |
10 | PGTracer offers a way to instrument PostgreSQL, using the Linux eBPF facility.
11 | As it does advanced memory access, it needs the PostgreSQL debug symbols to
12 | resolve symbols and offsets in structs.
13 |
14 | Features
15 | ============
16 |
17 | * Attach to a running PostgreSQL backend, and dump every executed query along
18 | with it's search path
19 | * Optionally turn on instrumentation (just like EXPLAIN ANALYZE does) to gather
20 | more information
21 |
22 | Planned features:
23 | * Gather information about individual execution nodes to print query plans
24 | * Gather system information and link them to individual nodes (think syscalls,
25 | IO, memory allocation...)
26 | * Build a TUI to explore the data
27 | * Allow to follow a transaction
28 |
29 |
30 | Install
31 | ============
32 |
33 | You will need a running PostgreSQL install, and it's debug symbols.
34 |
35 | For pgtracer itself you will need:
36 | - libunwind installed on the system
37 | - the [BPF Compiler Collection](https://github.com/iovisor/bcc/blob/master/INSTALL.md)
38 | - several python packages as dependencies:
39 | - `psutil`
40 | - `pyelftools`
41 |
42 | Support will vary depending on your Linux distribution, kernel version, and
43 | library versions, as well as how PostgreSQL was compiled.
44 |
45 | Please file a bug if it doesn't work as expected.
46 |
47 | Ubuntu
48 | ------------
49 |
50 | To install the debug symbols, install the `postgresql-version-dbgsym` package. You may have to enable additional repositories though.
51 |
52 | To run pgtracer you will need some python packages as well as packages only available from the repos.
53 |
54 | ```
55 | apt install python3-bpfcc python3-pip libunwind-dev
56 | ```
57 |
58 | Then upgrade pip using pip:
59 |
60 | ```
61 | pip install pip --upgrade
62 | ```
63 |
64 | And you are now ready to install the pgtracer package itself:
65 |
66 | ```
67 | git clone https://github.com/aiven/pgtracer.git
68 | cd pgtracer
69 | pip install .
70 | ```
71 |
72 |
73 | Fedora
74 | ---------
75 |
76 | To install the debugging symbols:
77 |
78 | ```
79 | yum install dnf-utils
80 | debuginfo-install postgresql-server
81 | ```
82 |
83 | For the dependencies:
84 |
85 | ```
86 | yum install python3-bcc libunwind python3-pip libunwind-devel
87 | ```
88 |
89 | Then install pgtracer itself:
90 |
91 | ```
92 | git clone https://github.com/aiven/pgtracer.git
93 | cd pgtracer
94 | pip install pip --upgrade
95 | pip install .
96 | ```
97 |
98 |
99 |
100 | Arch Linux
101 | ------------
102 |
103 | To install PostgreSQL debug symbols, as root:
104 |
105 | ```
106 | pacman -S debuginfod
107 | export DEBUGINFOD_URLS="https://debuginfod.archlinux.org/"
108 | debuginfod-find debuginfo /usr/bin/postgres
109 | ```
110 |
111 | To install the required packages:
112 |
113 | ```
114 | pacman -S python-bcc libunwind python-pip
115 | ```
116 |
117 | Then install the pgtracer package itself:
118 |
119 | ```
120 | git clone https://github.com/aiven/pgtracer.git
121 | cd pgtracer
122 | pip install .
123 | ```
124 |
125 |
126 | Usage
127 | =============
128 |
129 | Currently, only one script comes with pgtracer: `pgtrace_queries`.
130 | Since pgtracer uses eBPF, it needs to be run as root.
131 |
132 | ```
133 | usage: pgtrace_queries [-h] [--instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...]] [--nodes-collection] pid
134 |
135 | Dump a running backend execution plan
136 |
137 | positional arguments:
138 | pid PID to connect to
139 |
140 | options:
141 | -h, --help show this help message and exit
142 | --instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...], -I [{TIMER,BUFFERS,ROWS,WAL,ALL} ...]
143 | Instrument flags to set. (warning: writes into backends memory!)
144 | --nodes-collection, -n
145 | Collect information about individual execution nodes
146 | ```
147 |
148 |
149 |
150 | Depending on the way the PostgreSQL binary have been compiled, you may need a
151 | more recent pyelftools version than what is packaged with your distribution:
152 | DWARF5 support is quite recent and continuously improving.
153 |
154 |
155 |
156 |
157 |
158 | License
159 | =======
160 | pgtracer is licensed under the PostgreSQL license. Full license text is available in the [LICENSE](LICENSE) file.
161 |
162 | Please note that the project explicitly does not require a CLA (Contributor License Agreement) from its contributors.
163 |
164 | Contact
165 | ============
166 | Bug reports and patches are very welcome, please post them as GitHub issues and pull requests at https://github.com/aiven/pgtracer .
167 | To report any possible vulnerabilities or other serious issues please see our [security](SECURITY.md) policy.
168 |
169 | Trademarks
170 | ==========
171 |
172 | The terms Postgres and PostgreSQL are registered trademarks of the PostgreSQL Community Association of Canada.
173 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 |
3 | ## Supported Versions
4 |
5 | We release patches for security vulnerabilities. Which versions are eligible
6 | to receive such patches depend on the CVSS v3.0 Rating:
7 |
8 | | CVSS v3.0 | Supported Versions |
9 | | --------- | ----------------------------------------- |
10 | | 4.0-10.0 | Most recent release |
11 |
12 | ## Reporting a Vulnerability
13 |
14 | Please report (suspected) security vulnerabilities to our **[bug bounty
15 | program](https://hackerone.com/aiven_ltd)**. You will receive a response from
16 | us within 2 working days. If the issue is confirmed, we will release a patch as
17 | soon as possible depending on impact and complexity.
18 |
19 | ## Qualifying Vulnerabilities
20 |
21 | Any reproducible vulnerability that has a severe effect on the security or
22 | privacy of our users is likely to be in scope for the program.
23 |
24 | We generally **aren't** interested in the following issues:
25 | * Social engineering (e.g. phishing, vishing, smishing) attacks
26 | * Brute force, DoS, text injection
27 | * Missing best practices such as HTTP security headers (CSP, X-XSS, etc.),
28 | email (SPF/DKIM/DMARC records), SSL/TLS configuration.
29 | * Software version disclosure / Banner identification issues / Descriptive
30 | error messages or headers (e.g. stack traces, application or server errors).
31 | * Clickjacking on pages with no sensitive actions
32 | * Theoretical vulnerabilities where you can't demonstrate a significant
33 | security impact with a proof of concept.
34 |
--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.7
3 |
4 | [mypy-elftools.*]
5 | ignore_missing_imports = True
6 |
7 | [mypy-psutil.*]
8 | ignore_missing_imports = True
9 |
10 | [mypy-bcc.*]
11 | ignore_missing_imports = True
12 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "pgtracer"
3 | description = "Tracing tools for PostgreSQL"
4 | version = "0.1.0"
5 | authors = [
6 | { name = "Ronan Dunklau", email = "ronan.dunklau@aiven.com" }
7 | ]
8 | dependencies = [
9 | "pyelftools",
10 | "pypsutil"
11 | ]
12 | readme = "README.md"
13 | requires-python = ">=3.7"
14 | classifiers = [
15 | "Programming Language :: Python :: 3",
16 | "License :: OSI Approved :: PostgreSQL License",
17 | "Operating System :: POSIX :: Linux",
18 | ]
19 |
20 |
21 | [project.scripts]
22 | pgtrace_queries = "pgtracer.scripts.pgtrace_queries:main"
23 | pgtrace_gucs = "pgtracer.scripts.pgtrace_gucs:main"
24 |
25 | [project.optional-dependencies]
26 | lint = [
27 | 'black',
28 | 'isort',
29 | 'mypy',
30 | 'pylint',
31 | 'pylint-exit',
32 | ]
33 |
34 | test = [
35 | 'psycopg',
36 | 'pytest',
37 | 'pytest-coverage',
38 | 'pytest-postgresql',
39 | 'flaky'
40 | ]
41 |
42 | [tool.isort]
43 | profile = "black"
44 |
--------------------------------------------------------------------------------
/src/pgtracer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/__init__.py
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/ebpf/__init__.py
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/block_rq.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include "data.h"
3 | #include "utils.h"
4 |
5 | struct io_req_data_t {
6 | event_base event_base;
7 | char rwbs[8];
8 | u64 bytes;
9 | };
10 |
11 |
12 | TRACEPOINT_PROBE(block, block_rq_issue)
13 | {
14 | struct io_req_data_t *event;
15 | ##CHECK_POSTMASTER##
16 | /* We need to filter on pid ourselves inside syscalls. */
17 | #ifdef PID
18 | if (bpf_get_current_pid_tgid() >> 32 != PID)
19 | return 1;
20 | #endif
21 |
22 | event = event_ring.ringbuf_reserve(sizeof(struct io_req_data_t));
23 | if (!event)
24 | return 1;
25 |
26 | fill_event_base(&(event->event_base), EventTypeKBlockRqIssue);
27 | event->bytes = args->nr_sector << 9;
28 | if (event->bytes == 0) {
29 | event->bytes = args->bytes;
30 | }
31 | bpf_probe_read(&event->rwbs, sizeof(event->rwbs), args->rwbs);
32 | event_ring.ringbuf_submit(event, 0);
33 | return 0;
34 | }
35 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/data.h:
--------------------------------------------------------------------------------
1 | #ifndef DATA_H
2 | #define DATA_H
3 | #include "stack.h"
4 |
5 | typedef struct event_base {
6 | short event_type;
7 | int pid;
8 | } event_base;
9 |
10 | typedef struct Id128 {
11 | u64 u1;
12 | u64 u2;
13 | } Id128;
14 |
15 | struct portal_data_t {
16 | event_base event_base;
17 | Id128 portal_key;
18 | u64 queryAddr;
19 | u64 query_id;
20 | double startup_cost;
21 | double total_cost;
22 | double plan_rows;
23 | char query[MAX_QUERY_LENGTH]; // Dynamically injected using defines
24 | char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines
25 | char search_path[MAX_SEARCHPATH_LENGTH];
26 | };
27 |
28 | struct plan_data_t {
29 | u64 plan_addr;
30 | int plan_tag;
31 | double startup_cost;
32 | double total_cost;
33 | double plan_rows;
34 | int plan_width;
35 | bool parallel_aware;
36 | };
37 |
38 | struct planstate_data_t {
39 | event_base event_base;
40 | Id128 portal_key;
41 | u64 planstate_addr;
42 | int planstate_tag;
43 | u64 lefttree;
44 | u64 righttree;
45 | struct plan_data_t plan_data;
46 | char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines
47 | struct stack_data_t stack_capture;
48 | };
49 |
50 | #endif
51 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/ebpf_maps.h:
--------------------------------------------------------------------------------
1 | #ifndef EBPF_MAPS_H
2 | #define EBPF_MAPS_H
3 | /* Main ringbuf for communicating events to user space. */
4 | BPF_RINGBUF_OUTPUT(event_ring, EVENTRING_PAGE_SIZE);
5 |
6 | #endif
7 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/gucset.c:
--------------------------------------------------------------------------------
1 | #include "ebpf_maps.h"
2 |
3 | struct guc_request_t {
4 | u64 guc_location;
5 | int guc_size;
6 | char payload[GUC_MAX_LENGTH];
7 | };
8 |
9 | struct guc_response_t {
10 | short event_type;
11 | u64 guc_location;
12 | bool status;
13 | };
14 |
15 | BPF_QUEUE(gucs_to_set, struct guc_request_t, 128);
16 |
17 |
18 | /* This will be attached at various points in the program flow,
19 | * to override GUCs as seen fit.
20 | * */
21 | int process_guc_uprobe(struct pt_regs *ctx)
22 | {
23 | struct guc_request_t guc_request;
24 | struct guc_response_t *guc_response;
25 | int i = 0;
26 | int size = 0;
27 | int ret;
28 | while (i < 20)
29 | {
30 | guc_response = event_ring.ringbuf_reserve(sizeof(struct guc_response_t));
31 | if (!guc_response)
32 | return 1;
33 | guc_response->event_type = EventTypeGUCResponse;
34 |
35 | /* If no resquest to process, bail out */
36 | if (gucs_to_set.pop(&guc_request) < 0)
37 | {
38 | event_ring.ringbuf_discard(guc_response, 0);
39 | return 1;
40 | }
41 | guc_response->guc_location = guc_request.guc_location;
42 | size = guc_request.guc_size;
43 | clamp_umax(size, GUC_MAX_LENGTH);
44 | ret = -1;
45 | if (size > 0 && guc_request.guc_size <= GUC_MAX_LENGTH)
46 | ret = bpf_probe_write_user((void *) guc_request.guc_location, &(guc_request.payload), size);
47 | guc_response->status = (ret >= 0);
48 | event_ring.ringbuf_submit(guc_response, 0);
49 | i++;
50 | }
51 | return 0;
52 | }
53 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/memusage.c:
--------------------------------------------------------------------------------
1 | #include "ebpf_maps.h"
2 | #include "stack.h"
3 | #include "linux/sched.h"
4 | #include "utils.h"
5 | #include "data.h"
6 |
7 | #define offsetof(type, member) __builtin_offsetof (type, member)
8 |
9 |
10 | struct memory_account_t {
11 | event_base event_base;
12 | long long size;
13 | short kind;
14 | };
15 |
16 |
17 | static inline int send_memory_account(long long size, short kind)
18 | {
19 | struct memory_account_t *account = event_ring.ringbuf_reserve(sizeof(struct memory_account_t));
20 | if (!account)
21 | return 1;
22 | fill_event_base(&(account->event_base), EventTypeMemoryAccount);
23 | account->size = size;
24 | account->kind = kind;
25 | event_ring.ringbuf_submit(account, 0);
26 | return 0;
27 | }
28 |
29 | /*
30 | * sbrk moves are instrumented through the convenient tracepoints.
31 | */
32 | int sbrk_more(struct pt_regs *ctx)
33 | {
34 | ##CHECK_POSTMASTER##
35 | size_t size;
36 | bpf_usdt_readarg(2, ctx, &size);
37 | return send_memory_account(size, MemoryAllocTypeSbrk);
38 | }
39 |
40 | int sbrk_less(struct pt_regs *ctx)
41 | {
42 | ##CHECK_POSTMASTER##
43 | size_t size;
44 | bpf_usdt_readarg(2, ctx, &size);
45 | return send_memory_account(-size, MemoryAllocTypeSbrk);
46 | }
47 |
48 | /*
49 | * glibc doesn't offer tracepoints for mmap, so instrument the functions directly.
50 | */
51 |
52 | int mmap_enter(struct pt_regs *ctx)
53 | {
54 | ##CHECK_POSTMASTER##
55 | size_t size = PT_REGS_PARM2(ctx);
56 | return send_memory_account(size, MemoryAllocTypeMmap);
57 | }
58 |
59 | int munmap_enter(struct pt_regs *ctx)
60 | {
61 | ##CHECK_POSTMASTER##
62 | size_t size = PT_REGS_PARM2(ctx);
63 | return send_memory_account(-size, MemoryAllocTypeMmap);
64 | }
65 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/perf.c:
--------------------------------------------------------------------------------
1 | #include "ebpf_maps.h"
2 | #include "stack.h"
3 | #include "uapi/linux/bpf_perf_event.h"
4 | #include "utils.h"
5 | #include "data.h"
6 |
7 | struct memory_request_t {
8 | short event_type;
9 | Id128 request_id;
10 | int path_size;
11 | u64 size;
12 | u64 memory_path[MEMORY_PATH_SIZE];
13 | };
14 |
15 | struct memory_response_t {
16 | event_base event_base;
17 | Id128 request_id;
18 | char payload[MEMORY_REQUEST_MAXSIZE];
19 | };
20 |
21 | /*
22 | * We embed the whole portal_data_t
23 | */
24 | struct stack_sample_t {
25 | struct portal_data_t portal_data;
26 | struct stack_data_t stack_data;
27 | };
28 |
29 | # define QUERY_DISCOVERY_KEY 1
30 | # define NODE_DISCOVERY_KEY 2
31 | BPF_HASH(discovery_enabled, int, bool, 2);
32 |
33 | BPF_QUEUE(memory_requests, struct memory_request_t, 1024);
34 | /* Define one queue per process */
35 | #if LIBBCC_VERSION_GEQ(0, 21, 0)
36 | BPF_HASH_OF_MAPS(pid_queues, int, "memory_requests", 1024);
37 | #else
38 | BPF_HASH_OF_MAPS(pid_queues, "memory_requests", 1024);
39 | #endif
40 |
41 | /*
42 | * This code is run on perf event, with a specific frequency.
43 | * What we want is to be able to read specific memory locations whenever the perf
44 | * event is triggered.
45 | *
46 | * Userland code pushes memory locations to read to the memory_requests queues,
47 | * and sends the responses back through the same event_ringbuffer used
48 | * everywhere.
49 | */
50 | int perf_event(struct bpf_perf_event_data *ctx)
51 | {
52 | ##CHECK_POSTMASTER##
53 | struct memory_request_t request;
54 | struct memory_response_t *response;
55 | u64 size;
56 | u64 memory_location;
57 | int pid = (bpf_get_current_pid_tgid() >> 32);
58 | int i = 0;
59 | int j;
60 | void * queue;
61 | #ifdef ENABLE_QUERY_DISCOVERY
62 | int key = QUERY_DISCOVERY_KEY;
63 | bool *need_discovery;
64 | need_discovery = discovery_enabled.lookup(&key);
65 | bool need_query = (need_discovery && *need_discovery);
66 | key = NODE_DISCOVERY_KEY;
67 | need_discovery = discovery_enabled.lookup(&key);
68 | bool need_node = (need_discovery && *need_discovery);
69 | if (need_query || need_node)
70 | {
71 | void *activeportal = 0;
72 | bpf_probe_read_user(&activeportal,
73 | sizeof(void*),
74 | (void *) GlobalVariablesActivePortal);
75 | /* Only proceed if we have a current query. */
76 | if(activeportal != 0)
77 | {
78 | struct stack_sample_t *stack_sample = event_ring.ringbuf_reserve(sizeof(struct stack_sample_t));
79 |
80 | /*
81 | * If we can't allocate for the stack sample, we keep going to the memory request code.
82 | */
83 | if (stack_sample)
84 | {
85 | fill_event_base(&(stack_sample->portal_data.event_base), EventTypeStackSample);
86 | if (need_query)
87 | {
88 | void *queryDesc = 0;
89 | bpf_probe_read_user(&queryDesc, sizeof(void *),
90 | OffsetFrom(activeportal, PortalData, queryDesc));
91 | fill_portal_data(queryDesc, &stack_sample->portal_data);
92 | stack_sample->portal_data.portal_key = get_portal_key(activeportal);
93 | }
94 | if (need_node)
95 | {
96 | capture_stack(&(ctx->regs), &(stack_sample->stack_data), MAX_STACK_READ);
97 | }
98 | event_ring.ringbuf_submit(stack_sample, 0);
99 | }
100 | }
101 | }
102 | #endif
103 | queue = pid_queues.lookup(&pid);
104 | if (!queue)
105 | return 0;
106 | while (i < 5)
107 | {
108 |
109 | /* No more requests to process. */
110 | if (bpf_map_pop_elem(queue, &request) < 0)
111 | {
112 | return 0;
113 | }
114 |
115 | size = request.size;
116 | /* We treat those specially, as we have the opportunity to gather a bunch of
117 | * data at the same time.
118 | */
119 | if (request.event_type == EventTypeMemoryNodeData)
120 | {
121 | struct planstate_data_t *response = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
122 | if (!response)
123 | return 1;
124 | fill_event_base(&(response->event_base), EventTypeMemoryNodeData);
125 | record_node((void *) request.memory_path[0], response, NULL, false);
126 | event_ring.ringbuf_submit(response, 0);
127 | i++;
128 | continue;
129 | }
130 | response = event_ring.ringbuf_reserve(sizeof(struct memory_response_t));
131 | if (!response)
132 | return 1;
133 |
134 | fill_event_base(&(response->event_base), request.event_type);
135 | if (size >= MEMORY_REQUEST_MAXSIZE)
136 | size = MEMORY_REQUEST_MAXSIZE;
137 | /*
138 | * request.path_size can't be greater than MEMORY_PATH_SIZE,
139 | * but the eBPF verifier doesn't know this.
140 | */
141 | memory_location = 0;
142 | j = 0;
143 | /* Chase pointers as needed */
144 | while(j < request.path_size - 1 && j < MEMORY_PATH_SIZE)
145 | {
146 | if (memory_location != 0)
147 | {
148 | if(bpf_probe_read_user(&memory_location, sizeof(u64),
149 | (void *) memory_location))
150 | {
151 | /* We failed to read here, so bail out. */
152 | event_ring.ringbuf_discard(response, 0);
153 | return 0;
154 | }
155 | }
156 | memory_location = request.memory_path[j] + memory_location;
157 | j++;
158 | }
159 | if (bpf_probe_read_user(&response->payload, size, (void *) memory_location))
160 | {
161 | event_ring.ringbuf_discard(response, 0);
162 | } else {
163 | response->request_id = request.request_id;
164 | event_ring.ringbuf_submit(response, 0);
165 | }
166 | i++;
167 | }
168 | return 0;
169 | }
170 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/plan.c:
--------------------------------------------------------------------------------
1 | #include "data.h"
2 | #include "utils.h"
3 | #include "stack.h"
4 |
5 | int execprocnodefirst_enter(struct pt_regs *ctx);
6 | int execendnode_enter(struct pt_regs *ctx);
7 |
8 | /*
9 | * On each first execution of a node, send the node information to the client
10 | * side
11 | */
12 | int execprocnodefirst_enter(struct pt_regs *ctx)
13 | {
14 | ##CHECK_POSTMASTER##
15 | struct planstate_data_t *node;
16 | node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
17 | if (!node)
18 | return 0;
19 | fill_event_base(&(node->event_base), EventTypeExecProcNodeFirst);
20 | record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true);
21 | event_ring.ringbuf_submit(node, 0);
22 | return 0;
23 | }
24 |
25 | /*
26 | * On each node teardown, send the node information to the client side (again)
27 | */
28 | int execendnode_enter(struct pt_regs *ctx)
29 | {
30 | ##CHECK_POSTMASTER##
31 | struct planstate_data_t *node;
32 | node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
33 | if (!node)
34 | return 0;
35 | fill_event_base(&(node->event_base), EventTypeExecEndNode);
36 | record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true);
37 | event_ring.ringbuf_submit(node, 0);
38 | return 0;
39 | }
40 |
41 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/program.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include "ebpf_maps.h"
3 | #include "data.h"
4 | #include "utils.h"
5 |
6 | static int override_instrument_options(void * querydesc);
7 |
8 | int executorstart_enter(struct pt_regs *ctx)
9 | {
10 | ##CHECK_POSTMASTER##
11 | void *queryDesc = (void *) PT_REGS_PARM1(ctx);
12 | #ifdef USER_INSTRUMENT_FLAGS
13 | override_instrument_options(queryDesc);
14 | #endif
15 | return 0;
16 | }
17 |
18 | int executorrun_enter(struct pt_regs *ctx)
19 | {
20 | u64 ppid;
21 | ##CHECK_POSTMASTER##
22 | void *queryDesc = (void *) PT_REGS_PARM1(ctx);
23 | void *sourceText;
24 | void *portaladdr;
25 | void *search_path;
26 | void *plan;
27 |
28 | struct portal_data_t *event;
29 | bpf_probe_read_user(&portaladdr,
30 | sizeof(void*),
31 | (void *) GlobalVariablesActivePortal);
32 | Id128 key = get_portal_key(portaladdr);
33 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
34 | if (!event)
35 | return 1;
36 | fill_event_base(&(event->event_base), EventTypeExecutorRun);
37 | event->portal_key = key;
38 | fill_portal_data(queryDesc, event);
39 | bpf_probe_read_user(&search_path, sizeof(void *), (void *) GlobalVariablesnamespace_search_path);
40 | bpf_probe_read_user_str(&event->search_path, MAX_SEARCHPATH_LENGTH,
41 | search_path);
42 | event_ring.ringbuf_submit(event, 0);
43 | return 0;
44 | }
45 |
46 | int executorfinish_enter(struct pt_regs *ctx)
47 | {
48 | ##CHECK_POSTMASTER##
49 | void *portal;
50 | void *queryDesc = (void *) PT_REGS_PARM1(ctx);
51 | struct portal_data_t *event;
52 | Id128 key;
53 | bpf_probe_read_user(&portal,
54 | sizeof(void*),
55 | (void *) GlobalVariablesActivePortal);
56 |
57 | key = get_portal_key((void*) portal);
58 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
59 | if (!event)
60 | return 1;
61 | init_portal_data(event);
62 | fill_portal_data(queryDesc, event);
63 | fill_event_base(&(event->event_base), EventTypeExecutorFinish);
64 | event->portal_key = key;
65 | event_ring.ringbuf_submit(event, 0);
66 | return 0;
67 | }
68 |
69 | int portaldrop_return(struct pt_regs *ctx)
70 | {
71 | ##CHECK_POSTMASTER##
72 | struct portal_data_t *event;
73 | Id128 key = {0};
74 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
75 | if (!event)
76 | return 1;
77 | init_portal_data(event);
78 | fill_event_base(&(event->event_base), EventTypeDropPortalReturn);
79 | event->portal_key = key;
80 | event_ring.ringbuf_submit(event, 0);
81 | return 0;
82 | }
83 |
84 | int portaldrop_enter(struct pt_regs *ctx)
85 | {
86 | ##CHECK_POSTMASTER##
87 | void *portal = (void *) PT_REGS_PARM1(ctx);
88 | Id128 key = get_portal_key(portal);
89 | struct portal_data_t *event;
90 | void *queryDesc;
91 | event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
92 | if (!event)
93 | return 1;
94 | init_portal_data(event);
95 | bpf_probe_read_user(&queryDesc, sizeof(void *),
96 | OffsetFrom(portal, PortalData, queryDesc));
97 | fill_portal_data(queryDesc, event);
98 | fill_event_base(&(event->event_base), EventTypeDropPortalEnter);
99 | event->portal_key = key;
100 | event_ring.ringbuf_submit(event, 0);
101 | return 0;
102 | }
103 |
104 | /* When instrumenting a whole cluster, we also trace new processes.
105 | * Additionally, specific collectors can embed code in here.
106 | */
107 | #ifdef POSTMASTER_PID
108 | TRACEPOINT_PROBE(sched, sched_process_fork)
109 | {
110 | u32 pid = args->parent_pid;
111 | if (args->parent_pid != POSTMASTER_PID)
112 | return 0;
113 | struct event_base *event;
114 | event = event_ring.ringbuf_reserve(sizeof (struct event_base));
115 | if (!event)
116 | return 1;
117 | event->pid = args->child_pid;
118 | event->event_type = EventTypeProcessFork;
119 | event_ring.ringbuf_submit(event, 0);
120 | return 0;
121 | }
122 | #endif
123 |
124 | TRACEPOINT_PROBE(sched, sched_process_exit)
125 | {
126 | ##CHECK_POSTMASTER##
127 | #ifdef PID
128 | if (bpf_get_current_pid_tgid() >> 32 != PID)
129 | return 1;
130 | #endif
131 | struct event_base *event;
132 | event = event_ring.ringbuf_reserve(sizeof (struct event_base));
133 | if (!event)
134 | return 1;
135 | fill_event_base(event, EventTypeProcessExit);
136 | event_ring.ringbuf_submit(event, 0);
137 | return 0;
138 | }
139 |
140 |
141 | #ifdef USER_INSTRUMENT_FLAGS
142 | static int override_instrument_options(void * querydesc)
143 | {
144 | void * options_addr = OffsetFrom(querydesc, QueryDesc, instrument_options);
145 | int instr_options;
146 | bpf_probe_read_user(&instr_options,
147 | sizeof(int),
148 | options_addr);
149 | instr_options |= USER_INSTRUMENT_FLAGS;
150 | return bpf_probe_write_user(options_addr, &instr_options, sizeof(int));
151 | }
152 | #endif
153 |
154 | #ifdef CAPTURE_PLANS
155 | #include "plan.h"
156 | #endif
157 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/stack.h:
--------------------------------------------------------------------------------
1 | #ifndef STACK_H
2 | #define STACK_H
3 | #include
4 |
5 | struct stack_data_t {
6 | u64 rax;
7 | u64 rdx;
8 | u64 rcx;
9 | u64 rbx;
10 | u64 rsi;
11 | u64 rdi;
12 | u64 rbp;
13 | u64 rsp;
14 | u64 r8;
15 | u64 r9;
16 | u64 r10;
17 | u64 r11;
18 | u64 r12;
19 | u64 r13;
20 | u64 r14;
21 | u64 r15;
22 | u64 rip;
23 | u64 size;
24 | u64 start_addr;
25 | char stack[MAX_STACK_READ]; // Dynamically injected using defines
26 | };
27 |
28 | /*
29 | * Capture the current stack and register values.
30 | */
31 | static inline int capture_stack(struct pt_regs *ctx, struct stack_data_t *stack_data, u64 max_read)
32 | {
33 | int ret = 0;
34 | stack_data->rax = ctx->ax;
35 | stack_data->rdx = ctx->dx;
36 | stack_data->rcx = ctx->cx;
37 | stack_data->rbx = ctx->bx;
38 | stack_data->rsi = ctx->si;
39 | stack_data->rdi = ctx->di;
40 | stack_data->rbp = ctx->bp;
41 | stack_data->rsp = ctx->sp;
42 | stack_data->r8 = ctx->r8;
43 | stack_data->r9 = ctx->r9;
44 | stack_data->r10 = ctx->r10;
45 | stack_data->r11 = ctx->r11;
46 | stack_data->r12 = ctx->r12;
47 | stack_data->r13 = ctx->r13;
48 | stack_data->r14 = ctx->r14;
49 | stack_data->r15 = ctx->r15;
50 | stack_data->rip = ctx->ip;
51 | stack_data->start_addr = stack_data->rsp;
52 | stack_data->size = (STACK_TOP_ADDR - stack_data->rsp);
53 | if (stack_data->size > max_read)
54 | stack_data->size = max_read;
55 | ret = bpf_probe_read_user(&stack_data->stack,
56 | stack_data->size,
57 | (void *) (stack_data->rsp));
58 | if (ret != 0)
59 | {
60 | stack_data->size = 0;
61 | }
62 | return ret;
63 | }
64 |
65 | #endif
66 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/utils.h:
--------------------------------------------------------------------------------
1 | #ifndef UTILS_H
2 | #define UTILS_H
3 | #define EPOCH_OFFSET ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)
4 |
5 | #define Offset(structname, member) (STRUCT_ ## structname ## _OFFSET_ ## member)
6 | #define OffsetFrom(pointer, structname, member) ((void *) (pointer + Offset(structname, member)))
7 |
8 | /* Reuse code from LIBCC for version matching */
9 | #define __LIBBCC_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)))
10 | #define LIBBCC_VERSION_CODE __LIBBCC_VERSION(LIBBCC_MAJOR_VERSION, LIBBCC_MINOR_VERSION, LIBBCC_PATCH_VERSION)
11 | #define LIBBCC_VERSION_GEQ(a,b,c) LIBBCC_VERSION_CODE >= __LIBBCC_VERSION(a, b, c)
12 |
13 |
14 | #include "data.h"
15 |
16 | /* Clamp a value to a max value, and make the eBPF verifier happy. */
17 | #define clamp_umax(VAR, UMAX) \
18 | asm volatile ( \
19 | "if %0 <= %[max] goto +1\n" \
20 | "%0 = %[max]\n" \
21 | : "+r"(VAR) \
22 | : [max]"i"(UMAX) \
23 | )
24 |
25 | static u64 pgts_to_unixts(u64 pgts)
26 | {
27 | ulong secs = (ulong) pgts / 1000000;
28 | uint microsecs = (uint) pgts % 1000000;
29 | return (secs + EPOCH_OFFSET) * 1000000 + microsecs;
30 | }
31 |
32 |
33 | // Handle code related to the portal information capture
34 | static inline Id128 get_portal_key(void * portal)
35 | {
36 | Id128 ret;
37 | u64 creation_time;
38 | __builtin_memset(&ret, 0, sizeof(ret));
39 | ret.u1 = bpf_get_current_pid_tgid();
40 | bpf_probe_read_user(&creation_time,
41 | sizeof(u64),
42 | OffsetFrom(portal, PortalData, creation_time));
43 | ret.u2 = pgts_to_unixts(creation_time);
44 | return ret;
45 | }
46 |
47 | static inline void fill_event_base(event_base* event, short event_type)
48 | {
49 | event->event_type = event_type;
50 | event->pid = (bpf_get_current_pid_tgid() >> 32);
51 | }
52 |
53 | static inline void fill_portal_data(void * queryDesc, struct portal_data_t* event)
54 | {
55 | void *sourceText;
56 | void *planstate;
57 | void *instrument;
58 | void *plannedStmt;
59 | void *plan;
60 | int ret;
61 | event->queryAddr = (u64) queryDesc;
62 | bpf_probe_read_user(&sourceText,
63 | sizeof(void *),
64 | OffsetFrom(queryDesc, QueryDesc, sourceText));
65 | bpf_probe_read_user_str(&event->query,
66 | MAX_QUERY_LENGTH,
67 | (void *) sourceText);
68 | ret = bpf_probe_read_user(&plannedStmt,
69 | sizeof(void *),
70 | OffsetFrom(queryDesc, QueryDesc, plannedstmt));
71 | if (plannedStmt && ret == 0)
72 | {
73 | bpf_probe_read_user(&event->query_id,
74 | sizeof(u64),
75 | OffsetFrom(plannedStmt, PlannedStmt, queryId));
76 | }
77 | ret = bpf_probe_read_user(&planstate,
78 | sizeof(void *),
79 | OffsetFrom(queryDesc, QueryDesc, planstate));
80 | if (planstate && ret == 0)
81 | {
82 | ret = bpf_probe_read_user(&plan, sizeof(void *),
83 | OffsetFrom(planstate, PlanState, plan));
84 | if (plan && ret == 0)
85 | {
86 | bpf_probe_read_user(&event->startup_cost,
87 | sizeof(double),
88 | OffsetFrom(plan, Plan, startup_cost));
89 | bpf_probe_read_user(&event->total_cost,
90 | sizeof(double),
91 | OffsetFrom(plan, Plan, total_cost));
92 | bpf_probe_read_user(&event->plan_rows,
93 | sizeof(double),
94 | OffsetFrom(plan, Plan, plan_rows));
95 | }
96 | ret = bpf_probe_read_user(&instrument,
97 | sizeof(void *),
98 | OffsetFrom(planstate, PlanState, instrument));
99 | if (instrument && ret == 0)
100 | {
101 | bpf_probe_read_user(&event->instrument,
102 | STRUCT_SIZE_Instrumentation,
103 | instrument);
104 | }
105 | }
106 | }
107 |
108 | static inline void init_portal_data(struct portal_data_t* event)
109 | {
110 | event->query[0] = 0;
111 | event->instrument[0] = 0;
112 | event->search_path[0] = 0;
113 | }
114 |
115 | /*
116 | * Record information about a PlanStateNode
117 | */
118 | static inline void record_node(void * nodeaddr, struct planstate_data_t *node,
119 | struct pt_regs *ctx, bool need_capture_stack)
120 | {
121 | void *portal;
122 | void *instrument;
123 | void *planaddr;
124 | bpf_probe_read_user(&portal,
125 | sizeof(void*),
126 | (void *) GlobalVariablesActivePortal);
127 | node->portal_key = get_portal_key(portal);
128 | node->planstate_addr = (u64) nodeaddr;
129 | if (need_capture_stack)
130 | capture_stack(ctx, &node->stack_capture, MAX_STACK_READ);
131 |
132 | /* Read the associated Plan node, and it's estimates */
133 | bpf_probe_read_user(&planaddr,
134 | sizeof(void *),
135 | OffsetFrom(nodeaddr, PlanState, plan));
136 | node->plan_data.plan_addr = (u64) planaddr;
137 | bpf_probe_read_user(&node->plan_data.plan_tag,
138 | sizeof(int),
139 | OffsetFrom(planaddr, Plan, type));
140 |
141 | bpf_probe_read_user(&node->plan_data.startup_cost,
142 | sizeof(double),
143 | OffsetFrom(planaddr, Plan, startup_cost));
144 | bpf_probe_read_user(&node->plan_data.total_cost,
145 | sizeof(double),
146 | OffsetFrom(planaddr, Plan, total_cost));
147 | bpf_probe_read_user(&node->plan_data.plan_rows,
148 | sizeof(double),
149 | OffsetFrom(planaddr, Plan, plan_rows));
150 | bpf_probe_read_user(&node->plan_data.plan_width,
151 | sizeof(int),
152 | OffsetFrom(planaddr, Plan, plan_width));
153 | bpf_probe_read_user(&node->plan_data.parallel_aware,
154 | sizeof(bool),
155 | OffsetFrom(planaddr, Plan, parallel_aware));
156 | /* Read the PlanState node data */
157 | bpf_probe_read_user(&node->planstate_tag,
158 | sizeof(int),
159 | OffsetFrom(nodeaddr, PlanState, type));
160 | bpf_probe_read_user(&node->lefttree,
161 | sizeof(void *),
162 | OffsetFrom(nodeaddr, PlanState, lefttree));
163 | bpf_probe_read_user(&node->righttree,
164 | sizeof(void *),
165 | OffsetFrom(nodeaddr, PlanState, righttree));
166 | bpf_probe_read_user(&instrument,
167 | sizeof(void *),
168 | OffsetFrom(nodeaddr, PlanState, instrument));
169 | if (instrument)
170 | bpf_probe_read_user(&node->instrument,
171 | STRUCT_SIZE_Instrumentation,
172 | instrument);
173 | }
174 | #endif
175 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Workhorse for pgtracer.
3 |
4 | The BPFCollector works by combining two things:
5 | - an ebpf program loaded in to the kernel, which is built on the fly
6 | - DWARF information extracted from the executable (or a separate debug
7 | symbols file).
8 | """
9 | from __future__ import annotations
10 |
11 | import ctypes as ct
12 | import os
13 | from dataclasses import dataclass
14 | from enum import IntEnum
15 | from pathlib import Path
16 | from threading import Lock, Thread
17 | from time import sleep
18 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
19 |
20 | from bcc import BPF, USDT, PerfSWConfig, PerfType
21 | from bcc import __version__ as bcc_version
22 | from bcc import lib as bcclib
23 | from pypsutil import Process
24 |
25 | from ...model import MemoryAllocType, Query
26 | from ..dwarf import DWARFPointer, ProcessMetadata, Struct, get_size
27 | from ..unwind import stack_data_t
28 | from .c_defs import *
29 | from .utils import CODE_BASE_PATH, defines_dict_to_c, intenum_to_c, load_c_file
30 |
31 | BCC_VERSION_TUPLE = tuple(int(part) for part in bcc_version.split("."))
32 |
33 |
34 | class InvalidStateException(Exception):
35 | """
36 | Invalid State of a BPFCollector Exception.
37 |
38 | This Exception occurs when an operation is performed on a BPFCollector
39 | which is not in the prerequisite state.
40 | """
41 |
42 |
43 | # pylint: disable=invalid-name
44 | class EventHandler:
45 | """
46 | Base class for handling events.
47 |
48 | The handle_event method dispatched to handle_{EventType} methods if they
49 | exist. This acts mostly as a namespace to not pollute the BPFCollector
50 | class itself.
51 | """
52 |
53 | def __init__(self) -> None:
54 | pass
55 |
56 | def handle_event(self, bpf_collector: BPFCollector, event: ct._CData) -> int:
57 | """
58 | Handle an event from EBPF ringbuffer.
59 | Every event should be tagged with a short int as the first member to
60 | handle it's type. It is then dispatched to the appropriate method,
61 | which will be able to make sense of the actual struct.
62 | """
63 | # All events should be tagged with the event's type
64 | event_stub = ct.cast(event, ct.POINTER(event_base)).contents
65 | event_type_name = EventType(event_stub.event_type).name
66 | pid = event_stub.pid
67 | method_name = f"handle_{event_type_name}"
68 | method: Callable[[BPFCollector, ct._CData, int], int] = getattr(
69 | self, method_name
70 | )
71 | return method(bpf_collector, event, pid)
72 |
73 | # pylint: disable=unused-argument
74 | def handle_ProcessExit(
75 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
76 | ) -> int:
77 | """
78 | Handle ProcessExit event.
79 | """
80 | return bpf_collector.cleanup_process(pid)
81 |
82 | # pylint: disable=unused-argument
83 | def handle_ProcessFork(
84 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
85 | ) -> int:
86 | """
87 | Handle ProcessEnter event.
88 | """
89 | return bpf_collector.setup_process(pid)
90 |
91 |
92 | @dataclass
93 | class CollectorOptions:
94 | """
95 | Base class for BPFCollector Options.
96 | """
97 |
98 | enable_perf_events: bool = True
99 | sample_freq: int = 1200
100 |
101 |
102 | T = TypeVar("T", bound="BPFCollector")
103 |
104 |
105 | class BPFCollector:
106 | """
107 | Workhorse for pgtracer.
108 |
109 | This class allows the user to load an EBPF program dynamically generated
110 | using supplied options and extracted metadata about the Postgres
111 | executable.
112 | """
113 |
114 | options_cls: Type[CollectorOptions] = CollectorOptions
115 | event_handler_cls: Type[EventHandler] = EventHandler
116 |
117 | ExecEndFuncs = [
118 | "ExecEndAgg",
119 | "ExecEndAppend",
120 | "ExecEndBitmapAnd",
121 | "ExecEndBitmapHeapScan",
122 | "ExecEndBitmapIndexScan",
123 | "ExecEndBitmapOr",
124 | "ExecEndCteScan",
125 | "ExecEndCustomScan",
126 | "ExecEndForeignScan",
127 | "ExecEndFunctionScan",
128 | "ExecEndGather",
129 | "ExecEndGatherMerge",
130 | "ExecEndGroup",
131 | "ExecEndHash",
132 | "ExecEndHashJoin",
133 | "ExecEndIncrementalSort",
134 | "ExecEndIndexOnlyScan",
135 | "ExecEndIndexScan",
136 | "ExecEndLimit",
137 | "ExecEndLockRows",
138 | "ExecEndMaterial",
139 | "ExecEndMemoize",
140 | "ExecEndMergeAppend",
141 | "ExecEndMergeJoin",
142 | "ExecEndModifyTable",
143 | "ExecEndNamedTuplestoreScan",
144 | "ExecEndNode",
145 | "ExecEndNestLoop",
146 | "ExecEndProjectSet",
147 | "ExecEndRecursiveUnion",
148 | "ExecEndResult",
149 | "ExecEndSampleScan",
150 | "ExecEndSeqScan",
151 | "ExecEndSetOp",
152 | "ExecEndSort",
153 | "ExecEndSubqueryScan",
154 | "ExecEndTableFuncScan",
155 | "ExecEndTidRangeScan",
156 | "ExecEndTidScan",
157 | "ExecEndUnique",
158 | "ExecEndValuesScan",
159 | "ExecEndWindowAgg",
160 | "ExecEndWorkTableScan",
161 | ]
162 |
163 | def __init__(
164 | self,
165 | metadata: ProcessMetadata,
166 | options: Optional[CollectorOptions] = None,
167 | include_children: bool = False,
168 | ):
169 | if options is None:
170 | options = self.options_cls()
171 | self.options = options
172 | self.include_children = include_children
173 | self.anon_map_fds: Dict[int, int] = {}
174 | self.ppid: Optional[int]
175 | if include_children:
176 | self.pid = -1
177 | self.ppid = metadata.pid
178 | else:
179 | self.pid = metadata.pid
180 | self.ppid = None
181 |
182 | self.metadata = metadata
183 | self.program = str(self.metadata.program).encode("utf8")
184 | # Old bcc version don't support global usdt probes, so disable
185 | # memory tracking in that case
186 | if self.include_children is False or BCC_VERSION_TUPLE >= (0, 19, 0):
187 | self.usdt_ctx = USDT(metadata.pid)
188 | self.enable_usdt_probes(self.usdt_ctx)
189 | else:
190 | self.usdt_ctx = None
191 | self.bpf = self.prepare_bpf()
192 | self.setup_bpf_state()
193 | self.event_handler: EventHandler = self.event_handler_cls()
194 | self.update_struct_defs()
195 | self.is_running = False
196 | self.background_thread: Optional[Thread] = None
197 | self.lock = Lock()
198 | self.sample_freq = options.sample_freq
199 | self.backend_type: Optional[IntEnum] = None
200 |
201 | @classmethod
202 | def from_pid(
203 | cls: Type[T], pid: int, options: CollectorOptions = CollectorOptions()
204 | ) -> T:
205 | """
206 | Build a BPFCollector from a pid.
207 | """
208 | # FIXME: make this configurable
209 | cache_dir = Path("~/.cache").expanduser() / "pgtracer"
210 | process = Process(pid=pid)
211 | # Check if we are given the postmaster pid, or a backend.
212 | # If our parent is itself a postgres process, then we are instrumenting the whole backend.
213 | pprocess = process.parent()
214 | include_children = bool(
215 | pprocess and pprocess.name() not in ("postgres", "postmaster")
216 | )
217 | processmetadata = ProcessMetadata(process, cache_dir=cache_dir)
218 | return cls(processmetadata, options, include_children=include_children)
219 |
220 | def update_struct_defs(self) -> None:
221 | """
222 | Update the ctypes struct definitions from the DWARF metadata.
223 |
224 | Some C structs used in EBPF must match what is defined by Postgres:
225 | so we build the class dynamically after the DWARF file has been loaded.
226 | """
227 | global instrument_type # pylint: disable=global-statement
228 | instrument_type = ct.c_byte * self.metadata.structs.Instrumentation.size
229 | # Update global struct definitions with actual sizes
230 | portal_data.update_fields(
231 | {
232 | "query": ct.c_char * MAX_QUERY_LENGTH,
233 | "instrument": instrument_type,
234 | "search_path": ct.c_char * MAX_SEARCHPATH_LENGTH,
235 | }
236 | )
237 | planstate_data.update_fields({"instrument": instrument_type})
238 | stack_sample.update_fields({"portal_data": portal_data})
239 |
240 | @property
241 | def constant_defines(self) -> Dict[str, int]:
242 | """
243 | Returns a list of constants to add to the ebpf program as #define
244 | directives.
245 | """
246 | constants = {
247 | "STACK_TOP_ADDR": self.metadata.stack_top,
248 | # TODO: find a way to extract those ?
249 | "POSTGRES_EPOCH_JDATE": 2451545,
250 | "UNIX_EPOCH_JDATE": 2440588,
251 | "SECS_PER_DAY": 86400,
252 | # TODO: make those configurable ?
253 | "MAX_QUERY_NUMBER": 10,
254 | "MAX_QUERY_LENGTH": MAX_QUERY_LENGTH,
255 | "MAX_STACK_READ": 4096,
256 | "MAX_SEARCHPATH_LENGTH": MAX_SEARCHPATH_LENGTH,
257 | "EVENTRING_PAGE_SIZE": 131072,
258 | "MEMORY_REQUEST_MAXSIZE": MEMORY_REQUEST_MAXSIZE,
259 | "MEMORY_PATH_SIZE": MEMORY_PATH_SIZE,
260 | "LIBBCC_MAJOR_VERSION": BCC_VERSION_TUPLE[0],
261 | "LIBBCC_MINOR_VERSION": BCC_VERSION_TUPLE[1],
262 | "LIBBCC_PATCH_VERSION": BCC_VERSION_TUPLE[2],
263 | }
264 | if self.ppid is not None:
265 | constants["POSTMASTER_PID"] = self.ppid
266 | else:
267 | constants["PID"] = self.pid
268 | return constants
269 |
270 | @property
271 | def struct_offsets_defines(self) -> Dict[str, int]:
272 | """
273 | Build C-Code for the eBPF code to easily access named members in
274 | structs.
275 |
276 | We read the offset in a struct for known members, so that the eBPF code
277 | can read those members from the Postgres struct.
278 |
279 | This is necessary because we can't include Postgres headers in the eBPF
280 | code.
281 | """
282 | # Returns a normalized way of DEFINING struct offsets
283 | s = self.metadata.structs
284 |
285 | return {
286 | f"STRUCT_{struct}_OFFSET_{member}": getattr(s, struct)
287 | .field_definition(member)
288 | .offset
289 | for struct, member in (
290 | ("Node", "type"),
291 | ("Plan", "type"),
292 | ("Plan", "startup_cost"),
293 | ("Plan", "total_cost"),
294 | ("Plan", "plan_rows"),
295 | ("Plan", "plan_width"),
296 | ("Plan", "parallel_aware"),
297 | ("PlannedStmt", "queryId"),
298 | ("PlanState", "instrument"),
299 | ("PlanState", "plan"),
300 | ("PlanState", "type"),
301 | ("PlanState", "lefttree"),
302 | ("PlanState", "righttree"),
303 | ("PortalData", "creation_time"),
304 | ("PortalData", "queryDesc"),
305 | ("QueryDesc", "instrument_options"),
306 | ("QueryDesc", "planstate"),
307 | ("QueryDesc", "sourceText"),
308 | ("QueryDesc", "plannedstmt"),
309 | )
310 | }
311 |
312 | def make_global_variables_enum(self) -> Type[IntEnum]:
313 | """
314 | Create an IntEnum mapping global variables names to their address in
315 | the program.
316 | """
317 | mapping = {}
318 |
319 | for key in ("ActivePortal", "namespace_search_path"):
320 | mapping[key] = self.metadata.global_variable(key)
321 | # Mypy complains about dynamic enums
322 | globalenum = IntEnum("GlobalVariables", mapping) # type: ignore
323 |
324 | return globalenum
325 |
326 | def make_struct_sizes_dict(self) -> Dict[str, int]:
327 | """
328 | Create a dictionary mapping struct name to their bytesize.
329 |
330 | Once again, this is because we can't include Postgres header and call
331 | "sizeof".
332 | """
333 | mapping = {}
334 |
335 | for key in ("Instrumentation",):
336 | mapping[f"STRUCT_SIZE_{key}"] = getattr(self.metadata.structs, key).size
337 |
338 | return mapping
339 |
340 | def _attach_uprobe(self, function_name: str, ebpf_function: str) -> None:
341 | """
342 | Helper to attach a uprobe executing `ebpf_function` at every
343 | `function_name` location.
344 | """
345 | for addr in self.metadata.function_addresses(function_name):
346 | self.bpf.attach_uprobe(
347 | name=self.program,
348 | fn_name=ebpf_function.encode("utf8"),
349 | addr=addr,
350 | pid=self.pid,
351 | )
352 |
353 | def _attach_uretprobe(self, function_name: str, ebpf_function: str) -> None:
354 | """
355 | Helper to attach a uretprobe executing `ebpf_function` at every
356 | `function_name` location.
357 | """
358 | # TODO: make sure multiple addresses work too
359 | for addr in self.metadata.function_addresses(function_name):
360 | self.bpf.attach_uretprobe(
361 | name=self.program,
362 | fn_name=ebpf_function.encode("utf8"),
363 | addr=addr,
364 | pid=self.pid,
365 | )
366 |
367 | def background_polling(self, refresh_rate: int) -> None:
368 | """
369 | Run the polling in the background.
370 | """
371 | while self.is_running:
372 | self.bpf.ring_buffer_poll(refresh_rate)
373 | sleep(refresh_rate / 1000.0)
374 |
375 | def attach_probes(self) -> None:
376 | """
377 | Attach the required probes for this collector.
378 | """
379 | if self.options.enable_perf_events:
380 | self.bpf.attach_perf_event(
381 | ev_type=PerfType.SOFTWARE,
382 | ev_config=PerfSWConfig.CPU_CLOCK,
383 | fn_name=b"perf_event",
384 | pid=self.pid,
385 | sample_freq=self.sample_freq,
386 | )
387 |
388 | def enable_usdt_probes(self, usdt: USDT) -> None:
389 | """
390 | Enable USDT probes.
391 | """
392 |
393 | def start(self) -> None:
394 | """
395 | Starts the bpf collector.
396 | """
397 |
398 | if self.is_running:
399 | raise InvalidStateException("BPF Collector is already running")
400 | print("Starting eBPF collector...")
401 | self.bpf[b"event_ring"].open_ring_buffer(self._handle_event)
402 | self.attach_probes()
403 | self.is_running = True
404 | self.background_thread = Thread(target=self.background_polling, args=(100,))
405 | self.background_thread.start()
406 | print("eBPF collector started")
407 |
408 | def stop(self) -> None:
409 | """
410 | Stop polling the collector.
411 | """
412 | self.is_running = False
413 | if self.background_thread:
414 | self.background_thread.join()
415 | self.background_thread = None
416 | for (
417 | pid,
418 | fd,
419 | ) in self.anon_map_fds.copy().items(): # pylint: disable=invalid-name
420 | os.close(fd)
421 | try:
422 | del self.bpf[b"pid_queues"][ct.c_int(pid)]
423 | except KeyError:
424 | pass
425 | self.anon_map_fds.clear()
426 | self.bpf.cleanup()
427 |
428 | # pylint: disable=unused-argument
429 | def _handle_event(self, cpu: int, data: ct._CData, size: int) -> int:
430 | """
431 | Callback for the ring_buffer_poll. We actually dispatch this to the
432 | `EventHandler`
433 | """
434 | # Returning a negative value aborts polling
435 | if not self.is_running:
436 | return -1
437 | return self.event_handler.handle_event(self, data)
438 |
439 | def _optional_code(self) -> str:
440 | """
441 | Load additional code, depending on options or the specific
442 | Collector type.
443 | """
444 | buf = ""
445 | if self.options.enable_perf_events:
446 | buf += load_c_file("perf.c")
447 | return buf
448 |
449 | def build_memory_request(
450 | self,
451 | event_type: EventType,
452 | request_id: Id128,
453 | base_addr: int,
454 | base_type: Type[Union[ct._CData, Struct, DWARFPointer]],
455 | path: List[str],
456 | ) -> memory_request:
457 | """
458 | Build a memory request from a request_id, a base_addr, a known base_type living
459 | at this addr and a path describing which fields to follow to the final memory location.
460 |
461 | The fields definitions are extracted from the debug symbols.
462 | """
463 | memory_path = (ct.c_ulonglong * MEMORY_PATH_SIZE)()
464 | # We have the base address, the path, and finally an offset 0 to read the memory itself.
465 | mempath_length = len(path) + 1
466 | assert mempath_length <= MEMORY_PATH_SIZE
467 | memory_path[0] = base_addr
468 | current_type = base_type
469 | current_idx = 0
470 | for part in path:
471 | # If we follow a pointer, add a new item to the underlying path.
472 | # Otherwise, just add to the previous type.
473 | if issubclass(current_type, DWARFPointer):
474 | current_type = current_type.pointed_type
475 | current_idx += 1
476 | memory_path[current_idx] = 0
477 | if issubclass(current_type, Struct):
478 | attr = current_type.field_definition(part)
479 | if attr is None:
480 | raise AttributeError(f"Type {current_type} has no field {attr}")
481 | current_type = attr.member_type
482 | memory_path[current_idx] += attr.offset
483 | else:
484 | raise AttributeError(
485 | f"Cannot dereference field {part} from type {current_type}"
486 | )
487 | # For convenience, support the last field as a pointer.
488 | if issubclass(current_type, DWARFPointer) or current_type == ct.c_char_p:
489 | memory_path[current_idx + 1] = 0
490 | mempath_length += 1
491 | size = get_size(current_type, dereference=True)
492 |
493 | return memory_request(
494 | event_type=event_type,
495 | request_id=request_id,
496 | path_size=mempath_length,
497 | size=size,
498 | memory_path=memory_path,
499 | )
500 |
501 | def send_memory_request(self, pid: int, request: memory_request) -> None:
502 | """
503 | Sends a memory request to the ebpf program.
504 | """
505 | ret = -1
506 | if pid in self.anon_map_fds:
507 | map_fd = self.anon_map_fds[pid]
508 | ret = bcclib.bpf_update_elem(ct.c_int(map_fd), 0, ct.byref(request), 0)
509 | if ret < 0:
510 | raise ValueError("Something went wrong while sending a memory request")
511 |
512 | def preprocess_code(self, buf: str) -> str:
513 | """
514 | Preprocess code for things macro are not allowed to do with BCC.
515 | """
516 | if self.include_children:
517 | buf = buf.replace(
518 | "##CHECK_POSTMASTER##",
519 | """{
520 | u64 ppid;
521 | struct task_struct* task_p = (struct task_struct*)bpf_get_current_task();
522 | struct task_struct* parent_task_p = task_p->real_parent;
523 | ppid = parent_task_p->tgid;
524 | if (ppid != POSTMASTER_PID)
525 | return 0;
526 | };""",
527 | )
528 | else:
529 | buf = buf.replace("##CHECK_POSTMASTER##", "")
530 | return buf
531 |
532 | def prepare_bpf(self) -> BPF:
533 | """
534 | Generate the eBPF program, both from static code and dynamically
535 | generated defines and enums.
536 | """
537 | buf = defines_dict_to_c(self.constant_defines)
538 | buf += defines_dict_to_c(self.struct_offsets_defines)
539 | buf += defines_dict_to_c(self.make_struct_sizes_dict())
540 | buf += intenum_to_c(EventType)
541 | buf += intenum_to_c(MemoryAllocType)
542 | buf += intenum_to_c(self.make_global_variables_enum())
543 | buf += load_c_file("program.c")
544 | buf += self._optional_code()
545 | # Ok, now workaround some limitations of the macro system with bcc and implement our own.
546 | buf = self.preprocess_code(buf)
547 | # Add the code directory as include dir
548 | cflags = [f"-I{CODE_BASE_PATH}"]
549 | # Suppress some common warnings depending on bcc / kernel combinations
550 | cflags.append("-Wno-macro-redefined")
551 | cflags.append("-Wno-ignored-attributes")
552 | # Only enable global memory probe if bcc version is recent enough
553 | kwargs: Dict[str, Any] = {}
554 | if self.include_children and BCC_VERSION_TUPLE >= (0, 19, 0):
555 | kwargs["attach_usdt_ignore_pid"] = True
556 | kwargs["usdt_contexts"] = [self.usdt_ctx]
557 | bpf = BPF(text=buf.encode("utf8"), cflags=cflags, debug=0, **kwargs)
558 | return bpf
559 |
560 | def setup_bpf_state(self) -> None:
561 | """
562 | Setup the initial BPF State
563 | """
564 | if self.pid > 0:
565 | self.setup_process(self.pid)
566 |
567 | def setup_process(self, pid: int) -> int:
568 | """
569 | Callback when a new process is created.
570 | """
571 | if self.options.enable_perf_events:
572 | new_map = bcclib.bcc_create_map(
573 | BPF_MAP_TYPE_QUEUE, None, 0, ct.sizeof(memory_request), 1024, 0
574 | )
575 | self.bpf[b"pid_queues"][ct.c_int(pid)] = ct.c_int(new_map)
576 | self.anon_map_fds[pid] = new_map
577 | return 0
578 |
579 | def cleanup_process(self, pid: int) -> int:
580 | """
581 | Callback when a process exits.
582 | """
583 | # If we instrument a single pid, exit
584 | if self.pid == pid:
585 | print(f"Process {pid} is terminating, stopping collection")
586 | self.is_running = False
587 | else:
588 | try:
589 | if pid in self.anon_map_fds:
590 | try:
591 | del self.bpf[b"pid_queues"][ct.c_int(pid)]
592 | except KeyError:
593 | pass
594 | os.close(self.anon_map_fds[pid])
595 | del self.anon_map_fds[pid]
596 | except KeyError:
597 | return 0
598 | return 0
599 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/c_defs.py:
--------------------------------------------------------------------------------
1 | """
2 | Datastructure definitions used in the C ebpf code.
3 | """
4 | from __future__ import annotations
5 |
6 | import ctypes as ct
7 | from enum import IntEnum
8 | from typing import Dict, List, Tuple, Type
9 |
10 | from ..unwind import stack_data_t
11 |
12 | BPF_MAP_TYPE_QUEUE = 22
13 |
14 |
15 | class Id128(ct.Structure):
16 | """
17 | Structure containing two u64, to be used as either a single 8-bytes int or a two 8-bytes tuple.
18 | """
19 |
20 | _fields_ = [("u1", ct.c_ulonglong), ("u2", ct.c_ulonglong)]
21 |
22 | @classmethod
23 | def from_int(cls, intvalue: int) -> Id128:
24 | """
25 | Create an Id128 from a single integer.
26 | """
27 | return cls(intvalue, 0)
28 |
29 | def as_int(self) -> int:
30 | """
31 | Interpret an Id128 as a single integer.
32 | """
33 | val: int = self.u1
34 | return val
35 |
36 | @classmethod
37 | def from_tuple(cls, inttuple: Tuple[int, int]) -> Id128:
38 | """
39 | Create an Id128 from a two-ints tuple.
40 | """
41 | return cls(*inttuple)
42 |
43 | def as_tuple(self) -> Tuple[int, int]:
44 | """
45 | Interpret an Id128 as a two-int tuple.
46 | """
47 | return (self.u1, self.u2)
48 |
49 |
50 | # pylint: disable=invalid-name
51 | class EventType(IntEnum):
52 | """
53 | EventTypes generated by the EBPF code.
54 | """
55 |
56 | ExecutorRun = 1
57 | ExecutorFinish = 2
58 | DropPortalEnter = 3
59 | DropPortalReturn = 4
60 | ExecProcNodeFirst = 5
61 | ExecEndNode = 6
62 | KBlockRqIssue = 7
63 | StackSample = 8
64 | MemoryResponseQueryInstr = 9
65 | MemoryResponseNodeInstr = 10
66 | MemoryNodeData = 11
67 | GUCResponse = 12
68 | MemoryAccount = 13
69 | ProcessFork = 14
70 | ProcessExit = 15
71 |
72 |
73 | instrument_type = ct.c_byte * 0
74 |
75 |
76 | class StubStructure(ct.Structure):
77 | """
78 | StubStructure definition, which actual fields must be updated at runtime.
79 | """
80 |
81 | _protofields: List[Tuple[str, Type[ct._CData]]] = []
82 |
83 | @classmethod
84 | def update_fields(cls, fields: Dict[str, Type[ct._CData]]) -> None:
85 | """
86 | Update the structure fields.
87 | """
88 | if hasattr(cls, "_fields_"):
89 | # We are not allowed to update it. But if all updated values are
90 | # the same as the first update, we don't care.
91 | fields_dict = dict(cls._fields_) # type: ignore
92 | for key, value in fields.items():
93 | if fields_dict[key] != value:
94 | raise ValueError("Cannot update a struct more than once.")
95 | return
96 | fields_dict = dict(cls._protofields)
97 | fields_dict.update(fields)
98 | cls._fields_ = list(fields_dict.items())
99 |
100 |
101 | MAX_QUERY_LENGTH = 2048
102 | MAX_SEARCHPATH_LENGTH = 1024
103 |
104 |
105 | class event_base(ct.Structure):
106 | """
107 | Common fields for all events.
108 | """
109 |
110 | _fields_ = [("event_type", ct.c_short), ("pid", ct.c_int)]
111 |
112 |
113 | class portal_data(StubStructure):
114 | """
115 | Represents the portal_data associated to a portal.
116 | """
117 |
118 | _protofields = [
119 | ("event", event_base),
120 | ("portal_key", Id128),
121 | ("query_addr", ct.c_ulonglong),
122 | ("query_id", ct.c_ulonglong),
123 | ("startup_cost", ct.c_double),
124 | ("total_cost", ct.c_double),
125 | ("plan_rows", ct.c_double),
126 | ("query", ct.c_char * MAX_QUERY_LENGTH),
127 | ("instrument", instrument_type),
128 | ("search_path", ct.c_char * MAX_SEARCHPATH_LENGTH),
129 | ]
130 |
131 |
132 | class io_req_data(ct.Structure):
133 | """
134 | Represents the io_req_data coming from instrumenting the kernel.
135 | """
136 |
137 | _fields_ = [
138 | ("event", event_base),
139 | ("rwbs", ct.c_char * 8),
140 | ("bytes", ct.c_ulonglong),
141 | ]
142 |
143 |
144 | class plan_data(ct.Structure):
145 | """
146 | Represents the data associated with a PlanNode.
147 | """
148 |
149 | _fields_ = [
150 | ("plan_addr", ct.c_ulonglong),
151 | ("plan_tag", ct.c_int),
152 | ("startup_cost", ct.c_double),
153 | ("total_cost", ct.c_double),
154 | ("plan_rows", ct.c_double),
155 | ("plan_width", ct.c_int),
156 | ("parallel_aware", ct.c_bool),
157 | ]
158 |
159 |
160 | class planstate_data(StubStructure):
161 | """
162 | Represents the data associated to a PlanState node.
163 | """
164 |
165 | _protofields = [
166 | ("event", event_base),
167 | ("portal_key", Id128),
168 | ("planstate_addr", ct.c_ulonglong),
169 | ("planstate_tag", ct.c_int),
170 | ("lefttree", ct.c_ulonglong),
171 | ("righttree", ct.c_ulonglong),
172 | ("plan_data", plan_data),
173 | ("instrument", instrument_type),
174 | ("stack_capture", stack_data_t),
175 | ]
176 |
177 |
178 | MEMORY_REQUEST_MAXSIZE = 131072
179 | MEMORY_PATH_SIZE = 5
180 |
181 |
182 | class memory_request(ct.Structure):
183 | """
184 | Represents a memory request, to be processed in the perf event handler.
185 | """
186 |
187 | _fields_ = [
188 | ("event_type", ct.c_short),
189 | ("request_id", Id128),
190 | ("path_size", ct.c_int),
191 | ("size", ct.c_ulonglong),
192 | ("memory_path", ct.c_ulonglong * MEMORY_PATH_SIZE),
193 | ]
194 |
195 |
196 | class memory_response(ct.Structure):
197 | """
198 | Represents a memory response, sent back from the perf event handler.
199 | """
200 |
201 | _fields_ = [
202 | ("event", event_base),
203 | ("request_id", Id128),
204 | ("payload", ct.c_char * MEMORY_REQUEST_MAXSIZE),
205 | ]
206 |
207 | @property
208 | def payload_addr(self) -> int:
209 | """
210 | Returns the address of the payload field: useful to parse it into it's
211 | own struct.
212 | """
213 | return ct.addressof(self) + memory_response.payload.offset
214 |
215 |
216 | class stack_sample(StubStructure):
217 | """
218 | Represents a stack sample, sent back from the perf event handler.
219 | """
220 |
221 | _protofields = [("portal_data", portal_data), ("stack_data", stack_data_t)]
222 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/guc.py:
--------------------------------------------------------------------------------
1 | """
2 | This module defines the collector for getting / setting GUC.
3 | """
4 | from __future__ import annotations
5 |
6 | import ctypes as ct
7 | import struct
8 | from dataclasses import dataclass
9 | from typing import Any, BinaryIO, Dict, Optional, Tuple, Type
10 |
11 | from elftools.elf.elffile import ELFFile
12 |
13 | from ...utils import readcstr
14 | from ..dwarf import ProcessMetadata, Struct
15 | from . import BPFCollector, CollectorOptions, EventHandler
16 | from .c_defs import event_base
17 | from .utils import load_c_file
18 |
19 | GUC_MAX_LENGTH = 128
20 |
21 |
22 | # pylint: disable=invalid-name
23 | class guc_request(ct.Structure):
24 | """
25 | A request to set a guc.
26 | """
27 |
28 | _fields_ = [
29 | ("guc_location", ct.c_ulonglong),
30 | ("guc_size", ct.c_int),
31 | ("payload", ct.c_byte * GUC_MAX_LENGTH),
32 | ]
33 |
34 |
35 | # pylint: disable=invalid-name
36 | class guc_response(ct.Structure):
37 | """
38 | A response to a guc_request.
39 | """
40 |
41 | _fields_ = [
42 | ("event", event_base),
43 | ("guc_location", ct.c_ulonglong),
44 | ("status", ct.c_bool),
45 | ]
46 |
47 |
48 | class GUCTracerOptions(CollectorOptions):
49 | """
50 | Dataclass for GUCTracerBPFCollector options.
51 | """
52 |
53 | sample_freq: int = 3000
54 | guc_to_watch: Dict[str, str] = {}
55 |
56 |
57 | class GUCTracerEventHandler(EventHandler):
58 | """
59 | EventHandler for the GUCTracerBPFCollector.
60 | """
61 |
62 | def __init__(self) -> None:
63 | super().__init__()
64 | self.pending_names_req: Dict[int, GUCDefinition] = {}
65 |
66 | # pylint: disable=invalid-name
67 | def handle_GUCResponse(
68 | self, bpf_collector: GUCTracerBPFCollector, event: ct._CData, pid: int
69 | ) -> int:
70 | """
71 | Handle GUCResponse messages.
72 | """
73 | event = ct.cast(event, ct.POINTER(guc_response)).contents
74 | guc_def, value = bpf_collector.pending_guc_sets.pop(event.guc_location)
75 | if event.status:
76 | print(
77 | f"GUC {guc_def.guc_name}@{event.guc_location} has been successfully set to {value}"
78 | )
79 | else:
80 | print(
81 | f"GUC {guc_def.guc_name}@{event.guc_location} has failed to be set to {value}"
82 | )
83 | return 0
84 |
85 |
86 | @dataclass
87 | class GUCDefinition:
88 | """
89 | A GUC definition, extracted from the binary.
90 | """
91 |
92 | guc_type: str
93 | guc_name: str
94 | guc_location: int
95 |
96 |
97 | class GUCTracerBPFCollector(BPFCollector):
98 | """
99 | BPF Collector tracing GUCs and potentially modifying them.
100 | """
101 |
102 | options_cls = GUCTracerOptions
103 | event_handler_cls = GUCTracerEventHandler
104 |
105 | GUC_TABLE_TYPE_TO_VARIABLE = {
106 | "config_bool": "ConfigureNamesBool",
107 | "config_int": "ConfigureNamesInt",
108 | "config_real": "ConfigureNamesReal",
109 | "config_string": "ConfigureNamesString",
110 | "config_enum": "ConfigureNamesEnum",
111 | }
112 |
113 | def __init__(
114 | self,
115 | metadata: ProcessMetadata,
116 | options: Optional[CollectorOptions] = None,
117 | include_children: bool = False,
118 | ):
119 | if include_children:
120 | raise NotImplementedError(
121 | "GUC Tracer does not support attaching to the whole cluster."
122 | )
123 | self.options: CollectorOptions
124 | self.guc_defs: Dict[str, GUCDefinition] = {}
125 | self.pending_guc_sets: Dict[int, Tuple[GUCDefinition, Any]] = {}
126 | # We must not rely on the debug symbol elffile, but instead the one
127 | # from the executable itself
128 | with ELFFile.load_from_path(metadata.program) as elf:
129 | reladyn = elf.get_section_by_name(".rela.dyn")
130 | self.relocations: Dict[int, int] = {
131 | reloc["r_offset"]: reloc["r_addend"]
132 | for reloc in reladyn.iter_relocations()
133 | }
134 | self.ready = False
135 | super().__init__(metadata, options)
136 |
137 | def _relocate_addr(self, addr: int) -> int:
138 | """
139 | Relocate an address from the .rela.dyn section information.
140 | """
141 | if addr in self.relocations:
142 | return self.relocations[addr]
143 | return 0
144 |
145 | def _load_one_gucdef(
146 | self, addr: int, gucdef_type: Type[Struct], binfile: BinaryIO
147 | ) -> Optional[GUCDefinition]:
148 | """
149 | Load one GUC definition from the binary
150 | """
151 | # First lookup it's name. We could just use the base address
152 | # since it's the first member but better make it correct
153 | gen_definition = gucdef_type.field_definition("gen")
154 | if gen_definition is None:
155 | raise ValueError(
156 | f"Could not find member gen in struct {gucdef_type.__name__}"
157 | )
158 | name_definition = gen_definition.member_type.field_definition("name") # type: ignore
159 | if name_definition is None:
160 | raise ValueError(
161 | f"Could not find member name in struct {gen_definition.member_type.__name__}"
162 | )
163 | name_pointer_addr = addr + gen_definition.offset + name_definition.offset
164 | # Now lookup the relocation information for that address
165 | reloced_addr = self._relocate_addr(name_pointer_addr)
166 | if reloced_addr == 0:
167 | return None
168 | # Now we can read the data from the binary
169 | binfile.seek(reloced_addr)
170 | guc_bname = readcstr(binfile)
171 | guc_name = guc_bname.decode("utf8")
172 | # Now relocate the GUC global variable address
173 | variable_definition = gucdef_type.field_definition("variable")
174 | if variable_definition is None:
175 | raise ValueError(
176 | f"Could not find member variable in struct {gucdef_type.__name__}"
177 | )
178 |
179 | variable_pointer_addr = addr + variable_definition.offset
180 | reloced_addr = self._relocate_addr(variable_pointer_addr)
181 | return GUCDefinition(
182 | guc_name=guc_name,
183 | guc_type=gucdef_type.__name__.replace("config_", ""),
184 | guc_location=reloced_addr + self.metadata.base_addr,
185 | )
186 |
187 | def _load_guc_defs_from_binary(self) -> None:
188 | """
189 | Load GUC definitions from the binary executable.
190 | """
191 | with open(self.metadata.program, "rb") as programbin:
192 | for typname, variable_name in self.GUC_TABLE_TYPE_TO_VARIABLE.items():
193 | deftype = getattr(self.metadata.structs, typname)
194 | typsize = deftype.size
195 | variable_addr = self.metadata.global_variable(variable_name)
196 | if variable_addr is None:
197 | raise ValueError(
198 | f"Could not locate global variable {variable_name}"
199 | )
200 | addr = variable_addr - self.metadata.base_addr
201 |
202 | # Now iterate over the entries.
203 | while True:
204 | guc = self._load_one_gucdef(addr, deftype, programbin)
205 | if guc is None:
206 | break
207 | self.guc_defs[guc.guc_name] = guc
208 | addr += typsize
209 |
210 | def set_guc(self, guc_name: str, guc_value: str) -> None:
211 | """
212 | Send a request to set a GUC to a specific value.
213 | """
214 | guc_def = self.guc_defs[guc_name]
215 | guc_c_value: Optional[bytes] = None
216 | if guc_def.guc_type != "int":
217 | raise NotImplementedError("We only support ints for now.")
218 | guc_c_value = struct.pack("i", int(guc_value))
219 | guc_ct_value: ct._CData = ct.create_string_buffer(guc_c_value, GUC_MAX_LENGTH)
220 | guc_ct_value = ct.cast(
221 | guc_ct_value, ct.POINTER(ct.c_byte * GUC_MAX_LENGTH)
222 | ).contents
223 | guc_req = guc_request(
224 | ct.c_ulonglong(guc_def.guc_location), guc_size=4, payload=guc_ct_value
225 | )
226 | self.pending_guc_sets[guc_def.guc_location] = guc_def, guc_value
227 | self.bpf[b"gucs_to_set"].push(guc_req)
228 |
229 | def setup_bpf_state(self) -> None:
230 | super().setup_bpf_state()
231 | # Build a mapping of GUC names to variables addresses
232 | self._load_guc_defs_from_binary()
233 |
234 | @property
235 | def constant_defines(self) -> Dict[str, int]:
236 | constants = super().constant_defines
237 | constants["GUC_MAX_LENGTH"] = GUC_MAX_LENGTH
238 | return constants
239 |
240 | def attach_probes(self) -> None:
241 | super().attach_probes()
242 | # Attach at various not-too-intrusive points.
243 | self._attach_uretprobe("BeginCommand", "process_guc_uprobe")
244 | self._attach_uretprobe("printtup", "process_guc_uprobe")
245 |
246 | self._attach_uretprobe("launcher_determine_sleep", "process_guc_uprobe")
247 | self._attach_uretprobe("vacuum_delay_point", "process_guc_uprobe")
248 |
249 | def _optional_code(self) -> str:
250 | buf = super()._optional_code()
251 | buf += load_c_file("gucset.c")
252 | return buf
253 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/querytracer.py:
--------------------------------------------------------------------------------
1 | """
2 | BPF Collector tracing queries.
3 | """
4 | from __future__ import annotations
5 |
6 | import ctypes as ct
7 | from dataclasses import dataclass, field
8 | from enum import IntEnum
9 | from typing import Dict, List, Optional, Tuple
10 |
11 | from bcc import USDT
12 |
13 | from pgtracer.ebpf.dwarf import ProcessMetadata
14 | from pgtracer.model.plan import PlanState
15 | from pgtracer.model.query import Query
16 |
17 | from ...model import PlanState, Query, memory_account
18 | from . import BPFCollector, CollectorOptions, EventHandler, EventType
19 | from .c_defs import (
20 | Id128,
21 | io_req_data,
22 | memory_response,
23 | planstate_data,
24 | portal_data,
25 | stack_sample,
26 | )
27 | from .utils import load_c_file
28 |
29 |
30 | class InstrumentationFlags(IntEnum):
31 | """
32 | Instrumentation flags.
33 |
34 | Mimic the InstrumentOption enum from PG.
35 | We define it statically here as it can be used from options.
36 | """
37 |
38 | TIMER = 1 << 0
39 | BUFFERS = 1 << 1
40 | ROWS = 1 << 2
41 | WAL = 1 << 3
42 | ALL = 0x7FFFFFFF # INT32 Max
43 |
44 |
45 | @dataclass
46 | class QueryTracerOptions(CollectorOptions):
47 | """
48 | Dataclass for QueryTracerBPFCollector options.
49 | """
50 |
51 | instrument_flags: int = 0
52 | enable_nodes_collection: bool = False
53 | enable_query_discovery: bool = True
54 |
55 |
56 | @dataclass
57 | class PerProcessInfo:
58 | """
59 | Store information about the queries processed by a backend.
60 | """
61 |
62 | pid: int
63 | last_portal_key: Optional[Tuple[int, int]] = None
64 | query_history: List[Query] = field(default_factory=list)
65 | query_cache: Dict[Tuple[int, int], Query] = field(default_factory=dict)
66 | current_executor: Optional[Tuple[int, int]] = None
67 | current_query: Optional[Query] = None
68 |
69 |
70 | # pylint: disable=invalid-name
71 | class QueryTracerEventHandler(EventHandler):
72 | """
73 | EventHandler for QueryTracer.
74 | """
75 |
76 | def __init__(self) -> None:
77 | self.per_process_info: Dict[int, PerProcessInfo] = {}
78 | self.next_request_id = 0
79 | self.process_history: List[PerProcessInfo] = []
80 |
81 | def get_process_info(self, pid: int) -> PerProcessInfo:
82 | """
83 | Returns the process info for a given PID, creating it if needed.
84 | """
85 | if pid not in self.per_process_info:
86 | self.per_process_info[pid] = PerProcessInfo(pid)
87 | return self.per_process_info[pid]
88 |
89 | def _process_portal_data(
90 | self, bpf_collector: BPFCollector, event: portal_data, pid: int
91 | ) -> int:
92 | """
93 | Process the portal data. This is used both when a query starts, and when we see
94 | the first live query during query discovery.
95 | """
96 | key = event.portal_key.as_tuple()
97 | process_info = self.get_process_info(pid)
98 | process_info.current_executor = event.portal_key.as_tuple()
99 |
100 | if key not in process_info.query_cache:
101 | process_info.query_cache[key] = Query.from_event(
102 | bpf_collector.metadata, event
103 | )
104 | else:
105 | process_info.query_cache[key].update(bpf_collector.metadata, event)
106 | process_info.current_query = process_info.query_cache[key]
107 | # If perf events are enabled, start watching the query instrumentation.
108 | if bpf_collector.options.enable_perf_events:
109 | structs = bpf_collector.metadata.structs
110 | request = bpf_collector.build_memory_request(
111 | EventType.MemoryResponseQueryInstr,
112 | event.portal_key,
113 | event.query_addr,
114 | structs.QueryDesc,
115 | ["planstate", "instrument"],
116 | )
117 | bpf_collector.send_memory_request(pid, request)
118 | return 0
119 |
120 | def handle_ExecutorRun(
121 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
122 | ) -> int:
123 | """
124 | Handle ExecutorRun event. This event is produced by an uprobe on
125 | standard_ExecutorRun. See executorstart_enter in program.c.
126 |
127 | We record the fact that a query started, extracting relevant metadata
128 | already present at the query start.
129 | """
130 | if bpf_collector.options.enable_perf_events:
131 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False)
132 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool(False)
133 | event = ct.cast(event, ct.POINTER(portal_data)).contents
134 | return self._process_portal_data(bpf_collector, event, pid)
135 |
136 | # pylint: disable=unused-argument
137 | def handle_ExecutorFinish(
138 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
139 | ) -> int:
140 | """
141 | Handle ExecutorFinish event.
142 | """
143 | event = ct.cast(event, ct.POINTER(portal_data)).contents
144 | key = event.portal_key.as_tuple()
145 | process_info = self.get_process_info(pid)
146 | if process_info.current_executor:
147 | process_info.current_executor = None
148 | process_info.current_query = None
149 | if key in process_info.query_cache:
150 | process_info.query_cache[event.portal_key.as_tuple()].update(
151 | bpf_collector.metadata, event
152 | )
153 | return 0
154 |
155 | # pylint: disable=unused-argument
156 | def handle_DropPortalEnter(
157 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
158 | ) -> int:
159 | """
160 | Handle DropPortalEnter event. This event is produced by a uprobe on
161 | DropPortal. See protaldrop_enter in program.c.
162 |
163 | PortalDrop is called whenever a query is finished: once the last row
164 | has been read in the case of a single query, or when the cursor is
165 | closed in the case of a cursor.
166 |
167 | Since PortalDrop is responsbile for cleaning up the portal, we record
168 | the instrumentation and other data about the query here, and remember
169 | it's identifier. Only once we return from DropPortal will we actually
170 | clean up the query from our current cache, and append it to history.
171 | """
172 | event = ct.cast(event, ct.POINTER(portal_data)).contents
173 | process_info = self.get_process_info(pid)
174 | process_info.last_portal_key = event.portal_key.as_tuple()
175 | if process_info.last_portal_key in process_info.query_cache:
176 | process_info.query_cache[process_info.last_portal_key].update(
177 | bpf_collector.metadata, event
178 | )
179 | return 0
180 |
181 | # pylint: disable=unused-argument
182 | def handle_DropPortalReturn(
183 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
184 | ) -> int:
185 | """
186 | Handle DropPortalReturn event. This event is produced by an uretprobe on
187 | DropPortal. See protaldrop_return in program.c.
188 |
189 | We remove the query from the internal cache and append it to history.
190 | """
191 | event = ct.cast(event, ct.POINTER(portal_data)).contents
192 | process_info = self.get_process_info(pid)
193 | if process_info.last_portal_key is not None:
194 | if process_info.last_portal_key in process_info.query_cache:
195 | query = process_info.query_cache[process_info.last_portal_key]
196 | process_info.query_history.append(query)
197 | del process_info.query_cache[process_info.last_portal_key]
198 | process_info.last_portal_key = None
199 | process_info.current_executor = None
200 | process_info.current_query = None
201 | return 0
202 |
203 | def handle_ExecProcNodeFirst(
204 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
205 | ) -> int:
206 | """
207 | Handle ExecProcNodeFirst event. This event is produced by a uprobe on
208 | ExecProcNodeFirst.
209 |
210 | The goal here is to build a plan tree for the query.
211 | """
212 | event = ct.cast(event, ct.POINTER(planstate_data)).contents
213 | process_info = self.get_process_info(pid)
214 | query = process_info.query_cache.get(event.portal_key.as_tuple())
215 | if query is None:
216 | # We don't know this query: maybe it started running before us ?
217 | return 0
218 | query.add_node_from_event(bpf_collector.metadata, event)
219 | if bpf_collector.options.enable_perf_events:
220 | request = bpf_collector.build_memory_request(
221 | EventType.MemoryResponseNodeInstr,
222 | Id128.from_int(event.planstate_addr),
223 | event.planstate_addr,
224 | bpf_collector.metadata.structs.PlanState,
225 | ["instrument"],
226 | )
227 | bpf_collector.send_memory_request(pid, request)
228 | return 0
229 |
230 | def handle_ExecEndNode(
231 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
232 | ) -> int:
233 | """
234 | Handle ExecEndNode event. This event is produced by a uprobe on
235 | ExecEndNode's implementations.
236 |
237 | Once the executor node is destroyed, we want to collect it's
238 | instrumentation data if any.
239 | """
240 | event = ct.cast(event, ct.POINTER(planstate_data)).contents
241 | process_info = self.get_process_info(pid)
242 | if process_info.last_portal_key is None:
243 | return 0
244 | query = process_info.query_cache.get(process_info.last_portal_key)
245 | if query is None:
246 | return 0
247 | node = query.nodes.get(event.planstate_addr)
248 | if node is None:
249 | return 0
250 | instrument_addr = ct.addressof(event.instrument)
251 | instrument = bpf_collector.metadata.structs.Instrumentation(instrument_addr)
252 | instrument.nloops = ct.c_double(instrument.nloops.value + 1) # type: ignore
253 | node.instrument = instrument
254 | return 0
255 |
256 | def handle_KBlockRqIssue(
257 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
258 | ) -> int:
259 | """
260 | Handle KBlockRqIssue event. This event is produced by a kernel
261 | tracepoint on block_rq_issue.
262 |
263 | This serves to keep a count of block IO performed by a device, which
264 | can be useful to compute "real" cache hit ratio.
265 | """
266 | event = ct.cast(event, ct.POINTER(io_req_data)).contents
267 | process_info = self.get_process_info(pid)
268 | # We try to attach it to a specific query.
269 | # If we don't have one, don't bother
270 | if not process_info.current_executor:
271 | return 0
272 | query = process_info.query_cache.get(process_info.current_executor)
273 | if query is None:
274 | return 0
275 | if b"R" in event.rwbs:
276 | query.io_counters["R"] += event.bytes
277 | elif b"W" in event.rwbs:
278 | query.io_counters["W"] += event.bytes
279 | return 0
280 |
281 | def handle_MemoryResponseQueryInstr(
282 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
283 | ) -> int:
284 | """
285 | Handle MemoryResponseQueryInstr
286 |
287 | We lookup the request_id, and update the given counters if needed.
288 | """
289 | ev = ct.cast(event, ct.POINTER(memory_response)).contents
290 |
291 | process_info = self.get_process_info(pid)
292 | if not process_info.current_executor:
293 | return 0
294 | # We have a memory response for the whole query
295 | query = process_info.query_cache.get(ev.request_id.as_tuple(), None)
296 | if query:
297 | instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr)
298 | query.instrument = instr
299 | # Load all fields from the underlying memory.
300 | instr.as_dict(include_all=True)
301 | # Re-send the same request for continuous monitoring
302 | request = bpf_collector.build_memory_request(
303 | EventType.MemoryResponseQueryInstr,
304 | ev.request_id,
305 | query.addr,
306 | bpf_collector.metadata.structs.QueryDesc,
307 | ["planstate", "instrument"],
308 | )
309 |
310 | bpf_collector.send_memory_request(pid, request)
311 | return 0
312 |
313 | def handle_MemoryResponseNodeInstr(
314 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
315 | ) -> int:
316 | """
317 | Handle MemoryResponseNodeInstr produced as a response to some memory_request.
318 | """
319 | process_info = self.get_process_info(pid)
320 | if not process_info.current_executor:
321 | return 0
322 | query = process_info.query_cache.get(process_info.current_executor, None)
323 | ev = ct.cast(event, ct.POINTER(memory_response)).contents
324 | nodeid = ev.request_id.as_int()
325 | # We have a memory response for an individual node
326 | if query is not None and nodeid is not None:
327 | node = query.nodes.get(nodeid)
328 | if node is not None:
329 | instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr)
330 | node.instrument = instr
331 | # Re-send the same request for continuous monitoring
332 | request = bpf_collector.build_memory_request(
333 | EventType.MemoryResponseNodeInstr,
334 | Id128.from_int(nodeid),
335 | nodeid,
336 | bpf_collector.metadata.structs.PlanState,
337 | ["instrument"],
338 | )
339 | bpf_collector.send_memory_request(pid, request)
340 | return 0
341 |
342 | def handle_MemoryNodeData(
343 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
344 | ) -> int:
345 | """
346 | Handle MemoryNodeData produced as a response for a memory_request.
347 | """
348 | process_info = self.get_process_info(pid)
349 | if not process_info.current_executor:
350 | return 0
351 | ev = ct.cast(event, ct.POINTER(planstate_data)).contents
352 | query = process_info.query_cache.get(process_info.current_executor, None)
353 | if query is not None:
354 | node = query.add_node_from_event(bpf_collector.metadata, ev)
355 | if ev.lefttree and ev.lefttree not in query.nodes:
356 | leftchild = PlanState(ev.lefttree)
357 | leftchild.parent_node = node
358 | query.nodes[ev.lefttree] = leftchild
359 | node.children[leftchild] = None
360 | self._gather_node_info(bpf_collector, ev.lefttree, pid)
361 | if ev.righttree and ev.righttree not in query.nodes:
362 | rightchild = PlanState(ev.righttree)
363 | rightchild.parent_node = node
364 | query.nodes[ev.righttree] = rightchild
365 | node.children[rightchild] = None
366 | self._gather_node_info(bpf_collector, ev.righttree, pid)
367 | return 0
368 |
369 | def _gather_node_info(
370 | self, bpf_collector: BPFCollector, nodeaddr: int, pid: int
371 | ) -> None:
372 | """
373 | Send memory requests to gather information about a specific node.
374 | """
375 | req = bpf_collector.build_memory_request(
376 | EventType.MemoryNodeData,
377 | Id128.from_int(nodeaddr),
378 | nodeaddr,
379 | bpf_collector.metadata.structs.PlanState,
380 | [],
381 | )
382 | bpf_collector.send_memory_request(pid, req)
383 |
384 | def handle_StackSample(
385 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
386 | ) -> int:
387 | """
388 | Handle StackSample events produced during perf sampling.
389 | """
390 | ev = ct.cast(event, ct.POINTER(stack_sample)).contents
391 | process_info = self.get_process_info(pid)
392 | _, creation_time = ev.portal_data.portal_key.as_tuple()
393 | if creation_time:
394 | self._process_portal_data(bpf_collector, ev.portal_data, pid)
395 | bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False)
396 | if process_info.current_query:
397 | # Now add the nodes from the stacktrace
398 | process_info.current_query.add_nodes_from_stack(
399 | bpf_collector.metadata, ev.stack_data
400 | )
401 | # And add memory_requests to gather their information.
402 | for node in process_info.current_query.nodes.values():
403 | if node.is_stub and node.addr:
404 | self._gather_node_info(bpf_collector, node.addr, pid)
405 | return 0
406 |
407 | def handle_MemoryAccount(
408 | self, bpf_collector: BPFCollector, event: ct._CData, pid: int
409 | ) -> int:
410 | """
411 | Handle MemoryAccount events produced by malloc instrumentation.
412 | """
413 | ev = ct.cast(event, ct.POINTER(memory_account)).contents
414 | process_info = self.get_process_info(pid)
415 | if process_info.current_query:
416 | process_info.current_query.memallocs.update(ev)
417 | return 0
418 |
419 |
420 | class QueryTracerBPFCollector(BPFCollector):
421 | """
422 | BPF Collector tracing queries and optionally individual nodes.
423 | """
424 |
425 | options_cls = QueryTracerOptions
426 | event_handler_cls = QueryTracerEventHandler
427 |
428 | def __init__(
429 | self,
430 | metadata: ProcessMetadata,
431 | options: Optional[QueryTracerOptions] = None,
432 | include_children: bool = False,
433 | ):
434 | self.options: QueryTracerOptions
435 | self.event_handler: QueryTracerEventHandler
436 | super().__init__(metadata, options, include_children)
437 |
438 | def attach_probes(self) -> None:
439 | super().attach_probes()
440 | self._attach_uprobe("PortalDrop", "portaldrop_enter")
441 | self._attach_uretprobe("PortalDrop", "portaldrop_return")
442 | self._attach_uprobe("standard_ExecutorStart", "executorstart_enter")
443 | self._attach_uprobe("standard_ExecutorRun", "executorrun_enter")
444 | self._attach_uprobe("ExecutorFinish", "executorfinish_enter")
445 | self._attach_uprobe("mmap", "mmap_enter")
446 | self.bpf.attach_uprobe(
447 | name=b"c", sym=b"mmap", fn_name=b"mmap_enter", pid=self.pid
448 | )
449 | self.bpf.attach_uprobe(
450 | name=b"c", sym=b"munmap", fn_name=b"munmap_enter", pid=self.pid
451 | )
452 | if self.options.enable_nodes_collection:
453 | self._attach_uprobe("ExecProcNodeFirst", "execprocnodefirst_enter")
454 | for func in self.ExecEndFuncs:
455 | self._attach_uprobe(func, "execendnode_enter")
456 |
457 | def enable_usdt_probes(self, usdt: USDT) -> None:
458 | usdt.enable_probe(probe="libc:memory_sbrk_less", fn_name="sbrk_less")
459 | usdt.enable_probe(probe="libc:memory_sbrk_more", fn_name="sbrk_more")
460 |
461 | @property
462 | def constant_defines(self) -> Dict[str, int]:
463 | constants = super().constant_defines
464 | # USER_INSTRUMENT_FLAGS is defined only if the user wants to
465 | # inconditonally turn on instrumentation.
466 | if self.options.instrument_flags:
467 | constants["USER_INSTRUMENT_FLAGS"] = self.options.instrument_flags
468 | if self.options.enable_query_discovery:
469 | if not self.ppid:
470 | constants["ENABLE_QUERY_DISCOVERY"] = True
471 | return constants
472 |
473 | def _optional_code(self) -> str:
474 | buf = super()._optional_code()
475 | if self.options.enable_nodes_collection:
476 | buf += load_c_file("plan.c")
477 | buf += load_c_file("block_rq.c")
478 | buf += load_c_file("memusage.c")
479 | return buf
480 |
481 | def setup_bpf_state(self) -> None:
482 | # FIXME: get rid of those magic numbers.
483 | super().setup_bpf_state()
484 | if self.options.enable_perf_events:
485 | self.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(
486 | self.options.enable_query_discovery
487 | )
488 | self.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool(
489 | self.options.enable_query_discovery
490 | )
491 |
492 | def cleanup_process(self, pid: int) -> int:
493 | if pid in self.event_handler.per_process_info:
494 | self.event_handler.process_history.append(
495 | self.event_handler.per_process_info.pop(pid)
496 | )
497 | return super().cleanup_process(pid)
498 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Various utilities for collector implementations.
3 | """
4 | from enum import IntEnum
5 | from pathlib import Path
6 | from typing import Any, Dict, Type
7 |
8 |
9 | def intenum_to_c(intenum: Type[IntEnum]) -> str:
10 | """
11 | Generate C code defining an enum corresponding to a Python IntEnum.
12 | """
13 | buf = f"enum {intenum.__name__} {{\n"
14 | members = []
15 |
16 | for member in intenum:
17 | members.append(f"{intenum.__name__}{member.name} = {member.value}")
18 | buf += ",\n".join(members)
19 | buf += "\n};\n"
20 |
21 | return buf
22 |
23 |
24 | def defines_dict_to_c(defines_dict: Dict[str, Any]) -> str:
25 | """
26 | Generate a string of C #define directives from a mapping.
27 | """
28 | return (
29 | "\n".join(f"#define {key} {value}" for key, value in defines_dict.items())
30 | + "\n"
31 | )
32 |
33 |
34 | CODE_BASE_PATH = Path(__file__).parent.parent / "code"
35 |
36 |
37 | def load_c_file(filename: str) -> str:
38 | """
39 | Loads a C file from the package code directory.
40 | """
41 | filepath = CODE_BASE_PATH / filename
42 | with filepath.open() as cfile:
43 | return cfile.read()
44 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/eh_frame_hdr.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains code for parsing an .eh_frame_hdr section.
3 | """
4 | from __future__ import annotations
5 |
6 | import struct
7 | from enum import IntEnum
8 | from typing import TYPE_CHECKING, Any, Iterable, Optional, Tuple, no_type_check
9 |
10 | from elftools.dwarf.callframe import CallFrameInfo
11 | from elftools.dwarf.enums import DW_EH_encoding_flags
12 | from elftools.elf.elffile import ELFFile
13 |
14 | if TYPE_CHECKING:
15 | from elftools.dwarf.callframe import CFIEntry
16 | from elftools.elf.sections import Section
17 |
18 | DW_EH_Encoding = IntEnum("DW_EH_Encoding", DW_EH_encoding_flags) # type: ignore
19 |
20 |
21 | class EhFrameHdr:
22 | """
23 | Parsed .eh_frame_hdr section
24 | """
25 |
26 | def __init__(self, section: Section, elffile: ELFFile):
27 | self.elffile = elffile
28 | self.section = section
29 | self.offset = self.section.global_offset
30 | self.eh_frame_hdr_start = self.section.stream.tell()
31 | # First read the fixed header
32 | (
33 | self.version,
34 | self.eh_frame_ptr_enc,
35 | self.fde_count_enc,
36 | self.table_enc,
37 | ) = self._unpack_from("<4B", offset=0)
38 | self.frame_ptr: int = self.read_value(self.eh_frame_ptr_enc) # type: ignore
39 | self.fde_count: int = self.read_value(self.fde_count_enc) # type: ignore
40 | self.table_start = self.section.stream.tell()
41 | self.dwarf_info = elffile.get_dwarf_info()
42 | self.cfi = CallFrameInfo(
43 | stream=self.dwarf_info.eh_frame_sec.stream,
44 | size=self.dwarf_info.eh_frame_sec.size,
45 | address=self.dwarf_info.eh_frame_sec.address,
46 | base_structs=self.dwarf_info.structs,
47 | for_eh_frame=True,
48 | )
49 |
50 | @no_type_check
51 | def read_value(
52 | self,
53 | encoding: int,
54 | offset: Optional[int] = None,
55 | relative: bool = True,
56 | program_counter: int = 0,
57 | ) -> int:
58 | """
59 | Read a value with the given encoding at the specific offset.
60 | Relative indicate wether the offset is relative to the start of the
61 | section or absolute in the ELFFile.
62 | program_counter is the current program counter used for DW_EH_PE_pcrel calculations.
63 | """
64 | value_enc = encoding & 0x0F
65 | relative_enc = encoding & 0x70
66 | if value_enc == DW_EH_Encoding.DW_EH_PE_absptr:
67 | result = self._unpack_from("@B", offset=offset, relative=relative)
68 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata2:
69 | result = self._unpack_from("@H", offset=offset, relative=relative)
70 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata2:
71 | result = self._unpack_from("@h", offset=offset, relative=relative)
72 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata4:
73 | result = self._unpack_from("@I", offset=offset, relative=relative)
74 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata4:
75 | result = self._unpack_from("@i", offset=offset, relative=relative)
76 | elif value_enc == DW_EH_Encoding.DW_EH_PE_udata8:
77 | result = self._unpack_from("@Q", offset=offset, relative=relative)
78 | elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata8:
79 | result = self._unpack_from("@q", offset=offset, relative=relative)
80 | else:
81 | raise ValueError(f"Unknown value encoding: {value_enc}")
82 |
83 | result = result[0]
84 |
85 | if relative_enc == DW_EH_Encoding.DW_EH_PE_absptr:
86 | pass
87 | elif relative_enc == DW_EH_Encoding.DW_EH_PE_pcrel:
88 | result += program_counter
89 | elif relative_enc == DW_EH_Encoding.DW_EH_PE_datarel:
90 | result += self.offset
91 | else:
92 | raise ValueError(f"Pointer encoding {relative_enc} not supported")
93 | return result
94 |
95 | @no_type_check
96 | def get_table_entry_size(self) -> int:
97 | """
98 | Returns the size of a table entry.
99 | """
100 | enc = self.table_enc & 0x0F
101 | if enc in (DW_EH_Encoding.DW_EH_PE_udata2, DW_EH_Encoding.DW_EH_PE_sdata2):
102 | return 4
103 | if enc in (DW_EH_Encoding.DW_EH_PE_udata4, DW_EH_Encoding.DW_EH_PE_sdata4):
104 | return 8
105 | if enc in (DW_EH_Encoding.DW_EH_PE_udata8, DW_EH_Encoding.DW_EH_PE_sdata8):
106 | return 16
107 | if enc == DW_EH_Encoding.DW_EH_PE_omit:
108 | return 0
109 | raise ValueError(f"Invalid table encoding: {enc}")
110 |
111 | def _read_section(
112 | self, size: int, offset: Optional[int], relative: bool = False
113 | ) -> Any:
114 | """
115 | Read `size` bytes from the underlying stream at the given `offset`.
116 | relative indicates whether the given offset is relative to the
117 | .eh_frame_hdr section start, or absolute in the ELFFile.
118 | """
119 | stream = self.section.stream
120 | if offset is not None:
121 | if relative:
122 | offset = offset + self.offset
123 | stream.seek(offset)
124 | return stream.read(size)
125 |
126 | def _unpack_from(
127 | self, fmt: str, offset: Optional[int] = None, relative: bool = False
128 | ) -> Tuple[int, ...]:
129 | """
130 | Unpack a value read at offset according to format.
131 | """
132 | size = struct.calcsize(fmt)
133 | buffer = self._read_section(size, offset, relative)
134 | return struct.unpack_from(fmt, buffer)
135 |
136 | def read_entry(self, offset: Optional[int] = None) -> Tuple[int, int]:
137 | """
138 | Read a table entry at the given offset. .eh_frame_hdr table entries are
139 | couples of location / offset of the corresponding FDE.
140 | """
141 | loc_val: int = self.read_value(self.table_enc, offset, relative=False)
142 | offset_val: int = self.read_value(self.table_enc)
143 | return (loc_val, offset_val)
144 |
145 | def iter_entries(self) -> Iterable[Tuple[int, int]]:
146 | """
147 | Iter over .eh_frame_hdr table entries.
148 | """
149 | self.section.stream.seek(self.table_start)
150 | for _ in range(0, self.fde_count):
151 | yield self.read_entry()
152 |
153 | def find_fde(self, addrkey: int) -> Optional[CFIEntry]:
154 | """
155 | Find an antry by doing a binary search.
156 | """
157 | minidx = 0
158 | maxidx = self.fde_count
159 | size = self.get_table_entry_size()
160 | while True:
161 | idx = minidx + (maxidx - minidx) // 2
162 | offset = self.table_start + idx * size
163 | (addr, loc) = self.read_entry(offset=offset)
164 | # We found the looked up key, now we need to find the right tag
165 | if addrkey == addr or (minidx == idx and addrkey > addr):
166 | fde = self.cfi._parse_entry_at(
167 | loc - self.cfi.address
168 | ) # pylint: disable=protected-access
169 | if addrkey < fde.header.initial_location + fde.header.address_range:
170 | return fde
171 | # If the key is not in range, then we don't have an entry.
172 | return None
173 | if addrkey < addr:
174 | if maxidx == idx:
175 | return None
176 | maxidx = idx
177 | elif addrkey > addr:
178 | minidx = idx
179 |
180 | @classmethod
181 | def load_eh_frame_hdr(cls, elf_file: ELFFile) -> Optional[EhFrameHdr]:
182 | """
183 | Load an EHFrameHDR from an ELFFile.
184 | """
185 | eh_frame_hdr = elf_file.get_section_by_name(".eh_frame_hdr")
186 | if eh_frame_hdr is None:
187 | return None
188 |
189 | # pylint: disable=protected-access
190 | eh_frame_hdr = elf_file._read_dwarf_section(
191 | eh_frame_hdr, relocate_dwarf_sections=True
192 | )
193 | eh_frame_hdr_data = EhFrameHdr(eh_frame_hdr, elf_file)
194 | return eh_frame_hdr_data
195 |
--------------------------------------------------------------------------------
/src/pgtracer/ebpf/unwind.py:
--------------------------------------------------------------------------------
1 | # pylint: disable=invalid-name
2 | """
3 | This module provides access to libunwind through ctypes.
4 | """
5 | from __future__ import annotations
6 |
7 | import ctypes as ct
8 | import ctypes.util
9 | import platform
10 | import re
11 | from functools import cached_property
12 | from pathlib import Path
13 | from typing import TYPE_CHECKING, Any, Generator, List, Optional, Tuple, Type, TypeVar
14 |
15 | from elftools.dwarf.callframe import CFARule, CFIEntry
16 | from elftools.dwarf.die import DIE, AttributeValue
17 | from elftools.dwarf.dwarf_expr import DWARFExprOp, DWARFExprParser
18 | from elftools.dwarf.locationlists import BaseAddressEntry, LocationEntry, LocationExpr
19 |
20 | from .dwarf import MappedRegion, ProcessMetadata, die_name
21 |
22 | if TYPE_CHECKING:
23 | try:
24 | from typing import TypeAlias # type: ignore
25 | except ImportError:
26 | from typing_extensions import TypeAlias
27 | CFuncPtr: TypeAlias = ct._FuncPointer # pylint: disable=protected-access
28 | Pointer: TypeAlias = ct.pointer
29 | SimpleCData = ct._SimpleCData[Any] # pylint: disable=protected-access
30 | else:
31 | # Make pylint happy
32 | CFuncPtr = object()
33 | Pointer = List
34 | SimpleCData = Any
35 |
36 |
37 | CT = TypeVar("CT", bound=SimpleCData)
38 |
39 | ARCH = platform.machine()
40 |
41 |
42 | def find_libunwind_version() -> Tuple[int, int]:
43 | """
44 | Returns the libunwind version.
45 | We try to extract this from the headers.
46 |
47 | TODO: maybe we should call cc to get the actual include dirs ?
48 | """
49 | include_dir_candidates = [
50 | Path("/usr/include/"),
51 | Path(f"/usr/include/{ARCH}-linux-gnu/"),
52 | ]
53 | major_re = re.compile(r"#define UNW_VERSION_MAJOR\s+(\d+)")
54 | minor_re = re.compile(r"#define UNW_VERSION_MINOR\s+(\d+)")
55 | header_filename = Path("libunwind-common.h")
56 | major_version = None
57 | minor_version = None
58 | found = False
59 | for candidate in include_dir_candidates:
60 | include_file = candidate / header_filename
61 | if include_file.exists():
62 | with include_file.open() as f:
63 | for line in f:
64 | match = major_re.match(line)
65 | if match:
66 | found = True
67 | major_version = int(match.group(1))
68 | continue
69 | match = minor_re.match(line)
70 | if match:
71 | found = True
72 | minor_version = int(match.group(1))
73 | if found:
74 | break
75 | if major_version is None or minor_version is None:
76 | raise ValueError("Could not identify libunwind version !")
77 | return (major_version, minor_version)
78 |
79 |
80 | LIBUNWIND_VERSION = find_libunwind_version()
81 |
82 | UNW_PREFIX = f"_U{ARCH}_"
83 | libname = ctypes.util.find_library(f"unwind-{ARCH}")
84 | if libname is None:
85 | raise ImportError(f"Cannot load libunwind-{ARCH}")
86 | libunwind = ct.cdll.LoadLibrary(libname)
87 | if ARCH == "x86_64":
88 | UNW_TDEP_CURSOR_LEN = 127
89 | unw_word_t = ct.c_ulonglong
90 | UNW_WORD_T_FORMAT = " CFuncPtr:
180 | """
181 | Returns the CPointer function of that name. Depending on the architecture,
182 | the function names are not the same.
183 | """
184 | return getattr(libunwind, f"{UNW_PREFIX}{funcname}")
185 |
186 |
187 | class unw_dyn_remote_table_info_t(ct.Structure):
188 | """
189 | Mapping of unw_dyn_remote_table_info_t type.
190 | """
191 |
192 | _fields_ = [
193 | ("name_ptr", unw_word_t),
194 | ("segbase", unw_word_t),
195 | ("table_len", unw_word_t),
196 | ("table_data", unw_word_t),
197 | ]
198 |
199 |
200 | # We have to define the fields after the class, as it is a self-referencing
201 | # type.
202 | class unw_dyn_info_t(ct.Structure):
203 | """
204 | Mapping of unw_dyn_info_t type.
205 | """
206 |
207 |
208 | # Libunwind does not preserve perfect ABI compatibility.
209 | load_offset_field = []
210 | if LIBUNWIND_VERSION >= (1, 6):
211 | load_offset_field = [("load_offset", unw_word_t)]
212 |
213 | unw_dyn_info_t._fields_ = [ # pylint: disable=protected-access
214 | ("next", ct.POINTER(unw_dyn_info_t)),
215 | ("prev", ct.POINTER(unw_dyn_info_t)),
216 | ("start_ip", unw_word_t),
217 | ("end_ip", unw_word_t),
218 | ("gp", unw_word_t),
219 | ("format", ct.c_int32),
220 | ("pad", ct.c_int32),
221 | *load_offset_field,
222 | ("rti", unw_dyn_remote_table_info_t) # Supposed to be an union, but we will
223 | # only ever use this one.
224 | ]
225 |
226 | unw_regnum_t = ct.c_int
227 | unw_fpreg_t = unw_tdep_fpreg_t
228 | # Opaque type
229 | unw_addr_space_t = ct.c_void_p
230 |
231 | # Definition of function types
232 | FIND_PROC_INFO_FUNCTYPE = ct.CFUNCTYPE(
233 | ct.c_int, # Return value
234 | unw_addr_space_t,
235 | unw_word_t,
236 | ct.POINTER(unw_proc_info_t),
237 | ct.c_int,
238 | ct.c_void_p,
239 | )
240 | PUT_UNWIND_INFO_FUNCTYPE = ct.CFUNCTYPE(
241 | None, unw_addr_space_t, ct.POINTER(unw_proc_info_t), ct.c_void_p
242 | )
243 | GET_DYN_INFO_LIST_ADDR_FUNCTYPE = ct.CFUNCTYPE(
244 | ct.c_int, unw_addr_space_t, ct.POINTER(unw_word_t), ct.c_void_p
245 | )
246 | ACCESS_MEM_FUNCTYPE = ct.CFUNCTYPE(
247 | ct.c_int,
248 | unw_addr_space_t,
249 | unw_word_t,
250 | ct.POINTER(unw_word_t),
251 | ct.c_int,
252 | ct.c_void_p,
253 | )
254 | ACCESS_REG_FUNCTYPE = ct.CFUNCTYPE(
255 | ct.c_int,
256 | unw_addr_space_t,
257 | unw_regnum_t,
258 | ct.POINTER(unw_word_t),
259 | ct.c_int,
260 | ct.c_void_p,
261 | )
262 | ACCESS_FPREG_FUNCTYPE = ct.CFUNCTYPE(
263 | ct.c_int,
264 | unw_addr_space_t,
265 | unw_regnum_t,
266 | ct.POINTER(unw_fpreg_t),
267 | ct.c_int,
268 | ct.c_void_p,
269 | )
270 | GET_PROC_NAME_FUNCTYPE = ct.CFUNCTYPE(
271 | ct.c_int,
272 | unw_addr_space_t,
273 | unw_word_t,
274 | ct.c_char_p,
275 | ct.c_size_t,
276 | ct.POINTER(unw_word_t),
277 | ct.c_void_p,
278 | )
279 |
280 | create_addr_space = unw_func("create_addr_space")
281 | create_addr_space.restype = ct.c_void_p
282 | create_addr_space.argtypes = [ct.c_void_p, ct.c_int]
283 |
284 | init_remote = unw_func("init_remote")
285 | init_remote.restype = ct.c_int
286 | init_remote.argtypes = [ct.c_void_p, ct.c_void_p, ct.c_int]
287 |
288 |
289 | dwarf_search_unwind_table = unw_func("dwarf_search_unwind_table")
290 | dwarf_search_unwind_table.restype = ct.c_int
291 | dwarf_search_unwind_table.argtypes = [
292 | unw_addr_space_t,
293 | unw_word_t,
294 | ct.POINTER(unw_dyn_info_t),
295 | ct.POINTER(unw_proc_info_t),
296 | ct.c_int,
297 | ct.c_void_p,
298 | ]
299 |
300 |
301 | class unw_cursor_t(ct.Structure):
302 | """
303 | Mapping of unw_cursor_t type.
304 | """
305 |
306 | _fields_ = [("opaque", unw_word_t * UNW_TDEP_CURSOR_LEN)]
307 |
308 |
309 | step = unw_func("step")
310 | step.restype = ct.c_int
311 | step.argtypes = [ct.POINTER(unw_cursor_t)]
312 |
313 | get_reg = unw_func("get_reg")
314 | get_reg.restype = ct.c_int
315 | get_reg.argtypes = [ct.POINTER(unw_cursor_t), unw_regnum_t, ct.POINTER(unw_word_t)]
316 |
317 |
318 | class unw_accesors(ct.Structure):
319 | """
320 | Mapping of unw_accessors type.
321 | """
322 |
323 | _fields_ = [
324 | ("find_proc_info", FIND_PROC_INFO_FUNCTYPE),
325 | ("put_unwind_info", PUT_UNWIND_INFO_FUNCTYPE),
326 | ("get_dyn_info_list_addr", GET_DYN_INFO_LIST_ADDR_FUNCTYPE),
327 | ("access_mem", ACCESS_MEM_FUNCTYPE),
328 | ("access_reg", ACCESS_REG_FUNCTYPE),
329 | ("access_fpreg", ACCESS_FPREG_FUNCTYPE),
330 | ("resume", ct.c_void_p), # Unused
331 | ("get_proc_name", GET_PROC_NAME_FUNCTYPE),
332 | ]
333 |
334 |
335 | class Frame:
336 | """
337 | A stack frame.
338 | """
339 |
340 | def __init__(
341 | self,
342 | stack: ct._CData,
343 | ip: int,
344 | die: DIE,
345 | start_addr: int,
346 | processmetadata: ProcessMetadata,
347 | cursor: unw_cursor_t,
348 | prev_frame: Optional[Frame] = None,
349 | next_frame: Optional[Frame] = None,
350 | ):
351 | self.stack = stack
352 | self.ip = ip
353 | self.die = die
354 |
355 | self.start_addr = start_addr
356 | self.processmetadata = processmetadata
357 | # We don't keep the cursor itself, we make a copy instead.
358 | self.cursor = unw_cursor_t()
359 | ct.pointer(self.cursor)[0] = cursor
360 | self.prev_frame = prev_frame
361 | self.next_frame = next_frame
362 |
363 | @cached_property
364 | def fde(self) -> Optional[CFIEntry]:
365 | """
366 | Returns the FDE associated with this call frame.
367 | """
368 | region = self.region
369 | if region is None:
370 | return None
371 | v_ip = self.ip - region.start
372 | if region.eh_frame_hdr is None:
373 | return None
374 | fde = region.eh_frame_hdr.find_fde(v_ip)
375 | return fde
376 |
377 | @cached_property
378 | def _expr_parser(self) -> DWARFExprParser:
379 | """
380 | DWARF Expr parser.
381 | """
382 | return DWARFExprParser(self.processmetadata.dwarf_info.structs)
383 |
384 | @cached_property
385 | def cfa_rule(self) -> Optional[CFARule]:
386 | """
387 | Returns the CFA rule associated with this call frame.
388 | """
389 | if self.fde is None:
390 | return None
391 | for row in reversed(self.fde.get_decoded().table):
392 | if row["pc"] < self.ip - self.region.start:
393 | return row["cfa"]
394 | return None
395 |
396 | @cached_property
397 | def cfa(self) -> Optional[int]:
398 | """
399 | Compute the CFA for this call frame.
400 | """
401 | if self.cfa_rule is None:
402 | return None
403 | cfa_reg_value = unw_word_t(0)
404 | get_reg(self.cursor, self.cfa_rule.reg, ct.byref(cfa_reg_value))
405 | return cfa_reg_value.value + self.cfa_rule.offset - self.start_addr # type: ignore
406 |
407 | @cached_property
408 | def region(self) -> MappedRegion:
409 | """
410 | Return the MappedRegion correspoding to this Frame's IP.
411 | """
412 | region = self.processmetadata.map_for_addr(self.ip)
413 | if region is None:
414 | raise ValueError("This frame could not be associated to a region.")
415 | return region
416 |
417 | @cached_property
418 | def function_name(self) -> Optional[str]:
419 | """
420 | Returns the function name associated to this frame's DIE
421 | """
422 | if self.die is None:
423 | return None
424 | return die_name(self.die)
425 |
426 | def _get_parsed_expr_for_attribute(self, argnum: int) -> List[DWARFExprOp]:
427 | """
428 | Returns a list of parsed DwarfEXPROp for the attribute corresponding to the
429 | argnum'th argument.
430 | """
431 | curargnum = 0
432 | if self.die is None:
433 | return []
434 | for subdie in self.die.iter_children():
435 | if subdie.tag == "DW_TAG_formal_parameter":
436 | curargnum += 1
437 | if curargnum == argnum:
438 | locattr = subdie.attributes["DW_AT_location"]
439 | return self._get_parsed_exprs_from_loc(subdie, locattr)
440 | return []
441 |
442 | def _get_parsed_exprs_from_loc(
443 | self, die: DIE, locattr: AttributeValue
444 | ) -> List[DWARFExprOp]:
445 | """
446 | Returns a list of parsed DWARFExprOp for a given attribute.
447 | """
448 | expr = None
449 | loc = self.processmetadata.location_parser.parse_from_attribute(
450 | locattr, die.cu.header.version, die
451 | )
452 | if isinstance(loc, LocationExpr):
453 | expr = loc.loc_expr
454 | else:
455 | base_address = die.cu.get_top_DIE().attributes["DW_AT_low_pc"].value
456 | expr = None
457 | for entry in loc:
458 | if isinstance(entry, BaseAddressEntry):
459 | base_address = entry.base_address
460 | elif isinstance(entry, LocationEntry):
461 | start = entry.begin_offset + base_address
462 | end = entry.end_offset + base_address
463 | if start <= (self.ip - self.region.start) <= end:
464 | expr = entry.loc_expr
465 | break
466 | else:
467 | raise NotImplementedError(
468 | f"Location entries of type {type(entry)} are not supported"
469 | )
470 | if expr is None:
471 | raise ValueError("Could not find LocationExpr in attr {locattr}")
472 | parsed_exprs: List[DWARFExprOp] = self._expr_parser.parse_expr(expr)
473 | return parsed_exprs
474 |
475 | def fetch_arg(self, argnum: int, ctype: Type[CT]) -> CT:
476 | """
477 | Fetch the argument number argnum, interpreting it as a ctype.
478 | """
479 | # We have all the registers set up correctly, fetch things directly.
480 | rv: CT
481 | if self.cfa is None:
482 | # Fetch the argument directly from the register
483 | argreg = unw_word_t(0)
484 | ARGNUM_TO_REGNUM = {1: 5, 2: 4, 3: 1, 4: 2, 5: 8}
485 | get_reg(self.cursor, ARGNUM_TO_REGNUM[argnum], ct.byref(argreg))
486 | return ctype(argreg.value)
487 | expr = self._get_parsed_expr_for_attribute(argnum)
488 | dwarf_stack: List[CT] = []
489 | for op in expr:
490 | rv = self.eval_expr(op, ctype, dwarf_stack)
491 | return rv
492 |
493 | def _read_arg_from_stack(self, offset: int, ctype: Type[CT]) -> CT:
494 | """
495 | Read an argument of givent type at the given offset from the stack.
496 | """
497 | assert 0 <= offset < len(self.stack) # type: ignore
498 | return ctype.from_buffer(bytearray(self.stack)[offset:])
499 |
500 | def eval_expr(
501 | self, expr: DWARFExprOp, ctype: Type[CT], dwarf_stack: List[CT]
502 | ) -> CT:
503 | """
504 | Eval simple expressions.
505 | """
506 | # It's a register
507 | if self.die is None:
508 | raise ValueError("No DIE could be found for frame {self}")
509 | if expr.op_name == "DW_OP_fbreg":
510 | # If we are an inlined subroutine, lookup the parent frame base.
511 | die = self.die
512 | while die.tag == "DW_TAG_inlined_subroutine":
513 | if self.next_frame is None:
514 | raise ValueError("Cannot find parent frame of inlined subroutine")
515 | die = self.next_frame.die
516 | frameexpr = self.processmetadata.location_parser.parse_from_attribute(
517 | die.attributes["DW_AT_frame_base"],
518 | self.die.cu.header.version,
519 | self.die,
520 | )
521 | parsed_expr = self._expr_parser.parse_expr(frameexpr.loc_expr)
522 | for item in parsed_expr:
523 | base_value = self.eval_expr(item, ct.c_int, dwarf_stack) # type: ignore
524 | offset = base_value.value + expr.args[0]
525 | return self._read_arg_from_stack(offset, ctype)
526 | if expr.op_name == "DW_OP_call_frame_cfa":
527 | return ctype(self.cfa)
528 | if expr.op_name == "DW_OP_entry_value":
529 | # We evaluate the expression in the calling frame.
530 | for op in expr.args[0]:
531 | if self.next_frame is None:
532 | raise ValueError(
533 | "Cannot find parent frame for evaluation of entry point"
534 | )
535 | rv = self.next_frame.eval_expr(op, ctype, dwarf_stack)
536 | dwarf_stack.append(rv)
537 | return ctype(0)
538 | if expr.op_name == "DW_OP_stack_value":
539 | return dwarf_stack[-1]
540 | if expr.op_name.startswith("DW_OP_reg"):
541 | regnum = expr.op - 0x50
542 | val = unw_word_t(0)
543 | get_reg(self.cursor, regnum, ct.byref(val))
544 | return ctype(val.value)
545 | raise NotImplementedError(f"Unsupported expr type: {expr.op_name}")
546 |
547 |
548 | class UnwindAddressSpace:
549 | """
550 | A virtual address space for use by libunwind.
551 | """
552 |
553 | def __init__(self, capture: stack_data_t, processmetadata: ProcessMetadata):
554 | self.capture = capture
555 | self.registers: List[ct.c_ulonglong] = [
556 | ct.c_ulonglong(getattr(self.capture, name)) for name in REG_NAMES
557 | ]
558 | self.processmetadata = processmetadata
559 | self.accessors = unw_accesors(
560 | find_proc_info=FIND_PROC_INFO_FUNCTYPE(self.find_proc_info),
561 | put_unwind_info=PUT_UNWIND_INFO_FUNCTYPE(self.put_unwind_info),
562 | get_dyn_info_list_addr=GET_DYN_INFO_LIST_ADDR_FUNCTYPE(
563 | self.get_dyn_info_list_addr
564 | ),
565 | access_mem=ACCESS_MEM_FUNCTYPE(self.access_mem),
566 | access_reg=ACCESS_REG_FUNCTYPE(self.access_reg),
567 | access_fpreg=ACCESS_FPREG_FUNCTYPE(self.access_reg),
568 | get_proc_name=GET_PROC_NAME_FUNCTYPE(self.get_proc_name),
569 | )
570 |
571 | # 0 takes the default byteorder
572 | self.unw_addr_space = create_addr_space(ct.byref(self.accessors), 0)
573 | if self.unw_addr_space == 0:
574 | raise RuntimeError("Something bad happened in create_addr_space")
575 | self.unw_cursor = unw_cursor_t()
576 | retval = init_remote(
577 | ct.byref(self.unw_cursor), self.unw_addr_space, 0
578 | ) # Don't use the opaque pointer for now
579 | if retval != 0:
580 | raise RuntimeError("Something bad happened in init_remote")
581 |
582 | def find_proc_info(
583 | self,
584 | addr_space: unw_addr_space_t,
585 | ip: int,
586 | pip: Pointer[unw_proc_info_t],
587 | need_unwind_info: ct.c_int,
588 | arg: ct.c_void_p,
589 | ) -> int:
590 | # pylint: disable=unused-argument,too-many-arguments
591 | """
592 | Implementation of libunwind find_proc_info callback.
593 | """
594 | # Find the top of the elfile.
595 | mmap = self.processmetadata.map_for_addr(ip)
596 |
597 | if mmap is None or mmap.eh_frame_hdr is None:
598 | return -UNW_ESTOPUNWIND
599 | pip[0] = unw_proc_info_t()
600 | dynamic_info = unw_dyn_info_t(
601 | start_ip=mmap.start,
602 | end_ip=mmap.end,
603 | format=UNW_INFO_FORMAT_REMOTE_TABLE,
604 | )
605 | dynamic_info.rti.name_ptr = 0
606 | # We only consider one specific binary. The virtual address space will
607 | # then consist of the actual stack and we will consider that the
608 | # eh_frame_hdr and everything else is located after that.
609 | dynamic_info.rti.segbase = mmap.start + mmap.eh_frame_hdr.offset
610 | dynamic_info.rti.table_data = (
611 | mmap.start + mmap.eh_frame_hdr.table_start + mmap.eh_frame_hdr.offset
612 | )
613 | dynamic_info.rti.table_len = (mmap.eh_frame_hdr.fde_count * 8) // ct.sizeof(
614 | unw_word_t
615 | )
616 | ret: int = dwarf_search_unwind_table(
617 | addr_space, ip, ct.byref(dynamic_info), pip, need_unwind_info, None
618 | )
619 | return ret
620 |
621 | def put_unwind_info(
622 | self,
623 | addr_space: unw_addr_space_t,
624 | pip: Pointer[unw_proc_info_t],
625 | arg: ct.c_void_p,
626 | ) -> None:
627 | """
628 | Implementation of libunwind put_unwind_info callback.
629 | """
630 | # pylint: disable=unused-argument
631 | return
632 |
633 | def get_dyn_info_list_addr(
634 | self,
635 | addr_space: unw_addr_space_t,
636 | dilap: Pointer[unw_word_t],
637 | arg: ct.c_void_p,
638 | ) -> int:
639 | """
640 | Implementation of libunwind get_dyn_info_list_addr callback.
641 | """
642 | # pylint: disable=unused-argument
643 | return -UNW_ENOINFO
644 |
645 | def access_mem(
646 | self,
647 | addr_space: unw_addr_space_t,
648 | addr: int,
649 | valp: Pointer[unw_word_t],
650 | write: int,
651 | arg: ct.c_void_p,
652 | ) -> int:
653 | """
654 | Implementation of libunwind access_mem callback.
655 | """
656 | # pylint: disable=unused-argument,too-many-arguments
657 | # We only support either file-mapped addresses, or addresses
658 | # refering to the stack.
659 | region = self.processmetadata.map_for_addr(addr)
660 | if region is None:
661 | return -UNW_EINVAL
662 | if region.path == "[stack]":
663 | stack_idx = addr - self.capture.start_addr
664 | if stack_idx >= self.capture.size:
665 | return -UNW_EINVAL
666 | if write == 0:
667 | valp[0] = unw_word_t.from_buffer(
668 | bytearray(self.capture.stack[stack_idx : stack_idx + 8])
669 | )
670 | else:
671 | self.capture.stack[stack_idx] = valp.contents
672 | return 0
673 |
674 | # It's from the ELFFile itself.
675 | if region.real_path:
676 | if write == 0:
677 | with region.real_path.open("rb") as f:
678 | f.seek(addr - region.start)
679 | valp[0] = unw_word_t.from_buffer(
680 | bytearray(f.read(ct.sizeof(unw_word_t)))
681 | )
682 | return 0
683 | return -UNW_EINVAL
684 |
685 | # It's from anywhere else: return EINVAL
686 | return -UNW_EINVAL
687 |
688 | def access_reg(
689 | self,
690 | addr_space: unw_addr_space_t,
691 | regnum: int,
692 | valp: Pointer[unw_word_t],
693 | write: int,
694 | arg: ct.c_void_p,
695 | ) -> int:
696 | """
697 | Implementation of libunwind access_reg callback.
698 | """
699 | # pylint: disable=unused-argument,too-many-arguments
700 | if write == 0:
701 | valp[0] = unw_word_t(self.registers[regnum].value)
702 | else:
703 | self.registers[regnum] = valp.contents
704 | return 0
705 |
706 | def access_fpreg(
707 | self,
708 | addr_space: unw_addr_space_t,
709 | regnum: unw_regnum_t,
710 | fpvalp: Pointer[unw_fpreg_t],
711 | write: ct.c_int,
712 | arg: ct.c_void_p,
713 | ) -> int:
714 | """
715 | Implementation of libunwind access_fpreg callback.
716 | """
717 | # pylint: disable=unused-argument,too-many-arguments
718 | return -UNW_EINVAL
719 |
720 | def get_proc_name(
721 | self,
722 | addr_space: unw_addr_space_t,
723 | addr: unw_word_t,
724 | bufp: ct.c_char_p,
725 | buf_len: ct.c_size_t,
726 | offp: Pointer[unw_word_t],
727 | arg: ct.c_void_p,
728 | ) -> int:
729 | """
730 | Implementation of libunwind get_proc_name callback.
731 | """
732 | # pylint: disable=unused-argument,too-many-arguments
733 | return -UNW_EINVAL
734 |
735 | def ip(self) -> int:
736 | """
737 | Return the instruction pointer from the unwind cursor.
738 | """
739 | ip = unw_word_t(0)
740 | get_reg(self.unw_cursor, UNW_REG_IP, ct.byref(ip))
741 | return ip.value
742 |
743 | def dies_for_ip(self) -> Tuple[DIE, ...]:
744 | """
745 | Return a tuple of DIEs for a given ip.
746 | """
747 | ip = self.ip()
748 | region = self.processmetadata.map_for_addr(ip)
749 | if region is None:
750 | return (None,)
751 | if region.path == str(self.processmetadata.program_raw):
752 | dies = self.processmetadata.get_die_and_inlined_subdies_for_addr(
753 | ip - region.start
754 | )
755 | if dies is not None:
756 | return dies
757 | return (None,)
758 |
759 | def frames(self) -> Generator[Frame, None, None]:
760 | """
761 | Returns the list of frames for this stack.
762 | """
763 | cur = ct.byref(self.unw_cursor)
764 | prev_frame = None
765 | while True:
766 | # Extract the IP
767 | ip = self.ip()
768 | for die in self.dies_for_ip():
769 | # The cursor is copied by the frame, no need to
770 | # worry about it
771 | cur_frame = Frame(
772 | self.capture.stack,
773 | ip,
774 | die,
775 | self.capture.start_addr,
776 | self.processmetadata,
777 | self.unw_cursor,
778 | prev_frame=prev_frame,
779 | )
780 | if prev_frame is not None:
781 | prev_frame.next_frame = cur_frame
782 | yield prev_frame
783 | prev_frame = cur_frame
784 | if step(cur) <= 0:
785 | break
786 | if prev_frame is not None:
787 | yield prev_frame
788 |
--------------------------------------------------------------------------------
/src/pgtracer/model/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Models definitions for execution concepts we extract information about.
3 | """
4 | from .memory import MemoryAllocations, MemoryAllocType, memory_account
5 | from .plan import PlanState
6 | from .query import Query
7 |
8 | __all__ = [
9 | "Query",
10 | "PlanState",
11 | "memory_account",
12 | "MemoryAllocations",
13 | "MemoryAllocType",
14 | ]
15 |
--------------------------------------------------------------------------------
/src/pgtracer/model/memory.py:
--------------------------------------------------------------------------------
1 | """
2 | Classes storing information about memory allocations.
3 | """
4 |
5 | import ctypes as ct
6 | from dataclasses import dataclass
7 | from enum import IntEnum
8 |
9 |
10 | # pylint: disable=invalid-name
11 | class MemoryAllocType(IntEnum):
12 | """
13 | MemoryAllocation types.
14 | """
15 |
16 | Sbrk = 1
17 | Mmap = 2
18 |
19 |
20 | class memory_account(ct.Structure):
21 | """
22 | Represents the data associated to a memory allocation or deallocation.
23 | """
24 |
25 | _fields_ = [
26 | ("event_type", ct.c_short),
27 | ("size", ct.c_longlong),
28 | ("kind", ct.c_short),
29 | ]
30 |
31 |
32 | @dataclass
33 | class MemoryAllocations:
34 | """
35 | Memory allocation counters.
36 | """
37 |
38 | mmap_alloc: int = 0
39 | mmap_free: int = 0
40 | sbrk_alloc: int = 0
41 | sbrk_free: int = 0
42 |
43 | current_running_mmap: int = 0
44 | current_running_sbrk: int = 0
45 |
46 | current_mem_peak: int = 0
47 |
48 | @property
49 | def mmap_total(self) -> int:
50 | """
51 | Compute the resulting mmaped total.
52 | """
53 | return self.mmap_alloc - self.mmap_free
54 |
55 | @property
56 | def sbrk_total(self) -> int:
57 | """
58 | Compute the resulting sbrk total.
59 | """
60 | return self.sbrk_alloc - self.sbrk_free
61 |
62 | @property
63 | def total_malloc(self) -> int:
64 | """
65 | Compute the total memory diff.
66 | """
67 | return self.mmap_total + self.sbrk_total
68 |
69 | def update(self, memory_account_event: memory_account) -> None:
70 | """
71 | Update the current totals.
72 | """
73 | if memory_account_event.kind == MemoryAllocType.Sbrk:
74 | self.current_running_sbrk += memory_account_event.size
75 | if memory_account_event.size > 0:
76 | self.sbrk_alloc += memory_account_event.size
77 | else:
78 | self.sbrk_free += -memory_account_event.size
79 | elif memory_account_event.kind == MemoryAllocType.Mmap:
80 | self.current_running_mmap += memory_account_event.size
81 | if memory_account_event.size > 0:
82 | self.mmap_alloc += memory_account_event.size
83 | else:
84 | self.mmap_free += -memory_account_event.size
85 | self.current_mem_peak = max(
86 | self.current_mem_peak, self.current_running_sbrk + self.current_running_mmap
87 | )
88 |
--------------------------------------------------------------------------------
/src/pgtracer/model/plan.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains definitions for representing PostgreSQL plans.
3 | """
4 | from __future__ import annotations
5 |
6 | import ctypes as ct
7 | from typing import TYPE_CHECKING, Dict, Optional
8 |
9 | from ..ebpf.collector.c_defs import plan_data, planstate_data
10 | from ..ebpf.dwarf import ProcessMetadata, Struct
11 | from ..utils import timespec_to_float
12 |
13 | if TYPE_CHECKING:
14 | from enum import IntEnum
15 |
16 |
17 | def explain_dict_to_str(parts: Dict[str, str]) -> str:
18 | """
19 | Format a dict in the commonly used key=value format.
20 | """
21 | return " ".join(f"{key}={value}" for key, value in parts.items())
22 |
23 |
24 | class PlanState:
25 | """
26 | Information collected from a PostgreSQL PlanState Node.
27 | """
28 |
29 | def __init__(self, addr: Optional[int]):
30 | self.addr = addr
31 | self.tag: Optional[IntEnum] = None
32 | self.instrument: Optional[Struct] = None
33 | self.parent_node: Optional[PlanState] = None
34 | self.plan_data: Optional[plan_data] = None
35 | self.is_stub = True
36 | # We're using a Dict as poor man's OrderedSet
37 | self.children: Dict[PlanState, None] = {}
38 |
39 | def update(self, metadata: ProcessMetadata, event: planstate_data) -> None:
40 | """
41 | Update a Planstate from an event planstate_data.
42 | """
43 | instrument_addr = ct.addressof(event.instrument)
44 | tag = metadata.enums.NodeTag(event.plan_data.plan_tag) # type: ignore
45 | self.tag = tag
46 | self.instrument = metadata.structs.Instrumentation(instrument_addr)
47 | self.plan_data = plan_data()
48 | ct.pointer(self.plan_data)[0] = event.plan_data
49 |
50 | @property
51 | def title(self) -> str:
52 | """
53 | Return the node's title.
54 | """
55 | if self.tag is None:
56 | return "???"
57 | prefix = ""
58 | if self.plan_data and self.plan_data.parallel_aware:
59 | prefix = "Parallel "
60 | buf = f"{prefix}{str(self.tag.name[2:])}"
61 | # TODO: add additional information here
62 | return buf
63 |
64 | @property
65 | def cost(self) -> str:
66 | """
67 | Returns the "cost" section formatted similarly to PostgreSQL explain
68 | """
69 | if self.plan_data is None:
70 | parts = {"cost": "?..?", "rows": "?", "width": "?"}
71 | else:
72 | parts = {
73 | "cost": f"{self.plan_data.startup_cost:.2f}..{self.plan_data.total_cost:.2f}",
74 | "rows": f"{int(self.plan_data.plan_rows)}",
75 | "width": f"{int(self.plan_data.plan_width)}",
76 | }
77 | return f"({explain_dict_to_str(parts)})"
78 |
79 | @property
80 | def actual(self) -> str:
81 | """
82 | Returns the "actual" section formatted similarly to PostgreSQL explain.
83 | """
84 | if self.instrument is None:
85 | parts = {"time": "?..?", "rows": "?", "loops": "?"}
86 | else:
87 | total = timespec_to_float(self.instrument.counter)
88 | parts = {
89 | "time": f"{(self.instrument.firsttuple.value * 1000):0.3f}...{(total * 1000):0.3f}",
90 | "rows": f"{int(self.instrument.tuplecount.value)}",
91 | "loops": f"{int(self.instrument.nloops.value)}",
92 | }
93 | return f"(actual {explain_dict_to_str(parts)})"
94 |
95 | @property
96 | def buffers(self) -> str:
97 | """
98 | Returns the "buffers" section formatted similarly to PostgreSQL
99 | explain.
100 | """
101 | if self.instrument is None:
102 | return ""
103 | bufusage_dict = self.instrument.bufusage.as_dict(include_all=True)
104 | parts = {}
105 | for key, value in bufusage_dict.items():
106 | if isinstance(value, (ct.c_long,)) and value.value != 0:
107 | parts[key] = str(value.value)
108 | if not parts:
109 | return ""
110 | return f"Buffers: {explain_dict_to_str(parts)}"
111 |
112 | def explain(self, indent_level: int = 0) -> str:
113 | """
114 | Format the plan represented by this node similarly to PostgreSQL
115 | explain.
116 | """
117 | if indent_level == 0:
118 | prefix = ""
119 | else:
120 | prefix = "\t" * indent_level + "-> "
121 | buf = f"{prefix}{self.title} {self.cost} {self.actual}"
122 | buffer_line = self.buffers
123 | if buffer_line:
124 | buf += "\n" + "\t" * (indent_level + 1) + buffer_line
125 | for child in self.children:
126 | buf += "\n"
127 | buf += child.explain(indent_level + 1)
128 | return buf
129 |
--------------------------------------------------------------------------------
/src/pgtracer/model/query.py:
--------------------------------------------------------------------------------
1 | """
2 | This module contains definitions for representing PostgreSQL queries.
3 | """
4 | from __future__ import annotations
5 |
6 | import ctypes as ct
7 | from collections import defaultdict
8 | from datetime import datetime, timedelta
9 | from typing import TYPE_CHECKING, Any, Dict, Optional
10 |
11 | from ..ebpf.unwind import UnwindAddressSpace, stack_data_t
12 | from ..utils import timespec_to_timedelta
13 | from .memory import MemoryAllocations
14 | from .plan import PlanState
15 |
16 | if TYPE_CHECKING:
17 | from ..ebpf.collector import planstate_data, portal_data
18 | from ..ebpf.dwarf import ProcessMetadata
19 |
20 |
21 | FUNCTION_ARGS_MAPPING = {
22 | "ExecProcNodeFirst": 1,
23 | "ExecProcNodeInstr": 1,
24 | "ExecProcNode": 1,
25 | "ExecAgg": 1,
26 | "ExecAppend": 1,
27 | "ExecBitmapAnd": 1,
28 | "ExecBitmapHeapScan": 1,
29 | "ExecBitmapIndexScan": 1,
30 | "ExecBitmapOr": 1,
31 | "ExecCteScan": 1,
32 | "ExecCustomScan": 1,
33 | "ExecForeignScan": 1,
34 | "ExecFunctionScan": 1,
35 | "ExecGather": 1,
36 | "ExecGatherMerge": 1,
37 | "ExecGroup": 1,
38 | "ExecHash": 1,
39 | "ExecHashJoin": 1,
40 | "ExecIncrementalSort": 1,
41 | "ExecIndexOnlyScan": 1,
42 | "ExecIndexScan": 1,
43 | "ExecLimit": 1,
44 | "ExecLockRows": 1,
45 | "ExecMaterial": 1,
46 | "ExecMemoize": 1,
47 | "ExecMergeAppend": 1,
48 | "ExecMergeJoin": 1,
49 | "ExecModifyTable": 1,
50 | "ExecNamedTuplestoreScan": 1,
51 | "ExecNestLoop": 1,
52 | "ExecProjectSet": 1,
53 | "ExecRecursiveUnion": 1,
54 | "ExecResult": 1,
55 | "ExecSampleScan": 1,
56 | "ExecSeqScan": 1,
57 | "ExecSetOp": 1,
58 | "ExecSort": 1,
59 | "ExecSubqueryScan": 1,
60 | "ExecTableFuncScan": 1,
61 | "ExecTidRangeScan": 1,
62 | "ExecTidScan": 1,
63 | "ExecUnique": 1,
64 | "ExecValuesScan": 1,
65 | "ExecWindowAgg": 1,
66 | "ExecWorkTableScan": 1,
67 | "MultiExecHash": 1,
68 | "MultiExecBitmapIndexScan": 1,
69 | "MultiExecBitmapAnd": 1,
70 | "MultiExecBitmapOr": 1,
71 | }
72 |
73 |
74 | class Query:
75 | """
76 | A PostgreSQL Query.
77 | """
78 |
79 | def __init__(
80 | self,
81 | *,
82 | addr: int,
83 | query_id: int,
84 | startup_cost: float,
85 | total_cost: float,
86 | plan_rows: float,
87 | startts: Optional[float] = None,
88 | text: Optional[str] = None,
89 | # Instrumentation is a dynamically generated class, no way to check it
90 | instrument: Any = None,
91 | search_path: Optional[str] = None,
92 | ):
93 | self.addr = addr
94 | self.query_id = query_id
95 | self.startup_cost = startup_cost
96 | self.total_cost = total_cost
97 | self.plan_rows = plan_rows
98 | self.startts = startts
99 | self.text = text
100 | self.instrument = instrument
101 | self.search_path = search_path
102 | self.nodes: Dict[int, PlanState] = {}
103 | self.io_counters: Dict[str, int] = defaultdict(lambda: 0)
104 | self.memallocs: MemoryAllocations = MemoryAllocations()
105 |
106 | @property
107 | def root_node(self) -> PlanState:
108 | """
109 | Returns the plan's root node.
110 | """
111 | root_candidates = [
112 | node for node in self.nodes.values() if node.parent_node is None
113 | ]
114 | if len(root_candidates) == 0:
115 | raise ValueError("Invalid plan, we have no root node when we expect 1")
116 | if len(root_candidates) > 1:
117 | # In that case, we need to build a "fake" parent node.
118 | root_node = PlanState(None)
119 | root_node.children = {c: None for c in root_candidates}
120 | else:
121 | root_node = root_candidates[0]
122 | return root_node
123 |
124 | @classmethod
125 | def from_event(cls, metadata: ProcessMetadata, event: portal_data) -> Query:
126 | """
127 | Build a query from portal_data event generated by eBPF.
128 | """
129 | instrument_addr = ct.addressof(event.instrument)
130 | instrument = metadata.structs.Instrumentation(instrument_addr)
131 | search_path = None
132 | if event.search_path:
133 | search_path = event.search_path.decode("utf8")
134 | _, creation_time = event.portal_key.as_tuple()
135 | return cls(
136 | addr=event.query_addr,
137 | query_id=event.query_id,
138 | startup_cost=event.startup_cost,
139 | total_cost=event.total_cost,
140 | plan_rows=event.plan_rows,
141 | startts=creation_time,
142 | text=event.query.decode("utf8"),
143 | instrument=instrument,
144 | search_path=search_path,
145 | )
146 |
147 | def update(self, metadata: ProcessMetadata, event: portal_data) -> None:
148 | """
149 | Update the query from an eBPF portal_data event.
150 | """
151 | instrument_addr = ct.addressof(event.instrument)
152 | instrument = metadata.structs.Instrumentation(instrument_addr)
153 | if instrument.running:
154 | self.instrument = instrument
155 | _, creation_time = event.portal_key.as_tuple()
156 | self.startts = creation_time or self.startts
157 | self.text = event.query.decode("utf-8") or self.text
158 | search_path = event.search_path.decode("utf8")
159 | self.search_path = search_path or self.search_path
160 |
161 | @property
162 | def start_datetime(self) -> Optional[datetime]:
163 | """
164 | Returns the creation timestamp of the portal associated to this query.
165 | """
166 | if self.startts is None:
167 | return None
168 | return datetime.fromtimestamp(self.startts / 1000000)
169 |
170 | @property
171 | def runtime(self) -> Optional[timedelta]:
172 | """
173 | Returns the query's top-node total runtime.
174 | """
175 | if self.instrument and self.instrument.need_timer.value:
176 | return timespec_to_timedelta(self.instrument.counter)
177 | return None
178 |
179 | @property
180 | def shared_buffers_hitratio(self) -> Optional[float]:
181 | """
182 | Returns the hit ratio from the shared buffers.
183 | """
184 | if self.instrument is None:
185 | return None
186 | bufusage = self.instrument.bufusage
187 | total_blks = bufusage.shared_blks_hit.value + bufusage.shared_blks_read.value
188 | # If we didn't read any block, hit ratio is None
189 | if total_blks == 0:
190 | return None
191 | return float(bufusage.shared_blks_hit.value / total_blks * 100)
192 |
193 | @property
194 | def syscache_hitratio(self) -> Optional[float]:
195 | """
196 | Returns the system's hit ratio.
197 | """
198 | if self.instrument is None:
199 | return None
200 | bufusage = self.instrument.bufusage
201 | # FIXME: don't assume a fixed block size, either pass it as an option
202 | # or query the actual value from the DB
203 | blksize = 8192
204 | total_blks = (
205 | bufusage.shared_blks_read.value
206 | + bufusage.local_blks_read.value
207 | + bufusage.temp_blks_read.value
208 | )
209 | total_bytes = total_blks * blksize
210 | if total_bytes == 0:
211 | return None
212 | bytes_hit = total_bytes - self.io_counters["R"]
213 | return float(bytes_hit / total_bytes * 100)
214 |
215 | def add_nodes_from_stack(
216 | self,
217 | metadata: ProcessMetadata,
218 | stack: stack_data_t,
219 | start_at: int = 0,
220 | base_node: Optional[PlanState] = None,
221 | ) -> None:
222 | """
223 | Process a capture stack to add node stubs to this query.
224 | """
225 | addr_space = UnwindAddressSpace(stack, metadata)
226 | nodes = self.nodes
227 | cur_node = base_node
228 | for idx, frame in enumerate(addr_space.frames()):
229 | if idx < start_at:
230 | continue
231 | if frame.function_name in FUNCTION_ARGS_MAPPING:
232 | argnum = FUNCTION_ARGS_MAPPING[frame.function_name]
233 | parent_addr = frame.fetch_arg(argnum, ct.c_ulonglong).value
234 | if cur_node and parent_addr == cur_node.addr:
235 | continue
236 | parent_node = nodes.get(parent_addr)
237 | if parent_node is None:
238 | parent_node = PlanState(parent_addr)
239 | nodes[parent_addr] = parent_node
240 | if cur_node:
241 | cur_node.parent_node = parent_node
242 | parent_node.children[cur_node] = None
243 | # The parent_node is already not a stub, meaning its ancestors
244 | # have been resolved. Stop walking the frame here
245 | if not parent_node.is_stub:
246 | break
247 | cur_node = parent_node
248 |
249 | def add_node_from_event(
250 | self, metadata: ProcessMetadata, event: planstate_data
251 | ) -> PlanState:
252 | """
253 | Add a node from planstate_data event to this query plantree.
254 | We walk the stack up to understand where the nodes are located relative
255 | to each other.
256 | """
257 | nodes = self.nodes
258 | addr = event.planstate_addr
259 | planstate = nodes.get(addr)
260 | if planstate is None:
261 | planstate = PlanState(addr)
262 | nodes[addr] = planstate
263 | planstate.update(metadata, event)
264 | if not planstate.is_stub:
265 | return planstate
266 | self.add_nodes_from_stack(
267 | metadata, event.stack_capture, start_at=1, base_node=planstate
268 | )
269 | planstate.is_stub = False
270 | return planstate
271 |
--------------------------------------------------------------------------------
/src/pgtracer/scripts/pgtrace_gucs.py:
--------------------------------------------------------------------------------
1 | """
2 | This simple script reads and writes GUCs in a running PostgreSQL backend
3 | """
4 | import argparse
5 |
6 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector, GUCTracerOptions
7 |
8 |
9 | def main() -> None:
10 | """
11 | Entry point for the pgtrace_gucs script.
12 | """
13 | parser = argparse.ArgumentParser(
14 | description="Run and / or write GUCs from a running PostgreSQL backend."
15 | )
16 | parser.add_argument("pid", type=int, help="PID to connect to")
17 |
18 | parser.add_argument(
19 | "--set-guc",
20 | metavar="GUC=VALUE",
21 | dest="set_gucs",
22 | nargs="+",
23 | default=[],
24 | help="Set a number of GUCs in the running backend",
25 | )
26 |
27 | args = parser.parse_args()
28 | pid = args.pid
29 |
30 | # Parse the set-guc option.
31 | set_gucs = {}
32 | for keyvalue in args.set_gucs:
33 | key, value = keyvalue.split("=")
34 | set_gucs[key] = value
35 | options = GUCTracerOptions()
36 |
37 | collector = GUCTracerBPFCollector.from_pid(pid, options)
38 | collector.start()
39 | print(f"Backend is of type {str(collector.backend_type)}")
40 | seen = set()
41 | for gucname, gucvalue in set_gucs.items():
42 | collector.set_guc(gucname, gucvalue)
43 | while collector.is_running:
44 | with collector.lock:
45 | for guc in collector.guc_defs.values():
46 | if guc.guc_name is not None:
47 | seen.add(guc.guc_name)
48 | collector.stop()
49 |
50 |
51 | if __name__ == "__main__":
52 | main()
53 |
--------------------------------------------------------------------------------
/src/pgtracer/scripts/pgtrace_queries.py:
--------------------------------------------------------------------------------
1 | """
2 | This simple script trace queries executed by a Postgres backend.
3 | """
4 |
5 | import argparse
6 | import sys
7 | import time
8 | from collections import defaultdict
9 | from datetime import timedelta
10 | from typing import Any, Dict, Optional
11 |
12 | from pgtracer.ebpf.collector.querytracer import (
13 | InstrumentationFlags,
14 | QueryTracerBPFCollector,
15 | QueryTracerOptions,
16 | )
17 | from pgtracer.ebpf.dwarf import Struct
18 | from pgtracer.model.query import Query
19 | from pgtracer.utils import timespec_to_timedelta
20 |
21 |
22 | def dump_dict(somedict: Dict[str, Any], indent: int = 0) -> str:
23 | """
24 | Dump a dictionary as an indented string of key / value pairs.
25 | """
26 | parts = []
27 | tabs = "\t" * indent
28 | for key, value in somedict.items():
29 | if isinstance(value, Struct):
30 | # Special case for timespec
31 | if value.__class__.__name__ == "timespec":
32 | try:
33 | value = timespec_to_timedelta(value)
34 | except OverflowError:
35 | # Ignore overflowing timespecs
36 | continue
37 | else:
38 | value = value.as_dict(include_all=True)
39 | if isinstance(value, dict):
40 | part = "\n" + dump_dict(value, indent + 1)
41 | else:
42 | if hasattr(value, "value"):
43 | part = value.value
44 | else:
45 | part = value
46 | parts.append(f"{tabs}{key}: {part}")
47 | return "\n".join(parts)
48 |
49 |
50 | def print_query(query: Query, options: QueryTracerOptions) -> None:
51 | """
52 | Print a query according to which collector options have been set.
53 | """
54 | parts = []
55 | start = ""
56 | if query.start_datetime is not None:
57 | start = query.start_datetime.isoformat()
58 | parts.append(f"{start} {query.text}")
59 | mapping = {}
60 | mapping["search_path"] = query.search_path
61 | mapping["query_id"] = str(query.query_id) or ""
62 | mapping["startup_cost"] = str(query.startup_cost)
63 | mapping["total_cost"] = str(query.total_cost)
64 | mapping["plan_rows"] = str(query.plan_rows)
65 | mapping["peak_mem_alloc"] = str(query.memallocs.current_mem_peak)
66 | if query.instrument.need_timer:
67 | mapping["runtime"] = str(query.runtime)
68 | if options.instrument_flags & InstrumentationFlags.BUFFERS:
69 | mapping["written_bytes_to_disk"] = str(query.io_counters["W"])
70 | if query.shared_buffers_hitratio is not None:
71 | mapping["shared_buffers_hitratio"] = f"{query.shared_buffers_hitratio:0.2f}"
72 | else:
73 | mapping["shared_buffers_hitratio"] = None
74 | if query.syscache_hitratio is not None:
75 | mapping["syscache_hitratio"] = f"{query.syscache_hitratio:0.2f}"
76 | else:
77 | mapping["syscache_hitratio"] = None
78 | if query.instrument:
79 | mapping["buffer_usage"] = query.instrument.bufusage
80 | if options.instrument_flags & InstrumentationFlags.WAL and query.instrument:
81 | mapping["wal_usage"] = query.instrument.walusage
82 | print(query.text)
83 | print(dump_dict(mapping, 1))
84 | if options.enable_nodes_collection:
85 | print(query.root_node.explain())
86 |
87 |
88 | LINE_UP = "\033[1A"
89 | LINE_CLEAR = "\x1b[2K"
90 |
91 |
92 | def print_running_query(
93 | query: Query, print_plan: bool, first_time: bool, clear_line: int = 0
94 | ) -> int:
95 | """
96 | Print the currently running query.
97 | """
98 | nb_lines = 0
99 | if first_time:
100 | print("Currently running:")
101 | print(query.text)
102 | if not print_plan:
103 | print("Tuples produced / tuple expected")
104 | print("")
105 | for _ in range(clear_line):
106 | print(LINE_UP, end=LINE_CLEAR)
107 | if print_plan and query.root_node:
108 | plan = query.root_node.explain()
109 | nb_lines = len(plan.split("\n"))
110 | print(plan)
111 | else:
112 | print(f"{int(query.instrument.tuplecount.value)} / {int(query.plan_rows)}")
113 | return nb_lines
114 |
115 |
116 | def main() -> None:
117 | """
118 | Entry point for the pgtrace_queries script.
119 | """
120 | parser = argparse.ArgumentParser(
121 | description="Dump a running backend execution plan"
122 | )
123 | parser.add_argument("pid", type=int, help="PID to connect to")
124 | parser.add_argument(
125 | "--instrument",
126 | "-I",
127 | type=str,
128 | default=None,
129 | nargs="*",
130 | choices=[flag.name for flag in InstrumentationFlags],
131 | action="extend",
132 | help="""Instrument flags to set. (warning: writes into backends
133 | memory!)""",
134 | )
135 | parser.add_argument(
136 | "--nodes-collection",
137 | "-n",
138 | default=False,
139 | action="store_true",
140 | help="""Collect information about individual execution nodes""",
141 | )
142 |
143 | args = parser.parse_args()
144 | pid = args.pid
145 | instrument_flags = 0
146 | if args.instrument:
147 | for flag in args.instrument:
148 | instrument_flags |= InstrumentationFlags[flag]
149 | options = QueryTracerOptions(
150 | instrument_flags=instrument_flags,
151 | enable_nodes_collection=args.nodes_collection,
152 | enable_perf_events=instrument_flags != 0,
153 | )
154 | collector = QueryTracerBPFCollector.from_pid(pid, options)
155 | collector.start()
156 | total_queries = 0
157 | last_running_query: Dict[int, Optional[Query]] = defaultdict(lambda: None)
158 | lines_to_clear = 0
159 | while collector.is_running:
160 | try:
161 | time.sleep(1)
162 | for (
163 | pid,
164 | process_info,
165 | ) in collector.event_handler.per_process_info.copy().items():
166 | if not process_info.query_history and process_info.current_query:
167 | first_time = (
168 | last_running_query[pid] is not process_info.current_query
169 | )
170 | if first_time:
171 | lines_to_clear = 0
172 | lines_to_clear = print_running_query(
173 | process_info.current_query,
174 | options.enable_nodes_collection,
175 | first_time,
176 | lines_to_clear,
177 | )
178 | last_running_query[pid] = process_info.current_query
179 | continue
180 | last_running_query[pid] = None
181 | for query in process_info.query_history:
182 | print_query(query, options)
183 | total_queries += len(process_info.query_history)
184 | process_info.query_history = []
185 | except KeyboardInterrupt:
186 | break
187 | collector.stop()
188 | total_processes = len(collector.event_handler.process_history) + len(
189 | collector.event_handler.per_process_info
190 | )
191 | print(f"Processed {total_queries} queries among {total_processes} processes")
192 |
193 |
194 | if __name__ == "__main__":
195 | main()
196 |
--------------------------------------------------------------------------------
/src/pgtracer/utils.py:
--------------------------------------------------------------------------------
1 | """
2 | Miscellaneous utility functions.
3 | """
4 |
5 | import functools
6 | import itertools
7 | import re
8 | import subprocess
9 | from datetime import timedelta
10 | from typing import TYPE_CHECKING, BinaryIO, Optional, Union
11 |
12 | from pypsutil import Process
13 |
14 | from pgtracer.ebpf.dwarf import Struct
15 |
16 | if TYPE_CHECKING:
17 | from ctypes import _CData
18 | else:
19 | _CData = object
20 |
21 |
22 | def timespec_to_timedelta(timespec: Union[_CData, Struct]) -> timedelta:
23 | """
24 | Convert a timespec_t or instr_time struct to a timedelta.
25 | """
26 | # Can't really compare it to a proper class, so test on the class name
27 | if timespec.__class__.__name__ == "timespec":
28 | return timedelta(
29 | seconds=timespec.tv_sec.value, # type: ignore
30 | microseconds=timespec.tv_nsec.value / 1000, # type: ignore
31 | )
32 | if timespec.__class__.__name__ == "instr_time":
33 | return timedelta(seconds=timespec.ticks.value / 1000000000) # type: ignore
34 | raise ValueError("Expecting a timespec or instr_time struct")
35 |
36 |
37 | def timespec_to_float(timespec: _CData) -> float:
38 | """
39 | Convert a timespec_t or instr_time struct to a float representing the number of seconds.
40 | """
41 | if timespec.__class__.__name__ == "timespec":
42 | return float(timespec.tv_sec.value + timespec.tv_nsec.value / 1000000000) # type: ignore
43 | if timespec.__class__.__name__ == "instr_time":
44 | return float(timespec.ticks.value / 1000000000) # type: ignore
45 | raise ValueError("Expecting a timespec or instr_time struct")
46 |
47 |
48 | NSPID_PARSING_RE = re.compile(rb"^NSpid:\s+((?:(?:\d+)\s*)+)")
49 |
50 |
51 | def resolve_container_pid(container: str, container_pid: int) -> Optional[int]:
52 | """
53 | Resolve container_pid from the systemd-nspawn container `container`
54 | to a host pid.
55 | """
56 | # FIXME: this probably does not handle nested namespaces.
57 | completed_process = subprocess.run(
58 | ["machinectl", "show", container, "-p", "Leader"],
59 | capture_output=True,
60 | check=True,
61 | )
62 | container_leader_pid = int(completed_process.stdout.split(b"=")[1])
63 | # Now iterate over all child processes from this container.
64 | leader_process = Process(container_leader_pid)
65 | for child in leader_process.children(recursive=True):
66 | with open(f"/proc/{child.pid}/status", "rb") as statf:
67 | for line in statf:
68 | nspid_match = NSPID_PARSING_RE.match(line)
69 | if nspid_match:
70 | ns_pids = list(map(int, nspid_match.group(1).strip().split(b"\t")))
71 | if ns_pids[-1] == container_pid:
72 | return ns_pids[0]
73 | return None
74 |
75 |
76 | def readcstr(filelike: BinaryIO) -> bytes:
77 | """
78 | Read a NULL terminated C-string from a BinaryIO
79 | Courtesy of https://stackoverflow.com/a/32775270
80 | """
81 | toeof = iter(functools.partial(filelike.read, 1), b"")
82 | return b"".join(itertools.takewhile(b"\0".__ne__, toeof))
83 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | """
2 | Pytest fixtures.
3 | """
4 |
5 | import os
6 | import re
7 | import subprocess
8 | from pathlib import Path
9 | from pwd import getpwnam
10 | from tempfile import TemporaryDirectory
11 | from typing import Iterator
12 |
13 | import port_for
14 | import psycopg
15 | import pytest
16 | from pytest import FixtureRequest
17 | from pytest_postgresql.config import get_config
18 | from pytest_postgresql.executor import PostgreSQLExecutor
19 | from pytest_postgresql.executor_noop import NoopExecutor
20 |
21 | from pgtracer.ebpf.collector import CollectorOptions
22 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector
23 | from pgtracer.ebpf.collector.querytracer import (
24 | InstrumentationFlags,
25 | QueryTracerBPFCollector,
26 | )
27 | from pgtracer.utils import resolve_container_pid
28 |
29 |
30 | def pytest_addoption(parser):
31 | """
32 | Add the required options to pytest.
33 | """
34 | parser.addoption(
35 | "--container",
36 | help="Set this if the backend we are testing against is "
37 | "running inside a container.",
38 | )
39 |
40 |
41 | def pytest_configure(config):
42 | """
43 | Add used markers.
44 | """
45 | config.addinivalue_line(
46 | "markers", "slow: mark test as being 'slow', allowing to skip it"
47 | )
48 |
49 |
50 | @pytest.fixture(scope="session")
51 | def nonroot_postgres(request: FixtureRequest) -> Iterator[PostgreSQLExecutor]:
52 | """
53 | Returns a PostgreSQLExecutor to a newly created instance, running as the
54 | postgres user.
55 |
56 | FIXME: make the unix user used to run the instance configurable.
57 | """
58 |
59 | config = get_config(request)
60 |
61 | # If we have a host, use that instead of creating a new instance.
62 | if request.config.getoption("postgresql_host"):
63 | postgresql_executor = NoopExecutor(
64 | config.get("host"), 5432, "postgres", {}, "postgres"
65 | )
66 | postgresql_executor.unixsocketdir = None
67 | yield postgresql_executor
68 | return
69 |
70 | postgresql_ctl = config["exec"]
71 |
72 | if not os.path.exists(postgresql_ctl):
73 | pg_bindir = subprocess.check_output(
74 | ["pg_config", "--bindir"], universal_newlines=True
75 | ).strip()
76 | postgresql_ctl = os.path.join(pg_bindir, "pg_ctl")
77 |
78 | pg_passwd = getpwnam("postgres")
79 |
80 | with TemporaryDirectory() as tempdir_str:
81 | tmpdir = Path(tempdir_str)
82 | os.chown(tmpdir, pg_passwd.pw_uid, pg_passwd.pw_gid)
83 | pg_port = port_for.select_random()
84 | datadir = tmpdir / f"data-{pg_port}"
85 | unix_socket_dir = tmpdir / "unix-socket"
86 | postgresql_executor = PostgreSQLExecutor(
87 | executable=postgresql_ctl,
88 | shell=True,
89 | port=pg_port,
90 | host="localhost",
91 | unixsocketdir=str(unix_socket_dir),
92 | logfile=str(tmpdir / "pg_log"),
93 | dbname="postgres",
94 | startparams="",
95 | datadir=str(datadir),
96 | )
97 | postgresql_executor.VERSION_RE = re.compile(
98 | ".* (?P\\d+((\\.\\d+)|beta\\d|rc\\d|dev))"
99 | )
100 | pid = os.fork()
101 | if pid == 0:
102 | try:
103 | os.setuid(pg_passwd.pw_uid)
104 | os.chdir(str(tmpdir))
105 | datadir.mkdir()
106 | unix_socket_dir.mkdir()
107 | postgresql_executor.start()
108 | postgresql_executor.wait_for_postgres()
109 | except Exception as exc: # pylint: disable=broad-except
110 | print(exc)
111 | os._exit(1) # pylint: disable=protected-access
112 | finally:
113 | os._exit(0) # pylint: disable=protected-access
114 | else:
115 | pid, return_code = os.waitpid(pid, 0)
116 | if return_code != 0:
117 | raise Exception("Could not start postgresql")
118 | try:
119 | yield postgresql_executor
120 | finally:
121 | pid = os.fork()
122 | if pid == 0:
123 | try:
124 | os.setuid(pg_passwd.pw_uid)
125 | postgresql_executor.stop()
126 | finally:
127 | os._exit(0) # pylint: disable=protected-access
128 | os.waitpid(pid, 0)
129 |
130 |
131 | @pytest.fixture
132 | def connection(nonroot_postgres): # pylint: disable=redefined-outer-name
133 | """
134 | Returns a connection to the temporary postgresql instance.
135 | """
136 | conn = psycopg.connect(
137 | port=nonroot_postgres.port,
138 | host=nonroot_postgres.unixsocketdir or nonroot_postgres.host,
139 | user=nonroot_postgres.user,
140 | )
141 | yield conn
142 | conn.close()
143 |
144 |
145 | def make_collector(
146 | cls, connection, config, **kwargs
147 | ): # pylint: disable=redefined-outer-name
148 | """
149 | Create a collector from a connection.
150 | """
151 | backend_pid = connection.info.backend_pid
152 | if config.getoption("container"):
153 | # If we have a container, look into it to translate the backend_pid
154 | # to the host namespace.
155 | backend_pid = resolve_container_pid(config.getoption("container"), backend_pid)
156 | options = cls.options_cls(**kwargs)
157 | collector = cls.from_pid(pid=backend_pid, options=options)
158 | collector.start()
159 | return collector
160 |
161 |
162 | @pytest.fixture
163 | def querytracer_factory(connection, request):
164 | def factory_func(**kwargs):
165 | kwargs.setdefault("enable_nodes_collection", True)
166 | return make_collector(
167 | QueryTracerBPFCollector, connection, request.config, **kwargs
168 | )
169 |
170 | return factory_func
171 |
172 |
173 | @pytest.fixture
174 | def querytracer(
175 | request: FixtureRequest, connection
176 | ): # pylint: disable=redefined-outer-name
177 | """
178 | Returns a bpfcollector associated to the current connection.
179 | """
180 | collector = make_collector(
181 | QueryTracerBPFCollector,
182 | connection,
183 | request.config,
184 | enable_nodes_collection=True,
185 | )
186 | yield collector
187 | collector.stop()
188 |
189 |
190 | @pytest.fixture
191 | def querytracer_instrumented(
192 | request: FixtureRequest, connection
193 | ): # pylint: disable=redefined-outer-name
194 | """
195 | Returns a bpfcollector with instrumentation turned on.
196 | """
197 | collector = make_collector(
198 | QueryTracerBPFCollector,
199 | connection,
200 | request.config,
201 | instrument_flags=InstrumentationFlags.ALL,
202 | enable_perf_events=True,
203 | enable_query_discovery=True,
204 | enable_nodes_collection=True,
205 | )
206 | yield collector
207 | collector.stop()
208 |
209 |
210 | @pytest.fixture
211 | def guctracer(request: FixtureRequest, connection):
212 | """
213 | Fixture returning an instance of a GUCTracer.
214 | """
215 | collector = make_collector(GUCTracerBPFCollector, connection, request.config)
216 | yield collector
217 | collector.stop()
218 |
--------------------------------------------------------------------------------
/tests/scripts/setup_fedora_container.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 |
4 | # Create a directory for the container
5 | mkdir ~/fedora
6 | mkdir -p /var/lib/machines/fedora
7 | mount -o bind ~/fedora /var/lib/machines/fedora
8 | mkdir -p /etc/distro.repos.d
9 | # Configure yum repos for fedora
10 | cat << EOF > /etc/distro.repos.d/fedora.repo
11 | [fedora]
12 | name=Fedora \$releasever – \$basearch
13 | failovermethod=priority
14 | baseurl=http://download.fedoraproject.org/pub/fedora/linux/releases/\$releasever/Everything/\$basearch/os
15 | metalink=https://mirrors.fedoraproject.org/metalink?repo=fedora-\$releasever&arch=\$basearch
16 | enabled=1
17 | gpgcheck=1
18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-\$releasever-\$basearch
19 | metadata_expire=1
20 | skip_if_unavailable=False
21 | EOF
22 |
23 | # Install the fedora key for f36
24 | # TODO: generalize it
25 | mkdir -p /etc/pki/rpm-gpg/
26 | wget https://getfedora.org/static/fedora.gpg -O /etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-36-x86_64
27 |
28 | # Install the required packages in the container
29 | dnf -y --releasever=36 --best \
30 | --refresh \
31 | --setopt=install_weak_deps=False \
32 | --installroot=/var/lib/machines/fedora/ \
33 | install \
34 | dhcp-client dnf fedora-release glibc glibc-langpack-en glibc-langpack-de \
35 | iputils less ncurses passwd systemd \
36 | systemd-networkd systemd-resolved util-linux vim-default-editor \
37 | postgresql-server dnf-utils dnf-plugins-core \
38 | python-bcc python-pip libunwind
39 |
40 | rm /var/lib/machines/fedora/etc/resolv.conf
41 | cp /etc/resolv.conf /var/lib/machines/fedora/etc/resolve.conf
42 |
43 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf --best -y --releasever=36 install postgresql-server
44 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf -y --releasever=36 debuginfo-install postgresql-server
45 |
46 | # Set a dummy password for the root user
47 | systemd-nspawn --console=pipe -D /var/lib/machines/fedora/ passwd root --stdin << EOF
48 | fedora
49 | EOF
50 |
51 | systemctl start systemd-nspawn@fedora
52 | sleep 2
53 | systemd-run --machine fedora --pipe --wait /usr/bin/postgresql-setup --initdb
54 | systemd-run --machine fedora --pipe --wait /usr/bin/sed "s/#listen_addresses = 'localhost'/listen_addresses = '*'/" /var/lib/pgsql/data/postgresql.conf -i
55 | systemd-run --machine fedora --pipe --wait /usr/bin/bash -c 'echo "host all all 0.0.0.0/0 trust" > /var/lib/pgsql/data/pg_hba.conf'
56 | systemd-run --machine fedora --pipe --wait /usr/bin/systemctl enable postgresql --now
57 |
58 |
59 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip link set up host0
60 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip addr add 172.16.0.1/30 dev host0
61 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip route add default dev host0
62 |
63 | # Ok, now we need to assign a static IP address
64 | ip link set up ve-fedora
65 | ip route add 172.16.0.0/30 dev ve-fedora
66 | ip addr add 172.16.0.2/30 dev ve-fedora
67 |
--------------------------------------------------------------------------------
/tests/test_bins/Makefile:
--------------------------------------------------------------------------------
1 | %.elf: %.elf.c
2 | gcc -Wl,--build-id -gdwarf-5 -O0 -c $*.elf.c -o $@
3 |
4 | %.main: %.main.c
5 | gcc -Wl,--build-id -gdwarf-5 -O0 $*.main.c -o $@
6 |
7 | all: test.elf test_stack.main
8 |
--------------------------------------------------------------------------------
/tests/test_bins/test.elf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test.elf
--------------------------------------------------------------------------------
/tests/test_bins/test.elf.c:
--------------------------------------------------------------------------------
1 | typedef struct StructA {
2 | int a_int;
3 | float a_float;
4 | char* a_charp;
5 | } StructA;
6 |
7 | typedef struct StructB {
8 | StructA b_structa;
9 | StructA* b_structap;
10 | struct StructB* b_structbp;
11 | } StructB;
12 |
13 | StructA GLOBAL_STRUCT_A = {1, 1.0, "TEST"};
14 |
15 | StructB GLOBAL_STRUCT_B = {0};
16 |
--------------------------------------------------------------------------------
/tests/test_bins/test_stack.main:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test_stack.main
--------------------------------------------------------------------------------
/tests/test_bins/test_stack.main.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | int func_1(int a, int b)
4 | {
5 | int c = a + b;
6 | return c;
7 | }
8 |
9 | int func_2(int a, int b)
10 | {
11 | return func_1(a + 1, b + 2);
12 | }
13 |
14 | int main(int argc, char** argv)
15 | {
16 | /*
17 | * Block until the testing program sends something on stdin.
18 | * This is to allow for the testing program to get our proc/maps
19 | */
20 | getchar();
21 | return func_2(10, 20);
22 | }
23 |
--------------------------------------------------------------------------------
/tests/test_dwarf.py:
--------------------------------------------------------------------------------
1 | """
2 | This module tests some utilities from the dwarf module.
3 | """
4 |
5 | import ctypes as ct
6 | import os
7 | from pathlib import Path
8 | from unittest import TestCase
9 | from unittest.mock import patch
10 |
11 | from pgtracer.ebpf.dwarf import (
12 | DWARFPointer,
13 | ProcessMetadata,
14 | Struct,
15 | StructMemberDefinition,
16 | )
17 | from pgtracer.ebpf.eh_frame_hdr import EhFrameHdr
18 |
19 | TEST_BINARY = Path(__file__).parent / "test_bins" / "test.elf"
20 | TEST_EXEC_BINARY = Path(__file__).parent / "test_bins" / "test_stack.main"
21 |
22 |
23 | class MockProcess:
24 | """
25 | Mock a pypsutil.Process.
26 | """
27 |
28 | def __init__(self, binary):
29 | self.binary = binary
30 |
31 | def exe(self):
32 | """
33 | Returns a constant binary string.
34 | """
35 | return self.binary
36 |
37 | @property
38 | def pid(self):
39 | """
40 | Returns self pid. We only need an existing pid...
41 | """
42 | return os.getpid()
43 |
44 |
45 | class TestProcessMetadata(TestCase):
46 | """
47 | Test the dwarf helpers in ProcessMetadata.
48 | """
49 |
50 | @patch("pgtracer.ebpf.dwarf.get_mapped_regions", lambda process, root: [])
51 | def setUp(self):
52 | self.process_meta = ProcessMetadata(MockProcess(TEST_BINARY))
53 | self.exec_process_meta = ProcessMetadata(MockProcess(TEST_EXEC_BINARY))
54 |
55 | def test_struct(self):
56 | """
57 | Test the struct parsing helper.
58 | """
59 | structs = self.process_meta.structs
60 |
61 | StructA = structs.StructA # pylint: disable=invalid-name
62 | self.assertTrue(issubclass(StructA, Struct))
63 | self.assertEqual(StructA.size, 16)
64 |
65 | a_int = StructA.field_definition("a_int")
66 | self.assertIsInstance(a_int, StructMemberDefinition)
67 | self.assertEqual(a_int.offset, 0)
68 | self.assertEqual(a_int.member_type, ct.c_int)
69 |
70 | a_float = StructA.field_definition("a_float")
71 | self.assertEqual(a_float.offset, 4)
72 | self.assertEqual(a_float.member_type, ct.c_float)
73 |
74 | a_charp = StructA.field_definition("a_charp")
75 | self.assertEqual(a_charp.offset, 8)
76 | self.assertTrue(issubclass(a_charp.member_type, ct._Pointer))
77 | self.assertEqual(a_charp.member_type._type_, ct.c_byte)
78 |
79 | StructB = structs.StructB # pylint: disable=invalid-name
80 |
81 | b_structa = StructB.field_definition("b_structa")
82 | self.assertEqual(b_structa.offset, 0)
83 | self.assertEqual(b_structa.member_type, StructA)
84 |
85 | b_structap = StructB.field_definition("b_structap")
86 | self.assertEqual(b_structap.offset, StructA.size)
87 | self.assertTrue(issubclass(b_structap.member_type, DWARFPointer))
88 | self.assertEqual(b_structap.member_type.pointed_type, StructA)
89 |
90 | b_structbp = StructB.field_definition("b_structbp")
91 | self.assertEqual(b_structbp.offset, StructA.size + 8)
92 | self.assertTrue(issubclass(b_structbp.member_type, DWARFPointer))
93 | self.assertEqual(b_structbp.member_type.pointed_type, StructB)
94 |
95 | def test_eh_frame_hdr(self):
96 | """
97 | The the eh_frame_hdr parser.
98 | """
99 | eh_frame_hdr = EhFrameHdr.load_eh_frame_hdr(self.exec_process_meta.elffile)
100 | all_entries = list(eh_frame_hdr.iter_entries())
101 | assert len(all_entries) == 5
102 | assert eh_frame_hdr.fde_count == 5
103 | assert eh_frame_hdr.find_fde(0) == None
104 | assert eh_frame_hdr.find_fde(0xFFFFFFFFF) == None
105 | assert eh_frame_hdr.find_fde(4412).header.initial_location == 4409
106 |
107 | def test_die_contains_addr(self):
108 | dw = self.exec_process_meta.dwarf_info
109 | all_cus = list(dw.iter_CUs())
110 | # CU at index 3 as a DW_AT_ranges attribute
111 | cu = all_cus[3]
112 | die = cu.get_top_DIE()
113 | assert self.exec_process_meta.die_contains_addr(die, 4096)
114 | assert self.exec_process_meta.die_contains_addr(die, 4100)
115 | assert not self.exec_process_meta.die_contains_addr(die, 4095)
116 | assert not self.exec_process_meta.die_contains_addr(die, 4118)
117 |
--------------------------------------------------------------------------------
/tests/test_guctracer.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from time import sleep
3 | from unittest.mock import patch
4 |
5 | from pgtracer.ebpf.collector.guc import GUCTracerEventHandler
6 |
7 |
8 | def test_setting_one_guc(guctracer, connection):
9 | """
10 | Test to set a GUC in a running backend.
11 | """
12 | guc_has_been_set = False
13 | original_method = GUCTracerEventHandler.handle_GUCResponse
14 |
15 | def observe_guc_response(event_handler, collector, event, pid):
16 | nonlocal guc_has_been_set
17 | guc_has_been_set = True
18 | return original_method(event_handler, collector, event, pid)
19 |
20 | with patch(
21 | f"pgtracer.ebpf.collector.guc.GUCTracerEventHandler.handle_GUCResponse",
22 | observe_guc_response,
23 | ):
24 | # Set work_mem to 64kB
25 | guctracer.set_guc("work_mem", 64)
26 | start = datetime.now()
27 | while not guc_has_been_set and (datetime.now() - start) < timedelta(seconds=20):
28 | # Generate some activity to trigger the probe
29 | with connection.execute("SELECT 1") as cur:
30 | pass
31 | sleep(0.1)
32 | with connection.execute("show work_mem") as cur:
33 | result = cur.fetchall()
34 | val = result[0][0]
35 | # Depending on the version, it can come back as str or bytes
36 | if isinstance(val, bytes):
37 | val = val.decode("utf8")
38 | assert val == "64kB"
39 |
--------------------------------------------------------------------------------
/tests/test_querytracer.py:
--------------------------------------------------------------------------------
1 | """
2 | This module acts as a general health check for the eBPF collector.
3 | """
4 | import re
5 | from collections import defaultdict
6 | from contextlib import ExitStack
7 | from datetime import timedelta
8 | from threading import Thread
9 | from time import sleep
10 | from unittest.mock import patch
11 |
12 | import pytest
13 | from flaky import flaky
14 |
15 | from pgtracer.ebpf.collector.querytracer import (
16 | InstrumentationFlags,
17 | QueryTracerEventHandler,
18 | )
19 | from pgtracer.utils import timespec_to_timedelta as tstimedelta
20 |
21 |
22 | def wait_for_collector(collector):
23 | """
24 | Wait for the collector to have at least one query.
25 | """
26 | tries = 0
27 | process_info = collector.event_handler.per_process_info[collector.pid]
28 | while len(process_info.query_history) == 0 and tries < 1000:
29 | tries += 1
30 | sleep(0.05)
31 |
32 |
33 | def test_basic_ebf_collector(querytracer, connection):
34 | """
35 | Test the most basic functionality of the ebpf collector works.
36 | """
37 | # Now try running a query, and see if we can get it back
38 | with connection.execute("SELECT now()") as cur:
39 | querystart = cur.fetchall()[0][0].replace(microsecond=0, tzinfo=None)
40 | wait_for_collector(querytracer)
41 | assert len(querytracer.event_handler.per_process_info) == 1
42 | process_info = querytracer.event_handler.per_process_info[querytracer.pid]
43 | assert len(process_info.query_history) == 1
44 | query = process_info.query_history[0]
45 | assert query.text == "SELECT now()"
46 | assert query.search_path == '"$user", public'
47 | assert query.start_datetime.replace(microsecond=0) == querystart
48 | assert query.runtime is None
49 | assert query.instrument.need_timer.value is False
50 | assert query.instrument.need_bufusage.value is False
51 | assert query.shared_buffers_hitratio is None
52 | assert query.syscache_hitratio is None
53 |
54 |
55 | def test_instrumentation(querytracer_instrumented, connection):
56 | """
57 | Test that turning instrumentation on works as expected.
58 | """
59 | connection.execute("SET track_io_timing = on")
60 | # We want to have at least a few system reads, so do what is necessary...
61 | with open("/proc/sys/vm/drop_caches", "wb") as procf:
62 | procf.write(b"1")
63 |
64 | with connection.execute("SELECT * FROM pg_attribute") as cur:
65 | cur.fetchall()
66 | wait_for_collector(querytracer_instrumented)
67 | assert len(querytracer_instrumented.event_handler.per_process_info) == 1
68 | process_info = querytracer_instrumented.event_handler.per_process_info[
69 | querytracer_instrumented.pid
70 | ]
71 |
72 | assert len(process_info.query_history) == 1
73 | query = process_info.query_history[0]
74 | assert query.instrument.need_timer.value is True
75 | assert query.instrument.need_bufusage.value is True
76 | assert query.runtime > timedelta(0)
77 | assert query.instrument.bufusage.shared_blks_hit.value > 0
78 | assert query.instrument.bufusage.shared_blks_read.value >= 0
79 | assert query.instrument.bufusage.temp_blks_read.value == 0
80 | assert query.instrument.bufusage.temp_blks_written.value == 0
81 | if connection.info.server_version >= 150000:
82 | assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) == timedelta(0)
83 | assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) == timedelta(
84 | 0
85 | )
86 | # We can't make any assumptions about the hit ratios, so just ensure they
87 | # have some valid values.
88 | assert 0 <= query.shared_buffers_hitratio < 100
89 | # The syscache_hitratio can be negative, when we actually end up reading
90 | # more blocks than what is accounted for by instrumentation.
91 | assert query.syscache_hitratio <= 100
92 |
93 | # Check that we don't crash without any instrumentation whatshowever
94 | query.instrument = None
95 | assert query.shared_buffers_hitratio is None
96 | assert query.syscache_hitratio is None
97 |
98 | # Generate some temp files for fun
99 | process_info.query_history = []
100 | connection.execute("SET work_mem = '64kB'")
101 | with connection.execute("SELECT * FROM generate_series(1, 10000) as t"):
102 | pass
103 | wait_for_collector(querytracer_instrumented)
104 | query = process_info.query_history[0]
105 | assert query.text == "SELECT * FROM generate_series(1, 10000) as t"
106 | assert query.instrument.bufusage.temp_blks_read.value > 0
107 | assert query.instrument.bufusage.temp_blks_written.value > 0
108 | if connection.info.server_version >= 150000:
109 | assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) > timedelta(0)
110 | assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) > timedelta(0)
111 |
112 | # Now do the same query with a big enough work_mem to trigger some memory allocations
113 | connection.execute("SET work_mem = '32MB'")
114 | process_info.query_history = []
115 | with connection.execute("SELECT * FROM generate_series(1, 10000) as t") as cur:
116 | pass
117 | wait_for_collector(querytracer_instrumented)
118 | query = process_info.query_history[0]
119 | # The reparatition between sbrk / mmap and wether we move sbrk back to it's initial
120 | # value depends on the state of malloc and it's configuration. So best thing we can test is that "something"
121 | # happened
122 | assert query.memallocs.current_mem_peak > 0
123 | # We can't assert anything meaningful about total_malloc but we can at least exercise the code
124 | assert query.memallocs.total_malloc is not None
125 |
126 |
127 | def test_plans(querytracer_instrumented, connection):
128 | """
129 | Test that we are able to build a plans.
130 | """
131 | with connection.execute(
132 | "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t"
133 | ) as cur:
134 | cur.fetchall()
135 | wait_for_collector(querytracer_instrumented)
136 | process_info = querytracer_instrumented.event_handler.per_process_info[
137 | querytracer_instrumented.pid
138 | ]
139 | query = process_info.query_history[0]
140 | root_node = query.root_node
141 | NodeTag = querytracer_instrumented.metadata.enums.NodeTag
142 | assert root_node.tag == NodeTag.T_Limit
143 | assert len(root_node.children) == 1
144 | assert root_node.parent_node is None
145 | assert root_node.instrument.tuplecount.value == 10
146 |
147 | sort_node = list(root_node.children)[0]
148 | assert sort_node.tag == NodeTag.T_Sort
149 | assert len(sort_node.children) == 1
150 | assert sort_node.parent_node == root_node
151 | # FIXME: investigate why we can't fetch this value on ubuntu's PG11.
152 | if connection.info.server_version >= 120000:
153 | assert sort_node.instrument.tuplecount.value == 10
154 |
155 | seqscan_node = list(sort_node.children)[0]
156 | assert seqscan_node.tag == NodeTag.T_SeqScan
157 | assert len(seqscan_node.children) == 0
158 | assert seqscan_node.parent_node == sort_node
159 |
160 |
161 | def test_explain(querytracer, connection):
162 | """
163 | Test that we are able to build a plans.
164 | """
165 | # We have some trouble with collecting instrumentation for PG < 12
166 | if connection.info.server_version < 120000:
167 | return
168 | cost_snippet = r"\d+\.\d+\..\d+\.\d+"
169 | wanted_plan = rf"""Limit \(cost={cost_snippet} rows=10 width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)
170 | \t-> Sort \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)
171 | \t\t-> SeqScan \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)"""
172 |
173 | with connection.execute(
174 | "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t"
175 | ) as cur:
176 | cur.fetchall()
177 | wait_for_collector(querytracer)
178 | assert len(querytracer.event_handler.per_process_info) == 1
179 | process_info = querytracer.event_handler.per_process_info[querytracer.pid]
180 | query = process_info.query_history[0]
181 | root_node = query.root_node
182 | assert re.match(wanted_plan, root_node.explain())
183 |
184 |
185 | def background_query(connection, query):
186 | def execute_query():
187 | with connection.execute(query) as cur:
188 | cur.fetchall()
189 |
190 | newthread = Thread(target=execute_query)
191 | newthread.start()
192 | return newthread
193 |
194 |
195 | @pytest.mark.slow
196 | def test_long_query(querytracer_instrumented, connection):
197 | events = defaultdict(int)
198 |
199 | def event_handler_observer(method_name):
200 | original_method = getattr(QueryTracerEventHandler, method_name)
201 |
202 | def observe_event_handler(event_handler, bpf_collector, event, pid):
203 | events[method_name] += 1
204 | return original_method(event_handler, bpf_collector, event, pid)
205 |
206 | return observe_event_handler
207 |
208 | with ExitStack() as stack:
209 | for meth_name in (
210 | "handle_MemoryResponseNodeInstr",
211 | "handle_MemoryResponseQueryInstr",
212 | ):
213 | stack.enter_context(
214 | patch(
215 | f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}",
216 | event_handler_observer(meth_name),
217 | )
218 | )
219 | with connection.execute(
220 | """SELECT count(*) FROM (
221 | SELECT pg_sleep(0.01)
222 | FROM pg_class
223 | JOIN pg_attribute ON pg_class.oid = attrelid
224 | ) as s """
225 | ) as cur:
226 | cur.fetchall()
227 | wait_for_collector(querytracer_instrumented)
228 | assert events["handle_MemoryResponseQueryInstr"] > 0
229 | assert events["handle_MemoryResponseNodeInstr"] > 0
230 |
231 |
232 | @pytest.mark.slow
233 | @flaky(max_runs=5)
234 | def test_query_discovery(querytracer_factory, connection):
235 | """
236 | Test that information is gathered during a query.
237 | """
238 | events = defaultdict(int)
239 |
240 | def event_handler_observer(method_name):
241 | original_method = getattr(QueryTracerEventHandler, method_name)
242 |
243 | def observe_event_handler(event_handler, bpf_collector, event, pid):
244 | events[method_name] += 1
245 | return original_method(event_handler, bpf_collector, event, pid)
246 |
247 | return observe_event_handler
248 |
249 | with ExitStack() as stack:
250 | for meth_name in ("handle_StackSample", "handle_MemoryNodeData"):
251 | stack.enter_context(
252 | patch(
253 | f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}",
254 | event_handler_observer(meth_name),
255 | )
256 | )
257 | thread = background_query(
258 | connection,
259 | """SELECT count(*) FROM (
260 | SELECT pg_sleep(0.01)
261 | FROM pg_class
262 | JOIN pg_attribute ON pg_class.oid = attrelid
263 | ) as s """,
264 | )
265 | # Now set up the collector.
266 | collector = None
267 | try:
268 | collector = querytracer_factory(
269 | instrument_flags=InstrumentationFlags.ALL,
270 | enable_perf_events=True,
271 | enable_query_discovery=True,
272 | enable_nodes_collection=True,
273 | sample_freq=1200,
274 | )
275 | # And wait for the query to finish
276 | thread.join()
277 | # Wait a few seconds more to make sure collector has gathered all info
278 | sleep(3)
279 | finally:
280 | if collector is not None:
281 | collector.stop()
282 | assert events["handle_StackSample"] > 0
283 | assert events["handle_MemoryNodeData"] > 0
284 |
--------------------------------------------------------------------------------
/tests/test_stack_unwinding.py:
--------------------------------------------------------------------------------
1 | """
2 | This module tests the frame unwinding code.
3 | """
4 | import ctypes as ct
5 | import subprocess
6 | from pathlib import Path
7 | from unittest import TestCase
8 |
9 | from bcc import BPF
10 | from bcc.libbcc import lib as libbcc
11 | from pypsutil import Process
12 |
13 | from pgtracer.ebpf.collector import CODE_BASE_PATH
14 | from pgtracer.ebpf.dwarf import ProcessMetadata, die_name
15 | from pgtracer.ebpf.unwind import MAX_STACK_READ, UnwindAddressSpace, stack_data_t
16 |
17 | TEST_EBPF_PROGRAM = """
18 | /*
19 | * Fill in placeholders for generated defines
20 | */
21 | #define EVENTRING_PAGE_SIZE 1024
22 | #include "ebpf_maps.h"
23 | #include "stack.h"
24 |
25 | int capture_stack_enter(struct pt_regs *ctx)
26 | {
27 | struct stack_data_t* stack_data = event_ring.ringbuf_reserve(sizeof(struct
28 | stack_data_t));
29 | int i = 0, ret = 0;
30 | u64 maxread = MAX_STACK_READ;
31 | if (!stack_data)
32 | return -1;
33 | while(stack_data && i < 10)
34 | {
35 | ret = capture_stack(ctx, stack_data, maxread);
36 | i++;
37 | maxread = maxread / 2;
38 | }
39 | event_ring.ringbuf_submit(stack_data, 0);
40 | }
41 | """
42 |
43 |
44 | class TestStackUnwinding(TestCase):
45 | def setUp(self):
46 | self.captured_data = []
47 |
48 | def tearDown(self):
49 | for k, v in list(self.ebpf.uprobe_fds.items()):
50 | self.ebpf.detach_uprobe_event(k)
51 |
52 | def _capture_data(self, cpu, data, size):
53 | content = stack_data_t()
54 | ct.pointer(content)[0] = ct.cast(data, ct.POINTER(stack_data_t)).contents
55 | self.captured_data.append(content)
56 |
57 | def test_simple_call_stack(self):
58 | # Load an eBPF program which will capture stacks.
59 | binpath = Path(__file__).parent / "test_bins" / "test_stack.main"
60 |
61 | # Run the program.
62 | program = subprocess.Popen([binpath], stdin=subprocess.PIPE)
63 | # Now get the stack base address for the program.
64 | pm = ProcessMetadata(Process(program.pid))
65 | bpf_prog = f"#define STACK_TOP_ADDR {pm.stack_top}\n"
66 | bpf_prog += f"#define MAX_STACK_READ {MAX_STACK_READ}\n"
67 | bpf_prog += TEST_EBPF_PROGRAM
68 |
69 | self.ebpf = BPF(
70 | text=bpf_prog.encode("utf8"),
71 | cflags=[f"-I{CODE_BASE_PATH}"],
72 | )
73 | self.ebpf.attach_uprobe(
74 | name=str(binpath).encode("utf8"),
75 | fn_name=b"capture_stack_enter",
76 | sym=b"func_1",
77 | )
78 | self.ebpf.attach_uprobe(
79 | name=str(binpath).encode("utf8"),
80 | fn_name=b"capture_stack_enter",
81 | sym=b"func_2",
82 | )
83 | self.ebpf[b"event_ring"].open_ring_buffer(self._capture_data)
84 | # Ok, now everything is ready for the program to actually run.
85 | program.communicate(input=b"C")
86 | # Now that the ebpf program has been loaded, run the executable and
87 | # check the output.
88 | self.ebpf.ring_buffer_poll()
89 | assert len(self.captured_data) == 2
90 |
91 | # First stack should be:
92 | # (???) libc
93 | # main
94 | # func_2
95 | adress_space = UnwindAddressSpace(self.captured_data[0], pm)
96 | frames = list(adress_space.frames())
97 | assert len(frames) == 3
98 | assert frames[0].region.path == str(binpath)
99 | assert die_name(frames[0].die) == "func_2"
100 | assert frames[1].region.path == str(binpath)
101 | assert die_name(frames[1].die) == "main"
102 | libname = Path(frames[2].region.path)
103 | # Remove all suffixes
104 | while libname.suffix != ".so":
105 | libname = libname.with_suffix("")
106 | assert libname.name == "libc.so"
107 | assert frames[2].die is None
108 |
109 | # Second stack should be:
110 | # (???) libc
111 | # main
112 | # func_2
113 | # func_1
114 | adress_space = UnwindAddressSpace(self.captured_data[1], pm)
115 | frames = list(adress_space.frames())
116 | assert len(frames) == 4
117 | assert frames[0].region.path == str(binpath)
118 | assert die_name(frames[0].die) == "func_1"
119 | assert frames[1].region.path == str(binpath)
120 | assert die_name(frames[1].die) == "func_2"
121 | assert frames[2].region.path == str(binpath)
122 | assert die_name(frames[2].die) == "main"
123 | libname = Path(frames[3].region.path)
124 | # Remove all suffixes
125 | while libname.suffix != ".so":
126 | libname = libname.with_suffix("")
127 | assert libname.name == "libc.so"
128 | assert frames[3].die is None
129 |
130 | # Check the argument values
131 | assert frames[0].fetch_arg(1, ct.c_int).value == 11
132 | assert frames[0].fetch_arg(2, ct.c_int).value == 22
133 | assert frames[1].fetch_arg(1, ct.c_int).value == 10
134 | assert frames[1].fetch_arg(2, ct.c_int).value == 20
135 |
--------------------------------------------------------------------------------