├── .github
    ├── CODEOWNERS
    ├── ISSUE_TEMPLATE
    │   ├── 01_question.md
    │   ├── 02_bug.md
    │   ├── 03_feature.md
    │   └── config.yml
    ├── PULL_REQUEST_TEMPLATE.md
    ├── dependabot.yml
    └── workflows
    │   ├── lint.yml
    │   ├── publish-pypi.yaml
    │   └── tests.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .pylintrc
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── mypy.ini
├── pyproject.toml
├── src
    └── pgtracer
    │   ├── __init__.py
    │   ├── ebpf
    │       ├── __init__.py
    │       ├── code
    │       │   ├── block_rq.c
    │       │   ├── data.h
    │       │   ├── ebpf_maps.h
    │       │   ├── gucset.c
    │       │   ├── memusage.c
    │       │   ├── perf.c
    │       │   ├── plan.c
    │       │   ├── program.c
    │       │   ├── stack.h
    │       │   └── utils.h
    │       ├── collector
    │       │   ├── __init__.py
    │       │   ├── c_defs.py
    │       │   ├── guc.py
    │       │   ├── querytracer.py
    │       │   └── utils.py
    │       ├── dwarf.py
    │       ├── eh_frame_hdr.py
    │       └── unwind.py
    │   ├── model
    │       ├── __init__.py
    │       ├── memory.py
    │       ├── plan.py
    │       └── query.py
    │   ├── scripts
    │       ├── pgtrace_gucs.py
    │       └── pgtrace_queries.py
    │   └── utils.py
└── tests
    ├── conftest.py
    ├── scripts
        └── setup_fedora_container.sh
    ├── test_bins
        ├── Makefile
        ├── test.elf
        ├── test.elf.c
        ├── test_stack.main
        └── test_stack.main.c
    ├── test_dwarf.py
    ├── test_guctracer.py
    ├── test_querytracer.py
    └── test_stack_unwinding.py


/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @aiven/aiven-open-source
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/01_question.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: ❓ Ask a question
 3 | about: Got stuck or missing something from the docs? Ask away!
 4 | ---
 5 | 
 6 | # What can we help you with?
 7 | 
 8 | <!-- Try to explain your question with as much detail as you can provide. -->
 9 | 
10 | # Where would you expect to find this information?
11 | 
12 | <!-- Feel free to point us where with links or even proposing new sections or pages in the documentation. -->
13 | 
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/02_bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 🐜 Report a bug
 3 | about: Spotted a problem? Let us know
 4 | ---
 5 | 
 6 | # What happened?
 7 | 
 8 | <!-- Try to be as precise as possible. If you can a small reproducer example would be great! -->
 9 | 
10 | # What did you expect to happen?
11 | 
12 | <!-- Please explain what would be the expected behavior for this particular case, ideally, with examples. -->
13 | 
14 | # What else do we need to know?
15 | 
16 | <!-- Include your platform, version, and any other information that seems relevant. -->
17 | 
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/03_feature.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: 💡 Feature suggestion
 3 | about: What would make this even better?
 4 | ---
 5 | 
 6 | # What is currently missing?
 7 | 
 8 | <!-- Please, describe what is currently missing and why should it be present in the project. -->
 9 | 
10 | # How could this be improved?
11 | 
12 | <!-- If you already know how this could be approached, please provide some brief explanation about it. -->
13 | 
14 | # Is this a feature you would work on yourself?
15 | 
16 | * [ ] I plan to open a pull request for this feature
17 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: true
2 | contact_links:
3 |   - name: Aiven Security Bug Bounty
4 |     url: https://hackerone.com/aiven_ltd
5 |     about: Our bug bounty program.
6 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | <!-- All contributors please complete these sections, including maintainers -->
 2 | # About this change - What it does
 3 | 
 4 | <!-- Provide a small sentence that summarizes the change. -->
 5 | 
 6 | <!-- Provide the issue number below if it exists. -->
 7 | Resolves: #xxxxx
 8 | 
 9 | # Why this way
10 | 
11 | <!-- Provide a small explanation on why this is the approach you took for solving this problem. -->
12 | 
13 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "pip" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     branches:
 4 |       - main
 5 |     tags:
 6 |       - '**'
 7 |   pull_request:
 8 | 
 9 | jobs:
10 | 
11 |   lint:
12 |     runs-on: ubuntu-22.04
13 |     strategy:
14 |       matrix:
15 |         # only use one version for the lint step
16 |         python-version: [3.9]
17 | 
18 |     steps:
19 | 
20 |       - id: checkout
21 |         uses: actions/checkout@v2
22 |         with:
23 |           # Do not persist the token during execution of this job.
24 |           persist-credentials: false
25 | 
26 |       - id: dependencies
27 |         run: |
28 |           # Must be installed via the system
29 |           sudo apt install python3-bpfcc python3-pip
30 |           pip install -U pip toml
31 |           pip install '.[lint]'
32 | 
33 |       - id: pylint
34 |         run: pylint --rcfile .pylintrc src/ || pylint-exit $? -efail
35 | 
36 |       - id: mypy
37 |         run: python -m mypy --strict src/ --python-version 3.8
38 | 
39 |       - id: validate-style
40 |         run: |
41 |           isort --recursive src/
42 |           black src/
43 |           if [ $(git diff --name-only --diff-filter=ACMR | wc -l ) != 0 ]; then
44 |             echo "Reformatting failed! Please run make fmt on your commits and resubmit!" 1>&2;
45 |             git diff;
46 |             exit 1;
47 |           fi
48 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-pypi.yaml:
--------------------------------------------------------------------------------
 1 | # Based on https://packaging.python.org/guides/publishing-package-distribution-releases-using-github-actions-ci-cd-workflows/
 2 | 
 3 | name: Publish to PyPI
 4 | on:
 5 |   push:
 6 |     tags:
 7 |       - 'releases/**'
 8 | 
 9 | jobs:
10 |   build-n-publish:
11 |     name: Build and publish
12 |     runs-on: ubuntu-latest
13 | 
14 |     steps:
15 |     - uses: actions/checkout@v3
16 |       with:
17 |         fetch-depth: 0
18 | 
19 |     - name: Set up Python
20 |       uses: actions/setup-python@v4
21 |       with:
22 |         python-version: "3.8"
23 | 
24 |     - name: Install pypa/build
25 |       run: >-
26 |         python -m pip install build --user
27 |     - name: Build a binary wheel and a source tarball
28 |       run: >-
29 |         python -m
30 |         build
31 |         --sdist
32 |         --wheel
33 |         --outdir dist/
34 |         .
35 |     - name: Publish distribution to PyPI
36 |       if: startsWith(github.ref, 'refs/tags')
37 |       uses: pypa/gh-action-pypi-publish@release/v1
38 |       with:
39 |         password: ${{ secrets.PYPI_API_TOKEN }}
40 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
  1 | 
  2 | on:
  3 |   push:
  4 |     branches:
  5 |       - main
  6 |     tags:
  7 |       - '**'
  8 |   pull_request:
  9 | 
 10 | jobs:
 11 | 
 12 |   tests:
 13 |     runs-on: ubuntu-22.04
 14 |     # We don't fail on dev versions, as those are snapshots
 15 |     continue-on-error: ${{ matrix.experimental }}
 16 |     strategy:
 17 |       matrix:
 18 |         postgresql_version: [11, 12, 13, 14, ]
 19 |         experimental: [false]
 20 |         repo: ["pgdg"]
 21 |         pytest_args: ["-m 'not slow'"]
 22 |         include:
 23 |           # Define the current dev version to be experimental
 24 |           - postgresql_version: 16
 25 |             experimental: true
 26 |             repo: "pgdg-snapshot"
 27 |             pytest_args: "-m 'not slow'"
 28 |           # For latest stable version, include "slow" tests
 29 |           - postgresql_version: 15
 30 |             experimental: false
 31 |             repo: "pgdg"
 32 |             pytest_args: ""
 33 |     env:
 34 |       PGVERSION: ${{ matrix.postgresql_version }}
 35 |       DISTRO: ubuntu
 36 |     steps:
 37 | 
 38 |       - id: checkout
 39 |         uses: actions/checkout@v2
 40 |         with:
 41 |           # Do not persist the token during execution of this job.
 42 |           persist-credentials: false
 43 | 
 44 |       - id: dependencies
 45 |         run: |
 46 |           # Must be installed via the system
 47 |           sudo apt update
 48 |           sudo apt install curl ca-certificates gnupg
 49 |           sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r)
 50 |           curl https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/apt.postgresql.org.gpg >/dev/null
 51 |           sudo sh -c 'echo "deb http://apt.postgresql.org/pub/repos/apt $(lsb_release -cs)-${{ matrix.repo }} main ${{ matrix.postgresql_version }}" > /etc/apt/sources.list.d/pgdg.list'
 52 |           sudo apt update
 53 |           # Install postgresql-common so that update alternatives doesn't fail
 54 |           sudo apt install postgresql-common postgresql-client-common
 55 |           sudo apt install postgresql-${{matrix.postgresql_version}} postgresql-${{matrix.postgresql_version}}-dbgsym
 56 |           sudo pip install -U pip toml
 57 |           # Install requirements from pyproject.toml
 58 |           sudo pip install -e '.[test]'
 59 | 
 60 |       - id: tests
 61 |         run: |
 62 |           sudo pytest --postgresql-exec /usr/lib/postgresql/${{matrix.postgresql_version}}/bin/pg_ctl --cov src/ --cov-report=xml ${{matrix.pytest_args}}
 63 | 
 64 |       - name: Upload coverage reports to Codecov
 65 |         uses: codecov/codecov-action@v3
 66 |         with:
 67 |           env_vars: PGVERSION
 68 |           fail_ci_if_error: true
 69 |           files: ./coverage.xml
 70 |           verbose: true
 71 |           name: codecov-umbrella
 72 | 
 73 |   tests_fedora_container:
 74 |     runs-on: ubuntu-22.04
 75 |     env:
 76 |       PGVERSION: 13
 77 |       DISTRO: fedora
 78 |     steps:
 79 |       - id: checkout
 80 |         uses: actions/checkout@v2
 81 |         with:
 82 |           persist-credentials: false
 83 |       - id: dependencies
 84 |         run: |
 85 |           sudo apt update
 86 |           sudo apt install dnf systemd-container
 87 |           sudo apt install postgresql-client
 88 |           sudo ./tests/scripts/setup_fedora_container.sh
 89 |           sudo apt install curl ca-certificates gnupg
 90 |           sudo apt install python3-bpfcc python3-pip libunwind-dev linux-headers-$(uname -r)
 91 |           # Also install it in the host, for the tests running outside the
 92 |           # container
 93 |           sudo pip install -U pip toml
 94 |           sudo pip install -e '.[test]'
 95 | 
 96 |       - id: fedora_tests
 97 |         run: |
 98 |           sudo pytest --postgresql-host 172.16.0.1 --container fedora --cov src/ --cov-report=xml -m "not slow"
 99 | 
100 |       - name: Upload coverage reports to Codecov
101 |         uses: codecov/codecov-action@v3
102 |         with:
103 |           env_vars: PGVERSION, DISTRO
104 |           fail_ci_if_error: true
105 |           files: ./coverage.xml
106 |           verbose: true
107 |           name: codecov
108 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | *.egg-info
3 | build
4 | *.tmp
5 | .coverage
6 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pycqa/isort
 3 |   rev: 5.12.0
 4 |   hooks:
 5 |     - id: isort
 6 |       name: isort (python)
 7 | - repo: https://github.com/psf/black
 8 |   rev: 23.1.0
 9 |   hooks:
10 |     - id: black
11 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
1 | [MESSAGES CONTROL]
2 | disable=too-few-public-methods
3 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | opensource@aiven.io.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Welcome!
 2 | 
 3 | Contributions are very welcome on pgtracer. When contributing please keep this in mind:
 4 | 
 5 | - Open an issue to discuss new bigger features.
 6 | - Write code consistent with the project style and make sure the tests are passing.
 7 | - Stay in touch with us if we have follow up questions or requests for further changes.
 8 | 
 9 | # Development
10 | 
11 | ## Local Environment
12 | 
13 | 
14 | ## Tests
15 | 
16 | 
17 | ## Static checking and Linting
18 | 
19 | 
20 | ## Manual testing
21 | 
22 | 
23 | ### Configuration
24 | 
25 | 
26 | # Opening a PR
27 | 
28 | - Commit messages should describe the changes, not the filenames. Win our admiration by following
29 |   the [excellent advice from Chris Beams](https://chris.beams.io/posts/git-commit/) when composing
30 |   commit messages.
31 | - Choose a meaningful title for your pull request.
32 | - The pull request description should focus on what changed and why.
33 | - Check that the tests pass (and add test coverage for your changes if appropriate).
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | PGTracer
 2 | Copyright (C) 2022 Aiven
 3 | 
 4 | This program is free software: you can redistribute it and/or modify
 5 | it under the terms of the GNU General Public License as published by
 6 | the Free Software Foundation, either version 3 of the License, or
 7 | (at your option) any later version.
 8 | 
 9 | This program is distributed in the hope that it will be useful,
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12 | GNU General Public License for more details.
13 | 
14 | You should have received a copy of the GNU General Public License
15 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
16 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/pgtracer/ebpf/code/*.c
2 | include src/pgtracer/ebpf/code/*.h
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | PGTracer
  2 | ========
  3 | 
  4 | PGTracer is a collection of tools to trace queries, execution plans and more in
  5 | PostgreSQL®, using eBPF.
  6 | 
  7 | Overview
  8 | ========
  9 | 
 10 | PGTracer offers a way to instrument PostgreSQL, using the Linux eBPF facility.
 11 | As it does advanced memory access, it needs the PostgreSQL debug symbols to
 12 | resolve symbols and offsets in structs.
 13 | 
 14 | Features
 15 | ============
 16 | 
 17 | * Attach to a running PostgreSQL backend, and dump every executed query along
 18 |   with it's search path
 19 | * Optionally turn on instrumentation (just like EXPLAIN ANALYZE does) to gather
 20 |   more information
 21 | 
 22 | Planned features:
 23 | * Gather information about individual execution nodes to print query plans
 24 | * Gather system information and link them to individual nodes (think syscalls,
 25 |   IO, memory allocation...)
 26 | * Build a TUI to explore the data
 27 | * Allow to follow a transaction
 28 | 
 29 | 
 30 | Install
 31 | ============
 32 | 
 33 | You will need a running PostgreSQL install, and it's debug symbols.
 34 | 
 35 | For pgtracer itself you will need:
 36 |  - libunwind installed on the system
 37 |  - the [BPF Compiler Collection](https://github.com/iovisor/bcc/blob/master/INSTALL.md)
 38 |  - several python packages as dependencies:
 39 |    - `psutil`
 40 |    - `pyelftools`
 41 | 
 42 | Support will vary depending on your Linux distribution, kernel version, and
 43 | library versions, as well as how PostgreSQL was compiled.
 44 | 
 45 | Please file a bug if it doesn't work as expected.
 46 | 
 47 | Ubuntu
 48 | ------------
 49 | 
 50 | To install the debug symbols, install the `postgresql-version-dbgsym` package. You may have to enable additional repositories though.
 51 | 
 52 | To run pgtracer you will need some python packages as well as packages only available from the repos.
 53 | 
 54 | ```
 55 | apt install python3-bpfcc python3-pip libunwind-dev
 56 | ```
 57 | 
 58 | Then upgrade pip using pip:
 59 | 
 60 | ```
 61 | pip install pip --upgrade
 62 | ```
 63 | 
 64 | And you are now ready to install the pgtracer package itself:
 65 | 
 66 | ```
 67 | git clone https://github.com/aiven/pgtracer.git
 68 | cd pgtracer
 69 | pip install .
 70 | ```
 71 | 
 72 | 
 73 | Fedora
 74 | ---------
 75 | 
 76 | To install the debugging symbols:
 77 | 
 78 | ```
 79 | yum install dnf-utils
 80 | debuginfo-install postgresql-server
 81 | ```
 82 | 
 83 | For the dependencies:
 84 | 
 85 | ```
 86 | yum install python3-bcc libunwind  python3-pip libunwind-devel
 87 | ```
 88 | 
 89 | Then install pgtracer itself:
 90 | 
 91 | ```
 92 | git clone https://github.com/aiven/pgtracer.git
 93 | cd pgtracer
 94 | pip install pip --upgrade
 95 | pip install .
 96 | ```
 97 | 
 98 | 
 99 | 
100 | Arch Linux
101 | ------------
102 | 
103 | To install PostgreSQL debug symbols, as root:
104 | 
105 | ```
106 | pacman -S debuginfod
107 | export DEBUGINFOD_URLS="https://debuginfod.archlinux.org/"
108 | debuginfod-find debuginfo /usr/bin/postgres
109 | ```
110 | 
111 | To install the required packages:
112 | 
113 | ```
114 | pacman -S python-bcc libunwind python-pip
115 | ```
116 | 
117 | Then install the pgtracer package itself:
118 | 
119 | ```
120 | git clone https://github.com/aiven/pgtracer.git
121 | cd pgtracer
122 | pip install .
123 | ```
124 | 
125 | 
126 | Usage
127 | =============
128 | 
129 | Currently, only one script comes with pgtracer: `pgtrace_queries`.
130 | Since pgtracer uses eBPF, it needs to be run as root.
131 | 
132 | ```
133 | usage: pgtrace_queries [-h] [--instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...]] [--nodes-collection] pid
134 | 
135 | Dump a running backend execution plan
136 | 
137 | positional arguments:
138 |   pid                   PID to connect to
139 | 
140 | options:
141 |   -h, --help            show this help message and exit
142 |   --instrument [{TIMER,BUFFERS,ROWS,WAL,ALL} ...], -I [{TIMER,BUFFERS,ROWS,WAL,ALL} ...]
143 |                         Instrument flags to set. (warning: writes into backends memory!)
144 |   --nodes-collection, -n
145 |                         Collect information about individual execution nodes
146 | ```
147 | 
148 | 
149 | 
150 | Depending on the way the PostgreSQL binary have been compiled, you may need a
151 | more recent pyelftools version than what is packaged with your distribution:
152 | DWARF5 support is quite recent and continuously improving.
153 | 
154 | 
155 | 
156 | 
157 | 
158 | License
159 | =======
160 | pgtracer is licensed under the PostgreSQL license. Full license text is available in the [LICENSE](LICENSE) file.
161 | 
162 | Please note that the project explicitly does not require a CLA (Contributor License Agreement) from its contributors.
163 | 
164 | Contact
165 | ============
166 | Bug reports and patches are very welcome, please post them as GitHub issues and pull requests at https://github.com/aiven/pgtracer .
167 | To report any possible vulnerabilities or other serious issues please see our [security](SECURITY.md) policy.
168 | 
169 | Trademarks
170 | ==========
171 | 
172 | The terms Postgres and PostgreSQL are registered trademarks of the PostgreSQL Community Association of Canada.
173 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Supported Versions
 4 | 
 5 | We release patches for security vulnerabilities. Which versions are eligible
 6 | to receive such patches depend on the CVSS v3.0 Rating:
 7 | 
 8 | | CVSS v3.0 | Supported Versions                        |
 9 | | --------- | ----------------------------------------- |
10 | | 4.0-10.0  | Most recent release                       |
11 | 
12 | ## Reporting a Vulnerability
13 | 
14 | Please report (suspected) security vulnerabilities to our **[bug bounty
15 | program](https://hackerone.com/aiven_ltd)**. You will receive a response from
16 | us within 2 working days. If the issue is confirmed, we will release a patch as
17 | soon as possible depending on impact and complexity.
18 | 
19 | ## Qualifying Vulnerabilities
20 | 
21 | Any reproducible vulnerability that has a severe effect on the security or
22 | privacy of our users is likely to be in scope for the program.
23 | 
24 | We generally **aren't** interested in the following issues:
25 | * Social engineering (e.g. phishing, vishing, smishing) attacks
26 | * Brute force, DoS, text injection
27 | * Missing best practices such as HTTP security headers (CSP, X-XSS, etc.),
28 |   email (SPF/DKIM/DMARC records), SSL/TLS configuration.
29 | * Software version disclosure / Banner identification issues / Descriptive
30 |   error messages or headers (e.g. stack traces, application or server errors).
31 | * Clickjacking on pages with no sensitive actions
32 | * Theoretical vulnerabilities where you can't demonstrate a significant
33 |   security impact with a proof of concept.
34 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | python_version = 3.7
 3 | 
 4 | [mypy-elftools.*]
 5 | ignore_missing_imports = True
 6 | 
 7 | [mypy-psutil.*]
 8 | ignore_missing_imports = True
 9 | 
10 | [mypy-bcc.*]
11 | ignore_missing_imports = True
12 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "pgtracer"
 3 | description = "Tracing tools for PostgreSQL"
 4 | version = "0.1.0"
 5 | authors = [
 6 |     { name = "Ronan Dunklau", email = "ronan.dunklau@aiven.com" }
 7 | ]
 8 | dependencies = [
 9 |     "pyelftools",
10 |     "pypsutil"
11 | ]
12 | readme = "README.md"
13 | requires-python = ">=3.7"
14 | classifiers = [
15 |     "Programming Language :: Python :: 3",
16 |     "License :: OSI Approved :: PostgreSQL License",
17 |     "Operating System :: POSIX :: Linux",
18 | ]
19 | 
20 | 
21 | [project.scripts]
22 | pgtrace_queries = "pgtracer.scripts.pgtrace_queries:main"
23 | pgtrace_gucs = "pgtracer.scripts.pgtrace_gucs:main"
24 | 
25 | [project.optional-dependencies]
26 | lint = [
27 |   'black',
28 |   'isort',
29 |   'mypy',
30 |   'pylint',
31 |   'pylint-exit',
32 | ]
33 | 
34 | test = [
35 |   'psycopg',
36 |   'pytest',
37 |   'pytest-coverage',
38 |   'pytest-postgresql',
39 |   'flaky'
40 | ]
41 | 
42 | [tool.isort]
43 | profile = "black"
44 | 


--------------------------------------------------------------------------------
/src/pgtracer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/__init__.py


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/src/pgtracer/ebpf/__init__.py


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/block_rq.c:
--------------------------------------------------------------------------------
 1 | #include <linux/blkdev.h>
 2 | #include "data.h"
 3 | #include "utils.h"
 4 | 
 5 | struct io_req_data_t {
 6 | 	event_base event_base;
 7 | 	char	rwbs[8];
 8 | 	u64		bytes;
 9 | };
10 | 
11 | 
12 | TRACEPOINT_PROBE(block, block_rq_issue)
13 | {
14 | 	struct io_req_data_t *event;
15 | 	##CHECK_POSTMASTER##
16 | 	/* We need to filter on pid ourselves inside syscalls. */
17 | #ifdef PID
18 | 	if (bpf_get_current_pid_tgid() >> 32 != PID)
19 | 		return 1;
20 | #endif
21 | 
22 | 	event = event_ring.ringbuf_reserve(sizeof(struct io_req_data_t));
23 | 	if (!event)
24 | 		return 1;
25 | 
26 | 	fill_event_base(&(event->event_base), EventTypeKBlockRqIssue);
27 | 	event->bytes = args->nr_sector << 9;
28 |     if (event->bytes == 0) {
29 |         event->bytes = args->bytes;
30 |     }
31 | 	bpf_probe_read(&event->rwbs, sizeof(event->rwbs), args->rwbs);
32 | 	event_ring.ringbuf_submit(event, 0);
33 |     return 0;
34 | }
35 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/data.h:
--------------------------------------------------------------------------------
 1 | #ifndef DATA_H
 2 | #define DATA_H
 3 | #include "stack.h"
 4 | 
 5 | typedef struct event_base {
 6 | 	short event_type;
 7 | 	int pid;
 8 | } event_base;
 9 | 
10 | typedef struct Id128 {
11 | 	u64 u1;
12 | 	u64 u2;
13 | } Id128;
14 | 
15 | struct portal_data_t {
16 | 	event_base event_base;
17 | 	Id128 portal_key;
18 | 	u64 queryAddr;
19 | 	u64 query_id;
20 | 	double startup_cost;
21 | 	double total_cost;
22 | 	double plan_rows;
23 | 	char query[MAX_QUERY_LENGTH]; // Dynamically injected using defines
24 | 	char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines
25 | 	char search_path[MAX_SEARCHPATH_LENGTH];
26 | };
27 | 
28 | struct plan_data_t {
29 | 	u64 plan_addr;
30 | 	int plan_tag;
31 | 	double startup_cost;
32 | 	double total_cost;
33 | 	double plan_rows;
34 | 	int plan_width;
35 | 	bool parallel_aware;
36 | };
37 | 
38 | struct planstate_data_t {
39 | 	event_base event_base;
40 | 	Id128 portal_key;
41 | 	u64 planstate_addr;
42 | 	int planstate_tag;
43 | 	u64 lefttree;
44 | 	u64 righttree;
45 | 	struct plan_data_t plan_data;
46 | 	char instrument[STRUCT_SIZE_Instrumentation]; // Dynamically injected using defines
47 | 	struct stack_data_t stack_capture;
48 | };
49 | 
50 | #endif
51 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/ebpf_maps.h:
--------------------------------------------------------------------------------
1 | #ifndef EBPF_MAPS_H
2 | #define EBPF_MAPS_H
3 | /* Main ringbuf for communicating events to user space. */
4 | BPF_RINGBUF_OUTPUT(event_ring, EVENTRING_PAGE_SIZE);
5 | 
6 | #endif
7 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/gucset.c:
--------------------------------------------------------------------------------
 1 | #include "ebpf_maps.h"
 2 | 
 3 | struct guc_request_t {
 4 | 	u64 guc_location;
 5 | 	int guc_size;
 6 | 	char payload[GUC_MAX_LENGTH];
 7 | };
 8 | 
 9 | struct guc_response_t {
10 | 	short event_type;
11 | 	u64 guc_location;
12 | 	bool status;
13 | };
14 | 
15 | BPF_QUEUE(gucs_to_set, struct guc_request_t, 128);
16 | 
17 | 
18 | /* This will be attached at various points in the program flow,
19 |  * to override GUCs as seen fit.
20 |  * */
21 | int process_guc_uprobe(struct pt_regs *ctx)
22 | {
23 | 	struct guc_request_t guc_request;
24 | 	struct guc_response_t *guc_response;
25 | 	int i = 0;
26 | 	int size = 0;
27 | 	int ret;
28 | 	while (i < 20)
29 | 	{
30 | 		guc_response = event_ring.ringbuf_reserve(sizeof(struct guc_response_t));
31 | 		if (!guc_response)
32 | 			return 1;
33 | 		guc_response->event_type = EventTypeGUCResponse;
34 | 
35 | 		/* If no resquest to process, bail out */
36 | 		if (gucs_to_set.pop(&guc_request) < 0)
37 | 		{
38 | 			event_ring.ringbuf_discard(guc_response, 0);
39 | 			return 1;
40 | 		}
41 | 		guc_response->guc_location = guc_request.guc_location;
42 | 		size = guc_request.guc_size;
43 | 		clamp_umax(size, GUC_MAX_LENGTH);
44 | 		ret = -1;
45 | 		if (size > 0 && guc_request.guc_size <= GUC_MAX_LENGTH)
46 | 			ret = bpf_probe_write_user((void *) guc_request.guc_location, &(guc_request.payload), size);
47 | 		guc_response->status = (ret >= 0);
48 | 		event_ring.ringbuf_submit(guc_response, 0);
49 | 		i++;
50 | 	}
51 | 	return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/memusage.c:
--------------------------------------------------------------------------------
 1 | #include "ebpf_maps.h"
 2 | #include "stack.h"
 3 | #include "linux/sched.h"
 4 | #include "utils.h"
 5 | #include "data.h"
 6 | 
 7 | #define offsetof(type, member)  __builtin_offsetof (type, member)
 8 | 
 9 | 
10 | struct memory_account_t {
11 | 	event_base event_base;
12 | 	long long size;
13 | 	short kind;
14 | };
15 | 
16 | 
17 | static inline int send_memory_account(long long size, short kind)
18 | {
19 | 	struct memory_account_t *account = event_ring.ringbuf_reserve(sizeof(struct memory_account_t));
20 | 	if (!account)
21 | 		return 1;
22 | 	fill_event_base(&(account->event_base), EventTypeMemoryAccount);
23 | 	account->size = size;
24 | 	account->kind = kind;
25 | 	event_ring.ringbuf_submit(account, 0);
26 | 	return 0;
27 | }
28 | 
29 | /*
30 |  * sbrk moves are instrumented through the convenient tracepoints.
31 |  */
32 | int sbrk_more(struct pt_regs *ctx)
33 | {
34 |     ##CHECK_POSTMASTER##
35 | 	size_t size;
36 | 	bpf_usdt_readarg(2, ctx, &size);
37 | 	return send_memory_account(size, MemoryAllocTypeSbrk);
38 | }
39 | 
40 | int sbrk_less(struct pt_regs *ctx)
41 | {
42 |     ##CHECK_POSTMASTER##
43 | 	size_t size;
44 | 	bpf_usdt_readarg(2, ctx, &size);
45 | 	return send_memory_account(-size, MemoryAllocTypeSbrk);
46 | }
47 | 
48 | /* 
49 |  * glibc doesn't offer tracepoints for mmap, so instrument the functions directly.
50 |  */
51 | 
52 | int mmap_enter(struct pt_regs *ctx)
53 | {
54 | 	##CHECK_POSTMASTER##
55 | 	size_t size = PT_REGS_PARM2(ctx);
56 | 	return send_memory_account(size, MemoryAllocTypeMmap);
57 | }
58 | 
59 | int munmap_enter(struct pt_regs *ctx)
60 | {
61 | 	##CHECK_POSTMASTER##
62 | 	size_t size = PT_REGS_PARM2(ctx);
63 | 	return send_memory_account(-size, MemoryAllocTypeMmap);
64 | }
65 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/perf.c:
--------------------------------------------------------------------------------
  1 | #include "ebpf_maps.h"
  2 | #include "stack.h"
  3 | #include "uapi/linux/bpf_perf_event.h"
  4 | #include "utils.h"
  5 | #include "data.h"
  6 | 
  7 | struct memory_request_t {
  8 | 	short event_type;
  9 |     Id128 request_id;
 10 | 	int path_size;
 11 |     u64 size;
 12 | 	u64 memory_path[MEMORY_PATH_SIZE];
 13 | };
 14 | 
 15 | struct memory_response_t {
 16 | 	event_base event_base;
 17 |     Id128 request_id;
 18 |     char payload[MEMORY_REQUEST_MAXSIZE];
 19 | };
 20 | 
 21 | /* 
 22 |  * We embed the whole portal_data_t 
 23 |  */
 24 | struct stack_sample_t {
 25 | 	struct portal_data_t portal_data;
 26 | 	struct stack_data_t stack_data;
 27 | };
 28 | 
 29 | # define QUERY_DISCOVERY_KEY 1
 30 | # define NODE_DISCOVERY_KEY 2
 31 | BPF_HASH(discovery_enabled, int, bool, 2);
 32 | 
 33 | BPF_QUEUE(memory_requests, struct memory_request_t, 1024);
 34 | /* Define one queue per process */
 35 | #if LIBBCC_VERSION_GEQ(0, 21, 0)
 36 | BPF_HASH_OF_MAPS(pid_queues, int, "memory_requests", 1024);
 37 | #else
 38 | BPF_HASH_OF_MAPS(pid_queues, "memory_requests", 1024);
 39 | #endif
 40 | 
 41 | /*
 42 |  * This code is run on perf event, with a specific frequency.
 43 |  * What we want is to be able to read specific memory locations whenever the perf
 44 |  * event is triggered.
 45 |  *
 46 |  * Userland code pushes memory locations to read to the memory_requests queues,
 47 |  * and sends the responses back through the same event_ringbuffer used
 48 |  * everywhere.
 49 |  */
 50 | int perf_event(struct bpf_perf_event_data *ctx)
 51 | {
 52 | 	##CHECK_POSTMASTER##
 53 |     struct memory_request_t request;
 54 | 	struct memory_response_t *response;
 55 | 	u64 size;
 56 | 	u64 memory_location;
 57 | 	int pid = (bpf_get_current_pid_tgid() >> 32);
 58 | 	int i = 0;
 59 | 	int j;
 60 | 	void * queue;
 61 | #ifdef ENABLE_QUERY_DISCOVERY
 62 | 	int key = QUERY_DISCOVERY_KEY;
 63 | 	bool *need_discovery;
 64 | 	need_discovery = discovery_enabled.lookup(&key);
 65 | 	bool need_query = (need_discovery && *need_discovery);
 66 | 	key = NODE_DISCOVERY_KEY;
 67 | 	need_discovery = discovery_enabled.lookup(&key);
 68 | 	bool need_node = (need_discovery && *need_discovery);
 69 | 	if (need_query || need_node)
 70 | 	{
 71 | 		void *activeportal = 0;
 72 | 		bpf_probe_read_user(&activeportal,
 73 | 							sizeof(void*),
 74 | 							(void *) GlobalVariablesActivePortal);
 75 | 		/* Only proceed if we have a current query. */
 76 | 		if(activeportal != 0)
 77 | 		{
 78 | 			struct stack_sample_t *stack_sample = event_ring.ringbuf_reserve(sizeof(struct stack_sample_t));
 79 | 
 80 | 			/* 
 81 | 			 * If we can't allocate for the stack sample, we keep going to the memory request code.
 82 | 			 */
 83 | 			if (stack_sample)
 84 | 			{
 85 | 				fill_event_base(&(stack_sample->portal_data.event_base), EventTypeStackSample);
 86 | 				if (need_query)
 87 | 				{
 88 | 					void *queryDesc = 0;
 89 | 					bpf_probe_read_user(&queryDesc, sizeof(void *),
 90 | 										OffsetFrom(activeportal, PortalData, queryDesc));
 91 | 					fill_portal_data(queryDesc, &stack_sample->portal_data);
 92 | 					stack_sample->portal_data.portal_key = get_portal_key(activeportal);
 93 | 				}
 94 | 				if (need_node)
 95 | 				{
 96 | 					capture_stack(&(ctx->regs), &(stack_sample->stack_data), MAX_STACK_READ);
 97 | 				}
 98 | 				event_ring.ringbuf_submit(stack_sample, 0);
 99 | 			}
100 | 		}
101 | 	}
102 | #endif
103 | 	queue = pid_queues.lookup(&pid);
104 | 	if (!queue)
105 | 		return 0;
106 | 	while (i < 5)
107 | 	{
108 | 
109 | 		/* No more requests to process. */
110 | 		if (bpf_map_pop_elem(queue, &request) < 0)
111 | 		{
112 | 			return 0;
113 | 		}
114 | 
115 | 		size = request.size;
116 | 		/* We treat those specially, as we have the opportunity to gather a bunch of
117 | 		 * data at the same time.
118 | 		 */
119 | 		if (request.event_type == EventTypeMemoryNodeData)
120 | 		{
121 | 			struct planstate_data_t *response = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
122 | 			if (!response)
123 | 				return 1;
124 | 			fill_event_base(&(response->event_base), EventTypeMemoryNodeData);
125 | 			record_node((void *) request.memory_path[0], response, NULL, false);
126 | 			event_ring.ringbuf_submit(response, 0);
127 | 			i++;
128 | 			continue;
129 | 		}
130 | 		response = event_ring.ringbuf_reserve(sizeof(struct memory_response_t));
131 | 		if (!response)
132 | 			return 1;
133 | 
134 | 		fill_event_base(&(response->event_base), request.event_type);
135 | 		if (size >= MEMORY_REQUEST_MAXSIZE)
136 | 			size = MEMORY_REQUEST_MAXSIZE;
137 | 		/*
138 | 		 * request.path_size can't be greater than MEMORY_PATH_SIZE,
139 | 		 * but the eBPF verifier doesn't know this.
140 | 		 */
141 | 		memory_location = 0;
142 | 		j = 0;
143 | 		/* Chase pointers as needed */
144 | 		while(j < request.path_size - 1 && j < MEMORY_PATH_SIZE)
145 | 		{
146 | 			if (memory_location != 0)
147 | 			{
148 | 				if(bpf_probe_read_user(&memory_location, sizeof(u64),
149 | 													  (void *) memory_location))
150 | 				{
151 | 					/* We failed to read here, so bail out. */
152 | 					event_ring.ringbuf_discard(response, 0);
153 | 					return 0;
154 | 				}
155 | 			}
156 | 			memory_location = request.memory_path[j] + memory_location;
157 | 			j++;
158 | 		}
159 | 		if (bpf_probe_read_user(&response->payload, size, (void *) memory_location))
160 | 		{
161 | 			event_ring.ringbuf_discard(response, 0);
162 | 		} else {
163 | 			response->request_id = request.request_id;
164 | 			event_ring.ringbuf_submit(response, 0);
165 | 		}
166 | 		i++;
167 | 	}
168 | 	return 0;
169 | }
170 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/plan.c:
--------------------------------------------------------------------------------
 1 | #include "data.h"
 2 | #include "utils.h"
 3 | #include "stack.h"
 4 | 
 5 | int execprocnodefirst_enter(struct pt_regs *ctx);
 6 | int execendnode_enter(struct pt_regs *ctx);
 7 | 
 8 | /* 
 9 |  * On each first execution of a node, send the node information to the client
10 |  * side
11 |  */
12 | int execprocnodefirst_enter(struct pt_regs *ctx)
13 | {
14 | 	##CHECK_POSTMASTER##
15 | 	struct planstate_data_t *node;
16 | 	node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
17 | 	if (!node)
18 | 		return 0;
19 | 	fill_event_base(&(node->event_base), EventTypeExecProcNodeFirst);
20 | 	record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true);
21 | 	event_ring.ringbuf_submit(node, 0);
22 | 	return 0;
23 | }
24 | 
25 | /*
26 |  * On each node teardown, send the node information to the client side (again)
27 |  */
28 | int execendnode_enter(struct pt_regs *ctx)
29 | {
30 | 	##CHECK_POSTMASTER##
31 | 	struct planstate_data_t *node;
32 | 	node = event_ring.ringbuf_reserve(sizeof(struct planstate_data_t));
33 | 	if (!node)
34 | 		return 0;
35 | 	fill_event_base(&(node->event_base), EventTypeExecEndNode);
36 | 	record_node((void *) PT_REGS_PARM1(ctx), node, ctx, true);
37 | 	event_ring.ringbuf_submit(node, 0);
38 | 	return 0;
39 | }
40 | 
41 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/program.c:
--------------------------------------------------------------------------------
  1 | #include <linux/sched.h>
  2 | #include "ebpf_maps.h"
  3 | #include "data.h"
  4 | #include "utils.h"
  5 | 
  6 | static int override_instrument_options(void * querydesc);
  7 | 
  8 | int executorstart_enter(struct pt_regs *ctx)
  9 | {
 10 | 	##CHECK_POSTMASTER##
 11 | 	void *queryDesc = (void *) PT_REGS_PARM1(ctx);
 12 | #ifdef USER_INSTRUMENT_FLAGS
 13 | 	override_instrument_options(queryDesc);
 14 | #endif
 15 | 	return 0;
 16 | }
 17 | 
 18 | int executorrun_enter(struct pt_regs *ctx)
 19 | {
 20 | 	u64 ppid; 
 21 | 	##CHECK_POSTMASTER##
 22 | 	void *queryDesc = (void *) PT_REGS_PARM1(ctx);
 23 | 	void *sourceText;
 24 | 	void *portaladdr;
 25 | 	void *search_path;
 26 | 	void *plan;
 27 | 
 28 | 	struct portal_data_t *event;
 29 | 	bpf_probe_read_user(&portaladdr,
 30 | 						sizeof(void*),
 31 | 						(void *) GlobalVariablesActivePortal);
 32 | 	Id128 key = get_portal_key(portaladdr);
 33 | 	event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
 34 | 	if (!event)
 35 | 		return 1;
 36 | 	fill_event_base(&(event->event_base), EventTypeExecutorRun);
 37 | 	event->portal_key = key;
 38 | 	fill_portal_data(queryDesc, event);
 39 | 	bpf_probe_read_user(&search_path, sizeof(void *), (void *) GlobalVariablesnamespace_search_path);
 40 | 	bpf_probe_read_user_str(&event->search_path, MAX_SEARCHPATH_LENGTH,
 41 | 							search_path);
 42 | 	event_ring.ringbuf_submit(event, 0);
 43 | 	return 0;
 44 | }
 45 | 
 46 | int executorfinish_enter(struct pt_regs *ctx)
 47 | {
 48 | 	##CHECK_POSTMASTER##
 49 | 	void *portal;
 50 | 	void *queryDesc = (void *) PT_REGS_PARM1(ctx);
 51 | 	struct portal_data_t *event;
 52 | 	Id128 key;
 53 | 	bpf_probe_read_user(&portal,
 54 | 						sizeof(void*),
 55 | 						(void *) GlobalVariablesActivePortal);
 56 | 
 57 | 	key = get_portal_key((void*) portal);
 58 | 	event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
 59 | 	if (!event)
 60 | 		return 1;
 61 | 	init_portal_data(event);
 62 | 	fill_portal_data(queryDesc, event);
 63 | 	fill_event_base(&(event->event_base), EventTypeExecutorFinish);
 64 | 	event->portal_key = key;
 65 | 	event_ring.ringbuf_submit(event, 0);
 66 | 	return 0;
 67 | }
 68 | 
 69 | int portaldrop_return(struct pt_regs *ctx)
 70 | {
 71 | 	##CHECK_POSTMASTER##
 72 | 	struct portal_data_t *event;
 73 | 	Id128 key = {0};
 74 | 	event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
 75 | 	if (!event)
 76 | 		return 1;
 77 | 	init_portal_data(event);
 78 | 	fill_event_base(&(event->event_base), EventTypeDropPortalReturn);
 79 | 	event->portal_key = key;
 80 | 	event_ring.ringbuf_submit(event, 0);
 81 | 	return 0;
 82 | }
 83 | 
 84 | int portaldrop_enter(struct pt_regs *ctx)
 85 | {
 86 | 	##CHECK_POSTMASTER##
 87 | 	void *portal =  (void *) PT_REGS_PARM1(ctx);
 88 | 	Id128 key = get_portal_key(portal);
 89 | 	struct portal_data_t *event;
 90 | 	void *queryDesc;
 91 | 	event = event_ring.ringbuf_reserve(sizeof(struct portal_data_t));
 92 | 	if (!event)
 93 | 		return 1;
 94 | 	init_portal_data(event);
 95 | 	bpf_probe_read_user(&queryDesc, sizeof(void *),
 96 | 						OffsetFrom(portal, PortalData, queryDesc));
 97 | 	fill_portal_data(queryDesc, event);
 98 | 	fill_event_base(&(event->event_base), EventTypeDropPortalEnter);
 99 | 	event->portal_key = key;
100 | 	event_ring.ringbuf_submit(event, 0);
101 | 	return 0;
102 | }
103 | 
104 | /* When instrumenting a whole cluster, we also trace new processes.
105 |  * Additionally, specific collectors can embed code in here.
106 |  */
107 | #ifdef POSTMASTER_PID
108 | TRACEPOINT_PROBE(sched, sched_process_fork)
109 | {
110 | 	u32 pid = args->parent_pid;
111 | 	if (args->parent_pid != POSTMASTER_PID)
112 | 		return 0;
113 | 	struct event_base *event;
114 | 	event = event_ring.ringbuf_reserve(sizeof (struct event_base));
115 | 	if (!event)
116 | 		return 1;
117 | 	event->pid = args->child_pid;
118 | 	event->event_type = EventTypeProcessFork;
119 | 	event_ring.ringbuf_submit(event, 0);
120 | 	return 0;
121 | }
122 | #endif
123 | 
124 | TRACEPOINT_PROBE(sched, sched_process_exit)
125 | {
126 | 	##CHECK_POSTMASTER##
127 | #ifdef PID
128 | 	if (bpf_get_current_pid_tgid() >> 32 != PID)
129 | 		return 1;
130 | #endif
131 | 	struct event_base *event;
132 | 	event = event_ring.ringbuf_reserve(sizeof (struct event_base));
133 | 	if (!event)
134 | 		return 1;
135 | 	fill_event_base(event, EventTypeProcessExit);
136 | 	event_ring.ringbuf_submit(event, 0);
137 | 	return 0;
138 | }
139 | 
140 | 
141 | #ifdef USER_INSTRUMENT_FLAGS
142 | static int override_instrument_options(void * querydesc)
143 | {
144 | 	void * options_addr = OffsetFrom(querydesc, QueryDesc, instrument_options);
145 | 	int instr_options;
146 | 	bpf_probe_read_user(&instr_options,
147 | 						sizeof(int),
148 | 						options_addr);
149 | 	instr_options |= USER_INSTRUMENT_FLAGS;
150 | 	return bpf_probe_write_user(options_addr, &instr_options, sizeof(int));
151 | }
152 | #endif
153 | 
154 | #ifdef CAPTURE_PLANS
155 | #include "plan.h"
156 | #endif
157 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/stack.h:
--------------------------------------------------------------------------------
 1 | #ifndef STACK_H
 2 | #define STACK_H
 3 | #include <linux/sched.h>
 4 | 
 5 | struct stack_data_t {
 6 | 	u64 rax;
 7 | 	u64 rdx;
 8 | 	u64 rcx;
 9 | 	u64 rbx;
10 | 	u64 rsi;
11 | 	u64 rdi;
12 | 	u64 rbp;
13 | 	u64 rsp;
14 | 	u64 r8;
15 | 	u64 r9;
16 | 	u64 r10;
17 | 	u64 r11;
18 | 	u64 r12;
19 | 	u64 r13;
20 | 	u64 r14;
21 | 	u64 r15;
22 | 	u64 rip;
23 | 	u64 size;
24 | 	u64 start_addr;
25 | 	char stack[MAX_STACK_READ]; // Dynamically injected using defines
26 | };
27 | 
28 | /*
29 |  * Capture the current stack and register values.
30 |  */
31 | static inline int capture_stack(struct pt_regs *ctx, struct stack_data_t *stack_data, u64 max_read)
32 | {
33 | 	int ret = 0;
34 | 	stack_data->rax = ctx->ax;
35 | 	stack_data->rdx = ctx->dx;
36 | 	stack_data->rcx = ctx->cx;
37 | 	stack_data->rbx = ctx->bx;
38 | 	stack_data->rsi = ctx->si;
39 | 	stack_data->rdi = ctx->di;
40 | 	stack_data->rbp = ctx->bp;
41 | 	stack_data->rsp = ctx->sp;
42 | 	stack_data->r8 = ctx->r8;
43 | 	stack_data->r9 = ctx->r9;
44 | 	stack_data->r10 = ctx->r10;
45 | 	stack_data->r11 = ctx->r11;
46 | 	stack_data->r12 = ctx->r12;
47 | 	stack_data->r13 = ctx->r13;
48 | 	stack_data->r14 = ctx->r14;
49 | 	stack_data->r15 = ctx->r15;
50 | 	stack_data->rip = ctx->ip;
51 | 	stack_data->start_addr = stack_data->rsp;
52 | 	stack_data->size = (STACK_TOP_ADDR - stack_data->rsp);
53 | 	if (stack_data->size > max_read)
54 | 		stack_data->size = max_read;
55 | 	ret = bpf_probe_read_user(&stack_data->stack,
56 | 							  stack_data->size,
57 | 							  (void *) (stack_data->rsp));
58 | 	if (ret != 0)
59 | 	{
60 | 		stack_data->size = 0;
61 | 	}
62 | 	return ret;
63 | }
64 | 
65 | #endif
66 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/code/utils.h:
--------------------------------------------------------------------------------
  1 | #ifndef UTILS_H
  2 | #define UTILS_H
  3 | #define EPOCH_OFFSET ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)
  4 | 
  5 | #define Offset(structname, member) (STRUCT_ ## structname ## _OFFSET_ ## member)
  6 | #define OffsetFrom(pointer, structname, member) ((void *) (pointer + Offset(structname, member)))
  7 | 
  8 | /* Reuse code from LIBCC for version matching */
  9 | #define __LIBBCC_VERSION(a, b, c) (((a) << 16) + ((b) << 8) + ((c) > 255 ? 255 : (c)))
 10 | #define LIBBCC_VERSION_CODE __LIBBCC_VERSION(LIBBCC_MAJOR_VERSION, LIBBCC_MINOR_VERSION, LIBBCC_PATCH_VERSION)
 11 | #define LIBBCC_VERSION_GEQ(a,b,c) LIBBCC_VERSION_CODE >= __LIBBCC_VERSION(a, b, c)
 12 | 
 13 | 
 14 | #include "data.h"
 15 | 
 16 | /* Clamp a value to a max value, and make the eBPF verifier happy. */
 17 | #define clamp_umax(VAR, UMAX)						\
 18 | 	asm volatile (							\
 19 | 		"if %0 <= %[max] goto +1\n"				\
 20 | 		"%0 = %[max]\n"						\
 21 | 		: "+r"(VAR)						\
 22 | 		: [max]"i"(UMAX)					\
 23 | 	)
 24 | 
 25 | static u64 pgts_to_unixts(u64 pgts)
 26 | {
 27 | 	ulong secs = (ulong) pgts / 1000000;
 28 | 	uint microsecs = (uint) pgts % 1000000;
 29 | 	return (secs + EPOCH_OFFSET) * 1000000 + microsecs;
 30 | }
 31 | 
 32 | 
 33 | // Handle code related to the portal information capture
 34 | static inline Id128 get_portal_key(void * portal)
 35 | {
 36 | 	Id128 ret;
 37 | 	u64 creation_time;
 38 | 	__builtin_memset(&ret, 0, sizeof(ret));
 39 | 	ret.u1 = bpf_get_current_pid_tgid();
 40 | 	bpf_probe_read_user(&creation_time,
 41 | 						sizeof(u64),
 42 | 						OffsetFrom(portal, PortalData, creation_time));
 43 | 	ret.u2 = pgts_to_unixts(creation_time);
 44 | 	return ret;
 45 | }
 46 | 
 47 | static inline void fill_event_base(event_base* event, short event_type)
 48 | {
 49 | 	event->event_type = event_type;
 50 | 	event->pid = (bpf_get_current_pid_tgid() >> 32);
 51 | }
 52 | 
 53 | static inline void fill_portal_data(void * queryDesc, struct portal_data_t* event)
 54 | {
 55 | 	void *sourceText;
 56 | 	void *planstate;
 57 | 	void *instrument;
 58 | 	void *plannedStmt;
 59 | 	void *plan;
 60 | 	int ret;
 61 | 	event->queryAddr = (u64) queryDesc;
 62 | 	bpf_probe_read_user(&sourceText,
 63 | 						sizeof(void *),
 64 | 						OffsetFrom(queryDesc, QueryDesc, sourceText));
 65 | 	bpf_probe_read_user_str(&event->query,
 66 | 							MAX_QUERY_LENGTH,
 67 | 							(void *) sourceText);
 68 | 	ret = bpf_probe_read_user(&plannedStmt,
 69 | 							  sizeof(void *),
 70 | 							  OffsetFrom(queryDesc, QueryDesc, plannedstmt));
 71 | 	if (plannedStmt && ret == 0)
 72 | 	{
 73 | 		bpf_probe_read_user(&event->query_id,
 74 | 							sizeof(u64),
 75 | 							OffsetFrom(plannedStmt, PlannedStmt, queryId));
 76 | 	}
 77 | 	ret = bpf_probe_read_user(&planstate,
 78 | 						sizeof(void *),
 79 | 						OffsetFrom(queryDesc, QueryDesc, planstate));
 80 | 	if (planstate && ret == 0)
 81 | 	{
 82 | 		ret = bpf_probe_read_user(&plan, sizeof(void *),
 83 | 								  OffsetFrom(planstate, PlanState, plan));
 84 | 		if (plan && ret == 0)
 85 | 		{
 86 | 			bpf_probe_read_user(&event->startup_cost,
 87 | 								sizeof(double),
 88 | 								OffsetFrom(plan, Plan, startup_cost));
 89 | 			bpf_probe_read_user(&event->total_cost,
 90 | 								sizeof(double),
 91 | 								OffsetFrom(plan, Plan, total_cost));
 92 | 			bpf_probe_read_user(&event->plan_rows,
 93 | 								sizeof(double),
 94 | 								OffsetFrom(plan, Plan, plan_rows));
 95 | 		}
 96 | 		ret = bpf_probe_read_user(&instrument,
 97 | 							sizeof(void *),
 98 | 							OffsetFrom(planstate, PlanState, instrument));
 99 | 		if (instrument && ret == 0)
100 | 		{
101 | 			bpf_probe_read_user(&event->instrument,
102 | 								STRUCT_SIZE_Instrumentation,
103 | 								instrument);
104 | 		}
105 | 	}
106 | }
107 | 
108 | static inline void init_portal_data(struct portal_data_t* event)
109 | {
110 | 	event->query[0] = 0;
111 | 	event->instrument[0] = 0;
112 | 	event->search_path[0] = 0;
113 | }
114 | 
115 | /*
116 |  * Record information about a PlanStateNode
117 |  */
118 | static inline void record_node(void * nodeaddr, struct planstate_data_t *node,
119 | 							   struct pt_regs *ctx, bool need_capture_stack)
120 | {
121 | 	void *portal;
122 | 	void *instrument;
123 | 	void *planaddr;
124 | 	bpf_probe_read_user(&portal,
125 | 						sizeof(void*),
126 | 						(void *) GlobalVariablesActivePortal);
127 | 	node->portal_key = get_portal_key(portal);
128 | 	node->planstate_addr = (u64) nodeaddr;
129 | 	if (need_capture_stack)
130 | 		capture_stack(ctx, &node->stack_capture, MAX_STACK_READ);
131 | 
132 | 	/* Read the associated Plan node, and it's estimates */
133 | 	bpf_probe_read_user(&planaddr,
134 | 						sizeof(void *),
135 | 						OffsetFrom(nodeaddr, PlanState, plan));
136 | 	node->plan_data.plan_addr = (u64) planaddr;
137 | 	bpf_probe_read_user(&node->plan_data.plan_tag,
138 | 						sizeof(int),
139 | 						OffsetFrom(planaddr, Plan, type));
140 | 
141 | 	bpf_probe_read_user(&node->plan_data.startup_cost,
142 | 						sizeof(double),
143 | 						OffsetFrom(planaddr, Plan, startup_cost));
144 | 	bpf_probe_read_user(&node->plan_data.total_cost,
145 | 						sizeof(double),
146 | 						OffsetFrom(planaddr, Plan, total_cost));
147 | 	bpf_probe_read_user(&node->plan_data.plan_rows,
148 | 						sizeof(double),
149 | 						OffsetFrom(planaddr, Plan, plan_rows));
150 | 	bpf_probe_read_user(&node->plan_data.plan_width,
151 | 						sizeof(int),
152 | 						OffsetFrom(planaddr, Plan, plan_width));
153 | 	bpf_probe_read_user(&node->plan_data.parallel_aware,
154 | 						sizeof(bool),
155 | 						OffsetFrom(planaddr, Plan, parallel_aware));
156 | 	/* Read the PlanState node data */
157 | 	bpf_probe_read_user(&node->planstate_tag,
158 | 						sizeof(int),
159 | 						OffsetFrom(nodeaddr, PlanState, type));
160 | 	bpf_probe_read_user(&node->lefttree,
161 | 						sizeof(void *),
162 | 						OffsetFrom(nodeaddr, PlanState, lefttree));
163 | 	bpf_probe_read_user(&node->righttree,
164 | 						sizeof(void *),
165 | 						OffsetFrom(nodeaddr, PlanState, righttree));
166 | 	bpf_probe_read_user(&instrument,
167 | 						sizeof(void *),
168 | 						OffsetFrom(nodeaddr, PlanState, instrument));
169 | 	if (instrument)
170 | 		bpf_probe_read_user(&node->instrument,
171 | 							STRUCT_SIZE_Instrumentation,
172 | 							instrument);
173 | }
174 | #endif
175 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/__init__.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Workhorse for pgtracer.
  3 | 
  4 | The BPFCollector works by combining two things:
  5 |     - an ebpf program loaded in to the kernel, which is built on the fly
  6 |     - DWARF information extracted from the executable (or a separate debug
  7 |       symbols file).
  8 | """
  9 | from __future__ import annotations
 10 | 
 11 | import ctypes as ct
 12 | import os
 13 | from dataclasses import dataclass
 14 | from enum import IntEnum
 15 | from pathlib import Path
 16 | from threading import Lock, Thread
 17 | from time import sleep
 18 | from typing import Any, Callable, Dict, List, Optional, Tuple, Type, TypeVar, Union
 19 | 
 20 | from bcc import BPF, USDT, PerfSWConfig, PerfType
 21 | from bcc import __version__ as bcc_version
 22 | from bcc import lib as bcclib
 23 | from pypsutil import Process
 24 | 
 25 | from ...model import MemoryAllocType, Query
 26 | from ..dwarf import DWARFPointer, ProcessMetadata, Struct, get_size
 27 | from ..unwind import stack_data_t
 28 | from .c_defs import *
 29 | from .utils import CODE_BASE_PATH, defines_dict_to_c, intenum_to_c, load_c_file
 30 | 
 31 | BCC_VERSION_TUPLE = tuple(int(part) for part in bcc_version.split("."))
 32 | 
 33 | 
 34 | class InvalidStateException(Exception):
 35 |     """
 36 |     Invalid State of a BPFCollector Exception.
 37 | 
 38 |     This Exception occurs when an operation is performed on a BPFCollector
 39 |     which is not in the prerequisite state.
 40 |     """
 41 | 
 42 | 
 43 | # pylint: disable=invalid-name
 44 | class EventHandler:
 45 |     """
 46 |     Base class for handling events.
 47 | 
 48 |     The handle_event method dispatched to handle_{EventType} methods if they
 49 |     exist. This acts mostly as a namespace to not pollute the BPFCollector
 50 |     class itself.
 51 |     """
 52 | 
 53 |     def __init__(self) -> None:
 54 |         pass
 55 | 
 56 |     def handle_event(self, bpf_collector: BPFCollector, event: ct._CData) -> int:
 57 |         """
 58 |         Handle an event from EBPF ringbuffer.
 59 |         Every event should be tagged with a short int as the first member to
 60 |         handle it's type. It is then dispatched to the appropriate method,
 61 |         which will be able to make sense of the actual struct.
 62 |         """
 63 |         # All events should be tagged with the event's type
 64 |         event_stub = ct.cast(event, ct.POINTER(event_base)).contents
 65 |         event_type_name = EventType(event_stub.event_type).name
 66 |         pid = event_stub.pid
 67 |         method_name = f"handle_{event_type_name}"
 68 |         method: Callable[[BPFCollector, ct._CData, int], int] = getattr(
 69 |             self, method_name
 70 |         )
 71 |         return method(bpf_collector, event, pid)
 72 | 
 73 |     # pylint: disable=unused-argument
 74 |     def handle_ProcessExit(
 75 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
 76 |     ) -> int:
 77 |         """
 78 |         Handle ProcessExit event.
 79 |         """
 80 |         return bpf_collector.cleanup_process(pid)
 81 | 
 82 |     # pylint: disable=unused-argument
 83 |     def handle_ProcessFork(
 84 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
 85 |     ) -> int:
 86 |         """
 87 |         Handle ProcessEnter event.
 88 |         """
 89 |         return bpf_collector.setup_process(pid)
 90 | 
 91 | 
 92 | @dataclass
 93 | class CollectorOptions:
 94 |     """
 95 |     Base class for BPFCollector Options.
 96 |     """
 97 | 
 98 |     enable_perf_events: bool = True
 99 |     sample_freq: int = 1200
100 | 
101 | 
102 | T = TypeVar("T", bound="BPFCollector")
103 | 
104 | 
105 | class BPFCollector:
106 |     """
107 |     Workhorse for pgtracer.
108 | 
109 |     This class allows the user to load an EBPF program dynamically generated
110 |     using supplied options and extracted metadata about the Postgres
111 |     executable.
112 |     """
113 | 
114 |     options_cls: Type[CollectorOptions] = CollectorOptions
115 |     event_handler_cls: Type[EventHandler] = EventHandler
116 | 
117 |     ExecEndFuncs = [
118 |         "ExecEndAgg",
119 |         "ExecEndAppend",
120 |         "ExecEndBitmapAnd",
121 |         "ExecEndBitmapHeapScan",
122 |         "ExecEndBitmapIndexScan",
123 |         "ExecEndBitmapOr",
124 |         "ExecEndCteScan",
125 |         "ExecEndCustomScan",
126 |         "ExecEndForeignScan",
127 |         "ExecEndFunctionScan",
128 |         "ExecEndGather",
129 |         "ExecEndGatherMerge",
130 |         "ExecEndGroup",
131 |         "ExecEndHash",
132 |         "ExecEndHashJoin",
133 |         "ExecEndIncrementalSort",
134 |         "ExecEndIndexOnlyScan",
135 |         "ExecEndIndexScan",
136 |         "ExecEndLimit",
137 |         "ExecEndLockRows",
138 |         "ExecEndMaterial",
139 |         "ExecEndMemoize",
140 |         "ExecEndMergeAppend",
141 |         "ExecEndMergeJoin",
142 |         "ExecEndModifyTable",
143 |         "ExecEndNamedTuplestoreScan",
144 |         "ExecEndNode",
145 |         "ExecEndNestLoop",
146 |         "ExecEndProjectSet",
147 |         "ExecEndRecursiveUnion",
148 |         "ExecEndResult",
149 |         "ExecEndSampleScan",
150 |         "ExecEndSeqScan",
151 |         "ExecEndSetOp",
152 |         "ExecEndSort",
153 |         "ExecEndSubqueryScan",
154 |         "ExecEndTableFuncScan",
155 |         "ExecEndTidRangeScan",
156 |         "ExecEndTidScan",
157 |         "ExecEndUnique",
158 |         "ExecEndValuesScan",
159 |         "ExecEndWindowAgg",
160 |         "ExecEndWorkTableScan",
161 |     ]
162 | 
163 |     def __init__(
164 |         self,
165 |         metadata: ProcessMetadata,
166 |         options: Optional[CollectorOptions] = None,
167 |         include_children: bool = False,
168 |     ):
169 |         if options is None:
170 |             options = self.options_cls()
171 |         self.options = options
172 |         self.include_children = include_children
173 |         self.anon_map_fds: Dict[int, int] = {}
174 |         self.ppid: Optional[int]
175 |         if include_children:
176 |             self.pid = -1
177 |             self.ppid = metadata.pid
178 |         else:
179 |             self.pid = metadata.pid
180 |             self.ppid = None
181 | 
182 |         self.metadata = metadata
183 |         self.program = str(self.metadata.program).encode("utf8")
184 |         # Old bcc version don't support global usdt probes, so disable
185 |         # memory tracking in that case
186 |         if self.include_children is False or BCC_VERSION_TUPLE >= (0, 19, 0):
187 |             self.usdt_ctx = USDT(metadata.pid)
188 |             self.enable_usdt_probes(self.usdt_ctx)
189 |         else:
190 |             self.usdt_ctx = None
191 |         self.bpf = self.prepare_bpf()
192 |         self.setup_bpf_state()
193 |         self.event_handler: EventHandler = self.event_handler_cls()
194 |         self.update_struct_defs()
195 |         self.is_running = False
196 |         self.background_thread: Optional[Thread] = None
197 |         self.lock = Lock()
198 |         self.sample_freq = options.sample_freq
199 |         self.backend_type: Optional[IntEnum] = None
200 | 
201 |     @classmethod
202 |     def from_pid(
203 |         cls: Type[T], pid: int, options: CollectorOptions = CollectorOptions()
204 |     ) -> T:
205 |         """
206 |         Build a BPFCollector from a pid.
207 |         """
208 |         # FIXME: make this configurable
209 |         cache_dir = Path("~/.cache").expanduser() / "pgtracer"
210 |         process = Process(pid=pid)
211 |         # Check if we are given the postmaster pid, or a backend.
212 |         # If our parent is itself a postgres process, then we are instrumenting the whole backend.
213 |         pprocess = process.parent()
214 |         include_children = bool(
215 |             pprocess and pprocess.name() not in ("postgres", "postmaster")
216 |         )
217 |         processmetadata = ProcessMetadata(process, cache_dir=cache_dir)
218 |         return cls(processmetadata, options, include_children=include_children)
219 | 
220 |     def update_struct_defs(self) -> None:
221 |         """
222 |         Update the ctypes struct definitions from the DWARF metadata.
223 | 
224 |         Some C structs used in EBPF must match what is defined by Postgres:
225 |         so we build the class dynamically after the DWARF file has been loaded.
226 |         """
227 |         global instrument_type  # pylint: disable=global-statement
228 |         instrument_type = ct.c_byte * self.metadata.structs.Instrumentation.size
229 |         # Update global struct definitions with actual sizes
230 |         portal_data.update_fields(
231 |             {
232 |                 "query": ct.c_char * MAX_QUERY_LENGTH,
233 |                 "instrument": instrument_type,
234 |                 "search_path": ct.c_char * MAX_SEARCHPATH_LENGTH,
235 |             }
236 |         )
237 |         planstate_data.update_fields({"instrument": instrument_type})
238 |         stack_sample.update_fields({"portal_data": portal_data})
239 | 
240 |     @property
241 |     def constant_defines(self) -> Dict[str, int]:
242 |         """
243 |         Returns a list of constants to add to the ebpf program as #define
244 |         directives.
245 |         """
246 |         constants = {
247 |             "STACK_TOP_ADDR": self.metadata.stack_top,
248 |             # TODO: find a way to extract those ?
249 |             "POSTGRES_EPOCH_JDATE": 2451545,
250 |             "UNIX_EPOCH_JDATE": 2440588,
251 |             "SECS_PER_DAY": 86400,
252 |             # TODO: make those configurable ?
253 |             "MAX_QUERY_NUMBER": 10,
254 |             "MAX_QUERY_LENGTH": MAX_QUERY_LENGTH,
255 |             "MAX_STACK_READ": 4096,
256 |             "MAX_SEARCHPATH_LENGTH": MAX_SEARCHPATH_LENGTH,
257 |             "EVENTRING_PAGE_SIZE": 131072,
258 |             "MEMORY_REQUEST_MAXSIZE": MEMORY_REQUEST_MAXSIZE,
259 |             "MEMORY_PATH_SIZE": MEMORY_PATH_SIZE,
260 |             "LIBBCC_MAJOR_VERSION": BCC_VERSION_TUPLE[0],
261 |             "LIBBCC_MINOR_VERSION": BCC_VERSION_TUPLE[1],
262 |             "LIBBCC_PATCH_VERSION": BCC_VERSION_TUPLE[2],
263 |         }
264 |         if self.ppid is not None:
265 |             constants["POSTMASTER_PID"] = self.ppid
266 |         else:
267 |             constants["PID"] = self.pid
268 |         return constants
269 | 
270 |     @property
271 |     def struct_offsets_defines(self) -> Dict[str, int]:
272 |         """
273 |         Build C-Code for the eBPF code to easily access named members in
274 |         structs.
275 | 
276 |         We read the offset in a struct for known members, so that the eBPF code
277 |         can read those members from the Postgres struct.
278 | 
279 |         This is necessary because we can't include Postgres headers in the eBPF
280 |         code.
281 |         """
282 |         # Returns a normalized way of DEFINING struct offsets
283 |         s = self.metadata.structs
284 | 
285 |         return {
286 |             f"STRUCT_{struct}_OFFSET_{member}": getattr(s, struct)
287 |             .field_definition(member)
288 |             .offset
289 |             for struct, member in (
290 |                 ("Node", "type"),
291 |                 ("Plan", "type"),
292 |                 ("Plan", "startup_cost"),
293 |                 ("Plan", "total_cost"),
294 |                 ("Plan", "plan_rows"),
295 |                 ("Plan", "plan_width"),
296 |                 ("Plan", "parallel_aware"),
297 |                 ("PlannedStmt", "queryId"),
298 |                 ("PlanState", "instrument"),
299 |                 ("PlanState", "plan"),
300 |                 ("PlanState", "type"),
301 |                 ("PlanState", "lefttree"),
302 |                 ("PlanState", "righttree"),
303 |                 ("PortalData", "creation_time"),
304 |                 ("PortalData", "queryDesc"),
305 |                 ("QueryDesc", "instrument_options"),
306 |                 ("QueryDesc", "planstate"),
307 |                 ("QueryDesc", "sourceText"),
308 |                 ("QueryDesc", "plannedstmt"),
309 |             )
310 |         }
311 | 
312 |     def make_global_variables_enum(self) -> Type[IntEnum]:
313 |         """
314 |         Create an IntEnum mapping global variables names to their address in
315 |         the program.
316 |         """
317 |         mapping = {}
318 | 
319 |         for key in ("ActivePortal", "namespace_search_path"):
320 |             mapping[key] = self.metadata.global_variable(key)
321 |         # Mypy complains about dynamic enums
322 |         globalenum = IntEnum("GlobalVariables", mapping)  # type: ignore
323 | 
324 |         return globalenum
325 | 
326 |     def make_struct_sizes_dict(self) -> Dict[str, int]:
327 |         """
328 |         Create a dictionary mapping struct name to their bytesize.
329 | 
330 |         Once again, this is because we can't include Postgres header and call
331 |         "sizeof".
332 |         """
333 |         mapping = {}
334 | 
335 |         for key in ("Instrumentation",):
336 |             mapping[f"STRUCT_SIZE_{key}"] = getattr(self.metadata.structs, key).size
337 | 
338 |         return mapping
339 | 
340 |     def _attach_uprobe(self, function_name: str, ebpf_function: str) -> None:
341 |         """
342 |         Helper to attach a uprobe executing `ebpf_function` at every
343 |         `function_name` location.
344 |         """
345 |         for addr in self.metadata.function_addresses(function_name):
346 |             self.bpf.attach_uprobe(
347 |                 name=self.program,
348 |                 fn_name=ebpf_function.encode("utf8"),
349 |                 addr=addr,
350 |                 pid=self.pid,
351 |             )
352 | 
353 |     def _attach_uretprobe(self, function_name: str, ebpf_function: str) -> None:
354 |         """
355 |         Helper to attach a uretprobe executing `ebpf_function` at every
356 |         `function_name` location.
357 |         """
358 |         # TODO: make sure multiple addresses work too
359 |         for addr in self.metadata.function_addresses(function_name):
360 |             self.bpf.attach_uretprobe(
361 |                 name=self.program,
362 |                 fn_name=ebpf_function.encode("utf8"),
363 |                 addr=addr,
364 |                 pid=self.pid,
365 |             )
366 | 
367 |     def background_polling(self, refresh_rate: int) -> None:
368 |         """
369 |         Run the polling in the background.
370 |         """
371 |         while self.is_running:
372 |             self.bpf.ring_buffer_poll(refresh_rate)
373 |             sleep(refresh_rate / 1000.0)
374 | 
375 |     def attach_probes(self) -> None:
376 |         """
377 |         Attach the required probes for this collector.
378 |         """
379 |         if self.options.enable_perf_events:
380 |             self.bpf.attach_perf_event(
381 |                 ev_type=PerfType.SOFTWARE,
382 |                 ev_config=PerfSWConfig.CPU_CLOCK,
383 |                 fn_name=b"perf_event",
384 |                 pid=self.pid,
385 |                 sample_freq=self.sample_freq,
386 |             )
387 | 
388 |     def enable_usdt_probes(self, usdt: USDT) -> None:
389 |         """
390 |         Enable USDT probes.
391 |         """
392 | 
393 |     def start(self) -> None:
394 |         """
395 |         Starts the bpf collector.
396 |         """
397 | 
398 |         if self.is_running:
399 |             raise InvalidStateException("BPF Collector is already running")
400 |         print("Starting eBPF collector...")
401 |         self.bpf[b"event_ring"].open_ring_buffer(self._handle_event)
402 |         self.attach_probes()
403 |         self.is_running = True
404 |         self.background_thread = Thread(target=self.background_polling, args=(100,))
405 |         self.background_thread.start()
406 |         print("eBPF collector started")
407 | 
408 |     def stop(self) -> None:
409 |         """
410 |         Stop polling the collector.
411 |         """
412 |         self.is_running = False
413 |         if self.background_thread:
414 |             self.background_thread.join()
415 |             self.background_thread = None
416 |             for (
417 |                 pid,
418 |                 fd,
419 |             ) in self.anon_map_fds.copy().items():  # pylint: disable=invalid-name
420 |                 os.close(fd)
421 |                 try:
422 |                     del self.bpf[b"pid_queues"][ct.c_int(pid)]
423 |                 except KeyError:
424 |                     pass
425 |             self.anon_map_fds.clear()
426 |             self.bpf.cleanup()
427 | 
428 |     # pylint: disable=unused-argument
429 |     def _handle_event(self, cpu: int, data: ct._CData, size: int) -> int:
430 |         """
431 |         Callback for the ring_buffer_poll. We actually dispatch this to the
432 |         `EventHandler`
433 |         """
434 |         # Returning a negative value aborts polling
435 |         if not self.is_running:
436 |             return -1
437 |         return self.event_handler.handle_event(self, data)
438 | 
439 |     def _optional_code(self) -> str:
440 |         """
441 |         Load additional code, depending on options or the specific
442 |         Collector type.
443 |         """
444 |         buf = ""
445 |         if self.options.enable_perf_events:
446 |             buf += load_c_file("perf.c")
447 |         return buf
448 | 
449 |     def build_memory_request(
450 |         self,
451 |         event_type: EventType,
452 |         request_id: Id128,
453 |         base_addr: int,
454 |         base_type: Type[Union[ct._CData, Struct, DWARFPointer]],
455 |         path: List[str],
456 |     ) -> memory_request:
457 |         """
458 |         Build a memory request from a request_id, a base_addr, a known base_type living
459 |         at this addr and a path describing which fields to follow to the final memory location.
460 | 
461 |         The fields definitions are extracted from the debug symbols.
462 |         """
463 |         memory_path = (ct.c_ulonglong * MEMORY_PATH_SIZE)()
464 |         # We have the base address, the path, and finally an offset 0 to read the memory itself.
465 |         mempath_length = len(path) + 1
466 |         assert mempath_length <= MEMORY_PATH_SIZE
467 |         memory_path[0] = base_addr
468 |         current_type = base_type
469 |         current_idx = 0
470 |         for part in path:
471 |             # If we follow a pointer, add a new item to the underlying path.
472 |             # Otherwise, just add to the previous type.
473 |             if issubclass(current_type, DWARFPointer):
474 |                 current_type = current_type.pointed_type
475 |                 current_idx += 1
476 |                 memory_path[current_idx] = 0
477 |             if issubclass(current_type, Struct):
478 |                 attr = current_type.field_definition(part)
479 |                 if attr is None:
480 |                     raise AttributeError(f"Type {current_type} has no field {attr}")
481 |                 current_type = attr.member_type
482 |                 memory_path[current_idx] += attr.offset
483 |             else:
484 |                 raise AttributeError(
485 |                     f"Cannot dereference field {part} from type {current_type}"
486 |                 )
487 |         # For convenience, support the last field as a pointer.
488 |         if issubclass(current_type, DWARFPointer) or current_type == ct.c_char_p:
489 |             memory_path[current_idx + 1] = 0
490 |             mempath_length += 1
491 |         size = get_size(current_type, dereference=True)
492 | 
493 |         return memory_request(
494 |             event_type=event_type,
495 |             request_id=request_id,
496 |             path_size=mempath_length,
497 |             size=size,
498 |             memory_path=memory_path,
499 |         )
500 | 
501 |     def send_memory_request(self, pid: int, request: memory_request) -> None:
502 |         """
503 |         Sends a memory request to the ebpf program.
504 |         """
505 |         ret = -1
506 |         if pid in self.anon_map_fds:
507 |             map_fd = self.anon_map_fds[pid]
508 |             ret = bcclib.bpf_update_elem(ct.c_int(map_fd), 0, ct.byref(request), 0)
509 |         if ret < 0:
510 |             raise ValueError("Something went wrong while sending a memory request")
511 | 
512 |     def preprocess_code(self, buf: str) -> str:
513 |         """
514 |         Preprocess code for things macro are not allowed to do with BCC.
515 |         """
516 |         if self.include_children:
517 |             buf = buf.replace(
518 |                 "##CHECK_POSTMASTER##",
519 |                 """{ 
520 |                 u64 ppid; 
521 |                 struct task_struct* task_p = (struct task_struct*)bpf_get_current_task(); 
522 |                 struct task_struct* parent_task_p = task_p->real_parent; 
523 |                 ppid = parent_task_p->tgid; 
524 |                 if (ppid != POSTMASTER_PID) 
525 |                     return 0; 
526 |                 };""",
527 |             )
528 |         else:
529 |             buf = buf.replace("##CHECK_POSTMASTER##", "")
530 |         return buf
531 | 
532 |     def prepare_bpf(self) -> BPF:
533 |         """
534 |         Generate the eBPF program, both from static code and dynamically
535 |         generated defines and enums.
536 |         """
537 |         buf = defines_dict_to_c(self.constant_defines)
538 |         buf += defines_dict_to_c(self.struct_offsets_defines)
539 |         buf += defines_dict_to_c(self.make_struct_sizes_dict())
540 |         buf += intenum_to_c(EventType)
541 |         buf += intenum_to_c(MemoryAllocType)
542 |         buf += intenum_to_c(self.make_global_variables_enum())
543 |         buf += load_c_file("program.c")
544 |         buf += self._optional_code()
545 |         # Ok, now workaround some limitations of the macro system with bcc and implement our own.
546 |         buf = self.preprocess_code(buf)
547 |         # Add the code directory as include dir
548 |         cflags = [f"-I{CODE_BASE_PATH}"]
549 |         # Suppress some common warnings depending on bcc / kernel combinations
550 |         cflags.append("-Wno-macro-redefined")
551 |         cflags.append("-Wno-ignored-attributes")
552 |         # Only enable global memory probe if bcc version is recent enough
553 |         kwargs: Dict[str, Any] = {}
554 |         if self.include_children and BCC_VERSION_TUPLE >= (0, 19, 0):
555 |             kwargs["attach_usdt_ignore_pid"] = True
556 |             kwargs["usdt_contexts"] = [self.usdt_ctx]
557 |         bpf = BPF(text=buf.encode("utf8"), cflags=cflags, debug=0, **kwargs)
558 |         return bpf
559 | 
560 |     def setup_bpf_state(self) -> None:
561 |         """
562 |         Setup the initial BPF State
563 |         """
564 |         if self.pid > 0:
565 |             self.setup_process(self.pid)
566 | 
567 |     def setup_process(self, pid: int) -> int:
568 |         """
569 |         Callback when a new process is created.
570 |         """
571 |         if self.options.enable_perf_events:
572 |             new_map = bcclib.bcc_create_map(
573 |                 BPF_MAP_TYPE_QUEUE, None, 0, ct.sizeof(memory_request), 1024, 0
574 |             )
575 |             self.bpf[b"pid_queues"][ct.c_int(pid)] = ct.c_int(new_map)
576 |             self.anon_map_fds[pid] = new_map
577 |         return 0
578 | 
579 |     def cleanup_process(self, pid: int) -> int:
580 |         """
581 |         Callback when a process exits.
582 |         """
583 |         # If we instrument a single pid, exit
584 |         if self.pid == pid:
585 |             print(f"Process {pid} is terminating, stopping collection")
586 |             self.is_running = False
587 |         else:
588 |             try:
589 |                 if pid in self.anon_map_fds:
590 |                     try:
591 |                         del self.bpf[b"pid_queues"][ct.c_int(pid)]
592 |                     except KeyError:
593 |                         pass
594 |                     os.close(self.anon_map_fds[pid])
595 |                     del self.anon_map_fds[pid]
596 |             except KeyError:
597 |                 return 0
598 |         return 0
599 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/c_defs.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Datastructure definitions used in the C ebpf code.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import ctypes as ct
  7 | from enum import IntEnum
  8 | from typing import Dict, List, Tuple, Type
  9 | 
 10 | from ..unwind import stack_data_t
 11 | 
 12 | BPF_MAP_TYPE_QUEUE = 22
 13 | 
 14 | 
 15 | class Id128(ct.Structure):
 16 |     """
 17 |     Structure containing two u64, to be used as either a single 8-bytes int or a two 8-bytes tuple.
 18 |     """
 19 | 
 20 |     _fields_ = [("u1", ct.c_ulonglong), ("u2", ct.c_ulonglong)]
 21 | 
 22 |     @classmethod
 23 |     def from_int(cls, intvalue: int) -> Id128:
 24 |         """
 25 |         Create an Id128 from a single integer.
 26 |         """
 27 |         return cls(intvalue, 0)
 28 | 
 29 |     def as_int(self) -> int:
 30 |         """
 31 |         Interpret an Id128 as a single integer.
 32 |         """
 33 |         val: int = self.u1
 34 |         return val
 35 | 
 36 |     @classmethod
 37 |     def from_tuple(cls, inttuple: Tuple[int, int]) -> Id128:
 38 |         """
 39 |         Create an Id128 from a two-ints tuple.
 40 |         """
 41 |         return cls(*inttuple)
 42 | 
 43 |     def as_tuple(self) -> Tuple[int, int]:
 44 |         """
 45 |         Interpret an Id128 as a two-int tuple.
 46 |         """
 47 |         return (self.u1, self.u2)
 48 | 
 49 | 
 50 | # pylint: disable=invalid-name
 51 | class EventType(IntEnum):
 52 |     """
 53 |     EventTypes generated by the EBPF code.
 54 |     """
 55 | 
 56 |     ExecutorRun = 1
 57 |     ExecutorFinish = 2
 58 |     DropPortalEnter = 3
 59 |     DropPortalReturn = 4
 60 |     ExecProcNodeFirst = 5
 61 |     ExecEndNode = 6
 62 |     KBlockRqIssue = 7
 63 |     StackSample = 8
 64 |     MemoryResponseQueryInstr = 9
 65 |     MemoryResponseNodeInstr = 10
 66 |     MemoryNodeData = 11
 67 |     GUCResponse = 12
 68 |     MemoryAccount = 13
 69 |     ProcessFork = 14
 70 |     ProcessExit = 15
 71 | 
 72 | 
 73 | instrument_type = ct.c_byte * 0
 74 | 
 75 | 
 76 | class StubStructure(ct.Structure):
 77 |     """
 78 |     StubStructure definition, which actual fields must be updated at runtime.
 79 |     """
 80 | 
 81 |     _protofields: List[Tuple[str, Type[ct._CData]]] = []
 82 | 
 83 |     @classmethod
 84 |     def update_fields(cls, fields: Dict[str, Type[ct._CData]]) -> None:
 85 |         """
 86 |         Update the structure fields.
 87 |         """
 88 |         if hasattr(cls, "_fields_"):
 89 |             # We are not allowed to update it. But if all updated values are
 90 |             # the same as the first update, we don't care.
 91 |             fields_dict = dict(cls._fields_)  # type: ignore
 92 |             for key, value in fields.items():
 93 |                 if fields_dict[key] != value:
 94 |                     raise ValueError("Cannot update a struct more than once.")
 95 |             return
 96 |         fields_dict = dict(cls._protofields)
 97 |         fields_dict.update(fields)
 98 |         cls._fields_ = list(fields_dict.items())
 99 | 
100 | 
101 | MAX_QUERY_LENGTH = 2048
102 | MAX_SEARCHPATH_LENGTH = 1024
103 | 
104 | 
105 | class event_base(ct.Structure):
106 |     """
107 |     Common fields for all events.
108 |     """
109 | 
110 |     _fields_ = [("event_type", ct.c_short), ("pid", ct.c_int)]
111 | 
112 | 
113 | class portal_data(StubStructure):
114 |     """
115 |     Represents the portal_data associated to a portal.
116 |     """
117 | 
118 |     _protofields = [
119 |         ("event", event_base),
120 |         ("portal_key", Id128),
121 |         ("query_addr", ct.c_ulonglong),
122 |         ("query_id", ct.c_ulonglong),
123 |         ("startup_cost", ct.c_double),
124 |         ("total_cost", ct.c_double),
125 |         ("plan_rows", ct.c_double),
126 |         ("query", ct.c_char * MAX_QUERY_LENGTH),
127 |         ("instrument", instrument_type),
128 |         ("search_path", ct.c_char * MAX_SEARCHPATH_LENGTH),
129 |     ]
130 | 
131 | 
132 | class io_req_data(ct.Structure):
133 |     """
134 |     Represents the io_req_data coming from instrumenting the kernel.
135 |     """
136 | 
137 |     _fields_ = [
138 |         ("event", event_base),
139 |         ("rwbs", ct.c_char * 8),
140 |         ("bytes", ct.c_ulonglong),
141 |     ]
142 | 
143 | 
144 | class plan_data(ct.Structure):
145 |     """
146 |     Represents the data associated with a PlanNode.
147 |     """
148 | 
149 |     _fields_ = [
150 |         ("plan_addr", ct.c_ulonglong),
151 |         ("plan_tag", ct.c_int),
152 |         ("startup_cost", ct.c_double),
153 |         ("total_cost", ct.c_double),
154 |         ("plan_rows", ct.c_double),
155 |         ("plan_width", ct.c_int),
156 |         ("parallel_aware", ct.c_bool),
157 |     ]
158 | 
159 | 
160 | class planstate_data(StubStructure):
161 |     """
162 |     Represents the data associated to a PlanState node.
163 |     """
164 | 
165 |     _protofields = [
166 |         ("event", event_base),
167 |         ("portal_key", Id128),
168 |         ("planstate_addr", ct.c_ulonglong),
169 |         ("planstate_tag", ct.c_int),
170 |         ("lefttree", ct.c_ulonglong),
171 |         ("righttree", ct.c_ulonglong),
172 |         ("plan_data", plan_data),
173 |         ("instrument", instrument_type),
174 |         ("stack_capture", stack_data_t),
175 |     ]
176 | 
177 | 
178 | MEMORY_REQUEST_MAXSIZE = 131072
179 | MEMORY_PATH_SIZE = 5
180 | 
181 | 
182 | class memory_request(ct.Structure):
183 |     """
184 |     Represents a memory request, to be processed in the perf event handler.
185 |     """
186 | 
187 |     _fields_ = [
188 |         ("event_type", ct.c_short),
189 |         ("request_id", Id128),
190 |         ("path_size", ct.c_int),
191 |         ("size", ct.c_ulonglong),
192 |         ("memory_path", ct.c_ulonglong * MEMORY_PATH_SIZE),
193 |     ]
194 | 
195 | 
196 | class memory_response(ct.Structure):
197 |     """
198 |     Represents a memory response, sent back from the perf event handler.
199 |     """
200 | 
201 |     _fields_ = [
202 |         ("event", event_base),
203 |         ("request_id", Id128),
204 |         ("payload", ct.c_char * MEMORY_REQUEST_MAXSIZE),
205 |     ]
206 | 
207 |     @property
208 |     def payload_addr(self) -> int:
209 |         """
210 |         Returns the address of the payload field: useful to parse it into it's
211 |         own struct.
212 |         """
213 |         return ct.addressof(self) + memory_response.payload.offset
214 | 
215 | 
216 | class stack_sample(StubStructure):
217 |     """
218 |     Represents a stack sample, sent back from the perf event handler.
219 |     """
220 | 
221 |     _protofields = [("portal_data", portal_data), ("stack_data", stack_data_t)]
222 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/guc.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module defines the collector for getting / setting GUC.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import ctypes as ct
  7 | import struct
  8 | from dataclasses import dataclass
  9 | from typing import Any, BinaryIO, Dict, Optional, Tuple, Type
 10 | 
 11 | from elftools.elf.elffile import ELFFile
 12 | 
 13 | from ...utils import readcstr
 14 | from ..dwarf import ProcessMetadata, Struct
 15 | from . import BPFCollector, CollectorOptions, EventHandler
 16 | from .c_defs import event_base
 17 | from .utils import load_c_file
 18 | 
 19 | GUC_MAX_LENGTH = 128
 20 | 
 21 | 
 22 | # pylint: disable=invalid-name
 23 | class guc_request(ct.Structure):
 24 |     """
 25 |     A request to set a guc.
 26 |     """
 27 | 
 28 |     _fields_ = [
 29 |         ("guc_location", ct.c_ulonglong),
 30 |         ("guc_size", ct.c_int),
 31 |         ("payload", ct.c_byte * GUC_MAX_LENGTH),
 32 |     ]
 33 | 
 34 | 
 35 | # pylint: disable=invalid-name
 36 | class guc_response(ct.Structure):
 37 |     """
 38 |     A response to a guc_request.
 39 |     """
 40 | 
 41 |     _fields_ = [
 42 |         ("event", event_base),
 43 |         ("guc_location", ct.c_ulonglong),
 44 |         ("status", ct.c_bool),
 45 |     ]
 46 | 
 47 | 
 48 | class GUCTracerOptions(CollectorOptions):
 49 |     """
 50 |     Dataclass for GUCTracerBPFCollector options.
 51 |     """
 52 | 
 53 |     sample_freq: int = 3000
 54 |     guc_to_watch: Dict[str, str] = {}
 55 | 
 56 | 
 57 | class GUCTracerEventHandler(EventHandler):
 58 |     """
 59 |     EventHandler for the GUCTracerBPFCollector.
 60 |     """
 61 | 
 62 |     def __init__(self) -> None:
 63 |         super().__init__()
 64 |         self.pending_names_req: Dict[int, GUCDefinition] = {}
 65 | 
 66 |     # pylint: disable=invalid-name
 67 |     def handle_GUCResponse(
 68 |         self, bpf_collector: GUCTracerBPFCollector, event: ct._CData, pid: int
 69 |     ) -> int:
 70 |         """
 71 |         Handle GUCResponse messages.
 72 |         """
 73 |         event = ct.cast(event, ct.POINTER(guc_response)).contents
 74 |         guc_def, value = bpf_collector.pending_guc_sets.pop(event.guc_location)
 75 |         if event.status:
 76 |             print(
 77 |                 f"GUC {guc_def.guc_name}@{event.guc_location} has been successfully set to {value}"
 78 |             )
 79 |         else:
 80 |             print(
 81 |                 f"GUC {guc_def.guc_name}@{event.guc_location} has failed to be set to {value}"
 82 |             )
 83 |         return 0
 84 | 
 85 | 
 86 | @dataclass
 87 | class GUCDefinition:
 88 |     """
 89 |     A GUC definition, extracted from the binary.
 90 |     """
 91 | 
 92 |     guc_type: str
 93 |     guc_name: str
 94 |     guc_location: int
 95 | 
 96 | 
 97 | class GUCTracerBPFCollector(BPFCollector):
 98 |     """
 99 |     BPF Collector tracing GUCs and potentially modifying them.
100 |     """
101 | 
102 |     options_cls = GUCTracerOptions
103 |     event_handler_cls = GUCTracerEventHandler
104 | 
105 |     GUC_TABLE_TYPE_TO_VARIABLE = {
106 |         "config_bool": "ConfigureNamesBool",
107 |         "config_int": "ConfigureNamesInt",
108 |         "config_real": "ConfigureNamesReal",
109 |         "config_string": "ConfigureNamesString",
110 |         "config_enum": "ConfigureNamesEnum",
111 |     }
112 | 
113 |     def __init__(
114 |         self,
115 |         metadata: ProcessMetadata,
116 |         options: Optional[CollectorOptions] = None,
117 |         include_children: bool = False,
118 |     ):
119 |         if include_children:
120 |             raise NotImplementedError(
121 |                 "GUC Tracer does not support attaching to the whole cluster."
122 |             )
123 |         self.options: CollectorOptions
124 |         self.guc_defs: Dict[str, GUCDefinition] = {}
125 |         self.pending_guc_sets: Dict[int, Tuple[GUCDefinition, Any]] = {}
126 |         # We must not rely on the debug symbol elffile, but instead the one
127 |         # from the executable itself
128 |         with ELFFile.load_from_path(metadata.program) as elf:
129 |             reladyn = elf.get_section_by_name(".rela.dyn")
130 |             self.relocations: Dict[int, int] = {
131 |                 reloc["r_offset"]: reloc["r_addend"]
132 |                 for reloc in reladyn.iter_relocations()
133 |             }
134 |         self.ready = False
135 |         super().__init__(metadata, options)
136 | 
137 |     def _relocate_addr(self, addr: int) -> int:
138 |         """
139 |         Relocate an address from the .rela.dyn section information.
140 |         """
141 |         if addr in self.relocations:
142 |             return self.relocations[addr]
143 |         return 0
144 | 
145 |     def _load_one_gucdef(
146 |         self, addr: int, gucdef_type: Type[Struct], binfile: BinaryIO
147 |     ) -> Optional[GUCDefinition]:
148 |         """
149 |         Load one GUC definition from the binary
150 |         """
151 |         # First lookup it's name. We could just use the base address
152 |         # since it's the first member but better make it correct
153 |         gen_definition = gucdef_type.field_definition("gen")
154 |         if gen_definition is None:
155 |             raise ValueError(
156 |                 f"Could not find member gen in struct {gucdef_type.__name__}"
157 |             )
158 |         name_definition = gen_definition.member_type.field_definition("name")  # type: ignore
159 |         if name_definition is None:
160 |             raise ValueError(
161 |                 f"Could not find member name in struct {gen_definition.member_type.__name__}"
162 |             )
163 |         name_pointer_addr = addr + gen_definition.offset + name_definition.offset
164 |         # Now lookup the relocation information for that address
165 |         reloced_addr = self._relocate_addr(name_pointer_addr)
166 |         if reloced_addr == 0:
167 |             return None
168 |         # Now we can read the data from the binary
169 |         binfile.seek(reloced_addr)
170 |         guc_bname = readcstr(binfile)
171 |         guc_name = guc_bname.decode("utf8")
172 |         # Now relocate the GUC global variable address
173 |         variable_definition = gucdef_type.field_definition("variable")
174 |         if variable_definition is None:
175 |             raise ValueError(
176 |                 f"Could not find member variable in struct {gucdef_type.__name__}"
177 |             )
178 | 
179 |         variable_pointer_addr = addr + variable_definition.offset
180 |         reloced_addr = self._relocate_addr(variable_pointer_addr)
181 |         return GUCDefinition(
182 |             guc_name=guc_name,
183 |             guc_type=gucdef_type.__name__.replace("config_", ""),
184 |             guc_location=reloced_addr + self.metadata.base_addr,
185 |         )
186 | 
187 |     def _load_guc_defs_from_binary(self) -> None:
188 |         """
189 |         Load GUC definitions from the binary executable.
190 |         """
191 |         with open(self.metadata.program, "rb") as programbin:
192 |             for typname, variable_name in self.GUC_TABLE_TYPE_TO_VARIABLE.items():
193 |                 deftype = getattr(self.metadata.structs, typname)
194 |                 typsize = deftype.size
195 |                 variable_addr = self.metadata.global_variable(variable_name)
196 |                 if variable_addr is None:
197 |                     raise ValueError(
198 |                         f"Could not locate global variable {variable_name}"
199 |                     )
200 |                 addr = variable_addr - self.metadata.base_addr
201 | 
202 |                 # Now iterate over the entries.
203 |                 while True:
204 |                     guc = self._load_one_gucdef(addr, deftype, programbin)
205 |                     if guc is None:
206 |                         break
207 |                     self.guc_defs[guc.guc_name] = guc
208 |                     addr += typsize
209 | 
210 |     def set_guc(self, guc_name: str, guc_value: str) -> None:
211 |         """
212 |         Send a request to set a GUC to a specific value.
213 |         """
214 |         guc_def = self.guc_defs[guc_name]
215 |         guc_c_value: Optional[bytes] = None
216 |         if guc_def.guc_type != "int":
217 |             raise NotImplementedError("We only support ints for now.")
218 |         guc_c_value = struct.pack("i", int(guc_value))
219 |         guc_ct_value: ct._CData = ct.create_string_buffer(guc_c_value, GUC_MAX_LENGTH)
220 |         guc_ct_value = ct.cast(
221 |             guc_ct_value, ct.POINTER(ct.c_byte * GUC_MAX_LENGTH)
222 |         ).contents
223 |         guc_req = guc_request(
224 |             ct.c_ulonglong(guc_def.guc_location), guc_size=4, payload=guc_ct_value
225 |         )
226 |         self.pending_guc_sets[guc_def.guc_location] = guc_def, guc_value
227 |         self.bpf[b"gucs_to_set"].push(guc_req)
228 | 
229 |     def setup_bpf_state(self) -> None:
230 |         super().setup_bpf_state()
231 |         # Build a mapping of GUC names to variables addresses
232 |         self._load_guc_defs_from_binary()
233 | 
234 |     @property
235 |     def constant_defines(self) -> Dict[str, int]:
236 |         constants = super().constant_defines
237 |         constants["GUC_MAX_LENGTH"] = GUC_MAX_LENGTH
238 |         return constants
239 | 
240 |     def attach_probes(self) -> None:
241 |         super().attach_probes()
242 |         # Attach at various not-too-intrusive points.
243 |         self._attach_uretprobe("BeginCommand", "process_guc_uprobe")
244 |         self._attach_uretprobe("printtup", "process_guc_uprobe")
245 | 
246 |         self._attach_uretprobe("launcher_determine_sleep", "process_guc_uprobe")
247 |         self._attach_uretprobe("vacuum_delay_point", "process_guc_uprobe")
248 | 
249 |     def _optional_code(self) -> str:
250 |         buf = super()._optional_code()
251 |         buf += load_c_file("gucset.c")
252 |         return buf
253 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/querytracer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | BPF Collector tracing queries.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import ctypes as ct
  7 | from dataclasses import dataclass, field
  8 | from enum import IntEnum
  9 | from typing import Dict, List, Optional, Tuple
 10 | 
 11 | from bcc import USDT
 12 | 
 13 | from pgtracer.ebpf.dwarf import ProcessMetadata
 14 | from pgtracer.model.plan import PlanState
 15 | from pgtracer.model.query import Query
 16 | 
 17 | from ...model import PlanState, Query, memory_account
 18 | from . import BPFCollector, CollectorOptions, EventHandler, EventType
 19 | from .c_defs import (
 20 |     Id128,
 21 |     io_req_data,
 22 |     memory_response,
 23 |     planstate_data,
 24 |     portal_data,
 25 |     stack_sample,
 26 | )
 27 | from .utils import load_c_file
 28 | 
 29 | 
 30 | class InstrumentationFlags(IntEnum):
 31 |     """
 32 |     Instrumentation flags.
 33 | 
 34 |     Mimic the InstrumentOption enum from PG.
 35 |     We define it statically here as it can be used from options.
 36 |     """
 37 | 
 38 |     TIMER = 1 << 0
 39 |     BUFFERS = 1 << 1
 40 |     ROWS = 1 << 2
 41 |     WAL = 1 << 3
 42 |     ALL = 0x7FFFFFFF  # INT32 Max
 43 | 
 44 | 
 45 | @dataclass
 46 | class QueryTracerOptions(CollectorOptions):
 47 |     """
 48 |     Dataclass for QueryTracerBPFCollector options.
 49 |     """
 50 | 
 51 |     instrument_flags: int = 0
 52 |     enable_nodes_collection: bool = False
 53 |     enable_query_discovery: bool = True
 54 | 
 55 | 
 56 | @dataclass
 57 | class PerProcessInfo:
 58 |     """
 59 |     Store information about the queries processed by a backend.
 60 |     """
 61 | 
 62 |     pid: int
 63 |     last_portal_key: Optional[Tuple[int, int]] = None
 64 |     query_history: List[Query] = field(default_factory=list)
 65 |     query_cache: Dict[Tuple[int, int], Query] = field(default_factory=dict)
 66 |     current_executor: Optional[Tuple[int, int]] = None
 67 |     current_query: Optional[Query] = None
 68 | 
 69 | 
 70 | # pylint: disable=invalid-name
 71 | class QueryTracerEventHandler(EventHandler):
 72 |     """
 73 |     EventHandler for QueryTracer.
 74 |     """
 75 | 
 76 |     def __init__(self) -> None:
 77 |         self.per_process_info: Dict[int, PerProcessInfo] = {}
 78 |         self.next_request_id = 0
 79 |         self.process_history: List[PerProcessInfo] = []
 80 | 
 81 |     def get_process_info(self, pid: int) -> PerProcessInfo:
 82 |         """
 83 |         Returns the process info for a given PID, creating it if needed.
 84 |         """
 85 |         if pid not in self.per_process_info:
 86 |             self.per_process_info[pid] = PerProcessInfo(pid)
 87 |         return self.per_process_info[pid]
 88 | 
 89 |     def _process_portal_data(
 90 |         self, bpf_collector: BPFCollector, event: portal_data, pid: int
 91 |     ) -> int:
 92 |         """
 93 |         Process the portal data. This is used both when a query starts, and when we see
 94 |         the first live query during query discovery.
 95 |         """
 96 |         key = event.portal_key.as_tuple()
 97 |         process_info = self.get_process_info(pid)
 98 |         process_info.current_executor = event.portal_key.as_tuple()
 99 | 
100 |         if key not in process_info.query_cache:
101 |             process_info.query_cache[key] = Query.from_event(
102 |                 bpf_collector.metadata, event
103 |             )
104 |         else:
105 |             process_info.query_cache[key].update(bpf_collector.metadata, event)
106 |         process_info.current_query = process_info.query_cache[key]
107 |         # If perf events are enabled, start watching the query instrumentation.
108 |         if bpf_collector.options.enable_perf_events:
109 |             structs = bpf_collector.metadata.structs
110 |             request = bpf_collector.build_memory_request(
111 |                 EventType.MemoryResponseQueryInstr,
112 |                 event.portal_key,
113 |                 event.query_addr,
114 |                 structs.QueryDesc,
115 |                 ["planstate", "instrument"],
116 |             )
117 |             bpf_collector.send_memory_request(pid, request)
118 |         return 0
119 | 
120 |     def handle_ExecutorRun(
121 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
122 |     ) -> int:
123 |         """
124 |         Handle ExecutorRun event. This event is produced by an uprobe on
125 |         standard_ExecutorRun. See executorstart_enter in program.c.
126 | 
127 |         We record the fact that a query started, extracting relevant metadata
128 |         already present at the query start.
129 |         """
130 |         if bpf_collector.options.enable_perf_events:
131 |             bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False)
132 |             bpf_collector.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool(False)
133 |         event = ct.cast(event, ct.POINTER(portal_data)).contents
134 |         return self._process_portal_data(bpf_collector, event, pid)
135 | 
136 |     # pylint: disable=unused-argument
137 |     def handle_ExecutorFinish(
138 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
139 |     ) -> int:
140 |         """
141 |         Handle ExecutorFinish event.
142 |         """
143 |         event = ct.cast(event, ct.POINTER(portal_data)).contents
144 |         key = event.portal_key.as_tuple()
145 |         process_info = self.get_process_info(pid)
146 |         if process_info.current_executor:
147 |             process_info.current_executor = None
148 |             process_info.current_query = None
149 |         if key in process_info.query_cache:
150 |             process_info.query_cache[event.portal_key.as_tuple()].update(
151 |                 bpf_collector.metadata, event
152 |             )
153 |         return 0
154 | 
155 |     # pylint: disable=unused-argument
156 |     def handle_DropPortalEnter(
157 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
158 |     ) -> int:
159 |         """
160 |         Handle DropPortalEnter event. This event is produced by a uprobe on
161 |         DropPortal. See protaldrop_enter in program.c.
162 | 
163 |         PortalDrop is called whenever a query is finished: once the last row
164 |         has been read in the case of a single query, or when the cursor is
165 |         closed in the case of a cursor.
166 | 
167 |         Since PortalDrop is responsbile for cleaning up the portal, we record
168 |         the instrumentation and other data about the query here, and remember
169 |         it's identifier. Only once we return from DropPortal will we actually
170 |         clean up the query from our current cache, and append it to history.
171 |         """
172 |         event = ct.cast(event, ct.POINTER(portal_data)).contents
173 |         process_info = self.get_process_info(pid)
174 |         process_info.last_portal_key = event.portal_key.as_tuple()
175 |         if process_info.last_portal_key in process_info.query_cache:
176 |             process_info.query_cache[process_info.last_portal_key].update(
177 |                 bpf_collector.metadata, event
178 |             )
179 |         return 0
180 | 
181 |     # pylint: disable=unused-argument
182 |     def handle_DropPortalReturn(
183 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
184 |     ) -> int:
185 |         """
186 |         Handle DropPortalReturn event. This event is produced by an uretprobe on
187 |         DropPortal. See protaldrop_return in program.c.
188 | 
189 |         We remove the query from the internal cache  and append it to history.
190 |         """
191 |         event = ct.cast(event, ct.POINTER(portal_data)).contents
192 |         process_info = self.get_process_info(pid)
193 |         if process_info.last_portal_key is not None:
194 |             if process_info.last_portal_key in process_info.query_cache:
195 |                 query = process_info.query_cache[process_info.last_portal_key]
196 |                 process_info.query_history.append(query)
197 |                 del process_info.query_cache[process_info.last_portal_key]
198 |             process_info.last_portal_key = None
199 |         process_info.current_executor = None
200 |         process_info.current_query = None
201 |         return 0
202 | 
203 |     def handle_ExecProcNodeFirst(
204 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
205 |     ) -> int:
206 |         """
207 |         Handle ExecProcNodeFirst event. This event is produced by a uprobe on
208 |         ExecProcNodeFirst.
209 | 
210 |         The goal here is to build a plan tree for the query.
211 |         """
212 |         event = ct.cast(event, ct.POINTER(planstate_data)).contents
213 |         process_info = self.get_process_info(pid)
214 |         query = process_info.query_cache.get(event.portal_key.as_tuple())
215 |         if query is None:
216 |             # We don't know this query: maybe it started running before us ?
217 |             return 0
218 |         query.add_node_from_event(bpf_collector.metadata, event)
219 |         if bpf_collector.options.enable_perf_events:
220 |             request = bpf_collector.build_memory_request(
221 |                 EventType.MemoryResponseNodeInstr,
222 |                 Id128.from_int(event.planstate_addr),
223 |                 event.planstate_addr,
224 |                 bpf_collector.metadata.structs.PlanState,
225 |                 ["instrument"],
226 |             )
227 |             bpf_collector.send_memory_request(pid, request)
228 |         return 0
229 | 
230 |     def handle_ExecEndNode(
231 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
232 |     ) -> int:
233 |         """
234 |         Handle ExecEndNode event. This event is produced by a uprobe on
235 |         ExecEndNode's implementations.
236 | 
237 |         Once the executor node is destroyed, we want to collect it's
238 |         instrumentation data if any.
239 |         """
240 |         event = ct.cast(event, ct.POINTER(planstate_data)).contents
241 |         process_info = self.get_process_info(pid)
242 |         if process_info.last_portal_key is None:
243 |             return 0
244 |         query = process_info.query_cache.get(process_info.last_portal_key)
245 |         if query is None:
246 |             return 0
247 |         node = query.nodes.get(event.planstate_addr)
248 |         if node is None:
249 |             return 0
250 |         instrument_addr = ct.addressof(event.instrument)
251 |         instrument = bpf_collector.metadata.structs.Instrumentation(instrument_addr)
252 |         instrument.nloops = ct.c_double(instrument.nloops.value + 1)  # type: ignore
253 |         node.instrument = instrument
254 |         return 0
255 | 
256 |     def handle_KBlockRqIssue(
257 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
258 |     ) -> int:
259 |         """
260 |         Handle KBlockRqIssue event. This event is produced by a kernel
261 |         tracepoint on block_rq_issue.
262 | 
263 |         This serves to keep a count of block IO performed by a device, which
264 |         can be useful to compute "real" cache hit ratio.
265 |         """
266 |         event = ct.cast(event, ct.POINTER(io_req_data)).contents
267 |         process_info = self.get_process_info(pid)
268 |         # We try to attach it to a specific query.
269 |         # If we don't have one, don't bother
270 |         if not process_info.current_executor:
271 |             return 0
272 |         query = process_info.query_cache.get(process_info.current_executor)
273 |         if query is None:
274 |             return 0
275 |         if b"R" in event.rwbs:
276 |             query.io_counters["R"] += event.bytes
277 |         elif b"W" in event.rwbs:
278 |             query.io_counters["W"] += event.bytes
279 |         return 0
280 | 
281 |     def handle_MemoryResponseQueryInstr(
282 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
283 |     ) -> int:
284 |         """
285 |         Handle MemoryResponseQueryInstr
286 | 
287 |         We lookup the request_id, and update the given counters if needed.
288 |         """
289 |         ev = ct.cast(event, ct.POINTER(memory_response)).contents
290 | 
291 |         process_info = self.get_process_info(pid)
292 |         if not process_info.current_executor:
293 |             return 0
294 |         # We have a memory response for the whole query
295 |         query = process_info.query_cache.get(ev.request_id.as_tuple(), None)
296 |         if query:
297 |             instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr)
298 |             query.instrument = instr
299 |             # Load all fields from the underlying memory.
300 |             instr.as_dict(include_all=True)
301 |             # Re-send the same request for continuous monitoring
302 |             request = bpf_collector.build_memory_request(
303 |                 EventType.MemoryResponseQueryInstr,
304 |                 ev.request_id,
305 |                 query.addr,
306 |                 bpf_collector.metadata.structs.QueryDesc,
307 |                 ["planstate", "instrument"],
308 |             )
309 | 
310 |             bpf_collector.send_memory_request(pid, request)
311 |         return 0
312 | 
313 |     def handle_MemoryResponseNodeInstr(
314 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
315 |     ) -> int:
316 |         """
317 |         Handle MemoryResponseNodeInstr produced as a response to some memory_request.
318 |         """
319 |         process_info = self.get_process_info(pid)
320 |         if not process_info.current_executor:
321 |             return 0
322 |         query = process_info.query_cache.get(process_info.current_executor, None)
323 |         ev = ct.cast(event, ct.POINTER(memory_response)).contents
324 |         nodeid = ev.request_id.as_int()
325 |         # We have a memory response for an individual node
326 |         if query is not None and nodeid is not None:
327 |             node = query.nodes.get(nodeid)
328 |             if node is not None:
329 |                 instr = bpf_collector.metadata.structs.Instrumentation(ev.payload_addr)
330 |                 node.instrument = instr
331 |                 # Re-send the same request for continuous monitoring
332 |                 request = bpf_collector.build_memory_request(
333 |                     EventType.MemoryResponseNodeInstr,
334 |                     Id128.from_int(nodeid),
335 |                     nodeid,
336 |                     bpf_collector.metadata.structs.PlanState,
337 |                     ["instrument"],
338 |                 )
339 |                 bpf_collector.send_memory_request(pid, request)
340 |         return 0
341 | 
342 |     def handle_MemoryNodeData(
343 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
344 |     ) -> int:
345 |         """
346 |         Handle MemoryNodeData produced as a response for a memory_request.
347 |         """
348 |         process_info = self.get_process_info(pid)
349 |         if not process_info.current_executor:
350 |             return 0
351 |         ev = ct.cast(event, ct.POINTER(planstate_data)).contents
352 |         query = process_info.query_cache.get(process_info.current_executor, None)
353 |         if query is not None:
354 |             node = query.add_node_from_event(bpf_collector.metadata, ev)
355 |             if ev.lefttree and ev.lefttree not in query.nodes:
356 |                 leftchild = PlanState(ev.lefttree)
357 |                 leftchild.parent_node = node
358 |                 query.nodes[ev.lefttree] = leftchild
359 |                 node.children[leftchild] = None
360 |                 self._gather_node_info(bpf_collector, ev.lefttree, pid)
361 |             if ev.righttree and ev.righttree not in query.nodes:
362 |                 rightchild = PlanState(ev.righttree)
363 |                 rightchild.parent_node = node
364 |                 query.nodes[ev.righttree] = rightchild
365 |                 node.children[rightchild] = None
366 |                 self._gather_node_info(bpf_collector, ev.righttree, pid)
367 |         return 0
368 | 
369 |     def _gather_node_info(
370 |         self, bpf_collector: BPFCollector, nodeaddr: int, pid: int
371 |     ) -> None:
372 |         """
373 |         Send memory requests to gather information about a specific node.
374 |         """
375 |         req = bpf_collector.build_memory_request(
376 |             EventType.MemoryNodeData,
377 |             Id128.from_int(nodeaddr),
378 |             nodeaddr,
379 |             bpf_collector.metadata.structs.PlanState,
380 |             [],
381 |         )
382 |         bpf_collector.send_memory_request(pid, req)
383 | 
384 |     def handle_StackSample(
385 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
386 |     ) -> int:
387 |         """
388 |         Handle StackSample events produced during perf sampling.
389 |         """
390 |         ev = ct.cast(event, ct.POINTER(stack_sample)).contents
391 |         process_info = self.get_process_info(pid)
392 |         _, creation_time = ev.portal_data.portal_key.as_tuple()
393 |         if creation_time:
394 |             self._process_portal_data(bpf_collector, ev.portal_data, pid)
395 |         bpf_collector.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(False)
396 |         if process_info.current_query:
397 |             # Now add the nodes from the stacktrace
398 |             process_info.current_query.add_nodes_from_stack(
399 |                 bpf_collector.metadata, ev.stack_data
400 |             )
401 |             # And add memory_requests to gather their information.
402 |             for node in process_info.current_query.nodes.values():
403 |                 if node.is_stub and node.addr:
404 |                     self._gather_node_info(bpf_collector, node.addr, pid)
405 |         return 0
406 | 
407 |     def handle_MemoryAccount(
408 |         self, bpf_collector: BPFCollector, event: ct._CData, pid: int
409 |     ) -> int:
410 |         """
411 |         Handle MemoryAccount events produced by malloc instrumentation.
412 |         """
413 |         ev = ct.cast(event, ct.POINTER(memory_account)).contents
414 |         process_info = self.get_process_info(pid)
415 |         if process_info.current_query:
416 |             process_info.current_query.memallocs.update(ev)
417 |         return 0
418 | 
419 | 
420 | class QueryTracerBPFCollector(BPFCollector):
421 |     """
422 |     BPF Collector tracing queries and optionally individual nodes.
423 |     """
424 | 
425 |     options_cls = QueryTracerOptions
426 |     event_handler_cls = QueryTracerEventHandler
427 | 
428 |     def __init__(
429 |         self,
430 |         metadata: ProcessMetadata,
431 |         options: Optional[QueryTracerOptions] = None,
432 |         include_children: bool = False,
433 |     ):
434 |         self.options: QueryTracerOptions
435 |         self.event_handler: QueryTracerEventHandler
436 |         super().__init__(metadata, options, include_children)
437 | 
438 |     def attach_probes(self) -> None:
439 |         super().attach_probes()
440 |         self._attach_uprobe("PortalDrop", "portaldrop_enter")
441 |         self._attach_uretprobe("PortalDrop", "portaldrop_return")
442 |         self._attach_uprobe("standard_ExecutorStart", "executorstart_enter")
443 |         self._attach_uprobe("standard_ExecutorRun", "executorrun_enter")
444 |         self._attach_uprobe("ExecutorFinish", "executorfinish_enter")
445 |         self._attach_uprobe("mmap", "mmap_enter")
446 |         self.bpf.attach_uprobe(
447 |             name=b"c", sym=b"mmap", fn_name=b"mmap_enter", pid=self.pid
448 |         )
449 |         self.bpf.attach_uprobe(
450 |             name=b"c", sym=b"munmap", fn_name=b"munmap_enter", pid=self.pid
451 |         )
452 |         if self.options.enable_nodes_collection:
453 |             self._attach_uprobe("ExecProcNodeFirst", "execprocnodefirst_enter")
454 |             for func in self.ExecEndFuncs:
455 |                 self._attach_uprobe(func, "execendnode_enter")
456 | 
457 |     def enable_usdt_probes(self, usdt: USDT) -> None:
458 |         usdt.enable_probe(probe="libc:memory_sbrk_less", fn_name="sbrk_less")
459 |         usdt.enable_probe(probe="libc:memory_sbrk_more", fn_name="sbrk_more")
460 | 
461 |     @property
462 |     def constant_defines(self) -> Dict[str, int]:
463 |         constants = super().constant_defines
464 |         # USER_INSTRUMENT_FLAGS is defined only if the user wants to
465 |         # inconditonally turn on instrumentation.
466 |         if self.options.instrument_flags:
467 |             constants["USER_INSTRUMENT_FLAGS"] = self.options.instrument_flags
468 |         if self.options.enable_query_discovery:
469 |             if not self.ppid:
470 |                 constants["ENABLE_QUERY_DISCOVERY"] = True
471 |         return constants
472 | 
473 |     def _optional_code(self) -> str:
474 |         buf = super()._optional_code()
475 |         if self.options.enable_nodes_collection:
476 |             buf += load_c_file("plan.c")
477 |         buf += load_c_file("block_rq.c")
478 |         buf += load_c_file("memusage.c")
479 |         return buf
480 | 
481 |     def setup_bpf_state(self) -> None:
482 |         # FIXME: get rid of those magic numbers.
483 |         super().setup_bpf_state()
484 |         if self.options.enable_perf_events:
485 |             self.bpf[b"discovery_enabled"][ct.c_int(1)] = ct.c_bool(
486 |                 self.options.enable_query_discovery
487 |             )
488 |             self.bpf[b"discovery_enabled"][ct.c_int(2)] = ct.c_bool(
489 |                 self.options.enable_query_discovery
490 |             )
491 | 
492 |     def cleanup_process(self, pid: int) -> int:
493 |         if pid in self.event_handler.per_process_info:
494 |             self.event_handler.process_history.append(
495 |                 self.event_handler.per_process_info.pop(pid)
496 |             )
497 |         return super().cleanup_process(pid)
498 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/collector/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Various utilities for collector implementations.
 3 | """
 4 | from enum import IntEnum
 5 | from pathlib import Path
 6 | from typing import Any, Dict, Type
 7 | 
 8 | 
 9 | def intenum_to_c(intenum: Type[IntEnum]) -> str:
10 |     """
11 |     Generate C code defining an enum corresponding to a Python IntEnum.
12 |     """
13 |     buf = f"enum {intenum.__name__} {{\n"
14 |     members = []
15 | 
16 |     for member in intenum:
17 |         members.append(f"{intenum.__name__}{member.name} = {member.value}")
18 |     buf += ",\n".join(members)
19 |     buf += "\n};\n"
20 | 
21 |     return buf
22 | 
23 | 
24 | def defines_dict_to_c(defines_dict: Dict[str, Any]) -> str:
25 |     """
26 |     Generate a string of C #define directives from a mapping.
27 |     """
28 |     return (
29 |         "\n".join(f"#define {key} {value}" for key, value in defines_dict.items())
30 |         + "\n"
31 |     )
32 | 
33 | 
34 | CODE_BASE_PATH = Path(__file__).parent.parent / "code"
35 | 
36 | 
37 | def load_c_file(filename: str) -> str:
38 |     """
39 |     Loads a C file from the package code directory.
40 |     """
41 |     filepath = CODE_BASE_PATH / filename
42 |     with filepath.open() as cfile:
43 |         return cfile.read()
44 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/eh_frame_hdr.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains code for parsing an .eh_frame_hdr section.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import struct
  7 | from enum import IntEnum
  8 | from typing import TYPE_CHECKING, Any, Iterable, Optional, Tuple, no_type_check
  9 | 
 10 | from elftools.dwarf.callframe import CallFrameInfo
 11 | from elftools.dwarf.enums import DW_EH_encoding_flags
 12 | from elftools.elf.elffile import ELFFile
 13 | 
 14 | if TYPE_CHECKING:
 15 |     from elftools.dwarf.callframe import CFIEntry
 16 |     from elftools.elf.sections import Section
 17 | 
 18 | DW_EH_Encoding = IntEnum("DW_EH_Encoding", DW_EH_encoding_flags)  # type: ignore
 19 | 
 20 | 
 21 | class EhFrameHdr:
 22 |     """
 23 |     Parsed .eh_frame_hdr section
 24 |     """
 25 | 
 26 |     def __init__(self, section: Section, elffile: ELFFile):
 27 |         self.elffile = elffile
 28 |         self.section = section
 29 |         self.offset = self.section.global_offset
 30 |         self.eh_frame_hdr_start = self.section.stream.tell()
 31 |         # First read the fixed header
 32 |         (
 33 |             self.version,
 34 |             self.eh_frame_ptr_enc,
 35 |             self.fde_count_enc,
 36 |             self.table_enc,
 37 |         ) = self._unpack_from("<4B", offset=0)
 38 |         self.frame_ptr: int = self.read_value(self.eh_frame_ptr_enc)  # type: ignore
 39 |         self.fde_count: int = self.read_value(self.fde_count_enc)  # type: ignore
 40 |         self.table_start = self.section.stream.tell()
 41 |         self.dwarf_info = elffile.get_dwarf_info()
 42 |         self.cfi = CallFrameInfo(
 43 |             stream=self.dwarf_info.eh_frame_sec.stream,
 44 |             size=self.dwarf_info.eh_frame_sec.size,
 45 |             address=self.dwarf_info.eh_frame_sec.address,
 46 |             base_structs=self.dwarf_info.structs,
 47 |             for_eh_frame=True,
 48 |         )
 49 | 
 50 |     @no_type_check
 51 |     def read_value(
 52 |         self,
 53 |         encoding: int,
 54 |         offset: Optional[int] = None,
 55 |         relative: bool = True,
 56 |         program_counter: int = 0,
 57 |     ) -> int:
 58 |         """
 59 |         Read a value with the given encoding at the specific offset.
 60 |         Relative indicate wether the offset is relative to the start of the
 61 |         section or absolute in the ELFFile.
 62 |         program_counter is the current program counter used for DW_EH_PE_pcrel calculations.
 63 |         """
 64 |         value_enc = encoding & 0x0F
 65 |         relative_enc = encoding & 0x70
 66 |         if value_enc == DW_EH_Encoding.DW_EH_PE_absptr:
 67 |             result = self._unpack_from("@B", offset=offset, relative=relative)
 68 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_udata2:
 69 |             result = self._unpack_from("@H", offset=offset, relative=relative)
 70 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata2:
 71 |             result = self._unpack_from("@h", offset=offset, relative=relative)
 72 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_udata4:
 73 |             result = self._unpack_from("@I", offset=offset, relative=relative)
 74 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata4:
 75 |             result = self._unpack_from("@i", offset=offset, relative=relative)
 76 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_udata8:
 77 |             result = self._unpack_from("@Q", offset=offset, relative=relative)
 78 |         elif value_enc == DW_EH_Encoding.DW_EH_PE_sdata8:
 79 |             result = self._unpack_from("@q", offset=offset, relative=relative)
 80 |         else:
 81 |             raise ValueError(f"Unknown value encoding: {value_enc}")
 82 | 
 83 |         result = result[0]
 84 | 
 85 |         if relative_enc == DW_EH_Encoding.DW_EH_PE_absptr:
 86 |             pass
 87 |         elif relative_enc == DW_EH_Encoding.DW_EH_PE_pcrel:
 88 |             result += program_counter
 89 |         elif relative_enc == DW_EH_Encoding.DW_EH_PE_datarel:
 90 |             result += self.offset
 91 |         else:
 92 |             raise ValueError(f"Pointer encoding {relative_enc} not supported")
 93 |         return result
 94 | 
 95 |     @no_type_check
 96 |     def get_table_entry_size(self) -> int:
 97 |         """
 98 |         Returns the size of a table entry.
 99 |         """
100 |         enc = self.table_enc & 0x0F
101 |         if enc in (DW_EH_Encoding.DW_EH_PE_udata2, DW_EH_Encoding.DW_EH_PE_sdata2):
102 |             return 4
103 |         if enc in (DW_EH_Encoding.DW_EH_PE_udata4, DW_EH_Encoding.DW_EH_PE_sdata4):
104 |             return 8
105 |         if enc in (DW_EH_Encoding.DW_EH_PE_udata8, DW_EH_Encoding.DW_EH_PE_sdata8):
106 |             return 16
107 |         if enc == DW_EH_Encoding.DW_EH_PE_omit:
108 |             return 0
109 |         raise ValueError(f"Invalid table encoding: {enc}")
110 | 
111 |     def _read_section(
112 |         self, size: int, offset: Optional[int], relative: bool = False
113 |     ) -> Any:
114 |         """
115 |         Read `size` bytes from the underlying stream at the given `offset`.
116 |         relative indicates whether the given offset is relative to the
117 |         .eh_frame_hdr section start, or absolute in the ELFFile.
118 |         """
119 |         stream = self.section.stream
120 |         if offset is not None:
121 |             if relative:
122 |                 offset = offset + self.offset
123 |             stream.seek(offset)
124 |         return stream.read(size)
125 | 
126 |     def _unpack_from(
127 |         self, fmt: str, offset: Optional[int] = None, relative: bool = False
128 |     ) -> Tuple[int, ...]:
129 |         """
130 |         Unpack a value read at offset according to format.
131 |         """
132 |         size = struct.calcsize(fmt)
133 |         buffer = self._read_section(size, offset, relative)
134 |         return struct.unpack_from(fmt, buffer)
135 | 
136 |     def read_entry(self, offset: Optional[int] = None) -> Tuple[int, int]:
137 |         """
138 |         Read a table entry at the given offset. .eh_frame_hdr table entries are
139 |         couples of location / offset of the corresponding FDE.
140 |         """
141 |         loc_val: int = self.read_value(self.table_enc, offset, relative=False)
142 |         offset_val: int = self.read_value(self.table_enc)
143 |         return (loc_val, offset_val)
144 | 
145 |     def iter_entries(self) -> Iterable[Tuple[int, int]]:
146 |         """
147 |         Iter over .eh_frame_hdr table entries.
148 |         """
149 |         self.section.stream.seek(self.table_start)
150 |         for _ in range(0, self.fde_count):
151 |             yield self.read_entry()
152 | 
153 |     def find_fde(self, addrkey: int) -> Optional[CFIEntry]:
154 |         """
155 |         Find an antry by doing a binary search.
156 |         """
157 |         minidx = 0
158 |         maxidx = self.fde_count
159 |         size = self.get_table_entry_size()
160 |         while True:
161 |             idx = minidx + (maxidx - minidx) // 2
162 |             offset = self.table_start + idx * size
163 |             (addr, loc) = self.read_entry(offset=offset)
164 |             # We found the looked up key, now we need to find the right tag
165 |             if addrkey == addr or (minidx == idx and addrkey > addr):
166 |                 fde = self.cfi._parse_entry_at(
167 |                     loc - self.cfi.address
168 |                 )  # pylint: disable=protected-access
169 |                 if addrkey < fde.header.initial_location + fde.header.address_range:
170 |                     return fde
171 |                 # If the key is not in range, then we don't have an entry.
172 |                 return None
173 |             if addrkey < addr:
174 |                 if maxidx == idx:
175 |                     return None
176 |                 maxidx = idx
177 |             elif addrkey > addr:
178 |                 minidx = idx
179 | 
180 |     @classmethod
181 |     def load_eh_frame_hdr(cls, elf_file: ELFFile) -> Optional[EhFrameHdr]:
182 |         """
183 |         Load an EHFrameHDR from an ELFFile.
184 |         """
185 |         eh_frame_hdr = elf_file.get_section_by_name(".eh_frame_hdr")
186 |         if eh_frame_hdr is None:
187 |             return None
188 | 
189 |         # pylint: disable=protected-access
190 |         eh_frame_hdr = elf_file._read_dwarf_section(
191 |             eh_frame_hdr, relocate_dwarf_sections=True
192 |         )
193 |         eh_frame_hdr_data = EhFrameHdr(eh_frame_hdr, elf_file)
194 |         return eh_frame_hdr_data
195 | 


--------------------------------------------------------------------------------
/src/pgtracer/ebpf/unwind.py:
--------------------------------------------------------------------------------
  1 | # pylint: disable=invalid-name
  2 | """
  3 | This module provides access to libunwind through ctypes.
  4 | """
  5 | from __future__ import annotations
  6 | 
  7 | import ctypes as ct
  8 | import ctypes.util
  9 | import platform
 10 | import re
 11 | from functools import cached_property
 12 | from pathlib import Path
 13 | from typing import TYPE_CHECKING, Any, Generator, List, Optional, Tuple, Type, TypeVar
 14 | 
 15 | from elftools.dwarf.callframe import CFARule, CFIEntry
 16 | from elftools.dwarf.die import DIE, AttributeValue
 17 | from elftools.dwarf.dwarf_expr import DWARFExprOp, DWARFExprParser
 18 | from elftools.dwarf.locationlists import BaseAddressEntry, LocationEntry, LocationExpr
 19 | 
 20 | from .dwarf import MappedRegion, ProcessMetadata, die_name
 21 | 
 22 | if TYPE_CHECKING:
 23 |     try:
 24 |         from typing import TypeAlias  # type: ignore
 25 |     except ImportError:
 26 |         from typing_extensions import TypeAlias
 27 |     CFuncPtr: TypeAlias = ct._FuncPointer  # pylint: disable=protected-access
 28 |     Pointer: TypeAlias = ct.pointer
 29 |     SimpleCData = ct._SimpleCData[Any]  # pylint: disable=protected-access
 30 | else:
 31 |     # Make pylint happy
 32 |     CFuncPtr = object()
 33 |     Pointer = List
 34 |     SimpleCData = Any
 35 | 
 36 | 
 37 | CT = TypeVar("CT", bound=SimpleCData)
 38 | 
 39 | ARCH = platform.machine()
 40 | 
 41 | 
 42 | def find_libunwind_version() -> Tuple[int, int]:
 43 |     """
 44 |     Returns the libunwind version.
 45 |     We try to extract this from the headers.
 46 | 
 47 |     TODO: maybe we should call cc to get the actual include dirs ?
 48 |     """
 49 |     include_dir_candidates = [
 50 |         Path("/usr/include/"),
 51 |         Path(f"/usr/include/{ARCH}-linux-gnu/"),
 52 |     ]
 53 |     major_re = re.compile(r"#define UNW_VERSION_MAJOR\s+(\d+)")
 54 |     minor_re = re.compile(r"#define UNW_VERSION_MINOR\s+(\d+)")
 55 |     header_filename = Path("libunwind-common.h")
 56 |     major_version = None
 57 |     minor_version = None
 58 |     found = False
 59 |     for candidate in include_dir_candidates:
 60 |         include_file = candidate / header_filename
 61 |         if include_file.exists():
 62 |             with include_file.open() as f:
 63 |                 for line in f:
 64 |                     match = major_re.match(line)
 65 |                     if match:
 66 |                         found = True
 67 |                         major_version = int(match.group(1))
 68 |                         continue
 69 |                     match = minor_re.match(line)
 70 |                     if match:
 71 |                         found = True
 72 |                         minor_version = int(match.group(1))
 73 |         if found:
 74 |             break
 75 |     if major_version is None or minor_version is None:
 76 |         raise ValueError("Could not identify libunwind version !")
 77 |     return (major_version, minor_version)
 78 | 
 79 | 
 80 | LIBUNWIND_VERSION = find_libunwind_version()
 81 | 
 82 | UNW_PREFIX = f"_U{ARCH}_"
 83 | libname = ctypes.util.find_library(f"unwind-{ARCH}")
 84 | if libname is None:
 85 |     raise ImportError(f"Cannot load libunwind-{ARCH}")
 86 | libunwind = ct.cdll.LoadLibrary(libname)
 87 | if ARCH == "x86_64":
 88 |     UNW_TDEP_CURSOR_LEN = 127
 89 |     unw_word_t = ct.c_ulonglong
 90 |     UNW_WORD_T_FORMAT = "<Q"
 91 |     unw_tdep_fpreg_t = ct.c_longdouble
 92 |     MAX_STACK_READ = 1 << 16
 93 |     stack_array = ct.c_ubyte * MAX_STACK_READ
 94 |     REG_NAMES = [
 95 |         "rax",
 96 |         "rdx",
 97 |         "rcx",
 98 |         "rbx",
 99 |         "rsi",
100 |         "rdi",
101 |         "rbp",
102 |         "rsp",
103 |         "r8",
104 |         "r9",
105 |         "r10",
106 |         "r11",
107 |         "r12",
108 |         "r13",
109 |         "r14",
110 |         "r15",
111 |         "rip",
112 |     ]
113 |     UNW_REG_IP = REG_NAMES.index("rip")
114 | 
115 |     # This corresponds to the stack and registers captured from ebpf,
116 |     # and is architecture specific
117 |     class stack_data_t(ct.Structure):
118 |         """
119 |         Mapping of stack_data_t type, defined in ebpf code.
120 |         """
121 | 
122 |         _fields_ = [
123 |             ("rax", ct.c_ulong),
124 |             ("rdx", ct.c_ulong),
125 |             ("rcx", ct.c_ulong),
126 |             ("rbx", ct.c_ulong),
127 |             ("rsi", ct.c_ulong),
128 |             ("rdi", ct.c_ulong),
129 |             ("rbp", ct.c_ulong),
130 |             ("rsp", ct.c_ulong),
131 |             ("r8", ct.c_ulong),
132 |             ("r9", ct.c_ulong),
133 |             ("r10", ct.c_ulong),
134 |             ("r11", ct.c_ulong),
135 |             ("r12", ct.c_ulong),
136 |             ("r13", ct.c_ulong),
137 |             ("r14", ct.c_ulong),
138 |             ("r15", ct.c_ulong),
139 |             ("rip", ct.c_ulong),
140 |             ("size", ct.c_ulong),
141 |             ("start_addr", ct.c_ulong),
142 |             ("stack", stack_array),
143 |         ]
144 | 
145 |     class unw_tdep_proc_info_t(ct.Structure):
146 |         """
147 |         Mapping of unw_tdep_proc_info_t
148 |         """
149 | 
150 |         _fields_ = [("unused", ct.c_char)]
151 | 
152 |     class unw_proc_info_t(ct.Structure):
153 |         """
154 |         Mapping of unw_proc_info_t type
155 |         """
156 | 
157 |         _fields_ = [
158 |             ("start_ip", unw_word_t),
159 |             ("end_ip", unw_word_t),
160 |             ("lsda", unw_word_t),
161 |             ("handler", unw_word_t),
162 |             ("gp", unw_word_t),
163 |             ("flags", unw_word_t),
164 |             ("format", ct.c_int),
165 |             ("unwind_info_size", ct.c_int),
166 |             ("unwind_info", ct.c_void_p),
167 |             ("extra", unw_tdep_proc_info_t),
168 |         ]  # FIXME; this type
169 | 
170 | else:
171 |     raise NotImplementedError(f"Stack unwinding is not supporte for {ARCH} arch")
172 | 
173 | UNW_INFO_FORMAT_REMOTE_TABLE = 2
174 | UNW_ENOINFO = 10
175 | UNW_EINVAL = 8
176 | UNW_ESTOPUNWIND = 5
177 | 
178 | 
179 | def unw_func(funcname: str) -> CFuncPtr:
180 |     """
181 |     Returns the CPointer function of that name. Depending on the architecture,
182 |     the function names are not the same.
183 |     """
184 |     return getattr(libunwind, f"{UNW_PREFIX}{funcname}")
185 | 
186 | 
187 | class unw_dyn_remote_table_info_t(ct.Structure):
188 |     """
189 |     Mapping of unw_dyn_remote_table_info_t type.
190 |     """
191 | 
192 |     _fields_ = [
193 |         ("name_ptr", unw_word_t),
194 |         ("segbase", unw_word_t),
195 |         ("table_len", unw_word_t),
196 |         ("table_data", unw_word_t),
197 |     ]
198 | 
199 | 
200 | # We have to define the fields after the class, as it is a self-referencing
201 | # type.
202 | class unw_dyn_info_t(ct.Structure):
203 |     """
204 |     Mapping of unw_dyn_info_t type.
205 |     """
206 | 
207 | 
208 | # Libunwind does not preserve perfect ABI compatibility.
209 | load_offset_field = []
210 | if LIBUNWIND_VERSION >= (1, 6):
211 |     load_offset_field = [("load_offset", unw_word_t)]
212 | 
213 | unw_dyn_info_t._fields_ = [  # pylint: disable=protected-access
214 |     ("next", ct.POINTER(unw_dyn_info_t)),
215 |     ("prev", ct.POINTER(unw_dyn_info_t)),
216 |     ("start_ip", unw_word_t),
217 |     ("end_ip", unw_word_t),
218 |     ("gp", unw_word_t),
219 |     ("format", ct.c_int32),
220 |     ("pad", ct.c_int32),
221 |     *load_offset_field,
222 |     ("rti", unw_dyn_remote_table_info_t)  # Supposed to be an union, but we will
223 |     # only ever use this one.
224 | ]
225 | 
226 | unw_regnum_t = ct.c_int
227 | unw_fpreg_t = unw_tdep_fpreg_t
228 | # Opaque type
229 | unw_addr_space_t = ct.c_void_p
230 | 
231 | # Definition of function types
232 | FIND_PROC_INFO_FUNCTYPE = ct.CFUNCTYPE(
233 |     ct.c_int,  # Return value
234 |     unw_addr_space_t,
235 |     unw_word_t,
236 |     ct.POINTER(unw_proc_info_t),
237 |     ct.c_int,
238 |     ct.c_void_p,
239 | )
240 | PUT_UNWIND_INFO_FUNCTYPE = ct.CFUNCTYPE(
241 |     None, unw_addr_space_t, ct.POINTER(unw_proc_info_t), ct.c_void_p
242 | )
243 | GET_DYN_INFO_LIST_ADDR_FUNCTYPE = ct.CFUNCTYPE(
244 |     ct.c_int, unw_addr_space_t, ct.POINTER(unw_word_t), ct.c_void_p
245 | )
246 | ACCESS_MEM_FUNCTYPE = ct.CFUNCTYPE(
247 |     ct.c_int,
248 |     unw_addr_space_t,
249 |     unw_word_t,
250 |     ct.POINTER(unw_word_t),
251 |     ct.c_int,
252 |     ct.c_void_p,
253 | )
254 | ACCESS_REG_FUNCTYPE = ct.CFUNCTYPE(
255 |     ct.c_int,
256 |     unw_addr_space_t,
257 |     unw_regnum_t,
258 |     ct.POINTER(unw_word_t),
259 |     ct.c_int,
260 |     ct.c_void_p,
261 | )
262 | ACCESS_FPREG_FUNCTYPE = ct.CFUNCTYPE(
263 |     ct.c_int,
264 |     unw_addr_space_t,
265 |     unw_regnum_t,
266 |     ct.POINTER(unw_fpreg_t),
267 |     ct.c_int,
268 |     ct.c_void_p,
269 | )
270 | GET_PROC_NAME_FUNCTYPE = ct.CFUNCTYPE(
271 |     ct.c_int,
272 |     unw_addr_space_t,
273 |     unw_word_t,
274 |     ct.c_char_p,
275 |     ct.c_size_t,
276 |     ct.POINTER(unw_word_t),
277 |     ct.c_void_p,
278 | )
279 | 
280 | create_addr_space = unw_func("create_addr_space")
281 | create_addr_space.restype = ct.c_void_p
282 | create_addr_space.argtypes = [ct.c_void_p, ct.c_int]
283 | 
284 | init_remote = unw_func("init_remote")
285 | init_remote.restype = ct.c_int
286 | init_remote.argtypes = [ct.c_void_p, ct.c_void_p, ct.c_int]
287 | 
288 | 
289 | dwarf_search_unwind_table = unw_func("dwarf_search_unwind_table")
290 | dwarf_search_unwind_table.restype = ct.c_int
291 | dwarf_search_unwind_table.argtypes = [
292 |     unw_addr_space_t,
293 |     unw_word_t,
294 |     ct.POINTER(unw_dyn_info_t),
295 |     ct.POINTER(unw_proc_info_t),
296 |     ct.c_int,
297 |     ct.c_void_p,
298 | ]
299 | 
300 | 
301 | class unw_cursor_t(ct.Structure):
302 |     """
303 |     Mapping of unw_cursor_t type.
304 |     """
305 | 
306 |     _fields_ = [("opaque", unw_word_t * UNW_TDEP_CURSOR_LEN)]
307 | 
308 | 
309 | step = unw_func("step")
310 | step.restype = ct.c_int
311 | step.argtypes = [ct.POINTER(unw_cursor_t)]
312 | 
313 | get_reg = unw_func("get_reg")
314 | get_reg.restype = ct.c_int
315 | get_reg.argtypes = [ct.POINTER(unw_cursor_t), unw_regnum_t, ct.POINTER(unw_word_t)]
316 | 
317 | 
318 | class unw_accesors(ct.Structure):
319 |     """
320 |     Mapping of unw_accessors type.
321 |     """
322 | 
323 |     _fields_ = [
324 |         ("find_proc_info", FIND_PROC_INFO_FUNCTYPE),
325 |         ("put_unwind_info", PUT_UNWIND_INFO_FUNCTYPE),
326 |         ("get_dyn_info_list_addr", GET_DYN_INFO_LIST_ADDR_FUNCTYPE),
327 |         ("access_mem", ACCESS_MEM_FUNCTYPE),
328 |         ("access_reg", ACCESS_REG_FUNCTYPE),
329 |         ("access_fpreg", ACCESS_FPREG_FUNCTYPE),
330 |         ("resume", ct.c_void_p),  # Unused
331 |         ("get_proc_name", GET_PROC_NAME_FUNCTYPE),
332 |     ]
333 | 
334 | 
335 | class Frame:
336 |     """
337 |     A stack frame.
338 |     """
339 | 
340 |     def __init__(
341 |         self,
342 |         stack: ct._CData,
343 |         ip: int,
344 |         die: DIE,
345 |         start_addr: int,
346 |         processmetadata: ProcessMetadata,
347 |         cursor: unw_cursor_t,
348 |         prev_frame: Optional[Frame] = None,
349 |         next_frame: Optional[Frame] = None,
350 |     ):
351 |         self.stack = stack
352 |         self.ip = ip
353 |         self.die = die
354 | 
355 |         self.start_addr = start_addr
356 |         self.processmetadata = processmetadata
357 |         # We don't keep the cursor itself, we make a copy instead.
358 |         self.cursor = unw_cursor_t()
359 |         ct.pointer(self.cursor)[0] = cursor
360 |         self.prev_frame = prev_frame
361 |         self.next_frame = next_frame
362 | 
363 |     @cached_property
364 |     def fde(self) -> Optional[CFIEntry]:
365 |         """
366 |         Returns the FDE associated with this call frame.
367 |         """
368 |         region = self.region
369 |         if region is None:
370 |             return None
371 |         v_ip = self.ip - region.start
372 |         if region.eh_frame_hdr is None:
373 |             return None
374 |         fde = region.eh_frame_hdr.find_fde(v_ip)
375 |         return fde
376 | 
377 |     @cached_property
378 |     def _expr_parser(self) -> DWARFExprParser:
379 |         """
380 |         DWARF Expr parser.
381 |         """
382 |         return DWARFExprParser(self.processmetadata.dwarf_info.structs)
383 | 
384 |     @cached_property
385 |     def cfa_rule(self) -> Optional[CFARule]:
386 |         """
387 |         Returns the CFA rule associated with this call frame.
388 |         """
389 |         if self.fde is None:
390 |             return None
391 |         for row in reversed(self.fde.get_decoded().table):
392 |             if row["pc"] < self.ip - self.region.start:
393 |                 return row["cfa"]
394 |         return None
395 | 
396 |     @cached_property
397 |     def cfa(self) -> Optional[int]:
398 |         """
399 |         Compute the CFA for this call frame.
400 |         """
401 |         if self.cfa_rule is None:
402 |             return None
403 |         cfa_reg_value = unw_word_t(0)
404 |         get_reg(self.cursor, self.cfa_rule.reg, ct.byref(cfa_reg_value))
405 |         return cfa_reg_value.value + self.cfa_rule.offset - self.start_addr  # type: ignore
406 | 
407 |     @cached_property
408 |     def region(self) -> MappedRegion:
409 |         """
410 |         Return the MappedRegion correspoding to this Frame's IP.
411 |         """
412 |         region = self.processmetadata.map_for_addr(self.ip)
413 |         if region is None:
414 |             raise ValueError("This frame could not be associated to a region.")
415 |         return region
416 | 
417 |     @cached_property
418 |     def function_name(self) -> Optional[str]:
419 |         """
420 |         Returns the function name associated to this frame's DIE
421 |         """
422 |         if self.die is None:
423 |             return None
424 |         return die_name(self.die)
425 | 
426 |     def _get_parsed_expr_for_attribute(self, argnum: int) -> List[DWARFExprOp]:
427 |         """
428 |         Returns a list of parsed DwarfEXPROp for the attribute corresponding to the
429 |         argnum'th argument.
430 |         """
431 |         curargnum = 0
432 |         if self.die is None:
433 |             return []
434 |         for subdie in self.die.iter_children():
435 |             if subdie.tag == "DW_TAG_formal_parameter":
436 |                 curargnum += 1
437 |                 if curargnum == argnum:
438 |                     locattr = subdie.attributes["DW_AT_location"]
439 |                     return self._get_parsed_exprs_from_loc(subdie, locattr)
440 |         return []
441 | 
442 |     def _get_parsed_exprs_from_loc(
443 |         self, die: DIE, locattr: AttributeValue
444 |     ) -> List[DWARFExprOp]:
445 |         """
446 |         Returns a list of parsed DWARFExprOp for a given attribute.
447 |         """
448 |         expr = None
449 |         loc = self.processmetadata.location_parser.parse_from_attribute(
450 |             locattr, die.cu.header.version, die
451 |         )
452 |         if isinstance(loc, LocationExpr):
453 |             expr = loc.loc_expr
454 |         else:
455 |             base_address = die.cu.get_top_DIE().attributes["DW_AT_low_pc"].value
456 |             expr = None
457 |             for entry in loc:
458 |                 if isinstance(entry, BaseAddressEntry):
459 |                     base_address = entry.base_address
460 |                 elif isinstance(entry, LocationEntry):
461 |                     start = entry.begin_offset + base_address
462 |                     end = entry.end_offset + base_address
463 |                     if start <= (self.ip - self.region.start) <= end:
464 |                         expr = entry.loc_expr
465 |                         break
466 |                 else:
467 |                     raise NotImplementedError(
468 |                         f"Location entries of type {type(entry)} are not supported"
469 |                     )
470 |         if expr is None:
471 |             raise ValueError("Could not find LocationExpr in attr {locattr}")
472 |         parsed_exprs: List[DWARFExprOp] = self._expr_parser.parse_expr(expr)
473 |         return parsed_exprs
474 | 
475 |     def fetch_arg(self, argnum: int, ctype: Type[CT]) -> CT:
476 |         """
477 |         Fetch the argument number argnum, interpreting it as a ctype.
478 |         """
479 |         # We have all the registers set up correctly, fetch things directly.
480 |         rv: CT
481 |         if self.cfa is None:
482 |             # Fetch the argument directly from the register
483 |             argreg = unw_word_t(0)
484 |             ARGNUM_TO_REGNUM = {1: 5, 2: 4, 3: 1, 4: 2, 5: 8}
485 |             get_reg(self.cursor, ARGNUM_TO_REGNUM[argnum], ct.byref(argreg))
486 |             return ctype(argreg.value)
487 |         expr = self._get_parsed_expr_for_attribute(argnum)
488 |         dwarf_stack: List[CT] = []
489 |         for op in expr:
490 |             rv = self.eval_expr(op, ctype, dwarf_stack)
491 |         return rv
492 | 
493 |     def _read_arg_from_stack(self, offset: int, ctype: Type[CT]) -> CT:
494 |         """
495 |         Read an argument of givent type at the given offset from the stack.
496 |         """
497 |         assert 0 <= offset < len(self.stack)  # type: ignore
498 |         return ctype.from_buffer(bytearray(self.stack)[offset:])
499 | 
500 |     def eval_expr(
501 |         self, expr: DWARFExprOp, ctype: Type[CT], dwarf_stack: List[CT]
502 |     ) -> CT:
503 |         """
504 |         Eval simple expressions.
505 |         """
506 |         # It's a register
507 |         if self.die is None:
508 |             raise ValueError("No DIE could be found for frame {self}")
509 |         if expr.op_name == "DW_OP_fbreg":
510 |             # If we are an inlined subroutine, lookup the parent frame base.
511 |             die = self.die
512 |             while die.tag == "DW_TAG_inlined_subroutine":
513 |                 if self.next_frame is None:
514 |                     raise ValueError("Cannot find parent frame of inlined subroutine")
515 |                 die = self.next_frame.die
516 |             frameexpr = self.processmetadata.location_parser.parse_from_attribute(
517 |                 die.attributes["DW_AT_frame_base"],
518 |                 self.die.cu.header.version,
519 |                 self.die,
520 |             )
521 |             parsed_expr = self._expr_parser.parse_expr(frameexpr.loc_expr)
522 |             for item in parsed_expr:
523 |                 base_value = self.eval_expr(item, ct.c_int, dwarf_stack)  # type: ignore
524 |             offset = base_value.value + expr.args[0]
525 |             return self._read_arg_from_stack(offset, ctype)
526 |         if expr.op_name == "DW_OP_call_frame_cfa":
527 |             return ctype(self.cfa)
528 |         if expr.op_name == "DW_OP_entry_value":
529 |             # We evaluate the expression in the calling frame.
530 |             for op in expr.args[0]:
531 |                 if self.next_frame is None:
532 |                     raise ValueError(
533 |                         "Cannot find parent frame for evaluation of entry point"
534 |                     )
535 |                 rv = self.next_frame.eval_expr(op, ctype, dwarf_stack)
536 |             dwarf_stack.append(rv)
537 |             return ctype(0)
538 |         if expr.op_name == "DW_OP_stack_value":
539 |             return dwarf_stack[-1]
540 |         if expr.op_name.startswith("DW_OP_reg"):
541 |             regnum = expr.op - 0x50
542 |             val = unw_word_t(0)
543 |             get_reg(self.cursor, regnum, ct.byref(val))
544 |             return ctype(val.value)
545 |         raise NotImplementedError(f"Unsupported expr type: {expr.op_name}")
546 | 
547 | 
548 | class UnwindAddressSpace:
549 |     """
550 |     A virtual address space for use by libunwind.
551 |     """
552 | 
553 |     def __init__(self, capture: stack_data_t, processmetadata: ProcessMetadata):
554 |         self.capture = capture
555 |         self.registers: List[ct.c_ulonglong] = [
556 |             ct.c_ulonglong(getattr(self.capture, name)) for name in REG_NAMES
557 |         ]
558 |         self.processmetadata = processmetadata
559 |         self.accessors = unw_accesors(
560 |             find_proc_info=FIND_PROC_INFO_FUNCTYPE(self.find_proc_info),
561 |             put_unwind_info=PUT_UNWIND_INFO_FUNCTYPE(self.put_unwind_info),
562 |             get_dyn_info_list_addr=GET_DYN_INFO_LIST_ADDR_FUNCTYPE(
563 |                 self.get_dyn_info_list_addr
564 |             ),
565 |             access_mem=ACCESS_MEM_FUNCTYPE(self.access_mem),
566 |             access_reg=ACCESS_REG_FUNCTYPE(self.access_reg),
567 |             access_fpreg=ACCESS_FPREG_FUNCTYPE(self.access_reg),
568 |             get_proc_name=GET_PROC_NAME_FUNCTYPE(self.get_proc_name),
569 |         )
570 | 
571 |         # 0 takes the default byteorder
572 |         self.unw_addr_space = create_addr_space(ct.byref(self.accessors), 0)
573 |         if self.unw_addr_space == 0:
574 |             raise RuntimeError("Something bad happened in create_addr_space")
575 |         self.unw_cursor = unw_cursor_t()
576 |         retval = init_remote(
577 |             ct.byref(self.unw_cursor), self.unw_addr_space, 0
578 |         )  # Don't use the opaque pointer for now
579 |         if retval != 0:
580 |             raise RuntimeError("Something bad happened in init_remote")
581 | 
582 |     def find_proc_info(
583 |         self,
584 |         addr_space: unw_addr_space_t,
585 |         ip: int,
586 |         pip: Pointer[unw_proc_info_t],
587 |         need_unwind_info: ct.c_int,
588 |         arg: ct.c_void_p,
589 |     ) -> int:
590 |         # pylint: disable=unused-argument,too-many-arguments
591 |         """
592 |         Implementation of libunwind find_proc_info callback.
593 |         """
594 |         # Find the top of the elfile.
595 |         mmap = self.processmetadata.map_for_addr(ip)
596 | 
597 |         if mmap is None or mmap.eh_frame_hdr is None:
598 |             return -UNW_ESTOPUNWIND
599 |         pip[0] = unw_proc_info_t()
600 |         dynamic_info = unw_dyn_info_t(
601 |             start_ip=mmap.start,
602 |             end_ip=mmap.end,
603 |             format=UNW_INFO_FORMAT_REMOTE_TABLE,
604 |         )
605 |         dynamic_info.rti.name_ptr = 0
606 |         # We only consider one specific binary. The virtual address space will
607 |         # then consist of the actual stack and we will consider that the
608 |         # eh_frame_hdr and everything else is located after that.
609 |         dynamic_info.rti.segbase = mmap.start + mmap.eh_frame_hdr.offset
610 |         dynamic_info.rti.table_data = (
611 |             mmap.start + mmap.eh_frame_hdr.table_start + mmap.eh_frame_hdr.offset
612 |         )
613 |         dynamic_info.rti.table_len = (mmap.eh_frame_hdr.fde_count * 8) // ct.sizeof(
614 |             unw_word_t
615 |         )
616 |         ret: int = dwarf_search_unwind_table(
617 |             addr_space, ip, ct.byref(dynamic_info), pip, need_unwind_info, None
618 |         )
619 |         return ret
620 | 
621 |     def put_unwind_info(
622 |         self,
623 |         addr_space: unw_addr_space_t,
624 |         pip: Pointer[unw_proc_info_t],
625 |         arg: ct.c_void_p,
626 |     ) -> None:
627 |         """
628 |         Implementation of libunwind put_unwind_info callback.
629 |         """
630 |         # pylint: disable=unused-argument
631 |         return
632 | 
633 |     def get_dyn_info_list_addr(
634 |         self,
635 |         addr_space: unw_addr_space_t,
636 |         dilap: Pointer[unw_word_t],
637 |         arg: ct.c_void_p,
638 |     ) -> int:
639 |         """
640 |         Implementation of libunwind get_dyn_info_list_addr callback.
641 |         """
642 |         # pylint: disable=unused-argument
643 |         return -UNW_ENOINFO
644 | 
645 |     def access_mem(
646 |         self,
647 |         addr_space: unw_addr_space_t,
648 |         addr: int,
649 |         valp: Pointer[unw_word_t],
650 |         write: int,
651 |         arg: ct.c_void_p,
652 |     ) -> int:
653 |         """
654 |         Implementation of libunwind access_mem callback.
655 |         """
656 |         # pylint: disable=unused-argument,too-many-arguments
657 |         # We only support either file-mapped addresses, or addresses
658 |         # refering to the stack.
659 |         region = self.processmetadata.map_for_addr(addr)
660 |         if region is None:
661 |             return -UNW_EINVAL
662 |         if region.path == "[stack]":
663 |             stack_idx = addr - self.capture.start_addr
664 |             if stack_idx >= self.capture.size:
665 |                 return -UNW_EINVAL
666 |             if write == 0:
667 |                 valp[0] = unw_word_t.from_buffer(
668 |                     bytearray(self.capture.stack[stack_idx : stack_idx + 8])
669 |                 )
670 |             else:
671 |                 self.capture.stack[stack_idx] = valp.contents
672 |             return 0
673 | 
674 |         # It's from the ELFFile itself.
675 |         if region.real_path:
676 |             if write == 0:
677 |                 with region.real_path.open("rb") as f:
678 |                     f.seek(addr - region.start)
679 |                     valp[0] = unw_word_t.from_buffer(
680 |                         bytearray(f.read(ct.sizeof(unw_word_t)))
681 |                     )
682 |                     return 0
683 |             return -UNW_EINVAL
684 | 
685 |         # It's from anywhere else: return EINVAL
686 |         return -UNW_EINVAL
687 | 
688 |     def access_reg(
689 |         self,
690 |         addr_space: unw_addr_space_t,
691 |         regnum: int,
692 |         valp: Pointer[unw_word_t],
693 |         write: int,
694 |         arg: ct.c_void_p,
695 |     ) -> int:
696 |         """
697 |         Implementation of libunwind access_reg callback.
698 |         """
699 |         # pylint: disable=unused-argument,too-many-arguments
700 |         if write == 0:
701 |             valp[0] = unw_word_t(self.registers[regnum].value)
702 |         else:
703 |             self.registers[regnum] = valp.contents
704 |         return 0
705 | 
706 |     def access_fpreg(
707 |         self,
708 |         addr_space: unw_addr_space_t,
709 |         regnum: unw_regnum_t,
710 |         fpvalp: Pointer[unw_fpreg_t],
711 |         write: ct.c_int,
712 |         arg: ct.c_void_p,
713 |     ) -> int:
714 |         """
715 |         Implementation of libunwind access_fpreg callback.
716 |         """
717 |         # pylint: disable=unused-argument,too-many-arguments
718 |         return -UNW_EINVAL
719 | 
720 |     def get_proc_name(
721 |         self,
722 |         addr_space: unw_addr_space_t,
723 |         addr: unw_word_t,
724 |         bufp: ct.c_char_p,
725 |         buf_len: ct.c_size_t,
726 |         offp: Pointer[unw_word_t],
727 |         arg: ct.c_void_p,
728 |     ) -> int:
729 |         """
730 |         Implementation of libunwind get_proc_name callback.
731 |         """
732 |         # pylint: disable=unused-argument,too-many-arguments
733 |         return -UNW_EINVAL
734 | 
735 |     def ip(self) -> int:
736 |         """
737 |         Return the instruction pointer from the unwind cursor.
738 |         """
739 |         ip = unw_word_t(0)
740 |         get_reg(self.unw_cursor, UNW_REG_IP, ct.byref(ip))
741 |         return ip.value
742 | 
743 |     def dies_for_ip(self) -> Tuple[DIE, ...]:
744 |         """
745 |         Return a tuple of DIEs for a given ip.
746 |         """
747 |         ip = self.ip()
748 |         region = self.processmetadata.map_for_addr(ip)
749 |         if region is None:
750 |             return (None,)
751 |         if region.path == str(self.processmetadata.program_raw):
752 |             dies = self.processmetadata.get_die_and_inlined_subdies_for_addr(
753 |                 ip - region.start
754 |             )
755 |             if dies is not None:
756 |                 return dies
757 |         return (None,)
758 | 
759 |     def frames(self) -> Generator[Frame, None, None]:
760 |         """
761 |         Returns the list of frames for this stack.
762 |         """
763 |         cur = ct.byref(self.unw_cursor)
764 |         prev_frame = None
765 |         while True:
766 |             # Extract the IP
767 |             ip = self.ip()
768 |             for die in self.dies_for_ip():
769 |                 # The cursor is copied by the frame, no need to
770 |                 # worry about it
771 |                 cur_frame = Frame(
772 |                     self.capture.stack,
773 |                     ip,
774 |                     die,
775 |                     self.capture.start_addr,
776 |                     self.processmetadata,
777 |                     self.unw_cursor,
778 |                     prev_frame=prev_frame,
779 |                 )
780 |                 if prev_frame is not None:
781 |                     prev_frame.next_frame = cur_frame
782 |                     yield prev_frame
783 |                 prev_frame = cur_frame
784 |             if step(cur) <= 0:
785 |                 break
786 |         if prev_frame is not None:
787 |             yield prev_frame
788 | 


--------------------------------------------------------------------------------
/src/pgtracer/model/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Models definitions for execution concepts we extract information about.
 3 | """
 4 | from .memory import MemoryAllocations, MemoryAllocType, memory_account
 5 | from .plan import PlanState
 6 | from .query import Query
 7 | 
 8 | __all__ = [
 9 |     "Query",
10 |     "PlanState",
11 |     "memory_account",
12 |     "MemoryAllocations",
13 |     "MemoryAllocType",
14 | ]
15 | 


--------------------------------------------------------------------------------
/src/pgtracer/model/memory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Classes storing information about memory allocations.
 3 | """
 4 | 
 5 | import ctypes as ct
 6 | from dataclasses import dataclass
 7 | from enum import IntEnum
 8 | 
 9 | 
10 | # pylint: disable=invalid-name
11 | class MemoryAllocType(IntEnum):
12 |     """
13 |     MemoryAllocation types.
14 |     """
15 | 
16 |     Sbrk = 1
17 |     Mmap = 2
18 | 
19 | 
20 | class memory_account(ct.Structure):
21 |     """
22 |     Represents the data associated to a memory allocation or deallocation.
23 |     """
24 | 
25 |     _fields_ = [
26 |         ("event_type", ct.c_short),
27 |         ("size", ct.c_longlong),
28 |         ("kind", ct.c_short),
29 |     ]
30 | 
31 | 
32 | @dataclass
33 | class MemoryAllocations:
34 |     """
35 |     Memory allocation counters.
36 |     """
37 | 
38 |     mmap_alloc: int = 0
39 |     mmap_free: int = 0
40 |     sbrk_alloc: int = 0
41 |     sbrk_free: int = 0
42 | 
43 |     current_running_mmap: int = 0
44 |     current_running_sbrk: int = 0
45 | 
46 |     current_mem_peak: int = 0
47 | 
48 |     @property
49 |     def mmap_total(self) -> int:
50 |         """
51 |         Compute the resulting mmaped total.
52 |         """
53 |         return self.mmap_alloc - self.mmap_free
54 | 
55 |     @property
56 |     def sbrk_total(self) -> int:
57 |         """
58 |         Compute the resulting sbrk total.
59 |         """
60 |         return self.sbrk_alloc - self.sbrk_free
61 | 
62 |     @property
63 |     def total_malloc(self) -> int:
64 |         """
65 |         Compute the total memory diff.
66 |         """
67 |         return self.mmap_total + self.sbrk_total
68 | 
69 |     def update(self, memory_account_event: memory_account) -> None:
70 |         """
71 |         Update the current totals.
72 |         """
73 |         if memory_account_event.kind == MemoryAllocType.Sbrk:
74 |             self.current_running_sbrk += memory_account_event.size
75 |             if memory_account_event.size > 0:
76 |                 self.sbrk_alloc += memory_account_event.size
77 |             else:
78 |                 self.sbrk_free += -memory_account_event.size
79 |         elif memory_account_event.kind == MemoryAllocType.Mmap:
80 |             self.current_running_mmap += memory_account_event.size
81 |             if memory_account_event.size > 0:
82 |                 self.mmap_alloc += memory_account_event.size
83 |             else:
84 |                 self.mmap_free += -memory_account_event.size
85 |         self.current_mem_peak = max(
86 |             self.current_mem_peak, self.current_running_sbrk + self.current_running_mmap
87 |         )
88 | 


--------------------------------------------------------------------------------
/src/pgtracer/model/plan.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains definitions for representing PostgreSQL plans.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import ctypes as ct
  7 | from typing import TYPE_CHECKING, Dict, Optional
  8 | 
  9 | from ..ebpf.collector.c_defs import plan_data, planstate_data
 10 | from ..ebpf.dwarf import ProcessMetadata, Struct
 11 | from ..utils import timespec_to_float
 12 | 
 13 | if TYPE_CHECKING:
 14 |     from enum import IntEnum
 15 | 
 16 | 
 17 | def explain_dict_to_str(parts: Dict[str, str]) -> str:
 18 |     """
 19 |     Format a dict in the commonly used key=value format.
 20 |     """
 21 |     return " ".join(f"{key}={value}" for key, value in parts.items())
 22 | 
 23 | 
 24 | class PlanState:
 25 |     """
 26 |     Information collected from a PostgreSQL PlanState Node.
 27 |     """
 28 | 
 29 |     def __init__(self, addr: Optional[int]):
 30 |         self.addr = addr
 31 |         self.tag: Optional[IntEnum] = None
 32 |         self.instrument: Optional[Struct] = None
 33 |         self.parent_node: Optional[PlanState] = None
 34 |         self.plan_data: Optional[plan_data] = None
 35 |         self.is_stub = True
 36 |         # We're using a Dict as poor man's OrderedSet
 37 |         self.children: Dict[PlanState, None] = {}
 38 | 
 39 |     def update(self, metadata: ProcessMetadata, event: planstate_data) -> None:
 40 |         """
 41 |         Update a Planstate from an event planstate_data.
 42 |         """
 43 |         instrument_addr = ct.addressof(event.instrument)
 44 |         tag = metadata.enums.NodeTag(event.plan_data.plan_tag)  # type: ignore
 45 |         self.tag = tag
 46 |         self.instrument = metadata.structs.Instrumentation(instrument_addr)
 47 |         self.plan_data = plan_data()
 48 |         ct.pointer(self.plan_data)[0] = event.plan_data
 49 | 
 50 |     @property
 51 |     def title(self) -> str:
 52 |         """
 53 |         Return the node's title.
 54 |         """
 55 |         if self.tag is None:
 56 |             return "???"
 57 |         prefix = ""
 58 |         if self.plan_data and self.plan_data.parallel_aware:
 59 |             prefix = "Parallel "
 60 |         buf = f"{prefix}{str(self.tag.name[2:])}"
 61 |         # TODO: add additional information here
 62 |         return buf
 63 | 
 64 |     @property
 65 |     def cost(self) -> str:
 66 |         """
 67 |         Returns the "cost" section formatted similarly to PostgreSQL explain
 68 |         """
 69 |         if self.plan_data is None:
 70 |             parts = {"cost": "?..?", "rows": "?", "width": "?"}
 71 |         else:
 72 |             parts = {
 73 |                 "cost": f"{self.plan_data.startup_cost:.2f}..{self.plan_data.total_cost:.2f}",
 74 |                 "rows": f"{int(self.plan_data.plan_rows)}",
 75 |                 "width": f"{int(self.plan_data.plan_width)}",
 76 |             }
 77 |         return f"({explain_dict_to_str(parts)})"
 78 | 
 79 |     @property
 80 |     def actual(self) -> str:
 81 |         """
 82 |         Returns the "actual" section formatted similarly to PostgreSQL explain.
 83 |         """
 84 |         if self.instrument is None:
 85 |             parts = {"time": "?..?", "rows": "?", "loops": "?"}
 86 |         else:
 87 |             total = timespec_to_float(self.instrument.counter)
 88 |             parts = {
 89 |                 "time": f"{(self.instrument.firsttuple.value * 1000):0.3f}...{(total * 1000):0.3f}",
 90 |                 "rows": f"{int(self.instrument.tuplecount.value)}",
 91 |                 "loops": f"{int(self.instrument.nloops.value)}",
 92 |             }
 93 |         return f"(actual {explain_dict_to_str(parts)})"
 94 | 
 95 |     @property
 96 |     def buffers(self) -> str:
 97 |         """
 98 |         Returns the "buffers" section formatted similarly to PostgreSQL
 99 |         explain.
100 |         """
101 |         if self.instrument is None:
102 |             return ""
103 |         bufusage_dict = self.instrument.bufusage.as_dict(include_all=True)
104 |         parts = {}
105 |         for key, value in bufusage_dict.items():
106 |             if isinstance(value, (ct.c_long,)) and value.value != 0:
107 |                 parts[key] = str(value.value)
108 |         if not parts:
109 |             return ""
110 |         return f"Buffers: {explain_dict_to_str(parts)}"
111 | 
112 |     def explain(self, indent_level: int = 0) -> str:
113 |         """
114 |         Format the plan represented by this node similarly to PostgreSQL
115 |         explain.
116 |         """
117 |         if indent_level == 0:
118 |             prefix = ""
119 |         else:
120 |             prefix = "\t" * indent_level + "-> "
121 |         buf = f"{prefix}{self.title} {self.cost} {self.actual}"
122 |         buffer_line = self.buffers
123 |         if buffer_line:
124 |             buf += "\n" + "\t" * (indent_level + 1) + buffer_line
125 |         for child in self.children:
126 |             buf += "\n"
127 |             buf += child.explain(indent_level + 1)
128 |         return buf
129 | 


--------------------------------------------------------------------------------
/src/pgtracer/model/query.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module contains definitions for representing PostgreSQL queries.
  3 | """
  4 | from __future__ import annotations
  5 | 
  6 | import ctypes as ct
  7 | from collections import defaultdict
  8 | from datetime import datetime, timedelta
  9 | from typing import TYPE_CHECKING, Any, Dict, Optional
 10 | 
 11 | from ..ebpf.unwind import UnwindAddressSpace, stack_data_t
 12 | from ..utils import timespec_to_timedelta
 13 | from .memory import MemoryAllocations
 14 | from .plan import PlanState
 15 | 
 16 | if TYPE_CHECKING:
 17 |     from ..ebpf.collector import planstate_data, portal_data
 18 |     from ..ebpf.dwarf import ProcessMetadata
 19 | 
 20 | 
 21 | FUNCTION_ARGS_MAPPING = {
 22 |     "ExecProcNodeFirst": 1,
 23 |     "ExecProcNodeInstr": 1,
 24 |     "ExecProcNode": 1,
 25 |     "ExecAgg": 1,
 26 |     "ExecAppend": 1,
 27 |     "ExecBitmapAnd": 1,
 28 |     "ExecBitmapHeapScan": 1,
 29 |     "ExecBitmapIndexScan": 1,
 30 |     "ExecBitmapOr": 1,
 31 |     "ExecCteScan": 1,
 32 |     "ExecCustomScan": 1,
 33 |     "ExecForeignScan": 1,
 34 |     "ExecFunctionScan": 1,
 35 |     "ExecGather": 1,
 36 |     "ExecGatherMerge": 1,
 37 |     "ExecGroup": 1,
 38 |     "ExecHash": 1,
 39 |     "ExecHashJoin": 1,
 40 |     "ExecIncrementalSort": 1,
 41 |     "ExecIndexOnlyScan": 1,
 42 |     "ExecIndexScan": 1,
 43 |     "ExecLimit": 1,
 44 |     "ExecLockRows": 1,
 45 |     "ExecMaterial": 1,
 46 |     "ExecMemoize": 1,
 47 |     "ExecMergeAppend": 1,
 48 |     "ExecMergeJoin": 1,
 49 |     "ExecModifyTable": 1,
 50 |     "ExecNamedTuplestoreScan": 1,
 51 |     "ExecNestLoop": 1,
 52 |     "ExecProjectSet": 1,
 53 |     "ExecRecursiveUnion": 1,
 54 |     "ExecResult": 1,
 55 |     "ExecSampleScan": 1,
 56 |     "ExecSeqScan": 1,
 57 |     "ExecSetOp": 1,
 58 |     "ExecSort": 1,
 59 |     "ExecSubqueryScan": 1,
 60 |     "ExecTableFuncScan": 1,
 61 |     "ExecTidRangeScan": 1,
 62 |     "ExecTidScan": 1,
 63 |     "ExecUnique": 1,
 64 |     "ExecValuesScan": 1,
 65 |     "ExecWindowAgg": 1,
 66 |     "ExecWorkTableScan": 1,
 67 |     "MultiExecHash": 1,
 68 |     "MultiExecBitmapIndexScan": 1,
 69 |     "MultiExecBitmapAnd": 1,
 70 |     "MultiExecBitmapOr": 1,
 71 | }
 72 | 
 73 | 
 74 | class Query:
 75 |     """
 76 |     A PostgreSQL Query.
 77 |     """
 78 | 
 79 |     def __init__(
 80 |         self,
 81 |         *,
 82 |         addr: int,
 83 |         query_id: int,
 84 |         startup_cost: float,
 85 |         total_cost: float,
 86 |         plan_rows: float,
 87 |         startts: Optional[float] = None,
 88 |         text: Optional[str] = None,
 89 |         # Instrumentation is a dynamically generated class, no way to check it
 90 |         instrument: Any = None,
 91 |         search_path: Optional[str] = None,
 92 |     ):
 93 |         self.addr = addr
 94 |         self.query_id = query_id
 95 |         self.startup_cost = startup_cost
 96 |         self.total_cost = total_cost
 97 |         self.plan_rows = plan_rows
 98 |         self.startts = startts
 99 |         self.text = text
100 |         self.instrument = instrument
101 |         self.search_path = search_path
102 |         self.nodes: Dict[int, PlanState] = {}
103 |         self.io_counters: Dict[str, int] = defaultdict(lambda: 0)
104 |         self.memallocs: MemoryAllocations = MemoryAllocations()
105 | 
106 |     @property
107 |     def root_node(self) -> PlanState:
108 |         """
109 |         Returns the plan's root node.
110 |         """
111 |         root_candidates = [
112 |             node for node in self.nodes.values() if node.parent_node is None
113 |         ]
114 |         if len(root_candidates) == 0:
115 |             raise ValueError("Invalid plan, we have no root node when we expect 1")
116 |         if len(root_candidates) > 1:
117 |             # In that case, we need to build a "fake" parent node.
118 |             root_node = PlanState(None)
119 |             root_node.children = {c: None for c in root_candidates}
120 |         else:
121 |             root_node = root_candidates[0]
122 |         return root_node
123 | 
124 |     @classmethod
125 |     def from_event(cls, metadata: ProcessMetadata, event: portal_data) -> Query:
126 |         """
127 |         Build a query from portal_data event generated by eBPF.
128 |         """
129 |         instrument_addr = ct.addressof(event.instrument)
130 |         instrument = metadata.structs.Instrumentation(instrument_addr)
131 |         search_path = None
132 |         if event.search_path:
133 |             search_path = event.search_path.decode("utf8")
134 |         _, creation_time = event.portal_key.as_tuple()
135 |         return cls(
136 |             addr=event.query_addr,
137 |             query_id=event.query_id,
138 |             startup_cost=event.startup_cost,
139 |             total_cost=event.total_cost,
140 |             plan_rows=event.plan_rows,
141 |             startts=creation_time,
142 |             text=event.query.decode("utf8"),
143 |             instrument=instrument,
144 |             search_path=search_path,
145 |         )
146 | 
147 |     def update(self, metadata: ProcessMetadata, event: portal_data) -> None:
148 |         """
149 |         Update the query from an eBPF portal_data event.
150 |         """
151 |         instrument_addr = ct.addressof(event.instrument)
152 |         instrument = metadata.structs.Instrumentation(instrument_addr)
153 |         if instrument.running:
154 |             self.instrument = instrument
155 |         _, creation_time = event.portal_key.as_tuple()
156 |         self.startts = creation_time or self.startts
157 |         self.text = event.query.decode("utf-8") or self.text
158 |         search_path = event.search_path.decode("utf8")
159 |         self.search_path = search_path or self.search_path
160 | 
161 |     @property
162 |     def start_datetime(self) -> Optional[datetime]:
163 |         """
164 |         Returns the creation timestamp of the portal associated to this query.
165 |         """
166 |         if self.startts is None:
167 |             return None
168 |         return datetime.fromtimestamp(self.startts / 1000000)
169 | 
170 |     @property
171 |     def runtime(self) -> Optional[timedelta]:
172 |         """
173 |         Returns the query's top-node total runtime.
174 |         """
175 |         if self.instrument and self.instrument.need_timer.value:
176 |             return timespec_to_timedelta(self.instrument.counter)
177 |         return None
178 | 
179 |     @property
180 |     def shared_buffers_hitratio(self) -> Optional[float]:
181 |         """
182 |         Returns the hit ratio from the shared buffers.
183 |         """
184 |         if self.instrument is None:
185 |             return None
186 |         bufusage = self.instrument.bufusage
187 |         total_blks = bufusage.shared_blks_hit.value + bufusage.shared_blks_read.value
188 |         # If we didn't read any block, hit ratio is None
189 |         if total_blks == 0:
190 |             return None
191 |         return float(bufusage.shared_blks_hit.value / total_blks * 100)
192 | 
193 |     @property
194 |     def syscache_hitratio(self) -> Optional[float]:
195 |         """
196 |         Returns the system's hit ratio.
197 |         """
198 |         if self.instrument is None:
199 |             return None
200 |         bufusage = self.instrument.bufusage
201 |         # FIXME: don't assume a fixed block size, either pass it as an option
202 |         # or query the actual value from the DB
203 |         blksize = 8192
204 |         total_blks = (
205 |             bufusage.shared_blks_read.value
206 |             + bufusage.local_blks_read.value
207 |             + bufusage.temp_blks_read.value
208 |         )
209 |         total_bytes = total_blks * blksize
210 |         if total_bytes == 0:
211 |             return None
212 |         bytes_hit = total_bytes - self.io_counters["R"]
213 |         return float(bytes_hit / total_bytes * 100)
214 | 
215 |     def add_nodes_from_stack(
216 |         self,
217 |         metadata: ProcessMetadata,
218 |         stack: stack_data_t,
219 |         start_at: int = 0,
220 |         base_node: Optional[PlanState] = None,
221 |     ) -> None:
222 |         """
223 |         Process a capture stack to add node stubs to this query.
224 |         """
225 |         addr_space = UnwindAddressSpace(stack, metadata)
226 |         nodes = self.nodes
227 |         cur_node = base_node
228 |         for idx, frame in enumerate(addr_space.frames()):
229 |             if idx < start_at:
230 |                 continue
231 |             if frame.function_name in FUNCTION_ARGS_MAPPING:
232 |                 argnum = FUNCTION_ARGS_MAPPING[frame.function_name]
233 |                 parent_addr = frame.fetch_arg(argnum, ct.c_ulonglong).value
234 |                 if cur_node and parent_addr == cur_node.addr:
235 |                     continue
236 |                 parent_node = nodes.get(parent_addr)
237 |                 if parent_node is None:
238 |                     parent_node = PlanState(parent_addr)
239 |                     nodes[parent_addr] = parent_node
240 |                 if cur_node:
241 |                     cur_node.parent_node = parent_node
242 |                     parent_node.children[cur_node] = None
243 |                 # The parent_node is already not a stub, meaning its ancestors
244 |                 # have been resolved. Stop walking the frame here
245 |                 if not parent_node.is_stub:
246 |                     break
247 |                 cur_node = parent_node
248 | 
249 |     def add_node_from_event(
250 |         self, metadata: ProcessMetadata, event: planstate_data
251 |     ) -> PlanState:
252 |         """
253 |         Add a node from planstate_data event to this query plantree.
254 |         We walk the stack up to understand where the nodes are located relative
255 |         to each other.
256 |         """
257 |         nodes = self.nodes
258 |         addr = event.planstate_addr
259 |         planstate = nodes.get(addr)
260 |         if planstate is None:
261 |             planstate = PlanState(addr)
262 |             nodes[addr] = planstate
263 |         planstate.update(metadata, event)
264 |         if not planstate.is_stub:
265 |             return planstate
266 |         self.add_nodes_from_stack(
267 |             metadata, event.stack_capture, start_at=1, base_node=planstate
268 |         )
269 |         planstate.is_stub = False
270 |         return planstate
271 | 


--------------------------------------------------------------------------------
/src/pgtracer/scripts/pgtrace_gucs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This simple script reads and writes GUCs in a running PostgreSQL backend
 3 | """
 4 | import argparse
 5 | 
 6 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector, GUCTracerOptions
 7 | 
 8 | 
 9 | def main() -> None:
10 |     """
11 |     Entry point for the pgtrace_gucs script.
12 |     """
13 |     parser = argparse.ArgumentParser(
14 |         description="Run and / or write GUCs from a running PostgreSQL backend."
15 |     )
16 |     parser.add_argument("pid", type=int, help="PID to connect to")
17 | 
18 |     parser.add_argument(
19 |         "--set-guc",
20 |         metavar="GUC=VALUE",
21 |         dest="set_gucs",
22 |         nargs="+",
23 |         default=[],
24 |         help="Set a number of GUCs in the running backend",
25 |     )
26 | 
27 |     args = parser.parse_args()
28 |     pid = args.pid
29 | 
30 |     # Parse the set-guc option.
31 |     set_gucs = {}
32 |     for keyvalue in args.set_gucs:
33 |         key, value = keyvalue.split("=")
34 |         set_gucs[key] = value
35 |     options = GUCTracerOptions()
36 | 
37 |     collector = GUCTracerBPFCollector.from_pid(pid, options)
38 |     collector.start()
39 |     print(f"Backend is of type {str(collector.backend_type)}")
40 |     seen = set()
41 |     for gucname, gucvalue in set_gucs.items():
42 |         collector.set_guc(gucname, gucvalue)
43 |     while collector.is_running:
44 |         with collector.lock:
45 |             for guc in collector.guc_defs.values():
46 |                 if guc.guc_name is not None:
47 |                     seen.add(guc.guc_name)
48 |     collector.stop()
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/src/pgtracer/scripts/pgtrace_queries.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This simple script trace queries executed by a Postgres backend.
  3 | """
  4 | 
  5 | import argparse
  6 | import sys
  7 | import time
  8 | from collections import defaultdict
  9 | from datetime import timedelta
 10 | from typing import Any, Dict, Optional
 11 | 
 12 | from pgtracer.ebpf.collector.querytracer import (
 13 |     InstrumentationFlags,
 14 |     QueryTracerBPFCollector,
 15 |     QueryTracerOptions,
 16 | )
 17 | from pgtracer.ebpf.dwarf import Struct
 18 | from pgtracer.model.query import Query
 19 | from pgtracer.utils import timespec_to_timedelta
 20 | 
 21 | 
 22 | def dump_dict(somedict: Dict[str, Any], indent: int = 0) -> str:
 23 |     """
 24 |     Dump a dictionary as an indented string of key / value pairs.
 25 |     """
 26 |     parts = []
 27 |     tabs = "\t" * indent
 28 |     for key, value in somedict.items():
 29 |         if isinstance(value, Struct):
 30 |             # Special case for timespec
 31 |             if value.__class__.__name__ == "timespec":
 32 |                 try:
 33 |                     value = timespec_to_timedelta(value)
 34 |                 except OverflowError:
 35 |                     # Ignore overflowing timespecs
 36 |                     continue
 37 |             else:
 38 |                 value = value.as_dict(include_all=True)
 39 |         if isinstance(value, dict):
 40 |             part = "\n" + dump_dict(value, indent + 1)
 41 |         else:
 42 |             if hasattr(value, "value"):
 43 |                 part = value.value
 44 |             else:
 45 |                 part = value
 46 |         parts.append(f"{tabs}{key}: {part}")
 47 |     return "\n".join(parts)
 48 | 
 49 | 
 50 | def print_query(query: Query, options: QueryTracerOptions) -> None:
 51 |     """
 52 |     Print a query according to which collector options have been set.
 53 |     """
 54 |     parts = []
 55 |     start = "<unknown>"
 56 |     if query.start_datetime is not None:
 57 |         start = query.start_datetime.isoformat()
 58 |     parts.append(f"{start} {query.text}")
 59 |     mapping = {}
 60 |     mapping["search_path"] = query.search_path
 61 |     mapping["query_id"] = str(query.query_id) or "<unavailable>"
 62 |     mapping["startup_cost"] = str(query.startup_cost)
 63 |     mapping["total_cost"] = str(query.total_cost)
 64 |     mapping["plan_rows"] = str(query.plan_rows)
 65 |     mapping["peak_mem_alloc"] = str(query.memallocs.current_mem_peak)
 66 |     if query.instrument.need_timer:
 67 |         mapping["runtime"] = str(query.runtime)
 68 |     if options.instrument_flags & InstrumentationFlags.BUFFERS:
 69 |         mapping["written_bytes_to_disk"] = str(query.io_counters["W"])
 70 |         if query.shared_buffers_hitratio is not None:
 71 |             mapping["shared_buffers_hitratio"] = f"{query.shared_buffers_hitratio:0.2f}"
 72 |         else:
 73 |             mapping["shared_buffers_hitratio"] = None
 74 |         if query.syscache_hitratio is not None:
 75 |             mapping["syscache_hitratio"] = f"{query.syscache_hitratio:0.2f}"
 76 |         else:
 77 |             mapping["syscache_hitratio"] = None
 78 |         if query.instrument:
 79 |             mapping["buffer_usage"] = query.instrument.bufusage
 80 |     if options.instrument_flags & InstrumentationFlags.WAL and query.instrument:
 81 |         mapping["wal_usage"] = query.instrument.walusage
 82 |     print(query.text)
 83 |     print(dump_dict(mapping, 1))
 84 |     if options.enable_nodes_collection:
 85 |         print(query.root_node.explain())
 86 | 
 87 | 
 88 | LINE_UP = "\033[1A"
 89 | LINE_CLEAR = "\x1b[2K"
 90 | 
 91 | 
 92 | def print_running_query(
 93 |     query: Query, print_plan: bool, first_time: bool, clear_line: int = 0
 94 | ) -> int:
 95 |     """
 96 |     Print the currently running query.
 97 |     """
 98 |     nb_lines = 0
 99 |     if first_time:
100 |         print("Currently running:")
101 |         print(query.text)
102 |         if not print_plan:
103 |             print("Tuples produced / tuple expected")
104 |             print("")
105 |     for _ in range(clear_line):
106 |         print(LINE_UP, end=LINE_CLEAR)
107 |     if print_plan and query.root_node:
108 |         plan = query.root_node.explain()
109 |         nb_lines = len(plan.split("\n"))
110 |         print(plan)
111 |     else:
112 |         print(f"{int(query.instrument.tuplecount.value)} / {int(query.plan_rows)}")
113 |     return nb_lines
114 | 
115 | 
116 | def main() -> None:
117 |     """
118 |     Entry point for the pgtrace_queries script.
119 |     """
120 |     parser = argparse.ArgumentParser(
121 |         description="Dump a running backend execution plan"
122 |     )
123 |     parser.add_argument("pid", type=int, help="PID to connect to")
124 |     parser.add_argument(
125 |         "--instrument",
126 |         "-I",
127 |         type=str,
128 |         default=None,
129 |         nargs="*",
130 |         choices=[flag.name for flag in InstrumentationFlags],
131 |         action="extend",
132 |         help="""Instrument flags to set. (warning: writes into backends
133 |         memory!)""",
134 |     )
135 |     parser.add_argument(
136 |         "--nodes-collection",
137 |         "-n",
138 |         default=False,
139 |         action="store_true",
140 |         help="""Collect information about individual execution nodes""",
141 |     )
142 | 
143 |     args = parser.parse_args()
144 |     pid = args.pid
145 |     instrument_flags = 0
146 |     if args.instrument:
147 |         for flag in args.instrument:
148 |             instrument_flags |= InstrumentationFlags[flag]
149 |     options = QueryTracerOptions(
150 |         instrument_flags=instrument_flags,
151 |         enable_nodes_collection=args.nodes_collection,
152 |         enable_perf_events=instrument_flags != 0,
153 |     )
154 |     collector = QueryTracerBPFCollector.from_pid(pid, options)
155 |     collector.start()
156 |     total_queries = 0
157 |     last_running_query: Dict[int, Optional[Query]] = defaultdict(lambda: None)
158 |     lines_to_clear = 0
159 |     while collector.is_running:
160 |         try:
161 |             time.sleep(1)
162 |             for (
163 |                 pid,
164 |                 process_info,
165 |             ) in collector.event_handler.per_process_info.copy().items():
166 |                 if not process_info.query_history and process_info.current_query:
167 |                     first_time = (
168 |                         last_running_query[pid] is not process_info.current_query
169 |                     )
170 |                     if first_time:
171 |                         lines_to_clear = 0
172 |                     lines_to_clear = print_running_query(
173 |                         process_info.current_query,
174 |                         options.enable_nodes_collection,
175 |                         first_time,
176 |                         lines_to_clear,
177 |                     )
178 |                     last_running_query[pid] = process_info.current_query
179 |                     continue
180 |                 last_running_query[pid] = None
181 |                 for query in process_info.query_history:
182 |                     print_query(query, options)
183 |                 total_queries += len(process_info.query_history)
184 |                 process_info.query_history = []
185 |         except KeyboardInterrupt:
186 |             break
187 |     collector.stop()
188 |     total_processes = len(collector.event_handler.process_history) + len(
189 |         collector.event_handler.per_process_info
190 |     )
191 |     print(f"Processed {total_queries} queries among {total_processes} processes")
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     main()
196 | 


--------------------------------------------------------------------------------
/src/pgtracer/utils.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Miscellaneous utility functions.
 3 | """
 4 | 
 5 | import functools
 6 | import itertools
 7 | import re
 8 | import subprocess
 9 | from datetime import timedelta
10 | from typing import TYPE_CHECKING, BinaryIO, Optional, Union
11 | 
12 | from pypsutil import Process
13 | 
14 | from pgtracer.ebpf.dwarf import Struct
15 | 
16 | if TYPE_CHECKING:
17 |     from ctypes import _CData
18 | else:
19 |     _CData = object
20 | 
21 | 
22 | def timespec_to_timedelta(timespec: Union[_CData, Struct]) -> timedelta:
23 |     """
24 |     Convert a timespec_t or instr_time struct to a timedelta.
25 |     """
26 |     # Can't really compare it to a proper class, so test on the class name
27 |     if timespec.__class__.__name__ == "timespec":
28 |         return timedelta(
29 |             seconds=timespec.tv_sec.value,  # type: ignore
30 |             microseconds=timespec.tv_nsec.value / 1000,  # type: ignore
31 |         )
32 |     if timespec.__class__.__name__ == "instr_time":
33 |         return timedelta(seconds=timespec.ticks.value / 1000000000)  # type: ignore
34 |     raise ValueError("Expecting a timespec or instr_time struct")
35 | 
36 | 
37 | def timespec_to_float(timespec: _CData) -> float:
38 |     """
39 |     Convert a timespec_t or instr_time struct to a float representing the number of seconds.
40 |     """
41 |     if timespec.__class__.__name__ == "timespec":
42 |         return float(timespec.tv_sec.value + timespec.tv_nsec.value / 1000000000)  # type: ignore
43 |     if timespec.__class__.__name__ == "instr_time":
44 |         return float(timespec.ticks.value / 1000000000)  # type: ignore
45 |     raise ValueError("Expecting a timespec or instr_time struct")
46 | 
47 | 
48 | NSPID_PARSING_RE = re.compile(rb"^NSpid:\s+((?:(?:\d+)\s*)+)")
49 | 
50 | 
51 | def resolve_container_pid(container: str, container_pid: int) -> Optional[int]:
52 |     """
53 |     Resolve container_pid from the systemd-nspawn container `container`
54 |     to a host pid.
55 |     """
56 |     # FIXME: this probably does not handle nested namespaces.
57 |     completed_process = subprocess.run(
58 |         ["machinectl", "show", container, "-p", "Leader"],
59 |         capture_output=True,
60 |         check=True,
61 |     )
62 |     container_leader_pid = int(completed_process.stdout.split(b"=")[1])
63 |     # Now iterate over all child processes from this container.
64 |     leader_process = Process(container_leader_pid)
65 |     for child in leader_process.children(recursive=True):
66 |         with open(f"/proc/{child.pid}/status", "rb") as statf:
67 |             for line in statf:
68 |                 nspid_match = NSPID_PARSING_RE.match(line)
69 |                 if nspid_match:
70 |                     ns_pids = list(map(int, nspid_match.group(1).strip().split(b"\t")))
71 |                     if ns_pids[-1] == container_pid:
72 |                         return ns_pids[0]
73 |     return None
74 | 
75 | 
76 | def readcstr(filelike: BinaryIO) -> bytes:
77 |     """
78 |     Read a NULL terminated C-string from a BinaryIO
79 |     Courtesy of https://stackoverflow.com/a/32775270
80 |     """
81 |     toeof = iter(functools.partial(filelike.read, 1), b"")
82 |     return b"".join(itertools.takewhile(b"\0".__ne__, toeof))
83 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Pytest fixtures.
  3 | """
  4 | 
  5 | import os
  6 | import re
  7 | import subprocess
  8 | from pathlib import Path
  9 | from pwd import getpwnam
 10 | from tempfile import TemporaryDirectory
 11 | from typing import Iterator
 12 | 
 13 | import port_for
 14 | import psycopg
 15 | import pytest
 16 | from pytest import FixtureRequest
 17 | from pytest_postgresql.config import get_config
 18 | from pytest_postgresql.executor import PostgreSQLExecutor
 19 | from pytest_postgresql.executor_noop import NoopExecutor
 20 | 
 21 | from pgtracer.ebpf.collector import CollectorOptions
 22 | from pgtracer.ebpf.collector.guc import GUCTracerBPFCollector
 23 | from pgtracer.ebpf.collector.querytracer import (
 24 |     InstrumentationFlags,
 25 |     QueryTracerBPFCollector,
 26 | )
 27 | from pgtracer.utils import resolve_container_pid
 28 | 
 29 | 
 30 | def pytest_addoption(parser):
 31 |     """
 32 |     Add the required options to pytest.
 33 |     """
 34 |     parser.addoption(
 35 |         "--container",
 36 |         help="Set this if the backend we are testing against is "
 37 |         "running inside a container.",
 38 |     )
 39 | 
 40 | 
 41 | def pytest_configure(config):
 42 |     """
 43 |     Add used markers.
 44 |     """
 45 |     config.addinivalue_line(
 46 |         "markers", "slow: mark test as being 'slow', allowing to skip it"
 47 |     )
 48 | 
 49 | 
 50 | @pytest.fixture(scope="session")
 51 | def nonroot_postgres(request: FixtureRequest) -> Iterator[PostgreSQLExecutor]:
 52 |     """
 53 |     Returns a PostgreSQLExecutor to a newly created instance, running as the
 54 |     postgres user.
 55 | 
 56 |     FIXME: make the unix user used to run the instance configurable.
 57 |     """
 58 | 
 59 |     config = get_config(request)
 60 | 
 61 |     # If we have a host, use that instead of creating a new instance.
 62 |     if request.config.getoption("postgresql_host"):
 63 |         postgresql_executor = NoopExecutor(
 64 |             config.get("host"), 5432, "postgres", {}, "postgres"
 65 |         )
 66 |         postgresql_executor.unixsocketdir = None
 67 |         yield postgresql_executor
 68 |         return
 69 | 
 70 |     postgresql_ctl = config["exec"]
 71 | 
 72 |     if not os.path.exists(postgresql_ctl):
 73 |         pg_bindir = subprocess.check_output(
 74 |             ["pg_config", "--bindir"], universal_newlines=True
 75 |         ).strip()
 76 |         postgresql_ctl = os.path.join(pg_bindir, "pg_ctl")
 77 | 
 78 |     pg_passwd = getpwnam("postgres")
 79 | 
 80 |     with TemporaryDirectory() as tempdir_str:
 81 |         tmpdir = Path(tempdir_str)
 82 |         os.chown(tmpdir, pg_passwd.pw_uid, pg_passwd.pw_gid)
 83 |         pg_port = port_for.select_random()
 84 |         datadir = tmpdir / f"data-{pg_port}"
 85 |         unix_socket_dir = tmpdir / "unix-socket"
 86 |         postgresql_executor = PostgreSQLExecutor(
 87 |             executable=postgresql_ctl,
 88 |             shell=True,
 89 |             port=pg_port,
 90 |             host="localhost",
 91 |             unixsocketdir=str(unix_socket_dir),
 92 |             logfile=str(tmpdir / "pg_log"),
 93 |             dbname="postgres",
 94 |             startparams="",
 95 |             datadir=str(datadir),
 96 |         )
 97 |         postgresql_executor.VERSION_RE = re.compile(
 98 |             ".* (?P<version>\\d+((\\.\\d+)|beta\\d|rc\\d|dev))"
 99 |         )
100 |         pid = os.fork()
101 |         if pid == 0:
102 |             try:
103 |                 os.setuid(pg_passwd.pw_uid)
104 |                 os.chdir(str(tmpdir))
105 |                 datadir.mkdir()
106 |                 unix_socket_dir.mkdir()
107 |                 postgresql_executor.start()
108 |                 postgresql_executor.wait_for_postgres()
109 |             except Exception as exc:  # pylint: disable=broad-except
110 |                 print(exc)
111 |                 os._exit(1)  # pylint: disable=protected-access
112 |             finally:
113 |                 os._exit(0)  # pylint: disable=protected-access
114 |         else:
115 |             pid, return_code = os.waitpid(pid, 0)
116 |             if return_code != 0:
117 |                 raise Exception("Could not start postgresql")
118 |             try:
119 |                 yield postgresql_executor
120 |             finally:
121 |                 pid = os.fork()
122 |                 if pid == 0:
123 |                     try:
124 |                         os.setuid(pg_passwd.pw_uid)
125 |                         postgresql_executor.stop()
126 |                     finally:
127 |                         os._exit(0)  # pylint: disable=protected-access
128 |                 os.waitpid(pid, 0)
129 | 
130 | 
131 | @pytest.fixture
132 | def connection(nonroot_postgres):  # pylint: disable=redefined-outer-name
133 |     """
134 |     Returns a connection to the temporary postgresql instance.
135 |     """
136 |     conn = psycopg.connect(
137 |         port=nonroot_postgres.port,
138 |         host=nonroot_postgres.unixsocketdir or nonroot_postgres.host,
139 |         user=nonroot_postgres.user,
140 |     )
141 |     yield conn
142 |     conn.close()
143 | 
144 | 
145 | def make_collector(
146 |     cls, connection, config, **kwargs
147 | ):  # pylint: disable=redefined-outer-name
148 |     """
149 |     Create a collector from a connection.
150 |     """
151 |     backend_pid = connection.info.backend_pid
152 |     if config.getoption("container"):
153 |         # If we have a container, look into it to translate the backend_pid
154 |         # to the host namespace.
155 |         backend_pid = resolve_container_pid(config.getoption("container"), backend_pid)
156 |     options = cls.options_cls(**kwargs)
157 |     collector = cls.from_pid(pid=backend_pid, options=options)
158 |     collector.start()
159 |     return collector
160 | 
161 | 
162 | @pytest.fixture
163 | def querytracer_factory(connection, request):
164 |     def factory_func(**kwargs):
165 |         kwargs.setdefault("enable_nodes_collection", True)
166 |         return make_collector(
167 |             QueryTracerBPFCollector, connection, request.config, **kwargs
168 |         )
169 | 
170 |     return factory_func
171 | 
172 | 
173 | @pytest.fixture
174 | def querytracer(
175 |     request: FixtureRequest, connection
176 | ):  # pylint: disable=redefined-outer-name
177 |     """
178 |     Returns a bpfcollector associated to the current connection.
179 |     """
180 |     collector = make_collector(
181 |         QueryTracerBPFCollector,
182 |         connection,
183 |         request.config,
184 |         enable_nodes_collection=True,
185 |     )
186 |     yield collector
187 |     collector.stop()
188 | 
189 | 
190 | @pytest.fixture
191 | def querytracer_instrumented(
192 |     request: FixtureRequest, connection
193 | ):  # pylint: disable=redefined-outer-name
194 |     """
195 |     Returns a bpfcollector with instrumentation turned on.
196 |     """
197 |     collector = make_collector(
198 |         QueryTracerBPFCollector,
199 |         connection,
200 |         request.config,
201 |         instrument_flags=InstrumentationFlags.ALL,
202 |         enable_perf_events=True,
203 |         enable_query_discovery=True,
204 |         enable_nodes_collection=True,
205 |     )
206 |     yield collector
207 |     collector.stop()
208 | 
209 | 
210 | @pytest.fixture
211 | def guctracer(request: FixtureRequest, connection):
212 |     """
213 |     Fixture returning an instance of a GUCTracer.
214 |     """
215 |     collector = make_collector(GUCTracerBPFCollector, connection, request.config)
216 |     yield collector
217 |     collector.stop()
218 | 


--------------------------------------------------------------------------------
/tests/scripts/setup_fedora_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | 
 4 | # Create a directory for the container
 5 | mkdir ~/fedora
 6 | mkdir -p /var/lib/machines/fedora
 7 | mount -o bind ~/fedora /var/lib/machines/fedora
 8 | mkdir -p /etc/distro.repos.d
 9 | # Configure yum repos for fedora
10 | cat << EOF > /etc/distro.repos.d/fedora.repo
11 | [fedora]
12 | name=Fedora  \$releasever – \$basearch
13 | failovermethod=priority
14 | baseurl=http://download.fedoraproject.org/pub/fedora/linux/releases/\$releasever/Everything/\$basearch/os
15 | metalink=https://mirrors.fedoraproject.org/metalink?repo=fedora-\$releasever&arch=\$basearch
16 | enabled=1
17 | gpgcheck=1
18 | gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-\$releasever-\$basearch
19 | metadata_expire=1
20 | skip_if_unavailable=False
21 | EOF
22 | 
23 | # Install the fedora key for f36
24 | # TODO: generalize it
25 | mkdir -p /etc/pki/rpm-gpg/
26 | wget https://getfedora.org/static/fedora.gpg -O /etc/pki/rpm-gpg/RPM-GPG-KEY-fedora-36-x86_64
27 | 
28 | # Install the required packages in the container
29 | dnf -y --releasever=36 --best \
30 |   --refresh \
31 |   --setopt=install_weak_deps=False \
32 |   --installroot=/var/lib/machines/fedora/ \
33 |   install \
34 |   dhcp-client dnf fedora-release glibc glibc-langpack-en glibc-langpack-de \
35 |   iputils less ncurses passwd systemd \
36 |   systemd-networkd systemd-resolved util-linux vim-default-editor \
37 |   postgresql-server dnf-utils dnf-plugins-core \
38 |   python-bcc python-pip libunwind
39 | 
40 | rm /var/lib/machines/fedora/etc/resolv.conf
41 | cp /etc/resolv.conf /var/lib/machines/fedora/etc/resolve.conf
42 | 
43 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf --best -y --releasever=36 install postgresql-server
44 | systemd-nspawn -D /var/lib/machines/fedora/ /usr/bin/dnf -y --releasever=36 debuginfo-install postgresql-server
45 | 
46 | # Set a dummy password for the root user
47 | systemd-nspawn --console=pipe -D /var/lib/machines/fedora/ passwd root --stdin << EOF
48 | fedora
49 | EOF
50 | 
51 | systemctl start systemd-nspawn@fedora
52 | sleep 2
53 | systemd-run --machine fedora --pipe --wait  /usr/bin/postgresql-setup --initdb
54 | systemd-run --machine fedora --pipe --wait /usr/bin/sed "s/#listen_addresses = 'localhost'/listen_addresses = '*'/" /var/lib/pgsql/data/postgresql.conf -i
55 | systemd-run --machine fedora --pipe --wait /usr/bin/bash -c 'echo "host all all 0.0.0.0/0 trust" > /var/lib/pgsql/data/pg_hba.conf'
56 | systemd-run --machine fedora --pipe --wait /usr/bin/systemctl enable postgresql --now
57 | 
58 | 
59 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip link set up host0
60 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip addr add 172.16.0.1/30 dev host0
61 | systemd-run --machine fedora --pipe --wait /usr/sbin/ip route add default dev host0
62 | 
63 | # Ok, now we need to assign a static IP address
64 | ip link   set up ve-fedora
65 | ip route add 172.16.0.0/30 dev ve-fedora
66 | ip addr add 172.16.0.2/30 dev ve-fedora
67 | 


--------------------------------------------------------------------------------
/tests/test_bins/Makefile:
--------------------------------------------------------------------------------
1 | %.elf: %.elf.c
2 | 	gcc -Wl,--build-id -gdwarf-5 -O0 -c $*.elf.c -o $@
3 | 
4 | %.main: %.main.c
5 | 	gcc -Wl,--build-id -gdwarf-5 -O0 $*.main.c -o $@
6 | 
7 | all: test.elf test_stack.main
8 | 


--------------------------------------------------------------------------------
/tests/test_bins/test.elf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test.elf


--------------------------------------------------------------------------------
/tests/test_bins/test.elf.c:
--------------------------------------------------------------------------------
 1 | typedef struct StructA {
 2 | 	int a_int;
 3 | 	float a_float;
 4 | 	char* a_charp;
 5 | } StructA;
 6 | 
 7 | typedef struct StructB {
 8 | 	StructA b_structa;
 9 | 	StructA* b_structap;
10 | 	struct StructB* b_structbp;
11 | } StructB;
12 | 
13 | StructA GLOBAL_STRUCT_A = {1, 1.0, "TEST"};
14 | 
15 | StructB GLOBAL_STRUCT_B = {0};
16 | 


--------------------------------------------------------------------------------
/tests/test_bins/test_stack.main:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Aiven-Open/pgtracer/5faf30a695f7b45711e987df8a82c904a36e581f/tests/test_bins/test_stack.main


--------------------------------------------------------------------------------
/tests/test_bins/test_stack.main.c:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | int func_1(int a, int b)
 4 | {
 5 | 	int c = a + b;
 6 | 	return c;
 7 | }
 8 | 
 9 | int func_2(int a, int b)
10 | {
11 | 	return func_1(a + 1, b + 2);
12 | }
13 | 
14 | int main(int argc, char** argv)
15 | {
16 | 	/*
17 | 	 * Block until the testing program sends something on stdin.
18 | 	 * This is to allow for the testing program to get our proc/maps
19 | 	 */
20 | 	getchar();
21 | 	return func_2(10, 20);
22 | }
23 | 


--------------------------------------------------------------------------------
/tests/test_dwarf.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module tests some utilities from the dwarf module.
  3 | """
  4 | 
  5 | import ctypes as ct
  6 | import os
  7 | from pathlib import Path
  8 | from unittest import TestCase
  9 | from unittest.mock import patch
 10 | 
 11 | from pgtracer.ebpf.dwarf import (
 12 |     DWARFPointer,
 13 |     ProcessMetadata,
 14 |     Struct,
 15 |     StructMemberDefinition,
 16 | )
 17 | from pgtracer.ebpf.eh_frame_hdr import EhFrameHdr
 18 | 
 19 | TEST_BINARY = Path(__file__).parent / "test_bins" / "test.elf"
 20 | TEST_EXEC_BINARY = Path(__file__).parent / "test_bins" / "test_stack.main"
 21 | 
 22 | 
 23 | class MockProcess:
 24 |     """
 25 |     Mock a pypsutil.Process.
 26 |     """
 27 | 
 28 |     def __init__(self, binary):
 29 |         self.binary = binary
 30 | 
 31 |     def exe(self):
 32 |         """
 33 |         Returns a constant binary string.
 34 |         """
 35 |         return self.binary
 36 | 
 37 |     @property
 38 |     def pid(self):
 39 |         """
 40 |         Returns self pid. We only need an existing pid...
 41 |         """
 42 |         return os.getpid()
 43 | 
 44 | 
 45 | class TestProcessMetadata(TestCase):
 46 |     """
 47 |     Test the dwarf helpers in ProcessMetadata.
 48 |     """
 49 | 
 50 |     @patch("pgtracer.ebpf.dwarf.get_mapped_regions", lambda process, root: [])
 51 |     def setUp(self):
 52 |         self.process_meta = ProcessMetadata(MockProcess(TEST_BINARY))
 53 |         self.exec_process_meta = ProcessMetadata(MockProcess(TEST_EXEC_BINARY))
 54 | 
 55 |     def test_struct(self):
 56 |         """
 57 |         Test the struct parsing helper.
 58 |         """
 59 |         structs = self.process_meta.structs
 60 | 
 61 |         StructA = structs.StructA  # pylint: disable=invalid-name
 62 |         self.assertTrue(issubclass(StructA, Struct))
 63 |         self.assertEqual(StructA.size, 16)
 64 | 
 65 |         a_int = StructA.field_definition("a_int")
 66 |         self.assertIsInstance(a_int, StructMemberDefinition)
 67 |         self.assertEqual(a_int.offset, 0)
 68 |         self.assertEqual(a_int.member_type, ct.c_int)
 69 | 
 70 |         a_float = StructA.field_definition("a_float")
 71 |         self.assertEqual(a_float.offset, 4)
 72 |         self.assertEqual(a_float.member_type, ct.c_float)
 73 | 
 74 |         a_charp = StructA.field_definition("a_charp")
 75 |         self.assertEqual(a_charp.offset, 8)
 76 |         self.assertTrue(issubclass(a_charp.member_type, ct._Pointer))
 77 |         self.assertEqual(a_charp.member_type._type_, ct.c_byte)
 78 | 
 79 |         StructB = structs.StructB  # pylint: disable=invalid-name
 80 | 
 81 |         b_structa = StructB.field_definition("b_structa")
 82 |         self.assertEqual(b_structa.offset, 0)
 83 |         self.assertEqual(b_structa.member_type, StructA)
 84 | 
 85 |         b_structap = StructB.field_definition("b_structap")
 86 |         self.assertEqual(b_structap.offset, StructA.size)
 87 |         self.assertTrue(issubclass(b_structap.member_type, DWARFPointer))
 88 |         self.assertEqual(b_structap.member_type.pointed_type, StructA)
 89 | 
 90 |         b_structbp = StructB.field_definition("b_structbp")
 91 |         self.assertEqual(b_structbp.offset, StructA.size + 8)
 92 |         self.assertTrue(issubclass(b_structbp.member_type, DWARFPointer))
 93 |         self.assertEqual(b_structbp.member_type.pointed_type, StructB)
 94 | 
 95 |     def test_eh_frame_hdr(self):
 96 |         """
 97 |         The the eh_frame_hdr parser.
 98 |         """
 99 |         eh_frame_hdr = EhFrameHdr.load_eh_frame_hdr(self.exec_process_meta.elffile)
100 |         all_entries = list(eh_frame_hdr.iter_entries())
101 |         assert len(all_entries) == 5
102 |         assert eh_frame_hdr.fde_count == 5
103 |         assert eh_frame_hdr.find_fde(0) == None
104 |         assert eh_frame_hdr.find_fde(0xFFFFFFFFF) == None
105 |         assert eh_frame_hdr.find_fde(4412).header.initial_location == 4409
106 | 
107 |     def test_die_contains_addr(self):
108 |         dw = self.exec_process_meta.dwarf_info
109 |         all_cus = list(dw.iter_CUs())
110 |         # CU at index 3 as a DW_AT_ranges attribute
111 |         cu = all_cus[3]
112 |         die = cu.get_top_DIE()
113 |         assert self.exec_process_meta.die_contains_addr(die, 4096)
114 |         assert self.exec_process_meta.die_contains_addr(die, 4100)
115 |         assert not self.exec_process_meta.die_contains_addr(die, 4095)
116 |         assert not self.exec_process_meta.die_contains_addr(die, 4118)
117 | 


--------------------------------------------------------------------------------
/tests/test_guctracer.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime, timedelta
 2 | from time import sleep
 3 | from unittest.mock import patch
 4 | 
 5 | from pgtracer.ebpf.collector.guc import GUCTracerEventHandler
 6 | 
 7 | 
 8 | def test_setting_one_guc(guctracer, connection):
 9 |     """
10 |     Test to set a GUC in a running backend.
11 |     """
12 |     guc_has_been_set = False
13 |     original_method = GUCTracerEventHandler.handle_GUCResponse
14 | 
15 |     def observe_guc_response(event_handler, collector, event, pid):
16 |         nonlocal guc_has_been_set
17 |         guc_has_been_set = True
18 |         return original_method(event_handler, collector, event, pid)
19 | 
20 |     with patch(
21 |         f"pgtracer.ebpf.collector.guc.GUCTracerEventHandler.handle_GUCResponse",
22 |         observe_guc_response,
23 |     ):
24 |         # Set work_mem to 64kB
25 |         guctracer.set_guc("work_mem", 64)
26 |         start = datetime.now()
27 |         while not guc_has_been_set and (datetime.now() - start) < timedelta(seconds=20):
28 |             # Generate some activity to trigger the probe
29 |             with connection.execute("SELECT 1") as cur:
30 |                 pass
31 |             sleep(0.1)
32 |         with connection.execute("show work_mem") as cur:
33 |             result = cur.fetchall()
34 |             val = result[0][0]
35 |             # Depending on the version, it can come back as str or bytes
36 |             if isinstance(val, bytes):
37 |                 val = val.decode("utf8")
38 |             assert val == "64kB"
39 | 


--------------------------------------------------------------------------------
/tests/test_querytracer.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module acts as a general health check for the eBPF collector.
  3 | """
  4 | import re
  5 | from collections import defaultdict
  6 | from contextlib import ExitStack
  7 | from datetime import timedelta
  8 | from threading import Thread
  9 | from time import sleep
 10 | from unittest.mock import patch
 11 | 
 12 | import pytest
 13 | from flaky import flaky
 14 | 
 15 | from pgtracer.ebpf.collector.querytracer import (
 16 |     InstrumentationFlags,
 17 |     QueryTracerEventHandler,
 18 | )
 19 | from pgtracer.utils import timespec_to_timedelta as tstimedelta
 20 | 
 21 | 
 22 | def wait_for_collector(collector):
 23 |     """
 24 |     Wait for the collector to have at least one query.
 25 |     """
 26 |     tries = 0
 27 |     process_info = collector.event_handler.per_process_info[collector.pid]
 28 |     while len(process_info.query_history) == 0 and tries < 1000:
 29 |         tries += 1
 30 |         sleep(0.05)
 31 | 
 32 | 
 33 | def test_basic_ebf_collector(querytracer, connection):
 34 |     """
 35 |     Test the most basic functionality of the ebpf collector works.
 36 |     """
 37 |     # Now try running a query, and see if we can get it back
 38 |     with connection.execute("SELECT now()") as cur:
 39 |         querystart = cur.fetchall()[0][0].replace(microsecond=0, tzinfo=None)
 40 |     wait_for_collector(querytracer)
 41 |     assert len(querytracer.event_handler.per_process_info) == 1
 42 |     process_info = querytracer.event_handler.per_process_info[querytracer.pid]
 43 |     assert len(process_info.query_history) == 1
 44 |     query = process_info.query_history[0]
 45 |     assert query.text == "SELECT now()"
 46 |     assert query.search_path == '"$user", public'
 47 |     assert query.start_datetime.replace(microsecond=0) == querystart
 48 |     assert query.runtime is None
 49 |     assert query.instrument.need_timer.value is False
 50 |     assert query.instrument.need_bufusage.value is False
 51 |     assert query.shared_buffers_hitratio is None
 52 |     assert query.syscache_hitratio is None
 53 | 
 54 | 
 55 | def test_instrumentation(querytracer_instrumented, connection):
 56 |     """
 57 |     Test that turning instrumentation on works as expected.
 58 |     """
 59 |     connection.execute("SET track_io_timing = on")
 60 |     # We want to have at least a few system reads, so do what is necessary...
 61 |     with open("/proc/sys/vm/drop_caches", "wb") as procf:
 62 |         procf.write(b"1")
 63 | 
 64 |     with connection.execute("SELECT * FROM pg_attribute") as cur:
 65 |         cur.fetchall()
 66 |     wait_for_collector(querytracer_instrumented)
 67 |     assert len(querytracer_instrumented.event_handler.per_process_info) == 1
 68 |     process_info = querytracer_instrumented.event_handler.per_process_info[
 69 |         querytracer_instrumented.pid
 70 |     ]
 71 | 
 72 |     assert len(process_info.query_history) == 1
 73 |     query = process_info.query_history[0]
 74 |     assert query.instrument.need_timer.value is True
 75 |     assert query.instrument.need_bufusage.value is True
 76 |     assert query.runtime > timedelta(0)
 77 |     assert query.instrument.bufusage.shared_blks_hit.value > 0
 78 |     assert query.instrument.bufusage.shared_blks_read.value >= 0
 79 |     assert query.instrument.bufusage.temp_blks_read.value == 0
 80 |     assert query.instrument.bufusage.temp_blks_written.value == 0
 81 |     if connection.info.server_version >= 150000:
 82 |         assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) == timedelta(0)
 83 |         assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) == timedelta(
 84 |             0
 85 |         )
 86 |     # We can't make any assumptions about the hit ratios, so just ensure they
 87 |     # have some valid values.
 88 |     assert 0 <= query.shared_buffers_hitratio < 100
 89 |     # The syscache_hitratio can be negative, when we actually end up reading
 90 |     # more blocks than what is accounted for by instrumentation.
 91 |     assert query.syscache_hitratio <= 100
 92 | 
 93 |     # Check that we don't crash without any instrumentation whatshowever
 94 |     query.instrument = None
 95 |     assert query.shared_buffers_hitratio is None
 96 |     assert query.syscache_hitratio is None
 97 | 
 98 |     # Generate some temp files for fun
 99 |     process_info.query_history = []
100 |     connection.execute("SET work_mem = '64kB'")
101 |     with connection.execute("SELECT * FROM generate_series(1, 10000) as t"):
102 |         pass
103 |     wait_for_collector(querytracer_instrumented)
104 |     query = process_info.query_history[0]
105 |     assert query.text == "SELECT * FROM generate_series(1, 10000) as t"
106 |     assert query.instrument.bufusage.temp_blks_read.value > 0
107 |     assert query.instrument.bufusage.temp_blks_written.value > 0
108 |     if connection.info.server_version >= 150000:
109 |         assert tstimedelta(query.instrument.bufusage.temp_blk_read_time) > timedelta(0)
110 |         assert tstimedelta(query.instrument.bufusage.temp_blk_write_time) > timedelta(0)
111 | 
112 |     # Now do the same query with a big enough work_mem to trigger some memory allocations
113 |     connection.execute("SET work_mem = '32MB'")
114 |     process_info.query_history = []
115 |     with connection.execute("SELECT * FROM generate_series(1, 10000) as t") as cur:
116 |         pass
117 |     wait_for_collector(querytracer_instrumented)
118 |     query = process_info.query_history[0]
119 |     # The reparatition between sbrk / mmap and wether we move sbrk back to it's initial
120 |     # value depends on the state of malloc and it's configuration. So best thing we can test is that "something"
121 |     # happened
122 |     assert query.memallocs.current_mem_peak > 0
123 |     # We can't assert anything meaningful about total_malloc but we can at least exercise the code
124 |     assert query.memallocs.total_malloc is not None
125 | 
126 | 
127 | def test_plans(querytracer_instrumented, connection):
128 |     """
129 |     Test that we are able to build a plans.
130 |     """
131 |     with connection.execute(
132 |         "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t"
133 |     ) as cur:
134 |         cur.fetchall()
135 |     wait_for_collector(querytracer_instrumented)
136 |     process_info = querytracer_instrumented.event_handler.per_process_info[
137 |         querytracer_instrumented.pid
138 |     ]
139 |     query = process_info.query_history[0]
140 |     root_node = query.root_node
141 |     NodeTag = querytracer_instrumented.metadata.enums.NodeTag
142 |     assert root_node.tag == NodeTag.T_Limit
143 |     assert len(root_node.children) == 1
144 |     assert root_node.parent_node is None
145 |     assert root_node.instrument.tuplecount.value == 10
146 | 
147 |     sort_node = list(root_node.children)[0]
148 |     assert sort_node.tag == NodeTag.T_Sort
149 |     assert len(sort_node.children) == 1
150 |     assert sort_node.parent_node == root_node
151 |     # FIXME: investigate why we can't fetch this value on ubuntu's PG11.
152 |     if connection.info.server_version >= 120000:
153 |         assert sort_node.instrument.tuplecount.value == 10
154 | 
155 |     seqscan_node = list(sort_node.children)[0]
156 |     assert seqscan_node.tag == NodeTag.T_SeqScan
157 |     assert len(seqscan_node.children) == 0
158 |     assert seqscan_node.parent_node == sort_node
159 | 
160 | 
161 | def test_explain(querytracer, connection):
162 |     """
163 |     Test that we are able to build a plans.
164 |     """
165 |     # We have some trouble with collecting instrumentation for PG < 12
166 |     if connection.info.server_version < 120000:
167 |         return
168 |     cost_snippet = r"\d+\.\d+\..\d+\.\d+"
169 |     wanted_plan = rf"""Limit \(cost={cost_snippet} rows=10 width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)
170 | \t-> Sort \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)
171 | \t\t-> SeqScan \(cost={cost_snippet} rows=\d+ width=\d+\) \(actual time=0.000...0.000 rows=0 loops=1\)"""
172 | 
173 |     with connection.execute(
174 |         "SELECT * FROM (SELECT * FROM pg_class ORDER BY reltype LIMIT 10) t"
175 |     ) as cur:
176 |         cur.fetchall()
177 |     wait_for_collector(querytracer)
178 |     assert len(querytracer.event_handler.per_process_info) == 1
179 |     process_info = querytracer.event_handler.per_process_info[querytracer.pid]
180 |     query = process_info.query_history[0]
181 |     root_node = query.root_node
182 |     assert re.match(wanted_plan, root_node.explain())
183 | 
184 | 
185 | def background_query(connection, query):
186 |     def execute_query():
187 |         with connection.execute(query) as cur:
188 |             cur.fetchall()
189 | 
190 |     newthread = Thread(target=execute_query)
191 |     newthread.start()
192 |     return newthread
193 | 
194 | 
195 | @pytest.mark.slow
196 | def test_long_query(querytracer_instrumented, connection):
197 |     events = defaultdict(int)
198 | 
199 |     def event_handler_observer(method_name):
200 |         original_method = getattr(QueryTracerEventHandler, method_name)
201 | 
202 |         def observe_event_handler(event_handler, bpf_collector, event, pid):
203 |             events[method_name] += 1
204 |             return original_method(event_handler, bpf_collector, event, pid)
205 | 
206 |         return observe_event_handler
207 | 
208 |     with ExitStack() as stack:
209 |         for meth_name in (
210 |             "handle_MemoryResponseNodeInstr",
211 |             "handle_MemoryResponseQueryInstr",
212 |         ):
213 |             stack.enter_context(
214 |                 patch(
215 |                     f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}",
216 |                     event_handler_observer(meth_name),
217 |                 )
218 |             )
219 |         with connection.execute(
220 |             """SELECT count(*) FROM (
221 |             SELECT pg_sleep(0.01)
222 |             FROM pg_class
223 |             JOIN pg_attribute ON pg_class.oid = attrelid
224 |             ) as s """
225 |         ) as cur:
226 |             cur.fetchall()
227 |         wait_for_collector(querytracer_instrumented)
228 |     assert events["handle_MemoryResponseQueryInstr"] > 0
229 |     assert events["handle_MemoryResponseNodeInstr"] > 0
230 | 
231 | 
232 | @pytest.mark.slow
233 | @flaky(max_runs=5)
234 | def test_query_discovery(querytracer_factory, connection):
235 |     """
236 |     Test that information is gathered during a query.
237 |     """
238 |     events = defaultdict(int)
239 | 
240 |     def event_handler_observer(method_name):
241 |         original_method = getattr(QueryTracerEventHandler, method_name)
242 | 
243 |         def observe_event_handler(event_handler, bpf_collector, event, pid):
244 |             events[method_name] += 1
245 |             return original_method(event_handler, bpf_collector, event, pid)
246 | 
247 |         return observe_event_handler
248 | 
249 |     with ExitStack() as stack:
250 |         for meth_name in ("handle_StackSample", "handle_MemoryNodeData"):
251 |             stack.enter_context(
252 |                 patch(
253 |                     f"pgtracer.ebpf.collector.querytracer.QueryTracerEventHandler.{meth_name}",
254 |                     event_handler_observer(meth_name),
255 |                 )
256 |             )
257 |         thread = background_query(
258 |             connection,
259 |             """SELECT count(*) FROM (
260 |             SELECT pg_sleep(0.01)
261 |             FROM pg_class
262 |             JOIN pg_attribute ON pg_class.oid = attrelid
263 |             ) as s """,
264 |         )
265 |         # Now set up the collector.
266 |         collector = None
267 |         try:
268 |             collector = querytracer_factory(
269 |                 instrument_flags=InstrumentationFlags.ALL,
270 |                 enable_perf_events=True,
271 |                 enable_query_discovery=True,
272 |                 enable_nodes_collection=True,
273 |                 sample_freq=1200,
274 |             )
275 |             # And wait for the query to finish
276 |             thread.join()
277 |             # Wait a few seconds more to make sure collector has gathered all info
278 |             sleep(3)
279 |         finally:
280 |             if collector is not None:
281 |                 collector.stop()
282 |     assert events["handle_StackSample"] > 0
283 |     assert events["handle_MemoryNodeData"] > 0
284 | 


--------------------------------------------------------------------------------
/tests/test_stack_unwinding.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This module tests the frame unwinding code.
  3 | """
  4 | import ctypes as ct
  5 | import subprocess
  6 | from pathlib import Path
  7 | from unittest import TestCase
  8 | 
  9 | from bcc import BPF
 10 | from bcc.libbcc import lib as libbcc
 11 | from pypsutil import Process
 12 | 
 13 | from pgtracer.ebpf.collector import CODE_BASE_PATH
 14 | from pgtracer.ebpf.dwarf import ProcessMetadata, die_name
 15 | from pgtracer.ebpf.unwind import MAX_STACK_READ, UnwindAddressSpace, stack_data_t
 16 | 
 17 | TEST_EBPF_PROGRAM = """
 18 | /*
 19 |  * Fill in placeholders for generated defines
 20 |  */
 21 | #define EVENTRING_PAGE_SIZE 1024
 22 | #include "ebpf_maps.h"
 23 | #include "stack.h"
 24 | 
 25 | int capture_stack_enter(struct pt_regs *ctx)
 26 | {
 27 |     struct stack_data_t* stack_data = event_ring.ringbuf_reserve(sizeof(struct
 28 |     stack_data_t));
 29 |     int i = 0, ret = 0;
 30 |     u64 maxread = MAX_STACK_READ;
 31 |     if (!stack_data)
 32 |         return -1;
 33 |     while(stack_data && i < 10)
 34 |     {
 35 |         ret = capture_stack(ctx, stack_data, maxread);
 36 |         i++;
 37 |         maxread = maxread / 2;
 38 |     }
 39 |     event_ring.ringbuf_submit(stack_data, 0);
 40 | }
 41 | """
 42 | 
 43 | 
 44 | class TestStackUnwinding(TestCase):
 45 |     def setUp(self):
 46 |         self.captured_data = []
 47 | 
 48 |     def tearDown(self):
 49 |         for k, v in list(self.ebpf.uprobe_fds.items()):
 50 |             self.ebpf.detach_uprobe_event(k)
 51 | 
 52 |     def _capture_data(self, cpu, data, size):
 53 |         content = stack_data_t()
 54 |         ct.pointer(content)[0] = ct.cast(data, ct.POINTER(stack_data_t)).contents
 55 |         self.captured_data.append(content)
 56 | 
 57 |     def test_simple_call_stack(self):
 58 |         # Load an eBPF program which will capture stacks.
 59 |         binpath = Path(__file__).parent / "test_bins" / "test_stack.main"
 60 | 
 61 |         # Run the program.
 62 |         program = subprocess.Popen([binpath], stdin=subprocess.PIPE)
 63 |         # Now get the stack base address for the program.
 64 |         pm = ProcessMetadata(Process(program.pid))
 65 |         bpf_prog = f"#define STACK_TOP_ADDR {pm.stack_top}\n"
 66 |         bpf_prog += f"#define MAX_STACK_READ {MAX_STACK_READ}\n"
 67 |         bpf_prog += TEST_EBPF_PROGRAM
 68 | 
 69 |         self.ebpf = BPF(
 70 |             text=bpf_prog.encode("utf8"),
 71 |             cflags=[f"-I{CODE_BASE_PATH}"],
 72 |         )
 73 |         self.ebpf.attach_uprobe(
 74 |             name=str(binpath).encode("utf8"),
 75 |             fn_name=b"capture_stack_enter",
 76 |             sym=b"func_1",
 77 |         )
 78 |         self.ebpf.attach_uprobe(
 79 |             name=str(binpath).encode("utf8"),
 80 |             fn_name=b"capture_stack_enter",
 81 |             sym=b"func_2",
 82 |         )
 83 |         self.ebpf[b"event_ring"].open_ring_buffer(self._capture_data)
 84 |         # Ok, now everything is ready for the program to actually run.
 85 |         program.communicate(input=b"C")
 86 |         # Now that the ebpf program has been loaded, run the executable and
 87 |         # check the output.
 88 |         self.ebpf.ring_buffer_poll()
 89 |         assert len(self.captured_data) == 2
 90 | 
 91 |         # First stack should be:
 92 |         # (???) libc
 93 |         #   main
 94 |         #    func_2
 95 |         adress_space = UnwindAddressSpace(self.captured_data[0], pm)
 96 |         frames = list(adress_space.frames())
 97 |         assert len(frames) == 3
 98 |         assert frames[0].region.path == str(binpath)
 99 |         assert die_name(frames[0].die) == "func_2"
100 |         assert frames[1].region.path == str(binpath)
101 |         assert die_name(frames[1].die) == "main"
102 |         libname = Path(frames[2].region.path)
103 |         # Remove all suffixes
104 |         while libname.suffix != ".so":
105 |             libname = libname.with_suffix("")
106 |         assert libname.name == "libc.so"
107 |         assert frames[2].die is None
108 | 
109 |         # Second stack should be:
110 |         # (???) libc
111 |         #   main
112 |         #    func_2
113 |         #      func_1
114 |         adress_space = UnwindAddressSpace(self.captured_data[1], pm)
115 |         frames = list(adress_space.frames())
116 |         assert len(frames) == 4
117 |         assert frames[0].region.path == str(binpath)
118 |         assert die_name(frames[0].die) == "func_1"
119 |         assert frames[1].region.path == str(binpath)
120 |         assert die_name(frames[1].die) == "func_2"
121 |         assert frames[2].region.path == str(binpath)
122 |         assert die_name(frames[2].die) == "main"
123 |         libname = Path(frames[3].region.path)
124 |         # Remove all suffixes
125 |         while libname.suffix != ".so":
126 |             libname = libname.with_suffix("")
127 |         assert libname.name == "libc.so"
128 |         assert frames[3].die is None
129 | 
130 |         # Check the argument values
131 |         assert frames[0].fetch_arg(1, ct.c_int).value == 11
132 |         assert frames[0].fetch_arg(2, ct.c_int).value == 22
133 |         assert frames[1].fetch_arg(1, ct.c_int).value == 10
134 |         assert frames[1].fetch_arg(2, ct.c_int).value == 20
135 | 


--------------------------------------------------------------------------------