├── .github
├── CODEOWNERS
├── ISSUE_TEMPLATE
│ ├── Bug_report.md
│ ├── Feature_request.md
│ ├── Question.md
│ ├── agents-discussion.md
│ ├── apm-agents-meta.md
│ ├── design_issue.md
│ └── new-fields-issue.md
├── dependabot.yml
├── pull_request_template.md
└── workflows
│ ├── apm-agent-meta-issue-action.yml
│ ├── generate-plantuml.yml
│ └── merge-schedule.yml
├── .gitignore
├── LICENSE
├── README.md
├── apm-logo.svg
├── specs
├── agents
│ ├── README.md
│ ├── breaking-changes.md
│ ├── configuration.md
│ ├── error-tracking.md
│ ├── field-limits.md
│ ├── handling-huge-traces
│ │ ├── README.md
│ │ ├── tracing-spans-compress.md
│ │ ├── tracing-spans-drop-fast-exit.md
│ │ ├── tracing-spans-dropped-stats.md
│ │ └── tracing-spans-limit.md
│ ├── log-correlation.md
│ ├── log-reformatting.md
│ ├── log-sending.md
│ ├── logging.md
│ ├── metadata.md
│ ├── metrics-health.md
│ ├── metrics-otel.md
│ ├── metrics.md
│ ├── mobile
│ │ ├── README.md
│ │ ├── configuration.md
│ │ ├── events.md
│ │ ├── metrics.md
│ │ └── session.md
│ ├── otel-distribution.md
│ ├── process-new-fields.md
│ ├── sanitization.md
│ ├── span-links.md
│ ├── trace-continuation.md
│ ├── tracing-api-otel.md
│ ├── tracing-api.md
│ ├── tracing-distributed-tracing.md
│ ├── tracing-instrumentation-aws-lambda.md
│ ├── tracing-instrumentation-aws.md
│ ├── tracing-instrumentation-azure-functions.md
│ ├── tracing-instrumentation-azure.md
│ ├── tracing-instrumentation-db.md
│ ├── tracing-instrumentation-graphql.md
│ ├── tracing-instrumentation-grpc.md
│ ├── tracing-instrumentation-http.md
│ ├── tracing-instrumentation-messaging.md
│ ├── tracing-sampling.md
│ ├── tracing-spans-destination.md
│ ├── tracing-spans-service-target.md
│ ├── tracing-spans.md
│ ├── tracing-transaction-grouping.md
│ ├── tracing-transactions.md
│ ├── transport.md
│ ├── uml
│ │ ├── kafka_consume.puml
│ │ ├── kafka_consume.svg
│ │ ├── publish.puml
│ │ └── publish.svg
│ └── universal-profiling-integration.md
├── integrations
│ └── synthetics.md
└── terminology.md
└── tests
└── agents
├── README.md
├── gherkin-specs
├── api_key.feature
├── azure_app_service_metadata.feature
├── azure_functions_metadata.feature
├── otel_bridge.feature
├── outcome.feature
└── user_agent.feature
└── json-specs
├── container_metadata_discovery.json
├── service_resource_inference.json
├── span_types.json
├── sql_signature_examples.json
├── sql_token_examples.json
├── w3c_distributed_tracing.json
└── wildcard_matcher_tests.json
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Raising a PR against the agent spec folder will automatically request reviews from all agent teams
2 | # If the PR is not ready for review, create a draft PR instead
3 | # See also /.github/pull_request_template.md
4 | /specs/agents @elastic/ingest-otel-data @elastic/apm-agent-java @elastic/apm-agent-net @elastic/apm-agent-node-js @elastic/apm-agent-php @elastic/apm-agent-python @elastic/apm-agent-ruby @elastic/apm-agent-rum @elastic/apm-pm @elastic/apm-agent-ios @elastic/apm-agent-android
5 | /specs/agents/mobile @elastic/apm-agent-ios @elastic/apm-agent-android
6 | /.github/pull_request_template.md @elastic/ingest-otel-data @elastic/apm-agent-java @elastic/apm-agent-net @elastic/apm-agent-node-js @elastic/apm-agent-php @elastic/apm-agent-python @elastic/apm-agent-ruby @elastic/apm-agent-rum @elastic/apm-pm
7 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/Bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 |
5 | ---
6 |
7 | **Describe the bug**
8 | A clear and concise description of what the bug is.
9 |
10 | **To Reproduce**
11 | Steps to reproduce the behavior:
12 | 1. Use this config '...'
13 | 2. Then call '....'
14 | 3. Then do '....'
15 | 4. See error
16 |
17 | **Expected behavior**
18 | A clear and concise description of what you expected to happen.
19 |
20 | **Debug logs**
21 | Attach your debug logs.
22 | See the [documentation](https://www.elastic.co/guide/en/apm/agent/java/current/trouble-shooting.html#trouble-shooting-logging)
23 | about how to enable debug logging.
24 |
25 |
26 | Click to expand
27 |
28 | ```
29 | replace this line with your debug logs
30 | ```
31 |
32 |
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/Feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 |
5 | ---
6 |
7 | **Is your feature request related to a problem? Please describe.**
8 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
9 |
10 | **Describe the solution you'd like**
11 | A clear and concise description of what you want to happen.
12 |
13 | **Describe alternatives you've considered**
14 | A clear and concise description of any alternative solutions or features you've considered.
15 |
16 | **Additional context**
17 | Add any other context or screenshots about the feature request here.
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/Question.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Question
3 | about: Please use the discuss forum (https://discuss.elastic.co/c/apm) to ask questions
4 |
5 | ---
6 |
7 | Please use the [discuss forum](https://discuss.elastic.co/c/apm) to ask questions.
8 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/agents-discussion.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Agents discussion
3 | about: \
4 | Open a draft PR to change the specification to initiate a discussion.
5 | If discussion is required before a spec change proposal can even be assembled, create an Agent discussion issue first.
6 | labels: agents, discussion
7 | ---
8 |
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/apm-agents-meta.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Meta issue for APM agents.
3 | about: Template for a meta issue for APM agents that triggers creation of related agents sub-issues.
4 | labels: meta, apm-agents
5 | ---
6 |
7 | # Description
8 | Put the issue description here ...
9 |
10 |
11 |
17 |
18 | # Issues creation
19 |
20 | ## Create Spec issue?
21 | - [x] Spec issue
22 |
23 | ## Create APM Agent issues?
24 | - [x] elastic-otel-java
25 | - [x] elastic-otel-dotnet
26 | - [x] elastic-otel-node
27 | - [x] elastic-otel-python
28 | - [x] elastic-otel-php
29 | - [ ] apm-agent-java
30 | - [ ] apm-agent-dotnet
31 | - [ ] apm-agent-nodejs
32 | - [ ] apm-agent-python
33 | - [ ] apm-agent-go
34 | - [ ] apm-agent-php
35 | - [ ] apm-agent-ruby
36 |
37 |
38 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/design_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Design issue
3 | labels: design
4 | about: Request UI/UX help for this project
5 | ---
6 |
7 | **Summary of the problem** (If there are multiple problems or use cases, prioritize them)
8 | For example: as I user I want to quickly identify if one of my sites is down
9 |
10 | **User stories**
11 | For example: as an admin, I can create teams and invite new members
12 |
13 | **List known (technical) restrictions and requirements**
14 | For example: has to be scalable from 0-15k containers
15 |
16 | If in doubt, don’t hesitate to reach out to the `#observability-design` Slack channel.
17 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/new-fields-issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: New Metadata/Context Fields
3 | about: Template for proposing new metadata/context fields
4 | labels: agents, poll
5 | ---
6 |
7 |
10 |
11 | ## Proposed fields
12 |
13 | Add optional fields to
14 | - [ ] _span context_
15 | - [ ] _transaction context_
16 |
17 | as always, this should also be added to the _error context_.
18 |
19 | | Intake API field | Elasticsearch field | Elasticsearch Type |
20 | | -----------------|---------------------|-----------------------|
21 | | | | |
22 |
23 | ## JSON Schema
24 |
25 | ```
26 |
27 | ```
28 |
29 | ## Vote
30 |
31 |
35 |
36 | | Agent | Yes | No | Indifferent | N/A | Link to issue
37 | | --------|:----:|:---:|:-----------:|:----:|:-------------------:|
38 | | UI |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
39 | | Server |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
40 | | .NET |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
41 | | Go |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
42 | | Java |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
43 | | Node.js |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
44 | | Python |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
45 | | Ruby |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
46 | | RUM |
[ ]
|
[ ]
|
[ ]
|
[ ]
|
47 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | ---
2 | version: 2
3 | updates:
4 | - package-ecosystem: "github-actions"
5 | directories:
6 | - "/"
7 | - "/.github/actions/*"
8 | schedule:
9 | interval: "weekly"
10 | day: "sunday"
11 | time: "22:00"
12 | groups:
13 | github-actions:
14 | patterns:
15 | - "*"
16 |
--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 |
8 |
9 |
17 |
18 | - [ ] Create PR as draft
19 | - [ ] Approval by at least one other agent
20 | - [ ] Mark as Ready for Review (automatically requests reviews from all agents and PM via [`CODEOWNERS`](https://github.com/elastic/apm/tree/main/.github/CODEOWNERS))
21 | - Remove PM from reviewers if impact on product is negligible
22 | - Remove agents from reviewers if the change is not relevant for them
23 | - [ ] Merge after 2 business days passed without objections \
24 | To auto-merge the PR, add /`schedule YYYY-MM-DD` to the PR description.
25 |
26 |
31 |
32 | - May the instrumentation collect sensitive information, such as secrets or PII (ex. in headers)?
33 | - [ ] Yes
34 | - [ ] Add a section to the spec how agents should apply sanitization (such as `sanitize_field_names`)
35 | - [ ] No
36 | - [ ] Why?
37 | - [ ] n/a
38 | - [ ] Create PR as draft
39 | - [ ] Approval by at least one other agent
40 | - [ ] Mark as Ready for Review (automatically requests reviews from all agents and PM via [`CODEOWNERS`](https://github.com/elastic/apm/tree/main/.github/CODEOWNERS))
41 | - Remove PM from reviewers if impact on product is negligible
42 | - Remove agents from reviewers if the change is not relevant for them
43 | - [ ] Approved by at least 2 agents + PM (if relevant)
44 | - [ ] Merge after 7 days passed without objections \
45 | To auto-merge the PR, add /`schedule YYYY-MM-DD` to the PR description.
46 | - [ ] [Create implementation issues through the meta issue template](https://github.com/elastic/apm/issues/new?assignees=&labels=meta%2C+apm-agents&template=apm-agents-meta.md) (this will automate issue creation for individual agents)
47 | - [ ] If this spec adds a new dynamic config option, [add it to central config](https://github.com/elastic/apm/blob/main/specs/agents/configuration.md#adding-a-new-configuration-option).
48 |
--------------------------------------------------------------------------------
/.github/workflows/apm-agent-meta-issue-action.yml:
--------------------------------------------------------------------------------
1 | name: "APM Agents meta issue handler"
2 | on:
3 | issues:
4 | types: [opened]
5 |
6 | permissions:
7 | contents: read
8 |
9 | jobs:
10 | meta-issue-handler:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Get token
14 | id: get_token
15 | uses: tibdex/github-app-token@3beb63f4bd073e61482598c45c71c1019b59b73a # v2.1.0
16 | with:
17 | app_id: ${{ secrets.OBS_AUTOMATION_APP_ID }}
18 | private_key: ${{ secrets.OBS_AUTOMATION_APP_PEM }}
19 | permissions: >-
20 | {
21 | "issues": "write",
22 | "members": "read"
23 | }
24 | - name: Check team membership for user
25 | uses: elastic/get-user-teams-membership@1.1.0
26 | id: checkUserMember
27 | with:
28 | username: ${{ github.actor }}
29 | team: 'observability'
30 | usernamesToExclude: |
31 | apmmachine
32 | GITHUB_TOKEN: ${{ steps.get_token.outputs.token }}
33 | - name: Create sub issues
34 | if: steps.checkUserMember.outputs.isTeamMember == 'true' && contains(github.event.issue.labels.*.name, 'meta') && contains(github.event.issue.labels.*.name, 'apm-agents')
35 | uses: elastic/gh-action-meta-subissues-creator@1.0.2
36 | id: create_sub_issues
37 | with:
38 | token: ${{ steps.get_token.outputs.token }}
39 | metaIssue: "${{ toJSON(github.event.issue) }}"
40 | bodyRegex: "(.*)(.*)(.*)"
41 | labelsToExclude: "meta,apm-agents"
42 | specLabels: "spec-poc,apm-agents"
43 |
--------------------------------------------------------------------------------
/.github/workflows/generate-plantuml.yml:
--------------------------------------------------------------------------------
1 | name: Generate PlantUML Diagrams
2 | on:
3 | push:
4 | paths:
5 | - "**.puml"
6 |
7 | permissions:
8 | contents: write
9 |
10 | jobs:
11 | plantuml:
12 | runs-on: ubuntu-latest
13 | steps:
14 | - name: Checkout Source
15 | uses: actions/checkout@v4
16 |
17 | - name: Get changed UML files
18 | id: getfile
19 | run: |
20 | echo "::set-output name=files::$(git diff --name-only HEAD^1 HEAD | grep .puml | xargs)"
21 |
22 | - name: Generate SVG Diagrams
23 | uses: holowinski/plantuml-github-action@5ef932f0db4aa76e232fbf19f440248dd102b1d3
24 | with:
25 | args: -v -tsvg ${{steps.getfile.outputs.files}}
26 |
27 | - name: Push Local Changes
28 | uses: stefanzweifel/git-auto-commit-action@v5.0.1
29 | with:
30 | commit_message: "Generate SVG files for PlantUML diagrams"
31 | branch: ${{ github.head_ref }}
32 |
--------------------------------------------------------------------------------
/.github/workflows/merge-schedule.yml:
--------------------------------------------------------------------------------
1 | name: Merge Schedule
2 | on:
3 | pull_request_target:
4 | types:
5 | - opened
6 | - edited
7 | - synchronize
8 | schedule:
9 | # At 6pm EOB
10 | - cron: 0 18 * * *
11 |
12 | permissions:
13 | contents: write
14 |
15 | jobs:
16 | merge_schedule:
17 | runs-on: ubuntu-latest
18 | steps:
19 | - uses: gr2m/merge-schedule-action@v2
20 | with:
21 | # Merge method to use. Possible values are merge, squash or
22 | # rebase. Default is merge.
23 | merge_method: squash
24 | # Time zone to use. Default is UTC.
25 | time_zone: "America/Los_Angeles"
26 | env:
27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
Elastic APM
2 |
3 |
This project includes resources and general issue tracking for Elastic APM.
4 |
5 | Help us make Elastic APM better by sharing your experience with Elastic Product Management. [Schedule a 45-minute session](https://calendly.com/elasticapm-chris) with Elastic Product Management and share your feedback.
6 |
7 | ## What we do…
8 |
9 | | Project | Repo | Docs | Contrib
10 | | :- | :- | :- | :-
11 | | APM Server | [apm-server][] | [📘 elastic.co][apm-server-docs] | [📂 contrib][apm-server-contrib]
12 | | APM UI | [apm-ui][] | [📘 elastic.co][apm-ui-docs] | [📂 contrib][apm-ui-contrib]
13 | | Android Agent (🚧 In Development) | [apm-agent-android][] | [📘 elastic.co][apm-agent-android-docs] | [📂 contrib][apm-agent-android-contrib]
14 | | Go Agent | [apm-agent-go][] | [📘 elastic.co][apm-agent-go-docs] | [📂 contrib][apm-agent-go-contrib]
15 | | iOS Agent (🚧 In Development) | [apm-agent-ios][] | [📘 elastic.co][apm-agent-ios-docs] | [📂 contrib][apm-agent-ios-contrib]
16 | | Java Agent | [apm-agent-java][] | [📘 elastic.co][apm-agent-java-docs] | [📂 contrib][apm-agent-java-contrib]
17 | | JavaScript RUM Agent | [apm-agent-rum-js][] | [📘 elastic.co][apm-agent-js-base-docs] | [📂 contrib][apm-agent-rum-js-contrib]
18 | | Node.js Agent | [apm-agent-nodejs][] | [📘 elastic.co][apm-agent-nodejs-docs] | [📂 contrib][apm-agent-nodejs-contrib]
19 | | PHP Agent | [apm-agent-php][] | [📘 elastic.co][apm-agent-php-docs] | [📂 contrib][apm-agent-php-contrib]
20 | | Python Agent | [apm-agent-python][] | [📘 elastic.co][apm-agent-python-docs] | [📂 contrib][apm-agent-python-contrib]
21 | | Ruby Agent | [apm-agent-ruby][] | [📘 elastic.co][apm-agent-ruby-docs] | [📂 contrib][apm-agent-ruby-contrib]
22 | | .NET Agent | [apm-agent-dotnet][] | [📘 elastic.co][apm-agent-dotnet-docs] | [📂 contrib][apm-agent-dotnet-contrib]
23 |
24 |
25 | [apm-server-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-server
26 | [apm-server-docs]: https://www.elastic.co/guide/en/apm/guide/current/index.html
27 | [apm-server]: https://github.com/elastic/apm-server
28 |
29 | [apm-ui-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-ui
30 | [apm-ui-docs]: https://www.elastic.co/guide/en/kibana/current/xpack-apm.html
31 | [apm-ui]: https://github.com/elastic/kibana/tree/main/x-pack/plugins/apm
32 |
33 | [apm-agent-android-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-android
34 | [apm-agent-android-docs]: https://www.elastic.co/guide/en/apm/agent/android/current/index.html
35 | [apm-agent-android]: https://github.com/elastic/apm-agent-android
36 |
37 | [apm-agent-go-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-go
38 | [apm-agent-go-docs]: https://www.elastic.co/guide/en/apm/agent/go/current/index.html
39 | [apm-agent-go]: https://github.com/elastic/apm-agent-go
40 |
41 | [apm-agent-ios-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-ios
42 | [apm-agent-ios-docs]: https://www.elastic.co/guide/en/apm/agent/swift/current/index.html
43 | [apm-agent-ios]: https://github.com/elastic/apm-agent-ios
44 |
45 | [apm-agent-java-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-java
46 | [apm-agent-java-docs]: https://www.elastic.co/guide/en/apm/agent/java/current/index.html
47 | [apm-agent-java]: https://github.com/elastic/apm-agent-java
48 |
49 | [apm-agent-rum-js-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-rum-js
50 | [apm-agent-js-base-docs]: https://www.elastic.co/guide/en/apm/agent/js-base/current/index.html
51 | [apm-agent-rum-js]: https://github.com/elastic/apm-agent-rum-js
52 |
53 | [apm-agent-nodejs-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-nodejs
54 | [apm-agent-nodejs-docs]: https://www.elastic.co/guide/en/apm/agent/nodejs/current/index.html
55 | [apm-agent-nodejs]: https://github.com/elastic/apm-agent-nodejs
56 |
57 | [apm-agent-python-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-python
58 | [apm-agent-python-docs]: https://www.elastic.co/guide/en/apm/agent/python/current/index.html
59 | [apm-agent-python]: https://github.com/elastic/apm-agent-python
60 |
61 | [apm-agent-ruby-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-ruby
62 | [apm-agent-ruby-docs]: https://www.elastic.co/guide/en/apm/agent/ruby/current/index.html
63 | [apm-agent-ruby]: https://github.com/elastic/apm-agent-ruby
64 |
65 | [apm-agent-dotnet-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-dotnet
66 | [apm-agent-dotnet-docs]: https://www.elastic.co/guide/en/apm/agent/dotnet/current/index.html
67 | [apm-agent-dotnet]: https://github.com/elastic/apm-agent-dotnet
68 |
69 | [apm-agent-php-contrib]: https://github.com/elastic/apm-contrib/tree/main/apm-agent-php
70 | [apm-agent-php-docs]: https://www.elastic.co/guide/en/apm/agent/php/current/index.html
71 | [apm-agent-php]: https://github.com/elastic/apm-agent-php
72 |
--------------------------------------------------------------------------------
/apm-logo.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/specs/agents/README.md:
--------------------------------------------------------------------------------
1 | # Building an agent
2 |
3 | So you want to build an agent for Elastic APM? That's great, here's what you need to know.
4 |
5 | **Note:** This is a living document.
6 | If you come across something weird or find something missing, please add it or ask open an issue.
7 |
8 | ---
9 |
10 | # Introduction
11 |
12 | The [Getting started with APM](https://www.elastic.co/guide/en/observability/current/apm.html) provides an overview to understand the big picture architecture.
13 |
14 | Your agent will be talking to the APM Server using HTTP, sending data to it as JSON or ND-JSON. There are multiple categories of data that each agent captures and sends to the APM Server:
15 |
16 | - Trace data: transactions and spans (distributed tracing)
17 | - Errors/exceptions (i.e. for error tracking)
18 | - Metrics (host process-level metrics, and language/runtime-specific metrics)
19 |
20 | You can find details about each of these in the [APM Data Model](https://www.elastic.co/guide/en/observability/current/apm-data-model.html) documentation. The [Intake API](https://www.elastic.co/guide/en/observability/current/apm-api-events.html) documentation describes the wire format expected by APM Server. APM Server converts the data into Elasticsearch documents, and then the APM UI in Kibana provides visualisations over that data, as well as enabling you to dig into the data in an interactive way.
21 |
22 | # Guiding Philosophy
23 |
24 | 1. Agents try to be as good citizens as possible in the programming language they are written for. Even though every language ends up reporting to the same server API with the same JSON format, the agents should try to make as much sense in the context of the relevant language as possible. We want to both streamline the agents to work the same in every context **but** also make them feel like they were built specifically for each language. It's up to you to figure out how this looks in the language you are writing your agent for.
25 |
26 | 2. Agents should be as close to zero configuration as possible.
27 |
28 | - Use sensible defaults, aligning across agents unless there is a compelling reason to have a language-specific default.
29 | - Agents should typically come with out-of-the-box instrumentation for the most popular frameworks or libraries of their relevant language.
30 | - Users should be able to disable specific instrumentation modules to reduce overhead, or where details are not interesting to them.
31 |
32 | 3. The overhead of agents must be kept to a minimum, and must not affect application behaviour.
33 |
34 |
35 | # Features to implement
36 |
37 | - [Transport](transport.md)
38 | - [Metadata](metadata.md)
39 | - Tracing
40 | - [Transactions](tracing-transactions.md)
41 | - [Transaction Grouping](tracing-transaction-grouping.md)
42 | - [Spans](tracing-spans.md)
43 | - [Span destination](tracing-spans-destination.md)
44 | - [Handling huge traces](handling-huge-traces/)
45 | - [Hard limit on number of spans to collect](handling-huge-traces/tracing-spans-limit.md)
46 | - [Collecting statistics about dropped spans](handling-huge-traces/tracing-spans-dropped-stats.md)
47 | - [Dropping fast exit spans](handling-huge-traces/tracing-spans-drop-fast-exit.md)
48 | - [Compressing spans](handling-huge-traces/tracing-spans-compress.md)
49 | - [Sampling](tracing-sampling.md)
50 | - [Distributed tracing](tracing-distributed-tracing.md)
51 | - [Tracer API](tracing-api.md)
52 | - Instrumentation
53 | - [AWS](tracing-instrumentation-aws.md)
54 | - [Databases](tracing-instrumentation-db.md)
55 | - [HTTP](tracing-instrumentation-http.md)
56 | - [Messaging systems](tracing-instrumentation-messaging.md)
57 | - [gRPC](tracing-instrumentation-grpc.md)
58 | - [GraphQL](tracing-instrumentation-graphql.md)
59 | - [OpenTelemetry API Bridge](tracing-api-otel.md)
60 | - [Error/exception tracking](error-tracking.md)
61 | - [Metrics](metrics.md)
62 | - [Logging Correlation](log-correlation.md)
63 | - [Agent Configuration](configuration.md)
64 | - [Agent logging](logging.md)
65 | - [Data sanitization](sanitization.md)
66 | - [Field limits](field-limits.md)
67 |
68 | # Processes
69 |
70 | - [Proposing changes to the specification](../../.github/pull_request_template.md)
71 | - [Proposing new fields to the intake API](process-new-fields.md)
72 |
--------------------------------------------------------------------------------
/specs/agents/breaking-changes.md:
--------------------------------------------------------------------------------
1 | # What is a breaking change in a version of an APM agent?
2 | A change is defined as breaking if it causes an application using an APM agent to break or if the APM product is no longer usable in a way that it previously was.
3 |
4 | Taken strictly, this definition could lead to treating every change in runtime behavior as a breaking change. At the same time, we need to be able to keep driving improvements to existing features of our APM product. This document gives some guidelines to help distinguish changes of implementation details from breaking changes.
5 |
6 | ## Types of breaking changes
7 | ### Instrumentation versions
8 | Each agent instruments a number of libraries that are used in their language ecosystem. These libraries themselves may introduce new versions, breaking changes, and deprecate older versions. The APM agents therefore will occasionally introduce changes in their instrumentation of external libraries. The changes that we consider breaking are ones that remove support for older versions. Agents can also continue supporting the instrumentation of a particular older library version but drop its testing of it because of some conflicts in installing test suite dependencies, for example. This change would not be considered breaking as long as it’s properly documented.
9 |
10 | ### Language and runtime support
11 | Similar to library version instrumentation, APM agents will typically support multiple versions of its language. Sometimes it is necessary to drop support for older versions of the languages as they themselves are EOL’ed.
12 |
13 | It is typically considered a breaking change when an APM agent drops support for a particular language version, but this may vary according to the conventions of the language ecosystem.
14 | For example, it is common practice for Go libraries to follow the Go project's release policy of only supporting the two most recent releases of Go.
15 |
16 | ### Configuration changes
17 | All agents support a set of configuration options and default values. Changes to the configuration offering can be categorized into three types:
18 |
19 | __Change in default configuration value__: Each APM agent configuration option has a default value. Sometimes we change what that default configuration value is. We should consider the _effect_ of changing the value when we evaluate whether the change is breaking. For example, the default configuration value could enrich the data and provide an enhanced experience to the user. In this case, we wouldn’t consider the change to be breaking. On the other hand, if a default value is changed, and as a consequence, removes some information the user was previously able to see, we would consider that a breaking change.
20 |
21 | __Removal of a configuration option__: It is a breaking change to remove a configuration option. For example, APM agents may have removed the option `active` in favor of a new option, `enabled`.
22 |
23 | __Change in configuration value behavior__: If the semantics of a configuration value are altered, the change is considered breaking. For example, the configuration option `span_frames_min_duration` can be set to an integer millisecond value, 0, or -1. At the time this document was written, setting this value to 0 means to collect no stack traces and -1 means to collect all stack traces. If there is a change in what the special values 0 and -1 mean, the change is a breaking one.
24 |
25 | ### Public API changes
26 | Each APM agent has a Public API that is marked as such and documented. Agents may make a change to their Public API in order to support new features, support new integrations, resolve inconsistencies between agents, or for other reasons.
27 |
28 | __Public API__: When the name of a Public API component is changed or if a component is removed, this change is considered breaking. Applications may depend on the APM agent’s Public API so the agent would ideally issue a deprecation warning and clearly document the upcoming change for users before the version including the change is released. For example, changing the Public API for setting the `service.destination.resource` value to setting two new fields instead (`service.target.name`, `service.target.type`) is considered to be a breaking change.
29 |
30 | __Public API behavior__: If the effects of using a part of the Public API or the semantics of that API are changed to enhance a user experience or enrich the data in some way, we don’t consider it a breaking change. A Public API behavior change that removes or alters some information that was there before is considered breaking.
31 |
32 | ### APM server support
33 | __Support for APM server versions__: If an APM agent removes support for an older APM server version, the change is considered breaking.
34 |
35 | __Support for APM server protocols__: Similarly, if the APM agent removes support for an APM server protocol, the change is breaking.
36 |
37 |
38 | ## What is not a Breaking change
39 | In general, we don’t consider changes in the data we collect to be breaking, unless they have security or privacy implications. Some examples of these changes are:
40 | - Span metadata, such as the span name or the structured `db.statement`, or destination granularity
41 | - Span compression (multiple similar or exact consecutive spans collapsed into one)
42 | - Trace structure (e.g. span links + handling of messaging)
43 |
44 |
--------------------------------------------------------------------------------
/specs/agents/error-tracking.md:
--------------------------------------------------------------------------------
1 | ## Error/exception tracking
2 |
3 | The agent support reporting exceptions/errors. Errors may come in one of two forms:
4 |
5 | - unhandled (or handled and explicitly reported) exceptions/errors
6 | - log records
7 |
8 | Agents should include exception handling in the instrumentation they provide, such that exceptions are reported to the APM Server automatically, without intervention. In addition, hooks into logging libraries may be provided such that logged errors are also sent to the APM Server.
9 |
10 | Error properties
11 | * `id` (which in the case of errors is 128 bits, encoded as 32 hexadecimal digits)
12 |
13 | Additional properties that agents SHOULD collect when the error happens within the context of a transaction
14 | * `trace_id`
15 | * `transaction_id`
16 | * `parent_id` (which is the `id` of the transaction or span that caused the error).
17 | * `transaction.sampled`
18 | * `transaction.name`†
19 | * `transaction.type`†
20 |
21 | † These properties may change during the lifetime of a transaction, for example if a user explicitly sets the transaction name after an error has been captured.
22 | It is a known and accepted limitation that these properties are not always consistent with the transaction.
23 | Agents MUST NOT buffer errors to ensure consistency as this comes at the expense of increased memory overhead.
24 |
25 | ### Impact on the `outcome`
26 |
27 | Tracking an error that's related to a transaction does not impact its `outcome`.
28 | A transaction might have multiple errors associated to it but still return with a 2xx status code.
29 | Hence, the status code is a more reliable signal for the outcome of the transaction.
30 | This, in turn, means that the `outcome` is always specific to the protocol.
31 |
--------------------------------------------------------------------------------
/specs/agents/field-limits.md:
--------------------------------------------------------------------------------
1 | ## Field limits
2 |
3 | The maximum length of metadata, transaction, span, et al fields are determined
4 | by the [APM Server Events Intake API schema](https://www.elastic.co/guide/en/apm/server/current/events-api.html).
5 | Except for special cases, fields are typically limited to 1024 unicode characters.
6 | Unless listed below as knowm "long fields", agents SHOULD truncate filed values to 1024 characters, as specified [below](#truncating-field-values).
7 |
8 | ### Long fields
9 |
10 | Some APM event fields are not limited in the APM server intake API schema.
11 | Such fields are considered "long fields".
12 |
13 | Agents SHOULD treat the following fields as long fields:
14 |
15 | - `transaction.context.request.body`, `error.context.request.body`
16 | - `transaction.context.message.body`, `error.context.message.body`
17 | - `span.context.db.statement`
18 |
19 | In addition, agents MAY treat the following fields as long fields:
20 |
21 | - `error.exception.message`
22 | - `error.log.message`
23 |
24 | Agents SHOULD limit the maximum length of long fields by [truncating](#truncating-field-values) them to 10,000 unicode characters,
25 | or based on user configuration for long field length, as specified [below](#long_field_max_length-configuration).
26 |
27 | ### `long_field_max_length` configuration
28 |
29 | Agents MAY support the `long_field_max_length` configuration option to allow
30 | the user to configure this maximum length. This option defines a maximum number
31 | of unicode characters for each field.
32 |
33 | | | |
34 | |----------------|-----------|
35 | | Type | `Integer` |
36 | | Default | `10000` |
37 | | Dynamic | `false` |
38 | | Central config | `false` |
39 |
40 | Ultimately the maximum length of any field is limited by the [`max_event_size`](https://www.elastic.co/guide/en/apm/server/current/configuration-process.html#max_event_size)
41 | configured for the receiving APM server.
42 |
43 | ### Truncating field values
44 |
45 | When field values exceed the maximum allowed number of unicode characters, agents SHOULD truncate the values to fit the maximum allowed length,
46 | replacing the last character of the eventual value with the ellipsis character (unicode character `U+2026`: "…").
47 |
--------------------------------------------------------------------------------
/specs/agents/handling-huge-traces/README.md:
--------------------------------------------------------------------------------
1 | # Handling huge traces
2 |
3 | Instrumenting applications that make lots of requests (such as 10k+) to backends like caches or databases can lead to several issues:
4 | - A significant performance impact in the target application.
5 | For example due to high allocation rate, network traffic, garbage collection, additional CPU cycles for serializing, compressing and sending spans, etc.
6 | - Dropping of events in agents or APM Server due to exhausted queues.
7 | - High load on the APM Server.
8 | - High storage costs.
9 | - Decreased performance of the Elastic APM UI due to slow searches and rendering of huge traces.
10 | - Loss of clarity and overview (--> decreased user experience) in the UI when analyzing the traces.
11 |
12 | Agents can implement several strategies to mitigate these issues.
13 | These strategies are designed to capture significant information about relevant spans while at the same time limiting the trace to a manageable size.
14 | Applying any of these strategies inevitably leads to a loss of information.
15 | However, they aim to provide a better tradeoff between cost and insight by not capturing or summarizing less relevant data.
16 |
17 | - [Hard limit on number of spans to collect](tracing-spans-limit.md) \
18 | Even after applying the most advanced strategies, there must always be a hard limit on the number of spans we collect.
19 | This is the last line of defense that comes with the highest amount of data loss.
20 | - [Collecting statistics about dropped spans](tracing-spans-dropped-stats.md) \
21 | Makes sure even if dropping spans, we at least have stats about them.
22 | - [Dropping fast exit spans](tracing-spans-drop-fast-exit.md) \
23 | If a span was blazingly fast, it's probably not worth the cost to send and store it.
24 | - [Compressing spans](tracing-spans-compress.md) \
25 | If there are a bunch of very similar spans, we can represent them in a single document - a composite span.
26 |
27 | In a nutshell, this is how the different settings work in combination:
28 |
29 | ```java
30 | if (span.transaction.spanCount > transaction_max_spans) {
31 | // drop span
32 | // collect statistics for dropped spans
33 | } else if (compression possible) {
34 | // apply compression
35 | } else if (span.duration < exit_span_min_duration) {
36 | // drop span
37 | // collect statistics for dropped spans
38 | } else {
39 | // report span
40 | }
41 | ```
42 |
--------------------------------------------------------------------------------
/specs/agents/handling-huge-traces/tracing-spans-drop-fast-exit.md:
--------------------------------------------------------------------------------
1 | # Dropping fast exit spans
2 |
3 | If an exit span was really fast, chances are that it's not relevant for analyzing latency issues.
4 | Therefore, agents SHOULD implement the strategy laid out in this section to let users choose the level of detail/cost tradeoff that makes sense for them.
5 | If an agent implements this strategy, it MUST also implement [Collecting statistics about dropped spans](tracing-spans-dropped-stats.md).
6 |
7 | ## `exit_span_min_duration` configuration
8 |
9 | Sets the minimum duration of exit spans.
10 | Exit spans with a duration lesser than this threshold are attempted to be discarded.
11 | If the exit span is equal or greater the threshold, it should be kept.
12 |
13 | In some cases exit spans cannot be discarded.
14 | For example, spans that propagate the trace context to downstream services,
15 | such as outgoing HTTP requests,
16 | can't be discarded.
17 | However, external calls that don't propagate context,
18 | such as calls to a database, can be discarded using this threshold.
19 |
20 | Additionally, spans that lead to an error can't be discarded.
21 |
22 | | | |
23 | |----------------|------------|
24 | | Type | [`GranularDuration`](../configuration.md#configuration-value-types) |
25 | | Default | `0ms` |
26 | | Central config | `true` |
27 |
28 | ## Interplay with span compression
29 |
30 | If an agent implements [span compression](tracing-spans-compress.md),
31 | the limit applies to the [composite span](tracing-spans-compress.md#composite-span).
32 |
33 | For example, if 10 Redis calls are compressed into a single composite span whose total duration is lower than `exit_span_min_duration`,
34 | it will be dropped.
35 | If, on the other hand, the individual Redis calls are below the threshold,
36 | but the sum of their durations is above it, the composite span will not be dropped.
37 |
38 | ## Limitations
39 |
40 | The limitations are based on the premise that the `parent_id` of each span and transaction that's stored in Elasticsearch
41 | should point to another valid transaction or span that's present in the Elasticsearch index.
42 |
43 | A span that refers to a missing span via is `parent_id` is also known as an "orphaned span".
44 |
45 | ### Spans that propagate context to downstream services can't be discarded
46 |
47 | We only know whether to discard after the call has ended.
48 | At that point,
49 | the trace has already continued on the downstream service.
50 | Discarding the span for the external request would orphan the transaction of the downstream call.
51 |
52 | Propagating the trace context to downstream services is also known as out-of-process context propagation.
53 |
54 | ## Implementation
55 |
56 | ### `discardable` flag
57 |
58 | Spans store an additional `discardable` flag in order to determine whether a span can be discarded.
59 | The default value is `true` for [exit spans](../tracing-spans.md#exit-spans) and `false` for any other span.
60 |
61 | According to the [limitations](#Limitations),
62 | there are certain situations where the `discardable` flag of a span is set to `false`:
63 | - the span's `outcome` field is set to anything other than `success`.
64 | So spans with outcome indicating an issue of potential interest are not discardable
65 | - On out-of-process context propagation
66 |
67 | ### Determining whether to report a span
68 |
69 | If the span's duration is less than `exit_span_min_duration` and the span is discardable (`discardable=true`),
70 | the `span_count.dropped` count is incremented, and the span will not be reported.
71 | We're deliberately using the same dropped counter we also use when dropping spans due to [`transaction_max_spans`](tracing-spans-limit.md#configuration-option-transaction_max_spans).
72 | This ensures that a dropped fast span doesn't consume from the max spans limit.
73 |
74 | ### Metric collection
75 |
76 | To reduce the data loss, agents [collect statistics about dropped spans](tracing-spans-dropped-stats.md).
77 | Dropped spans contribute to [breakdown metrics](https://docs.google.com/document/d/1-_LuC9zhmva0VvLgtI0KcHuLzNztPHbcM0ZdlcPUl64#heading=h.ondan294nbpt) the same way as non-discarded spans.
78 |
--------------------------------------------------------------------------------
/specs/agents/handling-huge-traces/tracing-spans-dropped-stats.md:
--------------------------------------------------------------------------------
1 | # Collecting statistics about dropped spans
2 |
3 | To still retain some information about dropped spans (for example due to [`transaction_max_spans`](tracing-spans-limit.md) or [`exit_span_min_duration`](tracing-spans-drop-fast-exit.md)),
4 | agents SHOULD collect statistics on the corresponding transaction about dropped spans.
5 | These statistics MUST only be sent for sampled transactions.
6 |
7 | Agents SHOULD only collect these statistics for exit spans that have a non-empty `service.target.type` (and `service.target.name`),
8 | or a non-empty `destination.service.resource` if they don´t use [Service Target fields](../tracing-spans-service-target.md)
9 |
10 | This feature used to rely on the deprecated `destination.service.resource` field, which is replaced by `service.target.type`
11 | and `service.target.name`.
12 | However, in order to preserve compatibility, we still need to provide its value in dropped spans metrics.
13 |
14 | ## Use cases
15 |
16 | This allows APM Server to consider these metrics for the service destination metrics.
17 | In practice,
18 | this means that the service map, the dependencies table,
19 | and the backend details view can show accurate throughput statistics for backends like Redis,
20 | even if most of the spans are dropped.
21 |
22 | ## Data model
23 |
24 | This is an example of the statistics that are added to the `transaction` events sent via the intake v2 protocol.
25 |
26 | ```json
27 | {
28 | "dropped_spans_stats": [
29 | {
30 | "destination_service_resource": "example.com:443",
31 | "service_target_type": "http",
32 | "service_target_name": "example.com:443",
33 | "outcome": "failure",
34 | "duration.count": 28,
35 | "duration.sum.us": 123456
36 | },
37 | {
38 | "destination_service_resource": "mysql",
39 | "service_target_type": "mysql",
40 | "outcome": "success",
41 | "duration.count": 81,
42 | "duration.sum.us": 9876543
43 | }
44 | ]
45 | }
46 | ```
47 |
48 | ### Compatibility
49 |
50 | When the `service_target_*` fields are provided, APM server has to use those fields to identify the destination.
51 |
52 | When the `service_target_*` fields are not provided, APM server has to infer equivalent values using the algorigthm
53 | described in [Service Target Fields](../tracing-spans-service-target.md).
54 |
55 | ## Limits
56 |
57 | To avoid the structures from growing without bounds (which is only expected in pathological cases),
58 | agents MUST limit the size of the `dropped_spans_stats` to 128 entries per transaction.
59 | Any entries that would exceed the limit are silently dropped.
60 |
61 | ## Effects on destination service metrics
62 |
63 | As laid out in the [span destination spec](tracing-spans-destination.md#contextdestinationserviceresource),
64 | APM Server tracks span destination metrics.
65 | To avoid dropped spans to skew latency metrics and cause throughput metrics to be under-counted,
66 | APM Server will take `dropped_spans_stats` into account when tracking span destination metrics.
67 |
--------------------------------------------------------------------------------
/specs/agents/handling-huge-traces/tracing-spans-limit.md:
--------------------------------------------------------------------------------
1 | # Hard limit on number of spans to collect
2 |
3 | This is the last line of defense that comes with the highest amount of data loss.
4 | This strategy MUST be implemented by all agents.
5 | Ideally, the other mechanisms limit the amount of spans enough so that the hard limit does not kick in.
6 |
7 | Agents SHOULD also [collect statistics about dropped spans](tracing-spans-dropped-stats.md) when implementing this spec.
8 |
9 | ## Configuration option `transaction_max_spans`
10 |
11 | Limits the amount of spans that are recorded per transaction.
12 |
13 | This is helpful in cases where a transaction creates a very high amount of spans (e.g. thousands of SQL queries).
14 |
15 | Setting an upper limit will prevent overloading the agent and the APM server with too much work for such edge cases.
16 |
17 | | | |
18 | |----------------|----------|
19 | | Type | `integer`|
20 | | Default | `500` |
21 | | Dynamic | `true` |
22 |
23 | ## Implementation
24 |
25 | ### Span count
26 |
27 | When a span is put in the agent's reporter queue, a counter should be incremented on its transaction, in order to later identify the _expected_ number of spans.
28 | In this way we can identify data loss, e.g. because events have been dropped.
29 |
30 | This counter SHOULD internally be named `reported` and MUST be mapped to `span_count.started` in the intake API.
31 | The word `started` is a misnomer but needs to be used for backward compatibility.
32 | The rest of the spec will refer to this field as `span_count.reported`.
33 |
34 | When a span is dropped, it is not reported to the APM Server,
35 | instead another counter is incremented to track the number of spans dropped.
36 | In this case the above mentioned counter for `reported` spans is not incremented.
37 |
38 | ```json
39 | "span_count": {
40 | "started": 500,
41 | "dropped": 42
42 | }
43 | ```
44 |
45 | The total number of spans that an agent created within a transaction is equal to `span_count.started + span_count.dropped`.
46 | Note that this might be an under count, because spans that end *after* their
47 | transaction has been reported (typically when the transaction ends) will not be
48 | counted.
49 |
50 | ### Checking the limit
51 |
52 | Before creating a span,
53 | agents must determine whether that span would exceed the span limit.
54 | The limit is reached when the number of reported spans is greater or equal to the max number of spans.
55 | In other words, the limit is reached if this condition is true:
56 |
57 | atomic_get(transaction.span_count.eligible_for_reporting) >= transaction_max_spans
58 |
59 | On span end, agents that support the concurrent creation of spans need to check the condition again.
60 | That is because any number of spans may be started before any of them end.
61 |
62 | ```java
63 | if (atomic_get(transaction.span_count.eligible_for_reporting) <= transaction_max_spans // optional optimization
64 | && atomic_get_and_increment(transaction.span_count.eligible_for_reporting) <= transaction_max_spans ) {
65 | should_be_reported = true
66 | atomic_increment(transaction.span_count.reported)
67 | } else {
68 | should_be_reported = false
69 | atomic_increment(transaction.span_count.dropped)
70 | transaction.track_dropped_stats(this)
71 | }
72 | ```
73 |
74 | `eligible_for_reporting` is another counter in the span_count object, but it's not reported to APM Server.
75 | It's similar to `reported` but the value may be higher.
76 |
77 | ### Configuration snapshot
78 |
79 | To ensure consistent behavior within one transaction,
80 | the `transaction_max_spans` option should be read once on transaction start.
81 | Even if the option is changed via remote config during the lifetime of a transaction,
82 | the value that has been read at the start of the transaction should be used.
83 |
84 | ### Metric collection
85 |
86 | Even though we can determine whether to drop a span before starting it, it's not legal to return a `null` or noop span in that case.
87 | That's because we're [collecting statistics about dropped spans](tracing-spans-dropped-stats.md) as well as
88 | [breakdown metrics](https://docs.google.com/document/d/1-_LuC9zhmva0VvLgtI0KcHuLzNztPHbcM0ZdlcPUl64#heading=h.ondan294nbpt)
89 | even for spans that exceed `transaction_max_spans`.
90 |
91 | For spans that are known to be dropped upfront, Agents SHOULD NOT collect information that is expensive to get and not needed for metrics collection.
92 | This includes capturing headers, request bodies, and summarizing SQL statements, for example.
93 |
--------------------------------------------------------------------------------
/specs/agents/log-correlation.md:
--------------------------------------------------------------------------------
1 | ## Log correlation
2 |
3 | Agents should provide instrumentation/hooks for popular logging libraries in order to decorate structured log records with trace context.
4 | In particular, logging that occurs within the context of a transaction should add the fields `trace.id` and `transaction.id`;
5 | logging that occurs within a span should add the fields `trace.id` and optionally `transaction.id`.
6 |
7 | By adding trace context to log records, users will be able to move between the APM UI and Logs UI.
8 |
9 | Logging frameworks and libraries may provide a way to inject key-value pairs in log messages,
10 | this allows to reuse those fields in log message formats (for example in plain text).
11 |
12 | Log correlation relies on two sets of fields:
13 | - [metadata fields](#service-correlation-fields)
14 | - They allow to build the per-service logs view in UI.
15 | - They are implicitly provided when using log-sending by the agent metadata.
16 | - When using ECS logging, they might be set by the application.
17 | - [per-log-event fields](#trace-correlation-fields): `trace.id`, `transaction.id` and `error.id`
18 | - They allow to build the per-trace/transaction/error logs view in UI.
19 | - They are added to the log event
20 | - They must be written in each log event document
21 |
22 | The values for those fields can be set in two places:
23 | - when using [ecs-logging](https://github.com/elastic/ecs-logging) directly in the application
24 | - when the agent reformats a log event
25 |
26 | The values set at the application level have higher priority than the values set by agents.
27 | Agents must provide fallback values if they are not explicitly set by the application.
28 |
29 | In case the values set in the application and agent configuration differ, the resulting log
30 | messages won't correlate to the expected service in UI. In order to prevent such inconsistencies
31 | agents may issue a warning when there is a mis-configuration.
32 |
33 | ### Service correlation fields
34 |
35 | They allow to build the per-service logs view in UI.
36 | They are implicitly provided when using log-sending by the agent metadata.
37 | When using ECS logging, they might be set by the application in ECS logging configuration.
38 |
39 | - `service.name`:
40 | - used to filter/link log messages to a given service.
41 | - must be provided even if there is no active transaction
42 | - Configuration source (in order of precedence):
43 | - Configured value
44 | - `ELASTIC_APM_SERVICE_NAME`
45 | - `OTEL_SERVICE_NAME`
46 | - `OTEL_RESOURCE_ATTRIBUTES` value for `service.name`
47 | - Default from Elastic Agent (if available)
48 | - `service.version`:
49 | - only used for service metadata correlation
50 | - must be provided even if there is no active transaction
51 | - Configuration source (in order of precedence):
52 | - Configured value
53 | - `ELASTIC_APM_SERVICE_VERSION`
54 | - `OTEL_RESOURCE_ATTRIBUTES` value for `service.version`
55 | - Default from Elastic Agent (if available)
56 | - `service.environment`:
57 | - allows to filter/link log messages to a given service/environment.
58 | - must be provided even if there is no active transaction
59 | - Configuration source (in order of precedence):
60 | - Configured value
61 | - `ELASTIC_APM_ENVIRONMENT`
62 | - `OTEL_RESOURCE_ATTRIBUTES` value for `deployment.environment`
63 | - Default from Elastic Agent (if available)
64 | - `service.node.name`:
65 | - must be provided even if there is no active transaction
66 | - Configuration source (in order of precedence):
67 | - Configured value
68 | - `ELASTIC_APM_SERVICE_NODE_NAME`
69 | - `OTEL_RESOURCE_ATTRIBUTES` value for `service.instance.id`
70 | - Default from Elastic Agent (if available)
71 |
72 |
73 | The `container.id` field can also be used as a fallback to provide service-level correlation in UI, however agents ARE NOT expected to set it:
74 |
75 | - log collector (filebeat) is expected to do that when ingesting logs.
76 | - all data sent through agent intake implicitly provides `container.id` through metadata, which also includes the log events that may be sent to apm-server.
77 |
78 | ### Trace correlation fields
79 |
80 | They allow to build the per-trace/transaction/error logs view in UI.
81 | They allow to navigate from the log event to the trace/transaction/error in UI.
82 | They should be added to the log event.
83 | They must be written in each log event document they relate to, either reformatted or sent by the agent.
84 |
85 | - `trace.id`
86 | - `transaction.id`
87 | - `error.id`
88 |
--------------------------------------------------------------------------------
/specs/agents/log-reformatting.md:
--------------------------------------------------------------------------------
1 | # Log reformatting
2 |
3 | The Agents will be a critical part of log collection onboarding for their
4 | application logs. This is primarily accomplished via the `log_ecs_reformatting`
5 | configuration option, described below.
6 |
7 | In future iterations, the shipping of ECS logs will become more automated by auto-parsing ECS-JSON logs in Filebeat
8 | and [automatically shipping log files](https://github.com/elastic/apm/issues/374) that got reformatted via
9 | `log_ecs_reformatting`.
10 |
11 | ## `log_ecs_reformatting` configuration
12 |
13 | Configures the agent to automatically format application logs as ECS-compatible JSON
14 | (if possible).
15 |
16 | The configuration option must be marked experimental for now to allow for breaking changes we may need to introduce.
17 | Once the end-to-end process for seamless log onboarding with Elastic Agent works, we'll remove the experimental flag.
18 |
19 | As the implementation of this configuration option will be specific for each supported log library,
20 | the supported technologies documentation should list the supported frameworks (including supported version ranges)
21 | and the agent version that introduced support for each logging library.
22 |
23 | | | |
24 | |----------------|---|
25 | | Valid options | `override`, `off` (case insensitive) |
26 | | Default | `off` |
27 | | Dynamic | `false` |
28 | | Central config | `false` |
29 |
30 | Not all agents will be able to automatically format logs in this way. Those
31 | agents should not implement this configuration option.
32 |
33 | For some agents, additional options makes sense. For example, the Java agent
34 | also accepts the values `shade` and `replace`, where ECS-reformatted logs are written to a dedicated `.ecs.json`
35 | file in addition to (`shade`) or instead of (`replace`) the original log stream.
36 |
37 | When this option is set to `override`, the agent should format all logs from the
38 | app as ECS-compatible json, as shown in the
39 | [spec](https://github.com/elastic/ecs-logging/blob/main/spec/spec.json).
40 |
41 | For all options other than `off`, the [log correlation](log-correlation.md) should be implicitly enabled.
42 |
43 | ## `log_ecs_formatter_allow_list` configuration
44 |
45 | Only formatters that match an item on this list will be automatically reformatted to ECS when `log_ecs_reformatting` is
46 | set to any option other than `off`. A "formatter" is a generic name used to describe the logging-framework-specific entity
47 | that is responsible for the formatting of log events. Currently this option is only implemented in the Java agent, where
48 | formatters are subtypes of `Layout` or `Encoder`, depending on the logging framework.
49 |
50 | | | |
51 | |----------------|---|
52 | | Type | `List<`[`WildcardMatcher`](../../tests/agents/json-specs/wildcard_matcher_tests.json)`>` |
53 | | Default | agent specific |
54 | | Dynamic | `false` |
55 | | Central config | `false` |
56 |
57 | ## Required fields
58 |
59 | The following fields are required:
60 |
61 | * `@timestamp`
62 | * `log.level`
63 | * `message`
64 | * `ecs.version`
65 |
66 | ## Recommended fields
67 |
68 | ### `service.name`
69 |
70 | See [Log correlation](log-correlation.md)
71 |
72 | ### `service.version`
73 |
74 | See [Log correlation](log-correlation.md)
75 |
76 | ### `service.environment`
77 |
78 | See [Log correlation](log-correlation.md)
79 |
80 | ### `event.dataset`
81 |
82 | The `event.dataset` field is used to power the [log anomaly chart in the logs UI](https://www.elastic.co/guide/en/observability/current/inspect-log-anomalies.html#anomalies-chart).
83 | The dataset can also be useful to filter for different log streams from the same pod, for example.
84 | This field should be a step more granular than
85 | `service.name` where possible. However, the cardinality of this field should be
86 | limited, so per-class or per-file logger names are not appropriate for this
87 | field.
88 |
89 | A good example is in the Java agent, where `event.dataset` is set to
90 | `${service.name}.${appender.name}`, where `appender.name` is the name of the
91 | log appender.
92 |
93 | If an agent doesn't have reasonable options for this field, it should be set
94 | to `${service.name}`.
95 |
96 | Some examples:
97 | - opbeans
98 | - opbeans.checkout
99 | - opbeans.login
100 | - opbeans.audit
101 |
102 | ## Testing
103 |
104 | Due to differences in the possible Agent implementations of this feature, no
105 | Gherkin spec is provided. Testing will primarily be accomplished via Opbeans.
106 | Each Agent team should update their Opbeans app so that it only relies on this
107 | configuration option to format ECS logs that will be picked up by Filebeat.
108 |
--------------------------------------------------------------------------------
/specs/agents/log-sending.md:
--------------------------------------------------------------------------------
1 | # Log sending
2 |
3 | ### `log_sending` configuration
4 |
5 | **Warning**: experimental feature, may be subject to change until GA. Also, only a small subset of agents will provide it before GA.
6 |
7 | Controls the ability to send logs directly from the agent to APM server.
8 |
9 | | | |
10 | |----------------|-----------------|
11 | | Valid options | `true`, `false` |
12 | | Default | `false` |
13 | | Dynamic | `true` |
14 | | Central config | `true` |
15 |
16 | When set to `true`, the agent will send log events to apm-server.
17 | Original log events are unaltered and written to their usual destinations (file, stdout, ...).
18 |
19 | The APM server only supports log events as of version 8.6+, thus trying to use this with an older version should
20 | issue a warning/error in the agent logs.
21 |
22 | ### Log event format
23 |
24 | On the agent side, there are two ways to get an ECS-formatted log message from a log event:
25 | - The application already uses [ecs-logging](https://github.com/elastic/ecs-logging)
26 | - The agent embeds a copy of [ecs-logging](https://github.com/elastic/ecs-logging), which might also be used for [log reformatting](./log-reformatting.md).
27 |
28 | In both cases, the output of [ecs-logging](https://github.com/elastic/ecs-logging) can be reused as follows:
29 |
30 | ```
31 | {"log":}\n`
32 | ```
33 |
34 | The ECS logging event `` must not include an `EOL` character in order to preserve the ND-JSON
35 | format where each event is written to a single line.
36 |
37 | ### Log event fields
38 |
39 | The ECS logging fields are the same as the ones defined in log reformatting:
40 | - [required fields](./log-reformatting.md#required-fields)
41 | - [recommended fields](./log-reformatting.md#recommended-fields)
42 |
43 | However, the values of `service.name` and `service.version` can be omitted as they are redundant to the values that are
44 | already sent in the [ND-JSON metadata](metadata.md). In the case where the formatted ECS log event already contains
45 | them, the agent may send the event as-is, rather than rewriting the event in order to reduce overhead.
46 |
47 | ### Agent log
48 |
49 | When `log_sending` option is enabled, agents may also send their own logs to APM server.
50 |
51 | Agents usually have internal debug/trace logging statements that allow to diagnose communication issues and serialized data
52 | sent to APM server. Special care must be taken to ensure that sending APM agent logs do not trigger an exponential loop
53 | of log events or excessively large log event.
54 | For APM agent logs, ignoring those log statements is an acceptable compromise -- diagnosis of agent errors in serializing or communicating with APM server may rely on local logging.
55 |
56 | When the agent starts, agent log events might require some limited buffering until the agent initialization is complete.
57 | This allows to capture the early log messages when the agent initializes which often provide details about the agent
58 | setup and configuration which are required for support.
59 |
60 | For the `event.dataset` field, the `${service.name}.apm-agent` value should be used to allow keeping application logs
61 | and agent logs separate if needed.
62 |
--------------------------------------------------------------------------------
/specs/agents/logging.md:
--------------------------------------------------------------------------------
1 | # Agent logging
2 |
3 | ## `log_level` configuration
4 |
5 | Sets the logging level for the agent.
6 |
7 | This option is case-insensitive.
8 |
9 | | | |
10 | |----------------|---|
11 | | Valid options | `trace`, `debug`, `info`, `warning`, `error`, `critical`, `off` |
12 | | Default | `info` (soft default) |
13 | | Dynamic | `true` |
14 | | Central config | `true` |
15 |
16 | Note that this default is not enforced among all agents.
17 | If an agent development team thinks that a different default should be used
18 | (such as `warning`), that is acceptable.
19 |
20 | ## Mapping to native log levels
21 |
22 | Not all logging frameworks used by the different agents can natively work with these levels.
23 | Thus, agents will need to translate them, using their best judgment for the mapping.
24 |
25 | Some examples:
26 | If the logging framework used by an agent doesn't have `trace`,
27 | it would map it to the same level as `debug`.
28 | If the underlying logging framework doesn't support `critical`,
29 | agents can treat that as a synonym for `error` or `fatal`.
30 |
31 | The `off` level is a switch to completely turn off logging.
32 |
33 | ## Backwards compatibility
34 |
35 | Most agents have already implemented `log_level`,
36 | accepting a different set of levels.
37 | Those agents should still accept their "native" log levels to preserve backwards compatibility.
38 | However, in central config,
39 | there will only be a dropdown with the levels that are consistent across agents.
40 | Also, the documentation should not mention the old log levels going forward.
41 |
42 | ## Logging
43 |
44 | Agents may provide the following log-related features:
45 |
46 | - [Log correlation](log-correlation.md): inject service metadata, trace IDs and error IDs in log events.
47 | - [Log reformatting](log-reformatting.md): reformat plain-text logs to ECS, equivalent to using [ecs logging](https://github.com/elastic/ecs-logging)
48 | without modifying the application nor its dependencies.
49 | - [Log sending](log-sending.md): send logs directly to APM server.
50 |
51 | ## Logging Preamble
52 |
53 | The intention of this logging preamble is to ensure agent supportability. Relevant
54 | data about an agent (e.g. version) and the environment it is running in (e.g. host,
55 | operating system) should be provided in it.
56 |
57 | All agents MUST print this preamble on startup using the `info` logging level unless
58 | a different level is explicitly mentioned.
59 |
60 | The agent logging preamble consists of 3 blocks:
61 |
62 | * **Agent**: This block is mandatory and contains basic version and build date information.
63 | * **Environment**: This block is optional but for supportability reasons it should be provided.
64 | * **Configuration**: This block is mandatory and contains a minimum set of relevant configuration values.
65 |
66 | **Note** that this specification does not prescribe a specific format to be used for creating
67 | the log messages. It is up to the implementing agent to chose a format (e.g. ecs-logging format).
68 |
69 | ### Agent
70 |
71 | On startup, all APM agents MUST log basic information regarding their technology (language, runtime),
72 | and version information.
73 | This log message MUST provide sufficient data to uniquely identify the agent build that generated the
74 | log message. Hence, if e.g. the version information is not sufficient, agents
75 | MUST include further information (e.g. build timestamp, git hash) that uniquely identifies an agent build.
76 |
77 | This SHOULD be the very first log message that is created by an agent.
78 |
79 | Example:
80 |
81 | ```text
82 | Elastic APM .NET Agent, version: 1.19.1-preview, build date: 2022-10-27 10:55:42 UTC
83 | ```
84 |
85 | Agents SHOULD also detect when they are running in a non-final version (e.g. a debug
86 | or pre-release build) and report that fact using the `warning` logging level.
87 |
88 | Example:
89 |
90 | ```text
91 | This is a pre-release version and not intended for use in production environments!
92 | ```
93 |
94 | ### Environment
95 |
96 | Additionally, agents SHOULD report information about their environment (e.g. host, process, runtime).
97 |
98 | | Item | Description | Example |
99 | | - | - | - |
100 | | Process ID | The Process ID in decimal format. | `83606` |
101 | | Process Name | The executable image name or the full path to it. | `w3wp.exe`, `/usr/local/share/dotnet/dotnet` |
102 | | Command Line | The full command line used to launch this process as available to the runtime. [1] | `/Users/acme/some_app/bin/Debug/net7.0/some_app.dll foo=bar` |
103 | | Operating System | OS name and version in a human-readable format. | `macOS Version 12.6.1 (build 21G217)` |
104 | | CPU architecture | See table below. | `arm64` |
105 | | Host | The (optionally fully-qualified) host name. | `MacBook-Pro.localdomain` |
106 | | Time zone | The local time zone in UTC-offset notation. | `UTC+0200` |
107 | | Runtime | Name and version of the executing runtime. | `.NET Framework 4.8.4250.0`|
108 | | Framework | Name and version of the instrumented framework. | `Django 4.1.3`, `ASP.NET 4.8.4494.0`|
109 |
110 | [1]: Due to privacy concerns in the past (see e.g. [here](https://github.com/elastic/apm-agent-nodejs/issues/1916)),
111 | agents may decide to not log this information.
112 |
113 | **CPU Architecture:**
114 |
115 | This table provides an exemplary list of well-known values for reporting the CPU architecture.
116 | An agent can decide to use different values that might be readily available to their language/runtime
117 | ecosystem (e.g. Node.js' `os.arch()`).
118 |
119 | | Value | Description |
120 | | - | - |
121 | | `amd64` | AMD64 |
122 | | `arm32` |ARM32 |
123 | | `arm64` |ARM64 |
124 | | `ia64` | Itanium |
125 | | `ppc32` | 32-bit PowerPC |
126 | | `ppc64` | 64-bit PowerPC |
127 | | `s390x` | IBM z/Architecture |
128 | | `x86` | 32-bit x86 |
129 |
130 | ### Configuration
131 |
132 | The start of the configuration block MUST be denoted as such (e.g. `Agent Configuration:`).
133 |
134 | If configuration files are used in the configuration process, their fully-qualified paths
135 | SHOULD be logged.
136 |
137 | Configuration item names SHOULD be provided in normalized (lower-case, snake_case) notation.
138 | Configuration value strings MUST be printed in quotes (so accidental leading or trailing whitespace can be spotted).
139 |
140 | Agents SHOULD log all configuration items that do not have default values.
141 | At the very minimum, agents MUST provide information about following essential configuration items.
142 | Items denoted as *"Log always"* MUST be logged in any case (i.e. having a default value or a custom one).
143 |
144 | | Item | Needs masking | Log Always | Example |
145 | | - | - | - | - |
146 | | `server_url` | no | yes | `http://localhost:8200` [2] |
147 | | `service_name` | no | yes | `foo` |
148 | | `service_version` | no | yes | `42` |
149 | | `log_level` | no | yes | `warning` |
150 | | `secret_token` | yes | no | `[REDACTED]` |
151 | | `api_key` | yes | no | `[REDACTED]` |
152 |
153 | [2]: Agents MAY decide to mask potential sensitive data (e.g. basic authentication information)
154 | that could be part of this URL.
155 |
156 | For each configuration option its **source** SHOULD be reported. These sources can be:
157 |
158 | * `default`
159 | * `environment`: Environment variable
160 | * `file`: Configuration file
161 | * `central`: Central Configuration
162 | * **Note:** Agents MAY print their configuration block again on changes in the central configuration.
163 |
164 | Example:
165 |
166 | ```text
167 | Agent Configuration:
168 | - configuration files used:
169 | - '/path/to/some/config.json'
170 | - '/path/to/some/other/config.xml'
171 | - server_url: 'http://localhost:8200' (default)
172 | - secret_token: [REDACTED] (environment)
173 | - api_key: [REDACTED] (default)
174 | - service_name: `unknown-dotnet-service` (default)
175 | - log_level: info (file)
176 | - disable_metrics: '*' (file)
177 | ```
178 |
--------------------------------------------------------------------------------
/specs/agents/metrics.md:
--------------------------------------------------------------------------------
1 | ## Metrics
2 |
3 | Agents periodically collect and report various metrics, described below.
4 |
5 | ### System/process CPU/Heap
6 |
7 | All agents (excluding JavaScript RUM) should record the following basic system/process metrics:
8 |
9 | - `system.cpu.total.norm.pct`: system CPU usage since the last report, in the range `[0,1]` (0-100%)
10 | - `system.process.cpu.total.norm.pct`: process CPU usage since the last report, in the range `[0,1]` (0-100%)
11 | - `system.memory.total`: total usable (but not necessarily available) memory on the system, in bytes
12 | - `system.memory.actual.free`: total available memory on the system, in bytes
13 | - `system.process.memory.size`: process virtual memory size, in bytes
14 | - `system.process.memory.rss.bytes`: process resident set size, in bytes
15 |
16 | ### cgroup metrics
17 |
18 | Where applicable, all agents (excluding JavaScript RUM) should record the following cgroup metrics:
19 |
20 | - `system.process.cgroup.memory.mem.limit.bytes`
21 | - `system.process.cgroup.memory.mem.usage.bytes`
22 |
23 | #### Metrics source
24 |
25 | ##### [cgroup-v1](https://www.kernel.org/doc/Documentation/cgroup-v1/memory.txt)
26 | - `system.process.cgroup.memory.mem.limit.bytes` - based on the `memory.limit_in_bytes` file
27 | - `system.process.cgroup.memory.mem.usage.bytes` - based on the `memory.usage_in_bytes` file
28 |
29 | ##### [cgroup-v2](https://www.kernel.org/doc/Documentation/cgroup-v2.txt)
30 | - `system.process.cgroup.memory.mem.limit.bytes` - based on the `memory.max` file
31 | - `system.process.cgroup.memory.mem.usage.bytes` - based on the `memory.current` file
32 |
33 | #### Discovery of the memory files
34 |
35 | All files mentioned above are located at the same directory. Ideally, we can discover this dir by parsing the `/proc/self/mountinfo` file, looking for the memory mount line and extracting the path from within it. An example of such line is:
36 | ```
37 | 436 431 0:33 /docker/5042cfbb4ab36fcef9ca5f1eda54f40265c6ef3fe0694dfe34b9b474e70f8df5 /sys/fs/cgroup/memory ro,nosuid,nodev,noexec,relatime master:22 - cgroup memory rw,memory
38 | ```
39 | The regex `^\d+? \d+? .+? .+? (.*?) .*cgroup.*memory.*` works in the cgroup-v1 systems tested so far, where the first and only group should be the directory path. However, it will probably take a few iterations and tests on different container runtimes and OSs to get it right.
40 | There is no regex currently suggested for cgroup-v2. Look in other agent PRs to get ideas.
41 |
42 | Whenever agents fail to discover the memory mount path, they should default to `/sys/fs/cgroup/memory`.
43 |
44 | #### Special values for unlimited memory quota
45 |
46 | Special values are used to indicate that the cgroup is not configured with a memory limit. In cgroup v1, this value is numeric - `0x7ffffffffffff000` and in cgroup v2 it is represented by the string `max`.
47 | Agents should not send the `system.process.cgroup.memory.mem.limit.bytes` metric whenever these special values are set.
48 |
49 | ### Runtime
50 |
51 | Agent should record runtime-specific metrics, such as garbage collection pauses. Due to their runtime-specific nature, these will differ for each agent.
52 |
53 | When capturing runtime metrics, keep in mind the end use-case: how will they be used? Is the format in which they are recorded appropriate for visualization in Kibana? Do not record metrics just because it is easy; record them because they are useful.
54 |
55 | ### Transaction and span breakdown
56 |
57 | Agents should record "breakdown metrics", which is a summarization of how much time is spent per span type/subtype in each transaction group. This is described in detail in the [Breakdown Graphs](https://docs.google.com/document/d/1-_LuC9zhmva0VvLgtI0KcHuLzNztPHbcM0ZdlcPUl64#heading=h.ondan294nbpt) document, so we do not repeat it here.
58 |
59 | ### Agent Health and Overhead Metrics
60 |
61 | Agents SHOULD record metrics which give insight into the agent's health state. This is explained in detail in [this spec](metrics-health.md).
62 |
63 | ### OpenTelemetry Metrics
64 |
65 | OpenTelemetry provides an API for collecting user defined metrics. Agents SHOULD allow custom metric collection via this API, this is described in detail in [this spec](metrics-otel.md).
66 |
67 | ## Shutdown behavior
68 |
69 | Agents should make an effort to flush any metrics before shutting down.
70 | If this cannot be achieved with shutdown hooks provided by the language/runtime, the agent should provide a public API that the user can call to flush any remaining data.
--------------------------------------------------------------------------------
/specs/agents/mobile/configuration.md:
--------------------------------------------------------------------------------
1 | ## Mobile Configuration
2 |
3 | This document describes the configurable parameters used in mobile agents. The ones supported
4 | by [central configuration](../configuration.md) can be set through Kibana's APM Settings.
5 |
6 | ### `recording` configuration
7 |
8 | A boolean specifying if the agent should be recording or not. When recording, the agent instruments incoming HTTP
9 | requests, tracks errors and collects and sends metrics. When not recording, the agent works as a noop, not collecting
10 | data and not communicating with the APM sever, except for polling the central configuration endpoint. As this is a
11 | reversible switch, agent threads are not being killed when inactivated, but they will be mostly idle in this state, so
12 | the overhead should be negligible.
13 |
14 | You can use this setting to dynamically disable Elastic APM at runtime.
15 |
16 | | | |
17 | |----------------|-----------|
18 | | Type | `Boolean` |
19 | | Default | `true` |
20 | | Central config | `true` |
21 |
--------------------------------------------------------------------------------
/specs/agents/mobile/metrics.md:
--------------------------------------------------------------------------------
1 | ## Mobile Metrics
2 |
3 | ### CPU metrics
4 | | Name | Type | Units | Description |
5 | |--------------------|------------------|------------|---------------------------------|
6 | | `system.cpu.usage` | Gauge | percentage | A percentage value of cpu usage |
7 |
8 | ### Memory Metrics
9 | | Name | Type | Units | Description |
10 | |------------------------|------------------|-------|-----------------------------------------|
11 | | `system.memory.usage` | Gauge | bytes | The application's memory usage in bytes |
12 |
13 |
14 | ### Application Metrics
15 | #### load times
16 | | Name | Type | Units | Description |
17 | |--------------------------------------|--------------------------------|---------|-----------------------------------------------------------------------|
18 | | `application.launch.time` | histogram(iOS), gauge(Android) | milliseconds | The amount of time spent launching the app |
19 |
20 | | Labels | Values | Description |
21 | |--------|-------------------------------------------------|-----------------------------------------------------|
22 | | `type` | `first draw`, `first draw (optimized)`, `resume`| The type of application launch that is being timed. |
23 |
24 | #### responsiveness
25 | | Name | Type | Units | Description |
26 | |----------------------------------------|-----------|---------|-------------------------------------------------------------|
27 | | `application.responsiveness.hangtime` | histogram | millisseconds | The amount of time the applications has spent unresponsive. |
28 |
29 | ### Application exit
30 | Traces application exit counts in both healthy and unhealthy (crashes) states
31 |
32 | | Name | Type | Units | Description |
33 | |---------------------|-------|-------|-------------------------------|
34 | | `application.exits` | count | unit | A count of application exits. |
35 |
36 |
37 | | Labels | Values | Description |
38 | |------------|------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------|
39 | | `appState` | `background`, `foreground` | This denotes whether the application exited in the background or foreground |
40 | | `type` | `memoryResourceLimit`, `AppWatchDog`, `BadAccess`, `Abnormal`, `IllegalInstruction`, `Normal` | The cause of the application exit. All but normal could be considered a crash. |
41 |
42 |
43 |
44 |
--------------------------------------------------------------------------------
/specs/agents/mobile/session.md:
--------------------------------------------------------------------------------
1 | # Session
2 |
3 | Status: [Experimental](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/document-status.md)
4 |
5 | ### Overview
6 | A `session` is a collection of `logs`, `events`, `transactions` and `spans` (`LETS`) associated with a specific device within a specific period of time.
7 | A `session` is represented by a unique identify that is attached to `LETS` as an attribute.
8 |
9 | The primary purpose of `sessions` are to provide insight into the series of user actions or events that lead up to a critical error or crash. Sessions also provide a means to quantify application usage.
10 |
11 | This document depends on the Open Telemetry semantic convention for [session](https://github.com/open-telemetry/semantic-conventions/blob/main/docs/general/session.md). Due to the dependency on Open Telemetry's events API this document's contents are subject to change and considered [experimental](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/document-status.md).
12 |
13 | ### How a session operates
14 | - All `LETS` will have a `session` identifier attached as an attribute using the name `session.id`.
15 | - After a period of timeout, the `session` identifier will be refreshed.
16 | - The timeout period will be restarted when any `LETS` is recorded.
17 |
18 |
19 | #### The session timeout period can be customized.
20 | Default session timeout should be 30 minutes. This should only be done when the agent is configured, and shouldn't be updated in the middle of a session.
21 |
22 | #### Max session length
23 | Sessions will be limited to a maximum length of four (4) hours. This limitation will be implemented in the mobile agent.
24 |
--------------------------------------------------------------------------------
/specs/agents/otel-distribution.md:
--------------------------------------------------------------------------------
1 |
2 | ## Terminology
3 |
4 | **Vanilla OpenTelemetry Distribution**: this is the "upstream" OpenTelemetry distribution that is maintained by the OpenTelemetry community.
5 | Implementation differs per platform, but it usually consists of an API/SDK and can also provide automatic instrumentation.
6 |
7 | **Elastic OpenTelemetry Distribution**: this is an OpenTelemetry distribution provided by Elastic that is derived from the
8 | _Vanilla OpenTelemetry Distribution_.
9 |
10 | ## General guidelines
11 |
12 | These statements are guiding principles of the Elastic OpenTelemetry distributions, they should be considered more as advice than strict rules.
13 |
14 | Elastic OpenTelemetry distribution SHOULD ideally:
15 | - behave as drop-in replacements of their upstream counterparts
16 | - provide a simple setup and favor onboarding experience (aka "things should work by default").
17 | - avoid capturing potentially confusing data (see [system metrics](#system-metrics) example below).
18 |
19 | ## Configuration
20 |
21 | Elastic OpenTelemetry distributions MAY override the default configuration.
22 | When doing so, user-configuration should remain consistent with vanilla distribution:
23 | - explicit user configuration SHOULD remain effective
24 | - overriden default configuration MUST have the ability to be restored to upstream default
25 |
26 | Elastic specific configuration items MUST be prefixed with `ELASTIC_OTEL_`.
27 | For example, the [universal profiling integration](#universal-profiling-integration) can be enabled with `ELASTIC_OTEL_UNIVERSAL_PROFILING_INTEGRATION_ENABLED`.
28 |
29 | Elastic and platform specific configuration items must be prefixed with `ELASTIC_OTEL_${platform}_` to be consistent with
30 | the upstream `OTEL_${platform}_` prefix.
31 |
32 | When introducing new features, the decision between starting with platform-specific or general namespace is made on a feature by feature case:
33 | - feature can be aligned cross-platform even if implemented only in only one: use `ELASTIC_OTEL_` prefix, for example [System metrics](#system-metrics).
34 | - feature that we know will be platform-specific: use `ELASTIC_OTEL_${platform}_` prefix.
35 |
36 | For simplicity the configuration in this specification will use the "environment variable" syntax, some platforms like Java
37 | might also support other ways to configure.
38 |
39 | ## Identification
40 |
41 | ### User Agent headers
42 |
43 | Per the [OpenTelemetry Specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/protocol/exporter.md#user-agent), OpenTelemetry SDKs are expected to send a `User-Agent` header when exporting data to a backend. At a minimum, this header SHOULD identify the exporter, the language of its implementation, and the version of the exporter.
44 |
45 | Elastic distributions SHOULD configure a customized `User-Agent` header when possible[^1].
46 | This allows data exported from a vanilla SDK and an Elastic distribution to be easily distinguished.
47 |
48 | [^1]: Some OpenTelemetry SDKs (e.g. .NET) do not provide a mechanism to modify the `User-Agent` header. In this case, we accept their default.
49 |
50 | To conform with [RFC7231](https://datatracker.ietf.org/doc/html/rfc7231#section-5.5.3), the existing SDK `User-Agent` should be preceded by a product identifier and version for the Elastic distribution.
51 |
52 | ```
53 | /
54 | ```
55 |
56 | For example, in the .NET distribution, the `User-Agent` header would be configured as follows:
57 |
58 | ```
59 | elastic-otel-dotnet/1.0.0 OTel-OTLP-Exporter-Dotnet/1.6.0
60 | ```
61 |
62 | ### Telemetry resource attributes
63 |
64 | Per the [semantic conventions](https://opentelemetry.io/docs/specs/semconv/resource/#telemetry-sdk), SDKs are expected to include the following resource attributes on captured signals. These are used to identify the SDK where the data was captured and should not be modified.
65 |
66 | - `telemetry.sdk.name`
67 | - `telemetry.sdk.version`
68 | - `telemetry.sdk.language`
69 |
70 | In the above attributes, the name and version should be the OTel SDK name and version.
71 | The language should be the primary language that the SDK is intended for.
72 | It is expected that the OpenTelemetry SDK sets these values.
73 | Our distros should set them, only if the SDK code does not do so automatically.
74 |
75 | Intake currently reads these attributes and uses them to populate the `agent.Name` and `agent.Version` fields.
76 |
77 | The semantic conventions also [define two experimental attributes](https://opentelemetry.io/docs/specs/semconv/resource/#telemetry-sdk-experimental) to identify the distribution:
78 |
79 | - `telemetry.distro.name`: must be set to `elastic`
80 | - `telemetry.distro.version`: must reflect the distribution version
81 |
82 | Distributions SHOULD set these attributes with appropriate values.
83 |
84 | ## Features
85 |
86 | ### Inferred spans
87 |
88 | Supported platforms: [Java](https://github.com/elastic/elastic-otel-java/tree/main/inferred-spans)
89 |
90 | Configuration to enable: `ELASTIC_OTEL_INFERRED_SPANS_ENABLED`
91 |
92 | Note: While the implementation is Java-only for now, it should probably have been using `ELASTIC_OTEL_JAVA_INFERRED_SPANS_ENABLED`
93 | instead, but we plan to fix this inconsistency once it has been contributed upstream.
94 |
95 | ### System metrics
96 |
97 | Supported platforms: Python
98 |
99 | These metrics are usually captured using the collector running locally but in case where no collector is present, or a centralized
100 | collector is used then the user might opt in to also collect those.
101 |
102 | These metrics are not captured by default in order to prevent duplicated metrics when they are also captured by a collector.
103 |
104 | Configuration to enable: `ELASTIC_OTEL_SYSTEM_METRICS_ENABLED`
105 |
106 | ### Cloud resource attributes
107 |
108 | Supported platforms: Java
109 |
110 | The cloud resource attributes ([semconv](https://opentelemetry.io/docs/specs/semconv/resource/cloud/)) is a subset of
111 | the [resource attributes](https://opentelemetry.io/docs/specs/semconv/resource/) providing equivalent attributes to the
112 | [cloud provider metadata](metadata.md#cloud-provider-metadata).
113 | Those attributes are usually provided through a metadata HTTP(s) endpoint accessible from the application.
114 |
115 | Elastic OpenTelemetry distributions SHOULD capture those by default for a better onboarding experience.
116 | Users MUST be able to disable this default to minimize application startup overhead or if those attributes are provided through the collector.
117 |
118 | Elastic distribution MUST allow to opt out of this behavior through explicit configuration.
119 | Implementation is currently platform specific:
120 | - Java: `OTEL_RESOURCE_PROVIDERS_${provider}_ENABLED=false`
121 | - NodeJS: `OTEL_NODE_RESOURCE_DETECTORS` ([doc](https://github.com/open-telemetry/opentelemetry-js-contrib/tree/main/metapackages/auto-instrumentations-node/#usage-auto-instrumentation))
122 |
123 | ### Universal profiling integration
124 |
125 | Supported platforms: [Java](https://github.com/elastic/elastic-otel-java/tree/main/universal-profiling-integration)
126 |
127 | For the configuration options see [this section](universal-profiling-integration.md#configuration-options).
128 |
--------------------------------------------------------------------------------
/specs/agents/process-new-fields.md:
--------------------------------------------------------------------------------
1 | # Process for adding new fields
2 |
3 | If an agent dev wants to show new data they write a proposal for how it should be stored in the context.
4 | They should not add it to tags or custom.
5 | We can’t have agents just add data as they see fit because then it won’t be aligned,
6 | it will move around if they change their mind etc.,
7 | that would break peoples assumptions about where it is and if they add to tags,
8 | it would create new fields in the index and then stop using it when we standardize
9 |
10 | * The proposal should specify how the data fits into a top level key under `context` in the Intake API and how it fits in the Elasticsearch events that get written by APM Server.
11 | For example `context.elasticsearch.url` in the intake API becomes `elasticsearch.url` in Elasticsearch, `context.elasticsearch.error_reason` becomes `elasticsearch.error_reason` etc.
12 | * The proposal needs to specify which fields should be indexed.
13 | An APM Server person might need to assist here to determine the right data type for the indexed fields.
14 | * The proposal should include the suggested [JSON Schema](https://github.com/elastic/apm-server/tree/main/docs/spec/v2) changes for all new fields.
15 | This forces alignment on the exact field names, JSON data type, length restrictions etc.
16 | * Make sure to check if [ECS](https://github.com/elastic/ecs) has defined appropriate fields for what you're proposing.
17 | * Agents should agree to the changes in a voting format (checkboxes),
18 | once they agree an issue should be created on the agent,
19 | apm-server and/or kibana repos to track the implementation.
20 | Once we have issues for all the implementations, the original one can be closed.
21 | * As soon as the JSON Schema changes have been merged into APM Server,
22 | agents can implement and test their implementation against the new schema.
23 | It is typically only a matter of a few hours to implement new fields in APM Server once the details are agreed upon.
24 | * Agent devs can release new versions that send the new fields as soon the changes are merged into APM Server.
25 | APM Server will not reject arbitrary fields in `context`, but fields that are not defined in APM Server are not stored, indexed or validated.
26 | When users upgrade their stack, the new fields will start to appear.
27 | * The UI will show every field under the existing top-level fields. E.g. everything under `request` shows up in the UI automatically. If we add a new top level fields, the UI also needs to get updated.
28 | * When we add data to the span context or transaction context,
29 | this data should also be allowed in the error context and arbitrary data in the error context should be shown, just like for spans and transactions.
30 | That way, when an error happens, we can supply the context in the error context directly.
31 | We have previously decided that we need an error context and that it's not enough to just link errors to their parent span.
32 | * Errors that are captured in instrumentations should include/copy all the contextual data that would go on that span into the error context
33 |
34 |
35 | Example:
36 |
37 | 1. We have built an Elasticsearch instrumentation that gets some useful context: `elasticsearch.url`, `elasticsearch.response_code`, `elasticsearch.error_reason`.
38 | 2. Agent dev opens a proposal that looks like this:
39 |
40 | **Proposal:**
41 |
42 | Add optional fields to
43 | - [x] _span context_
44 | - [ ] _transaction context_
45 |
46 | as always, this should also be added to the _error context_.
47 |
48 | | Intake API field | Elasticsearch field | Elasticsearch Type |
49 | | -----------------|-------------------------|---------------------|
50 | | `context.elasticsearch.url` | `elasticsearch.url` | not indexed |
51 | | `context.elasticsearch.response_code` | `elasticsearch.response_code` | indexed as keyword |
52 | | `context.elasticsearch.error_reason` | `elasticsearch.error_reason` | not indexed |
53 | | `context.elasticsearch.cluster_name` | `elasticsearch.cluster_name` | not indexed |
54 |
55 |
56 | JSON Schema:
57 | ```json
58 | {
59 | "url": {
60 | "type": ["string"]
61 | },
62 | "response_code": {
63 | "type": ["string"],
64 | "maxLength": 1024
65 | },
66 | "error_reason": {
67 | "type": ["string"],
68 | "maxLength": 10000
69 | },
70 | "cluster_name": {
71 | "type": ["string"],
72 | }
73 | }
74 | ```
75 | Not all agents will send `context.elasticsearch.cluster_name`. This is _fine_. We should still align on the ones we can.
76 |
77 | Note: As this is a new top level field, the UI needs an update.
78 |
79 | Agents OK with this change:
80 |
81 | - [ ] @elastic/apm-ui (if this is a new top level field)
82 | - [ ] RUM
83 | - [ ] Node.js
84 | - [ ] Java
85 | - [ ] ...
86 |
87 | 1. When agent devs and APM Server agree, APM Server implements the changes necessary
88 | 1. When merged into `main`, agent devs can implement the fields immediately.
89 | The agent tests against APM Server `main` now tests the integration with the new fields in the JSON Schema.
90 | 1. Agents can release when their test are green. Next APM Server release will include the changes,
91 | which might include indexing some new fields.
92 |
--------------------------------------------------------------------------------
/specs/agents/sanitization.md:
--------------------------------------------------------------------------------
1 | The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
2 | "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this document are to
3 | be interpreted as described in
4 | [RFC 2119](https://www.ietf.org/rfc/rfc2119.txt).
5 |
6 | ## Data sanitization
7 |
8 | ### `sanitize_field_names` configuration
9 |
10 | Sometimes it is necessary to sanitize, i.e., remove,
11 | sensitive data sent to Elastic APM.
12 |
13 | This config accepts a list of wildcard patterns of field names which control
14 | how an agent will sanitize data.
15 |
16 | | | |
17 | |----------------|---|
18 | | Type | `List<`[`WildcardMatcher`](../../tests/agents/json-specs/wildcard_matcher_tests.json)`>` |
19 | | Default | `password, passwd, pwd, secret, *key, *token*, *session*, *credit*, *card*, *auth*, set-cookie, *principal*` |
20 | | Dynamic | `true` |
21 | | Central config | `true` |
22 |
23 | #### Configuration
24 |
25 | Agents MUST provide a minimum default configuration of
26 |
27 | [ 'password', 'passwd', 'pwd', 'secret', '*key', '*token*', '*session*',
28 | '*credit*','*card*', '*auth*', 'set-cookie', '*principal*' ]
29 |
30 | for the `sanitize_field_names` configuration value. Agent's MAY include the
31 | following extra fields in their default configuration to avoid breaking changes
32 |
33 | ['pw','pass','connect.sid']
34 |
35 | ## Sanitizing Values
36 |
37 | If a payload field's name (a header key, a form key) matches a configured
38 | wildcard, that field's _value_ MUST be redacted and the key itself
39 | MUST still be reported in the agent payload. Agents MAY choose the string
40 | they use to replace the value so long as it's consistent and does not reveal
41 | the value it has replaced. The replacement string SHOULD be `[REDACTED]`.
42 |
43 | Fields that MUST be sanitized are:
44 | - HTTP Request and Response headers (except [HTTP/2 pseudo-headers](https://datatracker.ietf.org/doc/html/rfc7540#section-8.1.2.3) which SHOULD NOT be redacted),
45 | - form fields in an `application/x-www-form-urlencoded` request body, and
46 | - HTTP Request cookies.
47 |
48 | Additionally, if cookie headers are parsed into name/value pairs and reported
49 | to APM Server via the agent (for example, `transaction.context.request.cookies`), the
50 | values of these pairs MUST be sanitized and the cookie header removed or redacted.
51 |
52 |
53 | The query string and other captured request bodies (such as `application/json`)
54 | SHOULD NOT be sanitized.
55 |
56 | Agents SHOULD NOT sanitize fields based on the _value_ of a particular field.
57 |
--------------------------------------------------------------------------------
/specs/agents/span-links.md:
--------------------------------------------------------------------------------
1 | ## Span Links
2 |
3 | A Span or Transaction MAY link to zero or more other Spans/Transactions that are causally related.
4 |
5 | Example use-cases for Span Links:
6 |
7 | 1. When a single transaction represents the batch processing of several messages, the agent is able to link back to the traces that have produced the messages.
8 | 2. When the agent receives a `traceparent` header from outside a trust boundary, it [can restart the trace](trace-continuation.md) (creating a different trace id with its own sampling decision) and link to the originating trace.
9 | 3. Close gap for the OTLP intake - [OTel's specification of span links](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/overview.md#links-between-spans)
10 |
11 | Spans and Transactions MUST collect links in the `links` array with the following fields on each item:
12 | - `trace_id`: the id of the linked trace.
13 | - `span_id`: the id of the linked span or transaction.
14 |
15 | Example:
16 |
17 | ```
18 | "links": [
19 | {"trace_id": "traceId1", "span_id": "spanId1"},
20 | {"trace_id": "traceId2", "span_id": "spanId2"},
21 | ]
22 | ```
23 |
24 | ### API
25 |
26 | Agents MAY provide a public API to add span links at span/transaction creation.
27 | A use-case for user's manually adding span links is for [batch message processing](tracing-instrumentation-messaging.md#batch-message-processing)
28 | that the APM agent does not or cannot instrument. (For some agents it would be
29 | a burden to internally support span links and *not* expose the API publicly.)
30 |
31 | If provided, the API SHOULD be written such that user code is not broken if/when
32 | support for span link *attributes* is added in the future.
33 |
34 | If provided, the API and semantics SHOULD be compatible with the
35 | [OpenTelemetry specification on specifying span links](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#specifying-links). A compatible API will facilitate
36 | [OpenTelemetry bridge](trace-api-otel.md) support. OpenTelemetry requirements:
37 |
38 | - The public API SHOULD allow adding span links *after* span creation.
39 | - Links SHOULD preserve the order in which they are set.
40 |
--------------------------------------------------------------------------------
/specs/agents/trace-continuation.md:
--------------------------------------------------------------------------------
1 | ## Trace Continuation
2 |
3 | ### `trace_continuation_strategy` configuration
4 |
5 | | | |
6 | |----------------|---|
7 | | Valid options | `continue`, `restart`, `restart_external` |
8 | | Default | `continue` |
9 | | Dynamic | `true` |
10 | | Central config | `true` |
11 |
12 | The `traceparent` header of requests that are traced with our agents might have been added by a 3rd party component.
13 |
14 | This situation becomes more and more common as the w3c trace context gets adopted. In such cases we can end up with traces where part of the trace is outside of our system.
15 |
16 | In order to handle this properly, the agent SHOULD offer several trace continuation strategies.
17 |
18 | The agent SHOULD offer a configuration called `trace_continuation_strategy` with the following values and behavior:
19 |
20 | - `continue`: The agent takes the `traceparent` header as it is and applies it to the new transaction.
21 | - `restart`: The agent always creates a new trace with a new trace id. In this case the agent MUST create a [span link](span-links.md) in the new transaction pointing to the original traceparent.
22 | - `restart_external`: The agent first checks the `tracestate` header. If the header contains the `es` vendor flag, it's treated as internal, otherwise (including the case when the `tracestate` header is not present) it's treated as external. In case of external calls the agent MUST create a new trace with a new trace id and MUST create a link in the new transaction pointing to the original trace.
23 |
24 | In the case of internal calls, the agent MUST use the `continue` strategy above.
--------------------------------------------------------------------------------
/specs/agents/tracing-api.md:
--------------------------------------------------------------------------------
1 | ## Tracer APIs
2 |
3 | All agents must provide a native API to enable developers to instrument their applications manually, in addition to any
4 | automatic instrumentation.
5 |
6 | Agents document their APIs in the elastic.co docs:
7 |
8 | - [Node.js Agent](https://www.elastic.co/guide/en/apm/agent/nodejs/current/api.html)
9 | - [Go Agent](https://www.elastic.co/guide/en/apm/agent/go/current/api.html)
10 | - [Java Agent](https://www.elastic.co/guide/en/apm/agent/java/current/public-api.html)
11 | - [.NET Agent](https://www.elastic.co/guide/en/apm/agent/dotnet/current/public-api.html)
12 | - [Python Agent](https://www.elastic.co/guide/en/apm/agent/python/current/api.html)
13 | - [Ruby Agent](https://www.elastic.co/guide/en/apm/agent/ruby/current/api.html)
14 | - [RUM JS Agent](https://www.elastic.co/guide/en/apm/agent/js-base/current/api.html)
15 |
16 | In addition, each agent may provide "bridge" implementations of vendor-neutral [OpenTelemetry API](tracing-api-otel.md).
--------------------------------------------------------------------------------
/specs/agents/tracing-instrumentation-aws.md:
--------------------------------------------------------------------------------
1 | ## AWS services spans
2 |
3 | We describe how to instrument some of AWS' services in this document.
4 | Some of the services can use existing specs. When there are differences or additions, they have been noted below.
5 | The spec for [instrumenting AWS Lambda](tracing-instrumentation-aws-lambda.md) is in a separate document.
6 |
7 | ### S3 (Simple Storage Service)
8 |
9 | AWS Simple Storage Service offers object storage via a REST API. The objects are organized into buckets, which are
10 | themselves organized into regions.
11 |
12 | Field semantics and values for S3 are defined in the [S3 table within the database spec](tracing-instrumentation-db.md#aws-s3).
13 |
14 | ### DynamoDB
15 |
16 | AWS DynamoDB is a document database so instrumenting it will follow the [db spec](tracing-instrumentation-db.md).
17 | DynamoDB-specific specifications that supercede generic db field semantics are defined in the [DynamoDB table within the database spec](tracing-instrumentation-db.md#aws-dynamodb).
18 |
19 | ### SQS (Simple Queue Service)
20 |
21 | AWS Simple Queue Service is a message queuing service. The [messaging spec](tracing-instrumentation-messaging.md) can
22 | be used for instrumenting SQS, but the following specifications supersede those of the messaging spec.
23 |
24 | For a batch send message operation, the span name is `SQS SEND_BATCH to MyQueue`. The `span.action` is `send_batch`.
25 |
26 | The SQS API also includes delete message and batch delete message operations. These should be instrumented in addition
27 | to the operations described in the messaging spec. For a delete message operation, the span name is
28 | `SQS DELETE from MyQueue`.
29 | For a batch delete message operation, the span name is `SQS DELETE_BATCH from MyQueue`.
30 | The `span.action` is `delete_batch`.
31 |
32 | - **`context.destination.cloud.region`**: mandatory. The AWS region where the queue is.
33 |
34 | #### Distributed Tracing
35 |
36 | For distributed tracing, the SQS API has "message attributes" that can be used in lieu of headers.
37 |
38 | Agents should use an attribute name of `traceparent` when sending the trace parent header value via the SQS message attributes. Agents should use an attribute name of `tracestate` if sending trace state header value in an SQS message attribute. Agents should not prefix these names with an `elastic-` namespace.
39 |
40 | SQS has a documented limit of ten message attributes per message. Agents _should not_ add `traceparent` or `tracestate` headers to the message attributes if adding those fields would put an individual message over this limit. Agents _should_ log a message if they omit either `traceparent` or `tracestate` due to these length limits.
41 |
42 | ### SNS (AWS Simple Notification Service)
43 |
44 | The AWS Simple Notification Service can be instrumented using the [messaging spec](tracing-instrumentation-messaging.md),
45 | but the only action that is instrumented is [Publish](https://docs.aws.amazon.com/sns/latest/api/API_Publish.html). These specifications supersede those of the messaging spec:
46 |
47 | - `span.name`:
48 | - For a publish action including a `TopicArn`, the span name MUST be `SNS PUBLISH to `. For example, for a TopicArn of `arn:aws:sns:us-east-2:123456789012:My-Topic` the topic-name is `My-Topic`. (Implementation note: this can extracted with the equivalent of this Python expression: `topicArn.split(':').pop()`.)
49 | - For a publish action including a `TargetArn` (an endpoint ARN created via [CreatePlatformEndpoint](https://docs.aws.amazon.com/sns/latest/api/API_CreatePlatformEndpoint.html)), the span name MUST be `SNS PUBLISH to `. For example, for a TargetArn of `arn:aws:sns:us-west-2:123456789012:endpoint/GCM/gcmpushapp/5e3e9847-3183-3f18-a7e8-671c3a57d4b3` the application-name is `endpoint/GCM/gcmpushapp`. The endpoint UUID represents a device and mobile app. For manageable cardinality, the UUID must be excluded from the span name. (Implementation note: this can be extracted with the equivalent of this Python expression: `targetArn.split(':').pop().rsplit('/', 1)[0]`)
50 | - For a publish action including a `PhoneNumber`, the span name MUST be `SNS PUBLISH to [PHONENUMBER]`. The actual phone number MUST NOT be included because it is [PII](https://en.wikipedia.org/wiki/Personal_data) and cardinality is too high.
51 | - `span.action`: 'publish'
52 |
53 | - **`context.destination.cloud.region`**: mandatory. The AWS region where the topic is.
54 |
55 | For distributed tracing, the SNS API has "message attributes" that can be used in lieu of headers.
56 |
--------------------------------------------------------------------------------
/specs/agents/tracing-instrumentation-graphql.md:
--------------------------------------------------------------------------------
1 | ## GraphQL transactions and spans
2 |
3 | **NB:** This document is not guaranteed to be final.
4 |
5 | ### Problems with our current approach
6 |
7 | Grouping transactions by HTTP method and path fits perfectly for REST-style APIs and the like.
8 |
9 | With a GraphQL API, the client always requests the same endpoint.
10 | This means queries and mutations with very different costs and consequences all end up in the same transaction group.
11 | Spans are likewise impossible to tell apart.
12 |
13 | This document describes and explains a common approach to better support GraphQL.
14 |
15 | Example GraphQL query:
16 |
17 | ```graphql
18 | {
19 | user(id: "99") {
20 | name
21 | comments {
22 | body
23 | }
24 | }
25 | }
26 | ```
27 |
28 | Turns into an HTTP POST request like so:
29 |
30 | ```plain
31 | POST /graphql
32 | Content-Type: application/json
33 |
34 | {
35 | "query": "query ($id: ID!) {
36 | user(id: $id) {
37 | name
38 | comments {
39 | body
40 | }
41 | }
42 | }",
43 | "variables": { "id": "99" }
44 | }
45 | ```
46 |
47 | **Sidenote:** The Node.js agent already supports GraphQL. This document is written with that in mind but not necessarily with its implementation as a target result.
48 |
49 | ### Transactions
50 |
51 | #### Prefix
52 |
53 | To distinguish GraphQL transactions from others we prefix them with `GraphQL:`.
54 |
55 | #### Operation Name
56 |
57 | It is common (and [recommended](https://graphql.org/learn/queries/#operation-name)) to provide an _Operation Name_ for queries. Here for example `UserWithComments`:
58 |
59 | ```graphql
60 | query UserWithComments {
61 | user {
62 | id
63 | name
64 | comments {
65 | body
66 | }
67 | }
68 | }
69 | ```
70 |
71 | The point of these are to provide an easy way for the developers, when things go wrong, to pinpoint where exactly they did so.
72 |
73 | This name is available on the server too and serves as a great distinguishing key.
74 |
75 | Transaction name examples:
76 | - `GraphQL: UserWithComments`
77 | - `GraphQL: UpdateUser`
78 |
79 | ##### Sidenote: Multiple endpoints
80 |
81 | The Node.js implementation adds the request path to the GraphQL span names.
82 |
83 | We do not find serving multiple endpoints and using them with the same Operation Names likely enough to add it to this document.
84 |
85 | #### Anonymous queries
86 |
87 | An Operation Name isn't required. When one isn't provided it's hard for us to tell apart the queries.
88 |
89 | We considered hashing queries to tell them apart, but decided against it.
90 | Instead we will just consider all unnamed queries _unnamed_ and consequently put them in the same bucket.
91 |
92 |
93 | Rationale:
94 |
95 | 1. Some clients generate `id`s from hashing the contents of the query (see [apollo-tooling](https://github.com/apollographql/apollo-tooling/blob/1dfd737eaf85b89b2cfb13913342e091e3c03d18/packages/apollo-codegen-core/src/compiler/visitors/generateOperationId.ts#L5)). This would split the anonymous queries into separate buckets.
96 |
97 | A problem with this approach is that a user of the APM UI has no way to recognise queries in the transactions list before clicking through.
98 |
99 | Using just the `id` will not reveal the true culprit since there can be variables associated with the query. Different values for the variables can lead to very different workloads and response times.
100 |
101 | 2. Another approach is to simply label them `[unnamed]`.
102 |
103 | A problem with _that_ approach is that the contents and thereby the relevant db queries and other sub-span actions that the server might do while resolving these queries may be wildly different making it hard to provide a _true_ sample waterfall.
104 |
105 | These two examples for example will look the same for the top-level GraphQL spans but will represent significantly different workloads.
106 |
107 | ```
108 | [- anonymous graphql span --------------]
109 | [- 1,000x SELECT * ---------------]
110 | [- 1,000 more SELECT * -]
111 |
112 | [- anonymous graphql span --------------]
113 | [- SELECT id FROM users WHERE id=? -]
114 | ```
115 |
116 | No one of these are perfect. Because the benefits of using `id`s in the worst case could be misleading anyway, we're going with option 2.
117 |
118 |
119 | To further help and nudge developers to use Operation Names for their queries, a tooltip will be shown in the Kibana UI, suggesting to use Operation Names.
120 |
121 | Transaction name examples:
122 | - `GraphQL: [unnamed]`
123 |
124 | #### Batching/Multiplexing queries
125 |
126 | Some clients and servers allow batching/multiplexing queries (see for example [apollo-link-batch-http](https://www.apollographql.com/docs/link/links/batch-http/#gatsby-focus-wrapper) or [dataloader](https://github.com/graphql/dataloader#batching)) allowing multiple queries to be run from the same HTTP request.
127 |
128 | If multiple queries are run from the same request, we join their operation names in the transaction name with a `+`.
129 |
130 | Transaction name examples:
131 | - `GraphQL: UserWithComments+PostWithSiblings+MoreThings+[unnamed]`
132 |
133 | To avoid very long transaction names, if a request has more than five queries, we abbreviate it to `[more-than-five-queries]`.
134 |
135 | Transaction name examples:
136 | - `GraphQL: [more-than-five-queries]`
137 |
138 | ### Spans
139 |
140 | The life cycle of responding to a GraphQL query is mostly split in two parts. First, the quer(y/ies) are read, parsed and analyzed. Second, they are executed.
141 |
142 | 
143 |
144 | This example shows the server responding to one request, with two named queries, `SKUs` and `Names`.
145 |
146 | As each language's server implementation can vary slightly in phases and their names, these might be named differently between agents.
147 |
148 | GraphQL spans have the following parameters:
149 |
150 | - `name: graphql.[action]`
151 | - `type: "app"`
152 | - `subtype: "graphql"`
153 | - `action: [action]`
154 |
--------------------------------------------------------------------------------
/specs/agents/tracing-instrumentation-grpc.md:
--------------------------------------------------------------------------------
1 | ## gRPC support in agents
2 |
3 | ### Header
4 |
5 | #### Value format
6 | The value format of the header is text, as other vendors use text and there's no advantage to using a binary encoding. See technical details about the gRPC header [here](https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md#requests).
7 |
8 | #### Key Names
9 | The header key names are `elastic-apm-traceparent` (for backwards compatibility with older agents) and `traceparent`.
10 |
11 | ### Instrumented calls
12 | Server and Client Unary request/response calls are instrumented. Support for other calls may be added later (i.e. client/server streaming, bidirectional streaming).
13 |
14 | ### Transaction/Span context schemas
15 |
16 | #### Transaction context
17 |
18 | * **name**: \, ex: `/helloworld.Greeter/SayHello`
19 | * **type**: `request`
20 | * **trace_context**: \
21 | * **result**: [\](https://github.com/grpc/grpc/blob/master/doc/statuscodes.md#status-codes-and-their-use-in-grpc), ex: `OK`
22 | * **outcome**: See [Outcome](#outcome)
23 |
24 | #### Span context
25 |
26 | Note that the destination fields are optional as some gRPC libraries don't expose host and port information.
27 | See [apm#180](https://github.com/elastic/apm/issues/180) and [apm#115](https://github.com/elastic/apm/issues/115) for details on `destination` fields.
28 |
29 | * **name**: \, ex: `/helloworld.Greeter/SayHello`
30 | * **type**: `external`
31 | * **subtype**: `grpc`
32 | * **outcome**: See [Outcome](#outcome)
33 | * **destination**:
34 | * **address**: Either an IP (v4 or v6) or a host/domain name.
35 | * **port**: A port number; Should report default ports.
36 | * **service**:
37 | * **resource**: Capture host, and port.
38 | * **name**: Capture the scheme, host, and non-default port.
39 | * **type**: Same as `span.type`
40 |
41 | #### Outcome
42 |
43 | With gRPC, transaction and span outcome is set from gRPC response status.
44 |
45 | If such status is not available, then we default to the following:
46 |
47 | - `failure` if an error is reported
48 | - `success` otherwise
49 |
50 | According to the [gRPC status codes reference spec](https://github.com/grpc/grpc/blob/master/doc/statuscodes.md), some
51 | statuses are not used by gRPC client & server, thus some of them should be considered as client-side errors.
52 |
53 | The gRPC `UNKNOWN` status refers to an error that is not known, thus we should treat it as a `failure` and NOT map it to
54 | an `unknown` outcome.
55 |
56 | For gRPC spans (from the client):
57 |
58 | - `OK` : `success`
59 | - anything else: `failure`
60 |
61 | For gRPC transactions (from the server):
62 |
63 | This mapping can be quite subjective, as we know that some statuses are not used by the gRPC server & client
64 | implementations and thus their meaning would be application specific. However, we attempt to report as `failure`
65 | outcomes errors that might require attention from the server point of view and report as `success` all the statuses
66 | that are only relevant on the client-side.
67 |
68 | | status | outcome | justification |
69 | | ------------------------- | --------- | ------------------------------------------------ |
70 | | `OK` | `success` | |
71 | | `CANCELLED` | `success` | Operation cancelled by client |
72 | | `UNKNOWN` | `failure` | Error of an unknown type, but still an error |
73 | | `INVALID_ARGUMENT` (*) | `success` | Client-side error |
74 | | `DEADLINE_EXCEEDED` | `failure` | |
75 | | `NOT_FOUND` (*) | `success` | Client-side error (similar to HTTP 404) |
76 | | `ALREADY_EXISTS` (*) | `success` | Client-side error (similar to HTTP 409) |
77 | | `PERMISSION_DENIED` (*) | `success` | Client authentication (similar to HTTP 403) |
78 | | `RESOURCE_EXHAUSTED` (*) | `failure` | Likely used for server out of resources |
79 | | `FAILED_PRECONDITION` (*) | `failure` | Similar to UNAVAILABLE |
80 | | `ABORTED` (*) | `failure` | Similar to UNAVAILABLE |
81 | | `OUT_OF_RANGE` (*) | `success` | Client-side error (similar to HTTP 416) |
82 | | `UNIMPLEMENTED` | `success` | Client called a non-implemented feature |
83 | | `INTERNAL` | `failure` | Internal error (similar to HTTP 500) |
84 | | `UNAVAILABLE` | `failure` | Transient error, client may retry with backoff |
85 | | `DATA_LOSS` (*) | `failure` | Lost data should always be reported |
86 | | `UNAUTHENTICATED` (*) | `success` | Client-side authentication (similar to HTTP 401) |
87 |
88 | The statuses marked with (*) are not used by gRPC libraries and thus their actual meaning is contextual to the
89 | application.
90 |
91 | Also, the gRPC status code for a given transaction should be reported in the `transaction.result` field, thus we still have the
92 | capability to detect an abnormal rate of a given status, in a similar way as we do with HTTP 4xx and 5xx errors.
93 |
--------------------------------------------------------------------------------
/specs/agents/tracing-instrumentation-http.md:
--------------------------------------------------------------------------------
1 | ## HTTP Transactions
2 |
3 | Agents should instrument HTTP request routers/handlers, starting a new transaction for each incoming HTTP request. When the request ends, the transaction should be ended, recording its duration.
4 |
5 | - The transaction `type` should be `request`.
6 | - The transaction `result` should be `HTTP Nxx`, where N is the first digit of the status code (e.g. `HTTP 4xx` for a 404)
7 | - The transaction `outcome` is set from response status code (see [Outcome](#outcome))
8 |
9 | As there's no browser API to get the status code of a page load, the RUM agent always reports `"unknown"` for those transactions.
10 | - The transaction `name` should be aggregatable, such as the route or handler name. Examples:
11 | - `GET /users/{id}`
12 | - `UsersController#index`
13 |
14 | It's up to you to pick a naming scheme that is the most natural for the language or web framework you are instrumenting.
15 |
16 | In case a name cannot be automatically determined, and a custom name has not been provided by other means, the transaction should be named ` unknown route`, e.g. `POST unknown route`. This would normally also apply to requests to unknown endpoints, e.g. the transaction for the request `GET /this/path/does/not/exist` would be named `GET unknown route`, whereas the transaction for the request `GET /users/123` would still be named `GET /users/{id}` even if the id `123` did not match any known user and the request resulted in a 404.
17 |
18 | In addition to the above properties, HTTP-specific properties should be recorded in the transaction `context`, for sampled transactions only. Refer to the [Intake API Transaction](https://www.elastic.co/guide/en/apm/server/current/transaction-api.html) documentation for a description of the various context fields.
19 |
20 | By default request bodies are not captured. It should be possible to configure agents to enable their capture using the config variable `ELASTIC_APM_CAPTURE_BODY`. By default agents will capture request headers, but it should be possible to disable their capture using the config variable `ELASTIC_APM_CAPTURE_HEADERS`.
21 |
22 | Captured request and response headers, cookies, and form bodies MUST be sanitised (i.e. secrets removed) according to [data sanitization rules](sanitization.md#data-sanitization).
23 |
24 |
25 | ### `transaction_ignore_urls` configuration
26 |
27 | Used to restrict requests to certain URLs from being instrumented.
28 |
29 | This property should be set to a list containing one or more strings.
30 | When an incoming HTTP request is detected,
31 | its request [`path`](https://tools.ietf.org/html/rfc3986#section-3.3)
32 | will be tested against each element in this list.
33 | For example, adding `/home/index` to this list would match and remove instrumentation from the following URLs:
34 |
35 | ```
36 | https://www.mycoolsite.com/home/index
37 | http://localhost/home/index
38 | http://whatever.com/home/index?value1=123
39 | ```
40 |
41 | | | |
42 | |----------------|---|
43 | | Type | `List<`[`WildcardMatcher`](../../tests/agents/json-specs/wildcard_matcher_tests.json)`>` |
44 | | Default | agent specific |
45 | | Dynamic | `true` |
46 | | Central config | `true` |
47 |
48 | ### `transaction_ignore_user_agents` configuration
49 |
50 | Used to restrict requests made by certain User-Agents from being instrumented.
51 |
52 | This property should be set to a list containing one or more strings.
53 | When an incoming HTTP request is detected, the `User-Agent` request headers will be tested against each element in this list and if a match is found, no trace will be captured for this request.
54 |
55 | | | |
56 | |----------------|---|
57 | | Type | `List<`[`WildcardMatcher`](../../tests/agents/json-specs/wildcard_matcher_tests.json)`>` |
58 | | Default | `` |
59 | | Dynamic | `true` |
60 | | Central config | `true` |
61 |
62 | ## HTTP client spans
63 |
64 | We capture spans for outbound HTTP requests. These should have a type of `external`, and subtype of `http`. The span name should have the format ``.
65 |
66 | For outbound HTTP request spans we capture the following http-specific span context:
67 |
68 | - `http.url` (the target URL) \
69 | The captured URL should have the userinfo (username and password), if any, redacted.
70 | - `http.status_code` (the response status code)
71 | - `outcome` is set from response status code (see [Outcome](#outcome) for details)
72 |
73 | ## Outcome
74 |
75 | For HTTP transactions (from the server perspective), status codes in the 4xx range (client errors) are not considered
76 | a `failure` as the failure has not been caused by the application itself but by the caller.
77 |
78 | For HTTP spans (from the client perspective), the span's `outcome` should be set to `"success"` if the status code is
79 | lower than 400 and to `"failure"` otherwise.
80 |
81 | For both transactions and spans, if there is no HTTP status we set `outcome` from the reported error:
82 |
83 | - `failure` if an error is reported
84 | - `success` otherwise
85 |
86 | ## Destination
87 |
88 | - `context.destination.address`: `url.host`
89 | - `context.destination.port`: `url.port`
90 | - `context.destination.service.*`: See [destination spec](tracing-spans-destination.md)
91 |
--------------------------------------------------------------------------------
/specs/agents/tracing-sampling.md:
--------------------------------------------------------------------------------
1 | ## Transaction sampling
2 |
3 | To reduce processing and storage overhead, transactions may be sampled by agents.
4 | Sampling here refers to "head-based sampling".
5 |
6 | Head-based sampling is where a sampling decision is made at the root of the distributed trace,
7 | before the details or outcome of the trace are known,
8 | and propagated throughout the trace.
9 |
10 | ### `transaction_sample_rate` configuration
11 |
12 | By default, all transactions will be sampled.
13 | Agents can be configured to sample probabilistically,
14 | by specifying a sampling probability in the range \[0,1\].
15 | e.g.
16 |
17 | - `1` means all transactions will be sampled (the default)
18 | - `0` means no transactions will be sampled
19 | - `0.5` means approximately 50% of transactions will be sampled
20 |
21 | The maximum precision of the sampling rate is `0.0001` (0.01%).
22 | The sampling rate should be rounded half away from zero to 4 decimal places.
23 | Values greater than `0` but less than `0.0001` should be rounded to 0.0001.
24 |
25 | e.g.
26 |
27 | 0.00001 -> 0.0001
28 | 0.55554 -> 0.5555
29 | 0.55555 -> 0.5556
30 | 0.55556 -> 0.5556
31 |
32 | The implementation will look something like `math.Round(sampleRate*10000)/10000`.
33 | It is recommended to do that calculation once rather than every time the sampling rate is queried.
34 | This is to ensure we are consistent when [propagating](#propagation) the sampling rate through `tracestate`.
35 |
36 | | | |
37 | |----------------|---------|
38 | | Valid options | \[0,1\] |
39 | | Type | `float` |
40 | | Default | `1` |
41 | | Dynamic | `true` |
42 | | Central config | `true` |
43 |
44 | ### Effect on metrics
45 |
46 | At the time of making a sampling decision,
47 | the sampling rate must be recorded so that it can be associated with every transaction and span in the trace.
48 | The sampling rate will be used by the server for scaling transaction and span metrics.
49 |
50 | Transaction metrics will be used by the UI to display transaction distributions and throughput,
51 | from the perspective of the transaction's service (grouped by `service.name` and `transaction.name`).
52 |
53 | Span metrics will be used by the UI for inter-service request rates on service maps,
54 | from the perspective of the requesting service (grouped by `service.name` and `destination.service.resource`).
55 | These are also referred as edge metrics.
56 |
57 | The server will calculate metrics by measuring only events from sampled traces,
58 | and scaling the metrics based on the sampling rate associated with each one.
59 | For example if the sampling rate is 0.5,
60 | then each sampled transaction and span would be counted twice in metrics aggregations.
61 |
62 | Metrics will be more accurate when the sampling rate is high.
63 | With lower sampling rates the server is able to calculate representative, but less accurate, metrics.
64 | If the sampling rate is 0 then no metrics will be calculated at all.
65 |
66 | When the sampling rate is available Agents MUST record the sampling rate on transactions and spans as `sample_rate`, e.g.
67 |
68 | {"transaction":{"name":"GET /","sample_rate":0.1,...}}
69 | {"span":{"name":"SELECT FROM table","sample_rate":0.1,...}}
70 |
71 | See [Propagation section of this document](#propagation) for details about the case when the sampling rate is not available.
72 |
73 | For non-sampled transactions the `sample_rate` field MUST be set to 0,
74 | to ensure non-sampled transactions are not counted in transaction metrics.
75 | This is important to avoid double-counting,
76 | as non-sampled transactions will be represented in metrics calculated from sampled transactions.
77 |
78 | When calculating transaction metrics,
79 | if the `sample_rate` transaction field is missing,
80 | the server will count each transaction (sampled and unsampled) as single events.
81 | This is required for backwards compatibility with agents that do not send a sampling rate.
82 |
83 | The server will only calculate span metrics for newer agents that include `sample_rate` in spans,
84 | as otherwise the representative counts will be incorrect for sampling rates less than 1.
85 |
86 | ### Non-sampled transactions
87 |
88 | In the case where the server version is not known, agents MUST assume that the server version is pre 8.0 and keep
89 | sending non-sampled transactions.
90 |
91 | Given the health-check call to APM server might not have returned or even fail when unsampled
92 | transactions are started, this conservative behavior avoids discarding relevant data.
93 |
94 | #### Pre 8.0
95 | When connected to an APM Server < 8.0, both sampled and non-sampled transactions MUST be captured by Elastic APM agents.
96 | Sampling controls how much data is captured for transactions:
97 | sampled transactions have complete context recorded and include spans;
98 | non-sampled transactions have limited context and no spans.
99 |
100 | For non-sampled transactions set the transaction attributes `sampled: false` and `sample_rate: 0`, and omit `context`.
101 | No spans should be captured.
102 |
103 | #### Post 8.0
104 | When connected to an APM Server 8.0+, agents SHOULD NOT send non-sampled transactions, or capture spans for these transactions.
105 | Sampled transactions MUST be captured by Elastic APM agents.
106 |
107 | ### Propagation
108 |
109 | As mentioned above, the sampling decision must be propagated throughout the trace.
110 | We adhere to the W3C Trace-Context spec for this, propagating the decision through trace-flags: https://www.w3.org/TR/trace-context/#sampled-flag
111 |
112 | In addition to propagating the sampling decision (boolean), agents MUST also propagate the sampling rate to ensure it is consistently attached to to all events in the trace.
113 | This is achieved by adding an `s` attribute to our [`es` `tracestate` key](tracing-distributed-tracing.md#tracestate) with the value of the sampling rate.
114 | e.g.
115 |
116 | tracestate: es=s:0.1,othervendor=
117 |
118 | As `tracestate` has modest size limits we must keep the size down.
119 | This is ensured as the `transaction_sample_rate` configuration option has a maximum precision of 4 decimal places.
120 |
121 | For non-root transactions the agent MUST parse incoming `tracestate` headers to identify the `es` entry and extract the `s` attribute.
122 | The `s` attribute value should be used to populate the `sample_rate` field of transactions and spans.
123 | If there is no `tracestate` or no valid `es` entry with an `s` attribute,
124 | then the agent MUST omit `sample_rate` from non-root transactions and their spans.
125 |
--------------------------------------------------------------------------------
/specs/agents/tracing-spans-destination.md:
--------------------------------------------------------------------------------
1 | ## Span destination
2 |
3 | The span destination information is relevant for exit spans and helps to identify the downstream service.
4 | This information is used for the [service map](https://www.elastic.co/guide/en/kibana/current/service-maps.html),
5 | the [dependencies table](https://www.elastic.co/guide/en/kibana/current/service-overview.html#service-span-duration) in the service overview,
6 | and the [APM SIEM integration](https://www.elastic.co/blog/elastic-apm-7-6-0-released).
7 |
8 | ### Destination service fields
9 |
10 | In `context.destination.service`, `_.name` and `_.type` fields are deprecated and replaced by `context.service.target.*` fields.
11 | See [related specification](tracing-spans-service-target.md) for more details.
12 |
13 | The only field still required is `context.destination.service.resource` until APM server is able to infer it.
14 |
15 | #### Deprecated fields
16 |
17 | - `context.destination.service.name` : deprecated but still required in protocol, thus value should be an empty string `""`.
18 | - `context.destination.service.type` : deprecated but still required in protocol, thus value should be an empty string `""`.
19 |
20 | Agents MUST NOT manually set these fields.
21 | Agents MUST NOT offer non-deprecated public APIs to set them.
22 |
23 | The intake JSON spec until 7.14.0 requires the deprecated fields to be present if `context.destination.service.resource` is set.
24 | Future versions of APM Server will remove the fields from the intake API and drop it if sent by agents.
25 |
26 | Agents MAY omit the deprecated fields when sending spans to an APM Server version >= 7.14.0 .
27 | Otherwise, the field MUST be serialized as an empty string if `context.destination.service.resource` is set.
28 | Both options result in the fields being omitted from the Elasticsearch document.
29 |
30 | #### Destination resource
31 |
32 | - `context.destination.service.resource` :
33 | - ES field: `span.destination.service.resource`
34 | - Identifies unique destinations for each service.
35 | - value should be inferred from `context.service.target.*` fields
36 | - required for compatibility with existing features (Service Map, Dependencies) that rely on it
37 | - might become optional in the future once APM server is able to infer the value from `context.service.target.*` fields.
38 |
39 | Spans representing an external call MUST have `context.destination.service` information.
40 | If the span represents a call to an in-memory database, the information SHOULD still be set.
41 |
42 | Agents SHOULD have a generic component used in all tests that validates that the destination information is present for exit spans.
43 | Rather than opting into the validation, the testing should provide an opt-out if,
44 | for whatever reason, the destination information can't or shouldn't be collected for a particular exit span.
45 |
46 | **Usage**
47 |
48 | Each unique resource will result in a node on the [service map](https://www.elastic.co/guide/en/kibana/current/service-maps.html).
49 | Also, APM Server will roll up metrics based on the resource.
50 | These metrics are currently used for the [dependencies table](https://www.elastic.co/guide/en/kibana/current/service-overview.html#service-span-duration)
51 | on the service overview page.
52 | There are plans to use the service destination metrics in the service map, too.
53 |
54 | The metrics are calculated based on the (head-based) sampled span documents that are sent to APM Server.
55 | That's why agents have to send the [`sample_rate`](tracing-sampling.md#effect-on-metrics)
56 | attribute for transactions and spans:
57 | It is used by APM Server to extrapolate the service destination metrics based on the (head-based) sampled spans.
58 |
59 | **Cardinality**
60 |
61 | To avoid a huge impact on storage requirements for metrics,
62 | and to not "spam" the service map with lots of fine-grained nodes,
63 | the cardinality has to be kept low.
64 | However, the cardinality should not be too low, either,
65 | so that different clusters, instances, and queues can be displayed separately in the service map.
66 |
67 | The cardinality should be the same or higher as `span.destination.service.name`.
68 | Higher, if there are individual sub-resources for a service, such as individual queues for a message broker.
69 | Same cardinality otherwise.
70 |
71 | **API**
72 |
73 | Agents SHOULD offer a public API to set this field so that users can customize the value if the generic mapping is not
74 | sufficient. If set to `null` or an empty value, agents MUST omit the `span.destination.service` field altogether, thus
75 | providing a way to manually disable the automatic setting/inference of this field (e.g. in order to remove a node
76 | from a service map or an external service from the dependencies table).
77 | A user-supplied value MUST have the highest precedence, regardless if it was set before or after the automatic setting is invoked.
78 |
79 | **Value**
80 |
81 | For all [exit spans](tracing-spans.md#exit-spans), unless the `context.destination.service.resource` field was set by the user to `null` or an empty
82 | string through API, agents MUST infer the value of this field based on properties that are set on the span.
83 |
84 | If no value is set to the `context.destination.service.resource` field, the logic for automatically inferring
85 | it MUST be the following:
86 |
87 | ```groovy
88 | if (!span.context.service.target.name)
89 | span.context.service.target.type
90 | else if (!span.context.service.target.type)
91 | span.context.service.target.name
92 | else if (span.type == 'external')
93 | // Special case for HTTP, gRPC, and other rpc.system spans: skip the
94 | // "${service.target.type}/" prefix.
95 | span.context.service.target.name
96 | else
97 | "${span.context.service.target.type}/${span.context.service.target.name}"
98 | ```
99 |
100 | If an agent API was used to set the `context.destination.service.resource` to `null` or an empty string, agents MUST
101 | omit the `context.destination.service` field from the reported span event.
102 |
103 | The inference of `context.destination.service.resource` SHOULD be implemented in a central place within the agent,
104 | such as an on-span-end-callback or the setter of a dependant property,
105 | rather than being implemented for each individual library integration/instrumentation.
106 |
107 | For specific technologies, the field MAY be set non-centrally.
108 | However, updating the generic inference logic SHOULD be preferred, if feasible.
109 | Setting the value within a specific library integration/instrumentation is perfectly fine if there's only one canonical library for it.
110 | Examples: gRPC and cloud-provider specific backends.
111 |
112 | ### Destination fields
113 |
114 | These fields are used within the APM/SIEM integration.
115 | They don't play a role for service maps.
116 |
117 | Spans representing an external call SHOULD have `context.destination` information if it is easy to gather.
118 |
119 | Examples when the effort of capturing the address and port is not justified:
120 | * When the underlying protocol-layer code is not readily available in the instrumented code.
121 | * When the instrumentation captures the exit event,
122 | but the actual client is not bound to a specific connection (e.g. a client that does load balancing).
123 |
124 | #### `context.destination.address`
125 |
126 | ES field: [`destination.address`](https://www.elastic.co/guide/en/ecs/current/ecs-destination.html#_destination_field_details)
127 |
128 | Address is the destination network address: hostname (e.g. `localhost`), FQDN (e.g. `elastic.co`), IPv4 (e.g. `127.0.0.1`) IPv6 (e.g. `::1`)
129 |
130 | Agents MAY offer a public API to set this field so that users can override the automatically discovered one.
131 | This includes the ability to set `null` or empty value in order to unset the automatically-set value.
132 | A user-supplied value MUST have the highest precedence, regardless of whether it was set before or after the automatic setting is invoked.
133 |
134 | #### `context.destination.port`
135 |
136 | ES field: [`destination.port`](https://www.elastic.co/guide/en/ecs/current/ecs-destination.html#_destination_field_details)
137 |
138 | Port is the destination network port (e.g. 443)
139 |
140 | Agents MAY offer a public API to set this field so that users can override the automnatically discovered one.
141 | This includes the ability to set a non-positive value in order to unset the automatically-set value.
142 | A user-supplied value MUST have the highest precedence, regardless of whether it was set before or after the automatic setting is invoked.
143 |
--------------------------------------------------------------------------------
/specs/agents/tracing-transaction-grouping.md:
--------------------------------------------------------------------------------
1 | ## Transaction Grouping
2 |
3 | Even though agents should choose a transaction name that has a reasonable cardinality,
4 | they can't always guarantee that.
5 | For example,
6 | when the auto-instrumentation of a job scheduling framework sets the transaction name to the name of the instrumented job,
7 | the agent has no control over the job name itself.
8 | While usually the job name is expected to have low cardinality,
9 | users might set dynamic parts as part of the job name, such as a UUID.
10 |
11 | In order to give users an option to group transactions whose name contain dynamic parts that don't require code changes,
12 | agents MAY implement the following configuration option:
13 |
14 | ### `transaction_name_groups` configuration
15 |
16 | With this option,
17 | you can group transaction names that contain dynamic parts with a wildcard expression.
18 | For example,
19 | the pattern `GET /user/*/cart` would consolidate transactions,
20 | such as `GET /users/42/cart` and `GET /users/73/cart` into a single transaction name `GET /users/*/cart`, hence reducing the transaction name cardinality.
21 | The first matching expression wins, so make sure to place more specific expressions before more generic ones, for example: `GET /users/*/cart, GET /users/*`.
22 |
23 | | | |
24 | |----------------|------------------------------------------------------------------------------------------|
25 | | Type | `List<`[`WildcardMatcher`](../../tests/agents/json-specs/wildcard_matcher_tests.json)`>` |
26 | | Default | `` |
27 | | Dynamic | `true` |
28 | | Central config | `true` |
29 |
30 | The `url_groups` option that the Java and PHP agent offered is deprecated in favor of `transaction_name_groups`.
31 | ### When to apply the grouping
32 |
33 | The grouping can be applied either every time the transaction name is set, or lazily, when the transaction name is read.
34 |
35 | It's not sufficient to only apply the grouping when the transaction ends.
36 | That's because when an error is tracked, the transaction name is copied to the error object.
37 | See also [the error spec](error-tracking.md)
38 |
39 | Agents MUST also ensure that the grouping is applied before breakdown metrics are reported.
40 |
--------------------------------------------------------------------------------
/specs/agents/tracing-transactions.md:
--------------------------------------------------------------------------------
1 | ## Transactions
2 |
3 | Transactions are a special kind of span.
4 | They represent the entry into a service.
5 | They are sometimes also referred to as local roots or entry spans.
6 |
7 | Transactions are created either by the built-in auto-instrumentation or an agent or the [tracer API](tracing-api.md).
8 |
9 | ### Transaction outcome
10 |
11 | The `outcome` property denotes whether the transaction represents a success or a failure from the perspective of the entity that produced the event.
12 | The APM Server converts this to the [`event.outcome`](https://www.elastic.co/guide/en/ecs/current/ecs-allowed-values-event-outcome.html) field.
13 | This property is optional to preserve backwards compatibility.
14 | If an agent doesn't report the `outcome` (or reports `null`), the APM Server will set it based on `context.http.response.status_code`. If the status code is not available, then it will be set to `"unknown"`.
15 |
16 | - `"failure"`: Indicates that this transaction describes a failed result. \
17 | Note that client errors (such as HTTP 4xx) don't fall into this category as they are not an error from the perspective of the server.
18 | - `"success"`: Indicates that this transaction describes a successful result.
19 | - `"unknown"`: Indicates that there's no information about the outcome.
20 | This is the default value that applies when an outcome has not been set explicitly.
21 | This may be the case when a user tracks a custom transaction without explicitly setting an outcome.
22 | For existing auto-instrumentations, agents should set the outcome either to `"failure"` or `"success"`.
23 |
24 | What counts as a failed or successful request depends on the protocol.
25 |
26 | The following protocols get their outcome from protocol-level attributes:
27 |
28 | - [gRPC](tracing-instrumentation-grpc.md#outcome)
29 | - [HTTP](tracing-instrumentation-http.md#outcome)
30 |
31 | For other protocols, we can default to the following behavior:
32 |
33 | - `failure` when an error is reported
34 | - `success` otherwise
35 |
36 | #### Error rate
37 |
38 | The error rate of a transaction group is based on the `outcome` of its transactions.
39 |
40 | error_rate = failure / (failure + success)
41 |
42 | Note that when calculating the error rate,
43 | transactions with an `unknown` or non-existent outcome are not considered.
44 |
45 | The calculation just looks at the subset of transactions where the result is known and extrapolates the error rate for the total population.
46 | This avoids that `unknown` or non-existant outcomes reduce the error rate,
47 | which would happen when looking at a mix of old and new agents,
48 | or when looking at RUM data (as page load transactions have an `unknown` outcome).
49 |
50 | Also note that this only reflects the error rate as perceived from the application itself.
51 | The error rate perceived from its clients is greater or equal to that.
52 |
53 | #### Outcome API
54 |
55 | Agents should expose an API to manually override the outcome.
56 | This value must always take precedence over the automatically determined value.
57 | The documentation should clarify that transactions with `unknown` outcomes are ignored in the error rate calculation.
58 |
--------------------------------------------------------------------------------
/specs/agents/transport.md:
--------------------------------------------------------------------------------
1 | ## Transport
2 |
3 | Agents send data to the APM Server as JSON (application/json) or ND-JSON (application/x-ndjson) over HTTP. We describe here various details to guide transport implementation.
4 |
5 | ### User-Agent
6 |
7 | In order to help debugging and gathering usage statistics, agents should use one of the following values for the `User-Agent` HTTP header:
8 |
9 | - Header value should start with agent github repository as prefix and version `apm-agent-${language}/${agent.version}`.
10 | - If both `service.name` and `service.version` are set, append ` (${service.name} ${service.version})`
11 | - If only `service.name` is set, append `(${service.name})`
12 |
13 | An executable gherkin specification is also provided in [user_agent.feature](../../tests/agents/gherkin-specs/user_agent.feature).
14 |
15 | Examples:
16 | - `apm-agent-java/v1.25.0`
17 | - `apm-agent-ruby/4.4.0 (myservice)`
18 | - `apm-agent-python/6.4.0 (myservice v42.7)`
19 |
20 | ### Background sending
21 |
22 | In order to avoid impacting application performance and behaviour, agents should (where possible) send data in a non-blocking manner, e.g. via a background thread/goroutine/process/what-have-you, or using asynchronous I/O.
23 |
24 | If data is sent in the background process, then there must be some kind of queuing between that background process and the application code. The queue should be limited in size to avoid memory exhaustion. In the event that the queue fills up, agents must drop events: either drop old events or simply stop recording new events.
25 |
26 | ### Batching/streaming data
27 |
28 | With the exception of the RUM agent (which does not maintain long-lived connections to the APM Server), agents should use the ND-JSON format. The ND-JSON format enables agents to stream data to the server as it is being collected, with one event being encoded per line. This format is supported since APM Server 6.5.0.
29 |
30 | Agents should implement one of two methods for sending events to the server:
31 |
32 | - batch events together and send a complete request after a given size is reached, or amount of time has elapsed
33 | - start streaming events immediately to the server using a chunked-encoding request, and end the request after a given amount of data has been sent, or amount of time has elapsed
34 |
35 | The streaming approach is preferred. There are two configuration options that agents should implement to control when data is sent:
36 |
37 | - [ELASTIC_APM_API_REQUEST_TIME](https://www.elastic.co/guide/en/apm/agent/python/current/configuration.html#config-api-request-time)
38 | - [ELASTIC_APM_API_REQUEST_SIZE](https://www.elastic.co/guide/en/apm/agent/python/current/configuration.html#config-api-request-size)
39 |
40 | All events can be streamed as described in the [Intake API](https://www.elastic.co/guide/en/apm/server/current/intake-api.html) documentation. Each line encodes a single event, with the first line in a stream encoding the special metadata "event" which is folded into all following events. This metadata "event" is used to describe static properties of the system, process, agent, etc.
41 |
42 | When the batching approach is employed, unhandled exceptions/unexpected errors should typically be sent immediately to ensure timely error visibility, and to avoid data loss due to process termination. Even when using streaming there may be circumstances in which the agent should block the application until events are sent, but this should be both rare and configurable, to avoid interrupting normal program operation. For example, an application may terminate itself after logging a message at "fatal" level. In such a scenario, it may be useful for the agent to optionally block until enqueued events are sent prior to process termination.
43 |
44 | ### Transport errors
45 |
46 | If the HTTP response status code isn’t 2xx or if a request is prematurely closed (either on the TCP or HTTP level) the request MUST be considered failed.
47 |
48 | When a request fails, the agent has no way of knowing exactly what data was successfully processed by the APM Server. And since the agent doesn’t keep a copy of the data that was sent, there’s no way for the agent to re-send any data. Furthermore, as the data waiting to be sent is already compressed, it’s impractical to recover any of it in a way so that it can be sent over a new HTTP request.
49 |
50 | The agent should therefore drop the entire compressed buffer: both the internal zlib buffer, and potentially the already compressed data if such data is also buffered. Data subsequently written to the compression library can be directed to a new HTTP request.
51 |
52 | The new HTTP request should not necessarily be started immediately after the previous HTTP request fails, as the reason for the failure might not have been resolved up-stream. Instead an incremental back-off algorithm SHOULD be used to delay new requests. The grace period should be calculated in seconds using the algorithm `min(reconnectCount++, 6) ** 2 ± 10%`, where `reconnectCount` starts at zero. So the delay after the first error is 0 seconds, then circa 1, 4, 9, 16, 25 and finally 36 seconds. We add ±10% jitter to the calculated grace period in case multiple agents entered the grace period simultaneously. This way they will not all try to reconnect at the same time.
53 |
54 | Agents should support specifying multiple server URLs. When a transport error occurs, the agent should switch to another server URL at the same time as backing off.
55 |
56 | While the grace period is in effect, the agent may buffer the data that was supposed to be sent if the grace period wasn’t in effect. If buffering, the agent must ensure the memory used to buffer data data does not grow indefinitely.
57 |
58 | ### Compression
59 |
60 | The APM Server accepts both uncompressed and compressed HTTP requests. The following compression formats are supported:
61 |
62 | - zlib data format (`Content-Encoding: deflate`)
63 | - gzip data format (`Content-Encoding: gzip`)
64 |
65 | Agents MUST compress the HTTP payload and SHOULD optimize for speed over compactness (typically known as the "best speed" level).
66 |
67 | If the host part of the APM Server URL is either `localhost`, `127.0.0.1`, `::1`, or `0:0:0:0:0:0:0:1`, agents SHOULD disable compression.
68 | Agents MUST NOT use the compression level `NO_COMPRESSION` to disable compression.
69 | That's because the [Lambda extension](https://github.com/elastic/apm-aws-lambda/tree/main/apm-lambda-extension)
70 | would otherwise consider the data as being compressed (due to the `Content-Encoding` header) and send data to APM Server that's actually uncompressed.
71 |
72 | ### `context_propagation_only` configuration
73 |
74 | | | |
75 | |----------------|---|
76 | | Type | `boolean` |
77 | | Default | `false` |
78 | | Dynamic | `true` |
79 | | Central config | `true` |
80 |
81 | Agents MAY implement this configuration option.
82 | `context_propagation_only` is a boolean configuration option to have an APM
83 | agent perform trace-context propagation and log correlation *only*; and to
84 | explicitly *not* send event data to APM server. This allows an application to
85 | get automatic context propagation and log correlation, **without** having
86 | deployed an APM server for event collection.
87 |
88 | Agents that implement this configuration option:
89 |
90 | - MUST continue to propagate trace headers (`traceparent`, `tracestate`, etc.)
91 | per normal;
92 | - MUST start a trace-id if no `traceparent` header is present where they would normally start a transaction and propagate it.
93 | - MUST continue to support [log correlation](./log-correlation.md);
94 | - MUST NOT send event data to the APM server
95 | - SHOULD attempt to reduce runtime overhead where possible. For example,
96 | because events will be dropped there is no need to collect stack traces,
97 | collect metrics, calculate breakdown metrics, or to create spans (other than
98 | the top-level transaction required for context propagation, similarly to non-sampled traces).
99 |
100 |
101 | ### `disable_send` configuration
102 |
103 | Agents MAY implement this configuration option.
104 | `disable_send` is a boolean configuration option to have an APM agent be fully
105 | functioning, but not communicate with an APM server. Use case for this include
106 | testing and continuous integration (CI) systems.
107 |
108 | Agents that implement this configuration option:
109 |
110 | - MUST NOT attempt to communicate with APM server. This includes central configuration.
111 | - MUST NOT log warnings/errors related to failures to communicate with APM server.
112 | - SHOULD otherwise perform all functions.
113 |
--------------------------------------------------------------------------------
/specs/agents/uml/kafka_consume.puml:
--------------------------------------------------------------------------------
1 | @startuml kafka_consume
2 | hide footbox
3 | participant "Application (Consumer)" as app
4 | participant "APM agent" as apm
5 | queue Kafka as queue
6 |
7 | activate app
8 | loop while true
9 | activate apm #00BFB3
10 | apm -> apm: transaction.End()
11 | deactivate apm
12 |
13 | app -> queue: **consumer.Consume()**
14 | deactivate app
15 | activate queue
16 |
17 | group Message processing flow
18 | ... ~~blocking operation~~ ...
19 | queue --> app: message
20 |
21 | deactivate queue
22 | activate app
23 |
24 | apm -> apm: transaction.Start()
25 | activate apm #00BFB3
26 | end
27 |
28 | deactivate app
29 | deactivate apm
30 | end
31 | @enduml
--------------------------------------------------------------------------------
/specs/agents/uml/publish.puml:
--------------------------------------------------------------------------------
1 | @startuml publish
2 | hide footbox
3 | participant "Application (Publisher)" as app
4 | participant "APM agent" as apm
5 | queue "Messaging system" as queue
6 |
7 | activate app
8 | activate apm #00BFB3
9 | app -> queue: **publish message**
10 | deactivate app
11 | activate queue
12 | activate apm #1BA9F5
13 | apm -> apm: span.Start()
14 | note left of apm
15 | **Capture new span**
16 |
17 | There is an active transaction
18 | end note
19 | apm -> apm: span.End()
20 | queue --> app
21 | deactivate queue
22 | activate app
23 | deactivate apm
24 |
25 | @enduml
--------------------------------------------------------------------------------
/specs/agents/uml/publish.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/specs/integrations/synthetics.md:
--------------------------------------------------------------------------------
1 | ## Synthetics Integration
2 |
3 | Synthetic monitors play a crucial role in periodically checking the status of your services and applications on a global scale. General documentation about synthetic monitors can be found in
4 | [Synthetics getting started page](https://www.elastic.co/guide/en/observability/current/synthetics-get-started.html).
5 |
6 | This integration goes in to more detail about how the sythetics monitors would
7 | be correlated with the APM traces. Synthetics traces can be categorized in to two
8 | main types
9 | 1. HTTP checks - These have one-one mapping with APM transactions
10 | 2. Browser checks - These have a one-to-many mapping with APM transactions
11 |
12 | ### Correlation
13 |
14 | The Synthetics agent (including Heartbeat) takes the responsibility of creating the
15 | [`traceparent`](../agents/tracing-distributed-tracing.md#trace_id-parent_id-and-traceparent)
16 | header for each outgoing network request associated with a test during every
17 | monitor execution.
18 |
19 | - `trace.id` and `parent.id`
20 | - outgoing requests that are being explicity traced by the synthetics agent
21 | will have the `parent.id` and `trace.id` as part of the trace context.
22 | - must be unique for each step for a browser monitor
23 | - must be unique for a http monitor
24 | - `sampled` Flag
25 | - used to control the sampling decision for all the downstream services.
26 | - 100% sampling when tracing is enabled
27 |
28 | #### Browser checks
29 |
30 | When executing a Synthetics journey with tracing enabled for all outgoing requests `**/*` or for specific URLs with the --apm_tracing_urls flag, the Synthetics agent takes the following actions:
31 |
32 | 1. Adds the traceparent header to each matching outgoing request.
33 | 2. Includes trace.id and parent.id in all the Step Elasticsearch (ES) documents for the journey.
34 |
35 | ```ts
36 | // run journey
37 | npx @elastic/synthetics --apm_tracing_urls "elastic.co/*"
38 |
39 | // example.journey.ts
40 | journey("elastic e2e", ({ page }) => {
41 | step("home page", async() => {
42 | await page.goto("https://www.elastic.co")
43 | })
44 | step("blog page", async() => {
45 | await page.goto("https://www.elastic.co/blog")
46 | })
47 | })
48 | ```
49 |
50 | Example of the tracing information added to the ES documents for two steps in the journey:
51 |
52 | ```json
53 | // Step - homepage
54 | {"type":"step/end","journey":{"name":"elastic e2e"},"step":{"name":"home page","index":1,"status":"failed","duration":{"us":17382122}}, "trace.id": "t1"}
55 | {"type":"journey/network_info","journey":{"name":"elastic e2e"},"step":{"name":"home page","index":1},"http":{"request":{"url":"http://www.elastic.co/","method":"GET"}},"trace.id": "t1", "span.id": "s1"}
56 |
57 |
58 | // Step - blog page
59 | {"type":"step/end","journey":{"name":"elastic e2e"},"step":{"name":"blog page","index":2,"status":"failed","duration":{"us":17382122}}, "trace.id": "t2"}
60 | {"type":"journey/network_info","journey":{"name":"elastic e2e"},"step":{"name":"blog page","index":2},"http":{"request":{"url":"http://www.elastic.co/blog","method":"GET"}},"trace.id": "t2", "span.id": "s2"}
61 | ```
62 |
63 | With this tracing information available in the ES documents for each step's network requests, the Synthetics UI can link back to the individual backend transactions in APM.
64 |
65 | #### HTTP Checks
66 |
67 | For the below HTTP monitor
68 |
69 | ```yml
70 | # heartbeat.yml
71 | heartbeat.monitors:
72 | - type: http
73 | id: test-http
74 | urls: ["https://www.example.com"]
75 | apm:
76 | enabled: true
77 | ```
78 |
79 | Heartbeat would add the `traceparent` header to the monitored URL and add the
80 | other tracing related information to the ES documents.
81 |
82 | ```json
83 | {"event":{"action":"monitor.run"},"monitor":{"id":"test-http","type":"http","status":"up","duration":{"ms":112}}, "trace.id": "t1", "span.id": "s1"}
84 | ```
85 |
86 | It's important to note that there is no dedicated waterfall information for the HTTP checks in the Synthetics UI. Consequently, the linking here will directly take you to the transaction if the backend is also traced by Elastic APM or OTel (OpenTelemetry)-based agents. This works similar to the Browser checks where the network request is directly linked to the transaction.
87 |
88 | **NOTE: The correlation remain applicable even if downstream services are traced by OpenTelemetry (OTel)-based agents. This ensures a consistent and seamless tracing experience regardless of the underlying tracing infrastructure.**
89 |
90 | ### Identifying Synthetics trace
91 |
92 | When tracing is enabled on the Synthetics monitors, the agent appends the `Elastic/Synthetics` to the HTTP `User-Agent` header for all outgoing requests. Tracing UI can use this information to identify the traces that are originated from
93 | Synthetics using the following approaches.
94 |
95 | - Elastic APM agents
96 | - The information is stored in `http.headers.user-agent`
97 | - OTel agents
98 | - The information is stored in `user_agent.original`
99 |
100 | UI will check both of these fields to identify the Synthetics traces and will
101 | prefer `user_agent.original` if both are present.
102 |
103 | There is a limitation with this approach
104 | - users can override the `User-Agent` header in the monitor configuration which
105 | might lead to users seeing only partial traces on APM UI.
106 |
107 | When a trace is confirmed to be originated from Synthetics-based monitors, the
108 | Trace Explorer view can be linked back to the Synthetics waterfall.
109 |
110 | - `/app/synthetics/link-to/:span.id`
111 | - links back to the explicit browser waterfall step on the Synthetics UI, and
112 | it follows the format `/monitor/:monitorId/test-run/:runId/step/:stepIndex#:spanId`.
--------------------------------------------------------------------------------
/specs/terminology.md:
--------------------------------------------------------------------------------
1 | # Terminology
2 |
3 | This document describes terms and concepts often used within the APM ecosystem.
4 |
5 | #### APM
6 | Application Performance Monitoring is the concept of profiling and monitoring services and applications. For instance, it accounts for things like requests per second, but not how much free space is on a disk.
7 |
8 | #### Distributed Tracing
9 | Distributed tracing makes it possible to analyze performance throughout a microservice architecture all in one view. This is accomplished by tracing all of the requests - from the initial web request in the front-end service - to queries made through multiple back-end services. [Further reading](https://www.elastic.co/guide/en/apm/get-started/current/distributed-tracing.html)
10 |
11 | #### Instrumentation
12 | To be able to monitor an application it needs to be _instrumented_ by an APM Agent.
13 | Usually not everything is instrumented because that would incur a very large overhead. We are interested only in performance-sensitive activities, like database queries or web requests. Instrumentation can happen automatically (the Elastic APM Agents instrument many frameworks and databases automatically), or manually with a programmatic API.
14 |
15 | #### Library frames vs App frames
16 |
17 | > A **stack frame** is a frame of data that gets pushed onto the stack. In the case of a call stack, a stack frame would represent a function call and its argument data ([Source](https://stackoverflow.com/a/10057535/434980))
18 |
19 | We distinguish between the user's own code, and the code of the user's dependencies. Often the user is more interested in stack frames from their own code, so these are highlighted.
20 | [Further reading](https://www.elastic.co/guide/en/apm/agent/nodejs/master/performance-tuning.html#performance-source-lines)
21 |
22 | #### Real User Monitoring (RUM)
23 | Real User Monitoring (RUM) tries to capture the real user’s experience with the application. Generally this means monitoring the application on the user’s machine e.g. their browsers or mobile devices. The APM agent used in the browser is called [RUM agent](https://www.elastic.co/guide/en/apm/agent/rum-js/4.x/intro.html)
24 |
25 | #### Service
26 | The application/service being instrumented by APM. A service is uniquely identified by name + environment.
27 |
28 | #### Time to glass
29 | The time from an event occurs in an application until it is visible to the user in the APM UI
30 |
31 | ## Elastic APM Architecture
32 |
33 | The Elastic APM offering consists of APM Agents, APM Server, APM UI and Kibana dashboards.
34 |
35 | #### APM Agent
36 | An APM agent lives inside an application and will automatically collect APM data (transactions, spans, metrics and errors) and send it to the APM Server.
37 | [Further reading](https://www.elastic.co/guide/en/apm/get-started/current/components.html#_apm_agents)
38 |
39 | #### APM Server
40 | The APM Server receives data from the Elastic APM agents and stores the data into Elasticsearch.
41 | [Further reading](https://www.elastic.co/guide/en/apm/get-started/current/components.html#_apm_server)
42 |
43 | #### APM UI
44 | APM UI is a Kibana plugin which reads captured APM data from Elasticsearch, and displays it in curated charts and tables that allow the user to easily compare and debug their applications. This UI is only available with an Elastic Basic License.
45 | [Further reading](https://www.elastic.co/guide/en/kibana/7.3/xpack-apm.html)
46 |
47 | #### Kibana dashboards (aka APM dashboards)
48 | Custom Kibana dashboards made for APM. These used to be bundled with Kibana but are now located in the [apm-contrib repo](https://github.com/elastic/apm-contrib/tree/471ef577fe6ae583d49ced4b2047a3763fac7a7b/kibana). The dashboards are available without an Elastic license.
49 |
50 | ## APM documents
51 |
52 | #### Span
53 | Spans contain information about a specific code path that has been executed. They measure from the start to end of an activity, and they can have a parent/child relationship with other spans.
54 | [Further reading](https://www.elastic.co/guide/en/apm/get-started/current/transaction-spans.html)
55 |
56 | #### Transaction
57 | Transactions are a special kind of span that have additional attributes associated with them. They describe an event captured by an Elastic APM agent instrumenting a service. You can think of transactions as the highest level of work you’re measuring within a service
58 | [Further reading](https://www.elastic.co/guide/en/apm/get-started/current/transactions.html)
59 |
60 | #### Trace
61 | A trace is a grouping of spans and transactions that all share the same `trace.id`
62 |
63 | ## Sampling
64 |
65 | To reduce processing and storage overhead, transactions may be "sampled".
66 | Non-sampled transactions and spans will not be stored in Elasticsearch.
67 |
68 | In versions prior to 8.0, non-sampled transactions were stored, but no context was recorded and no related spans were captured.
69 |
70 | #### Adaptive sampling
71 | TODO
72 |
73 | #### Head based sampling
74 | Deciding whether to sample a trace before it has started. The decision to sample will often be very simplistic e.g. sample 1% of traces.
75 |
76 | #### Tail-based sampling
77 | Deciding whether to sample a trace after it has ended. This makes it possible to sample based on _interesting_ events like error occurrence, latency etc.
78 |
79 |
--------------------------------------------------------------------------------
/tests/agents/README.md:
--------------------------------------------------------------------------------
1 | # APM test fixtures
2 |
3 | Files provided here may be used by agents to ensure matching results across languages/agents.
4 |
5 | ## SQL signatures
6 |
7 | SQL-based data stores' span names are abbreviated versions of their queries, eg. `SELECT * FROM users WHERE id=1` becomes `SELECT FROM users`.
8 |
9 | ### For precision
10 |
11 | - `tests/agents/sql_token_examples.json`
12 | - `tests/agents/sql_signature_examples.json`
13 |
14 | To get similar results across agents a set of `input -> expected output` examples are provided here as JSON files.
15 |
16 | Using or complying to these isn't a requirement.
17 |
18 | - Reference issue: [elastic/apm#12](https://github.com/elastic/apm/issues/12).
19 | - Reference doc: [RFC: SQL parsing](https://docs.google.com/document/d/1sblkAP1NHqk4MtloUta7tXjDuI_l64sT2ZQ_UFHuytA/)
20 |
21 | ### For performance
22 |
23 | See [this distribution](https://github.com/johnthebrave/nlidb-datasets) for example queries.
24 |
--------------------------------------------------------------------------------
/tests/agents/gherkin-specs/api_key.feature:
--------------------------------------------------------------------------------
1 | Feature: APM server authentication with API key and secret token
2 |
3 | Scenario: A configured API key is sent in the Authorization header
4 | Given an agent configured with
5 | | setting | value |
6 | | api_key | RTNxMjlXNEJt |
7 | When the agent sends a request to APM server
8 | Then the Authorization header of the request is 'ApiKey RTNxMjlXNEJt'
9 |
10 | Scenario: A configured secret token is sent in the Authorization header
11 | Given an agent configured with
12 | | setting | value |
13 | | secret_token | secr3tT0ken |
14 | When the agent sends a request to APM server
15 | Then the Authorization header of the request is 'Bearer secr3tT0ken'
16 |
17 | Scenario: A configured API key takes precedence over a secret token
18 | Given an agent configured with
19 | | setting | value |
20 | | api_key | MjlXNEJasdfDt |
21 | | secret_token | secr3tT0ken |
22 | When the agent sends a request to APM server
23 | Then the Authorization header of the request is 'ApiKey MjlXNEJasdfDt'
24 |
25 |
--------------------------------------------------------------------------------
/tests/agents/gherkin-specs/azure_app_service_metadata.feature:
--------------------------------------------------------------------------------
1 | Feature: Extracting Metadata for Azure App Service
2 |
3 | Background:
4 | Given an agent configured with
5 | | setting | value |
6 | | cloud_provider | azure |
7 |
8 | Scenario Outline: Azure App Service with all environment variables present in expected format
9 | Given the following environment variables are present
10 | | name | value |
11 | | WEBSITE_OWNER_NAME | |
12 | | WEBSITE_RESOURCE_GROUP | resource_group |
13 | | WEBSITE_SITE_NAME | site_name |
14 | | WEBSITE_INSTANCE_ID | instance_id |
15 | When cloud metadata is collected
16 | Then cloud metadata is not null
17 | And cloud metadata 'account.id' is 'f5940f10-2e30-3e4d-a259-63451ba6dae4'
18 | And cloud metadata 'provider' is 'azure'
19 | And cloud metadata 'instance.id' is 'instance_id'
20 | And cloud metadata 'instance.name' is 'site_name'
21 | And cloud metadata 'project.name' is 'resource_group'
22 | And cloud metadata 'region' is 'AustraliaEast'
23 | Examples:
24 | | WEBSITE_OWNER_NAME |
25 | | f5940f10-2e30-3e4d-a259-63451ba6dae4+elastic-apm-AustraliaEastwebspace |
26 | | f5940f10-2e30-3e4d-a259-63451ba6dae4+appsvc_linux_australiaeast-AustraliaEastwebspace-Linux |
27 |
28 | # WEBSITE_OWNER_NAME is expected to include a + character
29 | Scenario: WEBSITE_OWNER_NAME environment variable not expected format
30 | Given the following environment variables are present
31 | | name | value |
32 | | WEBSITE_OWNER_NAME | f5940f10-2e30-3e4d-a259-63451ba6dae4-elastic-apm-AustraliaEastwebspace |
33 | | WEBSITE_RESOURCE_GROUP | resource_group |
34 | | WEBSITE_SITE_NAME | site_name |
35 | | WEBSITE_INSTANCE_ID | instance_id |
36 | When cloud metadata is collected
37 | Then cloud metadata is null
38 |
39 | Scenario: Missing WEBSITE_OWNER_NAME environment variable
40 | Given the following environment variables are present
41 | | name | value |
42 | | WEBSITE_RESOURCE_GROUP | resource_group |
43 | | WEBSITE_SITE_NAME | site_name |
44 | | WEBSITE_INSTANCE_ID | instance_id |
45 | When cloud metadata is collected
46 | Then cloud metadata is null
47 |
48 | Scenario: Missing WEBSITE_RESOURCE_GROUP environment variable
49 | Given the following environment variables are present
50 | | name | value |
51 | | WEBSITE_OWNER_NAME | f5940f10-2e30-3e4d-a259-63451ba6dae4+elastic-apm-AustraliaEastwebspace |
52 | | WEBSITE_SITE_NAME | site_name |
53 | | WEBSITE_INSTANCE_ID | instance_id |
54 | When cloud metadata is collected
55 | Then cloud metadata is null
56 |
57 | Scenario: Missing WEBSITE_SITE_NAME environment variable
58 | Given the following environment variables are present
59 | | name | value |
60 | | WEBSITE_OWNER_NAME | f5940f10-2e30-3e4d-a259-63451ba6dae4+elastic-apm-AustraliaEastwebspace |
61 | | WEBSITE_RESOURCE_GROUP | resource_group |
62 | | WEBSITE_INSTANCE_ID | instance_id |
63 | When cloud metadata is collected
64 | Then cloud metadata is null
65 |
66 | Scenario: Missing WEBSITE_INSTANCE_ID environment variable
67 | Given the following environment variables are present
68 | | name | value |
69 | | WEBSITE_OWNER_NAME | f5940f10-2e30-3e4d-a259-63451ba6dae4+elastic-apm-AustraliaEastwebspace |
70 | | WEBSITE_RESOURCE_GROUP | resource_group |
71 | | WEBSITE_SITE_NAME | site_name |
72 | When cloud metadata is collected
73 | Then cloud metadata is null
--------------------------------------------------------------------------------
/tests/agents/gherkin-specs/azure_functions_metadata.feature:
--------------------------------------------------------------------------------
1 | Feature: Extracting Metadata for Azure Function Apps
2 |
3 | Background:
4 | Given an agent configured with
5 | | setting | value |
6 | | cloud_provider | azure |
7 |
8 | Scenario Outline: Azure Function App with minimum set of environment variables present in expected format
9 | Given the following environment variables are present
10 | | name | value |
11 | | FUNCTIONS_EXTENSION_VERSION | version |
12 | | WEBSITE_OWNER_NAME | d2cd53b3-acdc-4964-9563-3f5201556a81+faas_group-CentralUSwebspace-Linux |
13 | | WEBSITE_SITE_NAME | site_name |
14 | When cloud metadata is collected
15 | Then cloud metadata is not null
16 | And cloud metadata 'account.id' is 'd2cd53b3-acdc-4964-9563-3f5201556a81'
17 | And cloud metadata 'provider' is 'azure'
18 | And cloud metadata 'service.name' is 'functions'
19 | And cloud metadata 'instance.name' is 'site_name'
20 | And cloud metadata 'project.name' is 'faas_group'
21 | And cloud metadata 'region' is 'CentralUS'
22 |
23 | Scenario Outline: Azure Function App with typical set of environment variables present in expected format
24 | Given the following environment variables are present
25 | | name | value |
26 | | FUNCTIONS_EXTENSION_VERSION | version |
27 | | WEBSITE_OWNER_NAME | d2cd53b3-acdc-4964-9563-3f5201556a81+faas_group-CentralUSwebspace-Linux |
28 | | WEBSITE_SITE_NAME | site_name |
29 | | REGION_NAME | Central US |
30 | | WEBSITE_RESOURCE_GROUP | faas_group_from_env |
31 | When cloud metadata is collected
32 | Then cloud metadata is not null
33 | And cloud metadata 'account.id' is 'd2cd53b3-acdc-4964-9563-3f5201556a81'
34 | And cloud metadata 'provider' is 'azure'
35 | And cloud metadata 'service.name' is 'functions'
36 | And cloud metadata 'instance.name' is 'site_name'
37 | And cloud metadata 'project.name' is 'faas_group_from_env'
38 | And cloud metadata 'region' is 'Central US'
39 |
40 | Scenario: WEBSITE_OWNER_NAME environment variable not expected format
41 | Given the following environment variables are present
42 | | name | value |
43 | | WEBSITE_OWNER_NAME | d2cd53b3-acdc-4964-9563-3f5201556a81-faas_group-CentralUSwebspace-Linux |
44 | | WEBSITE_SITE_NAME | site_name |
45 | When cloud metadata is collected
46 | Then cloud metadata is null
47 |
48 | Scenario: Missing FUNCTIONS_EXTENSION_VERSION environment variable
49 | Given the following environment variables are present
50 | | name | value |
51 | | WEBSITE_OWNER_NAME | d2cd53b3-acdc-4964-9563-3f5201556a81+faas_group-CentralUSwebspace-Linux |
52 | | WEBSITE_SITE_NAME | site_name |
53 | When cloud metadata is collected
54 | Then cloud metadata is null
55 |
56 | Scenario: Missing WEBSITE_OWNER_NAME environment variable
57 | Given the following environment variables are present
58 | | name | value |
59 | | FUNCTIONS_EXTENSION_VERSION | version |
60 | | WEBSITE_SITE_NAME | site_name |
61 | When cloud metadata is collected
62 | Then cloud metadata is null
63 |
64 | Scenario: Missing WEBSITE_SITE_NAME environment variable
65 | Given the following environment variables are present
66 | | name | value |
67 | | FUNCTIONS_EXTENSION_VERSION | version |
68 | | WEBSITE_OWNER_NAME | d2cd53b3-acdc-4964-9563-3f5201556a81+faas_group-CentralUSwebspace-Linux |
69 | When cloud metadata is collected
70 | Then cloud metadata is null
71 |
--------------------------------------------------------------------------------
/tests/agents/gherkin-specs/outcome.feature:
--------------------------------------------------------------------------------
1 | Feature: Outcome
2 |
3 | Background: An agent with default configuration
4 | Given an agent
5 |
6 | # ---- user set outcome
7 |
8 | Scenario: User set outcome on span has priority over instrumentation
9 | Given an active span
10 | And the agent sets the span outcome to 'success'
11 | And a user sets the span outcome to 'failure'
12 | When the span ends
13 | Then the span outcome is 'failure'
14 |
15 | Scenario: User set outcome on transaction has priority over instrumentation
16 | Given an active transaction
17 | And the agent sets the transaction outcome to 'failure'
18 | And a user sets the transaction outcome to 'unknown'
19 | When the transaction ends
20 | Then the transaction outcome is 'unknown'
21 |
22 | # ---- span & transaction outcome from reported errors
23 |
24 | Scenario: span with error
25 | Given an active span
26 | And an error is reported to the span
27 | When the span ends
28 | Then the span outcome is 'failure'
29 |
30 | Scenario: span without error
31 | Given an active span
32 | When the span ends
33 | Then the span outcome is 'success'
34 |
35 | Scenario: transaction with error
36 | Given an active transaction
37 | And an error is reported to the transaction
38 | When the transaction ends
39 | Then the transaction outcome is 'failure'
40 |
41 | Scenario: transaction without error
42 | Given an active transaction
43 | When the transaction ends
44 | Then the transaction outcome is 'success'
45 |
46 | # ---- HTTP
47 |
48 | @http
49 | Scenario Outline: HTTP transaction and span outcome
50 | Given an active transaction
51 | And a HTTP call is received that returns
52 | When the transaction ends
53 | Then the transaction outcome is ''
54 | Given an active span
55 | And a HTTP call is made that returns
56 | When the span ends
57 | Then the span outcome is ''
58 | Examples:
59 | | status | client | server |
60 | | 100 | success | success |
61 | | 200 | success | success |
62 | | 300 | success | success |
63 | | 400 | failure | success |
64 | | 404 | failure | success |
65 | | 500 | failure | failure |
66 | | -1 | failure | failure |
67 | # last row with negative status represents the case where the status is not available
68 | # for example when an exception/error is thrown without status (IO error, redirect loop, ...)
69 |
70 | # ---- gRPC
71 |
72 | # reference spec : https://github.com/grpc/grpc/blob/master/doc/statuscodes.md
73 |
74 | @grpc
75 | Scenario Outline: gRPC transaction and span outcome
76 | Given an active transaction
77 | And a gRPC call is received that returns ''
78 | When the transaction ends
79 | Then the transaction outcome is ''
80 | Given an active span
81 | And a gRPC call is made that returns ''
82 | When the span ends
83 | Then the span outcome is ''
84 | Examples:
85 | | status | client | server |
86 | | OK | success | success |
87 | | CANCELLED | failure | success |
88 | | UNKNOWN | failure | failure |
89 | | INVALID_ARGUMENT | failure | success |
90 | | DEADLINE_EXCEEDED | failure | failure |
91 | | NOT_FOUND | failure | success |
92 | | ALREADY_EXISTS | failure | success |
93 | | PERMISSION_DENIED | failure | success |
94 | | RESOURCE_EXHAUSTED | failure | failure |
95 | | FAILED_PRECONDITION | failure | failure |
96 | | ABORTED | failure | failure |
97 | | OUT_OF_RANGE | failure | success |
98 | | UNIMPLEMENTED | failure | success |
99 | | INTERNAL | failure | failure |
100 | | UNAVAILABLE | failure | failure |
101 | | DATA_LOSS | failure | failure |
102 | | UNAUTHENTICATED | failure | success |
103 | | n/a | failure | failure |
104 | # last row with 'n/a' status represents the case where status is not available
105 |
--------------------------------------------------------------------------------
/tests/agents/gherkin-specs/user_agent.feature:
--------------------------------------------------------------------------------
1 | Feature: Agent Transport User agent Header
2 |
3 | Scenario: Default user-agent
4 | Given an agent
5 | When the agent sends a request to APM server
6 | Then the User-Agent header of the request matches regex '^apm-agent-[a-z]+/[^ ]* \(.*\)'
7 |
8 | Scenario: Default user-agent when setting invalid service
9 | Given an agent configured with
10 | | setting | value |
11 | | service_name | myService/()<>@ |
12 | When the agent sends a request to APM server
13 | Then the User-Agent header of the request matches regex '^apm-agent-[a-z]+/[^ ]* \(.*\)'
14 |
15 | Scenario: User-agent with service name only
16 | Given an agent configured with
17 | | setting | value |
18 | | service_name | myService |
19 | When the agent sends a request to APM server
20 | Then the User-Agent header of the request matches regex '^apm-agent-[a-z]+/[^ ]* \(myService\)'
21 |
22 | Scenario Outline: User-agent with service name and service version
23 | Given an agent configured with
24 | | setting | value |
25 | | service_name | |
26 | | service_version | |
27 | When the agent sends a request to APM server
28 | Then the User-Agent header of the request matches regex '^apm-agent-[a-z]+/[^ ]* \(\)'
29 | Examples:
30 | | SERVICE_NAME | ESCAPED_SERVICE_NAME | SERVICE_VERSION | ESCAPED_SERVICE_VERSION |
31 | | myService | myService | v42 | v42 |
32 | | myService | myService | 123(:\;)456 | 123_:_;_456 |
33 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/container_metadata_discovery.json:
--------------------------------------------------------------------------------
1 | {
2 | "cgroup_v1_underscores": {
3 | "files": {
4 | "/proc/self/cgroup": [
5 | "1:name=systemd:/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod90d81341_92de_11e7_8cf2_507b9d4141fa.slice/crio-2227daf62df6694645fee5df53c1f91271546a9560e8600a525690ae252b7f63.scope"
6 | ]
7 | },
8 | "containerId": "2227daf62df6694645fee5df53c1f91271546a9560e8600a525690ae252b7f63",
9 | "podId": "90d81341-92de-11e7-8cf2-507b9d4141fa"
10 | },
11 | "cgroup_v1_openshift": {
12 | "files": {
13 | "/proc/self/cgroup": [
14 | "9:freezer:/kubepods.slice/kubepods-pod22949dce_fd8b_11ea_8ede_98f2b32c645c.slice/docker-b15a5bdedd2e7645c3be271364324321b908314e4c77857bbfd32a041148c07f.scope"
15 | ]
16 | },
17 | "containerId": "b15a5bdedd2e7645c3be271364324321b908314e4c77857bbfd32a041148c07f",
18 | "podId": "22949dce-fd8b-11ea-8ede-98f2b32c645c"
19 | },
20 | "cgroup_v1_ubuntu": {
21 | "files": {
22 | "/proc/self/cgroup": [
23 | "1:name=systemd:/user.slice/user-1000.slice/user@1000.service/apps.slice/apps-org.gnome.Terminal.slice/vte-spawn-75bc72bd-6642-4cf5-b62c-0674e11bfc84.scope"
24 | ]
25 | },
26 | "containerId": null,
27 | "podId": null
28 | },
29 | "cgroup_v1_awsEcs": {
30 | "files": {
31 | "/proc/self/cgroup": [
32 | "1:name=systemd:/ecs/03752a671e744971a862edcee6195646/03752a671e744971a862edcee6195646-4015103728"
33 | ]
34 | },
35 | "containerId": "03752a671e744971a862edcee6195646-4015103728",
36 | "podId": null
37 | },
38 | "cgroup_v2": {
39 | "files": {
40 | "/proc/self/cgroup": [
41 | "0::/"
42 | ],
43 | "/proc/self/mountinfo": [
44 | "3984 3905 0:73 / / rw,relatime shared:1863 master:1733 - overlay overlay rw,lowerdir=/var/lib/docker/overlay2/l/KEX7CWLHQCXQY2RHPGTXJ3C26N:/var/lib/docker/overlay2/l/2PVS7JRTRSTVZS4KSUAFML3BIV:/var/lib/docker/overlay2/l/52M7ARM4JDVHCJAYUI6JIKBO4B,upperdir=/var/lib/docker/overlay2/267f825fb89e584605bf161177451879c0ba8b15f7df9b51fb7843c7beb9ed25/diff,workdir=/var/lib/docker/overlay2/267f825fb89e584605bf161177451879c0ba8b15f7df9b51fb7843c7beb9ed25/work",
45 | "3985 3984 0:77 / /proc rw,nosuid,nodev,noexec,relatime shared:1864 - proc proc rw",
46 | "3986 3984 0:78 / /dev rw,nosuid shared:1865 - tmpfs tmpfs rw,size=65536k,mode=755,inode64",
47 | "3987 3986 0:79 / /dev/pts rw,nosuid,noexec,relatime shared:1866 - devpts devpts rw,gid=5,mode=620,ptmxmode=666",
48 | "3988 3984 0:80 / /sys ro,nosuid,nodev,noexec,relatime shared:1870 - sysfs sysfs ro",
49 | "3989 3988 0:30 / /sys/fs/cgroup rw,nosuid,nodev,noexec,relatime shared:1871 - cgroup2 cgroup rw",
50 | "3990 3986 0:76 / /dev/mqueue rw,nosuid,nodev,noexec,relatime shared:1867 - mqueue mqueue rw",
51 | "3991 3986 0:81 / /dev/shm rw,nosuid,nodev,noexec,relatime shared:1868 - tmpfs shm rw,size=65536k,inode64",
52 | "3992 3984 253:1 /var/lib/docker/volumes/9d18ce5b36572d85358fa936afe5a4bf95cca5c822b04941aa08c6118f6e0d33/_data /var rw,relatime shared:1872 master:1 - ext4 /dev/mapper/vgubuntu-root rw,errors=remount-ro",
53 | "3993 3984 0:82 / /run rw,nosuid,nodev,noexec,relatime shared:1873 - tmpfs tmpfs rw,inode64",
54 | "3994 3984 0:83 / /tmp rw,nosuid,nodev,noexec,relatime shared:1874 - tmpfs tmpfs rw,inode64",
55 | "3995 3984 253:1 /usr/lib/modules /usr/lib/modules ro,relatime shared:1875 - ext4 /dev/mapper/vgubuntu-root rw,errors=remount-ro",
56 | "3996 3984 253:1 /var/lib/docker/containers/6548c6863fb748e72d1e2a4f824fde92f720952d062dede1318c2d6219a672d6/resolv.conf /etc/resolv.conf rw,relatime shared:1876 - ext4 /dev/mapper/vgubuntu-root rw,errors=remount-ro",
57 | "3997 3984 253:1 /var/lib/docker/containers/6548c6863fb748e72d1e2a4f824fde92f720952d062dede1318c2d6219a672d6/hostname /etc/hostname rw,relatime shared:1877 - ext4 /dev/mapper/vgubuntu-root rw,errors=remount-ro",
58 | "3998 3984 253:1 /var/lib/docker/containers/6548c6863fb748e72d1e2a4f824fde92f720952d062dede1318c2d6219a672d6/hosts /etc/hosts rw,relatime shared:1878 - ext4 /dev/mapper/vgubuntu-root rw,errors=remount-ro"
59 | ]
60 | },
61 | "containerId": "6548c6863fb748e72d1e2a4f824fde92f720952d062dede1318c2d6219a672d6",
62 | "podId": null
63 | },
64 | "gardener": {
65 | "files": {
66 | "/proc/self/mountinfo": [
67 | "10112 5519 0:864 / / ro,relatime master:1972 - overlay overlay rw,lowerdir=/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/35235/fs:/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/27346/fs:/var/lib/containerd/io.containerd.snapshotter.v1.overlayfs/snapshots/27345/fs:/var/lib/containerd/io.containerd.snapsh",
68 | "10113 10112 0:884 / /proc rw,nosuid,nodev,noexec,relatime - proc proc rw",
69 | "10301 10112 0:926 / /dev rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,inode64",
70 | "10302 10301 0:930 / /dev/pts rw,nosuid,noexec,relatime - devpts devpts rw,gid=5,mode=620,ptmxmode=666",
71 | "10519 10301 0:820 / /dev/mqueue rw,nosuid,nodev,noexec,relatime - mqueue mqueue rw",
72 | "10520 10112 0:839 / /sys ro,nosuid,nodev,noexec,relatime - sysfs sysfs ro",
73 | "10716 10520 0:26 /kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod121157b5_c67d_4c3e_9052_cb27bbb711fb.slice/cri-containerd-1cd3449e930b8a28c7595240fa32ba20c84f36d059e5fbe63104ad40057992d1.scope /sys/fs/cgroup ro,nosuid,nodev,noexec,relatime - cgroup2 cgroup rw",
74 | "10736 10112 8:3 /var/lib/kubelet/pods/121157b5-c67d-4c3e-9052-cb27bbb711fb/volumes/kubernetes.io~empty-dir/tmpdir /tmp rw,relatime - ext4 /dev/sda3 rw,discard,prjquota,errors=remount-ro",
75 | "10737 10112 0:786 / /vault/tls ro,relatime - tmpfs tmpfs rw,size=4194304k,inode64",
76 | "10738 10112 8:3 /var/lib/kubelet/pods/121157b5-c67d-4c3e-9052-cb27bbb711fb/etc-hosts /etc/hosts rw,relatime - ext4 /dev/sda3 rw,discard,prjquota,errors=remount-ro",
77 | "10739 10301 8:3 /var/lib/kubelet/pods/121157b5-c67d-4c3e-9052-cb27bbb711fb/containers/application-search-indexer/9bf2b38c /dev/termination-log rw,relatime - ext4 /dev/sda3 rw,discard,prjquota,errors=remount-ro",
78 | "10740 10112 8:3 /var/lib/containerd/io.containerd.grpc.v1.cri/sandboxes/26a006f558da58874bc37863efe9d2b5d715afc54453d95b22a7809a4e65566c/hostname /etc/hostname ro,relatime - ext4 /dev/sda3 rw,discard,prjquota,errors=remount-ro",
79 | "10741 10112 8:3 /var/lib/containerd/io.containerd.grpc.v1.cri/sandboxes/26a006f558da58874bc37863efe9d2b5d715afc54453d95b22a7809a4e65566c/resolv.conf /etc/resolv.conf ro,relatime - ext4 /dev/sda3 rw,discard,prjquota,errors=remount-ro",
80 | "10761 10301 0:788 / /dev/shm rw,nosuid,nodev,noexec,relatime - tmpfs shm rw,size=65536k,inode64",
81 | "10762 10112 0:787 / /var/run/secrets/kubernetes.io/serviceaccount ro,relatime - tmpfs tmpfs rw,size=4194304k,inode64",
82 | "5630 10113 0:884 /bus /proc/bus ro,nosuid,nodev,noexec,relatime - proc proc rw",
83 | "5631 10113 0:884 /fs /proc/fs ro,nosuid,nodev,noexec,relatime - proc proc rw",
84 | "5632 10113 0:884 /irq /proc/irq ro,nosuid,nodev,noexec,relatime - proc proc rw",
85 | "5633 10113 0:884 /sys /proc/sys ro,nosuid,nodev,noexec,relatime - proc proc rw",
86 | "5634 10113 0:931 / /proc/acpi ro,relatime - tmpfs tmpfs ro,inode64",
87 | "5635 10113 0:926 /null /proc/kcore rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,inode64",
88 | "5636 10113 0:926 /null /proc/keys rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,inode64",
89 | "5637 10113 0:926 /null /proc/timer_list rw,nosuid - tmpfs tmpfs rw,size=65536k,mode=755,inode64",
90 | "5639 10520 0:932 / /sys/firmware ro,relatime - tmpfs tmpfs ro,inode64"
91 | ],
92 | "/proc/self/cgroup": [
93 | "0::/kubepods.slice/kubepods-burstable.slice/kubepods-burstable-pod121157b5_c67d_4c3e_9052_cb27bbb711fb.slice/cri-containerd-1cd3449e930b8a28c7595240fa32ba20c84f36d059e5fbe63104ad40057992d1.scope"
94 | ]
95 | },
96 | "containerId": "1cd3449e930b8a28c7595240fa32ba20c84f36d059e5fbe63104ad40057992d1",
97 | "podId": "121157b5-c67d-4c3e-9052-cb27bbb711fb"
98 | }
99 | }
100 |
101 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/service_resource_inference.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "span": {
4 | "exit": "true",
5 | "type": "custom",
6 | "subtype": "test-subtype"
7 | },
8 | "expected_resource": "test-subtype",
9 | "expected_service_target": {
10 | "type": "test-subtype"
11 | },
12 | "failure_message": "In the absence of specific context fields, subtype should used"
13 | },
14 | {
15 | "span": {
16 | "exit": "true",
17 | "type": "custom",
18 | "subtype": "test-subtype",
19 | "context": {
20 | "service": {
21 | "target": {
22 | "type": "custom-service-type",
23 | "name": "custom-service-name"
24 | }
25 | }
26 | }
27 | },
28 | "expected_resource": "custom-service-type/custom-service-name",
29 | "expected_service_target": {
30 | "type": "custom-service-type",
31 | "name": "custom-service-name"
32 | },
33 | "failure_message": "If the `service target type or name` is already set, the inference mechanism should not override it"
34 | },
35 | {
36 | "span": {
37 | "exit": "true",
38 | "type": "custom"
39 | },
40 | "expected_resource": "custom",
41 | "expected_service_target": {
42 | "type": "custom"
43 | },
44 | "failure_message": "In the absence of specific context fields and absence of subtype, the type should be used"
45 | },
46 | {
47 | "span": {
48 | "exit": "false",
49 | "type": "custom",
50 | "subtype": "test-subtype"
51 | },
52 | "expected_resource": null,
53 | "expected_service_target": null,
54 | "failure_message": "The output for non-exit spans should be `null`"
55 | },
56 | {
57 | "span": {
58 | "exit": "false",
59 | "type": "custom",
60 | "subtype": "proprietary-db",
61 | "context": {
62 | "db": {
63 | "instance": "myInstance"
64 | }
65 | }
66 | },
67 | "expected_resource": null,
68 | "expected_service_target": null,
69 | "failure_message": "The output for non-exit spans should be `null` even if exit-related context data is set"
70 | },
71 | {
72 | "span": {
73 | "exit": "true",
74 | "type": "db",
75 | "subtype": "mysql",
76 | "context": {
77 | "db": {
78 | "instance": "myInstance"
79 | }
80 | }
81 | },
82 | "expected_resource": "mysql/myInstance",
83 | "expected_service_target": {
84 | "type": "mysql",
85 | "name": "myInstance"
86 | },
87 | "failure_message": "If `context.db.instance` exists, the output should be: `${subtype}/${context.db.instance}`"
88 | },
89 | {
90 | "span": {
91 | "exit": "true",
92 | "type": "db",
93 | "subtype": "mysql",
94 | "context": {
95 | "db": {
96 | "type": "sql"
97 | }
98 | }
99 | },
100 | "expected_resource": "mysql",
101 | "expected_service_target": {
102 | "type": "mysql"
103 | },
104 | "failure_message": "If `context.db` exists without `context.db.instance`, the subtype should be used"
105 | },
106 | {
107 | "span": {
108 | "exit": "true",
109 | "type": "db",
110 | "context": {
111 | "db": {
112 | "instance": "myInstance"
113 | }
114 | }
115 | },
116 | "expected_resource": "db/myInstance",
117 | "expected_service_target": {
118 | "type": "db",
119 | "name": "myInstance"
120 | },
121 | "failure_message": "If `context.db.instance` exists and subtype is `null`, the output should be: `${type}/${context.db.instance}`"
122 | },
123 | {
124 | "span": {
125 | "exit": "true",
126 | "type": "db",
127 | "subtype": "elasticsearch",
128 | "context": {
129 | "db": {
130 | "type": "elasticsearch"
131 | },
132 | "http": {
133 | "url": "https://my-cluster.com:9200"
134 | }
135 | }
136 | },
137 | "expected_resource": "elasticsearch",
138 | "expected_service_target": {
139 | "type": "elasticsearch"
140 | },
141 | "failure_message": "If `context.db` exists without `context.db.instance`, the subtype should be used, even if `context.http` exists"
142 | },
143 | {
144 | "span": {
145 | "exit": "true",
146 | "type": "messaging",
147 | "subtype": "msg-http-client",
148 | "context": {
149 | "message": {
150 | "body": "Text message"
151 | },
152 | "http": {
153 | "url": "https://my-broker.com:8888"
154 | }
155 | }
156 | },
157 | "expected_resource": "msg-http-client",
158 | "expected_service_target": {
159 | "type": "msg-http-client"
160 | },
161 | "failure_message": "If `context.message` exists without `context.message.queue.name`, the subtype should be used, even if `context.http` exists"
162 | },
163 | {
164 | "span": {
165 | "exit": "true",
166 | "type": "external",
167 | "subtype": "http",
168 | "context": {
169 | "http": {
170 | "url": "http://my-cluster.com:9200"
171 | }
172 | }
173 | },
174 | "expected_resource": "my-cluster.com:9200",
175 | "expected_service_target": {
176 | "type": "http",
177 | "name": "my-cluster.com:9200"
178 | },
179 | "failure_message": "If `context.http.url` exists, output should be `${context.http.url}`"
180 | },
181 | {
182 | "span": {
183 | "exit": "true",
184 | "type": "external",
185 | "subtype": "http",
186 | "context": {
187 | "http": {
188 | "url": "https://my-cluster.com"
189 | }
190 | }
191 | },
192 | "expected_resource": "my-cluster.com:443",
193 | "expected_service_target": {
194 | "type": "http",
195 | "name": "my-cluster.com:443"
196 | },
197 | "failure_message": "`context.http.url` without an explicit default HTTPS port, output should be reported as `${context.http.url}:443`"
198 | },
199 | {
200 | "span": {
201 | "exit": "true",
202 | "type": "external",
203 | "subtype": "http",
204 | "context": {
205 | "http": {
206 | "url": "http://my-cluster.com"
207 | }
208 | }
209 | },
210 | "expected_resource": "my-cluster.com:80",
211 | "expected_service_target": {
212 | "type": "http",
213 | "name": "my-cluster.com:80"
214 | },
215 | "failure_message": "`context.http.url` without an explicit default HTTP port, output should be reported as `${context.http.url}:80`"
216 | },
217 | {
218 | "span": {
219 | "exit": "true",
220 | "type": "messaging",
221 | "context": {
222 | "message": {
223 | "body": "Text message",
224 | "queue": {
225 | "name": "myQueue"
226 | }
227 | }
228 | }
229 | },
230 | "expected_resource": "messaging/myQueue",
231 | "expected_service_target": {
232 | "type": "messaging",
233 | "name": "myQueue"
234 | },
235 | "failure_message": "If `context.message` exists, and subtype is `null`, output should be `${type}/${context.message.queue.name}"
236 | },
237 | {
238 | "span": {
239 | "exit": "true",
240 | "type": "messaging",
241 | "subtype": "kafka",
242 | "context": {
243 | "message": {
244 | "body": "Text message",
245 | "queue": {
246 | "name": "myQueue"
247 | }
248 | }
249 | }
250 | },
251 | "expected_resource": "kafka/myQueue",
252 | "expected_service_target": {
253 | "type": "kafka",
254 | "name": "myQueue"
255 | },
256 | "failure_message": "If `context.message` exists, output should be `${subtype}/${context.message.queue.name}"
257 | },
258 | {
259 | "span": {
260 | "exit": "true",
261 | "type": "messaging",
262 | "subtype": "kafka",
263 | "context": {
264 | "message": {
265 | "body": "Text message"
266 | }
267 | }
268 | },
269 | "expected_resource": "kafka",
270 | "expected_service_target": {
271 | "type": "kafka"
272 | },
273 | "failure_message": "If `context.message` exists without `context.message.queue.name`, output should be `${subtype}`"
274 | }
275 | ]
276 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/span_types.json:
--------------------------------------------------------------------------------
1 | {
2 | "__description": {
3 | "": "root element for type identified by ''",
4 | ".__description": "description for '' (optional)",
5 | ".__used_by": "list of agents that use '' to help document alignment (optional)",
6 | ".allow_null_subtype": "true to allow null subtype, false by default if omitted",
7 | ".allow_unlisted_subtype": "true to allow unlisted subtypes, false by default if omitted",
8 | ".subtypes": "root element for sub-types of type '', if omitted or empty subtype must be null, unless 'allow_unlisted_subtype' is set to true",
9 | ".subtypes.": "sub-type element for ",
10 | ".subtypes..__description": "description of subtype (optional)",
11 | ".subtypes..__used_by": "list of agents that use to help document alignment (optional)"
12 | },
13 | "app": {
14 | "__description": "Spans within application (usually not calling an external system)",
15 | "allow_null_subtype": true,
16 | "subtypes": {
17 | "inferred": {
18 | "__description": "Sampling profiler inferred spans",
19 | "__used_by": [
20 | "java"
21 | ]
22 | },
23 | "internal": {
24 | "__description": "Application generic internal span for controller/handler/processing delegation",
25 | "__used_by": [
26 | ]
27 | },
28 | "controller": {
29 | "__description": "Deprecated: use app.internal instead",
30 | "__used_by": [
31 | "ruby"
32 | ]
33 | },
34 | "graphql": {
35 | "__description": "Deprecated: use app.internal instead",
36 | "__used_by": [
37 | "ruby"
38 | ]
39 | },
40 | "mailer": {
41 | "__description": "Deprecated: use app.internal instead",
42 | "__used_by": [
43 | "ruby"
44 | ]
45 | },
46 | "resource": {
47 | "__description": "Deprecated: use app.internal instead",
48 | "__used_by": [
49 | "ruby"
50 | ]
51 | },
52 | "handler": {
53 | "__description": "Deprecated: use app.internal instead",
54 | "__used_by": [
55 | "java"
56 | ]
57 | }
58 | }
59 | },
60 | "custom": {
61 | "__description": "API custom instrumentation",
62 | "__used_by": [
63 | "java",
64 | "ruby"
65 | ],
66 | "allow_null_subtype": true
67 | },
68 | "db": {
69 | "__description": "database span",
70 | "subtypes": {
71 | "cassandra": {
72 | "__description": "Cassandra",
73 | "__used_by": [
74 | "java"
75 | ]
76 | },
77 | "cosmosdb": {
78 | "__description": "Azure CosmosDB"
79 | },
80 | "db2": {
81 | "__description": "IBM DB2",
82 | "__used_by": [
83 | "java"
84 | ]
85 | },
86 | "derby": {
87 | "__description": "Apache Derby",
88 | "__used_by": [
89 | "java"
90 | ]
91 | },
92 | "dynamodb": {
93 | "__description": "AWS DynamoDB",
94 | "__used_by": [
95 | "ruby"
96 | ]
97 | },
98 | "elasticsearch": {
99 | "__description": "Elasticsearch",
100 | "__used_by": [
101 | "java",
102 | "ruby"
103 | ]
104 | },
105 | "graphql": {
106 | "__description": "GraphQL",
107 | "__used_by": [
108 | "nodejs"
109 | ]
110 | },
111 | "h2": {
112 | "__description": "H2",
113 | "__used_by": [
114 | "java"
115 | ]
116 | },
117 | "hsqldb": {
118 | "__description": "HSQLDB",
119 | "__used_by": [
120 | "java"
121 | ]
122 | },
123 | "ingres": {
124 | "__description": "Ingres"
125 | },
126 | "mariadb": {
127 | "__description": "MariaDB",
128 | "__used_by": [
129 | "java",
130 | "ruby"
131 | ]
132 | },
133 | "memcached": {
134 | "__description": "Memcached",
135 | "__used_by": [
136 | "nodejs"
137 | ]
138 | },
139 | "mongodb": {
140 | "__description": "MongoDB",
141 | "__used_by": [
142 | "java",
143 | "ruby"
144 | ]
145 | },
146 | "mssql": {
147 | "__description": "Microsoft SQL Server",
148 | "__used_by": [
149 | "nodejs",
150 | "java"
151 | ]
152 | },
153 | "mysql": {
154 | "__description": "MySQL",
155 | "__used_by": [
156 | "java",
157 | "ruby"
158 | ]
159 | },
160 | "oracle": {
161 | "__description": "Oracle Database",
162 | "__used_by": [
163 | "java"
164 | ]
165 | },
166 | "postgresql": {
167 | "__description": "PostgreSQL",
168 | "__used_by": [
169 | "ruby"
170 | ]
171 | },
172 | "redis": {
173 | "__description": "Redis",
174 | "__used_by": [
175 | "java",
176 | "ruby"
177 | ]
178 | },
179 | "sqlite": {
180 | "__description": "SQLite",
181 | "__used_by": [
182 | "ruby"
183 | ]
184 | },
185 | "sqlite3": {
186 | "__description": "Deprecated: use db/sqlite",
187 | "__used_by": [
188 | "ruby"
189 | ]
190 | },
191 | "sqlserver": {
192 | "__description": "Deprecated: use db/mssql",
193 | "__used_by": [
194 | "java"
195 | ]
196 | },
197 | "unknown": {
198 | "__description": "Unknown database",
199 | "__used_by": [
200 | "java",
201 | "ruby"
202 | ]
203 | }
204 | }
205 | },
206 | "external": {
207 | "__description": "Request to external service, usually in request/response pattern",
208 | "subtypes": {
209 | "dubbo": {
210 | "__description": "Apache Dubbo",
211 | "__used_by": [
212 | "java"
213 | ]
214 | },
215 | "grpc": {
216 | "__description": "gRPC",
217 | "__used_by": [
218 | "ruby",
219 | "java"
220 | ]
221 | },
222 | "http": {
223 | "__description": "HTTP client",
224 | "__used_by": [
225 | "ruby",
226 | "java"
227 | ]
228 | },
229 | "ldap": {
230 | "__description": "LDAP client",
231 | "__used_by": [
232 | "java"
233 | ]
234 | }
235 | }
236 | },
237 | "json": {
238 | "__description": "Deprecated: use app.internal instead",
239 | "subtypes": {
240 | "parse": {
241 | "__description": "JSON parsing"
242 | },
243 | "generate": {
244 | "__description": "JSON generation"
245 | }
246 | },
247 | "__used_by": [
248 | "ruby"
249 | ]
250 | },
251 | "messaging": {
252 | "__description": "Messaging",
253 | "subtypes": {
254 | "azurequeue": {
255 | "__description": "Azure Queue"
256 | },
257 | "azureservicebus": {
258 | "__description": "Azure Service Bus"
259 | },
260 | "jms": {
261 | "__description": "Java Messaging Service",
262 | "__used_by": [
263 | "java"
264 | ]
265 | },
266 | "kafka": {
267 | "__description": "Apache Kafka",
268 | "__used_by": [
269 | "java"
270 | ]
271 | },
272 | "rabbitmq": {
273 | "__description": "RabbitMQ",
274 | "__used_by": [
275 | "java"
276 | ]
277 | },
278 | "sns": {
279 | "__description": "AWS Simple Notification Service",
280 | "__used_by": [
281 | "ruby"
282 | ]
283 | },
284 | "sqs": {
285 | "__description": "AWS Simple Queue Service",
286 | "__used_by": [
287 | "ruby"
288 | ]
289 | }
290 | }
291 | },
292 | "process": {
293 | "__description": "External process",
294 | "__used_by": [
295 | "java"
296 | ]
297 | },
298 | "storage": {
299 | "subtypes": {
300 | "azureblob": {
301 | "__description": "Azure Blob Storage"
302 | },
303 | "azurefile": {
304 | "__description": "Azure Files"
305 | },
306 | "azuretable": {
307 | "__description": "Azure Storage Table",
308 | "__used_by": [
309 | "ruby"
310 | ]
311 | },
312 | "s3": {
313 | "__description": "AWS S3",
314 | "__used_by": [
315 | "ruby"
316 | ]
317 | }
318 | }
319 | },
320 | "template": {
321 | "__description": "Template engines (no sub-type for now as really platform-specific)",
322 | "__used_by": [
323 | "java",
324 | "ruby"
325 | ],
326 | "allow_unlisted_subtype": true
327 | },
328 | "websocket": {
329 | "__description": "Websockets",
330 | "subtypes": {
331 | "send": {
332 | "__used_by": [
333 | "nodejs"
334 | ]
335 | }
336 | }
337 | }
338 | }
339 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/sql_signature_examples.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "input": "",
4 | "output": ""
5 | },
6 | {
7 | "input": " ",
8 | "output": ""
9 | },
10 | {
11 | "input": "SELECT * FROM foo.bar",
12 | "output": "SELECT FROM foo.bar"
13 | },
14 | {
15 | "input": "SELECT * FROM foo.bar.baz",
16 | "output": "SELECT FROM foo.bar.baz"
17 | },
18 | {
19 | "input": "SELECT * FROM `foo.bar`",
20 | "output": "SELECT FROM foo.bar"
21 | },
22 | {
23 | "input": "SELECT * FROM \"foo.bar\"",
24 | "output": "SELECT FROM foo.bar"
25 | },
26 | {
27 | "input": "SELECT * FROM [foo.bar]",
28 | "output": "SELECT FROM foo.bar"
29 | },
30 | {
31 | "input": "SELECT (x, y) FROM foo,bar,baz",
32 | "output": "SELECT FROM foo"
33 | },
34 | {
35 | "input": "SELECT * FROM foo JOIN bar",
36 | "output": "SELECT FROM foo"
37 | },
38 | {
39 | "input": "SELECT * FROM dollar$bill",
40 | "output": "SELECT FROM dollar$bill"
41 | },
42 | {
43 | "input": "SELECT id FROM \"myta\n-æøåble\" WHERE id = 2323",
44 | "output": "SELECT FROM myta\n-æøåble"
45 | },
46 | {
47 | "input": "SELECT * FROM foo-- abc\n./*def*/bar",
48 | "output": "SELECT FROM foo.bar"
49 | },
50 | {
51 | "comment": "We capture the first table of the outermost select statement",
52 | "input": "SELECT *,(SELECT COUNT(*) FROM table2 WHERE table2.field1 = table1.id) AS count FROM table1 WHERE table1.field1 = 'value'",
53 | "output": "SELECT FROM table1"
54 | },
55 | {
56 | "comment": "If the outermost select operates on derived tables, then we just return 'SELECT' (i.e. the fallback)",
57 | "input": "SELECT * FROM (SELECT foo FROM bar) AS foo_bar",
58 | "output": "SELECT"
59 | },
60 | {
61 | "input": "DELETE FROM foo.bar WHERE baz=1",
62 | "output": "DELETE FROM foo.bar"
63 | },
64 | {
65 | "input": "UPDATE IGNORE foo.bar SET bar=1 WHERE baz=2",
66 | "output": "UPDATE foo.bar"
67 | },
68 | {
69 | "input": "UPDATE ONLY foo AS bar SET baz=1",
70 | "output": "UPDATE foo"
71 | },
72 | {
73 | "input": "INSERT INTO foo.bar (col) VALUES(?)",
74 | "output": "INSERT INTO foo.bar"
75 | },
76 | {
77 | "input": "INSERT LOW_PRIORITY IGNORE INTO foo.bar (col) VALUES(?)",
78 | "output": "INSERT INTO foo.bar"
79 | },
80 | {
81 | "input": "CALL foo(bar, 123)",
82 | "output": "CALL foo"
83 | },
84 | {
85 | "comment": "For DDL we only capture the first token",
86 | "input": "ALTER TABLE foo ADD ()",
87 | "output": "ALTER"
88 | },
89 | {
90 | "input": "CREATE TABLE foo ...",
91 | "output": "CREATE"
92 | },
93 | {
94 | "input": "DROP TABLE foo",
95 | "output": "DROP"
96 | },
97 | {
98 | "input": "SAVEPOINT x_asd1234",
99 | "output": "SAVEPOINT"
100 | },
101 | {
102 | "input": "BEGIN",
103 | "output": "BEGIN"
104 | },
105 | {
106 | "input": "COMMIT",
107 | "output": "COMMIT"
108 | },
109 | {
110 | "input": "ROLLBACK",
111 | "output": "ROLLBACK"
112 | },
113 | {
114 | "comment": "For broken statements we only capture the first token",
115 | "input": "SELECT * FROM (SELECT EOF",
116 | "output": "SELECT"
117 | },
118 | {
119 | "input": "SELECT 'neverending literal FROM (SELECT * FROM ...",
120 | "output": "SELECT"
121 | },
122 | {
123 | "input": "INSERT COIN TO PLAY",
124 | "output": "INSERT"
125 | },
126 | {
127 | "input": "INSERT $2 INTO",
128 | "output": "INSERT"
129 | },
130 | {
131 | "input": "UPDATE 99",
132 | "output": "UPDATE"
133 | },
134 | {
135 | "input": "DELETE 99",
136 | "output": "DELETE"
137 | },
138 | {
139 | "input": "DELETE FROM",
140 | "output": "DELETE"
141 | },
142 | {
143 | "input": "CALL",
144 | "output": "CALL"
145 | }
146 | ]
147 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/sql_token_examples.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "name": "whitespace-only",
4 | "comment": "whitespace between tokens is ignored",
5 | "input": " "
6 | },
7 | {
8 | "name": "keywords",
9 | "comment": "keywords each have their own kind, and are scanned case-insensitively",
10 | "input": "INSERT or rEpLaCe",
11 | "tokens": [
12 | {
13 | "kind": "INSERT",
14 | "text": "INSERT"
15 | },
16 | {
17 | "kind": "OR",
18 | "text": "or"
19 | },
20 | {
21 | "kind": "REPLACE",
22 | "text": "rEpLaCe"
23 | }
24 | ]
25 | },
26 | {
27 | "name": "qualified-table",
28 | "input": "schema.Abc_123",
29 | "tokens": [
30 | {
31 | "kind": "IDENT",
32 | "text": "schema"
33 | },
34 | {
35 | "kind": "PERIOD",
36 | "text": "."
37 | },
38 | {
39 | "kind": "IDENT",
40 | "text": "Abc_123"
41 | }
42 | ]
43 | },
44 | {
45 | "name": "dollar-variable",
46 | "comment": "dollar variables mustn't confuse dollar quoting",
47 | "input": "$123",
48 | "tokens": [
49 | {
50 | "kind": "OTHER",
51 | "text": "$123"
52 | }
53 | ]
54 | },
55 | {
56 | "name": "identifiers",
57 | "input": "_foo foo$",
58 | "tokens": [
59 | {
60 | "kind": "IDENT",
61 | "text": "_foo"
62 | },
63 | {
64 | "kind": "IDENT",
65 | "text": "foo$"
66 | }
67 | ]
68 | },
69 | {
70 | "name": "quoted-identifiers",
71 | "input": "`SELECT` \"SELECT \"\"\" [SELECT '']",
72 | "tokens": [
73 | {
74 | "kind": "IDENT",
75 | "text": "SELECT"
76 | },
77 | {
78 | "kind": "IDENT",
79 | "text": "SELECT \"\""
80 | },
81 | {
82 | "kind": "IDENT",
83 | "text": "SELECT ''"
84 | }
85 | ]
86 | },
87 | {
88 | "name": "punctuation",
89 | "input": "().",
90 | "tokens": [
91 | {
92 | "kind": "LPAREN",
93 | "text": "("
94 | },
95 | {
96 | "kind": "RPAREN",
97 | "text": ")"
98 | },
99 | {
100 | "kind": "PERIOD",
101 | "text": "."
102 | }
103 | ]
104 | },
105 | {
106 | "name": "comments",
107 | "input": "/* /*nested*/ */ -- SELECT /*",
108 | "tokens": [
109 | {
110 | "kind": "COMMENT",
111 | "text": "/* /*nested*/ */"
112 | },
113 | {
114 | "kind": "COMMENT",
115 | "text": "-- SELECT /*"
116 | }
117 | ]
118 | },
119 | {
120 | "name": "CQL line comment",
121 | "input": "/* /*nested*/ */ // SELECT /*",
122 | "tokens": [
123 | {
124 | "kind": "COMMENT",
125 | "text": "/* /*nested*/ */"
126 | },
127 | {
128 | "kind": "COMMENT",
129 | "text": "// SELECT /*"
130 | }
131 | ]
132 | },
133 | {
134 | "name": "string-literal",
135 | "input": "'abc '' def\\''",
136 | "tokens": [
137 | {
138 | "kind": "STRING",
139 | "text": "'abc '' def\\''"
140 | }
141 | ]
142 | },
143 | {
144 | "name": "dollar-quoted-string-literal",
145 | "input": "$$f$o$o$$ $$ $$ $foo$'`$$$$\"$foo$ $foo $bar",
146 | "tokens": [
147 | {
148 | "kind": "STRING",
149 | "text": "$$f$o$o$$"
150 | },
151 | {
152 | "kind": "STRING",
153 | "text": "$$ $$"
154 | },
155 | {
156 | "kind": "STRING",
157 | "text": "$foo$'`$$$$\"$foo$"
158 | },
159 | {
160 | "kind": "OTHER",
161 | "text": "$foo"
162 | },
163 | {
164 | "kind": "OTHER",
165 | "text": "$bar"
166 | }
167 | ]
168 | },
169 | {
170 | "name": "unterminated-dollar-quoted-string-literal",
171 | "comment": "Unterminated dollar-quoted string rewinds back to the first whitespace, under the assumption that the input is valid and we've interpreted it wrongly",
172 | "input": "$foo$ banana $",
173 | "tokens": [
174 | {
175 | "kind": "OTHER",
176 | "text": "$foo$"
177 | },
178 | {
179 | "kind": "IDENT",
180 | "text": "banana"
181 | },
182 | {
183 | "kind": "OTHER",
184 | "text": "$"
185 | }
186 | ]
187 | },
188 | {
189 | "name": "numeric-literals",
190 | "input": "123 123.456 123E45 123e+45 123e-45 1.2.3",
191 | "tokens": [
192 | {
193 | "kind": "NUMBER",
194 | "text": "123"
195 | },
196 | {
197 | "kind": "NUMBER",
198 | "text": "123.456"
199 | },
200 | {
201 | "kind": "NUMBER",
202 | "text": "123E45"
203 | },
204 | {
205 | "kind": "NUMBER",
206 | "text": "123e+45"
207 | },
208 | {
209 | "kind": "NUMBER",
210 | "text": "123e-45"
211 | },
212 | {
213 | "kind": "NUMBER",
214 | "text": "1.2"
215 | },
216 | {
217 | "kind": "PERIOD",
218 | "text": "."
219 | },
220 | {
221 | "kind": "NUMBER",
222 | "text": "3"
223 | }
224 | ]
225 | },
226 | {
227 | "name": "unicode",
228 | "input": "选择 FROM foo",
229 | "tokens": [
230 | {
231 | "kind": "IDENT",
232 | "text": "选择"
233 | },
234 | {
235 | "kind": "FROM",
236 | "text": "FROM"
237 | },
238 | {
239 | "kind": "IDENT",
240 | "text": "foo"
241 | }
242 | ]
243 | }
244 | ]
245 |
--------------------------------------------------------------------------------
/tests/agents/json-specs/wildcard_matcher_tests.json:
--------------------------------------------------------------------------------
1 | {
2 | "testMatchesStartsWith": {
3 | "foo*": {
4 | "foo": true,
5 | "foobar": true,
6 | "bar": false,
7 | "barfoo": false,
8 | "rfoo": false
9 | }
10 | },
11 | "testWildcardInTheMiddle": {
12 | "/foo/*/baz": {
13 | "/foo/bar/baz": true,
14 | "/foo/bar": false
15 | }
16 | },
17 | "testCompoundWildcardMatcher": {
18 | "*foo*foo*": {
19 | "foofoo": true,
20 | "foo/bar/foo": true,
21 | "/foo/bar/foo/bar": true,
22 | "foo": false
23 | }
24 | },
25 | "testCompoundWildcardMatcher3": {
26 | "*foo*oo*": {
27 | "foooo": true,
28 | "foofoo": true,
29 | "foo/bar/foo": true,
30 | "/foo/bar/foo/bar": true,
31 | "foo": false,
32 | "fooo": false
33 | }
34 | },
35 | "testCompoundWildcardMatcher2": {
36 | "*foo*bar*": {
37 | "foobar": true,
38 | "foo/bar/foo/baz": true,
39 | "/foo/bar/baz": true,
40 | "bar/foo": false,
41 | "barfoo": false
42 | }
43 | },
44 | "testCompoundWildcardMatcher4": {
45 | "*foo*far*": {
46 | "foofar": true,
47 | "foo/far/foo/baz": true,
48 | "/foo/far/baz": true,
49 | "/far/foo": false,
50 | "farfoo": false
51 | }
52 | },
53 | "testMatchBetween": {
54 | "*foo*foo*": {
55 | "foofoo": true,
56 | "foo/foo/foo/baz": true,
57 | "/foo/foo/baz": true,
58 | "/foo/foo": true,
59 | "foobar": false
60 | }
61 | },
62 | "testComplexExpressions": {
63 | "/foo/*/baz*": {
64 | "/foo/a/bar/b/baz": true
65 | },
66 | "/foo/*/bar/*/baz": {
67 | "/foo/a/bar/b/baz": true
68 | }
69 | },
70 | "testInfixEmptyMatcher": {
71 | "**": {
72 | "": true,
73 | "foo": true
74 | }
75 | },
76 | "testMatchesEndsWith": {
77 | "*foo": {
78 | "foo": true,
79 | "foobar": false,
80 | "bar": false,
81 | "barfoo": true,
82 | "foor": false
83 | }
84 | },
85 | "testMatchesEquals": {
86 | "foo": {
87 | "foo": true,
88 | "foobar": false,
89 | "bar": false,
90 | "barfoo": false
91 | }
92 | },
93 | "testMatchesInfix": {
94 | "*foo*": {
95 | "foo": true,
96 | "foobar": true,
97 | "bar": false,
98 | "barfoo": true,
99 | "barfoobaz": true
100 | }
101 | },
102 | "testMatchesNoWildcard": {
103 | "foo": {
104 | "foo": true,
105 | "foobar": false
106 | }
107 | },
108 | "testMatchesStartsWith_ignoreCase": {
109 | "foo*": {
110 | "foo": true,
111 | "foobar": true,
112 | "bar": false,
113 | "barfoo": false
114 | }
115 | },
116 | "testInfixEmptyMatcher_ignoreCase": {
117 | "**": {
118 | "": true,
119 | "foo": true
120 | }
121 | },
122 | "testMatchesEndsWith_ignoreCase": {
123 | "*foo": {
124 | "fOo": true,
125 | "foobar": false,
126 | "bar": false,
127 | "baRFoo": true
128 | }
129 | },
130 | "testMatchesEquals_ignoreCase": {
131 | "foo": {
132 | "fOo": true,
133 | "foOBar": false,
134 | "BAR": false,
135 | "barfoo": false
136 | }
137 | },
138 | "testMatchesInfix_ignoreCase": {
139 | "*foo*": {
140 | "FOO": true,
141 | "foOBar": true,
142 | "BAR": false,
143 | "baRFOo": true,
144 | "BARFOOBAZ": true
145 | }
146 | },
147 | "testMatchesInfix_caseSensitive": {
148 | "(?-i)*foo*": {
149 | "foo": true,
150 | "FOO": false
151 | }
152 | },
153 | "testMatchesNoWildcard_ignoreCase": {
154 | "foo": {
155 | "FOO": true,
156 | "foobar": false
157 | }
158 | },
159 | "testNeedleLongerThanHaystack": {
160 | "*foo": {
161 | "baz": false
162 | },
163 | "*foob": {
164 | "baz": false
165 | },
166 | "*fooba": {
167 | "baz": false
168 | },
169 | "*foobar": {
170 | "baz": false
171 | },
172 | "foo*": {
173 | "baz": false
174 | },
175 | "foob*": {
176 | "baz": false
177 | },
178 | "fooba*": {
179 | "baz": false
180 | },
181 | "foobar*": {
182 | "baz": false
183 | },
184 | "*foobar*": {
185 | "baz": false
186 | }
187 | },
188 | "testSingleCharacterWildcardNotSupported": {
189 | "fo?": {
190 | "foo": false,
191 | "fo?": true
192 | }
193 | }
194 | }
195 |
--------------------------------------------------------------------------------