├── .python-version
├── .idea
    ├── .gitignore
    ├── vcs.xml
    ├── modules.xml
    ├── misc.xml
    ├── runConfigurations
    │   ├── Make_Lint_Fix.xml
    │   ├── All_Quality_Checks.xml
    │   ├── Black_Format.xml
    │   ├── Ruff_Lint.xml
    │   ├── Mypy_Type_Check.xml
    │   └── Ruff_Lint_Fix.xml
    ├── platform-problem-monitoring-core.iml
    └── inspectionProfiles
    │   └── Project_Default.xml
├── src
    ├── tests
    │   ├── fixtures
    │   │   ├── current_date_time.txt
    │   │   ├── __init__.py
    │   │   ├── lucene_query.json
    │   │   └── previous_normalization_results.json
    │   ├── __init__.py
    │   ├── test_step6_extract_fields.py
    │   └── test_step7_normalize_messages.py
    └── platform_problem_monitoring_core
    │   ├── __init__.py
    │   ├── step1_prepare.py
    │   ├── step6_extract_fields.py
    │   ├── step12_cleanup.py
    │   ├── step11_store_new_state.py
    │   ├── utils.py
    │   ├── step2_download_previous_state.py
    │   ├── step10_send_email_report.py
    │   ├── step3_retrieve_hourly_problem_numbers.py
    │   ├── step4_generate_trend_chart.py
    │   ├── step8_compare_normalizations.py
    │   ├── step5_download_logstash_documents.py
    │   └── step7_normalize_messages.py
├── assets
    ├── sample-trend-and-report-input-data
    │   ├── start_date_time.txt
    │   ├── trend_chart.png
    │   ├── lucene_query.json
    │   ├── email_body.txt
    │   ├── hourly_problem_numbers.json
    │   ├── norm_results.json
    │   ├── norm_results_prev.json
    │   └── comparison_results.json
    └── readme-hero-image.png
├── etc
    ├── main.conf.dist
    └── lucene_query.json.dist
├── .gitignore
├── .github
    └── workflows
    │   ├── tests.yml
    │   ├── code-quality.yml
    │   └── release.yml
├── docs
    ├── NOTES.md
    ├── DEVELOPMENT.md
    ├── QUALITY.md
    ├── JETBRAINS_SETUP.md
    └── RELEASE_MANAGEMENT.md
├── LICENSE.txt
├── .vscode
    └── settings.json
├── .pre-commit-config.yaml
├── pyproject.toml
├── Makefile
├── README.md
└── bin
    └── ppmc


/.python-version:
--------------------------------------------------------------------------------
1 | 3.10.16
2 | 


--------------------------------------------------------------------------------
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 | 


--------------------------------------------------------------------------------
/src/tests/fixtures/current_date_time.txt:
--------------------------------------------------------------------------------
1 | 2025-03-05T19:23:10.832778+00:00
2 | 


--------------------------------------------------------------------------------
/src/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Test package for platform_problem_monitoring_core."""
2 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/start_date_time.txt:
--------------------------------------------------------------------------------
1 | 2025-03-06T00:00:00Z
2 | 


--------------------------------------------------------------------------------
/src/tests/fixtures/__init__.py:
--------------------------------------------------------------------------------
1 | """Test fixtures for platform_problem_monitoring_core tests."""
2 | 


--------------------------------------------------------------------------------
/assets/readme-hero-image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dx-tooling/platform-problem-monitoring-core/HEAD/assets/readme-hero-image.png


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/trend_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dx-tooling/platform-problem-monitoring-core/HEAD/assets/sample-trend-and-report-input-data/trend_chart.png


--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="VcsDirectoryMappings">
4 |     <mapping directory="" vcs="Git" />
5 |   </component>
6 | </project>
7 | 


--------------------------------------------------------------------------------
/.idea/modules.xml:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="UTF-8"?>
2 | <project version="4">
3 |   <component name="ProjectModuleManager">
4 |     <modules>
5 |       <module fileurl="file://$PROJECT_DIR$/.idea/platform-problem-monitoring-core.iml" filepath="$PROJECT_DIR$/.idea/platform-problem-monitoring-core.iml" />
6 |     </modules>
7 |   </component>
8 | </project>
9 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/__init__.py:
--------------------------------------------------------------------------------
 1 | """Platform Problem Monitoring Core.
 2 | 
 3 | A tool for monitoring platform problems using Elasticsearch logs.
 4 | """
 5 | 
 6 | from importlib.metadata import PackageNotFoundError, version
 7 | 
 8 | try:
 9 |     __version__ = version("platform_problem_monitoring_core")
10 | except PackageNotFoundError:
11 |     __version__ = "0.1.0"  # Default version if package is not installed
12 | 


--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project version="4">
 3 |   <component name="Black">
 4 |     <option name="enabledOnReformat" value="true" />
 5 |     <option name="enabledOnSave" value="true" />
 6 |     <option name="sdkName" value="Python 3.13 (platform-problem-monitoring-core)" />
 7 |   </component>
 8 |   <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.13 (platform-problem-monitoring-core)" project-jdk-type="Python SDK" />
 9 | </project>
10 | 


--------------------------------------------------------------------------------
/etc/main.conf.dist:
--------------------------------------------------------------------------------
 1 | REMOTE_STATE_S3_BUCKET_NAME=""
 2 | REMOTE_STATE_S3_FOLDER_NAME=""
 3 | 
 4 | ELASTICSEARCH_SERVER_BASE_URL=""
 5 | ELASTICSEARCH_LUCENE_QUERY_FILE_PATH=""
 6 | 
 7 | KIBANA_DISCOVER_BASE_URL=""
 8 | KIBANA_DOCUMENT_DEEPLINK_URL_STRUCTURE="https://example.com/kibana/_plugin/kibana/app/discover#/doc/logstash-*/{{index}}?id={{id}}"
 9 | 
10 | SMTP_SERVER_HOSTNAME=""
11 | SMTP_SERVER_PORT=""
12 | SMTP_SERVER_USERNAME=""
13 | SMTP_SERVER_PASSWORD=""
14 | SMTP_SENDER_ADDRESS=""
15 | SMTP_RECEIVER_ADDRESS=""
16 | 
17 | # Number of hours to look back for problem trends
18 | TREND_HOURS_BACK="24"
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/workspace.xml
 2 | .idea/tasks.xml
 3 | .idea/dictionaries/
 4 | .idea/shelf/
 5 | .idea/usage.statistics.xml
 6 | .idea/contentModel.xml
 7 | .idea/dataSources/
 8 | .idea/dataSources.local.xml
 9 | .idea/httpRequests/
10 | .idea/caches/
11 | # Keep .idea/runConfigurations
12 | # Keep .idea/inspectionProfiles
13 | # Keep .idea/misc.xml
14 | # Keep .idea/modules.xml
15 | 
16 | venv/
17 | main.conf
18 | etc/lucene_query.json
19 | 
20 | # Python package build artifacts
21 | *.egg-info/
22 | *.egg
23 | dist/
24 | build/
25 | __pycache__/
26 | *.py[cod]
27 | *$py.class
28 | references/
29 | coverage.xml
30 | 
31 | # Cache directories
32 | .mypy_cache/
33 | .pytest_cache/
34 | .ruff_cache/
35 | .coverage
36 | htmlcov/
37 | 


--------------------------------------------------------------------------------
/etc/lucene_query.json.dist:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "bool": {
 4 |             "should": [
 5 |                 { "match": { "message": "error" } },
 6 |                 { "match": { "message": "failure" } },
 7 |                 { "match": { "message": "critical" } },
 8 |                 { "match": { "message": "alert" } },
 9 |                 { "match": { "message": "exception" } }
10 |             ],
11 |             "must_not": [
12 |                 { "match": { "message": "User Deprecated" } },
13 |                 { "match": { "message": "logstash" } },
14 |                 { "term": { "syslog_program": "dd.collector" } },
15 |                 { "term": { "syslog_program": "dd.forwarder" } },
16 |                 { "term": { "syslog_program": "dd.dogstatsd" } }
17 |             ],
18 |             "minimum_should_match": 1
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/src/tests/fixtures/lucene_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "bool": {
 4 |             "should": [
 5 |                 { "match": { "message": "error" } },
 6 |                 { "match": { "message": "failure" } },
 7 |                 { "match": { "message": "critical" } },
 8 |                 { "match": { "message": "alert" } },
 9 |                 { "match": { "message": "exception" } }
10 |             ],
11 |             "must_not": [
12 |                 { "match": { "message": "User Deprecated" } },
13 |                 { "match": { "message": "logstash" } },
14 |                 { "term": { "syslog_program": "dd.collector" } },
15 |                 { "term": { "syslog_program": "dd.forwarder" } },
16 |                 { "term": { "syslog_program": "dd.dogstatsd" } }
17 |             ],
18 |             "minimum_should_match": 1
19 |         }
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   test:
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       CI: true
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v3
20 | 
21 |       - name: Set up Python ${{ matrix.python-version }}
22 |         uses: actions/setup-python@v4
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |           cache: 'pip'
26 | 
27 |       - name: Install dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           pip install -e ".[dev]"
31 | 
32 |       - name: Run tests
33 |         run: |
34 |           # Run tests with coverage
35 |           make test-coverage
36 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/Make_Lint_Fix.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="Make Lint Fix" type="ShConfigurationType">
 3 |     <option name="SCRIPT_TEXT" value="make lint-fix" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/.github/workflows/code-quality.yml:
--------------------------------------------------------------------------------
 1 | name: Code Quality
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   quality:
11 |     runs-on: ubuntu-latest
12 |     env:
13 |       CI: true
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.10", "3.11", "3.12", "3.13"]
17 | 
18 |     steps:
19 |       - uses: actions/checkout@v3
20 | 
21 |       - name: Set up Python ${{ matrix.python-version }}
22 |         uses: actions/setup-python@v4
23 |         with:
24 |           python-version: ${{ matrix.python-version }}
25 |           cache: 'pip'
26 | 
27 |       - name: Install dependencies
28 |         run: |
29 |           python -m pip install --upgrade pip
30 |           pip install -e ".[dev]"
31 | 
32 |       - name: Run code quality checks
33 |         run: |
34 |           # Run all code quality checks via the Makefile
35 |           make ci-quality
36 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/All_Quality_Checks.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="All Quality Checks" type="ShConfigurationType">
 3 |     <option name="SCRIPT_TEXT" value="make quality" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/Black_Format.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="Black Format" type="ShConfigurationType">
 3 |     <option name="SCRIPT_TEXT" value="black src/platform_problem_monitoring_core" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/Ruff_Lint.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="Ruff Lint" type="ShConfigurationType">
 3 |     <option name="SCRIPT_TEXT" value="ruff check src/platform_problem_monitoring_core" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/Mypy_Type_Check.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="Mypy Type Check" type="ShConfigurationType">
 3 |     <option name="SCRIPT_TEXT" value="mypy src/platform_problem_monitoring_core" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/.idea/runConfigurations/Ruff_Lint_Fix.xml:
--------------------------------------------------------------------------------
 1 | <component name="ProjectRunConfigurationManager">
 2 |   <configuration default="false" name="Ruff Lint Fix" type="ShConfigurationType" focusToolWindowBeforeRun="true">
 3 |     <option name="SCRIPT_TEXT" value="ruff check --fix src/platform_problem_monitoring_core" />
 4 |     <option name="INDEPENDENT_SCRIPT_PATH" value="true" />
 5 |     <option name="SCRIPT_PATH" value="" />
 6 |     <option name="SCRIPT_OPTIONS" value="" />
 7 |     <option name="INDEPENDENT_SCRIPT_WORKING_DIRECTORY" value="true" />
 8 |     <option name="SCRIPT_WORKING_DIRECTORY" value="$PROJECT_DIR$" />
 9 |     <option name="INDEPENDENT_INTERPRETER_PATH" value="true" />
10 |     <option name="INTERPRETER_PATH" value="/bin/bash" />
11 |     <option name="INTERPRETER_OPTIONS" value="" />
12 |     <option name="EXECUTE_IN_TERMINAL" value="true" />
13 |     <option name="EXECUTE_SCRIPT_FILE" value="false" />
14 |     <envs />
15 |     <method v="2" />
16 |   </configuration>
17 | </component>
18 | 


--------------------------------------------------------------------------------
/docs/NOTES.md:
--------------------------------------------------------------------------------
 1 | # Notes
 2 | 
 3 | ## Runbook
 4 | 
 5 |     python3 -m platform_problem_monitoring_core.step5_download_logstash_documents \
 6 |         --elasticsearch-url "http://127.0.0.1:9201" \
 7 |         --query-file "/Users/manuel/git/github/dx-tooling/platform-problem-monitoring-core/src/lucene_query.json" \
 8 |         --start-date-time-file "/tmp/latest-date-time.txt" \
 9 |         --output-file "/tmp/docs.json" \
10 |         --current-date-time-file "/tmp/cur-date-time.txt"
11 | 
12 |     curl -s -X GET "http://127.0.0.1:9201/_search?pretty" -H 'Content-Type: application/json' -d'
13 |         {
14 |         "query": {
15 |         "query_string" : {
16 |         "query" : "@timestamp: ['2025-03-04T00:00:00.000' TO '2025-03-04T01:00:00.000'] AND type: \"symfony-errors\""
17 |         }
18 |         }
19 |     }
20 |     '
21 | 
22 | ## TODOs & Ideas
23 | 
24 | - add step 12 (cleanup) to ppmc
25 | - add ppmc option to disable cleanup step 12
26 | - allow the local filesystem as a state storage alternative
27 | 


--------------------------------------------------------------------------------
/.idea/platform-problem-monitoring-core.iml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <module type="PYTHON_MODULE" version="4">
 3 |   <component name="NewModuleRootManager">
 4 |     <content url="file://$MODULE_DIR$">
 5 |       <sourceFolder url="file://$MODULE_DIR$/src" isTestSource="false" />
 6 |       <excludeFolder url="file://$MODULE_DIR$/venv" />
 7 |       <excludeFolder url="file://$MODULE_DIR$/.mypy_cache" />
 8 |       <excludeFolder url="file://$MODULE_DIR$/.pytest_cache" />
 9 |       <excludeFolder url="file://$MODULE_DIR$/.ruff_cache" />
10 |       <excludeFolder url="file://$MODULE_DIR$/build" />
11 |       <excludeFolder url="file://$MODULE_DIR$/dist" />
12 |     </content>
13 |     <orderEntry type="jdk" jdkName="Python 3.13 (platform-problem-monitoring-core)" jdkType="Python SDK" />
14 |     <orderEntry type="sourceFolder" forTests="false" />
15 |   </component>
16 |   <component name="PyDocumentationSettings">
17 |     <option name="format" value="GOOGLE" />
18 |     <option name="myDocStringFormat" value="Google" />
19 |   </component>
20 | </module>
21 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Manuel Kießling
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "python.linting.enabled": true,
 3 |     "python.linting.mypyEnabled": true,
 4 |     "python.linting.flake8Enabled": false,
 5 |     "python.linting.banditEnabled": true,
 6 |     "python.formatting.provider": "black",
 7 |     "python.formatting.blackArgs": ["--line-length", "100"],
 8 |     "editor.formatOnSave": true,
 9 |     "editor.codeActionsOnSave": {
10 |         "source.organizeImports": "explicit",
11 |         "source.fixAll": "explicit"
12 |     },
13 |     "python.linting.ignorePatterns": [
14 |         ".vscode/*.py",
15 |         "**/site-packages/**/*.py",
16 |         "venv/**/*.py"
17 |     ],
18 |     "python.linting.mypyArgs": [
19 |         "--config-file=pyproject.toml"
20 |     ],
21 |     "[python]": {
22 |         "editor.rulers": [100],
23 |         "editor.tabSize": 4,
24 |         "editor.insertSpaces": true,
25 |         "editor.detectIndentation": false
26 |     },
27 |     "files.exclude": {
28 |         "**/__pycache__": true,
29 |         "**/.mypy_cache": true,
30 |         "**/.pytest_cache": true,
31 |         "**/.ruff_cache": true,
32 |         "**/*.egg-info": true
33 |     },
34 |     "python.analysis.typeCheckingMode": "strict",
35 |     "python.analysis.extraPaths": ["src"]
36 | }
37 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.4.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: end-of-file-fixer
 7 |     -   id: check-yaml
 8 |     -   id: check-added-large-files
 9 |     -   id: check-json
10 |     -   id: check-toml
11 |     -   id: detect-private-key
12 | 
13 | -   repo: local
14 |     hooks:
15 |     -   id: black
16 |         name: black
17 |         entry: make format-check-files
18 |         language: system
19 |         types: [python]
20 |         pass_filenames: true
21 | 
22 |     -   id: isort
23 |         name: isort
24 |         entry: make isort-check-files
25 |         language: system
26 |         types: [python]
27 |         pass_filenames: true
28 | 
29 |     -   id: ruff
30 |         name: ruff
31 |         entry: make lint-files
32 |         language: system
33 |         types: [python]
34 |         pass_filenames: true
35 | 
36 |     -   id: mypy
37 |         name: mypy
38 |         entry: make type-check-files
39 |         language: system
40 |         types: [python]
41 |         pass_filenames: true
42 | 
43 |     -   id: bandit
44 |         name: bandit
45 |         entry: make security-check-files
46 |         language: system
47 |         types: [python]
48 |         exclude: ^src/tests/
49 |         pass_filenames: true
50 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/lucene_query.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "query": {
 3 |         "bool": {
 4 |             "should": [
 5 |                 {
 6 |                     "match": {
 7 |                         "message": "error"
 8 |                     }
 9 |                 },
10 |                 {
11 |                     "match": {
12 |                         "message": "failure"
13 |                     }
14 |                 },
15 |                 {
16 |                     "match": {
17 |                         "message": "exception"
18 |                     }
19 |                 },
20 |                 {
21 |                     "match": {
22 |                         "message": "warning"
23 |                     }
24 |                 },
25 |                 {
26 |                     "match": {
27 |                         "message": "critical"
28 |                     }
29 |                 }
30 |             ],
31 |             "must_not": [
32 |                 {
33 |                     "match": {
34 |                         "message": "User Deprecated"
35 |                     }
36 |                 },
37 |                 {
38 |                     "match": {
39 |                         "message": "debug"
40 |                     }
41 |                 }
42 |             ],
43 |             "minimum_should_match": 1
44 |         }
45 |     }
46 | }
47 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'v*.*.*'
 7 | 
 8 | permissions:
 9 |   contents: write  # Needed for creating releases and uploading assets
10 | 
11 | jobs:
12 |   release:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v3
16 |         with:
17 |           fetch-depth: 0
18 | 
19 |       - name: Set up Python
20 |         uses: actions/setup-python@v4
21 |         with:
22 |           python-version: '3.10'
23 |           cache: 'pip'
24 | 
25 |       - name: Install dependencies
26 |         run: |
27 |           python -m pip install --upgrade pip
28 |           pip install build twine wheel
29 |           pip install -e ".[dev]"
30 | 
31 |       - name: Run tests
32 |         run: make test-coverage
33 | 
34 |       - name: Build package
35 |         run: python -m build
36 | 
37 |       - name: Get version from tag
38 |         id: get_version
39 |         run: echo "VERSION=${GITHUB_REF#refs/tags/v}" >> $GITHUB_OUTPUT
40 | 
41 |       - name: Create Release
42 |         uses: softprops/action-gh-release@v1
43 |         with:
44 |           name: Release v${{ steps.get_version.outputs.VERSION }}
45 |           draft: false
46 |           prerelease: false
47 |           generate_release_notes: true
48 |           files: |
49 |             dist/*.whl
50 |             dist/*.tar.gz
51 | 


--------------------------------------------------------------------------------
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
 1 | <component name="InspectionProjectProfileManager">
 2 |   <profile version="1.0">
 3 |     <option name="myName" value="Project Default" />
 4 |     <inspection_tool class="PyCompatibilityInspection" enabled="true" level="WARNING" enabled_by_default="true">
 5 |       <option name="ourVersions">
 6 |         <value>
 7 |           <list size="1">
 8 |             <item index="0" class="java.lang.String" itemvalue="3.8" />
 9 |           </list>
10 |         </value>
11 |       </option>
12 |     </inspection_tool>
13 |     <inspection_tool class="PyPackageRequirementsInspection" enabled="true" level="WARNING" enabled_by_default="true">
14 |       <option name="ignoredPackages">
15 |         <value>
16 |           <list size="0" />
17 |         </value>
18 |       </option>
19 |     </inspection_tool>
20 |     <inspection_tool class="PyPep8Inspection" enabled="true" level="WEAK WARNING" enabled_by_default="true">
21 |       <option name="ignoredErrors">
22 |         <list />
23 |       </option>
24 |     </inspection_tool>
25 |     <inspection_tool class="PyPep8NamingInspection" enabled="true" level="WEAK WARNING" enabled_by_default="true" />
26 |     <inspection_tool class="PyTypeCheckerInspection" enabled="true" level="WARNING" enabled_by_default="true" />
27 |     <inspection_tool class="PyUnresolvedReferencesInspection" enabled="true" level="WARNING" enabled_by_default="true" />
28 |     <inspection_tool class="SpellCheckingInspection" enabled="false" level="TYPO" enabled_by_default="false">
29 |       <option name="processCode" value="true" />
30 |       <option name="processLiterals" value="true" />
31 |       <option name="processComments" value="true" />
32 |     </inspection_tool>
33 |   </profile>
34 | </component>
35 | 


--------------------------------------------------------------------------------
/docs/DEVELOPMENT.md:
--------------------------------------------------------------------------------
 1 | # Development
 2 | 
 3 | ## Development Setup
 4 | 
 5 | 1. **Clone the repository:**
 6 |    ```bash
 7 |    git clone https://github.com/dx-tooling/platform-problem-monitoring-core.git
 8 |    cd platform-problem-monitoring-core
 9 |    ```
10 | 
11 | 2. **Install development dependencies:**
12 |    ```bash
13 |    make install
14 |    ```
15 |    This creates a virtual environment, installs the package and all development dependencies, and sets up pre-commit hooks.
16 | 
17 | 3. **Activate the virtual environment:**
18 |    ```bash
19 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
20 |    ```
21 | 
22 | ## Code Quality Tools
23 | 
24 | This project uses a unified approach to code quality with all tools configured in `pyproject.toml` and executed via:
25 | 
26 | 1. **Pre-commit hooks** - Run automatically before each commit
27 | 2. **Make commands** - Run manually or in CI
28 | 
29 | Available make commands:
30 | 
31 | ```bash
32 |   make install        Install package and development dependencies
33 |   make activate-venv  Instructions to activate the virtual environment
34 |   make format         Format code with black and isort
35 |   make format-check   Check if code is properly formatted without modifying files
36 |   make lint           Run linters (ruff)
37 |   make lint-fix       Run linters and auto-fix issues where possible
38 |   make type-check     Run mypy type checking
39 |   make security-check Run bandit security checks
40 |   make quality        Run all code quality checks (with formatting)
41 |   make ci-quality     Run all code quality checks (without modifying files)
42 |   make test           Run tests
43 |   make test-verbose   Run tests with verbose output
44 |   make test-coverage  Run tests with coverage report
45 |   make test-file      Run tests for a specific file (usage: make test-file file=path/to/test_file.py)
46 |   make update-deps    Update all dependencies to their latest semver-compatible versions
47 |   make bump-version   Update the version number in pyproject.toml
48 |   make release        Create a new release tag (after running quality checks and tests)
49 |   make clean          Remove build artifacts and cache directories
50 | ```
51 | 
52 | The pre-commit hooks are configured to use the same Makefile targets, ensuring consistency between local development and CI environments.
53 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step1_prepare.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """Prepare environment for a process run."""
 3 | 
 4 | import argparse
 5 | import os
 6 | import sys
 7 | import tempfile
 8 | from pathlib import Path
 9 | 
10 | from platform_problem_monitoring_core.utils import ensure_dir_exists, logger
11 | 
12 | 
13 | def prepare_environment() -> str:
14 |     """
15 |     Prepare environment for a process run.
16 | 
17 |     Creates a temporary work directory for storing intermediate files.
18 | 
19 |     Returns:
20 |         Path to the temporary work folder
21 | 
22 |     Raises:
23 |         PermissionError: If unable to create or write to the temporary directory
24 |         OSError: If any other OS-level error occurs
25 |     """
26 |     logger.info("Preparing environment for process run")
27 | 
28 |     try:
29 |         # Create temporary work directory
30 |         work_dir = tempfile.mkdtemp(prefix="platform_problem_monitoring_")
31 |         logger.info(f"Created temporary work directory: {work_dir}")
32 | 
33 |         # Check if directory exists and is writable
34 |         work_path = Path(work_dir)
35 |         if not work_path.exists():
36 |             error_msg = f"Failed to create temporary directory: {work_dir}"
37 |             raise FileNotFoundError(error_msg)
38 | 
39 |         if not os.access(work_dir, os.W_OK):
40 |             error_msg = f"No write access to temporary directory: {work_dir}"
41 |             raise PermissionError(error_msg)
42 | 
43 |         # Create any additional subdirectories if needed
44 |         # This isn't strictly necessary but helps demonstrate the directory is writable
45 |         test_subdir = work_path / "test"
46 |         ensure_dir_exists(str(test_subdir))
47 |         test_subdir.rmdir()  # Clean up the test directory
48 | 
49 |         logger.info("Environment preparation complete")
50 |         return work_dir
51 |     except (OSError, PermissionError) as e:
52 |         logger.error(f"Failed to prepare environment: {str(e)}")
53 |         raise
54 | 
55 | 
56 | def main() -> None:
57 |     """Execute the script when run directly."""
58 |     parser = argparse.ArgumentParser(description="Prepare environment for a process run")
59 |     # Parse arguments but don't assign to a variable since we don't use them
60 |     parser.parse_args()
61 | 
62 |     try:
63 |         work_dir = prepare_environment()
64 |         # Print the work directory path for the next step to use
65 |         print(work_dir)
66 |         sys.exit(0)
67 |     except Exception as e:
68 |         logger.error(f"Error preparing environment: {str(e)}")
69 |         sys.exit(1)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     main()
74 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/email_body.txt:
--------------------------------------------------------------------------------
  1 | 
  2 | PLATFORM PROBLEM MONITORING REPORT
  3 | =================================
  4 | Generated: 2025-03-09 15:59:20 UTC
  5 | 
  6 | SUMMARY
  7 | -------
  8 | Current problem patterns: 0
  9 | Previous problem patterns: 0
 10 | New problem patterns: 3
 11 | Disappeared problem patterns: 5
 12 | 
 13 | NEW PROBLEM PATTERNS
 14 | ===================
 15 | These patterns have appeared since the last report.
 16 | 
 17 | 1. [32] SSL certificate for <*> is expiring in <*> days
 18 |    Sample documents:
 19 | 
 20 | 2. [21] Disk usage warning: <*> is at <*>% capacity
 21 |    Sample documents:
 22 | 
 23 | 3. [18] Connection reset by peer while sending request to <*>
 24 |    Sample documents:
 25 | 
 26 | 
 27 | 
 28 | DISAPPEARED PROBLEM PATTERNS
 29 | ==========================
 30 | These patterns were present in the previous report but are no longer occurring.
 31 | 
 32 | 1. [245] Error connecting to database at <*>: Connection timed out
 33 |    Sample documents:
 34 | 
 35 | 2. [124] Exception in thread "main" java.lang.OutOfMemoryError: <*>
 36 |    Sample documents:
 37 | 
 38 | 3. [89] Kubernetes pod <*> in namespace <*> failed health check
 39 |    Sample documents:
 40 | 
 41 | 4. [54] Failed to process message from queue <*>: <*>
 42 |    Sample documents:
 43 | 
 44 | 5. [42] Cache invalidation failed for key <*>
 45 |    Sample documents:
 46 | 
 47 | 
 48 | 
 49 | INCREASED PROBLEM PATTERNS
 50 | ========================
 51 | These patterns have increased in occurrence count since the last report.
 52 | 
 53 | 1. [14] (+0, +0.0%) Failed to process job <*> - timeout after <*> seconds
 54 |    Sample documents:
 55 | 
 56 | 
 57 | 
 58 | DECREASED PROBLEM PATTERNS
 59 | ========================
 60 | These patterns have decreased in occurrence count since the last report.
 61 | 
 62 | 1. [72] (-0, -0.0%) Failed to authenticate user <*> - invalid credentials
 63 |    Sample documents:
 64 | 
 65 | 2. [58] (-0, -0.0%) API rate limit exceeded for user ID <*>
 66 |    Sample documents:
 67 | 
 68 | 3. [12] (-0, -0.0%) HTTP request failed: <*> <*> returned status code <*>
 69 |    Sample documents:
 70 | 
 71 | 
 72 | 
 73 | TOP 25 CURRENT PROBLEM PATTERNS
 74 | ==============================
 75 | The most frequent problem patterns in the current report.
 76 | 
 77 | 1. [72] Failed to authenticate user <*> - invalid credentials
 78 |    Sample documents:
 79 | 
 80 | 2. [58] API rate limit exceeded for user ID <*>
 81 |    Sample documents:
 82 | 
 83 | 3. [32] SSL certificate for <*> is expiring in <*> days
 84 |    Sample documents:
 85 | 
 86 | 4. [21] Disk usage warning: <*> is at <*>% capacity
 87 |    Sample documents:
 88 | 
 89 | 5. [18] Connection reset by peer while sending request to <*>
 90 |    Sample documents:
 91 | 
 92 | 6. [14] Failed to process job <*> - timeout after <*> seconds
 93 |    Sample documents:
 94 | 
 95 | 7. [12] AWS S3 access denied: <*>
 96 |    Sample documents:
 97 | 
 98 | 
 99 | 
100 | This is an automated report from the Platform Problem Monitoring system.
101 | 


--------------------------------------------------------------------------------
/docs/QUALITY.md:
--------------------------------------------------------------------------------
  1 | # Code Quality Guidelines
  2 | 
  3 | This document describes the code quality tools and practices used in this project.
  4 | 
  5 | ## Code Quality Tools
  6 | 
  7 | We use the following tools to maintain high code quality:
  8 | 
  9 | ### Ruff
 10 | 
 11 | [Ruff](https://github.com/charliermarsh/ruff) is an extremely fast Python linter, written in Rust. It includes many checks from tools like flake8, isort, pycodestyle, and many plugins.
 12 | 
 13 | ```bash
 14 | # Run Ruff
 15 | make lint
 16 | ```
 17 | 
 18 | ### Black
 19 | 
 20 | [Black](https://github.com/psf/black) is an uncompromising code formatter for Python. It applies a consistent style by reformatting your code.
 21 | 
 22 | ```bash
 23 | # Format code with Black
 24 | make format
 25 | ```
 26 | 
 27 | ### isort
 28 | 
 29 | [isort](https://github.com/PyCQA/isort) sorts your imports alphabetically, and automatically separated into sections and by type.
 30 | 
 31 | ```bash
 32 | # Run isort (included in format command)
 33 | make format
 34 | ```
 35 | 
 36 | ### mypy
 37 | 
 38 | [mypy](https://github.com/python/mypy) is an optional static type checker for Python. It helps catch common errors before runtime.
 39 | 
 40 | ```bash
 41 | # Run mypy
 42 | make type-check
 43 | ```
 44 | 
 45 | ### Bandit
 46 | 
 47 | [Bandit](https://github.com/PyCQA/bandit) is a tool designed to find common security issues in Python code.
 48 | 
 49 | ```bash
 50 | # Run security checks
 51 | make security-check
 52 | ```
 53 | 
 54 | ### pre-commit
 55 | 
 56 | [pre-commit](https://pre-commit.com/) runs these checks automatically before each commit, ensuring that only quality code enters the repository.
 57 | 
 58 | ```bash
 59 | # Install pre-commit hooks
 60 | pre-commit install
 61 | ```
 62 | 
 63 | ## Running All Checks
 64 | 
 65 | You can run all quality checks at once:
 66 | 
 67 | ```bash
 68 | make quality
 69 | ```
 70 | 
 71 | ## VS Code Integration
 72 | 
 73 | This project includes VS Code settings that integrate all these tools into your editor. With the proper extensions installed, you'll get:
 74 | 
 75 | - Real-time type checking
 76 | - Automatic formatting on save
 77 | - Inline error highlighting
 78 | - Code actions to fix issues
 79 | 
 80 | ## Recommended VS Code Extensions
 81 | 
 82 | - Python (Microsoft)
 83 | - Pylance (Microsoft)
 84 | - Ruff (Astral Software)
 85 | - Even Better TOML (tamasfe)
 86 | - YAML (Red Hat)
 87 | 
 88 | ## Code Style Guidelines
 89 | 
 90 | 1. **Type Annotations**: All functions should have complete type annotations.
 91 | 2. **Docstrings**: All public methods and functions should have Google-style docstrings.
 92 | 3. **Line Length**: Maximum line length is 100 characters.
 93 | 4. **Imports**: Imports should be sorted by isort with the Black profile.
 94 | 5. **Naming**: Follow PEP8 naming conventions:
 95 |    - Classes: `PascalCase`
 96 |    - Functions, methods, variables: `snake_case`
 97 |    - Constants: `UPPER_SNAKE_CASE`
 98 |    - Private members: start with underscore `_private_method()`
 99 | 
100 | ## Continuous Integration
101 | 
102 | These quality checks are also run in CI to ensure that all code entering the main branch maintains the expected level of quality.
103 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/hourly_problem_numbers.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "start_time": "2025-03-07T00:00:00Z",
  4 |         "end_time": "2025-03-07T01:00:00Z",
  5 |         "count": 352
  6 |     },
  7 |     {
  8 |         "start_time": "2025-03-07T01:00:00Z",
  9 |         "end_time": "2025-03-07T02:00:00Z",
 10 |         "count": 378
 11 |     },
 12 |     {
 13 |         "start_time": "2025-03-07T02:00:00Z",
 14 |         "end_time": "2025-03-07T03:00:00Z",
 15 |         "count": 365
 16 |     },
 17 |     {
 18 |         "start_time": "2025-03-07T03:00:00Z",
 19 |         "end_time": "2025-03-07T04:00:00Z",
 20 |         "count": 342
 21 |     },
 22 |     {
 23 |         "start_time": "2025-03-07T04:00:00Z",
 24 |         "end_time": "2025-03-07T05:00:00Z",
 25 |         "count": 320
 26 |     },
 27 |     {
 28 |         "start_time": "2025-03-07T05:00:00Z",
 29 |         "end_time": "2025-03-07T06:00:00Z",
 30 |         "count": 298
 31 |     },
 32 |     {
 33 |         "start_time": "2025-03-07T06:00:00Z",
 34 |         "end_time": "2025-03-07T07:00:00Z",
 35 |         "count": 274
 36 |     },
 37 |     {
 38 |         "start_time": "2025-03-07T07:00:00Z",
 39 |         "end_time": "2025-03-07T08:00:00Z",
 40 |         "count": 286
 41 |     },
 42 |     {
 43 |         "start_time": "2025-03-07T08:00:00Z",
 44 |         "end_time": "2025-03-07T09:00:00Z",
 45 |         "count": 310
 46 |     },
 47 |     {
 48 |         "start_time": "2025-03-07T09:00:00Z",
 49 |         "end_time": "2025-03-07T10:00:00Z",
 50 |         "count": 267
 51 |     },
 52 |     {
 53 |         "start_time": "2025-03-07T10:00:00Z",
 54 |         "end_time": "2025-03-07T11:00:00Z",
 55 |         "count": 243
 56 |     },
 57 |     {
 58 |         "start_time": "2025-03-07T11:00:00Z",
 59 |         "end_time": "2025-03-07T12:00:00Z",
 60 |         "count": 218
 61 |     },
 62 |     {
 63 |         "start_time": "2025-03-07T12:00:00Z",
 64 |         "end_time": "2025-03-07T13:00:00Z",
 65 |         "count": 203
 66 |     },
 67 |     {
 68 |         "start_time": "2025-03-07T13:00:00Z",
 69 |         "end_time": "2025-03-07T14:00:00Z",
 70 |         "count": 185
 71 |     },
 72 |     {
 73 |         "start_time": "2025-03-07T14:00:00Z",
 74 |         "end_time": "2025-03-07T15:00:00Z",
 75 |         "count": 176
 76 |     },
 77 |     {
 78 |         "start_time": "2025-03-07T15:00:00Z",
 79 |         "end_time": "2025-03-07T16:00:00Z",
 80 |         "count": 162
 81 |     },
 82 |     {
 83 |         "start_time": "2025-03-07T16:00:00Z",
 84 |         "end_time": "2025-03-07T17:00:00Z",
 85 |         "count": 143
 86 |     },
 87 |     {
 88 |         "start_time": "2025-03-07T17:00:00Z",
 89 |         "end_time": "2025-03-07T18:00:00Z",
 90 |         "count": 132
 91 |     },
 92 |     {
 93 |         "start_time": "2025-03-07T18:00:00Z",
 94 |         "end_time": "2025-03-07T19:00:00Z",
 95 |         "count": 124
 96 |     },
 97 |     {
 98 |         "start_time": "2025-03-07T19:00:00Z",
 99 |         "end_time": "2025-03-07T20:00:00Z",
100 |         "count": 115
101 |     },
102 |     {
103 |         "start_time": "2025-03-07T20:00:00Z",
104 |         "end_time": "2025-03-07T21:00:00Z",
105 |         "count": 108
106 |     },
107 |     {
108 |         "start_time": "2025-03-07T21:00:00Z",
109 |         "end_time": "2025-03-07T22:00:00Z",
110 |         "count": 93
111 |     },
112 |     {
113 |         "start_time": "2025-03-07T22:00:00Z",
114 |         "end_time": "2025-03-07T23:00:00Z",
115 |         "count": 84
116 |     },
117 |     {
118 |         "start_time": "2025-03-07T23:00:00Z",
119 |         "end_time": "2025-03-08T00:00:00Z",
120 |         "count": 72
121 |     }
122 | ]
123 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["setuptools>=42", "wheel"]
  3 | build-backend = "setuptools.build_meta"
  4 | 
  5 | [project]
  6 | name = "platform_problem_monitoring_core"
  7 | version = "0.1.11"
  8 | description = "A tool for monitoring platform problems using Elasticsearch logs"
  9 | authors = [
 10 |     {name = "Platform Team"}
 11 | ]
 12 | readme = "README.md"
 13 | requires-python = ">=3.10"
 14 | license = {text = "Proprietary"}
 15 | dependencies = [
 16 |     "boto3>=1.28.0",
 17 |     "drain3>=0.9.6",
 18 |     "jinja2>=3.0.0",
 19 |     "argparse>=1.4.0",
 20 |     "requests>=2.25.0",
 21 |     "matplotlib>=3.7.0",
 22 |     "seaborn>=0.12.0",
 23 |     "typing-extensions>=4.0.0",
 24 | ]
 25 | 
 26 | [project.optional-dependencies]
 27 | dev = [
 28 |     "pytest>=7.0.0",
 29 |     "pytest-cov>=4.1.0",
 30 |     "black>=23.0.0",
 31 |     "flake8>=6.0.0",
 32 |     "mypy>=1.0.0",
 33 |     "isort>=5.12.0",
 34 |     "ruff>=0.1.9",
 35 |     "pre-commit>=3.3.2",
 36 |     "bandit>=1.7.5",
 37 |     "types-requests>=2.0.0",
 38 |     "types-boto3>=1.0.0",
 39 |     "types-seaborn>=0.12.0",
 40 | ]
 41 | 
 42 | [tool.setuptools]
 43 | packages = ["platform_problem_monitoring_core"]
 44 | package-dir = {"" = "src"}
 45 | 
 46 | # Add package data to include resources in the wheel
 47 | [tool.setuptools.package-data]
 48 | platform_problem_monitoring_core = ["resources/*.html"]
 49 | 
 50 | # Add data files to include configuration templates
 51 | [tool.setuptools.data-files]
 52 | "etc/platform_problem_monitoring_core" = ["etc/*.dist"]
 53 | "bin" = ["bin/*"]
 54 | 
 55 | [tool.black]
 56 | line-length = 120
 57 | target-version = ["py310", "py311", "py312", "py313"]
 58 | 
 59 | [tool.mypy]
 60 | python_version = "3.13"
 61 | warn_return_any = true
 62 | warn_unused_configs = true
 63 | disallow_untyped_defs = true
 64 | disallow_incomplete_defs = true
 65 | check_untyped_defs = true
 66 | disallow_untyped_decorators = true
 67 | no_implicit_optional = true
 68 | strict_optional = true
 69 | warn_redundant_casts = true
 70 | warn_unused_ignores = true
 71 | warn_no_return = true
 72 | warn_unreachable = true
 73 | 
 74 | [[tool.mypy.overrides]]
 75 | module = "drain3.*"
 76 | ignore_missing_imports = true
 77 | 
 78 | [tool.isort]
 79 | profile = "black"
 80 | line_length = 120
 81 | multi_line_output = 3
 82 | 
 83 | [tool.pytest]
 84 | testpaths = ["src/tests"]
 85 | python_files = "test_*.py"
 86 | python_classes = "Test*"
 87 | python_functions = "test_*"
 88 | filterwarnings = [
 89 |     "ignore::DeprecationWarning",
 90 |     "ignore::PendingDeprecationWarning",
 91 | ]
 92 | 
 93 | [tool.pytest.ini_options]
 94 | minversion = "7.0"
 95 | addopts = "--strict-markers"
 96 | markers = [
 97 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
 98 |     "integration: marks tests as integration tests (deselect with '-m \"not integration\"')",
 99 | ]
100 | 
101 | [tool.coverage.run]
102 | source = ["platform_problem_monitoring_core"]
103 | omit = ["*/tests/*", "*/venv/*"]
104 | 
105 | [tool.coverage.report]
106 | exclude_lines = [
107 |     "pragma: no cover",
108 |     "def __repr__",
109 |     "raise NotImplementedError",
110 |     "if __name__ == .__main__.:",
111 |     "pass",
112 |     "raise ImportError",
113 | ]
114 | 
115 | [tool.ruff]
116 | # General configuration
117 | line-length = 120
118 | target-version = "py313"
119 | 
120 | [tool.ruff.lint]
121 | # Enable pycodestyle (E), Pyflakes (F), McCabe complexity (C90), isort (I),
122 | # pep8-naming (N), flake8-builtins (A), flake8-bugbear (B), flake8-comprehensions (C4),
123 | # flake8-docstrings (D), flake8-errmsg (EM), flake8-logging-format (G), flake8-simplify (SIM),
124 | # flake8-unused-arguments (ARG), flake8-pytest-style (PT), flake8-use-pathlib (PTH)
125 | select = ["E", "F", "C90", "I", "N", "A", "B", "C4", "D", "EM", "G", "SIM", "ARG", "PT", "PTH"]
126 | ignore = ["D203", "D212"]
127 | 
128 | [tool.ruff.lint.pydocstyle]
129 | convention = "google"
130 | 
131 | [tool.bandit]
132 | exclude_dirs = ["venv"]
133 | skips = ["B101"]  # Skip assert warning as we use it in tests
134 | 


--------------------------------------------------------------------------------
/docs/JETBRAINS_SETUP.md:
--------------------------------------------------------------------------------
  1 | # JetBrains IDE Setup Guide
  2 | 
  3 | This guide explains how to set up and use JetBrains IDEs (PyCharm, IntelliJ IDEA, etc.) with this project, particularly focusing on the code quality tools.
  4 | 
  5 | ## Initial Setup
  6 | 
  7 | 1. Open the project in your JetBrains IDE
  8 | 2. Ensure you've installed the project dependencies:
  9 |    ```bash
 10 |    make install
 11 |    ```
 12 | 3. The IDE should automatically detect the project structure and Python interpreter from the `.idea` directory settings
 13 | 
 14 | ## Python SDK Setup
 15 | 
 16 | If the Python interpreter isn't automatically detected:
 17 | 
 18 | 1. Go to `File > Project Structure`
 19 | 2. Under Project Settings > Project, select the Python interpreter from your virtual environment
 20 | 3. Make sure it's pointing to the `venv/bin/python` interpreter in your project directory
 21 | 
 22 | ## Run Configurations
 23 | 
 24 | We've included several predefined run configurations to help you verify code quality:
 25 | 
 26 | - **Black Format**: Formats your code according to Black style
 27 | - **Ruff Lint**: Runs the Ruff linter to check for code issues
 28 | - **Ruff Lint Fix**: Runs the Ruff linter and automatically fixes issues where possible
 29 | - **Mypy Type Check**: Verifies type annotations
 30 | - **All Quality Checks**: Runs all quality checks at once
 31 | - **Make Lint Fix**: Runs make lint-fix to automatically fix linting issues
 32 | 
 33 | To run any of these:
 34 | 
 35 | 1. Click on the run configuration dropdown in the top-right toolbar
 36 | 2. Select the desired configuration
 37 | 3. Click the run button (green triangle)
 38 | 
 39 | ## Code Inspection
 40 | 
 41 | We've configured the IDE's inspection profiles to match our quality standards:
 42 | 
 43 | 1. Type checking is enabled with strict mode
 44 | 2. PEP 8 style checking is enabled
 45 | 3. Python version compatibility checks are enabled
 46 | 
 47 | ## External Tools Integration
 48 | 
 49 | ### Black
 50 | 
 51 | Black auto-formatting is enabled in the editor:
 52 | 
 53 | 1. The code will be auto-formatted on save
 54 | 2. You can also press `Ctrl+Alt+L` (or `Cmd+Alt+L` on macOS) to format the current file
 55 | 
 56 | ### Ruff
 57 | 
 58 | Ruff can both check for issues and fix them:
 59 | 
 60 | 1. Run "Ruff Lint" to check for issues
 61 | 2. Run "Ruff Lint Fix" to automatically fix issues where possible
 62 | 3. From the terminal: `make lint` to check, `make lint-fix` to check and fix
 63 | 
 64 | ### Keyboard Shortcuts
 65 | 
 66 | - **Reformat Code**: `Ctrl+Alt+L` (Windows/Linux) or `Cmd+Alt+L` (macOS)
 67 | - **Run Current Configuration**: `Shift+F10` (Windows/Linux) or `Ctrl+R` (macOS)
 68 | - **Debug Current Configuration**: `Shift+F9` (Windows/Linux) or `Ctrl+D` (macOS)
 69 | 
 70 | ## Using the Terminal Tool Window
 71 | 
 72 | You can also run the Makefile commands directly from the Terminal tool window:
 73 | 
 74 | 1. Open the Terminal tool window (`Alt+F12` or `View > Tool Windows > Terminal`)
 75 | 2. Run commands like:
 76 |    ```bash
 77 |    make quality
 78 |    make lint
 79 |    make lint-fix
 80 |    make format
 81 |    ```
 82 | 
 83 | ## Code Commits
 84 | 
 85 | When committing code, the pre-commit hooks will run automatically if you've installed them with:
 86 | 
 87 | ```bash
 88 | pre-commit install
 89 | ```
 90 | 
 91 | This helps catch issues before they're committed to the repository.
 92 | 
 93 | ## Best Practices
 94 | 
 95 | 1. **Enable Auto Import**: Under Settings > Editor > General > Auto Import, enable "Add unambiguous imports on the fly"
 96 | 2. **Use Type Hints**: The IDE will show type hint errors as you type
 97 | 3. **Run Type Checking Often**: Use the Mypy run configuration frequently to catch type issues
 98 | 4. **Fix Linting Issues Automatically**: Use `make lint-fix` to automatically fix many common issues
 99 | 
100 | ## Troubleshooting
101 | 
102 | If you experience issues with the IDE:
103 | 
104 | 1. **Invalidate Caches**: Try `File > Invalidate Caches and Restart`
105 | 2. **Sync Project with pyproject.toml**: Ensure the IDE settings match the `pyproject.toml` settings
106 | 3. **Check the Terminal**: Run commands directly in the terminal to see if errors are IDE-specific
107 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step6_extract_fields.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Extract relevant fields from logstash documents."""
  3 | 
  4 | import argparse
  5 | import json
  6 | import sys
  7 | from pathlib import Path
  8 | 
  9 | from platform_problem_monitoring_core.utils import load_json, logger
 10 | 
 11 | 
 12 | def extract_fields(logstash_file: str, output_file: str) -> None:
 13 |     """
 14 |     Extract relevant fields from logstash documents.
 15 | 
 16 |     Args:
 17 |         logstash_file: Path to the logstash documents file
 18 |         output_file: Path to store the extracted fields
 19 | 
 20 |     Raises:
 21 |         FileNotFoundError: If the logstash file doesn't exist
 22 |         json.JSONDecodeError: If the file contains invalid JSON
 23 |         OSError: If the output cannot be written
 24 |     """
 25 |     logger.info("Extracting fields from logstash documents")
 26 |     logger.info(f"Logstash file: {logstash_file}")
 27 |     logger.info(f"Output file: {output_file}")
 28 | 
 29 |     # Load logstash documents
 30 |     documents = load_json(logstash_file)
 31 |     logger.info(f"Loaded {len(documents)} logstash documents")
 32 | 
 33 |     # Ensure the output directory exists
 34 |     output_path = Path(output_file)
 35 |     output_path.parent.mkdir(parents=True, exist_ok=True)
 36 | 
 37 |     # Process the documents and write them to the output file
 38 |     processed_count = 0
 39 |     skipped_count = 0
 40 | 
 41 |     try:
 42 |         # Open output file for writing
 43 |         with Path(output_file).open("w") as f:
 44 |             # Process each document
 45 |             for doc in documents:
 46 |                 try:
 47 |                     # Extract required fields: index name, document id, and message
 48 |                     index_name = doc.get("_index", "unknown")
 49 |                     doc_id = doc.get("_id", "unknown")
 50 | 
 51 |                     # Extract message from _source
 52 |                     source = doc.get("_source", {})
 53 |                     message = source.get("message", "")
 54 | 
 55 |                     if not message:
 56 |                         skipped_count += 1
 57 |                         continue
 58 | 
 59 |                     # Write extracted fields to output file as JSON
 60 |                     extracted = {"index": index_name, "id": doc_id, "message": message}
 61 |                     f.write(json.dumps(extracted) + "\n")
 62 |                     processed_count += 1
 63 | 
 64 |                     # Log progress for large document sets
 65 |                     if processed_count % 10000 == 0:
 66 |                         logger.info(f"Processed {processed_count} documents so far")
 67 |                 except (KeyError, TypeError) as e:
 68 |                     skipped_count += 1
 69 |                     logger.warning(f"Error processing document: {e}")
 70 |                     continue
 71 |                 except Exception as e:
 72 |                     logger.warning(f"Unexpected error processing document: {e}")
 73 |                     skipped_count += 1
 74 |                     continue
 75 | 
 76 |         logger.info(f"Extracted fields from {processed_count} documents")
 77 |         if skipped_count > 0:
 78 |             logger.warning(f"Skipped {skipped_count} documents due to errors or missing fields")
 79 |     except OSError as e:
 80 |         logger.error(f"Error writing to output file: {e}")
 81 |         error_msg = f"Failed to write to output file {output_file}: {e}"
 82 |         raise OSError(error_msg) from e
 83 | 
 84 |     logger.info("Field extraction completed")
 85 | 
 86 | 
 87 | def main() -> None:
 88 |     """Parse command line arguments and extract fields from logstash documents."""
 89 |     parser = argparse.ArgumentParser(description="Extract fields from logstash documents")
 90 |     parser.add_argument("--logstash-file", required=True, help="Path to the logstash documents file")
 91 |     parser.add_argument("--output-file", required=True, help="Path to store the extracted fields")
 92 | 
 93 |     args = parser.parse_args()
 94 | 
 95 |     try:
 96 |         extract_fields(args.logstash_file, args.output_file)
 97 |         sys.exit(0)
 98 |     except Exception as e:
 99 |         logger.error(f"Error extracting fields: {e}")
100 |         sys.exit(1)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step12_cleanup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Clean up work environment."""
  3 | 
  4 | import argparse
  5 | import logging
  6 | import os
  7 | import shutil
  8 | import sys
  9 | from pathlib import Path
 10 | from typing import List
 11 | 
 12 | from platform_problem_monitoring_core.utils import logger
 13 | 
 14 | 
 15 | def _verify_safe_path(work_dir: Path) -> None:
 16 |     """
 17 |     Verify that the path is safe to remove.
 18 | 
 19 |     Args:
 20 |         work_dir: Path to verify
 21 | 
 22 |     Raises:
 23 |         ValueError: If the path is not a directory, doesn't exist, or doesn't look like a temporary work directory
 24 |     """
 25 |     # Check if the directory exists
 26 |     if not work_dir.exists():
 27 |         error_msg = f"Directory does not exist: {work_dir}"
 28 |         raise ValueError(error_msg)
 29 | 
 30 |     # Check if the path is a directory
 31 |     if not work_dir.is_dir():
 32 |         error_msg = f"Path is not a directory: {work_dir}"
 33 |         raise ValueError(error_msg)
 34 | 
 35 |     # Verify that the directory looks like a temporary work directory
 36 |     # This is a safety check to avoid accidentally deleting important directories
 37 |     if not work_dir.name.startswith("platform_problem_monitoring_"):
 38 |         error_msg = f"Directory does not appear to be a temporary work directory: {work_dir}"
 39 |         raise ValueError(error_msg)
 40 | 
 41 | 
 42 | def _list_remaining_files(work_dir: Path) -> List[str]:
 43 |     """
 44 |     List files remaining in the directory.
 45 | 
 46 |     Args:
 47 |         work_dir: Path to the directory
 48 | 
 49 |     Returns:
 50 |         List of files found in the directory
 51 |     """
 52 |     files = []
 53 |     try:
 54 |         for root, _dirs, filenames in os.walk(work_dir):
 55 |             for filename in filenames:
 56 |                 file_path = Path(root) / filename
 57 |                 files.append(str(file_path.relative_to(work_dir)))
 58 |         return files
 59 |     except (OSError, ValueError) as e:
 60 |         logger.warning(f"Error listing files in {work_dir}: {e}")
 61 |         return []
 62 | 
 63 | 
 64 | def cleanup_environment(work_dir: str) -> None:
 65 |     """
 66 |     Clean up the work environment by removing the temporary work directory.
 67 | 
 68 |     Args:
 69 |         work_dir: Path to the temporary work folder to remove
 70 | 
 71 |     Raises:
 72 |         ValueError: If the path is not suitable for removal
 73 |         OSError: If there's an error removing the directory
 74 |     """
 75 |     logger.info("Cleaning up work environment")
 76 |     logger.info(f"Removing temporary work directory: {work_dir}")
 77 | 
 78 |     # Convert to Path object
 79 |     work_path = Path(work_dir)
 80 | 
 81 |     try:
 82 |         # Check if the path is safe to remove
 83 |         _verify_safe_path(work_path)
 84 | 
 85 |         # Optional: List files before deletion (for debugging if needed)
 86 |         if logger.isEnabledFor(logging.DEBUG):
 87 |             files = _list_remaining_files(work_path)
 88 |             if files:
 89 |                 logger.debug(f"Files to be removed: {', '.join(files)}")
 90 | 
 91 |         # Remove the directory and all its contents
 92 |         shutil.rmtree(work_dir)
 93 |         logger.info(f"Successfully removed directory: {work_dir}")
 94 | 
 95 |     except ValueError as e:
 96 |         # Non-fatal errors (directory doesn't exist or isn't a temp directory)
 97 |         logger.warning(f"Skipping cleanup: {str(e)}")
 98 |     except OSError as e:
 99 |         error_msg = f"Error removing directory {work_dir}: {str(e)}"
100 |         logger.error(error_msg)
101 |         raise OSError(error_msg) from e
102 | 
103 |     logger.info("Cleanup complete")
104 | 
105 | 
106 | def main() -> None:
107 |     """Execute the script when run directly."""
108 |     parser = argparse.ArgumentParser(description="Clean up work environment")
109 |     parser.add_argument("--work-dir", required=True, help="Path to the temporary work folder to remove")
110 | 
111 |     args = parser.parse_args()
112 | 
113 |     try:
114 |         cleanup_environment(args.work_dir)
115 |         sys.exit(0)
116 |     except Exception as e:
117 |         logger.error(f"Error cleaning up environment: {str(e)}")
118 |         sys.exit(1)
119 | 
120 | 
121 | if __name__ == "__main__":
122 |     main()
123 | 


--------------------------------------------------------------------------------
/docs/RELEASE_MANAGEMENT.md:
--------------------------------------------------------------------------------
  1 | # Platform Problem Monitoring Core - Release Management
  2 | 
  3 | This document outlines the release process for the Platform Problem Monitoring Core package, including version management, artifact creation, and publishing.
  4 | 
  5 | ## Release Artifacts
  6 | 
  7 | Each release includes the following artifacts:
  8 | 
  9 | - **Source Distribution (.tar.gz)** - Contains the raw source code of the package
 10 | - **Wheel Distribution (.whl)** - A pre-built package that's ready to install
 11 | 
 12 | ## Release Workflow
 13 | 
 14 | ### Automated GitHub Actions Workflow
 15 | 
 16 | The release process uses a GitHub Actions workflow (`.github/workflows/release.yml`) that:
 17 | 
 18 | 1. Builds Python packages (wheel and source distribution)
 19 | 2. Creates a configuration templates archive
 20 | 3. Creates a GitHub Release with auto-generated release notes
 21 | 4. Attaches all artifacts to the release
 22 | 
 23 | The workflow is triggered whenever a tag with the format `v*.*.*` is pushed to the repository.
 24 | 
 25 | ## Step-by-Step Release Process
 26 | 
 27 | ### 1. Prepare for Release
 28 | 
 29 | Ensure all changes are committed, CI passes, and the code is ready for release:
 30 | 
 31 | ```bash
 32 | # Pull latest changes
 33 | git checkout main
 34 | git pull origin main
 35 | 
 36 | # Run quality checks and tests
 37 | make ci-quality
 38 | make test-coverage
 39 | ```
 40 | 
 41 | ### 2. Update Version Number
 42 | 
 43 | Update the version in `pyproject.toml`:
 44 | 
 45 | ```bash
 46 | # Option 1: Manual edit
 47 | # Edit pyproject.toml and change version = "x.y.z"
 48 | 
 49 | # Option 2: Using make command
 50 | make bump-version
 51 | ```
 52 | 
 53 | The `bump-version` make command will:
 54 | 1. Show current version
 55 | 2. Prompt for new version
 56 | 3. Update `pyproject.toml`
 57 | 
 58 | ### 3. Commit Version Change
 59 | 
 60 | ```bash
 61 | git add pyproject.toml
 62 | git commit -m "Bump version to x.y.z"
 63 | git push origin main
 64 | ```
 65 | 
 66 | ### 4. Create Release Tag
 67 | 
 68 | ```bash
 69 | # Option 1: Manual tagging
 70 | git tag -a "vx.y.z" -m "Release vx.y.z"
 71 | 
 72 | # Option 2: Using make command
 73 | make release
 74 | ```
 75 | 
 76 | The `release` make command will:
 77 | 1. Run quality checks and tests
 78 | 2. Create a new annotated git tag based on the version in pyproject.toml
 79 | 
 80 | ### 5. Push Tag to Trigger Release
 81 | 
 82 | ```bash
 83 | git push origin vx.y.z
 84 | ```
 85 | 
 86 | This will trigger the GitHub Actions release workflow.
 87 | 
 88 | ### 6. Verify Release
 89 | 
 90 | 1. Go to the GitHub repository's Actions tab
 91 | 2. Check that the release workflow completed successfully
 92 | 3. Go to the Releases page to verify that the release was created with all artifacts
 93 | 
 94 | ## Installation from Release Artifacts
 95 | 
 96 | The released package can be installed in two ways:
 97 | 
 98 | ### 1. Using pip directly from GitHub (for applications)
 99 | 
100 | ```bash
101 | pip install https://github.com/dx-tooling/platform-problem-monitoring-core/releases/download/vX.Y.Z/platform_problem_monitoring_core-X.Y.Z-py3-none-any.whl
102 | ```
103 | 
104 | ### 2. For development or customization
105 | 
106 | 1. Download both the wheel file and `additional_assets.zip` from the releases page
107 | 2. Extract the configuration templates
108 | 3. Follow the setup instructions in the README
109 | 
110 | ## Versioning Scheme
111 | 
112 | This project follows [Semantic Versioning](https://semver.org/):
113 | 
114 | * **MAJOR version** (x.0.0) - Incompatible API changes
115 | * **MINOR version** (0.x.0) - Add functionality in a backward compatible manner
116 | * **PATCH version** (0.0.x) - Backward compatible bug fixes
117 | 
118 | ## Release Notes Guidelines
119 | 
120 | When creating a new release:
121 | 
122 | 1. Provide a summary of key changes
123 | 2. List new features
124 | 3. Document any breaking changes
125 | 4. Include any migration instructions
126 | 5. Acknowledge contributors
127 | 
128 | ## Troubleshooting Release Issues
129 | 
130 | ### Common Problems and Solutions
131 | 
132 | 1. **Release workflow fails**
133 |    - Check that all test dependencies are properly installed
134 |    - Verify that tests pass locally
135 | 
136 | 2. **Missing configuration files in the release**
137 |    - Check the paths in the "Create configuration archive" step
138 |    - Ensure all required files exist in the repository
139 | 
140 | 3. **Wrong version number**
141 |    - Check that the version in `pyproject.toml` matches the git tag
142 |    - Ensure the tag follows the format `vX.Y.Z`
143 | 


--------------------------------------------------------------------------------
/src/tests/fixtures/previous_normalization_results.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "patterns": [
 3 |         {
 4 |             "cluster_id": 3,
 5 |             "count": 713,
 6 |             "pattern": "[TIMESTAMP] request.ERROR: Uncaught PHP Exception Symfony\\Component\\HttpKernel\\Exception\\NotFoundHttpException: \"Component \"LinkedIcon\" not found.\" at /opt/website/prod/backend-app/vendor/symfony/ux-live-component/src/EventListener/LiveComponentSubscriber.php line <NUM> {\"exception\": \"[object] (Symfony\\\\Component\\\\HttpKernel\\\\Exception\\\\NotFoundHttpException(code: <NUM>): Component \\\"LinkedIcon\\\" not found. at /opt/website/prod/backend-app/vendor/symfony/ux-live-component/src/EventListener/LiveComponentSubscriber.php:<NUM>)\\n[previous exception] [object] (InvalidArgumentException(code: <NUM>): Unknown component \\\"LinkedIcon\\\". And no matching anonymous component template was found. at /opt/website/prod/backend-app/vendor/symfony/ux-twig-component/src/ComponentFactory.php:<NUM>)\"} []",
 7 |             "first_seen": "logstash-symfony-errors-2025.03.05:dLi0ZpUBJdUWaJfyqGCm",
 8 |             "last_seen": "logstash-symfony-errors-2025.03.05:0Pu0ZpUBDnR7VQTqsAl9",
 9 |             "sample_log_lines": [],
10 |             "sample_doc_references": [
11 |                 "logstash-symfony-errors-2025.03.05:dLi0ZpUBJdUWaJfyqGCm",
12 |                 "logstash-symfony-errors-2025.03.05:9Pu0ZpUBDnR7VQTqqQi2",
13 |                 "logstash-symfony-errors-2025.03.05:ebi0ZpUBJdUWaJfyqmAj",
14 |                 "logstash-symfony-errors-2025.03.05:c7i0ZpUBJdUWaJfyrWGb",
15 |                 "logstash-symfony-errors-2025.03.05:0Pu0ZpUBDnR7VQTqsAl9"
16 |             ]
17 |         },
18 |         {
19 |             "cluster_id": 4,
20 |             "count": 713,
21 |             "pattern": "[TIMESTAMP] application_events.INFO: {\"applicationEvent\": {\"id\": \"<UUID>\", \"eventCategory\": \"<NUM>\", \"eventCategoryTitle\": \"error\", \"eventType\": \"<NUM>\", \"eventTypeTitle\": \"exception\", \"occuredAt\": \"TIMESTAMP\", \"affectedUserId\": <*> \"affectedUserIsJobofferer\": <*> \"affectedUserIsJobseeker\": <*> \"affectedUserRegisteredAt\": <*> \"metric\": null, \"errorMessage\": \"Component \\\"LinkedIcon\\\" not found.\", \"additionalData\": {\"throwableClass\": \"Symfony\\\\Component\\\\HttpKernel\\\\Exception\\\\NotFoundHttpException\", \"file\": \"/opt/website/prod/backend-app/vendor/symfony/ux-live-component/src/EventListener/LiveComponentSubscriber.php\", \"line\": \"<NUM>\"}, \"requestId\": \"<HEX>\", \"sessionId\": <*> \"clientId\": \"<HEX>\", \"isProbablyBotRequest\": \"<NUM>\"}} [] []",
22 |             "first_seen": "logstash-symfony-application-events-2025.03.05:8Pu0ZpUBDnR7VQTqqAis",
23 |             "last_seen": "logstash-symfony-application-events-2025.03.05:0fu0ZpUBDnR7VQTqsAl-",
24 |             "sample_log_lines": [],
25 |             "sample_doc_references": [
26 |                 "logstash-symfony-application-events-2025.03.05:8Pu0ZpUBDnR7VQTqqAis",
27 |                 "logstash-symfony-application-events-2025.03.05:8fu0ZpUBDnR7VQTqqAjK",
28 |                 "logstash-symfony-application-events-2025.03.05:dri0ZpUBJdUWaJfyqWBl",
29 |                 "logstash-symfony-application-events-2025.03.05:cbi0ZpUBJdUWaJfyrGG3",
30 |                 "logstash-symfony-application-events-2025.03.05:0fu0ZpUBDnR7VQTqsAl-"
31 |             ]
32 |         },
33 |         {
34 |             "cluster_id": 1,
35 |             "count": 1,
36 |             "pattern": "[TIMESTAMP] app.INFO: Application Appointment Scheduling API request for handling superchat error: No mapping found for superchat message id 'ms_nVVaoWgbvdQk5lIFyYWTK'. This will not affect any schedulings. [] []",
37 |             "first_seen": "logstash-symfony-main-2025.03.05:9_qzZpUBDnR7VQTqSpIZ",
38 |             "last_seen": "logstash-symfony-main-2025.03.05:9_qzZpUBDnR7VQTqSpIZ",
39 |             "sample_log_lines": [],
40 |             "sample_doc_references": [
41 |                 "logstash-symfony-main-2025.03.05:9_qzZpUBDnR7VQTqSpIZ"
42 |             ]
43 |         },
44 |         {
45 |             "cluster_id": 2,
46 |             "count": 1,
47 |             "pattern": "remote_addr=\"<IP>\" - x_forwarded_for=\"<IP>, <IP>\" - cf_connecting_ip=\"<IP>\" - - [05/Mar/2025:<TIME> +<NUM>] \"POST /_/application-appointment-schedulings/superchat-error<<?PARAMS>> HTTP/<NUM>.<NUM>\" <NUM> \"-\" \"Apache-HttpClient/4.5.14 (Java/21.0.6)\" request_time=\"<NUM>.<NUM>\" upstream_connect_time=\"<NUM>.<NUM>\" upstream_header_time=\"<NUM>.<NUM>\" upstream_response_time=\"<NUM>.<NUM>\" request_id=\"<HEX>\" session_id=\"-\"",
48 |             "first_seen": "logstash-nginx-access-2025.03.05:8vqzZpUBDnR7VQTqVZbV",
49 |             "last_seen": "logstash-nginx-access-2025.03.05:8vqzZpUBDnR7VQTqVZbV",
50 |             "sample_log_lines": [],
51 |             "sample_doc_references": [
52 |                 "logstash-nginx-access-2025.03.05:8vqzZpUBDnR7VQTqVZbV"
53 |             ]
54 |         }
55 |     ]
56 | }
57 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step11_store_new_state.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Store new state to S3."""
  3 | 
  4 | import argparse
  5 | import sys
  6 | from pathlib import Path
  7 | 
  8 | import boto3
  9 | from botocore.exceptions import ClientError, NoCredentialsError
 10 | 
 11 | from platform_problem_monitoring_core.utils import logger
 12 | 
 13 | 
 14 | def store_new_state(s3_bucket: str, s3_folder: str, date_time_file: str, norm_results_file: str) -> None:
 15 |     """
 16 |     Store new state to S3.
 17 | 
 18 |     Args:
 19 |         s3_bucket: S3 bucket name
 20 |         s3_folder: S3 folder name
 21 |         date_time_file: Path to the date and time file to upload
 22 |         norm_results_file: Path to the normalization results file to upload
 23 | 
 24 |     Raises:
 25 |         FileNotFoundError: If either input file doesn't exist
 26 |         NoCredentialsError: If AWS credentials are not found
 27 |         ClientError: If any AWS S3 operation fails
 28 |     """
 29 |     logger.info("Storing new state")
 30 |     logger.info(f"S3 bucket: {s3_bucket}")
 31 |     logger.info(f"S3 folder: {s3_folder}")
 32 |     logger.info(f"Date time file: {date_time_file}")
 33 |     logger.info(f"Normalization results file: {norm_results_file}")
 34 | 
 35 |     # Check if files exist
 36 |     date_time_path = Path(date_time_file)
 37 |     norm_results_path = Path(norm_results_file)
 38 | 
 39 |     if not date_time_path.exists():
 40 |         error_msg = f"Date time file not found: {date_time_file}"
 41 |         logger.error(error_msg)
 42 |         raise FileNotFoundError(error_msg)
 43 | 
 44 |     if not norm_results_path.exists():
 45 |         error_msg = f"Normalization results file not found: {norm_results_file}"
 46 |         logger.error(error_msg)
 47 |         raise FileNotFoundError(error_msg)
 48 | 
 49 |     try:
 50 |         # Create S3 client
 51 |         s3_client = boto3.client("s3")
 52 | 
 53 |         # Test connection to S3 by checking bucket existence
 54 |         s3_client.head_bucket(Bucket=s3_bucket)
 55 |         logger.info(f"Successfully connected to S3 bucket: {s3_bucket}")
 56 | 
 57 |         # Upload date time file
 58 |         date_time_key = f"{s3_folder}/current_date_time.txt"
 59 |         try:
 60 |             logger.info(f"Uploading date time file to s3://{s3_bucket}/{date_time_key}")
 61 |             s3_client.upload_file(date_time_file, s3_bucket, date_time_key)
 62 |             logger.info("Date time file uploaded successfully")
 63 |         except ClientError as e:
 64 |             error_code = e.response.get("Error", {}).get("Code", "Unknown")
 65 |             error_msg = f"Failed to upload date time file: Error {error_code} - {str(e)}"
 66 |             logger.error(error_msg)
 67 |             raise ClientError(e.response, e.operation_name) from e
 68 | 
 69 |         # Upload normalization results file
 70 |         norm_results_key = f"{s3_folder}/norm_results.json"
 71 |         try:
 72 |             logger.info(f"Uploading normalization results to s3://{s3_bucket}/{norm_results_key}")
 73 |             s3_client.upload_file(norm_results_file, s3_bucket, norm_results_key)
 74 |             logger.info("Normalization results uploaded successfully")
 75 |         except ClientError as e:
 76 |             error_code = e.response.get("Error", {}).get("Code", "Unknown")
 77 |             error_msg = f"Failed to upload normalization results: Error {error_code} - {str(e)}"
 78 |             logger.error(error_msg)
 79 |             raise ClientError(e.response, e.operation_name) from e
 80 | 
 81 |     except NoCredentialsError as e:
 82 |         logger.error(f"AWS credentials not found: {e}")
 83 |         raise
 84 |     except ClientError as e:
 85 |         if e.response.get("Error", {}).get("Code") == "NoSuchBucket":
 86 |             logger.error(f"S3 bucket not found: {s3_bucket}")
 87 |         else:
 88 |             logger.error(f"AWS S3 error: {e}")
 89 |         raise
 90 |     except Exception as e:
 91 |         logger.error(f"Unexpected error storing state: {e}")
 92 |         raise
 93 | 
 94 |     logger.info("New state stored successfully")
 95 | 
 96 | 
 97 | def main() -> None:
 98 |     """Parse command line arguments and store new state."""
 99 |     parser = argparse.ArgumentParser(description="Store new state to S3")
100 |     parser.add_argument("--s3-bucket", required=True, help="S3 bucket name")
101 |     parser.add_argument("--s3-folder", required=True, help="S3 folder name")
102 |     parser.add_argument("--date-time-file", required=True, help="Path to the date and time file to upload")
103 |     parser.add_argument(
104 |         "--norm-results-file",
105 |         required=True,
106 |         help="Path to the normalization results file to upload",
107 |     )
108 | 
109 |     args = parser.parse_args()
110 | 
111 |     try:
112 |         store_new_state(args.s3_bucket, args.s3_folder, args.date_time_file, args.norm_results_file)
113 |         sys.exit(0)
114 |     except Exception as e:
115 |         logger.error(f"Error storing new state: {e}")
116 |         sys.exit(1)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     main()
121 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | .PHONY: install format format-check lint lint-fix type-check security-check quality ci-quality venv clean help activate-venv
  2 | .PHONY: format-check-files isort-check-files lint-files type-check-files security-check-files
  3 | .PHONY: test test-verbose test-coverage test-file update-deps
  4 | .PHONY: bump-version release
  5 | 
  6 | PYTHON = python3
  7 | PACKAGE = platform_problem_monitoring_core
  8 | 
  9 | # Detect if we're running in CI environment
 10 | ifeq ($(CI),true)
 11 |     # In CI, use commands directly (no venv prefix)
 12 |     CMD_PREFIX =
 13 | else
 14 |     # Locally, use commands from venv
 15 |     CMD_PREFIX = venv/bin/
 16 | endif
 17 | 
 18 | VENV_BIN = venv/bin
 19 | 
 20 | help:
 21 | 	@echo "Available commands:"
 22 | 	@echo "  make install        Install package and development dependencies"
 23 | 	@echo "  make activate-venv  Instructions to activate the virtual environment"
 24 | 	@echo "  make format         Format code with black and isort"
 25 | 	@echo "  make format-check   Check if code is properly formatted without modifying files"
 26 | 	@echo "  make lint           Run linters (ruff)"
 27 | 	@echo "  make lint-fix       Run linters and auto-fix issues where possible"
 28 | 	@echo "  make type-check     Run mypy type checking"
 29 | 	@echo "  make security-check Run bandit security checks"
 30 | 	@echo "  make quality        Run all code quality checks (with formatting)"
 31 | 	@echo "  make ci-quality     Run all code quality checks (without modifying files)"
 32 | 	@echo "  make test           Run tests"
 33 | 	@echo "  make test-verbose   Run tests with verbose output"
 34 | 	@echo "  make test-coverage  Run tests with coverage report"
 35 | 	@echo "  make test-file      Run tests for a specific file (usage: make test-file file=path/to/test_file.py)"
 36 | 	@echo "  make update-deps    Update all dependencies to their latest semver-compatible versions"
 37 | 	@echo "  make bump-version   Update the version number in pyproject.toml"
 38 | 	@echo "  make release        Create a new release tag (after running quality checks and tests)"
 39 | 	@echo "  make clean          Remove build artifacts and cache directories"
 40 | 	@echo "  make help           Show this help message"
 41 | 
 42 | venv:
 43 | 	$(PYTHON) -m venv venv
 44 | 	$(VENV_BIN)/pip install --upgrade pip
 45 | 	$(VENV_BIN)/pip install -e ".[dev]"
 46 | 	$(VENV_BIN)/pre-commit install
 47 | 
 48 | install: venv
 49 | 
 50 | # This doesn't actually activate, but shows how to activate
 51 | activate-venv:
 52 | 	@echo "To activate the virtual environment, run:"
 53 | 	@echo "  source venv/bin/activate"
 54 | 	@echo ""
 55 | 	@echo "After you're done, you can deactivate by running:"
 56 | 	@echo "  deactivate"
 57 | 
 58 | format:
 59 | 	$(CMD_PREFIX)black src
 60 | 	$(CMD_PREFIX)isort src
 61 | 
 62 | format-check:
 63 | 	$(CMD_PREFIX)black --check src
 64 | 	$(CMD_PREFIX)isort --check src
 65 | 
 66 | # Pre-commit compatible targets that operate on specific files
 67 | format-check-files:
 68 | 	$(CMD_PREFIX)black --check $(filter-out $@,$(MAKECMDGOALS))
 69 | 
 70 | isort-check-files:
 71 | 	$(CMD_PREFIX)isort --check $(filter-out $@,$(MAKECMDGOALS))
 72 | 
 73 | lint:
 74 | 	$(CMD_PREFIX)ruff check src
 75 | 
 76 | lint-fix:
 77 | 	$(CMD_PREFIX)ruff check --fix src
 78 | 
 79 | lint-files:
 80 | 	$(CMD_PREFIX)ruff check $(filter-out $@,$(MAKECMDGOALS))
 81 | 
 82 | type-check:
 83 | 	$(CMD_PREFIX)mypy src
 84 | 
 85 | type-check-files:
 86 | 	$(CMD_PREFIX)mypy $(filter-out $@,$(MAKECMDGOALS))
 87 | 
 88 | security-check:
 89 | 	$(CMD_PREFIX)bandit -r src -x src/tests
 90 | 
 91 | security-check-files:
 92 | 	if [[ "$(filter-out $@,$(MAKECMDGOALS))" == *"test_"* ]]; then \
 93 | 		echo "Skipping security check for test file"; \
 94 | 	else \
 95 | 		$(CMD_PREFIX)bandit $(filter-out $@,$(MAKECMDGOALS)); \
 96 | 	fi
 97 | 
 98 | # Dependency management
 99 | update-deps:
100 | 	$(CMD_PREFIX)pip install --upgrade -e ".[dev]"
101 | 	@echo "Dependencies updated to their latest semver-compatible versions"
102 | 
103 | # Test targets
104 | test:
105 | 	$(CMD_PREFIX)pytest src/tests
106 | 
107 | test-verbose:
108 | 	$(CMD_PREFIX)pytest -v src/tests
109 | 
110 | test-coverage:
111 | 	$(CMD_PREFIX)pytest --cov=$(PACKAGE) --cov-report=term-missing --cov-report=xml src/tests
112 | 
113 | test-file:
114 | 	$(CMD_PREFIX)pytest $(file) -v
115 | 
116 | quality: format lint type-check security-check
117 | 
118 | ci-quality: format-check lint type-check security-check
119 | 
120 | clean:
121 | 	rm -rf build/
122 | 	rm -rf dist/
123 | 	rm -rf *.egg-info
124 | 	find . -type d -name __pycache__ -exec rm -rf {} +
125 | 	find . -type d -name .mypy_cache -exec rm -rf {} +
126 | 	find . -type d -name .pytest_cache -exec rm -rf {} +
127 | 	find . -type d -name .ruff_cache -exec rm -rf {} +
128 | 	find . -type f -name "*.pyc" -delete
129 | 	rm -f .coverage
130 | 	rm -f coverage.xml
131 | 	rm -f venv
132 | 
133 | # This allows passing filenames as arguments to make targets
134 | %:
135 | 	@:
136 | 
137 | # Version Management
138 | bump-version:
139 | 	@echo "Current version: $(shell grep -m 1 version pyproject.toml | cut -d '"' -f 2)"
140 | 	@read -p "New version: " new_version; \
141 | 	sed -i '' "s/version = \"[0-9]*\.[0-9]*\.[0-9]*\"/version = \"$$new_version\"/" pyproject.toml
142 | 
143 | # Creating a new release
144 | release: ci-quality test-coverage
145 | 	@version=$$(grep -m 1 version pyproject.toml | cut -d '"' -f 2); \
146 | 	git tag -a "v$$version" -m "Release v$$version"; \
147 | 	echo "Created tag v$$version. Push with: git push origin v$$version"
148 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/norm_results.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "patterns": [
  3 |         {
  4 |             "cluster_id": "2",
  5 |             "count": 72,
  6 |             "pattern": "Failed to authenticate user <*> - invalid credentials",
  7 |             "first_seen": "2025-03-07T00:03:12Z",
  8 |             "last_seen": "2025-03-07T23:45:27Z",
  9 |             "sample_log_lines": [
 10 |                 "Failed to authenticate user customer@example.org - invalid credentials",
 11 |                 "Failed to authenticate user guest-user - invalid credentials",
 12 |                 "Failed to authenticate user api-client-456 - invalid credentials"
 13 |             ],
 14 |             "sample_doc_references": [
 15 |                 "logs-2025.03.07/doc19",
 16 |                 "logs-2025.03.07/doc82",
 17 |                 "logs-2025.03.07/doc143"
 18 |             ]
 19 |         },
 20 |         {
 21 |             "cluster_id": "3",
 22 |             "count": 58,
 23 |             "pattern": "API rate limit exceeded for user ID <*>",
 24 |             "first_seen": "2025-03-07T00:12:43Z",
 25 |             "last_seen": "2025-03-07T23:51:16Z",
 26 |             "sample_log_lines": [
 27 |                 "API rate limit exceeded for user ID 6142",
 28 |                 "API rate limit exceeded for user ID 9037",
 29 |                 "API rate limit exceeded for user ID 2384"
 30 |             ],
 31 |             "sample_doc_references": [
 32 |                 "logs-2025.03.07/doc27",
 33 |                 "logs-2025.03.07/doc94",
 34 |                 "logs-2025.03.07/doc185"
 35 |             ]
 36 |         },
 37 |         {
 38 |             "cluster_id": "6",
 39 |             "count": 32,
 40 |             "pattern": "SSL certificate for <*> is expiring in <*> days",
 41 |             "first_seen": "2025-03-07T04:15:22Z",
 42 |             "last_seen": "2025-03-07T23:15:22Z",
 43 |             "sample_log_lines": [
 44 |                 "SSL certificate for api.example.com is expiring in 7 days",
 45 |                 "SSL certificate for dashboard.example.org is expiring in 5 days",
 46 |                 "SSL certificate for auth.example.net is expiring in 3 days"
 47 |             ],
 48 |             "sample_doc_references": [
 49 |                 "logs-2025.03.07/doc38",
 50 |                 "logs-2025.03.07/doc123",
 51 |                 "logs-2025.03.07/doc208"
 52 |             ]
 53 |         },
 54 |         {
 55 |             "cluster_id": "7",
 56 |             "count": 21,
 57 |             "pattern": "Disk usage warning: <*> is at <*>% capacity",
 58 |             "first_seen": "2025-03-07T05:32:16Z",
 59 |             "last_seen": "2025-03-07T22:47:53Z",
 60 |             "sample_log_lines": [
 61 |                 "Disk usage warning: /var/log is at 85% capacity",
 62 |                 "Disk usage warning: /home is at 92% capacity",
 63 |                 "Disk usage warning: /tmp is at 88% capacity"
 64 |             ],
 65 |             "sample_doc_references": [
 66 |                 "logs-2025.03.07/doc56",
 67 |                 "logs-2025.03.07/doc127",
 68 |                 "logs-2025.03.07/doc198"
 69 |             ]
 70 |         },
 71 |         {
 72 |             "cluster_id": "8",
 73 |             "count": 18,
 74 |             "pattern": "Connection reset by peer while sending request to <*>",
 75 |             "first_seen": "2025-03-07T01:42:19Z",
 76 |             "last_seen": "2025-03-07T22:38:11Z",
 77 |             "sample_log_lines": [
 78 |                 "Connection reset by peer while sending request to https://api.payment-provider.com/v2/transactions",
 79 |                 "Connection reset by peer while sending request to https://auth.partner-service.org/oauth/token",
 80 |                 "Connection reset by peer while sending request to https://cdn.assets.com/resource"
 81 |             ],
 82 |             "sample_doc_references": [
 83 |                 "logs-2025.03.07/doc43",
 84 |                 "logs-2025.03.07/doc137",
 85 |                 "logs-2025.03.07/doc219"
 86 |             ]
 87 |         },
 88 |         {
 89 |             "cluster_id": "9",
 90 |             "count": 14,
 91 |             "pattern": "Failed to process job <*> - timeout after <*> seconds",
 92 |             "first_seen": "2025-03-07T03:17:09Z",
 93 |             "last_seen": "2025-03-07T21:05:33Z",
 94 |             "sample_log_lines": [
 95 |                 "Failed to process job export-user-data-5782 - timeout after 60 seconds",
 96 |                 "Failed to process job generate-report-4213 - timeout after 120 seconds",
 97 |                 "Failed to process job sync-inventory-8974 - timeout after 180 seconds"
 98 |             ],
 99 |             "sample_doc_references": [
100 |                 "logs-2025.03.07/doc72",
101 |                 "logs-2025.03.07/doc158",
102 |                 "logs-2025.03.07/doc241"
103 |             ]
104 |         },
105 |         {
106 |             "cluster_id": "10",
107 |             "count": 12,
108 |             "pattern": "AWS S3 access denied: <*>",
109 |             "first_seen": "2025-03-07T08:05:27Z",
110 |             "last_seen": "2025-03-07T19:32:41Z",
111 |             "sample_log_lines": [
112 |                 "AWS S3 access denied: User not authorized to perform s3:PutObject on resource arn:aws:s3:::backups/daily",
113 |                 "AWS S3 access denied: The specified bucket does not exist",
114 |                 "AWS S3 access denied: The AWS Access Key Id you provided does not exist in our records"
115 |             ],
116 |             "sample_doc_references": [
117 |                 "logs-2025.03.07/doc98",
118 |                 "logs-2025.03.07/doc172",
119 |                 "logs-2025.03.07/doc254"
120 |             ]
121 |         }
122 |     ]
123 | }
124 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/utils.py:
--------------------------------------------------------------------------------
  1 | """Utility functions for Platform Problem Monitoring Core."""
  2 | 
  3 | import configparser
  4 | import json
  5 | import logging
  6 | import sys
  7 | from logging import Logger
  8 | from pathlib import Path
  9 | from typing import Any, Dict
 10 | 
 11 | 
 12 | def setup_logger(name: str = "platform_problem_monitoring", level: int = logging.INFO) -> Logger:
 13 |     """
 14 |     Configure and return a logger instance.
 15 | 
 16 |     Args:
 17 |         name: The name for the logger
 18 |         level: The logging level
 19 | 
 20 |     Returns:
 21 |         Configured logger instance
 22 |     """
 23 |     logger_instance = logging.getLogger(name)
 24 | 
 25 |     # Only configure if handlers haven't been added already
 26 |     if not logger_instance.handlers:
 27 |         logger_instance.setLevel(level)
 28 |         formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
 29 | 
 30 |         console_handler = logging.StreamHandler(sys.stdout)
 31 |         console_handler.setFormatter(formatter)
 32 | 
 33 |         logger_instance.addHandler(console_handler)
 34 | 
 35 |     return logger_instance
 36 | 
 37 | 
 38 | # Create a default logger instance for backward compatibility
 39 | logger = setup_logger()
 40 | 
 41 | 
 42 | def load_config(config_file_path: str) -> Dict[str, str]:
 43 |     """
 44 |     Load configuration from a file.
 45 | 
 46 |     Args:
 47 |         config_file_path: Path to the configuration file
 48 | 
 49 |     Returns:
 50 |         Dictionary containing configuration values
 51 | 
 52 |     Raises:
 53 |         FileNotFoundError: If the configuration file doesn't exist
 54 |         ValueError: If the configuration file is invalid
 55 |     """
 56 |     config_path = Path(config_file_path)
 57 | 
 58 |     if not config_path.exists():
 59 |         error_msg = f"Configuration file not found: {config_file_path}"
 60 |         logger.error(error_msg)
 61 |         raise FileNotFoundError(error_msg)
 62 | 
 63 |     # For backwards compatibility check file format
 64 |     with config_path.open("r") as f:
 65 |         first_line = f.readline().strip()
 66 |         f.seek(0)  # Reset file pointer
 67 | 
 68 |         # If it's a standard KEY=VALUE format without sections
 69 |         if first_line and "=" in first_line and not first_line.startswith("["):
 70 |             return _parse_key_value_config(f)
 71 | 
 72 |         # Otherwise use configparser
 73 |         return _parse_ini_config(config_path)
 74 | 
 75 | 
 76 | def _parse_key_value_config(file_obj: Any) -> Dict[str, str]:
 77 |     """
 78 |     Parse a config file with KEY=VALUE format.
 79 | 
 80 |     Args:
 81 |         file_obj: File object to parse
 82 | 
 83 |     Returns:
 84 |         Dictionary of parsed configuration
 85 |     """
 86 |     config: Dict[str, str] = {}
 87 | 
 88 |     for line in file_obj:
 89 |         line = line.strip()
 90 |         if not line or line.startswith("#"):
 91 |             continue
 92 | 
 93 |         if "=" in line:
 94 |             key, value = line.split("=", 1)
 95 |             config[key.strip()] = value.strip().strip('"')
 96 |         else:
 97 |             logger.warning("Ignoring invalid line in config file: %s", line)
 98 | 
 99 |     return config
100 | 
101 | 
102 | def _parse_ini_config(config_path: Path) -> Dict[str, str]:
103 |     """
104 |     Parse a config file using configparser.
105 | 
106 |     Args:
107 |         config_path: Path to the config file
108 | 
109 |     Returns:
110 |         Dictionary of parsed configuration
111 |     """
112 |     parser = configparser.ConfigParser()
113 | 
114 |     try:
115 |         parser.read(config_path)
116 | 
117 |         # Convert to flat dictionary for backward compatibility
118 |         result: Dict[str, str] = {}
119 |         for section in parser.sections():
120 |             for key, value in parser[section].items():
121 |                 result[f"{section}.{key}"] = value
122 | 
123 |         # Handle DEFAULT section
124 |         for key, value in parser.defaults().items():
125 |             result[key] = value
126 | 
127 |         return result
128 | 
129 |     except configparser.Error as e:
130 |         error_msg = f"Invalid configuration file: {e}"
131 |         logger.error(error_msg)
132 |         raise ValueError(error_msg) from e
133 | 
134 | 
135 | def save_json(data: Any, file_path: str) -> None:
136 |     """
137 |     Save data to a JSON file.
138 | 
139 |     Args:
140 |         data: Data to save
141 |         file_path: Path to the file
142 | 
143 |     Raises:
144 |         OSError: If there's an error writing to the file
145 |         TypeError: If data is not JSON serializable
146 |     """
147 |     path = Path(file_path)
148 | 
149 |     # Ensure parent directory exists
150 |     path.parent.mkdir(parents=True, exist_ok=True)
151 | 
152 |     try:
153 |         with path.open("w") as f:
154 |             json.dump(data, f, indent=2)
155 |     except (OSError, TypeError) as e:
156 |         logger.error("Failed to save JSON file %s: %s", file_path, str(e))
157 |         raise
158 | 
159 | 
160 | def load_json(file_path: str) -> Any:
161 |     """
162 |     Load data from a JSON file.
163 | 
164 |     Args:
165 |         file_path: Path to the file
166 | 
167 |     Returns:
168 |         Loaded data
169 | 
170 |     Raises:
171 |         FileNotFoundError: If the file doesn't exist
172 |         json.JSONDecodeError: If the file isn't valid JSON
173 |     """
174 |     path = Path(file_path)
175 | 
176 |     try:
177 |         with path.open("r") as f:
178 |             return json.load(f)
179 |     except FileNotFoundError as e:
180 |         error_msg = f"File not found: {file_path}"
181 |         logger.error(error_msg)
182 |         raise FileNotFoundError(error_msg) from e
183 |     except json.JSONDecodeError as e:
184 |         error_msg = f"Invalid JSON in {file_path}: {e}"
185 |         logger.error(error_msg)
186 |         raise json.JSONDecodeError(error_msg, e.doc, e.pos) from e
187 | 
188 | 
189 | def ensure_dir_exists(path: str) -> None:
190 |     """
191 |     Ensure a directory exists.
192 | 
193 |     Args:
194 |         path: Directory path
195 | 
196 |     Raises:
197 |         OSError: If directory creation fails for reasons other than it already existing
198 |     """
199 |     Path(path).mkdir(parents=True, exist_ok=True)
200 | 


--------------------------------------------------------------------------------
/src/tests/test_step6_extract_fields.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unit tests for step6_extract_fields.py."""
  3 | 
  4 | import json
  5 | import tempfile
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List
  8 | 
  9 | import pytest
 10 | 
 11 | from platform_problem_monitoring_core.step6_extract_fields import extract_fields
 12 | 
 13 | 
 14 | class TestStep6ExtractFields:
 15 |     """Tests for the extract_fields function."""
 16 | 
 17 |     @pytest.fixture
 18 |     def logstash_documents_path(self) -> str:
 19 |         """Return the path to the logstash_documents.json fixture."""
 20 |         return str(Path(__file__).parent / "fixtures" / "logstash_documents.json")
 21 | 
 22 |     @pytest.fixture
 23 |     def sample_logstash_data(self) -> List[Dict[str, Any]]:
 24 |         """Create a small sample of logstash data for testing."""
 25 |         return [
 26 |             {"_index": "logstash-test-index-1", "_id": "test-id-1", "_source": {"message": "Test message 1"}},
 27 |             {"_index": "logstash-test-index-2", "_id": "test-id-2", "_source": {"message": "Test message 2"}},
 28 |             {
 29 |                 "_index": "logstash-test-index-3",
 30 |                 "_id": "test-id-3",
 31 |                 "_source": {
 32 |                     # Missing message field
 33 |                 },
 34 |             },
 35 |         ]
 36 | 
 37 |     def test_extract_fields_with_sample_data(self, sample_logstash_data: List[Dict[str, Any]]) -> None:
 38 |         """Test extract_fields with a small sample of data."""
 39 |         # Create temporary files for input and output
 40 |         with (
 41 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as input_file,
 42 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file,
 43 |         ):
 44 | 
 45 |             # Write sample data to input file
 46 |             json.dump(sample_logstash_data, input_file)
 47 |             input_file.flush()
 48 | 
 49 |             input_path = Path(input_file.name)
 50 |             output_path = Path(output_file.name)
 51 | 
 52 |             try:
 53 |                 # Run the extract_fields function
 54 |                 extract_fields(str(input_path), str(output_path))
 55 | 
 56 |                 # Read and verify the output
 57 |                 with output_path.open("r") as f:
 58 |                     lines = f.readlines()
 59 | 
 60 |                 # Should have 2 lines (one document has no message)
 61 |                 assert len(lines) == 2
 62 | 
 63 |                 # Verify the content of each line
 64 |                 extracted1 = json.loads(lines[0])
 65 |                 assert extracted1["index"] == "logstash-test-index-1"
 66 |                 assert extracted1["id"] == "test-id-1"
 67 |                 assert extracted1["message"] == "Test message 1"
 68 | 
 69 |                 extracted2 = json.loads(lines[1])
 70 |                 assert extracted2["index"] == "logstash-test-index-2"
 71 |                 assert extracted2["id"] == "test-id-2"
 72 |                 assert extracted2["message"] == "Test message 2"
 73 |             finally:
 74 |                 # Clean up temporary files
 75 |                 input_path.unlink()
 76 |                 output_path.unlink()
 77 | 
 78 |     def test_extract_fields_with_fixture(self, logstash_documents_path: str) -> None:
 79 |         """Test extract_fields with the logstash_documents.json fixture."""
 80 |         # Create a temporary file for output
 81 |         with tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file:
 82 |             output_path = Path(output_file.name)
 83 | 
 84 |             try:
 85 |                 # Run the extract_fields function
 86 |                 extract_fields(logstash_documents_path, str(output_path))
 87 | 
 88 |                 # Read and verify the output
 89 |                 with output_path.open("r") as f:
 90 |                     lines = f.readlines()
 91 | 
 92 |                 # Verify we have output lines
 93 |                 assert len(lines) > 0
 94 | 
 95 |                 # Verify the structure of the first line
 96 |                 first_extracted = json.loads(lines[0])
 97 |                 assert "index" in first_extracted
 98 |                 assert "id" in first_extracted
 99 |                 assert "message" in first_extracted
100 | 
101 |                 # Verify all lines are valid JSON
102 |                 for line in lines:
103 |                     extracted = json.loads(line)
104 |                     assert isinstance(extracted, dict)
105 |                     assert "index" in extracted
106 |                     assert "id" in extracted
107 |                     assert "message" in extracted
108 |                     assert extracted["message"]  # Message should not be empty
109 |             finally:
110 |                 # Clean up temporary file
111 |                 output_path.unlink()
112 | 
113 |     def test_extract_fields_with_missing_file(self) -> None:
114 |         """Test extract_fields with a non-existent input file."""
115 |         with tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file:
116 |             output_path = Path(output_file.name)
117 | 
118 |             try:
119 |                 # Try to run extract_fields with a non-existent input file
120 |                 with pytest.raises(FileNotFoundError):
121 |                     extract_fields("non_existent_file.json", str(output_path))
122 |             finally:
123 |                 # Clean up temporary file
124 |                 output_path.unlink()
125 | 
126 |     def test_extract_fields_with_invalid_json(self) -> None:
127 |         """Test extract_fields with invalid JSON input."""
128 |         # Create temporary files for input and output
129 |         with (
130 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as input_file,
131 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file,
132 |         ):
133 | 
134 |             # Write invalid JSON to input file
135 |             input_file.write("{invalid json")
136 |             input_file.flush()
137 | 
138 |             input_path = Path(input_file.name)
139 |             output_path = Path(output_file.name)
140 | 
141 |             try:
142 |                 # Try to run extract_fields with invalid JSON
143 |                 with pytest.raises(json.JSONDecodeError):
144 |                     extract_fields(str(input_path), str(output_path))
145 |             finally:
146 |                 # Clean up temporary files
147 |                 input_path.unlink()
148 |                 output_path.unlink()
149 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/norm_results_prev.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "patterns": [
  3 |         {
  4 |             "cluster_id": "1",
  5 |             "count": 245,
  6 |             "pattern": "Error connecting to database at <*>: Connection timed out",
  7 |             "first_seen": "2025-03-06T00:15:32Z",
  8 |             "last_seen": "2025-03-06T23:45:17Z",
  9 |             "sample_log_lines": [
 10 |                 "Error connecting to database at 10.0.1.42:3306: Connection timed out",
 11 |                 "Error connecting to database at db.example.com:3306: Connection timed out",
 12 |                 "Error connecting to database at 192.168.1.100:3306: Connection timed out"
 13 |             ],
 14 |             "sample_doc_references": [
 15 |                 "logs-2025.03.06/doc1",
 16 |                 "logs-2025.03.06/doc145",
 17 |                 "logs-2025.03.06/doc231"
 18 |             ]
 19 |         },
 20 |         {
 21 |             "cluster_id": "2",
 22 |             "count": 187,
 23 |             "pattern": "Failed to authenticate user <*> - invalid credentials",
 24 |             "first_seen": "2025-03-06T01:12:07Z",
 25 |             "last_seen": "2025-03-06T23:10:31Z",
 26 |             "sample_log_lines": [
 27 |                 "Failed to authenticate user john.doe@example.com - invalid credentials",
 28 |                 "Failed to authenticate user admin - invalid credentials",
 29 |                 "Failed to authenticate user service-account-123 - invalid credentials"
 30 |             ],
 31 |             "sample_doc_references": [
 32 |                 "logs-2025.03.06/doc21",
 33 |                 "logs-2025.03.06/doc98",
 34 |                 "logs-2025.03.06/doc176"
 35 |             ]
 36 |         },
 37 |         {
 38 |             "cluster_id": "3",
 39 |             "count": 163,
 40 |             "pattern": "API rate limit exceeded for user ID <*>",
 41 |             "first_seen": "2025-03-06T00:05:23Z",
 42 |             "last_seen": "2025-03-06T23:58:41Z",
 43 |             "sample_log_lines": [
 44 |                 "API rate limit exceeded for user ID 5723",
 45 |                 "API rate limit exceeded for user ID 8932",
 46 |                 "API rate limit exceeded for user ID 1047"
 47 |             ],
 48 |             "sample_doc_references": [
 49 |                 "logs-2025.03.06/doc15",
 50 |                 "logs-2025.03.06/doc112",
 51 |                 "logs-2025.03.06/doc201"
 52 |             ]
 53 |         },
 54 |         {
 55 |             "cluster_id": "4",
 56 |             "count": 124,
 57 |             "pattern": "Exception in thread \"main\" java.lang.OutOfMemoryError: <*>",
 58 |             "first_seen": "2025-03-06T03:25:48Z",
 59 |             "last_seen": "2025-03-06T22:17:03Z",
 60 |             "sample_log_lines": [
 61 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: Java heap space",
 62 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: GC overhead limit exceeded",
 63 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: unable to create new native thread"
 64 |             ],
 65 |             "sample_doc_references": [
 66 |                 "logs-2025.03.06/doc42",
 67 |                 "logs-2025.03.06/doc87",
 68 |                 "logs-2025.03.06/doc109"
 69 |             ]
 70 |         },
 71 |         {
 72 |             "cluster_id": "5",
 73 |             "count": 89,
 74 |             "pattern": "Kubernetes pod <*> in namespace <*> failed health check",
 75 |             "first_seen": "2025-03-06T00:32:11Z",
 76 |             "last_seen": "2025-03-06T23:47:29Z",
 77 |             "sample_log_lines": [
 78 |                 "Kubernetes pod web-server-5d4d7 in namespace production failed health check",
 79 |                 "Kubernetes pod db-backup-3f2a1 in namespace data-services failed health check",
 80 |                 "Kubernetes pod cache-9b3c8 in namespace frontend failed health check"
 81 |             ],
 82 |             "sample_doc_references": [
 83 |                 "logs-2025.03.06/doc53",
 84 |                 "logs-2025.03.06/doc167",
 85 |                 "logs-2025.03.06/doc214"
 86 |             ]
 87 |         },
 88 |         {
 89 |             "cluster_id": "11",
 90 |             "count": 54,
 91 |             "pattern": "Failed to process message from queue <*>: <*>",
 92 |             "first_seen": "2025-03-06T02:17:42Z",
 93 |             "last_seen": "2025-03-06T22:35:09Z",
 94 |             "sample_log_lines": [
 95 |                 "Failed to process message from queue orders: JSON parse error at line 1 column 24",
 96 |                 "Failed to process message from queue notifications: Message expired",
 97 |                 "Failed to process message from queue user-events: Unknown message format"
 98 |             ],
 99 |             "sample_doc_references": [
100 |                 "logs-2025.03.06/doc67",
101 |                 "logs-2025.03.06/doc132",
102 |                 "logs-2025.03.06/doc223"
103 |             ]
104 |         },
105 |         {
106 |             "cluster_id": "12",
107 |             "count": 42,
108 |             "pattern": "Cache invalidation failed for key <*>",
109 |             "first_seen": "2025-03-06T04:28:16Z",
110 |             "last_seen": "2025-03-06T21:51:48Z",
111 |             "sample_log_lines": [
112 |                 "Cache invalidation failed for key user:profile:12345",
113 |                 "Cache invalidation failed for key product:catalog:recent",
114 |                 "Cache invalidation failed for key system:config:endpoints"
115 |             ],
116 |             "sample_doc_references": [
117 |                 "logs-2025.03.06/doc84",
118 |                 "logs-2025.03.06/doc156",
119 |                 "logs-2025.03.06/doc238"
120 |             ]
121 |         },
122 |         {
123 |             "cluster_id": "13",
124 |             "count": 29,
125 |             "pattern": "HTTP request failed: <*> <*> returned status code <*>",
126 |             "first_seen": "2025-03-06T07:14:37Z",
127 |             "last_seen": "2025-03-06T23:08:21Z",
128 |             "sample_log_lines": [
129 |                 "HTTP request failed: GET https://api.analytics.com/v1/reports returned status code 503",
130 |                 "HTTP request failed: POST https://payments.example.org/process returned status code 429",
131 |                 "HTTP request failed: PUT https://inventory.example.net/items/update returned status code 400"
132 |             ],
133 |             "sample_doc_references": [
134 |                 "logs-2025.03.06/doc103",
135 |                 "logs-2025.03.06/doc189",
136 |                 "logs-2025.03.06/doc247"
137 |             ]
138 |         }
139 |     ]
140 | }
141 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step2_download_previous_state.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Download previous state from S3."""
  3 | 
  4 | import argparse
  5 | import datetime
  6 | import sys
  7 | from datetime import timezone
  8 | from pathlib import Path
  9 | 
 10 | import boto3
 11 | from botocore.exceptions import ClientError, NoCredentialsError
 12 | 
 13 | from platform_problem_monitoring_core.utils import logger, save_json
 14 | 
 15 | 
 16 | def _create_fallback_date_time_file(date_time_path: Path) -> None:
 17 |     """
 18 |     Create a fallback date time file with timestamp from 24 hours ago.
 19 | 
 20 |     Args:
 21 |         date_time_path: Path where to create the file
 22 | 
 23 |     Raises:
 24 |         OSError: If unable to write to the file
 25 |     """
 26 |     now = datetime.datetime.now(timezone.utc)
 27 |     yesterday = now - datetime.timedelta(days=1)
 28 | 
 29 |     try:
 30 |         with date_time_path.open("w") as f:
 31 |             f.write(yesterday.isoformat())
 32 |         logger.info(f"Created fallback date time file at {date_time_path}")
 33 |     except OSError as write_error:
 34 |         error_msg = f"Failed to create fallback date time file: {write_error}"
 35 |         logger.error(error_msg)
 36 |         raise OSError(error_msg) from write_error
 37 | 
 38 | 
 39 | def _create_empty_norm_results_file(norm_results_path: Path) -> None:
 40 |     """
 41 |     Create an empty normalization results file.
 42 | 
 43 |     Args:
 44 |         norm_results_path: Path where to create the file
 45 | 
 46 |     Raises:
 47 |         OSError: If unable to write to the file
 48 |     """
 49 |     try:
 50 |         save_json({}, str(norm_results_path))
 51 |         logger.info(f"Created empty normalization results file at {norm_results_path}")
 52 |     except OSError as write_error:
 53 |         error_msg = f"Failed to create empty normalization results file: {write_error}"
 54 |         logger.error(error_msg)
 55 |         raise OSError(error_msg) from write_error
 56 | 
 57 | 
 58 | def download_previous_state(s3_bucket: str, s3_folder: str, date_time_file: str, norm_results_file: str) -> None:
 59 |     """
 60 |     Download previous state from S3.
 61 | 
 62 |     Args:
 63 |         s3_bucket: S3 bucket name
 64 |         s3_folder: S3 folder name
 65 |         date_time_file: Path to store the start date and time
 66 |         norm_results_file: Path to store the previous normalization results
 67 | 
 68 |     Raises:
 69 |         NoCredentialsError: If AWS credentials are not found
 70 |         ClientError: If any AWS S3 operation fails
 71 |         OSError: If any file operation fails
 72 |     """
 73 |     logger.info("Downloading previous state")
 74 |     logger.info(f"S3 bucket: {s3_bucket}")
 75 |     logger.info(f"S3 folder: {s3_folder}")
 76 | 
 77 |     # Ensure parent directories exist
 78 |     date_time_path = Path(date_time_file)
 79 |     norm_results_path = Path(norm_results_file)
 80 | 
 81 |     date_time_path.parent.mkdir(parents=True, exist_ok=True)
 82 |     norm_results_path.parent.mkdir(parents=True, exist_ok=True)
 83 | 
 84 |     try:
 85 |         # Create S3 client
 86 |         s3_client = boto3.client("s3")
 87 | 
 88 |         # Test connection to S3
 89 |         s3_client.head_bucket(Bucket=s3_bucket)
 90 |         logger.info(f"Successfully connected to S3 bucket: {s3_bucket}")
 91 | 
 92 |         # Download date time file
 93 |         date_time_key = f"{s3_folder}/current_date_time.txt"
 94 |         try:
 95 |             logger.info(f"Downloading date time file from s3://{s3_bucket}/{date_time_key}")
 96 |             s3_client.download_file(s3_bucket, date_time_key, date_time_file)
 97 |             logger.info(f"Date time file downloaded to {date_time_file}")
 98 |         except ClientError as e:
 99 |             error_code = e.response.get("Error", {}).get("Code", "Unknown")
100 |             if error_code == "404" or error_code == "NoSuchKey":
101 |                 logger.warning(f"Date time file not found in S3: {date_time_key}")
102 |             else:
103 |                 logger.warning(f"Failed to download date time file: {e}")
104 | 
105 |             logger.info("Using fallback: 24 hours ago")
106 |             _create_fallback_date_time_file(date_time_path)
107 | 
108 |         # Download normalization results file
109 |         norm_results_key = f"{s3_folder}/norm_results.json"
110 |         try:
111 |             logger.info(f"Downloading normalization results from s3://{s3_bucket}/{norm_results_key}")
112 |             s3_client.download_file(s3_bucket, norm_results_key, norm_results_file)
113 |             logger.info(f"Normalization results downloaded to {norm_results_file}")
114 |         except ClientError as e:
115 |             error_code = e.response.get("Error", {}).get("Code", "Unknown")
116 |             if error_code == "404" or error_code == "NoSuchKey":
117 |                 logger.warning(f"Normalization results file not found in S3: {norm_results_key}")
118 |             else:
119 |                 logger.warning(f"Failed to download normalization results: {e}")
120 | 
121 |             logger.info("Creating empty normalization results file")
122 |             _create_empty_norm_results_file(norm_results_path)
123 | 
124 |     except NoCredentialsError as e:
125 |         logger.error(f"AWS credentials not found: {e}")
126 |         raise
127 |     except ClientError as e:
128 |         if e.response.get("Error", {}).get("Code") == "NoSuchBucket":
129 |             logger.error(f"S3 bucket not found: {s3_bucket}")
130 |         else:
131 |             logger.error(f"AWS S3 error: {e}")
132 |         raise
133 |     except Exception as e:
134 |         logger.error(f"Unexpected error: {e}")
135 |         raise
136 | 
137 |     logger.info("Previous state download completed")
138 | 
139 | 
140 | def main() -> None:
141 |     """Parse command line arguments and download previous state."""
142 |     parser = argparse.ArgumentParser(description="Download previous state from S3")
143 |     parser.add_argument("--s3-bucket", required=True, help="S3 bucket name")
144 |     parser.add_argument("--s3-folder", required=True, help="S3 folder name")
145 |     parser.add_argument("--date-time-file", required=True, help="Path to store the start date and time")
146 |     parser.add_argument(
147 |         "--norm-results-file",
148 |         required=True,
149 |         help="Path to store the previous normalization results",
150 |     )
151 | 
152 |     args = parser.parse_args()
153 | 
154 |     try:
155 |         download_previous_state(args.s3_bucket, args.s3_folder, args.date_time_file, args.norm_results_file)
156 |         sys.exit(0)
157 |     except Exception as e:
158 |         logger.error(f"Error downloading previous state: {e}")
159 |         sys.exit(1)
160 | 
161 | 
162 | if __name__ == "__main__":
163 |     main()
164 | 


--------------------------------------------------------------------------------
/src/tests/test_step7_normalize_messages.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Unit tests for step7_normalize_messages.py."""
  3 | 
  4 | import json
  5 | import tempfile
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List
  8 | 
  9 | import pytest
 10 | 
 11 | from platform_problem_monitoring_core.step7_normalize_messages import normalize_messages
 12 | 
 13 | 
 14 | class TestStep7NormalizeMessages:
 15 |     """Tests for the normalize_messages function."""
 16 | 
 17 |     @pytest.fixture
 18 |     def sample_extracted_data(self) -> List[Dict[str, Any]]:
 19 |         """Provide sample extracted data for testing."""
 20 |         return [
 21 |             {
 22 |                 "id": "test-id-1",
 23 |                 "index": "logstash-test-index-1",
 24 |                 "message": (
 25 |                     "Error occurred at 2025-03-05T10:57:14.135052+00:00 in file "
 26 |                     "/opt/website/prod/backend-app/src/App/Controller.php"
 27 |                 ),
 28 |             },
 29 |             {
 30 |                 "id": "test-id-2",
 31 |                 "index": "logstash-test-index-2",
 32 |                 "message": "User 133d8fdf-5a47-11eb-9edb-0685f7490bd8 logged in from IP 10.0.11.128",
 33 |             },
 34 |             {
 35 |                 "id": "test-id-3",
 36 |                 "index": "logstash-test-index-3",
 37 |                 "message": 'Request from session_id="cpnfegs3qjho575ua7dk1o533o" failed with status code 400',
 38 |             },
 39 |         ]
 40 | 
 41 |     @pytest.fixture
 42 |     def extracted_fields_path(self) -> str:
 43 |         """Provide the path to the extracted_fields.jsonl fixture."""
 44 |         return "src/tests/fixtures/extracted_fields.jsonl"
 45 | 
 46 |     def test_normalize_messages_with_sample_data(self, sample_extracted_data: List[Dict[str, Any]]) -> None:
 47 |         """Test normalize_messages with a small sample of data."""
 48 |         # Create temporary files for input and output
 49 |         with (
 50 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as input_file,
 51 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file,
 52 |         ):
 53 | 
 54 |             # Write sample data to input file (one JSON object per line)
 55 |             for doc in sample_extracted_data:
 56 |                 input_file.write(json.dumps(doc) + "\n")
 57 |             input_file.flush()
 58 | 
 59 |             input_path = Path(input_file.name)
 60 |             output_path = Path(output_file.name)
 61 | 
 62 |             try:
 63 |                 # Run the normalize_messages function
 64 |                 normalize_messages(str(input_path), str(output_path))
 65 | 
 66 |                 # Read and verify the output
 67 |                 with output_path.open("r") as f:
 68 |                     output_content = f.read()
 69 |                     normalized_data = json.loads(output_content)
 70 | 
 71 |                 # Verify the structure of the output
 72 |                 assert "patterns" in normalized_data
 73 |                 assert isinstance(normalized_data["patterns"], list)
 74 | 
 75 |                 # Should have patterns for our sample data
 76 |                 assert len(normalized_data["patterns"]) > 0
 77 | 
 78 |                 # Check that each pattern has the expected fields
 79 |                 for pattern in normalized_data["patterns"]:
 80 |                     assert "cluster_id" in pattern
 81 |                     assert "count" in pattern
 82 |                     assert "pattern" in pattern
 83 |                     assert "sample_doc_references" in pattern
 84 | 
 85 |                 # Check for masking in patterns
 86 |                 patterns_text = " ".join([p["pattern"] for p in normalized_data["patterns"]])
 87 |                 # Check that sensitive information is masked
 88 |                 assert "2025-03-05T10:57:14.135052+00:00" not in patterns_text
 89 |                 assert "TIMESTAMP" in patterns_text or "timestamp" in patterns_text.lower()
 90 |                 assert "133d8fdf-5a47-11eb-9edb-0685f7490bd8" not in patterns_text
 91 |                 assert "UUID" in patterns_text or "uuid" in patterns_text.lower()
 92 |                 assert "10.0.11.128" not in patterns_text
 93 |                 assert "IP" in patterns_text or "ip" in patterns_text.lower()
 94 | 
 95 |             finally:
 96 |                 # Clean up temporary files
 97 |                 Path(input_file.name).unlink(missing_ok=True)
 98 |                 Path(output_file.name).unlink(missing_ok=True)
 99 | 
100 |     def test_normalize_messages_with_fixture(self, extracted_fields_path: str) -> None:
101 |         """Test normalize_messages with the extracted_fields.jsonl fixture."""
102 |         # Create a temporary file for output
103 |         with tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file:
104 |             output_path = Path(output_file.name)
105 | 
106 |             try:
107 |                 # Run the normalize_messages function
108 |                 normalize_messages(extracted_fields_path, str(output_path))
109 | 
110 |                 # Read and verify the output
111 |                 with output_path.open("r") as f:
112 |                     output_content = f.read()
113 |                     normalized_data = json.loads(output_content)
114 | 
115 |                 # Verify the structure of the output
116 |                 assert "patterns" in normalized_data
117 |                 assert isinstance(normalized_data["patterns"], list)
118 |                 assert len(normalized_data["patterns"]) > 0
119 | 
120 |                 # Check that each pattern has the expected fields
121 |                 for pattern in normalized_data["patterns"]:
122 |                     assert "cluster_id" in pattern
123 |                     assert "count" in pattern
124 |                     assert "pattern" in pattern
125 |                     assert "sample_doc_references" in pattern
126 | 
127 |                 # Verify that at least some patterns have multiple occurrences
128 |                 assert any(pattern["count"] > 1 for pattern in normalized_data["patterns"])
129 | 
130 |             finally:
131 |                 # Clean up temporary file
132 |                 Path(output_file.name).unlink(missing_ok=True)
133 | 
134 |     def test_normalize_messages_with_missing_file(self) -> None:
135 |         """Test normalize_messages with a non-existent input file."""
136 |         # Create a temporary file for output
137 |         with tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file:
138 |             output_path = Path(output_file.name)
139 | 
140 |             try:
141 |                 # Try to run normalize_messages with a non-existent input file
142 |                 with pytest.raises(FileNotFoundError):
143 |                     normalize_messages("non_existent_file.jsonl", str(output_path))
144 |             finally:
145 |                 # Clean up temporary file
146 |                 Path(output_file.name).unlink(missing_ok=True)
147 | 
148 |     def test_normalize_messages_with_invalid_json(self) -> None:
149 |         """Test normalize_messages with invalid JSON input."""
150 |         # Create temporary files for input and output
151 |         with (
152 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as input_file,
153 |             tempfile.NamedTemporaryFile(mode="w+", delete=False) as output_file,
154 |         ):
155 | 
156 |             # Write invalid JSON to input file
157 |             input_file.write("{invalid json")
158 |             input_file.flush()
159 | 
160 |             input_path = Path(input_file.name)
161 |             output_path = Path(output_file.name)
162 | 
163 |             try:
164 |                 # The function logs a warning but doesn't raise an exception for invalid JSON
165 |                 normalize_messages(str(input_path), str(output_path))
166 | 
167 |                 # Verify that the output file exists and contains valid JSON
168 |                 with output_path.open("r") as f:
169 |                     output_content = f.read()
170 |                     normalized_data = json.loads(output_content)
171 | 
172 |                 # Should have an empty patterns list since no valid input was processed
173 |                 assert "patterns" in normalized_data
174 |                 assert isinstance(normalized_data["patterns"], list)
175 |                 assert len(normalized_data["patterns"]) == 0
176 | 
177 |             finally:
178 |                 # Clean up temporary files
179 |                 Path(input_file.name).unlink(missing_ok=True)
180 |                 Path(output_file.name).unlink(missing_ok=True)
181 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step10_send_email_report.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Send email report."""
  3 | 
  4 | import argparse
  5 | import smtplib
  6 | import ssl
  7 | import sys
  8 | from email.mime.multipart import MIMEMultipart
  9 | from email.mime.text import MIMEText
 10 | from pathlib import Path
 11 | 
 12 | from platform_problem_monitoring_core.utils import logger
 13 | 
 14 | 
 15 | def wrap_long_lines(content: str, max_line_length: int = 990) -> str:
 16 |     """
 17 |     Wrap long lines in content to ensure they don't exceed max_line_length.
 18 | 
 19 |     Uses a more careful approach for HTML content.
 20 | 
 21 |     Args:
 22 |         content: The content to wrap
 23 |         max_line_length: Maximum length for each line (default 990, as per RFC 5322)
 24 | 
 25 |     Returns:
 26 |         Content with lines wrapped to max_line_length
 27 |     """
 28 |     result = []
 29 | 
 30 |     for line in content.splitlines():
 31 |         if len(line) <= max_line_length:
 32 |             # Line is already short enough
 33 |             result.append(line)
 34 |             continue
 35 | 
 36 |         # For HTML content, we need to be careful about where we insert line breaks
 37 |         current_position = 0
 38 |         line_length = len(line)
 39 | 
 40 |         while current_position < line_length:
 41 |             # Determine where to cut the line
 42 |             end_pos = min(current_position + max_line_length, line_length)
 43 | 
 44 |             # If we're in the middle of an HTML tag, try to find the end of it
 45 |             if "<" in line[current_position:end_pos]:
 46 |                 # We are potentially cutting through an HTML tag
 47 |                 last_open_tag = line.rfind("<", current_position, end_pos)
 48 |                 last_close_tag = line.rfind(">", current_position, end_pos)
 49 | 
 50 |                 if last_open_tag > last_close_tag:
 51 |                     # We're inside a tag, cut before the tag starts
 52 |                     if last_open_tag > current_position:
 53 |                         end_pos = last_open_tag
 54 |                     else:
 55 |                         # The tag itself is very long, find the next closing bracket
 56 |                         next_close = line.find(">", current_position)
 57 |                         if next_close != -1 and next_close < current_position + max_line_length * 2:
 58 |                             # If closing tag is within reasonable distance, include the whole tag
 59 |                             end_pos = next_close + 1
 60 | 
 61 |             # Add the segment to result
 62 |             result.append(line[current_position:end_pos])
 63 |             current_position = end_pos
 64 | 
 65 |     return "\n".join(result)
 66 | 
 67 | 
 68 | def send_email_report(
 69 |     html_file: str,
 70 |     text_file: str,
 71 |     subject: str,
 72 |     smtp_host: str,
 73 |     smtp_port: int,
 74 |     smtp_user: str,
 75 |     smtp_pass: str,
 76 |     sender: str,
 77 |     receiver: str,
 78 |     use_tls: bool = True,
 79 | ) -> None:
 80 |     """
 81 |     Send email report.
 82 | 
 83 |     Args:
 84 |         html_file: Path to the HTML email body file
 85 |         text_file: Path to the plaintext email body file
 86 |         subject: Email subject
 87 |         smtp_host: SMTP server hostname
 88 |         smtp_port: SMTP server port
 89 |         smtp_user: SMTP username
 90 |         smtp_pass: SMTP password
 91 |         sender: Sender email address
 92 |         receiver: Receiver email address
 93 |         use_tls: Whether to use TLS encryption (default: True)
 94 | 
 95 |     Raises:
 96 |         FileNotFoundError: If either email body file doesn't exist
 97 |         smtplib.SMTPException: If there's an error sending the email
 98 |         OSError: If there's an error reading the email body files
 99 |     """
100 |     logger.info("Sending email report")
101 |     logger.info(f"HTML file: {html_file}")
102 |     logger.info(f"Text file: {text_file}")
103 |     logger.info(f"Subject: {subject}")
104 |     logger.info(f"SMTP host: {smtp_host}")
105 |     logger.info(f"SMTP port: {smtp_port}")
106 |     logger.info(f"SMTP user: {smtp_user}")
107 |     logger.info(f"Sender: {sender}")
108 |     logger.info(f"Receiver: {receiver}")
109 |     logger.info(f"Use TLS: {use_tls}")
110 | 
111 |     # Check if files exist
112 |     html_path = Path(html_file)
113 |     text_path = Path(text_file)
114 | 
115 |     if not html_path.exists():
116 |         error_msg = f"HTML email body file not found: {html_file}"
117 |         logger.error(error_msg)
118 |         raise FileNotFoundError(error_msg)
119 | 
120 |     if not text_path.exists():
121 |         error_msg = f"Text email body file not found: {text_file}"
122 |         logger.error(error_msg)
123 |         raise FileNotFoundError(error_msg)
124 | 
125 |     try:
126 |         # Read the email bodies
127 |         with html_path.open("r") as f:
128 |             html_body = f.read()
129 | 
130 |         with text_path.open("r") as f:
131 |             text_body = f.read()
132 | 
133 |         # Wrap long lines to avoid SMTP line length limits (RFC 5322 says 998 characters max)
134 |         # Use a more conservative 900 characters to be safe with different SMTP servers
135 |         html_body = wrap_long_lines(html_body, max_line_length=900)
136 |         text_body = wrap_long_lines(text_body, max_line_length=900)
137 | 
138 |         # Create message
139 |         msg = MIMEMultipart("alternative")
140 |         msg["Subject"] = subject
141 |         msg["From"] = sender
142 |         msg["To"] = receiver
143 | 
144 |         # Attach parts
145 |         msg.attach(MIMEText(text_body, "plain"))
146 |         msg.attach(MIMEText(html_body, "html"))
147 | 
148 |         logger.info(f"Connecting to SMTP server {smtp_host}:{smtp_port}")
149 | 
150 |         # Send the email
151 |         server = None
152 |         try:
153 |             server = smtplib.SMTP(smtp_host, smtp_port, timeout=30)
154 | 
155 |             # Optional: Enable debug output
156 |             # server.set_debuglevel(1)
157 | 
158 |             # Use TLS if requested
159 |             if use_tls:
160 |                 context = ssl.create_default_context()
161 |                 server.starttls(context=context)
162 | 
163 |             server.login(smtp_user, smtp_pass)
164 |             server.sendmail(sender, receiver, msg.as_string())
165 |             logger.info("Email report sent successfully")
166 |         except smtplib.SMTPException as e:
167 |             error_msg = f"SMTP error: {str(e)}"
168 |             logger.error(error_msg)
169 |             raise
170 |         finally:
171 |             if server is not None:
172 |                 server.quit()
173 |                 logger.debug("SMTP connection closed")
174 | 
175 |     except (OSError, smtplib.SMTPException) as e:
176 |         if isinstance(e, FileNotFoundError):
177 |             logger.error(f"Email body file not found: {e}")
178 |         elif isinstance(e, smtplib.SMTPException):
179 |             logger.error(f"SMTP error: {e}")
180 |         else:
181 |             logger.error(f"Error sending email: {e}")
182 |         raise
183 | 
184 | 
185 | def main() -> None:
186 |     """Execute the script when run directly."""
187 |     parser = argparse.ArgumentParser(description="Send email report")
188 |     parser.add_argument("--html-file", required=True, help="Path to the HTML email body file")
189 |     parser.add_argument("--text-file", required=True, help="Path to the plaintext email body file")
190 |     parser.add_argument("--subject", required=True, help="Email subject")
191 |     parser.add_argument("--smtp-host", required=True, help="SMTP server hostname")
192 |     parser.add_argument("--smtp-port", type=int, required=True, help="SMTP server port")
193 |     parser.add_argument("--smtp-user", required=True, help="SMTP username")
194 |     parser.add_argument("--smtp-pass", required=True, help="SMTP password")
195 |     parser.add_argument("--sender", required=True, help="Sender email address")
196 |     parser.add_argument("--receiver", required=True, help="Receiver email address")
197 |     parser.add_argument("--no-tls", action="store_true", help="Disable TLS encryption")
198 | 
199 |     args = parser.parse_args()
200 | 
201 |     try:
202 |         send_email_report(
203 |             args.html_file,
204 |             args.text_file,
205 |             args.subject,
206 |             args.smtp_host,
207 |             args.smtp_port,
208 |             args.smtp_user,
209 |             args.smtp_pass,
210 |             args.sender,
211 |             args.receiver,
212 |             not args.no_tls,
213 |         )
214 |         sys.exit(0)
215 |     except Exception as e:
216 |         logger.error(f"Error sending email report: {str(e)}")
217 |         sys.exit(1)
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     main()
222 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Platform Problem Monitoring — Core Application
  2 | 
  3 | [![Tests](https://github.com/dx-tooling/platform-problem-monitoring-core/actions/workflows/tests.yml/badge.svg)](https://github.com/dx-tooling/platform-problem-monitoring-core/actions/workflows/tests.yml)
  4 | [![Code Quality](https://github.com/dx-tooling/platform-problem-monitoring-core/actions/workflows/code-quality.yml/badge.svg)](https://github.com/dx-tooling/platform-problem-monitoring-core/actions/workflows/code-quality.yml)
  5 | 
  6 | A proactive monitoring solution that automatically analyzes your Elasticsearch logs, detects patterns, and delivers concise email reports about your platform's health.
  7 | 
  8 | If you already have an ELK stack setup up and running, Platform Problem Monitoring delivers *this* ↓ into your mailbox every hour:
  9 | 
 10 | <p align="center">
 11 |   <img src="assets/readme-hero-image.png" alt="Platform Problem Monitoring">
 12 | </p>
 13 | 
 14 | ## What This Tool Does
 15 | 
 16 | Platform Problem Monitoring Core helps platform engineers and system administrators by:
 17 | 
 18 | - **Detecting problems automatically** — Identifies errors, exceptions, and warnings in your logs without manual searching
 19 | - **Recognizing patterns** — Normalizes similar log messages to reveal systemic issues
 20 | - **Tracking changes over time** — Compares current issues with previous runs to show what's new, increasing, or decreasing
 21 | - **Delivering digestible reports** — Sends clear, well-formatted email reports with Kibana links to examples
 22 | 
 23 | ## Is This Tool Right For You?
 24 | 
 25 | This tool is ideal if:
 26 | 
 27 | - You already have an ELK (Elasticsearch, Logstash, Kibana) stack collecting logs
 28 | - You want automated, periodic health assessments of your platform
 29 | - You prefer receiving digestible summaries rather than real-time alerts
 30 | - You need to understand patterns and trends in your platform's problems
 31 | 
 32 | ## Prerequisites
 33 | 
 34 | - **Python 3.10+** installed on the host system
 35 | - **Network access** to:
 36 |   - Your Elasticsearch server
 37 |   - An AWS S3 bucket (for state storage between runs)
 38 |   - An SMTP server (for sending reports)
 39 | - **Credentials** for all these services
 40 | 
 41 | ## Quick Start
 42 | 
 43 | 1. **Clone the repository:**
 44 |    ```bash
 45 |    git clone https://github.com/dx-tooling/platform-problem-monitoring-core.git
 46 |    cd platform-problem-monitoring-core
 47 |    ```
 48 | 
 49 | 2. **Set up a virtual environment:**
 50 |    ```bash
 51 |    python3 -m venv venv
 52 |    source venv/bin/activate  # On Windows: venv\Scripts\activate
 53 |    ```
 54 | 
 55 | 3. **Install the package:**
 56 |    ```bash
 57 |    pip3 install -e .
 58 |    ```
 59 | 
 60 | 4. **Create a configuration file:**
 61 |    ```bash
 62 |    mkdir -p /etc/ppmc
 63 |    cp etc/main.conf.dist /etc/ppmc/main.conf
 64 |    ```
 65 | 
 66 | 5. **Edit the configuration:**
 67 |    ```
 68 |    REMOTE_STATE_S3_BUCKET_NAME="your-s3-bucket"
 69 |    REMOTE_STATE_S3_FOLDER_NAME="platform-monitoring"
 70 | 
 71 |    ELASTICSEARCH_SERVER_BASE_URL="https://your-elasticsearch-server:9200"
 72 |    ELASTICSEARCH_LUCENE_QUERY_FILE_PATH="path/to/lucene_query.json"
 73 | 
 74 |    KIBANA_DISCOVER_BASE_URL="https://your-kibana-server:5601"
 75 |    KIBANA_DOCUMENT_DEEPLINK_URL_STRUCTURE="https://your-kibana-server:5601/app/discover#/doc/logstash-*/{{index}}?id={{id}}"
 76 | 
 77 |    SMTP_SERVER_HOSTNAME="smtp.example.com"
 78 |    SMTP_SERVER_PORT="587"
 79 |    SMTP_SERVER_USERNAME="your-smtp-username"
 80 |    SMTP_SERVER_PASSWORD="your-smtp-password"
 81 |    SMTP_SENDER_ADDRESS="monitoring@example.com"
 82 |    SMTP_RECEIVER_ADDRESS="alerts@example.com"
 83 |    ```
 84 | 
 85 | 6. **Set up the Elasticsearch query:**
 86 |    ```bash
 87 |    cp etc/lucene_query.json.dist /etc/ppmc/lucene_query.json
 88 |    ```
 89 | 
 90 |    This sample query looks for error messages while filtering out noise:
 91 |    ```json
 92 |    {
 93 |        "query": {
 94 |            "bool": {
 95 |                "should": [
 96 |                    { "match": { "message": "error" } },
 97 |                    { "match": { "message": "failure" } },
 98 |                    { "match": { "message": "critical" } },
 99 |                    { "match": { "message": "alert" } },
100 |                    { "match": { "message": "exception" } }
101 |                ],
102 |                "must_not": [
103 |                    { "match": { "message": "User Deprecated" } },
104 |                    { "match": { "message": "logstash" } },
105 |                    { "term": { "syslog_program": "dd.collector" } }
106 |                ],
107 |                "minimum_should_match": 1
108 |            }
109 |        }
110 |    }
111 |    ```
112 | 
113 | 7. **Run the tool:**
114 |    ```bash
115 |    ./bin/ppmc /etc/ppmc/main.conf
116 |    ```
117 | 
118 | ## How It Works
119 | 
120 | When executed, the tool:
121 | 
122 | 1. **Prepares the environment** by creating a temporary work directory
123 | 2. **Downloads previous state** from S3 (for comparison)
124 | 3. **Queries Elasticsearch** for new problem-related log messages since the last run
125 | 4. **Extracts relevant fields** from the returned documents
126 | 5. **Normalizes messages** by replacing dynamic parts like UUIDs, timestamps, and specific values with placeholders
127 | 6. **Compares current patterns** with the previous run to identify new, increased, and decreased issues
128 | 7. **Generates an email report** with detailed information about all identified issues
129 | 8. **Sends the report** via your configured SMTP server
130 | 9. **Stores the current state** in S3 for the next run
131 | 10. **Cleans up** temporary files
132 | 
133 | ## Common Configuration Scenarios
134 | 
135 | ### Example: Monitoring a Kubernetes Cluster
136 | 
137 | For a Kubernetes deployment, you might want to focus on pod-related errors:
138 | 
139 | ```json
140 | {
141 |     "query": {
142 |         "bool": {
143 |             "should": [
144 |                 { "match": { "kubernetes.pod.name": "*" } },
145 |                 { "match_phrase": { "message": "error" } },
146 |                 { "match_phrase": { "message": "exception" } }
147 |             ],
148 |             "must_not": [
149 |                 { "match": { "message": "liveness probe failed" } }
150 |             ],
151 |             "minimum_should_match": 2
152 |         }
153 |     }
154 | }
155 | ```
156 | 
157 | ### Example: Monitoring Web Services
158 | 
159 | For web services, you might focus on HTTP errors and performance issues:
160 | 
161 | ```json
162 | {
163 |     "query": {
164 |         "bool": {
165 |             "should": [
166 |                 { "range": { "http.response.status_code": { "gte": 500 } } },
167 |                 { "range": { "response_time_ms": { "gte": 1000 } } },
168 |                 { "match_phrase": { "message": "timed out" } }
169 |             ],
170 |             "minimum_should_match": 1
171 |         }
172 |     }
173 | }
174 | ```
175 | 
176 | ## Scheduled Monitoring
177 | 
178 | To run the tool periodically, set up a cron job:
179 | 
180 | ```bash
181 | # Run every 6 hours
182 | 0 */6 * * * cd /path/to/platform-problem-monitoring-core && ./bin/ppmc ./etc/main.conf >> /var/log/platform-monitoring.log 2>&1
183 | ```
184 | 
185 | ## Advanced Configuration
186 | 
187 | ### Configuring AWS Credentials
188 | 
189 | The tool uses boto3's default credential resolution. You can:
190 | 
191 | 1. Set environment variables: `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`
192 | 2. Use a shared credentials file (`~/.aws/credentials`)
193 | 3. Use IAM roles if running on EC2 instances
194 | 
195 | ## Troubleshooting
196 | 
197 | ### No Reports Being Sent
198 | 
199 | 1. Check Elasticsearch connectivity: `curl -X GET https://your-elasticsearch-server:9200/_cat/indices`
200 | 2. Verify S3 bucket permissions
201 | 3. Test SMTP settings: `python -m smtplib -d smtp.example.com:587`
202 | 4. Check your query matches actual log patterns
203 | 
204 | ### Reports Missing Expected Issues
205 | 
206 | 1. Test your Elasticsearch query directly in Kibana
207 | 2. Check the date range - are you missing events due to time zone issues?
208 | 3. Adjust the Lucene query to be more inclusive
209 | 
210 | ### Performance Issues
211 | 
212 | For large log volumes:
213 | 
214 | 1. Increase the time between runs to process more logs at once
215 | 2. Optimize your Elasticsearch query with more specific filters
216 | 3. Ensure the host running the tool has sufficient memory
217 | 
218 | ## Getting Help
219 | 
220 | If you encounter problems or have questions, please:
221 | 
222 | 1. Check the detailed logs in your temporary work directory
223 | 2. Open an issue in our repository with your configuration (with sensitive data removed)
224 | 3. Include error messages and steps to reproduce the issue
225 | 
226 | ## License
227 | 
228 | This project is available under the MIT License — Copyright (c) 2025 Manuel Kießling.
229 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step3_retrieve_hourly_problem_numbers.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Retrieve number of problem logstash documents per hour."""
  3 | 
  4 | import argparse
  5 | import datetime
  6 | import json
  7 | import sys
  8 | import time
  9 | from datetime import timezone
 10 | from pathlib import Path
 11 | from typing import Any, Dict, List
 12 | 
 13 | import requests
 14 | from requests.exceptions import ConnectionError as RequestsConnectionError
 15 | from requests.exceptions import HTTPError, Timeout
 16 | 
 17 | from platform_problem_monitoring_core.utils import load_json, logger, save_json
 18 | 
 19 | 
 20 | def _generate_hour_ranges(hours_back: int) -> List[Dict[str, str]]:
 21 |     """
 22 |     Generate a list of hour ranges going back from now.
 23 | 
 24 |     Args:
 25 |         hours_back: Number of hours to go back in time
 26 | 
 27 |     Returns:
 28 |         List of dictionaries containing start and end times for each hour
 29 |     """
 30 |     now = datetime.datetime.now(timezone.utc)
 31 |     ranges = []
 32 | 
 33 |     for i in range(hours_back - 1, -1, -1):  # Go from oldest to newest
 34 |         end_time = now - datetime.timedelta(hours=i)
 35 |         start_time = end_time - datetime.timedelta(hours=1)
 36 |         ranges.append({"start": start_time.isoformat(), "end": end_time.isoformat()})
 37 | 
 38 |     return ranges
 39 | 
 40 | 
 41 | def _add_time_range_to_query(query_data: Dict[str, Any], start_time: str, end_time: str) -> Dict[str, Any]:
 42 |     """
 43 |     Add time range filter to the Elasticsearch query.
 44 | 
 45 |     Args:
 46 |         query_data: Original query data
 47 |         start_time: Start time in ISO format
 48 |         end_time: End time in ISO format
 49 | 
 50 |     Returns:
 51 |         Updated query data with time range filter
 52 |     """
 53 |     # Create a deep copy to avoid modifying the original
 54 |     query_copy: Dict[str, Any] = json.loads(json.dumps(query_data))
 55 | 
 56 |     if "query" in query_copy:
 57 |         if "bool" in query_copy["query"]:
 58 |             if "filter" not in query_copy["query"]["bool"]:
 59 |                 query_copy["query"]["bool"]["filter"] = []
 60 | 
 61 |             # Add time range filter
 62 |             query_copy["query"]["bool"]["filter"].append({"range": {"@timestamp": {"gte": start_time, "lt": end_time}}})
 63 |         else:
 64 |             # If there's no bool query, create one
 65 |             original_query = query_copy["query"]
 66 |             query_copy["query"] = {
 67 |                 "bool": {
 68 |                     "must": [original_query],
 69 |                     "filter": [{"range": {"@timestamp": {"gte": start_time, "lt": end_time}}}],
 70 |                 }
 71 |             }
 72 |     else:
 73 |         # If there's no query at all, create a simple one
 74 |         query_copy["query"] = {"bool": {"filter": [{"range": {"@timestamp": {"gte": start_time, "lt": end_time}}}]}}
 75 | 
 76 |     return query_copy
 77 | 
 78 | 
 79 | def _query_elasticsearch_for_hour(
 80 |     elasticsearch_url: str,
 81 |     query_data: Dict[str, Any],
 82 |     start_time: str,
 83 |     end_time: str,
 84 |     max_retries: int = 3,
 85 |     timeout: int = 30,
 86 | ) -> int:
 87 |     """
 88 |     Query Elasticsearch for the number of documents in a specific hour.
 89 | 
 90 |     Args:
 91 |         elasticsearch_url: Elasticsearch server URL
 92 |         query_data: Query to execute
 93 |         start_time: Start time in ISO format
 94 |         end_time: End time in ISO format
 95 |         max_retries: Maximum number of connection retry attempts
 96 |         timeout: Connection timeout in seconds
 97 | 
 98 |     Returns:
 99 |         Number of matching documents
100 | 
101 |     Raises:
102 |         RequestsConnectionError: If unable to connect to Elasticsearch
103 |         HTTPError: If Elasticsearch returns an error response
104 |         Timeout: If the connection times out
105 |     """
106 |     # Add time range to query
107 |     query_with_time = _add_time_range_to_query(json.loads(json.dumps(query_data)), start_time, end_time)
108 | 
109 |     # Create the search URL
110 |     search_url = f"{elasticsearch_url.rstrip('/')}/logstash-*/_count"
111 |     headers = {"Content-Type": "application/json"}
112 | 
113 |     retry_count = 0
114 |     last_error = None
115 | 
116 |     while retry_count < max_retries:
117 |         try:
118 |             # Execute the query
119 |             response = requests.post(search_url, headers=headers, json=query_with_time, timeout=timeout)
120 |             response.raise_for_status()
121 | 
122 |             # Extract the count from the response
123 |             result = response.json()
124 |             count: int = result.get("count", 0)
125 |             return count
126 | 
127 |         except (RequestsConnectionError, HTTPError, Timeout) as e:
128 |             last_error = e
129 |             retry_count += 1
130 |             logger.warning(f"Query attempt {retry_count} failed for range {start_time} to {end_time}: {str(e)}")
131 | 
132 |             if retry_count < max_retries:
133 |                 # Exponential backoff: 1s, 2s, 4s, etc.
134 |                 wait_time = 2 ** (retry_count - 1)
135 |                 logger.info(f"Retrying in {wait_time} seconds...")
136 |                 time.sleep(wait_time)
137 | 
138 |     # If we get here, all retries failed
139 |     error_msg = f"Failed to query Elasticsearch after {max_retries} attempts: {str(last_error)}"
140 |     logger.error(error_msg)
141 |     raise RequestsConnectionError(error_msg) from last_error
142 | 
143 | 
144 | def retrieve_hourly_problem_numbers(elasticsearch_url: str, query_file: str, hours_back: int, output_file: str) -> None:
145 |     """
146 |     Retrieve number of problem logstash documents per hour.
147 | 
148 |     Args:
149 |         elasticsearch_url: Elasticsearch server URL
150 |         query_file: Path to the Lucene query file
151 |         hours_back: Number of hours to go back in time
152 |         output_file: Path to store the hourly numbers
153 | 
154 |     Raises:
155 |         FileNotFoundError: If the query file cannot be found
156 |         json.JSONDecodeError: If the query file contains invalid JSON
157 |         RequestsConnectionError: If unable to connect to Elasticsearch
158 |         HTTPError: If Elasticsearch returns an error response
159 |         OSError: If unable to write to the output file
160 |     """
161 |     logger.info("Retrieving hourly problem numbers")
162 |     logger.info(f"Elasticsearch URL: {elasticsearch_url}")
163 |     logger.info(f"Query file: {query_file}")
164 |     logger.info(f"Hours back: {hours_back}")
165 |     logger.info(f"Output file: {output_file}")
166 | 
167 |     # Load the Lucene query
168 |     query_data = load_json(query_file)
169 |     logger.info(f"Loaded query: {json.dumps(query_data, indent=2)}")
170 | 
171 |     # Generate hour ranges
172 |     hour_ranges = _generate_hour_ranges(hours_back)
173 |     logger.info(f"Generated {len(hour_ranges)} hour ranges")
174 | 
175 |     # Query Elasticsearch for each hour range
176 |     results = []
177 |     for hour_range in hour_ranges:
178 |         start_time = hour_range["start"]
179 |         end_time = hour_range["end"]
180 | 
181 |         try:
182 |             count = _query_elasticsearch_for_hour(elasticsearch_url, query_data, start_time, end_time)
183 |             results.append({"start_time": start_time, "end_time": end_time, "count": count})
184 |             logger.info(f"Hour {start_time} to {end_time}: {count} documents")
185 |         except Exception as e:
186 |             logger.error(f"Error querying hour range {start_time} to {end_time}: {str(e)}")
187 |             raise
188 | 
189 |     # Ensure the output directory exists
190 |     output_path = Path(output_file)
191 |     output_path.parent.mkdir(parents=True, exist_ok=True)
192 | 
193 |     # Save results to output file
194 |     save_json(results, output_file)
195 |     logger.info(f"Results saved to {output_file}")
196 | 
197 | 
198 | def main() -> None:
199 |     """Execute the script when run directly."""
200 |     parser = argparse.ArgumentParser(description="Retrieve number of problem logstash documents per hour")
201 |     parser.add_argument("--elasticsearch-url", required=True, help="Elasticsearch server URL")
202 |     parser.add_argument("--query-file", required=True, help="Path to the Lucene query file")
203 |     parser.add_argument("--hours-back", type=int, default=24, help="Number of hours to go back in time")
204 |     parser.add_argument("--output-file", required=True, help="Path to store the hourly numbers")
205 | 
206 |     args = parser.parse_args()
207 | 
208 |     try:
209 |         retrieve_hourly_problem_numbers(args.elasticsearch_url, args.query_file, args.hours_back, args.output_file)
210 |         sys.exit(0)
211 |     except Exception as e:
212 |         logger.error(f"Error retrieving hourly problem numbers: {str(e)}")
213 |         sys.exit(1)
214 | 
215 | 
216 | if __name__ == "__main__":
217 |     main()
218 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step4_generate_trend_chart.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Generate trend bar chart for problem logstash documents per hour."""
  3 | 
  4 | import argparse
  5 | import sys
  6 | from datetime import datetime
  7 | from pathlib import Path
  8 | from typing import List, Tuple
  9 | 
 10 | import matplotlib
 11 | 
 12 | matplotlib.use("Agg")  # Non-interactive backend
 13 | import matplotlib.dates as mdates
 14 | import matplotlib.pyplot as plt
 15 | import seaborn as sns
 16 | 
 17 | from platform_problem_monitoring_core.utils import load_json, logger
 18 | 
 19 | 
 20 | def _parse_hourly_data(hourly_data_file: str) -> Tuple[List[datetime], List[int]]:
 21 |     """
 22 |     Parse hourly data from JSON file.
 23 | 
 24 |     Args:
 25 |         hourly_data_file: Path to the hourly data JSON file
 26 | 
 27 |     Returns:
 28 |         Tuple of (timestamps, counts)
 29 | 
 30 |     Raises:
 31 |         FileNotFoundError: If the hourly data file doesn't exist
 32 |         json.JSONDecodeError: If the file contains invalid JSON
 33 |         KeyError: If required fields are missing in the JSON data
 34 |     """
 35 |     data = load_json(hourly_data_file)
 36 |     timestamps: List[datetime] = []
 37 |     counts: List[int] = []
 38 | 
 39 |     if not data:
 40 |         logger.warning("Hourly data file is empty or contains no valid entries")
 41 |         return timestamps, counts
 42 | 
 43 |     try:
 44 |         for entry in data:
 45 |             # Parse the end time as that represents the hour's data point
 46 |             # Handle both "Z" (UTC) suffix and +00:00 format
 47 |             end_time = entry.get("end_time", "")
 48 |             if not end_time:
 49 |                 logger.warning(f"Skipping entry with missing end_time: {entry}")
 50 |                 continue
 51 | 
 52 |             # Standardize date format
 53 |             end_time = end_time.replace("Z", "+00:00")
 54 |             timestamps.append(datetime.fromisoformat(end_time))
 55 | 
 56 |             # Get count, defaulting to 0 if missing
 57 |             count = entry.get("count", 0)
 58 |             counts.append(count)
 59 |     except (ValueError, TypeError) as e:
 60 |         logger.error(f"Error parsing hourly data: {e}")
 61 |         error_msg = f"Invalid date format in hourly data: {e}"
 62 |         raise ValueError(error_msg) from e
 63 | 
 64 |     return timestamps, counts
 65 | 
 66 | 
 67 | def _format_x_axis_labels(ax: plt.Axes, timestamps: List[datetime]) -> None:
 68 |     """
 69 |     Format x-axis labels for better readability.
 70 | 
 71 |     Args:
 72 |         ax: Matplotlib axes object
 73 |         timestamps: List of datetime objects
 74 |     """
 75 |     if not timestamps:
 76 |         logger.warning("No timestamps provided for axis formatting")
 77 |         return
 78 | 
 79 |     # Set major ticks at hour intervals
 80 |     ax.xaxis.set_major_locator(mdates.HourLocator(interval=2))
 81 |     ax.xaxis.set_major_formatter(mdates.DateFormatter("%H:%M"))
 82 | 
 83 |     # Rotate labels for better readability but provide more space
 84 |     plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor", fontsize=9)
 85 | 
 86 |     # Add extra space at the bottom to prevent labels from being cut off
 87 |     plt.gcf().subplots_adjust(bottom=0.15)
 88 | 
 89 |     # Add date labels directly to the main x-axis with more padding
 90 |     # Place them at the beginning, middle and end for better orientation
 91 |     # Convert datetime objects to matplotlib date numbers
 92 |     date_nums = mdates.date2num(timestamps)
 93 |     if len(timestamps) >= 3:
 94 |         middle_idx = len(timestamps) // 2
 95 |         ax.set_xticks([date_nums[0], date_nums[middle_idx], date_nums[-1]])
 96 |         ax.set_xticklabels(
 97 |             [t.strftime("%Y-%m-%d %H:%M") for t in [timestamps[0], timestamps[middle_idx], timestamps[-1]]], fontsize=8
 98 |         )
 99 |     else:
100 |         ax.set_xticks([date_nums[0], date_nums[-1]])
101 |         ax.set_xticklabels([t.strftime("%Y-%m-%d %H:%M") for t in [timestamps[0], timestamps[-1]]], fontsize=8)
102 | 
103 |     # Remove all spines from the axis
104 |     for spine in ax.spines.values():
105 |         spine.set_visible(False)
106 | 
107 |     # Add a bit more padding at the bottom
108 |     ax.tick_params(axis="x", which="major", pad=8)
109 | 
110 | 
111 | def generate_trend_chart(hourly_data_file: str, output_image_file: str) -> None:
112 |     """
113 |     Generate trend bar chart for problem logstash documents per hour.
114 | 
115 |     Args:
116 |         hourly_data_file: Path to the hourly data JSON file
117 |         output_image_file: Path to store the generated chart image
118 | 
119 |     Raises:
120 |         FileNotFoundError: If the hourly data file doesn't exist
121 |         ValueError: If the data cannot be parsed correctly
122 |         OSError: If the output image cannot be written
123 |     """
124 |     logger.info("Generating trend chart")
125 |     logger.info(f"Hourly data file: {hourly_data_file}")
126 |     logger.info(f"Output image file: {output_image_file}")
127 | 
128 |     # Parse the hourly data
129 |     timestamps, counts = _parse_hourly_data(hourly_data_file)
130 | 
131 |     if not timestamps:
132 |         logger.warning("No data points found in hourly data file")
133 |         # Create a simple empty chart instead of failing
134 |         fig, ax = plt.subplots(figsize=(11, 5))
135 |         ax.text(0.5, 0.5, "No data available for the selected time period", ha="center", va="center", fontsize=12)
136 |         ax.set_axis_off()
137 |     else:
138 |         logger.info(f"Parsed {len(timestamps)} data points")
139 | 
140 |         # Set up the style
141 |         sns.set_style("whitegrid")
142 |         sns.set_context("notebook", font_scale=1.1)
143 | 
144 |         # Create the figure and axis with white background - slightly larger size for more padding
145 |         fig, ax = plt.subplots(figsize=(11, 5))
146 |         fig.patch.set_facecolor("white")  # White background for the figure
147 |         ax.patch.set_facecolor("white")  # White background for the axis
148 | 
149 |         # Add padding around the plot area
150 |         plt.subplots_adjust(left=0.1, right=0.95, top=0.9, bottom=0.15)
151 | 
152 |         # Convert datetime objects to matplotlib date numbers
153 |         date_nums = mdates.date2num(timestamps)
154 | 
155 |         # Plot the bars
156 |         bars = ax.bar(date_nums, counts, width=0.02, color=sns.color_palette("deep")[0], alpha=0.7)
157 | 
158 |         # Customize the plot
159 |         ax.set_title(" ", pad=20, fontsize=12, fontweight="bold")
160 |         ax.set_ylabel("Number of Problems", fontsize=10, labelpad=10)  # Add padding to y-label
161 | 
162 |         # Add more space at the bottom and top of the y-axis
163 |         y_min, y_max = ax.get_ylim()
164 |         ax.set_ylim(0, y_max * 1.15)  # Add 15% more space at the top
165 | 
166 |         # Only show horizontal grid lines
167 |         ax.grid(True, axis="y", linestyle="--", alpha=0.7)
168 |         ax.grid(False, axis="x")
169 | 
170 |         # Format x-axis
171 |         _format_x_axis_labels(ax, timestamps)
172 | 
173 |         # Add value labels on top of bars with more space
174 |         for bar in bars:
175 |             height = bar.get_height()
176 |             ax.text(
177 |                 bar.get_x() + bar.get_width() / 2.0,
178 |                 height + (y_max * 0.02),
179 |                 f"{int(height)}",
180 |                 ha="center",
181 |                 va="bottom",
182 |                 fontsize=9,
183 |             )
184 | 
185 |     # Ensure output directory exists
186 |     output_path = Path(output_image_file)
187 |     output_path.parent.mkdir(parents=True, exist_ok=True)
188 | 
189 |     # Use tight_layout with padding parameter for more breathing space
190 |     plt.tight_layout(pad=1.5)
191 | 
192 |     try:
193 |         # Save the chart with a white background
194 |         plt.savefig(
195 |             output_image_file, dpi=300, bbox_inches="tight", facecolor="white", pad_inches=0.25
196 |         )  # Add extra padding around the entire figure
197 |         logger.info(f"Chart saved to {output_image_file}")
198 |     except OSError as e:
199 |         logger.error(f"Failed to save chart to {output_image_file}: {e}")
200 |         raise
201 |     finally:
202 |         # Close the figure to free memory
203 |         plt.close()
204 | 
205 | 
206 | def main() -> None:
207 |     """Execute the script when run directly."""
208 |     parser = argparse.ArgumentParser(description="Generate trend bar chart for problem logstash documents per hour")
209 |     parser.add_argument("--hourly-data-file", required=True, help="Path to the hourly data JSON file")
210 |     parser.add_argument("--output-file", required=True, help="Path to store the generated chart image")
211 | 
212 |     args = parser.parse_args()
213 | 
214 |     try:
215 |         generate_trend_chart(args.hourly_data_file, args.output_file)
216 |         sys.exit(0)
217 |     except Exception as e:
218 |         logger.error(f"Error generating trend chart: {str(e)}")
219 |         sys.exit(1)
220 | 
221 | 
222 | if __name__ == "__main__":
223 |     main()
224 | 


--------------------------------------------------------------------------------
/assets/sample-trend-and-report-input-data/comparison_results.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "new_patterns": [
  3 |         {
  4 |             "cluster_id": "6",
  5 |             "count": 32,
  6 |             "pattern": "SSL certificate for <*> is expiring in <*> days",
  7 |             "first_seen": "2025-03-07T04:15:22Z",
  8 |             "last_seen": "2025-03-07T23:15:22Z",
  9 |             "sample_log_lines": [
 10 |                 "SSL certificate for api.example.com is expiring in 7 days",
 11 |                 "SSL certificate for dashboard.example.org is expiring in 5 days",
 12 |                 "SSL certificate for auth.example.net is expiring in 3 days"
 13 |             ],
 14 |             "sample_doc_references": [
 15 |                 "logs-2025.03.07/doc38",
 16 |                 "logs-2025.03.07/doc123",
 17 |                 "logs-2025.03.07/doc208"
 18 |             ]
 19 |         },
 20 |         {
 21 |             "cluster_id": "7",
 22 |             "count": 21,
 23 |             "pattern": "Disk usage warning: <*> is at <*>% capacity",
 24 |             "first_seen": "2025-03-07T05:32:16Z",
 25 |             "last_seen": "2025-03-07T22:47:53Z",
 26 |             "sample_log_lines": [
 27 |                 "Disk usage warning: /var/log is at 85% capacity",
 28 |                 "Disk usage warning: /home is at 92% capacity",
 29 |                 "Disk usage warning: /tmp is at 88% capacity"
 30 |             ],
 31 |             "sample_doc_references": [
 32 |                 "logs-2025.03.07/doc56",
 33 |                 "logs-2025.03.07/doc127",
 34 |                 "logs-2025.03.07/doc198"
 35 |             ]
 36 |         },
 37 |         {
 38 |             "cluster_id": "8",
 39 |             "count": 18,
 40 |             "pattern": "Connection reset by peer while sending request to <*>",
 41 |             "first_seen": "2025-03-07T01:42:19Z",
 42 |             "last_seen": "2025-03-07T22:38:11Z",
 43 |             "sample_log_lines": [
 44 |                 "Connection reset by peer while sending request to https://api.payment-provider.com/v2/transactions",
 45 |                 "Connection reset by peer while sending request to https://auth.partner-service.org/oauth/token",
 46 |                 "Connection reset by peer while sending request to https://cdn.assets.com/resource"
 47 |             ],
 48 |             "sample_doc_references": [
 49 |                 "logs-2025.03.07/doc43",
 50 |                 "logs-2025.03.07/doc137",
 51 |                 "logs-2025.03.07/doc219"
 52 |             ]
 53 |         }
 54 |     ],
 55 |     "disappeared_patterns": [
 56 |         {
 57 |             "cluster_id": "1",
 58 |             "count": 245,
 59 |             "pattern": "Error connecting to database at <*>: Connection timed out",
 60 |             "first_seen": "2025-03-06T00:15:32Z",
 61 |             "last_seen": "2025-03-06T23:45:17Z",
 62 |             "sample_log_lines": [
 63 |                 "Error connecting to database at 10.0.1.42:3306: Connection timed out",
 64 |                 "Error connecting to database at db.example.com:3306: Connection timed out",
 65 |                 "Error connecting to database at 192.168.1.100:3306: Connection timed out"
 66 |             ],
 67 |             "sample_doc_references": [
 68 |                 "logs-2025.03.06/doc1",
 69 |                 "logs-2025.03.06/doc145",
 70 |                 "logs-2025.03.06/doc231"
 71 |             ]
 72 |         },
 73 |         {
 74 |             "cluster_id": "4",
 75 |             "count": 124,
 76 |             "pattern": "Exception in thread \"main\" java.lang.OutOfMemoryError: <*>",
 77 |             "first_seen": "2025-03-06T03:25:48Z",
 78 |             "last_seen": "2025-03-06T22:17:03Z",
 79 |             "sample_log_lines": [
 80 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: Java heap space",
 81 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: GC overhead limit exceeded",
 82 |                 "Exception in thread \"main\" java.lang.OutOfMemoryError: unable to create new native thread"
 83 |             ],
 84 |             "sample_doc_references": [
 85 |                 "logs-2025.03.06/doc42",
 86 |                 "logs-2025.03.06/doc87",
 87 |                 "logs-2025.03.06/doc109"
 88 |             ]
 89 |         },
 90 |         {
 91 |             "cluster_id": "5",
 92 |             "count": 89,
 93 |             "pattern": "Kubernetes pod <*> in namespace <*> failed health check",
 94 |             "first_seen": "2025-03-06T00:32:11Z",
 95 |             "last_seen": "2025-03-06T23:47:29Z",
 96 |             "sample_log_lines": [
 97 |                 "Kubernetes pod web-server-5d4d7 in namespace production failed health check",
 98 |                 "Kubernetes pod db-backup-3f2a1 in namespace data-services failed health check",
 99 |                 "Kubernetes pod cache-9b3c8 in namespace frontend failed health check"
100 |             ],
101 |             "sample_doc_references": [
102 |                 "logs-2025.03.06/doc53",
103 |                 "logs-2025.03.06/doc167",
104 |                 "logs-2025.03.06/doc214"
105 |             ]
106 |         },
107 |         {
108 |             "cluster_id": "11",
109 |             "count": 54,
110 |             "pattern": "Failed to process message from queue <*>: <*>",
111 |             "first_seen": "2025-03-06T02:17:42Z",
112 |             "last_seen": "2025-03-06T22:35:09Z",
113 |             "sample_log_lines": [
114 |                 "Failed to process message from queue orders: JSON parse error at line 1 column 24",
115 |                 "Failed to process message from queue notifications: Message expired",
116 |                 "Failed to process message from queue user-events: Unknown message format"
117 |             ],
118 |             "sample_doc_references": [
119 |                 "logs-2025.03.06/doc67",
120 |                 "logs-2025.03.06/doc132",
121 |                 "logs-2025.03.06/doc223"
122 |             ]
123 |         },
124 |         {
125 |             "cluster_id": "12",
126 |             "count": 42,
127 |             "pattern": "Cache invalidation failed for key <*>",
128 |             "first_seen": "2025-03-06T04:28:16Z",
129 |             "last_seen": "2025-03-06T21:51:48Z",
130 |             "sample_log_lines": [
131 |                 "Cache invalidation failed for key user:profile:12345",
132 |                 "Cache invalidation failed for key product:catalog:recent",
133 |                 "Cache invalidation failed for key system:config:endpoints"
134 |             ],
135 |             "sample_doc_references": [
136 |                 "logs-2025.03.06/doc84",
137 |                 "logs-2025.03.06/doc156",
138 |                 "logs-2025.03.06/doc238"
139 |             ]
140 |         }
141 |     ],
142 |     "increased_patterns": [
143 |         {
144 |             "cluster_id": "9",
145 |             "current_count": 14,
146 |             "previous_count": 5,
147 |             "difference": 9,
148 |             "percentage_change": 180.0,
149 |             "pattern": "Failed to process job <*> - timeout after <*> seconds",
150 |             "first_seen": "2025-03-07T03:17:09Z",
151 |             "last_seen": "2025-03-07T21:05:33Z",
152 |             "sample_log_lines": [
153 |                 "Failed to process job export-user-data-5782 - timeout after 60 seconds",
154 |                 "Failed to process job generate-report-4213 - timeout after 120 seconds",
155 |                 "Failed to process job sync-inventory-8974 - timeout after 180 seconds"
156 |             ],
157 |             "sample_doc_references": [
158 |                 "logs-2025.03.07/doc72",
159 |                 "logs-2025.03.07/doc158",
160 |                 "logs-2025.03.07/doc241"
161 |             ]
162 |         }
163 |     ],
164 |     "decreased_patterns": [
165 |         {
166 |             "cluster_id": "2",
167 |             "current_count": 72,
168 |             "previous_count": 187,
169 |             "difference": -115,
170 |             "percentage_change": -61.5,
171 |             "pattern": "Failed to authenticate user <*> - invalid credentials",
172 |             "first_seen": "2025-03-07T00:03:12Z",
173 |             "last_seen": "2025-03-07T23:45:27Z",
174 |             "sample_log_lines": [
175 |                 "Failed to authenticate user customer@example.org - invalid credentials",
176 |                 "Failed to authenticate user guest-user - invalid credentials",
177 |                 "Failed to authenticate user api-client-456 - invalid credentials"
178 |             ],
179 |             "sample_doc_references": [
180 |                 "logs-2025.03.07/doc19",
181 |                 "logs-2025.03.07/doc82",
182 |                 "logs-2025.03.07/doc143"
183 |             ]
184 |         },
185 |         {
186 |             "cluster_id": "3",
187 |             "current_count": 58,
188 |             "previous_count": 163,
189 |             "difference": -105,
190 |             "percentage_change": -64.4,
191 |             "pattern": "API rate limit exceeded for user ID <*>",
192 |             "first_seen": "2025-03-07T00:12:43Z",
193 |             "last_seen": "2025-03-07T23:51:16Z",
194 |             "sample_log_lines": [
195 |                 "API rate limit exceeded for user ID 6142",
196 |                 "API rate limit exceeded for user ID 9037",
197 |                 "API rate limit exceeded for user ID 2384"
198 |             ],
199 |             "sample_doc_references": [
200 |                 "logs-2025.03.07/doc27",
201 |                 "logs-2025.03.07/doc94",
202 |                 "logs-2025.03.07/doc185"
203 |             ]
204 |         },
205 |         {
206 |             "cluster_id": "13",
207 |             "current_count": 12,
208 |             "previous_count": 29,
209 |             "difference": -17,
210 |             "percentage_change": -58.6,
211 |             "pattern": "HTTP request failed: <*> <*> returned status code <*>",
212 |             "first_seen": "2025-03-07T08:43:18Z",
213 |             "last_seen": "2025-03-07T20:17:32Z",
214 |             "sample_log_lines": [
215 |                 "HTTP request failed: GET https://metrics.example.com/api/collect returned status code 502",
216 |                 "HTTP request failed: POST https://auth.thirdparty.org/authorize returned status code 429",
217 |                 "HTTP request failed: DELETE https://cloud-storage.example.net/objects/temp returned status code 403"
218 |             ],
219 |             "sample_doc_references": [
220 |                 "logs-2025.03.07/doc87",
221 |                 "logs-2025.03.07/doc164",
222 |                 "logs-2025.03.07/doc232"
223 |             ]
224 |         }
225 |     ]
226 | }
227 | 


--------------------------------------------------------------------------------
/bin/ppmc:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | 
  3 | # Platform Problem Monitoring Core - Main execution script
  4 | 
  5 | set -e  # Exit immediately if a command exits with a non-zero status
  6 | 
  7 | # Check if configuration file is provided
  8 | if [ $# -ne 1 ]; then
  9 |     echo "Usage: $0 <path-to-config-file>"
 10 |     exit 1
 11 | fi
 12 | 
 13 | CONFIG_FILE="$1"
 14 | 
 15 | # Check if configuration file exists
 16 | if [ ! -f "$CONFIG_FILE" ]; then
 17 |     echo "Error: Configuration file not found: $CONFIG_FILE"
 18 |     exit 1
 19 | fi
 20 | 
 21 | # Source the configuration file
 22 | source "$CONFIG_FILE"
 23 | 
 24 | # Step 0: Prepare Python environment
 25 | echo "Step 0: Preparing Python environment..."
 26 | 
 27 | # Resolve the actual script location, even when called through a symlink
 28 | SOURCE=${BASH_SOURCE[0]}
 29 | if [ -z "$SOURCE" ]; then
 30 |     echo "Failed to determine script source" >&2
 31 |     exit 1
 32 | fi
 33 | 
 34 | while [ -L "$SOURCE" ]; do
 35 |     DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
 36 |     if [ -z "$DIR" ]; then
 37 |         echo "Failed to resolve symlink directory" >&2
 38 |         exit 1
 39 |     fi
 40 |     SOURCE=$(readlink "$SOURCE")
 41 |     [[ $SOURCE != /* ]] && SOURCE=$DIR/$SOURCE
 42 | done
 43 | 
 44 | # Get the script directory
 45 | SCRIPT_DIR=$( cd -P "$( dirname "$SOURCE" )" >/dev/null 2>&1 && pwd )
 46 | echo "Script directory: $SCRIPT_DIR"
 47 | 
 48 | # Try to determine which Python executable to use
 49 | PYTHON_CMD="python3"
 50 | 
 51 | # Check if Python 3 is available
 52 | if ! command -v python3 &>/dev/null; then
 53 |     # Fall back to just 'python' if python3 is not available
 54 |     if command -v python &>/dev/null; then
 55 |         PYTHON_CMD="python"
 56 |     else
 57 |         echo "Error: Neither python3 nor python found in PATH"
 58 |         exit 1
 59 |     fi
 60 | fi
 61 | 
 62 | echo "Using Python: $($PYTHON_CMD --version)"
 63 | 
 64 | # More robust check for installed package
 65 | PACKAGE_INSTALLED=false
 66 | if $PYTHON_CMD -c "import platform_problem_monitoring_core" &>/dev/null; then
 67 |     echo "Package found in primary Python installation"
 68 |     PACKAGE_INSTALLED=true
 69 | # Try with pip list as a fallback check
 70 | elif $PYTHON_CMD -m pip list | grep -i "platform-problem-monitoring-core" &>/dev/null; then
 71 |     echo "Package found in pip list"
 72 |     PACKAGE_INSTALLED=true
 73 | fi
 74 | 
 75 | # Determine if we're running from source or an installed package
 76 | if [ "$PACKAGE_INSTALLED" = true ]; then
 77 |     echo "Running from installed package"
 78 |     PYTHON_CMD=$PYTHON_CMD
 79 | else
 80 |     # If running from source, detect package root and create/use virtual environment
 81 |     # When running from source, the script should be in bin/ which is at the root of the project
 82 |     PACKAGE_ROOT=$( cd -P "$SCRIPT_DIR/.." >/dev/null 2>&1 && pwd )
 83 | 
 84 |     # Verify this looks like our project directory
 85 |     if [ -f "$PACKAGE_ROOT/pyproject.toml" ]; then
 86 |         echo "Running from source, package root directory: $PACKAGE_ROOT"
 87 |         echo "Setting up virtual environment"
 88 |         cd "$PACKAGE_ROOT"
 89 | 
 90 |         if [ ! -d "venv" ]; then
 91 |             echo "Creating virtual environment"
 92 |             $PYTHON_CMD -m venv venv
 93 |         fi
 94 | 
 95 |         source venv/bin/activate
 96 |         pip install --upgrade pip
 97 |         pip install -e . -q
 98 |     else
 99 |         echo "Error: Cannot determine package root directory"
100 |         echo "Neither running from installed package nor from recognizable source directory"
101 |         exit 1
102 |     fi
103 |     PYTHON_CMD="python"
104 | fi
105 | 
106 | # Define file paths for intermediate results
107 | WORK_DIR=""
108 | START_DATE_TIME_FILE=""
109 | CURRENT_DATE_TIME_FILE=""
110 | LOGSTASH_DOCUMENTS_FILE=""
111 | EXTRACTED_FIELDS_FILE=""
112 | NORM_RESULTS_PREV_FILE=""
113 | NORM_RESULTS_FILE=""
114 | COMPARISON_RESULTS_FILE=""
115 | HTML_EMAIL_BODY_FILE=""
116 | TEXT_EMAIL_BODY_FILE=""
117 | HOURLY_DATA_FILE=""
118 | TREND_CHART_FILE=""
119 | 
120 | echo "Starting Platform Problem Monitoring process..."
121 | 
122 | # Step 1: Prepare application environment
123 | echo "Step 1: Preparing application environment..."
124 | # Capture all output but only use the last line as the work directory
125 | PREPARE_OUTPUT=$($PYTHON_CMD -m platform_problem_monitoring_core.step1_prepare)
126 | if [ $? -ne 0 ]; then
127 |     echo "Error: Failed to prepare environment"
128 |     exit 1
129 | fi
130 | # Display the output for logging purposes
131 | echo "$PREPARE_OUTPUT"
132 | # Extract only the last line as the work directory
133 | WORK_DIR=$(echo "$PREPARE_OUTPUT" | tail -n 1)
134 | echo "Work directory created: $WORK_DIR"
135 | 
136 | # Define paths for intermediate files
137 | START_DATE_TIME_FILE="$WORK_DIR/start_date_time.txt"
138 | CURRENT_DATE_TIME_FILE="$WORK_DIR/current_date_time.txt"
139 | LOGSTASH_DOCUMENTS_FILE="$WORK_DIR/logstash_documents.json"
140 | EXTRACTED_FIELDS_FILE="$WORK_DIR/extracted_fields.jsonl"
141 | NORM_RESULTS_PREV_FILE="$WORK_DIR/norm_results_prev.json"
142 | NORM_RESULTS_FILE="$WORK_DIR/norm_results.json"
143 | COMPARISON_RESULTS_FILE="$WORK_DIR/comparison_results.json"
144 | HTML_EMAIL_BODY_FILE="$WORK_DIR/email_body.html"
145 | TEXT_EMAIL_BODY_FILE="$WORK_DIR/email_body.txt"
146 | HOURLY_DATA_FILE="$WORK_DIR/hourly_problem_numbers.json"
147 | TREND_CHART_FILE="$WORK_DIR/trend_chart.png"
148 | 
149 | # Step 2: Download previous state
150 | echo "Step 2: Downloading previous state..."
151 | $PYTHON_CMD -m platform_problem_monitoring_core.step2_download_previous_state \
152 |     --s3-bucket "$REMOTE_STATE_S3_BUCKET_NAME" \
153 |     --s3-folder "$REMOTE_STATE_S3_FOLDER_NAME" \
154 |     --date-time-file "$START_DATE_TIME_FILE" \
155 |     --norm-results-file "$NORM_RESULTS_PREV_FILE"
156 | if [ $? -ne 0 ]; then
157 |     echo "Error: Failed to download previous state"
158 |     exit 1
159 | fi
160 | echo "Previous state downloaded successfully"
161 | 
162 | # Step 3: Retrieve hourly problem numbers
163 | echo "Step 3: Retrieving hourly problem numbers..."
164 | $PYTHON_CMD -m platform_problem_monitoring_core.step3_retrieve_hourly_problem_numbers \
165 |     --elasticsearch-url "$ELASTICSEARCH_SERVER_BASE_URL" \
166 |     --query-file "$ELASTICSEARCH_LUCENE_QUERY_FILE_PATH" \
167 |     --hours-back "${TREND_HOURS_BACK:-24}" \
168 |     --output-file "$HOURLY_DATA_FILE"
169 | if [ $? -ne 0 ]; then
170 |     echo "Error: Failed to retrieve hourly problem numbers"
171 |     exit 1
172 | fi
173 | echo "Hourly problem numbers retrieved successfully"
174 | 
175 | # Step 4: Generate trend chart
176 | echo "Step 4: Generating trend chart..."
177 | $PYTHON_CMD -m platform_problem_monitoring_core.step4_generate_trend_chart \
178 |     --hourly-data-file "$HOURLY_DATA_FILE" \
179 |     --output-file "$TREND_CHART_FILE"
180 | if [ $? -ne 0 ]; then
181 |     echo "Error: Failed to generate trend chart"
182 |     exit 1
183 | fi
184 | echo "Trend chart generated successfully"
185 | 
186 | # Step 5: Download logstash documents
187 | echo "Step 5: Downloading logstash documents..."
188 | $PYTHON_CMD -m platform_problem_monitoring_core.step5_download_logstash_documents \
189 |     --elasticsearch-url "$ELASTICSEARCH_SERVER_BASE_URL" \
190 |     --query-file "$ELASTICSEARCH_LUCENE_QUERY_FILE_PATH" \
191 |     --start-date-time-file "$START_DATE_TIME_FILE" \
192 |     --output-file "$LOGSTASH_DOCUMENTS_FILE" \
193 |     --current-date-time-file "$CURRENT_DATE_TIME_FILE"
194 | if [ $? -ne 0 ]; then
195 |     echo "Error: Failed to download logstash documents"
196 |     exit 1
197 | fi
198 | echo "Logstash documents downloaded successfully"
199 | 
200 | # Step 6: Extract fields from logstash documents
201 | echo "Step 6: Extracting fields from logstash documents..."
202 | $PYTHON_CMD -m platform_problem_monitoring_core.step6_extract_fields \
203 |     --logstash-file "$LOGSTASH_DOCUMENTS_FILE" \
204 |     --output-file "$EXTRACTED_FIELDS_FILE"
205 | if [ $? -ne 0 ]; then
206 |     echo "Error: Failed to extract fields"
207 |     exit 1
208 | fi
209 | echo "Fields extracted successfully"
210 | 
211 | # Step 7: Normalize messages
212 | echo "Step 7: Normalizing messages..."
213 | $PYTHON_CMD -m platform_problem_monitoring_core.step7_normalize_messages \
214 |     --fields-file "$EXTRACTED_FIELDS_FILE" \
215 |     --output-file "$NORM_RESULTS_FILE"
216 | if [ $? -ne 0 ]; then
217 |     echo "Error: Failed to normalize messages"
218 |     exit 1
219 | fi
220 | echo "Messages normalized successfully"
221 | 
222 | # Step 8: Compare normalizations
223 | echo "Step 8: Comparing normalization results..."
224 | $PYTHON_CMD -m platform_problem_monitoring_core.step8_compare_normalizations \
225 |     --current-file "$NORM_RESULTS_FILE" \
226 |     --previous-file "$NORM_RESULTS_PREV_FILE" \
227 |     --output-file "$COMPARISON_RESULTS_FILE"
228 | if [ $? -ne 0 ]; then
229 |     echo "Error: Failed to compare normalization results"
230 |     exit 1
231 | fi
232 | echo "Normalization results compared successfully"
233 | 
234 | # Step 9: Generate email bodies
235 | echo "Step 9: Generating email bodies..."
236 | $PYTHON_CMD -m platform_problem_monitoring_core.step9_generate_email_bodies \
237 |     --comparison-file "$COMPARISON_RESULTS_FILE" \
238 |     --norm-results-file "$NORM_RESULTS_FILE" \
239 |     --html-output "$HTML_EMAIL_BODY_FILE" \
240 |     --text-output "$TEXT_EMAIL_BODY_FILE" \
241 |     --trend-chart-file "$TREND_CHART_FILE" \
242 |     --hours-back "${TREND_HOURS_BACK:-24}" \
243 |     ${KIBANA_DISCOVER_BASE_URL:+--kibana-url "$KIBANA_DISCOVER_BASE_URL"} \
244 |     ${KIBANA_DOCUMENT_DEEPLINK_URL_STRUCTURE:+--kibana-deeplink-structure "$KIBANA_DOCUMENT_DEEPLINK_URL_STRUCTURE"} \
245 |     ${ELASTICSEARCH_LUCENE_QUERY_FILE_PATH:+--elasticsearch-query-file "$ELASTICSEARCH_LUCENE_QUERY_FILE_PATH"} \
246 |     ${START_DATE_TIME_FILE:+--start-date-time-file "$START_DATE_TIME_FILE"}
247 | if [ $? -ne 0 ]; then
248 |     echo "Error: Failed to generate email bodies"
249 |     exit 1
250 | fi
251 | echo "Email bodies generated successfully"
252 | 
253 | # Step 10: Send email report
254 | echo "Step 10: Sending email report..."
255 | EMAIL_SUBJECT="Platform Problem Monitoring Report $(date +"%Y-%m-%d")"
256 | $PYTHON_CMD -m platform_problem_monitoring_core.step10_send_email_report \
257 |     --html-file "$HTML_EMAIL_BODY_FILE" \
258 |     --text-file "$TEXT_EMAIL_BODY_FILE" \
259 |     --subject "$EMAIL_SUBJECT" \
260 |     --smtp-host "$SMTP_SERVER_HOSTNAME" \
261 |     --smtp-port "$SMTP_SERVER_PORT" \
262 |     --smtp-user "$SMTP_SERVER_USERNAME" \
263 |     --smtp-pass "$SMTP_SERVER_PASSWORD" \
264 |     --sender "$SMTP_SENDER_ADDRESS" \
265 |     --receiver "$SMTP_RECEIVER_ADDRESS"
266 | if [ $? -ne 0 ]; then
267 |     echo "Error: Failed to send email report"
268 |     exit 1
269 | fi
270 | echo "Email report sent successfully"
271 | 
272 | # Step 11: Store new state
273 | echo "Step 11: Storing new state..."
274 | $PYTHON_CMD -m platform_problem_monitoring_core.step11_store_new_state \
275 |     --s3-bucket "$REMOTE_STATE_S3_BUCKET_NAME" \
276 |     --s3-folder "$REMOTE_STATE_S3_FOLDER_NAME" \
277 |     --date-time-file "$CURRENT_DATE_TIME_FILE" \
278 |     --norm-results-file "$NORM_RESULTS_FILE"
279 | if [ $? -ne 0 ]; then
280 |     echo "Error: Failed to store new state"
281 |     exit 1
282 | fi
283 | echo "New state stored successfully"
284 | 
285 | # No step 12 (cleanup) for now because it helps with debugging
286 | # when the work folder files are available
287 | 
288 | echo "Steps 1-11 completed successfully"
289 | echo "Work directory: $WORK_DIR"
290 | echo "Downloaded documents: $LOGSTASH_DOCUMENTS_FILE"
291 | echo "Extracted fields: $EXTRACTED_FIELDS_FILE"
292 | echo "Normalization results: $NORM_RESULTS_FILE"
293 | echo "Comparison results: $COMPARISON_RESULTS_FILE"
294 | echo "Email bodies: $HTML_EMAIL_BODY_FILE, $TEXT_EMAIL_BODY_FILE"
295 | echo "Email report sent to: $SMTP_RECEIVER_ADDRESS"
296 | 
297 | exit 0
298 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step8_compare_normalizations.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Compare normalization results between current and previous runs."""
  3 | 
  4 | import argparse
  5 | import json
  6 | import sys
  7 | from typing import List, TypedDict
  8 | 
  9 | try:
 10 |     from typing import NotRequired
 11 | except ImportError:
 12 |     from typing_extensions import NotRequired
 13 | 
 14 | from platform_problem_monitoring_core.utils import load_json, logger, save_json
 15 | 
 16 | 
 17 | class PatternDict(TypedDict):
 18 |     """Type for pattern dictionaries."""
 19 | 
 20 |     cluster_id: str
 21 |     count: int
 22 |     pattern: str
 23 |     first_seen: str
 24 |     last_seen: str
 25 |     sample_log_lines: List[str]
 26 |     sample_doc_references: List[str]
 27 |     # Fields required by step9_generate_email_bodies
 28 |     current_count: NotRequired[int]  # Current count (same as count for clarity)
 29 |     previous_count: NotRequired[int]  # Count from previous run
 30 |     absolute_change: NotRequired[int]  # Absolute difference between counts
 31 |     percent_change: NotRequired[float]  # Percentage difference
 32 | 
 33 | 
 34 | # Define a function to get the count safely with a proper return type
 35 | def get_count(pattern: PatternDict) -> int:
 36 |     """Safely get the count from a pattern dictionary.
 37 | 
 38 |     Args:
 39 |         pattern: The pattern dictionary.
 40 | 
 41 |     Returns:
 42 |         The count as an integer (defaults to 0 if missing).
 43 |     """
 44 |     return pattern["count"]
 45 | 
 46 | 
 47 | def _find_new_patterns(current_dict: dict, previous_dict: dict) -> List[PatternDict]:
 48 |     """
 49 |     Find patterns that are in the current data but not in the previous data.
 50 | 
 51 |     Args:
 52 |         current_dict: Dictionary containing the current normalization results.
 53 |         previous_dict: Dictionary containing the previous normalization results.
 54 | 
 55 |     Returns:
 56 |         List of new patterns that weren't in the previous data.
 57 |     """
 58 |     new_patterns: List[PatternDict] = []
 59 |     current_patterns = current_dict.get("patterns", [])
 60 |     previous_patterns = {p["pattern"]: p for p in previous_dict.get("patterns", [])}
 61 | 
 62 |     # Find patterns that are in current but not in previous
 63 |     for pattern in current_patterns:
 64 |         if pattern["pattern"] not in previous_patterns:
 65 |             # Ensure all required fields are present
 66 |             new_pattern: PatternDict = {
 67 |                 "cluster_id": pattern["cluster_id"],
 68 |                 "count": pattern["count"],
 69 |                 "pattern": pattern["pattern"],
 70 |                 "first_seen": pattern.get("first_seen", ""),
 71 |                 "last_seen": pattern.get("last_seen", ""),
 72 |                 "sample_log_lines": pattern.get("sample_log_lines", []),
 73 |                 "sample_doc_references": pattern.get("sample_doc_references", []),
 74 |             }
 75 |             new_patterns.append(new_pattern)
 76 | 
 77 |     # Sort new patterns by count (descending)
 78 |     new_patterns.sort(key=get_count, reverse=True)
 79 |     return new_patterns
 80 | 
 81 | 
 82 | def _find_disappeared_patterns(current_dict: dict, previous_dict: dict) -> List[PatternDict]:
 83 |     """
 84 |     Find patterns that are in the previous data but not in the current data.
 85 | 
 86 |     Args:
 87 |         current_dict: Dictionary containing the current normalization results.
 88 |         previous_dict: Dictionary containing the previous normalization results.
 89 | 
 90 |     Returns:
 91 |         List of patterns that disappeared from previous to current data.
 92 |     """
 93 |     disappeared_patterns: List[PatternDict] = []
 94 |     current_patterns = {p["pattern"]: p for p in current_dict.get("patterns", [])}
 95 |     previous_patterns = previous_dict.get("patterns", [])
 96 | 
 97 |     # Extract the disappeared patterns
 98 |     for pattern in previous_patterns:
 99 |         if pattern["pattern"] not in current_patterns:
100 |             # Ensure all required fields are present
101 |             disappeared_pattern: PatternDict = {
102 |                 "cluster_id": pattern["cluster_id"],
103 |                 "count": pattern["count"],
104 |                 "pattern": pattern["pattern"],
105 |                 "first_seen": pattern.get("first_seen", ""),
106 |                 "last_seen": pattern.get("last_seen", ""),
107 |                 "sample_log_lines": pattern.get("sample_log_lines", []),
108 |                 "sample_doc_references": pattern.get("sample_doc_references", []),
109 |             }
110 |             disappeared_patterns.append(disappeared_pattern)
111 | 
112 |     # Sort disappeared patterns by count (descending)
113 |     disappeared_patterns.sort(key=get_count, reverse=True)
114 |     return disappeared_patterns
115 | 
116 | 
117 | def _find_increased_patterns(current_dict: dict, previous_dict: dict) -> List[PatternDict]:
118 |     """
119 |     Find patterns that have increased in count from the previous data to the current data.
120 | 
121 |     Args:
122 |         current_dict: Dictionary containing the current normalization results.
123 |         previous_dict: Dictionary containing the previous normalization results.
124 | 
125 |     Returns:
126 |         List of patterns with increased counts.
127 |     """
128 |     increased_patterns: List[PatternDict] = []
129 | 
130 |     # Build dictionaries for easier lookup
131 |     current_patterns = {p["pattern"]: p for p in current_dict.get("patterns", [])}
132 |     previous_patterns = {p["pattern"]: p for p in previous_dict.get("patterns", [])}
133 | 
134 |     # Find patterns with increased counts
135 |     for pattern_text, current_pattern in current_patterns.items():
136 |         if pattern_text in previous_patterns:
137 |             previous_count = previous_patterns[pattern_text]["count"]
138 |             current_count = current_pattern["count"]
139 | 
140 |             # Only include patterns with actual increases
141 |             if current_count > previous_count:
142 |                 # Calculate absolute and percentage change
143 |                 absolute_change = current_count - previous_count
144 |                 percent_change = round((absolute_change / previous_count) * 100, 1) if previous_count > 0 else 100
145 | 
146 |                 # Ensure all required fields are present
147 |                 increased_pattern: PatternDict = {
148 |                     "cluster_id": current_pattern["cluster_id"],
149 |                     "count": current_count,
150 |                     "current_count": current_count,
151 |                     "previous_count": previous_count,
152 |                     "absolute_change": absolute_change,
153 |                     "percent_change": percent_change,
154 |                     "pattern": pattern_text,
155 |                     "first_seen": current_pattern.get("first_seen", ""),
156 |                     "last_seen": current_pattern.get("last_seen", ""),
157 |                     "sample_log_lines": current_pattern.get("sample_log_lines", []),
158 |                     "sample_doc_references": current_pattern.get("sample_doc_references", []),
159 |                 }
160 |                 increased_patterns.append(increased_pattern)
161 | 
162 |     # Sort by percent change (descending), then by absolute change for ties, then by count
163 |     increased_patterns.sort(key=lambda p: (p["percent_change"], p["absolute_change"], p["count"]), reverse=True)
164 |     return increased_patterns
165 | 
166 | 
167 | def _find_decreased_patterns(current_dict: dict, previous_dict: dict) -> List[PatternDict]:
168 |     """
169 |     Find patterns that have decreased in count from the previous data to the current data.
170 | 
171 |     Args:
172 |         current_dict: Dictionary containing the current normalization results.
173 |         previous_dict: Dictionary containing the previous normalization results.
174 | 
175 |     Returns:
176 |         List of patterns with decreased counts.
177 |     """
178 |     decreased_patterns: List[PatternDict] = []
179 | 
180 |     # Build dictionaries for easier lookup
181 |     current_patterns = {p["pattern"]: p for p in current_dict.get("patterns", [])}
182 |     previous_patterns = {p["pattern"]: p for p in previous_dict.get("patterns", [])}
183 | 
184 |     # Find patterns with decreased counts
185 |     for pattern_text, current_pattern in current_patterns.items():
186 |         if pattern_text in previous_patterns:
187 |             previous_count = previous_patterns[pattern_text]["count"]
188 |             current_count = current_pattern["count"]
189 | 
190 |             # Only include patterns with actual decreases
191 |             if current_count < previous_count:
192 |                 # Calculate absolute and percentage change
193 |                 absolute_change = previous_count - current_count
194 |                 percent_change = round((absolute_change / previous_count) * 100, 1) if previous_count > 0 else 0
195 | 
196 |                 # Ensure all required fields are present
197 |                 decreased_pattern: PatternDict = {
198 |                     "cluster_id": current_pattern["cluster_id"],
199 |                     "count": current_count,
200 |                     "current_count": current_count,
201 |                     "previous_count": previous_count,
202 |                     "absolute_change": absolute_change,
203 |                     "percent_change": percent_change,
204 |                     "pattern": pattern_text,
205 |                     "first_seen": current_pattern.get("first_seen", ""),
206 |                     "last_seen": current_pattern.get("last_seen", ""),
207 |                     "sample_log_lines": current_pattern.get("sample_log_lines", []),
208 |                     "sample_doc_references": current_pattern.get("sample_doc_references", []),
209 |                 }
210 |                 decreased_patterns.append(decreased_pattern)
211 | 
212 |     # Sort by percent change (descending), then by absolute change for ties, then by count
213 |     decreased_patterns.sort(key=lambda p: (p["percent_change"], p["absolute_change"], p["count"]), reverse=True)
214 |     return decreased_patterns
215 | 
216 | 
217 | def compare_normalizations(current_file: str, previous_file: str, output_file: str) -> None:
218 |     """
219 |     Compare normalization results between current and previous runs.
220 | 
221 |     Args:
222 |         current_file: Path to the current normalization results file
223 |         previous_file: Path to the previous normalization results file
224 |         output_file: Path to store the comparison results
225 |     """
226 |     logger.info("Comparing normalization results")
227 |     logger.info(f"Current file: {current_file}")
228 |     logger.info(f"Previous file: {previous_file}")
229 |     logger.info(f"Output file: {output_file}")
230 | 
231 |     try:
232 |         # Load current and previous normalization results
233 |         current_data = load_json(current_file)
234 |         previous_data = load_json(previous_file)
235 | 
236 |         # Extract patterns from the loaded data
237 |         current_patterns = current_data.get("patterns", [])
238 |         previous_patterns = previous_data.get("patterns", [])
239 | 
240 |         logger.info(f"Current patterns: {len(current_patterns)}")
241 |         logger.info(f"Previous patterns: {len(previous_patterns)}")
242 | 
243 |         # Find new patterns (in current but not in previous)
244 |         new_patterns = _find_new_patterns({"patterns": current_patterns}, {"patterns": previous_patterns})
245 |         logger.info(f"New patterns: {len(new_patterns)}")
246 | 
247 |         # Find disappeared patterns (in previous but not in current)
248 |         disappeared_patterns = _find_disappeared_patterns(
249 |             {"patterns": current_patterns}, {"patterns": previous_patterns}
250 |         )
251 |         logger.info(f"Disappeared patterns: {len(disappeared_patterns)}")
252 | 
253 |         # Find patterns with increased counts
254 |         increased_patterns = _find_increased_patterns({"patterns": current_patterns}, {"patterns": previous_patterns})
255 |         logger.info(f"Increased patterns: {len(increased_patterns)}")
256 | 
257 |         # Find patterns with decreased counts
258 |         decreased_patterns = _find_decreased_patterns({"patterns": current_patterns}, {"patterns": previous_patterns})
259 |         logger.info(f"Decreased patterns: {len(decreased_patterns)}")
260 | 
261 |         # Prepare comparison results
262 |         comparison_results = {
263 |             "current_patterns_count": len(current_patterns),
264 |             "previous_patterns_count": len(previous_patterns),
265 |             "new_patterns": new_patterns,
266 |             "disappeared_patterns": disappeared_patterns,
267 |             "increased_patterns": increased_patterns,
268 |             "decreased_patterns": decreased_patterns,
269 |         }
270 | 
271 |         # Save comparison results
272 |         save_json(comparison_results, output_file)
273 |         logger.info(f"Comparison results saved to {output_file}")
274 | 
275 |     except FileNotFoundError as e:
276 |         logger.error(f"File not found: {e}")
277 |         raise
278 |     except json.JSONDecodeError as e:
279 |         logger.error(f"Invalid JSON: {e}")
280 |         raise
281 |     except Exception as e:
282 |         logger.error(f"Error comparing normalization results: {e}")
283 |         raise
284 | 
285 | 
286 | def main() -> None:
287 |     """Parse command line arguments and compare normalization results."""
288 |     parser = argparse.ArgumentParser(description="Compare normalization results")
289 |     parser.add_argument("--current-file", required=True, help="Path to the current normalization results file")
290 |     parser.add_argument("--previous-file", required=True, help="Path to the previous normalization results file")
291 |     parser.add_argument("--output-file", required=True, help="Path to store the comparison results")
292 | 
293 |     args = parser.parse_args()
294 | 
295 |     try:
296 |         compare_normalizations(args.current_file, args.previous_file, args.output_file)
297 |     except Exception as e:
298 |         logger.error(f"Error comparing normalization results: {e}")
299 |         sys.exit(1)
300 | 
301 | 
302 | if __name__ == "__main__":
303 |     main()
304 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step5_download_logstash_documents.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Download logstash documents from Elasticsearch."""
  3 | 
  4 | import argparse
  5 | import datetime
  6 | import json
  7 | import sys
  8 | import time
  9 | from datetime import timezone
 10 | from pathlib import Path
 11 | from typing import Any, Dict, List
 12 | 
 13 | import requests
 14 | from requests.exceptions import ConnectionError as RequestsConnectionError
 15 | from requests.exceptions import HTTPError, Timeout
 16 | 
 17 | from platform_problem_monitoring_core.utils import load_json, logger, save_json
 18 | 
 19 | 
 20 | def _add_time_range_to_query(
 21 |     query_data: Dict[str, Any], start_date_time: str, current_date_time: str
 22 | ) -> Dict[str, Any]:
 23 |     """
 24 |     Add time range filter to the Elasticsearch query.
 25 | 
 26 |     Args:
 27 |         query_data: Original query data
 28 |         start_date_time: Start date and time in ISO format
 29 |         current_date_time: Current date and time in ISO format
 30 | 
 31 |     Returns:
 32 |         Updated query data with time range filter
 33 |     """
 34 |     # Create a deep copy of the query to avoid modifying the original
 35 |     query_copy: Dict[str, Any] = json.loads(json.dumps(query_data))
 36 | 
 37 |     if "query" in query_copy:
 38 |         if "bool" in query_copy["query"]:
 39 |             if "filter" not in query_copy["query"]["bool"]:
 40 |                 query_copy["query"]["bool"]["filter"] = []
 41 | 
 42 |             # Add time range filter
 43 |             query_copy["query"]["bool"]["filter"].append(
 44 |                 {"range": {"@timestamp": {"gte": start_date_time, "lte": current_date_time}}}
 45 |             )
 46 |         else:
 47 |             # If there's no bool query, create one
 48 |             original_query = query_copy["query"]
 49 |             query_copy["query"] = {
 50 |                 "bool": {
 51 |                     "must": [original_query],
 52 |                     "filter": [{"range": {"@timestamp": {"gte": start_date_time, "lte": current_date_time}}}],
 53 |                 }
 54 |             }
 55 |     else:
 56 |         # If there's no query at all, create a simple one
 57 |         query_copy["query"] = {
 58 |             "bool": {"filter": [{"range": {"@timestamp": {"gte": start_date_time, "lte": current_date_time}}}]}
 59 |         }
 60 | 
 61 |     return query_copy
 62 | 
 63 | 
 64 | def _get_start_date_time(start_date_time_file: str) -> str:
 65 |     """
 66 |     Read start date and time from file or use default.
 67 | 
 68 |     Args:
 69 |         start_date_time_file: Path to the file containing the start date and time
 70 | 
 71 |     Returns:
 72 |         Start date and time in ISO format
 73 |     """
 74 |     try:
 75 |         with Path(start_date_time_file).open("r") as f:
 76 |             start_date_time = f.read().strip()
 77 |             logger.info(f"Start date and time: {start_date_time}")
 78 |             return start_date_time
 79 |     except FileNotFoundError:
 80 |         logger.warning(f"Start date and time file not found: {start_date_time_file}")
 81 |         # Default to 24 hours ago if file not found
 82 |         # Using timezone-aware approach to address deprecation warning
 83 |         start_date_time = (datetime.datetime.now(timezone.utc) - datetime.timedelta(days=1)).isoformat()
 84 |         logger.info(f"Using default start date and time: {start_date_time}")
 85 |         return start_date_time
 86 | 
 87 | 
 88 | def _save_current_date_time(current_date_time_file: str, current_date_time: str) -> None:
 89 |     """
 90 |     Save current date and time to file for the next run.
 91 | 
 92 |     Args:
 93 |         current_date_time_file: Path to store the current date and time
 94 |         current_date_time: Current date and time in ISO format
 95 | 
 96 |     Raises:
 97 |         OSError: If unable to write to the file
 98 |     """
 99 |     try:
100 |         with Path(current_date_time_file).open("w") as f:
101 |             f.write(current_date_time)
102 |     except (OSError, IOError) as e:
103 |         logger.error(f"Failed to save current date and time to {current_date_time_file}: {e}")
104 |         raise
105 | 
106 | 
107 | def _verify_elasticsearch_connection(elasticsearch_url: str, max_retries: int = 3, timeout: int = 30) -> None:
108 |     """
109 |     Verify Elasticsearch server is available.
110 | 
111 |     Args:
112 |         elasticsearch_url: Elasticsearch server URL
113 |         max_retries: Maximum number of connection retry attempts
114 |         timeout: Connection timeout in seconds
115 | 
116 |     Raises:
117 |         ConnectionError: If unable to connect to the server after retries
118 |         HTTPError: If the server returns an error response
119 |         Timeout: If the connection times out
120 |     """
121 |     retry_count = 0
122 |     last_error = None
123 | 
124 |     while retry_count < max_retries:
125 |         try:
126 |             # Do a simple request first to verify the server is available
127 |             response = requests.get(elasticsearch_url, timeout=timeout)
128 |             response.raise_for_status()  # Raise exception for 4XX/5XX responses
129 | 
130 |             server_info = response.json()
131 |             es_version = server_info.get("version", {}).get("number", "unknown")
132 |             logger.info(f"Connected to Elasticsearch version: {es_version}")
133 |             return
134 | 
135 |         except (RequestsConnectionError, HTTPError, Timeout) as e:
136 |             last_error = e
137 |             retry_count += 1
138 |             logger.warning(f"Connection attempt {retry_count} failed: {str(e)}")
139 | 
140 |             if retry_count < max_retries:
141 |                 # Exponential backoff: 1s, 2s, 4s, etc.
142 |                 wait_time = 2 ** (retry_count - 1)
143 |                 logger.info(f"Retrying in {wait_time} seconds...")
144 |                 time.sleep(wait_time)
145 | 
146 |     # If we get here, all retries failed
147 |     error_msg = f"Failed to connect to Elasticsearch after {max_retries} attempts: {str(last_error)}"
148 |     logger.error(error_msg)
149 |     raise RequestsConnectionError(error_msg)
150 | 
151 | 
152 | def _download_documents_with_pagination(
153 |     elasticsearch_url: str, query_data: Dict[str, Any], timeout: int = 30
154 | ) -> List[Dict[str, Any]]:
155 |     """
156 |     Download documents from Elasticsearch using pagination.
157 | 
158 |     Args:
159 |         elasticsearch_url: Elasticsearch server URL
160 |         query_data: Query to execute
161 |         timeout: Connection timeout in seconds
162 | 
163 |     Returns:
164 |         List of downloaded documents
165 | 
166 |     Raises:
167 |         ConnectionError: If unable to connect to the server
168 |         HTTPError: If the server returns an error response
169 |         Timeout: If the connection times out
170 |     """
171 |     documents: List[Dict[str, Any]] = []
172 |     page_size = 1000  # Number of documents per page
173 |     scroll_id = None
174 |     scroll_timeout = "5m"  # Keep the search context alive for 5 minutes
175 |     headers = {"Content-Type": "application/json"}
176 | 
177 |     # Initial search
178 |     search_url = f"{elasticsearch_url.rstrip('/')}/logstash-*/_search"
179 |     search_params: Dict[str, str] = {"scroll": scroll_timeout, "size": str(page_size)}
180 | 
181 |     try:
182 |         search_response = requests.post(
183 |             search_url, params=search_params, headers=headers, json=query_data, timeout=timeout
184 |         )
185 |         search_response.raise_for_status()  # Raise exception for 4XX/5XX responses
186 | 
187 |         # Process the first batch of results
188 |         response = search_response.json()
189 |         scroll_id = response.get("_scroll_id")
190 |         hits = response.get("hits", {}).get("hits", [])
191 |         total_docs = response.get("hits", {}).get("total", {}).get("value", 0)
192 | 
193 |         logger.info(f"Found {total_docs} documents matching the query")
194 | 
195 |         # Process the first page of results
196 |         documents.extend(hits)
197 |         logger.info(f"Downloaded {len(hits)} documents (page 1)")
198 | 
199 |         # Continue scrolling until all documents are retrieved
200 |         page = 1
201 | 
202 |         while scroll_id and len(hits) > 0:
203 |             page += 1
204 |             try:
205 |                 # Use the scroll API directly
206 |                 scroll_url = f"{elasticsearch_url.rstrip('/')}/_search/scroll"
207 |                 scroll_data = {"scroll": scroll_timeout, "scroll_id": scroll_id}
208 | 
209 |                 scroll_response = requests.post(scroll_url, headers=headers, json=scroll_data, timeout=timeout)
210 |                 scroll_response.raise_for_status()
211 | 
212 |                 response = scroll_response.json()
213 |                 scroll_id = response.get("_scroll_id")
214 |                 hits = response.get("hits", {}).get("hits", [])
215 | 
216 |                 if hits:
217 |                     documents.extend(hits)
218 |                     logger.info(f"Downloaded {len(hits)} documents (page {page})")
219 | 
220 |                     # Add a small delay to avoid overwhelming the Elasticsearch server
221 |                     time.sleep(0.1)
222 |             except (RequestsConnectionError, HTTPError, Timeout) as e:
223 |                 logger.error(f"Error during scroll operation on page {page}: {str(e)}")
224 |                 # Continue with documents retrieved so far
225 |                 break
226 | 
227 |     except (RequestsConnectionError, HTTPError, Timeout) as e:
228 |         logger.error(f"Error during initial search: {str(e)}")
229 |         raise
230 |     finally:
231 |         # Clear the scroll context to free up resources
232 |         if scroll_id:
233 |             try:
234 |                 clear_scroll_url = f"{elasticsearch_url.rstrip('/')}/_search/scroll"
235 |                 clear_scroll_data = {"scroll_id": [scroll_id]}
236 | 
237 |                 requests.delete(clear_scroll_url, headers=headers, json=clear_scroll_data, timeout=timeout)
238 |             except Exception as e:
239 |                 logger.warning(f"Failed to clear scroll context: {str(e)}")
240 | 
241 |     return documents
242 | 
243 | 
244 | def download_logstash_documents(
245 |     elasticsearch_url: str,
246 |     query_file: str,
247 |     start_date_time_file: str,
248 |     output_file: str,
249 |     current_date_time_file: str,
250 | ) -> None:
251 |     """
252 |     Download logstash documents from Elasticsearch.
253 | 
254 |     Args:
255 |         elasticsearch_url: Elasticsearch server URL
256 |         query_file: Path to the Lucene query file
257 |         start_date_time_file: Path to the file containing the start date and time
258 |         output_file: Path to store the downloaded logstash documents
259 |         current_date_time_file: Path to store the current date and time
260 | 
261 |     Raises:
262 |         FileNotFoundError: If any of the required files cannot be found
263 |         RequestsConnectionError: If unable to connect to Elasticsearch
264 |         JSONDecodeError: If the query file contains invalid JSON
265 |         OSError: If unable to write output files
266 |     """
267 |     logger.info("Downloading logstash documents")
268 |     logger.info(f"Elasticsearch URL: {elasticsearch_url}")
269 |     logger.info(f"Query file: {query_file}")
270 |     logger.info(f"Start date and time file: {start_date_time_file}")
271 |     logger.info(f"Output file: {output_file}")
272 |     logger.info(f"Current date and time file: {current_date_time_file}")
273 | 
274 |     # Load the Lucene query
275 |     query_data = load_json(query_file)
276 |     logger.info(f"Loaded query: {json.dumps(query_data, indent=2)}")
277 | 
278 |     # Read the start date and time
279 |     start_date_time = _get_start_date_time(start_date_time_file)
280 | 
281 |     # Get the current date and time
282 |     # Using timezone-aware approach to address deprecation warning
283 |     current_date_time = datetime.datetime.now(timezone.utc).isoformat()
284 |     logger.info(f"Current date and time: {current_date_time}")
285 | 
286 |     # Save the current date and time for the next run
287 |     _save_current_date_time(current_date_time_file, current_date_time)
288 | 
289 |     # Connect to Elasticsearch - first check if the server is reachable
290 |     _verify_elasticsearch_connection(elasticsearch_url)
291 | 
292 |     # Add time range to the query
293 |     query_data = _add_time_range_to_query(query_data, start_date_time, current_date_time)
294 |     logger.info(f"Modified query with time range: {json.dumps(query_data, indent=2)}")
295 | 
296 |     # Download documents using pagination
297 |     documents = _download_documents_with_pagination(elasticsearch_url, query_data)
298 | 
299 |     logger.info(f"Downloaded a total of {len(documents)} documents")
300 | 
301 |     # Save the documents to the output file
302 |     save_json(documents, output_file)
303 |     logger.info(f"Saved documents to {output_file}")
304 | 
305 |     logger.info("Logstash documents downloaded successfully")
306 | 
307 | 
308 | def main() -> None:
309 |     """Execute the script when run directly."""
310 |     parser = argparse.ArgumentParser(description="Download logstash documents from Elasticsearch")
311 |     parser.add_argument("--elasticsearch-url", required=True, help="Elasticsearch server URL")
312 |     parser.add_argument("--query-file", required=True, help="Path to the Lucene query file")
313 |     parser.add_argument(
314 |         "--start-date-time-file",
315 |         required=True,
316 |         help="Path to the file containing the start date and time",
317 |     )
318 |     parser.add_argument("--output-file", required=True, help="Path to store the downloaded logstash documents")
319 |     parser.add_argument("--current-date-time-file", required=True, help="Path to store the current date and time")
320 | 
321 |     args = parser.parse_args()
322 | 
323 |     try:
324 |         download_logstash_documents(
325 |             args.elasticsearch_url,
326 |             args.query_file,
327 |             args.start_date_time_file,
328 |             args.output_file,
329 |             args.current_date_time_file,
330 |         )
331 |         sys.exit(0)
332 |     except Exception as e:
333 |         logger.error(f"Error downloading logstash documents: {str(e)}")
334 |         sys.exit(1)
335 | 
336 | 
337 | if __name__ == "__main__":
338 |     main()
339 | 


--------------------------------------------------------------------------------
/src/platform_problem_monitoring_core/step7_normalize_messages.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """Normalize messages using drain3 for pattern recognition."""
  3 | 
  4 | import argparse
  5 | import json
  6 | import re
  7 | import sys
  8 | from pathlib import Path
  9 | from typing import Any, Dict, List, TypedDict
 10 | 
 11 | from drain3 import TemplateMiner
 12 | from drain3.masking import MaskingInstruction
 13 | from drain3.template_miner_config import TemplateMinerConfig
 14 | 
 15 | from platform_problem_monitoring_core.utils import logger, save_json
 16 | 
 17 | 
 18 | def configure_template_miner() -> TemplateMiner:
 19 |     """
 20 |     Configure the drain3 template miner with custom masking instructions.
 21 | 
 22 |     Returns:
 23 |         Configured TemplateMiner instance
 24 |     """
 25 |     config = TemplateMinerConfig()
 26 |     config.mask_prefix = "<"
 27 |     config.mask_suffix = ">"
 28 | 
 29 |     # Clear default masking instructions and add custom ones
 30 |     config.masking_instructions = []
 31 | 
 32 |     # IP addresses
 33 |     config.masking_instructions.append(
 34 |         MaskingInstruction(pattern=r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", mask_with="IP")
 35 |     )
 36 | 
 37 |     # Timestamps in various formats
 38 |     config.masking_instructions.append(
 39 |         MaskingInstruction(
 40 |             pattern=r"\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(\+|-)\d{2}:\d{2}\]",
 41 |             mask_with="[TIMESTAMP]",
 42 |         )
 43 |     )
 44 |     config.masking_instructions.append(
 45 |         MaskingInstruction(pattern=r"\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d+)?\]", mask_with="[TIMESTAMP]")
 46 |     )
 47 |     config.masking_instructions.append(
 48 |         MaskingInstruction(
 49 |             pattern=r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(\+|-)\d{2}:\d{2}",
 50 |             mask_with="TIMESTAMP",
 51 |         )
 52 |     )
 53 |     config.masking_instructions.append(
 54 |         MaskingInstruction(pattern=r"\d{2}/[A-Za-z]{3}/\d{4}:\d{2}:\d{2}:\d{2} (\+|-)\d{4}", mask_with="TIMESTAMP")
 55 |     )
 56 |     config.masking_instructions.append(
 57 |         MaskingInstruction(pattern=r"[A-Z][a-z]{2} \d{1,2} \d{2}:\d{2}:\d{2}", mask_with="TIMESTAMP")
 58 |     )
 59 |     config.masking_instructions.append(MaskingInstruction(pattern=r"\d{4}-\d{2}-\d{2}", mask_with="DATE"))
 60 |     config.masking_instructions.append(MaskingInstruction(pattern=r"\d{2}:\d{2}:\d{2}(\.\d+)?", mask_with="TIME"))
 61 | 
 62 |     # UUIDs
 63 |     config.masking_instructions.append(
 64 |         MaskingInstruction(
 65 |             pattern=r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
 66 |             mask_with="UUID",
 67 |         )
 68 |     )
 69 | 
 70 |     # Hexadecimal identifiers
 71 |     config.masking_instructions.append(MaskingInstruction(pattern=r"\b[0-9a-f]{16,}\b", mask_with="HEX"))
 72 | 
 73 |     # Process IDs
 74 |     config.masking_instructions.append(MaskingInstruction(pattern=r"\[\d+\]", mask_with="[PID]"))
 75 | 
 76 |     # Line numbers in stack traces
 77 |     config.masking_instructions.append(MaskingInstruction(pattern=r"line:? \d+", mask_with="line: NUM"))
 78 |     config.masking_instructions.append(MaskingInstruction(pattern=r":\d+\)", mask_with=":NUM)"))
 79 | 
 80 |     # Query parameters in URLs
 81 |     config.masking_instructions.append(MaskingInstruction(pattern=r'\?[^"\'<>\s]*', mask_with="?PARAMS"))
 82 | 
 83 |     return TemplateMiner(config=config)
 84 | 
 85 | 
 86 | def protect_file_paths(line: str) -> str:
 87 |     """
 88 |     Identify and protect file paths in log messages.
 89 | 
 90 |     Args:
 91 |         line: Log message line
 92 | 
 93 |     Returns:
 94 |         Line with protected file paths
 95 |     """
 96 |     # Common file path patterns in error messages
 97 |     file_path_patterns = [
 98 |         # Pattern for PHP errors: in /path/to/file.php on line 123
 99 |         r"(in\s+)(/[^\s:]+)(\s+on\s+line\s+)(\d+)",
100 |         # Pattern for stack traces: at /path/to/file.php:123
101 |         r"(at\s+)(/[^\s:]+):(\d+)",
102 |         # Pattern for file paths with line numbers: /path/to/file.php:123
103 |         r"(^|\s)(/[^\s:]+):(\d+)(\s|$)",
104 |         # Pattern for file paths in quotes: '/path/to/file.php'
105 |         r'[\'"](/[^\'"]+)[\'"]',
106 |     ]
107 | 
108 |     for pattern in file_path_patterns:
109 |         if "on line" in pattern:
110 |             # For PHP-style errors: "in /path/to/file.php on line 123"
111 |             line = re.sub(pattern, r"\1\2\3<NUM>", line)
112 |         elif "at" in pattern:
113 |             # For stack traces: "at /path/to/file.php:123"
114 |             line = re.sub(pattern, r"\1\2:<NUM>", line)
115 |         elif ":" in pattern:
116 |             # For general file paths with line numbers: "/path/to/file.php:123"
117 |             line = re.sub(pattern, r"\1\2:<NUM>\4", line)
118 | 
119 |     return line
120 | 
121 | 
122 | def normalize_json(json_obj: Any) -> Any:
123 |     """
124 |     Recursively process a JSON object and mask variable parts while preserving structure.
125 | 
126 |     Args:
127 |         json_obj: JSON object to normalize
128 | 
129 |     Returns:
130 |         Normalized JSON object
131 |     """
132 |     if isinstance(json_obj, dict):
133 |         result = {}
134 |         for key, value in json_obj.items():
135 |             # Keep the keys as is, normalize the values
136 |             result[key] = normalize_json(value)
137 |         return result
138 |     elif isinstance(json_obj, list):
139 |         # For lists, normalize each element
140 |         return [normalize_json(item) for item in json_obj]
141 |     elif isinstance(json_obj, str):
142 |         # Mask UUIDs
143 |         if re.match(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", json_obj):
144 |             return "<UUID>"
145 |         # Mask timestamps
146 |         elif re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(\+|-)\d{2}:\d{2}$", json_obj):
147 |             return "<TIMESTAMP>"
148 |         # Mask emails
149 |         elif re.match(r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$", json_obj):
150 |             return "<EMAIL>"
151 |         # For other strings, keep them as is
152 |         return json_obj
153 |     elif isinstance(json_obj, (int, float)):
154 |         # Mask numbers
155 |         return "<NUM>"
156 |     else:
157 |         # For booleans, null, etc., keep them as is
158 |         return json_obj
159 | 
160 | 
161 | def preprocess_log_line(line: str) -> str:
162 |     """
163 |     Preprocess a log line to handle special cases before template mining.
164 | 
165 |     Args:
166 |         line: Log message line
167 | 
168 |     Returns:
169 |         Preprocessed log line
170 |     """
171 |     # Handle timestamp patterns in square brackets
172 |     timestamp_pattern = r"\[\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(\+|-)\d{2}:\d{2}\]"
173 |     line = re.sub(timestamp_pattern, "[TIMESTAMP]", line)
174 | 
175 |     alt_timestamp_pattern = r"\[\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}(\.\d+)?\]"
176 |     line = re.sub(alt_timestamp_pattern, "[TIMESTAMP]", line)
177 | 
178 |     iso_timestamp = r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d+)?(\+|-)\d{2}:\d{2}"
179 |     line = re.sub(iso_timestamp, "TIMESTAMP", line)
180 | 
181 |     # Identify and protect file paths
182 |     line = protect_file_paths(line)
183 | 
184 |     # Handle JSON structures in log messages
185 |     json_pattern = r"(\{.*\})"
186 |     json_matches = re.findall(json_pattern, line)
187 | 
188 |     for json_str in json_matches:
189 |         try:
190 |             # Try to parse the JSON
191 |             parsed_json = json.loads(json_str)
192 | 
193 |             # Create a normalized version with masked values but preserved structure
194 |             normalized_json = normalize_json(parsed_json)
195 | 
196 |             # Replace the original JSON with the normalized version
197 |             line = line.replace(json_str, json.dumps(normalized_json))
198 |         except json.JSONDecodeError:
199 |             # If it's not valid JSON, continue with normal processing
200 |             pass
201 | 
202 |     # Identify and temporarily mark HTTP verb + URL patterns
203 |     http_pattern = r"(GET|POST|PUT|DELETE|PATCH|HEAD|OPTIONS) ([^ ]+) HTTP/\d+\.\d+"
204 | 
205 |     def replace_numbers_except_in_urls(match: re.Match) -> str:
206 |         verb = match.group(1)
207 |         url = match.group(2)
208 |         http_version = match.group(0).split(" ")[-1]
209 | 
210 |         # Replace numbers in the HTTP version
211 |         http_version = re.sub(r"\d+", "<NUM>", http_version)
212 | 
213 |         # Keep the URL path intact, but mask query parameters
214 |         url_parts = url.split("?", 1)
215 |         path = url_parts[0]
216 | 
217 |         # If there are query parameters, mask them
218 |         if len(url_parts) > 1:
219 |             return f"{verb} {path}<?PARAMS> {http_version}"
220 |         else:
221 |             return f"{verb} {path} {http_version}"
222 | 
223 |     # Apply the URL handling
224 |     line = re.sub(http_pattern, replace_numbers_except_in_urls, line)
225 | 
226 |     # Replace numbers in the rest of the line (not in URLs or file paths)
227 |     line = re.sub(r"(?<![a-zA-Z0-9/_.-])(\d+)(?![a-zA-Z0-9/_.-])", "<NUM>", line)
228 |     line = re.sub(r"(?<![a-zA-Z0-9/_.-])(\d+\.\d+)(?![a-zA-Z0-9/_.-])", "<NUM>.<NUM>", line)
229 | 
230 |     return line
231 | 
232 | 
233 | def post_process_template(template: str) -> str:
234 |     """
235 |     Post-process a template to make it more readable.
236 | 
237 |     Args:
238 |         template: Template to post-process
239 | 
240 |     Returns:
241 |         Post-processed template
242 |     """
243 |     # Replace consecutive masked parameters with a single mask
244 |     template = re.sub(r"(<[^>]+>)(\s*\1)+", r"\1", template)
245 | 
246 |     # Ensure timestamp components are consistently masked
247 |     # Replace patterns like <NUM>:<NUM>:<NUM> with <TIME>
248 |     template = re.sub(r"<NUM>:<NUM>:<NUM>(\.<NUM>)?", "<TIME>", template)
249 | 
250 |     # Replace patterns like <NUM>-<NUM>-<NUM> with <DATE>
251 |     template = re.sub(r"<NUM>-<NUM>-<NUM>", "<DATE>", template)
252 | 
253 |     # Replace patterns like [<DATE>T<TIME>+<NUM>:<NUM>] with [<TIMESTAMP>]
254 |     template = re.sub(r"\[<DATE>T<TIME>(\+|-)<NUM>:<NUM>\]", "[<TIMESTAMP>]", template)
255 | 
256 |     # Handle alternative format with space instead of 'T'
257 |     template = re.sub(r"\[<DATE> <TIME>\]", "[<TIMESTAMP>]", template)
258 | 
259 |     # Handle any remaining timestamp-like patterns
260 |     template = re.sub(
261 |         r"\[<NUM>-<NUM>-<NUM>T?<NUM>:<NUM>:<NUM>(.<NUM>)?(\+|-)?<NUM>?:?<NUM>?\]",
262 |         "[<TIMESTAMP>]",
263 |         template,
264 |     )
265 | 
266 |     return template
267 | 
268 | 
269 | class PatternResult(TypedDict):
270 |     """Type for pattern results."""
271 | 
272 |     cluster_id: str
273 |     count: int
274 |     pattern: str
275 |     first_seen: str
276 |     last_seen: str
277 |     sample_log_lines: List[str]
278 |     sample_doc_references: List[str]
279 | 
280 | 
281 | def _process_document(doc: dict, template_miner: TemplateMiner, pattern_doc_references: dict) -> bool:
282 |     """
283 |     Process a single document to extract and normalize its message.
284 | 
285 |     Args:
286 |         doc: Document to process
287 |         template_miner: Configured template miner
288 |         pattern_doc_references: Dictionary to store document references for each pattern
289 | 
290 |     Returns:
291 |         True if processing was successful, False otherwise
292 |     """
293 |     index_name = doc.get("index", "unknown")
294 |     doc_id = doc.get("id", "unknown")
295 |     message = doc.get("message", "")
296 | 
297 |     if not message:
298 |         return False
299 | 
300 |     # Apply custom pre-processing to the message
301 |     processed_message = preprocess_log_line(message)
302 | 
303 |     # Add to template miner
304 |     result = template_miner.add_log_message(processed_message)
305 | 
306 |     # Store the document ID with its template
307 |     template_id = result["cluster_id"]
308 |     if template_id not in pattern_doc_references:
309 |         pattern_doc_references[template_id] = []
310 | 
311 |     # Add the doc ID and index to the lists, keeping only the 5 most recent
312 |     doc_reference = f"{index_name}:{doc_id}"
313 |     pattern_doc_references[template_id].append(doc_reference)
314 |     if len(pattern_doc_references[template_id]) > 5:
315 |         pattern_doc_references[template_id].pop(0)
316 | 
317 |     return True
318 | 
319 | 
320 | # Define a function to get the count safely with a proper return type
321 | def get_count(pattern: PatternResult) -> int:
322 |     """Safely get the count from a pattern dictionary.
323 | 
324 |     Args:
325 |         pattern: The pattern dictionary.
326 | 
327 |     Returns:
328 |         The count value.
329 |     """
330 |     return pattern["count"]
331 | 
332 | 
333 | def _prepare_results(template_miner: TemplateMiner, pattern_doc_references: dict) -> List[PatternResult]:
334 |     """
335 |     Prepare results from template miner.
336 | 
337 |     Args:
338 |         template_miner: The template miner instance.
339 |         pattern_doc_references: Dictionary mapping cluster IDs to document references.
340 | 
341 |     Returns:
342 |         List of pattern results.
343 |     """
344 |     results: List[PatternResult] = []
345 | 
346 |     # Access the id_to_cluster dictionary directly instead of using clusters
347 |     for cluster_id, cluster in template_miner.drain.id_to_cluster.items():
348 |         # Post-process the template to make it more readable
349 |         template = post_process_template(cluster.get_template())
350 | 
351 |         # Create result entry
352 |         result: PatternResult = {
353 |             "cluster_id": cluster_id,
354 |             "count": cluster.size,
355 |             "pattern": template,
356 |             "first_seen": (
357 |                 pattern_doc_references.get(cluster_id, [""])[0] if pattern_doc_references.get(cluster_id, []) else ""
358 |             ),
359 |             "last_seen": (
360 |                 pattern_doc_references.get(cluster_id, [""])[-1] if pattern_doc_references.get(cluster_id, []) else ""
361 |             ),
362 |             "sample_log_lines": (cluster.get_sample_logs() if hasattr(cluster, "get_sample_logs") else []),
363 |             "sample_doc_references": pattern_doc_references.get(cluster_id, []),
364 |         }
365 |         results.append(result)
366 | 
367 |     # Sort results by count (descending)
368 |     results.sort(key=get_count, reverse=True)
369 |     return results
370 | 
371 | 
372 | def normalize_messages(fields_file: str, output_file: str) -> None:
373 |     """
374 |     Normalize messages and summarize them.
375 | 
376 |     Args:
377 |         fields_file: Path to the extracted fields file
378 |         output_file: Path to store the normalization results
379 |     """
380 |     logger.info("Normalizing messages")
381 |     logger.info(f"Fields file: {fields_file}")
382 |     logger.info(f"Output file: {output_file}")
383 | 
384 |     # Configure template miner
385 |     template_miner = configure_template_miner()
386 | 
387 |     # Dictionary to store document IDs for each template
388 |     pattern_doc_references: Dict[str, List[str]] = {}  # Changed from List[dict] to List[str]
389 | 
390 |     try:
391 |         # Process the input file
392 |         with Path(fields_file).open("r") as f:
393 |             line_count = 0
394 |             for line in f:
395 |                 line = line.strip()
396 |                 if not line:
397 |                     continue
398 | 
399 |                 try:
400 |                     # Parse the JSON line
401 |                     doc = json.loads(line)
402 |                     if _process_document(doc, template_miner, pattern_doc_references):
403 |                         line_count += 1
404 |                         if line_count % 1000 == 0:
405 |                             logger.info(f"Processed {line_count} messages")
406 |                 except json.JSONDecodeError:
407 |                     logger.warning(f"Invalid JSON line: {line}")
408 |                     continue
409 |                 except Exception as e:
410 |                     logger.warning(f"Error processing message: {e}")
411 |                     continue
412 | 
413 |         logger.info(f"Processed {line_count} messages in total")
414 |         logger.info(f"Found {len(template_miner.drain.clusters)} unique patterns")
415 | 
416 |         # Prepare results
417 |         results = _prepare_results(template_miner, pattern_doc_references)
418 | 
419 |         # Save results to output file
420 |         save_json({"patterns": results}, output_file)
421 |         logger.info(f"Normalization results saved to {output_file}")
422 | 
423 |     except FileNotFoundError:
424 |         logger.error(f"Fields file not found: {fields_file}")
425 |         raise
426 |     except Exception as e:
427 |         logger.error(f"Error normalizing messages: {e}")
428 |         raise
429 | 
430 | 
431 | def main() -> None:
432 |     """Parse command line arguments and normalize messages."""
433 |     parser = argparse.ArgumentParser(description="Normalize messages and summarize them")
434 |     parser.add_argument("--fields-file", required=True, help="Path to the extracted fields file")
435 |     parser.add_argument("--output-file", required=True, help="Path to store the normalization results")
436 | 
437 |     args = parser.parse_args()
438 | 
439 |     try:
440 |         normalize_messages(args.fields_file, args.output_file)
441 |     except Exception as e:
442 |         logger.error(f"Error normalizing messages: {e}")
443 |         sys.exit(1)
444 | 
445 | 
446 | if __name__ == "__main__":
447 |     main()
448 | 


--------------------------------------------------------------------------------