├── .clang-format
├── .envrc
├── .flake8
├── .github
    ├── dependabot.yml
    └── workflows
    │   ├── mega-linter.yml
    │   └── python-package.yml
├── .gitignore
├── .mega-linter.yml
├── .pre-commit-config.yaml
├── .pylintrc
├── .vscode
    └── settings.json
├── .yamlfmt.yml
├── .yamllint.yml
├── CPPLINT.cfg
├── LICENSE
├── MANIFEST.in
├── README.md
├── ci
    ├── copyright-ignore
    ├── flake8-ignore
    ├── pylint-ignore
    ├── pytest
    │   ├── test_general.py
    │   └── yaml.yaml
    └── run-tests.sh
├── cplusutilities
    ├── Download-generic.sh
    ├── Download.sh
    ├── README.md
    ├── clean_HF_TreeCreator_env.sh
    ├── download_from_grid.sh
    ├── downloader-generic.sh
    ├── downloader.sh
    ├── mass_fitter.C
    ├── merge_and_fit_invmasshisto.C
    ├── post_download.sh
    ├── post_download_all.sh
    ├── run_downloader
    └── run_mass_fitter.sh
├── figures
    ├── ALICE_all.png
    ├── LHCparticle.jpg
    ├── Lambda_peak.png
    ├── SelectionVar.png
    └── SideBands.png
├── machine_learning_hep
    ├── README.md
    ├── __init__.py
    ├── __main__.py
    ├── analysis
    │   ├── README.md
    │   ├── __init__.py
    │   ├── analyzer.py
    │   ├── analyzer_jets.py
    │   ├── analyzer_manager.py
    │   ├── analyzerdhadrons.py
    │   ├── analyzerdhadrons_mult.py
    │   ├── do_systematics.py
    │   ├── systematics.py
    │   └── utils.py
    ├── bitwise.py
    ├── clean.sh
    ├── clean_analysis.sh
    ├── clean_results.sh
    ├── config.py
    ├── correlations.py
    ├── data
    │   ├── __init__.py
    │   ├── config_model_parameters.yml
    │   ├── config_run_parameters.yml
    │   ├── data_run3
    │   │   ├── database_ml_parameters_D0Jet_pp.yml
    │   │   ├── database_ml_parameters_D0pp_jet_run2cmp.yml
    │   │   ├── database_ml_parameters_Dp.yml
    │   │   ├── database_ml_parameters_DpJet_pp.yml
    │   │   ├── database_ml_parameters_JPsiJet_pp.yml
    │   │   ├── database_ml_parameters_Jet_pp.yml
    │   │   ├── database_ml_parameters_LcJet_pp.yml
    │   │   ├── database_ml_parameters_LcJet_pp_hp24.yml
    │   │   ├── database_ml_parameters_LcToPKPi_multiclass.yml
    │   │   ├── database_ml_parameters_LcToPKPi_newformat.yml
    │   │   ├── database_ml_parameters_LcToPKPi_newformat_mult_ana.yml
    │   │   ├── database_variations_D0Jet_pp_jet_obs.yml
    │   │   └── database_variations_LcJet_pp_jet_obs.yml
    │   ├── database_run_list.yml
    │   └── fonll
    │   │   ├── DmesonLcPredictions_13TeV_y05_FFee_BRpythia8.root
    │   │   ├── DmesonLcPredictions_13TeV_y05_FFee_BRpythia8_SepContr_PDG2020.root
    │   │   ├── DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8.root
    │   │   ├── DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8_PDG2020.root
    │   │   ├── DmesonLcPredictions_502TeV_y05_FFee_BRpythia8.root
    │   │   └── DmesonLcPredictions_502TeV_y05_FFptDepLHCb_BRpythia8.root
    ├── do_variations.py
    ├── fitting
    │   ├── README.md
    │   ├── __init__.py
    │   ├── fitters.py
    │   ├── helpers.py
    │   ├── roofitter.py
    │   ├── simple_fit.py
    │   └── utils.py
    ├── globalfitter.py
    ├── hf_analysis_utils.py
    ├── hf_pt_spectrum.py
    ├── io.py
    ├── logger.py
    ├── mlperformance.py
    ├── models.py
    ├── multiprocesser.py
    ├── optimisation
    │   ├── README.md
    │   ├── bayesian_opt.py
    │   ├── grid_search.py
    │   └── metrics.py
    ├── optimiser.py
    ├── optimization.py
    ├── plotting
    │   ├── __init__.py
    │   ├── compare_results.py
    │   └── plot_jetsubstructure_run3.py
    ├── processer.py
    ├── processer_jet.py
    ├── processerdhadrons.py
    ├── processerdhadrons_mult.py
    ├── root.py
    ├── selectionutils.py
    ├── steer_analysis.py
    ├── submission
    │   ├── __init__.py
    │   ├── all_off.yml
    │   ├── analysis.yml
    │   ├── analyzer.yml
    │   ├── data.yml
    │   ├── full_analysis.yml
    │   ├── mc.yml
    │   ├── mlapp.yml
    │   ├── mltrain.yml
    │   ├── preprocess.yml
    │   └── processor.yml
    ├── submit.sh
    ├── submit_variations.sh
    ├── templates_keras.py
    ├── templates_scikit.py
    ├── templates_xgboost.py
    ├── utilities.py
    ├── utilities_files.py
    ├── utilities_plot.py
    ├── utils
    │   ├── __init__.py
    │   ├── compare_directories.sh
    │   ├── compare_root_files.py
    │   ├── dl_train.py
    │   └── hist.py
    ├── vary_bdt.py
    └── workflow
    │   └── workflow_base.py
├── pyproject.toml
├── requirements.txt
└── run_hfjets.py


/.clang-format:
--------------------------------------------------------------------------------
 1 | BasedOnStyle: Google
 2 | AccessModifierOffset: -1
 3 | AlignEscapedNewlinesLeft: true
 4 | AlignTrailingComments: true
 5 | AllowAllParametersOfDeclarationOnNextLine: false
 6 | AllowShortFunctionsOnASingleLine: true
 7 | AllowShortIfStatementsOnASingleLine: false
 8 | AllowShortLoopsOnASingleLine: false
 9 | #AlwaysBreakBeforeMultilineStrings: true
10 | AlwaysBreakTemplateDeclarations: true
11 | BinPackParameters: true
12 | BreakBeforeBinaryOperators: false
13 | BreakBeforeBraces: Linux
14 | BreakBeforeTernaryOperators: true
15 | BreakConstructorInitializersBeforeComma: false
16 | ColumnLimit: 0
17 | CommentPragmas:  '^ IWYU pragma:'
18 | ConstructorInitializerAllOnOneLineOrOnePerLine: true
19 | ConstructorInitializerIndentWidth: 2
20 | ContinuationIndentWidth: 2
21 | Cpp11BracedListStyle: true
22 | DerivePointerBinding: false
23 | ExperimentalAutoDetectBinPacking: false
24 | IndentCaseLabels: true
25 | IndentFunctionDeclarationAfterType: true
26 | IndentWidth:     2
27 | # It is broken on windows. Breaks all #include "header.h"
28 | ---
29 | Language:        Cpp
30 | MaxEmptyLinesToKeep: 1
31 | KeepEmptyLinesAtTheStartOfBlocks: true
32 | NamespaceIndentation: None
33 | ObjCSpaceAfterProperty: false
34 | ObjCSpaceBeforeProtocolList: false
35 | PenaltyBreakBeforeFirstCallParameter: 1
36 | PenaltyBreakComment: 300
37 | PenaltyBreakFirstLessLess: 120
38 | PenaltyBreakString: 1000
39 | PenaltyExcessCharacter: 1000000
40 | PenaltyReturnTypeOnItsOwnLine: 200
41 | SortIncludes: false
42 | SpaceBeforeAssignmentOperators: true
43 | SpaceBeforeParens: ControlStatements
44 | SpaceInEmptyParentheses: false
45 | SpacesBeforeTrailingComments: 1
46 | SpacesInAngles:  false
47 | SpacesInContainerLiterals: true
48 | SpacesInCStyleCastParentheses: false
49 | SpacesInParentheses: false
50 | Standard:        Cpp11
51 | TabWidth:        2
52 | UseTab:          Never
53 | ---
54 | # Do not format protobuf files
55 | Language: Proto
56 | DisableFormat: true
57 | # ---
58 | # # Since clang-format 13.0.0
59 | # Language: Json
60 | # # O2 dumps JSON files with 4-space indents.
61 | # IndentWidth: 4
62 | 


--------------------------------------------------------------------------------
/.envrc:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [[ $(hostname) == "alicecerno2" || $(hostname) == "alipap1" ]] && [[ -z ${ROOTSYS} ]]; then
 4 | 
 5 |   PYTHON_VERSION=3.10.14
 6 |   ROOT_VERSION=v6-32-04
 7 |   ROOUNFOLD_VERSION=2.0.1
 8 | 
 9 |   PREFIX=/home/pyadmin/software_mlhep
10 |   layout python ${PREFIX}/install/pyenv/versions/${PYTHON_VERSION}/bin/python3
11 |   path_add PYTHONPATH ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
12 |   PATH_add ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/bin
13 |   path_add LD_LIBRARY_PATH ${PREFIX}/install/root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
14 |   # path_add LD_LIBRARY_PATH ${PREFIX}/install/RooUnfold-${ROOUNFOLD_VERSION}_root-${ROOT_VERSION}_py-${PYTHON_VERSION}/lib
15 |   path_add LD_LIBRARY_PATH ${PREFIX}/build/RooUnfold-${ROOUNFOLD_VERSION}_root-${ROOT_VERSION}_py-${PYTHON_VERSION}
16 | 
17 | fi
18 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | extend-ignore = E203
4 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Dependabot configuration
 3 | # Reference: https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
 4 | 
 5 | version: 2
 6 | updates:
 7 |   - package-ecosystem: "github-actions" # See documentation for possible values
 8 |     directory: "/" # Location of package manifests
 9 |     schedule:
10 |       interval: "weekly"
11 | 


--------------------------------------------------------------------------------
/.github/workflows/mega-linter.yml:
--------------------------------------------------------------------------------
  1 | ---
  2 | # MegaLinter GitHub Action configuration file
  3 | # More info at https://megalinter.io
  4 | name: MegaLinter
  5 | 
  6 | 'on':
  7 |   # Trigger mega-linter at every push. Action will also be visible from Pull Requests to master
  8 |   push: # Comment this line to trigger action only on pull-requests (not recommended if you don't pay for GH Actions)
  9 |   pull_request:
 10 |     branches: [master, run3]
 11 | 
 12 | permissions:
 13 |   # Give the default GITHUB_TOKEN write permission to commit and push, comment issues & post new PR
 14 |   # Remove the ones you do not need
 15 |   contents: write
 16 |   issues: write
 17 |   pull-requests: write
 18 | 
 19 | env: # Comment env block if you don't want to apply fixes
 20 |   # Apply linter fixes configuration
 21 |   APPLY_FIXES: all # When active, APPLY_FIXES must also be defined as environment variable (in github/workflows/mega-linter.yml or other CI tool)
 22 |   APPLY_FIXES_EVENT: push # Decide which event triggers application of fixes in a commit or a PR (pull_request, push, all)
 23 |   APPLY_FIXES_MODE: pull_request # If APPLY_FIXES is used, defines if the fixes are directly committed (commit) or posted in a PR (pull_request)
 24 | 
 25 | concurrency:
 26 |   group: ${{ github.ref }}-${{ github.workflow }}
 27 |   cancel-in-progress: true
 28 | 
 29 | jobs:
 30 |   megalinter:
 31 |     name: MegaLinter
 32 |     runs-on: ubuntu-latest
 33 |     steps:
 34 |       # Git Checkout
 35 |       - name: Checkout Code
 36 |         uses: actions/checkout@v4
 37 |         with:
 38 |           token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
 39 |           fetch-depth: 0 # If you use VALIDATE_ALL_CODEBASE = true, you can remove this line to improve performances
 40 | 
 41 |       # MegaLinter
 42 |       - name: MegaLinter
 43 |         id: ml
 44 |         # You can override MegaLinter flavor used to have faster performances
 45 |         # More info at https://megalinter.io/flavors/
 46 |         uses: oxsecurity/megalinter@v8.5.0
 47 |         env:
 48 |           # All available variables are described in documentation
 49 |           # https://megalinter.io/configuration/
 50 |           VALIDATE_ALL_CODEBASE: false # ${{ github.event_name == 'push' && github.ref == 'refs/heads/master' }} # Validates all source when push on master, else just the git diff with master. Override with true if you always want to lint all sources
 51 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 52 |           # ADD YOUR CUSTOM ENV VARIABLES HERE OR DEFINE THEM IN A FILE .mega-linter.yml AT THE ROOT OF YOUR REPOSITORY
 53 |           # DISABLE: COPYPASTE,SPELL # Uncomment to disable copy-paste and spell checks
 54 | 
 55 |       # Upload MegaLinter artifacts
 56 |       - name: Archive production artifacts
 57 |         if: success() || failure()
 58 |         uses: actions/upload-artifact@v4
 59 |         with:
 60 |           name: MegaLinter reports
 61 |           path: |
 62 |             megalinter-reports
 63 |             mega-linter.log
 64 | 
 65 |       # Create pull request if applicable (for now works only on PR from same repository, not from forks)
 66 |       - name: Print PR condition
 67 |         run: |
 68 |           # Print the condition
 69 |           echo "(${{ env.APPLY_FIXES_EVENT }} == 'all' || ${{ env.APPLY_FIXES_EVENT }} == ${{ github.event_name }}) && ${{ env.APPLY_FIXES_MODE }} == 'pull_request' && (${{ github.event_name }} == 'push' || ${{ github.event.pull_request.head.repo.full_name }} == ${{ github.repository }})"
 70 |       - name: Create Pull Request with applied fixes
 71 |         id: cpr
 72 |         if: (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'pull_request' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository)
 73 |         uses: peter-evans/create-pull-request@v7
 74 |         with:
 75 |           token: ${{ secrets.PAT || secrets.GITHUB_TOKEN }}
 76 |           commit-message: "[MegaLinter] Apply linters automatic fixes"
 77 |           title: "[MegaLinter] Apply linters automatic fixes"
 78 |           body: "Please merge this pull request to apply automatic fixes by MegaLinter."
 79 |           labels: bot
 80 |           branch: patch-${{ github.workflow }}-${{ github.ref_name }}
 81 |           delete-branch: true
 82 |       - name: Create PR output
 83 |         if: steps.ml.outputs.has_updated_sources == 1
 84 |         run: |
 85 |           echo "::error::MegaLinter has fixed some files."
 86 |           if [ ${{ github.event_name }} == 'push' ]; then
 87 |             echo "::error::Merge pull request ${{ steps.cpr.outputs.pull-request-url }} to apply automatic fixes."
 88 |           elif [ ${{ github.event_name }} == 'pull_request' ]; then
 89 |             echo "::error::Check ${{ github.event.pull_request.head.repo.html_url }}/pulls to apply automatic fixes."
 90 |             echo "::notice::Actions must be allowed in your repository. See ${{ github.event.pull_request.head.repo.html_url }}/settings/actions"
 91 |           fi
 92 |           exit 1
 93 | 
 94 |       # Push new commit if applicable (for now works only on PR from same repository, not from forks)
 95 |       - name: Print commit condition
 96 |         run: |
 97 |           # Print the condition
 98 |           echo "${{ steps.ml.outputs.has_updated_sources }} == 1 && (${{ env.APPLY_FIXES_EVENT }} == 'all' || ${{ env.APPLY_FIXES_EVENT }} == ${{ github.event_name }}) && ${{ env.APPLY_FIXES_MODE }} == 'commit' && ${{ github.ref }} != 'refs/heads/master' && (${{ github.event_name }} == 'push' || ${{ github.event.pull_request.head.repo.full_name }} == ${{ github.repository }})"
 99 |       - name: Prepare commit
100 |         if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/master' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository)
101 |         run: sudo chown -Rc $UID .git/
102 |       - name: Commit and push applied linter fixes
103 |         if: steps.ml.outputs.has_updated_sources == 1 && (env.APPLY_FIXES_EVENT == 'all' || env.APPLY_FIXES_EVENT == github.event_name) && env.APPLY_FIXES_MODE == 'commit' && github.ref != 'refs/heads/master' && (github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository)
104 |         uses: stefanzweifel/git-auto-commit-action@v5
105 |         with:
106 |           branch: ${{ github.event.pull_request.head.ref || github.head_ref || github.ref }}
107 |           commit_message: "[MegaLinter] Apply linters fixes"
108 |           commit_user_name: megalinter-bot
109 |           commit_user_email: nicolas.vuillamy@ox.security
110 | 


--------------------------------------------------------------------------------
/.github/workflows/python-package.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Test package
 3 | 
 4 | 'on':
 5 |   pull_request:
 6 |     branches:
 7 |       - master
 8 |       - run3
 9 |     paths:
10 |       - "**.py"
11 |   push:
12 |     branches:
13 |       - master
14 |       - run3
15 |     paths:
16 |       - "**.py"
17 | 
18 | permissions:
19 |   contents: read
20 |   pull-requests: read
21 | 
22 | concurrency:
23 |   group: ${{ github.ref }}-${{ github.workflow }}
24 |   cancel-in-progress: true
25 | 
26 | jobs:
27 |   build-os-latest:
28 |     runs-on: ${{ matrix.os }}
29 |     strategy:
30 |       max-parallel: 6
31 |       matrix:
32 |         os: [ubuntu-latest, macos-latest]
33 |         python-version: ['3.10', '3.11', '3.12', '3.13']
34 |         test-tool: [pytest]
35 | 
36 |     steps:
37 |       - uses: actions/checkout@v4
38 |       - name: Set up Python ${{ matrix.python-version }}
39 |         uses: actions/setup-python@v5
40 |         with:
41 |           python-version: ${{ matrix.python-version }}
42 |       - name: Install dependencies
43 |         run: |
44 |           python -m pip install --upgrade pip
45 |           python -m pip install --upgrade setuptools
46 |           pip install -r requirements.txt
47 |       - name: Install test tool ${{ matrix.test-tool }}
48 |         run: |
49 |           pip install ${{ matrix.test-tool }}
50 |       - name: Run on pull_request
51 |         if: github.event_name == 'pull_request'
52 |         run: |
53 |           git fetch --no-tags --prune --depth=1 origin +refs/heads/*:refs/remotes/origin/*
54 |           changed_files="$(git diff --name-only origin/${{ github.base_ref }})"
55 |           # shellcheck disable=SC2086 # Ignore unquoted options.
56 |           ci/run-tests.sh --tests ${{ matrix.test-tool }} --files $changed_files
57 |       - name: Run on push
58 |         if: github.event_name == 'push'
59 |         run: |-
60 |           ci/run-tests.sh --tests ${{ matrix.test-tool }}
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # If python build in source tree
 2 | build/*
 3 | dist/*
 4 | *.egg-info
 5 | **/__pycache__
 6 | .direnv
 7 | 
 8 | # Python compiled
 9 | *.pyc
10 | *.pyd
11 | *.pyo
12 | 
13 | # object files
14 | *.slo
15 | *.lo
16 | *.o
17 | *.obj
18 | 
19 | # dynamic libraries
20 | *.so
21 | *.dylib
22 | *.dll
23 | 
24 | # static libraries
25 | *.lai
26 | *.la
27 | *.a
28 | *.lib
29 | 
30 | # executables
31 | *.exe
32 | *.out
33 | *.app
34 | 
35 | #input and output data
36 | machine_learning_hep/data/**/*.root
37 | machine_learning_hep/D0kINT7HighMultwithJets
38 | machine_learning_hep/LckINT7HighMultwithJets
39 | 
40 | dataframes_*
41 | plots_*
42 | output_*
43 | *.json
44 | *.h5
45 | *.png
46 | *.log
47 | 
48 | # editors
49 | *.swp
50 | *~
51 | *.vscode
52 | **/.mypy_cache
53 | setup.cfg
54 | *.code-workspace
55 | 
56 | # macOS
57 | .DS_Store
58 | 
59 | # linters
60 | megalinter-reports
61 | 


--------------------------------------------------------------------------------
/.mega-linter.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # Configuration file for Mega-Linter
 3 | # See all available variables at https://oxsecurity.github.io/megalinter/configuration/ and in linters documentation
 4 | 
 5 | APPLY_FIXES: all # all, none, or list of linter keys
 6 | DEFAULT_BRANCH: run3 # Usually master or main
 7 | # ENABLE: # If you use ENABLE variable, all other languages/formats/tooling-formats will be disabled by default
 8 | # ENABLE_LINTERS: # If you use ENABLE_LINTERS variable, all other linters will be disabled by default
 9 | DISABLE:
10 |   - C
11 |   - COPYPASTE # abusive copy-pastes
12 |   - SPELL # spelling mistakes
13 | DISABLE_LINTERS:
14 |   - BASH_EXEC
15 |   - BASH_SHFMT
16 |   - JSON_PRETTIER
17 |   - PYTHON_BLACK
18 |   - PYTHON_FLAKE8
19 |   - PYTHON_ISORT
20 |   - REPOSITORY_DEVSKIM
21 |   - REPOSITORY_GRYPE
22 |   - REPOSITORY_KICS
23 |   - REPOSITORY_SECRETLINT
24 |   - REPOSITORY_TRIVY
25 |   - YAML_PRETTIER
26 |   - YAML_V8R
27 | DISABLE_ERRORS_LINTERS: # If errors are found by these linters, they will be considered as non blocking.
28 |   - PYTHON_BANDIT # The bandit check is overly broad and complains about subprocess usage.
29 | SHOW_ELAPSED_TIME: true
30 | FILEIO_REPORTER: false
31 | GITHUB_COMMENT_REPORTER: false
32 | UPDATED_SOURCES_REPORTER: true
33 | PRINT_ALPACA: false # Don't print ASCII alpaca in the log
34 | PRINT_ALL_FILES: true # Print all processed files
35 | FLAVOR_SUGGESTIONS: false # Don't show suggestions about different MegaLinter flavors
36 | PYTHON_ISORT_CONFIG_FILE: pyproject.toml
37 | PYTHON_PYRIGHT_CONFIG_FILE: pyproject.toml
38 | PYTHON_RUFF_CONFIG_FILE: pyproject.toml
39 | CPP_CPPLINT_FILE_EXTENSIONS: [".C", ".c", ".c++", ".cc", ".cl", ".cpp", ".cu", ".cuh", ".cxx", ".cxx.in", ".h", ".h++", ".hh", ".h.in", ".hpp", ".hxx", ".inc", ".inl", ".macro"]
40 | CPP_CLANG_FORMAT_FILE_EXTENSIONS: [".C", ".c", ".c++", ".cc", ".cl", ".cpp", ".cu", ".cuh", ".cxx", ".cxx.in", ".h", ".h++", ".hh", ".h.in", ".hpp", ".hxx", ".inc", ".inl", ".macro"]
41 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # See https://pre-commit.com for more information
 3 | # See https://pre-commit.com/hooks.html for more hooks
 4 | 
 5 | repos:
 6 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 7 |     rev: v5.0.0
 8 |     hooks:
 9 |       - id: check-added-large-files
10 |       - id: check-ast
11 |       - id: check-builtin-literals
12 |       - id: check-docstring-first
13 |       - id: check-executables-have-shebangs
14 |       - id: check-merge-conflict
15 |       - id: check-symlinks
16 |       - id: check-toml
17 |       - id: check-yaml
18 |       - id: debug-statements
19 |       - id: end-of-file-fixer
20 |       - id: mixed-line-ending
21 |       - id: name-tests-test
22 |       - id: requirements-txt-fixer
23 |       - id: trailing-whitespace
24 |   - repo: https://github.com/astral-sh/ruff-pre-commit
25 |     rev: v0.11.2 # ruff version
26 |     hooks:
27 |       - id: ruff # linter
28 |         args: ["--fix"]
29 |       - id: ruff-format # formatter
30 |   - repo: https://github.com/asottile/pyupgrade
31 |     rev: v3.19.1
32 |     hooks:
33 |       - id: pyupgrade
34 |         args: ["--py310-plus"]
35 |   - repo: https://github.com/shellcheck-py/shellcheck-py
36 |     rev: v0.10.0.1
37 |     hooks:
38 |       - id: shellcheck
39 |   - repo: https://github.com/google/yamlfmt
40 |     rev: v0.16.0
41 |     hooks:
42 |       - id: yamlfmt
43 |   - repo: https://github.com/adrienverge/yamllint
44 |     rev: v1.36.2
45 |     hooks:
46 |       - id: yamllint
47 | 


--------------------------------------------------------------------------------
/.pylintrc:
--------------------------------------------------------------------------------
 1 | [FORMAT]
 2 | indent-string='    '
 3 | max-line-length=120
 4 | 
 5 | [BASIC]
 6 | variable-rgx=(?:(?P<snake>[a-z_]+))
 7 | 
 8 | [TYPECHECK]
 9 | generated-members=RdBu
10 | 
11 | [DESIGN]
12 | max-args=10
13 | max-locals=40
14 | 
15 | [MESSAGES CONTROL]
16 | disable=
17 |     useless-suppression,
18 |     too-few-public-methods,
19 |     too-many-arguments,
20 |     too-many-branches,
21 |     too-many-instance-attributes,
22 |     too-many-lines,
23 |     too-many-locals,
24 |     too-many-nested-blocks,
25 |     too-many-positional-arguments,
26 |     too-many-public-methods,
27 |     too-many-return-statements,
28 |     too-many-statements
29 | 
30 | [MISCELLANEOUS]
31 | notes=FIXME,XXX
32 | 
33 | [IMPORTS]
34 | ignored-modules=ROOT,yaml,pandas,numpy,shap,uproot
35 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "editor.rulers": [120,],
3 |     "files.trimTrailingWhitespace": true,
4 |     "cmake.configureOnOpen": false,
5 | }
6 | 


--------------------------------------------------------------------------------
/.yamlfmt.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # yamlfmt configuration
 3 | # Reference: https://github.com/google/yamlfmt/blob/main/docs/config-file.md#configuration-1
 4 | 
 5 | formatter:
 6 |   type: basic
 7 |   indent: 2
 8 |   include_document_start: true
 9 |   line_ending: lf
10 |   retain_line_breaks_single: true
11 |   max_line_length: -1
12 |   drop_merge_tag: true
13 |   pad_line_comments: 1
14 |   trim_trailing_whitespace: true
15 |   eof_newline: true
16 | 


--------------------------------------------------------------------------------
/.yamllint.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | # yamllint configuration
 3 | # Reference: https://yamllint.readthedocs.io/en/stable/rules.html
 4 | 
 5 | extends: default
 6 | rules:
 7 |   line-length:
 8 |     max: 120
 9 |     level: warning
10 |   indentation:
11 |     spaces: 2
12 |     level: warning
13 |   comments:
14 |     require-starting-space: true
15 |     min-spaces-from-content: 1
16 | 


--------------------------------------------------------------------------------
/CPPLINT.cfg:
--------------------------------------------------------------------------------
1 | filter=-build/c++11,-build/namespaces,-readability/fn_size,-readability/todo,-runtime/references,-whitespace/blank_line,-whitespace/braces,-whitespace/comments,-whitespace/indent_namespace,-whitespace/line_length,-whitespace/semicolon,-whitespace/todo
2 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include machine_learning_hep/submission/default_complete.yml
2 | include machine_learning_hep/submission/default_pre.yml
3 | include machine_learning_hep/submission/default_train.yml
4 | include machine_learning_hep/submission/default_apply.yml
5 | include machine_learning_hep/submission/default_ana.yml
6 | include machine_learning_hep/data/config_model_parameters.yml
7 | include machine_learning_hep/data/database_run_list.yml"
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Machine learning package for high-energy physics
 2 | 
 3 | ![LHC Particle](figures/LHCparticle.jpg)
 4 | 
 5 | ## Overview of the package:
 6 | This software provides a flexible, modular and easy-to-use package to perform classification using Scikit, XGBoost and Keras algorithms. The first purpose of the package is to provide tools for high-energy physicists to perform optimisation of rare signals produced in ultra-relativistic proton-proton and heavy-ion collisions. 
 7 | 
 8 | ## The package (v0) provides tools to:
 9 | - convert ROOT datasets into Pandas Dataframes
10 | - create training and testing dataset starting from samples of data and Monte-Carlo simulations
11 | - perform Principal-Component-Analysis
12 | - training and testing using Scikit, XGBoost and Keras algorithms
13 | - large set of validation tools with a user friendly interface
14 | - conversion of Pandas Dataframe to ROOT objects including algorithm decisions and probabilities
15 | 
16 | ## Instructions and tutorials
17 | Instructions for installing and running the package are provided in the Wiki section of this repository [wiki](https://github.com/ginnocen/MachineLearningHEP/wiki).
18 | 
19 | ## The ALICE Collaboration at CERN
20 | Visit the collaboration website for more information about studies of hot nuclear matter at the Large Hadron Collider at CERN
21 | http://alice-collaboration.web.cern.ch
22 | 
23 | ## Contacts
24 | For any questions please contact <ginnocen@cern.ch>
25 | 
26 | # Installation
27 | 
28 | ## Usage with aliBuild software stack
29 | 
30 | This package depends on functionality offered by external packages, e.g. RooUnfold and O2Physics.
31 | In order to use these packages from the aliBuild software stack, you should first install the aliBuild packages, and setup mlhep within the aliBuild environment.
32 | To install the python dependencies run the following (from within the aliBuild environment) in the root directory of mlehp:
33 | ```
34 | python3 -m pip install -r requirements.txt
35 | ```
36 | 


--------------------------------------------------------------------------------
/ci/copyright-ignore:
--------------------------------------------------------------------------------
1 | # Applies anyway only to .py files so no need to put yaml and others
2 | 
3 | # That's just pytest stuff
4 | ci/pytest
5 | 
6 | # And the setup
7 | setup.py
8 | 


--------------------------------------------------------------------------------
/ci/flake8-ignore:
--------------------------------------------------------------------------------
1 | # Legacy code
2 | machine_learning_hep/analysis/analyzer_jet_legacy.py
3 | machine_learning_hep/analysis/analyzer_Dhadrons.py
4 | 
5 | # Don't pylint setup
6 | setup.py
7 | 


--------------------------------------------------------------------------------
/ci/pylint-ignore:
--------------------------------------------------------------------------------
1 | # Legacy code
2 | machine_learning_hep/analysis/analyzer_jet_legacy.py
3 | machine_learning_hep/analysis/analyzer_Dhadrons.py
4 | machine_learning_hep/analysis/analyzer_back.py
5 | 
6 | # Don't pylint setup
7 | setup.py
8 | 


--------------------------------------------------------------------------------
/ci/pytest/test_general.py:
--------------------------------------------------------------------------------
1 | from machine_learning_hep.io import parse_yaml
2 | 
3 | YAML_PATH = "ci/pytest/yaml.yaml"
4 | 
5 | def test_yaml():
6 |     assert isinstance(parse_yaml(YAML_PATH), dict)
7 | 


--------------------------------------------------------------------------------
/ci/pytest/yaml.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | test:
 3 |   test1: [42, "test", Null]
 4 |   test2:
 5 |     tea: False
 6 |     coffee: True
 7 |     list_of_lists:
 8 |       - [1, 2, 3, 4, 5]
 9 |       - ["ABC", "DEF", "GHI"]
10 |       - [Null, False, True, true, false, null]
11 |   test3: 42
12 | 


--------------------------------------------------------------------------------
/cplusutilities/README.md:
--------------------------------------------------------------------------------
  1 | # Getting and processing TTreeCreator output
  2 | 
  3 | Instructions to download the output from the LEGO train (can be run as part of the package or stand-alone), and merge the files (only stand-alone). The instructions assume you are a user of the `aliceml` machine. With some small changes, the instructions are valid for each system though.
  4 | 
  5 | Completing steps 1) - 4) will make the data ready for being used in the `MLHEP` Python analysis package.
  6 | 
  7 | ## 1) Setup your environment
  8 | 
  9 | Start by logging in
 10 | ```
 11 | ssh -X username@lxplus.cern.ch #only when needed
 12 | ssh -X username@aliceml
 13 | ```
 14 | > Please have a look at section 4) if you want to use these script on a local system or different server, as some packages are already pre-installed at aliceml which you might need to install yourself first.
 15 | 
 16 | ### a) Building and loading the virtual environment
 17 | 
 18 | On `aliceml`, one should (create/)load your personal virtual environment:
 19 | ```
 20 | ml-create-virtualenv     #only once to create the environment
 21 | ml-activate-virtualenv   #start (and enable python) in virtual environment
 22 | ml-activate-root         #Enable system-wide ROOT installation
 23 | ```
 24 | and clone+install this git repository (see https://github.com/ginnocen/MachineLearningHEP/wiki)
 25 | 
 26 | ## 2) Obtain a certificate and `JAliEn`
 27 | 
 28 | Before downloading, one has to enter the JAliEn environment manually. Please make sure your GRID certificates are copied to the server. If you haven't done that, [this tutorial](https://alice-doc.github.io/alice-analysis-tutorial/start/cert.html#convert-your-certificate-for-using-the-grid-tools) how to do it. Furthermore, you can find steps to obtain the certificate if don't have one.
 29 | 
 30 | Having the certificate, load the `JAliEn` environment once and exit afterwards.
 31 | ```
 32 | jalien
 33 | #Enter Grid Certificate password
 34 | exit
 35 | 
 36 | > **NB:** If you get the error: "**JBox isn't running, so we won't start JSh.**", your grid certificates probably don't have the right permissions. Correct them in `~/.globus/` using:
 37 | 
 38 | ```bash
 39 | chmod 0440 usercert.pem
 40 | chmod 0400 userkey.pem
 41 | ```
 42 | 
 43 | ## 3) Downloading productions from the GRID
 44 | 
 45 | In the following, `$MLHEP` is considered to point to the top directory of the **MLHEP** package.
 46 | 
 47 | Downloading is done with `$MLHEP/cplusutilities/Download-generic.sh`. **Note** that you have to be in that directory to run it as it uses helper scripts and finds them relatively to its own directory (to be updated):
 48 | 
 49 | ```bash
 50 | ./Download-generic.sh <train_name> <top_save_dir> [<grid_merging_stage>]
 51 | ```
 52 | 
 53 | If you steer the script without arguments, they will be asked for. `<train_name>` has the format: `<ID>_<date>-<time>`. This is used to automatically find the data. `<top_save_dir>` is the directory where the output should be saved. Please don't use your personal `/home/<user>/` directory (or anything below) to save grid productions but synchronise with your collaborators and use `/data/TTrees/`. In there, it has become quite usual to call the next directory corresponding to the hadrons taken into account, the used triggers and whether or not jet variables have been filled. So a possible value `<top_save_dir>` might be `/data/TTree/D0DsLckINT7HighMultwithJets/`. Below, the entire structure is created automatically and would be
 54 | 
 55 | ```
 56 | <top_save_dir>/<AliPhysics_tag>/<data_or_mc_production>/<train_name>/unmerged/child_<ID>
 57 | ```
 58 | 
 59 | `<grid_merging_stage>` is optional and refers to the GRID merging stage, which should be in the format `Stage_#`. If this argument is empty, JAliEn will download the unmerged files from GRID.
 60 | 
 61 | 
 62 | ### a) Hardcoded values
 63 | 
 64 | A few variables are hardcoded in `Download.sh` in addition, but usually, they don't need to be changed:
 65 | 1) The number of files to download from GRID. By default all files will be downloaded: `nfiles="/*/"`. For test runs, one can add some zeros (`"/000*/"`, assuming 1000 < jobs < 9999) to download less files.
 66 | 2) The file to be downloaded is by default `AnalysisResults.root`.
 67 | 3) There are hardcoded paths for the different datasets from where to get the LEGO train output. Unfortunately, it is not possible to automatically get these from the train config, as some of the children are splitted into multiple paths when the output is too big. For debugging purposes, the script will print the hardcoded paths with the ones it can get from the train config.
 68 | Point 1) and 3) are only valid for the old downloader, `Download.sh`.
 69 | 
 70 | ### b) The screen program
 71 | 
 72 | If one will download and process the full statistics, using `screen` can be very convenien. It starts an additional server in the background which can be kept running even when you log off from the machine. To use it, do:
 73 | 
 74 | ```bash
 75 | ssh -X username@lxplus.cern.ch
 76 | screen    #A empty terminal will pop up
 77 | #Do everything till the Download.sh script is running
 78 | #Important to do this from lxplus, JAliEn together with screen on aliceml will not work
 79 | ```
 80 | When the script is running, you can detach from it by pressing **Ctrl-a d**. You will find yourself back in the previous terminal. Before quiting, there is some information you should remember:
 81 | ```
 82 | screen -list     #Should print something like: "There is a screen on: 32693.pts-30.lxplus008    (Detached)"
 83 | hostname         #Should print something like: "lxplus008.cern.ch"
 84 | ```
 85 | You need to save/remember this info because your job is run on a specific node on `lxplus` and you need to go back to exactly that node (which in this case is `008`) later. Now you can disconnect savely and come back later.
 86 | 
 87 | When the script is ready, or you want to check the progress, just do:
 88 | ```
 89 | ssh username@lxplus008.cern.ch          # Now you explicitly ask for connecting to the specific node
 90 | screen -list
 91 | screen -rD 32693.pts-30.lxplus008       #Change to your situation
 92 | ```
 93 | Is the download finished? Then you can exit `screen` with `Ctrl-d`. If not, you can disconnect again and come back later by detaching again with **Ctrl-a d**.
 94 | 
 95 | 
 96 | ## 4) Post download merging
 97 | 
 98 | At this point everything you want to use later should be downloaded as described above. Since the single `AnalysisResults.root` can be very different in size and also very small (O(kB-MB)) we merge them together to end up with files of roughly the same size. If we don't do that, the analysis with the `MLHEP` package will spend a significant amount of time with opening and closing files which is quite some overhead.
 99 | 
100 | To equalize the files sizes, one can use `MLHEP/cplusutilities/post_download.sh`. Because it uses ROOT's `hadd`, ROOT needs to be available. You can load the version centrally provided on the machine (`ml-activate-root`) or you use a version from `cvmfs`.
101 | 
102 | ```bash
103 | ./post_download.sh --help
104 | ```
105 | 
106 | will print you some options you can set and it is quite explanatory. However, usually you only need to run
107 | 
108 | ```bash
109 | ./post_download.sh --input /path/where/data/is/stored/upto/trainID --target-size 500000 --jobs 50
110 | ./post_download.sh --input /path/where/data/is/stored/upto/trainID --target-size 500000 --jobs 50 --max-search-depth 8
111 | ```
112 | 
113 | which would try to merge the single `AnalysisResults.root` files to `500,000 kB` files using 50 parallel jobs. The  second line, with the additional `--max-search-depth 8` argument is needed for `Download-generic.sh`.  In addition, it is very handy to use
114 | 
115 | ```bash
116 | ./post_download_all.sh <some_top_dir>
117 | ```
118 | 
119 | This script searches for `unmgered` directories below `<some_top_dir>` and calls `post_download.sh` for each determined directory found. In case a `merged` directory already exists, that one will be skipped.
120 | 
121 | 
122 | ## 5) Installation on a local system/different server
123 | 
124 | The JAliEn tool is needed for downloading from the GRID. An installation of alibuild is needed (follow https://alice-doc.github.io/alice-analysis-tutorial/building/). Afterwards, one can build JAliEn (the installation should take a few minutes only).
125 | ```
126 | mkdir -p ~/alice
127 | cd ~/alice/
128 | git clone https://github.com/alisw/alidist
129 | aliBuild build JAliEn --defaults jalien -z jalien
130 | ```
131 | > JAliEn is already installed at lxplus. To enter the environment, do '/cvmfs/alice.cern.ch/bin/alienv enter JAliEn'
132 | 
133 | You may need to edit the hard-coded jalien path (`/opt/jalien/src/jalien/jalien`) in `downloader.sh` to
134 | something appropriate for your system, e.g. simply `jalien`.
135 | 
136 | ROOT is needed for the merging of the files. If this is not yet installed, please follow the instructions below. **Please note that these instructions don't build against a specific python version, which you might need for ML studies.**
137 | ```
138 | git clone http://github.com/root-project/root.git
139 | cd root
140 | git checkout -b v6-14-04 v6-14-04    #Or any other version listed by 'git tag -l'
141 | mkdir -p ../rootbuild
142 | cd ../rootbuild
143 | cmake ../root
144 | ```
145 | Please change **N** into the number of cores to be used.
146 | ```
147 | cmake --build . -- -jN
148 | ```
149 | When ready, source ROOT in your ~/.bashrc
150 | ```
151 | source $HOME/alice/rootbuild/bin/thisroot.sh    #change directory if needed
152 | ```
153 | and source your .bashrc
154 | ```
155 | source ~/.bashrc
156 | ```
157 | 
158 | ## In case of problems:
159 | 
160 | For problems luuk.vermunt@cern.ch
161 | 


--------------------------------------------------------------------------------
/cplusutilities/download_from_grid.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Script to download any files from the Grid
 4 | 
 5 | speed=10 # number of download threads started per second
 6 | 
 7 | # Check correct input.
 8 | 
 9 | if [ -z "$3" ]; then
10 |     echo "Usage: $0 <Grid source path> <local target path> <file names>"
11 |     exit 1
12 | fi
13 | 
14 | # Check correct environment.
15 | 
16 | if [ -z "$ALIPHYSICS_RELEASE" ] || [ -z "$(echo $ALIPHYSICS_RELEASE | grep JALIEN)" ]; then
17 |     echo "Error: Load the JALIEN flavour of AliPhysics and run the script again."
18 |     echo '/cvmfs/alice.cern.ch/bin/alienv enter AliPhysics/vAN-'$(date --date="-2 days" +%Y%m%d)'_JALIEN-1'
19 |     exit 1
20 | fi
21 | 
22 | path_grid="$1" # Grid path
23 | shift
24 | path_local="$1" # Local path
25 | shift
26 | filenames="$@" # Names of files
27 | 
28 | # Create list of files.
29 | 
30 | timestamp=$(date +%Y%m%d_%H%M%S)_${BASHPID}
31 | inputlist=filelist_${timestamp}.txt
32 | logfile=stdouterr_${timestamp}.log
33 | rm -f $inputlist $logfile
34 | echo "Creating list of files"
35 | for file in $filenames; do
36 |     alien_find $path_grid/ $file >> $inputlist
37 |     if [ ! $? -eq 0 ]; then echo "Error"; exit 1; fi
38 | done
39 | nfiles=$(cat $inputlist | wc -l)
40 | 
41 | # Display summary and ask for confirmation.
42 | 
43 | echo "Source Grid path: $path_grid"
44 | echo "Target local path: $path_local"
45 | echo "File names: $filenames"
46 | echo "Number of files: $nfiles"
47 | 
48 | echo -e "\nDo you wish to continue? (y/n)"
49 | while true; do
50 |     read -p "Answer: " yn
51 |     case $yn in
52 |         [y] ) echo "Proceeding"; break;;
53 |         [n] ) echo "Aborting"; rm -f $inputlist; exit 0; break;;
54 |         * ) echo "Please answer y or n.";;
55 |     esac
56 | done
57 | 
58 | # Download.
59 | 
60 | delay=$(echo "scale=10 ; 1 / $speed" | bc)
61 | 
62 | i_file=0
63 | for file in $(cat $inputlist); do
64 |     i_file=$((i_file + 1))
65 |     target_file="$path_local/${file/$path_grid/}"
66 |     echo "$i_file/$nfiles $file"
67 |     mkdir -p $(dirname $target_file)
68 |     if [ ! $? -eq 0 ]; then echo "Error"; exit 1; fi
69 |     path_alien="alien://${file}"
70 |     alien_cp -f ${path_alien} ${target_file} >> $logfile 2>&1 &
71 |     sleep $delay
72 | done
73 | 
74 | # Watch progress.
75 | 
76 | nsuccess=0
77 | done=0
78 | pause=2
79 | while [ $done -lt $nfiles ]; do
80 |     nsuccess=$(cat $logfile | grep -e "MESSAGE: \[SUCCESS\]" -e "TARGET VALID" | wc -l)
81 |     nerror=$(cat $logfile | grep "MESSAGE: \[ERROR\]" | wc -l)
82 |     done=$((nsuccess + nerror))
83 |     echo -e "Completed: $nsuccess/$nfiles\tFailed: $nerror/$nfiles\tDone: $done/$nfiles"
84 |     if [ $done -lt $nfiles ]; then
85 |         sleep $pause
86 |     fi
87 | done
88 | 
89 | # Clean.
90 | 
91 | rm -f $inputlist $logfile
92 | 
93 | exit 0
94 | 
95 | 


--------------------------------------------------------------------------------
/cplusutilities/downloader-generic.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DOWNLDOUTPUTPATH=$1
 4 | SAVEDIR=$2
 5 | DOWNLOAD_FILENAME=$3
 6 | 
 7 | 
 8 | 
 9 | printf "Downloading LEGO train files from: %s to %s \n" $DOWNLDOUTPUTPATH $SAVEDIR
10 | 
11 | 
12 | cmd=$(printf "cp -T 32 %s file:%s/\n" $DOWNLDOUTPUTPATH $SAVEDIR)
13 | 
14 | jalien << EOF
15 | $cmd
16 | exit
17 | EOF
18 | 
19 | nameoutputlist=$(printf "%s/listfiles.txt" $SAVEDIR)
20 | find $SAVEDIR -not -type d -name $DOWNLOAD_FILENAME> $nameoutputlist
21 | if [ $? -ne 0 ]; then
22 |   printf "\r                         \e[1;31mWarning: No files were downloaded. Did you enter JAliEn environment before? Are you connected to internet? Did you set the correct path?\e[0m" > /dev/tty
23 |   printf "$SAVEDIR/printing-line-to-give-a-warning-as-no-files-were-downloaded/$DOWNLOAD_FILENAME" >> $nameoutputlist
24 | else
25 |   NDWNLFILES=$(wc -l < "$nameoutputlist")
26 |   printf "\r                         \e[1;32mSuccessfully. %s files downloaded.\e[0m" $NDWNLFILES > /dev/tty
27 | fi
28 | 


--------------------------------------------------------------------------------
/cplusutilities/downloader.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | DWNLDOUTPUTPATH=$1
 4 | CHILD=$2
 5 | NFILES=$3
 6 | DWNLDOUTPUTFILE=$4
 7 | BASEDIR=$5
 8 | TRAINNAME=$6
 9 | DATASETWITHCHILDS=$7
10 | LOCALCHILD=$8
11 | STAGE=$9
12 | 
13 | SAVEDIR=$(printf "%s/%s/unmerged/child_%s" $BASEDIR $TRAINNAME $LOCALCHILD)
14 | mkdir -p -m 777 $SAVEDIR
15 | if [ $? -ne 0 ]; then
16 |   printf "Error: Could not create output directory. Is $SAVEDIR writable? Returning... \n\n"
17 |   exit
18 | else
19 |   printf "Created directory: $SAVEDIR \n"
20 | fi
21 | 
22 | if [ -z "$9" ]; then
23 |   #do nothing, if-statement to be reversed
24 |   dummy=1
25 | else
26 |   SAVEDIR=$(printf "%s/%s/unmerged/child_%s/%s/" $BASEDIR $TRAINNAME $LOCALCHILD $STAGE)
27 |   mkdir -p -m 777 $SAVEDIR
28 |   if [ $? -ne 0 ]; then
29 |     printf "Error: Could not create output directory. Is $SAVEDIR writable? Returning... \n\n"
30 |     exit
31 |   else
32 |     printf "Created directory: $SAVEDIR \n"
33 |   fi
34 | fi
35 | 
36 | if [ $DATASETWITHCHILDS -eq 1 ]; then
37 |   DWNLDOUTPUTPATH=$(printf "%s/%s_child_%s/%s" $DWNLDOUTPUTPATH $TRAINNAME $CHILD $STAGE)
38 | else
39 |   DWNLDOUTPUTPATH=$(printf "%s/%s/%s" $DWNLDOUTPUTPATH $TRAINNAME $STAGE)
40 | fi
41 | printf "Downloading LEGO train files from: %s\n" $DWNLDOUTPUTPATH
42 | 
43 | cmd=$(printf "cp -T 32 %s/%s/%s.root file:%s/\n" $DWNLDOUTPUTPATH "$NFILES" $DWNLDOUTPUTFILE $SAVEDIR)
44 | 
45 | jalien << EOF
46 | $cmd
47 | exit
48 | EOF
49 | 
50 | nameoutputlist=$(printf "listfiles_%s_child_%s%s.txt" $TRAINNAME $LOCALCHILD $STAGE)
51 | find $SAVEDIR/$NFILES/$DWNLDOUTPUTFILE.root -maxdepth 1 -not -type d> $nameoutputlist
52 | if [ $? -ne 0 ]; then
53 |   printf "\r                         \e[1;31mWarning: No files were downloaded. Did you enter JAliEn environment before? Are you connected to internet? Did you set the correct path?\e[0m" > /dev/tty
54 |   printf "$SAVEDIR/printing-line-to-give-a-warning-as-no-files-were-downloaded/$DWNLDOUTPUTFILE.root" >> $nameoutputlist
55 | else
56 |   NDWNLFILES=$(wc -l < "$nameoutputlist")
57 |   printf "\r                         \e[1;32mSuccessfully. %s files downloaded.\e[0m" $NDWNLFILES > /dev/tty
58 | fi
59 | 
60 | mv $nameoutputlist $SAVEDIR
61 | 


--------------------------------------------------------------------------------
/cplusutilities/merge_and_fit_invmasshisto.C:
--------------------------------------------------------------------------------
  1 | /*
  2 |  Macro to fit invariant mass distributions
  3 |  1) input files:
  4 |     masshistoLctopK0sPbPbCen[010,3050]data1_[prob_value]_[18r,18q].root
  5 |  2) output file (containig h_raw_signal_prob[prob], h_invmass_[ptmin]_[ptmax]_prob[prob], h_invmasstot_[ptmin]_[ptmax]_prob[prob]):
  6 |     raw_yields_[010,3050].root
  7 |  
  8 |  .L merge_and_fit_histo.C
  9 |  merge_and_fit_histo()
 10 |  */
 11 | 
 12 | #include "AliHFInvMassFitter.h"
 13 | 
 14 | 
 15 | Double_t mass = 2.286;
 16 | Double_t min_fit = mass-0.14;
 17 | Double_t max_fit = mass+0.14;
 18 | Int_t signal_fit_f = 0;    // kGaus=0, k2Gaus=1, k2GausSigmaRatioPar=2
 19 | Int_t background_fit_f = 2;// kExpo=0, kLin=1, kPol2=2, kNoBk=3, kPow=4, kPowEx=5
 20 | Int_t rebin = 4;
 21 | 
 22 | const int nptbins = 1;
 23 | float ptlimits[nptbins+1] = {6, 8};
 24 | TString dir[nptbins] = {"/home/ginnocen/MachineLearningHEP/machine_learning_hep/datapklanalysisLHC18q"};
 25 | 
 26 | const int nprob_test = 1;
 27 | TString   prob_test[nprob_test] = {"0.70"};
 28 | 
 29 | const int nfiles = 1;
 30 | TString   files[nfiles] = {"LHC18q"};
 31 | TString   cent = "3050";
 32 | 
 33 | bool fixsigma = true;
 34 | float sigmafix[nptbins] = {0.013};
 35 | 
 36 | void merge_and_fit_invmasshisto(){
 37 |     
 38 |     TFile *fout = new TFile(Form("raw_yields_%s.root",cent.Data()),"RECREATE");
 39 |     TH1F* h_raw_signal[nprob_test];
 40 |     for(int k=0; k<nprob_test; k++){
 41 |         h_raw_signal[k] = new TH1F(Form("h_raw_signal_prob%s",prob_test[k].Data()),";#it{p}_{T} (GeV/#it{c});raw yield",nptbins,ptlimits);
 42 |     }
 43 |         
 44 |     
 45 |     for(int i=0; i<nptbins; i++){
 46 |         TCanvas *c_fit = new TCanvas(Form("fits_%.0f_%.0f",ptlimits[i],ptlimits[i+1]),Form("fits_%.0f_%.0f",ptlimits[i],ptlimits[i+1]));
 47 |         c_fit->Divide(nprob_test,2);
 48 |         
 49 |         for(int iprob=0; iprob<nprob_test; iprob++){
 50 |             TH1F *hmerge=0x0;
 51 |             TCanvas *c_fit_prob = new TCanvas(Form("fits_prob%s_%.0f_%.0f",prob_test[iprob].Data(),ptlimits[i],ptlimits[i+1]),Form("fits_prob%s_%.0f_%.0f",prob_test[iprob].Data(),ptlimits[i],ptlimits[i+1]));
 52 |             c_fit_prob->Divide(3,2);
 53 |             // single period analysis
 54 |             for(int ifil=0; ifil<nfiles; ifil++){
 55 | 		    std::cout<<Form("%s/masshistoLctopK0sPbPbCen%s%s_%.0f_%.0f.root",dir[i].Data(),cent.Data(),"_byHand",ptlimits[i], ptlimits[i+1])<<std::endl;
 56 | 		TFile *f = TFile::Open(Form("%s/masshisto%.0f_%.0f_%s.root",dir[i].Data(),ptlimits[i], ptlimits[i+1],prob_test[iprob].Data()));
 57 |                 
 58 | 		TH1F *h = (TH1F*)f->Get("hmass");
 59 |                 h->SetName(Form("h_invmass%s_%.0f_%.0f_prob%s",files[ifil].Data(),ptlimits[i],ptlimits[i+1],prob_test[iprob].Data()));
 60 |                 if(ifil==0){
 61 |                     hmerge=(TH1F*)h->Clone("h_merge");
 62 |                     hmerge->SetName(Form("h_invmasstot_%.0f_%.0f_prob%s",ptlimits[i],ptlimits[i+1],prob_test[iprob].Data()));
 63 |                 }
 64 |                 else{
 65 |                     hmerge->Add(h);
 66 |                 }
 67 |                 c_fit_prob->cd(ifil+1);
 68 |                 gPad->SetTicks();
 69 |                 h->Rebin(rebin);
 70 |                 Float_t bin_width = h->GetXaxis()->GetBinWidth(3);
 71 |                 TString histo_title=Form("%.0f-%.0f prob>%s %s; #it{M} (GeV/#it{c}^{2}); Counts/%.0f MeV/#it{c}^{2}",ptlimits[i],ptlimits[i+1],prob_test[iprob].Data(),files[ifil].Data(),bin_width*1000.);
 72 |                 h->SetTitle(histo_title.Data());
 73 |                 AliHFInvMassFitter *fitter = new AliHFInvMassFitter(h,min_fit,max_fit,background_fit_f,signal_fit_f);
 74 |                 fitter->SetUseLikelihoodFit();
 75 |                 fitter->SetInitialGaussianMean(mass);
 76 |                 if(fixsigma)fitter->SetFixGaussianSigma(sigmafix[i]);
 77 |                 Bool_t out=fitter->MassFitter(0);
 78 |                 if(!out) {
 79 |                     fitter->GetHistoClone()->Draw();
 80 |                 }
 81 |                 fitter->DrawHere(gPad);
 82 |                 Double_t sigma=fitter->GetSigma();
 83 |                 fout->cd();
 84 |                 h->Write();
 85 |                 
 86 |                 // subtracting background
 87 |                 TF1 *bkgf = (TF1*)fitter->GetBackgroundFullRangeFunc();
 88 |                 TH1F *hbkg = (TH1F*)h->Clone("hbkg");
 89 |                 TH1F *hsigsub = (TH1F*)h->Clone("hsigsub");
 90 |                 for(int j=0; j<h->GetNbinsX(); j++){
 91 |                     float bkg=bkgf->Eval(hbkg->GetBinCenter(j+1));
 92 |                     float bkge=TMath::Sqrt(bkg);
 93 |                     hbkg->SetBinContent(j+1,bkg);
 94 |                     hbkg->SetBinError(j+1,bkge);
 95 |                 }
 96 |                 //hbkg->Draw("same");
 97 |                 hsigsub->Add(hbkg,-1.);
 98 |                 c_fit_prob->cd(ifil+4);
 99 |                 AliHFInvMassFitter *fitterbkg = new AliHFInvMassFitter(hsigsub,min_fit,max_fit,3,signal_fit_f);
100 |                 fitterbkg->SetUseChi2Fit();
101 |                 fitterbkg->SetInitialGaussianMean(mass);
102 |                 fitterbkg->SetFixGaussianSigma(sigma);
103 |                 Bool_t out2=fitterbkg->MassFitter(0);
104 |                 if(!out2) {
105 |                     fitterbkg->GetHistoClone()->Draw();
106 |                 }
107 |                 fitterbkg->DrawHere(gPad);
108 |             }
109 |             // merging 2 periods
110 |             hmerge->Rebin(rebin);
111 |             Float_t bin_width = hmerge->GetXaxis()->GetBinWidth(3);
112 |             TString histo_title=Form("%.0f-%.0f prob>%s; #it{M} (GeV/#it{c}^{2}); Counts/%.0f MeV/#it{c}^{2}",ptlimits[i],ptlimits[i+1],prob_test[iprob].Data(),bin_width*1000.);
113 |             hmerge->SetTitle(histo_title.Data());
114 |             c_fit->cd(iprob+1);
115 |             gPad->SetTicks();
116 |             AliHFInvMassFitter *fitter = new AliHFInvMassFitter(hmerge,min_fit,max_fit,background_fit_f,signal_fit_f);
117 |             fitter->SetUseLikelihoodFit();
118 |             fitter->SetInitialGaussianMean(mass);
119 |             if(fixsigma)fitter->SetFixGaussianSigma(sigmafix[i]);
120 |             Bool_t out=fitter->MassFitter(0);
121 |             if(!out) {
122 |                 fitter->GetHistoClone()->Draw();
123 |             }
124 |             fitter->DrawHere(gPad);
125 |             c_fit_prob->cd(3);
126 |             fitter->DrawHere(gPad);
127 |             Double_t sigma=fitter->GetSigma();
128 |             double rawyield = fitter->GetRawYield();
129 |             double rawyielderr = fitter->GetRawYieldError();
130 |             h_raw_signal[iprob]->SetBinContent(i+1,rawyield);
131 |             h_raw_signal[iprob]->SetBinError(i+1,rawyielderr);
132 |             
133 |             //h_signal
134 |             
135 |             fout->cd();
136 |             hmerge->Write();
137 |             
138 |             // subtracting background
139 |             TF1 *bkgf = (TF1*)fitter->GetBackgroundFullRangeFunc();
140 |             TH1F *hbkg = (TH1F*)hmerge->Clone("hbkg");
141 |             TH1F *hsigsub = (TH1F*)hmerge->Clone("hsigsub");
142 |             for(int j=0; j<hmerge->GetNbinsX(); j++){
143 |                 float bkg=bkgf->Eval(hbkg->GetBinCenter(j+1));
144 |                 float bkge=TMath::Sqrt(bkg);
145 |                 hbkg->SetBinContent(j+1,bkg);
146 |                 hbkg->SetBinError(j+1,bkge);
147 |             }
148 |             hsigsub->Add(hbkg,-1.);
149 |             c_fit_prob->cd(6);
150 |             AliHFInvMassFitter *fitterbkg = new AliHFInvMassFitter(hsigsub,min_fit,max_fit,3,signal_fit_f);
151 |             //fitterbkg->SetUseLikelihoodFit();
152 |             fitterbkg->SetUseChi2Fit();
153 |             fitterbkg->SetInitialGaussianMean(mass);
154 |             fitterbkg->SetFixGaussianSigma(sigma);
155 |             Bool_t out2=fitterbkg->MassFitter(0);
156 |             if(!out2) {
157 |                 fitterbkg->GetHistoClone()->Draw();
158 |             }
159 |             fitterbkg->DrawHere(gPad);
160 |             c_fit->cd(iprob+4);
161 |             fitterbkg->DrawHere(gPad);
162 |             c_fit_prob->Write();
163 | 	}
164 |     }
165 |     fout->cd();
166 |     for(int k=0; k<nprob_test; k++){
167 |         h_raw_signal[k]->Write();
168 |     	
169 |     }
170 | }
171 | 
172 | 


--------------------------------------------------------------------------------
/cplusutilities/post_download_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Run the merging script for all "unmerged" directories in the input path.
 4 | 
 5 | path="$1"
 6 | if [ ! -d "$path" ]
 7 | then
 8 |   echo "Input path $path does not exist!"
 9 |   exit 1
10 | fi
11 | 
12 | for dir in $(find $path -type d -name unmerged)
13 | do
14 |   dirout=${dir/unmerged/merged}
15 |   if [ -d "$dirout" ]
16 |   then
17 |     echo "Output directory $dirout already exists. Skipping"
18 |     continue
19 |   fi
20 |   ./post_download.sh --input ${dir/unmerged/} --target-size 500000 --jobs 50 -f
21 | done
22 | 
23 | exit 0
24 | 
25 | 


--------------------------------------------------------------------------------
/cplusutilities/run_downloader:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | "$@" &
 4 | 
 5 | START=$(date +%s)
 6 | 
 7 | # generic downloader
 8 | if [ "$#" -eq "5" ]; then
 9 |     printcmd=$(printf "   \033[0;37mDownloading LEGO train files from: %s \033[0m" $3)
10 | # old downloader
11 | else
12 |     if [ -z "$11" ]; then
13 |       if [ $9 -eq 1 ]; then
14 |         printcmd=$(printf "   \033[0;37mDownloading LEGO train files from: %s/%s_child_%s/%s/%s.root\033[0m" $3 $8 $4 "$5" $6)
15 |       else
16 |         printcmd=$(printf "   \033[0;37mDownloading LEGO train files from: %s/%s/%s/%s.root\033[0m" $3 $8 "$5" $6)
17 |       fi
18 |     else
19 |       if [ $9 -eq 1 ]; then
20 |         printcmd=$(printf "   \033[0;37mDownloading LEGO train files from: %s/%s_child_%s/%s/%s/%s.root\033[0m" $3 $8 $4 ${11} "$5" $6)
21 |       else
22 |         printcmd=$(printf "   \033[0;37mDownloading LEGO train files from: %s/%s/%s/%s/%s.root\033[0m" $3 $8 ${11} "$5" $6)
23 |       fi
24 |     fi
25 | fi
26 | 
27 | printf "$printcmd\n" > /dev/tty
28 | 
29 | printf "   Running downloader: " > /dev/tty
30 | while kill -0 $!; do
31 |   printf "." > /dev/tty
32 |   sleep 2
33 | 
34 |   if [ "$(( $(date +%s) - $START ))" -gt "10" ]; then
35 |     printf "\r                               " > /dev/tty
36 |     printf "\r   Running downloader: " > /dev/tty
37 |     START=$(date +%s)
38 |   fi
39 | 
40 | done
41 | 
42 | printf "\r                      " > /dev/tty
43 | printf "\r   Downloader finished\n\n" > /dev/tty
44 | 


--------------------------------------------------------------------------------
/cplusutilities/run_mass_fitter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | MCORDATA=$1
 3 | ISML=$2
 4 | 
 5 | #AliPhysics is needed for the mass fit
 6 | if [ -z "${ALICE_PHYSICS}" ]
 7 | then
 8 |   #load yesterday's tag
 9 |   eval `/cvmfs/alice.cern.ch/bin/alienv printenv AliPhysics/vAN-$(date -v-1d +%Y%m%d)_ROOT6-1`
10 | fi
11 | 
12 | root -b -l <<EOF
13 | .L mass_fitter.C+
14 | mass_fitter("$1","$2");
15 | .q
16 | EOF
17 | 


--------------------------------------------------------------------------------
/figures/ALICE_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/figures/ALICE_all.png


--------------------------------------------------------------------------------
/figures/LHCparticle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/figures/LHCparticle.jpg


--------------------------------------------------------------------------------
/figures/Lambda_peak.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/figures/Lambda_peak.png


--------------------------------------------------------------------------------
/figures/SelectionVar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/figures/SelectionVar.png


--------------------------------------------------------------------------------
/figures/SideBands.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/figures/SideBands.png


--------------------------------------------------------------------------------
/machine_learning_hep/README.md:
--------------------------------------------------------------------------------
1 | # Instructions
2 | 
3 | To run the package: Set your preferences in `submission/default_complete.yml` and run:
4 | ```
5 | mlhep -r submission/default_complete.yml -d data/database_ml_parameters_[CASE].yml
6 | ```
7 | 
8 | ..Instructions to be completed..
9 | 


--------------------------------------------------------------------------------
/machine_learning_hep/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 


--------------------------------------------------------------------------------
/machine_learning_hep/__main__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 3 | ##                                                                         ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | import sys
16 | 
17 | from machine_learning_hep.steer_analysis import main
18 | 
19 | sys.exit(main())
20 | 


--------------------------------------------------------------------------------
/machine_learning_hep/analysis/README.md:
--------------------------------------------------------------------------------
  1 | # Analysis and systematics
  2 | 
  3 | ## Overview
  4 | 
  5 | First of all, everything in here is basically an **Analyzer**. These objects can be handled by an `AnalysisManager`. 
  6 | 
  7 | 
  8 | ## Applying additional analysis cuts
  9 | 
 10 | In order to place additional cuts before a mass histogram is filled, those have to be set in the corresponding analysis section in the database. There, one cut must be put per analysis pT bin. If no cut should be applied, just put `Null`. The flag `use_cuts` controls whether the cuts should be applied or not. Otherwise, cuts are formulated as strings which are directly used in a `pandas.DataFrame.query` meaning that all names used **must** exist as a column in the dataframe in the analysis. An example implementation in the database could look like
 11 | 
 12 | ```yaml
 13 | # within an analysis section, assuming 4 pT bins
 14 |   use_cuts: True
 15 |   cuts:
 16 |     - "p_prong0 > 2 or p_prong1 < 1"
 17 |     - Null
 18 |     - "abs(eta_cand) < 1.2"
 19 |     - Null
 20 | ```
 21 | 
 22 | The cuts can then be accessed in `processer_<type>.process_histomass_single`. The database flag `use_cuts` is translated into the member `self.do_custom_analysis_cuts` and should be checked whether it's `True` in order to not circumvent it's purpose. Then, there is a helper function in `Processer` so if you have a dataframe corresponding to a certain pT bin, you can just do
 23 | 
 24 | ## Using efficiencies from another analysis
 25 | 
 26 | To use the efficiencies from another analysis for a certain multiplicity bin one can use the fields `path_eff` and `mult_bin_eff` when using the analyzer class `AnalyzerDhadrons_mult`. When using this feature, both fields have to contain a list as long as the number of multiplicity bins. The first list lists the corresponding file to be used and  the second list entries are integers referring to the i'th multiplicity bin histogram inside the file. `null` entries can be used to use the efficiencies of this very analysis multiplicity bin (which is of course the default when none of the lists is present).
 27 | 
 28 | 
 29 | ```python
 30 | if self.do_custom_analysis_cuts:
 31 |     df = self.apply_cuts_ptbin(df, ipt)
 32 | 
 33 | ```
 34 | 
 35 | which would apply the cuts defined for the `ipt`'th bin and returns the skimmed dataframe. Nothing is done when there was no cut defined and you would just get back the dataframe you put in.
 36 | 
 37 | ## Analysis and systematic implementation and workflow
 38 | 
 39 | A specific analysis or systematics is derived from `Analyzer`. This `AnalyzerDerived` can then implement any analysis step method. Note, that passing arguments to those methods is at the moment not supported. However, as they have access to the entire configuration via the database dictionary, this will probably not be needed as all specifics can be derived from that database.
 40 | An object of `AnalyzerDerived` can be tried to run the step `ana_step` by doing
 41 | 
 42 | ```python
 43 | method_executed = analyzer_derived.analysis_step("ana_step")
 44 | ```
 45 | 
 46 | If a method with this name is found, it will be executed. If not, this returns `False`. (Of course, `analyzer_derived.ana_step()` works just as well...).
 47 | 
 48 | `Analyzer` objects can be wrapped into an `AnalyzerManager` which has the following constructor
 49 | 
 50 | ```python
 51 | class AnalyzerManager:
 52 |     def __init__(self, database, case, typean, doperiodbyperiod, *args):
 53 |         ...
 54 | ```
 55 | 
 56 | where `database` is the dictionary derived from the `database_ml_parameters_<particle>.yml` and contain hence all information needed. `case` is passed for backwards-compatibility in some sense but might become obsolete at some point as it is basically the particle name and not used to specifically run/determine anything. On the other hand, `typean` is the analysis to be run and needed to pick up the correct parameters from `database` as they might differ from analysis to analysis. The boolean variable `doperiodbyperiod` specifies whether analyses should be run for each single specified data-taking period specified in `database`. `*args` gives the opportunity to specify arguments which should be passed to each `Analyzer` in addition.
 57 | 
 58 | Say now, we have a class `AwesomeAnalyzer` deriving from `Analyzer` so we can do the follwing
 59 | 
 60 | ```python
 61 | ana_mgr = AnalyzerManager(AwesomeAnalyzer, database, case, typean, add_arg1, add_arg2)
 62 | ana_steps = ["fit", "make_cross_section", "plot_all"]
 63 | 
 64 | # The "*" is absolutely necessary here as it kind of resolves the list into single arguments...
 65 | ana_mgr.analyze(*ana_steps)
 66 | 
 67 | # ...so you could also do...
 68 | ana_mgr.analyze("fit", "make_cross_section", "plot_all")
 69 | 
 70 | # ...or only
 71 | ana_mgr.analyze("summarize")
 72 | ```
 73 | 
 74 | Note that the analysis steps are executed in the exact same order they were passed. The `AnalyzerManager` then first loops over each step and inside this loop it loops over all `Analyzer`s. The other way round might be dangerous as some steps might depend on others and it is could be the case that it is not accounted for that in the specific implementation of the `AwesomeAnalyzer` (ok, then it might be not so awesome...).
 75 | 
 76 | In the very same way it works with systematics. A class handling those would as well derive from `Analyzer` and it can hence as well be treated by an `AnalyzerManager` object in the very same way.
 77 | 
 78 | ## Implementing an after-burner
 79 | 
 80 | If `doperiodbyperiod` was enabled, it might be necessary to run some final jobs, for instance in order to merge data. Therefore, `Analyzer` implements a method `get_after_burner(self)` which, in the default implementation, just returns `None`. If so, the `AnalyzerManager` is smart enough to not run any after-burner steps. If the method is implemented, an object deriving from `AnalyzerAfterBurner` is expected (which in turn derives actually from `Analyzer`).
 81 | 
 82 | Everytime a specific analysis step has been run for all period-analyses, the after-burner is invoked, basically by 
 83 | 
 84 | ```python
 85 | after_burner.analysis_step("fit")
 86 | ```
 87 | Classes deriving from `AnalyzerAfterBurner` have access to all per-period `Analyzer`s through the member list `analyzers`. Hence, they can access all of them in the corresponding method. There are two things to note here:
 88 | 
 89 | 1. The after-burner is called for each analysis step, however it is save in case that is not implemented - simply nothing happens.
 90 | 2. To be meaningful, the after-burner method has to have the same name as the individual analysis step done before.
 91 | 
 92 | One use-case of the after-burner is for example the class `Systematics` in `systematics.py` which at that moment does systematic studies of the ML working point (basically a variation) and of the MC pT shape.
 93 | 
 94 | ## Implementing an Analyzer
 95 | 
 96 | Any analyzer or systematic class derived from `Analyzer`. That means, you start off like this
 97 | 
 98 | ```python
 99 | from machine_learning_hep.analysis.analyzer import Analyzer
100 | 
101 | class AwesomeAnalyzer(Analyzer):
102 |     def __init__(self, datap, case, typean, period, few, more, arguments):
103 |         super().__init__(datap, case, typean, period)
104 | 
105 |     # awesome implementations
106 | ```
107 | 
108 | It is required hat the base class gets the database dictionary, the analysis type, the particle case and the period. Hence, these four arguments need to correspond to the first four arguments of your `AwesomeAnalyzer`. After the base classe's `__init__` has been called these are automatically available in your `AwesomeAnalyzer` as class members
109 | 
110 | * `self.datap`
111 | * `self.case`
112 | * `self.typean`
113 | * `self.period`
114 | 
115 | In addition there is also a logger in `self.logger` you can use to issue more important output for the user.
116 | 
117 | `self.period` will have the period number ranging from `0` to `n_period - 1`. It is `None` is this an analyzer has to expect merged period input. This info can be used if, for instance, a method should only be executed for a certain pariod or a period-merged analysis. At the beginning of such a method you might write
118 | 
119 | ```python
120 |     def my_analysis_step(self):
121 |         if self.period is None:
122 |             return
123 |         # Following implementation only run for per-period run
124 | ```
125 | 
126 | 
127 | 
128 | 


--------------------------------------------------------------------------------
/machine_learning_hep/analysis/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 


--------------------------------------------------------------------------------
/machine_learning_hep/analysis/analyzer.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | import os
16 | from os import makedirs
17 | from os.path import exists, join
18 | 
19 | from machine_learning_hep.io import dump_yaml_from_dict
20 | 
21 | # HF specific imports
22 | from machine_learning_hep.workflow.workflow_base import WorkflowBase
23 | 
24 | 
25 | class Analyzer(WorkflowBase):
26 |     def __init__(self, datap, case, typean, period):
27 |         super().__init__(datap, case, typean, period)
28 | 
29 |         # The only thing here is to dump the database in the data analysis directory
30 |         for mcordata in ("mc", "data"):
31 |             dp = datap["analysis"][typean][mcordata]
32 |             prefix_dir_res = dp.get("prefix_dir_res", "")
33 |             results_dir = (
34 |                 prefix_dir_res + os.path.expandvars(dp["results"][period])
35 |                 if period is not None
36 |                 else prefix_dir_res + os.path.expandvars(dp["resultsallp"])
37 |             )
38 |             if not exists(results_dir):
39 |                 # create otput directories in case they do not exist
40 |                 makedirs(results_dir)
41 |             if mcordata == "data":
42 |                 dump_yaml_from_dict({case: datap}, join(results_dir, f"database_{case}_{typean}.yml"))
43 | 
44 | 
45 | class AnalyzerAfterBurner(WorkflowBase):
46 |     def __init__(self, datap, case, typean):
47 |         super().__init__(datap, case, typean, None)
48 | 
49 |         self.analyzers = None
50 |         self.analyzer_merged = None
51 | 


--------------------------------------------------------------------------------
/machine_learning_hep/analysis/analyzer_manager.py:
--------------------------------------------------------------------------------
  1 | #  © Copyright CERN 2018. All rights not expressly granted are reserved.  #
  2 | #                 Author: Gian.Michele.Innocenti@cern.ch                  #
  3 | # This program is free software: you can redistribute it and/or modify it #
  4 | #  under the terms of the GNU General Public License as published by the  #
  5 | # Free Software Foundation, either version 3 of the License, or (at your  #
  6 | # option) any later version. This program is distributed in the hope that #
  7 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
  8 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
  9 | #           See the GNU General Public License for more details.          #
 10 | #    You should have received a copy of the GNU General Public License    #
 11 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
 12 | 
 13 | from machine_learning_hep.logger import get_logger
 14 | 
 15 | 
 16 | # pylint: disable=too-many-instance-attributes
 17 | class AnalyzerManager:
 18 |     """
 19 |     Manager class handling analysis and systematic objects
 20 |     """
 21 | 
 22 |     def __init__(self, ana_class, database, case, typean, doperiodbyperiod, *args):
 23 |         self.ana_class = ana_class
 24 |         self.database = database
 25 |         self.case = case
 26 |         self.typean = typean
 27 |         self.doperiodbyperiod = doperiodbyperiod
 28 | 
 29 |         # Additional arguments to be forwarded to the analyzers
 30 |         self.add_args = args
 31 | 
 32 |         self.logger = get_logger()
 33 | 
 34 |         self.analyzers = []
 35 |         self.after_burner = None
 36 | 
 37 |         self.is_initialized = False
 38 | 
 39 |     def get_analyzers(self, none_for_unused_period=True):
 40 |         self.initialize()
 41 |         if not none_for_unused_period:
 42 |             return self.analyzers
 43 | 
 44 |         useperiod = self.database["analysis"][self.typean]["useperiod"]
 45 |         analyzers = [None] * (len(useperiod) + 1)
 46 |         for a in self.analyzers:
 47 |             if a.period is not None:
 48 |                 analyzers[a.period] = a
 49 |         analyzers[-1] = self.analyzers[-1]
 50 |         return analyzers
 51 | 
 52 |     def initialize(self):
 53 |         """
 54 |         Collect all analyzer objects required in a list and initialises the after_burner if present
 55 |         """
 56 | 
 57 |         if self.is_initialized:
 58 |             return
 59 | 
 60 |         self.logger.info("Initialize analyzer manager for analyzer %s", self.ana_class.__name__)
 61 | 
 62 |         useperiod = self.database["analysis"][self.typean]["useperiod"]
 63 | 
 64 |         for ip, period in enumerate(useperiod):
 65 |             if self.doperiodbyperiod and period:
 66 |                 self.analyzers.append(self.ana_class(self.database, self.case, self.typean, ip, *self.add_args))
 67 |         self.analyzers.append(self.ana_class(self.database, self.case, self.typean, None, *self.add_args))
 68 | 
 69 |         if self.doperiodbyperiod:
 70 |             # get after-burner, if any
 71 |             self.after_burner = self.analyzers[-1].get_after_burner()
 72 |             if self.after_burner:
 73 |                 self.after_burner.analyzers = self.analyzers[:-1]
 74 |                 self.after_burner.analyzer_merged = self.analyzers[-1]
 75 | 
 76 |         self.is_initialized = True
 77 | 
 78 |     def analyze(self, ana_steps):
 79 |         """
 80 |         Gives a list of analyzers and analysis steps do each step for each analyzer
 81 |         Args:
 82 |             ana_steps: list of analysis steps as strings
 83 |         """
 84 | 
 85 |         if not ana_steps:
 86 |             self.logger.info("No analysis steps to be done for Analyzer class %s. Return...", self.ana_class.__name__)
 87 |             return
 88 | 
 89 |         self.initialize()
 90 | 
 91 |         self.logger.info(
 92 |             "Run all registered analyzers of type %s for following analysis steps: %s",
 93 |             self.ana_class.__name__,
 94 |             ana_steps,
 95 |         )
 96 | 
 97 |         # Collect potentially failed systematic steps
 98 |         failed_steps = []
 99 |         failed_steps_after_burner = []
100 |         for step in ana_steps:
101 |             if self.doperiodbyperiod:
102 |                 for analyzer in self.analyzers[:-1]:
103 |                     if not analyzer.step(step):
104 |                         failed_steps.append((analyzer.__class__.__name__, step))
105 |                         # If analysis step could not be found here,
106 |                         # we don't need to go on trying this steps since all analyzers are of the
107 |                         # same class
108 |                         break
109 | 
110 |                 # Run after-burner if one was provided by the analyzer object
111 |                 if self.after_burner and not self.after_burner.step(step):
112 |                     failed_steps_after_burner.append((self.after_burner.__class__.__name__, step))
113 | 
114 |             # Do analysis step for period-merged analyzer
115 |             self.analyzers[-1].step(step)
116 | 
117 |         if failed_steps:
118 |             self.logger.error("Following analysis steps could not be found:")
119 |             for fs in failed_steps:
120 |                 print(f"Analyzer class: {fs[0]}, analysis step: {fs[1]}")
121 | 


--------------------------------------------------------------------------------
/machine_learning_hep/analysis/utils.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | import tempfile
16 | from os.path import join
17 | 
18 | from machine_learning_hep.logger import get_logger
19 | from machine_learning_hep.utilities import mergerootfiles
20 | 
21 | 
22 | def multi_preparenorm(database, typean, doperiodbyperiod):
23 |     logger = get_logger()
24 | 
25 |     lper_normfilesorig = []
26 |     lper_normfiles = []
27 |     dlper_valevtroot = database["validation"]["data"]["dir"]
28 |     resultsdata = database["analysis"][typean]["data"]["results"]
29 | 
30 |     for res_path, lper_val in zip(resultsdata, dlper_valevtroot):
31 |         lper_normfilesorig.append(join(lper_val, "correctionsweights.root"))
32 |         lper_normfiles.append(join(res_path, "correctionsweights.root"))
33 | 
34 |     f_normmerged = join(database["analysis"][typean]["data"]["resultsallp"], "correctionsweights.root")
35 | 
36 |     listempty = []
37 |     useperiod = database["analysis"][typean]["useperiod"]
38 | 
39 |     with tempfile.TemporaryDirectory() as tmp_merged_dir:
40 |         for indexp in range(len(resultsdata)):
41 |             logger.info("Origin path: %s, target path: %s", lper_normfilesorig[indexp], lper_normfiles[indexp])
42 |             mergerootfiles([lper_normfilesorig[indexp]], lper_normfiles[indexp], tmp_merged_dir)
43 |             if doperiodbyperiod and useperiod[indexp]:
44 |                 listempty.append(lper_normfiles[indexp])
45 | 
46 |         mergerootfiles(listempty, f_normmerged, tmp_merged_dir)
47 | 


--------------------------------------------------------------------------------
/machine_learning_hep/bitwise.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | """
16 | Methods to: perform bitwise operations on dataframes
17 | """
18 | 
19 | import operator
20 | from functools import reduce
21 | 
22 | import numpy as np
23 | 
24 | from .logger import get_logger
25 | 
26 | 
27 | def tag_bit_df(dfin, namebitmap, activatedbit, absval=False):
28 |     try:
29 |         ar = dfin[namebitmap].to_numpy(dtype="int")
30 |         if absval:
31 |             ar = abs(ar)
32 |         mask_on = reduce(operator.or_, ((1 << bit) for bit in activatedbit[0]), 0)
33 |         mask_off = reduce(operator.or_, ((1 << bit) for bit in activatedbit[1]), 0)
34 |         return np.logical_and(np.bitwise_and(ar, mask_on) == mask_on, np.bitwise_and(ar, mask_off) == 0)
35 |     except Exception:
36 |         get_logger().exception("%s, %s", dfin, namebitmap)
37 |         raise
38 | 
39 | 
40 | def filter_bit_df(dfin, namebitmap, activatedbit):
41 |     return dfin[tag_bit_df(dfin, namebitmap, activatedbit)]
42 | 


--------------------------------------------------------------------------------
/machine_learning_hep/clean.sh:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | 
16 | rm -rf *exe.dSYM
17 | rm plots/.DS_Store
18 | rm .DS_Store
19 | rm ../.DS_Store
20 | rm -rf ../utilities/__pycache__/
21 | rm -rf ../../.DS_Store
22 | rm -rf __pycache__/
23 | rm -rf utilities/__pycache__/
24 | rm -rf utilities/__pycache__/
25 | rm -rf __pycache__
26 | rm -rf plots_*
27 | rm -rf output_*
28 | rm *.json 
29 | rm *.h5
30 | rm -rf mcpkl* datapkl* *pklanalysis*
31 | rm mlplot*/*.*
32 | rm mlout*/*.*
33 | rm -rf *pkl*
34 | 


--------------------------------------------------------------------------------
/machine_learning_hep/clean_analysis.sh:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | 
16 | rm -rf *exe.dSYM
17 | rm plots/*.pdf*
18 | rm outputhisto/*.root
19 | rm .DS_Store
20 | rm ../.DS_Store
21 | rm *.json 
22 | rm *.h5
23 | 
24 | 


--------------------------------------------------------------------------------
/machine_learning_hep/clean_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Delete per-period result directories.
 4 | 
 5 | dir_root="$1"
 6 | 
 7 | [ "$dir_root" ] || { echo "Error: Empty directory."; exit 1; }
 8 | 
 9 | [ -d "$dir_root" ] || { echo "Error: Directory $dir_root does not exist."; exit 1; }
10 | 
11 | pattern="pp_201*"
12 | dir_found=$(find $dir_root -type d -name $pattern)
13 | 
14 | [ "$dir_found" ] || { echo "Nothing found."; exit 0; }
15 | 
16 | echo "Found these directories to delete:"
17 | for d in $dir_found; do echo $d; done
18 | echo "Do you wish to delete them? (y/n)"
19 | while true; do
20 |   read -p "Answer: " yn
21 |   case $yn in
22 |     [Yy] )
23 |       echo "Deleting"
24 |       for d in $dir_found; do rm -rf $d; done
25 |       break;;
26 |     [Nn] )
27 |       echo "Skipping"; break;;
28 |     * )
29 |       echo "Please answer y or n.";;
30 |   esac
31 | done
32 | 
33 | exit 0
34 | 
35 | 


--------------------------------------------------------------------------------
/machine_learning_hep/config.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | """
16 | Methods to: update/assert database and run configuration
17 | """
18 | 
19 | from itertools import product
20 | 
21 | from machine_learning_hep.do_variations import modify_dictionary
22 | from machine_learning_hep.logger import get_logger
23 | 
24 | 
25 | # disable pylint unused-argument because this is done already in view of updating the
26 | # database depending on info in there
27 | def update_config(database: dict, run_config: dict, database_overwrite=None):  # pylint: disable=unused-argument
28 |     """Update database before usage
29 | 
30 |     1. overwrite with potential additional user configuration
31 |     2. adjust paths
32 |     This adjusts database inline ==> no return value
33 | 
34 |     Args:
35 |         database: dict
36 |             input database as read from YAML
37 |         run_config: dict
38 |             input run configuration as read from default_<stage>.yaml
39 |         database_overwrite: dict (optional)
40 |             substructured corresponding to database used to overwrite
41 |             corresponding fields in database
42 |     """
43 | 
44 |     logger = get_logger()
45 | 
46 |     # Extract the case
47 |     case = list(database.keys())[0]
48 |     database = database[case]
49 | 
50 |     # First overwrite as required by the user
51 |     # To be implemented
52 |     if database_overwrite:
53 |         logger.info("Updating database fields with custom user input")
54 |         modify_dictionary(database, database_overwrite, True)
55 | 
56 |     # If not an ML analysis...
57 |     if not database["doml"]:
58 |         logger.info("Not an ML analysis, adjust paths and settings accordingly")
59 |         # ...append "_std" to paths where necessary
60 |         data_mc = ("data", "mc")
61 |         pkl_keys = ("pkl_skimmed_dec", "pkl_skimmed_decmerged")
62 |         for keys in product(data_mc, pkl_keys):
63 |             database["mlapplication"][keys[0]][keys[1]][:] = [
64 |                 f"{path}_std" for path in database["mlapplication"][keys[0]][keys[1]]
65 |             ]
66 |         # ...set the ML working point all to 0
67 |         # except for MultiClassification, where bkg cut of 1 is the loosest one
68 |         for k in data_mc:
69 |             database["mlapplication"]["probcutpresel"][k] = [
70 |                 [1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 for i in range(len(pcut))]
71 |                 for pcut in database["mlapplication"]["probcutpresel"][k]
72 |             ]
73 |         database["mlapplication"]["probcutoptimal"] = [
74 |             [1 if i == 0 and database["ml"]["mltype"] == "MultiClassification" else 0 for i in range(len(pcut))]
75 |             for pcut in database["mlapplication"]["probcutoptimal"]
76 |         ]
77 | 


--------------------------------------------------------------------------------
/machine_learning_hep/data/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 


--------------------------------------------------------------------------------
/machine_learning_hep/data/config_model_parameters.yml:
--------------------------------------------------------------------------------
  1 | BinaryClassification:
  2 | 
  3 |   keras:
  4 |     keras_classifier:
  5 |       activate: False
  6 |       layers:
  7 |         - {"n_nodes": 12, "activation": "relu"}
  8 |       optimizer: "adam"
  9 |       loss: "binary_crossentropy"
 10 |       epochs: 30
 11 |       batch_size: 50
 12 | 
 13 |   scikit:
 14 |     scikit_random_forest_classifier:
 15 |       activate: False
 16 |       central_params:
 17 |         max_depth: 5
 18 |         n_estimators: 10
 19 |         max_features: 1
 20 | 
 21 |       grid_search:
 22 |         params:
 23 |           n_estimators: [3, 10, 50, 100]
 24 |           max_features: [2,4,6,8]
 25 |           max_depth: [1,4]
 26 |         refit: AUC
 27 |         scoring: ["AUC", "Accuracy"]
 28 | 
 29 |     scikit_adaboost_classifier:
 30 |       activate: False
 31 |       central_params: {}
 32 |         #max_depth: 3       # 1 default
 33 |         #n_estimators: 50   # 50 default
 34 |         #learning_rate: 0.5 # 1 default
 35 | 
 36 |       grid_search:
 37 |         params:
 38 |           n_estimators: [3, 10, 50, 100]
 39 |           learning_rate: [0.1,0.5,0.9]
 40 |         refit: AUC
 41 |         scoring: ["AUC", "Accuracy"]
 42 | 
 43 |     scikit_decision_tree_classifier:
 44 |       activate: False
 45 |       central_params:
 46 |         max_depth: 5
 47 | 
 48 | #not default parameters
 49 |   xgboost:
 50 |     xgboost_classifier:
 51 |       activate: True
 52 |       central_params:
 53 |         max_depth: 3
 54 |         learning_rate: 0.1
 55 |         n_estimators: 850
 56 |         objective: 'binary:logistic'
 57 |         n_jobs: 10
 58 |         gamma: 0.
 59 |         min_child_weight: 3
 60 |         subsample: 0.8
 61 |         colsample_bytree: 0.8
 62 |         colsample_bynode: 1
 63 |         random_state: 0
 64 |         tree_method: 'hist'
 65 |       #      early_stopping_rounds: 10
 66 |       grid_search:
 67 |         params:
 68 |           min_child_weight: [3] #[1, 3]
 69 |           max_depth: [2] #[3, 6]
 70 |           gamma: [0.2]
 71 |           subsample: [0.8] #[0.6, 0.8, 0.9]
 72 |           colsample_bytree: [0.8] #[0.6, 0.8, 0.9]
 73 |           learning_rate: [0.05, 0.15] #[0.05, 0.1, 0.5]
 74 |           n_estimators: [200, 300] #[500, 800, 1000]
 75 |           objective: ["binary:logistic"]
 76 |         refit: AUC
 77 |         scoring: ["AUC", "Accuracy"]
 78 | 
 79 | 
 80 | MultiClassification:
 81 | 
 82 |   keras:
 83 |     keras_classifier:
 84 |       activate: False
 85 |       layers:
 86 |         - {"n_nodes": 12, "activation": "relu"}
 87 |       optimizer: "adam"
 88 |       loss: "binary_crossentropy"
 89 |       epochs: 30
 90 |       batch_size: 50
 91 | 
 92 |   scikit:
 93 |     scikit_random_forest_classifier:
 94 |       activate: False
 95 |       central_params:
 96 |         max_depth: 5
 97 |         n_estimators: 10
 98 |         max_features: 1
 99 | 
100 |       grid_search:
101 |         params:
102 |           n_estimators: [3, 10, 50, 100]
103 |           max_features: [2,4,6,8]
104 |           max_depth: [1,4]
105 |         refit: AUC
106 |         scoring: ["AUC", "Accuracy"]
107 | 
108 |     scikit_adaboost_classifier:
109 |       activate: False
110 |       central_params: {}
111 |         #max_depth: 3       # 1 default
112 |         #n_estimators: 50   # 50 default
113 |         #learning_rate: 0.5 # 1 default
114 | 
115 |       grid_search:
116 |         params:
117 |           n_estimators: [3, 10, 50, 100]
118 |           learning_rate: [0.1,0.5,0.9]
119 |         refit: AUC
120 |         scoring: ["AUC", "Accuracy"]
121 | 
122 |     scikit_decision_tree_classifier:
123 |       activate: False
124 |       central_params:
125 |         max_depth: 5
126 | 
127 | #not default parameters
128 |   xgboost:
129 |     xgboost_classifier:
130 |       activate: True
131 |       central_params:
132 |         max_depth: 3
133 |         learning_rate: 0.1
134 |         n_estimators: 850
135 |         objective: 'multi:softprob'
136 |         n_jobs: 10
137 |         gamma: 0.
138 |         min_child_weight: 3
139 |         subsample: 0.8
140 |         colsample_bytree: 0.8
141 |         colsample_bynode: 1
142 |         random_state: 0
143 |         tree_method: 'hist'
144 |       #      early_stopping_rounds: 10
145 |       grid_search:
146 |         params:
147 |           min_child_weight: [3] #[1, 3]
148 |           max_depth: [2] #[3, 6]
149 |           gamma: [0.2]
150 |           subsample: [0.8] #[0.6, 0.8, 0.9]
151 |           colsample_bytree: [0.8] #[0.6, 0.8, 0.9]
152 |           learning_rate: [0.05, 0.15] #[0.05, 0.1, 0.5]
153 |           n_estimators: [200, 300] #[500, 800, 1000]
154 |           objective: ["multi:softprob"]
155 |         refit: AUC
156 |         scoring: ["AUC", "Accuracy"]
157 | 
158 | 
159 | Regression:
160 | 
161 |   scikit:
162 | 
163 |     scikit_linear_regression:
164 |       activate: True
165 |       central_params: {}
166 | 
167 |     scikit_ridge_regression:
168 |       activate: True
169 |       central_params:
170 |         alpha: 1
171 |         solver: "cholesky"
172 | 
173 |     scikit_lasso_regression:
174 |       activate: True
175 |       central_params:
176 |         alpha: 0.1
177 | 


--------------------------------------------------------------------------------
/machine_learning_hep/data/config_run_parameters.yml:
--------------------------------------------------------------------------------
  1 | nevt_sig:
  2 |   default: 1000
  3 |   type_as: [42]
  4 | nevt_bkg:
  5 |   default: 1000
  6 |   type_as: [42]
  7 | 
  8 | mltype:
  9 |   choices: ["BinaryClassification", "Regression"]
 10 |   default: "BinaryClassification"
 11 | 
 12 | mlsubtype:
 13 |   default: "HFmeson"
 14 | 
 15 | case:
 16 |   choices: ["Dplus", "Ds", "Dzero", "Dstar", "LctopKpi", "LctopK0s", "PIDKaon", "PIDPion", "hypertritium", "lightquarkjet" ]
 17 |   default: "Dzero"
 18 | 
 19 | usefileserver:
 20 |   default: False
 21 | 
 22 | binmin:
 23 |   default: 2
 24 |   type_as: [42., 42]
 25 | 
 26 | binmax:
 27 |   default: 4
 28 |   type_as: [42., 42]
 29 | 
 30 | test_frac:
 31 |   default: 0.2
 32 |   type_as: [42., 42]
 33 | 
 34 | rnd_splt:
 35 |   default: 12
 36 |   type_as: [42]
 37 | 
 38 | rnd_shuffle:
 39 |  default: 12
 40 |  type_as: [42]
 41 | 
 42 | nkfolds:
 43 |   default: 5
 44 |   type_as: [42]
 45 | 
 46 | ncores:
 47 |   default: -1
 48 |   type_as: [42]
 49 | 
 50 | loadsampleoption:
 51 |   default: True
 52 |   type_as: [True]
 53 | 
 54 | docorrelation:
 55 |   default: False
 56 |   type_as: [True]
 57 | 
 58 | dostandard:
 59 |   default: False
 60 |   type_as: [True]
 61 | 
 62 | dopca:
 63 |   default: False
 64 |   type_as: [True]
 65 | 
 66 | dotraining:
 67 |   default: True
 68 |   type_as: [True]
 69 | 
 70 | dotesting:
 71 |   default: True
 72 |   type_as: [True]
 73 | 
 74 | applytodatamc:
 75 |   default: True
 76 |   type_as: [True]
 77 | 
 78 | docrossvalidation:
 79 |   default: True
 80 |   type_as: [True]
 81 | 
 82 | dolearningcurve:
 83 |   default: True
 84 |   type_as: [True]
 85 | 
 86 | doROC:
 87 |   default: True
 88 |   type_as: [True]
 89 |   depends:
 90 |     parameter: "mltype"
 91 |     value: "Regression"
 92 |     set: False
 93 | 
 94 | doboundary:
 95 |   default: True
 96 |   type_as: [True]
 97 |   depends:
 98 |     parameter: "mltype"
 99 |     value: "Regression"
100 |     set: False
101 | 
102 | doimportance:
103 |   default: True
104 |   type_as: [True]
105 |   depends:
106 |     parameter: "mltype"
107 |     value: "Regression"
108 |     set: False
109 | 
110 | dopltregressionxy:
111 |   default: False
112 |   type_as: [True]
113 |   depends:
114 |     parameter: "mltype"
115 |     value: "BinaryClassification"
116 |     set: False
117 | 
118 | dogridsearch:
119 |   default: False
120 |   type_as: [True]
121 | 
122 | dosignifopt:
123 |   default: False
124 |   type_as: [True]
125 | 
126 | doefficiency:
127 |   default: False
128 |   type_as: [True]
129 | 
130 | # This configuration defaults are filled on the dly from the models database
131 | # config_model_parameters.yml
132 | activate_models:
133 |   default: {}
134 |   type_as: [{}]
135 | 


--------------------------------------------------------------------------------
/machine_learning_hep/data/database_run_list.yml:
--------------------------------------------------------------------------------
1 | HighMultSPD2018: [287658, 287657, 287656, 287654, 287578, 287575, 287524, 287521, 287518, 287517, 287516, 287513, 287486, 287484, 287481, 287480, 287451, 287413, 287389, 287388, 287387, 287385, 287381, 287380, 287360, 287356, 287355, 287353, 287349, 287347, 287346, 287344, 287343, 287325, 287324, 287323, 287283, 287254, 287251, 287250, 287249, 287248, 287209, 287208, 287204, 287203, 287202, 287201, 287185, 287155, 287137, 287077, 287072, 287071, 287066, 287064, 287063, 287021, 287000, 287977, 287975, 287941, 287923, 287915, 287913, 287912, 287911, 287885, 287884, 287877, 287876, 287784, 287783, 288804, 288806, 288943, 289165, 289166, 289167, 289169, 289172, 289175, 289176, 289177, 289198, 289199, 289200, 289201, 289971, 289966, 289965, 289943, 289941, 289940, 289935, 289931, 289928, 289884, 289880, 289879, 289857, 289856, 289855, 289854, 289852, 289849, 289830, 289818, 289817, 289816, 289815, 289814, 289811, 289808, 289775, 289757, 289732, 289731, 289729, 289724, 289723, 289721, 289666, 289664, 289660, 289659, 289658, 289657, 289634, 289632, 289625, 289582, 289577, 289576, 289574, 289547, 289521, 289494, 289493, 289468, 289466, 289465, 289463, 289462, 289444, 289426, 289374, 289373, 289370, 289369, 289368, 289367, 289366, 289365, 289356, 289355, 289354, 289353, 289309, 289308, 289306, 289303, 289300, 289281, 289280, 289278, 289277, 289276, 289275, 289254, 289253, 289249, 289247, 289243, 289242, 289241, 289240, 289666, 289664, 289660, 289659, 289658, 289657, 289634, 289632, 289625, 289582, 289577, 289576, 289574, 292839, 292836, 292834, 292832, 292831, 292811, 292810, 292809, 292804, 292803, 292754, 292750, 292748, 292747, 292744, 292739, 292737, 292704, 292701, 292698, 292696, 292695, 292693, 292586, 292584, 292563, 292560, 292559, 292557, 292554, 292553, 292526, 292524, 292523, 292521, 292500, 292497, 292496, 292495, 292461, 292460, 292457, 292456, 292434, 292432, 292430, 292429, 292428, 292406, 292405, 292398, 292397, 292298, 292273, 292265, 292242, 292241, 292240, 292218, 292192, 292168, 292167, 292166, 292164, 292163, 292162, 292161, 292160, 292140, 292115, 292114, 292109, 292108, 292107, 292106, 292081, 292080, 292077, 292075, 292067, 292062, 292061, 292060, 292040, 292012, 291982, 291977, 291976, 291953, 291948, 291946, 291945, 291944, 291943, 291942, 291803, 291796, 291795, 291769, 291768, 291766, 291762, 291760, 291756, 291755, 291729, 291706, 291698, 291697, 291690, 291665, 291661, 291657, 291626, 291624, 291622, 291618, 291615, 291614, 291590, 291485, 291484, 291482, 291481, 291457, 291456, 291453, 291451, 291447, 291424, 291420, 291417, 291416, 291402, 291400, 291399, 291397, 291377, 291375, 291373, 291363, 291362, 291361, 291360, 291286, 291285, 291284, 291282, 291266, 291265, 291263, 291262, 291257, 291240, 291209, 291188, 291143, 291116, 291111, 291110, 291101, 291100, 291093, 291069, 291066, 291065, 291041, 291037, 291035, 291006, 291005, 291004, 291003, 291002, 290980, 290979, 290976, 290975, 290974, 290948, 290944, 290943, 290941, 290935, 290932, 290895, 290894, 290888, 290887, 290886, 290862, 290860, 290853, 290848, 290846, 290843, 290841, 290790, 290787, 290766, 290689, 290687, 290665, 290660, 290645, 290632, 290627, 290615, 290614, 290613, 290612, 290590, 290588, 290553, 290550, 290549, 290544, 290540, 290539, 290538, 290501, 290500, 290499, 290469, 290467, 290459, 290458, 290456, 290427, 290426, 290425, 290423, 290412, 290411, 290404, 290401, 290399, 290376, 290375, 290374, 290350, 290327, 290323, 291373, 293898, 293896, 293893, 293891, 293886, 293856, 293831, 293830, 293829, 293809, 293807, 293806, 293805, 293802, 293776, 293774, 293773, 293770, 293741, 293740, 293698, 293696, 293695, 293692, 293691, 293588, 293587, 293583, 293582, 293579, 293578, 293573, 293571, 293570, 293475, 293496, 293494, 293474, 293424, 293413, 293392, 293386, 293368, 294925, 294916, 294884, 294883, 294880, 294875, 294852, 294818, 294817, 294816, 294815, 294813, 294809, 294805, 294775, 294774, 294772, 294769, 294749, 294747, 294746, 294745, 294744, 294742, 294741, 294722, 294718, 294715, 294710, 294703, 294653, 294636, 294633, 294632, 294593, 294591, 294590, 294587, 294586, 294563, 294562, 294558, 294556, 294553, 294531, 294530, 294529, 294527, 294526, 294525, 294524, 294310, 294308, 294307, 294305, 294242, 294241, 294212, 294210, 294208, 294205, 294201, 294200, 294199, 294156, 294155, 294154, 294152, 294131, 294013, 294012, 294011, 294010, 294009]
2 | 
3 | V0vspt_perc_v0m_2016: [258537, 258499, 258477, 258456, 258454, 258452, 258426, 258393, 258391, 258387, 258359, 258336, 258332, 258307, 258306, 258303, 258302, 258301, 258299, 258278, 258274, 258273, 258271, 258270, 258258, 258257, 258256, 258204, 258203, 258202, 258198, 258197, 258178, 258117, 258114, 258113, 258109, 258108, 258107, 258063, 258062, 258060, 258059, 258053, 258049, 258045, 258042, 258041, 258039, 258019, 258017, 258014, 258012, 258008, 258003, 257992, 257989, 257986, 257979, 257963, 257960, 257957, 257939, 257937, 257936, 257855, 257853, 257851, 257850, 257804, 257803, 257800, 257799, 257798, 257797, 257773, 257765, 257757, 257754, 257737, 257735, 257734, 257733, 257727, 257725, 257724, 257697, 257694, 257692, 257691, 257689, 257688, 257687, 257685, 257684, 257682, 257644, 257642, 257636, 257635, 257632, 257630, 257606, 257605, 257604, 257601, 257595, 257594, 257592, 257590, 257588, 257587, 257566, 257562, 257561, 257560, 257541, 257540, 257539, 257537, 257531, 257530, 257492, 257491, 257490, 257488, 257487, 257474, 257468, 257457, 257433, 257364, 257358, 257330, 257322, 257320, 257318, 257260, 257224, 257209, 257206, 257204, 257144, 257141, 257139, 257138, 257137, 257136, 257100, 257095, 257092, 257086, 257084, 257082, 257080, 257077, 257012, 257011, 256944, 256942, 256941, 258498, 258388, 258280, 257932, 257912, 257901, 257071, 259888, 259868, 259867, 259866, 259860, 259842, 259841, 259822, 259789, 259788, 259781, 259756, 259752, 259751, 259750, 259748, 259747, 259477, 259473, 259396, 259395, 259394, 259389, 259388, 259382, 259378, 259342, 259341, 259340, 259339, 259336, 259334, 259307, 259305, 259303, 259302, 259274, 259273, 259272, 259271, 259270, 259269, 259264, 259263, 259261, 259257, 259204, 259164, 259162, 259118, 259117, 259099, 259096, 259091, 259090, 259088, 258964, 258962, 259381, 259086, 264035, 264033, 263985, 263984, 263981, 263978, 263977, 263923, 263920, 263917, 263916, 263905, 263866, 263863, 263810, 263803, 263793, 263792, 263790, 263787, 263786, 263785, 263784, 263744, 263743, 263741, 263739, 263738, 263737, 263691, 263690, 263682, 263663, 263662, 263657, 263654, 263652, 263647, 263529, 263497, 263496, 263490, 263487, 263332, 263331, 262858, 262855, 262853, 262849, 262847, 262844, 262842, 262841, 262778, 262777, 262776, 262768, 262760, 262727, 262725, 262723, 262719, 262717, 262713, 262708, 262706, 262705, 262428, 262426, 262425, 262424, 263979, 262635, 262632, 262628, 262624, 262594, 262593, 262583, 262578, 262574, 262572, 262571, 262570, 262569, 262568, 262567, 262563, 262537, 262533, 262532, 262528, 262492, 262490, 262489, 262487, 262451, 262450, 262430, 264347, 264346, 264345, 264341, 264336, 264312, 264306, 264305, 264281, 264279, 264277, 264273, 264267, 264266, 264265, 264264, 264262, 264261, 264260, 264259, 264238, 264235, 264233, 264232, 264198, 264197, 264194, 264190, 264188, 264168, 264164, 264139, 264138, 264137, 264129, 264110, 264109, 264086, 264085, 264082, 264078, 264076] 
4 | 
5 | 


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFee_BRpythia8.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFee_BRpythia8.root


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFee_BRpythia8_SepContr_PDG2020.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFee_BRpythia8_SepContr_PDG2020.root


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8.root


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8_PDG2020.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_13TeV_y05_FFptDepLHCb_BRpythia8_PDG2020.root


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_502TeV_y05_FFee_BRpythia8.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_502TeV_y05_FFee_BRpythia8.root


--------------------------------------------------------------------------------
/machine_learning_hep/data/fonll/DmesonLcPredictions_502TeV_y05_FFptDepLHCb_BRpythia8.root:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/alisw/MachineLearningHEP/293e78c1c1b15e8f4c712d86c43d48c6e01b359d/machine_learning_hep/data/fonll/DmesonLcPredictions_502TeV_y05_FFptDepLHCb_BRpythia8.root


--------------------------------------------------------------------------------
/machine_learning_hep/fitting/README.md:
--------------------------------------------------------------------------------
  1 | # Fitting
  2 | 
  3 | ## Introduction
  4 | 
  5 | The fitting sub-module is split into two parts `fitting.py`, `utils.py` and `helpers.py`. The first two are in principle independent of this package and do not know anything about the structure. For instance, there is no usage of any database configuration. Hence, these classes and functions found in `fitting.py` and `utils.py` are self-consistent.
  6 | On the other hand, `helper.py` contains the interfaces between this package and the fit classes. Here, the desired configuration and initialisation for the fitters is extracted and the fit objects are instantiated accordingly. Finally, the class `MLFitter` is a wrapper around all fits needed in this package.
  7 | 
  8 | ## Structure
  9 | 
 10 | ### `fitting.py`
 11 | 
 12 | **FitBase**
 13 | This is the base class all concrete fit classes derive from. Each fit then has a kernel which can be accessed via `self.kernel`. This object will be defined and set in deriving classes and is responsible for the actual fitting procedure.
 14 | 
 15 | **FitROOT**
 16 | This is the base class for all fits depending on `ROOT`. Such an object collects at least `ROOT` objects in `self.root_objects` which can be serialised along with a corresponding fit object in order to recover the fit object in a later run again.
 17 | 
 18 | **FitAliHF**
 19 | This class uses the `AliInvMassFitter` as it is defined in [AliPhysics](https://github.com/alisw/AliPhysics/blob/master/PWGHF/vertexingHF/AliHFInvMassFitter.h) and hence it comes with all its features.
 20 | 
 21 | **FitROOTGauss**
 22 | This class implements a simple Gaussian fit.
 23 | 
 24 | ### `utils.py`
 25 | 
 26 | **save_fit** and **load_fit** are used to serialise a fit object to disk or to read a serialised configuration back and construct a fit object from that.
 27 | 
 28 | ### `helper.py`
 29 | Here, all `MLHEP` specific classes and helper functions are defined.
 30 | 
 31 | **MLFitParsFactory**
 32 | This class builds an abstraction layer and is responsible to understand the fit configuration given in the databases (such as `database_ml_parameters_<particlename>.yml`). Fit configurations are packed in a unified way to be used further to create and initialise fit according to those defined in `fitting.py`.
 33 | 
 34 | **MLFitter**
 35 | All fits used in an analysis run are handled here.
 36 | 
 37 | ## Database settings
 38 | 
 39 | A full configuration of the raw yield (aka multi trial) systematics in a database looks like
 40 | 
 41 | ```yaml
 42 | systematics:
 43 |   # For now don't do these things per pT bin
 44 |   max_chisquare_ndf: 2. # optional, maximum red. chi2 accepted (default: 2.)
 45 |   min_signif: 3. # optional, minimal significance required (default: 3.)
 46 |   rebin: [-1,0,1] # required, for no variation just put [0]
 47 |   massmin: [2.14, 2.13, 2.12, 2.15, 2.17] # required, for no variation put [<min_value>]
 48 |   massmax: [2.436, 2.435, 2.434, 2.437, 2.438] # required, for no variation put [<max_value>]
 49 |   bincount_sigma: [3, 5] # required (at the moment, will be made optional)
 50 |   bkg_funcs: [kExpo, kLin, Pol2, Pol3, Pol4, Pol5] # required
 51 |   # Whether to include the free sigma option in the derivation of raw yield uncertainty in given pT bin
 52 |   consider_free_sigma: [False, False, False, False, False, False] # optional, one value for each pT bin, choose between True or False
 53 |   # Put relative variation from central sigma separately for varying up/donw, e.g.
 54 |   # Whenever it evaluates to False it's not taken into account
 55 |   rel_var_sigma_up: [0.1, 0.05, False, 0.2, False, False] # optional
 56 |   rel_var_sigma_down: [False, 0.1, False, False, 0.1, False] # optional
 57 | ```
 58 | 
 59 | The bin count is at the moment always taken into account and it should be seen as a cross-check
 60 | 
 61 | ## Example usage
 62 | 
 63 | Here is a small example (as it actually looks like in the package but with some further comments)
 64 | 
 65 | ```python
 66 | 
 67 | """
 68 | Create an MLFitter object given
 69 | 1. config_database: the configuration dictionary where the fit parameters can be found. This is forwarded to an MLFitParsFactory object internally.
 70 | 2. analysis_type: the specified analysis-section to be lookd up in the config_database where the fit parameters for the specified analysis are defined
 71 | 3. histogram_filepath_data, histogram_filepath_mc: file paths to ROOT files where histograms can be found which should be fitted.
 72 | """
 73 | fitter = MLFitter(config_database, analysis_type, histogram_filepath_data, histogram_filepath_mc)
 74 | 
 75 | """
 76 | This performs fits in inclusive bins of the second binning variable defined in the analysis section
 77 | of the database. These pre-fits are usedto initialise the central fit. How to do that is derived
 78 | from the database parameters and handled by the MLFitter and MLFitParsFactory objects.
 79 | """
 80 | fitter.perform_pre_fits()
 81 | 
 82 | # Central fits are performed.
 83 | fitter.perform_central_fits()
 84 | 
 85 | # Specify a file where fit summary plots will be saved.
 86 | fileout_name = "summary_fit_plots.root"
 87 | fileout = TFile(fileout_name, "RECREATE")
 88 | 
 89 | """
 90 | Fit plots are saved in the directory fit_plots_save_dir and summary plots are also saved in the
 91 | specified ROOT file (can also be an abstract TDirectory) given that it is not None.
 92 | """
 93 | fitter.draw_fits(fit_plots_save_dir, fileout)
 94 | fileout.Close()
 95 | 
 96 | # Serialize all fits to the directory fit_save_dir
 97 | self.fitter.save_fits(fit_save_dir)
 98 | 
 99 | # ... do something in the meantime or re-start the analysis workflow ...
100 | 
101 | # Look for the fitter
102 | if not fitter:
103 |     fitter = MLFitter(config_database, analysis_type, histogram_filepath_data, histogram_filepath_mc)
104 |     # Read back fits serialised to fit_save_dir if possible
105 |     if not fitter.load_fits(fit_save_dir):
106 |         print(f"FATAL: Cannot load fits from dir {fit_save_dir}")
107 |         return
108 | 
109 | # Get a fit passing the bins ibin1 and ibin2 of the fit variables the fits where done in differentially
110 | fit = fitter.get_central_fit(ibin1, ibin2)
111 | 
112 | # If the fit could not be loaded or was not successful back then, return (or do something else...)
113 | if not fit:
114 |     print(f"FATAL: Cannot access fit in bins ({ibin1}, {ibin2})")
115 |     return
116 | if not fit.success:
117 |     print(f"Fit in bins ({ibin1}, {ibin2}) not successful, skip...")
118 |     return
119 | ```
120 |     
121 | 
122 | 
123 | 
124 | 
125 | 
126 | 
127 | 


--------------------------------------------------------------------------------
/machine_learning_hep/fitting/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 


--------------------------------------------------------------------------------
/machine_learning_hep/fitting/utils.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | 
 16 | """
 17 | Common utility functions for fitting.
 18 | Interfacing with
 19 |     1. OS / serialization of fitters
 20 |     2. user configuration database
 21 | Providing and storing fitters
 22 | """
 23 | 
 24 | import inspect
 25 | from math import ceil
 26 | from os.path import join
 27 | 
 28 | # pylint: disable=import-error, no-name-in-module, unused-import
 29 | from ROOT import TFile
 30 | 
 31 | from machine_learning_hep.io import checkdir, dump_yaml_from_dict, parse_yaml
 32 | from machine_learning_hep.logger import get_logger
 33 | 
 34 | 
 35 | def construct_rebinning(histo, rebin):
 36 |     try:
 37 |         iter(rebin)
 38 |         min_rebin = rebin[0]
 39 |         rebin_min_entries_per_bin = rebin[1]
 40 |         max_rebin = rebin[2]
 41 |         entries_per_bin = histo.Integral() / histo.GetNbinsX()
 42 |         rebin = rebin_min_entries_per_bin / entries_per_bin
 43 |         if rebin > max_rebin:
 44 |             return max_rebin
 45 |         if min_rebin and min_rebin < rebin:
 46 |             return min_rebin
 47 |         if rebin < 1:
 48 |             return None
 49 |         return ceil(rebin)
 50 |     except TypeError:
 51 |         return rebin
 52 | 
 53 | 
 54 | def save_fit(fit, save_dir, annotations=None):
 55 |     if not fit.has_attempt:
 56 |         get_logger().warning("Fit has not been done and will hence not be saved")
 57 |         return
 58 | 
 59 |     checkdir(save_dir)
 60 | 
 61 |     root_file_name = join(save_dir, "root_objects.root")
 62 |     root_file = TFile.Open(root_file_name, "RECREATE")
 63 |     root_file.cd()
 64 | 
 65 |     for name, root_object in fit.root_objects.items():
 66 |         if root_object:
 67 |             root_object.Write(name)
 68 |     fit.kernel.Write("kernel")
 69 |     root_file.Close()
 70 | 
 71 |     yaml_path = join(save_dir, "init_pars.yaml")
 72 |     dump_yaml_from_dict(fit.init_pars, yaml_path)
 73 | 
 74 |     yaml_path = join(save_dir, "fit_pars.yaml")
 75 |     dump_yaml_from_dict(fit.fit_pars, yaml_path)
 76 | 
 77 |     class_name = fit.__class__.__name__
 78 |     meta_info = {"fit_class": class_name, "success": fit.success}
 79 |     if annotations:
 80 |         meta_info["annotations"] = annotations
 81 | 
 82 |     yaml_path = join(save_dir, "meta.yaml")
 83 |     dump_yaml_from_dict(meta_info, yaml_path)
 84 | 
 85 | 
 86 | def load_fit(save_dir):
 87 |     yaml_path = join(save_dir, "meta.yaml")
 88 |     meta_info = parse_yaml(yaml_path)
 89 | 
 90 |     yaml_path = join(save_dir, "init_pars.yaml")
 91 | 
 92 |     # pylint: disable=import-outside-toplevel
 93 |     import machine_learning_hep.fitting.fitters as search_module
 94 | 
 95 |     # pylint: enable=import-outside-toplevel
 96 |     fit_classes = {
 97 |         f[0]: getattr(search_module, f[0])
 98 |         for f in inspect.getmembers(search_module, inspect.isclass)
 99 |         if f[1].__module__ == search_module.__name__
100 |     }
101 |     fit = None
102 |     if meta_info["fit_class"] in fit_classes:
103 |         fit = fit_classes[meta_info["fit_class"]](parse_yaml(yaml_path))
104 |     else:
105 |         get_logger().fatal("Fit class %s is invalid")
106 | 
107 |     yaml_path = join(save_dir, "fit_pars.yaml")
108 |     fit.fit_pars = parse_yaml(yaml_path)
109 | 
110 |     root_file_name = join(save_dir, "root_objects.root")
111 |     root_file = TFile.Open(root_file_name, "READ")
112 | 
113 |     keys = root_file.GetListOfKeys()
114 | 
115 |     root_objects = {}
116 |     for k in keys:
117 |         if k.GetName() == "kernel":
118 |             fit.kernel = k.ReadObj()
119 |             continue
120 |         obj = k.ReadObj()
121 |         obj.SetDirectory(0)
122 |         root_objects[k.GetName()] = obj
123 |     root_file.Close()
124 | 
125 |     fit.set_root_objects(root_objects)
126 |     fit.success = meta_info["success"]
127 |     fit.init_fit()
128 | 
129 |     if "annotations" not in meta_info:
130 |         return fit
131 |     return fit, meta_info["annotations"]
132 | 


--------------------------------------------------------------------------------
/machine_learning_hep/io.py:
--------------------------------------------------------------------------------
 1 | #  © Copyright CERN 2018. All rights not expressly granted are reserved.  #
 2 | #                 Author: Gian.Michele.Innocenti@cern.ch                  #
 3 | # This program is free software: you can redistribute it and/or modify it #
 4 | #  under the terms of the GNU General Public License as published by the  #
 5 | # Free Software Foundation, either version 3 of the License, or (at your  #
 6 | # option) any later version. This program is distributed in the hope that #
 7 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
 8 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
 9 | #           See the GNU General Public License for more details.          #
10 | #    You should have received a copy of the GNU General Public License    #
11 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
12 | 
13 | """
14 | Methods to: manage input/output
15 | """
16 | 
17 | import os
18 | from inspect import isclass
19 | from numbers import Number
20 | 
21 | import yaml  # pylint: disable=import-error
22 | 
23 | from machine_learning_hep.logger import get_logger
24 | 
25 | 
26 | def dict_yamlable(params):
27 |     """make dictionary ready for yaml.safe_dump
28 |     Args:
29 |         params: dict
30 |             dictionary to modify
31 |     Returns:
32 |         dict: modified dictionary which can be used with yaml.safe_dump
33 | 
34 |     """
35 |     params_seri = {}
36 |     for k, v in params.items():
37 |         if isinstance(v, dict):
38 |             params_seri[k] = dict_yamlable(v)
39 |         else:
40 |             if isinstance(v, (Number, str, list, tuple)):
41 |                 # This we can handle with standard PyYAML
42 |                 params_seri[k] = v
43 |             elif isclass(v):
44 |                 params_seri[k] = f"custom:{v.__name__}"
45 |             else:
46 |                 params_seri[k] = f"custom:{v.__class__.__name__}"
47 |     return params_seri
48 | 
49 | 
50 | def parse_yaml(filepath):
51 |     """
52 |     Parse a YAML file and return dictionary
53 |     Args:
54 |         filepath: Path to the YAML file to be parsed.
55 |     """
56 |     if not os.path.isfile(filepath):
57 |         get_logger().critical("YAML file %s does not exist.", filepath)
58 |     with open(filepath, "r", encoding="utf-8") as f:
59 |         return yaml.safe_load(f)
60 | 
61 | 
62 | def dump_yaml_from_dict(to_yaml, path):
63 |     path = os.path.expanduser(path)
64 |     with open(path, "w", encoding="utf-8") as stream:
65 |         yaml.safe_dump(to_yaml, stream, default_flow_style=False, allow_unicode=False, sort_keys=False)
66 | 
67 | 
68 | def checkdir(path):
69 |     """
70 |     Check for existence of directory and create if not existing
71 |     """
72 |     if not os.path.exists(path):
73 |         os.makedirs(path)
74 | 
75 | 
76 | def print_dict(to_be_printed, indent=0, skip=None):
77 |     for key, value in to_be_printed.items():
78 |         if isinstance(skip, list) and key in skip:
79 |             continue
80 |         print("\t" * indent + str(key))
81 |         if isinstance(value, dict):
82 |             print_dict(value, indent + 1)
83 |         else:
84 |             print("\t" * (indent + 1) + str(value))
85 | 


--------------------------------------------------------------------------------
/machine_learning_hep/logger.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2024. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | """
 16 | Methods to: provide and manage central logging utility
 17 | """
 18 | 
 19 | import logging
 20 | import sys
 21 | from copy import copy
 22 | 
 23 | 
 24 | class ExitHandler(logging.Handler):
 25 |     """
 26 |     Add custom logging handler to exit on certain logging level
 27 |     """
 28 | 
 29 |     def emit(self, record):
 30 |         logging.shutdown()
 31 |         sys.exit(1)
 32 | 
 33 | 
 34 | class MLLoggerFormatter(logging.Formatter):
 35 |     """
 36 |     A custom formatter that colors the levelname on request
 37 |     """
 38 | 
 39 |     # color names to indices
 40 |     color_map = {
 41 |         "black": 0,
 42 |         "red": 1,
 43 |         "green": 2,
 44 |         "yellow": 3,
 45 |         "blue": 4,
 46 |         "magenta": 5,
 47 |         "cyan": 6,
 48 |         "white": 7,
 49 |     }
 50 | 
 51 |     level_map = {
 52 |         logging.DEBUG: (None, "blue", False),
 53 |         logging.INFO: (None, "green", False),
 54 |         logging.WARNING: (None, "yellow", False),
 55 |         logging.ERROR: (None, "red", False),
 56 |         logging.CRITICAL: ("red", "white", True),
 57 |     }
 58 |     csi = "\x1b["
 59 |     reset = "\x1b[0m"
 60 | 
 61 |     # Define default format string
 62 |     def __init__(self, fmt=None, datefmt=None, style="%", color=False):
 63 |         fmt = fmt or "%(levelname)s %(asctime)s - %(pathname)s:%(lineno)d:\n ↳ %(message)s"
 64 |         logging.Formatter.__init__(self, fmt, datefmt, style)
 65 |         self.color = color
 66 | 
 67 |     def format(self, record):
 68 |         # Copy the record so the global format is kept
 69 |         cached_record = copy(record)
 70 |         requ_color = self.color
 71 |         # Could be a lambda so check for callable property
 72 |         if callable(self.color):
 73 |             requ_color = self.color()
 74 |         # Make sure levelname takes same space for all cases
 75 |         cached_record.levelname = f"{cached_record.levelname:8}"
 76 |         # Colorize if requested
 77 |         if record.levelno in self.level_map and requ_color:
 78 |             bg, fg, bold = self.level_map[record.levelno]
 79 |             params = []
 80 |             if bg in self.color_map:
 81 |                 params.append(str(self.color_map[bg] + 40))
 82 |             if fg in self.color_map:
 83 |                 params.append(str(self.color_map[fg] + 30))
 84 |             if bold:
 85 |                 params.append("1")
 86 |             if params:
 87 |                 cached_record.levelname = "".join(
 88 |                     (self.csi, ";".join(params), "m", cached_record.levelname, self.reset)
 89 |                 )
 90 |         return logging.Formatter.format(self, cached_record)
 91 | 
 92 | 
 93 | def configure_logger(debug, logfile=None, quiet=False):
 94 |     """
 95 |     Basic configuration adding a custom formatted StreamHandler and turning on
 96 |     debug info if requested.
 97 |     """
 98 |     logger = logging.getLogger("MachinelearningHEP")
 99 |     if logger.hasHandlers():
100 |         return
101 | 
102 |     logger.setLevel(logging.DEBUG if debug else logging.INFO)
103 | 
104 |     sh = logging.StreamHandler()
105 |     formatter = MLLoggerFormatter(
106 |         color=lambda: getattr(sh.stream, "isatty", None), fmt="%(levelname)s ➞ %(message)s" if quiet else None
107 |     )
108 | 
109 |     sh.setFormatter(formatter)
110 |     logger.addHandler(sh)
111 | 
112 |     # Add logfile on request
113 |     if logfile is not None:
114 |         # Specify output format
115 |         fh = logging.FileHandler(logfile)
116 |         fh.setFormatter(MLLoggerFormatter())
117 |         logger.addHandler(fh)
118 | 
119 |     # Add handler to exit at critical. Do this as the last step so all former
120 |     # logger flush before aborting
121 |     logger.addHandler(ExitHandler(logging.CRITICAL))
122 | 
123 | 
124 | def get_logger():
125 |     """
126 |     Get the global logger for this package and set handler together with formatters.
127 |     """
128 |     # configure_logger(False, None)
129 |     return logging.getLogger("MachinelearningHEP")
130 | 


--------------------------------------------------------------------------------
/machine_learning_hep/optimisation/README.md:
--------------------------------------------------------------------------------
 1 | # Machine Learning and optimisation
 2 | 
 3 | 
 4 | ## Basic Machine Learning
 5 | 
 6 | ML and optimisation `Python` source files to be moved here...
 7 | 
 8 | 
 9 | ## Bayesian optimisation
10 | 
11 | Bayesian optimiastion can be used instead of a brute-force grid search to optimise the hyperparameters of a model. It might superior in the sense that it does not try all possible combinations of parameters to be varied. Instead, it takes previous performance and parameter settings, to decide on setting for a next **trial**.
12 | 
13 | This package uses Bayesian optimisation for its models via the [hyperopt package](https://github.com/hyperopt/hyperopt). A parameter space is defined to draw the values from which is already done in `templates_xgboost.py` where one can find the corresponding implementation:
14 | 
15 | ```python
16 | 
17 | def xgboost_classifier_bayesian_space():
18 |     return {"max_depth": hp.quniform("x_max_depth", 1, 6, 1),
19 |             "n_estimators": hp.quniform("x_n_estimators", 600, 1000, 1),
20 |             "min_child_weight": hp.quniform("x_min_child", 1, 4, 1),
21 |             "subsample": hp.uniform("x_subsample", 0.5, 0.9),
22 |             "gamma": hp.uniform("x_gamma", 0.0, 0.2),
23 |             "colsample_bytree": hp.uniform("x_colsample_bytree", 0.5, 0.9),
24 |             "reg_lambda": hp.uniform("x_reg_lambda", 0, 1),
25 |             "reg_alpha": hp.uniform("x_reg_alpha", 0, 1),
26 |             "learning_rate": hp.uniform("x_learning_rate", 0.05, 0.35),
27 |             "max_delta_step": hp.quniform("x_max_delta_step", 0, 8, 2)}
28 | ```
29 | 
30 | In this case all parameters are uniformly distributed (there are more ways to do that, see [hyperopt's wiki](https://github.com/hyperopt/hyperopt/wiki/FMin#21-parameter-expressions)). For each trial, a new set is drawn and used for fitting the next model.
31 | 
32 | More explanation is coming soon...
33 | 
34 | 
35 | ## How to use it
36 | 
37 | In order to use Bayesian optimisation for a model, you need to do the following (`templates_xgboost.py` is taken here as an example):
38 | 
39 | 1. Derive a class from `BayesianOpt` (if you are interested, you can find it in `optimisation/bayesian_opt.py` in this package).
40 | 2. Implement the method `yield_model_(self, model_config, space)` which must return a model constructed from the configuration parameters passed via `model_config` (parameters for central model) and `space` (drawn from the parameter space as explained above). The user still has the freedom to combine/overwrite parameters of the central configuration. Of course, it would be overhead to draw from many parameters but only use a few... **NOTE** that all parameters in `space` are floating point numbers and it is you responsibility to cast that to an integer if needed (as it would be e.g. necessary for XGBoost's `n_estimators`).
41 | 3. Implement the method `save_model_(self, model, out_dir)` where `out_dir` is the directory to save the `model` in. This has to be done by the user because `BayesianOpt` has inherently no idea about the actual model implementation; hence, it cannot know how to save it.
42 | 
43 | These first three steps are in principle independent of the package and one could use this class easily somewhere else to use this type of optimisation. At some point, it has to be made sure that further members of `BayesianOpt` are set, such as the training data and other parameters:
44 | 
45 | ```python
46 | 
47 | # Train samples
48 | self.x_train = None
49 | self.y_train = None
50 | 
51 | # Nominal model configuration dict
52 | self.model_config = model_config
53 | 
54 | # Space to draw parameter values for Bayesian optimisation
55 | self.space = space
56 | 
57 | # KFolds for CV
58 | self.nkfolds = 1
59 | 
60 | # Number of trials
61 | self.n_trials = 100
62 | 
63 | # Scorers
64 | self.scoring = None
65 | # Optimise with this score
66 | self.scoring_opt = None
67 | 
68 | # Min- or maximise?
69 | self.low_is_better = True
70 | 
71 | ```
72 | 
73 | Afterwards, it can be run like (here you see which members **must** be set:
74 | 
75 | 
76 | ```python
77 | bayes_opt.x_train = x_train # must be set
78 | bayes_opt.y_train = y_train # must be set
79 | bayes_opt.nkfolds = 5 # can be changed, default is 1
80 | bayes_opt.scoring = {"AUC": auc_scorer, "Accuracy": accuracy_scorer} # needs to be a dictionary mapping a scoring function to its name, all metrics are evaluated
81 | bayes_opt.scoring_opt = "AUC" # must be one key from the above, this is used for optimisation
82 | bayes_opt.low_is_better = False # indicate if metric needs to be minimised or maximised
83 | bayes_opt.n_trials = 100 # can be changed, default is indeed 100
84 | 
85 | bayes_opt.optimise(ncores=ncores) # optimisation, number of cores can be set on-the-fly
86 | bayes_opt.save(out_dir) # save results and model in output_dir
87 | bayes_opt.plot(out_dir) # plot results in output_dir
88 | 
89 | ```
90 | 
91 | The **package specific** part is to provide a function `<full_model_name>_bayesian_opt(model_config)` in the `templates_<model_class>.py` whose only task is to return an instance of your derived `BayesianOpt` class. `model_config` are again the central model parameters which - in the MLHEP package - are defined in `data/config_model_parameters.yml`. This can just be forwarded as the first argument in the constructor while the second one should be the space needed by `hyperopt`. For XGBoost models in the package, this is constructed and returned by `xgboost_classifier_bayesian_opt_space()` as already mentioned above.
92 | 
93 | That's it and as you can see, you need roughly only 30 lines to put a full blown Bayesian optimisation in place.
94 | 


--------------------------------------------------------------------------------
/machine_learning_hep/optimisation/grid_search.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | """
 16 | Methods to do grid-search hyper-parameters optimization
 17 | """
 18 | 
 19 | import itertools
 20 | import pickle
 21 | from os.path import join as osjoin
 22 | 
 23 | import matplotlib.pyplot as plt
 24 | import pandas as pd
 25 | from sklearn.model_selection import GridSearchCV
 26 | 
 27 | from machine_learning_hep.io import dump_yaml_from_dict, parse_yaml, print_dict
 28 | from machine_learning_hep.logger import get_logger
 29 | from machine_learning_hep.models import savemodels
 30 | from machine_learning_hep.optimisation.metrics import get_scorers
 31 | from machine_learning_hep.utilities import openfile
 32 | 
 33 | 
 34 | def do_gridsearch(names, classifiers, grid_params, x_train, y_train, nkfolds, out_dirs, ncores=-1):
 35 |     """Hyperparameter grid search for a list of classifiers
 36 | 
 37 |     Given a list of classifiers, do a hyperparameter grid search based on a corresponding
 38 |     set of parameters
 39 | 
 40 |     Args:
 41 |         names: iteratable of classifier names
 42 |         classifiers: iterable of classifiers
 43 |         grid_params: iterable of parameters used to perform the grid search
 44 |         x_train: feature dataframe
 45 |         y_train: targets dataframe
 46 |         nkfolds: int, cross-validation generator or an iterable
 47 |         out_dirs: Write parameters and pickle of summary dataframe
 48 |         ncores: number of cores to distribute jobs to
 49 |     Returns:
 50 |         lists of grid search models, the best model and scoring dataframes
 51 |     """
 52 | 
 53 |     logger = get_logger()
 54 | 
 55 |     for clf_name, clf, gps, out_dir in zip(names, classifiers, grid_params, out_dirs):
 56 |         if not gps:
 57 |             logger.info("Nothing to be done for grid search of model %s", clf_name)
 58 |             continue
 59 |         logger.info("Grid search for model %s with following parameters:", clf_name)
 60 |         print_dict(gps)
 61 | 
 62 |         # To work for probabilities. This will call model.decision_function or
 63 |         # model.predict_proba as it is done for the nominal ROC curves as well to decide on the
 64 |         # performance
 65 |         scoring = get_scorers(gps["scoring"])
 66 | 
 67 |         grid_search = GridSearchCV(
 68 |             clf,
 69 |             gps["params"],
 70 |             cv=nkfolds,
 71 |             refit=gps["refit"],
 72 |             scoring=scoring,
 73 |             n_jobs=ncores,
 74 |             verbose=2,
 75 |             return_train_score=True,
 76 |         )
 77 |         grid_search.fit(x_train, y_train)
 78 |         cvres = grid_search.cv_results_
 79 | 
 80 |         # Save the results as soon as we have them in case something goes wrong later
 81 |         # (would be quite unfortunate to loose grid search reults...)
 82 |         out_file = osjoin(out_dir, "results.pkl")
 83 |         pickle.dump(pd.DataFrame(cvres), openfile(out_file, "wb"), protocol=4)
 84 |         # Parameters
 85 |         dump_yaml_from_dict(gps, osjoin(out_dir, "parameters.yaml"))
 86 |         savemodels((clf_name,), (grid_search.best_estimator_,), out_dir, "")
 87 | 
 88 | 
 89 | # pylint: disable=too-many-locals, too-many-statements
 90 | def perform_plot_gridsearch(names, out_dirs):
 91 |     """
 92 |     Function for grid scores plotting (working with scikit 0.20)
 93 |     """
 94 |     logger = get_logger()
 95 | 
 96 |     for name, out_dir in zip(names, out_dirs):
 97 |         # Read written results
 98 |         gps = parse_yaml(osjoin(out_dir, "parameters.yaml"))
 99 |         score_obj = pickle.load(openfile(osjoin(out_dir, "results.pkl"), "rb"))
100 | 
101 |         param_keys = [f"param_{key}" for key in gps["params"].keys()]
102 |         if not param_keys:
103 |             logger.warning("Add at least 1 parameter (even just 1 value)")
104 |             continue
105 | 
106 |         # Re-arrange scoring such that the refitted one is always on top
107 |         score_names = gps["scoring"]
108 |         refit_score = gps["refit"]
109 |         del score_names[score_names.index(refit_score)]
110 |         score_names.insert(0, refit_score)
111 | 
112 |         # Extract scores
113 |         x_labels = []
114 |         y_values = {}
115 |         y_errors = {}
116 | 
117 |         for sn in score_names:
118 |             y_values[sn] = {"train": [], "test": []}
119 |             y_errors[sn] = {"train": [], "test": []}
120 | 
121 |         # Get indices of values to put on x-axis and identify parameter combination
122 |         values_indices = [range(len(values)) for values in gps["params"].values()]
123 | 
124 |         y_axis_mins = {sn: 9999 for sn in score_names}
125 |         y_axis_maxs = {sn: -9999 for sn in score_names}
126 |         for indices, case in zip(itertools.product(*values_indices), itertools.product(*list(gps["params"].values()))):
127 |             df_case = score_obj.copy()
128 |             for i_case, i_key in zip(case, param_keys):
129 |                 df_case = df_case.loc[df_case[i_key] == df_case[i_key].dtype.type(i_case)]
130 | 
131 |             x_labels.append(",".join([str(i) for i in indices]))
132 |             # As we just nailed it down to one value
133 |             for sn in score_names:
134 |                 for tt in ("train", "test"):
135 |                     y_values[sn][tt].append(df_case[f"mean_{tt}_{sn}"].values[0])
136 |                     y_errors[sn][tt].append(df_case[f"std_{tt}_{sn}"].values[0])
137 |                     y_axis_mins[sn] = min(y_axis_mins[sn], y_values[sn][tt][-1])
138 |                     y_axis_maxs[sn] = max(y_axis_maxs[sn], y_values[sn][tt][-1])
139 | 
140 |         # Prepare text for parameters
141 |         text_parameters = "\n".join([f"{key}: {values}" for key, values in gps["params"].items()])
142 | 
143 |         # To determine fontsizes later
144 |         figsize = (35, 18 * len(score_names))
145 |         fig, axes = plt.subplots(len(score_names), 1, sharex=True, gridspec_kw={"hspace": 0.05}, figsize=figsize)
146 |         ax_plot = dict(zip(score_names, axes))
147 | 
148 |         # The axes to put the parameter list
149 |         ax_main = axes[-1]
150 |         # The axes with the title being on top
151 |         ax_top = axes[0]
152 | 
153 |         points_per_inch = 72
154 |         markerstyles = ["o", "+"]
155 |         markersize = 20
156 | 
157 |         for sn in score_names:
158 |             ax = ax_plot[sn]
159 |             ax_min = y_axis_mins[sn] - (y_axis_maxs[sn] - y_axis_mins[sn]) / 10.0
160 |             ax_max = y_axis_maxs[sn] + (y_axis_maxs[sn] - y_axis_mins[sn]) / 10.0
161 |             ax.set_ylim(ax_min, ax_max)
162 |             ax.set_ylabel(f"mean {sn}", fontsize=20)
163 |             ax.get_yaxis().set_tick_params(labelsize=20)
164 | 
165 |             for j, tt in enumerate(("train", "test")):
166 |                 markerstyle = markerstyles[j % len(markerstyles)]
167 | 
168 |                 ax.errorbar(
169 |                     range(len(x_labels)),
170 |                     y_values[sn][tt],
171 |                     yerr=y_errors[sn][tt],
172 |                     ls="",
173 |                     marker=markerstyle,
174 |                     markersize=markersize,
175 |                     label=f"{sn} ({tt})",
176 |                 )
177 | 
178 |                 # Add values to points
179 |                 ylim = ax.get_ylim()
180 |                 plot_labels_offset = (ylim[1] - ylim[0]) / 40
181 |                 for x, y in enumerate(y_values[sn][tt]):
182 |                     ax.text(x, y - plot_labels_offset, f"{y:.4f}", fontsize=20)
183 | 
184 |         ax_main.set_xlabel("parameter indices", fontsize=20)
185 |         ax_top.set_title(f"Grid search {name}", fontsize=30)
186 |         ax_main.get_xaxis().set_tick_params(labelsize=20)
187 |         ax_main.set_xticks(range(len(x_labels)))
188 |         ax_main.set_xticklabels(x_labels, rotation=45)
189 | 
190 |         text_point_size = int(4 * fig.dpi / points_per_inch * figsize[1] / len(gps["params"]))
191 |         xlim = ax_main.get_xlim()
192 |         ylim = ax_main.get_ylim()
193 | 
194 |         xlow = xlim[0] + (xlim[1] - xlim[0]) / 100
195 |         ylow = ylim[0] + (ylim[1] - ylim[0]) / 3
196 |         ax_main.text(xlow, ylow, text_parameters, fontsize=text_point_size)
197 | 
198 |         for ax in ax_plot.values():
199 |             ax.legend(loc="center right", fontsize=20)
200 |         plotname = osjoin(out_dir, "GridSearchResults.png")
201 |         plt.savefig(plotname)
202 |         plt.close(fig)
203 | 


--------------------------------------------------------------------------------
/machine_learning_hep/optimisation/metrics.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | """
16 | Metrics for (ML) optimisation
17 | """
18 | 
19 | from sklearn.metrics import accuracy_score, make_scorer, roc_auc_score
20 | 
21 | 
22 | def get_scorers(score_names):
23 |     """Construct dictionary of scorers
24 | 
25 |     Args:
26 |         score_names: tuple of names. Available names see below
27 |     Returns:
28 |         dictionary mapping scorers to names
29 |     """
30 | 
31 |     scorers = {}
32 |     for sn in score_names:
33 |         if sn == "AUC":
34 |             scorers["AUC"] = make_scorer(roc_auc_score, needs_threshold=True)
35 |         elif sn == "Accuracy":
36 |             scorers["Accuracy"] = make_scorer(accuracy_score)
37 | 
38 |     return scorers
39 | 


--------------------------------------------------------------------------------
/machine_learning_hep/optimization.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | """
 16 | Methods to: utility methods to conpute efficiency and study expected significance
 17 | """
 18 | 
 19 | import matplotlib.pyplot as plt
 20 | import numpy as np
 21 | from matplotlib.ticker import MultipleLocator
 22 | from ROOT import TH1F, TFile  # pylint: disable=import-error,no-name-in-module
 23 | 
 24 | from machine_learning_hep.logger import get_logger
 25 | 
 26 | 
 27 | def select_by_threshold(df_label, label, thr, name):
 28 |     # Changed from >= to > since we use that atm for the nominal selection
 29 |     # See processer.py self.l_selml
 30 |     if label == "bkg":
 31 |         return df_label[df_label[f"y_test_prob{name}{label}"].values <= thr]
 32 |     if label == "":
 33 |         return df_label[df_label[f"y_test_prob{name}{label}"].values > thr]
 34 |     return df_label[df_label[f"y_test_prob{name}{label}"].values >= thr]
 35 | 
 36 | 
 37 | def get_x_axis(num_steps, class_label):
 38 |     ns_left = int(num_steps / 10) - 1
 39 |     ns_right = num_steps - ns_left
 40 |     if class_label == "bkg":
 41 |         ns_left, ns_right = ns_right, ns_left
 42 |     x_axis_left = np.linspace(0.0, 0.49, ns_left)
 43 |     x_axis_right = np.linspace(0.5, 1.0, ns_right)
 44 |     x_axis = np.concatenate((x_axis_left, x_axis_right))
 45 |     return x_axis
 46 | 
 47 | 
 48 | def calc_bkg(
 49 |     df_bkg,
 50 |     name,
 51 |     num_steps,
 52 |     fit_region,
 53 |     bkg_func,
 54 |     bin_width,
 55 |     sig_region,
 56 |     save_fit,  # pylint: disable=too-many-arguments
 57 |     out_dir,
 58 |     pt_lims,
 59 |     invmassvar,
 60 |     mltype,
 61 | ):
 62 |     """
 63 |     Estimate the number of background candidates under the signal peak. This is obtained
 64 |     from real data with a fit of the sidebands of the invariant mass distribution.
 65 |     """
 66 |     logger = get_logger()
 67 |     class_label = "bkg" if mltype == "MultiClassification" else ""
 68 |     x_axis = get_x_axis(num_steps, class_label)
 69 |     bkg_array = []
 70 |     bkg_err_array = []
 71 |     num_bins = (fit_region[1] - fit_region[0]) / bin_width
 72 |     num_bins = int(round(num_bins))
 73 |     bin_width = (fit_region[1] - fit_region[0]) / num_bins
 74 | 
 75 |     if save_fit:
 76 |         logger.debug("Saving bkg fits to file")
 77 |         pt_min = pt_lims[0]
 78 |         pt_max = pt_lims[1]
 79 |         out_file = TFile(f"{out_dir}/bkg_fits_{name}_pt{pt_min:.1f}_{pt_max:.1f}.root", "recreate")
 80 |         out_file.cd()
 81 | 
 82 |     logger.debug("To fit the bkg a %s function is used", bkg_func)
 83 |     for thr in x_axis:
 84 |         bkg = 0.0
 85 |         bkg_err = 0.0
 86 |         hmass = TH1F(f"hmass_{thr:.5f}", "", num_bins, fit_region[0], fit_region[1])
 87 |         df_bkg_sel = select_by_threshold(df_bkg, class_label, thr, name)
 88 |         sel_mass_array = df_bkg_sel[invmassvar].values
 89 | 
 90 |         if len(sel_mass_array) > 5:
 91 |             for mass_value in np.nditer(sel_mass_array):
 92 |                 hmass.Fill(mass_value)
 93 |             fit = hmass.Fit(bkg_func, "Q", "", fit_region[0], fit_region[1])
 94 |             if save_fit:
 95 |                 hmass.Write()
 96 |             if int(fit) == 0:
 97 |                 fit_func = hmass.GetFunction(bkg_func)
 98 |                 bkg = fit_func.Integral(sig_region[0], sig_region[1]) / bin_width
 99 |                 bkg_err = fit_func.IntegralError(sig_region[0], sig_region[1]) / bin_width
100 |                 del fit_func
101 |         elif save_fit:
102 |             hmass.Write()
103 | 
104 |         bkg_array.append(bkg)
105 |         bkg_err_array.append(bkg_err)
106 |         del hmass
107 | 
108 |     out_file.Close()
109 |     return bkg_array, bkg_err_array, x_axis
110 | 
111 | 
112 | def calc_signif(sig_array, sig_err_array, bkg_array, bkg_err_array):
113 |     """
114 |     Calculate the expected signal significance as a function of the treshold on the
115 |     ML model output.
116 |     """
117 |     signif_array = []
118 |     signif_err_array = []
119 | 
120 |     for sig, bkg, sig_err, bkg_err in zip(sig_array, bkg_array, sig_err_array, bkg_err_array):
121 |         signif = 0.0
122 |         signif_err = 0.0
123 | 
124 |         if sig > 0 and (sig + bkg) > 0:
125 |             signif = sig / np.sqrt(sig + bkg)
126 |             signif_err = signif * np.sqrt(
127 |                 (sig_err**2 + bkg_err**2) / (4 * (sig + bkg) ** 2) + (bkg / (sig + bkg)) * sig_err**2 / sig**2
128 |             )
129 | 
130 |         signif_array.append(signif)
131 |         signif_err_array.append(signif_err)
132 | 
133 |     return signif_array, signif_err_array
134 | 
135 | 
136 | def calc_eff(num, den):
137 |     eff = num / den
138 |     eff_err = np.sqrt(eff * (1 - eff) / den)
139 | 
140 |     return eff, eff_err
141 | 
142 | 
143 | def calc_sigeff_steps(num_steps, df_sig, name, mltype):
144 |     logger = get_logger()
145 |     class_label = "bkg" if mltype == "MultiClassification" else ""
146 |     x_axis = get_x_axis(num_steps, class_label)
147 |     if df_sig.empty:
148 |         logger.error("In division denominator is empty")
149 |         eff_array = [0] * num_steps
150 |         eff_err_array = [0] * num_steps
151 |         return eff_array, eff_err_array, x_axis
152 |     num_tot_cand = len(df_sig)
153 |     eff_array = []
154 |     eff_err_array = []
155 |     for thr in x_axis:
156 |         num_sel_cand = len(select_by_threshold(df_sig, class_label, thr, name))
157 |         eff, err_eff = calc_eff(num_sel_cand, num_tot_cand)
158 |         eff_array.append(eff)
159 |         eff_err_array.append(err_eff)
160 | 
161 |     return eff_array, eff_err_array, x_axis
162 | 
163 | 
164 | def prepare_eff_signif_figure(y_label, mltype):
165 |     class_label = "Bkg" if mltype == "MultiClassification" else "Prompt"
166 |     fig = plt.figure(figsize=(20, 15))
167 |     ax = plt.subplot(1, 1, 1)
168 |     ax.set_xlabel(f"{class_label} threshold", fontsize=30)
169 |     ax.set_ylabel(y_label, fontsize=30)
170 |     ax.xaxis.set_major_locator(MultipleLocator(0.1))
171 |     ax.set_xlim(0.0, 1.0)
172 |     ax.tick_params(labelsize=20)
173 |     return fig
174 | 


--------------------------------------------------------------------------------
/machine_learning_hep/plotting/__init__.py:
--------------------------------------------------------------------------------
 1 | #  © Copyright CERN 2018. All rights not expressly granted are reserved.  #
 2 | #                 Author: Gian.Michele.Innocenti@cern.ch                  #
 3 | # This program is free software: you can redistribute it and/or modify it #
 4 | #  under the terms of the GNU General Public License as published by the  #
 5 | # Free Software Foundation, either version 3 of the License, or (at your  #
 6 | # option) any later version. This program is distributed in the hope that #
 7 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
 8 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
 9 | #           See the GNU General Public License for more details.          #
10 | #    You should have received a copy of the GNU General Public License    #
11 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
12 | 


--------------------------------------------------------------------------------
/machine_learning_hep/root.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | """
 16 | Methods to: read and write a ROOT TNtuple
 17 | """
 18 | 
 19 | import array
 20 | import ast
 21 | 
 22 | import numpy as np
 23 | from ROOT import TFile, TNtuple  # pylint: disable=import-error,no-name-in-module
 24 | 
 25 | from machine_learning_hep.logger import get_logger
 26 | 
 27 | 
 28 | def read_ntuple(ntuple, variables):
 29 |     """
 30 |     Return a numpy array with the values from TNtuple.
 31 |       ntuple : input TNtuple
 32 |       variables : list of ntuple variables to read
 33 |     """
 34 |     logger = get_logger()
 35 |     code_list = []
 36 |     for v in variables:
 37 |         code_list += [compile("i.%s" % v, "<string>", "eval")]
 38 |     nentries = ntuple.GetEntries()
 39 |     nvars = len(variables)
 40 |     myarray = np.zeros((nentries, nvars))
 41 |     for n, _ in enumerate(ntuple):
 42 |         for m, v in enumerate(code_list):
 43 |             myarray[n][m] = ast.literal_eval(v)
 44 |         if n % 100000 == 0:
 45 |             logger.info("%d/%d", n, nentries)
 46 |     return myarray
 47 | 
 48 | 
 49 | def read_ntuple_ml(ntuple, variablesfeatures, variablesothers, variabley):
 50 |     """
 51 |     Return a numpy array with the values from TNtuple.
 52 |       ntuple : input TNtuple
 53 |       variables : list of ntuple variables to read
 54 |     """
 55 |     logger = get_logger()
 56 |     code_listfeatures = []
 57 |     code_listothers = []
 58 |     for v in variablesfeatures:
 59 |         code_listfeatures += [compile("i.%s" % v, "<string>", "eval")]
 60 |     for v in variablesothers:
 61 |         code_listothers += [compile("i.%s" % v, "<string>", "eval")]
 62 |     codevariabley = compile("i.%s" % variabley, "<string>", "eval")
 63 |     nentries = ntuple.GetEntries()
 64 |     nvars = len(variablesfeatures)
 65 |     nvarsothers = len(variablesothers)
 66 |     arrayfeatures = np.zeros((nentries, nvars))
 67 |     arrayothers = np.zeros((nentries, nvarsothers))
 68 |     arrayy = np.zeros(nentries)
 69 |     for n, _ in enumerate(ntuple):
 70 |         for m, v in enumerate(code_listfeatures):
 71 |             arrayfeatures[n][m] = ast.literal_eval(v)
 72 |         for m, v in enumerate(code_listothers):
 73 |             arrayothers[n][m] = ast.literal_eval(v)
 74 |         arrayy[n] = ast.literal_eval(codevariabley)
 75 |         if n % 100000 == 0:
 76 |             logger.info("%d/%d", n, nentries)
 77 |     return arrayfeatures, arrayothers, arrayy
 78 | 
 79 | 
 80 | def fill_ntuple(tupname, data, names):
 81 |     """
 82 |     Create and fill ROOT NTuple with the data sample.
 83 |       tupname : name of the NTuple
 84 |       data : data sample
 85 |       names : names of the NTuple variables
 86 |     """
 87 |     variables = ""
 88 |     for n in names:
 89 |         variables += "%s:" % n
 90 |     variables = variables[:-1]
 91 |     values = len(names) * [0.0]
 92 |     avalues = array.array("f", values)
 93 |     nt = TNtuple(tupname, "", variables)
 94 |     for d in data:
 95 |         for i in range(len(names)):
 96 |             avalues[i] = d[i]
 97 |         nt.Fill(avalues)
 98 |     nt.Write()
 99 | 
100 | 
101 | def write_tree(filename, treename, dataframe):
102 |     listvar = list(dataframe)
103 |     values = dataframe.values
104 |     fout = TFile.Open(filename, "recreate")
105 |     fout.cd()
106 |     fill_ntuple(treename, values, listvar)
107 | 
108 | 
109 | def save_root_object(obj, path, name=None, extension="pdf"):
110 |     """
111 |     Function to save a root object in path with a defined extension
112 |     If no name is give, the name of the object is taken as output.
113 |         obj : object to save
114 |         path : path to save the object in
115 |         name : name of the output file
116 |         extension : extension of the output file (e.g. pdf, png, eps)
117 |     """
118 |     name = name if name is not None else obj.GetName()
119 |     obj.SaveAs(f"{path}/{name}.{extension}")
120 | 


--------------------------------------------------------------------------------
/machine_learning_hep/selectionutils.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2024. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | """
 16 | utilities for fiducial acceptance, pid, single topological variable selections and normalization
 17 | """
 18 | 
 19 | import numba
 20 | import numpy as np
 21 | from ROOT import TH1F  # pylint: disable=import-error, no-name-in-module
 22 | 
 23 | from machine_learning_hep.bitwise import filter_bit_df, tag_bit_df
 24 | 
 25 | 
 26 | # @numba.njit
 27 | def selectcandidateml(array_prob, probcut):
 28 |     array_is_sel = []
 29 |     for prob in array_prob:
 30 |         if prob > probcut:
 31 |             array_is_sel.append(True)
 32 |         else:
 33 |             array_is_sel.append(False)
 34 |     return array_is_sel
 35 | 
 36 | 
 37 | @numba.njit
 38 | def select_runs(good_runlist, array_run):
 39 |     array_run_sel = np.zeros(len(array_run), np.bool_)
 40 |     for i, candrun in np.ndenumerate(array_run):
 41 |         for _, goodrun in np.ndenumerate(good_runlist):
 42 |             if candrun == goodrun:
 43 |                 array_run_sel[i] = True
 44 |                 break
 45 |     return array_run_sel
 46 | 
 47 | 
 48 | # (pt > 5 and abs(y) < 0.8) or (pt <= 5 and abs(y) < ...)
 49 | # @numba.njit
 50 | def selectfidacc(array_pt, array_y):
 51 |     array_is_sel = []
 52 |     for icand, pt in enumerate(array_pt):
 53 |         if pt > 5:
 54 |             if abs(array_y[icand]) < 0.8:
 55 |                 array_is_sel.append(True)
 56 |             else:
 57 |                 array_is_sel.append(False)
 58 |         else:
 59 |             yfid = -0.2 / 15 * pt**2 + 1.9 / 15 * pt + 0.5
 60 |             if abs(array_y[icand]) < yfid:
 61 |                 array_is_sel.append(True)
 62 |             else:
 63 |                 array_is_sel.append(False)
 64 |     return array_is_sel
 65 | 
 66 | 
 67 | # pylint: disable=too-many-arguments
 68 | # @numba.njit
 69 | def selectpid_dstokkpi(
 70 |     array_nsigma_tpc_pi_0,
 71 |     array_nsigma_tpc_k_0,
 72 |     array_nsigma_tof_pi_0,
 73 |     array_nsigma_tof_k_0,
 74 |     array_nsigma_tpc_k_1,
 75 |     array_nsigma_tof_k_1,
 76 |     array_nsigma_tpc_pi_2,
 77 |     array_nsigma_tpc_k_2,
 78 |     array_nsigma_tof_pi_2,
 79 |     array_nsigma_tof_k_2,
 80 |     nsigmacut,
 81 | ):
 82 |     array_is_pid_sel = []
 83 | 
 84 |     for icand, _ in enumerate(array_nsigma_tpc_pi_0):
 85 |         is_track_0_sel = (
 86 |             array_nsigma_tpc_pi_0[icand] < nsigmacut
 87 |             or array_nsigma_tof_pi_0[icand] < nsigmacut
 88 |             or array_nsigma_tpc_k_0[icand] < nsigmacut
 89 |             or array_nsigma_tof_k_0[icand] < nsigmacut
 90 |         )
 91 |         # second track must be a kaon
 92 |         is_track_1_sel = array_nsigma_tpc_k_1[icand] < nsigmacut or array_nsigma_tof_k_1[icand] < nsigmacut
 93 |         is_track_2_sel = (
 94 |             array_nsigma_tpc_pi_2[icand] < nsigmacut
 95 |             or array_nsigma_tof_pi_2[icand] < nsigmacut
 96 |             or array_nsigma_tpc_k_2[icand] < nsigmacut
 97 |             or array_nsigma_tof_k_2[icand] < nsigmacut
 98 |         )
 99 |         if is_track_0_sel and is_track_1_sel and is_track_2_sel:
100 |             array_is_pid_sel.append(True)
101 |         else:
102 |             array_is_pid_sel.append(False)
103 |     return array_is_pid_sel
104 | 
105 | 
106 | # @numba.njit
107 | def selectpid_dzerotokpi(
108 |     array_nsigma_tpc_pi_0,
109 |     array_nsigma_tpc_k_0,
110 |     array_nsigma_tof_pi_0,
111 |     array_nsigma_tof_k_0,
112 |     array_nsigma_tpc_pi_1,
113 |     array_nsigma_tpc_k_1,
114 |     array_nsigma_tof_pi_1,
115 |     array_nsigma_tof_k_1,
116 |     nsigmacut,
117 | ):
118 |     array_is_pid_sel = []
119 | 
120 |     for icand, _ in enumerate(array_nsigma_tpc_pi_0):
121 |         is_track_0_sel = (
122 |             array_nsigma_tpc_pi_0[icand] < nsigmacut
123 |             or array_nsigma_tof_pi_0[icand] < nsigmacut
124 |             or array_nsigma_tpc_k_0[icand] < nsigmacut
125 |             or array_nsigma_tof_k_0[icand] < nsigmacut
126 |         )
127 |         is_track_1_sel = (
128 |             array_nsigma_tpc_pi_1[icand] < nsigmacut
129 |             or array_nsigma_tof_pi_1[icand] < nsigmacut
130 |             or array_nsigma_tpc_k_1[icand] < nsigmacut
131 |             or array_nsigma_tof_k_1[icand] < nsigmacut
132 |         )
133 |         if is_track_0_sel and is_track_1_sel:
134 |             array_is_pid_sel.append(True)
135 |         else:
136 |             array_is_pid_sel.append(False)
137 |     return array_is_pid_sel
138 | 
139 | 
140 | # @numba.njit
141 | def selectpid_lctov0bachelor(array_nsigma_tpc, array_nsigma_tof, nsigmacut):
142 |     # nsigma for desired species (i.e. p in case of pK0s or pi in case of piL)
143 |     array_is_pid_sel = []
144 | 
145 |     for icand, _ in enumerate(array_nsigma_tpc):
146 |         is_track_sel = array_nsigma_tpc[icand] < nsigmacut or array_nsigma_tof[icand] < nsigmacut
147 |         if is_track_sel:
148 |             array_is_pid_sel.append(True)
149 |         else:
150 |             array_is_pid_sel.append(False)
151 |     return array_is_pid_sel
152 | 
153 | 
154 | # @numba.njit
155 | def selectcand_lincut(array_cut_var, minvalue, maxvalue, isabs):
156 |     array_is_sel = []
157 |     for icand, _ in enumerate(array_cut_var):
158 |         if isabs:
159 |             value = abs(array_cut_var[icand])
160 |         else:
161 |             value = array_cut_var[icand]
162 |         if minvalue < value < maxvalue:
163 |             array_is_sel.append(True)
164 |         else:
165 |             array_is_sel.append(False)
166 |     return array_is_sel
167 | 
168 | 
169 | def gethistonormforselevt(df_evt, dfevtevtsel, label):
170 |     hSelMult = TH1F("sel_" + label, "sel_" + label, 1, -0.5, 0.5)
171 |     hNoVtxMult = TH1F("novtx_" + label, "novtx_" + label, 1, -0.5, 0.5)
172 |     hVtxOutMult = TH1F("vtxout_" + label, "vtxout_" + label, 1, -0.5, 0.5)
173 | 
174 |     df_to_keep = filter_bit_df(df_evt, "fIsEventReject", [[], [0, 5, 6, 10, 11]])
175 |     # events with reco vtx after previous selection
176 |     tag_vtx = tag_bit_df(df_to_keep, "fIsEventReject", [[], [1, 2, 7, 12]])
177 |     df_no_vtx = df_to_keep[tag_vtx]
178 |     # events with reco zvtx > 10 cm after previous selection
179 |     df_bit_zvtx_gr10 = filter_bit_df(df_to_keep, "fIsEventReject", [[3], [1, 2, 7, 12]])
180 | 
181 |     hSelMult.SetBinContent(1, len(dfevtevtsel))
182 |     hNoVtxMult.SetBinContent(1, len(df_no_vtx))
183 |     hVtxOutMult.SetBinContent(1, len(df_bit_zvtx_gr10))
184 |     return hSelMult, hNoVtxMult, hVtxOutMult
185 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/all_off.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps: # analyzer methods to run (uncomment to activate)
68 |   ##### Inclusive hadrons
69 |   # fit:
70 |   # efficiency:
71 |   # makenormyields:
72 |   ##### Jets
73 |   # init:
74 |   # calculate_efficiencies:
75 |   # qa:
76 |   # fit:
77 |   # estimate_feeddown:
78 |   # analyze_with_sidesub:
79 |   # analyze_with_sigextr:
80 | 
81 | systematics:
82 |   cutvar:
83 |     activate: false
84 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
85 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
86 |   mcptshape:
87 |     activate: false
88 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/analysis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: true # processer: process_histomass
64 |   mc:
65 |     histomass: true # processer: process_histomass
66 |     efficiency: true # processer: process_efficiency
67 |   steps: # analyzer methods to run (uncomment to activate)
68 |     ##### Inclusive hadrons
69 |     # fit:
70 |     # efficiency:
71 |     # makenormyields:
72 |     ##### Jets
73 |     init:
74 |     calculate_efficiencies:
75 |     qa:
76 |     fit:
77 |     estimate_feeddown:
78 |     analyze_with_sidesub:
79 |     # analyze_with_sigextr:
80 | 
81 | systematics:
82 |   cutvar:
83 |     activate: false
84 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
85 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
86 |   mcptshape:
87 |     activate: false
88 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/analyzer.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps: # analyzer methods to run (uncomment to activate)
68 |     ##### Inclusive hadrons
69 |     # fit:
70 |     # efficiency:
71 |     # makenormyields:
72 |     ##### Jets
73 |     init:
74 |     calculate_efficiencies:
75 |     qa:
76 |     fit:
77 |     estimate_feeddown:
78 |     analyze_with_sidesub:
79 |     # analyze_with_sigextr:
80 | 
81 | systematics:
82 |   cutvar:
83 |     activate: false
84 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
85 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
86 |   mcptshape:
87 |     activate: false
88 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/data.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: true
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: true
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/full_analysis.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: true
 9 |   data:
10 |     activate: true
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: true
14 |   data:
15 |     activate: true
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: true # processer: process_histomass
64 |   mc:
65 |     histomass: true # processer: process_histomass
66 |     efficiency: true # processer: process_efficiency
67 |   steps: # analyzer methods to run (uncomment to activate)
68 |     ##### Inclusive hadrons
69 |     # fit:
70 |     # efficiency:
71 |     # makenormyields:
72 |     ##### Jets
73 |     init:
74 |     calculate_efficiencies:
75 |     qa:
76 |     fit:
77 |     estimate_feeddown:
78 |     analyze_with_sidesub:
79 |     # analyze_with_sigextr:
80 | 
81 | systematics:
82 |   cutvar:
83 |     activate: false
84 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
85 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
86 |   mcptshape:
87 |     activate: false
88 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/mc.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: true
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: true
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/mlapp.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: true # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: true # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: true # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: true # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/mltrain.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: true
29 |   dotraining: true
30 |   dotesting: true
31 |   doplotdistr: false
32 |   doroc: true
33 |   doroctraintest: true
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/preprocess.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: true
 9 |   data:
10 |     activate: true
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: true
14 |   data:
15 |     activate: true
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: false # processer: process_histomass
64 |   mc:
65 |     histomass: false # processer: process_histomass
66 |     efficiency: false # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submission/processor.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | case: XXXX # used to find the database file unless specified explicitly as do_entire_analysis -d database_analysis
 3 | download:
 4 |   alice:
 5 |     activate: false
 6 | conversion: # pkl
 7 |   mc:
 8 |     activate: false
 9 |   data:
10 |     activate: false
11 | skimming: # pkl_skimmed (pklsk), pkl_evtcounter_all
12 |   mc:
13 |     activate: false
14 |   data:
15 |     activate: false
16 | merging: # pkl_skimmed_merge_for_ml (pklskml)
17 |   mc:
18 |     activate: false
19 |   data:
20 |     activate: false
21 | mergingperiods: # pkl_skimmed_merge_for_ml_all
22 |   mc:
23 |     activate: false
24 |   data:
25 |     activate: false
26 | 
27 | ml_study: # mlout, mlplot
28 |   activate: false
29 |   dotraining: false
30 |   dotesting: false
31 |   doplotdistr: false
32 |   doroc: false
33 |   doroctraintest: false
34 |   doimportance: false
35 |   doimportanceshap: false
36 |   docorrelation: false
37 |   dolearningcurve: false
38 |   doapplytodatamc: false
39 |   doscancuts: false
40 |   doefficiency: false
41 |   dosignifopt: false
42 |   doboundary: false
43 |   docrossvalidation: false
44 |   dogridsearch: false
45 |   dobayesianopt: false
46 | 
47 | mlapplication:
48 |   data:
49 |     doapply: false # pkl_skimmed_dec (pklskdec)
50 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
51 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
52 |   mc:
53 |     doapply: false # pkl_skimmed_dec (pklskdec)
54 |     domergeapply: false # pkl_skimmed_decmerged (pklskdecmerged)
55 |     docontinueafterstop: false # set to true to resume interrupted processing (existing corrupted output will be overwritten)
56 | 
57 | analysis:
58 |   type: "YYYY" # used unless specified explicitly as do_entire_analysis -a type_ana
59 |   # Do each period separately including merged (true)
60 |   # Do only merged (false)
61 |   doperperiod: false
62 |   data:
63 |     histomass: true # processer: process_histomass
64 |   mc:
65 |     histomass: true # processer: process_histomass
66 |     efficiency: true # processer: process_efficiency
67 |   steps:
68 | 
69 | systematics:
70 |   cutvar:
71 |     activate: false
72 |     do_only_analysis: false # This can be done anytime when mass and efficiency histograms have been produced already for a number of trials
73 |     resume: false # already done mass and efficiency histograms will not be done again, continue with left trials
74 |   mcptshape:
75 |     activate: false
76 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submit.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ##### Configuration
 4 | 
 5 | # Analysis stage
 6 | 
 7 | # STAGE="all_off"        # all steps disabled
 8 | STAGE="full_analysis"    # stage preprocess + stage analysis (requires train output in "(data|mc)/prefix_dir")
 9 | # STAGE="preprocess"     # conversion, skimming (requires train output in "(data|mc)/prefix_dir")
10 | # STAGE="data"           # stage preprocess: data (requires train output in "data/prefix_dir")
11 | # STAGE="mc"             # stage preprocess: mc (requires train output in "mc/prefix_dir")
12 | # STAGE="mltrain"        # ml_study
13 | # STAGE="mlapp"          # mlapplication
14 | # STAGE="analysis"       # stage processor + stage analyzer (requires stage preprocess done)
15 | # STAGE="processor"      # analysis/(data|mc)/(histomass|efficiency) (requires stage preprocess done)
16 | # STAGE="analyzer"       # analysis/steps (requires stage processor done)
17 | # STAGE="variations"     # run analysis variations (requires stage analyzer done)
18 | # STAGE="systematics"    # calculate and plot systematics (requires stage variations done)
19 | # STAGE="plotting"       # make analysis plots (requires stage systematics done)
20 | 
21 | # Suffix of the analysis database name
22 | 
23 | DATABASE="D0Jet_pp"
24 | # DATABASE="LcJet_pp"
25 | 
26 | # Name of the analysis section in the analysis database
27 | 
28 | ANALYSIS="jet_obs"
29 | 
30 | ##### Initialisation
31 | 
32 | DIR_THIS="$(dirname "$(realpath "$0")")"  # This directory
33 | DBDIR="data/data_run3"
34 | DB_DEFAULT="${DIR_THIS}/${DBDIR}/database_ml_parameters_${DATABASE}.yml"
35 | LOG="log_${STAGE}_${DATABASE}_${ANALYSIS}.log"
36 | LOG_ERR="${LOG/.log/_err.log}"
37 | LOG_TMP="${LOG/.log/_tmp.log}"
38 | 
39 | ##### Execution
40 | 
41 | echo "$(date) Start"
42 | echo "Running the \"${STAGE}\" stage of the \"${ANALYSIS}\" analysis from the \"${DATABASE}\" database"
43 | 
44 | if [[ "${STAGE}" == "plotting" ]]; then
45 |     echo "Log file: $LOG"
46 |     python "${DIR_THIS}/plotting/plot_jetsubstructure_run3.py" -d "${DB_DEFAULT}" -a "${ANALYSIS}" > "${LOG}" 2>&1
47 | elif [[ "${STAGE}" == "systematics" ]]; then
48 |     echo "Log file: $LOG"
49 |     python "${DIR_THIS}/analysis/do_systematics.py" -d "${DB_DEFAULT}" -a "${ANALYSIS}" > "${LOG}" 2>&1
50 | elif [[ "${STAGE}" == "variations" ]]; then
51 |     DB_VARIATION="${DIR_THIS}/${DBDIR}/database_variations_${DATABASE}_${ANALYSIS}.yml"
52 |     CONFIG_FILE="${DIR_THIS}/submission/analysis.yml"
53 |     "${DIR_THIS}/submit_variations.sh" "${DB_DEFAULT}" "${DB_VARIATION}" "${ANALYSIS}" "${CONFIG_FILE}"
54 | else
55 |     echo "Log file: $LOG"
56 |     CONFIG_FILE="${DIR_THIS}/submission/${STAGE}.yml"
57 |     CMD_ANA="mlhep -a ${ANALYSIS} -r ${CONFIG_FILE} -d ${DB_DEFAULT} -b --delete"
58 |     ${CMD_ANA} > "${LOG}" 2>&1
59 | fi || echo "Error"
60 | 
61 | ml-log() { grep -e "Initial" -e "Unpacking" -e "Skimming" -e "Process" -e "Run workflow step" -e "Running analysis" -e "Analysis complete" -e "Done" -e "CRITICAL" -B 1 "$1" | grep -v "\--"; }
62 | 
63 | echo "Grepping issues into ${LOG_ERR}"
64 | grep -e "Error in " -e "Failed " "${LOG}" > "${LOG_ERR}"
65 | grep -A 1 -e WARN -e ERROR -e FATAL -e CRITICAL "${LOG}" >> "${LOG_ERR}"
66 | 
67 | echo "Grepping timestamps into ${LOG_TMP}"
68 | ml-log "${LOG}" > "${LOG_TMP}"
69 | 
70 | echo "$(date) Done"
71 | 


--------------------------------------------------------------------------------
/machine_learning_hep/submit_variations.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | [ "$4" ] || { echo "Usage: $0 <default database> <variation database> <analysis>"; exit 0; }
 4 | 
 5 | ErrExit() { echo "Error running variations"; exit 1; }
 6 | 
 7 | # This directory
 8 | DIR_THIS="$(dirname "$(realpath "$0")")"
 9 | 
10 | DB_DEFAULT="$1"
11 | DB_VARIATION="$2"
12 | ANALYSIS="$3"
13 | CONFIG_FILE="$4"
14 | RUN=0
15 | CMD_VAR="python ${DIR_THIS}/do_variations.py ${DB_DEFAULT} ${DB_VARIATION}"
16 | declare -a NJOBS  # number of parallel jobs
17 | NJOBS[0]=50  # for variations without processor
18 | NJOBS[1]=5   # for variations with processor
19 | SCRIPT="script.sh" # name of the script with the execution lines
20 | 
21 | ${CMD_VAR} || ErrExit
22 | 
23 | echo -e "\nDo you wish to run these variations?"
24 | while true; do
25 |   read -r -p "Answer: " yn
26 |   case $yn in
27 |     [y] ) echo "Proceeding"; RUN=1; break;;
28 |     [n] ) echo "Aborting"; break;;
29 |     * ) echo "Please answer y or n.";;
30 |   esac
31 | done
32 | 
33 | if ((RUN)); then
34 |   echo -e "\nRunning variations"
35 |   for PROC in 0 1; do
36 |     ${CMD_VAR} -a "${ANALYSIS}" -r "${CONFIG_FILE}" -s "$SCRIPT" -p $PROC && parallel --will-cite --progress -j "${NJOBS[$PROC]}" < "$SCRIPT"
37 |   done || ErrExit
38 | else
39 |   echo -e "\nCleaning"
40 |   ${CMD_VAR} -c -s "$SCRIPT" || ErrExit
41 | fi
42 | 


--------------------------------------------------------------------------------
/machine_learning_hep/templates_keras.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | from copy import deepcopy
 16 | 
 17 | from hyperopt import hp
 18 | from hyperopt.pyll import scope
 19 | from keras.layers import Dense, Input
 20 | from keras.models import Model
 21 | from keras.wrappers.scikit_learn import KerasClassifier
 22 | 
 23 | from machine_learning_hep.optimisation.bayesian_opt import BayesianOpt
 24 | from machine_learning_hep.optimisation.metrics import get_scorers
 25 | 
 26 | 
 27 | def keras_classifier_(model_config, input_length):
 28 |     """
 29 |     NN for binary classification with 1 hidden layers
 30 |     """
 31 |     # Create layers
 32 |     inputs = Input(shape=(input_length,))
 33 |     layer = Dense(model_config["layers"][0]["n_nodes"], activation=model_config["layers"][0]["activation"])(inputs)
 34 |     predictions = Dense(1, activation="sigmoid")(layer)
 35 |     # Build model from layers
 36 |     model = Model(inputs=inputs, outputs=predictions)
 37 |     model.compile(loss=model_config["loss"], optimizer=model_config["optimizer"], metrics=["accuracy"])
 38 |     return model
 39 | 
 40 | 
 41 | def keras_classifier(model_config, input_length):
 42 |     return KerasClassifier(
 43 |         build_fn=lambda: keras_classifier_(model_config, input_length),
 44 |         epochs=model_config["epochs"],
 45 |         batch_size=model_config["batch_size"],
 46 |         verbose=1,
 47 |     )
 48 | 
 49 | 
 50 | def keras_classifier_bayesian_space():
 51 |     return {
 52 |         "n_nodes": hp.choice(
 53 |             "x_n_nodes",
 54 |             [
 55 |                 [scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)), scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1))],
 56 |                 [
 57 |                     scope.int(hp.quniform("x_n_nodes_1", 12, 64, 1)),
 58 |                     scope.int(hp.quniform("x_n_nodes_2", 12, 64, 1)),
 59 |                     scope.int(hp.quniform("x_n_nodes_3", 12, 64, 1)),
 60 |                 ],
 61 |             ],
 62 |         ),
 63 |         "activation_0": hp.choice("x_activation_0", ["relu", "sigmoid"]),
 64 |         "activation_1": hp.choice("x_activation_1", ["relu", "sigmoid"]),
 65 |         "epochs": scope.int(hp.quniform("x_epochs", 50, 100, 1)),
 66 |         "batch_size": scope.int(hp.quniform("x_batch_size", 28, 256, 1)),
 67 |     }
 68 | 
 69 | 
 70 | class KerasClassifierBayesianOpt(BayesianOpt):  # pylint: disable=too-many-instance-attributes
 71 |     def __init__(self, model_config, space, input_length):
 72 |         super().__init__(model_config, space)
 73 |         self.input_length = input_length
 74 |         # Cache drawn space and model config to build the model several times in
 75 |         # self.get_scikit_model (should have these available but cannot take arguments
 76 |         self.model_config_tmp = None
 77 |         self.space_tmp = None
 78 | 
 79 |     def get_scikit_model(self):
 80 |         """Just a helper funtion
 81 | 
 82 |         KerasClassifier needs something callable to obtain the model
 83 | 
 84 |         """
 85 |         inputs = Input(shape=(self.input_length,))
 86 |         layer = Dense(self.space_tmp["n_nodes"][0], activation=self.space_tmp["activation_0"])(inputs)
 87 |         for i, n_nodes in enumerate(self.space_tmp["n_nodes"][1:]):
 88 |             layer = Dense(n_nodes, activation=self.space_tmp[f"activation_{(i + 1) % 2}"])(layer)
 89 |         predictions = Dense(1, activation="sigmoid")(layer)
 90 |         # Build model from layers
 91 |         model = Model(inputs=inputs, outputs=predictions)
 92 |         model.compile(
 93 |             loss=self.model_config_tmp["loss"], optimizer=self.model_config_tmp["optimizer"], metrics=["accuracy"]
 94 |         )
 95 |         return model
 96 | 
 97 |     def yield_model_(self, model_config, space):
 98 |         self.space_tmp = deepcopy(space)
 99 |         self.model_config_tmp = deepcopy(model_config)
100 | 
101 |         return KerasClassifier(
102 |             build_fn=self.get_scikit_model, epochs=space["epochs"], batch_size=space["batch_size"], verbose=1
103 |         ), space
104 | 
105 |     def save_model_(self, model, out_dir):
106 |         """Not implemented yet"""
107 | 
108 | 
109 | def keras_classifier_bayesian_opt(model_config, input_length):
110 |     bayesian_opt = KerasClassifierBayesianOpt(model_config, keras_classifier_bayesian_space(), input_length)
111 |     bayesian_opt.nkfolds = 3
112 |     bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"])
113 |     bayesian_opt.scoring_opt = "AUC"
114 |     bayesian_opt.low_is_better = False
115 |     bayesian_opt.n_trials = 30
116 |     return bayesian_opt
117 | 


--------------------------------------------------------------------------------
/machine_learning_hep/templates_scikit.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2018. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
16 | from sklearn.linear_model import Lasso, LinearRegression, Ridge
17 | from sklearn.tree import DecisionTreeClassifier
18 | 
19 | 
20 | def scikit_random_forest_classifier(model_config):
21 |     return RandomForestClassifier(
22 |         max_depth=model_config["max_depth"],
23 |         n_estimators=model_config["n_estimators"],
24 |         max_features=model_config["max_features"],
25 |     )
26 | 
27 | 
28 | def scikit_adaboost_classifier(model_config):  # pylint: disable=W0613
29 |     return AdaBoostClassifier()
30 | 
31 | 
32 | def scikit_decision_tree_classifier(model_config):
33 |     return DecisionTreeClassifier(max_depth=model_config["max_depth"])
34 | 
35 | 
36 | def scikit_linear_regression(model_config):  # pylint: disable=W0613
37 |     return LinearRegression()
38 | 
39 | 
40 | def scikit_ridge_regression(model_config):
41 |     return Ridge(alpha=model_config["alpha"], solver=model_config["solver"])
42 | 
43 | 
44 | def scikit_lasso_regression(model_config):
45 |     return Lasso(alpha=model_config["alpha"])
46 | 


--------------------------------------------------------------------------------
/machine_learning_hep/templates_xgboost.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2023. All rights not expressly granted are reserved.  ##
 3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
 4 | ## This program is free software: you can redistribute it and/or modify it ##
 5 | ##  under the terms of the GNU General Public License as published by the  ##
 6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 7 | ## option) any later version. This program is distributed in the hope that ##
 8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
10 | ##           See the GNU General Public License for more details.          ##
11 | ##    You should have received a copy of the GNU General Public License    ##
12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
13 | #############################################################################
14 | 
15 | import pickle
16 | from os.path import join
17 | 
18 | from hyperopt import hp
19 | from hyperopt.pyll import scope
20 | from xgboost import XGBClassifier
21 | 
22 | from machine_learning_hep.optimisation.bayesian_opt import BayesianOpt
23 | from machine_learning_hep.optimisation.metrics import get_scorers
24 | 
25 | 
26 | def xgboost_classifier(model_config):  # pylint: disable=W0613
27 |     return XGBClassifier(
28 |         verbosity=1,
29 |         # n_gpus=0,
30 |         **model_config,
31 |     )
32 | 
33 | 
34 | def xgboost_classifier_bayesian_space():
35 |     return {
36 |         "max_depth": scope.int(hp.quniform("x_max_depth", 1, 3, 1)),
37 |         "n_estimators": scope.int(hp.quniform("x_n_estimators", 100, 1000, 1)),
38 |         "min_child_weight": scope.int(hp.quniform("x_min_child", 1, 10, 1)),
39 |         "subsample": hp.uniform("x_subsample", 0.5, 0.9),
40 |         "gamma": hp.uniform("x_gamma", 0.0, 0.2),
41 |         "colsample_bytree": hp.uniform("x_colsample_bytree", 0.5, 1.0),
42 |         "colsample_bylevel": hp.uniform("x_colsample_bylevel", 0.5, 1.0),
43 |         "colsample_bynode": hp.uniform("x_colsample_bynode", 0.5, 1.0),
44 |         # "max_delta_step": scope.int(hp.quniform("x_max_delta_step", 0, 8, 1)),
45 |         "reg_lambda": hp.uniform("x_reg_lambda", 0, 1),
46 |         "reg_alpha": hp.uniform("x_reg_alpha", 0, 1),
47 |         "learning_rate": hp.uniform("x_learning_rate", 0.01, 0.5),
48 |     }
49 | 
50 | 
51 | class XGBoostClassifierBayesianOpt(BayesianOpt):
52 |     def yield_model_(self, model_config, space):
53 |         config = self.next_params(space)
54 |         config["early_stopping_rounds"] = 10
55 |         return xgboost_classifier(config), config
56 | 
57 |     def save_model_(self, model, out_dir):
58 |         out_filename = join(out_dir, "xgboost_classifier.sav")
59 |         with open(out_filename, "wb") as outfile:
60 |             pickle.dump(model, outfile, protocol=4)
61 |         out_filename = join(out_dir, "xgboost_classifier.model")
62 |         model.save_model(out_filename)
63 | 
64 | 
65 | def xgboost_classifier_bayesian_opt(model_config):
66 |     bayesian_opt = XGBoostClassifierBayesianOpt(model_config, xgboost_classifier_bayesian_space())
67 |     bayesian_opt.nkfolds = 3
68 |     bayesian_opt.scoring = get_scorers(["AUC", "Accuracy"])
69 |     bayesian_opt.scoring_opt = "AUC"
70 |     bayesian_opt.low_is_better = False
71 |     bayesian_opt.n_trials = 100
72 |     bayesian_opt.score_train_test_diff = 0.01
73 |     return bayesian_opt
74 | 


--------------------------------------------------------------------------------
/machine_learning_hep/utilities_files.py:
--------------------------------------------------------------------------------
  1 | #  © Copyright CERN 2024. All rights not expressly granted are reserved.  #
  2 | #                 Author: Gian.Michele.Innocenti@cern.ch                  #
  3 | # This program is free software: you can redistribute it and/or modify it #
  4 | #  under the terms of the GNU General Public License as published by the  #
  5 | # Free Software Foundation, either version 3 of the License, or (at your  #
  6 | # option) any later version. This program is distributed in the hope that #
  7 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
  8 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
  9 | #           See the GNU General Public License for more details.          #
 10 | #    You should have received a copy of the GNU General Public License    #
 11 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
 12 | 
 13 | import glob
 14 | import os
 15 | import shutil
 16 | from pathlib import Path
 17 | from typing import Union
 18 | 
 19 | from .logger import get_logger
 20 | 
 21 | logger = get_logger()
 22 | 
 23 | 
 24 | def list_folders(main_dir: str, filenameinput: str, maxfiles: int, select=None):  # pylint: disable=too-many-branches
 25 |     """
 26 |     Return folders under main_dir which contain filenameinput
 27 | 
 28 |     :param maxfiles: limit to maxfiles
 29 |     :param select: iterable of substrings that must be contained in folders
 30 |     """
 31 |     if not os.path.isdir(main_dir):
 32 |         logger.error("Input directory <%s> does not exist", main_dir)
 33 | 
 34 |     files = glob.glob(f"{main_dir}/**/{filenameinput}", recursive=True)
 35 |     listfolders = [os.path.relpath(os.path.dirname(file), main_dir) for file in files]
 36 | 
 37 |     if select:
 38 |         # Select only folders with a matching sub-string in their paths
 39 |         list_folders_tmp = []
 40 |         for sel_sub_string in select:
 41 |             list_folders_tmp.extend([folder for folder in listfolders if sel_sub_string in folder])
 42 |         listfolders = list_folders_tmp
 43 | 
 44 |     if maxfiles != -1:
 45 |         listfolders = listfolders[:maxfiles]
 46 | 
 47 |     return listfolders
 48 | 
 49 | 
 50 | def create_folder_struc(maindir: str, listpath: list[str]):
 51 |     """
 52 |     Reproduce the folder structure as input
 53 |     """
 54 |     for path in listpath:
 55 |         path_elements = path.split("/")
 56 |         folder = maindir
 57 |         for element in path_elements:
 58 |             folder = os.path.join(folder, element)
 59 |             if not os.path.exists(folder):
 60 |                 os.makedirs(folder)
 61 | 
 62 | 
 63 | def checkdirs(dirs: Union[list[str], str]) -> list[str]:
 64 |     """
 65 |     Return list of existing directories
 66 |     """
 67 |     if isinstance(dirs, str):
 68 |         exdirs = [dirs] if Path(dirs).exists() else []
 69 |     else:
 70 |         exdirs = [d for d in dirs if Path(d).exists()]
 71 |     return exdirs
 72 | 
 73 | 
 74 | def checkmakedir(mydir: str):
 75 |     """
 76 |     Makes directory using 'mkdir'
 77 |     """
 78 |     logger.debug("Creating folder %s", mydir)
 79 |     os.makedirs(mydir, exist_ok=True)
 80 | 
 81 | 
 82 | def checkmakedirlist(dirlist: list[str]):
 83 |     """
 84 |     Makes directories from list using 'mkdir'
 85 |     """
 86 |     for mydir in dirlist:
 87 |         checkmakedir(mydir)
 88 | 
 89 | 
 90 | def delete_dir(path: str):
 91 |     """
 92 |     Delete directory if it exists. Return True if success, False otherwise.
 93 |     """
 94 |     if not os.path.isdir(path):
 95 |         logger.warning("Directory %s does not exist", path)
 96 |         return True
 97 |     logger.warning("Deleting directory %s", path)
 98 |     try:
 99 |         shutil.rmtree(path)
100 |     except OSError:
101 |         logger.error("Failed to delete directory %s", path)
102 |         return False
103 |     return True
104 | 
105 | 
106 | def delete_dirlist(dirlist: list[str]):
107 |     """
108 |     Delete directories from list. Return True if success, False otherwise.
109 |     """
110 |     for path in dirlist:
111 |         if not delete_dir(path):
112 |             return False
113 |     return True
114 | 
115 | 
116 | def appendfiletolist(mylist: list[str], namefile: str):
117 |     """
118 |     Append filename to list
119 |     """
120 |     return [os.path.join(path, namefile) for path in mylist]
121 | 
122 | 
123 | def appendmainfoldertolist(prefolder: str, mylist: list[str]):
124 |     """
125 |     Append base foldername to paths in list
126 |     """
127 |     return [os.path.join(prefolder, path) for path in mylist]
128 | 
129 | 
130 | def createlist(prefolder: str, mylistfolder: list[str], namefile: str):
131 |     """
132 |     Appends base foldername + filename in list
133 |     """
134 |     if not namefile:
135 |         return []
136 |     listfiles = appendfiletolist(mylistfolder, namefile)
137 |     listfiles = appendmainfoldertolist(prefolder, listfiles)
138 |     return listfiles
139 | 


--------------------------------------------------------------------------------
/machine_learning_hep/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | #############################################################################
 2 | ##  © Copyright CERN 2025. All rights not expressly granted are reserved.  ##
 3 | ## This program is free software: you can redistribute it and/or modify it ##
 4 | ##  under the terms of the GNU General Public License as published by the  ##
 5 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
 6 | ## option) any later version. This program is distributed in the hope that ##
 7 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
 8 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 9 | ##           See the GNU General Public License for more details.          ##
10 | ##    You should have received a copy of the GNU General Public License    ##
11 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
12 | #############################################################################
13 | 


--------------------------------------------------------------------------------
/machine_learning_hep/utils/compare_directories.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Compare ROOT files between two directories.
 4 | 
 5 | [ "$2" ] || { echo "Provide two paths to compare."; exit 1; }
 6 | 
 7 | for d in "$1" "$2"; do
 8 |     [ -d "$d" ] || { echo "Path $d is not a directory."; exit 1; }
 9 | done
10 | 
11 | dir_this="$(dirname "$(realpath "$0")")"
12 | dir_pwd="$PWD"
13 | dir_1="$(realpath "$1")"
14 | dir_2="$(realpath "$2")"
15 | 
16 | shift
17 | shift
18 | 
19 | readarray -t files < <(find "$dir_1" -maxdepth 3 -name "*.root")
20 | [ ${#files[@]} -eq 0 ] && { echo "No ROOT files found in $dir_1."; exit 0; }
21 | 
22 | for f in "${files[@]}"; do
23 |     file_1="$f"
24 |     file_2="${file_1/$dir_1/$dir_2}"
25 |     [ -f "$file_2" ] || { echo "File $file_2 does not exist. Skipping."; continue; }
26 |     echo "Comparing $file_1 and $file_2"
27 |     dir_out="${file_1/$dir_1\//}"
28 |     dir_out="${dir_out/.root/}"
29 |     echo "Output dir $dir_out"
30 |     mkdir -p "$dir_out"
31 |     cd "$dir_out" || { echo "Cannot enter $dir_out"; exit 1; }
32 |     "${dir_this}/compare_root_files.py" "$file_1" "$file_2" "$@" > "diff.txt" 2>&1
33 |     cd "$dir_pwd" || { echo "Cannot enter $dir_pwd"; exit 1; }
34 | done
35 | 


--------------------------------------------------------------------------------
/machine_learning_hep/utils/dl_train.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | 
 3 | """This module downloads AO2Ds from an ALICE hyperloop train run"""
 4 | 
 5 | import argparse
 6 | import os
 7 | from pathlib import PurePosixPath
 8 | 
 9 | import requests  # pylint: disable=import-error
10 | 
11 | try:
12 |     from alienpy import alien, xrd_core
13 | except ImportError:
14 |     print("Could not import alien, install with pip install alienpy")
15 | 
16 | 
17 | def get_train_spec(train_id: int):
18 |     """Retrieve train spec from hyperloop interface"""
19 |     # https://alimonitor.cern.ch/hyperloop/train-run/131050
20 |     url = f"https://alimonitor.cern.ch/alihyperloop-data/trains/train.jsp?train_id={train_id}"
21 |     try:
22 |         return requests.get(
23 |             url,
24 |             verify=False,
25 |             cert=(f"/tmp/tokencert_{os.getuid()}.pem", f"/tmp/tokenkey_{os.getuid()}.pem"),
26 |             timeout=10,
27 |         )
28 |     except requests.exceptions.SSLError as e:
29 |         print(f"SSL Error: {e}")
30 |         raise
31 | 
32 | 
33 | def find_ao2ds(ali: alien.AliEn, aliendir: str) -> list[str]:
34 |     """Find AO2Ds in train output directory"""
35 |     cmd_find = f"find {PurePosixPath(aliendir) / 'AOD'} AO2D.root"
36 |     print(cmd_find)
37 |     ret = ali.run(cmd_find)
38 |     if ret.exitcode != 0:
39 |         print(f"Failed to run search: {cmd_find}\n{ret.out}")
40 |         return []
41 |     return ret.out.split()
42 | 
43 | 
44 | def main():
45 |     """CLI interface"""
46 |     parser = argparse.ArgumentParser(description="Download AO2Ds from hyperloop train")
47 |     parser.add_argument("train_id", type=int, help="train ID")
48 |     parser.add_argument("--prefix", "-p", default="/data2/MLhep/trains/")
49 |     parser.add_argument("--dry-run", "-n", action="store_true", help="dry run")
50 |     args = parser.parse_args()
51 | 
52 |     print("Obtaining train spec ..")
53 |     train_spec = get_train_spec(args.train_id)
54 |     outputdirs = [d["outputdir"] for d in train_spec.json()["jobResults"]]
55 | 
56 |     print("Finding AO2Ds ..")
57 |     a = alien.AliEn()
58 |     src = [d for outputdir in outputdirs for d in find_ao2ds(a, outputdir)]
59 |     dst = ["file:" + str(PurePosixPath(args.prefix) / str(args.train_id) / file.lstrip("/")) for file in src]
60 |     print("Files to copy:")
61 |     for s, d in zip(src, dst):
62 |         print(f"{s} -> {d}")
63 | 
64 |     if not args.dry_run:
65 |         print("Copying ..")
66 |         xrd_core.DO_XrootdCp(a.wb(), api_src=src, api_dst=dst)
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     main()
71 | 


--------------------------------------------------------------------------------
/machine_learning_hep/vary_bdt.py:
--------------------------------------------------------------------------------
 1 | #  © Copyright CERN 2018. All rights not expressly granted are reserved.  #
 2 | #                 Author: Gian.Michele.Innocenti@cern.ch                  #
 3 | # This program is free software: you can redistribute it and/or modify it #
 4 | #  under the terms of the GNU General Public License as published by the  #
 5 | # Free Software Foundation, either version 3 of the License, or (at your  #
 6 | # option) any later version. This program is distributed in the hope that #
 7 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
 8 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
 9 | #           See the GNU General Public License for more details.          #
10 | #    You should have received a copy of the GNU General Public License    #
11 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
12 | 
13 | """
14 | Generate BDT cut variations
15 | Author: Vit Kucera <vit.kucera@cern.ch>
16 | """
17 | 
18 | 
19 | def main():
20 |     n_steps = 5
21 |     print_default = False
22 | 
23 |     dic_cuts = {
24 |         "d0": {
25 |             "string": "mlBkgScore < %g",
26 |             "cuts_default": [0.02, 0.02, 0.02, 0.05, 0.06, 0.08, 0.08, 0.10, 0.10, 0.20, 0.25, 0.30],  # default
27 |             "cuts_min": [0.008, 0.008, 0.0087, 0.017, 0.024, 0.031, 0.028, 0.042, 0.038, 0.052, 0.067, 0.060],  # tight
28 |             "cuts_max": [0.045, 0.053, 0.054, 0.19, 0.22, 0.33, 0.46, 0.38, 0.50, 0.50, 0.50, 0.50],  # loose
29 |         },
30 |         "lc": {
31 |             "string": "mlPromptScore > %g",
32 |             "cuts_default": [0.97, 0.9, 0.9, 0.85, 0.85, 0.8, 0.8, 0.6, 0.6],  # default
33 |             "cuts_min": [0.961, 0.83, 0.84, 0.74, 0.74, 0.62, 0.63, 0.15, 0.15],  # loose
34 |             "cuts_max": [0.978, 0.94, 0.937, 0.915, 0.91, 0.89, 0.88, 0.85, 0.85],  # tight
35 |         },
36 |     }
37 | 
38 |     def format_list(str_format: str, values: list):
39 |         return [str_format % val for val in values]
40 | 
41 |     def format_comment(comment: str):
42 |         return f" # {comment}"
43 | 
44 |     for hf, cuts in dic_cuts.items():
45 |         cuts_default = cuts["cuts_default"]
46 |         fmt = cuts["string"]
47 |         greater_than = ">" in fmt
48 | 
49 |         # Calculate steps
50 |         step_down = [(minimum - default) / n_steps for minimum, default in zip(cuts["cuts_min"], cuts_default)]
51 |         step_up = [(maximum - default) / n_steps for maximum, default in zip(cuts["cuts_max"], cuts_default)]
52 |         list_down = []
53 |         list_up = []
54 | 
55 |         # Calculate variations
56 |         for i in range(n_steps):
57 |             list_down.append([round(default + (i + 1) * step, 6) for default, step in zip(cuts_default, step_down)])
58 |             list_up.append([round(default + (i + 1) * step, 6) for default, step in zip(cuts_default, step_up)])
59 | 
60 |         labels_down = [("loose" if greater_than else "tight") + f" {i + 1}" for i in range(n_steps)]
61 |         labels_up = [("tight" if greater_than else "loose") + f" {i + 1}" for i in range(n_steps)]
62 | 
63 |         labels = list(reversed(labels_down))
64 |         if print_default:
65 |             labels += ["default"]
66 |         labels += labels_up
67 | 
68 |         # Print flags and labels
69 |         n_items = 2 * n_steps + int(print_default)
70 |         prefix_item = "    - "
71 | 
72 |         print(f"{hf}:")
73 |         act = f"{n_items * 'yes, '}"
74 |         print(f"  activate: [{act[:-2]}]")
75 |         print("  label:", labels)
76 |         print("  use_cuts:", n_items * [True])
77 | 
78 |         # Print numeric variations
79 |         print("  cuts_num:")
80 |         for var, label in zip(reversed(list_down), reversed(labels_down)):
81 |             print(prefix_item, var, format_comment(label))
82 |         if print_default:
83 |             print(prefix_item, cuts_default, format_comment("default"))
84 |         for var, label in zip(list_up, labels_up):
85 |             print(prefix_item, var, format_comment(label))
86 | 
87 |         # Print formatted variations
88 |         print("  cuts:")
89 |         for var, label in zip(reversed(list_down), reversed(labels_down)):
90 |             print(prefix_item, format_list(fmt, var), format_comment(label))
91 |         if print_default:
92 |             print(prefix_item, format_list(fmt, cuts_default), format_comment("default"))
93 |         for var, label in zip(list_up, labels_up):
94 |             print(prefix_item, format_list(fmt, var), format_comment(label))
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/machine_learning_hep/workflow/workflow_base.py:
--------------------------------------------------------------------------------
  1 | #############################################################################
  2 | ##  © Copyright CERN 2024. All rights not expressly granted are reserved.  ##
  3 | ##                 Author: Gian.Michele.Innocenti@cern.ch                  ##
  4 | ## This program is free software: you can redistribute it and/or modify it ##
  5 | ##  under the terms of the GNU General Public License as published by the  ##
  6 | ## Free Software Foundation, either version 3 of the License, or (at your  ##
  7 | ## option) any later version. This program is distributed in the hope that ##
  8 | ##  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  ##
  9 | ##     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    ##
 10 | ##           See the GNU General Public License for more details.          ##
 11 | ##    You should have received a copy of the GNU General Public License    ##
 12 | ##   along with this program. if not, see <https://www.gnu.org/licenses/>. ##
 13 | #############################################################################
 14 | 
 15 | from functools import reduce
 16 | from os.path import join
 17 | 
 18 | # pylint: disable=import-error, no-name-in-module
 19 | from ROOT import gStyle
 20 | 
 21 | # HF specific imports
 22 | from machine_learning_hep.logger import get_logger
 23 | 
 24 | 
 25 | # pylint: disable=too-few-public-methods
 26 | class WorkflowBase:
 27 |     """
 28 |     Base class for all workflows related classes including systematics
 29 |     """
 30 | 
 31 |     species = "workflow_base"
 32 | 
 33 |     def __init__(self, datap, case, typean, period=None):
 34 |         self.logger = get_logger()
 35 |         self.datap = datap
 36 |         self.case = case
 37 |         self.typean = typean
 38 |         self.period = period
 39 | 
 40 |     def cfg(self, param, default=None):
 41 |         return reduce(
 42 |             lambda d, key: d.get(key, default) if isinstance(d, dict) else default,
 43 |             param.split("."),
 44 |             self.datap["analysis"][self.typean],
 45 |         )
 46 | 
 47 |     @staticmethod
 48 |     def loadstyle():
 49 |         gStyle.SetOptStat(0)
 50 |         gStyle.SetOptStat(0000)
 51 |         gStyle.SetPalette(1)
 52 |         gStyle.SetNumberContours(100)
 53 |         gStyle.SetCanvasColor(0)
 54 |         gStyle.SetFrameFillColor(0)
 55 | 
 56 |     @staticmethod
 57 |     def make_pre_suffix(args):
 58 |         """
 59 |         Construct a common file suffix from args
 60 |         """
 61 |         try:
 62 |             _ = iter(args)
 63 |         except TypeError:
 64 |             args = [args]
 65 |         else:
 66 |             if isinstance(args, str):
 67 |                 args = [args]
 68 |         args = [str(a) for a in args]
 69 |         return "_".join(args)
 70 | 
 71 |     @staticmethod
 72 |     def make_file_path(directory, filename, extension, prefix=None, suffix=None):
 73 |         if prefix is not None:
 74 |             filename = WorkflowBase.make_pre_suffix(prefix) + "_" + filename
 75 |         if suffix is not None:
 76 |             filename = filename + "_" + WorkflowBase.make_pre_suffix(suffix)
 77 |         extension = extension.replace(".", "")
 78 |         return join(directory, filename + "." + extension)
 79 | 
 80 |     def step(self, step: str):
 81 |         """
 82 |         Given a workflow steps as string, find the corresponding method and call it.
 83 |         Args:
 84 |             step: workflow step as string
 85 |         Returns:
 86 |             True if the step was found and executed, False otherwise
 87 |         """
 88 |         if not hasattr(self, step):
 89 |             self.logger.error("Could not run workflow step %s for workflow %s", step, self.__class__.__name__)
 90 |             return False
 91 |         self.logger.info("Run workflow step %s for workflow %s", step, self.__class__.__name__)
 92 |         getattr(self, step)()
 93 |         return True
 94 | 
 95 |     def get_after_burner(self):
 96 |         """
 97 |         Return an after-burner object to be run after per-period workflow steps, OPTIONAL
 98 |         Can be overwritten by deriving class
 99 |         """
100 |         return None
101 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools >= 61.0", "setuptools-scm>=8.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "MachineLearningHEP"
 7 | dynamic = ["version"]
 8 | dependencies = [
 9 |   "hyperopt>=0.2.3",
10 |   "Jinja2>=2.10.3",
11 |   "keras>=2.3.1",
12 |   "klein>=17.10.0",
13 |   "lz4>=2.1.10",
14 |   "matplotlib>=3.0.3",
15 |   "numba>=0.48.0",
16 |   "numpy>=1.17.4",
17 |   "onnx>=1.12.0",
18 |   "onnxmltools>=1.12.0",
19 |   "onnxconverter-common>=1.13.0",
20 |   "pandas>=0.24.2",
21 |   "pyarrow",
22 |   "pylint",
23 |   "PyYaml>=5.1",
24 |   "scikit-learn>=0.20.3",
25 |   "scipy>=1.4.1",
26 |   "seaborn>=0.11.1",
27 |   "shap>=v0.23.0",
28 |   "tensorflow>=2.3.1",
29 |   "twisted>=19.2.0",
30 |   "uproot>=3.4.18",
31 |   "xgboost>=0.90",
32 |   "zstandard>=0.21.0",
33 | ]
34 | requires-python = ">= 3.10"
35 | authors = [
36 |   {name = "Gian Michele Innocenti", email = "gian.michele.innocenti@cern.ch"},
37 |   {name = "Jochen Klein", email = "jochen.klein@cern.ch"},
38 | ]
39 | maintainers = [
40 |   {name = "Jochen Klein", email = "jochen.klein@cern.ch"}
41 | ]
42 | description = "Machine Learning package for HEP"
43 | readme = "README.md"
44 | license = {file = "LICENSE"}
45 | keywords=['HEP', 'Computing', 'MachineLearning']
46 | classifiers=[
47 |   "Development Status :: 3 - Alpha",
48 |   "Intended Audience :: Education",
49 |   "Topic :: Scientific/Engineering :: Physics",
50 |   "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
51 |   "Programming Language :: Python",
52 | ]
53 | 
54 | [project.urls]
55 | Homepage = "https://github.com/alisw/MachineLearningHEP"
56 | Repository = "https://github.com/alisw/MachineLearningHEP"
57 | 
58 | [project.scripts]
59 | mlhep = "machine_learning_hep.steer_analysis:main"
60 | dl_train = "machine_learning_hep.utils.dl_train:main"
61 | 
62 | [tool.setuptools.packages.find]
63 | where = ["."]
64 | include = ["machine_learning_hep*"]
65 | 
66 | [tool.setuptools_scm]
67 | 
68 | [tool.ruff]
69 | line-length = 120
70 | 
71 | [tool.ruff.lint]
72 | select = [ # defaults: ["E4", "E7", "E9", "F"], see https://docs.astral.sh/ruff/rules/
73 |   "A",    # flake8-builtins
74 |   "ARG",  # flake8-unused-arguments
75 |   "B",    # flake8-bugbear
76 |   "C4",   # flake8-comprehensions
77 |   "E",    # pycodestyle Error
78 |   "F",    # Pyflakes
79 |   "FLY",  # flynt
80 |   "FURB", # refurb
81 |   "I",    # isort
82 |   "NPY",  # NumPy-specific rules
83 |   "PD",   # pandas-vet
84 |   "PL",   # Pylint
85 |   "RUF",  # Ruff-specific rules
86 |   "SIM",  # flake8-simplify
87 |   "UP",   # pyupgrade
88 |   "W",    # pycodestyle Warning
89 | ]
90 | ignore = [
91 |   "PD901", # pandas-df-variable-name
92 |   "PLR09", # too-many-...
93 | ]
94 | 
95 | [tool.pyright]
96 | reportMissingImports = false
97 | reportUnboundVariable = false
98 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | absl-py==1.4.0
 2 | astroid==2.15.0
 3 | astunparse==1.6.3
 4 | attrs==22.2.0
 5 | Automat==22.10.0
 6 | awkward==2.1.0
 7 | awkward-cpp==12
 8 | cachetools==5.3.0
 9 | certifi==2022.12.7
10 | charset-normalizer==3.1.0
11 | cloudpickle==2.2.1
12 | constantly==15.1.0
13 | contourpy==1.0.7
14 | cycler==0.11.0
15 | dill==0.3.6
16 | flatbuffers==23.3.3
17 | fonttools==4.39.0
18 | future==0.18.3
19 | gast==0.4.0
20 | google-auth==2.16.2
21 | google-auth-oauthlib==0.4.6
22 | google-pasta==0.2.0
23 | grpcio==1.51.3
24 | h5py==3.8.0
25 | hyperlink==21.0.0
26 | hyperopt==0.2.7
27 | idna==3.4
28 | importlib-metadata==6.0.0
29 | importlib-resources==5.12.0
30 | incremental==22.10.0
31 | isort==5.12.0
32 | Jinja2==3.1.2
33 | joblib==1.2.0
34 | keras==2.11.0
35 | kiwisolver==1.4.4
36 | klein==21.8.0
37 | lazy-object-proxy==1.9.0
38 | libclang==15.0.6.1
39 | llvmlite==0.39.1
40 | lz4==4.3.2
41 | Markdown==3.4.1
42 | MarkupSafe==2.1.2
43 | matplotlib==3.7.1
44 | mccabe==0.7.0
45 | munch==4.0.0
46 | networkx==3.0
47 | numba==0.56.4
48 | numpy==1.23.5
49 | oauthlib==3.2.2
50 | onnx==1.12.0
51 | onnxconverter-common==1.13.0
52 | onnxmltools==1.12.0
53 | opt-einsum==3.3.0
54 | packaging==23.0
55 | pandas==1.5.3
56 | Pillow==9.4.0
57 | platformdirs==3.1.1
58 | protobuf==3.19.6
59 | py4j==0.10.9.7
60 | pyarrow==16.0.0
61 | pyasn1==0.4.8
62 | pyasn1-modules==0.2.8
63 | pylint==2.17.0
64 | pyparsing==3.0.9
65 | python-dateutil==2.8.2
66 | pytz==2022.7.1
67 | PyYAML==6.0
68 | requests==2.28.2
69 | requests-oauthlib==1.3.1
70 | rsa==4.9
71 | scikit-learn==1.2.2
72 | scipy==1.10.1
73 | seaborn==0.12.2
74 | shap==0.41.0
75 | six==1.16.0
76 | slicer==0.0.7
77 | tensorboard==2.11.2
78 | tensorboard-data-server==0.6.1
79 | tensorboard-plugin-wit==1.8.1
80 | tensorflow==2.11.0
81 | tensorflow-estimator==2.11.0
82 | tensorflow-io-gcs-filesystem==0.31.0
83 | termcolor==2.2.0
84 | threadpoolctl==3.1.0
85 | tomli==2.0.1
86 | tomlkit==0.11.6
87 | tqdm==4.65.0
88 | Tubes==0.2.1
89 | Twisted==22.10.0
90 | typing-extensions==4.5.0
91 | uproot==5.0.4
92 | urllib3==1.26.15
93 | Werkzeug==2.2.3
94 | wrapt==1.15.0
95 | xgboost==1.7.4
96 | zipp==3.15.0
97 | zope.interface==5.5.2
98 | zstandard==0.21.0
99 | 


--------------------------------------------------------------------------------
/run_hfjets.py:
--------------------------------------------------------------------------------
 1 | #!/bin/env python3
 2 | #  © Copyright CERN 2024. All rights not expressly granted are reserved.  #
 3 | #                                                                         #
 4 | # This program is free software: you can redistribute it and/or modify it #
 5 | #  under the terms of the GNU General Public License as published by the  #
 6 | # Free Software Foundation, either version 3 of the License, or (at your  #
 7 | # option) any later version. This program is distributed in the hope that #
 8 | #  it will be useful, but WITHOUT ANY WARRANTY; without even the implied  #
 9 | #     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.    #
10 | #           See the GNU General Public License for more details.          #
11 | #    You should have received a copy of the GNU General Public License    #
12 | #   along with this program. if not, see <https://www.gnu.org/licenses/>. #
13 | 
14 | import argparse
15 | import subprocess
16 | import sys
17 | 
18 | parser = argparse.ArgumentParser()
19 | parser.add_argument("--case", "-c", default="d0jet")
20 | parser.add_argument("--analysis", "-a", default="jet_obs")
21 | parser.add_argument("--steps", "-s", nargs="+", default=["analyzer"])
22 | parser.add_argument("--interactive", "-i", action="store_true")
23 | parser.add_argument("--delete", "-d", action="store_true")
24 | # parser.add_argument('--dryrun', '-n', action='store_true')
25 | args = parser.parse_args()
26 | 
27 | DB = f"machine_learning_hep/data/data_run3/database_ml_parameters_{args.case}.yml"
28 | 
29 | for step in args.steps:
30 |     subprocess.run(
31 |         f"mlhep -r machine_learning_hep/submission/{step}.yml "
32 |         + f"-d {DB} {'-b' if not args.interactive else ''} "
33 |         + f"-a {args.analysis} {'--delete' if args.delete else ''}",
34 |         shell=True,
35 |         stdout=sys.stdout,
36 |         stderr=sys.stderr,
37 |         check=True,
38 |     )
39 | 


--------------------------------------------------------------------------------