├── .github
    └── workflows
    │   └── pypi.yml
├── .gitignore
├── .readthedocs.yaml
├── LICENSE
├── README.md
├── docs
    ├── Makefile
    ├── conf.py
    ├── index.rst
    ├── install
    │   └── installation.rst
    └── make.bat
├── pynuml
    ├── .gitignore
    ├── __init__.py
    ├── io
    │   ├── __init__.py
    │   ├── file.py
    │   ├── h5interface.py
    │   └── out.py
    ├── labels
    │   ├── __init__.py
    │   ├── ccqe.py
    │   ├── flavor.py
    │   ├── pdk.py
    │   ├── simple.py
    │   └── standard.py
    ├── meta.yaml
    ├── plot
    │   ├── __init__.py
    │   └── graph.py
    └── process
    │   ├── __init__.py
    │   ├── base.py
    │   ├── hitgraph.py
    │   └── spmap.py
├── pyproject.toml
├── scripts
    └── install_ph5concat_conda.sh
└── tests
    └── test_process.py


/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: pypi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 | 
 7 | permissions:
 8 |   contents: read
 9 | 
10 | jobs:
11 |   deploy:
12 | 
13 |     runs-on: ubuntu-latest
14 | 
15 |     steps:
16 |     - uses: actions/checkout@v4
17 |     - name: Set up Python
18 |       uses: actions/setup-python@v3
19 |       with:
20 |         python-version: '3.x'
21 |     - name: Install dependencies
22 |       run: |
23 |         python -m pip install --upgrade pip
24 |         pip install build
25 |     - name: Build package
26 |       run: python -m build
27 |     - name: Publish package
28 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
29 |       with:
30 |         user: __token__
31 |         password: ${{ secrets.PYPI_API_TOKEN }}
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.egg-info
2 | build
3 | .ipynb_checkpoints
4 | __pycache__
5 | .vscode
6 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # .readthedocs.yaml
 2 | # Read the Docs configuration file
 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 4 | 
 5 | # Required
 6 | version: 2
 7 | 
 8 | # Set the OS, Python version and other tools you might need
 9 | build:
10 |   os: ubuntu-22.04
11 |   tools:
12 |     python: "3.11"
13 |     # You can also specify other tool versions:
14 |     # nodejs: "19"
15 |     # rust: "1.64"
16 |     # golang: "1.19"
17 | 
18 | # Build documentation in the "docs/" directory with Sphinx
19 | sphinx:
20 |    configuration: docs/conf.py
21 | 
22 | # Optionally build your docs in additional formats such as PDF and ePub
23 | # formats:
24 | #    - pdf
25 | #    - epub
26 | 
27 | # Optional but recommended, declare the Python requirements required
28 | # to build your documentation
29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
30 | # python:
31 | #    install:
32 | #    - requirements: docs/requirements.txt


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2023 v hewes
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | The `pynuml` package has been incorporated into the [nugraph](github.com/nugraph/nugraph) repository, and any new development should be carried out there. This repository persists for legacy purposes, but is no longer actively maintained.
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # For the full list of built-in configuration values, see the documentation:
 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 5 | 
 6 | # -- Project information -----------------------------------------------------
 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 8 | 
 9 | project = 'pynuml'
10 | copyright = '2023, v hewes'
11 | author = 'v hewes'
12 | 
13 | # -- General configuration ---------------------------------------------------
14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
15 | 
16 | extensions = []
17 | 
18 | templates_path = ['_templates']
19 | exclude_patterns = []
20 | 
21 | 
22 | 
23 | # -- Options for HTML output -------------------------------------------------
24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
25 | 
26 | html_theme = 'sphinx_rtd_theme'
27 | html_static_path = ['_static']
28 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | :github_url: https://github.com/nugraph/pynuml
 2 | 
 3 | pynuml Documentation
 4 | ====================
 5 | 
 6 | **pynuml** is a python package providing a data interface for machine learning in neutrino physics. It utilises the **NuML** HDF5 event file format to efficiently preprocess physics events into ML objects for training neural networks. It is designed to abstract away many aspects of a typical ML workflow:
 7 | 
 8 | - Efficiently iterate over large HDF5 datasets
 9 | - Generate semantic and instance labels for particles
10 | - Preprocess events into ML objects
11 | 
12 | .. toctree::
13 |    :maxdepth: 1
14 |    :caption: Installation
15 | 
16 |    install/installation
17 | 
18 | .. toctree::
19 |    :maxdepth: 1
20 |    :caption: Getting Started


--------------------------------------------------------------------------------
/docs/install/installation.rst:
--------------------------------------------------------------------------------
 1 | Installation
 2 | ============
 3 | 
 4 | In order to best make use of the **pynuml** package, it is strongly encouraged to install the provided numl Anaconda environment. Parallel processing functionality requires an MPI installation, which will be automatically configured when you install the `numl` conda environment.
 5 | 
 6 | Installing the numl conda environment
 7 | -------------------------------------
 8 | 
 9 | Installing **pynuml** requires an Anaconda installation that utilises `conda-forge`. If you need to install Anaconda, we recommend using the `Mambaforge`_ variant.
10 | 
11 | A conda environment for numl is available via the anaconda client, and can be installed using::
12 | 
13 |     mamba install -y anaconda-client
14 |     mamba env create numl/numl
15 | 
16 | Once installed, this environment will need to be activated at the start of each terminal session::
17 | 
18 |     mamba activate numl
19 | 
20 | .. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge
21 | 
22 | This environment contains the most recent version of **pynuml** published to conda.
23 | 
24 | Installing with Anaconda
25 | ------------------------
26 | 
27 | It is also possible to install **pynuml** on its own via Anaconda, using the **numl** channel::
28 | 
29 |     mamba install -c numl pynuml
30 | 
31 | Installing with pip
32 | -------------------
33 | 
34 | **pynuml** is also available on PyPi, although this installation method is not recommended, as **pynuml** has non-python dependencies that cannot be installed by pip. If the user has installed those dependencies manually, then the package can be installed using::
35 | 
36 |     pip install pynuml
37 | 
38 | Installing for development
39 | --------------------------
40 | 
41 | If you're installing **pynuml** for development, you can install the numl Anaconda environment as outlined above, and then clone the repository directly and install it in editable mode::
42 | 
43 |     git clone https://github.com/nugraph/pynuml
44 |     pip install --no-deps -e ./pynuml
45 | 
46 | This will uninstall the conda release of pynuml installed by default as part of the numl environment, and override it with your local repository. If installed in editable mode, any changes made to the package will instantaneously be reflected when the module is imported in Python.


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/pynuml/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.pyc
3 | .ipynb_checkpoints/
4 | 


--------------------------------------------------------------------------------
/pynuml/__init__.py:
--------------------------------------------------------------------------------
1 | '''Standardised ML input processing for particle physics'''
2 | 
3 | __version__ = '24.6.dev0'
4 | 
5 | from . import io
6 | from . import labels
7 | from . import process
8 | from . import plot
9 | 


--------------------------------------------------------------------------------
/pynuml/io/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import Event, File
2 | from .h5interface import H5Interface
3 | from .out import PTOut, H5Out
4 | 


--------------------------------------------------------------------------------
/pynuml/io/file.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from abc import ABC
  3 | from typing import Any, Callable, Dict, List, Tuple
  4 | import psutil
  5 | 
  6 | import h5py
  7 | import numpy as np
  8 | import pandas as pd
  9 | from mpi4py import MPI
 10 | 
 11 | class Event:
 12 |     def __init__(self,
 13 |                  index: int,
 14 |                  event_id: np.ndarray,
 15 |                  data: Dict[str, pd.DataFrame] = {}):
 16 |         self.index = index
 17 |         self.event_id = event_id
 18 |         self.data = data.copy()
 19 | 
 20 |     @property
 21 |     def name(self):
 22 |         r, sr, evt = self.event_id
 23 |         return f'r{r}_sr{sr}_evt{evt}'
 24 | 
 25 |     def __setitem__(self, key: str, item: pd.DataFrame):
 26 |         if type(key) != str:
 27 |             raise Exception('Key must be a string!')
 28 |         if type(item) != pd.DataFrame:
 29 |             raise Exception('Value must be a pandas DataFrame!')
 30 |         self.data[key] = item
 31 | 
 32 |     def __getitem__(self, key: str):
 33 |         if type(key) != str:
 34 |             raise Exception('Key must be a string!')
 35 |         return self.data[key]
 36 | 
 37 |     def __str__(self):
 38 |         ret = f'event {self.event_id}\n'
 39 |         for group, df in self.data.items():
 40 |             ret += f'  {group} ({df.shape[0]} rows):\n'
 41 |             for key in df.keys():
 42 |                 ret += f'    {key}\n'
 43 |         return ret
 44 | 
 45 | class File:
 46 |     def __init__(self, fname: str, parKey: str = "/event_table/event_id"):
 47 |         self._colmap = {
 48 |             "event_table": {
 49 |                 "nu_dir": [ "nu_dir_x", "nu_dir_y", "nu_dir_z" ],
 50 |                 "nu_vtx": [ "nu_vtx_x", "nu_vtx_y", "nu_vtx_z" ],
 51 |                 "nu_vtx_corr": [ "nu_vtx_corr_x", "nu_vtx_corr_y", "nu_vtx_corr_z" ],
 52 |             },
 53 |             "particle_table": {
 54 |                 "start_position": [ "start_position_x", "start_position_y", "start_position_z" ],
 55 |                 "end_position": [ "end_position_x", "end_position_y", "end_position_z" ],
 56 |                 "start_position_corr": [ "start_position_corr_x", "start_position_corr_y", "start_position_corr_z" ],
 57 |                 "end_position_corr": [ "end_position_corr_x", "end_position_corr_y", "end_position_corr_z" ],
 58 |             },
 59 |             "spacepoint_table": {
 60 |                 "hit_id": [ "hit_id_u", "hit_id_v", "hit_id_y" ],
 61 |                 "position": [ "position_x", "position_y", "position_z" ],
 62 |             },
 63 |             "pandoraPrimary_table": {
 64 |                 "vtx": [ "vtx_x", "vtx_y", "vtx_z" ],
 65 |             },
 66 |         }
 67 | 
 68 |         # open the input HDF5 file in parallel
 69 |         self._fd = h5py.File(fname, "r", driver='mpio', comm=MPI.COMM_WORLD)
 70 | 
 71 |         # check if data partitioning key datasets exists in the file
 72 |         if parKey not in self._fd.keys():
 73 |             raise Exception(f'Error: dataset {parKey} is not found in file {fname}!')
 74 | 
 75 |         # parse the name of data partitioning key
 76 |         import os.path
 77 |         self._parTable = os.path.dirname(parKey)
 78 |         # remove leading '/'
 79 |         if self._parTable[0] == '/': self._parTable = self._parTable[1:]
 80 | 
 81 |         # extract dataset names: partitioning key, seq, and seq_cnt
 82 |         self._par_name = os.path.basename(parKey)
 83 |         self._seq_name = self._par_name + ".seq"
 84 |         self._cnt_name = self._par_name + ".seq_cnt"
 85 | 
 86 |         # obtain metadata of dataset parKey, later the dataset will be read
 87 |         # into self._index as a numpy array in data_partition()
 88 |         self._index = self._fd.get(parKey)
 89 |         self._num_events = self._index.shape[0]
 90 | 
 91 |         # self._groups is a python list, each member is a 2-element list consisting
 92 |         # of a group name, and a python list of dataset names
 93 |         self._groups = []
 94 | 
 95 |         # a python dictionary storing a sequence-count dataset in each group, keys
 96 |         # are group names, values are the sequence-count dataset subarrays assigned
 97 |         # to this process
 98 |         self._seq_cnt = {}
 99 |         self._evt_seq = {}
100 | 
101 |         self._whole_seq_cnt = {}
102 |         self._whole_seq     = {}
103 | 
104 |         self._use_seq_cnt = True
105 | 
106 |         # partition based on event amount of particle table (default)
107 |         self._evt_part = 2
108 | 
109 |         # a python nested dictionary storing datasets of each group read from the
110 |         # input file. keys of self._data are group names, values are python
111 |         # dictionaries, each has names of dataset in that group as keys, and values
112 |         # storing dataset subarrays
113 |         self._data = {}
114 | 
115 |         # _starts: data partition start indeices of all processes
116 |         # _counts: data cmount assigned to each process
117 |         starts = None
118 |         counts = None
119 | 
120 |         # starting array index of parKey assigned to this process
121 |         self._my_start = -1
122 | 
123 |         # number of array elements of parKey assigned to this process
124 |         self._my_count = -1
125 | 
126 |     def __del__(self):
127 |         if hasattr(self, '_fd') and self._fd: self._fd.close()
128 | 
129 |     def __len__(self):
130 |         # inquire the number of unique event IDs in the input file
131 |         return self._num_events
132 | 
133 |     def __str__(self):
134 |         ret = ""
135 |         for k1 in self._fd.keys():
136 |             ret += f"{k1}:\n"
137 |             for k2 in self._fd[k1].keys():
138 |                 if self._seq_name in k2: continue
139 |                 ret += f"    {k2}\n"
140 |         return ret
141 | 
142 |     def __getitem__(self, idx: int):
143 |         """load a single event from file"""
144 |         self.read_data(idx, 1)
145 |         ret = self.build_evt(idx, 1)
146 |         return ret[0] if len(ret) else None
147 | 
148 |     def check_shape0(self,
149 |                      group: str,
150 |                      keys: List[str] = []) -> None:
151 |         # Check if shape[0] of all datasets in keys are of the same size
152 |         shape0 = self._fd[group][keys[0]].shape[0]
153 |         for k in keys[1:]:
154 |             if k == self._cnt_name: continue # exception is seq_cnt dataset
155 |             if shape0 != self._fd[group][k].shape[0]:
156 |                raise Exception(f'Dataset "/{group}/{k}" shape[0]={self._fd[group][k].shape[0]} inconsistent with {keys[0]}.shape[0]={shape0}')
157 | 
158 |     def add_group(self,
159 |                   group: str,
160 |                   keys: List[str] = []) -> None:
161 | 
162 |         # if no keys specified, append all columns in HDF5 group
163 |         if not keys:
164 |             # retrieve all the dataset names of the group
165 |             keys = list(self._fd[group].keys())
166 |             # datasets seq and seq_cnt are not needed
167 |             if group != self._parTable and self._par_name in keys: keys.remove(self._par_name)
168 |             if self._seq_name in keys: keys.remove(self._seq_name)
169 |             if self._cnt_name in keys: keys.remove(self._cnt_name)
170 |         else:
171 |             # Check if datasets in keys are available in the file
172 |             for k in keys:
173 |                 if k not in self._fd[group].keys():
174 |                    raise Exception(f'Dataset "/{group}/{k}" does not exist')
175 | 
176 |         # if group does not already exist, just add it
177 |         if not self._groups or group not in self._groups[:][0]:
178 |             self.check_shape0(group, keys)
179 |             self._groups.append([ group, keys ])
180 |             return
181 | 
182 |         # if group is already present, need to figure out whether any extra keys need to be added
183 |         for g, k in self._groups:
184 |             if g == group:
185 |                 self.check_shape0(group, keys)
186 |                 for key in keys:
187 |                     if key not in k:
188 |                         k.append(key)
189 |                 return
190 |         raise Exception(f'group "{group}" not found.')
191 | 
192 |     def keys(self):
193 |         return self._fd.keys()
194 | 
195 |     def _cols(self,
196 |               group: str,
197 |               key: str) -> List[str]:
198 |         if key == self._par_name: return [ "run", "subrun", "event" ]
199 |         if group in self._colmap and key in self._colmap[group].keys(): return self._colmap[group][key]
200 |         elif self._fd[group][key].shape[1]==1: return [key]
201 |         else: return [ key+"_"+str(c) for c in range(0,self._fd[group][key].shape[1])]
202 | 
203 |     def get_dataframe(self,
204 |                       group: str,
205 |                       keys: List[str] = []) -> pd.DataFrame:
206 |         if not keys:
207 |             keys = list(self._fd[group].keys())
208 |             if self._seq_name in keys: keys.remove(self._seq_name)
209 |             if self._cnt_name in keys: keys.remove(self._cnt_name)
210 |         dfs = [ pd.DataFrame(np.array(self._fd[group][key]), columns=self._cols(group, key)) for key in keys ]
211 |         return pd.concat(dfs, axis="columns").set_index(["run","subrun","event"])
212 | 
213 |     def get_dataframe_evt(self,
214 |                           group: str,
215 |                           keys: List[str] = []) -> pd.DataFrame:
216 |         if not keys:
217 |             keys = list(self._data[group].keys())
218 |             if self._seq_name in keys: keys.remove(self._seq_name)
219 |             if self._cnt_name in keys: keys.remove(self._cnt_name)
220 |         dfs = [ pd.DataFrame(np.array(self._data[group][key]), columns=self._cols(group, key)) for key in keys ]
221 |         df = pd.concat(dfs, axis="columns")
222 |         evt_idx_col = []
223 |         for seq in self._seq_cnt[group]:
224 |             evt_idx_col += seq[1]*[seq[0]]
225 |         df['evt_idx'] = evt_idx_col
226 |         return df
227 | 
228 |     def index(self, idx: int):
229 |         """get the index for a given row"""
230 |         return self._my_index[idx - self._my_start]
231 | 
232 |     def read_seq(self) -> None:
233 |         for group, datasets in self._groups:
234 |             try:
235 |                 # read an HDF5 dataset into a numpy array
236 |                 self._whole_seq[group] = np.array(self._fd[group+"/"+self._seq_name])
237 |             except KeyError:
238 |                 print(f"Error: dataset {group}/{self._seq_name} does not exist")
239 |                 sys.stdout.flush()
240 |                 sys.exit(1)
241 | 
242 |     def read_seq_cnt(self) -> None:
243 |         # Dataset event_id.seq_cnt stores the event IDs sorted in an increasing
244 |         # order. There is no duplicated values and gaps may exist between any
245 |         # two consecutive elements. Note dataset event_id.seq_cnt in group
246 |         # self._parTable contains all event IDs with no gap.
247 |         for group, datasets in self._groups:
248 |             try:
249 |                 # read an HDF5 dataset into a numpy array
250 |                 self._whole_seq_cnt[group] = np.array(self._fd[group+"/"+self._cnt_name])
251 |             except KeyError:
252 |                 print(f"Error: dataset {group}/{self._cnt_name} does not exist")
253 |                 sys.stdout.flush()
254 |                 sys.exit(1)
255 | 
256 |     def data_partition(self) -> None:
257 |         # Calculate the start indices and counts of evt.seq assigned to each process
258 |         # self._starts: a numpy array of size nprocs
259 |         # self._counts: a numpy array of size nprocs
260 |         # Note self._starts and self._counts are matter only in root process.
261 |         # self._my_start: (== self._starts[rank]) this process's start
262 |         # self._my_count: (== self._counts[rank]) this process's count
263 |         # self._my_index: partitioned dataset i.e. assigned to this process
264 | 
265 |         comm = MPI.COMM_WORLD
266 |         rank = comm.Get_rank()
267 |         nprocs = comm.Get_size()
268 |         self._starts = np.zeros(nprocs, dtype=int)
269 |         self._counts = np.zeros(nprocs, dtype=int)
270 | 
271 |         if rank == 0:
272 |             if self._use_seq_cnt:
273 |                 self.read_seq_cnt()
274 |             else:
275 |                 self.read_seq()
276 | 
277 |             num_events = self._num_events
278 | 
279 |             if self._evt_part == 0:
280 |                 # Below implements event ID based partitioning, which
281 |                 # calculates the start and count of evt.seq id for each process
282 |                 _count = num_events // nprocs
283 |                 for j in range(num_events % nprocs):
284 |                     self._starts[j] = _count * j + j
285 |                     self._counts[j] = _count + 1
286 | 
287 |                 for j in range(num_events % nprocs, nprocs):
288 |                     self._starts[j] = _count * j + num_events % nprocs
289 |                     self._counts[j] = _count
290 | 
291 |             elif self._evt_part == 1:
292 |                 # event amount based partitioning, which calculates event sizes
293 |                 # across all groups. Note it is possible multiple consecutive rows
294 |                 # a dataset have the same event ID. It is also possible some event
295 |                 # IDs contain no data. First, we accumulate numbers of events
296 |                 # across all groups
297 |                 evt_size = np.zeros(num_events, dtype=int)
298 |                 if self._use_seq_cnt:
299 |                     for group, datasets in self._groups:
300 |                         seq_cnt = self._whole_seq_cnt[group]
301 |                         num_datasets = len(datasets)
302 |                         for i in range(seq_cnt.shape[0]):
303 |                             evt_size[seq_cnt[i, 0]] += seq_cnt[i, 1] * num_datasets
304 |                 else:
305 |                     for group, datasets in self._groups:
306 |                         seq = self._whole_seq[group]
307 |                         for i in range(seq.shape[0]):
308 |                             evt_size[seq[i, 0]] += 1
309 | 
310 |                 # now we have collected the number of events per event ID across all groups
311 |                 total_evt_num = np.sum(evt_size)
312 |                 avg_evt_num = total_evt_num // nprocs
313 |                 avg_evt = total_evt_num // num_events / 2
314 | 
315 |                 # assign ranges of event IDs to individual processes
316 |                 acc_evt_num = 0
317 |                 rank_id = 0
318 |                 for j in range(num_events):
319 |                     if rank_id == nprocs - 1: break
320 |                     if acc_evt_num + evt_size[j] >= avg_evt_num:
321 |                         remain_l = avg_evt_num - acc_evt_num
322 |                         remain_r = evt_size[j] - remain_l
323 |                         if remain_l > remain_r and remain_l > avg_evt:
324 |                             # assign event j to rank_id
325 |                             self._counts[rank_id] += 1
326 |                             acc_evt_num = 0
327 |                         else:
328 |                             # assign event j to rank_id+1
329 |                             self._counts[rank_id+1] = 1
330 |                             acc_evt_num = evt_size[j]
331 |                         # done with rank_id i
332 |                         rank_id += 1
333 |                         self._starts[rank_id] = self._starts[rank_id-1] + self._counts[rank_id-1]
334 |                     else:
335 |                         self._counts[rank_id] += 1
336 |                         acc_evt_num += evt_size[j]
337 |                 self._counts[nprocs-1] += num_events - j
338 | 
339 |             elif self._evt_part == 2:
340 |                 # use event amounts in the particle_table only to partition events
341 |                 seq_cnt = self._whole_seq_cnt['particle_table']
342 |                 total_evt_num = np.sum(seq_cnt[:,1])
343 |                 avg_evt_num = total_evt_num // nprocs
344 |                 avg_evt = total_evt_num // seq_cnt.shape[0] / 2
345 | 
346 |                 self._starts[0] = seq_cnt[0,0]
347 |                 acc_evt_num = 0
348 |                 rank_id = 0
349 |                 for j in range(seq_cnt.shape[0]):
350 |                     if rank_id == nprocs - 1: break
351 |                     if acc_evt_num + seq_cnt[j,1] >= avg_evt_num:
352 |                         remain_l = avg_evt_num - acc_evt_num
353 |                         remain_r = seq_cnt[j,1] - remain_l
354 |                         # if remain_r > remain_l:
355 |                         if remain_l > remain_r and remain_l > avg_evt:
356 |                             # assign event j to rank_id
357 |                             self._counts[rank_id] = seq_cnt[j+1, 0] - self._starts[rank_id]
358 |                             self._starts[rank_id+1] = seq_cnt[j+1, 0]
359 |                             acc_evt_num = 0
360 |                         else:
361 |                             # assign event j to rank_id+1
362 |                             self._counts[rank_id] = seq_cnt[j, 0] - self._starts[rank_id]
363 |                             self._starts[rank_id+1] = seq_cnt[j, 0]
364 |                             acc_evt_num = seq_cnt[j, 1]
365 |                         # done with rank_id
366 |                         rank_id += 1
367 |                     else:
368 |                         acc_evt_num += seq_cnt[j, 1]
369 | 
370 |                 self._counts[nprocs-1] = num_events - self._starts[nprocs-1]
371 | 
372 |         # All processes participate the collective communication, scatter.
373 |         # Root distributes start and count to all processes. Note only root process
374 |         # uses self._starts and self._counts.
375 |         start_count = np.empty([nprocs, 2], dtype=int)
376 |         start_count[:, 0] = self._starts[:]
377 |         start_count[:, 1] = self._counts[:]
378 |         recvbuf = np.empty(2, dtype=int)
379 |         comm.Scatter(start_count, recvbuf, root=0)
380 |         self._my_start = recvbuf[0]
381 |         self._my_count = recvbuf[1]
382 | 
383 |         # This process is assigned event IDs of range from self._my_start to
384 |         # (self._my_start + self._my_count - 1)
385 | 
386 |         # each process reads its share of dataset and stores it in a numpy
387 |         # array
388 |         self._my_index = np.array(self._index[self._my_start : self._my_start + self._my_count, :])
389 | 
390 |     def binary_search_min(self, key, base, nmemb):
391 |         low = 0
392 |         high = nmemb
393 |         while low != high:
394 |                 mid = (low + high) // 2
395 |                 if base[mid] < key:
396 |                         low = mid + 1
397 |                 else:
398 |                         high = mid
399 |         return low
400 | 
401 |     def binary_search_max(self, key, base, nmemb):
402 |         low = 0
403 |         high = nmemb
404 |         while low != high:
405 |                 mid = (low + high) // 2
406 |                 if base[mid] <= key:
407 |                         low = mid + 1
408 |                 else:
409 |                         high = mid
410 |         return (low - 1)
411 | 
412 |     def calc_bound_seq(self, group):
413 |         # return the lower and upper array indices of subarray assigned to this
414 |         # process, using the partition sequence dataset
415 | 
416 |         comm = MPI.COMM_WORLD
417 |         rank = comm.Get_rank()
418 |         nprocs = comm.Get_size()
419 | 
420 |         displ  = np.zeros([nprocs], dtype=int)
421 |         count  = np.zeros([nprocs], dtype=int)
422 |         bounds = np.zeros([nprocs, 2], dtype=int)
423 | 
424 |         all_evt_seq = None
425 |         if rank == 0:
426 |             # root reads the entire dataset self._seq_name, if not already
427 |             if not self._whole_seq: self.read_seq()
428 | 
429 |             all_evt_seq = self._whole_seq[group]
430 |             dim = len(all_evt_seq)
431 | 
432 |             # calculate displ, count to be used in scatterV for all processes
433 |             for i in range(nprocs):
434 |                 if self._counts[i] == 0: continue
435 |                 end = self._starts[i] + self._counts[i] - 1
436 |                 bounds[i, 0] = self.binary_search_min(self._starts[i], all_evt_seq, dim)
437 |                 bounds[i, 1] = self.binary_search_max(end,             all_evt_seq, dim)
438 |                 displ[i] = bounds[i, 0]
439 |                 count[i] = bounds[i, 1] - bounds[i, 0] + 1
440 | 
441 |         lower_upper = np.empty([2], dtype=int)
442 | 
443 |         # root distributes start and end indices to all processes
444 |         comm.Scatter(bounds, lower_upper, root=0)
445 | 
446 |         # this process is assigned array indices from lower to upper
447 |         lower = 0
448 |         upper = 0
449 |         if self._my_count > 0:
450 |             lower = lower_upper[0]
451 |             upper = lower_upper[1] + 1
452 | 
453 |         # root scatters the subarray of evt_seq to all processes
454 |         self._evt_seq[group] = np.zeros(upper - lower, dtype=np.int64)
455 |         comm.Scatterv([all_evt_seq, count, displ, MPI.LONG_LONG], self._evt_seq[group], root=0)
456 | 
457 |         return lower, upper
458 | 
459 |     def calc_bound_seq_cnt(self, group):
460 |         # return the lower and upper array indices of subarray assigned to this
461 |         # process, using the partition sequence-count dataset
462 | 
463 |         comm   = MPI.COMM_WORLD
464 |         rank   = comm.Get_rank()
465 |         nprocs = comm.Get_size()
466 | 
467 |         displ   = np.zeros([nprocs], dtype=int)
468 |         count   = np.zeros([nprocs], dtype=int)
469 |         seq_cnt = np.zeros([nprocs, 2], dtype=int)
470 | 
471 |         all_seq_cnt = None
472 |         if rank == 0:
473 |             # root reads the entire dataset self._cnt_name, if not already
474 |             if not self._whole_seq_cnt: self.read_seq_cnt()
475 | 
476 |             all_seq_cnt = self._whole_seq_cnt[group]
477 |             dim = len(all_seq_cnt)
478 | 
479 |             # calculate displ, count for all processes to be used in scatterV
480 |             recv_rank = 0  # receiver rank
481 |             displ[recv_rank] = 0
482 |             seq_cnt[recv_rank, 0] = 0
483 |             seq_end = self._starts[recv_rank] + self._counts[recv_rank]
484 |             seq_id = 0
485 |             for i in range(dim):
486 |                 if all_seq_cnt[i, 0] >= seq_end :
487 |                     seq_cnt[recv_rank, 1] = i - displ[recv_rank]
488 |                     recv_rank += 1  # move on to the next receiver rank
489 |                     seq_end = self._starts[recv_rank] + self._counts[recv_rank]
490 |                     displ[recv_rank] = i
491 |                     seq_cnt[recv_rank, 0] = seq_id
492 |                 seq_id += all_seq_cnt[i, 1]
493 | 
494 |             # last receiver rank
495 |             seq_cnt[recv_rank, 1] = dim - displ[recv_rank]
496 | 
497 |             displ[:] *= 2
498 |             count[:] = seq_cnt[:, 1] * 2
499 | 
500 |         # root distributes seq_cnt to all processes
501 |         my_seq_cnt = np.empty([2], dtype=int)
502 |         comm.Scatter(seq_cnt, my_seq_cnt, root=0)
503 | 
504 |         # self._seq_cnt[group][:, 0] is the event ID
505 |         # self._seq_cnt[group][:, 1] is the number of elements
506 |         self._seq_cnt[group] = np.empty([my_seq_cnt[1], 2], dtype=np.int64)
507 | 
508 |         # root scatters the subarray of evt_seq to all processes
509 |         comm.Scatterv([all_seq_cnt, count, displ, MPI.LONG_LONG], self._seq_cnt[group], root=0)
510 | 
511 |         lower = 0
512 |         upper = 0
513 |         if self._my_count > 0:
514 |             lower = my_seq_cnt[0]
515 |             upper = my_seq_cnt[0] + np.sum(self._seq_cnt[group][:, 1])
516 | 
517 |         # this process is assigned array indices from lower to upper
518 | 
519 |         return lower, upper
520 | 
521 |     def read_data(self,
522 |                   start: int,
523 |                   count: int) -> None:
524 |         # (sequentially) read subarrays of all datasets in all groups that fall
525 |         # in the range of self._seq_name, starting from 'start' and amount of 'count'
526 | 
527 |         for group, datasets in self._groups:
528 |             if self._use_seq_cnt:
529 |                 # use evt_id.seq_cnt to calculate subarray boundaries
530 |                 # reads the entire dataset self._cnt_name, if not already
531 |                 if not self._whole_seq_cnt or group not in self._whole_seq_cnt.keys():
532 |                     self.read_seq_cnt()
533 |                 all_seq_cnt = self._whole_seq_cnt[group]
534 |                 # search indices of start and end in all_seq_cnt
535 |                 # all_seq_cnt[:,0] are all unique
536 |                 ilower = np.searchsorted(all_seq_cnt[:,0], start)
537 |                 iupper = np.searchsorted(all_seq_cnt[:,0], start+count)
538 |                 self._seq_cnt[group] = np.array(all_seq_cnt[ilower:iupper], dtype=np.int64)
539 |                 lower = np.sum(all_seq_cnt[0:ilower, 1])
540 |                 upper = lower + np.sum(all_seq_cnt[ilower:iupper, 1])
541 |             else:
542 |                 # use evt_id.seq to calculate subarray boundaries
543 |                 # root reads the entire dataset self._seq_name, if not already
544 |                 if not self._whole_seq: self.read_seq()
545 |                 all_evt_seq = self._whole_seq[group]
546 |                 dim = len(all_evt_seq)
547 |                 # search indices of start and end in all_seq
548 |                 # all_seq[:] are not unique
549 |                 end = start + count - 1
550 |                 lower = self.binary_search_min(start, all_evt_seq, dim)
551 |                 upper = self.binary_search_max(end,   all_evt_seq, dim)
552 |                 upper += 1
553 |                 self._evt_seq[group] = np.array(all_evt_seq[lower:upper], dtype=np.int64)
554 | 
555 |             # Iterate through all the datasets and read the subarray from index lower
556 |             # to upper and store it into a dictionary with the names of group and
557 |             # dataset as the key.
558 |             self._data[group] = {}
559 |             for dset in datasets:
560 |                 # read subarray into a numpy array
561 |                 self._data[group][dset] = np.array(self._fd[group][dset][lower : upper])
562 | 
563 |         self._my_start = start
564 |         self._my_count = count
565 |         # read assigned partitioning key dataset into a numpy array
566 |         self._my_index = np.array(self._index[start : start + count, :])
567 | 
568 |     def read_data_all(self,
569 |                       use_seq_cnt: bool = True,
570 |                       evt_part: int = 2,
571 |                       profile: bool = False) -> None:
572 |         # use_seq_cnt: True  - use event.seq_cnt dataset to calculate partitioning
573 |         #                      starts and counts
574 |         #              False - use event.seq dataset to calculate starts and counts
575 |         # evt_part: 0  - partition based on event IDs
576 |         #           1 - partition based on event amount
577 |         #           2 - partition based on event amount of particle table (default)
578 |         # Parallel read dataset subarrays assigned to this process ranging from
579 |         # array index of self._my_start to (self._my_start + self._my_count - 1)
580 |         if profile:
581 |             par_time = 0
582 |             bnd_time = 0
583 |             rds_time = 0
584 |             time_s = MPI.Wtime()
585 | 
586 |         self._use_seq_cnt = use_seq_cnt
587 |         self._evt_part = evt_part
588 | 
589 |         # calculate the data partitioning start indices and amounts assigned to
590 |         # each process. Set self._starts, self._counts, self._my_start,
591 |         # self._my_count, and self._my_index
592 |         self.data_partition()
593 | 
594 |         if profile:
595 |             time_e = MPI.Wtime()
596 |             par_time = time_e - time_s
597 |             time_s = time_e
598 | 
599 |         for group, datasets in self._groups:
600 |             if self._use_seq_cnt:
601 |                 # use evt_id.seq_cnt to calculate subarray boundaries
602 |                 lower, upper = self.calc_bound_seq_cnt(group)
603 |             else:
604 |                 # use evt_id.seq to calculate subarray boundaries
605 |                 lower, upper = self.calc_bound_seq(group)
606 | 
607 |             if profile:
608 |                 time_e = MPI.Wtime()
609 |                 bnd_time += time_e - time_s
610 |                 time_s = time_e
611 | 
612 |             # Iterate through all the datasets and read the subarray from index lower
613 |             # to upper and store it into a dictionary with the names of group and
614 |             # dataset as the key.
615 |             self._data[group] = {}
616 |             for dset in datasets:
617 |                 # read subarray into a numpy array
618 |                 self._data[group][dset] = np.array(self._fd[group][dset][lower : upper])
619 | 
620 |             if profile:
621 |                 time_e = MPI.Wtime()
622 |                 rds_time += time_e - time_s
623 |                 time_s = time_e
624 | 
625 |         if profile:
626 |             rank   = MPI.COMM_WORLD.Get_rank()
627 |             nprocs = MPI.COMM_WORLD.Get_size()
628 | 
629 |             total_t = np.array([par_time, bnd_time, rds_time])
630 |             max_total_t = np.zeros(3)
631 |             MPI.COMM_WORLD.Reduce(total_t, max_total_t, op=MPI.MAX, root = 0)
632 |             min_total_t = np.zeros(3)
633 |             MPI.COMM_WORLD.Reduce(total_t, min_total_t, op=MPI.MIN, root = 0)
634 |             if rank == 0:
635 |                 print("---- Timing break down of the file read phase (in seconds) -------")
636 |                 if self._use_seq_cnt:
637 |                     print(f'Use "{self._cnt_name}" to calculate subarray boundaries')
638 |                 else:
639 |                     print(f'Use "{self._seq_name}" to calculate subarray boundaries')
640 | 
641 |                 print("data partitioning           time ", end='')
642 |                 print("MAX=%8.2f  MIN=%8.2f" % (max_total_t[0], min_total_t[0]))
643 |                 print("calc boundaries             time ", end='')
644 |                 print("MAX=%8.2f  MIN=%8.2f" % (max_total_t[1], min_total_t[1]))
645 |                 print("read datasets               time ", end='')
646 |                 print("MAX=%8.2f  MIN=%8.2f" % (max_total_t[2], min_total_t[2]))
647 |                 print("(MAX and MIN timings are among %d processes)" % nprocs)
648 | 
649 |     def build_evt(self,
650 |                   start: int = None,
651 |                   count: int = None) -> List[Dict]:
652 |         # This process is responsible for event IDs from start to (start+count-1).
653 |         # All data of the same event ID will be used to create a graph.
654 |         # This function collects all data based on self._seq_name, or
655 |         # self._cnt_name into a python list containing Pandas DataFrames, one
656 |         # for a unique event ID.
657 |         if not self._groups:
658 |             raise Exception('cannot build event without adding any HDF5 groups')
659 | 
660 |         ret_list = []
661 | 
662 |         if start is None: start = self._my_start
663 |         if count is None: count = self._my_count
664 | 
665 |         if self._use_seq_cnt:
666 |             # track the latest used index per group
667 |             idx_grp = dict.fromkeys(self._data.keys(), 0)
668 | 
669 |             # accumulate starting array index per group
670 |             idx_start = dict.fromkeys(self._data.keys(), 0)
671 | 
672 |             # whether idx is presented in a group's _seq_cnt[:,0]
673 |             idx_found = dict.fromkeys(self._data.keys(), False)
674 | 
675 |         # Iterate through assigned event IDs
676 |         for idx in range(int(start), int(start+count)):
677 |             # check if idx is missing in all groups
678 |             is_missing = True
679 |             if self._use_seq_cnt:
680 |                 for group in self._data.keys():
681 |                     idx_found[group] = False
682 |                     dim = self._seq_cnt[group].shape[0]
683 | 
684 |                     # check against the max of this group's
685 |                     if idx > self._seq_cnt[group][dim-1, 0]:
686 |                         continue
687 | 
688 |                     # check and search for idx in _seq_cnt[group][:,0]
689 |                     if idx == idx_grp[group]:
690 |                         # this is most likely the case when building all graphs
691 |                         # for all events at once
692 |                         idx_found[group] = True
693 |                         idx_grp[group] = idx
694 |                     elif idx - idx_grp[group] <= 8:
695 |                         # linear search for idx in _seq_cnt[group][:,0]
696 |                         # if distance is less than 8, linear search is faster
697 |                         for jj in range(idx_grp[group], dim):
698 |                             if idx == self._seq_cnt[group][jj, 0]:
699 |                                 idx_found[group] = True
700 |                                 idx_grp[group] = jj
701 |                                 break
702 |                             elif idx < self._seq_cnt[group][jj, 0]:
703 |                                 break
704 |                     else:
705 |                         # binary search for idx in _seq_cnt[group][:,0]
706 |                         # Note there is no duplicated values in
707 |                         # _seq_cnt[group][:,0] and the values are sorted in an
708 |                         # increasing order
709 |                         low = idx_grp[group]
710 |                         high = dim
711 |                         while low < high:
712 |                             mid = (low + high) // 2
713 |                             if self._seq_cnt[group][mid, 0] < idx:
714 |                                 low = mid + 1
715 |                             elif self._seq_cnt[group][mid, 0] > idx:
716 |                                 high = mid
717 |                             else:
718 |                                 idx_found[group] = True
719 |                                 idx_grp[group] = mid
720 |                                 break
721 | 
722 |                     if idx_found[group]:
723 |                         if idx == start:
724 |                             # Calculate starting array index only necessary for
725 |                             # first idx. For 2nd and later, idx_start is
726 |                             # accumulated later
727 |                             idx_start[group] = self._seq_cnt[group][0:idx_grp[group], 1].sum()
728 |                         # skip self._parTable group, as it is not used to
729 |                         # determine whether idx is missing.
730 |                         if group != self._parTable:
731 |                             is_missing = False
732 |             else:
733 |                 for group in self._data.keys():
734 |                     dim = len(self._evt_seq[group])
735 |                     # dataset event_id.seq may contain duplicated event IDs
736 |                     # IDs in this dataset are sorted in a monotonically non-decreasing order
737 |                     lower = self.binary_search_min(idx, self._evt_seq[group], dim)
738 |                     upper = self.binary_search_max(idx, self._evt_seq[group], dim) + 1
739 |                     if lower < upper:
740 |                         is_missing = False
741 |                         break
742 | 
743 |             # this idx is missing in all groups
744 |             if is_missing:
745 |                 continue
746 | 
747 |             # for each event seq ID, create a dictionary, ret
748 |             #   first item: key is "index" and value is the event seq ID
749 |             #   remaining items: key is group name and value is a Pandas DataFrame
750 |             #   containing the dataset subarray in this group with the event ID, idx
751 |             ret = Event(idx, self.index(idx))
752 | 
753 |             # Iterate through all groups
754 |             for group in self._data.keys():
755 | 
756 |                 if self._use_seq_cnt:
757 |                     # Note self._seq_cnt[group][:, 0] is the event ID
758 |                     # Note self._seq_cnt[group][:, 1] is the number of elements
759 | 
760 |                     if not idx_found[group]:
761 |                         # For idx is missing from this group but not in other
762 |                         # groups, create an empty Pandas DataFrame
763 |                         dfs = []
764 |                         for dataset in self._data[group].keys():
765 |                             data_dataframe = pd.DataFrame(columns=self._cols(group, dataset))
766 |                             dfs.append(data_dataframe)
767 |                         ret[group] = pd.concat(dfs, axis="columns")
768 |                         continue
769 | 
770 |                     if group == self._parTable:
771 |                         # Special treatment for group self._parTable, as its
772 |                         # seq_cnt[:,1] contains all 1s and earlier increment of
773 |                         # idx_grp[group] may be skipped due to missing idx
774 |                         lower = idx_grp[group]
775 |                         upper = lower + 1
776 |                     else:
777 |                         lower = idx_start[group]
778 |                         upper = self._seq_cnt[group][idx_grp[group], 1] + lower
779 | 
780 |                     # The range from lower to upper (exclusively) is subarray
781 |                     # indices of elements belonging to the same event ID, idx
782 | 
783 |                     if count > 1:
784 |                         # increment start array indices to avoid searching the
785 |                         # already-done data
786 |                         idx_start[group] += self._seq_cnt[group][idx_grp[group], 1]
787 |                         idx_grp[group] += 1
788 | 
789 |                 else:
790 |                     # Note self._evt_seq stores event ID values and is already sorted in
791 |                     # an increasing order
792 |                     dim = len(self._evt_seq[group])
793 | 
794 |                     # Find the local start and end row indices for this event ID, idx
795 |                     lower = self.binary_search_min(idx, self._evt_seq[group], dim)
796 |                     upper = self.binary_search_max(idx, self._evt_seq[group], dim) + 1
797 | 
798 |                 # dfs is a python list containing Pandas DataFrame objects
799 |                 dfs = []
800 |                 for dataset in self._data[group].keys():
801 |                     if lower >= upper:
802 |                         # idx is missing from the dataset self._seq_name,
803 |                         # In this case, create an empty numpy array
804 |                         data = np.array([])
805 |                     else:
806 |                         # array elements from lower to upper of this dataset have the
807 |                         # event ID == idx
808 |                         data = self._data[group][dataset][lower : upper]
809 | 
810 |                     # create a Pandas DataFrame to store the numpy array
811 |                     df = pd.DataFrame(data, columns=self._cols(group, dataset))
812 |                     for col in df.columns:
813 |                         if df[col].dtype == '|S64' or df[col].dtype == 'object':
814 |                             df[col] = df[col].str.decode('utf-8')
815 |                     dfs.append(df)
816 | 
817 |                 # concatenate into the dictionary "ret" with group names as keys
818 |                 ret[group] = pd.concat(dfs, axis="columns")
819 | 
820 |             # Add all dictionaries "ret" into a list.
821 |             # Each of them corresponds to the data of one single event ID
822 |             ret_list.append(ret)
823 | 
824 |         return ret_list
825 | 
826 |     def process(self,
827 |                 processor: Callable[[Event], Tuple[str, Any]],
828 |                 out: Callable[[Any, str], None]) -> None:
829 |         '''Process all events in this data partition'''
830 |         xproc = psutil.Process()
831 |         comm = MPI.COMM_WORLD
832 |         nprocs = comm.Get_size()
833 |         rank = comm.Get_rank()
834 |         if rank == 0:
835 |             out.write_metadata(processor.metadata)
836 |         self.read_data_all()
837 | 
838 |         verbose = False
839 | 
840 |         # whether or not to build graphs one event at a time
841 |         build_one_evt_at_a_time = True
842 | 
843 |         if build_one_evt_at_a_time == False:
844 |             evt_list = self.build_evt()
845 |             for evt in evt_list:
846 |                 name, data = processor(evt)
847 |                 if data is not None: out(name, data)
848 |             if verbose:
849 |                 print("Build all events: MPI rank %3d Memory footprint = %8.1f MiB" %
850 |                     (rank, xproc.memory_info().rss/ 1024.0 ** 2))
851 |         else:
852 |             # Iterate through assigned event IDs
853 |             for idx in range(int(self._my_start), int(self._my_start+self._my_count)):
854 |                 evt = self.build_evt(idx, 1)
855 |                 if len(evt) > 0:
856 |                     name, data = processor(evt[0])
857 |                     if data is not None: out(name, data)
858 |             if verbose:
859 |                 print("Build 1 event at a time: MPI rank %-3d Memory footprint = %8.1f MiB" %
860 |                     (rank, xproc.memory_info().rss/ 1024.0 ** 2))
861 | 
862 | 


--------------------------------------------------------------------------------
/pynuml/io/h5interface.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import h5py
 4 | import numpy as np
 5 | import torch
 6 | from torch_geometric.data import Data, HeteroData
 7 | 
 8 | 
 9 | class H5Interface:
10 |     def __init__(self, file: h5py.File):
11 |         self.f = file
12 | 
13 |     def save_data(self, data: Data) -> None:
14 |         code
15 | 
16 |     def _add_dataset(self, key: str, val: Any) -> None:
17 |         if np.isscalar(val):
18 |             self._data = self._data + (val,)
19 |             field = (key, type(val))
20 |         else:
21 |             if val.nelement() == 0: # save tensor with zero-sized dimension as a scalar 0
22 |                 # HDF5 compound data type does not allow zero-size dimension
23 |                 # ValueError: Zero-sized dimension specified (zero-sized dimension specified)
24 |                 self._data = self._data + (0,)
25 |                 field = (key, val.numpy().dtype)
26 |             else:
27 |                 val = val.numpy() # convert a tensor to numpy
28 |                 self._data = self._data + (val,)
29 |                 field = (key, val.dtype, val.shape)
30 |         self._fields.append(field)
31 | 
32 |     def save_heterodata(self, data: HeteroData) -> None:
33 | 
34 |         self._data = ()
35 |         self._fields = []
36 | 
37 |         nodes, edges = data.metadata()
38 | 
39 |         # save node stores
40 |         for node in nodes:
41 |             if "_" in node:
42 |                 raise Exception(f'"{node}" is not a valid node store name! Underscores are not supported.')
43 |             for key in data[node].keys():
44 |                 self._add_dataset(f'{node}/{key}', data[node][key])
45 | 
46 |         # save edge stores
47 |         for edge in edges:
48 |             for tmp in edge:
49 |                 if "_" in tmp:
50 |                     raise Exception(f'"{tmp}" is not a valid edge store name component! Underscores are not supported.')
51 |             name = "_".join(edge)
52 |             for key in data[edge].keys():
53 |                 self._add_dataset(f'{name}/{key}', data[edge][key])
54 | 
55 |     def save(self, name: str, data: Any) -> None:
56 |         if isinstance(data, Data):
57 |             self.save_data(data)
58 |         elif isinstance(data, HeteroData):
59 |             self.save_heterodata(data)
60 |         else:
61 |             raise NotImplementedError(f'No save method implemented for {type(data)}!')
62 | 
63 |         # create a scalar dataset of compound data type
64 |         ctype = np.dtype(self._fields)
65 |         ds = self.f.create_dataset(f'/dataset/{name}', shape=(), dtype=ctype, data=self._data)
66 |         del ctype, self._fields, self._data, ds
67 | 
68 |     def load_heterodata(self, name: str) -> HeteroData:
69 |         data = HeteroData()
70 |         # Read the whole dataset idx, dataset name is self.groups[idx]
71 |         group = self.f[f'dataset/{name}'][()]
72 |         for dataset in group.dtype.names:
73 |             store, attr = dataset.split('/')
74 |             if "_" in store: store = tuple(store.split("_"))
75 |             if group[dataset].ndim == 0:
76 |                 if attr == 'edge_index': # empty edge tensor
77 |                     data[store][attr] = torch.LongTensor([[],[]])
78 |                 else: # scalar
79 |                     data[store][attr] = torch.as_tensor(group[dataset][()])
80 |             else: # multi-dimension array
81 |                 data[store][attr] = torch.as_tensor(group[dataset][:])
82 |         return data
83 | 
84 |     def keys(self) -> list[str]:
85 |         return list(self.f['dataset'].keys())


--------------------------------------------------------------------------------
/pynuml/io/out.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | from typing import Any
  4 | 
  5 | import h5py
  6 | from mpi4py import MPI
  7 | 
  8 | 
  9 | class PTOut:
 10 |     def __init__(self, outdir: str):
 11 |         self.outdir = outdir
 12 |         isExist = os.path.exists(outdir)
 13 |         if not isExist:
 14 |             rank = MPI.COMM_WORLD.Get_rank()
 15 |             if rank == 0:
 16 |                 print("Error: output directory does not exist", outdir)
 17 |             sys.stdout.flush()
 18 |             MPI.COMM_WORLD.Abort(1)
 19 | 
 20 |     def __call__(self, name: str, obj: Any) -> None:
 21 |         import torch
 22 |         torch.save(obj, os.path.join(self.outdir, name)+".pt")
 23 | 
 24 |     def write_metadata(metadata: dict[str, Any]) -> None:
 25 |         raise NotImplementedError
 26 | 
 27 |     def exists(self, name: str) -> bool:
 28 |         return os.path.exists(os.path.join(self.outdir, name)+".pt")
 29 | 
 30 | class H5Out:
 31 |     def __init__(self, fname: str, overwrite: bool = False):
 32 |         # This implements one-file-per-process I/O strategy.
 33 |         # append MPI process rank to the output file name
 34 |         rank = MPI.COMM_WORLD.Get_rank()
 35 |         file_ext = ".{:04d}.h5"
 36 |         self.fname = fname + file_ext.format(rank)
 37 |         if os.path.exists(self.fname):
 38 |             if overwrite:
 39 |                 os.remove(self.fname)
 40 |             else:
 41 |                 print(f"Error: file already exists: {self.fname}")
 42 |                 sys.stdout.flush()
 43 |                 MPI.COMM_WORLD.Abort(1)
 44 |         # open/create the HDF5 file
 45 |         self.f = h5py.File(self.fname, "w")
 46 | 
 47 |         from .h5interface import H5Interface
 48 |         self.interface = H5Interface(self.f)
 49 |         # print(f"{rank}: creating {self.fname}")
 50 |         # sys.stdout.flush()
 51 | 
 52 |     def __call__(self, name: str, obj: Any) -> None:
 53 |         """
 54 |         for key, val in obj:
 55 |             # set chunk sizes to val shape, so there is only one chunk per dataset
 56 |             # if isinstance(val, torch.Tensor) and val.nelement() == 0 :
 57 |             #   print("zero val ",name,"/",key," shape=",val.shape)
 58 |             if isinstance(val, torch.Tensor) and val.nelement() > 0 :
 59 |                 # Note compressed datasets can only be read/written in MPI collective I/O mode in HDF5
 60 |                 self.f.create_dataset(f"/{name}/{key}", data=val, chunks=val.shape, compression="gzip")
 61 |                 # The line below is to not enable chunking/compression
 62 |                 # self.f.create_dataset(f"/{name}/{key}", data=val)
 63 |             else:
 64 |                 # if data is not a tensor or is empty, then disable chunking/compression
 65 |                 self.f.create_dataset(f"/{name}/{key}", data=val)
 66 |         """
 67 |         import numpy as np
 68 |         import torch_geometric as pyg
 69 | 
 70 |         # collect and construct fields of compound data type
 71 |         fields = []
 72 |         data = ()
 73 | 
 74 |         # special treatment for heterograph object
 75 |         if isinstance(obj, pyg.data.HeteroData):
 76 |             self.interface.save(name, obj)
 77 |             return
 78 |         for key, val in obj:
 79 |             if np.isscalar(val): # only n_sp is a scalar
 80 |                 data = data + (val,)
 81 |                 field = (key, type(val))
 82 |             else:
 83 |                 if val.nelement() == 0: # save tensor with zero-sized dimension as a scalar 0
 84 |                     # HDF5 compound data type does not allow zero-size dimension
 85 |                     # ValueError: Zero-sized dimension specified (zero-sized dimension specified)
 86 |                     val = val.numpy()  # convert a tensor to numpy
 87 |                     data = data + (0,)
 88 |                     field = (key, val.dtype)
 89 |                 else:
 90 |                     val = val.numpy() # convert a tensor to numpy
 91 |                     data = data + (val,)
 92 |                     field = (key, val.dtype, val.shape)
 93 |             fields.append(field)
 94 |         ctype = np.dtype(fields)
 95 |         # create a scalar dataset of compound data type
 96 |         ds = self.f.create_dataset(f"/{name}", shape=(), dtype=ctype, data=data)
 97 |         del ctype, fields, data, ds
 98 | 
 99 |     def write_metadata(self, metadata: dict[str, Any]) -> None:
100 |         for key, val in metadata.items():
101 |             self.f[key] = val
102 | 
103 |     def __del__(self):
104 |         if self.f != None: self.f.close()


--------------------------------------------------------------------------------
/pynuml/labels/__init__.py:
--------------------------------------------------------------------------------
1 | from .standard import StandardLabels
2 | from .simple import SimpleLabels
3 | from .flavor import FlavorLabels
4 | from .pdk import PDKLabels


--------------------------------------------------------------------------------
/pynuml/labels/ccqe.py:
--------------------------------------------------------------------------------
 1 | def ccqe(part):
 2 |     # get primary for each particle
 3 |     part = part.set_index("g4_id", drop=False)
 4 | 
 5 |     # convert from PDG code to label
 6 |     def label(pdg):
 7 |         if abs(pdg) == 11: return 0 # electron
 8 |         if abs(pdg) == 13: return 1 # muon
 9 |         return 2 # hadronic
10 | 
11 |     # trace lineage back from particle to primary and get label
12 |     def func(row):
13 |         gid = row.g4_id
14 |         pid = row.parent_id
15 |         while True:
16 |             if pid == 0: return label(part.type[gid])
17 |             # if not pid in part.g4_id: return label(part.type[gid])
18 |             # gid = part.g4_id[pid]
19 |             try:
20 |                 gid = part.g4_id[pid]
21 |             except KeyError:
22 |                 return 2
23 |             pid = part.parent_id[pid]
24 | 
25 |     # apply backtrace function to get labels
26 |     part["semantic_label"] = part.apply(func, axis=1)
27 |     return part.reset_index(drop=True)[["g4_id", "semantic_label"]]
28 | 
29 | def panoptic_label(part):
30 |     part = semantic_label(part)
31 |     part["instance_label"] = -1
32 |     return part
33 | 
34 | def edge_label(edge):
35 | 
36 |     # False
37 |     edge["label"] = 0
38 | 
39 |     # EM shower
40 |     mask_e = (edge.label_1 == 0) & (edge.label_2 == 0)
41 |     edge.loc[mask_e, "label"] = 1
42 | 
43 |     # Muon
44 |     mask_part = (edge.g4_id_1 == edge.g4_id_2)
45 |     mask_mu = (edge.label_1 == 1) & (edge.label_2 == 1)
46 |     edge.loc[mask_part & mask_mu, "label"] = 2
47 | 
48 |     # Hadronic
49 |     mask_had = (edge.label_1 == 2) & (edge.label_2 == 2)
50 |     edge.loc[mask_part & mask_had, "label"] = 3
51 | 
52 |     return edge
53 | 


--------------------------------------------------------------------------------
/pynuml/labels/flavor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | class FlavorLabels:
 4 |     def __init__(self):
 5 |         self._labels = (
 6 |             'cc_nue',
 7 |             'cc_numu',
 8 |             'cc_nutau',
 9 |             'nc')
10 | 
11 |     @property
12 |     def labels(self):
13 |         return self._labels
14 | 
15 |     def label(self, idx: int):
16 |         if not 0 <= label < len(self._labels):
17 |             raise Exception(f'index {idx} out of range for {len(self._labels)} labels.')
18 |         return self._labels[idx]
19 | 
20 |     def index(self, name: str):
21 |         if name not in self._labels:
22 |             raise Exception(f'"{name}" is not the name of a class.')
23 |         return self._labels.index(name)
24 | 
25 |     @property
26 |     def cc_nue(self):
27 |         return self.index('cc_nue')
28 |     
29 |     @property
30 |     def cc_numu(self):
31 |         return self.index('cc_numu')
32 |     
33 |     @property
34 |     def cc_nutau(self):
35 |         return self.index('cc_nutau')
36 |     
37 |     @property
38 |     def nc(self):
39 |         return self.index('nc')
40 | 
41 |     def __call__(self, event: pd.Series):
42 |         if not event.is_cc:
43 |             return self.nc
44 |         pdg = abs(event.nu_pdg)
45 |         if pdg == 12:
46 |             return self.cc_nue
47 |         if pdg == 14:
48 |             return self.cc_numu
49 |         if pdg == 16:
50 |             return self.cc_nutau
51 |         raise Exception(f'PDG code {event.nu_pdg} not recognised.')


--------------------------------------------------------------------------------
/pynuml/labels/pdk.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | class PDKLabels:
 4 |     def __init__(self):
 5 |         self._labels = ('nu', 'pdk')
 6 | 
 7 |     @property
 8 |     def labels(self):
 9 |         return self._labels
10 | 
11 |     def label(self, idx: int):
12 |         if not 0 <= label < len(self._labels):
13 |             raise Exception(f'index {idx} out of range for {len(self._labels)} labels.')
14 |         return self._labels[idx]
15 | 
16 |     def index(self, name: str):
17 |         if name not in self._labels:
18 |             raise Exception(f'"{name}" is not the name of a class.')
19 |         return self._labels.index(name)
20 | 
21 |     @property
22 |     def nu(self):
23 |         return self.index('nu')
24 | 
25 |     @property
26 |     def pdk(self):
27 |         return self.index('pdk')
28 | 
29 |     def __call__(self, event: pd.Series):
30 |         if 12 <= abs(event.nu_pdg) <= 16:
31 |             return self.nu
32 |         else:
33 |             return self.pdk


--------------------------------------------------------------------------------
/pynuml/labels/simple.py:
--------------------------------------------------------------------------------
 1 | from .standard import StandardLabels
 2 | 
 3 | class SimpleLabels(StandardLabels):
 4 |     def __init__(self,
 5 |                  gamma_threshold: float = 0.02,
 6 |                  hadron_threshold: float = 0.2):
 7 |         super(SimpleLabels, self).__init__(gamma_threshold, hadron_threshold)
 8 | 
 9 |         self._labels = [
10 |             'MIP',
11 |             'HIP',
12 |             'shower',
13 |             'michel',
14 |             'diffuse',
15 |             'invisible'
16 |         ]
17 | 
18 |     @property
19 |     def pion(self):
20 |         return self.index('MIP')
21 | 
22 |     @property
23 |     def muon(self):
24 |         return self.index('MIP')
25 | 
26 |     @property
27 |     def kaon(self):
28 |         return self.index('HIP')
29 | 
30 |     @property
31 |     def hadron(self):
32 |         return self.index('HIP')
33 | 
34 |     @property
35 |     def delta(self):
36 |         return self.index('MIP')


--------------------------------------------------------------------------------
/pynuml/labels/standard.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import particle
  3 | 
  4 | class StandardLabels:
  5 | 
  6 |     def __init__(self,
  7 |                  gamma_threshold: float = 0.02,
  8 |                  hadron_threshold: float = 0.2):
  9 |         self._labels = [
 10 |             'pion',
 11 |             'muon',
 12 |             'kaon',
 13 |             'hadron',
 14 |             'shower',
 15 |             'michel',
 16 |             'diffuse',
 17 |             'invisible'
 18 |         ]
 19 |         self._gamma_threshold = gamma_threshold
 20 |         self._hadron_threshold = hadron_threshold
 21 | 
 22 |     @property
 23 |     def labels(self):
 24 |         return self._labels
 25 | 
 26 |     def label(self, idx: int):
 27 |         if not 0 <= label < len(self._labels):
 28 |             raise Exception(f'index {idx} out of range for {len(self._labels)} labels.')
 29 |         return self._labels[idx]
 30 | 
 31 |     def index(self, name: str):
 32 |         if name not in self._labels:
 33 |             raise Exception(f'"{name}" is not the name of a class.')
 34 |         return self._labels.index(name)
 35 |     
 36 |     @property
 37 |     def pion(self):
 38 |         return self.index('pion')
 39 | 
 40 |     @property
 41 |     def muon(self):
 42 |         return self.index('muon')
 43 | 
 44 |     @property
 45 |     def kaon(self):
 46 |         return self.index('kaon')
 47 | 
 48 |     @property
 49 |     def hadron(self):
 50 |         return self.index('hadron')
 51 | 
 52 |     @property
 53 |     def shower(self):
 54 |         return self.index('shower')
 55 | 
 56 |     @property
 57 |     def michel(self):
 58 |         return self.index('michel')
 59 | 
 60 |     @property
 61 |     def diffuse(self):
 62 |         return self.index('diffuse')
 63 |     
 64 |     @property
 65 |     def invisible(self):
 66 |         return self.index('invisible')
 67 | 
 68 |     def __call__(self,
 69 |                  part: pd.DataFrame):
 70 |         '''Standard labelling function.
 71 | 
 72 |         Pion, Muon, Kaon, Hadron, EM shower, Michel electron,
 73 |         diffuse activity.
 74 |         '''
 75 | 
 76 |         def walk(part, particles, depth, sl, il):
 77 |             def s(part, particles):
 78 |                 sl, slc = -1, None
 79 |                 parent_type = 0 if part.parent_id == 0 else particles.type[part.parent_id]
 80 | 
 81 |                 def pion_labeler(part, parent_type):
 82 |                     sl = self.pion
 83 |                     slc = None
 84 |                     return sl, slc
 85 | 
 86 |                 def muon_labeler(part, parent_type):
 87 |                     sl = self.muon
 88 |                     slc = None
 89 |                     return sl, slc
 90 | 
 91 |                 def kaon_labeler(part, parent_type):
 92 |                     sl = self.kaon
 93 |                     slc = None
 94 |                     return sl, slc
 95 | 
 96 |                 def neutral_pions_kaons_labeler(part, parent_type):
 97 |                     sl = self.invisible
 98 |                     slc = None
 99 |                     return sl, slc
100 | 
101 |                 def electron_positron_labeler(part, parent_type):
102 |                     if part.start_process == 'primary':
103 |                         sl = self.shower
104 |                         slc = self.shower
105 |                     elif abs(parent_type) == 13 and (part.start_process == 'muMinusCaptureAtRest' \
106 |                         or part.start_process == 'muPlusCaptureAtRest' or part.start_process == 'Decay'):
107 |                         sl = self.michel
108 |                         slc = self.michel
109 |                     elif part.start_process == 'conv' or part.end_process == 'conv' \
110 |                         or part.start_process == 'compt' or part.end_process == 'compt':
111 |                         if part.momentum >= self._gamma_threshold:
112 |                             sl = self.shower
113 |                             slc = self.shower
114 |                         else:
115 |                             sl = self.diffuse
116 |                             slc = self.diffuse
117 |                     elif part.start_process == 'muIoni' or part.start_process == 'hIoni' \
118 |                         or part.start_process == 'eIoni':
119 |                         if part.start_process == 'muIoni':
120 |                             sl = self.muon
121 |                             slc = None
122 |                         elif part.start_process == 'hIoni':
123 |                             if abs(parent_type) == 2212:
124 |                                 sl = self.hadron
125 |                                 if part.momentum <= 0.0015: sl = self.diffuse
126 |                             else:
127 |                                 sl = self.pion
128 |                             slc = None
129 |                         else:
130 |                             sl = self.diffuse
131 |                             slc = None
132 |                     elif part.start_process == 'eBrem' or part.end_process == 'phot' \
133 |                         or part.end_process == 'photonNuclear' or part.end_process == 'eIoni':
134 |                         sl = self.diffuse
135 |                         slc = None
136 |                     elif part.end_process == 'StepLimiter' or part.end_process == 'annihil' \
137 |                         or part.end_process == 'eBrem' or part.start_process == 'hBertiniCaptureAtRest' \
138 |                         or part.end_process == 'FastScintillation' or part.start_process == 'muPairProd' \
139 |                         or part.start_process == 'phot':
140 |                         sl = self.diffuse
141 |                         slc = self.diffuse
142 |                     else:
143 |                         raise Exception(f'labelling failed for electron with start process "{part.start_process}" and end process "{part.end_process}')
144 | 
145 |                     return sl, slc
146 | 
147 |                 def gamma_labeler(part, parent_type):
148 |                     if part.start_process == 'conv' or part.end_process == 'conv' \
149 |                         or part.start_process == 'compt' or part.end_process == 'compt':
150 |                         if part.momentum >= self._gamma_threshold:
151 |                             sl = self.shower
152 |                             slc = self.shower
153 |                         else:
154 |                             sl = self.diffuse
155 |                             slc = self.diffuse
156 |                     elif part.start_process == 'eBrem' or part.end_process == 'phot' \
157 |                         or part.end_process == 'photonNuclear':
158 |                         sl = self.diffuse
159 |                         slc = None
160 |                     else:
161 |                         raise Exception(f'labelling failed for photon with start process "{part.start_process}" and end process "{part.end_process}')
162 |                     return sl, slc
163 | 
164 |                 def unlabeled_particle(part, parent_type):
165 |                     raise Exception(f"particle not recognised! PDG code {part.type}, parent PDG code {parent_type}, start process {part.start_process}, end process {part.end_process}")
166 | 
167 |                 particle_processor = {
168 |                     211: pion_labeler,
169 |                     221: pion_labeler,
170 |                     331: pion_labeler,
171 |                     223: pion_labeler,
172 |                     13: muon_labeler,
173 |                     321: kaon_labeler,
174 |                     111: neutral_pions_kaons_labeler,
175 |                     311: neutral_pions_kaons_labeler,
176 |                     310: neutral_pions_kaons_labeler,
177 |                     130: neutral_pions_kaons_labeler,
178 |                     113: neutral_pions_kaons_labeler,
179 |                     411: kaon_labeler, # D meson
180 |                     11: electron_positron_labeler,
181 |                     22: gamma_labeler
182 |                 }
183 | 
184 |                 if particle.pdgid.charge(part.type) == 0 and part.end_process == 'CoupledTransportation':
185 |                     # neutral particle left the volume boundary
186 |                     sl = self.invisible
187 |                 else:
188 |                     func = particle_processor.get(abs(part.type), lambda x ,y: (-1, None))
189 |                     sl, slc = func(part, parent_type)
190 | 
191 |                 # baryon interactions - hadron or diffuse
192 |                 if (particle.pdgid.is_baryon(part.type) and particle.pdgid.charge(part.type) == 0) \
193 |                     or particle.pdgid.is_nucleus(part.type):
194 |                     sl = self.diffuse
195 |                 if particle.pdgid.is_baryon(part.type) and particle.pdgid.charge(part.type) != 0:
196 |                     if abs(part.type) == 2212 and part.momentum >= self._hadron_threshold:
197 |                         sl = self.hadron
198 |                     else:
199 |                         sl = self.diffuse
200 | 
201 |                 # call a charged tau highly ionising - should revisit this
202 |                 if abs(part.type) == 15:
203 |                     sl = self.hadron
204 | 
205 |                 # check to make sure particle was assigned
206 |                 if sl == -1:
207 |                     unlabeled_particle(part, parent_type)
208 | 
209 |                 return sl, slc
210 | 
211 |             def i(part, particles, sl):
212 |                 il, ilc = -1, None
213 |                 if sl == self.muon and part.start_process == 'muIoni':
214 |                     il = part.parent_id
215 |                 elif (sl == self.pion or sl == self.hadron) and part.start_process == 'hIoni':
216 |                     il = part.parent_id
217 |                 elif sl != self.diffuse and sl != self.invisible:
218 |                     il = part.g4_id
219 |                     if sl == self.shower: ilc = il
220 |                     if sl == self.michel: ilc = il
221 |                 return il, ilc
222 | 
223 |             if sl is not None: slc = sl
224 |             else: sl, slc = s(part, particles)
225 | 
226 |             if il is not None: ilc = il
227 |             else: il, ilc = i(part, particles, sl)
228 | 
229 |             ret = [ {
230 |                 "g4_id": part.g4_id,
231 |                 "parent_id": part.parent_id,
232 |                 "type": part.type,
233 |                 "start_process": part.start_process,
234 |                 "end_process": part.end_process,
235 |                 "momentum": part.momentum,
236 |                 "semantic_label": sl,
237 |                 "instance_label": il } ]
238 |             for _, row in particles[(part.g4_id==particles.parent_id)].iterrows():
239 |                 ret += walk(row, particles, depth+1, slc, ilc)
240 |             return ret
241 | 
242 |         ret = []
243 |         part = part.set_index("g4_id", drop=False)
244 |         primaries = part[(part.parent_id==0)]
245 |         for _, primary in primaries.iterrows():
246 |             ret += walk(primary, part, 0, None, None)
247 |         if len(ret)==0: return
248 |         labels = pd.DataFrame.from_dict(ret)
249 |         instances = { val: i for i, val in enumerate(labels[(labels.instance_label>=0)].instance_label.unique()) }
250 | 
251 |         def alias_instance(row, instances):
252 |             if row.instance_label == -1: return -1
253 |             return instances[row.instance_label]
254 | 
255 |         labels["instance_label"] = labels.apply(alias_instance, args=[instances], axis="columns")
256 |         return labels
257 | 
258 |     def validate(self, labels: pd.Series):
259 |         mask = (labels < 0) | (labels >= len(self._labels) - 1)
260 |         if mask.any():
261 |             raise Exception(f'{mask.sum()} semantic labels are out of range: {labels[mask]}.')
262 | 


--------------------------------------------------------------------------------
/pynuml/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "pynuml" %}
 2 | {% set version = "23.11.0" %}
 3 | 
 4 | package:
 5 |   name: {{ name|lower }}
 6 |   version: {{ version }}
 7 | 
 8 | source:
 9 |   url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/pynuml-{{ version }}.tar.gz
10 |   sha256: 1a7e61864cfeb0b27c6a93646c33e3f457bbc384eb86aee4df76b5e02898d02f
11 | 
12 | build:
13 |   noarch: python
14 |   script: {{ PYTHON }} -m pip install . -vv
15 |   number: 0
16 | 
17 | requirements:
18 |   host:
19 |     - python >=3.7
20 |     - flit >=3.2,<4
21 |     - pip
22 |   run:
23 |     - python >=3.7
24 |     - h5py >=3.7.0
25 |     - mpi4py
26 |     - pandas
27 |     - particle
28 |     - plotly
29 |     - pytorch >=1.12.1
30 |     - pyg >=2.1.0
31 | 
32 | test:
33 |   imports:
34 |     - pynuml
35 |   commands:
36 |     - pip check
37 |   requires:
38 |     - pip
39 | 
40 | about:
41 |   summary: Standardised ML input processing for particle physics
42 |   license: MIT
43 |   license_file: LICENSE
44 | 
45 | extra:
46 |   recipe-maintainers:
47 |     - vhewes
48 | 


--------------------------------------------------------------------------------
/pynuml/plot/__init__.py:
--------------------------------------------------------------------------------
1 | from .graph import GraphPlot


--------------------------------------------------------------------------------
/pynuml/plot/graph.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | from torch_geometric.data import Batch, HeteroData
  3 | import plotly.express as px
  4 | from plotly.graph_objects import FigureWidget
  5 | import warnings
  6 | 
  7 | class GraphPlot:
  8 |     def __init__(self,
  9 |                  planes: list[str],
 10 |                  classes: list[str],
 11 |                  filter_threshold: float = 0.5):
 12 |         self._planes = planes
 13 |         self._classes = classes
 14 |         self._labels = pd.CategoricalDtype(['background']+classes, ordered=True)
 15 |         self._cmap = { c: px.colors.qualitative.Plotly[i] for i, c in enumerate(classes) }
 16 |         self._cmap['background'] = 'lightgrey'
 17 |         self.filter_threshold = filter_threshold
 18 | 
 19 |         # temporarily silence this pandas warning triggered by plotly,
 20 |         # which we don't have any power to fix but will presumably
 21 |         # be fixed on their end at some point
 22 |         warnings.filterwarnings("ignore", ".*The default of observed=False is deprecated and will be changed to True in a future version of pandas.*")
 23 |         self._truth_cols = ( 'g4_id', 'parent_id', 'pdg' )
 24 | 
 25 |     def to_dataframe(self, data: HeteroData):
 26 |         def to_categorical(arr):
 27 |             return pd.Categorical.from_codes(codes=arr+1, dtype=self._labels)
 28 |         if isinstance(data, Batch):
 29 |             raise Exception('to_dataframe does not support batches!')
 30 |         dfs = []
 31 |         for p in self._planes:
 32 |             plane = data[p].to_dict()
 33 |             df = pd.DataFrame(plane['id'], columns=['id'])
 34 |             df['plane'] = p
 35 |             df[['wire','time']] = plane['pos']
 36 |             if "c" in plane:
 37 |                 df[["x", "y", "z"]] = plane["c"]
 38 |             df['y_filter'] = plane['y_semantic'] != -1
 39 |             mask = df.y_filter.values
 40 |             df['y_semantic'] = to_categorical(plane['y_semantic'])
 41 |             df['y_instance'] = plane['y_instance'].numpy().astype(str)
 42 | 
 43 |             # add detailed truth information if it's available
 44 |             for col in self._truth_cols:
 45 |                 if col in plane.keys():
 46 |                     df[col] = plane[col].numpy()
 47 | 
 48 |             # add model prediction if it's available
 49 |             if 'x_semantic' in plane.keys():
 50 |                 df['x_semantic'] = to_categorical(plane['x_semantic'].argmax(dim=-1).detach())
 51 |                 df[self._classes] = plane['x_semantic'].detach()
 52 |             if 'x_filter' in plane.keys():
 53 |                 df['x_filter'] = plane['x_filter'].detach()
 54 |             if "i" in plane.keys():
 55 |                 df["i"] = plane["i"].numpy().astype(str)
 56 | 
 57 |             dfs.append(df)
 58 |         df = pd.concat(dfs)
 59 |         md = data['metadata']
 60 |         df['run'] = md.run.item()
 61 |         df['subrun'] = md.subrun.item()
 62 |         df['event'] = md.event.item()
 63 |         return df
 64 | 
 65 |     def plot(self,
 66 |              data: HeteroData,
 67 |              target: str = 'hits',
 68 |              how: str = 'none',
 69 |              filter: str = 'show',
 70 |              xyz: bool = False,
 71 |              width: int = None,
 72 |              height: int = None,
 73 |              title: bool = True) -> FigureWidget:
 74 | 
 75 |         df = self.to_dataframe(data)
 76 | 
 77 |         # no colour
 78 |         if target == 'hits':
 79 |             opts = {
 80 |                 'title': 'Graph hits',
 81 |             }
 82 | 
 83 |         # semantic labels
 84 |         elif target == 'semantic':
 85 |             if how == 'true':
 86 |                 opts = {
 87 |                     'title': 'True semantic labels',
 88 |                     'labels': { 'y_semantic': 'Semantic label' },
 89 |                     'color': 'y_semantic',
 90 |                     'color_discrete_map': self._cmap,
 91 |                 }
 92 |             elif how == 'pred':
 93 |                 opts = {
 94 |                     'title': 'Predicted semantic labels',
 95 |                     'labels': { 'x_semantic': 'Semantic label' },
 96 |                     'color': 'x_semantic',
 97 |                     'color_discrete_map': self._cmap,
 98 |                 }
 99 |             elif how in self._classes:
100 |                 opts = {
101 |                     'title': f'Predicted semantic label strength for {how} class',
102 |                     'labels': { how: f'{how} probability' },
103 |                     'color': how,
104 |                     'color_continuous_scale': px.colors.sequential.Reds,
105 |                 }
106 |             else:
107 |                 raise Exception('for semantic labels, "how" must be one of "true", "pred" or the name of a class.')
108 | 
109 |         # instance labels
110 |         elif target == 'instance':
111 |             if how == 'true':
112 |                 opts = {
113 |                     'title': 'True instance labels',
114 |                     'labels': { 'y_instance': 'Instance label' },
115 |                     'color': 'y_instance',
116 |                     'symbol': 'y_semantic',
117 |                     'color_discrete_map': self._cmap,
118 |                 }
119 |             elif how == 'pred':
120 |                 opts = {
121 |                     'title': 'Predicted instance labels',
122 |                     'labels': { 'i': 'Instance label' },
123 |                     'color': 'i',
124 |                     'color_discrete_map': self._cmap,
125 |                 }
126 |             else:
127 |                 raise Exception('for instance labels, "how" must be one of "true" or "pred".')
128 | 
129 |         # filter labels
130 |         elif target == 'filter':
131 |             if how == 'true':
132 |                 opts = {
133 |                     'title': 'True filter labels',
134 |                     'labels': { 'y_filter': 'Filter label' },
135 |                     'color': 'y_filter',
136 |                     'color_discrete_map': { 0: 'coral', 1: 'mediumseagreen' },
137 |                 }
138 |             elif how == 'pred':
139 |                 opts = {
140 |                     'title': 'Predicted filter labels',
141 |                     'labels': { 'x_filter': 'Filter label' },
142 |                     'color': 'x_filter',
143 |                     'color_continuous_scale': px.colors.sequential.Reds,
144 |                 }
145 |             else:
146 |                 raise Exception('for filter labels, "how" must be one of "true" or "pred".')
147 | 
148 |         else:
149 |             raise Exception('"target" must be one of "hits", "semantic", "instance" or "filter".')
150 | 
151 |         if filter == 'none':
152 |             # don't do any filtering
153 |             pass
154 |         elif filter == 'show':
155 |             # show hits predicted to be background in grey
156 |             if target == 'semantic' and how == 'pred':
157 |                 df.x_semantic[df.x_filter < self.filter_threshold] = 'background'
158 |         elif filter == 'true':
159 |             # remove true background hits
160 |             df = df[df.y_filter.values]
161 |             opts['title'] += ' (filtered by truth)'
162 |         elif filter == 'pred':
163 |             # remove predicted background hits
164 |             df = df[df.x_filter > self.filter_threshold]
165 |             opts['title'] += ' (filtered by prediction)'
166 |         else:
167 |             raise Exception('"filter" must be one of "none", "show", "true" or "pred".')
168 | 
169 |         if not title:
170 |             opts.pop('title')
171 | 
172 |         # set hover data
173 |         opts['hover_data'] = {
174 |             'y_semantic': True,
175 |             "y_instance": True,
176 |             'wire': ':.1f',
177 |             'time': ':.1f',
178 |         }
179 |         opts['labels'] = {
180 |             'y_filter': 'filter truth',
181 |             'y_semantic': 'semantic truth',
182 |             'y_instance': 'instance truth',
183 |         }
184 |         if 'x_filter' in df:
185 |             opts['hover_data']['x_filter'] = True
186 |             opts['labels']['x_filter'] = 'filter prediction'
187 |         if 'x_semantic' in df:
188 |             opts['hover_data']['x_semantic'] = True
189 |             opts['labels']['x_semantic'] = 'semantic prediction'
190 |         if 'i' in df:
191 |             opts['hover_data']['i'] = ':.4f'
192 |             opts['labels']['i'] = 'instance prediction'
193 |         for col in self._truth_cols:
194 |             if col in df:
195 |                 opts['hover_data'][col] = True
196 | 
197 |         if xyz:
198 |             fig = px.scatter_3d(df, x="x", y="y", z="z",
199 |                                 width=width, height=height, **opts)
200 |             fig.update_traces(marker_size=1)
201 |         else:
202 |             fig = px.scatter(df, x='wire', y='time', facet_col='plane',
203 |                             width=width, height=height, **opts)
204 |             fig.update_xaxes(matches=None)
205 |             for a in fig.layout.annotations:
206 |                 a.text = a.text.replace('plane=', '')
207 | 
208 |         # set the legend to horizontal
209 |         fig.update_layout(
210 |             legend_orientation='h',
211 |             legend_yanchor='bottom', legend_y=1.05,
212 |             legend_xanchor='right', legend_x=1,
213 |             margin_l=20, margin_r=20, margin_t=20, margin_b=20,
214 |             title_automargin=title,
215 |         )
216 | 
217 |         return FigureWidget(fig)


--------------------------------------------------------------------------------
/pynuml/process/__init__.py:
--------------------------------------------------------------------------------
1 | from .hitgraph import HitGraphProducer
2 | 


--------------------------------------------------------------------------------
/pynuml/process/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC
 2 | from typing import Any, Dict, List, Tuple
 3 | 
 4 | from ..io import File
 5 | 
 6 | class ProcessorBase(ABC):
 7 |     '''Base class for event processing'''
 8 | 
 9 |     def __init__(self, file: File):
10 |         for group, keys in self.columns.items():
11 |             file.add_group(group, keys)
12 | 
13 |     @property
14 |     def columns(self) -> Dict[str, List[str]]:
15 |         raise NotImplementedError
16 | 
17 |     def __call__(self, evt: Any) -> Tuple[str, Any]:
18 |         raise NotImplementedError


--------------------------------------------------------------------------------
/pynuml/process/hitgraph.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Callable
  2 | import numpy as np
  3 | import pandas as pd
  4 | 
  5 | import torch
  6 | import torch_geometric as pyg
  7 | 
  8 | from .base import ProcessorBase
  9 | 
 10 | class HitGraphProducer(ProcessorBase):
 11 |     '''Process event into graphs'''
 12 | 
 13 |     def __init__(self,
 14 |                  file: 'pynuml.io.File',
 15 |                  semantic_labeller: Callable = None,
 16 |                  event_labeller: Callable = None,
 17 |                  label_vertex: bool = False,
 18 |                  label_position: bool = False,
 19 |                  planes: list[str] = ['u','v','y'],
 20 |                  node_pos: list[str] = ['local_wire','local_time'],
 21 |                  pos_norm: list[float] = [0.3,0.055],
 22 |                  node_feats: list[str] = ['integral','rms'],
 23 |                  lower_bound: int = 20,
 24 |                  store_detailed_truth: bool = False):
 25 | 
 26 |         self.semantic_labeller = semantic_labeller
 27 |         self.event_labeller = event_labeller
 28 |         self.label_vertex = label_vertex
 29 |         self.label_position = label_position
 30 |         self.planes = planes
 31 |         self.node_pos = node_pos
 32 |         self.pos_norm = torch.tensor(pos_norm).float()
 33 |         self.node_feats = node_feats
 34 |         self.lower_bound = lower_bound
 35 |         self.store_detailed_truth = store_detailed_truth
 36 | 
 37 |         self.transform = pyg.transforms.Compose((
 38 |             pyg.transforms.Delaunay(),
 39 |             pyg.transforms.FaceToEdge()))
 40 | 
 41 |         super().__init__(file)
 42 | 
 43 |     @property
 44 |     def columns(self) -> dict[str, list[str]]:
 45 |         groups = {
 46 |             'hit_table': ['hit_id','local_plane','local_time','local_wire','integral','rms'],
 47 |             'spacepoint_table': []
 48 |         }
 49 |         if self.semantic_labeller:
 50 |             groups['particle_table'] = ['g4_id','parent_id','type','momentum','start_process','end_process']
 51 |             groups['edep_table'] = []
 52 |         if self.event_labeller:
 53 |             groups['event_table'] = ['is_cc', 'nu_pdg']
 54 |         if self.label_vertex:
 55 |             keys = ['nu_vtx_corr','nu_vtx_wire_pos','nu_vtx_wire_time']
 56 |             if 'event_table' in groups:
 57 |                 groups['event_table'].extend(keys)
 58 |             else:
 59 |                 groups['event_table'] = keys
 60 |         if self.label_position:
 61 |             groups["edep_table"] = []
 62 |         return groups
 63 | 
 64 |     @property
 65 |     def metadata(self):
 66 |         metadata = { 'planes': self.planes }
 67 |         if self.semantic_labeller is not None:
 68 |             metadata['semantic_classes'] = self.semantic_labeller.labels[:-1]
 69 |         if self.event_labeller is not None:
 70 |             metadata['event_classes'] = self.event_labeller.labels
 71 |         return metadata
 72 | 
 73 |     def __call__(self, evt: 'pynuml.io.Event') -> tuple[str, Any]:
 74 | 
 75 |         if self.event_labeller or self.label_vertex:
 76 |             event = evt['event_table'].squeeze()
 77 | 
 78 |         hits = evt['hit_table']
 79 |         spacepoints = evt['spacepoint_table'].reset_index(drop=True)
 80 | 
 81 |         # discard any events with pathologically large hit integrals
 82 |         # this is a hotfix that should be removed once the dataset is fixed
 83 |         if hits.integral.max() > 1e6:
 84 |             print('found event with pathologically large hit integral, skipping')
 85 |             return evt.name, None
 86 | 
 87 |         # handle energy depositions
 88 |         if self.semantic_labeller:
 89 |             edeps = evt['edep_table']
 90 |             energy_col = 'energy' if 'energy' in edeps.columns else 'energy_fraction' # for backwards compatibility
 91 |         
 92 |             # get ID of max particle
 93 |             g4_id = edeps[[energy_col, 'g4_id', 'hit_id']]
 94 |             g4_id = g4_id.sort_values(by=[energy_col],
 95 |                                       ascending=False,
 96 |                                       kind='mergesort').drop_duplicates('hit_id')
 97 |             hits = g4_id.merge(hits, on='hit_id', how='right')
 98 |             
 99 |             # charge-weighted average of 3D position
100 |             if self.label_position:
101 |                 edeps = edeps[["hit_id", "energy", "x_position", "y_position", "z_position"]]
102 |                 for col in ["x_position", "y_position", "z_position"]:
103 |                     edeps.loc[:, col] *= edeps.energy
104 |                 edeps = edeps.groupby("hit_id").sum()
105 |                 for col in ["x_position", "y_position", "z_position"]:
106 |                     edeps.loc[:, col] /= edeps.energy
107 |                 edeps = edeps.drop("energy", axis="columns")
108 |                 hits = edeps.merge(hits, on="hit_id", how="right")
109 | 
110 |             hits['filter_label'] = ~hits[energy_col].isnull()
111 |             hits = hits.drop(energy_col, axis='columns')
112 | 
113 |         # reset spacepoint index
114 |         spacepoints = spacepoints.reset_index(names='index_3d')
115 | 
116 |         # skip events with fewer than lower_bnd simulated hits in any plane.
117 |         # note that we can't just do a pandas groupby here, because that will
118 |         # skip over any planes with zero hits
119 |         for i in range(len(self.planes)):
120 |             planehits = hits[hits.local_plane==i]
121 |             nhits = planehits.filter_label.sum() if self.semantic_labeller else planehits.shape[0]
122 |             if nhits < self.lower_bound:
123 |                 return evt.name, None
124 | 
125 |         # get labels for each particle
126 |         if self.semantic_labeller:
127 |             particles = self.semantic_labeller(evt['particle_table'])
128 |             try:
129 |                 hits = hits.merge(particles, on='g4_id', how='left')
130 |             except:
131 |                 print('exception occurred when merging hits and particles')
132 |                 print('hit table:', hits)
133 |                 print('particle table:', particles)
134 |                 print('skipping this event')
135 |                 return evt.name, None
136 |             mask = (~hits.g4_id.isnull()) & (hits.semantic_label.isnull())
137 |             if mask.any():
138 |                 print(f'found {mask.sum()} orphaned hits.')
139 |                 return evt.name, None
140 |             del mask
141 | 
142 |         data = pyg.data.HeteroData()
143 | 
144 |         # event metadata
145 |         r, sr, e = evt.event_id
146 |         data['metadata'].run = r
147 |         data['metadata'].subrun = sr
148 |         data['metadata'].event = e
149 | 
150 |         # spacepoint nodes
151 |         if "position_x" in spacepoints.keys():
152 |             data["sp"].pos = torch.tensor(spacepoints[[f"position_{c}" for c in ("x", "y", "z")]].values).float()
153 |         else:
154 |             data['sp'].num_nodes = spacepoints.shape[0]
155 | 
156 |         # draw graph edges
157 |         for i, plane_hits in hits.groupby('local_plane'):
158 | 
159 |             p = self.planes[i]
160 |             plane_hits = plane_hits.reset_index(drop=True).reset_index(names='index_2d')
161 | 
162 |             # node position
163 |             pos = torch.tensor(plane_hits[self.node_pos].values).float()
164 |             data[p].pos = pos * self.pos_norm[None,:]
165 | 
166 |             # node features
167 |             data[p].x = torch.tensor(plane_hits[self.node_feats].values).float()
168 | 
169 |             # node true position
170 |             if self.label_position:
171 |                 data[p].c = torch.tensor(plane_hits[["x_position", "y_position", "z_position"]].values).float()
172 | 
173 |             # hit indices
174 |             data[p].id = torch.tensor(plane_hits['hit_id'].values).long()
175 | 
176 |             # 2D edges
177 |             data[p, 'plane', p].edge_index = self.transform(data[p]).edge_index
178 | 
179 |             # 3D edges
180 |             edge3d = spacepoints.merge(plane_hits[['hit_id','index_2d']].add_suffix(f'_{p}'),
181 |                                        on=f'hit_id_{p}',
182 |                                        how='inner')
183 |             edge3d = edge3d[[f'index_2d_{p}','index_3d']].values.transpose()
184 |             edge3d = torch.tensor(edge3d) if edge3d.size else torch.empty((2,0))
185 |             data[p, 'nexus', 'sp'].edge_index = edge3d.long()
186 | 
187 |             # truth information
188 |             if self.semantic_labeller:
189 |                 data[p].y_semantic = torch.tensor(plane_hits['semantic_label'].fillna(-1).values).long()
190 |                 data[p].y_instance = torch.tensor(plane_hits['instance_label'].fillna(-1).values).long()
191 |                 if self.store_detailed_truth:
192 |                     data[p].g4_id = torch.tensor(plane_hits['g4_id'].fillna(-1).values).long()
193 |                     data[p].parent_id = torch.tensor(plane_hits['parent_id'].fillna(-1).values).long()
194 |                     data[p].pdg = torch.tensor(plane_hits['type'].fillna(-1).values).long()
195 |             if self.label_vertex:
196 |                 vtx_2d = torch.tensor([ event[f'nu_vtx_wire_pos_{i}'], event.nu_vtx_wire_time ]).float()
197 |                 data[p].y_vtx = vtx_2d * self.pos_norm[None,:]
198 | 
199 |         # event label
200 |         if self.event_labeller:
201 |             data['evt'].y = torch.tensor(self.event_labeller(event)).long()
202 | 
203 |         # 3D vertex truth
204 |         if self.label_vertex:
205 |             vtx_3d = [ [ event.nu_vtx_corr_x, event.nu_vtx_corr_y, event.nu_vtx_corr_z ] ]
206 |             data['evt'].y_vtx = torch.tensor(vtx_3d).float()
207 | 
208 |         return evt.name, data


--------------------------------------------------------------------------------
/pynuml/process/spmap.py:
--------------------------------------------------------------------------------
 1 | import pynuml
 2 | 
 3 | def process_event(key, out, sp, hit, part, edep, l=standard, voxelsize=1):
 4 |     """Process an event into a 3D pixel map"""
 5 |     import numpy as np, torch, MinkowskiEngine as ME
 6 | 
 7 |     # skip any events with no simulated hits
 8 |     if (hit.index==key).sum() == 0: return
 9 |     if (edep.index==key).sum() == 0: return
10 | 
11 |     # label true particles
12 |     evt_part = part.loc[key].reset_index(drop=True)
13 |     evt_part = l.panoptic_label(evt_part)
14 | 
15 |     # get energy depositions and ground truth
16 |     evt_edep = edep.loc[key].reset_index(drop=True)
17 |     evt_edep = evt_edep.merge(evt_part[["g4_id", "semantic_label"]], on="g4_id", how="left").drop("g4_id", axis="columns")
18 |     scores = evt_edep.groupby(["hit_id", "semantic_label"]).agg({"energy": "sum"}).reset_index()
19 | 
20 |     # class number and names
21 |     n = len(l.label) - 1
22 |     lnames = [ it.name for it in l.label ][:-1]
23 |     noise = np.zeros(n)
24 |     noise[l.label.diffuse.value] = 1
25 | 
26 |     def fractional_truth(row, n):
27 |         label = np.zeros(n)
28 |         label[int(row.semantic_label)] = row.energy
29 |         return label
30 |     scores["slabel"] = scores.apply(fractional_truth, args=[n], axis="columns")
31 |     scores = scores.groupby("hit_id").agg({"slabel": "sum"})
32 | 
33 |     # Propagate labels to hits
34 |     evt_hit = hit.loc[key].reset_index(drop=True).merge(scores, on="hit_id", how="inner")
35 |     evt_sp = sp.loc[key].reset_index(drop=True)
36 | 
37 |     # skip events with fewer than 50 simulated hits in any plane, or fewer than 50 spacepoints
38 |     for i in range(3):
39 |         if (evt_hit.global_plane==i).sum() < 50: return
40 |     if evt_sp.shape[0] < 50: return
41 | 
42 |     # merge hits into spacepoints
43 |     for plane in ["u","v","y"]:
44 |         evt_sp = evt_sp.merge(evt_hit[["hit_id","integral","slabel"]].add_suffix(f"_{plane}"), on=f"hit_id_{plane}", how="left")
45 |         evt_sp[f"integral_{plane}"] = evt_sp[f"integral_{plane}"].fillna(0)
46 | 
47 |     def merge_truth(row, n):
48 |         labels = np.zeros(n)
49 |         for plane in ["u","v","y"]:
50 |             vals = row[f"slabel_{plane}"]
51 |             if type(vals) != float: labels += vals
52 |         return labels
53 | 
54 |     evt_sp["slabel"] = evt_sp.apply(merge_truth, args=[len(l.label)-1], axis="columns")
55 |     evt_sp = evt_sp[["slabel", "position_x", "position_y", "position_z", "integral_u", "integral_v", "integral_y"]]
56 | 
57 |     # voxelise spacepoints and aggregate labels
58 |     def voxelise(row):
59 |         return np.floor(row.position_x/voxelsize), np.floor(row.position_y/voxelsize), np.floor(row.position_z/voxelsize)
60 |     evt_sp["c"] = evt_sp.apply(voxelise, axis="columns")
61 |     evt_sp = evt_sp.drop(["position_x", "position_y", "position_z"], axis="columns")
62 |     evt_sp = evt_sp.groupby("c").agg({"integral_u": "sum", "integral_v": "sum", "integral_y": "sum", "slabel": "sum"}).reset_index()
63 |     def norm_truth(row, noise):
64 |         lsum = row.slabel.sum()
65 |         return noise if lsum == 0 else row.slabel / lsum
66 |     evt_sp["slabel"] = evt_sp.apply(norm_truth, args=[noise], axis="columns")
67 | 
68 |     spm = {
69 |         "f": torch.tensor(evt_sp[["integral_u", "integral_v", "integral_y"]].to_numpy()).float(),
70 |         "c": torch.tensor(evt_sp["c"]).int(),
71 |         "ys": torch.tensor(evt_sp["slabel"]).float()
72 |     }
73 |     out.save(spm, f"r{key[0]}_sr{key[1]}_evt{key[2]}")
74 | 
75 | def process_file(out, fname, p=process_event, l=standard, voxelsize=1):
76 |     """Process all events in a file into graphs"""
77 |     f = NuMLFile(fname)
78 | 
79 |     evt = f.get_dataframe("event_table", ["event_id"])
80 |     sp = f.get_dataframe("spacepoint_table")
81 |     hit = f.get_dataframe("hit_table")
82 |     part = f.get_dataframe("particle_table", ["event_id", "g4_id", "parent_id", "type", "momentum", "start_process", "end_process"])
83 |     edep = f.get_dataframe("edep_table")
84 | 
85 |     # loop over events in file
86 |     for key in evt.index: p(key, out, sp, hit, part, edep, l, voxelsize)
87 | 
88 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["flit_core >=3.2,<4"]
 3 | build-backend = "flit_core.buildapi"
 4 | 
 5 | [project]
 6 | name = "pynuml"
 7 | authors = [{name = "v hewes", email = "vhewes@fnal.gov"}]
 8 | requires-python = '>=3.7'
 9 | readme = "README.md"
10 | license = {file = "LICENSE"}
11 | 
12 | classifiers = [
13 |     "Intended Audience :: Science/Research",
14 |     "License :: OSI Approved :: MIT License",
15 | ]
16 | dependencies = [
17 |     "h5py>=3.7.0",
18 |     "mpi4py",
19 |     "pandas",
20 |     "particle",
21 |     "plotly",
22 |     "torch>=1.12.1",
23 |     "torch-geometric>=2.1.0",
24 | ]
25 | dynamic = ["version", "description"]
26 | 
27 | [project.urls]
28 | Home = "https://github.com/nugraph/pynuml"
29 | 


--------------------------------------------------------------------------------
/scripts/install_ph5concat_conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | git clone https://github.com/NU-CUCIS/ph5concat
 3 | cd ph5concat
 4 | autoreconf -i
 5 | ./configure --prefix=$CONDA_PREFIX \
 6 |             --with-mpi=$CONDA_PREFIX \
 7 |             --with-hdf5=$CONDA_PREFIX \
 8 |             CFLAGS="-O2 -DNDEBUG" \
 9 |             CXXFLAGS="-O2 -DNDEBUG" \
10 |             LIBS="-ldl -lz" \
11 |             --enable-profiling
12 | make install
13 | cd ..
14 | rm -fr ph5concat
15 | 


--------------------------------------------------------------------------------
/tests/test_process.py:
--------------------------------------------------------------------------------
 1 | """Test pynuml graph processing and plotting"""
 2 | import pynuml
 3 | 
 4 | def test_process_uboone():
 5 |     """Test graph processing with MicroBooNE open data release"""
 6 |     f = pynuml.io.File("/raid/nugraph/uboone-opendata/uboone-opendata.evt.h5")
 7 |     processor = pynuml.process.HitGraphProducer(
 8 |         file=f,
 9 |         semantic_labeller=pynuml.labels.StandardLabels(),
10 |         event_labeller=pynuml.labels.FlavorLabels(),
11 |         label_vertex=True)
12 |     plot = pynuml.plot.GraphPlot(
13 |         planes=["u", "v", "y"],
14 |         classes=pynuml.labels.StandardLabels().labels[:-1])
15 |     f.read_data(0, 100)
16 |     evts = f.build_evt()
17 |     for evt in evts:
18 |         _, data = processor(evt)
19 |         if not data:
20 |             continue
21 |         plot.plot(data, target='semantic', how='true', filter='show')
22 |         plot.plot(data, target='instance', how='true', filter='true')
23 | 
24 | def test_process_dune_nutau():
25 |     """Test graph processing with DUNE beam nutau dataset"""
26 |     f = pynuml.io.File("/raid/nugraph/dune-nutau/test.evt.h5")
27 |     processor = pynuml.process.HitGraphProducer(
28 |         file=f,
29 |         semantic_labeller=pynuml.labels.StandardLabels(),
30 |         event_labeller=pynuml.labels.FlavorLabels(),
31 |         label_position=True)
32 |     plot = pynuml.plot.GraphPlot(
33 |         planes=["u", "v", "y"],
34 |         classes=pynuml.labels.StandardLabels().labels[:-1])
35 |     f.read_data(0, 100)
36 |     evts = f.build_evt()
37 |     for evt in evts:
38 |         _, data = processor(evt)
39 |         if not data:
40 |             continue
41 |         plot.plot(data, target="filter", how="true", filter="show")
42 |         plot.plot(data, target='semantic', how='true', filter='show')
43 |         plot.plot(data, target='instance', how='true', filter='true')
44 |         plot.plot(data, target="semantic", how="true", filter="show", xyz=True)
45 | 


--------------------------------------------------------------------------------