├── .github └── workflows │ └── pypi.yml ├── .gitignore ├── .readthedocs.yaml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── conf.py ├── index.rst ├── install │ └── installation.rst └── make.bat ├── pynuml ├── .gitignore ├── __init__.py ├── io │ ├── __init__.py │ ├── file.py │ ├── h5interface.py │ └── out.py ├── labels │ ├── __init__.py │ ├── ccqe.py │ ├── flavor.py │ ├── pdk.py │ ├── simple.py │ └── standard.py ├── meta.yaml ├── plot │ ├── __init__.py │ └── graph.py └── process │ ├── __init__.py │ ├── base.py │ ├── hitgraph.py │ └── spmap.py ├── pyproject.toml ├── scripts └── install_ph5concat_conda.sh └── tests └── test_process.py /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: pypi 2 | 3 | on: 4 | release: 5 | types: [published] 6 | 7 | permissions: 8 | contents: read 9 | 10 | jobs: 11 | deploy: 12 | 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v4 17 | - name: Set up Python 18 | uses: actions/setup-python@v3 19 | with: 20 | python-version: '3.x' 21 | - name: Install dependencies 22 | run: | 23 | python -m pip install --upgrade pip 24 | pip install build 25 | - name: Build package 26 | run: python -m build 27 | - name: Publish package 28 | uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 29 | with: 30 | user: __token__ 31 | password: ${{ secrets.PYPI_API_TOKEN }} 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.egg-info 2 | build 3 | .ipynb_checkpoints 4 | __pycache__ 5 | .vscode 6 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | # You can also specify other tool versions: 14 | # nodejs: "19" 15 | # rust: "1.64" 16 | # golang: "1.19" 17 | 18 | # Build documentation in the "docs/" directory with Sphinx 19 | sphinx: 20 | configuration: docs/conf.py 21 | 22 | # Optionally build your docs in additional formats such as PDF and ePub 23 | # formats: 24 | # - pdf 25 | # - epub 26 | 27 | # Optional but recommended, declare the Python requirements required 28 | # to build your documentation 29 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 30 | # python: 31 | # install: 32 | # - requirements: docs/requirements.txt -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2023 v hewes 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | The `pynuml` package has been incorporated into the [nugraph](github.com/nugraph/nugraph) repository, and any new development should be carried out there. This repository persists for legacy purposes, but is no longer actively maintained. 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | project = 'pynuml' 10 | copyright = '2023, v hewes' 11 | author = 'v hewes' 12 | 13 | # -- General configuration --------------------------------------------------- 14 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 15 | 16 | extensions = [] 17 | 18 | templates_path = ['_templates'] 19 | exclude_patterns = [] 20 | 21 | 22 | 23 | # -- Options for HTML output ------------------------------------------------- 24 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 25 | 26 | html_theme = 'sphinx_rtd_theme' 27 | html_static_path = ['_static'] 28 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | :github_url: https://github.com/nugraph/pynuml 2 | 3 | pynuml Documentation 4 | ==================== 5 | 6 | **pynuml** is a python package providing a data interface for machine learning in neutrino physics. It utilises the **NuML** HDF5 event file format to efficiently preprocess physics events into ML objects for training neural networks. It is designed to abstract away many aspects of a typical ML workflow: 7 | 8 | - Efficiently iterate over large HDF5 datasets 9 | - Generate semantic and instance labels for particles 10 | - Preprocess events into ML objects 11 | 12 | .. toctree:: 13 | :maxdepth: 1 14 | :caption: Installation 15 | 16 | install/installation 17 | 18 | .. toctree:: 19 | :maxdepth: 1 20 | :caption: Getting Started -------------------------------------------------------------------------------- /docs/install/installation.rst: -------------------------------------------------------------------------------- 1 | Installation 2 | ============ 3 | 4 | In order to best make use of the **pynuml** package, it is strongly encouraged to install the provided numl Anaconda environment. Parallel processing functionality requires an MPI installation, which will be automatically configured when you install the `numl` conda environment. 5 | 6 | Installing the numl conda environment 7 | ------------------------------------- 8 | 9 | Installing **pynuml** requires an Anaconda installation that utilises `conda-forge`. If you need to install Anaconda, we recommend using the `Mambaforge`_ variant. 10 | 11 | A conda environment for numl is available via the anaconda client, and can be installed using:: 12 | 13 | mamba install -y anaconda-client 14 | mamba env create numl/numl 15 | 16 | Once installed, this environment will need to be activated at the start of each terminal session:: 17 | 18 | mamba activate numl 19 | 20 | .. _Mambaforge: https://github.com/conda-forge/miniforge#mambaforge 21 | 22 | This environment contains the most recent version of **pynuml** published to conda. 23 | 24 | Installing with Anaconda 25 | ------------------------ 26 | 27 | It is also possible to install **pynuml** on its own via Anaconda, using the **numl** channel:: 28 | 29 | mamba install -c numl pynuml 30 | 31 | Installing with pip 32 | ------------------- 33 | 34 | **pynuml** is also available on PyPi, although this installation method is not recommended, as **pynuml** has non-python dependencies that cannot be installed by pip. If the user has installed those dependencies manually, then the package can be installed using:: 35 | 36 | pip install pynuml 37 | 38 | Installing for development 39 | -------------------------- 40 | 41 | If you're installing **pynuml** for development, you can install the numl Anaconda environment as outlined above, and then clone the repository directly and install it in editable mode:: 42 | 43 | git clone https://github.com/nugraph/pynuml 44 | pip install --no-deps -e ./pynuml 45 | 46 | This will uninstall the conda release of pynuml installed by default as part of the numl environment, and override it with your local repository. If installed in editable mode, any changes made to the package will instantaneously be reflected when the module is imported in Python. -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /pynuml/.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | .ipynb_checkpoints/ 4 | -------------------------------------------------------------------------------- /pynuml/__init__.py: -------------------------------------------------------------------------------- 1 | '''Standardised ML input processing for particle physics''' 2 | 3 | __version__ = '24.6.dev0' 4 | 5 | from . import io 6 | from . import labels 7 | from . import process 8 | from . import plot 9 | -------------------------------------------------------------------------------- /pynuml/io/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import Event, File 2 | from .h5interface import H5Interface 3 | from .out import PTOut, H5Out 4 | -------------------------------------------------------------------------------- /pynuml/io/file.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from abc import ABC 3 | from typing import Any, Callable, Dict, List, Tuple 4 | import psutil 5 | 6 | import h5py 7 | import numpy as np 8 | import pandas as pd 9 | from mpi4py import MPI 10 | 11 | class Event: 12 | def __init__(self, 13 | index: int, 14 | event_id: np.ndarray, 15 | data: Dict[str, pd.DataFrame] = {}): 16 | self.index = index 17 | self.event_id = event_id 18 | self.data = data.copy() 19 | 20 | @property 21 | def name(self): 22 | r, sr, evt = self.event_id 23 | return f'r{r}_sr{sr}_evt{evt}' 24 | 25 | def __setitem__(self, key: str, item: pd.DataFrame): 26 | if type(key) != str: 27 | raise Exception('Key must be a string!') 28 | if type(item) != pd.DataFrame: 29 | raise Exception('Value must be a pandas DataFrame!') 30 | self.data[key] = item 31 | 32 | def __getitem__(self, key: str): 33 | if type(key) != str: 34 | raise Exception('Key must be a string!') 35 | return self.data[key] 36 | 37 | def __str__(self): 38 | ret = f'event {self.event_id}\n' 39 | for group, df in self.data.items(): 40 | ret += f' {group} ({df.shape[0]} rows):\n' 41 | for key in df.keys(): 42 | ret += f' {key}\n' 43 | return ret 44 | 45 | class File: 46 | def __init__(self, fname: str, parKey: str = "/event_table/event_id"): 47 | self._colmap = { 48 | "event_table": { 49 | "nu_dir": [ "nu_dir_x", "nu_dir_y", "nu_dir_z" ], 50 | "nu_vtx": [ "nu_vtx_x", "nu_vtx_y", "nu_vtx_z" ], 51 | "nu_vtx_corr": [ "nu_vtx_corr_x", "nu_vtx_corr_y", "nu_vtx_corr_z" ], 52 | }, 53 | "particle_table": { 54 | "start_position": [ "start_position_x", "start_position_y", "start_position_z" ], 55 | "end_position": [ "end_position_x", "end_position_y", "end_position_z" ], 56 | "start_position_corr": [ "start_position_corr_x", "start_position_corr_y", "start_position_corr_z" ], 57 | "end_position_corr": [ "end_position_corr_x", "end_position_corr_y", "end_position_corr_z" ], 58 | }, 59 | "spacepoint_table": { 60 | "hit_id": [ "hit_id_u", "hit_id_v", "hit_id_y" ], 61 | "position": [ "position_x", "position_y", "position_z" ], 62 | }, 63 | "pandoraPrimary_table": { 64 | "vtx": [ "vtx_x", "vtx_y", "vtx_z" ], 65 | }, 66 | } 67 | 68 | # open the input HDF5 file in parallel 69 | self._fd = h5py.File(fname, "r", driver='mpio', comm=MPI.COMM_WORLD) 70 | 71 | # check if data partitioning key datasets exists in the file 72 | if parKey not in self._fd.keys(): 73 | raise Exception(f'Error: dataset {parKey} is not found in file {fname}!') 74 | 75 | # parse the name of data partitioning key 76 | import os.path 77 | self._parTable = os.path.dirname(parKey) 78 | # remove leading '/' 79 | if self._parTable[0] == '/': self._parTable = self._parTable[1:] 80 | 81 | # extract dataset names: partitioning key, seq, and seq_cnt 82 | self._par_name = os.path.basename(parKey) 83 | self._seq_name = self._par_name + ".seq" 84 | self._cnt_name = self._par_name + ".seq_cnt" 85 | 86 | # obtain metadata of dataset parKey, later the dataset will be read 87 | # into self._index as a numpy array in data_partition() 88 | self._index = self._fd.get(parKey) 89 | self._num_events = self._index.shape[0] 90 | 91 | # self._groups is a python list, each member is a 2-element list consisting 92 | # of a group name, and a python list of dataset names 93 | self._groups = [] 94 | 95 | # a python dictionary storing a sequence-count dataset in each group, keys 96 | # are group names, values are the sequence-count dataset subarrays assigned 97 | # to this process 98 | self._seq_cnt = {} 99 | self._evt_seq = {} 100 | 101 | self._whole_seq_cnt = {} 102 | self._whole_seq = {} 103 | 104 | self._use_seq_cnt = True 105 | 106 | # partition based on event amount of particle table (default) 107 | self._evt_part = 2 108 | 109 | # a python nested dictionary storing datasets of each group read from the 110 | # input file. keys of self._data are group names, values are python 111 | # dictionaries, each has names of dataset in that group as keys, and values 112 | # storing dataset subarrays 113 | self._data = {} 114 | 115 | # _starts: data partition start indeices of all processes 116 | # _counts: data cmount assigned to each process 117 | starts = None 118 | counts = None 119 | 120 | # starting array index of parKey assigned to this process 121 | self._my_start = -1 122 | 123 | # number of array elements of parKey assigned to this process 124 | self._my_count = -1 125 | 126 | def __del__(self): 127 | if hasattr(self, '_fd') and self._fd: self._fd.close() 128 | 129 | def __len__(self): 130 | # inquire the number of unique event IDs in the input file 131 | return self._num_events 132 | 133 | def __str__(self): 134 | ret = "" 135 | for k1 in self._fd.keys(): 136 | ret += f"{k1}:\n" 137 | for k2 in self._fd[k1].keys(): 138 | if self._seq_name in k2: continue 139 | ret += f" {k2}\n" 140 | return ret 141 | 142 | def __getitem__(self, idx: int): 143 | """load a single event from file""" 144 | self.read_data(idx, 1) 145 | ret = self.build_evt(idx, 1) 146 | return ret[0] if len(ret) else None 147 | 148 | def check_shape0(self, 149 | group: str, 150 | keys: List[str] = []) -> None: 151 | # Check if shape[0] of all datasets in keys are of the same size 152 | shape0 = self._fd[group][keys[0]].shape[0] 153 | for k in keys[1:]: 154 | if k == self._cnt_name: continue # exception is seq_cnt dataset 155 | if shape0 != self._fd[group][k].shape[0]: 156 | raise Exception(f'Dataset "/{group}/{k}" shape[0]={self._fd[group][k].shape[0]} inconsistent with {keys[0]}.shape[0]={shape0}') 157 | 158 | def add_group(self, 159 | group: str, 160 | keys: List[str] = []) -> None: 161 | 162 | # if no keys specified, append all columns in HDF5 group 163 | if not keys: 164 | # retrieve all the dataset names of the group 165 | keys = list(self._fd[group].keys()) 166 | # datasets seq and seq_cnt are not needed 167 | if group != self._parTable and self._par_name in keys: keys.remove(self._par_name) 168 | if self._seq_name in keys: keys.remove(self._seq_name) 169 | if self._cnt_name in keys: keys.remove(self._cnt_name) 170 | else: 171 | # Check if datasets in keys are available in the file 172 | for k in keys: 173 | if k not in self._fd[group].keys(): 174 | raise Exception(f'Dataset "/{group}/{k}" does not exist') 175 | 176 | # if group does not already exist, just add it 177 | if not self._groups or group not in self._groups[:][0]: 178 | self.check_shape0(group, keys) 179 | self._groups.append([ group, keys ]) 180 | return 181 | 182 | # if group is already present, need to figure out whether any extra keys need to be added 183 | for g, k in self._groups: 184 | if g == group: 185 | self.check_shape0(group, keys) 186 | for key in keys: 187 | if key not in k: 188 | k.append(key) 189 | return 190 | raise Exception(f'group "{group}" not found.') 191 | 192 | def keys(self): 193 | return self._fd.keys() 194 | 195 | def _cols(self, 196 | group: str, 197 | key: str) -> List[str]: 198 | if key == self._par_name: return [ "run", "subrun", "event" ] 199 | if group in self._colmap and key in self._colmap[group].keys(): return self._colmap[group][key] 200 | elif self._fd[group][key].shape[1]==1: return [key] 201 | else: return [ key+"_"+str(c) for c in range(0,self._fd[group][key].shape[1])] 202 | 203 | def get_dataframe(self, 204 | group: str, 205 | keys: List[str] = []) -> pd.DataFrame: 206 | if not keys: 207 | keys = list(self._fd[group].keys()) 208 | if self._seq_name in keys: keys.remove(self._seq_name) 209 | if self._cnt_name in keys: keys.remove(self._cnt_name) 210 | dfs = [ pd.DataFrame(np.array(self._fd[group][key]), columns=self._cols(group, key)) for key in keys ] 211 | return pd.concat(dfs, axis="columns").set_index(["run","subrun","event"]) 212 | 213 | def get_dataframe_evt(self, 214 | group: str, 215 | keys: List[str] = []) -> pd.DataFrame: 216 | if not keys: 217 | keys = list(self._data[group].keys()) 218 | if self._seq_name in keys: keys.remove(self._seq_name) 219 | if self._cnt_name in keys: keys.remove(self._cnt_name) 220 | dfs = [ pd.DataFrame(np.array(self._data[group][key]), columns=self._cols(group, key)) for key in keys ] 221 | df = pd.concat(dfs, axis="columns") 222 | evt_idx_col = [] 223 | for seq in self._seq_cnt[group]: 224 | evt_idx_col += seq[1]*[seq[0]] 225 | df['evt_idx'] = evt_idx_col 226 | return df 227 | 228 | def index(self, idx: int): 229 | """get the index for a given row""" 230 | return self._my_index[idx - self._my_start] 231 | 232 | def read_seq(self) -> None: 233 | for group, datasets in self._groups: 234 | try: 235 | # read an HDF5 dataset into a numpy array 236 | self._whole_seq[group] = np.array(self._fd[group+"/"+self._seq_name]) 237 | except KeyError: 238 | print(f"Error: dataset {group}/{self._seq_name} does not exist") 239 | sys.stdout.flush() 240 | sys.exit(1) 241 | 242 | def read_seq_cnt(self) -> None: 243 | # Dataset event_id.seq_cnt stores the event IDs sorted in an increasing 244 | # order. There is no duplicated values and gaps may exist between any 245 | # two consecutive elements. Note dataset event_id.seq_cnt in group 246 | # self._parTable contains all event IDs with no gap. 247 | for group, datasets in self._groups: 248 | try: 249 | # read an HDF5 dataset into a numpy array 250 | self._whole_seq_cnt[group] = np.array(self._fd[group+"/"+self._cnt_name]) 251 | except KeyError: 252 | print(f"Error: dataset {group}/{self._cnt_name} does not exist") 253 | sys.stdout.flush() 254 | sys.exit(1) 255 | 256 | def data_partition(self) -> None: 257 | # Calculate the start indices and counts of evt.seq assigned to each process 258 | # self._starts: a numpy array of size nprocs 259 | # self._counts: a numpy array of size nprocs 260 | # Note self._starts and self._counts are matter only in root process. 261 | # self._my_start: (== self._starts[rank]) this process's start 262 | # self._my_count: (== self._counts[rank]) this process's count 263 | # self._my_index: partitioned dataset i.e. assigned to this process 264 | 265 | comm = MPI.COMM_WORLD 266 | rank = comm.Get_rank() 267 | nprocs = comm.Get_size() 268 | self._starts = np.zeros(nprocs, dtype=int) 269 | self._counts = np.zeros(nprocs, dtype=int) 270 | 271 | if rank == 0: 272 | if self._use_seq_cnt: 273 | self.read_seq_cnt() 274 | else: 275 | self.read_seq() 276 | 277 | num_events = self._num_events 278 | 279 | if self._evt_part == 0: 280 | # Below implements event ID based partitioning, which 281 | # calculates the start and count of evt.seq id for each process 282 | _count = num_events // nprocs 283 | for j in range(num_events % nprocs): 284 | self._starts[j] = _count * j + j 285 | self._counts[j] = _count + 1 286 | 287 | for j in range(num_events % nprocs, nprocs): 288 | self._starts[j] = _count * j + num_events % nprocs 289 | self._counts[j] = _count 290 | 291 | elif self._evt_part == 1: 292 | # event amount based partitioning, which calculates event sizes 293 | # across all groups. Note it is possible multiple consecutive rows 294 | # a dataset have the same event ID. It is also possible some event 295 | # IDs contain no data. First, we accumulate numbers of events 296 | # across all groups 297 | evt_size = np.zeros(num_events, dtype=int) 298 | if self._use_seq_cnt: 299 | for group, datasets in self._groups: 300 | seq_cnt = self._whole_seq_cnt[group] 301 | num_datasets = len(datasets) 302 | for i in range(seq_cnt.shape[0]): 303 | evt_size[seq_cnt[i, 0]] += seq_cnt[i, 1] * num_datasets 304 | else: 305 | for group, datasets in self._groups: 306 | seq = self._whole_seq[group] 307 | for i in range(seq.shape[0]): 308 | evt_size[seq[i, 0]] += 1 309 | 310 | # now we have collected the number of events per event ID across all groups 311 | total_evt_num = np.sum(evt_size) 312 | avg_evt_num = total_evt_num // nprocs 313 | avg_evt = total_evt_num // num_events / 2 314 | 315 | # assign ranges of event IDs to individual processes 316 | acc_evt_num = 0 317 | rank_id = 0 318 | for j in range(num_events): 319 | if rank_id == nprocs - 1: break 320 | if acc_evt_num + evt_size[j] >= avg_evt_num: 321 | remain_l = avg_evt_num - acc_evt_num 322 | remain_r = evt_size[j] - remain_l 323 | if remain_l > remain_r and remain_l > avg_evt: 324 | # assign event j to rank_id 325 | self._counts[rank_id] += 1 326 | acc_evt_num = 0 327 | else: 328 | # assign event j to rank_id+1 329 | self._counts[rank_id+1] = 1 330 | acc_evt_num = evt_size[j] 331 | # done with rank_id i 332 | rank_id += 1 333 | self._starts[rank_id] = self._starts[rank_id-1] + self._counts[rank_id-1] 334 | else: 335 | self._counts[rank_id] += 1 336 | acc_evt_num += evt_size[j] 337 | self._counts[nprocs-1] += num_events - j 338 | 339 | elif self._evt_part == 2: 340 | # use event amounts in the particle_table only to partition events 341 | seq_cnt = self._whole_seq_cnt['particle_table'] 342 | total_evt_num = np.sum(seq_cnt[:,1]) 343 | avg_evt_num = total_evt_num // nprocs 344 | avg_evt = total_evt_num // seq_cnt.shape[0] / 2 345 | 346 | self._starts[0] = seq_cnt[0,0] 347 | acc_evt_num = 0 348 | rank_id = 0 349 | for j in range(seq_cnt.shape[0]): 350 | if rank_id == nprocs - 1: break 351 | if acc_evt_num + seq_cnt[j,1] >= avg_evt_num: 352 | remain_l = avg_evt_num - acc_evt_num 353 | remain_r = seq_cnt[j,1] - remain_l 354 | # if remain_r > remain_l: 355 | if remain_l > remain_r and remain_l > avg_evt: 356 | # assign event j to rank_id 357 | self._counts[rank_id] = seq_cnt[j+1, 0] - self._starts[rank_id] 358 | self._starts[rank_id+1] = seq_cnt[j+1, 0] 359 | acc_evt_num = 0 360 | else: 361 | # assign event j to rank_id+1 362 | self._counts[rank_id] = seq_cnt[j, 0] - self._starts[rank_id] 363 | self._starts[rank_id+1] = seq_cnt[j, 0] 364 | acc_evt_num = seq_cnt[j, 1] 365 | # done with rank_id 366 | rank_id += 1 367 | else: 368 | acc_evt_num += seq_cnt[j, 1] 369 | 370 | self._counts[nprocs-1] = num_events - self._starts[nprocs-1] 371 | 372 | # All processes participate the collective communication, scatter. 373 | # Root distributes start and count to all processes. Note only root process 374 | # uses self._starts and self._counts. 375 | start_count = np.empty([nprocs, 2], dtype=int) 376 | start_count[:, 0] = self._starts[:] 377 | start_count[:, 1] = self._counts[:] 378 | recvbuf = np.empty(2, dtype=int) 379 | comm.Scatter(start_count, recvbuf, root=0) 380 | self._my_start = recvbuf[0] 381 | self._my_count = recvbuf[1] 382 | 383 | # This process is assigned event IDs of range from self._my_start to 384 | # (self._my_start + self._my_count - 1) 385 | 386 | # each process reads its share of dataset and stores it in a numpy 387 | # array 388 | self._my_index = np.array(self._index[self._my_start : self._my_start + self._my_count, :]) 389 | 390 | def binary_search_min(self, key, base, nmemb): 391 | low = 0 392 | high = nmemb 393 | while low != high: 394 | mid = (low + high) // 2 395 | if base[mid] < key: 396 | low = mid + 1 397 | else: 398 | high = mid 399 | return low 400 | 401 | def binary_search_max(self, key, base, nmemb): 402 | low = 0 403 | high = nmemb 404 | while low != high: 405 | mid = (low + high) // 2 406 | if base[mid] <= key: 407 | low = mid + 1 408 | else: 409 | high = mid 410 | return (low - 1) 411 | 412 | def calc_bound_seq(self, group): 413 | # return the lower and upper array indices of subarray assigned to this 414 | # process, using the partition sequence dataset 415 | 416 | comm = MPI.COMM_WORLD 417 | rank = comm.Get_rank() 418 | nprocs = comm.Get_size() 419 | 420 | displ = np.zeros([nprocs], dtype=int) 421 | count = np.zeros([nprocs], dtype=int) 422 | bounds = np.zeros([nprocs, 2], dtype=int) 423 | 424 | all_evt_seq = None 425 | if rank == 0: 426 | # root reads the entire dataset self._seq_name, if not already 427 | if not self._whole_seq: self.read_seq() 428 | 429 | all_evt_seq = self._whole_seq[group] 430 | dim = len(all_evt_seq) 431 | 432 | # calculate displ, count to be used in scatterV for all processes 433 | for i in range(nprocs): 434 | if self._counts[i] == 0: continue 435 | end = self._starts[i] + self._counts[i] - 1 436 | bounds[i, 0] = self.binary_search_min(self._starts[i], all_evt_seq, dim) 437 | bounds[i, 1] = self.binary_search_max(end, all_evt_seq, dim) 438 | displ[i] = bounds[i, 0] 439 | count[i] = bounds[i, 1] - bounds[i, 0] + 1 440 | 441 | lower_upper = np.empty([2], dtype=int) 442 | 443 | # root distributes start and end indices to all processes 444 | comm.Scatter(bounds, lower_upper, root=0) 445 | 446 | # this process is assigned array indices from lower to upper 447 | lower = 0 448 | upper = 0 449 | if self._my_count > 0: 450 | lower = lower_upper[0] 451 | upper = lower_upper[1] + 1 452 | 453 | # root scatters the subarray of evt_seq to all processes 454 | self._evt_seq[group] = np.zeros(upper - lower, dtype=np.int64) 455 | comm.Scatterv([all_evt_seq, count, displ, MPI.LONG_LONG], self._evt_seq[group], root=0) 456 | 457 | return lower, upper 458 | 459 | def calc_bound_seq_cnt(self, group): 460 | # return the lower and upper array indices of subarray assigned to this 461 | # process, using the partition sequence-count dataset 462 | 463 | comm = MPI.COMM_WORLD 464 | rank = comm.Get_rank() 465 | nprocs = comm.Get_size() 466 | 467 | displ = np.zeros([nprocs], dtype=int) 468 | count = np.zeros([nprocs], dtype=int) 469 | seq_cnt = np.zeros([nprocs, 2], dtype=int) 470 | 471 | all_seq_cnt = None 472 | if rank == 0: 473 | # root reads the entire dataset self._cnt_name, if not already 474 | if not self._whole_seq_cnt: self.read_seq_cnt() 475 | 476 | all_seq_cnt = self._whole_seq_cnt[group] 477 | dim = len(all_seq_cnt) 478 | 479 | # calculate displ, count for all processes to be used in scatterV 480 | recv_rank = 0 # receiver rank 481 | displ[recv_rank] = 0 482 | seq_cnt[recv_rank, 0] = 0 483 | seq_end = self._starts[recv_rank] + self._counts[recv_rank] 484 | seq_id = 0 485 | for i in range(dim): 486 | if all_seq_cnt[i, 0] >= seq_end : 487 | seq_cnt[recv_rank, 1] = i - displ[recv_rank] 488 | recv_rank += 1 # move on to the next receiver rank 489 | seq_end = self._starts[recv_rank] + self._counts[recv_rank] 490 | displ[recv_rank] = i 491 | seq_cnt[recv_rank, 0] = seq_id 492 | seq_id += all_seq_cnt[i, 1] 493 | 494 | # last receiver rank 495 | seq_cnt[recv_rank, 1] = dim - displ[recv_rank] 496 | 497 | displ[:] *= 2 498 | count[:] = seq_cnt[:, 1] * 2 499 | 500 | # root distributes seq_cnt to all processes 501 | my_seq_cnt = np.empty([2], dtype=int) 502 | comm.Scatter(seq_cnt, my_seq_cnt, root=0) 503 | 504 | # self._seq_cnt[group][:, 0] is the event ID 505 | # self._seq_cnt[group][:, 1] is the number of elements 506 | self._seq_cnt[group] = np.empty([my_seq_cnt[1], 2], dtype=np.int64) 507 | 508 | # root scatters the subarray of evt_seq to all processes 509 | comm.Scatterv([all_seq_cnt, count, displ, MPI.LONG_LONG], self._seq_cnt[group], root=0) 510 | 511 | lower = 0 512 | upper = 0 513 | if self._my_count > 0: 514 | lower = my_seq_cnt[0] 515 | upper = my_seq_cnt[0] + np.sum(self._seq_cnt[group][:, 1]) 516 | 517 | # this process is assigned array indices from lower to upper 518 | 519 | return lower, upper 520 | 521 | def read_data(self, 522 | start: int, 523 | count: int) -> None: 524 | # (sequentially) read subarrays of all datasets in all groups that fall 525 | # in the range of self._seq_name, starting from 'start' and amount of 'count' 526 | 527 | for group, datasets in self._groups: 528 | if self._use_seq_cnt: 529 | # use evt_id.seq_cnt to calculate subarray boundaries 530 | # reads the entire dataset self._cnt_name, if not already 531 | if not self._whole_seq_cnt or group not in self._whole_seq_cnt.keys(): 532 | self.read_seq_cnt() 533 | all_seq_cnt = self._whole_seq_cnt[group] 534 | # search indices of start and end in all_seq_cnt 535 | # all_seq_cnt[:,0] are all unique 536 | ilower = np.searchsorted(all_seq_cnt[:,0], start) 537 | iupper = np.searchsorted(all_seq_cnt[:,0], start+count) 538 | self._seq_cnt[group] = np.array(all_seq_cnt[ilower:iupper], dtype=np.int64) 539 | lower = np.sum(all_seq_cnt[0:ilower, 1]) 540 | upper = lower + np.sum(all_seq_cnt[ilower:iupper, 1]) 541 | else: 542 | # use evt_id.seq to calculate subarray boundaries 543 | # root reads the entire dataset self._seq_name, if not already 544 | if not self._whole_seq: self.read_seq() 545 | all_evt_seq = self._whole_seq[group] 546 | dim = len(all_evt_seq) 547 | # search indices of start and end in all_seq 548 | # all_seq[:] are not unique 549 | end = start + count - 1 550 | lower = self.binary_search_min(start, all_evt_seq, dim) 551 | upper = self.binary_search_max(end, all_evt_seq, dim) 552 | upper += 1 553 | self._evt_seq[group] = np.array(all_evt_seq[lower:upper], dtype=np.int64) 554 | 555 | # Iterate through all the datasets and read the subarray from index lower 556 | # to upper and store it into a dictionary with the names of group and 557 | # dataset as the key. 558 | self._data[group] = {} 559 | for dset in datasets: 560 | # read subarray into a numpy array 561 | self._data[group][dset] = np.array(self._fd[group][dset][lower : upper]) 562 | 563 | self._my_start = start 564 | self._my_count = count 565 | # read assigned partitioning key dataset into a numpy array 566 | self._my_index = np.array(self._index[start : start + count, :]) 567 | 568 | def read_data_all(self, 569 | use_seq_cnt: bool = True, 570 | evt_part: int = 2, 571 | profile: bool = False) -> None: 572 | # use_seq_cnt: True - use event.seq_cnt dataset to calculate partitioning 573 | # starts and counts 574 | # False - use event.seq dataset to calculate starts and counts 575 | # evt_part: 0 - partition based on event IDs 576 | # 1 - partition based on event amount 577 | # 2 - partition based on event amount of particle table (default) 578 | # Parallel read dataset subarrays assigned to this process ranging from 579 | # array index of self._my_start to (self._my_start + self._my_count - 1) 580 | if profile: 581 | par_time = 0 582 | bnd_time = 0 583 | rds_time = 0 584 | time_s = MPI.Wtime() 585 | 586 | self._use_seq_cnt = use_seq_cnt 587 | self._evt_part = evt_part 588 | 589 | # calculate the data partitioning start indices and amounts assigned to 590 | # each process. Set self._starts, self._counts, self._my_start, 591 | # self._my_count, and self._my_index 592 | self.data_partition() 593 | 594 | if profile: 595 | time_e = MPI.Wtime() 596 | par_time = time_e - time_s 597 | time_s = time_e 598 | 599 | for group, datasets in self._groups: 600 | if self._use_seq_cnt: 601 | # use evt_id.seq_cnt to calculate subarray boundaries 602 | lower, upper = self.calc_bound_seq_cnt(group) 603 | else: 604 | # use evt_id.seq to calculate subarray boundaries 605 | lower, upper = self.calc_bound_seq(group) 606 | 607 | if profile: 608 | time_e = MPI.Wtime() 609 | bnd_time += time_e - time_s 610 | time_s = time_e 611 | 612 | # Iterate through all the datasets and read the subarray from index lower 613 | # to upper and store it into a dictionary with the names of group and 614 | # dataset as the key. 615 | self._data[group] = {} 616 | for dset in datasets: 617 | # read subarray into a numpy array 618 | self._data[group][dset] = np.array(self._fd[group][dset][lower : upper]) 619 | 620 | if profile: 621 | time_e = MPI.Wtime() 622 | rds_time += time_e - time_s 623 | time_s = time_e 624 | 625 | if profile: 626 | rank = MPI.COMM_WORLD.Get_rank() 627 | nprocs = MPI.COMM_WORLD.Get_size() 628 | 629 | total_t = np.array([par_time, bnd_time, rds_time]) 630 | max_total_t = np.zeros(3) 631 | MPI.COMM_WORLD.Reduce(total_t, max_total_t, op=MPI.MAX, root = 0) 632 | min_total_t = np.zeros(3) 633 | MPI.COMM_WORLD.Reduce(total_t, min_total_t, op=MPI.MIN, root = 0) 634 | if rank == 0: 635 | print("---- Timing break down of the file read phase (in seconds) -------") 636 | if self._use_seq_cnt: 637 | print(f'Use "{self._cnt_name}" to calculate subarray boundaries') 638 | else: 639 | print(f'Use "{self._seq_name}" to calculate subarray boundaries') 640 | 641 | print("data partitioning time ", end='') 642 | print("MAX=%8.2f MIN=%8.2f" % (max_total_t[0], min_total_t[0])) 643 | print("calc boundaries time ", end='') 644 | print("MAX=%8.2f MIN=%8.2f" % (max_total_t[1], min_total_t[1])) 645 | print("read datasets time ", end='') 646 | print("MAX=%8.2f MIN=%8.2f" % (max_total_t[2], min_total_t[2])) 647 | print("(MAX and MIN timings are among %d processes)" % nprocs) 648 | 649 | def build_evt(self, 650 | start: int = None, 651 | count: int = None) -> List[Dict]: 652 | # This process is responsible for event IDs from start to (start+count-1). 653 | # All data of the same event ID will be used to create a graph. 654 | # This function collects all data based on self._seq_name, or 655 | # self._cnt_name into a python list containing Pandas DataFrames, one 656 | # for a unique event ID. 657 | if not self._groups: 658 | raise Exception('cannot build event without adding any HDF5 groups') 659 | 660 | ret_list = [] 661 | 662 | if start is None: start = self._my_start 663 | if count is None: count = self._my_count 664 | 665 | if self._use_seq_cnt: 666 | # track the latest used index per group 667 | idx_grp = dict.fromkeys(self._data.keys(), 0) 668 | 669 | # accumulate starting array index per group 670 | idx_start = dict.fromkeys(self._data.keys(), 0) 671 | 672 | # whether idx is presented in a group's _seq_cnt[:,0] 673 | idx_found = dict.fromkeys(self._data.keys(), False) 674 | 675 | # Iterate through assigned event IDs 676 | for idx in range(int(start), int(start+count)): 677 | # check if idx is missing in all groups 678 | is_missing = True 679 | if self._use_seq_cnt: 680 | for group in self._data.keys(): 681 | idx_found[group] = False 682 | dim = self._seq_cnt[group].shape[0] 683 | 684 | # check against the max of this group's 685 | if idx > self._seq_cnt[group][dim-1, 0]: 686 | continue 687 | 688 | # check and search for idx in _seq_cnt[group][:,0] 689 | if idx == idx_grp[group]: 690 | # this is most likely the case when building all graphs 691 | # for all events at once 692 | idx_found[group] = True 693 | idx_grp[group] = idx 694 | elif idx - idx_grp[group] <= 8: 695 | # linear search for idx in _seq_cnt[group][:,0] 696 | # if distance is less than 8, linear search is faster 697 | for jj in range(idx_grp[group], dim): 698 | if idx == self._seq_cnt[group][jj, 0]: 699 | idx_found[group] = True 700 | idx_grp[group] = jj 701 | break 702 | elif idx < self._seq_cnt[group][jj, 0]: 703 | break 704 | else: 705 | # binary search for idx in _seq_cnt[group][:,0] 706 | # Note there is no duplicated values in 707 | # _seq_cnt[group][:,0] and the values are sorted in an 708 | # increasing order 709 | low = idx_grp[group] 710 | high = dim 711 | while low < high: 712 | mid = (low + high) // 2 713 | if self._seq_cnt[group][mid, 0] < idx: 714 | low = mid + 1 715 | elif self._seq_cnt[group][mid, 0] > idx: 716 | high = mid 717 | else: 718 | idx_found[group] = True 719 | idx_grp[group] = mid 720 | break 721 | 722 | if idx_found[group]: 723 | if idx == start: 724 | # Calculate starting array index only necessary for 725 | # first idx. For 2nd and later, idx_start is 726 | # accumulated later 727 | idx_start[group] = self._seq_cnt[group][0:idx_grp[group], 1].sum() 728 | # skip self._parTable group, as it is not used to 729 | # determine whether idx is missing. 730 | if group != self._parTable: 731 | is_missing = False 732 | else: 733 | for group in self._data.keys(): 734 | dim = len(self._evt_seq[group]) 735 | # dataset event_id.seq may contain duplicated event IDs 736 | # IDs in this dataset are sorted in a monotonically non-decreasing order 737 | lower = self.binary_search_min(idx, self._evt_seq[group], dim) 738 | upper = self.binary_search_max(idx, self._evt_seq[group], dim) + 1 739 | if lower < upper: 740 | is_missing = False 741 | break 742 | 743 | # this idx is missing in all groups 744 | if is_missing: 745 | continue 746 | 747 | # for each event seq ID, create a dictionary, ret 748 | # first item: key is "index" and value is the event seq ID 749 | # remaining items: key is group name and value is a Pandas DataFrame 750 | # containing the dataset subarray in this group with the event ID, idx 751 | ret = Event(idx, self.index(idx)) 752 | 753 | # Iterate through all groups 754 | for group in self._data.keys(): 755 | 756 | if self._use_seq_cnt: 757 | # Note self._seq_cnt[group][:, 0] is the event ID 758 | # Note self._seq_cnt[group][:, 1] is the number of elements 759 | 760 | if not idx_found[group]: 761 | # For idx is missing from this group but not in other 762 | # groups, create an empty Pandas DataFrame 763 | dfs = [] 764 | for dataset in self._data[group].keys(): 765 | data_dataframe = pd.DataFrame(columns=self._cols(group, dataset)) 766 | dfs.append(data_dataframe) 767 | ret[group] = pd.concat(dfs, axis="columns") 768 | continue 769 | 770 | if group == self._parTable: 771 | # Special treatment for group self._parTable, as its 772 | # seq_cnt[:,1] contains all 1s and earlier increment of 773 | # idx_grp[group] may be skipped due to missing idx 774 | lower = idx_grp[group] 775 | upper = lower + 1 776 | else: 777 | lower = idx_start[group] 778 | upper = self._seq_cnt[group][idx_grp[group], 1] + lower 779 | 780 | # The range from lower to upper (exclusively) is subarray 781 | # indices of elements belonging to the same event ID, idx 782 | 783 | if count > 1: 784 | # increment start array indices to avoid searching the 785 | # already-done data 786 | idx_start[group] += self._seq_cnt[group][idx_grp[group], 1] 787 | idx_grp[group] += 1 788 | 789 | else: 790 | # Note self._evt_seq stores event ID values and is already sorted in 791 | # an increasing order 792 | dim = len(self._evt_seq[group]) 793 | 794 | # Find the local start and end row indices for this event ID, idx 795 | lower = self.binary_search_min(idx, self._evt_seq[group], dim) 796 | upper = self.binary_search_max(idx, self._evt_seq[group], dim) + 1 797 | 798 | # dfs is a python list containing Pandas DataFrame objects 799 | dfs = [] 800 | for dataset in self._data[group].keys(): 801 | if lower >= upper: 802 | # idx is missing from the dataset self._seq_name, 803 | # In this case, create an empty numpy array 804 | data = np.array([]) 805 | else: 806 | # array elements from lower to upper of this dataset have the 807 | # event ID == idx 808 | data = self._data[group][dataset][lower : upper] 809 | 810 | # create a Pandas DataFrame to store the numpy array 811 | df = pd.DataFrame(data, columns=self._cols(group, dataset)) 812 | for col in df.columns: 813 | if df[col].dtype == '|S64' or df[col].dtype == 'object': 814 | df[col] = df[col].str.decode('utf-8') 815 | dfs.append(df) 816 | 817 | # concatenate into the dictionary "ret" with group names as keys 818 | ret[group] = pd.concat(dfs, axis="columns") 819 | 820 | # Add all dictionaries "ret" into a list. 821 | # Each of them corresponds to the data of one single event ID 822 | ret_list.append(ret) 823 | 824 | return ret_list 825 | 826 | def process(self, 827 | processor: Callable[[Event], Tuple[str, Any]], 828 | out: Callable[[Any, str], None]) -> None: 829 | '''Process all events in this data partition''' 830 | xproc = psutil.Process() 831 | comm = MPI.COMM_WORLD 832 | nprocs = comm.Get_size() 833 | rank = comm.Get_rank() 834 | if rank == 0: 835 | out.write_metadata(processor.metadata) 836 | self.read_data_all() 837 | 838 | verbose = False 839 | 840 | # whether or not to build graphs one event at a time 841 | build_one_evt_at_a_time = True 842 | 843 | if build_one_evt_at_a_time == False: 844 | evt_list = self.build_evt() 845 | for evt in evt_list: 846 | name, data = processor(evt) 847 | if data is not None: out(name, data) 848 | if verbose: 849 | print("Build all events: MPI rank %3d Memory footprint = %8.1f MiB" % 850 | (rank, xproc.memory_info().rss/ 1024.0 ** 2)) 851 | else: 852 | # Iterate through assigned event IDs 853 | for idx in range(int(self._my_start), int(self._my_start+self._my_count)): 854 | evt = self.build_evt(idx, 1) 855 | if len(evt) > 0: 856 | name, data = processor(evt[0]) 857 | if data is not None: out(name, data) 858 | if verbose: 859 | print("Build 1 event at a time: MPI rank %-3d Memory footprint = %8.1f MiB" % 860 | (rank, xproc.memory_info().rss/ 1024.0 ** 2)) 861 | 862 | -------------------------------------------------------------------------------- /pynuml/io/h5interface.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import h5py 4 | import numpy as np 5 | import torch 6 | from torch_geometric.data import Data, HeteroData 7 | 8 | 9 | class H5Interface: 10 | def __init__(self, file: h5py.File): 11 | self.f = file 12 | 13 | def save_data(self, data: Data) -> None: 14 | code 15 | 16 | def _add_dataset(self, key: str, val: Any) -> None: 17 | if np.isscalar(val): 18 | self._data = self._data + (val,) 19 | field = (key, type(val)) 20 | else: 21 | if val.nelement() == 0: # save tensor with zero-sized dimension as a scalar 0 22 | # HDF5 compound data type does not allow zero-size dimension 23 | # ValueError: Zero-sized dimension specified (zero-sized dimension specified) 24 | self._data = self._data + (0,) 25 | field = (key, val.numpy().dtype) 26 | else: 27 | val = val.numpy() # convert a tensor to numpy 28 | self._data = self._data + (val,) 29 | field = (key, val.dtype, val.shape) 30 | self._fields.append(field) 31 | 32 | def save_heterodata(self, data: HeteroData) -> None: 33 | 34 | self._data = () 35 | self._fields = [] 36 | 37 | nodes, edges = data.metadata() 38 | 39 | # save node stores 40 | for node in nodes: 41 | if "_" in node: 42 | raise Exception(f'"{node}" is not a valid node store name! Underscores are not supported.') 43 | for key in data[node].keys(): 44 | self._add_dataset(f'{node}/{key}', data[node][key]) 45 | 46 | # save edge stores 47 | for edge in edges: 48 | for tmp in edge: 49 | if "_" in tmp: 50 | raise Exception(f'"{tmp}" is not a valid edge store name component! Underscores are not supported.') 51 | name = "_".join(edge) 52 | for key in data[edge].keys(): 53 | self._add_dataset(f'{name}/{key}', data[edge][key]) 54 | 55 | def save(self, name: str, data: Any) -> None: 56 | if isinstance(data, Data): 57 | self.save_data(data) 58 | elif isinstance(data, HeteroData): 59 | self.save_heterodata(data) 60 | else: 61 | raise NotImplementedError(f'No save method implemented for {type(data)}!') 62 | 63 | # create a scalar dataset of compound data type 64 | ctype = np.dtype(self._fields) 65 | ds = self.f.create_dataset(f'/dataset/{name}', shape=(), dtype=ctype, data=self._data) 66 | del ctype, self._fields, self._data, ds 67 | 68 | def load_heterodata(self, name: str) -> HeteroData: 69 | data = HeteroData() 70 | # Read the whole dataset idx, dataset name is self.groups[idx] 71 | group = self.f[f'dataset/{name}'][()] 72 | for dataset in group.dtype.names: 73 | store, attr = dataset.split('/') 74 | if "_" in store: store = tuple(store.split("_")) 75 | if group[dataset].ndim == 0: 76 | if attr == 'edge_index': # empty edge tensor 77 | data[store][attr] = torch.LongTensor([[],[]]) 78 | else: # scalar 79 | data[store][attr] = torch.as_tensor(group[dataset][()]) 80 | else: # multi-dimension array 81 | data[store][attr] = torch.as_tensor(group[dataset][:]) 82 | return data 83 | 84 | def keys(self) -> list[str]: 85 | return list(self.f['dataset'].keys()) -------------------------------------------------------------------------------- /pynuml/io/out.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | from typing import Any 4 | 5 | import h5py 6 | from mpi4py import MPI 7 | 8 | 9 | class PTOut: 10 | def __init__(self, outdir: str): 11 | self.outdir = outdir 12 | isExist = os.path.exists(outdir) 13 | if not isExist: 14 | rank = MPI.COMM_WORLD.Get_rank() 15 | if rank == 0: 16 | print("Error: output directory does not exist", outdir) 17 | sys.stdout.flush() 18 | MPI.COMM_WORLD.Abort(1) 19 | 20 | def __call__(self, name: str, obj: Any) -> None: 21 | import torch 22 | torch.save(obj, os.path.join(self.outdir, name)+".pt") 23 | 24 | def write_metadata(metadata: dict[str, Any]) -> None: 25 | raise NotImplementedError 26 | 27 | def exists(self, name: str) -> bool: 28 | return os.path.exists(os.path.join(self.outdir, name)+".pt") 29 | 30 | class H5Out: 31 | def __init__(self, fname: str, overwrite: bool = False): 32 | # This implements one-file-per-process I/O strategy. 33 | # append MPI process rank to the output file name 34 | rank = MPI.COMM_WORLD.Get_rank() 35 | file_ext = ".{:04d}.h5" 36 | self.fname = fname + file_ext.format(rank) 37 | if os.path.exists(self.fname): 38 | if overwrite: 39 | os.remove(self.fname) 40 | else: 41 | print(f"Error: file already exists: {self.fname}") 42 | sys.stdout.flush() 43 | MPI.COMM_WORLD.Abort(1) 44 | # open/create the HDF5 file 45 | self.f = h5py.File(self.fname, "w") 46 | 47 | from .h5interface import H5Interface 48 | self.interface = H5Interface(self.f) 49 | # print(f"{rank}: creating {self.fname}") 50 | # sys.stdout.flush() 51 | 52 | def __call__(self, name: str, obj: Any) -> None: 53 | """ 54 | for key, val in obj: 55 | # set chunk sizes to val shape, so there is only one chunk per dataset 56 | # if isinstance(val, torch.Tensor) and val.nelement() == 0 : 57 | # print("zero val ",name,"/",key," shape=",val.shape) 58 | if isinstance(val, torch.Tensor) and val.nelement() > 0 : 59 | # Note compressed datasets can only be read/written in MPI collective I/O mode in HDF5 60 | self.f.create_dataset(f"/{name}/{key}", data=val, chunks=val.shape, compression="gzip") 61 | # The line below is to not enable chunking/compression 62 | # self.f.create_dataset(f"/{name}/{key}", data=val) 63 | else: 64 | # if data is not a tensor or is empty, then disable chunking/compression 65 | self.f.create_dataset(f"/{name}/{key}", data=val) 66 | """ 67 | import numpy as np 68 | import torch_geometric as pyg 69 | 70 | # collect and construct fields of compound data type 71 | fields = [] 72 | data = () 73 | 74 | # special treatment for heterograph object 75 | if isinstance(obj, pyg.data.HeteroData): 76 | self.interface.save(name, obj) 77 | return 78 | for key, val in obj: 79 | if np.isscalar(val): # only n_sp is a scalar 80 | data = data + (val,) 81 | field = (key, type(val)) 82 | else: 83 | if val.nelement() == 0: # save tensor with zero-sized dimension as a scalar 0 84 | # HDF5 compound data type does not allow zero-size dimension 85 | # ValueError: Zero-sized dimension specified (zero-sized dimension specified) 86 | val = val.numpy() # convert a tensor to numpy 87 | data = data + (0,) 88 | field = (key, val.dtype) 89 | else: 90 | val = val.numpy() # convert a tensor to numpy 91 | data = data + (val,) 92 | field = (key, val.dtype, val.shape) 93 | fields.append(field) 94 | ctype = np.dtype(fields) 95 | # create a scalar dataset of compound data type 96 | ds = self.f.create_dataset(f"/{name}", shape=(), dtype=ctype, data=data) 97 | del ctype, fields, data, ds 98 | 99 | def write_metadata(self, metadata: dict[str, Any]) -> None: 100 | for key, val in metadata.items(): 101 | self.f[key] = val 102 | 103 | def __del__(self): 104 | if self.f != None: self.f.close() -------------------------------------------------------------------------------- /pynuml/labels/__init__.py: -------------------------------------------------------------------------------- 1 | from .standard import StandardLabels 2 | from .simple import SimpleLabels 3 | from .flavor import FlavorLabels 4 | from .pdk import PDKLabels -------------------------------------------------------------------------------- /pynuml/labels/ccqe.py: -------------------------------------------------------------------------------- 1 | def ccqe(part): 2 | # get primary for each particle 3 | part = part.set_index("g4_id", drop=False) 4 | 5 | # convert from PDG code to label 6 | def label(pdg): 7 | if abs(pdg) == 11: return 0 # electron 8 | if abs(pdg) == 13: return 1 # muon 9 | return 2 # hadronic 10 | 11 | # trace lineage back from particle to primary and get label 12 | def func(row): 13 | gid = row.g4_id 14 | pid = row.parent_id 15 | while True: 16 | if pid == 0: return label(part.type[gid]) 17 | # if not pid in part.g4_id: return label(part.type[gid]) 18 | # gid = part.g4_id[pid] 19 | try: 20 | gid = part.g4_id[pid] 21 | except KeyError: 22 | return 2 23 | pid = part.parent_id[pid] 24 | 25 | # apply backtrace function to get labels 26 | part["semantic_label"] = part.apply(func, axis=1) 27 | return part.reset_index(drop=True)[["g4_id", "semantic_label"]] 28 | 29 | def panoptic_label(part): 30 | part = semantic_label(part) 31 | part["instance_label"] = -1 32 | return part 33 | 34 | def edge_label(edge): 35 | 36 | # False 37 | edge["label"] = 0 38 | 39 | # EM shower 40 | mask_e = (edge.label_1 == 0) & (edge.label_2 == 0) 41 | edge.loc[mask_e, "label"] = 1 42 | 43 | # Muon 44 | mask_part = (edge.g4_id_1 == edge.g4_id_2) 45 | mask_mu = (edge.label_1 == 1) & (edge.label_2 == 1) 46 | edge.loc[mask_part & mask_mu, "label"] = 2 47 | 48 | # Hadronic 49 | mask_had = (edge.label_1 == 2) & (edge.label_2 == 2) 50 | edge.loc[mask_part & mask_had, "label"] = 3 51 | 52 | return edge 53 | -------------------------------------------------------------------------------- /pynuml/labels/flavor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | class FlavorLabels: 4 | def __init__(self): 5 | self._labels = ( 6 | 'cc_nue', 7 | 'cc_numu', 8 | 'cc_nutau', 9 | 'nc') 10 | 11 | @property 12 | def labels(self): 13 | return self._labels 14 | 15 | def label(self, idx: int): 16 | if not 0 <= label < len(self._labels): 17 | raise Exception(f'index {idx} out of range for {len(self._labels)} labels.') 18 | return self._labels[idx] 19 | 20 | def index(self, name: str): 21 | if name not in self._labels: 22 | raise Exception(f'"{name}" is not the name of a class.') 23 | return self._labels.index(name) 24 | 25 | @property 26 | def cc_nue(self): 27 | return self.index('cc_nue') 28 | 29 | @property 30 | def cc_numu(self): 31 | return self.index('cc_numu') 32 | 33 | @property 34 | def cc_nutau(self): 35 | return self.index('cc_nutau') 36 | 37 | @property 38 | def nc(self): 39 | return self.index('nc') 40 | 41 | def __call__(self, event: pd.Series): 42 | if not event.is_cc: 43 | return self.nc 44 | pdg = abs(event.nu_pdg) 45 | if pdg == 12: 46 | return self.cc_nue 47 | if pdg == 14: 48 | return self.cc_numu 49 | if pdg == 16: 50 | return self.cc_nutau 51 | raise Exception(f'PDG code {event.nu_pdg} not recognised.') -------------------------------------------------------------------------------- /pynuml/labels/pdk.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | class PDKLabels: 4 | def __init__(self): 5 | self._labels = ('nu', 'pdk') 6 | 7 | @property 8 | def labels(self): 9 | return self._labels 10 | 11 | def label(self, idx: int): 12 | if not 0 <= label < len(self._labels): 13 | raise Exception(f'index {idx} out of range for {len(self._labels)} labels.') 14 | return self._labels[idx] 15 | 16 | def index(self, name: str): 17 | if name not in self._labels: 18 | raise Exception(f'"{name}" is not the name of a class.') 19 | return self._labels.index(name) 20 | 21 | @property 22 | def nu(self): 23 | return self.index('nu') 24 | 25 | @property 26 | def pdk(self): 27 | return self.index('pdk') 28 | 29 | def __call__(self, event: pd.Series): 30 | if 12 <= abs(event.nu_pdg) <= 16: 31 | return self.nu 32 | else: 33 | return self.pdk -------------------------------------------------------------------------------- /pynuml/labels/simple.py: -------------------------------------------------------------------------------- 1 | from .standard import StandardLabels 2 | 3 | class SimpleLabels(StandardLabels): 4 | def __init__(self, 5 | gamma_threshold: float = 0.02, 6 | hadron_threshold: float = 0.2): 7 | super(SimpleLabels, self).__init__(gamma_threshold, hadron_threshold) 8 | 9 | self._labels = [ 10 | 'MIP', 11 | 'HIP', 12 | 'shower', 13 | 'michel', 14 | 'diffuse', 15 | 'invisible' 16 | ] 17 | 18 | @property 19 | def pion(self): 20 | return self.index('MIP') 21 | 22 | @property 23 | def muon(self): 24 | return self.index('MIP') 25 | 26 | @property 27 | def kaon(self): 28 | return self.index('HIP') 29 | 30 | @property 31 | def hadron(self): 32 | return self.index('HIP') 33 | 34 | @property 35 | def delta(self): 36 | return self.index('MIP') -------------------------------------------------------------------------------- /pynuml/labels/standard.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import particle 3 | 4 | class StandardLabels: 5 | 6 | def __init__(self, 7 | gamma_threshold: float = 0.02, 8 | hadron_threshold: float = 0.2): 9 | self._labels = [ 10 | 'pion', 11 | 'muon', 12 | 'kaon', 13 | 'hadron', 14 | 'shower', 15 | 'michel', 16 | 'diffuse', 17 | 'invisible' 18 | ] 19 | self._gamma_threshold = gamma_threshold 20 | self._hadron_threshold = hadron_threshold 21 | 22 | @property 23 | def labels(self): 24 | return self._labels 25 | 26 | def label(self, idx: int): 27 | if not 0 <= label < len(self._labels): 28 | raise Exception(f'index {idx} out of range for {len(self._labels)} labels.') 29 | return self._labels[idx] 30 | 31 | def index(self, name: str): 32 | if name not in self._labels: 33 | raise Exception(f'"{name}" is not the name of a class.') 34 | return self._labels.index(name) 35 | 36 | @property 37 | def pion(self): 38 | return self.index('pion') 39 | 40 | @property 41 | def muon(self): 42 | return self.index('muon') 43 | 44 | @property 45 | def kaon(self): 46 | return self.index('kaon') 47 | 48 | @property 49 | def hadron(self): 50 | return self.index('hadron') 51 | 52 | @property 53 | def shower(self): 54 | return self.index('shower') 55 | 56 | @property 57 | def michel(self): 58 | return self.index('michel') 59 | 60 | @property 61 | def diffuse(self): 62 | return self.index('diffuse') 63 | 64 | @property 65 | def invisible(self): 66 | return self.index('invisible') 67 | 68 | def __call__(self, 69 | part: pd.DataFrame): 70 | '''Standard labelling function. 71 | 72 | Pion, Muon, Kaon, Hadron, EM shower, Michel electron, 73 | diffuse activity. 74 | ''' 75 | 76 | def walk(part, particles, depth, sl, il): 77 | def s(part, particles): 78 | sl, slc = -1, None 79 | parent_type = 0 if part.parent_id == 0 else particles.type[part.parent_id] 80 | 81 | def pion_labeler(part, parent_type): 82 | sl = self.pion 83 | slc = None 84 | return sl, slc 85 | 86 | def muon_labeler(part, parent_type): 87 | sl = self.muon 88 | slc = None 89 | return sl, slc 90 | 91 | def kaon_labeler(part, parent_type): 92 | sl = self.kaon 93 | slc = None 94 | return sl, slc 95 | 96 | def neutral_pions_kaons_labeler(part, parent_type): 97 | sl = self.invisible 98 | slc = None 99 | return sl, slc 100 | 101 | def electron_positron_labeler(part, parent_type): 102 | if part.start_process == 'primary': 103 | sl = self.shower 104 | slc = self.shower 105 | elif abs(parent_type) == 13 and (part.start_process == 'muMinusCaptureAtRest' \ 106 | or part.start_process == 'muPlusCaptureAtRest' or part.start_process == 'Decay'): 107 | sl = self.michel 108 | slc = self.michel 109 | elif part.start_process == 'conv' or part.end_process == 'conv' \ 110 | or part.start_process == 'compt' or part.end_process == 'compt': 111 | if part.momentum >= self._gamma_threshold: 112 | sl = self.shower 113 | slc = self.shower 114 | else: 115 | sl = self.diffuse 116 | slc = self.diffuse 117 | elif part.start_process == 'muIoni' or part.start_process == 'hIoni' \ 118 | or part.start_process == 'eIoni': 119 | if part.start_process == 'muIoni': 120 | sl = self.muon 121 | slc = None 122 | elif part.start_process == 'hIoni': 123 | if abs(parent_type) == 2212: 124 | sl = self.hadron 125 | if part.momentum <= 0.0015: sl = self.diffuse 126 | else: 127 | sl = self.pion 128 | slc = None 129 | else: 130 | sl = self.diffuse 131 | slc = None 132 | elif part.start_process == 'eBrem' or part.end_process == 'phot' \ 133 | or part.end_process == 'photonNuclear' or part.end_process == 'eIoni': 134 | sl = self.diffuse 135 | slc = None 136 | elif part.end_process == 'StepLimiter' or part.end_process == 'annihil' \ 137 | or part.end_process == 'eBrem' or part.start_process == 'hBertiniCaptureAtRest' \ 138 | or part.end_process == 'FastScintillation' or part.start_process == 'muPairProd' \ 139 | or part.start_process == 'phot': 140 | sl = self.diffuse 141 | slc = self.diffuse 142 | else: 143 | raise Exception(f'labelling failed for electron with start process "{part.start_process}" and end process "{part.end_process}') 144 | 145 | return sl, slc 146 | 147 | def gamma_labeler(part, parent_type): 148 | if part.start_process == 'conv' or part.end_process == 'conv' \ 149 | or part.start_process == 'compt' or part.end_process == 'compt': 150 | if part.momentum >= self._gamma_threshold: 151 | sl = self.shower 152 | slc = self.shower 153 | else: 154 | sl = self.diffuse 155 | slc = self.diffuse 156 | elif part.start_process == 'eBrem' or part.end_process == 'phot' \ 157 | or part.end_process == 'photonNuclear': 158 | sl = self.diffuse 159 | slc = None 160 | else: 161 | raise Exception(f'labelling failed for photon with start process "{part.start_process}" and end process "{part.end_process}') 162 | return sl, slc 163 | 164 | def unlabeled_particle(part, parent_type): 165 | raise Exception(f"particle not recognised! PDG code {part.type}, parent PDG code {parent_type}, start process {part.start_process}, end process {part.end_process}") 166 | 167 | particle_processor = { 168 | 211: pion_labeler, 169 | 221: pion_labeler, 170 | 331: pion_labeler, 171 | 223: pion_labeler, 172 | 13: muon_labeler, 173 | 321: kaon_labeler, 174 | 111: neutral_pions_kaons_labeler, 175 | 311: neutral_pions_kaons_labeler, 176 | 310: neutral_pions_kaons_labeler, 177 | 130: neutral_pions_kaons_labeler, 178 | 113: neutral_pions_kaons_labeler, 179 | 411: kaon_labeler, # D meson 180 | 11: electron_positron_labeler, 181 | 22: gamma_labeler 182 | } 183 | 184 | if particle.pdgid.charge(part.type) == 0 and part.end_process == 'CoupledTransportation': 185 | # neutral particle left the volume boundary 186 | sl = self.invisible 187 | else: 188 | func = particle_processor.get(abs(part.type), lambda x ,y: (-1, None)) 189 | sl, slc = func(part, parent_type) 190 | 191 | # baryon interactions - hadron or diffuse 192 | if (particle.pdgid.is_baryon(part.type) and particle.pdgid.charge(part.type) == 0) \ 193 | or particle.pdgid.is_nucleus(part.type): 194 | sl = self.diffuse 195 | if particle.pdgid.is_baryon(part.type) and particle.pdgid.charge(part.type) != 0: 196 | if abs(part.type) == 2212 and part.momentum >= self._hadron_threshold: 197 | sl = self.hadron 198 | else: 199 | sl = self.diffuse 200 | 201 | # call a charged tau highly ionising - should revisit this 202 | if abs(part.type) == 15: 203 | sl = self.hadron 204 | 205 | # check to make sure particle was assigned 206 | if sl == -1: 207 | unlabeled_particle(part, parent_type) 208 | 209 | return sl, slc 210 | 211 | def i(part, particles, sl): 212 | il, ilc = -1, None 213 | if sl == self.muon and part.start_process == 'muIoni': 214 | il = part.parent_id 215 | elif (sl == self.pion or sl == self.hadron) and part.start_process == 'hIoni': 216 | il = part.parent_id 217 | elif sl != self.diffuse and sl != self.invisible: 218 | il = part.g4_id 219 | if sl == self.shower: ilc = il 220 | if sl == self.michel: ilc = il 221 | return il, ilc 222 | 223 | if sl is not None: slc = sl 224 | else: sl, slc = s(part, particles) 225 | 226 | if il is not None: ilc = il 227 | else: il, ilc = i(part, particles, sl) 228 | 229 | ret = [ { 230 | "g4_id": part.g4_id, 231 | "parent_id": part.parent_id, 232 | "type": part.type, 233 | "start_process": part.start_process, 234 | "end_process": part.end_process, 235 | "momentum": part.momentum, 236 | "semantic_label": sl, 237 | "instance_label": il } ] 238 | for _, row in particles[(part.g4_id==particles.parent_id)].iterrows(): 239 | ret += walk(row, particles, depth+1, slc, ilc) 240 | return ret 241 | 242 | ret = [] 243 | part = part.set_index("g4_id", drop=False) 244 | primaries = part[(part.parent_id==0)] 245 | for _, primary in primaries.iterrows(): 246 | ret += walk(primary, part, 0, None, None) 247 | if len(ret)==0: return 248 | labels = pd.DataFrame.from_dict(ret) 249 | instances = { val: i for i, val in enumerate(labels[(labels.instance_label>=0)].instance_label.unique()) } 250 | 251 | def alias_instance(row, instances): 252 | if row.instance_label == -1: return -1 253 | return instances[row.instance_label] 254 | 255 | labels["instance_label"] = labels.apply(alias_instance, args=[instances], axis="columns") 256 | return labels 257 | 258 | def validate(self, labels: pd.Series): 259 | mask = (labels < 0) | (labels >= len(self._labels) - 1) 260 | if mask.any(): 261 | raise Exception(f'{mask.sum()} semantic labels are out of range: {labels[mask]}.') 262 | -------------------------------------------------------------------------------- /pynuml/meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "pynuml" %} 2 | {% set version = "23.11.0" %} 3 | 4 | package: 5 | name: {{ name|lower }} 6 | version: {{ version }} 7 | 8 | source: 9 | url: https://pypi.io/packages/source/{{ name[0] }}/{{ name }}/pynuml-{{ version }}.tar.gz 10 | sha256: 1a7e61864cfeb0b27c6a93646c33e3f457bbc384eb86aee4df76b5e02898d02f 11 | 12 | build: 13 | noarch: python 14 | script: {{ PYTHON }} -m pip install . -vv 15 | number: 0 16 | 17 | requirements: 18 | host: 19 | - python >=3.7 20 | - flit >=3.2,<4 21 | - pip 22 | run: 23 | - python >=3.7 24 | - h5py >=3.7.0 25 | - mpi4py 26 | - pandas 27 | - particle 28 | - plotly 29 | - pytorch >=1.12.1 30 | - pyg >=2.1.0 31 | 32 | test: 33 | imports: 34 | - pynuml 35 | commands: 36 | - pip check 37 | requires: 38 | - pip 39 | 40 | about: 41 | summary: Standardised ML input processing for particle physics 42 | license: MIT 43 | license_file: LICENSE 44 | 45 | extra: 46 | recipe-maintainers: 47 | - vhewes 48 | -------------------------------------------------------------------------------- /pynuml/plot/__init__.py: -------------------------------------------------------------------------------- 1 | from .graph import GraphPlot -------------------------------------------------------------------------------- /pynuml/plot/graph.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from torch_geometric.data import Batch, HeteroData 3 | import plotly.express as px 4 | from plotly.graph_objects import FigureWidget 5 | import warnings 6 | 7 | class GraphPlot: 8 | def __init__(self, 9 | planes: list[str], 10 | classes: list[str], 11 | filter_threshold: float = 0.5): 12 | self._planes = planes 13 | self._classes = classes 14 | self._labels = pd.CategoricalDtype(['background']+classes, ordered=True) 15 | self._cmap = { c: px.colors.qualitative.Plotly[i] for i, c in enumerate(classes) } 16 | self._cmap['background'] = 'lightgrey' 17 | self.filter_threshold = filter_threshold 18 | 19 | # temporarily silence this pandas warning triggered by plotly, 20 | # which we don't have any power to fix but will presumably 21 | # be fixed on their end at some point 22 | warnings.filterwarnings("ignore", ".*The default of observed=False is deprecated and will be changed to True in a future version of pandas.*") 23 | self._truth_cols = ( 'g4_id', 'parent_id', 'pdg' ) 24 | 25 | def to_dataframe(self, data: HeteroData): 26 | def to_categorical(arr): 27 | return pd.Categorical.from_codes(codes=arr+1, dtype=self._labels) 28 | if isinstance(data, Batch): 29 | raise Exception('to_dataframe does not support batches!') 30 | dfs = [] 31 | for p in self._planes: 32 | plane = data[p].to_dict() 33 | df = pd.DataFrame(plane['id'], columns=['id']) 34 | df['plane'] = p 35 | df[['wire','time']] = plane['pos'] 36 | if "c" in plane: 37 | df[["x", "y", "z"]] = plane["c"] 38 | df['y_filter'] = plane['y_semantic'] != -1 39 | mask = df.y_filter.values 40 | df['y_semantic'] = to_categorical(plane['y_semantic']) 41 | df['y_instance'] = plane['y_instance'].numpy().astype(str) 42 | 43 | # add detailed truth information if it's available 44 | for col in self._truth_cols: 45 | if col in plane.keys(): 46 | df[col] = plane[col].numpy() 47 | 48 | # add model prediction if it's available 49 | if 'x_semantic' in plane.keys(): 50 | df['x_semantic'] = to_categorical(plane['x_semantic'].argmax(dim=-1).detach()) 51 | df[self._classes] = plane['x_semantic'].detach() 52 | if 'x_filter' in plane.keys(): 53 | df['x_filter'] = plane['x_filter'].detach() 54 | if "i" in plane.keys(): 55 | df["i"] = plane["i"].numpy().astype(str) 56 | 57 | dfs.append(df) 58 | df = pd.concat(dfs) 59 | md = data['metadata'] 60 | df['run'] = md.run.item() 61 | df['subrun'] = md.subrun.item() 62 | df['event'] = md.event.item() 63 | return df 64 | 65 | def plot(self, 66 | data: HeteroData, 67 | target: str = 'hits', 68 | how: str = 'none', 69 | filter: str = 'show', 70 | xyz: bool = False, 71 | width: int = None, 72 | height: int = None, 73 | title: bool = True) -> FigureWidget: 74 | 75 | df = self.to_dataframe(data) 76 | 77 | # no colour 78 | if target == 'hits': 79 | opts = { 80 | 'title': 'Graph hits', 81 | } 82 | 83 | # semantic labels 84 | elif target == 'semantic': 85 | if how == 'true': 86 | opts = { 87 | 'title': 'True semantic labels', 88 | 'labels': { 'y_semantic': 'Semantic label' }, 89 | 'color': 'y_semantic', 90 | 'color_discrete_map': self._cmap, 91 | } 92 | elif how == 'pred': 93 | opts = { 94 | 'title': 'Predicted semantic labels', 95 | 'labels': { 'x_semantic': 'Semantic label' }, 96 | 'color': 'x_semantic', 97 | 'color_discrete_map': self._cmap, 98 | } 99 | elif how in self._classes: 100 | opts = { 101 | 'title': f'Predicted semantic label strength for {how} class', 102 | 'labels': { how: f'{how} probability' }, 103 | 'color': how, 104 | 'color_continuous_scale': px.colors.sequential.Reds, 105 | } 106 | else: 107 | raise Exception('for semantic labels, "how" must be one of "true", "pred" or the name of a class.') 108 | 109 | # instance labels 110 | elif target == 'instance': 111 | if how == 'true': 112 | opts = { 113 | 'title': 'True instance labels', 114 | 'labels': { 'y_instance': 'Instance label' }, 115 | 'color': 'y_instance', 116 | 'symbol': 'y_semantic', 117 | 'color_discrete_map': self._cmap, 118 | } 119 | elif how == 'pred': 120 | opts = { 121 | 'title': 'Predicted instance labels', 122 | 'labels': { 'i': 'Instance label' }, 123 | 'color': 'i', 124 | 'color_discrete_map': self._cmap, 125 | } 126 | else: 127 | raise Exception('for instance labels, "how" must be one of "true" or "pred".') 128 | 129 | # filter labels 130 | elif target == 'filter': 131 | if how == 'true': 132 | opts = { 133 | 'title': 'True filter labels', 134 | 'labels': { 'y_filter': 'Filter label' }, 135 | 'color': 'y_filter', 136 | 'color_discrete_map': { 0: 'coral', 1: 'mediumseagreen' }, 137 | } 138 | elif how == 'pred': 139 | opts = { 140 | 'title': 'Predicted filter labels', 141 | 'labels': { 'x_filter': 'Filter label' }, 142 | 'color': 'x_filter', 143 | 'color_continuous_scale': px.colors.sequential.Reds, 144 | } 145 | else: 146 | raise Exception('for filter labels, "how" must be one of "true" or "pred".') 147 | 148 | else: 149 | raise Exception('"target" must be one of "hits", "semantic", "instance" or "filter".') 150 | 151 | if filter == 'none': 152 | # don't do any filtering 153 | pass 154 | elif filter == 'show': 155 | # show hits predicted to be background in grey 156 | if target == 'semantic' and how == 'pred': 157 | df.x_semantic[df.x_filter < self.filter_threshold] = 'background' 158 | elif filter == 'true': 159 | # remove true background hits 160 | df = df[df.y_filter.values] 161 | opts['title'] += ' (filtered by truth)' 162 | elif filter == 'pred': 163 | # remove predicted background hits 164 | df = df[df.x_filter > self.filter_threshold] 165 | opts['title'] += ' (filtered by prediction)' 166 | else: 167 | raise Exception('"filter" must be one of "none", "show", "true" or "pred".') 168 | 169 | if not title: 170 | opts.pop('title') 171 | 172 | # set hover data 173 | opts['hover_data'] = { 174 | 'y_semantic': True, 175 | "y_instance": True, 176 | 'wire': ':.1f', 177 | 'time': ':.1f', 178 | } 179 | opts['labels'] = { 180 | 'y_filter': 'filter truth', 181 | 'y_semantic': 'semantic truth', 182 | 'y_instance': 'instance truth', 183 | } 184 | if 'x_filter' in df: 185 | opts['hover_data']['x_filter'] = True 186 | opts['labels']['x_filter'] = 'filter prediction' 187 | if 'x_semantic' in df: 188 | opts['hover_data']['x_semantic'] = True 189 | opts['labels']['x_semantic'] = 'semantic prediction' 190 | if 'i' in df: 191 | opts['hover_data']['i'] = ':.4f' 192 | opts['labels']['i'] = 'instance prediction' 193 | for col in self._truth_cols: 194 | if col in df: 195 | opts['hover_data'][col] = True 196 | 197 | if xyz: 198 | fig = px.scatter_3d(df, x="x", y="y", z="z", 199 | width=width, height=height, **opts) 200 | fig.update_traces(marker_size=1) 201 | else: 202 | fig = px.scatter(df, x='wire', y='time', facet_col='plane', 203 | width=width, height=height, **opts) 204 | fig.update_xaxes(matches=None) 205 | for a in fig.layout.annotations: 206 | a.text = a.text.replace('plane=', '') 207 | 208 | # set the legend to horizontal 209 | fig.update_layout( 210 | legend_orientation='h', 211 | legend_yanchor='bottom', legend_y=1.05, 212 | legend_xanchor='right', legend_x=1, 213 | margin_l=20, margin_r=20, margin_t=20, margin_b=20, 214 | title_automargin=title, 215 | ) 216 | 217 | return FigureWidget(fig) -------------------------------------------------------------------------------- /pynuml/process/__init__.py: -------------------------------------------------------------------------------- 1 | from .hitgraph import HitGraphProducer 2 | -------------------------------------------------------------------------------- /pynuml/process/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC 2 | from typing import Any, Dict, List, Tuple 3 | 4 | from ..io import File 5 | 6 | class ProcessorBase(ABC): 7 | '''Base class for event processing''' 8 | 9 | def __init__(self, file: File): 10 | for group, keys in self.columns.items(): 11 | file.add_group(group, keys) 12 | 13 | @property 14 | def columns(self) -> Dict[str, List[str]]: 15 | raise NotImplementedError 16 | 17 | def __call__(self, evt: Any) -> Tuple[str, Any]: 18 | raise NotImplementedError -------------------------------------------------------------------------------- /pynuml/process/hitgraph.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Callable 2 | import numpy as np 3 | import pandas as pd 4 | 5 | import torch 6 | import torch_geometric as pyg 7 | 8 | from .base import ProcessorBase 9 | 10 | class HitGraphProducer(ProcessorBase): 11 | '''Process event into graphs''' 12 | 13 | def __init__(self, 14 | file: 'pynuml.io.File', 15 | semantic_labeller: Callable = None, 16 | event_labeller: Callable = None, 17 | label_vertex: bool = False, 18 | label_position: bool = False, 19 | planes: list[str] = ['u','v','y'], 20 | node_pos: list[str] = ['local_wire','local_time'], 21 | pos_norm: list[float] = [0.3,0.055], 22 | node_feats: list[str] = ['integral','rms'], 23 | lower_bound: int = 20, 24 | store_detailed_truth: bool = False): 25 | 26 | self.semantic_labeller = semantic_labeller 27 | self.event_labeller = event_labeller 28 | self.label_vertex = label_vertex 29 | self.label_position = label_position 30 | self.planes = planes 31 | self.node_pos = node_pos 32 | self.pos_norm = torch.tensor(pos_norm).float() 33 | self.node_feats = node_feats 34 | self.lower_bound = lower_bound 35 | self.store_detailed_truth = store_detailed_truth 36 | 37 | self.transform = pyg.transforms.Compose(( 38 | pyg.transforms.Delaunay(), 39 | pyg.transforms.FaceToEdge())) 40 | 41 | super().__init__(file) 42 | 43 | @property 44 | def columns(self) -> dict[str, list[str]]: 45 | groups = { 46 | 'hit_table': ['hit_id','local_plane','local_time','local_wire','integral','rms'], 47 | 'spacepoint_table': [] 48 | } 49 | if self.semantic_labeller: 50 | groups['particle_table'] = ['g4_id','parent_id','type','momentum','start_process','end_process'] 51 | groups['edep_table'] = [] 52 | if self.event_labeller: 53 | groups['event_table'] = ['is_cc', 'nu_pdg'] 54 | if self.label_vertex: 55 | keys = ['nu_vtx_corr','nu_vtx_wire_pos','nu_vtx_wire_time'] 56 | if 'event_table' in groups: 57 | groups['event_table'].extend(keys) 58 | else: 59 | groups['event_table'] = keys 60 | if self.label_position: 61 | groups["edep_table"] = [] 62 | return groups 63 | 64 | @property 65 | def metadata(self): 66 | metadata = { 'planes': self.planes } 67 | if self.semantic_labeller is not None: 68 | metadata['semantic_classes'] = self.semantic_labeller.labels[:-1] 69 | if self.event_labeller is not None: 70 | metadata['event_classes'] = self.event_labeller.labels 71 | return metadata 72 | 73 | def __call__(self, evt: 'pynuml.io.Event') -> tuple[str, Any]: 74 | 75 | if self.event_labeller or self.label_vertex: 76 | event = evt['event_table'].squeeze() 77 | 78 | hits = evt['hit_table'] 79 | spacepoints = evt['spacepoint_table'].reset_index(drop=True) 80 | 81 | # discard any events with pathologically large hit integrals 82 | # this is a hotfix that should be removed once the dataset is fixed 83 | if hits.integral.max() > 1e6: 84 | print('found event with pathologically large hit integral, skipping') 85 | return evt.name, None 86 | 87 | # handle energy depositions 88 | if self.semantic_labeller: 89 | edeps = evt['edep_table'] 90 | energy_col = 'energy' if 'energy' in edeps.columns else 'energy_fraction' # for backwards compatibility 91 | 92 | # get ID of max particle 93 | g4_id = edeps[[energy_col, 'g4_id', 'hit_id']] 94 | g4_id = g4_id.sort_values(by=[energy_col], 95 | ascending=False, 96 | kind='mergesort').drop_duplicates('hit_id') 97 | hits = g4_id.merge(hits, on='hit_id', how='right') 98 | 99 | # charge-weighted average of 3D position 100 | if self.label_position: 101 | edeps = edeps[["hit_id", "energy", "x_position", "y_position", "z_position"]] 102 | for col in ["x_position", "y_position", "z_position"]: 103 | edeps.loc[:, col] *= edeps.energy 104 | edeps = edeps.groupby("hit_id").sum() 105 | for col in ["x_position", "y_position", "z_position"]: 106 | edeps.loc[:, col] /= edeps.energy 107 | edeps = edeps.drop("energy", axis="columns") 108 | hits = edeps.merge(hits, on="hit_id", how="right") 109 | 110 | hits['filter_label'] = ~hits[energy_col].isnull() 111 | hits = hits.drop(energy_col, axis='columns') 112 | 113 | # reset spacepoint index 114 | spacepoints = spacepoints.reset_index(names='index_3d') 115 | 116 | # skip events with fewer than lower_bnd simulated hits in any plane. 117 | # note that we can't just do a pandas groupby here, because that will 118 | # skip over any planes with zero hits 119 | for i in range(len(self.planes)): 120 | planehits = hits[hits.local_plane==i] 121 | nhits = planehits.filter_label.sum() if self.semantic_labeller else planehits.shape[0] 122 | if nhits < self.lower_bound: 123 | return evt.name, None 124 | 125 | # get labels for each particle 126 | if self.semantic_labeller: 127 | particles = self.semantic_labeller(evt['particle_table']) 128 | try: 129 | hits = hits.merge(particles, on='g4_id', how='left') 130 | except: 131 | print('exception occurred when merging hits and particles') 132 | print('hit table:', hits) 133 | print('particle table:', particles) 134 | print('skipping this event') 135 | return evt.name, None 136 | mask = (~hits.g4_id.isnull()) & (hits.semantic_label.isnull()) 137 | if mask.any(): 138 | print(f'found {mask.sum()} orphaned hits.') 139 | return evt.name, None 140 | del mask 141 | 142 | data = pyg.data.HeteroData() 143 | 144 | # event metadata 145 | r, sr, e = evt.event_id 146 | data['metadata'].run = r 147 | data['metadata'].subrun = sr 148 | data['metadata'].event = e 149 | 150 | # spacepoint nodes 151 | if "position_x" in spacepoints.keys(): 152 | data["sp"].pos = torch.tensor(spacepoints[[f"position_{c}" for c in ("x", "y", "z")]].values).float() 153 | else: 154 | data['sp'].num_nodes = spacepoints.shape[0] 155 | 156 | # draw graph edges 157 | for i, plane_hits in hits.groupby('local_plane'): 158 | 159 | p = self.planes[i] 160 | plane_hits = plane_hits.reset_index(drop=True).reset_index(names='index_2d') 161 | 162 | # node position 163 | pos = torch.tensor(plane_hits[self.node_pos].values).float() 164 | data[p].pos = pos * self.pos_norm[None,:] 165 | 166 | # node features 167 | data[p].x = torch.tensor(plane_hits[self.node_feats].values).float() 168 | 169 | # node true position 170 | if self.label_position: 171 | data[p].c = torch.tensor(plane_hits[["x_position", "y_position", "z_position"]].values).float() 172 | 173 | # hit indices 174 | data[p].id = torch.tensor(plane_hits['hit_id'].values).long() 175 | 176 | # 2D edges 177 | data[p, 'plane', p].edge_index = self.transform(data[p]).edge_index 178 | 179 | # 3D edges 180 | edge3d = spacepoints.merge(plane_hits[['hit_id','index_2d']].add_suffix(f'_{p}'), 181 | on=f'hit_id_{p}', 182 | how='inner') 183 | edge3d = edge3d[[f'index_2d_{p}','index_3d']].values.transpose() 184 | edge3d = torch.tensor(edge3d) if edge3d.size else torch.empty((2,0)) 185 | data[p, 'nexus', 'sp'].edge_index = edge3d.long() 186 | 187 | # truth information 188 | if self.semantic_labeller: 189 | data[p].y_semantic = torch.tensor(plane_hits['semantic_label'].fillna(-1).values).long() 190 | data[p].y_instance = torch.tensor(plane_hits['instance_label'].fillna(-1).values).long() 191 | if self.store_detailed_truth: 192 | data[p].g4_id = torch.tensor(plane_hits['g4_id'].fillna(-1).values).long() 193 | data[p].parent_id = torch.tensor(plane_hits['parent_id'].fillna(-1).values).long() 194 | data[p].pdg = torch.tensor(plane_hits['type'].fillna(-1).values).long() 195 | if self.label_vertex: 196 | vtx_2d = torch.tensor([ event[f'nu_vtx_wire_pos_{i}'], event.nu_vtx_wire_time ]).float() 197 | data[p].y_vtx = vtx_2d * self.pos_norm[None,:] 198 | 199 | # event label 200 | if self.event_labeller: 201 | data['evt'].y = torch.tensor(self.event_labeller(event)).long() 202 | 203 | # 3D vertex truth 204 | if self.label_vertex: 205 | vtx_3d = [ [ event.nu_vtx_corr_x, event.nu_vtx_corr_y, event.nu_vtx_corr_z ] ] 206 | data['evt'].y_vtx = torch.tensor(vtx_3d).float() 207 | 208 | return evt.name, data -------------------------------------------------------------------------------- /pynuml/process/spmap.py: -------------------------------------------------------------------------------- 1 | import pynuml 2 | 3 | def process_event(key, out, sp, hit, part, edep, l=standard, voxelsize=1): 4 | """Process an event into a 3D pixel map""" 5 | import numpy as np, torch, MinkowskiEngine as ME 6 | 7 | # skip any events with no simulated hits 8 | if (hit.index==key).sum() == 0: return 9 | if (edep.index==key).sum() == 0: return 10 | 11 | # label true particles 12 | evt_part = part.loc[key].reset_index(drop=True) 13 | evt_part = l.panoptic_label(evt_part) 14 | 15 | # get energy depositions and ground truth 16 | evt_edep = edep.loc[key].reset_index(drop=True) 17 | evt_edep = evt_edep.merge(evt_part[["g4_id", "semantic_label"]], on="g4_id", how="left").drop("g4_id", axis="columns") 18 | scores = evt_edep.groupby(["hit_id", "semantic_label"]).agg({"energy": "sum"}).reset_index() 19 | 20 | # class number and names 21 | n = len(l.label) - 1 22 | lnames = [ it.name for it in l.label ][:-1] 23 | noise = np.zeros(n) 24 | noise[l.label.diffuse.value] = 1 25 | 26 | def fractional_truth(row, n): 27 | label = np.zeros(n) 28 | label[int(row.semantic_label)] = row.energy 29 | return label 30 | scores["slabel"] = scores.apply(fractional_truth, args=[n], axis="columns") 31 | scores = scores.groupby("hit_id").agg({"slabel": "sum"}) 32 | 33 | # Propagate labels to hits 34 | evt_hit = hit.loc[key].reset_index(drop=True).merge(scores, on="hit_id", how="inner") 35 | evt_sp = sp.loc[key].reset_index(drop=True) 36 | 37 | # skip events with fewer than 50 simulated hits in any plane, or fewer than 50 spacepoints 38 | for i in range(3): 39 | if (evt_hit.global_plane==i).sum() < 50: return 40 | if evt_sp.shape[0] < 50: return 41 | 42 | # merge hits into spacepoints 43 | for plane in ["u","v","y"]: 44 | evt_sp = evt_sp.merge(evt_hit[["hit_id","integral","slabel"]].add_suffix(f"_{plane}"), on=f"hit_id_{plane}", how="left") 45 | evt_sp[f"integral_{plane}"] = evt_sp[f"integral_{plane}"].fillna(0) 46 | 47 | def merge_truth(row, n): 48 | labels = np.zeros(n) 49 | for plane in ["u","v","y"]: 50 | vals = row[f"slabel_{plane}"] 51 | if type(vals) != float: labels += vals 52 | return labels 53 | 54 | evt_sp["slabel"] = evt_sp.apply(merge_truth, args=[len(l.label)-1], axis="columns") 55 | evt_sp = evt_sp[["slabel", "position_x", "position_y", "position_z", "integral_u", "integral_v", "integral_y"]] 56 | 57 | # voxelise spacepoints and aggregate labels 58 | def voxelise(row): 59 | return np.floor(row.position_x/voxelsize), np.floor(row.position_y/voxelsize), np.floor(row.position_z/voxelsize) 60 | evt_sp["c"] = evt_sp.apply(voxelise, axis="columns") 61 | evt_sp = evt_sp.drop(["position_x", "position_y", "position_z"], axis="columns") 62 | evt_sp = evt_sp.groupby("c").agg({"integral_u": "sum", "integral_v": "sum", "integral_y": "sum", "slabel": "sum"}).reset_index() 63 | def norm_truth(row, noise): 64 | lsum = row.slabel.sum() 65 | return noise if lsum == 0 else row.slabel / lsum 66 | evt_sp["slabel"] = evt_sp.apply(norm_truth, args=[noise], axis="columns") 67 | 68 | spm = { 69 | "f": torch.tensor(evt_sp[["integral_u", "integral_v", "integral_y"]].to_numpy()).float(), 70 | "c": torch.tensor(evt_sp["c"]).int(), 71 | "ys": torch.tensor(evt_sp["slabel"]).float() 72 | } 73 | out.save(spm, f"r{key[0]}_sr{key[1]}_evt{key[2]}") 74 | 75 | def process_file(out, fname, p=process_event, l=standard, voxelsize=1): 76 | """Process all events in a file into graphs""" 77 | f = NuMLFile(fname) 78 | 79 | evt = f.get_dataframe("event_table", ["event_id"]) 80 | sp = f.get_dataframe("spacepoint_table") 81 | hit = f.get_dataframe("hit_table") 82 | part = f.get_dataframe("particle_table", ["event_id", "g4_id", "parent_id", "type", "momentum", "start_process", "end_process"]) 83 | edep = f.get_dataframe("edep_table") 84 | 85 | # loop over events in file 86 | for key in evt.index: p(key, out, sp, hit, part, edep, l, voxelsize) 87 | 88 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["flit_core >=3.2,<4"] 3 | build-backend = "flit_core.buildapi" 4 | 5 | [project] 6 | name = "pynuml" 7 | authors = [{name = "v hewes", email = "vhewes@fnal.gov"}] 8 | requires-python = '>=3.7' 9 | readme = "README.md" 10 | license = {file = "LICENSE"} 11 | 12 | classifiers = [ 13 | "Intended Audience :: Science/Research", 14 | "License :: OSI Approved :: MIT License", 15 | ] 16 | dependencies = [ 17 | "h5py>=3.7.0", 18 | "mpi4py", 19 | "pandas", 20 | "particle", 21 | "plotly", 22 | "torch>=1.12.1", 23 | "torch-geometric>=2.1.0", 24 | ] 25 | dynamic = ["version", "description"] 26 | 27 | [project.urls] 28 | Home = "https://github.com/nugraph/pynuml" 29 | -------------------------------------------------------------------------------- /scripts/install_ph5concat_conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | git clone https://github.com/NU-CUCIS/ph5concat 3 | cd ph5concat 4 | autoreconf -i 5 | ./configure --prefix=$CONDA_PREFIX \ 6 | --with-mpi=$CONDA_PREFIX \ 7 | --with-hdf5=$CONDA_PREFIX \ 8 | CFLAGS="-O2 -DNDEBUG" \ 9 | CXXFLAGS="-O2 -DNDEBUG" \ 10 | LIBS="-ldl -lz" \ 11 | --enable-profiling 12 | make install 13 | cd .. 14 | rm -fr ph5concat 15 | -------------------------------------------------------------------------------- /tests/test_process.py: -------------------------------------------------------------------------------- 1 | """Test pynuml graph processing and plotting""" 2 | import pynuml 3 | 4 | def test_process_uboone(): 5 | """Test graph processing with MicroBooNE open data release""" 6 | f = pynuml.io.File("/raid/nugraph/uboone-opendata/uboone-opendata.evt.h5") 7 | processor = pynuml.process.HitGraphProducer( 8 | file=f, 9 | semantic_labeller=pynuml.labels.StandardLabels(), 10 | event_labeller=pynuml.labels.FlavorLabels(), 11 | label_vertex=True) 12 | plot = pynuml.plot.GraphPlot( 13 | planes=["u", "v", "y"], 14 | classes=pynuml.labels.StandardLabels().labels[:-1]) 15 | f.read_data(0, 100) 16 | evts = f.build_evt() 17 | for evt in evts: 18 | _, data = processor(evt) 19 | if not data: 20 | continue 21 | plot.plot(data, target='semantic', how='true', filter='show') 22 | plot.plot(data, target='instance', how='true', filter='true') 23 | 24 | def test_process_dune_nutau(): 25 | """Test graph processing with DUNE beam nutau dataset""" 26 | f = pynuml.io.File("/raid/nugraph/dune-nutau/test.evt.h5") 27 | processor = pynuml.process.HitGraphProducer( 28 | file=f, 29 | semantic_labeller=pynuml.labels.StandardLabels(), 30 | event_labeller=pynuml.labels.FlavorLabels(), 31 | label_position=True) 32 | plot = pynuml.plot.GraphPlot( 33 | planes=["u", "v", "y"], 34 | classes=pynuml.labels.StandardLabels().labels[:-1]) 35 | f.read_data(0, 100) 36 | evts = f.build_evt() 37 | for evt in evts: 38 | _, data = processor(evt) 39 | if not data: 40 | continue 41 | plot.plot(data, target="filter", how="true", filter="show") 42 | plot.plot(data, target='semantic', how='true', filter='show') 43 | plot.plot(data, target='instance', how='true', filter='true') 44 | plot.plot(data, target="semantic", how="true", filter="show", xyz=True) 45 | --------------------------------------------------------------------------------