├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── setup.cfg ├── setup.py └── trackml ├── __init__.py ├── dataset.py ├── randomize.py ├── score.py ├── utils.py └── weights.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | .static_storage/ 57 | .media/ 58 | local_settings.py 59 | 60 | # Flask stuff: 61 | instance/ 62 | .webassets-cache 63 | 64 | # Scrapy stuff: 65 | .scrapy 66 | 67 | # Sphinx documentation 68 | docs/_build/ 69 | 70 | # PyBuilder 71 | target/ 72 | 73 | # Jupyter Notebook 74 | .ipynb_checkpoints 75 | 76 | # pyenv 77 | .python-version 78 | 79 | # celery beat schedule file 80 | celerybeat-schedule 81 | 82 | # SageMath parsed files 83 | *.sage.py 84 | 85 | # Environments 86 | .env 87 | .venv 88 | env/ 89 | venv/ 90 | ENV/ 91 | env.bak/ 92 | venv.bak/ 93 | 94 | # Spyder project settings 95 | .spyderproject 96 | .spyproject 97 | 98 | # Rope project settings 99 | .ropeproject 100 | 101 | # mkdocs documentation 102 | /site 103 | 104 | # mypy 105 | .mypy_cache/ 106 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2018 Moritz Kiehn 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | TrackML utility library 2 | ======================= 3 | 4 | A python library to simplify working with the 5 | [High Energy Physics Tracking Machine Learning challenge][trackml] 6 | dataset. It can be used for the datasets of both the 7 | [accuracy phase][trackml_kaggle] and the 8 | [throughput phase][trackml_codalab]. 9 | 10 | Installation 11 | ------------ 12 | 13 | The package can be installed as a user package via 14 | 15 | pip install --user 16 | 17 | To make a local checkout of the repository available directly it can also be 18 | installed in development mode 19 | 20 | pip install --user --editable 21 | 22 | In both cases, the package can be imported via `import trackml` without 23 | additional configuration. In the later case, changes made to the code are 24 | immediately visible without having to reinstall the package. 25 | 26 | Usage 27 | ----- 28 | 29 | To read the data for one event from the training dataset including the ground 30 | truth information: 31 | 32 | ```python 33 | from trackml.dataset import load_event 34 | 35 | hits, cells, particles, truth = load_event('path/to/event000000123') 36 | ``` 37 | 38 | For the test dataset only the hit information is available. To read only this 39 | data: 40 | 41 | ```python 42 | from trackml.dataset import load_event 43 | 44 | hits, cells = load_event('path/to/event000000456', parts=['hits', 'cells']) 45 | ``` 46 | 47 | To iterate over events in a dataset: 48 | 49 | ```python 50 | from trackml.dataset import load_dataset 51 | 52 | for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'): 53 | ... 54 | ``` 55 | 56 | To read a single event and compute additional columns derived from the 57 | stored data: 58 | 59 | ```python 60 | from trackml.dataset import load_event 61 | from trackml.utils import add_position_quantities, add_momentum_quantities, decode_particle_id 62 | 63 | # get the particles data 64 | particles = load_event('path/to/event000000123', parts=['particles']) 65 | # decode particle id into vertex id, generation, etc. 66 | particles = decode_particle_id(particles) 67 | # add vertex rho, phi, r 68 | particles = add_position_quantities(particles, prefix='v') 69 | # add momentum eta, p, pt 70 | particles = add_momentum_quantities(particles) 71 | ``` 72 | 73 | The dataset path can be the path to a directory or to a zip file containing the 74 | events `.csv` files. Each event is lazily loaded during the iteration. Options 75 | are available to read only a subset of available events or only read selected 76 | parts, e.g. only hits or only particles. 77 | 78 | To generate a random test submission from truth information and compute the 79 | expected score: 80 | 81 | ```python 82 | from trackml.randomize import shuffle_hits 83 | from trackml.score import score_event 84 | 85 | shuffled = shuffle_hits(truth, 0.05) # 5% probability to reassign a hit 86 | score = score_event(truth, shuffled) 87 | ``` 88 | 89 | All methods either take or return `pandas.DataFrame` objects. You can have a 90 | look at the function docstrings for detailed information. 91 | 92 | Authors 93 | ------- 94 | 95 | The library was written by 96 | 97 | * Moritz Kiehn 98 | 99 | with contributions from 100 | 101 | * Sabrina Amrouche 102 | * David Rousseau 103 | * Ilija Vukotic 104 | * Nimar Arora 105 | * Jon Nordby 106 | * Yerkebulan Berdibekov 107 | * Victor Estrade 108 | 109 | License 110 | ------- 111 | 112 | All code is licensed under the [MIT license][mit_license]. 113 | 114 | Dataset 115 | ------- 116 | 117 | A dataset comprises multiple independent events, where each event 118 | contains simulated measurements (essentially 3D points) of particles 119 | generated in a collision between proton bunches at the 120 | [Large Hadron Collider][lhc] at [CERN][cern]. The goal of the tracking 121 | machine learning challenge is to group the recorded measurements or hits 122 | for each event into tracks, sets of hits that belong to the same initial 123 | particle. A solution must uniquely associate each hit to one track. The 124 | training dataset contains the recorded hits, their ground truth 125 | counterpart and their association to particles, and the initial 126 | parameters of those particles. The test dataset contains only the 127 | recorded hits. 128 | 129 | Each dataset is usually provided as a single archive file. Once 130 | unzipped, the dataset comprises a set of `.csv[.gz]` files. Each event 131 | can have up to four associated files that contain hits, hit cells, 132 | particles, and the ground truth association between them. The common 133 | prefix, e.g. `event000000010`, is always `event` followed by 9 digits. 134 | 135 | event000000000-hits.csv 136 | event000000000-cells.csv 137 | event000000000-particles.csv 138 | event000000000-truth.csv 139 | event000000001-hits.csv 140 | event000000001-cells.csv 141 | event000000001-particles.csv 142 | event000000001-truth.csv 143 | 144 | Submissions must be provided as a single `.csv` file for the whole 145 | dataset, e.g. 146 | 147 | submission-test.csv 148 | submission-final.csv 149 | 150 | ### Event hits 151 | 152 | The hits file contains the following values for each hit/entry: 153 | 154 | * **hit_id**: numerical identifier of the hit inside the event. 155 | * **x, y, z**: measured x, y, z position (in millimeter) of the hit in 156 | global coordinates. 157 | * **volume_id**: numerical identifier of the detector group. 158 | * **layer_id**: numerical identifier of the detector layer inside the 159 | group. 160 | * **module_id**: numerical identifier of the detector module inside 161 | the layer. 162 | 163 | The volume/layer/module id could in principle be deduced from x, y, z. They 164 | are given here to simplify detector-specific data handling. 165 | 166 | ### Event truth 167 | 168 | The truth file contains the mapping between hits and generating particles and 169 | the true particle state at each measured hit. Each entry maps one hit to one 170 | particle. 171 | 172 | * **hit_id**: numerical identifier of the hit as defined in the hits file. 173 | * **particle_id**: numerical identifier of the generating particle as defined 174 | in the particles file. A value of 0 means that the hit did not originate 175 | from a reconstructible particle, but e.g. from detector noise. 176 | * **tx, ty, tz** true intersection point in global coordinates (in 177 | millimeters) between the particle trajectory and the sensitive surface. 178 | * **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global 179 | coordinate system at the intersection point. The corresponding vector 180 | is tangent to the particle trajectory at the intersection point. 181 | * **weight** per-hit weight used for the scoring metric; total sum of weights 182 | within one event equals to one. 183 | 184 | ### Event particles 185 | 186 | The particles files contains the following values for each particle/entry: 187 | 188 | * **particle_id**: numerical identifier of the particle inside the event. 189 | * **particle_type**: numerical identifier of the particle type; the 190 | [Particle Data Group Monte Carlo numbering scheme][pdg_mc_numbering] 191 | is used to identify specific particle types. 192 | * **vx, vy, vz**: initial position or vertex (in millimeters) in global 193 | coordinates. 194 | * **px, py, pz**: initial momentum (in GeV/c) along each global axis. 195 | * **q**: particle charge (as multiple of the absolute electron charge). 196 | * **nhits**: number of hits generated by this particle. 197 | 198 | All entries contain the generated information or ground truth. 199 | 200 | ### Event hit cells 201 | 202 | The cells file contains the constituent active detector cells that comprise each 203 | hit. The cells can be used to refine the hit to track association. 204 | A cell is the smallest granularity inside each detector module, much like a 205 | pixel on a screen, except that depending on the **volume_id** a cell can be a square 206 | or a long rectangle. It is identified by two channel identifiers that are unique 207 | within each detector module and encode the position, much like column/row 208 | numbers of a matrix. A cell can provide signal information that the detector 209 | module has recorded in addition to the position. Depending on the detector type 210 | only one of the channel identifiers is valid, e.g. for the strip detectors, and 211 | the value might have different resolution. 212 | 213 | * **hit_id**: numerical identifier of the hit as defined in the hits file. 214 | * **ch0, ch1**: channel identifier/coordinates unique within one module. 215 | * **value**: signal value information, e.g. how much charge a particle has 216 | deposited. 217 | 218 | ### Dataset submission information 219 | 220 | The submission file must associate each hit in each event to one and only one 221 | reconstructed particle track. The reconstructed tracks must be uniquely 222 | identified only within each event. 223 | 224 | * **event_id**: numerical identifier of the event; corresponds to the number 225 | found in the per-event file name prefix. 226 | * **hit_id**: numerical identifier of the hit inside the event as defined in 227 | the per-event hits file. 228 | * **track_id**: user-defined numerical identifier (non-negative integer) of 229 | the track. 230 | 231 | ### Additional detector geometry information 232 | 233 | The detector is built from silicon slabs (or modules, rectangular or 234 | trapezoïdal), arranged in cylinders and disks, which measure the position (or 235 | hits) of the particles that cross them. The detector modules are organized 236 | into detector groups or volumes identified by a **volume_id**. Inside a volume they 237 | are further grouped into layers identified by a **layer_id**. Each layer can contain 238 | an arbitrary number of detector modules, the smallest geometrically distinct 239 | detector object, each identified by a **module_id**. Within each group, detector 240 | modules are of the same type have e.g. the same granularity. All simulated 241 | detector modules are so-called semiconductor sensors that are build from thin 242 | silicon sensor chips. Each module can be represented by a two-dimensional, 243 | planar, bounded sensitive surface. These sensitive surfaces are subdivided into 244 | regular grids that define the detectors cells, the smallest granularity within 245 | the detector. 246 | 247 | Each module has a different position and orientation described in the detectors 248 | file. A local, right-handed coordinate system is defined on each sensitive 249 | surface such that the first two coordinates u and v are on the sensitive surface 250 | and the third coordinate w is normal to the surface. The orientation and 251 | position are defined by the following transformation 252 | 253 | pos_xyz = rotation_matrix * pos_uvw + offset 254 | 255 | that transform a position described in local coordinates u,v,w into the 256 | equivalent position x,y,z in global coordinates using a rotation matrix and 257 | an offset. 258 | 259 | * **volume_id**: numerical identifier of the detector group. 260 | * **layer_id**: numerical identifier of the detector layer inside the 261 | group. 262 | * **module_id**: numerical identifier of the detector module inside 263 | the layer. 264 | * **cx, cy, cz**: position of the local origin in the described in the global 265 | coordinate system (in millimeter). 266 | * **rot_xu, rot_xv, rot_xw, rot_yu, ...**: components of the rotation matrix 267 | to rotate from local u,v,w to global x,y,z coordinates. 268 | * **module_t**: thickness of the detector module (in millimeter). 269 | * **module_minhu, module_maxhu**: the minimum/maximum half-length of the 270 | module boundary along the local u direction (in millimeter). 271 | * **module_hv**: the half-length of the module boundary along the local v 272 | direction (in millimeter). 273 | * **pitch_u, pitch_v**: the size of detector cells along the local u and v 274 | direction (in millimeter). 275 | 276 | 277 | [cern]: https://home.cern 278 | [lhc]: https://home.cern/topics/large-hadron-collider 279 | [mit_license]: http://www.opensource.org/licenses/MIT 280 | [trackml]: https://sites.google.com/site/trackmlparticle/ 281 | [trackml_kaggle]: https://www.kaggle.com/c/trackml-particle-identification 282 | [trackml_codalab]: https://competitions.codalab.org/competitions/20112 283 | [pdg_mc_numbering]: http://pdg.lbl.gov/2018/reviews/rpp2018-rev-monte-carlo-numbering.pdf 284 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os.path as op 3 | 4 | from setuptools import setup 5 | 6 | here = op.abspath(op.dirname(__file__)) 7 | 8 | # Get the long description from the README file 9 | with io.open(op.join(here, 'README.md'), mode='rt', encoding='utf-8') as f: 10 | long_description = f.read() 11 | 12 | setup( 13 | name='trackml', 14 | version='3', 15 | description='TrackML utility library', 16 | long_description=long_description, 17 | long_description_content_type='text/markdown', 18 | url='https://github.com/LAL/trackml-library', 19 | author='Moritz Kiehn', 20 | author_email='msmk@cern.ch', 21 | classifiers=[ 22 | 'Development Status :: 5 - Production/Stable', 23 | 'Intended Audience :: Science/Research', 24 | 'Topic :: Scientific/Engineering :: Information Analysis', 25 | 'Topic :: Scientific/Engineering :: Physics', 26 | 'License :: OSI Approved :: MIT License', 27 | 'Programming Language :: Python :: 2', 28 | 'Programming Language :: Python :: 2.7', 29 | 'Programming Language :: Python :: 3', 30 | 'Programming Language :: Python :: 3.4', 31 | 'Programming Language :: Python :: 3.5', 32 | 'Programming Language :: Python :: 3.6', 33 | ], 34 | packages=['trackml'], 35 | install_requires=[ 36 | 'numpy', 37 | 'pandas>=0.21.0', 38 | ], 39 | python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*', 40 | ) 41 | -------------------------------------------------------------------------------- /trackml/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LAL/trackml-library/53a165e15a2c885f54c2bef1bd1ed53db6ed9648/trackml/__init__.py -------------------------------------------------------------------------------- /trackml/dataset.py: -------------------------------------------------------------------------------- 1 | """TrackML dataset loading""" 2 | 3 | __authors__ = ['Moritz Kiehn', 'Sabrina Amrouche', 'Nimar Arora'] 4 | 5 | import glob 6 | import os 7 | import os.path as op 8 | import re 9 | import zipfile 10 | 11 | import pandas 12 | 13 | CELLS_DTYPES = dict([ 14 | ('hit_id', 'i4'), 15 | ('ch0', 'i4'), 16 | ('ch1', 'i4'), 17 | ('value', 'f4'), 18 | ]) 19 | HITS_DTYPES = dict([ 20 | ('hit_id', 'i4'), 21 | ('x', 'f4'), 22 | ('y', 'f4'), 23 | ('z','f4'), 24 | ('volume_id', 'i4'), 25 | ('layer_id', 'i4'), 26 | ('module_id', 'i4'), 27 | ]) 28 | PARTICLES_DTYPES = dict([ 29 | ('particle_id', 'i8'), 30 | ('particle_type', 'i4'), 31 | ('vx', 'f4'), 32 | ('vy', 'f4'), 33 | ('vz', 'f4'), 34 | ('px', 'f4'), 35 | ('py', 'f4'), 36 | ('pz', 'f4'), 37 | ('q', 'i4'), 38 | ('nhits', 'i4'), 39 | ]) 40 | TRUTH_DTYPES = dict([ 41 | ('hit_id', 'i4'), 42 | ('particle_id', 'i8'), 43 | ('tx', 'f4'), 44 | ('ty', 'f4'), 45 | ('tz', 'f4'), 46 | ('tpx', 'f4'), 47 | ('tpy', 'f4'), 48 | ('tpz', 'f4'), 49 | ('weight', 'f4'), 50 | ]) 51 | DTYPES = { 52 | 'cells': CELLS_DTYPES, 53 | 'hits': HITS_DTYPES, 54 | 'particles': PARTICLES_DTYPES, 55 | 'truth': TRUTH_DTYPES, 56 | } 57 | DEFAULT_PARTS = ['hits', 'cells', 'particles', 'truth'] 58 | 59 | def _load_event_data(prefix, name): 60 | """Load per-event data for one single type, e.g. hits, or particles. 61 | """ 62 | # csv files can be individually zipped with extension .csv.gz 63 | expr = '{!s}-{}.csv*'.format(prefix, name) 64 | files = glob.glob(expr) 65 | dtype = DTYPES[name] 66 | if len(files) == 1: 67 | return pandas.read_csv(files[0], header=0, index_col=False, dtype=dtype) 68 | elif len(files) == 0: 69 | raise Exception('No file matches \'{}\''.format(expr)) 70 | else: 71 | raise Exception('More than one file matches \'{}\''.format(expr)) 72 | 73 | def load_event_hits(prefix): 74 | """Load the hits information for a single event with the given prefix. 75 | """ 76 | return _load_event_data(prefix, 'hits') 77 | 78 | def load_event_cells(prefix): 79 | """Load the hit cells information for a single event with the given prefix. 80 | """ 81 | return _load_event_data(prefix, 'cells') 82 | 83 | def load_event_particles(prefix): 84 | """Load the particles information for a single event with the given prefix. 85 | """ 86 | return _load_event_data(prefix, 'particles') 87 | 88 | def load_event_truth(prefix): 89 | """Load only the truth information for a single event with the given prefix. 90 | """ 91 | return _load_event_data(prefix, 'truth') 92 | 93 | def load_event(prefix, parts=DEFAULT_PARTS): 94 | """Load data for a single event with the given prefix. 95 | 96 | Parameters 97 | ---------- 98 | prefix : str or pathlib.Path 99 | The common prefix name for the event files, i.e. without `-hits.csv`). 100 | parts : List[{'hits', 'cells', 'particles', 'truth'}], optional 101 | Which parts of the event files to load. 102 | 103 | Returns 104 | ------- 105 | tuple 106 | Contains a `pandas.DataFrame` for each element of `parts`. Each 107 | element has field names identical to the CSV column names with 108 | appropriate types. 109 | """ 110 | return tuple(_load_event_data(prefix, name) for name in parts) 111 | 112 | def load_dataset(path, skip=None, nevents=None, parts=DEFAULT_PARTS): 113 | """Provide an iterator over (all) events in a dataset. 114 | 115 | Parameters 116 | ---------- 117 | path : str or pathlib.Path 118 | Path to a directory or a zip file containing event files. 119 | skip : int, optional 120 | Skip the first `skip` events. 121 | nevents : int, optional 122 | Only load a maximum of `nevents` events. 123 | parts : List[{'hits', 'cells', 'particles', 'truth'}], optional 124 | Which parts of each event files to load. 125 | 126 | Yields 127 | ------ 128 | event_id : int 129 | The event identifier. 130 | *data 131 | Event data element as specified in `parts`. 132 | """ 133 | # extract a sorted list of event file prefixes. 134 | def list_prefixes(files): 135 | # Note: the file names may optionally have a directory prefix if they 136 | # are derived from a zipfile, for example. Hence the regular expression 137 | # can't be anchored at the beginning of the file name. 138 | regex = re.compile('.*event\d{9}-[a-zA-Z]+.csv(.gz)?$') 139 | files = filter(regex.match, files) 140 | prefixes = set(_.split('-', 1)[0] for _ in files) 141 | prefixes = sorted(prefixes) 142 | if skip is not None: 143 | prefixes = prefixes[skip:] 144 | if nevents is not None: 145 | prefixes = prefixes[:nevents] 146 | return prefixes 147 | 148 | # TODO use `yield from` once we increase the python requirement 149 | if op.isdir(path): 150 | for x in _iter_dataset_dir(path, list_prefixes(os.listdir(path)), parts): 151 | yield x 152 | else: 153 | with zipfile.ZipFile(path, mode='r') as z: 154 | for x in _iter_dataset_zip(z, list_prefixes(z.namelist()), parts): 155 | yield x 156 | 157 | def _extract_event_id(prefix): 158 | """Extract event_id from prefix. 159 | 160 | E.g. event_id=1 from `event000000001` or from `train_1/event000000001` 161 | """ 162 | regex = r'.*event(\d+)' 163 | groups = re.findall(regex, prefix) 164 | return int(groups[0]) 165 | 166 | def _iter_dataset_dir(directory, prefixes, parts): 167 | """Iterate over selected events files inside a directory. 168 | """ 169 | for p in prefixes: 170 | yield (_extract_event_id(p),) + load_event(op.join(directory, p), parts) 171 | 172 | def _iter_dataset_zip(zipfile, prefixes, parts): 173 | """Iterate over selected event files inside a zip archive. 174 | """ 175 | for p in prefixes: 176 | files = [zipfile.open('{}-{}.csv'.format(p, _), mode='r') for _ in parts] 177 | dtypes = [DTYPES[_] for _ in parts] 178 | data = tuple(pandas.read_csv(f, header=0, index_col=False, dtype=d) 179 | for f, d in zip(files, dtypes)) 180 | yield (_extract_event_id(p),) + data 181 | -------------------------------------------------------------------------------- /trackml/randomize.py: -------------------------------------------------------------------------------- 1 | """TrackML randomized submissions from truth""" 2 | 3 | __authors__ = ['Moritz Kiehn'] 4 | 5 | import pandas 6 | import numpy 7 | import numpy.random 8 | 9 | def _make_submission(hit_ids, track_ids, renumber=True): 10 | """Create a submission DataFrame with hit_id and track_id columns. 11 | 12 | Optionally renumbers the track_id to random small integers. 13 | """ 14 | if renumber: 15 | unique_ids, inverse = numpy.unique(track_ids, return_inverse=True) 16 | numbers = numpy.arange(1, len(unique_ids) + 1, dtype=unique_ids.dtype) 17 | numpy.random.shuffle(numbers) 18 | track_ids = numbers[inverse] 19 | return pandas.DataFrame({'hit_id': hit_ids, 'track_id': track_ids}) 20 | 21 | def set_seed(seed): 22 | """Set the random seed used for randomness in this module.""" 23 | numpy.random.seed(seed) 24 | 25 | def random_solution(hits, ntracks): 26 | """Generate a completely random solution with the given number of tracks. 27 | 28 | Parameters 29 | ---------- 30 | hits : pandas.DataFrame 31 | Hits information must contain hit_id column. 32 | ntracks : int 33 | Number of tracks the submission should contain. 34 | """ 35 | ids = numpy.random.randint(1, ntracks + 1, size=len(hits), dtype='i4') 36 | return _make_submission(hits['hit_id'], ids, renumber=False) 37 | 38 | def drop_hits(truth, probability): 39 | """Drop hits from each track with a certain probability. 40 | 41 | Each dropped hit is assigned to a new particle that only contains this hit. 42 | 43 | Parameters 44 | ---------- 45 | truth : pandas.DataFrame 46 | Truth mapping must contain hit_id and particle_id columns. 47 | probability : float 48 | The probability for a single hit to be dropped from the track. 49 | """ 50 | out = numpy.array(truth['particle_id'], copy=True) 51 | dropped_mask = (numpy.random.random_sample(len(out)) < probability) 52 | dropped_count = numpy.count_nonzero(dropped_mask) 53 | fakeid0 = numpy.max(out) + 1 54 | fakeids = numpy.arange(fakeid0, fakeid0 + dropped_count, dtype='i8') 55 | # replace masked particle ids with fakes ones 56 | numpy.place(out, dropped_mask, fakeids) 57 | return _make_submission(truth['hit_id'], out) 58 | 59 | def shuffle_hits(truth, probability): 60 | """Randomly assign hits to a wrong particle with a certain probability. 61 | 62 | Parameters 63 | ---------- 64 | truth : pandas.DataFrame 65 | Truth mapping must contain hit_id and particle_id columns. 66 | probability : float 67 | The probability for a single hit to be reassigned to a different track. 68 | """ 69 | out = numpy.array(truth['particle_id'], copy=True) 70 | shuffled_mask = (numpy.random.random_sample(len(out)) < probability) 71 | shuffled_count = numpy.count_nonzero(shuffled_mask) 72 | wrongparticles = numpy.random.choice(numpy.unique(out), size=shuffled_count) 73 | # replace masked particle ids with random valid ids 74 | numpy.place(out, shuffled_mask, wrongparticles) 75 | return _make_submission(truth['hit_id'], out) 76 | -------------------------------------------------------------------------------- /trackml/score.py: -------------------------------------------------------------------------------- 1 | """TrackML scoring metric""" 2 | 3 | __authors__ = ['Sabrina Amrouche', 'David Rousseau', 'Moritz Kiehn', 4 | 'Ilija Vukotic'] 5 | 6 | import numpy 7 | import pandas 8 | 9 | def _analyze_tracks(truth, submission): 10 | """Compute the majority particle, hit counts, and weight for each track. 11 | 12 | Parameters 13 | ---------- 14 | truth : pandas.DataFrame 15 | Truth information. Must have hit_id, particle_id, and weight columns. 16 | submission : pandas.DataFrame 17 | Proposed hit/track association. Must have hit_id and track_id columns. 18 | 19 | Returns 20 | ------- 21 | pandas.DataFrame 22 | Contains track_id, nhits, major_particle_id, major_particle_nhits, 23 | major_nhits, and major_weight columns. 24 | """ 25 | # true number of hits for each particle_id 26 | particles_nhits = truth['particle_id'].value_counts(sort=False) 27 | total_weight = truth['weight'].sum() 28 | # combined event with minimal reconstructed and truth information 29 | event = pandas.merge(truth[['hit_id', 'particle_id', 'weight']], 30 | submission[['hit_id', 'track_id']], 31 | on=['hit_id'], how='left', validate='one_to_one') 32 | event.drop('hit_id', axis=1, inplace=True) 33 | event.sort_values(by=['track_id', 'particle_id'], inplace=True) 34 | 35 | # ASSUMPTIONs: 0 <= track_id, 0 <= particle_id 36 | 37 | tracks = [] 38 | # running sum for the reconstructed track we are currently in 39 | rec_track_id = -1 40 | rec_nhits = 0 41 | # running sum for the particle we are currently in (in this track_id) 42 | cur_particle_id = -1 43 | cur_nhits = 0 44 | cur_weight = 0 45 | # majority particle with most hits up to now (in this track_id) 46 | maj_particle_id = -1 47 | maj_nhits = 0 48 | maj_weight = 0 49 | 50 | for hit in event.itertuples(index=False): 51 | # we reached the next track so we need to finish the current one 52 | if (rec_track_id != -1) and (rec_track_id != hit.track_id): 53 | # could be that the current particle is the majority one 54 | if maj_nhits < cur_nhits: 55 | maj_particle_id = cur_particle_id 56 | maj_nhits = cur_nhits 57 | maj_weight = cur_weight 58 | # store values for this track 59 | tracks.append((rec_track_id, rec_nhits, maj_particle_id, 60 | particles_nhits[maj_particle_id], maj_nhits, 61 | maj_weight / total_weight)) 62 | 63 | # setup running values for next track (or first) 64 | if rec_track_id != hit.track_id: 65 | rec_track_id = hit.track_id 66 | rec_nhits = 1 67 | cur_particle_id = hit.particle_id 68 | cur_nhits = 1 69 | cur_weight = hit.weight 70 | maj_particle_id = -1 71 | maj_nhits = 0 72 | maj_weights = 0 73 | continue 74 | 75 | # hit is part of the current reconstructed track 76 | rec_nhits += 1 77 | 78 | # reached new particle within the same reconstructed track 79 | if cur_particle_id != hit.particle_id: 80 | # check if last particle has more hits than the majority one 81 | # if yes, set the last particle as the new majority particle 82 | if maj_nhits < cur_nhits: 83 | maj_particle_id = cur_particle_id 84 | maj_nhits = cur_nhits 85 | maj_weight = cur_weight 86 | # reset runnig values for current particle 87 | cur_particle_id = hit.particle_id 88 | cur_nhits = 1 89 | cur_weight = hit.weight 90 | # hit belongs to the same particle within the same reconstructed track 91 | else: 92 | cur_nhits += 1 93 | cur_weight += hit.weight 94 | 95 | # last track is not handled inside the loop 96 | if maj_nhits < cur_nhits: 97 | maj_particle_id = cur_particle_id 98 | maj_nhits = cur_nhits 99 | maj_weight = cur_weight 100 | # store values for the last track 101 | tracks.append((rec_track_id, rec_nhits, maj_particle_id, 102 | particles_nhits[maj_particle_id], maj_nhits, maj_weight / total_weight)) 103 | 104 | cols = ['track_id', 'nhits', 105 | 'major_particle_id', 'major_particle_nhits', 106 | 'major_nhits', 'major_weight'] 107 | return pandas.DataFrame.from_records(tracks, columns=cols) 108 | 109 | def score_event(truth, submission): 110 | """Compute the TrackML event score for a single event. 111 | 112 | Parameters 113 | ---------- 114 | truth : pandas.DataFrame 115 | Truth information. Must have hit_id, particle_id, and weight columns. 116 | submission : pandas.DataFrame 117 | Proposed hit/track association. Must have hit_id and track_id columns. 118 | """ 119 | tracks = _analyze_tracks(truth, submission) 120 | purity_rec = numpy.true_divide(tracks['major_nhits'], tracks['nhits']) 121 | purity_maj = numpy.true_divide(tracks['major_nhits'], tracks['major_particle_nhits']) 122 | good_track = (0.5 < purity_rec) & (0.5 < purity_maj) 123 | return tracks['major_weight'][good_track].sum() 124 | -------------------------------------------------------------------------------- /trackml/utils.py: -------------------------------------------------------------------------------- 1 | """TrackML utility functions""" 2 | 3 | __authors__ = ['Moritz Kiehn'] 4 | 5 | import numpy as np 6 | 7 | def add_position_quantities(data, prefix=''): 8 | """Add derived position quantities rho, phi, and r. 9 | """ 10 | x = data['{}x'.format(prefix)] 11 | y = data['{}y'.format(prefix)] 12 | z = data['{}z'.format(prefix)] 13 | t = np.hypot(x, y) 14 | data['{}rho'.format(prefix)] = t 15 | data['{}phi'.format(prefix)] = np.arctan2(y, x) 16 | data['{}r'.format(prefix)] = np.hypot(t, z) 17 | return data 18 | 19 | def add_momentum_quantities(data, prefix=''): 20 | """Add derived momentum quantities pt, pphi, peta, p. 21 | """ 22 | px = data['{}px'.format(prefix)] 23 | py = data['{}py'.format(prefix)] 24 | pz = data['{}pz'.format(prefix)] 25 | pt = np.hypot(px, py) 26 | p = np.hypot(pt, pz) 27 | data['{}pt'.format(prefix)] = pt 28 | data['{}pphi'.format(prefix)] = np.arctan2(py, px) 29 | data['{}peta'.format(prefix)] = np.arctanh(pz / p) 30 | data['{}p'.format(prefix)] = p 31 | return data 32 | 33 | def decode_particle_id(data): 34 | """Decode particle_id into vertex id, generation, etc. 35 | """ 36 | components = [ 37 | ('vertex_id', 0xfff0000000000000, 13 * 4), 38 | ('primary_id', 0x000ffff000000000, 9 * 4), 39 | ('generation', 0x0000000fff000000, 6 * 4), 40 | ('secondary_id', 0x0000000000fff000, 3 * 4), 41 | ('process', 0x0000000000000fff, 0), 42 | ] 43 | pid = data['particle_id'].values.astype('u8') 44 | for name, mask, shift in components: 45 | data[name] = (pid & mask) >> shift 46 | return data 47 | -------------------------------------------------------------------------------- /trackml/weights.py: -------------------------------------------------------------------------------- 1 | """TrackML metric weight calculation""" 2 | 3 | from __future__ import print_function 4 | 5 | __authors__ = ['Moritz Kiehn'] 6 | 7 | import math 8 | 9 | import numpy 10 | import pandas 11 | 12 | from .utils import decode_particle_id 13 | 14 | def _compute_order_weight_matrix(proposal, min_hits, max_hits): 15 | """Compute the hit order weight matrix. 16 | 17 | Returns 18 | ------- 19 | numpy.ndarray 20 | Weight matrix indexed by (nhits, ihit), i.e. the total number of 21 | hits in the tracks and the hit index. 22 | """ 23 | w = numpy.zeros((max_hits + 1, max_hits)) 24 | for nhits in range(min_hits, max_hits + 1): 25 | # scale proposal weights to the number of hits on track 26 | supports = numpy.arange(len(proposal)) * (nhits - 1) / (len(proposal) - 1) 27 | # compute normalized weights so that a full track has a sum of 1 28 | weights = numpy.interp(numpy.arange(nhits), supports, proposal) 29 | weights /= weights.sum() 30 | w[nhits, :nhits] = weights 31 | return w 32 | 33 | ORDER_PROPOSAL = [10., 8., 6., 5., 3., 3., 3., 5., 6.] 34 | ORDER_MIN_HITS = 4 35 | ORDER_MAX_HITS = 20 36 | ORDER_MATRIX = _compute_order_weight_matrix(ORDER_PROPOSAL, ORDER_MIN_HITS, ORDER_MAX_HITS) 37 | 38 | def print_order_weight_matrix(prefix=''): 39 | print(prefix, 'order weight matrix (weights in percent):', sep='') 40 | print(prefix, 'nhits | ihit', sep='') 41 | print(prefix, ' |', sep='', end='') 42 | for i in range(len(ORDER_MATRIX[1:][0])): 43 | print(' {:2d}'.format(i), end='') 44 | print() 45 | print(prefix, '------+' + len(ORDER_MATRIX[1:][0]) * 3 * '-', sep='') 46 | for nhits, row in enumerate(ORDER_MATRIX[1:], start=1): 47 | print(prefix, ' {: 3d} |'.format(nhits), sep='', end='') 48 | for ihit in range(nhits): 49 | print(' {:2.0f}'.format(100 * row[ihit]), end='') 50 | print() 51 | 52 | def weight_order(args): 53 | """Return the weight due to the hit order along the track. 54 | """ 55 | ihit, nhits = args 56 | if nhits < ORDER_MIN_HITS: 57 | return 0. 58 | if ORDER_MAX_HITS < nhits: 59 | nhits = ORDER_MAX_HITS 60 | if ORDER_MAX_HITS <= ihit: 61 | print("warning long true track ihit ", ihit, " proceeding with weight zero.") 62 | return 0. 63 | if nhits <= ihit: 64 | raise Exception("hit index ", ihit, " is larger than total number of hits ", nhits) 65 | if nhits < 0: 66 | raise Exception("total number of hits ", nhits, " is below zero") 67 | if ihit < 0: 68 | raise Exception("hit index ", ihit, " is below zero") 69 | return ORDER_MATRIX[nhits, ihit] 70 | 71 | def weight_pt(pt, pt_inf=0.5, pt_sup=3, w_min=0.2, w_max=1.): 72 | """Return the transverse momentum dependent hit weight. 73 | """ 74 | # lower cut just to be sure, should not happen except maybe for noise hits 75 | xp = [min(0.05, pt_inf), pt_inf, pt_sup] 76 | fp = [w_min, w_min, w_max] 77 | return numpy.interp(pt, xp, fp, left=0.0, right=w_max) 78 | 79 | # particle id for noise hits 80 | INVALID_PARTICLED_ID = 0 81 | 82 | def weight_hits_phase1(truth, particles): 83 | """Compute per-hit weights for the phase 1 scoring metric. 84 | 85 | Hits w/ invalid particle ids, e.g. noise hits, have zero weight. 86 | 87 | Parameters 88 | ---------- 89 | truth : pandas.DataFrame 90 | Truth information. Must have hit_id, particle_id, and tz columns. 91 | particles : pandas.DataFrame 92 | Particle information. Must have particle_id, vz, px, py, and nhits 93 | columns. 94 | 95 | Returns 96 | ------- 97 | pandas.DataFrame 98 | `truth` augmented with additional columns: particle_nhits, ihit, 99 | weight_order, weight_pt, and weight. 100 | """ 101 | # fill selected per-particle information for each hit 102 | selected = pandas.DataFrame({ 103 | 'particle_id': particles['particle_id'], 104 | 'particle_vz': particles['vz'], 105 | 'particle_nhits': particles['nhits'], 106 | 'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])), 107 | }) 108 | combined = pandas.merge(truth, selected, 109 | how='left', on=['particle_id'], 110 | validate='many_to_one') 111 | 112 | # fix pt weight for hits w/o associated particle 113 | combined['weight_pt'].fillna(0.0, inplace=True) 114 | # fix nhits for hits w/o associated particle 115 | combined['particle_nhits'].fillna(0.0, inplace=True) 116 | combined['particle_nhits'] = combined['particle_nhits'].astype('i4') 117 | # compute hit count and order using absolute distance from particle vertex 118 | combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz']) 119 | combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4') 120 | # compute order-dependent weight 121 | combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1) 122 | 123 | # compute combined weight normalized to 1 124 | w = combined['weight_pt'] * combined['weight_order'] 125 | w /= w.sum() 126 | combined['weight'] = w 127 | 128 | # return w/o intermediate columns 129 | return combined.drop(columns=['particle_vz', 'abs_dvz']) 130 | 131 | def weight_hits_phase2(truth, particles): 132 | """Compute per-hit weights for the phase 2 scoring metric. 133 | 134 | This is the phase 1 metric with an additional particle preselection, i.e. 135 | only a subset of the particles have a non-zero score. 136 | 137 | Parameters 138 | ---------- 139 | truth : pandas.DataFrame 140 | Truth information. Must have hit_id, particle_id, and tz columns. 141 | particles : pandas.DataFrame 142 | Particle information. Must have particle_id, vz, px, py, and nhits 143 | columns. 144 | 145 | Returns 146 | ------- 147 | pandas.DataFrame 148 | `truth` augmented with additional columns: particle_nhits, ihit, 149 | weight_order, weight_pt, and weight. 150 | """ 151 | # fill selected per-particle information for each hit 152 | selected = pandas.DataFrame({ 153 | 'particle_id': particles['particle_id'], 154 | 'particle_vz': particles['vz'], 155 | 'particle_nhits': particles['nhits'], 156 | 'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])), 157 | }) 158 | selected = decode_particle_id(selected) 159 | combined = pandas.merge(truth, selected, 160 | how='left', on=['particle_id'], 161 | validate='many_to_one') 162 | 163 | # fix pt weight for hits w/o associated particle 164 | combined['weight_pt'].fillna(0.0, inplace=True) 165 | # fix nhits for hits w/o associated particle 166 | combined['particle_nhits'].fillna(0.0, inplace=True) 167 | combined['particle_nhits'] = combined['particle_nhits'].astype('i4') 168 | 169 | # compute hit count and order using absolute distance from particle vertex 170 | combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz']) 171 | combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4') 172 | # compute order-dependent weight 173 | combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1) 174 | 175 | # compute normalized combined weight w/ extra particle selection 176 | weight = combined['weight_pt'] * combined['weight_order'] 177 | weight[combined['generation'] != 0] = 0 178 | weight /= weight.sum() 179 | # normalize total event weight 180 | combined['weight'] = weight 181 | 182 | # return w/o intermediate columns 183 | return combined.drop(columns=['particle_vz', 'abs_dvz']) 184 | --------------------------------------------------------------------------------