├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── setup.cfg
├── setup.py
└── trackml
    ├── __init__.py
    ├── dataset.py
    ├── randomize.py
    ├── score.py
    ├── utils.py
    └── weights.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | .static_storage/
 57 | .media/
 58 | local_settings.py
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2018 Moritz Kiehn
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include README.md
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | TrackML utility library
  2 | =======================
  3 | 
  4 | A python library to simplify working with the
  5 | [High Energy Physics Tracking Machine Learning challenge][trackml]
  6 | dataset. It can be used for the datasets of both the
  7 | [accuracy phase][trackml_kaggle] and the
  8 | [throughput phase][trackml_codalab].
  9 | 
 10 | Installation
 11 | ------------
 12 | 
 13 | The package can be installed as a user package via
 14 | 
 15 |     pip install --user <path/to/repository>
 16 | 
 17 | To make a local checkout of the repository available directly it can also be
 18 | installed in development mode
 19 | 
 20 |     pip install --user --editable <path/to/local/checkout>
 21 | 
 22 | In both cases, the package can be imported via `import trackml` without
 23 | additional configuration. In the later case, changes made to the code are
 24 | immediately visible without having to reinstall the package.
 25 | 
 26 | Usage
 27 | -----
 28 | 
 29 | To read the data for one event from the training dataset including the ground
 30 | truth information:
 31 | 
 32 | ```python
 33 | from trackml.dataset import load_event
 34 | 
 35 | hits, cells, particles, truth = load_event('path/to/event000000123')
 36 | ```
 37 | 
 38 | For the test dataset only the hit information is available. To read only this
 39 | data:
 40 | 
 41 | ```python
 42 | from trackml.dataset import load_event
 43 | 
 44 | hits, cells = load_event('path/to/event000000456', parts=['hits', 'cells'])
 45 | ```
 46 | 
 47 | To iterate over events in a dataset:
 48 | 
 49 | ```python
 50 | from trackml.dataset import load_dataset
 51 | 
 52 | for event_id, hits, cells, particles, truth in load_dataset('path/to/dataset'):
 53 |     ...
 54 | ```
 55 | 
 56 | To read a single event and compute additional columns derived from the
 57 | stored data:
 58 | 
 59 | ```python
 60 | from trackml.dataset import load_event
 61 | from trackml.utils import add_position_quantities, add_momentum_quantities, decode_particle_id
 62 | 
 63 | # get the particles data
 64 | particles = load_event('path/to/event000000123', parts=['particles'])
 65 | # decode particle id into vertex id, generation, etc.
 66 | particles = decode_particle_id(particles)
 67 | # add vertex rho, phi, r
 68 | particles = add_position_quantities(particles, prefix='v')
 69 | # add momentum eta, p, pt
 70 | particles = add_momentum_quantities(particles)
 71 | ```
 72 | 
 73 | The dataset path can be the path to a directory or to a zip file containing the
 74 | events `.csv` files. Each event is lazily loaded during the iteration. Options
 75 | are available to read only a subset of available events or only read selected
 76 | parts, e.g. only hits or only particles.
 77 | 
 78 | To generate a random test submission from truth information and compute the
 79 | expected score:
 80 | 
 81 | ```python
 82 | from trackml.randomize import shuffle_hits
 83 | from trackml.score import score_event
 84 | 
 85 | shuffled = shuffle_hits(truth, 0.05) # 5% probability to reassign a hit
 86 | score = score_event(truth, shuffled)
 87 | ```
 88 | 
 89 | All methods either take or return `pandas.DataFrame` objects. You can have a
 90 | look at the function docstrings for detailed information.
 91 | 
 92 | Authors
 93 | -------
 94 | 
 95 | The library was written by
 96 | 
 97 | *   Moritz Kiehn
 98 | 
 99 | with contributions from
100 | 
101 | *   Sabrina Amrouche
102 | *   David Rousseau
103 | *   Ilija Vukotic
104 | *   Nimar Arora
105 | *   Jon Nordby
106 | *   Yerkebulan Berdibekov
107 | *   Victor Estrade
108 | 
109 | License
110 | -------
111 | 
112 | All code is licensed under the [MIT license][mit_license].
113 | 
114 | Dataset
115 | -------
116 | 
117 | A dataset comprises multiple independent events, where each event
118 | contains simulated measurements (essentially 3D points) of particles
119 | generated in a collision between proton bunches at the
120 | [Large Hadron Collider][lhc] at [CERN][cern]. The goal of the tracking
121 | machine learning challenge is to group the recorded measurements or hits
122 | for each event into tracks, sets of hits that belong to the same initial
123 | particle. A solution must uniquely associate each hit to one track. The
124 | training dataset contains the recorded hits, their ground truth
125 | counterpart and their association to particles, and the initial
126 | parameters of those particles. The test dataset contains only the
127 | recorded hits.
128 | 
129 | Each dataset is usually provided as a single archive file. Once
130 | unzipped, the dataset comprises a set of `.csv[.gz]` files. Each event
131 | can have up to four associated files that contain hits, hit cells,
132 | particles, and the ground truth association between them. The common
133 | prefix, e.g. `event000000010`, is always `event` followed by 9 digits.
134 | 
135 |     event000000000-hits.csv
136 |     event000000000-cells.csv
137 |     event000000000-particles.csv
138 |     event000000000-truth.csv
139 |     event000000001-hits.csv
140 |     event000000001-cells.csv
141 |     event000000001-particles.csv
142 |     event000000001-truth.csv
143 | 
144 | Submissions must be provided as a single `.csv` file for the whole
145 | dataset, e.g.
146 | 
147 |     submission-test.csv
148 |     submission-final.csv
149 | 
150 | ### Event hits
151 | 
152 | The hits file contains the following values for each hit/entry:
153 | 
154 | *   **hit_id**: numerical identifier of the hit inside the event.
155 | *   **x, y, z**: measured x, y, z position (in millimeter) of the hit in
156 |     global coordinates.
157 | *   **volume_id**: numerical identifier of the detector group.
158 | *   **layer_id**: numerical identifier of the detector layer inside the
159 |     group.
160 | *   **module_id**: numerical identifier of the detector module inside
161 |     the layer.
162 | 
163 | The volume/layer/module id could in principle be deduced from x, y, z. They
164 | are given here to simplify detector-specific data handling.
165 | 
166 | ### Event truth
167 | 
168 | The truth file contains the mapping between hits and generating particles and
169 | the true particle state at each measured hit. Each entry maps one hit to one
170 | particle.
171 | 
172 | *   **hit_id**: numerical identifier of the hit as defined in the hits file.
173 | *   **particle_id**: numerical identifier of the generating particle as defined
174 |     in the particles file. A value of 0 means that the hit did not originate
175 |     from a reconstructible particle, but e.g. from detector noise.
176 | *   **tx, ty, tz** true intersection point in global coordinates (in
177 |     millimeters) between the particle trajectory and the sensitive surface.
178 | *   **tpx, tpy, tpz** true particle momentum (in GeV/c) in the global
179 |     coordinate system at the intersection point. The corresponding vector
180 |     is tangent to the particle trajectory at the intersection point.
181 | *   **weight** per-hit weight used for the scoring metric; total sum of weights
182 |     within one event equals to one.
183 | 
184 | ### Event particles
185 | 
186 | The particles files contains the following values for each particle/entry:
187 | 
188 | *   **particle_id**: numerical identifier of the particle inside the event.
189 | *   **particle_type**: numerical identifier of the particle type; the
190 |     [Particle Data Group Monte Carlo numbering scheme][pdg_mc_numbering]
191 |     is used to identify specific particle types.
192 | *   **vx, vy, vz**: initial position or vertex (in millimeters) in global
193 |     coordinates.
194 | *   **px, py, pz**: initial momentum (in GeV/c) along each global axis.
195 | *   **q**: particle charge (as multiple of the absolute electron charge).
196 | *   **nhits**: number of hits generated by this particle.
197 | 
198 | All entries contain the generated information or ground truth.
199 | 
200 | ### Event hit cells
201 | 
202 | The cells file contains the constituent active detector cells that comprise each
203 | hit. The cells can be used to refine the hit to track association.
204 | A cell is the smallest granularity inside each detector module, much like a
205 | pixel on a screen, except that depending on the **volume_id** a cell can be a square
206 | or a long rectangle. It is identified by two channel identifiers that are unique
207 | within each detector module and encode the position, much like column/row
208 | numbers of a matrix. A cell can provide signal information that the detector
209 | module has recorded in addition to the position. Depending on the detector type
210 | only one of the channel identifiers is valid, e.g. for the strip detectors, and
211 | the value might have different resolution.
212 | 
213 | *   **hit_id**: numerical identifier of the hit as defined in the hits file.
214 | *   **ch0, ch1**: channel identifier/coordinates unique within one module.
215 | *   **value**: signal value information, e.g. how much charge a particle has
216 |     deposited.
217 | 
218 | ### Dataset submission information
219 | 
220 | The submission file must associate each hit in each event to one and only one
221 | reconstructed particle track. The reconstructed tracks must be uniquely
222 | identified only within each event.
223 | 
224 | *   **event_id**: numerical identifier of the event; corresponds to the number
225 |     found in the per-event file name prefix.
226 | *   **hit_id**: numerical identifier of the hit inside the event as defined in
227 |     the per-event hits file.
228 | *   **track_id**: user-defined numerical identifier (non-negative integer) of
229 |     the track.
230 | 
231 | ### Additional detector geometry information
232 | 
233 | The detector is built from silicon slabs (or modules, rectangular or
234 | trapezoïdal), arranged in cylinders and disks, which measure the position (or
235 | hits) of the particles that cross them. The detector modules are organized
236 | into detector groups or volumes identified by a **volume_id**. Inside a volume they
237 | are further grouped into layers identified by a **layer_id**. Each layer can contain
238 | an arbitrary number of detector modules, the smallest geometrically distinct
239 | detector object, each identified by a **module_id**. Within each group, detector
240 | modules are of the same type have e.g. the same granularity. All simulated
241 | detector modules are so-called semiconductor sensors that are build from thin
242 | silicon sensor chips. Each module can be represented by a two-dimensional,
243 | planar, bounded sensitive surface. These sensitive surfaces are subdivided into
244 | regular grids that define the detectors cells, the smallest granularity within
245 | the detector.
246 | 
247 | Each module has a different position and orientation described in the detectors
248 | file. A local, right-handed coordinate system is defined on each sensitive
249 | surface such that the first two coordinates u and v are on the sensitive surface
250 | and the third coordinate w is normal to the surface. The orientation and
251 | position are defined by the following transformation
252 | 
253 |     pos_xyz = rotation_matrix * pos_uvw + offset
254 | 
255 | that transform a position described in local coordinates u,v,w into the
256 | equivalent position x,y,z in global coordinates using a rotation matrix and
257 | an offset.
258 | 
259 | *   **volume_id**: numerical identifier of the detector group.
260 | *   **layer_id**: numerical identifier of the detector layer inside the
261 |     group.
262 | *   **module_id**: numerical identifier of the detector module inside
263 |     the layer.
264 | *   **cx, cy, cz**: position of the local origin in the described in the global
265 |     coordinate system (in millimeter).
266 | *   **rot_xu, rot_xv, rot_xw, rot_yu, ...**: components of the rotation matrix
267 |     to rotate from local u,v,w to global x,y,z coordinates.
268 | *   **module_t**: thickness of the detector module (in millimeter).
269 | *   **module_minhu, module_maxhu**: the minimum/maximum half-length of the
270 |     module boundary along the local u direction (in millimeter).
271 | *   **module_hv**: the half-length of the module boundary along the local v
272 |     direction (in millimeter).
273 | *   **pitch_u, pitch_v**: the size of detector cells along the local u and v
274 |     direction (in millimeter).
275 | 
276 | 
277 | [cern]: https://home.cern
278 | [lhc]: https://home.cern/topics/large-hadron-collider
279 | [mit_license]: http://www.opensource.org/licenses/MIT
280 | [trackml]: https://sites.google.com/site/trackmlparticle/
281 | [trackml_kaggle]: https://www.kaggle.com/c/trackml-particle-identification
282 | [trackml_codalab]: https://competitions.codalab.org/competitions/20112
283 | [pdg_mc_numbering]: http://pdg.lbl.gov/2018/reviews/rpp2018-rev-monte-carlo-numbering.pdf
284 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [bdist_wheel]
2 | universal=1
3 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os.path as op
 3 | 
 4 | from setuptools import setup
 5 | 
 6 | here = op.abspath(op.dirname(__file__))
 7 | 
 8 | # Get the long description from the README file
 9 | with io.open(op.join(here, 'README.md'), mode='rt', encoding='utf-8') as f:
10 |     long_description = f.read()
11 | 
12 | setup(
13 |     name='trackml',
14 |     version='3',
15 |     description='TrackML utility library',
16 |     long_description=long_description,
17 |     long_description_content_type='text/markdown',
18 |     url='https://github.com/LAL/trackml-library',
19 |     author='Moritz Kiehn',
20 |     author_email='msmk@cern.ch',
21 |     classifiers=[
22 |         'Development Status :: 5 - Production/Stable',
23 |         'Intended Audience :: Science/Research',
24 |         'Topic :: Scientific/Engineering :: Information Analysis',
25 |         'Topic :: Scientific/Engineering :: Physics',
26 |         'License :: OSI Approved :: MIT License',
27 |         'Programming Language :: Python :: 2',
28 |         'Programming Language :: Python :: 2.7',
29 |         'Programming Language :: Python :: 3',
30 |         'Programming Language :: Python :: 3.4',
31 |         'Programming Language :: Python :: 3.5',
32 |         'Programming Language :: Python :: 3.6',
33 |     ],
34 |     packages=['trackml'],
35 |     install_requires=[
36 |         'numpy',
37 |         'pandas>=0.21.0',
38 |     ],
39 |     python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
40 | )
41 | 


--------------------------------------------------------------------------------
/trackml/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LAL/trackml-library/53a165e15a2c885f54c2bef1bd1ed53db6ed9648/trackml/__init__.py


--------------------------------------------------------------------------------
/trackml/dataset.py:
--------------------------------------------------------------------------------
  1 | """TrackML dataset loading"""
  2 | 
  3 | __authors__ = ['Moritz Kiehn', 'Sabrina Amrouche', 'Nimar Arora']
  4 | 
  5 | import glob
  6 | import os
  7 | import os.path as op
  8 | import re
  9 | import zipfile
 10 | 
 11 | import pandas
 12 | 
 13 | CELLS_DTYPES = dict([
 14 |     ('hit_id', 'i4'),
 15 |     ('ch0', 'i4'),
 16 |     ('ch1', 'i4'),
 17 |     ('value', 'f4'),
 18 | ])
 19 | HITS_DTYPES = dict([
 20 |     ('hit_id', 'i4'),
 21 |     ('x', 'f4'),
 22 |     ('y', 'f4'),
 23 |     ('z','f4'),
 24 |     ('volume_id', 'i4'),
 25 |     ('layer_id', 'i4'),
 26 |     ('module_id', 'i4'),
 27 | ])
 28 | PARTICLES_DTYPES = dict([
 29 |     ('particle_id', 'i8'),
 30 |     ('particle_type', 'i4'),
 31 |     ('vx', 'f4'),
 32 |     ('vy', 'f4'),
 33 |     ('vz', 'f4'),
 34 |     ('px', 'f4'),
 35 |     ('py', 'f4'),
 36 |     ('pz', 'f4'),
 37 |     ('q', 'i4'),
 38 |     ('nhits', 'i4'),
 39 | ])
 40 | TRUTH_DTYPES = dict([
 41 |     ('hit_id', 'i4'),
 42 |     ('particle_id', 'i8'),
 43 |     ('tx', 'f4'),
 44 |     ('ty', 'f4'),
 45 |     ('tz', 'f4'),
 46 |     ('tpx', 'f4'),
 47 |     ('tpy', 'f4'),
 48 |     ('tpz', 'f4'),
 49 |     ('weight', 'f4'),
 50 | ])
 51 | DTYPES = {
 52 |     'cells': CELLS_DTYPES,
 53 |     'hits': HITS_DTYPES,
 54 |     'particles': PARTICLES_DTYPES,
 55 |     'truth': TRUTH_DTYPES,
 56 | }
 57 | DEFAULT_PARTS = ['hits', 'cells', 'particles', 'truth']
 58 | 
 59 | def _load_event_data(prefix, name):
 60 |     """Load per-event data for one single type, e.g. hits, or particles.
 61 |     """
 62 |     # csv files can be individually zipped with extension .csv.gz
 63 |     expr = '{!s}-{}.csv*'.format(prefix, name)
 64 |     files = glob.glob(expr)
 65 |     dtype = DTYPES[name]
 66 |     if len(files) == 1:
 67 |         return pandas.read_csv(files[0], header=0, index_col=False, dtype=dtype)
 68 |     elif len(files) == 0:
 69 |         raise Exception('No file matches \'{}\''.format(expr))
 70 |     else:
 71 |         raise Exception('More than one file matches \'{}\''.format(expr))
 72 | 
 73 | def load_event_hits(prefix):
 74 |     """Load the hits information for a single event with the given prefix.
 75 |     """
 76 |     return _load_event_data(prefix, 'hits')
 77 | 
 78 | def load_event_cells(prefix):
 79 |     """Load the hit cells information for a single event with the given prefix.
 80 |     """
 81 |     return _load_event_data(prefix, 'cells')
 82 | 
 83 | def load_event_particles(prefix):
 84 |     """Load the particles information for a single event with the given prefix.
 85 |     """
 86 |     return _load_event_data(prefix, 'particles')
 87 | 
 88 | def load_event_truth(prefix):
 89 |     """Load only the truth information for a single event with the given prefix.
 90 |     """
 91 |     return _load_event_data(prefix, 'truth')
 92 | 
 93 | def load_event(prefix, parts=DEFAULT_PARTS):
 94 |     """Load data for a single event with the given prefix.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     prefix : str or pathlib.Path
 99 |         The common prefix name for the event files, i.e. without `-hits.csv`).
100 |     parts : List[{'hits', 'cells', 'particles', 'truth'}], optional
101 |         Which parts of the event files to load.
102 | 
103 |     Returns
104 |     -------
105 |     tuple
106 |         Contains a `pandas.DataFrame` for each element of `parts`. Each
107 |         element has field names identical to the CSV column names with
108 |         appropriate types.
109 |     """
110 |     return tuple(_load_event_data(prefix, name) for name in parts)
111 | 
112 | def load_dataset(path, skip=None, nevents=None, parts=DEFAULT_PARTS):
113 |     """Provide an iterator over (all) events in a dataset.
114 | 
115 |     Parameters
116 |     ----------
117 |     path : str or pathlib.Path
118 |         Path to a directory or a zip file containing event files.
119 |     skip : int, optional
120 |         Skip the first `skip` events.
121 |     nevents : int, optional
122 |         Only load a maximum of `nevents` events.
123 |     parts : List[{'hits', 'cells', 'particles', 'truth'}], optional
124 |         Which parts of each event files to load.
125 | 
126 |     Yields
127 |     ------
128 |     event_id : int
129 |         The event identifier.
130 |     *data
131 |         Event data element as specified in `parts`.
132 |     """
133 |     # extract a sorted list of event file prefixes.
134 |     def list_prefixes(files):
135 |         # Note: the file names may optionally have a directory prefix if they
136 |         # are derived from a zipfile, for example. Hence the regular expression
137 |         # can't be anchored at the beginning of the file name.
138 |         regex = re.compile('.*event\d{9}-[a-zA-Z]+.csv(.gz)?$')
139 |         files = filter(regex.match, files)
140 |         prefixes = set(_.split('-', 1)[0] for _ in files)
141 |         prefixes = sorted(prefixes)
142 |         if skip is not None:
143 |             prefixes = prefixes[skip:]
144 |         if nevents is not None:
145 |             prefixes = prefixes[:nevents]
146 |         return prefixes
147 | 
148 |     # TODO use `yield from` once we increase the python requirement
149 |     if op.isdir(path):
150 |         for x in _iter_dataset_dir(path, list_prefixes(os.listdir(path)), parts):
151 |             yield x
152 |     else:
153 |         with zipfile.ZipFile(path, mode='r') as z:
154 |             for x in _iter_dataset_zip(z, list_prefixes(z.namelist()), parts):
155 |                 yield x
156 | 
157 | def _extract_event_id(prefix):
158 |     """Extract event_id from prefix.
159 | 
160 |     E.g. event_id=1 from `event000000001` or from `train_1/event000000001`
161 |     """
162 |     regex = r'.*event(\d+)'
163 |     groups = re.findall(regex, prefix)
164 |     return int(groups[0])
165 | 
166 | def _iter_dataset_dir(directory, prefixes, parts):
167 |     """Iterate over selected events files inside a directory.
168 |     """
169 |     for p in prefixes:
170 |         yield (_extract_event_id(p),) + load_event(op.join(directory, p), parts)
171 | 
172 | def _iter_dataset_zip(zipfile, prefixes, parts):
173 |     """Iterate over selected event files inside a zip archive.
174 |     """
175 |     for p in prefixes:
176 |         files = [zipfile.open('{}-{}.csv'.format(p, _), mode='r') for _ in parts]
177 |         dtypes = [DTYPES[_] for _ in parts]
178 |         data = tuple(pandas.read_csv(f, header=0, index_col=False, dtype=d)
179 |                                      for f, d in zip(files, dtypes))
180 |         yield (_extract_event_id(p),) + data
181 | 


--------------------------------------------------------------------------------
/trackml/randomize.py:
--------------------------------------------------------------------------------
 1 | """TrackML randomized submissions from truth"""
 2 | 
 3 | __authors__ = ['Moritz Kiehn']
 4 | 
 5 | import pandas
 6 | import numpy
 7 | import numpy.random
 8 | 
 9 | def _make_submission(hit_ids, track_ids, renumber=True):
10 |     """Create a submission DataFrame with hit_id and track_id columns.
11 | 
12 |     Optionally renumbers the track_id to random small integers.
13 |     """
14 |     if renumber:
15 |         unique_ids, inverse = numpy.unique(track_ids, return_inverse=True)
16 |         numbers = numpy.arange(1, len(unique_ids) + 1, dtype=unique_ids.dtype)
17 |         numpy.random.shuffle(numbers)
18 |         track_ids = numbers[inverse]
19 |     return pandas.DataFrame({'hit_id': hit_ids, 'track_id': track_ids})
20 | 
21 | def set_seed(seed):
22 |     """Set the random seed used for randomness in this module."""
23 |     numpy.random.seed(seed)
24 | 
25 | def random_solution(hits, ntracks):
26 |     """Generate a completely random solution with the given number of tracks.
27 | 
28 |     Parameters
29 |     ----------
30 |     hits : pandas.DataFrame
31 |         Hits information must contain hit_id column.
32 |     ntracks : int
33 |         Number of tracks the submission should contain.
34 |     """
35 |     ids = numpy.random.randint(1, ntracks + 1, size=len(hits), dtype='i4')
36 |     return _make_submission(hits['hit_id'], ids, renumber=False)
37 | 
38 | def drop_hits(truth, probability):
39 |     """Drop hits from each track with a certain probability.
40 | 
41 |     Each dropped hit is assigned to a new particle that only contains this hit.
42 | 
43 |     Parameters
44 |     ----------
45 |     truth : pandas.DataFrame
46 |         Truth mapping must contain hit_id and particle_id columns.
47 |     probability : float
48 |         The probability for a single hit to be dropped from the track.
49 |     """
50 |     out = numpy.array(truth['particle_id'], copy=True)
51 |     dropped_mask = (numpy.random.random_sample(len(out)) < probability)
52 |     dropped_count = numpy.count_nonzero(dropped_mask)
53 |     fakeid0 = numpy.max(out) + 1
54 |     fakeids = numpy.arange(fakeid0, fakeid0 + dropped_count, dtype='i8')
55 |     # replace masked particle ids with fakes ones
56 |     numpy.place(out, dropped_mask, fakeids)
57 |     return _make_submission(truth['hit_id'], out)
58 | 
59 | def shuffle_hits(truth, probability):
60 |     """Randomly assign hits to a wrong particle with a certain probability.
61 | 
62 |     Parameters
63 |     ----------
64 |     truth : pandas.DataFrame
65 |         Truth mapping must contain hit_id and particle_id columns.
66 |     probability : float
67 |         The probability for a single hit to be reassigned to a different track.
68 |     """
69 |     out = numpy.array(truth['particle_id'], copy=True)
70 |     shuffled_mask = (numpy.random.random_sample(len(out)) < probability)
71 |     shuffled_count = numpy.count_nonzero(shuffled_mask)
72 |     wrongparticles = numpy.random.choice(numpy.unique(out), size=shuffled_count)
73 |     # replace masked particle ids with random valid ids
74 |     numpy.place(out, shuffled_mask, wrongparticles)
75 |     return _make_submission(truth['hit_id'], out)
76 | 


--------------------------------------------------------------------------------
/trackml/score.py:
--------------------------------------------------------------------------------
  1 | """TrackML scoring metric"""
  2 | 
  3 | __authors__ = ['Sabrina Amrouche', 'David Rousseau', 'Moritz Kiehn',
  4 |                'Ilija Vukotic']
  5 | 
  6 | import numpy
  7 | import pandas
  8 | 
  9 | def _analyze_tracks(truth, submission):
 10 |     """Compute the majority particle, hit counts, and weight for each track.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     truth : pandas.DataFrame
 15 |         Truth information. Must have hit_id, particle_id, and weight columns.
 16 |     submission : pandas.DataFrame
 17 |         Proposed hit/track association. Must have hit_id and track_id columns.
 18 | 
 19 |     Returns
 20 |     -------
 21 |     pandas.DataFrame
 22 |         Contains track_id, nhits, major_particle_id, major_particle_nhits,
 23 |         major_nhits, and major_weight columns.
 24 |     """
 25 |     # true number of hits for each particle_id
 26 |     particles_nhits = truth['particle_id'].value_counts(sort=False)
 27 |     total_weight = truth['weight'].sum()
 28 |     # combined event with minimal reconstructed and truth information
 29 |     event = pandas.merge(truth[['hit_id', 'particle_id', 'weight']],
 30 |                          submission[['hit_id', 'track_id']],
 31 |                          on=['hit_id'], how='left', validate='one_to_one')
 32 |     event.drop('hit_id', axis=1, inplace=True)
 33 |     event.sort_values(by=['track_id', 'particle_id'], inplace=True)
 34 | 
 35 |     # ASSUMPTIONs: 0 <= track_id, 0 <= particle_id
 36 | 
 37 |     tracks = []
 38 |     # running sum for the reconstructed track we are currently in
 39 |     rec_track_id = -1
 40 |     rec_nhits = 0
 41 |     # running sum for the particle we are currently in (in this track_id)
 42 |     cur_particle_id = -1
 43 |     cur_nhits = 0
 44 |     cur_weight = 0
 45 |     # majority particle with most hits up to now (in this track_id)
 46 |     maj_particle_id = -1
 47 |     maj_nhits = 0
 48 |     maj_weight = 0
 49 | 
 50 |     for hit in event.itertuples(index=False):
 51 |         # we reached the next track so we need to finish the current one
 52 |         if (rec_track_id != -1) and (rec_track_id != hit.track_id):
 53 |             # could be that the current particle is the majority one
 54 |             if maj_nhits < cur_nhits:
 55 |                 maj_particle_id = cur_particle_id
 56 |                 maj_nhits = cur_nhits
 57 |                 maj_weight = cur_weight
 58 |             # store values for this track
 59 |             tracks.append((rec_track_id, rec_nhits, maj_particle_id,
 60 |                 particles_nhits[maj_particle_id], maj_nhits,
 61 |                 maj_weight / total_weight))
 62 | 
 63 |         # setup running values for next track (or first)
 64 |         if rec_track_id != hit.track_id:
 65 |             rec_track_id = hit.track_id
 66 |             rec_nhits = 1
 67 |             cur_particle_id = hit.particle_id
 68 |             cur_nhits = 1
 69 |             cur_weight = hit.weight
 70 |             maj_particle_id = -1
 71 |             maj_nhits = 0
 72 |             maj_weights = 0
 73 |             continue
 74 | 
 75 |         # hit is part of the current reconstructed track
 76 |         rec_nhits += 1
 77 | 
 78 |         # reached new particle within the same reconstructed track
 79 |         if cur_particle_id != hit.particle_id:
 80 |             # check if last particle has more hits than the majority one
 81 |             # if yes, set the last particle as the new majority particle
 82 |             if maj_nhits < cur_nhits:
 83 |                 maj_particle_id = cur_particle_id
 84 |                 maj_nhits = cur_nhits
 85 |                 maj_weight = cur_weight
 86 |             # reset runnig values for current particle
 87 |             cur_particle_id = hit.particle_id
 88 |             cur_nhits = 1
 89 |             cur_weight = hit.weight
 90 |         # hit belongs to the same particle within the same reconstructed track
 91 |         else:
 92 |             cur_nhits += 1
 93 |             cur_weight += hit.weight
 94 | 
 95 |     # last track is not handled inside the loop
 96 |     if maj_nhits < cur_nhits:
 97 |         maj_particle_id = cur_particle_id
 98 |         maj_nhits = cur_nhits
 99 |         maj_weight = cur_weight
100 |     # store values for the last track
101 |     tracks.append((rec_track_id, rec_nhits, maj_particle_id,
102 |         particles_nhits[maj_particle_id], maj_nhits, maj_weight / total_weight))
103 | 
104 |     cols = ['track_id', 'nhits',
105 |             'major_particle_id', 'major_particle_nhits',
106 |             'major_nhits', 'major_weight']
107 |     return pandas.DataFrame.from_records(tracks, columns=cols)
108 | 
109 | def score_event(truth, submission):
110 |     """Compute the TrackML event score for a single event.
111 | 
112 |     Parameters
113 |     ----------
114 |     truth : pandas.DataFrame
115 |         Truth information. Must have hit_id, particle_id, and weight columns.
116 |     submission : pandas.DataFrame
117 |         Proposed hit/track association. Must have hit_id and track_id columns.
118 |     """
119 |     tracks = _analyze_tracks(truth, submission)
120 |     purity_rec = numpy.true_divide(tracks['major_nhits'], tracks['nhits'])
121 |     purity_maj = numpy.true_divide(tracks['major_nhits'], tracks['major_particle_nhits'])
122 |     good_track = (0.5 < purity_rec) & (0.5 < purity_maj)
123 |     return tracks['major_weight'][good_track].sum()
124 | 


--------------------------------------------------------------------------------
/trackml/utils.py:
--------------------------------------------------------------------------------
 1 | """TrackML utility functions"""
 2 | 
 3 | __authors__ = ['Moritz Kiehn']
 4 | 
 5 | import numpy as np
 6 | 
 7 | def add_position_quantities(data, prefix=''):
 8 |     """Add derived position quantities rho, phi, and r.
 9 |     """
10 |     x = data['{}x'.format(prefix)]
11 |     y = data['{}y'.format(prefix)]
12 |     z = data['{}z'.format(prefix)]
13 |     t = np.hypot(x, y)
14 |     data['{}rho'.format(prefix)] = t
15 |     data['{}phi'.format(prefix)] = np.arctan2(y, x)
16 |     data['{}r'.format(prefix)] = np.hypot(t, z)
17 |     return data
18 | 
19 | def add_momentum_quantities(data, prefix=''):
20 |     """Add derived momentum quantities pt, pphi, peta, p.
21 |     """
22 |     px = data['{}px'.format(prefix)]
23 |     py = data['{}py'.format(prefix)]
24 |     pz = data['{}pz'.format(prefix)]
25 |     pt = np.hypot(px, py)
26 |     p = np.hypot(pt, pz)
27 |     data['{}pt'.format(prefix)] = pt
28 |     data['{}pphi'.format(prefix)] = np.arctan2(py, px)
29 |     data['{}peta'.format(prefix)] = np.arctanh(pz / p)
30 |     data['{}p'.format(prefix)] = p
31 |     return data
32 | 
33 | def decode_particle_id(data):
34 |     """Decode particle_id into vertex id, generation, etc.
35 |     """
36 |     components = [
37 |         ('vertex_id',    0xfff0000000000000, 13 * 4),
38 |         ('primary_id',   0x000ffff000000000, 9 * 4),
39 |         ('generation',   0x0000000fff000000, 6 * 4),
40 |         ('secondary_id', 0x0000000000fff000, 3 * 4),
41 |         ('process',      0x0000000000000fff, 0),
42 |     ]
43 |     pid = data['particle_id'].values.astype('u8')
44 |     for name, mask, shift in components:
45 |         data[name] = (pid & mask) >> shift
46 |     return data
47 | 


--------------------------------------------------------------------------------
/trackml/weights.py:
--------------------------------------------------------------------------------
  1 | """TrackML metric weight calculation"""
  2 | 
  3 | from __future__ import print_function
  4 | 
  5 | __authors__ = ['Moritz Kiehn']
  6 | 
  7 | import math
  8 | 
  9 | import numpy
 10 | import pandas
 11 | 
 12 | from .utils import decode_particle_id
 13 | 
 14 | def _compute_order_weight_matrix(proposal, min_hits, max_hits):
 15 |     """Compute the hit order weight matrix.
 16 | 
 17 |     Returns
 18 |     -------
 19 |     numpy.ndarray
 20 |         Weight matrix indexed by (nhits, ihit), i.e. the total number of
 21 |         hits in the tracks and the hit index.
 22 |     """
 23 |     w = numpy.zeros((max_hits + 1, max_hits))
 24 |     for nhits in range(min_hits, max_hits + 1):
 25 |         # scale proposal weights to the number of hits on track
 26 |         supports = numpy.arange(len(proposal)) * (nhits - 1) / (len(proposal) - 1)
 27 |         # compute normalized weights so that a full track has a sum of 1
 28 |         weights = numpy.interp(numpy.arange(nhits), supports, proposal)
 29 |         weights /= weights.sum()
 30 |         w[nhits, :nhits] = weights
 31 |     return w
 32 | 
 33 | ORDER_PROPOSAL = [10., 8., 6., 5., 3., 3., 3., 5., 6.]
 34 | ORDER_MIN_HITS = 4
 35 | ORDER_MAX_HITS = 20
 36 | ORDER_MATRIX = _compute_order_weight_matrix(ORDER_PROPOSAL, ORDER_MIN_HITS, ORDER_MAX_HITS)
 37 | 
 38 | def print_order_weight_matrix(prefix=''):
 39 |     print(prefix, 'order weight matrix (weights in percent):', sep='')
 40 |     print(prefix, 'nhits | ihit', sep='')
 41 |     print(prefix, '      |', sep='', end='')
 42 |     for i in range(len(ORDER_MATRIX[1:][0])):
 43 |         print(' {:2d}'.format(i), end='')
 44 |     print()
 45 |     print(prefix, '------+' + len(ORDER_MATRIX[1:][0]) * 3 * '-', sep='')
 46 |     for nhits, row in enumerate(ORDER_MATRIX[1:], start=1):
 47 |         print(prefix, '  {: 3d} |'.format(nhits), sep='', end='')
 48 |         for ihit in range(nhits):
 49 |             print(' {:2.0f}'.format(100 * row[ihit]), end='')
 50 |         print()
 51 | 
 52 | def weight_order(args):
 53 |     """Return the weight due to the hit order along the track.
 54 |     """
 55 |     ihit, nhits = args
 56 |     if nhits < ORDER_MIN_HITS:
 57 |         return 0.
 58 |     if ORDER_MAX_HITS < nhits:
 59 |         nhits = ORDER_MAX_HITS
 60 |     if ORDER_MAX_HITS <= ihit:
 61 |         print("warning long true track ihit ", ihit, " proceeding with weight zero.")
 62 |         return 0.
 63 |     if nhits <= ihit:
 64 |         raise Exception("hit index ", ihit, " is larger than total number of hits ", nhits)
 65 |     if nhits < 0:
 66 |         raise Exception("total number of hits ", nhits, " is below zero")
 67 |     if ihit < 0:
 68 |         raise Exception("hit index ", ihit, " is below zero")
 69 |     return ORDER_MATRIX[nhits, ihit]
 70 | 
 71 | def weight_pt(pt, pt_inf=0.5, pt_sup=3, w_min=0.2, w_max=1.):
 72 |     """Return the transverse momentum dependent hit weight.
 73 |     """
 74 |     # lower cut just to be sure, should not happen except maybe for noise hits
 75 |     xp = [min(0.05, pt_inf), pt_inf, pt_sup]
 76 |     fp = [w_min, w_min, w_max]
 77 |     return numpy.interp(pt, xp, fp, left=0.0, right=w_max)
 78 | 
 79 | # particle id for noise hits
 80 | INVALID_PARTICLED_ID = 0
 81 | 
 82 | def weight_hits_phase1(truth, particles):
 83 |     """Compute per-hit weights for the phase 1 scoring metric.
 84 | 
 85 |     Hits w/ invalid particle ids, e.g. noise hits, have zero weight.
 86 | 
 87 |     Parameters
 88 |     ----------
 89 |     truth : pandas.DataFrame
 90 |         Truth information. Must have hit_id, particle_id, and tz columns.
 91 |     particles : pandas.DataFrame
 92 |         Particle information. Must have particle_id, vz, px, py, and nhits
 93 |         columns.
 94 | 
 95 |     Returns
 96 |     -------
 97 |     pandas.DataFrame
 98 |         `truth` augmented with additional columns: particle_nhits, ihit,
 99 |         weight_order, weight_pt, and weight.
100 |     """
101 |     # fill selected per-particle information for each hit
102 |     selected = pandas.DataFrame({
103 |         'particle_id': particles['particle_id'],
104 |         'particle_vz': particles['vz'],
105 |         'particle_nhits': particles['nhits'],
106 |         'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])),
107 |     })
108 |     combined = pandas.merge(truth, selected,
109 |                             how='left', on=['particle_id'],
110 |                             validate='many_to_one')
111 | 
112 |     # fix pt weight for hits w/o associated particle
113 |     combined['weight_pt'].fillna(0.0, inplace=True)
114 |     # fix nhits for hits w/o associated particle
115 |     combined['particle_nhits'].fillna(0.0, inplace=True)
116 |     combined['particle_nhits'] = combined['particle_nhits'].astype('i4')
117 |     # compute hit count and order using absolute distance from particle vertex
118 |     combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz'])
119 |     combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4')
120 |     # compute order-dependent weight
121 |     combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1)
122 | 
123 |     # compute combined weight normalized to 1
124 |     w = combined['weight_pt'] * combined['weight_order']
125 |     w /= w.sum()
126 |     combined['weight'] = w
127 | 
128 |     # return w/o intermediate columns
129 |     return combined.drop(columns=['particle_vz', 'abs_dvz'])
130 | 
131 | def weight_hits_phase2(truth, particles):
132 |     """Compute per-hit weights for the phase 2 scoring metric.
133 | 
134 |     This is the phase 1 metric with an additional particle preselection, i.e.
135 |     only a subset of the particles have a non-zero score.
136 | 
137 |     Parameters
138 |     ----------
139 |     truth : pandas.DataFrame
140 |         Truth information. Must have hit_id, particle_id, and tz columns.
141 |     particles : pandas.DataFrame
142 |         Particle information. Must have particle_id, vz, px, py, and nhits
143 |         columns.
144 | 
145 |     Returns
146 |     -------
147 |     pandas.DataFrame
148 |         `truth` augmented with additional columns: particle_nhits, ihit,
149 |         weight_order, weight_pt, and weight.
150 |     """
151 |     # fill selected per-particle information for each hit
152 |     selected = pandas.DataFrame({
153 |         'particle_id': particles['particle_id'],
154 |         'particle_vz': particles['vz'],
155 |         'particle_nhits': particles['nhits'],
156 |         'weight_pt': weight_pt(numpy.hypot(particles['px'], particles['py'])),
157 |     })
158 |     selected = decode_particle_id(selected)
159 |     combined = pandas.merge(truth, selected,
160 |                             how='left', on=['particle_id'],
161 |                             validate='many_to_one')
162 | 
163 |     # fix pt weight for hits w/o associated particle
164 |     combined['weight_pt'].fillna(0.0, inplace=True)
165 |     # fix nhits for hits w/o associated particle
166 |     combined['particle_nhits'].fillna(0.0, inplace=True)
167 |     combined['particle_nhits'] = combined['particle_nhits'].astype('i4')
168 | 
169 |     # compute hit count and order using absolute distance from particle vertex
170 |     combined['abs_dvz'] = numpy.absolute(combined['tz'] - combined['particle_vz'])
171 |     combined['ihit'] = combined.groupby('particle_id')['abs_dvz'].rank().transform(lambda x: x - 1).fillna(0.0).astype('i4')
172 |     # compute order-dependent weight
173 |     combined['weight_order'] = combined[['ihit', 'particle_nhits']].apply(weight_order, axis=1)
174 | 
175 |     # compute normalized combined weight w/ extra particle selection
176 |     weight = combined['weight_pt'] * combined['weight_order']
177 |     weight[combined['generation'] != 0] = 0
178 |     weight /= weight.sum()
179 |     # normalize total event weight
180 |     combined['weight'] = weight
181 | 
182 |     # return w/o intermediate columns
183 |     return combined.drop(columns=['particle_vz', 'abs_dvz'])
184 | 


--------------------------------------------------------------------------------