├── tests ├── __init__.py ├── test_helpers.py ├── test_get_zarr.py ├── test_data_reduction.py ├── test_lone_hit_integration.py ├── test_exhaust_plugin.py ├── test_config.py ├── test_statistics.py ├── test_peak_properties.py ├── test_peak_merging.py ├── test_pulse_processing.py ├── test_inline_plugin.py ├── test_down_chunk_plugin.py ├── test_saving.py ├── test_cut_plugin.py ├── test_sort.py ├── test_overlap_plugin.py ├── test_fixed_plugin_cache.py ├── test_mailbox.py └── test_peak_splitting.py ├── docs ├── source │ ├── __init__.py │ ├── developer │ │ ├── contributing.md │ │ ├── overlap_window.jpg │ │ ├── release.rst │ │ ├── documentation.rst │ │ ├── corrections.rst │ │ ├── pipeline.rst │ │ ├── storage.rst │ │ ├── overlaps.rst │ │ └── parallel.rst │ ├── reference │ │ ├── strax.storage.rst │ │ ├── strax.processing.rst │ │ └── strax.rst │ ├── basics │ │ └── setup.rst │ ├── build_release_notes.py │ ├── index.rst │ └── advanced │ │ ├── out_of_core.rst │ │ ├── chunking.rst │ │ ├── fuzzy_for.rst │ │ ├── superrun.rst │ │ └── recompression.rst ├── make_docs.sh ├── pull_request_template.md └── make.bat ├── strax ├── processing │ ├── __init__.py │ ├── data_reduction.py │ ├── statistics.py │ └── peak_properties.py ├── scripts │ ├── __init__.py │ └── rechunker.py ├── storage │ ├── __init__.py │ └── zipfiles.py ├── processor.py ├── plugins │ ├── __init__.py │ ├── exhaust_plugin.py │ ├── merge_only_plugin.py │ ├── down_chunking_plugin.py │ ├── cut_plugin.py │ ├── loop_plugin.py │ ├── overlap_window_plugin.py │ └── parrallel_source_plugin.py ├── processors │ ├── __init__.py │ ├── base.py │ └── single_thread.py ├── __init__.py ├── sort_enforcement.py └── io.py ├── MANIFEST.in ├── pytest.ini ├── .git-blame-ignore-revs ├── .coveragerc ├── .bumpversion.cfg ├── .readthedocs.yml ├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ └── bug_report.md ├── dependabot.yml └── workflows │ ├── pypi_install.yml │ ├── test_install.yml │ └── pytest.yml ├── setup.cfg ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── CONTRIBUTING.md ├── README.md ├── .pylintrc ├── pyproject.toml └── CODE-OF-CONDUCT.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strax/processing/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strax/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /strax/storage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include *.txt 2 | include *.md 3 | -------------------------------------------------------------------------------- /docs/source/developer/contributing.md: -------------------------------------------------------------------------------- 1 | ../../../CONTRIBUTING.md -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | filterwarnings = 3 | ignore::numba.NumbaExperimentalFeatureWarning 4 | -------------------------------------------------------------------------------- /docs/source/developer/overlap_window.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/AxFoundation/strax/HEAD/docs/source/developer/overlap_window.jpg -------------------------------------------------------------------------------- /strax/processor.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # Legacy import, used in a single place in straxen. 3 | from .processors.threaded_mailbox import SHMExecutor 4 | -------------------------------------------------------------------------------- /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | 42ad25e0e6834c82f415149b206b49bf0cf5654f 2 | 5c4a277fbb155529dc5cf06435f0d7419977d7dd 3 | 8e431b2f5827d3f9c69088bd68b432dc6d4d4769 4 | -------------------------------------------------------------------------------- /docs/make_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | make clean 3 | rm -r source/reference 4 | sphinx-apidoc -o source/reference ../strax 5 | rm source/reference/modules.rst 6 | make html 7 | -------------------------------------------------------------------------------- /.coveragerc: -------------------------------------------------------------------------------- 1 | # .coveragerc to control coverage.py 2 | [report] 3 | # Regexes for lines to exclude from consideration 4 | exclude_lines = 5 | if __name__ == .__main__.: 6 | raise 7 | 8 | ignore_errors = True 9 | -------------------------------------------------------------------------------- /.bumpversion.cfg: -------------------------------------------------------------------------------- 1 | [bumpversion] 2 | current_version = 2.2.1 3 | files = strax/__init__.py docs/source/conf.py 4 | commit = True 5 | tag = True 6 | 7 | [bumpversion:file:pyproject.toml] 8 | search = version = "{current_version}" 9 | replace = version = "{new_version}" 10 | -------------------------------------------------------------------------------- /strax/plugins/__init__.py: -------------------------------------------------------------------------------- 1 | from .plugin import * 2 | from .cut_plugin import * 3 | from .loop_plugin import * 4 | from .merge_only_plugin import * 5 | from .overlap_window_plugin import * 6 | from .parrallel_source_plugin import * 7 | from .down_chunking_plugin import * 8 | from .exhaust_plugin import * 9 | -------------------------------------------------------------------------------- /docs/source/developer/release.rst: -------------------------------------------------------------------------------- 1 | Release procedure 2 | ================== 3 | 4 | - Update personal fork & local master to Axfoundation fork 5 | - Edit and commit HISTORY.md 6 | - bumpversion patch (or minor/major, as appropriate) 7 | - Push to personal and AxFoundation fork, with --tags 8 | - fast-foward and push AxFoundation/stable 9 | - Add release info on release page of github website 10 | -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yml 2 | 3 | # Required 4 | version: 2 5 | 6 | sphinx: 7 | configuration: docs/source/conf.py 8 | 9 | build: 10 | os: ubuntu-22.04 11 | apt_packages: 12 | - graphviz 13 | tools: 14 | python: "3.10" 15 | 16 | python: 17 | install: 18 | - method: pip 19 | path: . 20 | extra_requirements: 21 | - docs 22 | 23 | formats: 24 | - htmlzip 25 | - epub 26 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # https://www.aleksandrhovhannisyan.com/blog/crlf-vs-lf-normalizing-line-endings-in-git/#a-simple-gitattributes-config 2 | # We'll let Git's auto-detection algorithm infer if a file is text. If it is, 3 | # enforce LF line endings regardless of OS or git configurations. 4 | * text=auto eol=lf 5 | 6 | # Isolate binary files in case the auto-detection algorithm fails and 7 | # marks them as text files (which could brick them). 8 | *.{png,jpg,jpeg,gif,webp,woff,woff2} binary 9 | -------------------------------------------------------------------------------- /strax/processors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import * 2 | from .threaded_mailbox import * 3 | from .single_thread import * 4 | 5 | # This is redundant with the star-imports above, but some flake8 6 | # versions require this 7 | from .threaded_mailbox import ThreadedMailboxProcessor 8 | from .single_thread import SingleThreadProcessor 9 | 10 | PROCESSORS = { 11 | "default": SingleThreadProcessor, 12 | "threaded_mailbox": ThreadedMailboxProcessor, 13 | "single_thread": SingleThreadProcessor, 14 | } 15 | -------------------------------------------------------------------------------- /docs/pull_request_template.md: -------------------------------------------------------------------------------- 1 | **What is the problem / what does the code in this PR do** 2 | 3 | **Can you briefly describe how it works?** 4 | 5 | **Can you give a minimal working example (or illustrate with a figure)?** 6 | 7 | Please include the following if applicable: 8 | - Update the docstring(s) 9 | - Update the documentation 10 | - Tests to check the (new) code is working as desired. 11 | - Does it solve one of the open issues on github? 12 | 13 | Please make sure that all automated tests have passed before asking for a review (you can save the PR as a draft otherwise). 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Insert the MWE of how to reproduce the error 15 | ```python 16 | YOUR CODE GOES HERE 17 | ``` 18 | 19 | **Expected behavior** 20 | A clear and concise description of what you expected to happen. 21 | 22 | **Screenshots** 23 | If applicable, add screenshots to help explain your problem. 24 | 25 | **Versions** 26 | Please add the version of strax and any related package 27 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # Set update schedule for GitHub Actions to check they are up to date 2 | # If one of the github actions is out of date, dependabot will open a 3 | # PR to update the version of that action 4 | 5 | version: 2 6 | updates: 7 | # Maintain the requirements in the github actiuons 8 | - package-ecosystem: "github-actions" 9 | directory: "/" 10 | schedule: 11 | # Check for updates to GitHub Actions every weekday 12 | interval: "monthly" 13 | # Maintain the requirements requirements folder 14 | - package-ecosystem: "pip" 15 | directory: "/" 16 | schedule: 17 | # Check for updates to requirements every week 18 | interval: "monthly" 19 | open-pull-requests-limit: 15 20 | -------------------------------------------------------------------------------- /strax/plugins/exhaust_plugin.py: -------------------------------------------------------------------------------- 1 | from .plugin import Plugin 2 | 3 | 4 | class ExhaustPlugin(Plugin): 5 | """Plugin that exhausts all chunks when fetching data.""" 6 | 7 | def _fetch_chunk(self, d, iters, check_end_not_before=None): 8 | while super()._fetch_chunk(d, iters, check_end_not_before=check_end_not_before): 9 | pass 10 | return False 11 | 12 | def do_compute(self, chunk_i=None, **kwargs): 13 | if chunk_i != self.first_chunk: 14 | raise RuntimeError( 15 | f"{self.__class__.__name__} is an ExhaustPlugin. " 16 | "It should read all chunks together can process them together." 17 | ) 18 | return super().do_compute(chunk_i=chunk_i, **kwargs) 19 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | from strax import testutils 2 | from hypothesis import given 3 | import strax 4 | 5 | 6 | @given(testutils.sorted_bounds()) 7 | def test_sorted_bounds(bs): 8 | assert is_sorted(bs) 9 | 10 | 11 | @given(testutils.sorted_bounds(disjoint=True)) 12 | def test_disjoint_bounds(bs): 13 | assert is_sorted(bs) 14 | assert is_disjoint(bs) 15 | 16 | 17 | @given(testutils.disjoint_sorted_intervals) 18 | def test_dsi(intvs): 19 | bs = list(zip(intvs["time"].tolist(), strax.endtime(intvs).tolist())) 20 | assert is_sorted(bs) 21 | assert is_disjoint(bs) 22 | 23 | 24 | def is_sorted(bs): 25 | return bs == sorted(bs) 26 | 27 | 28 | def is_disjoint(bs): 29 | return all([bs[i][1] <= bs[i + 1][0] for i in range(len(bs) - 1)]) 30 | -------------------------------------------------------------------------------- /tests/test_get_zarr.py: -------------------------------------------------------------------------------- 1 | import strax 2 | from strax.testutils import Records, Peaks, run_id 3 | import tempfile 4 | import numpy as np 5 | 6 | 7 | def test_get_zarr(): 8 | """Get a context for the tests below.""" 9 | with tempfile.TemporaryDirectory() as temp_dir: 10 | context = strax.Context( 11 | storage=strax.DataDirectory(temp_dir, deep_scan=True), 12 | register=[Records, Peaks], 13 | use_per_run_defaults=True, 14 | ) 15 | records = context.get_array(run_id, "records") 16 | peaks = context.get_array(run_id, "peaks") 17 | zgrp = context.get_zarr(run_id, ("records", "peaks"), storage="memory://") 18 | 19 | assert np.all(zgrp.records["time"] == records["time"]) 20 | assert np.all(zgrp.peaks["time"] == peaks["time"]) 21 | -------------------------------------------------------------------------------- /docs/source/developer/documentation.rst: -------------------------------------------------------------------------------- 1 | Writing documentation 2 | ====================== 3 | 4 | To write documentation, please refer to the existing for examples. To add new pages: 5 | - Add a new ``.rst`` file in the basics/advanced/developer folder within ./docs. 6 | - Add the link to the file in the docs/index.rst 7 | - run ``bash make_docs.sh``. This will run sphinx locally, this allows one to 8 | preview if the results are the desired results. Several modules need be 9 | installed in order to run this script. 10 | - Add the ``.rst`` file, the ``index.rst`` to git. 11 | 12 | Updating ``docs/reference`` 13 | --------------------------- 14 | The ``docs/reference`` is only updated with ``bash make_docs.sh``. 15 | In case modules are added/removed, one needs to rerun this script to and commit 16 | the changes to the files in ``docs/reference``. 17 | -------------------------------------------------------------------------------- /strax/processors/base.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import typing as ty 3 | 4 | import strax 5 | 6 | export, __all__ = strax.exporter() 7 | 8 | 9 | @export 10 | class ProcessorComponents(ty.NamedTuple): 11 | """Specification to assemble a processor.""" 12 | 13 | plugins: ty.Dict[str, strax.Plugin] 14 | loaders: ty.Dict[str, ty.Callable] 15 | # Required for inline ParallelSource plugin. 16 | loader_plugins: ty.Dict[str, strax.Plugin] 17 | savers: ty.Dict[str, ty.List[strax.Saver]] 18 | targets: ty.Tuple[str] 19 | 20 | 21 | @export 22 | class BaseProcessor: 23 | components: ProcessorComponents 24 | 25 | def __init__(self, components: ProcessorComponents, **kwargs): 26 | self.log = logging.getLogger(self.__class__.__name__) 27 | self.components = components 28 | 29 | def iter(self): 30 | raise NotImplementedError 31 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [aliases] 2 | test=pytest 3 | 4 | [mypy] 5 | disable_error_code = attr-defined, name-defined, union-attr 6 | 7 | [flake8] 8 | # Set maximum width of the line to 100 9 | max-line-length = 100 10 | 11 | # E203 whitespace before ':' 12 | # E402 module level import not at top of file 13 | # E501 line too long 14 | # E731 do not assign a lambda expression, use a def 15 | # F541 f-string is missing placeholders 16 | # F401 imported but unused 17 | # F403 unable to detect undefined names 18 | # F405 name may be undefined, or defined from star imports 19 | # W503 line break before binary operator 20 | 21 | ignore = E203, E731, F541, W503 22 | 23 | per-file-ignores = 24 | strax/*__init__.py: F401, F403 25 | strax/plugin.py: F401 26 | strax/processing/general.py: E402 27 | tests/*: F403, F405 28 | tests/plugins/test_plugins.py: F401 29 | docs/source/build_datastructure_doc.py: E501 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter 2 | .ipynb_checkpoints 3 | 4 | # Data (temporary?) 5 | *.json 6 | *.bz2 7 | *.zstd 8 | *.npy 9 | *.blosc 10 | *.h5 11 | strax_data 12 | from_fake_daq 13 | from_eb 14 | from_eb_finished 15 | resource_cache 16 | raw 17 | reduced_raw 18 | processed 19 | temp_processed 20 | custom_data 21 | test_input_data 22 | *.zip 23 | 24 | # cProfile output 25 | *.prof 26 | 27 | # Python cache 28 | *.py[cod] 29 | __pycache__ 30 | 31 | # Testing caches 32 | .pytest_cache 33 | *pytestdebug.log 34 | .hypothesis 35 | 36 | # Packages 37 | .eggs 38 | *.egg 39 | *.egg-info 40 | dist 41 | build 42 | eggs 43 | parts 44 | var 45 | sdist 46 | develop-eggs 47 | .installed.cfg 48 | lib 49 | lib64 50 | 51 | # Sphinx 52 | docs/_build 53 | docs/source/developer/contributing.rst 54 | docs/source/reference/release_notes.rst 55 | 56 | # Pycharm 57 | .idea 58 | 59 | # coverage 60 | .coverage 61 | 62 | # DS_Store 63 | .DS_Store 64 | -------------------------------------------------------------------------------- /.github/workflows/pypi_install.yml: -------------------------------------------------------------------------------- 1 | # Pipy upload strax after a release (or manually). 2 | # Mostly based on https://github.com/marketplace/actions/pypi-publish 3 | name: Pipy 4 | 5 | on: 6 | workflow_dispatch: 7 | release: 8 | types: [created] 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | steps: 13 | # Setup steps 14 | - name: Setup python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: "3.10" 18 | - name: Checkout repo 19 | uses: actions/checkout@v4 20 | - name: Install dependencies 21 | run: pip install build 22 | - name: Build package 23 | run: python -m build 24 | # Do the publish 25 | - name: Publish a Python distribution to PyPI 26 | uses: pypa/gh-action-pypi-publish@master 27 | with: 28 | user: ${{ secrets.pipy_token }} 29 | password: ${{ secrets.pypi_password }} 30 | -------------------------------------------------------------------------------- /.github/workflows/test_install.yml: -------------------------------------------------------------------------------- 1 | # Test if we can actually install strax by installing 2 | name: Installation test 3 | 4 | on: 5 | workflow_dispatch: 6 | release: 7 | types: [created] 8 | pull_request: 9 | branches: 10 | - master 11 | - stable 12 | push: 13 | branches: 14 | - master 15 | 16 | jobs: 17 | build: 18 | name: "py${{ matrix.python-version }}" 19 | runs-on: ubuntu-latest 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | python-version: ["3.10", "3.11"] 24 | steps: 25 | - name: Setup python 26 | uses: actions/setup-python@v5 27 | with: 28 | python-version: ${{ matrix.python-version }} 29 | - name: Checkout repo 30 | uses: actions/checkout@v4 31 | - name: Install strax 32 | run: pip install . 33 | - name: Test import 34 | run: python -c "import strax; print(strax.__version__)" 35 | - name: goodbye 36 | run: echo goodbye 37 | -------------------------------------------------------------------------------- /docs/source/reference/strax.storage.rst: -------------------------------------------------------------------------------- 1 | strax.storage package 2 | ===================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | strax.storage.common module 8 | --------------------------- 9 | 10 | .. automodule:: strax.storage.common 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | strax.storage.files module 16 | -------------------------- 17 | 18 | .. automodule:: strax.storage.files 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | strax.storage.mongo module 24 | -------------------------- 25 | 26 | .. automodule:: strax.storage.mongo 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | strax.storage.zipfiles module 32 | ----------------------------- 33 | 34 | .. automodule:: strax.storage.zipfiles 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | Module contents 40 | --------------- 41 | 42 | .. automodule:: strax.storage 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | -------------------------------------------------------------------------------- /docs/source/basics/setup.rst: -------------------------------------------------------------------------------- 1 | Setting up strax 2 | ================ 3 | 4 | To install the latest stable version (from pypi), run `pip install strax`. 5 | Dependencies should install automatically: 6 | numpy, pandas, numba, two compression libraries (blosc and zstd) 7 | and a few miscellaneous pure-python packages. Strax requires python >= 3.8. 8 | 9 | If you want to try out strax on XENON1T data, you're probably better off installing strax's XENON bindings at ``_. Strax will be automatically installed along with straxen. 10 | 11 | You might want to install some dependencies (such as numpy and numba) via conda rather than pip, but it's up to you. 12 | 13 | You can also clone the repository, then setup a developer installation with `python setup.py develop`. 14 | 15 | If you experience problems during installation, try installing 16 | exactly the same version of the dependencies as used on the Github Actions integrated testing. 17 | Clone the repository, then do `pip install -r strax/extra_requirements/requirements-tests.txt`. 18 | -------------------------------------------------------------------------------- /strax/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | __version__ = "2.2.1" 3 | 4 | # Glue the package together 5 | # See https://www.youtube.com/watch?v=0oTh1CXRaQ0 if this confuses you 6 | # The order of subpackes is not invariant, since we use strax.xxx inside strax 7 | from .sort_enforcement import * 8 | from .utils import * 9 | from .chunk import * 10 | from .dtypes import * 11 | from strax.processing.general import * 12 | 13 | from .storage.common import * 14 | from .storage.files import * 15 | from .storage.file_rechunker import * 16 | from .storage.mongo import * 17 | from .storage.zipfiles import * 18 | 19 | from .config import * 20 | from .plugins import * 21 | 22 | from .mailbox import * 23 | from .processor import * 24 | from .processors import * 25 | from .context import * 26 | from .run_selection import * 27 | 28 | from .io import * 29 | 30 | from strax.processing.data_reduction import * 31 | from strax.processing.pulse_processing import * 32 | from strax.processing.peak_building import * 33 | from strax.processing.peak_merging import * 34 | from strax.processing.peak_splitting import * 35 | from strax.processing.peak_properties import * 36 | from strax.processing.hitlets import * 37 | from strax.processing.statistics import * 38 | -------------------------------------------------------------------------------- /strax/plugins/merge_only_plugin.py: -------------------------------------------------------------------------------- 1 | import strax 2 | from .plugin import Plugin, SaveWhen 3 | 4 | export, __all__ = strax.exporter() 5 | 6 | 7 | ## 8 | # "Plugins" for internal use 9 | # These do not actually do computations, but do other tasks 10 | # for which posing as a plugin is helpful. 11 | # Do not subclass unless you know what you are doing.. 12 | ## 13 | 14 | 15 | @export 16 | class MergeOnlyPlugin(Plugin): 17 | """Plugin that merges data from its dependencies.""" 18 | 19 | save_when = SaveWhen.EXPLICIT 20 | 21 | def infer_dtype(self): 22 | deps_by_kind = self.dependencies_by_kind() 23 | if len(deps_by_kind) != 1: 24 | raise ValueError( 25 | "MergeOnlyPlugins can only merge data of the same kind, but got multiple kinds: " 26 | + str(deps_by_kind) 27 | ) 28 | 29 | return strax.merged_dtype( 30 | [ 31 | self.deps[d].dtype_for(d) 32 | # Sorting is needed here to match what strax.Chunk does in merging 33 | for d in sorted(self.depends_on) 34 | ] 35 | ) 36 | 37 | def compute(self, **kwargs): 38 | return kwargs[list(kwargs.keys())[0]] 39 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v5.0.0 6 | hooks: 7 | - id: trailing-whitespace 8 | - id: end-of-file-fixer 9 | - id: check-yaml 10 | - id: check-added-large-files 11 | 12 | - repo: https://github.com/psf/black 13 | rev: 24.10.0 14 | hooks: 15 | - id: black 16 | args: [--safe, --line-length=100, --preview] 17 | language_version: python3 18 | 19 | - repo: https://github.com/pycqa/docformatter 20 | rev: v1.7.6 21 | hooks: 22 | - id: docformatter 23 | additional_dependencies: [tomli] 24 | args: [--config, pyproject.toml] 25 | 26 | - repo: https://github.com/pre-commit/mirrors-mypy 27 | rev: v1.11.2 28 | hooks: 29 | - id: mypy 30 | additional_dependencies: [ 31 | types-PyYAML, types-tqdm, types-pytz, 32 | types-requests, types-setuptools, 33 | ] 34 | 35 | - repo: https://github.com/pycqa/flake8 36 | rev: 7.1.1 37 | hooks: 38 | - id: flake8 39 | 40 | ci: 41 | autoupdate_schedule: weekly 42 | -------------------------------------------------------------------------------- /docs/source/build_release_notes.py: -------------------------------------------------------------------------------- 1 | from m2r import convert 2 | import os 3 | 4 | header = """ 5 | Release notes 6 | ============== 7 | 8 | """ 9 | 10 | 11 | def convert_release_notes(): 12 | """Convert the release notes to an RST page with links to PRs.""" 13 | this_dir = os.path.dirname(os.path.realpath(__file__)) 14 | notes = os.path.join(this_dir, "..", "..", "HISTORY.md") 15 | with open(notes, "r") as f: 16 | notes = f.read() 17 | rst = convert(notes) 18 | with_ref = "" 19 | for line in rst.split("\n"): 20 | # Get URL for PR 21 | if "#" in line: 22 | pr_number = line.split("#")[1] 23 | while len(pr_number): 24 | try: 25 | pr_number = int(pr_number) 26 | break 27 | except ValueError: 28 | # Too many tailing characters to be an int 29 | pr_number = pr_number[:-1] 30 | if pr_number: 31 | line = line.replace( 32 | f"#{pr_number}", 33 | f"`#{pr_number} `_", 34 | ) 35 | with_ref += line + "\n" 36 | target = os.path.join(this_dir, "reference", "release_notes.rst") 37 | 38 | with open(target, "w") as f: 39 | f.write(header + with_ref) 40 | 41 | 42 | if __name__ == "__main__": 43 | convert_release_notes() 44 | -------------------------------------------------------------------------------- /strax/sort_enforcement.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numba.extending import register_jitable 3 | 4 | # Define error message as a constant 5 | UNSTABLE_SORT_MESSAGE = ( 6 | "quicksort and heapsort are not allowed due to non-deterministic behavior.\n" 7 | "Please use mergesort for deterministic sorting behavior." 8 | ) 9 | 10 | 11 | # Define custom exception for sorting errors 12 | class SortingError(Exception): 13 | pass 14 | 15 | 16 | def stable_sort(arr, kind="mergesort", **kwargs): 17 | """Stable sort function using mergesort, w/o numba optimization. 18 | 19 | Args: 20 | arr: numpy array to sort 21 | kind: sorting algorithm to use (only 'mergesort' is allowed) 22 | 23 | Returns: 24 | Sorted array using mergesort algorithm 25 | 26 | """ 27 | if kind != "mergesort": 28 | raise SortingError(UNSTABLE_SORT_MESSAGE) 29 | return np.sort(arr, kind="mergesort", **kwargs) 30 | 31 | 32 | @register_jitable 33 | def stable_argsort(arr, kind="mergesort"): 34 | """Numba-optimized stable argsort function using mergesort. 35 | 36 | Args: 37 | arr: numpy array to sort 38 | kind: sorting algorithm to use (only 'mergesort' is allowed) 39 | 40 | Returns: 41 | Indices that would sort the array using mergesort algorithm 42 | 43 | """ 44 | if kind != "mergesort": 45 | raise SortingError(UNSTABLE_SORT_MESSAGE) 46 | return np.argsort(arr, kind="mergesort") 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 3-Clause License 2 | 3 | Copyright (c) 2018-2023, strax developers. 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | * Neither the name of the copyright holder nor the names of its 17 | contributors may be used to endorse or promote products derived from 18 | this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 21 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 22 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 23 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 24 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 25 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 26 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 27 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 28 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 29 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 | -------------------------------------------------------------------------------- /tests/test_data_reduction.py: -------------------------------------------------------------------------------- 1 | from hypothesis import given, settings 2 | 3 | from strax.testutils import * 4 | 5 | 6 | # TODO: test with multiple fake pulses and dt != 1 7 | @settings(deadline=None) 8 | @given(single_fake_pulse) 9 | def test_cut_outside_hits(records): 10 | hits = strax.find_hits(records, min_amplitude=1) 11 | 12 | # Set all record waveforms to 1 (still and 0 out of bounds) 13 | for r in records: 14 | r["data"] = 0 15 | r["data"][: r["length"]] = 1 16 | assert np.all(np.in1d(r["data"], [0, 1])) 17 | 18 | left_extension = 2 19 | right_extension = 3 20 | 21 | records_out = strax.cut_outside_hits( 22 | records, hits, left_extension=left_extension, right_extension=right_extension 23 | ) 24 | 25 | assert len(records_out) == len(records) 26 | if len(records) == 0: 27 | return 28 | 29 | # All fields except data are unchanged 30 | for x in records.dtype.names: 31 | if x == "data": 32 | continue 33 | if x == "reduction_level": 34 | np.testing.assert_array_equal( 35 | records_out[x], 36 | np.ones(len(records), dtype=np.int16) * strax.ReductionLevel.HITS_ONLY, 37 | ) 38 | else: 39 | np.testing.assert_array_equal(records_out[x], records[x], err_msg=f"Field {x} mangled!") 40 | 41 | records = records_out 42 | 43 | # Super-laborious dumb check 44 | for r in records: 45 | for i, w in enumerate(r["data"][: r["length"]]): 46 | t = r["time"] + i * r["dt"] 47 | for h in hits: 48 | if h["time"] - left_extension <= t < strax.endtime(h) + right_extension: 49 | assert w == 1, f"Position {i} should be preserved" 50 | break 51 | else: 52 | assert w == 0, f"Position {i} should be cut" 53 | -------------------------------------------------------------------------------- /tests/test_lone_hit_integration.py: -------------------------------------------------------------------------------- 1 | from strax.testutils import several_fake_records 2 | import numpy as np 3 | from hypothesis import given, settings 4 | import hypothesis.strategies as st 5 | 6 | import strax 7 | 8 | 9 | @settings(deadline=None) 10 | @given( 11 | several_fake_records, 12 | st.integers(min_value=0, max_value=100), 13 | st.integers(min_value=0, max_value=100), 14 | ) 15 | def test_lone_hits_integration_bounds(records, left_extension, right_extension): 16 | """Loops over hits and tests if integration bounds overlap.""" 17 | n_channel = 0 18 | if len(records): 19 | n_channel = records["channel"].max() + 1 20 | 21 | hits = strax.find_hits(records, np.ones(n_channel)) 22 | 23 | strax.find_hit_integration_bounds( 24 | hits, 25 | np.zeros(0, dtype=strax.time_dt_fields), 26 | records, 27 | (left_extension, right_extension), 28 | n_channel, 29 | allow_bounds_beyond_records=False, 30 | ) 31 | _test_overlap(hits) 32 | 33 | hits["left_integration"] = 0 34 | hits["right_integration"] = 0 35 | 36 | strax.find_hit_integration_bounds( 37 | hits, 38 | np.zeros(0, dtype=strax.time_dt_fields), 39 | records, 40 | (left_extension, right_extension), 41 | n_channel, 42 | allow_bounds_beyond_records=True, 43 | ) 44 | _test_overlap(hits) 45 | 46 | 47 | def _test_overlap(hits): 48 | tester = np.zeros(len(hits), dtype=strax.time_fields) 49 | tester["time"] = hits["time"] - (hits["left_integration"] - hits["left"]) * hits["dt"] 50 | tester["endtime"] = hits["time"] + (hits["right_integration"] - hits["left"]) * hits["dt"] 51 | 52 | for ch in np.unique(hits["channel"]): 53 | mask = hits["channel"] == ch 54 | test_ch = np.all((tester[mask]["endtime"][:-1] - tester[mask]["time"][1:]) <= 0) 55 | assert np.all(test_ch), "Hits overlap!" 56 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | ## Contribution guidelines 2 | 3 | You're welcome to contribute to strax! 4 | 5 | Currently, many features are still in significant flux, and the documentation is still very basic. Until more people start getting involved in development, we're probably not even following our own advice below... 6 | 7 | ### Please fork 8 | Please work in a fork, then submit pull requests. 9 | Only maintainers sometimes work in branches if there is a good reason for it. 10 | 11 | ### No large files 12 | Avoid committing large (> 100 kB) files. We'd like to keep the repository no more than a few MB. 13 | 14 | For example, do not commit jupyter notebooks with high-resolution plots (clear the output first), or long configuration files, or binary test data. 15 | 16 | While it's possible to rewrite history to remove large files, this is a bit of work and messes with the repository's consistency. Once data has gone to master it's especially difficult, then there's a risk of others merging the files back in later unless they cooperate in the history-rewriting. 17 | 18 | This is one reason to prefer forks over branches; if you commit a huge file by mistake it's just in your fork. 19 | 20 | ### Code style 21 | Of course, please write nice and clean code :-) 22 | 23 | PEP8-compatibility is great (you can test with flake8) but not as important as other good coding habits such as avoiding duplication. See e.g. the [famous beyond PEP8 talk](https://www.youtube.com/watch?v=wf-BqAjZb8M). 24 | 25 | In particular, don't go into code someone else is maintaining to "PEP8-ify" it (or worse, use some automatic styling tool) 26 | 27 | Other style guidelines (docstrings etc.) are yet to be determined. 28 | 29 | ### Pull requests 30 | When accepting pull requests, preferrably squash as it attributes all the commits to one single pull request. One might consider merging the pull request without squashing if it's a few commits that mostly outline discrete steps of an implementation which seem worth keeping. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # strax 2 | Streaming analysis for xenon experiments 3 | 4 | [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.1340632.svg)](https://doi.org/10.5281/zenodo.1340632) 5 | [![Readthedocs Badge](https://readthedocs.org/projects/strax/badge/?version=latest)](https://strax.readthedocs.io/en/latest/?badge=latest) 6 | [![Coverage Status](https://coveralls.io/repos/github/AxFoundation/strax/badge.svg?branch=master)](https://coveralls.io/github/AxFoundation/strax?branch=master) 7 | [![tests](https://github.com/AxFoundation/strax/actions/workflows/pytest.yml/badge.svg?branch=master)](https://github.com/AxFoundation/strax/actions/workflows/pytest.yml) 8 | [![CodeFactor](https://www.codefactor.io/repository/github/axfoundation/strax/badge)](https://www.codefactor.io/repository/github/axfoundation/strax) 9 | [![pre-commit.ci status](https://results.pre-commit.ci/badge/github/AxFoundation/strax/master.svg)](https://results.pre-commit.ci/latest/github/AxFoundation/strax/master) 10 | 11 | [![PyPI version shields.io](https://img.shields.io/pypi/v/strax.svg)](https://pypi.python.org/pypi/strax/) 12 | [![Python Versions](https://img.shields.io/pypi/pyversions/strax.svg)](https://pypi.python.org/pypi/strax) 13 | [![PyPI downloads](https://img.shields.io/pypi/dm/strax.svg)](https://pypistats.org/packages/strax) 14 | [![Join the chat at https://gitter.im/AxFoundation/strax](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/AxFoundation/strax?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) 15 | 16 | 17 | Strax is an analysis framework for pulse-only digitization data, specialized for live data reduction at speeds of 50-100 MB(raw) / core / sec. For more information, please see the [strax documentation](https://strax.readthedocs.io). 18 | 19 | Strax' primary aim is to support noble liquid TPC dark matter searches, such as XENONnT. The XENON-specific algorithms live in the separate package [straxen](https://github.com/XENONnT/straxen). If you want to try out strax, you probably want to start there. This package only contains the core framework and basic algorithms any TPCs would want to use. 20 | -------------------------------------------------------------------------------- /docs/source/developer/corrections.rst: -------------------------------------------------------------------------------- 1 | Corrections 2 | ======== 3 | 4 | Overview 5 | --------- 6 | Corrections is a centralized interface that allows to store, query, and retrieve information about detector effects (corrections) where this information cab used at the event building process to remove (correct) such effects for a given data type. The information is stored in MongoDB as collection using ``pandas.DataFrame()`` format and with a pandas.DatetimeIndex() this allows track time-dependent information as often detector conditions change over time. Corrections also add the functionality to differentiate between ONLINE and OFFLINE versioning, where ONLINE corrections are used during online processing and, therefore, changes in the past are not allowed, and OFFLINE version meant to be used for re-processing where changes in the past are allowed. Below we explain key features of the corrections class: 7 | 8 | * ``read``: Retrive entire collection as ``pandas.DataFrame()`` 9 | * ``read_at``: Retrieve collection based on a time period (indexes) with limit rows(documents), using indexes greatly reduces the number of documents MongoDB needs to scan, then is a faster method for querying specific information; 10 | * ``write``: Store (save) entire collection as ``pandas.DataFrame()`` in the DB. 11 | * ``interpolate``: Often, data is limited in any DB then interpolation is needed when trying to retrieve information at a given time (DateTime). User has the option to use pandas interpolation methods see, e.g. `link `_. 12 | 13 | 14 | Finally, a few remarks regarding modifications of collection(``pandas.DataFrame()``). For convention, the user should provide dates(index) in UTC format. In addition, the user has the flexibility to modify or add rows (documents) to any ``pandas.DataFrame()`` (collections) with the only requirement the changes in the past are only for OFFLINE values, for instance, there could be some scenarios where user wants to add a new date (DateTime index) or wants to fill out non-physical values (NaNs) later. 15 | -------------------------------------------------------------------------------- /strax/plugins/down_chunking_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import Generator 2 | 3 | import strax 4 | from .plugin import Plugin 5 | 6 | export, __all__ = strax.exporter() 7 | 8 | 9 | ## 10 | # Plugin which allows to use yield in plugins compute method. 11 | # Allows to chunk down output before storing to disk. 12 | # Only works if multiprocessing is omitted. 13 | ## 14 | 15 | 16 | @export 17 | class DownChunkingPlugin(Plugin): 18 | """Plugin that merges data from its dependencies.""" 19 | 20 | parallel = False 21 | 22 | def __init__(self): 23 | super().__init__() 24 | 25 | if self.parallel: 26 | raise NotImplementedError( 27 | f'Plugin "{self.__class__.__name__}" is a DownChunkingPlugin which ' 28 | "currently does not support parallel processing." 29 | ) 30 | 31 | def _iter_compute(self, chunk_i, **inputs_merged): 32 | return self.do_compute(chunk_i=chunk_i, **inputs_merged) 33 | 34 | def _fix_output(self, result, start, end, superrun, subruns, _dtype=None): 35 | """Wrapper around _fix_output to support the return of iterators.""" 36 | if not isinstance(result, Generator): 37 | raise ValueError( 38 | f"Plugin {self.__class__.__name__} should return a generator in compute method." 39 | ) 40 | 41 | for _result in result: 42 | if isinstance(_result, dict): 43 | values = _result.values() 44 | else: 45 | if self.multi_output: 46 | raise ValueError( 47 | f"{self.__class__.__name__} is multi-output and should " 48 | "provide a generator of dict output." 49 | ) 50 | values = [_result] 51 | if not all(isinstance(v, strax.Chunk) for v in values): 52 | raise ValueError( 53 | f"Plugin {self.__class__.__name__} should yield (dict of) " 54 | "strax.Chunk in compute method." 55 | ) 56 | yield self.superrun_transformation(_result, superrun, subruns) 57 | -------------------------------------------------------------------------------- /docs/source/reference/strax.processing.rst: -------------------------------------------------------------------------------- 1 | strax.processing package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | strax.processing.data\_reduction module 8 | --------------------------------------- 9 | 10 | .. automodule:: strax.processing.data_reduction 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | strax.processing.general module 16 | ------------------------------- 17 | 18 | .. automodule:: strax.processing.general 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | strax.processing.hitlets module 24 | ------------------------------- 25 | 26 | .. automodule:: strax.processing.hitlets 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | strax.processing.peak\_building module 32 | -------------------------------------- 33 | 34 | .. automodule:: strax.processing.peak_building 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | strax.processing.peak\_merging module 40 | ------------------------------------- 41 | 42 | .. automodule:: strax.processing.peak_merging 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | strax.processing.peak\_properties module 48 | ---------------------------------------- 49 | 50 | .. automodule:: strax.processing.peak_properties 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | strax.processing.peak\_splitting module 56 | --------------------------------------- 57 | 58 | .. automodule:: strax.processing.peak_splitting 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | strax.processing.pulse\_processing module 64 | ----------------------------------------- 65 | 66 | .. automodule:: strax.processing.pulse_processing 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | strax.processing.statistics module 72 | ---------------------------------- 73 | 74 | .. automodule:: strax.processing.statistics 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | Module contents 80 | --------------- 81 | 82 | .. automodule:: strax.processing 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [MESSAGES CONTROL] 2 | # Jelle: CodeFactor has a whitelist of pylint messages 3 | # I removed: 4 | # - cyclic-import (we use this all the time in strax, see __init__.py) 5 | # - no-else-return (I think this makes sense for symmetric conditions, see https://dmerej.info/blog/post/else-after-return-yea-or-nay/) 6 | # - len-as-condition (if you do 'if data' on a numpy array it will crash) 7 | # - unnecessary-pass (sometimes pass makes the code more readable) 8 | disable=all 9 | enable=assert-on-tuple,astroid-error,bad-except-order,bad-inline-option,bad-option-value,bad-reversed-sequence,bare-except,binary-op-exception,boolean-datetime,catching-non-exception,cell-var-from-loop,confusing-with-statement,consider-merging-isinstance,consider-using-enumerate,consider-using-ternary,continue-in-finally,deprecated-pragma,django-not-available,duplicate-except,duplicate-key,eval-used,exec-used,expression-not-assigned,fatal,file-ignored,fixme,global-at-module-level,global-statement,global-variable-not-assigned,global-variable-undefined,http-response-with-content-type-json,http-response-with-json-dumps,invalid-all-object,invalid-characters-in-docstring,literal-comparison,locally-disabled,locally-enabled,lost-exception,lowercase-l-suffix,misplaced-bare-raise,missing-final-newline,missing-kwoa,mixed-line-endings,model-has-unicode,model-missing-unicode,model-no-explicit-unicode,model-unicode-not-callable,multiple-imports,multiple-statements,new-db-field-with-default,no-else-raise,non-ascii-bytes-literals,nonexistent-operator,not-an-iterable,not-in-loop,notimplemented-raised,overlapping-except,parse-error,pointless-statement,pointless-string-statement,raising-bad-type,raising-non-exception,raw-checker-failed,redefine-in-handler,redefined-argument-from-local,redefined-builtin,redundant-content-type-for-json-response,reimported,relative-import,return-outside-function,simplifiable-if-statement,singleton-comparison,syntax-error,trailing-comma-tuple,trailing-newlines,unbalanced-tuple-unpacking,undefined-all-variable,undefined-loop-variable,unexpected-line-ending-format,unidiomatic-typecheck,unnecessary-lambda,unnecessary-semicolon,unneeded-not,unpacking-non-sequence,unreachable,unrecognized-inline-option,used-before-assignment,useless-else-on-loop,using-constant-test,wildcard-import,yield-outside-function,useless-return 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool] 2 | [tool.poetry] 3 | name = "strax" 4 | version = "2.2.1" 5 | description = "Streaming analysis for xenon TPCs" 6 | readme = "README.md" 7 | authors = [ 8 | "strax developers", 9 | ] 10 | classifiers = [ 11 | "Development Status :: 5 - Production/Stable", 12 | "License :: OSI Approved :: BSD License", 13 | "Natural Language :: English", 14 | "Programming Language :: Python :: 3.10", 15 | "Programming Language :: Python :: 3.11", 16 | "Programming Language :: Python :: 3.12", 17 | "Intended Audience :: Science/Research", 18 | "Programming Language :: Python :: Implementation :: CPython", 19 | "Topic :: Scientific/Engineering :: Physics", 20 | ] 21 | repository = "https://github.com/AxFoundation/strax" 22 | 23 | [tool.poetry.scripts] 24 | rechunker = "strax.scripts.rechunker:main" 25 | 26 | [tool.poetry.dependencies] 27 | python = ">=3.10,<3.13" 28 | blosc = "*" 29 | click = "*" 30 | deepdiff = "*" 31 | dill = "*" 32 | fsspec = "*" 33 | immutabledict = "*" 34 | lz4 = "*" 35 | numba = ">=0.43.1" 36 | numexpr = "*" 37 | numpy = ">=1.18.5" 38 | numcodecs = "<0.16.0" 39 | packaging = "*" 40 | pandas = "*" 41 | psutil = "*" 42 | pymongo = "*" 43 | scipy = "*" 44 | tqdm = ">=4.46.0" 45 | zarr = "<3.0.0" 46 | zstd = "*" 47 | zstandard = "*" 48 | sphinx = { version = "*", optional = true } 49 | sphinx_rtd_theme = { version = "*", optional = true } 50 | nbsphinx = { version = "*", optional = true } 51 | recommonmark = { version = "*", optional = true } 52 | graphviz = { version = "*", optional = true } 53 | m2r = { version = "*", optional = true } 54 | mistune = { version = "0.8.4", optional = true } 55 | urllib3 = { version = "2.2.2", optional = true } 56 | lxml_html_clean = { version = "*", optional = true } 57 | 58 | [tool.poetry.extras] 59 | docs = [ 60 | "sphinx", 61 | "sphinx_rtd_theme", 62 | "nbsphinx", 63 | "recommonmark", 64 | "graphviz", 65 | "m2r", 66 | "mistune", 67 | "urllib3", 68 | "lxml_html_clean", 69 | ] 70 | 71 | [build-system] 72 | requires = ["poetry-core>=1.0.8", "setuptools>=61.0"] 73 | build-backend = "poetry.core.masonry.api" 74 | 75 | [tool.black] 76 | line-length = 100 77 | preview = true 78 | 79 | [tool.docformatter] 80 | recursive = true 81 | in-place = true 82 | style = "sphinx" 83 | wrap-summaries = 100 84 | wrap-descriptions = 100 85 | blank = true 86 | -------------------------------------------------------------------------------- /docs/source/reference/strax.rst: -------------------------------------------------------------------------------- 1 | strax package 2 | ============= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | strax.processing 11 | strax.storage 12 | 13 | Submodules 14 | ---------- 15 | 16 | strax.chunk module 17 | ------------------ 18 | 19 | .. automodule:: strax.chunk 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | strax.config module 25 | ------------------- 26 | 27 | .. automodule:: strax.config 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | strax.context module 33 | -------------------- 34 | 35 | .. automodule:: strax.context 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | 40 | strax.corrections module 41 | ------------------------ 42 | 43 | .. automodule:: strax.corrections 44 | :members: 45 | :undoc-members: 46 | :show-inheritance: 47 | 48 | strax.dtypes module 49 | ------------------- 50 | 51 | .. automodule:: strax.dtypes 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | 56 | strax.io module 57 | --------------- 58 | 59 | .. automodule:: strax.io 60 | :members: 61 | :undoc-members: 62 | :show-inheritance: 63 | 64 | strax.mailbox module 65 | -------------------- 66 | 67 | .. automodule:: strax.mailbox 68 | :members: 69 | :undoc-members: 70 | :show-inheritance: 71 | 72 | strax.plugin module 73 | ------------------- 74 | 75 | .. automodule:: strax.plugin 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | 80 | strax.processor module 81 | ---------------------- 82 | 83 | .. automodule:: strax.processor 84 | :members: 85 | :undoc-members: 86 | :show-inheritance: 87 | 88 | strax.run\_selection module 89 | --------------------------- 90 | 91 | .. automodule:: strax.run_selection 92 | :members: 93 | :undoc-members: 94 | :show-inheritance: 95 | 96 | strax.testutils module 97 | ---------------------- 98 | 99 | .. automodule:: strax.testutils 100 | :members: 101 | :undoc-members: 102 | :show-inheritance: 103 | 104 | strax.utils module 105 | ------------------ 106 | 107 | .. automodule:: strax.utils 108 | :members: 109 | :undoc-members: 110 | :show-inheritance: 111 | 112 | Module contents 113 | --------------- 114 | 115 | .. automodule:: strax 116 | :members: 117 | :undoc-members: 118 | :show-inheritance: 119 | -------------------------------------------------------------------------------- /docs/source/developer/pipeline.rst: -------------------------------------------------------------------------------- 1 | Pipeline 2 | ========= 3 | 4 | This describes how strax chains computations from multiple plugins together in a pipeline. 5 | 6 | In python, pipeline components can offer two semantics. In **pull-semantics**, usually implemented with generators, somebody calls ``next`` to pull output, and ``StopIteration`` signals nothing more is coming. In **push-semantics**, usually implemented with coroutines, input is pushed in with a ``send`` method. If cleanup is required, a ``close`` method must be invoked. These can be chained together to make pipelines. Either can also be implemented with custom classes instead of standard python generators/coroutines. 7 | 8 | Strax primarily uses pull-semantics: 9 | * Loaders are plain iterators; 10 | * Plugins iterate over inputs, and expect their results to be iterated over; 11 | * Savers use both semantics. Usually they iterate over their input. However, during multiprocessing, savers have their inputs sent into them, and must be explicitly closed. 12 | 13 | Mailboxes 14 | ---------- 15 | Strax could not be built by just chaining iterators or coroutines. 16 | * Pipelines can have multiple inputs and outputs, which generally come at different speeds; we cannot simply push on or pull from one endpoint. 17 | * For parallellization, we must run the same computation on several chunks at a time, then gather the results. 18 | 19 | The *mailbox* class provides the additional machinery that handles this. During processing, each data type has a mailbox. 20 | A data type's mailbox iterates over the results of the plugin or loader that produces it. It also provides an iterator to each plugin that needs it as an input. 21 | 22 | The input iterators given to the plugins must be somewhat magical. If we call ``next``, but the input is not yet available, we must pause (and do something else) until it is. 23 | To enable this suspending, strax runs each plugin in a separate thread. (We could use a framework like ``asyncio`` instead if we wanted to run our own scheduler, now we just use the OS's scheduler.) 24 | 25 | The threads in strax are thus motivated by `concurrency `_, not parallelism. As a bonus, they do allow different plugins to run simultaneously. The benefit is limited by python's global interpreter lock, but this does not affect IO or computations in numpy and numba. 26 | 27 | 28 | 29 | Exception propgagation 30 | ------------------------ 31 | 32 | TODO: document MailboxKilled etc. 33 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | Strax 3 | ====== 4 | 5 | Github page: https://github.com/AxFoundation/strax 6 | 7 | Strax is an analysis framework for pulse-only digitization data, 8 | specialized for live data processing at speeds of 50-100 MB(raw) / core / sec. 9 | 10 | For comparison, this is more than 100x faster than the XENON1T processor `pax `_, 11 | and does not require a preprocessing stage ('eventbuilder'). 12 | It achieves this due to using `numpy `_ `structured arrays `_ internally, 13 | which are supported by the amazing just-in-time compiler `numba `_. 14 | 15 | Strax is primarily developed for the XENONnT experiment, although the configuration and specific algorithms for XENONnT are hosted at ``_. You can find its documentation `here `_. 16 | 17 | You might also find these presentations useful: 18 | 19 | * `Talk on strax at the first XENONnT software telecon (May 2018) `_ 20 | * `Talk on strax for DAQ experts (May 2018) `_ 21 | 22 | 23 | .. toctree:: 24 | :maxdepth: 1 25 | :caption: Setup and basics 26 | 27 | basics/setup 28 | basics/overview 29 | 30 | .. toctree:: 31 | :maxdepth: 1 32 | :caption: Advanced usage 33 | 34 | advanced/plugin_dev 35 | advanced/config 36 | advanced/chunking 37 | advanced/superrun 38 | advanced/out_of_core 39 | advanced/recompression 40 | advanced/fuzzy_for 41 | 42 | .. toctree:: 43 | :maxdepth: 1 44 | :caption: Developer documentation 45 | 46 | developer/pipeline 47 | developer/parallel 48 | developer/overlaps 49 | developer/storage 50 | developer/contributing 51 | developer/documentation 52 | developer/release 53 | 54 | The above pages describe how strax's processing framework works under the hood, and explains some implementation choices. It's meant for people who want to do core development on strax; users or even plugin developers should not need it. 55 | 56 | .. toctree:: 57 | :maxdepth: 1 58 | :caption: API Reference 59 | 60 | reference/strax 61 | 62 | .. toctree:: 63 | :maxdepth: 2 64 | :caption: Release notes 65 | 66 | reference/release_notes 67 | 68 | 69 | * :ref:`genindex` 70 | * :ref:`modindex` 71 | -------------------------------------------------------------------------------- /tests/test_exhaust_plugin.py: -------------------------------------------------------------------------------- 1 | from typing import Tuple 2 | import numpy as np 3 | import strax 4 | from strax import Plugin, ExhaustPlugin 5 | 6 | 7 | @strax.takes_config( 8 | strax.Option(name="n_chunks", default=10), 9 | strax.Option(name="n_items", default=10), 10 | ) 11 | class ToExhaust(Plugin): 12 | depends_on: Tuple = tuple() 13 | provides: str = "to_exhaust" 14 | 15 | dtype = strax.time_fields 16 | 17 | source_done = False 18 | 19 | def compute(self, chunk_i): 20 | data = np.empty(self.config["n_items"], dtype=self.dtype) 21 | data["time"] = np.arange(self.config["n_items"]) + chunk_i * self.config["n_items"] 22 | data["endtime"] = data["time"] 23 | 24 | if chunk_i == self.config["n_chunks"] - 1: 25 | self.source_done = True 26 | 27 | return self.chunk( 28 | data=data, 29 | start=int(data[0]["time"]), 30 | end=int(strax.endtime(data[-1])) + 1, # to make sure that data is continuous 31 | ) 32 | 33 | def source_finished(self): 34 | return self.source_done 35 | 36 | def is_ready(self, chunk_i): 37 | if "ready" not in self.__dict__: 38 | self.ready = False 39 | self.ready ^= True # Flip 40 | return self.ready 41 | 42 | 43 | @strax.takes_config( 44 | strax.Option(name="n_chunks", default=10), 45 | strax.Option(name="n_items", default=10), 46 | ) 47 | class Exhausted(ExhaustPlugin): 48 | depends_on: str = "to_exhaust" 49 | provides: str = "exhausted" 50 | 51 | dtype = strax.time_fields 52 | 53 | def compute(self, to_exhaust): 54 | return to_exhaust 55 | 56 | def _fetch_chunk(self, d, iters, check_end_not_before=None): 57 | flag = self.input_buffer[d] is None # only check if we have not read anything yet 58 | super()._fetch_chunk(d, iters, check_end_not_before=check_end_not_before) 59 | if flag and (len(self.input_buffer[d]) != self.config["n_chunks"] * self.config["n_items"]): 60 | raise RuntimeError("Exhausted plugin did not read all chunks!") 61 | return False 62 | 63 | 64 | def test_exhaust_plugin(): 65 | """Test the ExhaustPlugin, about whether it can really exhaust the data or not.""" 66 | st = strax.Context(storage=[]) 67 | st.register((ToExhaust, Exhausted)) 68 | st.storage = [ 69 | strax.DataDirectory( 70 | "./strax_data", 71 | provide_run_metadata=True, 72 | ) 73 | ] 74 | run_id = "000000" 75 | st.make(run_id, "to_exhaust") 76 | st.get_array(run_id, "exhausted") 77 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | import strax 2 | import numpy as np 3 | import tempfile 4 | import unittest 5 | import hypothesis 6 | from hypothesis import given 7 | 8 | 9 | @strax.takes_config( 10 | strax.Option(name="int_option", type=int, default=42), 11 | strax.Option(name="str_option", type=str, default="forty_two"), 12 | strax.Config(name="mixed", type=int, default=42), 13 | ) 14 | class DummyPlugin(strax.Plugin): 15 | depends_on = () 16 | provides = ("dummy_data",) 17 | dtype = strax.dtypes.time_fields + [ 18 | (("Some data description", "some_data_name"), np.int32), 19 | ] 20 | 21 | int_config = strax.Config(type=int, default=42) 22 | str_config = strax.Config(type=str, default="forty_two") 23 | 24 | 25 | class TestPluginConfig(unittest.TestCase): 26 | @staticmethod 27 | def get_plugin(config): 28 | with tempfile.TemporaryDirectory() as temp_dir: 29 | context = strax.Context( 30 | storage=strax.DataDirectory(temp_dir, deep_scan=True), 31 | config=config, 32 | register=[DummyPlugin], 33 | use_per_run_defaults=True, 34 | ) 35 | 36 | return context.get_single_plugin("321", "dummy_data") 37 | 38 | def test_config_defaults(self): 39 | p = self.get_plugin({}) 40 | assert p.int_config == p.int_option == 42 41 | assert p.str_option == p.str_config == "forty_two" 42 | 43 | @given( 44 | hypothesis.strategies.integers(), 45 | hypothesis.strategies.text(), 46 | ) 47 | def test_config_attr_access(self, int_value, str_value): 48 | config = { 49 | "int_config": int_value, 50 | "str_config": str_value, 51 | "int_option": int_value, 52 | "str_option": str_value, 53 | } 54 | p = self.get_plugin(config) 55 | 56 | assert p.int_config == p.int_option == int_value 57 | assert p.str_option == p.str_config == str_value 58 | 59 | @given( 60 | hypothesis.strategies.integers(), 61 | hypothesis.strategies.text(), 62 | ) 63 | def test_config_dict_access(self, int_value, str_value): 64 | """Test backward compatibility.""" 65 | config = { 66 | "int_config": int_value, 67 | "str_config": str_value, 68 | "int_option": int_value, 69 | "str_option": str_value, 70 | } 71 | 72 | p = self.get_plugin(config) 73 | assert p.config["int_config"] == p.config["int_option"] == int_value 74 | assert p.config["str_config"] == p.config["str_option"] == str_value 75 | 76 | def test_config_backward_compatibility(self): 77 | p = self.get_plugin({}) 78 | assert p.mixed == 42 79 | -------------------------------------------------------------------------------- /docs/source/developer/storage.rst: -------------------------------------------------------------------------------- 1 | Storage 2 | ======== 3 | 4 | Overview 5 | --------- 6 | Players in strax's storage system take on one of three roles: 7 | * ``StorageFrontend``: Find data locations, and communicate this to one or more ``StorageBackend`` instances; 8 | * ``StorageBackend``: load pieces of data, and create instances of ``Saver``; 9 | * ``Saver``: save pieces of data to a specific location. 10 | 11 | As an example, a ``StorageFrontend`` could talk to a database that tracks which data is stored where. 12 | A ``StorageBackend`` then retrieves data from local disks, while another might retrieve it remotely using SSH or other transfer systems. 13 | The front-end decides which backend is appropriate for a given request. Finally, a ``Savers`` guides the process of writing a particular 14 | piece of data to disk or databases (potentially from multiple cores), compressing and rechunking as needed. 15 | 16 | To implement a new way of storing and/or tracking data, you must implement (subclass) all or some of these classes. 17 | This means subclassing them and overriding a few specific methods 18 | (called 'abstract methods' because they ``raise NotImplementedError`` if they are not overridden). 19 | 20 | Keys 21 | ----- 22 | In strax, a piece of data is identified by a *DataKey*, consisting of three components: 23 | * The run id 24 | * The data type 25 | * The complete *lineage* of the data. This includes, for the data type itself, and all types it depends on (and their dependencies, and so forth): 26 | * The plugin class name that produced the data; 27 | * The version string of the plugin; 28 | * The values of all configuration options the plugin took (whether they were explicitly specified or left as default). 29 | 30 | When you ask for data using ``Context.get_xxx``, the context will produce a key like this, and pass it to the ``StorageFrontend``. 31 | It then looks for a filename or database collection name that matches this key -- something a ``StorageBackend`` understands. which is therefore generically called a *backend key*. 32 | The matching between DataKey and backend key can be done very strictly, or more loosely, depending on how the context is configured. 33 | This way you can choose to be completely sure about what data you get, or be more flexible and load whatever is available. 34 | TODO: ref context documentation. 35 | 36 | 37 | Run-level metadata 38 | ------------------- 39 | Metadata can be associated with a run, but no particular data type. The ``StorageFrontend`` must take care of saving and loading these. 40 | 41 | Such run-level metadata can be crucial in providing run-dependent default setting for configuration options, for example, calibrated quantities necessary 42 | for data processing (e.g. electron lifetime and PMT gains). 43 | -------------------------------------------------------------------------------- /tests/test_statistics.py: -------------------------------------------------------------------------------- 1 | import strax 2 | import numpy as np 3 | from scipy.stats import norm 4 | from strax.processing.hitlets import highest_density_region_width 5 | 6 | 7 | def test_highest_density_region(): 8 | """Unity test for highest density regions.""" 9 | # Some distribution: 10 | distribution = np.array([0, 0, 3, 4, 2, 0, 1]) 11 | # Truth dict always stores fraction desired, intervals: 12 | truth_dict = {0.2: [[2, 4]], 0.7: [[2, 5], [6, 7]]} 13 | _test_highest_density_region(distribution, truth_dict) 14 | 15 | # Distribution with an offset: 16 | distribution = np.array([0, 0, 3, 4, 2, 0, 1]) + 2 17 | truth_dict = {0.2: [[2, 5]], 0.7: [[0, len(distribution)]]} 18 | _test_highest_density_region(distribution, truth_dict) 19 | 20 | 21 | def _test_highest_density_region(distribution, truth_dict): 22 | intervals, heights = strax.highest_density_region( 23 | distribution, 24 | np.array(list(truth_dict.keys())), 25 | only_upper_part=True, 26 | ) 27 | for fraction_ind, (key, values) in enumerate(truth_dict.items()): 28 | for ind_interval, interval in enumerate(values): 29 | int_found = intervals[fraction_ind, :, ind_interval] 30 | mes = ( 31 | f"Have not found the correct edges for a fraction of {key}% found {int_found}, but" 32 | f" expected {interval}" 33 | ) 34 | assert np.all(int_found == interval), mes 35 | 36 | 37 | def test_too_small_buffer(): 38 | """Unit test to check whether a too small buffer leads to np.nans.""" 39 | distribution = np.ones(1000) 40 | distribution[::4] = 0 41 | indicies, _ = strax.highest_density_region( 42 | distribution, 43 | np.array([0.5]), 44 | only_upper_part=True, 45 | ) 46 | assert np.all(indicies == -1) 47 | 48 | width = highest_density_region_width( 49 | distribution, fractions_desired=np.array([0.5]), _buffer_size=10 50 | ) 51 | assert np.all(np.isnan(width)) 52 | 53 | 54 | def test_true_hdr(): 55 | """Tests if highest density region returns for a normal distribution the expected -1/1 56 | boundaries for 68.27% coverage. 57 | 58 | We are not using a too high precision here to reduce the total test time. 59 | 60 | """ 61 | x = np.arange(-5, 5, 10**-4) 62 | data = norm.pdf(x) 63 | data /= np.sum(data) 64 | index, _ = strax.highest_density_region(data, fractions_desired=np.array([0.6827])) 65 | a_index = index[0, 0, 0] 66 | b_index = index[0, 1, 0] 67 | area = np.sum(data[a_index : (b_index - 1)]) 68 | 69 | assert np.isclose(area, 0.6827, rtol=10**-4), (area, 0.6827) 70 | assert np.isclose(x[a_index], -1, rtol=10**-3), (x[a_index], -1) 71 | assert np.isclose(x[b_index - 1], 1, rtol=10**-3), (x[b_index - 1], 1) 72 | -------------------------------------------------------------------------------- /docs/source/developer/overlaps.rst: -------------------------------------------------------------------------------- 1 | Chunk boundary handling 2 | ======================== 3 | 4 | Many algorithms need to look back and ahead in the data. For example, we want to group nearby PMT pulses together into peaks. 5 | Or, to ensure non-overlapping events, we want to group triggers with others that occur just before or ahead. 6 | 7 | During online processing, however, not all data is available. How can you look ahead if the data has not been acquired? 8 | Even during offline processing, you may not be able to keep all data of a given type for a run into RAM. 9 | 10 | Overlap window plugins 11 | ----------------------- 12 | 13 | Strax includes the `OverlapWindowPlugin` to deal with this case. To use it, specify a **window size**: a maximum duration which the algorithm needs to look back or ahead. Then write your algorithm as if there are no chunk breaks -- everything will be taken care of behind the scenes. 14 | 15 | The illustration below shows how `OverlapWindowPlugin` works. Imagine this is a event finder plugin, which finds events (green and red dashes) in a stream of peaks (continuous blue line). 16 | 17 | .. image:: overlap_window.jpg 18 | 19 | * Outputs too close to the end of a chunk are discarded, except for the last chunk. When a chunk arrives, it is generally not known to be the last, so we keep this data around internally and emit it once we get word there will be no more chunks. 20 | * Inputs close to the end of a chunk (pink region) are cached, and added to the input for the next chunk. Note we must cache *two* windows of input: to interpret data at (end - one window) we need all data from (end - two windows) to (end). 21 | * For the next chunk, outputs fully in the region between (end - window) and (end) of the previous chunk are discarded. These are invalid, and moreover, we sent out the valid ouputs for that range during the previous chunks. 22 | 23 | Note from the figure that outputs straddling (end - window) are initially discarded; they are recomputed during the next chunk. 24 | 25 | If the endtimes of two objects are separated by more than a window size, they must be guaranteed to not influence each other. If your algorithm does not have this guarantee, you cannot use `OverlapWindowPlugin` and must implement a custom solution. Make sure your window is large enough so this guarantee holds even if the objects themselves have a relevant length. 26 | 27 | Chunk breaks and the DAQ reader 28 | -------------------------------- 29 | 30 | Low-level datastreams are too large to be routed through a single core. Instead, each CPU sees only a chunk of data. However, the `OverlapWindowPlugin` will not work, because it keeps has state (the cached input, and the temporarily cached output), it cannot be parallelized. 31 | 32 | For the low-level datastream, we take a different approach 33 | 34 | TODO: document pre- and post-chunk stuff here. 35 | -------------------------------------------------------------------------------- /tests/test_peak_properties.py: -------------------------------------------------------------------------------- 1 | import strax 2 | import numpy as np 3 | from hypothesis import given, strategies, settings 4 | 5 | 6 | def get_filled_peaks(peak_length, data_length, n_widths): 7 | dtype = [ 8 | (("Start time since unix epoch [ns]", "time"), np.int64), 9 | (("dt in ns", "dt"), np.int64), 10 | (("length of p", "length"), np.int16), 11 | (("area of p", "area"), np.float64), 12 | (("data of p", "data"), (np.float64, data_length)), 13 | ] 14 | if n_widths is not None: 15 | dtype += [ 16 | (("center_time of p", "center_time"), np.int64), 17 | (("median_time of p", "median_time"), np.float64), 18 | (("width of p", "width"), (np.float64, n_widths)), 19 | ( 20 | ("area_decile_from_midpoint of p", "area_decile_from_midpoint"), 21 | (np.float64, n_widths), 22 | ), 23 | ] 24 | peaks = np.zeros(peak_length, dtype=dtype) 25 | dt = 1 26 | peaks["time"] = np.arange(peak_length) * dt 27 | peaks["dt"] = dt 28 | 29 | # Fill the peaks with random length data 30 | for p in peaks: 31 | length = np.random.randint(1, data_length) 32 | p["length"] = length 33 | wf = np.random.random(size=length) 34 | p["data"][:length] = wf 35 | if len(peaks): 36 | # Compute sum area 37 | peaks["area"] = np.sum(peaks["data"], axis=1) 38 | return peaks 39 | 40 | 41 | @settings(max_examples=100, deadline=None) 42 | @given( 43 | # number of peaks 44 | strategies.integers(min_value=0, max_value=20), 45 | # length of the data field in the peaks 46 | strategies.integers(min_value=2, max_value=20), 47 | ) 48 | def test_index_of_fraction(peak_length, data_length): 49 | """Test strax.index_of_fraction.""" 50 | peaks = get_filled_peaks(peak_length, data_length, n_widths=None) 51 | 52 | fraction_desired = np.random.random(size=peak_length) 53 | res = strax.index_of_fraction(peaks, fraction_desired) 54 | assert len(res) == len(peaks), "Lost peaks" 55 | if len(peaks): 56 | assert np.max(res) <= data_length, "Index returned out of bound" 57 | 58 | 59 | @settings(max_examples=100, deadline=None) 60 | @given( 61 | # number of peaks 62 | strategies.integers(min_value=0, max_value=20), 63 | # length of the data field in the peaks 64 | strategies.integers(min_value=2, max_value=20), 65 | # Number of widths to compute 66 | strategies.integers(min_value=2, max_value=10), 67 | ) 68 | def test_compute_center_time_widths(peak_length, data_length, n_widths): 69 | """Test strax.compute_properties.""" 70 | peaks = get_filled_peaks(peak_length, data_length, n_widths) 71 | 72 | # Make a copy of peaks to test that they don't remain the same later 73 | pre_peaks = peaks.copy() 74 | strax.compute_properties(peaks) 75 | 76 | assert len(pre_peaks) == len(peaks), "Lost peaks" 77 | if np.sum(peaks["area"] > 0) > 10: 78 | mess = ( 79 | "Highly unlikely that from at least 10 positive area peaks " 80 | "none were able to compute the width" 81 | ) 82 | assert np.any(peaks["width"] != pre_peaks["width"]), mess 83 | -------------------------------------------------------------------------------- /tests/test_peak_merging.py: -------------------------------------------------------------------------------- 1 | import hypothesis 2 | import numpy as np 3 | 4 | import strax 5 | from strax.testutils import disjoint_sorted_intervals, fake_hits 6 | 7 | 8 | @hypothesis.given(disjoint_sorted_intervals, disjoint_sorted_intervals) 9 | @hypothesis.settings(max_examples=1000, deadline=None) 10 | def test_replace_merged(intervals, merge_instructions): 11 | # First we have to create some merged intervals. 12 | # We abuse the interval generation mechanism to create 'merge_instructions' 13 | # i.e. something to tell us which indices of intervals must be merged 14 | # together. 15 | 16 | merged_itvs = [] 17 | to_remove = [] 18 | for x in merge_instructions: 19 | start, end_inclusive = x["time"], x["time"] + x["length"] - 1 20 | if end_inclusive == start or end_inclusive >= len(intervals): 21 | # Pointless / invalid merge instruction 22 | continue 23 | to_remove.extend(list(range(start, end_inclusive + 1))) 24 | new = np.zeros(1, strax.interval_dtype)[0] 25 | new["time"] = intervals[start]["time"] 26 | new["length"] = strax.endtime(intervals[end_inclusive]) - new["time"] 27 | new["dt"] = 1 28 | merged_itvs.append(new) 29 | removed_itvs = [] 30 | kept_itvs = [] 31 | for i, itv in enumerate(intervals): 32 | if i in to_remove: 33 | removed_itvs.append(itv) 34 | else: 35 | kept_itvs.append(itv) 36 | 37 | kept_itvs = np.array(kept_itvs) 38 | merged_itvs = np.array(merged_itvs) 39 | 40 | result = strax.replace_merged(intervals, merged_itvs) 41 | assert len(result) == len(merged_itvs) + len(kept_itvs) 42 | assert np.all(np.diff(result["time"]) > 0), "Not sorted" 43 | assert np.all(result["time"][1:] - strax.endtime(result)[:-1] >= 0), "Overlap" 44 | for x in kept_itvs: 45 | assert x in result, "Removed too many" 46 | for x in merged_itvs: 47 | assert x in result, "Didn't put in merged" 48 | for x in result: 49 | assert np.isin(x, merged_itvs) or np.isin(x, kept_itvs), "Invented itv" 50 | 51 | 52 | @hypothesis.given( 53 | fake_hits, 54 | hypothesis.strategies.integers(min_value=0, max_value=int(1e18)), 55 | hypothesis.strategies.integers(min_value=0, max_value=100), 56 | hypothesis.strategies.integers(min_value=1, max_value=2), 57 | ) 58 | @hypothesis.settings(deadline=None) 59 | def test_add_lone_hits(hits, time_offset, peak_length, dt): 60 | peak = np.zeros(1, dtype=strax.peak_dtype()) 61 | peak["time"] = time_offset 62 | hits["time"] += time_offset 63 | peak["length"] = peak_length 64 | hits["area"] = 1 65 | peak["dt"] = dt 66 | 67 | to_pe = np.ones(10000) 68 | strax.add_lone_hits(peak, hits, to_pe) 69 | 70 | if not peak_length: 71 | assert peak["area"] == 0 72 | assert peak["data"].sum() == 0 73 | return 74 | 75 | split_hits = strax.split_by_containment(hits, peak)[0] 76 | dummy_peak = np.zeros(peak_length) 77 | 78 | for h in split_hits: 79 | dummy_peak[(h["time"] - time_offset) // dt] += h["area"] 80 | peak = peak[0] 81 | assert peak["area"] == np.sum(split_hits["area"]) 82 | assert np.all(peak["data"][:peak_length] == dummy_peak) 83 | -------------------------------------------------------------------------------- /.github/workflows/pytest.yml: -------------------------------------------------------------------------------- 1 | # Test strax on each PR 2 | name: tests 3 | 4 | # Trigger this code when a new release is published 5 | on: 6 | workflow_dispatch: 7 | release: 8 | types: [ created ] 9 | pull_request: 10 | branches: 11 | - master 12 | - stable 13 | push: 14 | branches: 15 | - master 16 | 17 | jobs: 18 | test: 19 | name: "${{ matrix.test }}_py${{ matrix.python-version }}" 20 | runs-on: ubuntu-latest 21 | strategy: 22 | fail-fast: false 23 | matrix: 24 | python-version: ["3.10", "3.11"] 25 | test: ["coveralls", "pytest"] 26 | # Installation on py3.10 is rather slow at the moment 27 | exclude: 28 | - python-version: "3.11" 29 | test: coveralls 30 | steps: 31 | - name: Checkout repo 32 | uses: actions/checkout@v4 33 | - name: Setup python 34 | uses: actions/setup-python@v5 35 | with: 36 | python-version: ${{ matrix.python-version }} 37 | - name: Install dependencies 38 | run: sudo apt-get install -y graphviz 39 | - name: Install requirements for Python 3.10 40 | if: matrix.python-version == '3.10' 41 | run: pip install git+https://github.com/XENONnT/base_environment.git@el7.2025.01.3 --force-reinstall 42 | - name: Install requirements for Python 3.11 43 | if: matrix.python-version == '3.11' 44 | run: pip install git+https://github.com/XENONnT/base_environment.git --force-reinstall 45 | - name: Install strax 46 | run: pip install . 47 | - name: Start MongoDB 48 | uses: supercharge/mongodb-github-action@1.11.0 49 | with: 50 | mongodb-version: 4.2 51 | # Perform coveralls (if coverage is set to True) or pytest 52 | - name: Test package 53 | if: matrix.test == 'pytest' 54 | env: 55 | TEST_MONGO_URI: 'mongodb://localhost:27017/' 56 | run: | 57 | pytest -v --durations 0 58 | - name: Coverage run 59 | if: matrix.test == 'coveralls' 60 | env: 61 | NUMBA_DISABLE_JIT: 1 62 | TEST_MONGO_URI: 'mongodb://localhost:27017/' 63 | run: | 64 | coverage run --source=strax -m pytest --durations 0 -v 65 | - name: Coverage run - backward compatibility check with straxen 66 | if: matrix.test == 'coveralls' 67 | env: 68 | NUMBA_DISABLE_JIT: 1 69 | TEST_MONGO_URI: 'mongodb://localhost:27017/' 70 | run: | 71 | echo "clone straxen" 72 | straxen_dir="../straxen/" 73 | git clone --single-branch --branch master https://github.com/XENONnT/straxen.git $straxen_dir 74 | bash $straxen_dir/.github/scripts/create_pre_apply_function.sh $HOME 75 | pip install -e $straxen_dir # Reinstall since tests might reflect new code. 76 | echo "Run straxen tests" 77 | coverage run --append --source=strax -m pytest $straxen_dir 78 | coverage report 79 | - name: Coveralls upload 80 | if: matrix.test == 'coveralls' 81 | env: 82 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 83 | run: | 84 | coverage report 85 | coveralls --service=github 86 | - name: goodbye 87 | run: echo goodbye 88 | -------------------------------------------------------------------------------- /tests/test_pulse_processing.py: -------------------------------------------------------------------------------- 1 | from strax.testutils import single_fake_pulse 2 | 3 | import numpy as np 4 | from hypothesis import given, settings 5 | from scipy.ndimage import convolve1d 6 | 7 | import strax 8 | 9 | 10 | def _find_hits(r): 11 | # Test pulses have dt=1 and time=0 12 | # hm, maybe this doesn't test everything? 13 | 14 | hits = strax.find_hits(r, min_amplitude=1) 15 | 16 | # dt = 1, so: 17 | np.testing.assert_equal(hits["time"], hits["left"]) 18 | 19 | # NB: exclusive right bound, no + 1 here 20 | np.testing.assert_equal(hits["length"], hits["right"] - hits["left"]) 21 | 22 | # Check hits are properly integrated 23 | for h in hits: 24 | q = r[h["record_i"]] 25 | assert q["data"][h["left"] : h["right"]].sum() == h["area"] 26 | 27 | return list(zip(hits["left"], hits["right"])) 28 | 29 | 30 | def test_find_hits(): 31 | """Tests the hitfinder with simple example pulses.""" 32 | for w, should_find_intervals in [ 33 | ([], []), 34 | ([1], [(0, 1)]), 35 | ([1, 0], [(0, 1)]), 36 | ([1, 0, 1], [(0, 1), (2, 3)]), 37 | ([1, 0, 1, 0], [(0, 1), (2, 3)]), 38 | ([1, 0, 1, 0, 1], [(0, 1), (2, 3), (4, 5)]), 39 | ([0, 1, 2, 0, 4, -1, 60, 700, -4], [(1, 3), (4, 5), (6, 8)]), 40 | ([1, 1, 2, 0, 4, -1, 60, 700, -4], [(0, 3), (4, 5), (6, 8)]), 41 | ([1, 0, 2, 3, 4, -1, 60, 700, -4], [(0, 1), (2, 5), (6, 8)]), 42 | ([1, 0, 2, 3, 4, -1, 60, 700, 800], [(0, 1), (2, 5), (6, 9)]), 43 | ([0, 0, 2, 3, 4, -1, 60, 700, 800], [(2, 5), (6, 9)]), 44 | ]: 45 | records = np.zeros(1, strax.record_dtype(9)) 46 | records[0]["data"][: len(w)] = w 47 | records["dt"] = 1 48 | records["length"] = 9 49 | 50 | results = _find_hits(records) 51 | assert len(results) == len(should_find_intervals) 52 | assert results == should_find_intervals 53 | 54 | 55 | @settings(deadline=None) 56 | @given(single_fake_pulse) 57 | def test_find_hits_randomize(records): 58 | """Tests the hitfinder with whatever hypothesis can throw at it (well, pulse only takes (0, 1), 59 | and we only test a single pulse at a time)""" 60 | results = _find_hits(records) 61 | w = records[0]["data"] 62 | 63 | # Check for false positives 64 | for ll, rr in results: 65 | assert np.all(w[ll:rr] == 1) 66 | 67 | # Check for false negatives 68 | for i in range(len(results) - 1): 69 | l_ = results[i][1] 70 | r_ = results[i + 1][0] 71 | assert not np.any(w[l_:r_] == 1) 72 | 73 | 74 | def test_filter_waveforms(): 75 | """Test that filter_records gives the same output as a simple convolution applied to the 76 | original pulse (before splitting into records)""" 77 | wv = np.random.randn(300) 78 | ir = np.random.randn(41) 79 | ir[10] += 10 # Because it crashes for max at edges 80 | origin = np.argmax(ir) - (len(ir) // 2) 81 | wv_after = convolve1d(wv, ir, mode="constant", origin=origin) 82 | 83 | wvs = wv.reshape(3, 100) 84 | wvs = strax.filter_waveforms( 85 | wvs, 86 | ir, 87 | prev_r=np.array([strax.NO_RECORD_LINK, 0, 1]), 88 | next_r=np.array([1, 2, strax.NO_RECORD_LINK]), 89 | ) 90 | wv_after_2 = np.reshape(wvs, -1) 91 | 92 | assert np.abs(wv_after - wv_after_2).sum() < 1e-9 93 | -------------------------------------------------------------------------------- /docs/source/advanced/out_of_core.rst: -------------------------------------------------------------------------------- 1 | Out of core computation 2 | ======================= 3 | 4 | Overview and motivation 5 | ------------------------ 6 | Many times analyses involve performing some computation not implemented by a plugin (e.g. plotting) 7 | that require loading more data than can fit into memory, 8 | these type of tasks are commonly reffered to as out-of-core computations. 9 | Out-of-core algorithms usually involve a few repeating steps: 10 | 11 | 1. chunk the dataset into managable sizes 12 | 2. load the data chunk by chunk 13 | 3. perform some computation on each chunk 14 | 4. save a summary of the results for each chunk 15 | 5. perform some combination of the per-chunk results into a final result. 16 | 17 | While it is of course possible to implement these operations yourself, it can be tedious and repetative and the code becomes very rigid to the specific calculations being performed. 18 | A better approach is to use abstractions of commonly performed operations that use out-of-core algorithms under the hood to get the same result as if the operations were performed on the entire dataset. 19 | Code written using these abstractions can then run both on in-memory datasets as well as out-of-core datasets alike. 20 | More importantly the implmentations of these algorithms can be written once and packaged to then be used by all. 21 | 22 | Data chunking 23 | ------------- 24 | The zarr package provides an abstraction of the data-access api of numpy arrays for chunked and compressed data stored in memory or disk. 25 | zarr provides an array abstraction with identical behavior to a numpy array when accessing data but where the underlyign data is actually a collection of compressed (optional) chunks. 26 | the strax context provides a convenience method for loading data directly into zarr arrays. 27 | 28 | .. code-block:: python 29 | 30 | import strax 31 | 32 | context = strax.Context(**CONTEXT_KWARGS) 33 | 34 | # you can pass the same arguments you pass to context.get_array() 35 | zgrp = context.get_zarr(RUN_IDs, DATA_TYPES, **GET_ARRAY_KWARGS) 36 | 37 | # the zarr group contains multiple arrays, one for each data type 38 | z = zgrp.data_type 39 | 40 | # individual arrays are also accessible via the __getitem__ interface 41 | z = zgrp['data_type'] 42 | 43 | # numpy-like data access, abstracting away the underlying 44 | # data reading which may include readin multiple chunks from disk/memory 45 | # and decompression then concatenation to return an in memory numpy array 46 | z[:100] 47 | 48 | 49 | Data processing 50 | --------------- 51 | The dask package provides abstractions for most of the numpy and pandas apis. 52 | The dask.Array and dask.DataFrame objects implement their respective apis 53 | using fully distributed algorithms, only loading a fraction of the total data into memory 54 | at any given moment for a given computing partition (thread/process/HPC-job). 55 | 56 | .. code-block:: python 57 | 58 | import dask.array as da 59 | 60 | # easily convert to dask.Array abstraction for processing 61 | darr = da.from_zarr(z) 62 | 63 | # its recommended to rechunk to sizes more appropriate for processing 64 | # see dask documentation for details 65 | darr.rechunk(CHUNK_SIZE) 66 | 67 | # you can also convert the dask.Array abstraction 68 | # to a dask.DataFrame abstraction if you need the pandas api 69 | ddf = darr.to_dask_dataframe() 70 | -------------------------------------------------------------------------------- /CODE-OF-CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | -------------------------------------------------------------------------------- /tests/test_inline_plugin.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | from unittest import TestCase 3 | import immutabledict 4 | import strax 5 | from strax.testutils import Records, Peaks, PeakClassification, run_id 6 | 7 | 8 | class ParallelPeaks(Peaks): 9 | parallel = "process" 10 | 11 | 12 | class ParallelPeakClassification(PeakClassification): 13 | parallel = "process" 14 | save_when = {k: strax.SaveWhen.EXPLICIT for k in PeakClassification.provides} 15 | save_when["lone_hits"] = strax.SaveWhen.ALWAYS 16 | save_when = immutabledict.immutabledict(save_when) 17 | 18 | 19 | class ParallelEnds(strax.Plugin): 20 | """The most stupid plugin to make sure that we depend on _some_ of the output of 21 | ParallelPeakClassification.""" 22 | 23 | parallel = "process" 24 | provides = "parallel_ends" 25 | depends_on = "peak_classification" 26 | dtype = strax.time_fields 27 | 28 | def compute(self, peaks): 29 | return {"time": peaks["time"], "endtime": strax.endtime(peaks)} 30 | 31 | 32 | class TestInline(TestCase): 33 | store_at = "./.test_inline" 34 | 35 | def setUp(self) -> None: 36 | st = strax.context.Context( 37 | allow_multiprocess=True, 38 | allow_lazy=False, 39 | max_messages=4, 40 | timeout=60, 41 | config=dict(bonus_area=9), 42 | ) 43 | st.storage = [strax.DataDirectory(self.store_at)] 44 | for p in [Records, ParallelPeaks, ParallelPeakClassification, ParallelEnds]: 45 | st.register(p) 46 | self.st = st 47 | assert not any(st.is_stored(run_id, t) for t in st._plugin_class_registry.keys()) 48 | 49 | def tearDown(self) -> None: 50 | shutil.rmtree(self.store_at) 51 | 52 | def test_inline(self, **make_kwargs): 53 | st = self.st 54 | targets = ("records", "parallel_ends") 55 | st.make( 56 | run_id, 57 | targets, 58 | allow_multiple=True, 59 | processor="threaded_mailbox", 60 | **make_kwargs, 61 | ) 62 | for target in list(st._plugin_class_registry.keys()): 63 | should_be_stored = st.get_save_when(target) == strax.SaveWhen.ALWAYS 64 | if target in targets and not should_be_stored: 65 | # redundant check but just in case someone ever changes 66 | # this test the records test plugin 67 | should_be_stored = st.get_save_when(target) == strax.SaveWhen.TARGET 68 | assert st.is_stored(run_id, target) == should_be_stored 69 | 70 | def test_inline_with_multi_processing(self, **make_kwargs): 71 | self.test_inline(max_workers=2, **make_kwargs) 72 | 73 | def test_inline_with_temp_config(self, **make_kwargs): 74 | self.test_inline_with_multi_processing(config=dict(secret_time_offset=10), **make_kwargs) 75 | 76 | def test_inline_bare(self, n_chunks=3): 77 | """Get the plugin from a bare processor and run in this thread.""" 78 | st = self.st 79 | st.set_config(dict(n_chunks=n_chunks)) 80 | targets = list(st._plugin_class_registry.keys()) 81 | components = st.get_components(run_id, targets=targets) 82 | parallel_components = strax.ParallelSourcePlugin.inline_plugins( 83 | components, start_from="records", log=st.log 84 | ) 85 | parallel_plugin = parallel_components.plugins["parallel_ends"] 86 | for chunk_i in range(n_chunks): 87 | assert len(parallel_plugin.do_compute(chunk_i=chunk_i)) 88 | -------------------------------------------------------------------------------- /tests/test_down_chunk_plugin.py: -------------------------------------------------------------------------------- 1 | from immutabledict import immutabledict 2 | from strax.testutils import RecordsWithTimeStructure, DownSampleRecords, run_id 3 | import strax 4 | import numpy as np 5 | 6 | import os 7 | import tempfile 8 | import shutil 9 | import uuid 10 | import unittest 11 | 12 | 13 | class TestContext(unittest.TestCase): 14 | """Tests for DownChunkPlugin class.""" 15 | 16 | def setUp(self): 17 | """Make temp folder to write data to.""" 18 | temp_folder = uuid.uuid4().hex 19 | self.tempdir = os.path.join(tempfile.gettempdir(), temp_folder) 20 | assert not os.path.exists(self.tempdir) 21 | 22 | def tearDown(self): 23 | if os.path.exists(self.tempdir): 24 | shutil.rmtree(self.tempdir) 25 | 26 | def test_down_chunking(self): 27 | st = self.get_context() 28 | st.register(RecordsWithTimeStructure) 29 | st.register(DownSampleRecords) 30 | 31 | st.make(run_id, "records") 32 | st.make(run_id, "records_down_chunked") 33 | 34 | chunks_records = st.get_metadata(run_id, "records")["chunks"] 35 | chunks_records_down_chunked = st.get_metadata(run_id, "records_down_chunked")["chunks"] 36 | 37 | _chunks_are_downsampled = len(chunks_records) * 2 == len(chunks_records_down_chunked) 38 | assert _chunks_are_downsampled 39 | 40 | _chunks_are_continues = np.all( 41 | [ 42 | chunks_records_down_chunked[i]["end"] == chunks_records_down_chunked[i + 1]["start"] 43 | for i in range(len(chunks_records_down_chunked) - 1) 44 | ] 45 | ) 46 | assert _chunks_are_continues 47 | 48 | def test_down_chunking_multi_processing(self): 49 | st = self.get_context(allow_multiprocess=True) 50 | st.register(RecordsWithTimeStructure) 51 | st.register(DownSampleRecords) 52 | 53 | st.make(run_id, "records", max_workers=1) 54 | 55 | class TestMultiProcessing(DownSampleRecords): 56 | parallel = True 57 | 58 | st.register(TestMultiProcessing) 59 | with self.assertRaises(NotImplementedError): 60 | st.make(run_id, "records_down_chunked", max_workers=2) 61 | 62 | def test_down_chunking_multi_output(self): 63 | st = self.get_context(allow_multiprocess=True) 64 | st.register(RecordsWithTimeStructure) 65 | st.register(DownSampleRecords) 66 | 67 | st.make(run_id, "records", max_workers=1) 68 | 69 | class TestMultiOutput(DownSampleRecords): 70 | provides = ("records_down_chunked", "records_down_chunked_copy") 71 | data_kind = immutabledict(zip(provides, provides)) 72 | 73 | def infer_dtype(self): 74 | return {p: DownSampleRecords.dtype for p in self.provides} 75 | 76 | def compute(self, records, start, end): 77 | for r in super().compute(records, start, end): 78 | yield r 79 | 80 | st.register(TestMultiOutput) 81 | with self.assertRaises(ValueError): 82 | st.make(run_id, "records_down_chunked", max_workers=2) 83 | 84 | def get_context(self, **kwargs): 85 | """Simple context to run tests.""" 86 | st = strax.Context(storage=self.get_mock_sf(), check_available=("records",), **kwargs) 87 | return st 88 | 89 | def get_mock_sf(self): 90 | mock_rundb = [{"name": "0", strax.RUN_DEFAULTS_KEY: dict(base_area=43)}] 91 | sf = strax.DataDirectory(path=self.tempdir, deep_scan=True, provide_run_metadata=True) 92 | for d in mock_rundb: 93 | sf.write_run_metadata(d["name"], d) 94 | return sf 95 | -------------------------------------------------------------------------------- /tests/test_saving.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import strax 3 | from strax.testutils import Records, Peaks 4 | import os 5 | import tempfile 6 | 7 | from strax import RUN_METADATA_PATTERN 8 | 9 | 10 | class TestPerRunDefaults(unittest.TestCase): 11 | """Test the saving behavior of the context.""" 12 | 13 | def setUp(self): 14 | self.test_run_id = "0" 15 | self.target = "records" 16 | self.tempdir = tempfile.TemporaryDirectory() 17 | self.path = self.tempdir.name 18 | self.st = strax.Context( 19 | use_per_run_defaults=True, register=[Records], storage=[strax.DataDirectory(self.path)] 20 | ) 21 | assert not self.st.is_stored(self.test_run_id, self.target) 22 | 23 | def tearDown(self): 24 | self.tempdir.cleanup() 25 | 26 | def test_savewhen_never(self, **kwargs): 27 | self.set_save_when("NEVER") 28 | self.st.make(self.test_run_id, self.target, **kwargs) 29 | assert not self.is_stored() 30 | 31 | def test_savewhen_never_with_save(self): 32 | should_fail_with_save = self.test_savewhen_never 33 | self.assertRaises(ValueError, should_fail_with_save, save=self.target) 34 | 35 | def test_savewhen_explict_without_save(self): 36 | self.set_save_when("EXPLICIT") 37 | self.st.make(self.test_run_id, self.target) 38 | assert not self.is_stored() 39 | 40 | def test_savewhen_explict_with_save(self): 41 | self.set_save_when("EXPLICIT") 42 | self.st.make(self.test_run_id, self.target, save=self.target) 43 | assert self.is_stored() 44 | 45 | def test_savewhen_target(self): 46 | self.set_save_when("TARGET") 47 | self.st.make(self.test_run_id, self.target) 48 | assert self.is_stored() 49 | 50 | def test_savewhen_always(self): 51 | self.set_save_when("ALWAYS") 52 | self.st.make(self.test_run_id, self.target) 53 | assert self.is_stored() 54 | 55 | def is_stored(self): 56 | return self.st.is_stored(self.test_run_id, self.target) 57 | 58 | def set_save_when(self, mode: str): 59 | if not hasattr(strax.SaveWhen, mode.upper()): 60 | raise ValueError(f"No such saving mode {mode}") 61 | save_mode = getattr(strax.SaveWhen, mode.upper()) 62 | self.st._plugin_class_registry[self.target].save_when = save_mode 63 | 64 | def test_raise_corruption(self): 65 | self.set_save_when("ALWAYS") 66 | self.st.make(self.test_run_id, self.target) 67 | assert self.is_stored() 68 | storage = self.st.storage[0] 69 | data_key = self.st.key_for(self.test_run_id, self.target) 70 | data_path = os.path.join(storage.path, str(data_key)) 71 | assert os.path.exists(data_path) 72 | metadata = storage.backends[0].get_metadata(data_path) 73 | assert isinstance(metadata, dict) 74 | 75 | # copied from FileSytemBackend (maybe abstractify the method separately?) 76 | prefix = strax.dirname_to_prefix(data_path) 77 | metadata_json = RUN_METADATA_PATTERN % prefix 78 | md_path = os.path.join(data_path, metadata_json) 79 | assert os.path.exists(md_path) 80 | 81 | # Corrupt the metadata (making it non-JSON parsable) 82 | md_file = open(md_path, "a") 83 | # Append 'hello' at the end of file 84 | md_file.write("Adding a non-JSON line to the file to corrupt the metadata") 85 | # Close the file 86 | md_file.close() 87 | 88 | # Now we should get an error since the metadata data is corrupted 89 | with self.assertRaises(strax.DataCorrupted): 90 | self.st.get_array(self.test_run_id, self.target) 91 | 92 | # Also test the error is raised if be build a target that depends on corrupted data 93 | self.st.register(Peaks) 94 | with self.assertRaises(strax.DataCorrupted): 95 | self.st.get_array(self.test_run_id, "peaks") 96 | 97 | # Cleanup if someone wants to re-use this self.st 98 | del self.st._plugin_class_registry["peaks"] 99 | -------------------------------------------------------------------------------- /docs/source/developer/parallel.rst: -------------------------------------------------------------------------------- 1 | Parallelization 2 | ================ 3 | 4 | Strax can process data at 50-100 raw-MB /sec single core, which is not enough for live online processing at high DAQ rates. We must thus parallelize at least some of the signal processing. 5 | 6 | Not all plugins can be parallelized. For example, we cannot assign event numbers (0, 1, 2, ...) in parallel if we want unique numbers that increment without gaps. We also cannot save to a single file in parallel. 7 | 8 | Multithreading 9 | --------------- 10 | To get parallelization, plugins can defer computations to a pool of **threads** or **processes**. If they do, they yield futures to the output mailbox instead of the actual results (numpy arrays). The mailbox awaits the futures and ensures each consumer gets the results in order. 11 | 12 | A plugin indicates to strax it is paralellizable by setting its ``parallel`` attribute to True. This usually causes strax to outsource computations to a pool of threads. Every chunk will result in a call to the thread pool. This has little overhead, though the performance gain is limited by the global interpreter lock. If the computation is in pure python, there is no benefit; however, numpy and numba code can benefit significantly (until the pure-python overhead around it becomes the limiting factor, at high numbers of cores). 13 | 14 | Loaders use multithreading by default, since their work is eminently parallelizable: they just load some data and decompress it (using low-level compressors that happily release the GIL). Savers that rechunk the data (e.g. to achieve more sysadmin-friendly filesizes) are not parallelizable. Savers that do not rechunk use multithreading just like loaders. 15 | 16 | 17 | Multiprocessing 18 | ---------------- 19 | 20 | Strax can also use multiprocessing for parallelization. This is useful to free pure-python computations from the shackles of the GIL. Low-level plugins deal with a massive data flow, so parallelizing theircomputations in separate processes is very inefficient due to data transfer overhead. Thread parallelization works fine (since the algorithms are implemented in numba) until you reach ~10 cores, when the GIL becomes binding due to pure-python overhead. 21 | 22 | You can set the ``parallel`` attribute to ``process``, to suggest strax should use a process pool instead of a thread pool. This is often not a good idea: multiprocessing incurs overhead from (1) forking the strax process and (2) pickling and unpickling the results in the child and parent processes. Strax will still not use multiprocessing at all unless you: 23 | - Set the allow_multiprocess context option to True, 24 | - Set max_workers to a value higher than 1 in the get_xxx call. 25 | 26 | During multiprocessing, computations of chunks from ``parallel='process'`` plugins will be outsourced to a process pool. Additionally, to avoid data transfer overhead, strax attempts to gather as many savers, dependencies, and savers of dependencies of a ``parallel='process'`` plugin to "inline" them: their computations are set to happen immedately after the main plugin's computation in the same process. This is achieved behind the scenes by replacing the plugin with a container-like plugin called ParallelSourcePlugin. Only parallelizable plugins and savers that do not rechunk will be inlined. 27 | 28 | Since savers can become inlined, they should work even if they are forked. That implies they cannot keep state, and must store metadata for each chunk in their backend as it arrives. For example, the FileStore backend produces a json file with metadata for each chunk. When the saver is closed, all the json files are read in and concatenated. A saver that can't do this should set `allow_fork = False`. 29 | 30 | 31 | Multi-run parallelization: a different problem 32 | ------------------------------------------------ 33 | 34 | Paralellizing quick (re)processing of many runs is a different problem altogether. It is easier in one way: since runs are assumed to be independent, we can simply process each run on a single core, and use our multiple cores to process multiple runs. However, it is harder in another: the large volume of desired result data may exceed available RAM. We can use Dask dataframes for this. Probably we can just copy/reuse the code in hax. 35 | -------------------------------------------------------------------------------- /tests/test_cut_plugin.py: -------------------------------------------------------------------------------- 1 | from strax import testutils 2 | import strax 3 | import numpy as np 4 | from hypothesis import given, strategies, example, settings 5 | 6 | # Initialize. We test both dt time-fields and time time-field 7 | _dtype_name = "var" 8 | _cut_dtype = ("variable 0", _dtype_name) 9 | full_dt_dtype = [(_cut_dtype, np.float64)] + strax.time_dt_fields 10 | full_time_dtype = [(_cut_dtype, np.float64)] + strax.time_fields 11 | 12 | 13 | def get_some_array(disjoint=True): 14 | # Either 0 or 1 15 | take_dt = np.random.choice(2) 16 | 17 | # Stolen from testutils.bounds_to_intervals 18 | def bounds_to_intervals(bs, dt=1): 19 | x = np.zeros(len(bs), dtype=full_dt_dtype if take_dt else full_time_dtype) 20 | x["time"] = [x[0] for x in bs] 21 | # Remember: exclusive right bound... 22 | if take_dt: 23 | x["length"] = [x[1] - x[0] for x in bs] 24 | x["dt"] = 1 25 | else: 26 | x["endtime"] = x["time"] + ([x[1] - x[0] for x in bs]) * dt 27 | return x 28 | 29 | # Randomly input either of full_dt_dtype or full_time_dtype 30 | sorted_intervals = testutils.sorted_bounds(disjoint=disjoint).map(bounds_to_intervals) 31 | return sorted_intervals 32 | 33 | 34 | @given( 35 | get_some_array().filter(lambda x: len(x) >= 0), strategies.integers(min_value=-10, max_value=10) 36 | ) 37 | @settings(deadline=None) 38 | # Examples for readability 39 | @example( 40 | input_peaks=np.array( 41 | [(-11, 0, 1), (0, 1, 3), (-5, 3, 5), (11, 5, 7), (7, 7, 9)], 42 | dtype=[(_cut_dtype, np.float64)] + strax.time_fields, 43 | ), 44 | cut_threshold=5, 45 | ) 46 | @example( 47 | input_peaks=np.array( 48 | [(0, 0, 1, 1), (1, 1, 1, 1), (5, 2, 2, 1), (11, 4, 2, 4)], 49 | dtype=[(_cut_dtype, np.int16)] + strax.time_dt_fields, 50 | ), 51 | cut_threshold=-1, 52 | ) 53 | def test_cut_plugin(input_peaks, cut_threshold): 54 | # Just one chunk will do 55 | chunks = [input_peaks] 56 | _dtype = input_peaks.dtype 57 | 58 | class ToBeCut(strax.Plugin): 59 | """Data to be cut with strax.CutPlugin.""" 60 | 61 | depends_on = tuple() 62 | dtype = _dtype 63 | provides = "to_be_cut" 64 | data_kind = "to_be_cut" # match with depends_on below 65 | 66 | def compute(self, chunk_i): 67 | data = chunks[chunk_i] 68 | return self.chunk( 69 | data=data, 70 | start=(int(data[0]["time"]) if len(data) else np.arange(len(chunks))[chunk_i]), 71 | end=( 72 | int(strax.endtime(data[-1])) 73 | if len(data) 74 | else np.arange(1, len(chunks) + 1)[chunk_i] 75 | ), 76 | ) 77 | 78 | # Hack to make peak output stop after a few chunks 79 | def is_ready(self, chunk_i): 80 | return chunk_i < len(chunks) 81 | 82 | def source_finished(self): 83 | return True 84 | 85 | class CutSomething(strax.CutPlugin): 86 | """Minimal working example of CutPlugin.""" 87 | 88 | depends_on = ("to_be_cut",) 89 | 90 | def cut_by(self, to_be_cut): 91 | return to_be_cut[_dtype_name] > cut_threshold 92 | 93 | st = strax.Context(storage=[]) 94 | st.register(ToBeCut) 95 | st.register(CutSomething) 96 | 97 | result = st.get_array(run_id="some_run", targets=strax.camel_to_snake(CutSomething.__name__)) 98 | correct_answer = np.sum(input_peaks[_dtype_name] > cut_threshold) 99 | assert len(result) == len(input_peaks), "WTF??" 100 | assert correct_answer == np.sum( 101 | result["cut_something"] 102 | ), "Cut plugin does not give boolean arrays correctly" 103 | 104 | if len(input_peaks): 105 | assert ( 106 | strax.endtime(input_peaks).max() == strax.endtime(result).max() 107 | ), "last end time got scrambled" 108 | assert np.all(input_peaks["time"] == result["time"]), "(start) times got scrambled" 109 | assert np.all( 110 | strax.endtime(input_peaks) == strax.endtime(result) 111 | ), "Some end times got scrambled" 112 | -------------------------------------------------------------------------------- /docs/source/advanced/chunking.rst: -------------------------------------------------------------------------------- 1 | Strax data model 2 | ================= 3 | 4 | Data type and kind 5 | ------------------- 6 | 7 | All data lives in *data types*, such as `raw_records` or `peak_basics`. Each of these has a fixed numpy datatype. 8 | 9 | If a single row of two data types refers to the same physical / logical thing, such as an event or a peak, we say those data types have the same `data_kind`. 10 | 11 | 12 | The Laws of Chunking 13 | --------------------- 14 | You shall obey them. 15 | 16 | 1. Each data row corresponds to a time interval. Time and (endtime or (dt and length)) are mandatory fields in all datatypes. 17 | 2. Strax handles data in chunks. A chunk is also an interval (containing rows of data which are individually intervals). 18 | 3. Suppose you have a chunk of some datatype reaching from [t0, t1), then 19 | 20 | a. It contains all and only data that starts >= t0 or ends <= t1; 21 | b. All data outside the chunk ends <= t0, or starts >= t1. (Remember intervals are half-open; the boundary cases are not ambiguous.) 22 | c. In particular, every data row lies completely in one chunk. No data whatsoever lies partially in more than one chunk. This means chunks cannot be split at arbitrary times. 23 | 24 | 4. Zero-duration data rows are not allowed. Zero-duration chunks are allowed, but they cannot contain data. 25 | 26 | 27 | Incoming data 28 | ------------- 29 | From the perspective of a plugin, all incoming data is time-synchronized and merged by kind. Specifically: 30 | 31 | * Data of the same kind is merged into a single array. If you depend on `events`, `peaks` and `peak_basics`, you will get two arrays: `events` and `peaks`. The second will be the merged array of `peaks` and `peak_basics`. 32 | * Data of different kinds are synchronized by time. Strax will fetch a chunks of the first kind (`events`), then fetch as much as needed from the second kind (`peaks`) until you have all peaks that end before or at exactly the same time as the last event. 33 | 34 | This example is a bit odd: when loading data of multiple kinds that are contained in each other, e.g. events and peaks, you very often want to use a `LoopPlugin` rather than a straight-up Plugin. 35 | 36 | Outgoing data 37 | ------------- 38 | Plugins can chunk their output as they wish, including withholding some data until the next chunk is sent out. Of course this requires keeping state, which means you cannot parallelize: see the chunk boundary handling section later in this documentation. 39 | 40 | Savers, too, are free to chunk their data as they like; for example, to create files of convenient sizes. This affects the chunks you get when loading or reprocessing data. If you don't want this, e.g. if the next plugin in line assumes a particular kind of chunking you want to preserve, set the attribute `rechunk_on_save = False`. 41 | 42 | In cases where rechunking is permitted, a plugin can also specify a desired minimum uncompressed chunk size in bytes via the `chunk_target_size` attribute, with 200 MB as the default value. Chunks are concatenated until this desired size is exceeded, or all chunks have been combined, whereupon the data is compressed and written to disk. 43 | 44 | 45 | Sorted output requirement 46 | -------------------------- 47 | Strax requires that all output is sorted by time inside chunks. 48 | 49 | Additionally, most or all plugins will assume that incoming data is time-ordered between chunks. That is, a subsequent chunk should not contain any data that starts before an item from a previous chunk ends. Strax data must be either consist of disjoint things, or if there are overlaps, chunk boundaries must fall in places where gaps exist. 50 | 51 | It would have been much harder to code an algorithm if you do not know when you have seen all input before a certain time. Essentially you would have to wait until the end of the run before you can process any data, which goes against the idea of processing your data as a stream. 52 | 53 | If your plugin removes or adds items from the original incoming array, it must output a different *data kind*. For example, during the initial data reduction steps, we remove items from 'raw_records' to make a new data kind 'records'. Here we change data kind, even though the fields in the output data type are identical to the fields in the input data type. 54 | -------------------------------------------------------------------------------- /docs/source/advanced/fuzzy_for.rst: -------------------------------------------------------------------------------- 1 | Fuzzy for functionality 2 | ======================= 3 | Since strax tracks lineages, updates to low level plugins may change the 4 | availability of high level data. When a low level plugin is changed (for example 5 | the version of a plugin is incremented), strax will recognize that the data corresponding 6 | to the plugin whereof the version is changed is not stored (since only the 7 | previous version is stored). This safeguards that the data that the user is loading 8 | is always consistent with the context. 9 | 10 | **This functionality can partially be disabled using fuzzy-for settings. This should 11 | only be done temporarily or for quick checks as strax is not anymore checking if 12 | the entire ancestry of the requested and the delivered data is consistent.** 13 | 14 | When to use 15 | ----------- 16 | There are situations where the above robustness of the context is not what the user 17 | wants. Such situations can be if a user is developing a new plugin on the master 18 | branch, when the master branch has some changes in the lower level plugins. 19 | The user in this case cannot easily check if the plugin works on data, as no data 20 | is available in the context of the master branch. In this case, the user might want 21 | to tell the context to just load whatever data is available, ignoring changes in 22 | a specific plugin. Another example would be if a dataset was simulated with specific 23 | instructions and a user wants to quickly look at the data in the simulated dataset 24 | without having to manually check which context was used for simulating this data 25 | (of course, the best way to solve this would be to open the metadata that is stored 26 | with the simulation files and construct the context from those options). 27 | 28 | How to use 29 | ---------- 30 | There are two ways of ignoring the lineage. Both are set in the context config 31 | (see context.context_config): 32 | - ``fuzzy_for_options`` a tuple of options to specify that each option with a 33 | name in the tuple can be ignored 34 | - ``fuzzy_for`` a tuple of data-types to ignore. 35 | 36 | In the example below, we will use setting the ``fuzzy_for`` option. We will use 37 | the online context from `straxen `_ to illustrate 38 | how the options are set. 39 | 40 | 41 | .. code-block:: python 42 | 43 | import straxen 44 | # Use a context that can load data from a datatype 'peak-basics' 45 | st = straxen.contexts.xenonnt_online() 46 | run_id, target = '022880', 'peak_basics' 47 | 48 | # Check if the data is stored for this run and datatype 49 | print(f'{run_id} {target} is stored: {st.is_stored(run_id, target)}') 50 | 51 | # Now let's mimic the situation wherein the version of the plugin that provides 52 | # peak basics has changed (it has a different version). We will do so by changing 53 | # the version of the plugin below 54 | PeakBasics = st._plugin_class_registry[target] 55 | PeakBasics.__version__ = 'does_not_exist' 56 | print(f'{run_id} {target} is stored: {st.is_stored(run_id, target)}') 57 | 58 | # The print statement will tell us the data is not stored. To load the data 59 | # from the default version of PeakBasics we will use the fuzzy-for option: 60 | st.context_config['fuzzy_for'] = (target,) 61 | print(f'{run_id} {target} is stored: {st.is_stored(run_id, target)}') 62 | 63 | The block above prints: 64 | 65 | .. code-block:: bash 66 | 67 | 022880 peak_basics is stored: True 68 | 022880 peak_basics is stored: False 69 | 022880 peak_basics is stored: True 70 | 71 | Is it advisable / safe to use? 72 | ------------------------------ 73 | For running production analyses, one should never base results on a context where 74 | fuzzy-ness is enabled. 75 | 76 | For quick tests, it is save to use. If new data is made based on a fuzzy context, 77 | this is not stored to prevent the creation of data-files with unreproducible 78 | results. 79 | 80 | Additionally (depending on the StorageFrontend), loading data with fuzzy options 81 | will be generally much slower. For example, the most commonly used StorageFrontend, 82 | the DataDirectory scans all folders within it's parent directory and filters the 83 | meta-data in search for a folder with a lineage compatible with the fuzzy for 84 | options. 85 | -------------------------------------------------------------------------------- /strax/scripts/rechunker.py: -------------------------------------------------------------------------------- 1 | import os.path 2 | import argparse 3 | 4 | import strax 5 | import pandas as pd 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser( 10 | description="Rechunker for FileSytemBackend. Interfaces with strax.rechunker." 11 | "Please see the documentation of strax.rechunker for more information: " 12 | "https://github.com/AxFoundation/strax/blob/31c114c5f8329e53289d5127fb2125e71c3d6aae/strax/storage/files.py#L371" # noqa 13 | ) 14 | parser.add_argument( 15 | "--source", 16 | type=str, 17 | help="Target directory to rechunk, should be a folder in a " 18 | "strax.DataDirectory (one datatype)", 19 | ) 20 | parser.add_argument( 21 | "--dest", 22 | "--destination", 23 | default=None, 24 | dest="dest", 25 | type=str, 26 | help="Where to store rechunked data. If nothing is specified, replace the source.", 27 | ) 28 | parser.add_argument( 29 | "--compressor", 30 | choices=list(strax.io.COMPRESSORS.keys()), 31 | help="Recompress using one of these compressors. If nothing specified, " 32 | "use the same compressor as the source", 33 | ) 34 | parser.add_argument( 35 | "--rechunk", default=True, choices=[True, False], type=bool, help="rechunk the data" 36 | ) 37 | parser.add_argument( 38 | "--target_size_mb", 39 | "--target-size-mb", 40 | dest="target_size_mb", 41 | type=int, 42 | default=strax.DEFAULT_CHUNK_SIZE_MB, 43 | help="Target size MB (uncompressed) of the rechunked data", 44 | ) 45 | parser.add_argument( 46 | "--write_stats_to", 47 | "--write-stats-to", 48 | dest="write_stats_to", 49 | type=str, 50 | default=None, 51 | help="Write some information to this file (csv format)", 52 | ) 53 | parser.add_argument( 54 | "--parallel", 55 | type=str, 56 | default="False", 57 | choices=["False", "True", "thread", "process"], 58 | help="Parallelize using threadpool or processpool", 59 | ) 60 | parser.add_argument( 61 | "--max_workers", type=int, default=4, help="Max workers if parallel is specified" 62 | ) 63 | parser.add_argument("--profile_memory", action="store_true", help="Profile memory usage") 64 | args = parser.parse_args() 65 | return args 66 | 67 | 68 | def main(): 69 | args = parse_args() 70 | if args.profile_memory: 71 | from memory_profiler import memory_usage 72 | import time 73 | 74 | start = time.time() 75 | mem = memory_usage(proc=(rechunk, (args,))) 76 | print(f"Memory profiler says peak RAM usage was: {max(mem):.1f} MB") 77 | print(f"Took {time.time() - start:.1f} s = {(time.time() - start) / 3600:.2f} h ") 78 | print("Bye, bye") 79 | else: 80 | rechunk(args) 81 | 82 | 83 | def rechunk(args): 84 | source_mb = strax.utils.dir_size_mb(args.source) 85 | report = strax.rechunker( 86 | source_directory=args.source, 87 | dest_directory=args.dest, 88 | replace=args.dest is None, 89 | compressor=args.compressor, 90 | target_size_mb=args.target_size_mb, 91 | rechunk=args.rechunk, 92 | parallel={"False": False, "True": True}.get(args.parallel, args.parallel), 93 | max_workers=args.max_workers, 94 | ) 95 | if args.dest is not None: 96 | recompressed_mb = strax.utils.dir_size_mb(report.get("dest_directory", args.dest)) 97 | else: 98 | recompressed_mb = strax.utils.dir_size_mb(args.source) 99 | report.update(dict(source_mb=source_mb, dest_mb=recompressed_mb)) 100 | if args.write_stats_to: 101 | if os.path.exists(args.write_stats_to): 102 | df = pd.read_csv(args.write_stats_to) 103 | else: 104 | df = pd.DataFrame() 105 | df_new = pd.concat([df, pd.DataFrame({k: [v] for k, v in report.items()})]) 106 | df_new.to_csv(args.write_stats_to, index=False) 107 | 108 | print(f"Re-compressed {args.source}") 109 | for k, v in report.items(): 110 | print(f"\t{k:16}\t{v}") 111 | 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /strax/processors/single_thread.py: -------------------------------------------------------------------------------- 1 | import typing as ty 2 | 3 | from .base import BaseProcessor, ProcessorComponents 4 | from .post_office import PostOffice, Spy 5 | 6 | 7 | import strax 8 | 9 | export, __all__ = strax.exporter() 10 | 11 | 12 | @export 13 | class SingleThreadProcessor(BaseProcessor): 14 | def __init__( 15 | self, components: ProcessorComponents, allow_rechunk=True, is_superrun=False, **kwargs 16 | ): 17 | super().__init__(components, allow_rechunk=allow_rechunk, is_superrun=is_superrun, **kwargs) 18 | 19 | self.log.debug("Processor components are: " + str(components)) 20 | 21 | # Do not use executors: work in one thread in one process 22 | self.process_executor = self.thread_executor = None 23 | 24 | self.post_office = PostOffice() 25 | 26 | for d, loader in components.loaders.items(): 27 | assert d not in components.plugins 28 | self.post_office.register_producer(loader(executor=self.thread_executor), topic=d) 29 | 30 | plugins_seen: ty.List[strax.Plugin] = [] 31 | for d, p in components.plugins.items(): 32 | # Multi-output plugins are listed multiple times in components.plugins; 33 | # ensure we only process each plugin once. 34 | if p in plugins_seen: 35 | continue 36 | plugins_seen.append(p) 37 | 38 | # Some data_types might be already saved and can be loaded; 39 | # remove them from the list of provides 40 | self.post_office.register_producer( 41 | p.iter(iters={dep: self.post_office.get_iter(dep, d) for dep in p.depends_on}), 42 | topic=strax.to_str_tuple(p.provides), 43 | registered=tuple(components.loaders), 44 | ) 45 | 46 | dtypes_built = {d: p for p in components.plugins.values() for d in p.provides} 47 | for d, savers in components.savers.items(): 48 | for saver in savers: 49 | if d in dtypes_built: 50 | rechunk = dtypes_built[d].can_rechunk(d) and allow_rechunk 51 | else: 52 | rechunk = is_superrun and allow_rechunk 53 | 54 | self.post_office.register_spy(SaverSpy(saver, rechunk=rechunk), topic=d) 55 | 56 | def iter(self): 57 | target = self.components.targets[0] 58 | final_generator = self.post_office.get_iter(topic=target, reader="FINAL") 59 | 60 | self.log.debug(f"Yielding {target}") 61 | 62 | try: 63 | yield from final_generator 64 | 65 | except Exception: 66 | # Exception in one of the producers. Close savers (they will record 67 | # the exception from sys.exc_info()) then reraise. 68 | self.log.fatal(f"Exception during processing, closing savers and reraising") 69 | self.post_office.kill_spies() 70 | raise 71 | 72 | except GeneratorExit: 73 | self.log.fatal( 74 | "Exception in code that called the processor: detected " 75 | "GeneratorExit from python shutting down. " 76 | "Closing savers and exiting." 77 | ) 78 | # Strax savers look at sys.exc_info(). Having only "GeneratorExit" 79 | # there is unhelpful.. this should set it to something better: 80 | try: 81 | raise RuntimeError("Exception in caller, see log for details") 82 | except RuntimeError: 83 | self.post_office.kill_spies() 84 | 85 | self.log.debug("Processing finished") 86 | 87 | 88 | class SaverSpy(Spy): 89 | """Spy that saves messages to a saver.""" 90 | 91 | def __init__(self, saver, rechunk=False): 92 | self.saver = saver 93 | self.rechunker = strax.Rechunker(rechunk, self.saver.md["run_id"]) 94 | self.chunk_number = 0 95 | 96 | def receive(self, chunk): 97 | self._save_chunk(self.rechunker.receive(chunk)) 98 | 99 | def _save_chunk(self, chunks): 100 | for chunk in chunks: 101 | if chunk is None: 102 | continue 103 | self.saver.save(chunk, self.chunk_number) 104 | self.chunk_number += 1 105 | 106 | def close(self): 107 | self._save_chunk(self.rechunker.flush()) 108 | self.saver.close() 109 | -------------------------------------------------------------------------------- /tests/test_sort.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import numpy as np 3 | import warnings 4 | from hypothesis import given, strategies 5 | from hypothesis.extra.numpy import arrays, integer_dtypes 6 | from strax.sort_enforcement import SortingError, stable_sort, stable_argsort 7 | 8 | 9 | class TestSortEnforcement(unittest.TestCase): 10 | @given(arrays(integer_dtypes(), strategies.integers(1, 100))) 11 | def test_explicit_stable_sort(self, arr): 12 | """Test explicit stable_sort function with generated arrays.""" 13 | with warnings.catch_warnings(): 14 | warnings.simplefilter("error") # Turn warnings into errors 15 | sorted_arr = stable_sort(arr) 16 | np.testing.assert_array_equal(sorted_arr, np.sort(arr, kind="mergesort")) 17 | # Verify the array is actually sorted 18 | self.assertTrue(np.all(sorted_arr[:-1] <= sorted_arr[1:])) 19 | 20 | @given(arrays(integer_dtypes(), strategies.integers(1, 100))) 21 | def test_explicit_stable_argsort(self, arr): 22 | """Test explicit stable_argsort function with generated arrays.""" 23 | with warnings.catch_warnings(): 24 | warnings.simplefilter("error") # Turn warnings into errors 25 | sorted_indices = stable_argsort(arr) 26 | np.testing.assert_array_equal(sorted_indices, np.argsort(arr, kind="mergesort")) 27 | # Verify the indices actually sort the array 28 | sorted_arr = arr[sorted_indices] 29 | self.assertTrue(np.all(sorted_arr[:-1] <= sorted_arr[1:])) 30 | 31 | @given( 32 | arrays(integer_dtypes(), strategies.integers(1, 100)), 33 | strategies.sampled_from(["quicksort", "heapsort"]), 34 | ) 35 | def test_wrapped_quicksort_rejection(self, arr, sort_kind): 36 | """Test that quicksort and heapsort raise errors in wrapped functions.""" 37 | with self.assertRaises(SortingError): 38 | stable_sort(arr, kind=sort_kind) 39 | with self.assertRaises(SortingError): 40 | stable_argsort(arr, kind=sort_kind) 41 | 42 | @given(arrays(integer_dtypes(), strategies.integers(1, 100))) 43 | def test_original_numpy_unaffected(self, arr): 44 | """Test that original numpy sort functions still work with quicksort.""" 45 | try: 46 | quicksort_result = np.sort(arr, kind="quicksort") 47 | self.assertTrue(np.all(quicksort_result[:-1] <= quicksort_result[1:])) 48 | 49 | quicksort_indices = np.argsort(arr, kind="quicksort") 50 | sorted_arr = arr[quicksort_indices] 51 | self.assertTrue(np.all(sorted_arr[:-1] <= sorted_arr[1:])) 52 | except Exception as e: 53 | self.fail(f"numpy sort with quicksort raised an unexpected exception: {e}") 54 | 55 | @given( 56 | strategies.lists( 57 | strategies.tuples( 58 | strategies.integers(1, 10), # num field 59 | strategies.text(min_size=1, max_size=1), # letter field 60 | ), 61 | min_size=1, 62 | max_size=100, 63 | ) 64 | ) 65 | def test_sort_stability(self, data): 66 | """Test that wrapped sorting is stable using generated structured arrays.""" 67 | # Convert list of tuples to structured array 68 | arr = np.array(data, dtype=[("num", int), ("letter", "U1")]) 69 | 70 | # First sort by letter to establish initial order 71 | arr_by_letter = stable_sort(arr, order="letter") 72 | # Then sort by number - if sort is stable, items with same number 73 | # should maintain their relative order from the letter sort 74 | final_sort = stable_sort(arr_by_letter, order="num") 75 | 76 | # Verify sorting works correctly 77 | for i in range(len(final_sort) - 1): 78 | # Check primary sort key (number) 79 | self.assertTrue( 80 | final_sort[i]["num"] <= final_sort[i + 1]["num"], 81 | f"Primary sort failed: {final_sort[i]} should come before {final_sort[i + 1]}", 82 | ) 83 | 84 | # If numbers are equal, check that letter order is preserved 85 | if final_sort[i]["num"] == final_sort[i + 1]["num"]: 86 | self.assertTrue( 87 | final_sort[i]["letter"] <= final_sort[i + 1]["letter"], 88 | f"Stability violated: for equal numbers {final_sort[i]['num']}, " 89 | f"letter {final_sort[i]['letter']} should come " 90 | f"before or equal to {final_sort[i + 1]['letter']}", 91 | ) 92 | 93 | 94 | if __name__ == "__main__": 95 | unittest.main(verbosity=2) 96 | -------------------------------------------------------------------------------- /tests/test_overlap_plugin.py: -------------------------------------------------------------------------------- 1 | from strax import testutils 2 | 3 | import numpy as np 4 | 5 | from hypothesis import given, strategies, example, settings 6 | 7 | import strax 8 | 9 | 10 | @given( 11 | testutils.disjoint_sorted_intervals.filter(lambda x: len(x) > 0), 12 | strategies.integers(min_value=0, max_value=3), 13 | ) 14 | @settings(deadline=None) 15 | # Examples that trigger issue #49 16 | @example( 17 | input_peaks=np.array([(0, 1, 1, 0), (1, 10, 1, 0), (11, 1, 1, 0)], dtype=strax.interval_dtype), 18 | split_i=2, 19 | ) 20 | @example( 21 | input_peaks=np.array( 22 | [(0, 1, 1, 0), (1, 1, 1, 0), (2, 9, 1, 0), (11, 1, 1, 0)], dtype=strax.interval_dtype 23 | ), 24 | split_i=3, 25 | ) 26 | # Other example that caused failures at some point 27 | @example( 28 | input_peaks=np.array([(0, 1, 1, 0), (7, 6, 1, 0), (13, 1, 1, 0)], dtype=strax.interval_dtype), 29 | split_i=2, 30 | ) 31 | def test_overlap_plugin(input_peaks, split_i): 32 | """Counting the number of nearby peaks should not depend on how peaks are chunked.""" 33 | chunks = np.split(input_peaks, [split_i]) 34 | chunks = [c for c in chunks if not len(c) == 0] 35 | 36 | class Peaks(strax.Plugin): 37 | depends_on = tuple() 38 | dtype = strax.interval_dtype 39 | 40 | def compute(self, chunk_i): 41 | data = chunks[chunk_i] 42 | return self.chunk( 43 | data=data, start=int(data[0]["time"]), end=int(strax.endtime(data[-1])) 44 | ) 45 | 46 | # Hack to make peak output stop after a few chunks 47 | def is_ready(self, chunk_i): 48 | return chunk_i < len(chunks) 49 | 50 | def source_finished(self): 51 | return True 52 | 53 | window = 10 54 | 55 | # Note we must apply this to endtime, not time, since 56 | # peaks straddling the overlap threshold are assigned to the NEXT window. 57 | # If we used time it would fail on examples with peaks larger than window. 58 | # In real life, the window should simply be chosen large enough that this 59 | # is not an issue. 60 | def count_in_window(ts, w=window): 61 | # Terribly inefficient algorithm... 62 | result = np.zeros(len(ts), dtype=np.int16) 63 | for i, t in enumerate(ts): 64 | result[i] = ((ts < t + w) & (ts > t - w)).sum() 65 | return result 66 | 67 | class WithinWindow(strax.OverlapWindowPlugin): 68 | depends_on = ("peaks",) 69 | dtype = [("n_within_window", np.int16)] + strax.time_fields 70 | 71 | def get_window_size(self): 72 | return window 73 | 74 | def compute(self, peaks): 75 | return dict( 76 | n_within_window=count_in_window(strax.endtime(peaks)), 77 | time=peaks["time"][:1], 78 | endtime=strax.endtime(peaks)[-1:], 79 | ) 80 | 81 | class MultipleWithinWindow(WithinWindow): 82 | provides = ("within_window", "multiple_within_window") 83 | data_kind = dict( 84 | within_window="within_window", multiple_within_window="multiple_within_window" 85 | ) 86 | dtype = dict( 87 | within_window=[("n_within_window", np.int16)] + strax.time_fields, 88 | multiple_within_window=[("window_length", np.int16)] + strax.time_fields, 89 | ) 90 | 91 | def compute(self, peaks): 92 | within_window = dict( 93 | n_within_window=count_in_window(strax.endtime(peaks)), 94 | time=peaks["time"][:1], 95 | endtime=strax.endtime(peaks)[-1:], 96 | ) 97 | multiple_within_window = dict( 98 | window_length=peaks["length"], 99 | time=peaks["time"], 100 | endtime=strax.endtime(peaks), 101 | ) 102 | return dict( 103 | within_window=within_window, 104 | multiple_within_window=multiple_within_window, 105 | ) 106 | 107 | st = strax.Context(storage=[]) 108 | st.register(Peaks) 109 | for plugin in (WithinWindow, MultipleWithinWindow): 110 | st.register(plugin) 111 | 112 | result = st.get_array(run_id="some_run", targets="within_window") 113 | expected = count_in_window(strax.endtime(input_peaks)) 114 | 115 | assert len(expected) == len(input_peaks), "WTF??" 116 | assert isinstance(result, np.ndarray), "Did not get an array" 117 | assert len(result) == len(expected), "Result has wrong length" 118 | np.testing.assert_equal(result["n_within_window"], expected, "Counting went wrong") 119 | -------------------------------------------------------------------------------- /strax/processing/data_reduction.py: -------------------------------------------------------------------------------- 1 | """Functions to perform in-place pulse-level data reduction.""" 2 | 3 | import numpy as np 4 | import numba 5 | from enum import IntEnum 6 | 7 | import strax 8 | from strax.processing.pulse_processing import NO_RECORD_LINK, record_links 9 | 10 | export, __all__ = strax.exporter() 11 | 12 | 13 | @export 14 | class ReductionLevel(IntEnum): 15 | """Identifies what type of data reduction has been used on a record.""" 16 | 17 | # Record not modified 18 | NO_REDUCTION = 0 19 | # Samples near pulse start/end were removed 20 | BASELINE_CUT = 1 21 | # Samples far from a threshold excursion were removed 22 | HITS_ONLY = 2 23 | # The record has been replaced with a simpler waveform 24 | WAVEFORM_REPLACED = 3 25 | # The raw waveform has been deleted, only metadata survives 26 | METADATA_ONLY = 4 27 | 28 | 29 | @export 30 | @numba.njit(nogil=True, cache=True) 31 | def cut_baseline(records, n_before=48, n_after=30): 32 | """Replace first n_before and last n_after samples of pulses by 0.""" 33 | # records.data.shape[1] gives a numba error (file issue?) 34 | if not len(records): 35 | return 36 | samples_per_record = len(records[0]["data"]) 37 | 38 | for d_i, d in enumerate(records): 39 | if d.record_i == 0: 40 | d.data[:n_before] = 0 41 | 42 | clear_from = d.pulse_length - n_after 43 | clear_from -= d.record_i.astype(np.int32) * samples_per_record 44 | clear_from = max(0, clear_from) 45 | if clear_from < samples_per_record: 46 | d.data[clear_from:] = 0 47 | d["reduction_level"] = ReductionLevel.BASELINE_CUT 48 | 49 | 50 | @export 51 | def cut_outside_hits(records, hits, left_extension=2, right_extension=15): 52 | """Return records with waveforms zeroed if not within left_extension or right_extension of hits. 53 | These extensions properly account for breaking of pulses into records. 54 | 55 | If you pass an incomplete (e.g. cut) set of records, we will not save data around hits found in 56 | the removed records, even if this stretches into records that you did pass. 57 | 58 | """ 59 | if not len(records): 60 | return records 61 | 62 | # Create a copy of records with blanked data 63 | # Even a simple records.copy() is mightily slow in numba, 64 | # and assignments to struct arrays seem troublesome. 65 | # The obvious solution: 66 | # new_recs = records.copy() 67 | # new_recs['data'] = 0 68 | # is quite slow. 69 | # Replacing the last = with *= gives a factor 2 speed boost. 70 | # But ~40% faster still is this: 71 | meta_fields = [x for x in records.dtype.names if x not in ["data", "reduction_level"]] 72 | 73 | new_recs = np.zeros(len(records), dtype=records.dtype) 74 | new_recs[meta_fields] = records[meta_fields] 75 | new_recs["reduction_level"] = ReductionLevel.HITS_ONLY 76 | 77 | _cut_outside_hits(records, hits, new_recs, left_extension, right_extension) 78 | 79 | return new_recs 80 | 81 | 82 | @numba.njit(nogil=True, cache=True) 83 | def _cut_outside_hits(records, hits, new_recs, left_extension=2, right_extension=15): 84 | if not len(records): 85 | return 86 | samples_per_record = len(records[0]["data"]) 87 | 88 | previous_record, next_record = record_links(records) 89 | 90 | for hit_i, h in enumerate(hits): 91 | rec_i = h["record_i"] 92 | r = records[rec_i] 93 | 94 | # Indices to keep, with 0 at the start of this record 95 | start_keep = h["left"] - left_extension 96 | end_keep = h["right"] + right_extension 97 | 98 | # Indices of samples to keep in this record 99 | (a, b), _ = strax.overlap_indices(0, r["length"], start_keep, end_keep - start_keep) 100 | new_recs[rec_i]["data"][a:b] = records[rec_i]["data"][a:b] 101 | 102 | # Keep samples in previous record, if there was one 103 | if start_keep < 0: 104 | prev_ri = previous_record[rec_i] 105 | if prev_ri != NO_RECORD_LINK: 106 | # Note start_keep is negative, so this keeps the 107 | # last few samples of the previous record 108 | a_prev = start_keep 109 | new_recs[prev_ri]["data"][a_prev:] = records[prev_ri]["data"][a_prev:] 110 | 111 | # Same for the next record, if there is one 112 | if end_keep > samples_per_record: 113 | next_ri = next_record[rec_i] 114 | if next_ri != NO_RECORD_LINK: 115 | b_next = end_keep - samples_per_record 116 | new_recs[next_ri]["data"][:b_next] = records[next_ri]["data"][:b_next] 117 | -------------------------------------------------------------------------------- /tests/test_fixed_plugin_cache.py: -------------------------------------------------------------------------------- 1 | from strax.testutils import Records, Peaks 2 | import strax 3 | import unittest 4 | import numpy as np 5 | 6 | 7 | class ChannelIsRunidRecords(Records): 8 | """Set the channel field equal to the run_id.""" 9 | 10 | def compute(self, chunk_i): 11 | res = super().compute(chunk_i) 12 | res.data["channel"][:] = int(self.run_id) 13 | return res 14 | 15 | 16 | class MaxChannelPeaks(Peaks): 17 | def infer_dtype(self): 18 | # We are going to check later that the infer_dtype is always called. 19 | dtype = strax.peak_dtype() + [(("PMT with median most records", "max_pmt"), np.int16)] 20 | self.dtype_is_set = True 21 | return dtype 22 | 23 | def compute(self, records): 24 | assert np.all(records["channel"] == int(self.run_id)) 25 | res = super().compute(records) 26 | res["max_pmt"] = records["channel"].mean() 27 | return res 28 | 29 | 30 | class TestContextFixedPluginCache(unittest.TestCase): 31 | """Test the _fixed_plugin_cache of a context.""" 32 | 33 | def test_load_runs(self, n_runs=3, config_update=None, **kwargs): 34 | """Try loading data for n_runs to make sure that we are.""" 35 | run_ids = [str(r) for r in range(n_runs)] 36 | st = self.get_context(use_per_run_defaults=False) 37 | if config_update is not None: 38 | st.set_context_config(config_update) 39 | data = st.get_array(run_ids, "records", **kwargs) 40 | run_id_channel_diff = data["run_id"].astype(np.int64) - data["channel"] 41 | assert np.all(run_id_channel_diff == 0) 42 | 43 | # To be sure also double check Peaks as self.deps of the Plugin 44 | # class should be correctly taken care of by the context. 45 | peaks_data = st.get_array(run_ids, "peaks") 46 | run_id_max_pmt_diff = peaks_data["max_pmt"] - peaks_data["run_id"].astype(np.int64) 47 | assert np.all(run_id_max_pmt_diff == 0) 48 | 49 | def test_get_plugin(self, n_runs=3): 50 | run_ids = [str(r) for r in range(n_runs)] 51 | st = self.get_context(use_per_run_defaults=False) 52 | plugins_seen = [] 53 | for run in run_ids: 54 | p = st.get_single_plugin(run, "records") 55 | plugins_seen.append(p) 56 | assert p.run_id == run 57 | 58 | # If we passed around a reference instead of a copy of the 59 | # plugin, this would be a problem. 60 | for r_i, run in enumerate(run_ids): 61 | assert plugins_seen[r_i].run_id == run 62 | 63 | def test_load_runs_multicore(self): 64 | """Load the runs. 65 | 66 | If the references are mixed up the results are inconsistent 67 | 68 | """ 69 | multicore_config = dict( 70 | allow_lazy=False, 71 | timeout=60, 72 | allow_multiprocess=True, 73 | ) 74 | self.test_load_runs(n_runs=10, config_update=multicore_config, max_workers=10) 75 | 76 | def test_cache_changes(self): 77 | """ 78 | Test that the _fixed_plugin_cache changes if we: 79 | - Change the config 80 | - Change the version of a plugin 81 | 82 | """ 83 | st = self.get_context(use_per_run_defaults=False) 84 | 85 | # Compute the key/hash under which we will store the plugins 86 | first_key = st._context_hash() 87 | assert first_key is not None 88 | 89 | # Change the config triggers a new key 90 | st.set_config({"bla": 1}) 91 | second_key = st._context_hash() 92 | 93 | # Change the version of a plugin triggers a new key 94 | st._plugin_class_registry["records"].__version__ = -1 95 | third_key = st._context_hash() 96 | 97 | assert first_key != second_key != third_key 98 | 99 | def test_set_dtype(self): 100 | st = self.get_context(use_per_run_defaults=False) 101 | 102 | # Compute the key/hash under which we will store the plugins 103 | st.key_for("0", "peaks") 104 | assert st._fixed_plugin_cache[st._context_hash()]["peaks"].dtype_is_set 105 | 106 | # Now recreate for a new run 107 | st.key_for("1", "peaks") 108 | assert st._fixed_plugin_cache[st._context_hash()]["peaks"].dtype_is_set 109 | 110 | @staticmethod 111 | def get_context(use_per_run_defaults: bool): 112 | """Get simple context.""" 113 | st = strax.Context( 114 | storage=[], register=(ChannelIsRunidRecords, MaxChannelPeaks), config=dict(bonus_area=1) 115 | ) 116 | st.set_context_config({"use_per_run_defaults": use_per_run_defaults}) 117 | return st 118 | -------------------------------------------------------------------------------- /strax/storage/zipfiles.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import os.path as osp 4 | import shutil 5 | import zipfile 6 | 7 | import strax 8 | from .files import RUN_METADATA_PATTERN 9 | 10 | export, __all__ = strax.exporter() 11 | 12 | 13 | @export 14 | class ZipDirectory(strax.StorageFrontend): 15 | """ZipFile-based storage frontend for strax. 16 | 17 | All data for one run is assumed to be in a single zip file .zip, with the same 18 | file/directory structure as created by FileStore. 19 | 20 | We cannot write zip files directly (this would result in concurrency hell), instead these zip 21 | files are made by zipping stuff from FileSytemBackend. 22 | 23 | """ 24 | 25 | storage_typ = strax.StorageType.COMPRESSED 26 | 27 | def __init__(self, path=".", *args, readonly=True, **kwargs): 28 | if not readonly: 29 | raise NotImplementedError("Zipfiles are currently read-only") 30 | super().__init__(*args, readonly=readonly, **kwargs) 31 | self.backends = [ZipFileBackend()] 32 | self.path = path 33 | if not osp.exists(path): 34 | os.makedirs(path) 35 | 36 | def _find(self, key, write, allow_incomplete, fuzzy_for, fuzzy_for_options): 37 | assert not write 38 | 39 | # Check exact match / write case 40 | bk = self._backend_key(key) 41 | with zipfile.ZipFile(self._zipname(key)) as zp: 42 | try: 43 | dirname = str(key) 44 | prefix = strax.dirname_to_prefix(dirname) 45 | zp.getinfo(f"{dirname}/{RUN_METADATA_PATTERN % prefix}") 46 | return bk 47 | except KeyError: 48 | pass 49 | 50 | if not len(fuzzy_for) and not len(fuzzy_for_options): 51 | raise strax.DataNotAvailable 52 | 53 | raise NotImplementedError("Fuzzy matching within zipfiles not yet implemented") 54 | 55 | def run_metadata(self, run_id): 56 | with zipfile.ZipFile(self._zipname(run_id)) as zp: 57 | try: 58 | with zp.open(RUN_METADATA_PATTERN % run_id) as f: 59 | return json.loads(f.read()) 60 | except KeyError: 61 | raise strax.RunMetadataNotAvailable 62 | 63 | def write_run_metadata(self, run_id, metadata): 64 | raise NotImplementedError("Zipfiles cannot write") 65 | 66 | def remove(self, key): 67 | raise NotImplementedError("Zipfiles cannot write") 68 | 69 | def _set_write_complete(self, key): 70 | raise NotImplementedError("Zipfiles cannot write") 71 | 72 | def _backend_key(self, key): 73 | return (self.backends[0].__class__.__name__, (self._zipname(key), str(key))) 74 | 75 | def _zipname(self, key): 76 | zipname = osp.join(self.path, key.run_id + ".zip") 77 | # Since we're never writing, this check can be here 78 | # is this a bad idea? 79 | if not osp.exists(zipname): 80 | raise strax.DataNotAvailable 81 | return zipname 82 | 83 | @staticmethod 84 | def zip_dir(input_dir, output_zipfile, delete=False): 85 | """Zips subdirectories of input_dir to output_zipfile (without compression). 86 | 87 | Travels into subdirectories, but not sub-subdirectories. Skips any other files in directory. 88 | :param delete: If True, delete original directories 89 | 90 | """ 91 | with zipfile.ZipFile(output_zipfile, mode="w") as zp: 92 | for dirn in os.listdir(input_dir): 93 | full_dirn = os.path.join(input_dir, dirn) 94 | if not osp.isdir(full_dirn): 95 | continue 96 | for fn in os.listdir(full_dirn): 97 | zp.write(os.path.join(full_dirn, fn), arcname=os.path.join(dirn, fn)) 98 | if delete: 99 | shutil.rmtree(full_dirn) 100 | 101 | 102 | @export 103 | class ZipFileBackend(strax.StorageBackend): 104 | def _read_chunk(self, zipn_and_dirn, chunk_info, dtype, compressor): 105 | zipn, dirn = zipn_and_dirn 106 | with zipfile.ZipFile(zipn) as zp: 107 | with zp.open(dirn + "/" + chunk_info["filename"]) as f: 108 | return strax.load_file(f, dtype=dtype, compressor=compressor) 109 | 110 | def _get_metadata(self, zipn_and_dirn): 111 | zipn, dirn = zipn_and_dirn 112 | with zipfile.ZipFile(zipn) as zp: 113 | prefix = strax.dirname_to_prefix(dirn) 114 | with zp.open(f"{dirn}/{RUN_METADATA_PATTERN % prefix}") as f: 115 | return json.loads(f.read()) 116 | 117 | def saver(self, *args, **kwargs): 118 | raise NotImplementedError("Zipfiles cannot write") 119 | -------------------------------------------------------------------------------- /strax/plugins/cut_plugin.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | import numpy as np 3 | import strax 4 | from .plugin import Plugin, SaveWhen 5 | from .merge_only_plugin import MergeOnlyPlugin 6 | 7 | export, __all__ = strax.exporter() 8 | 9 | 10 | @export 11 | class CutPlugin(Plugin): 12 | """Generate a plugin that provides a boolean for a given cut specified by 'cut_by'.""" 13 | 14 | save_when = SaveWhen.TARGET 15 | 16 | def __init__(self): 17 | super().__init__() 18 | 19 | compute_pars = list(inspect.signature(self.cut_by).parameters.keys()) 20 | if "chunk_i" in compute_pars: 21 | self.compute_takes_chunk_i = True 22 | del compute_pars[compute_pars.index("chunk_i")] 23 | if "start" in compute_pars: 24 | if "end" not in compute_pars: 25 | raise ValueError(f"Compute of {self} takes start, so it should also take end.") 26 | self.compute_takes_start_end = True 27 | del compute_pars[compute_pars.index("start")] 28 | del compute_pars[compute_pars.index("end")] 29 | self.compute_pars = compute_pars 30 | 31 | _name = strax.camel_to_snake(self.__class__.__name__) 32 | if not hasattr(self, "provides"): 33 | self.provides = _name 34 | if not hasattr(self, "cut_name"): 35 | self.cut_name = _name 36 | if not hasattr(self, "cut_description"): 37 | _description = _name 38 | if "cut_" not in _description: 39 | _description = "Cut by " + _description 40 | else: 41 | _description = " ".join(_description.split("_")) 42 | self.cut_description = _description 43 | 44 | def infer_dtype(self): 45 | dtype = [(self.cut_name, bool, self.cut_description)] 46 | # Alternatively one could use time_dt_fields for low level plugins. 47 | dtype = strax.time_fields + dtype 48 | return dtype 49 | 50 | def compute(self, **kwargs): 51 | if hasattr(self, "cut_by"): 52 | cut_by = self.cut_by 53 | else: 54 | raise NotImplementedError(f"{self.cut_name} does not have attribute 'cut_by'") 55 | 56 | # Take shape of the first data_type like in strax.plugin 57 | buff = list(kwargs.values())[0] 58 | 59 | # Generate result buffer 60 | r = np.zeros(len(buff), self.dtype) 61 | r["time"] = buff["time"] 62 | r["endtime"] = strax.endtime(buff) 63 | r[self.cut_name] = cut_by(**kwargs) 64 | return r 65 | 66 | def cut_by(self, **kwargs): 67 | # This should be provided by the user making a CutPlugin 68 | raise NotImplementedError() 69 | 70 | 71 | @export 72 | class CutList(MergeOnlyPlugin): 73 | """Base class that merges all existing cuts into a single array which can be loaded by the 74 | analysts.""" 75 | 76 | __version__ = "0.0.0" 77 | 78 | save_when = SaveWhen.TARGET 79 | cuts = () 80 | # need to declare depends_on here to satisfy strax 81 | # https://github.com/AxFoundation/strax/blob/df18c9cef38ea1cee9737d56b1bea078ebb246a9/strax/plugin.py#L99 82 | depends_on = () 83 | _depends_on = () 84 | 85 | def infer_dtype(self): 86 | dtype = super().infer_dtype() 87 | dtype += [ 88 | ( 89 | ( 90 | f"Boolean AND of all cuts in {self.accumulated_cuts_string}", 91 | self.accumulated_cuts_string, 92 | ), 93 | bool, 94 | ) 95 | ] 96 | return dtype 97 | 98 | def compute(self, **kwargs): 99 | cuts = super().compute(**kwargs) 100 | cuts_joint = np.zeros(len(cuts), self.dtype) 101 | strax.copy_to_buffer( 102 | cuts, cuts_joint, f"_copy_cuts_{strax.deterministic_hash(self.depends_on)}" 103 | ) 104 | cuts_joint[self.accumulated_cuts_string] = get_accumulated_bool(cuts) 105 | return cuts_joint 106 | 107 | @property # type: ignore 108 | def depends_on(self): # noqa 109 | if not len(self._depends_on): 110 | deps = [] 111 | for c in self.cuts: 112 | deps.extend(strax.to_str_tuple(c.provides)) 113 | self._depends_on = tuple(deps) 114 | return self._depends_on 115 | 116 | @depends_on.setter 117 | def depends_on(self, str_or_tuple): 118 | self._depends_on = strax.to_str_tuple(str_or_tuple) 119 | 120 | 121 | @export 122 | def get_accumulated_bool(array): 123 | """Computes accumulated boolean over all cuts. 124 | 125 | :param array: Array containing merged cuts. 126 | 127 | """ 128 | fields = array.dtype.names 129 | fields = np.array([f for f in fields if f not in ("time", "endtime")]) 130 | 131 | res = np.ones(len(array), bool) 132 | for field in fields: 133 | res &= array[field] 134 | return res 135 | -------------------------------------------------------------------------------- /tests/test_mailbox.py: -------------------------------------------------------------------------------- 1 | import concurrent.futures 2 | import threading 3 | import time 4 | 5 | import numpy as np 6 | import pytest 7 | 8 | import strax 9 | 10 | SHORT_TIMEOUT = 0.1 11 | LONG_TIMEOUT = 5 * SHORT_TIMEOUT 12 | 13 | 14 | def reader(source, reader_sleeps=0, name=""): 15 | result = [] 16 | for x in source: 17 | print(f"Reader {name} got {x}, sleeping for {reader_sleeps}") 18 | time.sleep(reader_sleeps) 19 | print(f"Reader {name} awoke") 20 | result.append(x) 21 | return result 22 | 23 | 24 | def mailbox_tester( 25 | messages, 26 | numbers=None, 27 | lazy=False, 28 | reader_sleeps=0.0, 29 | max_messages=100, 30 | expected_result=None, 31 | timeout=SHORT_TIMEOUT, 32 | result_timeout=LONG_TIMEOUT, 33 | ): 34 | if numbers is None: 35 | numbers = np.arange(len(messages)) 36 | if expected_result is None: 37 | messages = np.asarray(messages) 38 | expected_result = messages[strax.stable_argsort(numbers)] 39 | 40 | mb = strax.Mailbox(max_messages=max_messages, timeout=timeout, lazy=lazy) 41 | 42 | n_readers = 2 43 | 44 | with concurrent.futures.ThreadPoolExecutor() as tp: 45 | futures = [ 46 | tp.submit(reader, source=mb.subscribe(), reader_sleeps=reader_sleeps) 47 | for _ in range(n_readers) 48 | ] 49 | 50 | for i, _ in enumerate(messages): 51 | mb.send(messages[i], msg_number=numbers[i]) 52 | print(f"Sent message {i}. Now {len(mb._mailbox)} ms in mailbox.") 53 | 54 | mb.close() 55 | 56 | # Results must be equal 57 | for f in futures: 58 | np.testing.assert_equal(f.result(timeout=result_timeout), expected_result) 59 | 60 | 61 | def test_highlevel(): 62 | """Test highlevel mailbox API.""" 63 | for lazy in [False, True]: 64 | n_threads_start = len(threading.enumerate()) 65 | print(f"Lazy mode: {lazy}") 66 | 67 | mb = strax.Mailbox(lazy=lazy) 68 | mb.add_sender(iter(list(range(10)))) 69 | 70 | def test_reader(source): 71 | test_reader.got = r = [] 72 | for s in source: 73 | r.append(s) 74 | 75 | mb.add_reader(test_reader) 76 | mb.start() 77 | time.sleep(SHORT_TIMEOUT) 78 | assert hasattr(test_reader, "got") 79 | assert test_reader.got == list(range(10)) 80 | mb.cleanup() 81 | threads = [f"{t.name} is dead: {True ^ t.is_alive()}" for t in threading.enumerate()] 82 | assert ( 83 | len(threads) == n_threads_start 84 | ), f"Not all threads died. \n Threads running are:{threads}" 85 | 86 | 87 | def test_result_timeout(): 88 | """Test that our mailbox tester actually times out. 89 | 90 | (if not, the other tests might hang indefinitely if something is broken) 91 | 92 | """ 93 | with pytest.raises(concurrent.futures.TimeoutError): 94 | mailbox_tester([0, 1], numbers=[1, 2], timeout=2 * LONG_TIMEOUT) 95 | 96 | 97 | def test_read_timeout(): 98 | """Subscribers time out if we cannot read for too long.""" 99 | with pytest.raises(strax.MailboxReadTimeout): 100 | mailbox_tester([0, 1], numbers=[1, 2]) 101 | 102 | 103 | def test_write_timeout(): 104 | """Writers time out if we cannot write for too long.""" 105 | with pytest.raises(strax.MailboxFullTimeout): 106 | mailbox_tester([0, 1, 2, 3, 4], max_messages=1, reader_sleeps=LONG_TIMEOUT) 107 | 108 | 109 | def test_reversed(): 110 | """Mailbox sorts messages properly.""" 111 | mailbox_tester(np.arange(10), numbers=np.arange(10)[::-1]) 112 | 113 | 114 | def test_deadlock_regression(): 115 | """A reader thread may start after the first message is processed.""" 116 | # Test cannot run in lazy mode, cannot send without active subscriber 117 | mb = strax.Mailbox(timeout=SHORT_TIMEOUT) 118 | mb.send(0) 119 | 120 | readers = [ 121 | threading.Thread(target=reader, kwargs=dict(source=mb.subscribe(), name=str(i))) 122 | for i in range(2) 123 | ] 124 | readers[0].start() 125 | time.sleep(SHORT_TIMEOUT) 126 | 127 | readers[1].start() 128 | mb.send(1) 129 | mb.close() 130 | 131 | for t in readers: 132 | t.join(SHORT_TIMEOUT) 133 | assert not t.is_alive() 134 | 135 | 136 | def test_close_protection(): 137 | """Cannot send messages to a closed mailbox.""" 138 | mb = strax.Mailbox() 139 | mb.close() 140 | with pytest.raises(strax.MailBoxAlreadyClosed): 141 | mb.send(0) 142 | 143 | 144 | def test_valid_msg_number(): 145 | """Message numbers are non-negative integers.""" 146 | mb = strax.Mailbox() 147 | with pytest.raises(strax.InvalidMessageNumber): 148 | mb.send(0, msg_number=-1) 149 | with pytest.raises(strax.InvalidMessageNumber): 150 | mb.send(0, msg_number="???") 151 | 152 | 153 | # Task for in the next test, must be global since we're using ProcessPool 154 | # (which must pickle) 155 | def _task(i): 156 | time.sleep(SHORT_TIMEOUT) 157 | return i 158 | 159 | 160 | def test_futures(): 161 | """Mailbox awaits futures before passing them to readers.""" 162 | # Timeouts are longer for this example, 163 | # since they involve creating subprocesses. 164 | exc = concurrent.futures.ProcessPoolExecutor() 165 | futures = [exc.submit(_task, i) for i in range(3)] 166 | mailbox_tester( 167 | futures, 168 | expected_result=[0, 1, 2], 169 | result_timeout=5 * LONG_TIMEOUT, 170 | timeout=5 * LONG_TIMEOUT, 171 | ) 172 | -------------------------------------------------------------------------------- /docs/source/advanced/superrun.rst: -------------------------------------------------------------------------------- 1 | Superruns 2 | ========= 3 | 4 | Overview and motivation 5 | ------------------------ 6 | A superrun is a run defined by (parts of) other runs, which are called 'subruns'. 7 | Superrun names start with an underscore. Regular run names cannot start with an underscore. 8 | 9 | Strax builds data for a superrun by loading (and potentially building) each of the subruns, then 10 | slicing and concatenating them as necessary. In addition superruns can be stored to disk as a 11 | rechunked representation of its subruns. This currently only works for static lineages e.g. without 12 | default-by-run_id settings. Stored superruns have the advantage that loading data is much faster 13 | and different data_types of the same kind can be combined. 14 | 15 | Superruns are useful to track common groupings of data. For example: 16 | 17 | * 'Minimum bias' runs, consisting only of low-energy events, events passing some cuts, DM-candidates, PMT flashes, or other thing of interest. The low-level data of these is much smaller than that of all the full runs, and can be brought to a local analysis facility, enabling on-site low-level waveform watching. 18 | * Grouping similar runs. For example, shifters might group good runs from a week of calibration data with some source under a single name, e.g. ``_kr_feb2019``. 19 | 20 | 21 | Superruns can be built from other superruns. Thus, _sr1_v0.2 could be built from 22 | _background_january, _background_february, etc. 23 | 24 | Defining superruns and making data: 25 | ----------------------------------- 26 | Use the `define_run` context method to define a new superrun. Currently it is only supported to 27 | define superruns from a list of run_ids: 28 | 29 | 30 | .. code-block:: python 31 | 32 | st.define_run('_awesome_superrun', ['123', '124']) 33 | 34 | 35 | From a dictionary of time range tuples. The times must be 64-bit integer UTC timestamps since the unix epoch: 36 | 37 | .. code-block:: python 38 | 39 | st.define_run('_awesome_superrun', { 40 | '123': [(start, stop), (start, stop), ...], 41 | '124': [(start, stop), (start, stop), ...],}) 42 | 43 | From a dataframe (or record array) with strax data: 44 | 45 | 46 | .. code-block:: python 47 | 48 | st.define_run('_awesome_superrun', events_df) 49 | st.define_run('_awesome_superrun', events_df, from_run='123') 50 | 51 | In this case, the run will be made of the time ranges that correspond exactly to ``events_df``. If ```events_df`` already has a ``run_id`` field (e.g. because it consists of data from multiple runs), you do not need to pass `from_run`, it will be read off from the data. 52 | 53 | It is up to the storage frontend to process your request for defining a run. As a normal user, you 54 | generally only have permissions to create a new run in the `DataDirectory` (local files) storage 55 | frontend, where runs are recorded in json files. 56 | 57 | Making superrun data is as easy as creating any other data. Once a superrun is defined we can make 58 | for example event_info via: 59 | 60 | 61 | .. code-block:: python 62 | 63 | st.make('_awesome_superrun', 'event_info) 64 | 65 | For bookkeeping each stored superrun chunk contains information of its constituents in a field 66 | called subruns e.g.: 67 | 68 | 69 | .. code-block:: python 70 | 71 | {'0': {'end': 10, 'start': 0}, 72 | '1': {'end': 30, 'start': 20}, 73 | '2': {'end': 50, 'start': 40}} 74 | 75 | Where the keys represent the subrun_ids and start/end the start and end of the corresponding 76 | first/last chunk included in the superrun chunk. The same information can also be found in the 77 | metadata of the individual chunks: 78 | 79 | .. code-block:: python 80 | 81 | {'chunk_i': 0, 82 | 'end': 50, 83 | 'filename': 'records-j3nd2fjbiq-000000', 84 | 'filesize': 2343, 85 | 'first_endtime': 1, 86 | 'first_time': 0, 87 | 'last_endtime': 50, 88 | 'last_time': 49, 89 | 'n': 300, 90 | 'nbytes': 77100, 91 | 'run_id': '_superrun_test', 92 | 'start': 0, 93 | 'subruns': {'0': {'end': 10, 'start': 0}, 94 | '1': {'end': 30, 'start': 20}, 95 | '2': {'end': 50, 'start': 40}}} 96 | 97 | After creating data we can load the superrun as we are used to and combine it with other data_types 98 | of the same kind too. 99 | 100 | To work more easily with superruns all chunks have also the properties `chunk.is_superun` as well as 101 | `chunk.first_subrun` and `chunk.last_subrun`. 102 | 103 | If you wish to make/store a superrun you have to specify the context option: 104 | 105 | 106 | .. code-block:: python 107 | 108 | st.set_context_config({'write_superruns': True}) 109 | 110 | 111 | Superruns follow the same saving rules (SaveWhen.TARGET, SaveWhen.EXPLICIT or SaveWhen.ALWAYS) as regular runs. 112 | 113 | How superruns work 114 | -------------------- 115 | 116 | As mentioned above, strax builds data for superruns by slicing data of the subruns. Thus, peaks 117 | from a superrun come from the peaks of the subruns, which are built from their own records as usual. 118 | 119 | Defaults for settings can be runid-dependent in strax, although this is not preferred any longer. 120 | If an option specifies ``default_per_run=[(run, setting), (run2, setting2)]``, then runs in between 121 | run and run2 will use setting, and runs after run2 ``setting2``. Superruns store a deterministic hash 122 | of this ``default_per_run`` specification for tracking purposes. 123 | 124 | You cannot currently go directly from the superrun's records to the superrun's peaks. This would be 125 | tricky to implement, since (1) (2) even with the same settings, many plugins choose to do something 126 | different depending on the run_id. For example, in straxen the gain model is specified by a file, 127 | but which gains from the file are actually used is dependent on the runid. 128 | 129 | Thus, superruns won't help build data faster, but they will speed up loading data after it has been 130 | built. This is important, because strax' overhead for loading a run is larger than hax, due to its 131 | version and option tracking (this is only true if per-run-default options are allowed). 132 | -------------------------------------------------------------------------------- /strax/plugins/loop_plugin.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import strax 3 | from .plugin import Plugin 4 | from immutabledict import immutabledict 5 | from warnings import warn 6 | 7 | export, __all__ = strax.exporter() 8 | 9 | 10 | @export 11 | class LoopPlugin(Plugin): 12 | """Plugin that disguises multi-kind data-iteration by an event loop.""" 13 | 14 | # time_selection: Kind of time selection to apply: 15 | # - touching: select things that (partially) overlap with the range. 16 | # NB! Use this option with care since if e.g. two events are 17 | # adjacent, touching windows might return ambiguous results as peaks 18 | # may be touching both events. 19 | # The number of samples to be desired to overlapped can be set by 20 | # self.touching_window. Otherwise 0 is assumed (see strax.touching_windows) 21 | # - fully_contained: (default) select things fully contained in the range 22 | time_selection = "fully_contained" 23 | 24 | def compute(self, **kwargs): 25 | # If not otherwise specified, data kind to loop over 26 | # is that of the first dependency (e.g. events) 27 | # Can't be in __init__: deps not initialized then 28 | if hasattr(self, "loop_over"): 29 | loop_over = self.loop_over 30 | else: 31 | loop_over = self.deps[self.depends_on[0]].data_kind 32 | if not isinstance(loop_over, str): 33 | raise TypeError('Please add "loop_over = " to your plugin definition') 34 | 35 | # Group into lists of things (e.g. peaks) 36 | # contained in the base things (e.g. events) 37 | base = kwargs[loop_over] 38 | if len(base) > 1: 39 | assert np.all(base[1:]["time"] >= strax.endtime(base[:-1])), f"{base}s overlap" 40 | 41 | for k, things in kwargs.items(): 42 | # Check for sorting 43 | difs = np.diff(things["time"]) 44 | if difs.min(initial=0) < 0: 45 | i_bad = np.argmin(difs) 46 | examples = things[i_bad - 1 : i_bad + 3] 47 | t0 = examples["time"].min() 48 | raise ValueError( 49 | f"Expected {k} to be sorted, but found " 50 | + str([(x["time"] - t0, strax.endtime(x) - t0) for x in examples]) 51 | ) 52 | 53 | if k != loop_over: 54 | if self.time_selection == "fully_contained": 55 | r = strax.split_by_containment(things, base) 56 | elif self.time_selection == "touching": 57 | # Experimental feature that should be handled with care: 58 | # github.com/AxFoundation/strax/pull/424 59 | warn( 60 | f"{self.__class__.__name__} has a touching time " 61 | "selection. This may lead to ambiguous results as two " 62 | f"{loop_over}'s may contain the same {k}, thereby a " 63 | f"given {k} can be included multiple times." 64 | ) 65 | window = 0 66 | if hasattr(self, "touching_window"): 67 | window = self.touching_window 68 | r = strax.split_touching_windows(things, base, window=window) 69 | else: 70 | raise RuntimeError("Unknown time_selection") 71 | if len(r) != len(base): 72 | raise RuntimeError(f"Split {k} into {len(r)}, should be {len(base)}!") 73 | kwargs[k] = r 74 | 75 | if self.multi_output: 76 | # This is the a-typical case. Most of the time you just have 77 | # one output. Just doing the same as below but this time we 78 | # need to create a dict for the outputs. 79 | # NB: both outputs will need to have the same length as the 80 | # base! 81 | results = {k: np.zeros(len(base), dtype=self.dtype[k]) for k in self.provides} 82 | deps_by_kind = self.dependencies_by_kind() 83 | 84 | for i, base_chunk in enumerate(base): 85 | res = self.compute_loop( 86 | base_chunk, **{k: kwargs[k][i] for k in deps_by_kind if k != loop_over} 87 | ) 88 | if not isinstance(res, (dict, immutabledict)): 89 | raise AttributeError("Please provide result in compute loop as dict") 90 | # Convert from dict to array row: 91 | for provides, r in res.items(): 92 | for k, v in r.items(): 93 | if np.shape(v) != np.shape(results[provides][i][k]): 94 | # Make sure that the buffer length as 95 | # defined by the base matches the output of 96 | # the compute argument. 97 | raise ValueError( 98 | f"{provides} returned an improper length array " 99 | f"that is not equal to the {loop_over} " 100 | "data-kind! Are you sure a LoopPlugin is the " 101 | "right Plugin for your application?" 102 | ) 103 | results[provides][i][k] = v 104 | else: 105 | # Normally you end up here were we are going to loop over 106 | # base and add the results to the right format. 107 | results = np.zeros(len(base), dtype=self.dtype) 108 | deps_by_kind = self.dependencies_by_kind() 109 | 110 | for i, base_chunk in enumerate(base): 111 | r = self.compute_loop( 112 | base_chunk, **{k: kwargs[k][i] for k in deps_by_kind if k != loop_over} 113 | ) 114 | if not isinstance(r, (dict, immutabledict)): 115 | raise AttributeError("Please provide result in compute loop as dict") 116 | # Convert from dict to array row: 117 | for k, v in r.items(): 118 | results[i][k] = v 119 | return results 120 | 121 | def compute_loop(self, *args, **kwargs): 122 | raise NotImplementedError 123 | -------------------------------------------------------------------------------- /strax/processing/statistics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | 5 | import strax 6 | from strax.sort_enforcement import stable_argsort, stable_sort 7 | 8 | export, __all__ = strax.exporter() 9 | 10 | 11 | @export 12 | @numba.njit(nogil=True, cache=True) 13 | def _compute_hdr_core(data, fractions_desired, only_upper_part=False, _buffer_size=10): 14 | """Core computation for highest density region initialization.""" 15 | fi = 0 16 | res = np.zeros((len(fractions_desired), 2, _buffer_size), dtype=np.int32) 17 | res_amp = np.zeros(len(fractions_desired), dtype=np.float32) 18 | 19 | area_tot = np.sum(data) 20 | if area_tot <= 0: 21 | raise ValueError( 22 | "Highest density regions are not defined for distributions " 23 | "with a total probability of less-equal 0." 24 | ) 25 | 26 | max_to_min = stable_argsort(data)[::-1] 27 | return max_to_min, area_tot, res, res_amp, fi 28 | 29 | 30 | @export 31 | @numba.njit(nogil=True, cache=True) 32 | def _process_intervals_numba(ind, gaps, fi, res, g0, _buffer_size): 33 | """Process intervals using numba. 34 | 35 | Args: 36 | ind: Sorted indices 37 | gaps: Gap indices 38 | fi: Current fraction index 39 | res: Result buffer 40 | g0: Start index 41 | _buffer_size: Maximum number of intervals 42 | 43 | Returns: 44 | tuple: (fi + 1, res) Updated fraction index and result buffer 45 | 46 | """ 47 | if len(gaps) > _buffer_size: 48 | res[fi, 0, :] = -1 49 | res[fi, 1, :] = -1 50 | return fi + 1, res 51 | 52 | g_ind = -1 53 | for g_ind, g in enumerate(gaps): 54 | interval = ind[g0:g] 55 | res[fi, 0, g_ind] = interval[0] 56 | res[fi, 1, g_ind] = interval[-1] + 1 57 | g0 = g 58 | 59 | interval = ind[g0:] 60 | res[fi, 0, g_ind + 1] = interval[0] 61 | res[fi, 1, g_ind + 1] = interval[-1] + 1 62 | return fi + 1, res 63 | 64 | 65 | @export 66 | @numba.njit(nogil=True, cache=True) 67 | def _compute_fraction_seen(data, max_to_min, j, lowest_sample_seen, area_tot, only_upper_part): 68 | """Compute fraction seen (numba-compilable part). 69 | 70 | Args: 71 | data: Input distribution 72 | max_to_min: Sorted indices from max to min 73 | j: Current index 74 | lowest_sample_seen: Current lowest sample 75 | area_tot: Total area 76 | only_upper_part: If True, only compute area between max and current height 77 | 78 | Returns: 79 | tuple: (fraction_seen, sorted_data_max_to_j, actual_lowest) 80 | 81 | """ 82 | lowest_sample_seen *= int(only_upper_part) 83 | sorted_data_max_to_j = data[max_to_min[:j]] 84 | return ( 85 | np.sum(sorted_data_max_to_j - lowest_sample_seen) / area_tot, 86 | sorted_data_max_to_j, 87 | lowest_sample_seen, 88 | ) 89 | 90 | 91 | @export 92 | @numba.njit(nogil=True, cache=True) 93 | def _compute_true_height(sorted_data_sum, j, g, lowest_sample_seen): 94 | """Compute true height (numba-compilable part). 95 | 96 | Args: 97 | sorted_data_sum: Sum of sorted data 98 | j: Current index 99 | g: Fraction ratio 100 | lowest_sample_seen: Current lowest sample 101 | 102 | Returns: 103 | float: True height value 104 | 105 | """ 106 | return (1 - g) * sorted_data_sum / j + g * lowest_sample_seen 107 | 108 | 109 | @export 110 | def highest_density_region(data, fractions_desired, only_upper_part=False, _buffer_size=10): 111 | """Compute highest density region for a given sampled distribution. 112 | 113 | This function splits only the stable sort operation into Python, keeping all other 114 | computations numba-accelerated for maximum performance. 115 | 116 | Args: 117 | data: Sampled distribution 118 | fractions_desired: Area/probability for which HDR should be computed 119 | only_upper_part: If True, only compute area between max and current height 120 | _buffer_size: Size of result buffer (max number of allowed intervals) 121 | 122 | Returns: 123 | tuple: (res, res_amp) where res contains interval indices and res_amp contains 124 | amplitudes for desired fractions 125 | 126 | """ 127 | # Initialize using numba 128 | max_to_min, area_tot, res, res_amp, fi = _compute_hdr_core( 129 | data, fractions_desired, only_upper_part, _buffer_size 130 | ) 131 | 132 | lowest_sample_seen = np.inf 133 | for j in range(1, len(data)): 134 | if lowest_sample_seen == data[max_to_min[j]]: 135 | continue 136 | 137 | lowest_sample_seen = data[max_to_min[j]] 138 | 139 | # Compute fraction seen (numba) 140 | fraction_seen, sorted_data_max_to_j, actual_lowest = _compute_fraction_seen( 141 | data, max_to_min, j, lowest_sample_seen, area_tot, only_upper_part 142 | ) 143 | 144 | m = fractions_desired[fi:] <= fraction_seen 145 | if not np.any(m): 146 | continue 147 | 148 | for fraction_desired in fractions_desired[fi : fi + np.sum(m)]: 149 | g = fraction_desired / fraction_seen 150 | # Compute true height (numba) 151 | true_height = _compute_true_height(np.sum(sorted_data_max_to_j), j, g, actual_lowest) 152 | res_amp[fi] = true_height 153 | 154 | # Only stable_sort in Python mode 155 | with numba.objmode(ind="int64[:]"): 156 | ind = stable_sort(max_to_min[:j]) 157 | 158 | # Rest stays in numba mode 159 | gaps = np.arange(1, len(ind) + 1) 160 | diff = ind[1:] - ind[:-1] 161 | gaps = gaps[:-1][diff > 1] 162 | 163 | # Process intervals with numba 164 | fi, res = _process_intervals_numba(ind, gaps, fi, res, 0, _buffer_size) 165 | 166 | if fi == len(fractions_desired): 167 | return res, res_amp 168 | 169 | # Handle remaining fractions (in numba) 170 | res[fi:, 0, 0] = 0 171 | res[fi:, 1, 0] = len(data) 172 | for ind, fraction_desired in enumerate(fractions_desired[fi:]): 173 | res_amp[fi + ind] = (1 - fraction_desired) * np.sum(data) / len(data) 174 | 175 | return res, res_amp 176 | -------------------------------------------------------------------------------- /strax/processing/peak_properties.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | import strax 5 | 6 | export, __all__ = strax.exporter() 7 | 8 | 9 | @export 10 | @numba.njit(cache=True, nogil=True) 11 | def index_of_fraction(peaks, fractions_desired): 12 | """Return the (fractional) indices at which the peaks reach fractions_desired of their area. 13 | 14 | :param peaks: strax peak(let)s or other data-bearing dtype 15 | :param fractions_desired: array of floats between 0 and 1 16 | :return: (len(peaks), len(fractions_desired)) array of floats 17 | 18 | """ 19 | results = np.zeros((len(peaks), len(fractions_desired)), dtype=np.float32) 20 | 21 | for p_i, p in enumerate(peaks): 22 | if p["area"] <= 0: 23 | continue # TODO: These occur a lot. Investigate! 24 | compute_index_of_fraction(p, fractions_desired, results[p_i]) 25 | return results 26 | 27 | 28 | @export 29 | @numba.njit(nogil=True, cache=True) 30 | def compute_index_of_fraction(peak, fractions_desired, result): 31 | """Store the (fractional) indices at which peak reaches fractions_desired of their area in 32 | result. 33 | 34 | :param peak: single strax peak(let) or other data-bearing dtype 35 | :param fractions_desired: array of floats between 0 and 1 36 | :return: len(fractions_desired) array of floats 37 | 38 | """ 39 | area_tot = peak["area"] 40 | fraction_seen = 0 41 | current_fraction_index = 0 42 | needed_fraction = fractions_desired[current_fraction_index] 43 | for i, x in enumerate(peak["data"][: peak["length"]]): 44 | # How much of the area is in this sample? 45 | fraction_this_sample = x / area_tot 46 | 47 | # Are we passing any desired fractions in this sample? 48 | while fraction_seen + fraction_this_sample >= needed_fraction: 49 | area_needed = area_tot * (needed_fraction - fraction_seen) 50 | if x != 0: 51 | result[current_fraction_index] = i + area_needed / x 52 | else: 53 | result[current_fraction_index] = i 54 | 55 | # Advance to the next fraction 56 | current_fraction_index += 1 57 | if current_fraction_index > len(fractions_desired) - 1: 58 | break 59 | needed_fraction = fractions_desired[current_fraction_index] 60 | 61 | if current_fraction_index > len(fractions_desired) - 1: 62 | break 63 | 64 | # Add this sample's area to the area seen 65 | fraction_seen += fraction_this_sample 66 | 67 | if needed_fraction == 1: 68 | # Sometimes floating-point errors prevent the full area 69 | # from being reached before the waveform ends 70 | result[-1] = peak["length"] 71 | 72 | 73 | @export 74 | def compute_widths(peaks): 75 | """Compute widths in ns at desired area fractions for peaks. 76 | 77 | :param peaks: single strax peak(let) or other data-bearing dtype 78 | 79 | """ 80 | 81 | desired_widths = np.linspace(0, 1, len(peaks[0]["width"])) 82 | # 0% are width is 0 by definition, and it messes up the calculation below 83 | desired_widths = desired_widths[1:] 84 | 85 | # Which area fractions do we need times for? 86 | desired_fr = np.concatenate([0.5 - desired_widths / 2, 0.5 + desired_widths / 2]) 87 | 88 | # We lose the 50% fraction with this operation, let's add it back 89 | desired_fr = strax.stable_sort(np.unique(np.append(desired_fr, [0.5]))) 90 | 91 | fr_times = index_of_fraction(peaks, desired_fr) 92 | fr_times *= peaks["dt"].reshape(-1, 1) 93 | 94 | i = len(desired_fr) // 2 95 | median_time = fr_times[:, i] 96 | width = fr_times[:, i:] - fr_times[:, ::-1][:, i:] 97 | area_decile_from_midpoint = fr_times[:, ::2] - fr_times[:, i].reshape(-1, 1) 98 | return median_time, width, area_decile_from_midpoint 99 | 100 | 101 | @numba.njit(cache=True, nogil=True) 102 | def compute_center_time(peaks): 103 | """Compute the center time of the peaks. 104 | 105 | :param peaks: single strax peak(let) or other data-bearing dtype 106 | 107 | """ 108 | center_time = np.zeros(len(peaks), dtype=np.int64) 109 | for p_i, p in enumerate(peaks): 110 | data = p["data"][: p["length"]] 111 | if data.sum() == 0.0: 112 | # Zero-area peaks have centertime at startime 113 | center_time[p_i] = p["time"] 114 | continue 115 | t = np.average(np.arange(p["length"]), weights=data) 116 | center_time[p_i] = (t + 1 / 2) * p["dt"] 117 | center_time[p_i] += p["time"] # converting from float to int, implicit floor 118 | center_time = np.clip(center_time, peaks["time"], strax.endtime(peaks)) 119 | return center_time 120 | 121 | 122 | @export 123 | @numba.njit(cache=True, nogil=True) 124 | def compute_area_fraction_top(peaks, n_top_channels): 125 | """Compute the area fraction top for peaks.""" 126 | area_fraction_top = np.zeros(len(peaks), dtype=np.float32) 127 | for peak_i in range(len(peaks)): 128 | p = peaks[peak_i] 129 | area_top = p["area_per_channel"][:n_top_channels].sum() 130 | # Non-positive-area peaks get NaN AFT 131 | if p["area"] > 0: 132 | area_fraction_top[peak_i] = area_top / p["area"] 133 | else: 134 | area_fraction_top[peak_i] = np.nan 135 | return area_fraction_top 136 | 137 | 138 | @export 139 | def compute_properties(peaks, n_top_channels=0, select_peaks_indices=None): 140 | """Compute properties: median_time, width, area_decile_from_midpoint, 141 | center_time, and area_fraction_top for peaks. 142 | 143 | :param peaks: single strax peak(let) or other data-bearing dtype 144 | :param select_peaks_indices: array of integers informing which peaks to compute default to None 145 | in which case compute for all peaks 146 | 147 | """ 148 | if not len(peaks) or (select_peaks_indices is not None and not len(select_peaks_indices)): 149 | return 150 | 151 | if select_peaks_indices is None: 152 | select_peaks_indices = slice(None) 153 | 154 | median_time, width, area_decile_from_midpoint = compute_widths(peaks[select_peaks_indices]) 155 | peaks["median_time"][select_peaks_indices] = median_time 156 | peaks["width"][select_peaks_indices] = width 157 | peaks["area_decile_from_midpoint"][select_peaks_indices] = area_decile_from_midpoint 158 | 159 | center_time = compute_center_time(peaks[select_peaks_indices]) 160 | peaks["center_time"][select_peaks_indices] = center_time 161 | 162 | if n_top_channels > 0: 163 | area_fraction_top = compute_area_fraction_top(peaks[select_peaks_indices], n_top_channels) 164 | peaks["area_fraction_top"][select_peaks_indices] = area_fraction_top 165 | -------------------------------------------------------------------------------- /tests/test_peak_splitting.py: -------------------------------------------------------------------------------- 1 | import strax 2 | import numpy as np 3 | from hypothesis import given, settings, strategies 4 | 5 | 6 | def get_int_array(min_value=0, max_value=1, min_size=0, max_size=20) -> strategies.lists: 7 | """Get array with ints. 8 | 9 | :param min_value: min value of items in array 10 | :param max_value: max value of items in array 11 | :param min_size: min number of samples in array 12 | :param max_size: max number of samples in array 13 | :return: strategies.lists of integers of specified format 14 | 15 | """ 16 | return strategies.lists( 17 | strategies.integers(min_value=min_value, max_value=max_value), 18 | min_size=min_size, 19 | max_size=max_size, 20 | ) 21 | 22 | 23 | def get_float_array(min_value=0, max_value=1, min_size=0, max_size=20): 24 | """Get array with floats. 25 | 26 | :param min_value: min value of items in array 27 | :param max_value: max value of items in array 28 | :param min_size: min number of samples in array 29 | :param max_size: max number of samples in array 30 | :return: strategies.lists of floats of specified format 31 | 32 | """ 33 | return strategies.lists( 34 | strategies.floats(min_value=min_value, max_value=max_value), 35 | min_size=min_size, 36 | max_size=max_size, 37 | ) 38 | 39 | 40 | @given( 41 | get_float_array(), 42 | get_int_array(max_value=100), 43 | get_float_array(min_size=20, max_size=150, max_value=100), 44 | ) 45 | @settings(deadline=None) 46 | def test_local_minimum(min_heights, min_ratios, w): 47 | """See _test_splitter_inner.""" 48 | _test_splitter_inner(min_heights, min_ratios, w, "natural_breaks") 49 | 50 | 51 | @given( 52 | get_float_array(), 53 | get_int_array(max_value=100), 54 | get_float_array(min_size=20, max_size=150, max_value=100), 55 | ) 56 | @settings(deadline=None) 57 | def test_natural_breaks(min_heights, min_ratios, w): 58 | """See _test_splitter_inner.""" 59 | _test_splitter_inner(min_heights, min_ratios, w, "local_minimum") 60 | 61 | 62 | def _test_splitter_inner(min_heights, min_ratios, waveform, splitter): 63 | """Test the specified splitting algorithm. 64 | 65 | :param min_heights: list of the minimum heights of the peaks to have a split 66 | :param min_ratios: list of the ratios of the peaks to have a split 67 | :param waveform: list (will be converted to array) of 68 | :param splitter: either 'local_minimum' or 'natural_breaks' 69 | 70 | """ 71 | test_splitter = { 72 | "local_minimum": strax.processing.peak_splitting.LocalMinimumSplitter(), 73 | "natural_breaks": strax.processing.peak_splitting.NaturalBreaksSplitter(), 74 | }.get(splitter, None) 75 | print(f"Testing {splitter}") 76 | if test_splitter is None: 77 | raise NotImplementedError(f"Unknown splitter {splitter}") 78 | 79 | NO_MORE_SPLITS = strax.processing.peak_splitting.NO_MORE_SPLITS 80 | 81 | # mimick a peak 82 | waveform = np.array(waveform) 83 | 84 | for min_height, min_ratio in zip(min_heights, min_ratios): 85 | # Split according to the different splitters 86 | if splitter == "local_minimum": 87 | my_splits = test_splitter.find_split_points( 88 | waveform, dt=None, peak_i=None, min_height=min_height, min_ratio=min_ratio 89 | ) 90 | elif splitter == "natural_breaks": 91 | # Use min-height here as threshold (>1 meaningless) 92 | threshold = np.array([min_height]) 93 | my_splits = test_splitter.find_split_points( 94 | waveform, 95 | dt=1, 96 | peak_i=np.int64(0), 97 | threshold=threshold, 98 | normalize=0, 99 | split_low=0, 100 | filter_wing_width=0, 101 | ) 102 | 103 | my_splits = np.array(list(my_splits)) 104 | 105 | assert len(my_splits) >= 1 106 | # get left and right from found splits 107 | split_checks = [(int(split - 1), int(split + 1), int(split)) for split in my_splits[:, 0]] 108 | 109 | # discard last two split-entries if they exist 110 | # they are len(w) and NO_MORE_SPLITS --> nothing to test 111 | split_checks = split_checks[:-2] 112 | 113 | # This test does not have to work for the natural breaks 114 | # algorithm as we use a moving average 115 | if test_splitter == "local_minimum": 116 | # check if left and right from split index value is bigger or equal 117 | for left, right, split in split_checks: 118 | assert waveform[left] >= waveform[split] 119 | assert waveform[right] >= waveform[split] 120 | 121 | assert len(my_splits) <= int(len(waveform) / 2) + 1 122 | assert min(my_splits[:, 0]) == NO_MORE_SPLITS 123 | assert my_splits[-1, 0] == NO_MORE_SPLITS 124 | 125 | 126 | def test_splitter_outer(): 127 | data = [0, 2, 2, 0, 2, 2, 1] 128 | records = np.zeros(1, dtype=strax.record_dtype(len(data))) 129 | records["dt"] = 1 130 | records["data"] = data 131 | records["length"] = len(data) 132 | records["pulse_length"] = len(data) 133 | to_pe = np.ones(10) 134 | 135 | hits = strax.find_hits(records, np.ones(1)) 136 | hits["left_integration"] = hits["left"] 137 | hits["right_integration"] = hits["right"] 138 | peaks = np.zeros(1, dtype=strax.peak_dtype()) 139 | hitlets = np.zeros(1, dtype=strax.hitlet_with_data_dtype(10)) 140 | for data_type in (peaks, hitlets): 141 | data_type["dt"] = 1 142 | data_type["data"][0, : len(data)] = data 143 | data_type["length"] = len(data) 144 | 145 | rlinks = strax.record_links(records) 146 | peaks = strax.split_peaks( 147 | peaks, 148 | hits, 149 | records, 150 | rlinks, 151 | to_pe, 152 | algorithm="local_minimum", 153 | data_type="peaks", 154 | min_height=1, 155 | min_ratio=0, 156 | ) 157 | 158 | hitlets = strax.split_peaks( 159 | hitlets, 160 | hits, 161 | records, 162 | rlinks, 163 | to_pe, 164 | algorithm="local_minimum", 165 | data_type="hitlets", 166 | min_height=1, 167 | min_ratio=0, 168 | ) 169 | 170 | for name, data_type in zip(("peaks", "hitlets"), (peaks, hitlets)): 171 | data = data_type[0]["data"][: data_type[0]["length"]] 172 | assert np.all( 173 | data == [0, 2, 2] 174 | ), f"Wrong split for {name}, got {data}, expected {[0, 2, 2]}." 175 | data = data_type[1]["data"][: data_type[1]["length"]] 176 | assert np.all( 177 | data == [0, 2, 2, 1] 178 | ), f"Wrong split for {name}, got {data}, expected {[0, 2, 2, 1]}." 179 | -------------------------------------------------------------------------------- /strax/plugins/overlap_window_plugin.py: -------------------------------------------------------------------------------- 1 | import strax 2 | from .plugin import Plugin 3 | 4 | export, __all__ = strax.exporter() 5 | 6 | 7 | @export 8 | class OverlapWindowPlugin(Plugin): 9 | """Plugin whose computation depends on having its inputs extend a certain window on both sides. 10 | 11 | Current implementation assumes: 12 | - All inputs are sorted by *endtime*. Since everything in strax is sorted 13 | by time, this only works for disjoint intervals such as peaks or events, 14 | but NOT records! 15 | - You must read time info for your data kind, or create a new data kind. 16 | 17 | """ 18 | 19 | parallel = False 20 | max_trials = 10 21 | 22 | def __init__(self): 23 | super().__init__() 24 | self.cached_input = {} 25 | self.init_cached_results() 26 | self.sent_until = 0 27 | if self.clean_chunk_after_compute: 28 | raise ValueError( 29 | "OverlapWindowPlugin cannot clean chunks after compute because you need them later." 30 | ) 31 | # This guy can have a logger, it's not parallelized anyway 32 | 33 | def get_window_size(self): 34 | """Return the required window size in nanoseconds.""" 35 | raise NotImplementedError 36 | 37 | def _get_window_size(self): 38 | window_size = self.get_window_size() 39 | if isinstance(window_size, (int, float)): 40 | return window_size, window_size 41 | elif isinstance(window_size, (list, tuple)) and len(window_size) == 2: 42 | if window_size[0] < 0 or window_size[1] < 0: 43 | raise ValueError("Window size elements must be non-negative") 44 | return window_size 45 | else: 46 | raise ValueError( 47 | "Window size must be an integer(float) or a tuple of two integer(float)s" 48 | ) 49 | 50 | def init_cached_results(self): 51 | if self.multi_output: 52 | self.cached_results = {} 53 | else: 54 | self.cached_results = None 55 | 56 | def iter(self, iters, executor=None): 57 | yield from super().iter(iters, executor=executor) 58 | 59 | # Yield final results, kept at bay in fear of a new chunk 60 | yield self.cached_results 61 | 62 | def do_compute(self, chunk_i=None, **kwargs): 63 | if not len(kwargs): 64 | raise RuntimeError("OverlapWindowPlugin must have a dependency") 65 | 66 | # Add cached inputs to compute arguments 67 | for data_kind, chunk in kwargs.items(): 68 | if len(self.cached_input): 69 | kwargs[data_kind] = strax.Chunk.concatenate( 70 | [self.cached_input[data_kind], chunk], self.allow_superrun 71 | ) 72 | 73 | # When does this batch of inputs end? 74 | ends = [c.end for c in kwargs.values()] 75 | if not len(set(ends)) == 1: 76 | raise RuntimeError(f"OverlapWindowPlugin got incongruent inputs: {kwargs}") 77 | end = ends[0] 78 | 79 | window_size = self._get_window_size() 80 | # When can we no longer trust our results? 81 | # Take slightly larger windows for safety: it is very easy for me 82 | # (or the user) to have made an off-by-one error 83 | invalid_beyond = int(end - 2 * window_size[1] - 1) 84 | 85 | # Compute new results 86 | result = super().do_compute(chunk_i=chunk_i, **kwargs) 87 | 88 | # Throw away results we already sent out 89 | # no error here though allow_early_split=False, 90 | # because result.split(t=invalid_beyond, allow_early_split=True) tunes the 91 | # sent_until to be not overlapping with result and 92 | # sent_until <= invalid_beyond 93 | if self.multi_output: 94 | # when multi_output=True, the result is a dict 95 | for data_type in result: 96 | result[data_type] = result[data_type].split( 97 | t=self.sent_until, allow_early_split=False 98 | )[1] 99 | else: 100 | result = result.split(t=self.sent_until, allow_early_split=False)[1] 101 | 102 | # Prepare to send out valid results, cache the rest 103 | # Do not modify result anymore after these lines 104 | # Note result.end <= invalid_beyond, with equality if there are no overlaps 105 | if self.multi_output: 106 | prev_split = self.cache_beyond(result, invalid_beyond, self.cached_results) 107 | for data_type in result: 108 | result[data_type], self.cached_results[data_type] = result[data_type].split( 109 | t=prev_split, allow_early_split=True 110 | ) 111 | if len(set([c.start for c in self.cached_results.values()])) != 1: 112 | raise ValueError("Output start time inconsistency has not been resolved?") 113 | self.sent_until = prev_split 114 | else: 115 | result, self.cached_results = result.split(t=invalid_beyond, allow_early_split=True) 116 | self.sent_until = self.cached_results.start 117 | 118 | # Cache a necessary amount of input for next time 119 | # Again, take a bit of overkill for good measure 120 | # cache_inputs_beyond is smaller than sent_until 121 | cache_inputs_beyond = int(self.sent_until - 2 * window_size[0] - 1) 122 | 123 | # Cache inputs, make sure that the chunks start at the same time to 124 | # prevent issues in input buffers later on 125 | self.cache_beyond(kwargs, cache_inputs_beyond, self.cached_input) 126 | return result 127 | 128 | def cache_beyond(self, io, prev_split, cached): 129 | original_prev_split = prev_split 130 | for try_counter in range(self.max_trials): 131 | for data, chunk in io.items(): 132 | # data here can not either data_kind or data_type 133 | # do not temporarily modify result here because it will be used later 134 | # keep its original value! 135 | cached[data] = chunk.split(t=prev_split, allow_early_split=True)[1] 136 | prev_split = cached[data].start 137 | unique_starts = set([c.start for c in cached.values()]) 138 | if len(unique_starts) == 1: 139 | self.log.debug( 140 | f"Success after {try_counter}. " 141 | f"Extra time is {original_prev_split - prev_split} ns" 142 | ) 143 | break 144 | else: 145 | self.log.debug( 146 | "Inconsistent start times of the cashed chunks {io} after" 147 | f" {try_counter}/{self.max_trials} passes." 148 | ) 149 | else: 150 | raise ValueError( 151 | f"Buffer start time inconsistency cannot be resolved after {self.max_trials} tries" 152 | ) 153 | return prev_split 154 | -------------------------------------------------------------------------------- /strax/io.py: -------------------------------------------------------------------------------- 1 | """Read/write numpy arrays to/from compressed files or file-like objects.""" 2 | 3 | import os 4 | import bz2 5 | import json 6 | 7 | import numpy as np 8 | import blosc 9 | import zstd 10 | import zstandard 11 | import lz4.frame as lz4 12 | from ast import literal_eval 13 | 14 | import strax 15 | from strax import RUN_METADATA_PATTERN 16 | 17 | export, __all__ = strax.exporter() 18 | __all__.extend(["DECOMPRESS_BUFFER_SIZE"]) 19 | 20 | DECOMPRESS_BUFFER_SIZE = 64 * 1024 * 1024 # 64 MB 21 | 22 | # use tqdm as loaded in utils (from tqdm.notebook when in a jupyter env) 23 | tqdm = strax.utils.tqdm 24 | 25 | blosc.set_releasegil(True) 26 | blosc.set_nthreads(1) 27 | 28 | 29 | def _bz2_decompress(f, buffer_size=DECOMPRESS_BUFFER_SIZE): 30 | decompressor = bz2.BZ2Decompressor() 31 | data = bytearray() # Efficient mutable storage 32 | for d in iter(lambda: f.read(buffer_size), b""): 33 | data.extend(decompressor.decompress(d)) 34 | return data 35 | 36 | 37 | # zstd's default compression level is 3: 38 | # https://github.com/sergey-dryabzhinsky/python-zstd/blob/eba9e633e0bc0e9c9762c985d0433e08405fd097/src/python-zstd.h#L53 39 | # we also need to constraint the number of worker threads to 1 40 | # https://github.com/sergey-dryabzhinsky/python-zstd/blob/eba9e633e0bc0e9c9762c985d0433e08405fd097/src/python-zstd.h#L98 41 | _zstd_compress = lambda data: zstd.compress(data, 3, 1) 42 | 43 | 44 | def _zstd_decompress(f, chunk_size=64 * 1024 * 1024): 45 | decompressor = zstandard.ZstdDecompressor().decompressobj() 46 | data = bytearray() # Efficient mutable storage 47 | for d in iter(lambda: f.read(chunk_size), b""): 48 | data.extend(decompressor.decompress(d)) 49 | return data 50 | 51 | 52 | def _blosc_compress(data): 53 | if data.nbytes >= blosc.MAX_BUFFERSIZE: 54 | raise ValueError("Blosc's input buffer cannot exceed ~2 GB") 55 | return blosc.compress(data, shuffle=False) 56 | 57 | 58 | def _blosc_decompress(f): 59 | data = f.read() 60 | data = blosc.decompress(data) 61 | return data 62 | 63 | 64 | def _lz4_decompress(f, buffer_size=DECOMPRESS_BUFFER_SIZE): 65 | decompressor = lz4.LZ4FrameDecompressor() 66 | data = bytearray() # Efficient mutable storage 67 | for d in iter(lambda: f.read(buffer_size), b""): 68 | data.extend(decompressor.decompress(d)) 69 | return data 70 | 71 | 72 | COMPRESSORS = dict( 73 | bz2=dict(compress=bz2.compress, decompress=bz2.decompress, _decompress=_bz2_decompress), 74 | zstd=dict(compress=_zstd_compress, decompress=zstd.decompress, _decompress=_zstd_decompress), 75 | blosc=dict( 76 | compress=_blosc_compress, decompress=blosc.decompress, _decompress=_blosc_decompress 77 | ), 78 | lz4=dict(compress=lz4.compress, decompress=lz4.decompress, _decompress=_lz4_decompress), 79 | ) 80 | 81 | 82 | @export 83 | def load_file(f, compressor, dtype): 84 | """Read and return data from file. 85 | 86 | :param f: file name or handle to read from 87 | :param compressor: compressor to use for decompressing. If not passed, will try to load it from 88 | json metadata file. 89 | :param dtype: numpy dtype of data to load 90 | 91 | """ 92 | if isinstance(f, str): 93 | with open(f, mode="rb") as write_file: 94 | return _load_file(write_file, compressor, dtype) 95 | else: 96 | return _load_file(f, compressor, dtype) 97 | 98 | 99 | def _load_file(f, compressor, dtype): 100 | try: 101 | data = COMPRESSORS[compressor]["_decompress"](f) 102 | if not len(data): 103 | return np.zeros(0, dtype=dtype) 104 | try: 105 | return np.frombuffer(data, dtype=dtype) 106 | except ValueError as e: 107 | raise ValueError(f"ValueError while loading data with dtype =\n\t{dtype}") from e 108 | 109 | except Exception: 110 | raise strax.DataCorrupted( 111 | f"Fatal Error while reading file {f}: " + strax.utils.formatted_exception() 112 | ) 113 | 114 | 115 | @export 116 | def save_file(f, data, compressor="zstd"): 117 | """Save data to file and return number of bytes written. 118 | 119 | :param f: file name or handle to save to 120 | :param data: data (numpy array) to save 121 | :param compressor: compressor to use 122 | 123 | """ 124 | if isinstance(f, str): 125 | final_fn = f 126 | temp_fn = f + "_temp" 127 | with open(temp_fn, mode="wb") as write_file: 128 | result = _save_file(write_file, data, compressor) 129 | os.rename(temp_fn, final_fn) 130 | return result 131 | else: 132 | return _save_file(f, data, compressor) 133 | 134 | 135 | def _save_file(f, data, compressor="zstd"): 136 | assert isinstance(data, np.ndarray), "Please pass a numpy array" 137 | d_comp = COMPRESSORS[compressor]["compress"](data) 138 | f.write(d_comp) 139 | return len(d_comp) 140 | 141 | 142 | @export 143 | def dry_load_files(dirname, chunk_numbers=None, disable=False, **kwargs): 144 | prefix = strax.storage.files.dirname_to_prefix(dirname) 145 | metadata_json = RUN_METADATA_PATTERN % prefix 146 | md_path = os.path.join(dirname, metadata_json) 147 | 148 | with open(md_path, mode="r") as f: 149 | metadata = json.loads(f.read()) 150 | 151 | dtype = literal_eval(metadata["dtype"]) 152 | 153 | def load_chunk(chunk_info): 154 | if chunk_info["n"] != 0: 155 | data = load_file( 156 | os.path.join(dirname, f"{prefix}-{chunk_info['chunk_i']:06d}"), 157 | metadata["compressor"], 158 | dtype, 159 | ) 160 | if len(data) != chunk_info["n"]: 161 | raise ValueError( 162 | f"Chunk {chunk_info['chunk_i']:06d} has {len(data)} " 163 | f"items, but metadata says {chunk_info['n']}." 164 | ) 165 | else: 166 | data = np.empty(0, dtype) 167 | return data 168 | 169 | # Load all chunks if chunk_numbers is None, otherwise load the specified chunk 170 | if chunk_numbers is None: 171 | chunk_numbers = list(range(len(metadata["chunks"]))) 172 | else: 173 | if not isinstance(chunk_numbers, (int, list, tuple)): 174 | raise ValueError( 175 | f"Chunk number must be int, list, or tuple, not {type(chunk_numbers)}." 176 | ) 177 | chunk_numbers = ( 178 | chunk_numbers if isinstance(chunk_numbers, (list, tuple)) else [chunk_numbers] 179 | ) 180 | if max(chunk_numbers) >= len(metadata["chunks"]): 181 | raise ValueError(f"Chunk {max(chunk_numbers):06d} does not exist in {dirname}.") 182 | 183 | results = [] 184 | for c in tqdm(chunk_numbers, disable=disable): 185 | chunk_info = metadata["chunks"][c] 186 | x = load_chunk(chunk_info) 187 | x = strax.apply_selection(x, **kwargs) 188 | results.append(x) 189 | 190 | # No need to hstack if only one chunk is loaded 191 | if len(results) == 1: 192 | results = results[0] 193 | else: 194 | results = np.hstack(results) 195 | return results if len(results) else np.empty(0, dtype) 196 | -------------------------------------------------------------------------------- /docs/source/advanced/recompression.rst: -------------------------------------------------------------------------------- 1 | Recompressing & moving data 2 | =========================== 3 | There are two options for recompressing data: 4 | - via the context :py:func:`context.copy_to_frontend` 5 | - via a dedicated script ``rechunker`` that only works for filesystem backends and works outside the context. 6 | 7 | In order to recompress data with another compression algorithm the 8 | :py:func:`context.copy_to_frontend` function can be used. 9 | The function works on a per run_id-, per datatype- basis. In the example 10 | below, peaks data is copied to a second frontend. 11 | 12 | 13 | .. code-block:: python 14 | 15 | import strax 16 | import os 17 | # Naturally, these plugins (Records and Peaks) only serve as examples 18 | # and are best replaced by a fully constructed context 19 | from strax.testutils import Records, Peaks, run_id 20 | 21 | # Initialize context (st): 22 | st = strax.Context(register=[Records, Peaks]) 23 | 24 | # Initialize frontends 25 | storage_frontend_A = strax.DataDirectory('./folder_A') 26 | storage_frontend_B = strax.DataDirectory('./folder_B', 27 | readonly=True) 28 | st.storage = [storage_frontend_A, 29 | storage_frontend_B] 30 | 31 | # In this example, we will only consider records 32 | target = "records" 33 | 34 | print(f'Are records stored?\n{st.is_stored(run_id, target)}') 35 | 36 | # Make the data (stores to every frontend available) 37 | st.get_array(run_id, 'records') 38 | 39 | for sf in st.storage: 40 | print(f'{target} stored in\n\t{sf}?\n\t{st._is_stored_in_sf(run_id, target, sf)}') 41 | 42 | Which prints: 43 | 44 | .. code-block:: rst 45 | 46 | Are records stored? 47 | False 48 | records stored in 49 | strax.storage.files.DataDirectory, path: ./folder_A? 50 | True 51 | records stored in 52 | strax.storage.files.DataDirectory, readonly: True, path: ./folder_B? 53 | False 54 | 55 | Copy 56 | ____ 57 | In the example above the `storage_frontend_B` was readonly, therefore, 58 | when creating records, no is data stored there. 59 | Below, we will copy the data from `storage_frontend_A` to 60 | `storage_frontend_B`. 61 | 62 | .. code-block:: python 63 | 64 | # First set the storage_frontend_B for readonly=False such that we can copy 65 | # data there 66 | storage_frontend_B.readonly = False 67 | 68 | # In the st.storage-list, storage_frontend_B is index 1 69 | index_frontend_B = 1 70 | st.copy_to_frontend(run_id, target, 71 | target_frontend_id=index_frontend_B) 72 | 73 | for sf in [storage_frontend_A, storage_frontend_B]: 74 | print(f'{target} stored in\n\t{sf}?\n\t{st._is_stored_in_sf(run_id, target, sf)}') 75 | 76 | 77 | Which prints the following (so we can see that the copy to `folder_B` 78 | was successful. 79 | 80 | .. code-block:: rst 81 | 82 | records stored in 83 | strax.storage.files.DataDirectory, path: ./folder_A? 84 | True 85 | records stored in 86 | strax.storage.files.DataDirectory, path: ./folder_B? 87 | True 88 | 89 | Copy and recompress 90 | ___________________ 91 | Now, with a third storage frontend, we will recompress the data to 92 | reduce the size on disk. 93 | 94 | .. code-block:: python 95 | 96 | # Recompression with a different compressor 97 | # See strax.io.COMPRESSORS for more compressors 98 | target_compressor = 'bz2' 99 | 100 | # Add the extra storage frontend 101 | index_frontend_C = 2 102 | storage_frontend_C = strax.DataDirectory('./folder_C') 103 | st.storage.append(storage_frontend_C) 104 | 105 | # Copy and recompress 106 | st.copy_to_frontend(run_id, target, 107 | target_frontend_id=index_frontend_C, 108 | target_compressor=target_compressor) 109 | 110 | for sf in st.storage: 111 | first_cunk = os.path.join(sf.path, 112 | '0-records-sqcyyhsfpv', 113 | 'records-sqcyyhsfpv-000000') 114 | print(f'In {sf.path}, the first chunk is {os.path.getsize(first_cunk)} kB') 115 | 116 | Which outputs: 117 | 118 | .. code-block:: rst 119 | 120 | In ./folder_A, the first chunk is 275 kB 121 | In ./folder_B, the first chunk is 275 kB 122 | In ./folder_C, the first chunk is 65 kB 123 | 124 | From the output we can see that the size of the first chunk of 125 | folder_C, the data much smaller than in folder_A/folder_B. This comes 126 | from the fact that `bz2` compresses the data much more than the default 127 | compressor `blosc`. 128 | 129 | How does this work? 130 | __________________ 131 | Strax knows from the metadata stored with the data with witch 132 | compressor the data was written. It is possible to use a different 133 | compressor when re-writing the data to disk (as done for `strax` knows 134 | from the metadata stored with the data with witch compressor the data 135 | was written. It is possible to use a different compressor when 136 | re-writing the data to disk (as done folder_C in the example above). 137 | 138 | As such, for further use, it does not matter if the data is coming from 139 | either of folders folder_A-folder_C as the metadata will tell strax 140 | which compressor to use. Different compressors may have different 141 | performance for loading/writing data. 142 | 143 | Rechunker script 144 | ================ 145 | From strax v1.2.2 onwards, a ``rechunker`` script is automatically installed with strax. 146 | It can be used to re-write data in the ``FileSystem`` backend. 147 | 148 | 149 | For example: 150 | 151 | .. code-block:: bash 152 | 153 | rechunker --source 009104-raw_records_aqmon-rfzvpzj4mf --compressor zstd 154 | 155 | will output: 156 | 157 | 158 | .. code-block:: rst 159 | 160 | Will write to /tmp/tmpoj0xpr78 and make sub-folder 009104-raw_records_aqmon-rfzvpzj4mf 161 | Rechunking 009104-raw_records_aqmon-rfzvpzj4mf to /tmp/tmpoj0xpr78/009104-raw_records_aqmon-rfzvpzj4mf 162 | move /tmp/tmpoj0xpr78/009104-raw_records_aqmon-rfzvpzj4mf to 009104-raw_records_aqmon-rfzvpzj4mf 163 | Re-compressed 009104-raw_records_aqmon-rfzvpzj4mf 164 | backend_key 009104-raw_records_aqmon-rfzvpzj4mf 165 | load_time 0.4088103771209717 166 | write_time 0.07699322700500488 167 | uncompressed_mb 1.178276 168 | source_compressor zstd 169 | dest_compressor zstd 170 | source_mb 0.349217 171 | dest_mb 0.349218 172 | 173 | Using script to profile write/read rates for compressors 174 | -------------------------------------------------------- 175 | This script can easily be used to profile different compressors: 176 | 177 | .. code-block:: bash 178 | 179 | for COMPRESSOR in zstd bz2 lz4 blosc zstd; \ 180 | do echo $COMPRESSOR; \ 181 | rechunker \ 182 | --source 009104-raw_records-rfzvpzj4mf \ 183 | --write_stats_to test.csv \ 184 | --compressor $COMPRESSOR; \ 185 | done 186 | 187 | We can check the output in python using: 188 | 189 | .. code-block:: python 190 | 191 | >>> import pandas as pd 192 | >>> df = pd.read_csv('test.csv') 193 | >>> df['read_mbs'] = df['uncompressed_mb']/df['load_time'] 194 | >>> df['write_mbs'] = df['uncompressed_mb']/df['write_time'] 195 | >>> print(df[['source_compressor', 'read_mbs', 'dest_compressor', 'write_mbs']].to_string()) 196 | source_compressor read_mbs dest_compressor write_mbs 197 | 0 zstd 313.922890 zstd 298.429123 198 | 1 zstd 284.530054 bz2 8.932259 199 | 2 bz2 20.289876 lz4 228.932498 200 | 3 lz4 372.491150 blosc 433.494794 201 | 4 blosc 725.154966 zstd 215.765177 202 | -------------------------------------------------------------------------------- /strax/plugins/parrallel_source_plugin.py: -------------------------------------------------------------------------------- 1 | import strax 2 | from .plugin import Plugin 3 | 4 | export, __all__ = strax.exporter() 5 | 6 | 7 | @export 8 | class ParallelSourcePlugin(Plugin): 9 | """An plugin that inlines the computations of other plugins and the saving of their results. 10 | 11 | This evades data transfer (pickling and/or memory copy) penalties while multiprocessing. 12 | 13 | """ 14 | 15 | parallel = "process" 16 | # should we set this here? 17 | input_timeout = 300 18 | 19 | @classmethod 20 | def inline_plugins(cls, components, start_from, log): 21 | plugins = components.plugins.copy() 22 | loader_plugins = components.loader_plugins.copy() 23 | log.debug(f"Try to inline plugins starting from {start_from}") 24 | 25 | sub_plugins = {start_from: plugins[start_from]} 26 | del plugins[start_from] 27 | 28 | # Gather all plugins that do not rechunk and which branch out as a 29 | # simple tree from the input plugin. 30 | # We'll run these all together in one process. 31 | while True: 32 | # Scan for plugins we can inline 33 | for p in plugins.values(): 34 | if p.parallel and all([d in sub_plugins for d in p.depends_on]): 35 | for d in p.provides: 36 | sub_plugins[d] = p 37 | if d in plugins: 38 | del plugins[d] 39 | # Rescan 40 | break 41 | else: 42 | # No more plugins we can inline 43 | break 44 | log.debug(f"Trying to inline the following sub-plugins: {sub_plugins}") 45 | if len(set(list(sub_plugins.values()))) == 1: 46 | # Just one plugin to inline: no use 47 | log.debug("Just one plugin to inline: skipping") 48 | return components 49 | 50 | # Which data types should we output? Three cases follow. 51 | outputs_to_send = set() 52 | 53 | # Case 1. Requested as a final target 54 | for p in sub_plugins.values(): 55 | outputs_to_send.update(set(components.targets).intersection(set(p.provides))) 56 | # Case 2. Requested by a plugin we did not inline 57 | for d, p in plugins.items(): 58 | outputs_to_send.update(set(p.depends_on)) 59 | outputs_to_send &= sub_plugins.keys() 60 | 61 | # Inline savers that do not require rechunking 62 | savers = components.savers 63 | sub_savers = dict() 64 | for p in sub_plugins.values(): 65 | for d in p.provides: 66 | if d not in savers: 67 | continue 68 | if p.can_rechunk(d): 69 | # Case 3. has a saver we can't inline 70 | outputs_to_send.add(d) 71 | continue 72 | 73 | remaining_savers = [] 74 | for s_i, s in enumerate(savers[d]): 75 | if not s.allow_fork: 76 | # Case 3 again, cannot inline saver 77 | outputs_to_send.add(d) 78 | remaining_savers.append(s) 79 | continue 80 | if d not in sub_savers: 81 | sub_savers[d] = [] 82 | s.is_forked = True 83 | sub_savers[d].append(s) 84 | savers[d] = remaining_savers 85 | 86 | if not len(savers[d]): 87 | del savers[d] 88 | 89 | p = cls(depends_on=sub_plugins[start_from].depends_on) 90 | p.run_id = sub_plugins[start_from]._run_id 91 | p.sub_plugins = sub_plugins 92 | assert len(outputs_to_send) 93 | p.provides = tuple(outputs_to_send) 94 | p.sub_savers = sub_savers 95 | p.start_from = start_from 96 | if p.multi_output: 97 | p.dtype = {} 98 | for d in outputs_to_send: 99 | if d in p.sub_plugins: 100 | p.dtype[d] = p.sub_plugins[d].dtype_for(d) 101 | else: 102 | log.debug(f"Finding plugin that provides {d}") 103 | # Need to do some more work to get the plugin that 104 | # provides this data-type. 105 | for sp in p.sub_plugins.values(): 106 | if d in sp.provides: 107 | log.debug(f"{sp} provides {d}") 108 | p.dtype[d] = sp.dtype_for(d) 109 | break 110 | else: 111 | to_send = list(outputs_to_send)[0] 112 | p.dtype = p.sub_plugins[to_send].dtype_for(to_send) 113 | for d in p.provides: 114 | plugins[d] = p 115 | 116 | log.debug(f"Trying to find plugins for dependencies: {p.depends_on}") 117 | 118 | p.deps = { 119 | d: plugins[d] if plugins.get(d, None) else loader_plugins[d] for d in p.depends_on 120 | } 121 | 122 | log.debug(f"Inlined plugins: {p.sub_plugins}.Inlined savers: {p.sub_savers}") 123 | 124 | return strax.ProcessorComponents( 125 | plugins, components.loaders, components.loader_plugins, savers, components.targets 126 | ) 127 | 128 | def __init__(self, depends_on): 129 | self.depends_on = depends_on 130 | super().__init__() 131 | 132 | def source_finished(self): 133 | return self.sub_plugins[self.start_from].source_finished() 134 | 135 | def is_ready(self, chunk_i): 136 | return self.sub_plugins[self.start_from].is_ready(chunk_i) 137 | 138 | def do_compute(self, chunk_i=None, **kwargs): 139 | results = kwargs 140 | 141 | # Run the different plugin computations 142 | while True: 143 | for output_name, p in self.sub_plugins.items(): 144 | if output_name in results: 145 | continue 146 | if any([d not in results for d in p.depends_on]): 147 | continue 148 | compute_kwargs = dict(chunk_i=chunk_i) 149 | 150 | for kind, d_of_kind in p.dependencies_by_kind().items(): 151 | compute_kwargs[kind] = strax.Chunk.merge([results[d] for d in d_of_kind]) 152 | 153 | # Store compute result(s) 154 | r = p.do_compute(**compute_kwargs) 155 | if p.multi_output: 156 | for d in r: 157 | results[d] = r[d] 158 | else: 159 | results[output_name] = r 160 | 161 | # Rescan plugins to see if we can compute anything more 162 | break 163 | 164 | else: 165 | # Nothing further to compute 166 | break 167 | for d in self.provides: 168 | assert d in results, f"Output {d} missing!" 169 | 170 | # Save anything we can through the inlined savers 171 | for d, savers in self.sub_savers.items(): 172 | for s in savers: 173 | s.save(chunk=results[d], chunk_i=chunk_i) 174 | 175 | # Remove results we do not need to send 176 | for d in list(results.keys()): 177 | if d not in self.provides: 178 | del results[d] 179 | 180 | if self.multi_output: 181 | for k in self.provides: 182 | assert k in results 183 | assert isinstance(results[k], strax.Chunk) 184 | r0 = results[k] 185 | else: 186 | results = r0 = results[self.provides[0]] 187 | assert isinstance(r0, strax.Chunk) 188 | 189 | return self._fix_output( 190 | results, start=r0.start, end=r0.end, superrun=r0.superrun, subruns=r0.subruns 191 | ) 192 | 193 | def cleanup(self, wait_for): 194 | print(f"{self.__class__.__name__} terminated. Waiting for {len(wait_for)} pending futures.") 195 | for savers in self.sub_savers.values(): 196 | for s in savers: 197 | s.close(wait_for=wait_for) 198 | super().cleanup(wait_for) 199 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | REM Command file for Sphinx documentation 4 | 5 | if "%SPHINXBUILD%" == "" ( 6 | set SPHINXBUILD=sphinx-build 7 | ) 8 | set BUILDDIR=build 9 | set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source 10 | set I18NSPHINXOPTS=%SPHINXOPTS% source 11 | if NOT "%PAPER%" == "" ( 12 | set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS% 13 | set I18NSPHINXOPTS=-D latex_paper_size=%PAPER% %I18NSPHINXOPTS% 14 | ) 15 | 16 | if "%1" == "" goto help 17 | 18 | if "%1" == "help" ( 19 | :help 20 | echo.Please use `make ^` where ^ is one of 21 | echo. html to make standalone HTML files 22 | echo. dirhtml to make HTML files named index.html in directories 23 | echo. singlehtml to make a single large HTML file 24 | echo. pickle to make pickle files 25 | echo. json to make JSON files 26 | echo. htmlhelp to make HTML files and a HTML help project 27 | echo. qthelp to make HTML files and a qthelp project 28 | echo. devhelp to make HTML files and a Devhelp project 29 | echo. epub to make an epub 30 | echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter 31 | echo. text to make text files 32 | echo. man to make manual pages 33 | echo. texinfo to make Texinfo files 34 | echo. gettext to make PO message catalogs 35 | echo. changes to make an overview over all changed/added/deprecated items 36 | echo. xml to make Docutils-native XML files 37 | echo. pseudoxml to make pseudoxml-XML files for display purposes 38 | echo. linkcheck to check all external links for integrity 39 | echo. doctest to run all doctests embedded in the documentation if enabled 40 | echo. coverage to run coverage check of the documentation if enabled 41 | goto end 42 | ) 43 | 44 | if "%1" == "clean" ( 45 | for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i 46 | del /q /s %BUILDDIR%\* 47 | goto end 48 | ) 49 | 50 | 51 | REM Check if sphinx-build is available and fallback to Python version if any 52 | %SPHINXBUILD% 1>NUL 2>NUL 53 | if errorlevel 9009 goto sphinx_python 54 | goto sphinx_ok 55 | 56 | :sphinx_python 57 | 58 | set SPHINXBUILD=python -m sphinx.__init__ 59 | %SPHINXBUILD% 2> nul 60 | if errorlevel 9009 ( 61 | echo. 62 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 63 | echo.installed, then set the SPHINXBUILD environment variable to point 64 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 65 | echo.may add the Sphinx directory to PATH. 66 | echo. 67 | echo.If you don't have Sphinx installed, grab it from 68 | echo.http://sphinx-doc.org/ 69 | exit /b 1 70 | ) 71 | 72 | :sphinx_ok 73 | 74 | 75 | if "%1" == "html" ( 76 | %SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html 77 | if errorlevel 1 exit /b 1 78 | echo. 79 | echo.Build finished. The HTML pages are in %BUILDDIR%/html. 80 | goto end 81 | ) 82 | 83 | if "%1" == "dirhtml" ( 84 | %SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml 85 | if errorlevel 1 exit /b 1 86 | echo. 87 | echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml. 88 | goto end 89 | ) 90 | 91 | if "%1" == "singlehtml" ( 92 | %SPHINXBUILD% -b singlehtml %ALLSPHINXOPTS% %BUILDDIR%/singlehtml 93 | if errorlevel 1 exit /b 1 94 | echo. 95 | echo.Build finished. The HTML pages are in %BUILDDIR%/singlehtml. 96 | goto end 97 | ) 98 | 99 | if "%1" == "pickle" ( 100 | %SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle 101 | if errorlevel 1 exit /b 1 102 | echo. 103 | echo.Build finished; now you can process the pickle files. 104 | goto end 105 | ) 106 | 107 | if "%1" == "json" ( 108 | %SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json 109 | if errorlevel 1 exit /b 1 110 | echo. 111 | echo.Build finished; now you can process the JSON files. 112 | goto end 113 | ) 114 | 115 | if "%1" == "htmlhelp" ( 116 | %SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp 117 | if errorlevel 1 exit /b 1 118 | echo. 119 | echo.Build finished; now you can run HTML Help Workshop with the ^ 120 | .hhp project file in %BUILDDIR%/htmlhelp. 121 | goto end 122 | ) 123 | 124 | if "%1" == "qthelp" ( 125 | %SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp 126 | if errorlevel 1 exit /b 1 127 | echo. 128 | echo.Build finished; now you can run "qcollectiongenerator" with the ^ 129 | .qhcp project file in %BUILDDIR%/qthelp, like this: 130 | echo.^> qcollectiongenerator %BUILDDIR%\qthelp\strax.qhcp 131 | echo.To view the help file: 132 | echo.^> assistant -collectionFile %BUILDDIR%\qthelp\strax.ghc 133 | goto end 134 | ) 135 | 136 | if "%1" == "devhelp" ( 137 | %SPHINXBUILD% -b devhelp %ALLSPHINXOPTS% %BUILDDIR%/devhelp 138 | if errorlevel 1 exit /b 1 139 | echo. 140 | echo.Build finished. 141 | goto end 142 | ) 143 | 144 | if "%1" == "epub" ( 145 | %SPHINXBUILD% -b epub %ALLSPHINXOPTS% %BUILDDIR%/epub 146 | if errorlevel 1 exit /b 1 147 | echo. 148 | echo.Build finished. The epub file is in %BUILDDIR%/epub. 149 | goto end 150 | ) 151 | 152 | if "%1" == "latex" ( 153 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 154 | if errorlevel 1 exit /b 1 155 | echo. 156 | echo.Build finished; the LaTeX files are in %BUILDDIR%/latex. 157 | goto end 158 | ) 159 | 160 | if "%1" == "latexpdf" ( 161 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 162 | cd %BUILDDIR%/latex 163 | make all-pdf 164 | cd %~dp0 165 | echo. 166 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 167 | goto end 168 | ) 169 | 170 | if "%1" == "latexpdfja" ( 171 | %SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex 172 | cd %BUILDDIR%/latex 173 | make all-pdf-ja 174 | cd %~dp0 175 | echo. 176 | echo.Build finished; the PDF files are in %BUILDDIR%/latex. 177 | goto end 178 | ) 179 | 180 | if "%1" == "text" ( 181 | %SPHINXBUILD% -b text %ALLSPHINXOPTS% %BUILDDIR%/text 182 | if errorlevel 1 exit /b 1 183 | echo. 184 | echo.Build finished. The text files are in %BUILDDIR%/text. 185 | goto end 186 | ) 187 | 188 | if "%1" == "man" ( 189 | %SPHINXBUILD% -b man %ALLSPHINXOPTS% %BUILDDIR%/man 190 | if errorlevel 1 exit /b 1 191 | echo. 192 | echo.Build finished. The manual pages are in %BUILDDIR%/man. 193 | goto end 194 | ) 195 | 196 | if "%1" == "texinfo" ( 197 | %SPHINXBUILD% -b texinfo %ALLSPHINXOPTS% %BUILDDIR%/texinfo 198 | if errorlevel 1 exit /b 1 199 | echo. 200 | echo.Build finished. The Texinfo files are in %BUILDDIR%/texinfo. 201 | goto end 202 | ) 203 | 204 | if "%1" == "gettext" ( 205 | %SPHINXBUILD% -b gettext %I18NSPHINXOPTS% %BUILDDIR%/locale 206 | if errorlevel 1 exit /b 1 207 | echo. 208 | echo.Build finished. The message catalogs are in %BUILDDIR%/locale. 209 | goto end 210 | ) 211 | 212 | if "%1" == "changes" ( 213 | %SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes 214 | if errorlevel 1 exit /b 1 215 | echo. 216 | echo.The overview file is in %BUILDDIR%/changes. 217 | goto end 218 | ) 219 | 220 | if "%1" == "linkcheck" ( 221 | %SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck 222 | if errorlevel 1 exit /b 1 223 | echo. 224 | echo.Link check complete; look for any errors in the above output ^ 225 | or in %BUILDDIR%/linkcheck/output.txt. 226 | goto end 227 | ) 228 | 229 | if "%1" == "doctest" ( 230 | %SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest 231 | if errorlevel 1 exit /b 1 232 | echo. 233 | echo.Testing of doctests in the sources finished, look at the ^ 234 | results in %BUILDDIR%/doctest/output.txt. 235 | goto end 236 | ) 237 | 238 | if "%1" == "coverage" ( 239 | %SPHINXBUILD% -b coverage %ALLSPHINXOPTS% %BUILDDIR%/coverage 240 | if errorlevel 1 exit /b 1 241 | echo. 242 | echo.Testing of coverage in the sources finished, look at the ^ 243 | results in %BUILDDIR%/coverage/python.txt. 244 | goto end 245 | ) 246 | 247 | if "%1" == "xml" ( 248 | %SPHINXBUILD% -b xml %ALLSPHINXOPTS% %BUILDDIR%/xml 249 | if errorlevel 1 exit /b 1 250 | echo. 251 | echo.Build finished. The XML files are in %BUILDDIR%/xml. 252 | goto end 253 | ) 254 | 255 | if "%1" == "pseudoxml" ( 256 | %SPHINXBUILD% -b pseudoxml %ALLSPHINXOPTS% %BUILDDIR%/pseudoxml 257 | if errorlevel 1 exit /b 1 258 | echo. 259 | echo.Build finished. The pseudo-XML files are in %BUILDDIR%/pseudoxml. 260 | goto end 261 | ) 262 | 263 | :end 264 | --------------------------------------------------------------------------------