├── doc
    ├── _static
    │   └── .keep
    ├── _templates
    │   └── .keep
    ├── .gitignore
    ├── modules.rst
    ├── user.rst
    ├── mvf_v3.rst
    ├── index.rst
    ├── data_set_format.rst
    ├── Makefile
    ├── signs.rst
    ├── katdal.rst
    ├── mvf_v2.rst
    ├── tuning.rst
    ├── conf.py
    └── intro.rst
├── katdal
    ├── test
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_categorical.py
    │   ├── test_chunkstore_npy.py
    │   ├── test_chunkstore_dict.py
    │   ├── test_van_vleck.py
    │   ├── test_spectral_window.py
    │   ├── test_concatdata.py
    │   ├── s3_utils.py
    │   ├── test_sensordata.py
    │   ├── test_lazy_indexer.py
    │   └── test_vis_flags_weights.py
    ├── flags.py
    ├── chunkstore_dict.py
    ├── van_vleck.py
    ├── chunkstore_npy.py
    ├── __init__.py
    ├── averager.py
    ├── spectral_window.py
    └── ms_async.py
├── pyproject.toml
├── MANIFEST.in
├── pytest.ini
├── setup.cfg
├── Jenkinsfile
├── requirements.txt
├── test-requirements.txt
├── doc-requirements.txt
├── .gitignore
├── LICENSE.txt
├── setup.py
├── scripts
    ├── mvf_read_benchmark.py
    ├── spectrogram_plot_example.py
    ├── mvf_copy.py
    ├── mvf_download.py
    └── mvf_rechunk.py
├── NEWS.rst
└── README.rst


/doc/_static/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/_templates/.keep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/katdal/test/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/doc/.gitignore:
--------------------------------------------------------------------------------
1 | _build
2 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools", "wheel", "katversion"]
3 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE.txt README.rst NEWS.rst requirements.txt test-requirements.txt
2 | 


--------------------------------------------------------------------------------
/doc/modules.rst:
--------------------------------------------------------------------------------
1 | katdal
2 | ======
3 | 
4 | .. toctree::
5 |    :maxdepth: 4
6 | 
7 |    katdal
8 | 


--------------------------------------------------------------------------------
/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     expected_duration(seconds): The expected duration of a test, in seconds.
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | 
4 | [darglint]
5 | docstring_style=numpy
6 | strictness=long
7 | 


--------------------------------------------------------------------------------
/doc/user.rst:
--------------------------------------------------------------------------------
 1 | User guide
 2 | ==========
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 2
 6 |    :caption: Contents:
 7 | 
 8 |    intro
 9 |    tuning
10 |    signs
11 | 


--------------------------------------------------------------------------------
/Jenkinsfile:
--------------------------------------------------------------------------------
 1 | #!groovy
 2 | 
 3 | @Library('katsdpjenkins') _
 4 | katsdp.killOldJobs()
 5 | katsdp.setDependencies([
 6 |     'ska-sa/katsdpdockerbase/master',
 7 |     'ska-sa/katpoint/master',
 8 |     'ska-sa/katsdptelstate/master'])
 9 | katsdp.standardBuild()
10 | katsdp.mail('ludwig@ska.ac.za')
11 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | -c https://raw.githubusercontent.com/ska-sa/katsdpdockerbase/master/docker-base-build/base-requirements.txt
 2 | 
 3 | botocore
 4 | cityhash
 5 | dask[array]
 6 | h5py
 7 | numba
 8 | numpy
 9 | packaging
10 | pyjwt
11 | requests
12 | 
13 | katpoint @ git+https://github.com/ska-sa/katpoint
14 | katsdptelstate[rdb] @ git+https://github.com/ska-sa/katsdptelstate
15 | 


--------------------------------------------------------------------------------
/doc/mvf_v3.rst:
--------------------------------------------------------------------------------
 1 | MVF version 3 (early MeerKAT)
 2 | =============================
 3 | 
 4 | The version 3 format is an evolution of the v2 format, and continues to
 5 | use HDF5 as the underlying format. It was used for early engineering and
 6 | commissioning of MeerKAT, but replaced by v4 before science operations
 7 | started.
 8 | 
 9 | At present there is no detailed documentation.
10 | 


--------------------------------------------------------------------------------
/test-requirements.txt:
--------------------------------------------------------------------------------
 1 | -c https://raw.githubusercontent.com/ska-sa/katsdpdockerbase/master/docker-base-build/base-requirements.txt
 2 | 
 3 | cffi==1.15.1                           # via cryptography
 4 | coverage
 5 | cryptography==38.0.3
 6 | packaging
 7 | pycparser==2.21                        # via cffi
 8 | pyparsing                              # via packaging
 9 | pytest
10 | pytest-cov
11 | 


--------------------------------------------------------------------------------
/doc-requirements.txt:
--------------------------------------------------------------------------------
 1 | alabaster
 2 | babel
 3 | certifi
 4 | chardet
 5 | docutils
 6 | idna
 7 | imagesize
 8 | jinja2
 9 | markupsafe
10 | pygments
11 | pytz
12 | requests
13 | snowballstemmer
14 | sphinx
15 | sphinx-rtd-theme
16 | sphinxcontrib-applehelp
17 | sphinxcontrib-devhelp
18 | sphinxcontrib-htmlhelp
19 | sphinxcontrib-jsmath
20 | sphinxcontrib-qthelp
21 | sphinxcontrib-serializinghtml
22 | sphinxcontrib-websupport
23 | typing; python_version<'3'
24 | urllib3
25 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.py[cod]
 2 | 
 3 | # C extensions
 4 | *.so
 5 | 
 6 | # Packages
 7 | *.egg
 8 | *.egg-info
 9 | dist
10 | build
11 | eggs
12 | .eggs
13 | parts
14 | bin
15 | var
16 | sdist
17 | develop-eggs
18 | .installed.cfg
19 | lib
20 | lib64
21 | __pycache__
22 | 
23 | # Installer logs
24 | pip-log.txt
25 | 
26 | # Unit test / coverage reports
27 | .coverage
28 | .tox
29 | nosetests.xml
30 | 
31 | # Developer tools
32 | *~
33 | .ropeproject
34 | 
35 | # Virtual Studio Code settings
36 | .vscode
37 | 
38 | 


--------------------------------------------------------------------------------
/doc/index.rst:
--------------------------------------------------------------------------------
 1 | .. katdal documentation master file, created by
 2 |    sphinx-quickstart on Sun Jun  2 11:18:58 2019.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to katdal's documentation!
 7 | ==================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 2
11 |    :caption: Contents:
12 | 
13 |    user
14 |    data_set_format
15 |    API reference <modules>
16 | 
17 | 
18 | Indices and tables
19 | ==================
20 | 
21 | * :ref:`genindex`
22 | * :ref:`modindex`
23 | * :ref:`search`
24 | 


--------------------------------------------------------------------------------
/doc/data_set_format.rst:
--------------------------------------------------------------------------------
 1 | Data set format reference
 2 | =========================
 3 | 
 4 | In most cases uses should not need to know the details of the data set formats,
 5 | because katdal exists to hide these details and present a consistent,
 6 | user-friendly view. It also contains workarounds for known issues in older data
 7 | sets (which are not documented here). This is reference documentation useful to
 8 | katdal developers and to power users who need to extract information not
 9 | presented by the katdal interface.
10 | 
11 | .. toctree::
12 |    :maxdepth: 1
13 |    :caption: Contents:
14 | 
15 |    mvf_v1
16 |    mvf_v2
17 |    mvf_v3
18 |    mvf_v4
19 | 


--------------------------------------------------------------------------------
/doc/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SOURCEDIR     = .
 8 | BUILDDIR      = _build
 9 | 
10 | # Put it first so that "make" without argument is like "make help".
11 | help:
12 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
13 | 
14 | apidoc:
15 | 	sphinx-apidoc -f -o . ../katdal ../katdal/test/*.py
16 | 
17 | .PHONY: help apidoc Makefile
18 | 
19 | # Catch-all target: route all unknown targets to Sphinx using the new
20 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
21 | %: Makefile
22 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
23 | 


--------------------------------------------------------------------------------
/doc/signs.rst:
--------------------------------------------------------------------------------
 1 | Sign conventions
 2 | ----------------
 3 | 
 4 | Visibilities
 5 | ============
 6 | 
 7 | For a wave with frequency :math:`\omega` and wave number :math:`k`, the
 8 | phasor is
 9 | 
10 | .. math:: e^{(\omega t - kz)i}
11 | 
12 | Visibilities are then :math:`e_1 \overline{e_2}`.
13 | 
14 | In KAT-7, the opposite sign convention is used in the HDF5 files, but katdal
15 | conjugates the visibilities to match MeerKAT.
16 | 
17 | Baseline coordinates
18 | ====================
19 | 
20 | The UVW coordinates for the baseline (A, B) are
21 | :math:`(u, v, w)_A - (u, v, w)_B`. Combined with the above, this means
22 | that ideal visibilities (ignoring any effects apart from geometric
23 | delay) are
24 | 
25 | .. math:: V(u, v, w) = \int \frac{I(l, m)}{n} e^{2\pi i(ul + vm + w(n - 1))}\ dl\ dm
26 | 
27 | Polarisation
28 | ============
29 | 
30 | KAT-7 and MeerKAT are linear feed systems. On MeerKAT, if one points
31 | one's right thumb in the direction of vertical polarisation and the
32 | right index finger in the direction of horizontal polarisation, then the
33 | right middle finger points from the antenna towards the source.
34 | 
35 | When exporting to a Measurement Set, katdal maps H to (IEEE) x and V to
36 | y, and introduces a 90° offset to the parallactic angle rotation.
37 | 
38 | KAT-7 has the opposite convention for polarisation (due to the lack of a
39 | sub-reflector). katdal does **not** make any effort to compensate for
40 | this. Measurement sets exported from KAT-7 data should thus not be used
41 | for polarimetry without further correction.
42 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2011-2024, National Research Foundation (SARAO)
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without modification,
 5 | are permitted provided that the following conditions are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright notice,
 8 |    this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright notice,
11 |    this list of conditions and the following disclaimer in the documentation
12 |    and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its contributors
15 |    may be used to endorse or promote products derived from this software
16 |    without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28 | 


--------------------------------------------------------------------------------
/katdal/flags.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2011,2019, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """Definitions of flag bits"""
18 | 
19 | NAMES = ('reserved0', 'static', 'cam', 'data_lost',
20 |          'ingest_rfi', 'predicted_rfi', 'cal_rfi', 'postproc')
21 | DESCRIPTIONS = ('reserved - bit 0',
22 |                 'predefined static flag list',
23 |                 'flag based on live CAM information',
24 |                 'no data was received',
25 |                 'RFI detected in ingest',
26 |                 'RFI predicted from space based pollutants',
27 |                 'RFI detected in calibration',
28 |                 'some correction/postprocessing step could not be applied')
29 | 
30 | STATIC_BIT = 1
31 | CAM_BIT = 2
32 | DATA_LOST_BIT = 3
33 | INGEST_RFI_BIT = 4
34 | PREDICTED_RFI_BIT = 5
35 | CAL_RFI_BIT = 6
36 | POSTPROC_BIT = 7
37 | 
38 | STATIC = 1 << STATIC_BIT
39 | CAM = 1 << CAM_BIT
40 | DATA_LOST = 1 << DATA_LOST_BIT
41 | INGEST_RFI = 1 << INGEST_RFI_BIT
42 | PREDICTED_RFI = 1 << PREDICTED_RFI_BIT
43 | CAL_RFI = 1 << CAL_RFI_BIT
44 | POSTPROC = 1 << POSTPROC_BIT
45 | 


--------------------------------------------------------------------------------
/katdal/test/conftest.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2023, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | import pytest
18 | 
19 | TEST_DURATION_TOLERANCE = 0.1
20 | 
21 | 
22 | def pytest_addoption(parser):
23 |     parser.addoption(
24 |         "--check-durations",
25 |         action="store_true",
26 |         help="Verify how long some tests run (the ones with an `expected_duration` mark)",
27 |     )
28 | 
29 | 
30 | @pytest.hookimpl(hookwrapper=True)
31 | def pytest_runtest_makereport(item, call):
32 |     """Optionally override pytest report creation to verify test duration."""
33 |     report = (yield).get_result()
34 |     # Only continue if the user requests this and the test has an expected_duration mark
35 |     check_durations = item.config.getoption("--check-durations", default=False)
36 |     mark = item.get_closest_marker("expected_duration")
37 |     if not check_durations or mark is None:
38 |         return report
39 |     # The test will take at least as long as the expected duration and probably a bit longer
40 |     minimum = mark.args[0]
41 |     maximum = minimum + TEST_DURATION_TOLERANCE
42 |     # Only verify duration if the test itself passes (and we are in the 'call' phase of test)
43 |     if (
44 |         report.when == 'call'
45 |         and report.passed
46 |         and not minimum <= report.duration <= maximum
47 |     ):
48 |         # Mark test as failed and report the timing discrepancy
49 |         report.outcome = 'failed'
50 |         report.longrepr = (f"\nTest took {report.duration:g} seconds, "
51 |                            f"which is outside the range [{minimum:g}, {maximum:g}]\n")
52 |     return report
53 | 


--------------------------------------------------------------------------------
/katdal/test/test_categorical.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2017-2018,2021, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """Tests for :py:mod:`katdal.categorical`."""
18 | 
19 | import numpy as np
20 | from numpy.testing import assert_array_equal
21 | 
22 | from katdal.categorical import _single_event_per_dump, sensor_to_categorical
23 | 
24 | 
25 | def test_dump_to_event_parsing():
26 |     values = np.array(list('ABCDEFGH'))
27 |     events = np.array([0, 0, 1, 3, 3, 4, 4, 6, 8])
28 |     greedy = np.array([1, 0, 0, 1, 1, 0, 0, 0])
29 |     cleaned = list(_single_event_per_dump(events, greedy))
30 |     new_values = values[cleaned]
31 |     new_events = events[cleaned]
32 |     assert_array_equal(cleaned, [0, 2, 4, 6, 7], 'Dump->event parser failed')
33 |     assert_array_equal(new_values, list('ACEGH'), 'Dump->event parser failed')
34 |     assert_array_equal(new_events, [0, 1, 3, 5, 6], 'Dump->event parser failed')
35 | 
36 | 
37 | def test_categorical_sensor_creation():
38 |     timestamps = [-363.784, 2.467, 8.839, 8.867, 15.924, 48.925, 54.897, 88.982]
39 |     values = ['stop', 'slew', 'track', 'slew', 'track', 'slew', 'track', 'slew']
40 |     dump_period = 8.
41 |     dump_times = np.arange(4., 100., dump_period)
42 |     categ = sensor_to_categorical(timestamps, values, dump_times, dump_period,
43 |                                   greedy_values=('slew', 'stop'),
44 |                                   initial_value='slew')
45 |     assert_array_equal(categ.unique_values, ['slew', 'track'],
46 |                        'Sensor->categorical failed')
47 |     assert_array_equal(categ.events, [0, 2, 6, 7, 11, 12],
48 |                        'Sensor->categorical failed')
49 |     assert_array_equal(categ.indices, [0, 1, 0, 1, 0],
50 |                        'Sensor->categorical failed')
51 | 


--------------------------------------------------------------------------------
/katdal/test/test_chunkstore_npy.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2017-2018,2021-2022, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """Tests for :py:mod:`katdal.chunkstore_npy`."""
18 | 
19 | import os
20 | import shutil
21 | import tempfile
22 | 
23 | import pytest
24 | 
25 | from katdal.chunkstore import StoreUnavailable
26 | from katdal.chunkstore_npy import NpyFileChunkStore
27 | from katdal.test.test_chunkstore import ChunkStoreTestBase, generate_arrays
28 | 
29 | 
30 | class TestNpyFileChunkStore(ChunkStoreTestBase):
31 |     """Test NPY file functionality using a temporary directory."""
32 | 
33 |     @classmethod
34 |     def setup_class(cls):
35 |         """Create temp dir to store NPY files and build ChunkStore on that."""
36 |         cls.arrays = generate_arrays()
37 |         cls.tempdir = tempfile.mkdtemp()
38 |         cls.store = NpyFileChunkStore(cls.tempdir)
39 | 
40 |     @classmethod
41 |     def teardown_class(cls):
42 |         shutil.rmtree(cls.tempdir)
43 | 
44 |     def setup_method(self):
45 |         # Clean out data created by previous tests
46 |         for entry in os.scandir(self.tempdir):
47 |             if not entry.name.startswith('.') and entry.is_dir():
48 |                 shutil.rmtree(entry.path)
49 | 
50 |     def test_store_unavailable(self):
51 |         with pytest.raises(StoreUnavailable):
52 |             NpyFileChunkStore('hahahahahaha')
53 | 
54 | 
55 | class TestNpyFileChunkStoreDirectWrite(TestNpyFileChunkStore):
56 |     """Test NPY file functionality with O_DIRECT writes."""
57 | 
58 |     @classmethod
59 |     def setup_class(cls):
60 |         """Create temp dir to store NPY files and build ChunkStore on that."""
61 |         cls.tempdir = tempfile.mkdtemp()
62 |         try:
63 |             cls.store = NpyFileChunkStore(cls.tempdir, direct_write=True)
64 |         except StoreUnavailable as e:
65 |             if 'not supported' in str(e):
66 |                 pytest.skip(str(e))
67 |             raise
68 | 


--------------------------------------------------------------------------------
/katdal/test/test_chunkstore_dict.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2017-2018,2021-2022, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """Tests for :py:mod:`katdal.chunkstore_dict`."""
18 | 
19 | import time
20 | 
21 | import numpy as np
22 | import dask.array as da
23 | 
24 | from katdal.chunkstore_dict import DictChunkStore
25 | from katdal.test.test_chunkstore import ChunkStoreTestBase, generate_arrays
26 | 
27 | 
28 | class TestDictChunkStore(ChunkStoreTestBase):
29 |     def setup_method(self):
30 |         self.arrays = generate_arrays()
31 |         self.store = DictChunkStore(**self.arrays)
32 |         # This store is prepopulated so missing chunks can't be checked
33 |         self.preloaded_chunks = True
34 | 
35 | 
36 | def test_basic_overheads():
37 |     """Check overheads of creating and transferring dask array between stores."""
38 |     # The array is about 1 GB in size
39 |     shape = (100, 1000, 1000)
40 |     x = np.ones(shape)
41 |     y = np.zeros(shape)
42 |     store1 = DictChunkStore(x=x)
43 |     store2 = DictChunkStore(y=y)
44 |     # We have 1000 chunks of about 1 MB each
45 |     chunk_size = (1, 100, 1000)
46 |     chunks = da.core.normalize_chunks(chunk_size, shape)
47 |     # Check that the time to set up dask arrays is not grossly inflated
48 |     start_time = time.process_time()
49 |     dx = store1.get_dask_array('x', chunks, float)
50 |     py = store2.put_dask_array('y', dx)
51 |     setup_duration = time.process_time() - start_time
52 |     assert setup_duration < 1.0
53 |     # Use basic array copy as a reference
54 |     start_time = time.process_time()
55 |     y[:] = x
56 |     copy_duration = time.process_time() - start_time
57 |     # Check ChunkStore / dask overhead on top of basic memory copy
58 |     start_time = time.process_time()
59 |     success = py.compute()
60 |     dask_duration = time.process_time() - start_time
61 |     assert dask_duration < 10 * copy_duration
62 |     np.testing.assert_equal(success, None)
63 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ################################################################################
 4 | # Copyright (c) 2011,2013,2016-2023, National Research Foundation (SARAO)
 5 | #
 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 7 | # this file except in compliance with the License. You may obtain a copy
 8 | # of the License at
 9 | #
10 | #   https://opensource.org/licenses/BSD-3-Clause
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 | 
19 | import os.path
20 | 
21 | from setuptools import find_packages, setup
22 | 
23 | here = os.path.dirname(__file__)
24 | readme = open(os.path.join(here, 'README.rst')).read()
25 | news = open(os.path.join(here, 'NEWS.rst')).read()
26 | long_description = readme + '\n\n' + news
27 | 
28 | setup(name='katdal',
29 |       description='Karoo Array Telescope data access library for interacting '
30 |                   'with data sets in the MeerKAT Visibility Format (MVF)',
31 |       long_description=long_description,
32 |       long_description_content_type='text/x-rst',
33 |       author='Ludwig Schwardt',
34 |       author_email='ludwig@ska.ac.za',
35 |       packages=find_packages(),
36 |       scripts=[
37 |           'scripts/mvf_copy.py',
38 |           'scripts/mvf_download.py',
39 |           'scripts/mvftoms.py',
40 |       ],
41 |       url='https://github.com/ska-sa/katdal',
42 |       license='Modified BSD',
43 |       classifiers=[
44 |           'Development Status :: 4 - Beta',
45 |           'Intended Audience :: Developers',
46 |           'License :: OSI Approved :: BSD License',
47 |           'Operating System :: OS Independent',
48 |           'Programming Language :: Python',
49 |           'Programming Language :: Python :: 3',
50 |           'Topic :: Software Development :: Libraries :: Python Modules',
51 |           'Topic :: Scientific/Engineering :: Astronomy'],
52 |       platforms=['OS Independent'],
53 |       keywords='meerkat ska',
54 |       python_requires='>=3.6',
55 |       setup_requires=['katversion'],
56 |       use_katversion=True,
57 |       install_requires=[
58 |           'numpy >= 1.12.0',
59 |           'katpoint >= 0.9, < 1',
60 |           'h5py >= 2.3',
61 |           'numba',
62 |           'katsdptelstate[rdb] >= 0.10',
63 |           'dask[array] >= 2.7.0',
64 |           'requests >= 2.18.0',
65 |           'pyjwt >= 2',
66 |           'cityhash >= 0.2.2',
67 |           'packaging',
68 |       ],
69 |       extras_require={
70 |           'ms': ['python-casacore >= 2.2.1'],
71 |           's3': [],
72 |           's3credentials': ['botocore']
73 |       },
74 |       tests_require=['cryptography', 'pytest'])
75 | 


--------------------------------------------------------------------------------
/katdal/chunkstore_dict.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2017-2018,2021, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """A store of chunks (i.e. N-dimensional arrays) based on a dict of arrays."""
18 | 
19 | from .chunkstore import BadChunk, ChunkNotFound, ChunkStore
20 | 
21 | 
22 | class DictChunkStore(ChunkStore):
23 |     """A store of chunks (i.e. N-dimensional arrays) based on a dict of arrays.
24 | 
25 |     This interprets all keyword arguments as NumPy arrays and stores them in
26 |     an `arrays` dict. Each array is identified by its corresponding keyword.
27 |     New arrays cannot be added via :meth:`put` - they all need to be in place
28 |     at store initialisation (or can be added afterwards via direct insertion
29 |     into the `arrays` dict). The `put` method is only useful for in-place
30 |     modification of existing arrays.
31 |     """
32 | 
33 |     def __init__(self, **kwargs):
34 |         error_map = {KeyError: ChunkNotFound, IndexError: ChunkNotFound}
35 |         super().__init__(error_map)
36 |         self.arrays = kwargs
37 | 
38 |     def get_chunk(self, array_name, slices, dtype):
39 |         """See the docstring of :meth:`ChunkStore.get_chunk`."""
40 |         chunk_name, shape = self.chunk_metadata(array_name, slices, dtype=dtype)
41 |         with self._standard_errors(chunk_name):
42 |             array = self.arrays[array_name]
43 |             # Ensure that chunk is array (otherwise 0-dim array becomes number)
44 |             chunk = array[slices] if slices != () else array
45 |         if chunk.shape != shape or chunk.dtype != dtype:
46 |             raise BadChunk(f'Chunk {chunk_name!r}: requested dtype {chunk.dtype} and/or shape '
47 |                            f'{chunk.shape} differs from expected dtype {dtype} and shape {shape}')
48 |         return chunk
49 | 
50 |     def create_array(self, array_name):
51 |         if array_name not in self.arrays:
52 |             raise NotImplementedError
53 | 
54 |     def put_chunk(self, array_name, slices, chunk):
55 |         """See the docstring of :meth:`ChunkStore.put_chunk`."""
56 |         self.chunk_metadata(array_name, slices, chunk=chunk)
57 |         self.get_chunk(array_name, slices, chunk.dtype)[()] = chunk
58 | 
59 |     get_chunk.__doc__ = ChunkStore.get_chunk.__doc__
60 |     put_chunk.__doc__ = ChunkStore.put_chunk.__doc__
61 | 


--------------------------------------------------------------------------------
/scripts/mvf_read_benchmark.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | ################################################################################
 4 | # Copyright (c) 2018-2021,2023, National Research Foundation (SARAO)
 5 | #
 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 7 | # this file except in compliance with the License. You may obtain a copy
 8 | # of the License at
 9 | #
10 | #   https://opensource.org/licenses/BSD-3-Clause
11 | #
12 | # Unless required by applicable law or agreed to in writing, software
13 | # distributed under the License is distributed on an "AS IS" BASIS,
14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | # See the License for the specific language governing permissions and
16 | # limitations under the License.
17 | ################################################################################
18 | 
19 | import argparse
20 | import logging
21 | import time
22 | 
23 | import dask
24 | import numpy as np
25 | 
26 | import katdal
27 | from katdal.lazy_indexer import DaskLazyIndexer
28 | 
29 | parser = argparse.ArgumentParser()
30 | parser.add_argument('filename')
31 | parser.add_argument('--time', type=int, default=10, help='Number of times to read per batch')
32 | parser.add_argument('--channels', type=int, help='Number of channels to read')
33 | parser.add_argument('--dumps', type=int, help='Number of times to read')
34 | parser.add_argument('--joint', action='store_true', help='Load vis, weights, flags together')
35 | parser.add_argument('--applycal', help='Calibration solutions to apply')
36 | parser.add_argument('--workers', type=int, help='Number of dask workers')
37 | args = parser.parse_args()
38 | 
39 | logging.basicConfig(level='INFO', format='%(asctime)s [%(levelname)s] %(message)s')
40 | if args.workers is not None:
41 |     dask.config.set(num_workers=args.workers)
42 | logging.info('Starting')
43 | kwargs = {}
44 | if args.applycal is not None:
45 |     kwargs['applycal'] = args.applycal
46 | f = katdal.open(args.filename, **kwargs)
47 | logging.info('File loaded, shape %s', f.shape)
48 | if args.channels:
49 |     f.select(channels=np.s_[:args.channels])
50 | if args.dumps:
51 |     f.select(dumps=np.s_[:args.dumps])
52 | # Trigger creation of the dask graphs, population of sensor cache for applycal etc
53 | _ = (f.vis[0, 0, 0], f.weights[0, 0, 0], f.flags[0, 0, 0])
54 | logging.info('Selection complete')
55 | start = time.time()
56 | last_time = start
57 | for st in range(0, f.shape[0], args.time):
58 |     et = st + args.time
59 |     if args.joint:
60 |         vis, weights, flags = DaskLazyIndexer.get([f.vis, f.weights, f.flags], np.s_[st:et])
61 |     else:
62 |         vis = f.vis[st:et]
63 |         weights = f.weights[st:et]
64 |         flags = f.flags[st:et]
65 |     current_time = time.time()
66 |     elapsed = current_time - last_time
67 |     last_time = current_time
68 |     size = np.prod(vis.shape) * 10
69 |     logging.info('Loaded %d dumps (%.3f MB/s)', vis.shape[0], size / elapsed / 1e6)
70 | size = np.prod(f.shape) * 10
71 | elapsed = time.time() - start
72 | logging.info('Loaded %d bytes in %.3f s (%.3f MB/s)', size, elapsed, size / elapsed / 1e6)
73 | 


--------------------------------------------------------------------------------
/katdal/test/test_van_vleck.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2020-2021, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | import numpy as np
18 | 
19 | from katdal.van_vleck import autocorr_lookup_table, norm0_cdf
20 | 
21 | 
22 | def test_norm0_cdf():
23 |     scale = 2.0
24 |     x = np.array([0.0, 2.0, 4.0, 6.0])
25 |     # Generated by scipy.stats.norm.cdf(x, scale=2.0)
26 |     expected = np.array([0.5, 0.8413447460685429, 0.9772498680518208, 0.9986501019683699])
27 |     actual = norm0_cdf(x, scale)
28 |     np.testing.assert_allclose(actual, expected, rtol=0., atol=np.finfo(float).eps)
29 |     actual = norm0_cdf(-x, scale)
30 |     np.testing.assert_allclose(actual, 1.0 - expected, rtol=0., atol=np.finfo(float).eps)
31 |     actual = norm0_cdf(x[-1], scale)
32 |     np.testing.assert_allclose(actual, expected[-1], rtol=0., atol=np.finfo(float).eps)
33 | 
34 | 
35 | def test_autocorr_correction():
36 |     # 15-level "4-bit" KAT-7 requantiser (contiguous ints for now)
37 |     levels = np.arange(-7., 8.)
38 |     quantised_ac_table, true_ac_table = autocorr_lookup_table(levels)
39 |     N = 100000
40 |     rs = np.random.RandomState(42)
41 |     autocorrs = [0.06, 0.2, 1.0, 10.0, 100.0, 1000.0]
42 |     # Excess above usual sample standard deviation due to loss of information caused by quantisation,
43 |     # generated by Bayesian quantisation correction code (ask the author for details)
44 |     rtol_factors = [2.5, 1.24, 1.16, 1.02, 1.5, 2.9]
45 |     for true_ac, rtol_factor in zip(autocorrs, rtol_factors):
46 |         # Generate complex random voltages with appropriate variance
47 |         scale = np.sqrt(true_ac / 2.)
48 |         x = rs.normal(scale=scale, size=N) + 1j * rs.normal(scale=scale, size=N)
49 |         # Estimate power of the unquantised complex signal as a sanity check
50 |         unquantised_sample_ac = x.dot(x.conj()).real / N
51 |         # The standard deviation of sample variance of N complex normals of variance `var`
52 |         # is var / sqrt(N). Use rtol since stdev is proportional to var and set it to 3 sigma.
53 |         rtol = 3.0 / np.sqrt(N)
54 |         np.testing.assert_allclose(unquantised_sample_ac, true_ac, rtol=rtol)
55 |         # Quantise x to the nearest integer and clip (assumes levels are contiguous ints)
56 |         xq = x.round()
57 |         np.clip(xq.real, levels[0], levels[-1], out=xq.real)
58 |         np.clip(xq.imag, levels[0], levels[-1], out=xq.imag)
59 |         # Estimate power of the quantised signal and correct the effects of quantisation
60 |         quantised_sample_ac = xq.dot(xq.conj()).real / N
61 |         corrected_ac = np.interp(quantised_sample_ac, quantised_ac_table, true_ac_table)
62 |         np.testing.assert_allclose(corrected_ac, true_ac, rtol=rtol_factor * rtol)
63 |         np.testing.assert_allclose(corrected_ac, unquantised_sample_ac, rtol=rtol_factor * rtol)
64 | 


--------------------------------------------------------------------------------
/doc/katdal.rst:
--------------------------------------------------------------------------------
  1 | katdal package
  2 | ==============
  3 | 
  4 | Submodules
  5 | ----------
  6 | 
  7 | katdal.applycal module
  8 | ----------------------
  9 | 
 10 | .. automodule:: katdal.applycal
 11 |    :members:
 12 |    :undoc-members:
 13 |    :show-inheritance:
 14 | 
 15 | katdal.averager module
 16 | ----------------------
 17 | 
 18 | .. automodule:: katdal.averager
 19 |    :members:
 20 |    :undoc-members:
 21 |    :show-inheritance:
 22 | 
 23 | katdal.categorical module
 24 | -------------------------
 25 | 
 26 | .. automodule:: katdal.categorical
 27 |    :members:
 28 |    :undoc-members:
 29 |    :show-inheritance:
 30 | 
 31 | katdal.chunkstore module
 32 | ------------------------
 33 | 
 34 | .. automodule:: katdal.chunkstore
 35 |    :members:
 36 |    :undoc-members:
 37 |    :show-inheritance:
 38 | 
 39 | katdal.chunkstore\_dict module
 40 | ------------------------------
 41 | 
 42 | .. automodule:: katdal.chunkstore_dict
 43 |    :members:
 44 |    :undoc-members:
 45 |    :show-inheritance:
 46 | 
 47 | katdal.chunkstore\_npy module
 48 | -----------------------------
 49 | 
 50 | .. automodule:: katdal.chunkstore_npy
 51 |    :members:
 52 |    :undoc-members:
 53 |    :show-inheritance:
 54 | 
 55 | katdal.chunkstore\_s3 module
 56 | ----------------------------
 57 | 
 58 | .. automodule:: katdal.chunkstore_s3
 59 |    :members:
 60 |    :undoc-members:
 61 |    :show-inheritance:
 62 | 
 63 | katdal.concatdata module
 64 | ------------------------
 65 | 
 66 | .. automodule:: katdal.concatdata
 67 |    :members:
 68 |    :undoc-members:
 69 |    :show-inheritance:
 70 | 
 71 | katdal.dataset module
 72 | ---------------------
 73 | 
 74 | .. automodule:: katdal.dataset
 75 |    :members:
 76 |    :undoc-members:
 77 |    :show-inheritance:
 78 | 
 79 | katdal.datasources module
 80 | -------------------------
 81 | 
 82 | .. automodule:: katdal.datasources
 83 |    :members:
 84 |    :undoc-members:
 85 |    :show-inheritance:
 86 | 
 87 | katdal.flags module
 88 | -------------------
 89 | 
 90 | .. automodule:: katdal.flags
 91 |    :members:
 92 |    :undoc-members:
 93 |    :show-inheritance:
 94 | 
 95 | katdal.h5datav1 module
 96 | ----------------------
 97 | 
 98 | .. automodule:: katdal.h5datav1
 99 |    :members:
100 |    :undoc-members:
101 |    :show-inheritance:
102 | 
103 | katdal.h5datav2 module
104 | ----------------------
105 | 
106 | .. automodule:: katdal.h5datav2
107 |    :members:
108 |    :undoc-members:
109 |    :show-inheritance:
110 | 
111 | katdal.h5datav3 module
112 | ----------------------
113 | 
114 | .. automodule:: katdal.h5datav3
115 |    :members:
116 |    :undoc-members:
117 |    :show-inheritance:
118 | 
119 | katdal.lazy\_indexer module
120 | ---------------------------
121 | 
122 | .. automodule:: katdal.lazy_indexer
123 |    :members:
124 |    :undoc-members:
125 |    :show-inheritance:
126 | 
127 | katdal.ms\_async module
128 | -----------------------
129 | 
130 | .. automodule:: katdal.ms_async
131 |    :members:
132 |    :undoc-members:
133 |    :show-inheritance:
134 | 
135 | katdal.ms\_extra module
136 | -----------------------
137 | 
138 | .. automodule:: katdal.ms_extra
139 |    :members:
140 |    :undoc-members:
141 |    :show-inheritance:
142 | 
143 | katdal.sensordata module
144 | ------------------------
145 | 
146 | .. automodule:: katdal.sensordata
147 |    :members:
148 |    :undoc-members:
149 |    :show-inheritance:
150 | 
151 | katdal.spectral\_window module
152 | ------------------------------
153 | 
154 | .. automodule:: katdal.spectral_window
155 |    :members:
156 |    :undoc-members:
157 |    :show-inheritance:
158 | 
159 | katdal.visdatav4 module
160 | -----------------------
161 | 
162 | .. automodule:: katdal.visdatav4
163 |    :members:
164 |    :undoc-members:
165 |    :show-inheritance:
166 | 
167 | 
168 | Module contents
169 | ---------------
170 | 
171 | .. automodule:: katdal
172 |    :members:
173 |    :undoc-members:
174 |    :show-inheritance:
175 | 


--------------------------------------------------------------------------------
/katdal/van_vleck.py:
--------------------------------------------------------------------------------
 1 | ################################################################################
 2 | # Copyright (c) 2012,2020,2021, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ################################################################################
16 | 
17 | """Routines for performing quantisation (Van Vleck) correction."""
18 | 
19 | import math
20 | 
21 | import numba
22 | import numpy as np
23 | 
24 | 
25 | @numba.vectorize(['f8(f8, f8)'], nopython=True, cache=True)
26 | def norm0_cdf(x, scale):
27 |     """Fast zero-mean (loc=0) implementation of :meth:`scipy.stats.norm.cdf`."""
28 |     return 0.5 * (math.erf(np.sqrt(0.5) * x / scale) + 1.)
29 | 
30 | 
31 | def _quant_norm0_pmf(levels, var=1.0):
32 |     """Probability mass function of quantised zero-mean normal variable."""
33 |     edges = np.r_[-np.inf, levels[:-1] + np.diff(levels) / 2., np.inf]
34 |     return np.diff(norm0_cdf(edges, np.sqrt(var)))
35 | 
36 | 
37 | def _squared_quant_norm0_mean(levels, var=1.0):
38 |     """Mean of squared quantised zero-mean normal variable (same shape as `var`)."""
39 |     levels = np.asarray(levels)
40 |     # Allow var and levels to be broadcast against each other, with levels as last dimension
41 |     var = np.asarray(var)[..., np.newaxis]
42 |     pmf = _quant_norm0_pmf(levels, var)
43 |     return pmf.dot(levels * levels)
44 | 
45 | 
46 | def autocorr_lookup_table(levels, size=4000):
47 |     """Lookup table that corrects complex autocorrelation quantisation effects.
48 | 
49 |     This maps the variance of a quantised complex voltage signal to the variance
50 |     of the unquantised signal under the assumption that the signal is proper
51 |     (circularly-symmetric) complex normally distributed.
52 | 
53 |     Parameters
54 |     ----------
55 |     levels : sequence of float
56 |         Quantisation levels for real and imaginary components of voltage signal
57 |     size : int, optional
58 |         Size of lookup table
59 | 
60 |     Returns
61 |     -------
62 |     quantised_autocorr_table, true_autocorr_table : array of float, shape (`size`,)
63 |         Lookup table associating quantised autocorrelations and unquantised
64 |         autocorrelations (i.e. power/variance of complex signals)
65 |     """
66 |     # Terminology:
67 |     # x = Proper complex normal voltage signal (zero-mean)
68 |     # rxx = Power (variance) *per* real/imag component of unquantised / true x
69 |     # sxx = Power (variance) *per* real/imag component of quantised x
70 |     abs_levels = np.abs(levels)
71 |     sxx_min_nonzero = abs_levels[abs_levels > 0].min() ** 2
72 |     sxx_max = abs_levels.max() ** 2
73 |     # Sweep across range of true power values, placing more table entries at tricky lower end
74 |     rxx_grid = np.r_[np.logspace(-2.4, 0, size // 2, endpoint=False),
75 |                      np.logspace(0, np.log10(sxx_max / sxx_min_nonzero) + 8, size - 2 - size // 2)]
76 |     # Shift the table to place inflection point at minimum non-zero sxx
77 |     rxx_grid *= sxx_min_nonzero
78 |     # Map true power to expected quantised power
79 |     sxx_mean = _squared_quant_norm0_mean(levels, rxx_grid)
80 |     # Extend quantised power values to its maximum range
81 |     sxx_table = np.r_[0., sxx_mean, sxx_max]
82 |     # Replace asymptotic with linear decay at bottom end, and clip unbounded growth at top end
83 |     rxx_table = np.r_[0., rxx_grid, rxx_grid[-1]]
84 |     # The factor 2 converts power per real/imag component to power/variance of complex signal
85 |     return 2. * sxx_table, 2. * rxx_table
86 | 


--------------------------------------------------------------------------------
/katdal/test/test_spectral_window.py:
--------------------------------------------------------------------------------
 1 | ###############################################################################
 2 | # Copyright (c) 2018,2021-2022, National Research Foundation (SARAO)
 3 | #
 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
 5 | # this file except in compliance with the License. You may obtain a copy
 6 | # of the License at
 7 | #
 8 | #   https://opensource.org/licenses/BSD-3-Clause
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | ###############################################################################
16 | 
17 | """Tests for :py:mod:`katdal.spectral_window`."""
18 | 
19 | import numpy as np
20 | from numpy.testing import assert_array_almost_equal, assert_array_equal
21 | 
22 | from katdal.spectral_window import SpectralWindow
23 | 
24 | 
25 | class TestSpectralWindow:
26 |     def setup_method(self):
27 |         self.lsb = SpectralWindow(1000.0, 10.0, 6, sideband=-1, product='lsb')
28 |         self.usb = SpectralWindow(1000.0, 10.0, 6, sideband=1, band='X')
29 |         self.odd = SpectralWindow(1000.0, 10.0, 5, sideband=1)
30 |         # channel_width will not be an exact float. The values have been
31 |         # chosen so that bandwidth / num_chans * num_chans does not quite
32 |         # equal bandwidth.
33 |         self.inexact = SpectralWindow(1000.0, None, 14, sideband=1,
34 |                                       bandwidth=230.0)
35 | 
36 |     def test_width_properties(self):
37 |         assert self.lsb.channel_width == 10.0
38 |         assert self.lsb.bandwidth == 60.0
39 |         assert self.inexact.channel_width == 230.0 / 14
40 |         assert self.inexact.bandwidth == 230.0
41 | 
42 |     def test_channel_freqs(self):
43 |         assert_array_equal(self.lsb.channel_freqs,
44 |                            [1030.0, 1020.0, 1010.0, 1000.0, 990.0, 980.0])
45 |         assert_array_equal(self.usb.channel_freqs,
46 |                            [970.0, 980.0, 990.0, 1000.0, 1010.0, 1020.0])
47 |         assert_array_equal(self.odd.channel_freqs,
48 |                            [980.0, 990.0, 1000.0, 1010.0, 1020.0])
49 |         assert_array_almost_equal(self.inexact.channel_freqs,
50 |                                   np.arange(14) * 230.0 / 14 + 885.0)
51 |         # Check that the exactly representable values are exact
52 |         assert self.inexact.channel_freqs[0] == 885.0
53 |         assert self.inexact.channel_freqs[7] == 1000.0
54 | 
55 |     def test_repr(self):
56 |         # Just a smoke test to check that it doesn't crash
57 |         repr(self.lsb)
58 |         repr(self.usb)
59 | 
60 |     def test_subrange(self):
61 |         lsb_sub = self.lsb.subrange(0, 3)
62 |         assert_array_equal(lsb_sub.channel_freqs, [1030.0, 1020.0, 1010.0])
63 |         assert lsb_sub.product == self.lsb.product
64 |         usb_sub = self.usb.subrange(2, 6)
65 |         assert_array_equal(usb_sub.channel_freqs,
66 |                            [990.0, 1000.0, 1010.0, 1020.0])
67 |         assert usb_sub.band == self.usb.band
68 |         # Check that updated bandwidth doesn't have rounding errors
69 |         inexact_sub = self.inexact.subrange(0, 7)
70 |         assert inexact_sub.bandwidth == 115.0
71 | 
72 |     def test_rechannelise_same(self):
73 |         lsb = self.lsb.rechannelise(6)
74 |         assert lsb == self.lsb
75 | 
76 |     def test_rechannelise_to_even(self):
77 |         lsb = self.lsb.rechannelise(2)
78 |         assert_array_equal(lsb.channel_freqs, [1020.0, 990.0])
79 |         usb = self.usb.rechannelise(2)
80 |         assert_array_equal(usb.channel_freqs, [980.0, 1010.0])
81 | 
82 |     def test_rechannelise_to_odd(self):
83 |         lsb = self.lsb.rechannelise(3)
84 |         assert_array_equal(lsb.channel_freqs, [1025.0, 1005.0, 985.0])
85 |         usb = self.usb.rechannelise(3)
86 |         assert_array_equal(usb.channel_freqs, [975.0, 995.0, 1015.0])
87 |         odd = self.odd.rechannelise(1)
88 |         assert_array_equal(odd.channel_freqs, [1000.0])
89 | 


--------------------------------------------------------------------------------
/doc/mvf_v2.rst:
--------------------------------------------------------------------------------
  1 | .. _hdf5_format_v2:
  2 | 
  3 | MVF version 2 (KAT-7)
  4 | ======================
  5 | 
  6 | .. sectionauthor:: Simon Ratcliffe <sratcliffe@ska.ac.za>, Ludwig Schwardt <ludwig@ska.ac.za>
  7 | 
  8 | Introduction
  9 | ------------
 10 | 
 11 | With the introduction of the KAT-7 correlator, we have taken the opportunity to revisit the correlator data storage format. This document describes this updated format.
 12 | 
 13 | Basic Concept
 14 | -------------
 15 | A single HDF5 corresponds to a single observation (contiguous telescope time segment for a specified subarray).
 16 | 
 17 | At highest level split into Data and MetaData.
 18 | 
 19 | MetaData contains two distinct types:
 20 | 
 21 |  * Configuration is known a priori and is static for the duration of the observation.
 22 |  * Sensors contains dynamic information provided in the form of katcp sensors. Typically only full known post observation.
 23 | 
 24 | Flags and History are special cases objects that get populated during run time but not from sensors. These are also the only groups that could get updated post augmentation.
 25 | 
 26 | Some datasets such as the noise_diode flags are synthesised from sensor information post capture. These base sensors could then be removed if space is a concern.
 27 | 
 28 | A major/minor version number is included in the file. The major indicates the overall structural philosophy (this document describes version 2.x). The minor is used
 29 | to identify the mandtory members of the MetaData and Markup groups included in the file. This allows addition of members (and modification of existing members) to the required list without wholesale changes to the file structure. The mandatory members are described in the following document: TBA.
 30 | 
 31 | If used to store voltage data then both correlator_data and timestamps are omitted as timing is synthesized on the fly.
 32 | 
 33 | Nut - number of correlator timeslots in this observation
 34 | Nt - number of averaged time timeslots
 35 | Nuf - number of correlator frequency channels
 36 | Nf - number of averaged frequency channels
 37 | Nbl - number of baselines
 38 | Np - number of polarisation products
 39 | Na - number of antennas in a given subarray
 40 | AntennaK - first antenna in a given subarray
 41 | AntennaN - last antenna in a given subarray
 42 | 
 43 | HDF5 Format
 44 | -----------
 45 | 
 46 | The structural format is shown below.
 47 | 
 48 | Groups are named using CamelCase, datasets are all lower case with underscores.
 49 | Attributes are indicated next to a group in {}::
 50 | 
 51 |  / {augment_ts}
 52 |    {experiment_id}
 53 |    {version}
 54 |  
 55 |  /Data/ {ts_of_first_timeslot}
 56 |       /correlator_data - (Nt,Nf,Nbl,2) array of float32 visibilities (real and imag components)
 57 |       /timestamps - (Nt) array of float64 timestamps (UT seconds since Unix epoch)
 58 |       /voltage_data - (optional) (Na, Nt, Nf) array of 8bit voltage samples
 59 |  
 60 |  /MetaData/
 61 |           /Configuration/
 62 |                         /Antennas/ {num_antennas, subarray_id}
 63 |                                  /AntennaK..N/ {description, delays, diameter, location, etc...}
 64 |                                              / beam_pattern
 65 |                                              / h_coupler_noise_diode_model
 66 |                                              / h_pin_noise_diode_model
 67 |                                              / v_coupler_noide_diode_model
 68 |                                              / v_pin_noise_diode_model
 69 |                         /Correlator/ {num_channels, center_freq, channel_bw, etc...}
 70 |                         /Observation/ {type, pi, contact, sw_build_versions, etc...}
 71 |                         /PostProcessing/ {channel_averaging, rfi_threshold, etc...}
 72 |                                        /time_averaging - TBD detail of baseline dep time avg
 73 |          /Sensors/
 74 |                  /Antennas/ {num_antennas, subarray_id}
 75 |                           /AntennaK..N/
 76 |                                       /... - dataset per antenna and pedestal sensor
 77 |                  /DBE/
 78 |                         /... - dataset per DBE sensor                
 79 |                  /Enviro/
 80 |                         /... - dataset per enviro sensor
 81 |                  /Other/
 82 |                        /... - dataset per other sensor
 83 |                  /RFE/
 84 |                      /... - dataset per RFE sensor
 85 |                  /Source/
 86 |                         /phase_center
 87 |                         /antenna_target - array of target sensors for each antenna
 88 | 
 89 |  /Markup/
 90 |         /dropped_data - (optional) describes data dropped by receivers
 91 |         /flags - (Nt,Nf,Nbl) post averaged uint8 flags - 1bit per flag, packed
 92 |         /flags_description - (Nflags,3) index, name and description for each packed flag type
 93 |         /flags_full - (optional) (Nut,Nuf,Nbl) pre-averaged uint8 flags - 1bit per flag, packed
 94 |         /labels - (optional) descriptions of intent of each observational phase (e.g. scan, slew, cal, etc..)
 95 |         /noise_diode - (Nt,Na) noise diode state during this averaged timeslot
 96 |         /noise_diode_full - (optional) (Nut,Na) noise diode state per correlator timeslot
 97 |         /weights - (Nt,Nf,Nbl,Nweights) weights for each sample
 98 | 
 99 |  /History/
100 |          /augment_log - Log output of augmentation process
101 |          /script_log - Log output of observation script
102 | 


--------------------------------------------------------------------------------
/doc/tuning.rst:
--------------------------------------------------------------------------------
  1 | Tuning your application
  2 | =======================
  3 | It is possible to load data at high bandwidth using katdal: rates over
  4 | 2.5 GB/s have been seen when loading from a local disk. However, it
  5 | requires an understanding of the storage layout and choice of an
  6 | appropriate access pattern.
  7 | 
  8 | This chapter is aimed at loading :doc:`mvf_v4` data, as older versions
  9 | typically contain far less data. Some of the advice is generic but some
 10 | of the methods described here will not work on older data sets.
 11 | 
 12 | Chunking
 13 | --------
 14 | The most important thing to understand is that the data is split into
 15 | chunks, each of which are stored as a file on disk or an object in an S3
 16 | store. Retrieving any element of a chunk causes the entire chunk to be
 17 | retrieved. Thus, aligning accesses to whole chunks will give the best
 18 | performance, as data is not discarded.
 19 | 
 20 | As an illustration, consider an application that has an outer loop over
 21 | the baselines, and loads data for one baseline at a time. Chunks
 22 | typically span all baselines, so each time one baseline is loaded,
 23 | katdal will actually load the entire data set. If the application can
 24 | be redesigned to fetch data for a small time range for all baselines it
 25 | will perform much better.
 26 | 
 27 | When using MVFv4, katdal uses `dask`_ to manage the chunking. After
 28 | opening a data set, you can determine the chunking for a particular
 29 | array by examining its ``dataset`` member:
 30 | 
 31 | .. code:: python
 32 | 
 33 |    >>> d.vis.dataset
 34 |    dask.array<1556179171-sdp, shape=(38, 4096, 40), dtype=complex64, chunksize=(32, 1024, 40)>
 35 |    >>> d.vis.dataset.chunks
 36 |    ((32, 6), (1024, 1024, 1024, 1024), (40,))
 37 | 
 38 | .. _dask: https://docs.dask.org/
 39 | 
 40 | For this data set, it will be optimal to load visibilities in 32 × 1024
 41 | × 40 element pieces.
 42 | 
 43 | Note that the chunking scheme may be different for visibilities, flags
 44 | and weights.
 45 | 
 46 | Joint loading
 47 | -------------
 48 | The values returned by katdal are not the raw values stored in the
 49 | chunks: there is processing involved, such as application of calibration
 50 | solutions and flagging of missing data. Some of this processing is
 51 | common between visibilities, flags and weights. It's thus more efficient
 52 | to load the visibilities, flags and weights as a single operation rather
 53 | than as three separate operations.
 54 | 
 55 | This can be achieved using :meth:`.DaskLazyIndexer.get`. For example,
 56 | replace
 57 | 
 58 | .. code:: python
 59 | 
 60 |    vis = d.vis[idx]
 61 |    flags = d.flags[idx]
 62 |    weights = d.weights[idx]
 63 | 
 64 | with
 65 | 
 66 | .. code:: python
 67 | 
 68 |    vis, flags, weights = DaskLazyIndexer.get([d.vis, d.flags, d.weights], idx)
 69 | 
 70 | Parallelism
 71 | -----------
 72 | Dask uses multiple worker threads. It defaults to one thread per CPU
 73 | core, but for I/O-bound tasks this is often not enough to achieve
 74 | maximum throughput. Refer to the dask `scheduler`_ documentation for
 75 | details of how to configure the number of workers.
 76 | 
 77 | .. _scheduler: https://docs.dask.org/en/latest/scheduling.html
 78 | 
 79 | More workers only helps if there is enough parallel work to be
 80 | performed, which means there need to be at least as many chunks loaded
 81 | at a time as there are workers (and preferably many more). It's thus
 82 | advisable to load as much data at a time as possible without running out
 83 | of memory.
 84 | 
 85 | Selection
 86 | ---------
 87 | Using :meth:`DataSet.select` is relatively expensive. For the best
 88 | performance, it should only be used occasionally (for example, to filter
 89 | out unwanted data at the start), with array access notation or
 90 | :meth:`.DaskLazyIndexer.get` used to break up large data sets into
 91 | manageable pieces.
 92 | 
 93 | Dask also performs better with selections that select contiguous data.
 94 | You might be able to get a little more performance by using
 95 | :meth:`.DataSet.scans` (which will yield a series of contiguous
 96 | selections) rather than using :meth:`~.DataSet.select` with
 97 | ``scans='track'``.
 98 | 
 99 | When using MVF v4 one can also pass a `preselect` parameter to :meth:`katdal.open`
100 | which allows slicing a subset of the data (time and frequency). It is more
101 | limited than :meth:`DataSet.select` (it can only select contiguous ranges, and
102 | can only specify the selection in terms of channels and dumps), but if a script
103 | is only interested in working on a subset of data, this method can be more
104 | efficient and uses less memory.
105 | 
106 | Network versus local disk
107 | -------------------------
108 | When loading data from the network, latency is typically higher, and so
109 | more workers will be needed to achieve peak throughput. Network access
110 | is also more sensitive to access patterns that are mis-aligned with
111 | chunks, because chunks are not cached in memory by the operation system
112 | and hence must be re-fetched over the network if they are accessed
113 | again.
114 | 
115 | Benchmarking
116 | ------------
117 | To assist with testing out the effects of changing these tuning
118 | parameters, the katdal source code includes a script called
119 | ``mvf_read_benchmark.py`` that allows a data set to be loaded in
120 | various ways and reports the average throughput. The command-line
121 | options are somewhat limited so you may need to edit it yourself, for
122 | example, to add a custom selection.
123 | 


--------------------------------------------------------------------------------
/katdal/test/test_concatdata.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2020-2022, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Tests for :py:mod:`katdal.concatdata`."""
 18 | 
 19 | import numpy as np
 20 | import pytest
 21 | 
 22 | from katdal.categorical import CategoricalData
 23 | from katdal.concatdata import ConcatenatedSensorCache
 24 | from katdal.sensordata import SensorCache, SimpleSensorGetter
 25 | 
 26 | 
 27 | class TestConcatenatedSensorCache:
 28 |     @staticmethod
 29 |     def _make_cache(timestamps, sensors):
 30 |         cache_data = {}
 31 |         for name, ts, values in sensors:
 32 |             sd = SimpleSensorGetter(name, np.asarray(ts), np.asarray(values))
 33 |             cache_data[name] = sd
 34 |         return SensorCache(cache_data, timestamps, 2.0)
 35 | 
 36 |     def setup_method(self):
 37 |         self.timestamps1 = np.arange(100.0, 110.0, 2.0)
 38 |         self.timestamps2 = np.arange(1000.0, 1006.0, 2.0)
 39 |         sensors1 = [
 40 |             ('foo', [104.0, 107.0], [3.0, 6.0]),
 41 |             ('cat', [102.0, 110.0], ['hello', 'world']),
 42 |             ('int_missing', [105.0], [42])
 43 |         ]
 44 |         sensors2 = [
 45 |             ('foo', [995.0, 1010.0], [10.0, 25.0]),
 46 |             ('cat', [1000.0, 1002.0, 1004.0, 1006.0], ['world', 'hello', 'again', 'hello']),
 47 |             ('float_missing', [995.0], [3.0])
 48 |         ]
 49 |         self.cache1 = self._make_cache(self.timestamps1, sensors1)
 50 |         self.cache2 = self._make_cache(self.timestamps2, sensors2)
 51 |         self.keep = np.array([True, False, True, False, False, True, False, True])
 52 |         self.cache = ConcatenatedSensorCache([self.cache1, self.cache2], keep=self.keep)
 53 | 
 54 |     def test_timestamps(self):
 55 |         np.testing.assert_array_equal(
 56 |             self.cache.timestamps,
 57 |             np.concatenate([self.timestamps1, self.timestamps2])
 58 |         )
 59 | 
 60 |     def test_float(self):
 61 |         data = self.cache.get('foo')
 62 |         np.testing.assert_allclose(data, [3.0, 3, 3, 5, 6, 15, 17, 19])
 63 | 
 64 |     def test_categorical(self):
 65 |         data = self.cache.get('cat')
 66 |         assert data.unique_values == ['hello', 'world', 'again']
 67 |         H = 'hello'
 68 |         W = 'world'
 69 |         A = 'again'
 70 |         np.testing.assert_array_equal(data[:], [H, H, H, H, H, W, H, A])
 71 | 
 72 |     def test_float_missing(self):
 73 |         data = self.cache.get('float_missing')
 74 |         np.testing.assert_array_equal(data, [np.nan] * 5 + [3.0] * 3)
 75 | 
 76 |     def test_int_missing(self):
 77 |         data = self.cache.get('int_missing')
 78 |         np.testing.assert_array_equal(data[:], [42] * 5 + [-1] * 3)
 79 | 
 80 |     def test_missing_select(self):
 81 |         data = self.cache['int_missing']
 82 |         np.testing.assert_array_equal(data[:], [42, 42, -1, -1])
 83 | 
 84 |     def test_float_select(self):
 85 |         data = self.cache['foo']
 86 |         np.testing.assert_allclose(data, [3.0, 3, 15, 19])
 87 | 
 88 |     def test_categorical_select(self):
 89 |         data = self.cache['cat']
 90 |         np.testing.assert_array_equal(data, ['hello', 'hello', 'world', 'again'])
 91 | 
 92 |     def test_no_extract(self):
 93 |         data = self.cache.get('foo', extract=False)
 94 |         values = data.get()
 95 |         np.testing.assert_array_equal(values.timestamp, [104.0, 107.0, 995.0, 1010.0])
 96 |         np.testing.assert_array_equal(values.value, [3.0, 6.0, 10.0, 25.0])
 97 | 
 98 |     def test_no_extract_missing(self):
 99 |         data = self.cache.get('float_missing', extract=False)
100 |         values = data.get()
101 |         np.testing.assert_array_equal(values.timestamp, [995.0])
102 |         np.testing.assert_array_equal(values.value, [3.0])
103 | 
104 |     def test_missing_sensor(self):
105 |         with pytest.raises(KeyError):
106 |             self.cache['sir_not_appearing_in_this_cache']
107 | 
108 |     def test_partially_extract(self):
109 |         self.cache1['foo']
110 |         data = self.cache.get('foo', extract=False)
111 |         np.testing.assert_array_equal(data, self.cache.get('foo', extract=True))
112 | 
113 |     def test_setitem_categorical(self):
114 |         data = CategoricalData(['x', 'y', 'x'], [0, 2, 4, 8])
115 |         self.cache['dog'] = data
116 |         ans = self.cache.get('dog')
117 |         assert data.unique_values == ans.unique_values
118 |         np.testing.assert_array_equal(data.events, ans.events)
119 |         np.testing.assert_array_equal(data.indices, ans.indices)
120 | 
121 |     def test_setitem_array(self):
122 |         data = np.array([1.0, 2, 3, 5, 8, 13, 21, 34])
123 |         self.cache['fib'] = data
124 |         ans = self.cache.get('fib')
125 |         np.testing.assert_array_equal(data, ans)
126 | 
127 |     def test_len(self):
128 |         assert len(self.cache) == 4
129 | 
130 |     def test_keys(self):
131 |         assert sorted(self.cache.keys()) == ['cat', 'float_missing', 'foo', 'int_missing']
132 | 
133 |     def test_contains(self):
134 |         assert 'cat' in self.cache
135 |         assert 'float_missing' in self.cache
136 |         assert 'int_missing' in self.cache
137 |         assert 'dog' not in self.cache
138 |         assert '' not in self.cache
139 | 


--------------------------------------------------------------------------------
/katdal/chunkstore_npy.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2017-2018,2020-2021, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """A store of chunks (i.e. N-dimensional arrays) based on NPY files."""
 18 | 
 19 | import contextlib
 20 | import errno
 21 | import mmap
 22 | import os
 23 | 
 24 | import numpy as np
 25 | 
 26 | from .chunkstore import (BadChunk, ChunkNotFound, ChunkStore, StoreUnavailable,
 27 |                          npy_header_and_body)
 28 | 
 29 | 
 30 | def _write_chunk(filename, chunk, direct_write):
 31 |     if not direct_write:
 32 |         return np.save(filename, chunk, allow_pickle=False)
 33 |     header, chunk = npy_header_and_body(chunk)
 34 |     size = len(header) + chunk.nbytes
 35 |     gran = mmap.ALLOCATIONGRANULARITY
 36 |     aligned_size = (size + gran - 1) // gran * gran
 37 |     with contextlib.closing(mmap.mmap(-1, aligned_size)) as aligned:
 38 |         aligned.write(header)
 39 |         aligned.write(chunk)
 40 |         aligned.seek(0)
 41 |         fd = os.open(filename, os.O_RDWR | os.O_CREAT | os.O_TRUNC | os.O_DIRECT, 0o666)
 42 |         try:
 43 |             os.write(fd, aligned)
 44 |             # We had to round the size up to a page, now correct back to exact size
 45 |             os.ftruncate(fd, size)
 46 |         finally:
 47 |             os.close(fd)
 48 | 
 49 | 
 50 | class NpyFileChunkStore(ChunkStore):
 51 |     """A store of chunks (i.e. N-dimensional arrays) based on NPY files.
 52 | 
 53 |     Each chunk is stored in a separate binary file in NumPy ``.npy`` format.
 54 |     The filename is constructed as
 55 | 
 56 |       "<path>/<array>/<idx>.npy"
 57 | 
 58 |     where "<path>" is the chunk store directory specified on construction,
 59 |     "<array>" is the name of the parent array of the chunk and "<idx>" is
 60 |     the index string of each chunk (e.g. "00001_00512").
 61 | 
 62 |     For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format`
 63 |     or the relevant NumPy Enhancement Proposal
 64 |     `here <http://docs.scipy.org/doc/numpy/neps/npy-format.html>`_.
 65 | 
 66 |     Parameters
 67 |     ----------
 68 |     path : string
 69 |         Top-level directory that contains NPY files of chunk store
 70 |     direct_write : bool
 71 |         If true, use ``O_DIRECT`` when writing the file. This bypasses the
 72 |         OS page cache, which can be useful to avoid filling it up with
 73 |         files that won't be read again.
 74 | 
 75 |     Raises
 76 |     ------
 77 |     :exc:`chunkstore.StoreUnavailable`
 78 |         If path does not exist / is not readable
 79 |     :exc:`chunkstore.StoreUnavailable`
 80 |         If `direct_write` was requested but is not available
 81 |     """
 82 | 
 83 |     def __init__(self, path, direct_write=False):
 84 |         super().__init__({IOError: ChunkNotFound, ValueError: ChunkNotFound})
 85 |         if not os.path.isdir(path):
 86 |             raise StoreUnavailable(f'Directory {path!r} does not exist')
 87 |         self.path = path
 88 |         self.direct_write = direct_write
 89 |         if direct_write and not hasattr(os, 'O_DIRECT'):
 90 |             raise StoreUnavailable('direct_write requested but not supported on this OS')
 91 | 
 92 |     def get_chunk(self, array_name, slices, dtype):
 93 |         """See the docstring of :meth:`ChunkStore.get_chunk`."""
 94 |         chunk_name, shape = self.chunk_metadata(array_name, slices, dtype=dtype)
 95 |         filename = os.path.join(self.path, chunk_name) + '.npy'
 96 |         with self._standard_errors(chunk_name):
 97 |             chunk = np.load(filename, allow_pickle=False)
 98 |         if chunk.shape != shape or chunk.dtype != dtype:
 99 |             raise BadChunk(f'Chunk {chunk_name!r}: NPY file dtype {chunk.dtype} and/or shape '
100 |                            f'{chunk.shape} differs from expected dtype {dtype} and shape {shape}')
101 |         return chunk
102 | 
103 |     def create_array(self, array_name):
104 |         """See the docstring of :meth:`ChunkStore.create_array`."""
105 |         # Ensure any subdirectories are in place
106 |         array_dir = os.path.join(self.path, array_name)
107 |         try:
108 |             os.makedirs(array_dir)
109 |         except OSError as e:
110 |             # Be happy if someone already created the path
111 |             if e.errno != errno.EEXIST:
112 |                 raise
113 | 
114 |     def put_chunk(self, array_name, slices, chunk):
115 |         """See the docstring of :meth:`ChunkStore.put_chunk`."""
116 |         chunk_name, _ = self.chunk_metadata(array_name, slices, chunk=chunk)
117 |         base_filename = os.path.join(self.path, chunk_name)
118 |         with self._standard_errors(chunk_name):
119 |             # Rename the file when done writing to make put_chunk() atomic
120 |             temp_filename = base_filename + '.writing.npy'
121 |             _write_chunk(temp_filename, chunk, self.direct_write)
122 |             os.rename(temp_filename, base_filename + '.npy')
123 | 
124 |     def mark_complete(self, array_name):
125 |         """See the docstring of :meth:`ChunkStore.mark_complete`."""
126 |         self.create_array(array_name)
127 |         touch_file = os.path.join(self.path, array_name, 'complete')
128 |         with open(touch_file, 'a'):
129 |             os.utime(touch_file, None)
130 | 
131 |     def is_complete(self, array_name):
132 |         """See the docstring of :meth:`ChunkStore.is_complete`."""
133 |         touch_file = os.path.join(self.path, array_name, 'complete')
134 |         return os.path.isfile(touch_file)
135 | 
136 |     get_chunk.__doc__ = ChunkStore.get_chunk.__doc__
137 |     put_chunk.__doc__ = ChunkStore.put_chunk.__doc__
138 |     mark_complete.__doc__ = ChunkStore.mark_complete.__doc__
139 |     is_complete.__doc__ = ChunkStore.is_complete.__doc__
140 | 


--------------------------------------------------------------------------------
/doc/conf.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | #
  3 | # Configuration file for the Sphinx documentation builder.
  4 | #
  5 | # This file does only contain a selection of the most common options. For a
  6 | # full list see the documentation:
  7 | # http://www.sphinx-doc.org/en/master/config
  8 | 
  9 | # -- Path setup --------------------------------------------------------------
 10 | 
 11 | # If extensions (or modules to document with autodoc) are in another directory,
 12 | # add these directories to sys.path here. If the directory is relative to the
 13 | # documentation root, use os.path.abspath to make it absolute, like shown here.
 14 | #
 15 | import os
 16 | import sys
 17 | 
 18 | sys.path.insert(0, os.path.abspath('..'))
 19 | import katdal  # noqa: E402
 20 | 
 21 | # -- Project information -----------------------------------------------------
 22 | 
 23 | project = 'katdal'
 24 | copyright = '2019, South African Radio Astronomy Observatory'
 25 | author = 'South African Radio Astronomy Observatory'
 26 | 
 27 | # The short X.Y version
 28 | version = '.'.join(katdal.__version__.split('.')[:2])
 29 | # The full version, including alpha/beta/rc tags
 30 | release = katdal.__version__
 31 | 
 32 | 
 33 | # -- General configuration ---------------------------------------------------
 34 | 
 35 | # If your documentation needs a minimal Sphinx version, state it here.
 36 | #
 37 | # needs_sphinx = '1.0'
 38 | 
 39 | # Add any Sphinx extension module names here, as strings. They can be
 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
 41 | # ones.
 42 | extensions = [
 43 |     'sphinx.ext.autodoc',
 44 |     'sphinx.ext.napoleon',
 45 |     'sphinx.ext.mathjax',
 46 |     'sphinx.ext.intersphinx'
 47 | ]
 48 | 
 49 | # Add any paths that contain templates here, relative to this directory.
 50 | templates_path = ['_templates']
 51 | 
 52 | # The suffix(es) of source filenames.
 53 | # You can specify multiple suffix as a list of string:
 54 | #
 55 | # source_suffix = ['.rst', '.md']
 56 | source_suffix = '.rst'
 57 | 
 58 | # The master toctree document.
 59 | master_doc = 'index'
 60 | 
 61 | # The language for content autogenerated by Sphinx. Refer to documentation
 62 | # for a list of supported languages.
 63 | #
 64 | # This is also used if you do content translation via gettext catalogs.
 65 | # Usually you set "language" from the command line for these cases.
 66 | language = None
 67 | 
 68 | # List of patterns, relative to source directory, that match files and
 69 | # directories to ignore when looking for source files.
 70 | # This pattern also affects html_static_path and html_extra_path.
 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
 72 | 
 73 | # The name of the Pygments (syntax highlighting) style to use.
 74 | pygments_style = None
 75 | 
 76 | autodoc_member_order = 'bysource'
 77 | 
 78 | intersphinx_mapping = {
 79 |     'katsdptelstate': ('https://katsdptelstate.readthedocs.io/en/latest', None),
 80 |     'katpoint': ('https://katpoint.readthedocs.io/en/latest', None)
 81 | }
 82 | 
 83 | 
 84 | # -- Options for HTML output -------------------------------------------------
 85 | 
 86 | # The theme to use for HTML and HTML Help pages.  See the documentation for
 87 | # a list of builtin themes.
 88 | #
 89 | html_theme = 'sphinx_rtd_theme'
 90 | 
 91 | # Theme options are theme-specific and customize the look and feel of a theme
 92 | # further.  For a list of options available for each theme, see the
 93 | # documentation.
 94 | #
 95 | # html_theme_options = {}
 96 | 
 97 | # Add any paths that contain custom static files (such as style sheets) here,
 98 | # relative to this directory. They are copied after the builtin static files,
 99 | # so a file named "default.css" will overwrite the builtin "default.css".
100 | html_static_path = ['_static']
101 | 
102 | # Custom sidebar templates, must be a dictionary that maps document names
103 | # to template names.
104 | #
105 | # The default sidebars (for documents that don't match any pattern) are
106 | # defined by theme itself.  Builtin themes are using these templates by
107 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
108 | # 'searchbox.html']``.
109 | #
110 | # html_sidebars = {}
111 | 
112 | 
113 | # -- Options for HTMLHelp output ---------------------------------------------
114 | 
115 | # Output file base name for HTML help builder.
116 | htmlhelp_basename = 'katdaldoc'
117 | 
118 | 
119 | # -- Options for LaTeX output ------------------------------------------------
120 | 
121 | latex_elements = {
122 |     # The paper size ('letterpaper' or 'a4paper').
123 |     #
124 |     # 'papersize': 'letterpaper',
125 | 
126 |     # The font size ('10pt', '11pt' or '12pt').
127 |     #
128 |     # 'pointsize': '10pt',
129 | 
130 |     # Additional stuff for the LaTeX preamble.
131 |     #
132 |     # 'preamble': '',
133 | 
134 |     # Latex figure (float) alignment
135 |     #
136 |     # 'figure_align': 'htbp',
137 | }
138 | 
139 | # Grouping the document tree into LaTeX files. List of tuples
140 | # (source start file, target name, title,
141 | #  author, documentclass [howto, manual, or own class]).
142 | latex_documents = [
143 |     (master_doc, 'katdal.tex', 'katdal Documentation',
144 |      'Ludwig Schwardt', 'manual'),
145 | ]
146 | 
147 | 
148 | # -- Options for manual page output ------------------------------------------
149 | 
150 | # One entry per manual page. List of tuples
151 | # (source start file, name, description, authors, manual section).
152 | man_pages = [
153 |     (master_doc, 'katdal', 'katdal Documentation',
154 |      [author], 1)
155 | ]
156 | 
157 | 
158 | # -- Options for Texinfo output ----------------------------------------------
159 | 
160 | # Grouping the document tree into Texinfo files. List of tuples
161 | # (source start file, target name, title, author,
162 | #  dir menu entry, description, category)
163 | texinfo_documents = [
164 |     (master_doc, 'katdal', 'katdal Documentation',
165 |      author, 'katdal', 'One line description of project.',
166 |      'Miscellaneous'),
167 | ]
168 | 
169 | 
170 | # -- Options for Epub output -------------------------------------------------
171 | 
172 | # Bibliographic Dublin Core info.
173 | epub_title = project
174 | 
175 | # The unique identifier of the text. This can be a ISBN number
176 | # or the project homepage.
177 | #
178 | # epub_identifier = ''
179 | 
180 | # A unique identification for the text.
181 | #
182 | # epub_uid = ''
183 | 
184 | # A list of files that should not be packed into the epub file.
185 | epub_exclude_files = ['search.html']
186 | 
187 | 
188 | # -- Extension configuration -------------------------------------------------
189 | 


--------------------------------------------------------------------------------
/katdal/__init__.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2011-2021, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Data access library for data sets in the MeerKAT Visibility Format (MVF)."""
 18 | 
 19 | import logging as _logging
 20 | import urllib.parse
 21 | 
 22 | from .concatdata import ConcatenatedDataSet
 23 | from .dataset import DataSet, WrongVersion  # noqa: F401
 24 | from .datasources import open_data_source
 25 | from .h5datav1 import H5DataV1
 26 | from .h5datav2 import H5DataV2
 27 | from .h5datav3 import H5DataV3
 28 | from .lazy_indexer import LazyTransform, dask_getitem  # noqa: F401
 29 | from .spectral_window import SpectralWindow  # noqa: F401
 30 | from .visdatav4 import VisibilityDataV4
 31 | 
 32 | 
 33 | # Setup library logger and add a print-like handler used when no logging is configured
 34 | class _NoConfigFilter(_logging.Filter):
 35 |     """Filter which only allows event if top-level logging is not configured."""
 36 | 
 37 |     def filter(self, record):
 38 |         return 1 if not _logging.root.handlers else 0
 39 | 
 40 | 
 41 | _no_config_handler = _logging.StreamHandler()
 42 | _no_config_handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT))
 43 | _no_config_handler.addFilter(_NoConfigFilter())
 44 | logger = _logging.getLogger(__name__)
 45 | logger.addHandler(_no_config_handler)
 46 | 
 47 | # BEGIN VERSION CHECK
 48 | # Get package version when locally imported from repo or via -e develop install
 49 | try:
 50 |     import katversion as _katversion
 51 | except ImportError:
 52 |     import time as _time
 53 |     __version__ = "0.0+unknown.{}".format(_time.strftime('%Y%m%d%H%M'))
 54 | else:
 55 |     __version__ = _katversion.get_version(__path__[0])
 56 | # END VERSION CHECK
 57 | 
 58 | # -----------------------------------------------------------------------------
 59 | # -- Top-level functions passed on to the appropriate format handler
 60 | # -----------------------------------------------------------------------------
 61 | 
 62 | formats = [H5DataV3, H5DataV2, H5DataV1]
 63 | 
 64 | 
 65 | def _file_action(action, filename, *args, **kwargs):
 66 |     """Perform action on data file using the appropriate format class.
 67 | 
 68 |     Parameters
 69 |     ----------
 70 |     action : string
 71 |         Name of method to call on format class
 72 |     filename : string
 73 |         Data file name
 74 |     args, kwargs : extra parameters to method (optional)
 75 | 
 76 |     Returns
 77 |     -------
 78 |     result : object
 79 |         Result of action
 80 | 
 81 |     """
 82 |     for format in formats:
 83 |         try:
 84 |             result = getattr(format, action)(filename, *args, **kwargs)
 85 |             break
 86 |         except WrongVersion:
 87 |             continue
 88 |     else:
 89 |         raise WrongVersion(f"File '{filename}' has unknown data file format or version")
 90 |     return result
 91 | 
 92 | 
 93 | def open(filename, ref_ant='', time_offset=0.0, **kwargs):
 94 |     """Open data file(s) with loader of the appropriate version.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |     filename : string or sequence of strings
 99 |         Data file name or list of file names
100 |     ref_ant : string, optional
101 |         Name of reference antenna (default is first antenna in use)
102 |     time_offset : float, optional
103 |         Offset to add to all timestamps, in seconds
104 |     kwargs : dict, optional
105 |         Extra keyword arguments are passed on to underlying accessor class:
106 | 
107 |         mode (string, optional)
108 |             [H5DataV*] File opening mode (e.g. 'r+' to open file in write mode)
109 |         quicklook (bool)
110 |             [H5DataV2] True if synthesised timestamps should be used to
111 |             partition data set even if real timestamps are irregular, thereby
112 |             avoiding the slow loading of real timestamps at the cost of
113 |             slightly inaccurate label borders
114 | 
115 |         See the documentation of :class:`VisibilityDataV4` for the keywords
116 |         it accepts.
117 | 
118 |     Returns
119 |     -------
120 |     data : :class:`DataSet` object
121 |         Object providing :class:`DataSet` interface to file(s)
122 | 
123 |     """
124 |     if isinstance(filename, str):
125 |         filenames = [filename]
126 |     else:
127 |         unexpected = set(kwargs.get('preselect', {})) - {'channels'}
128 |         if unexpected:
129 |             raise IndexError(f'Unsupported preselect key(s) for ConcatenatedDataSet: {unexpected}')
130 |         filenames = filename
131 |     datasets = []
132 |     for f in filenames:
133 |         # V4 RDB file or live telstate with optional URL-style query string
134 |         parsed = urllib.parse.urlsplit(f)
135 |         if parsed.path.endswith('.rdb') or parsed.scheme != '':
136 |             dataset = VisibilityDataV4(open_data_source(f, **kwargs),
137 |                                        ref_ant, time_offset, **kwargs)
138 |         else:
139 |             if 'preselect' in kwargs:
140 |                 raise TypeError('preselect is not supported for this format')
141 |             dataset = _file_action('__call__', f, ref_ant, time_offset, **kwargs)
142 |         datasets.append(dataset)
143 |     return datasets[0] if isinstance(filename, str) else ConcatenatedDataSet(datasets)
144 | 
145 | 
146 | def get_ants(filename):
147 |     """Quick look function to get the list of antennas in a data file.
148 | 
149 |     Parameters
150 |     ----------
151 |     filename : string
152 |         Data file name
153 | 
154 |     Returns
155 |     -------
156 |     antennas : list of :class:`katpoint.Antenna` objects
157 | 
158 |     """
159 |     return _file_action('_get_ants', filename)
160 | 
161 | 
162 | def get_targets(filename):
163 |     """Quick look function to get the list of targets in a data file.
164 | 
165 |     Parameters
166 |     ----------
167 |     filename : string
168 |         Data file name
169 | 
170 |     Returns
171 |     -------
172 |     targets : :class:`katpoint.Catalogue` object
173 |         All targets in file
174 | 
175 |     """
176 |     return _file_action('_get_targets', filename)
177 | 


--------------------------------------------------------------------------------
/scripts/spectrogram_plot_example.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | ################################################################################
  4 | # Copyright (c) 2012-2016,2018,2021, National Research Foundation (SARAO)
  5 | #
  6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  7 | # this file except in compliance with the License. You may obtain a copy
  8 | # of the License at
  9 | #
 10 | #   https://opensource.org/licenses/BSD-3-Clause
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | ################################################################################
 18 | 
 19 | #
 20 | # Plot spectrogram of entire dataset in an efficient way that only loads
 21 | # enough data that will fit onto the screen.
 22 | #
 23 | # Ludwig Schwardt
 24 | # 26 June 2012
 25 | #
 26 | 
 27 | import optparse
 28 | import time
 29 | 
 30 | import matplotlib.pyplot as plt
 31 | import numpy as np
 32 | 
 33 | import katdal
 34 | 
 35 | 
 36 | class ResampledImage:
 37 |     """Image that only loads enough data that will fit onto screen pixels.
 38 | 
 39 |     Parameters
 40 |     ----------
 41 |     data : array-like, shape at least (N, M)
 42 |         Data object with ndarray interface
 43 |     extract : function, signature ``xy_data = f(data, x, y)``, optional
 44 |         Function used to extract 2-D image array from data object given x and
 45 |         y indices, using getitem interface on data by default
 46 |     autoscale : {False, True}, optional
 47 |         True if image should be renormalised after each update or zoom
 48 |     ax : :class:`matplotlib.axes.Axes` object or None, optional
 49 |         Axes onto which to plot image
 50 |     kwargs : dict, optional
 51 |         Additional parameters are passed on to underlying imshow
 52 | 
 53 |     """
 54 |     def __init__(self, data, extract=None, autoscale=False, ax=None, **kwargs):
 55 |         self.data = data
 56 |         self.extract = extract if extract is not None else lambda d, x, y: d[y, x]
 57 |         self.autoscale = autoscale
 58 |         self.ax = ax if ax is not None else plt.gca()
 59 |         kwargs.update({'aspect': 'auto', 'origin': 'lower', 'interpolation': 'nearest',
 60 |                        'extent': (-0.5, data.shape[1] - 0.5, -0.5, data.shape[0] - 0.5)})
 61 |         self.image = self.ax.imshow([[0]], **kwargs)
 62 |         self.update()
 63 |         # Connect to all events that change the data limits or the number of pixels in image
 64 |         self.ax.callbacks.connect('xlim_changed', self.update)
 65 |         self.ax.callbacks.connect('ylim_changed', self.update)
 66 |         self.ax.figure.canvas.mpl_connect('resize_event', self.update)
 67 | 
 68 |     def update(self, param=None):
 69 |         """Load required data and update image."""
 70 |         data_limits, view_limits = self.ax.dataLim, self.ax.viewLim
 71 |         display_limits = self.ax.get_window_extent()
 72 |         # print "data =", data_limits.extents[[0, 2, 1, 3]].tolist()
 73 |         # print "view =", view_limits.extents[[0, 2, 1, 3]].tolist()
 74 |         # print "display =", display_limits.extents[[0, 2, 1, 3]].tolist()
 75 |         data_scale_x = self.data.shape[1] / data_limits.width
 76 |         data_scale_y = self.data.shape[0] / data_limits.height
 77 |         x_from = max(int(np.floor(data_scale_x * (view_limits.x0 - data_limits.x0))), 0)
 78 |         y_from = max(int(np.floor(data_scale_y * (view_limits.y0 - data_limits.y0))), 0)
 79 |         x_to = max(int(np.ceil(data_scale_x * (view_limits.x1 - data_limits.x0))), x_from + 1)
 80 |         y_to = max(int(np.ceil(data_scale_y * (view_limits.y1 - data_limits.y0))), y_from + 1)
 81 |         x_step = max(int(view_limits.width / display_limits.width), 1)
 82 |         y_step = max(int(view_limits.height / display_limits.height), 1)
 83 |         # print "range = %d:%d:%d, %d:%d:%d" % (x_from, x_to, x_step, y_from, y_to, y_step)
 84 |         x_slice = slice(x_from, x_to, x_step)
 85 |         y_slice = slice(y_from, y_to, y_step)
 86 |         x_inds = list(range(*x_slice.indices(self.data.shape[1])))
 87 |         y_inds = list(range(*y_slice.indices(self.data.shape[0])))
 88 |         im_left = x_inds[0] / data_scale_x + data_limits.x0
 89 |         im_right = (x_inds[-1] + 1) / data_scale_x + data_limits.x0
 90 |         im_bottom = y_inds[0] / data_scale_y + data_limits.y0
 91 |         im_top = (y_inds[-1] + 1) / data_scale_y + data_limits.y0
 92 |         # print "im =", (im_left, im_right, im_bottom, im_top)
 93 |         before = time.time()
 94 |         # Load and update image data and make it fill the view
 95 |         data = self.extract(self.data, x_slice, y_slice)
 96 |         extract_time = time.time() - before
 97 |         size_bytes = data.size * np.dtype('complex64').itemsize
 98 |         print("Loaded %d visibilities - x %s y %s - in %.2f seconds (%g MB/s)" %
 99 |               (data.size, x_slice, y_slice, extract_time, size_bytes * 1e-6 / extract_time))
100 |         self.image.set_data(data)
101 |         self.image._extent = (im_left, im_right, im_bottom, im_top)
102 |         if self.autoscale:
103 |             self.image.autoscale()
104 |         else:
105 |             # Keep the same normalisation as soon as the extreme data values are known
106 |             self.image.norm.vmin = min(self.image.norm.vmin, data.min())
107 |             self.image.norm.vmax = max(self.image.norm.vmax, data.max())
108 |         self.ax.figure.canvas.draw_idle()
109 | 
110 | 
111 | parser = optparse.OptionParser(usage="%prog [options] <data file> [<data file> ...]",
112 |                                description='Waterfall plot from HDF5 data file(s)')
113 | parser.add_option('-a', '--ant',
114 |                   help="Antenna to plot (e.g. 'ant1'), default is first antenna")
115 | parser.add_option('-p', '--pol', type='choice', choices=['H', 'V'], default='H',
116 |                   help="Polarisation term to use ('H' or 'V'), default is %default")
117 | parser.add_option('-s', '--autoscale', action='store_true', default=False,
118 |                   help="Renormalise colour scale after each zoom or resize, default is %default")
119 | (opts, args) = parser.parse_args()
120 | 
121 | if len(args) == 0:
122 |     print('Please specify at least one HDF5 file to load')
123 | else:
124 |     d = katdal.open(args)
125 |     ant = opts.ant if opts.ant is not None else d.ref_ant
126 |     d.select(ants=ant, pol=opts.pol)
127 | 
128 |     plt.figure(1)
129 |     plt.clf()
130 |     ax = plt.subplot(1, 1, 1)
131 |     im = ResampledImage(d.vis, extract=lambda data, x, y: np.abs(data[y, x, 0]),
132 |                         autoscale=opts.autoscale, ax=ax)
133 |     ax.set_xlabel('Channel index')
134 |     ax.set_ylabel('Dump index')
135 |     ax.set_title(f'Spectrogram {d.name} {ant} {opts.pol}')
136 |     plt.show()
137 | 


--------------------------------------------------------------------------------
/katdal/test/s3_utils.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2017,2020-2021,2023, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Test utilities for code that interacts with the S3 API.
 18 | 
 19 | It provides a class for managing running an external S3 server (currently
 20 | `MinIO`_).
 21 | 
 22 | Versions of minio prior to 2018-08-25T01:56:38Z contain a `race condition`_
 23 | that can cause it to crash when queried at the wrong point during startup, so
 24 | should not be used.
 25 | 
 26 | .. _minio: https://github.com/minio/minio
 27 | .. _race condition: https://github.com/minio/minio/issues/6324
 28 | """
 29 | 
 30 | import contextlib
 31 | import os
 32 | import pathlib
 33 | import subprocess
 34 | import time
 35 | import urllib.parse
 36 | 
 37 | import requests
 38 | 
 39 | 
 40 | class MissingProgram(RuntimeError):
 41 |     """An required executable program was not found."""
 42 | 
 43 | 
 44 | class ProgramFailed(RuntimeError):
 45 |     """An external program did not run successfully."""
 46 | 
 47 | 
 48 | class S3User:
 49 |     """Credentials for an S3 user."""
 50 | 
 51 |     def __init__(self, access_key: str, secret_key: str) -> None:
 52 |         self.access_key = access_key
 53 |         self.secret_key = secret_key
 54 | 
 55 | 
 56 | class S3Server:
 57 |     """Run and manage an external program to run an S3 server.
 58 | 
 59 |     This can be used as a context manager, to shut down the server when
 60 |     finished.
 61 | 
 62 |     Parameters
 63 |     ----------
 64 |     host
 65 |         Host to bind to
 66 |     port
 67 |         Port to bind to
 68 |     path
 69 |         Directory in which objects and config will be stored.
 70 |     user
 71 |         Credentials for the default admin user.
 72 | 
 73 |     Attributes
 74 |     ----------
 75 |     host
 76 |         Hostname for connecting to the server
 77 |     port
 78 |         Port for connecting to the server
 79 |     url
 80 |         Base URL for the server
 81 |     auth_url
 82 |         URL with the access_key and secret_key baked in
 83 |     path
 84 |         Path given to the constructor
 85 |     user
 86 |         User given to the constructor
 87 | 
 88 |     Raises
 89 |     ------
 90 |     MissingProgram
 91 |         if the ``minio`` binary was not found.
 92 |     ProgramFailed
 93 |         if minio started but failed before it became healthy
 94 |     """
 95 | 
 96 |     def __init__(self, host: str, port: int, path: pathlib.Path, user: S3User) -> None:
 97 |         self.host = host
 98 |         self.port = port
 99 |         self.path = path
100 |         self.user = user
101 |         self.url = f'http://{self.host}:{self.port}'
102 |         self.auth_url = f'http://{user.access_key}:{user.secret_key}@{self.host}:{self.port}'
103 |         self._process = None
104 | 
105 |         env = os.environ.copy()
106 |         env['MINIO_BROWSER'] = 'off'
107 |         env['MINIO_ROOT_USER'] = self.user.access_key
108 |         env['MINIO_ROOT_PASSWORD'] = self.user.secret_key
109 |         try:
110 |             self._process = subprocess.Popen(
111 |                 [
112 |                     'minio', 'server', '--quiet',
113 |                     '--address', f'{self.host}:{self.port}',
114 |                     '-C', str(self.path / 'config'),
115 |                     str(self.path / 'data'),
116 |                 ],
117 |                 stdout=subprocess.DEVNULL,
118 |                 env=env
119 |             )
120 |         except OSError as exc:
121 |             raise MissingProgram(f'Could not run minio: {exc}') from exc
122 | 
123 |         with contextlib.ExitStack() as exit_stack:
124 |             exit_stack.callback(self._process.terminate)
125 |             health_url = urllib.parse.urljoin(self.url, '/minio/health/ready')
126 |             for i in range(100):
127 |                 try:
128 |                     with requests.get(health_url) as resp:
129 |                         if (
130 |                             # Server is up...
131 |                             resp.ok
132 |                             # and initialised, therefore ready for requests
133 |                             and resp.headers.get('X-Minio-Server-Status') != 'offline'
134 |                         ):
135 |                             break
136 |                 except requests.ConnectionError:
137 |                     pass
138 |                 if self._process.poll() is not None:
139 |                     raise ProgramFailed('Minio died before it became healthy')
140 |                 time.sleep(0.1)
141 |             else:
142 |                 raise ProgramFailed('Timed out waiting for minio to be ready')
143 |             exit_stack.pop_all()
144 | 
145 |     def wipe(self) -> None:
146 |         """Remove all buckets and objects, but leave the server running.
147 | 
148 |         See :meth:`mc` for information about exceptions.
149 |         """
150 |         self.mc('rb', '--force', '--dangerous', 'minio')
151 | 
152 |     def close(self) -> None:
153 |         """Shut down the server."""
154 |         if self._process:
155 |             self._process.terminate()
156 |             self._process.wait()
157 |             self._process = None
158 | 
159 |     def __enter__(self) -> 'S3Server':
160 |         return self
161 | 
162 |     def __exit__(self, exc_type, exc_value, exc_tb) -> None:
163 |         self.close()
164 | 
165 |     def mc(self, *args) -> None:
166 |         """Run a (minio) mc subcommand against the running server.
167 | 
168 |         The running server has the alias ``minio``.
169 | 
170 |         .. note::
171 | 
172 |            The credentials will be exposed in the environment. This is only
173 |            intended for unit testing, and hence not with sensitive
174 |            credentials.
175 | 
176 |         Raises
177 |         ------
178 |         MissingProgram
179 |             if the ``mc`` command is not found on the path
180 |         ProgramFailed
181 |             if the command returned a non-zero exit status. The exception
182 |             message will include the stderr output.
183 |         """
184 |         env = os.environ.copy()
185 |         env['MC_HOST_minio'] = self.auth_url
186 |         # --config-dir is set just to prevent any config set by the user
187 |         # from interfering with the test.
188 |         try:
189 |             subprocess.run(
190 |                 [
191 |                     'mc', '--quiet', '--no-color', f'--config-dir={self.path}',
192 |                     *args
193 |                 ],
194 |                 stdout=subprocess.DEVNULL,
195 |                 stderr=subprocess.PIPE,
196 |                 env=env,
197 |                 encoding='utf-8',
198 |                 errors='replace',
199 |                 check=True
200 |             )
201 |         except OSError as exc:
202 |             raise MissingProgram(f'mc could not be run: {exc}') from exc
203 |         except subprocess.CalledProcessError as exc:
204 |             raise ProgramFailed(exc.stderr) from exc
205 | 


--------------------------------------------------------------------------------
/katdal/averager.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2011,2016-2019,2021-2023, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | import numba
 18 | import numpy as np
 19 | 
 20 | 
 21 | @numba.jit(nopython=True, parallel=True)
 22 | def _average_visibilities(vis, weight, flag, timeav, chanav, flagav):
 23 |     # Workaround for https://github.com/numba/numba/issues/2921
 24 |     flag_u8 = flag.view(np.uint8)
 25 | 
 26 |     # Compute shapes
 27 |     n_time, n_chans, n_bl = vis.shape
 28 |     av_n_time = n_time // timeav
 29 |     av_n_chans = n_chans // chanav
 30 |     av_shape = (av_n_time, av_n_chans, n_bl)
 31 | 
 32 |     # Allocate output buffers
 33 |     av_vis = np.empty(av_shape, vis.dtype)
 34 |     av_weight = np.empty(av_shape, weight.dtype)
 35 |     av_flag = np.empty(av_shape, flag.dtype)
 36 | 
 37 |     scale = weight.dtype.type(1.0 / (timeav * chanav))
 38 |     wzero = weight.dtype.type(0)   # Zero constant of correct type
 39 | 
 40 |     bl_step = 128      # Want a chunk to be multiple cache lines but into L1
 41 |     # We put channel as the outer loop just because it's more likely than
 42 |     # time to get parallel speedup with prange (since the time axis is often
 43 |     # short e.g. 1).
 44 |     for av_c in numba.prange(0, av_n_chans):
 45 |         cstart = av_c * chanav
 46 |         vis_sum = np.empty(bl_step, vis.dtype)
 47 |         vis_weight_sum = np.empty(bl_step, vis.dtype)
 48 |         weight_sum = np.empty(bl_step, weight.dtype)
 49 |         flag_any = np.empty(bl_step, dtype=np.bool_)
 50 |         flag_all = np.empty(bl_step, dtype=np.bool_)
 51 |         for av_t in range(0, av_n_time):
 52 |             tstart = av_t * timeav
 53 |             for bstart in range(0, n_bl, bl_step):
 54 |                 bstop = min(n_bl, bstart + bl_step)
 55 |                 vis_sum[:] = 0
 56 |                 vis_weight_sum[:] = 0
 57 |                 weight_sum[:] = 0
 58 |                 flag_any[:] = False
 59 |                 flag_all[:] = True
 60 |                 for t in range(tstart, tstart + timeav):
 61 |                     for c in range(cstart, cstart + chanav):
 62 |                         for b in range(bstop - bstart):
 63 |                             b1 = b + bstart
 64 |                             v = vis[t, c, b1]
 65 |                             w = weight[t, c, b1]
 66 |                             f = (flag_u8[t, c, b1] != 0)
 67 |                             if f:
 68 |                                 # Don't simply use 0 here: it causes numba's type
 69 |                                 # inference to upgrade w from float32 to float64.
 70 |                                 w = wzero
 71 |                             flag_any[b] |= f
 72 |                             flag_all[b] &= f
 73 |                             vis_sum[b] += v
 74 |                             vis_weight_sum[b] += w * v
 75 |                             weight_sum[b] += w
 76 |                 for b in range(bstop - bstart):
 77 |                     b1 = b + bstart
 78 |                     w = np.float32(weight_sum[b])
 79 |                     # If everything is flagged/zero-weighted, use an unweighted average
 80 |                     if not w:
 81 |                         v = vis_sum[b] * scale
 82 |                     else:
 83 |                         v = vis_weight_sum[b] / w
 84 |                     f = flag_any[b] if flagav else flag_all[b]
 85 |                     av_vis[av_t, av_c, b1] = v
 86 |                     av_weight[av_t, av_c, b1] = w
 87 |                     av_flag[av_t, av_c, b1] = f
 88 |     return av_vis, av_weight, av_flag
 89 | 
 90 | 
 91 | def average_visibilities(vis, weight, flag, timestamps, channel_freqs, timeav=10, chanav=8, flagav=False):
 92 |     """Average visibilities, flags and weights.
 93 | 
 94 |     Visibilities are weight-averaged using the weights in the `weight` array
 95 |     with flagged data set to weight zero. The averaged weights are the sum of
 96 |     the input weights for each average block. An average flag is retained if
 97 |     all of the data in an averaging block is flagged (the averaged visibility
 98 |     in this case is the unweighted average of the input visibilities). In cases
 99 |     where the averaging size in channel or time does not evenly divide the size
100 |     of the input data, the remaining channels or timestamps at the end of the
101 |     array after averaging are discarded. Channels are averaged first and the
102 |     timestamps are second. An array of timestamps and frequencies corresponding
103 |     to each channel is also directly averaged and returned.
104 | 
105 |     Parameters
106 |     ----------
107 |     vis: array(numtimestamps,numchannels,numbaselines) of complex64.
108 |           The input visibilities to be averaged.
109 |     weight: array(numtimestamps,numchannels,numbaselines) of float32.
110 |           The input weights (used for weighted averaging).
111 |     flag: array(numtimestamps,numchannels,numbaselines) of boolean.
112 |           Input flags (flagged data have weight zero before averaging).
113 |     timestamps: array(numtimestamps) of int.
114 |           The timestamps (in mjd seconds) corresponding to the input data.
115 |     channel_freqs: array(numchannels) of int.
116 |           The frequencies (in Hz) corresponding to the input channels.
117 |     timeav: int.
118 |           The desired averaging size in timestamps.
119 |     chanav: int.
120 |           The desired averaging size in channels.
121 |     flagav: bool
122 |           Flagged averaged data in when there is a single flag in the bin if true.
123 |           Only flag averaged data when all data in the bin is flagged if false.
124 | 
125 |     Returns
126 |     -------
127 |     av_vis: array(int(numtimestamps/timeav),int(numchannels/chanav)) of complex64.
128 |     av_weight: array(int(numtimestamps/timeav),int(numchannels/chanav)) of float32.
129 |     av_flag: array(int(numtimestamps/timeav),int(numchannels/chanav)) of boolean.
130 |     av_mjd: array(int(numtimestamps/timeav)) of int.
131 |     av_freq: array(int(numchannels)/chanav) of int.
132 | 
133 |     """
134 |     # Trim data to integer multiples of the averaging factors
135 |     n_time, n_chans, n_bl = vis.shape
136 |     timeav = min(timeav, n_time)
137 |     flagav = min(flagav, n_chans)
138 |     n_time = n_time // timeav * timeav
139 |     n_chans = n_chans // chanav * chanav
140 | 
141 |     vis = vis[:n_time, :n_chans]
142 |     weight = weight[:n_time, :n_chans]
143 |     flag = flag[:n_time, :n_chans]
144 |     timestamps = timestamps[:n_time]
145 |     channel_freqs = channel_freqs[:n_chans]
146 | 
147 |     # Average the data (using a numba-accelerated function)
148 |     av_vis, av_weight, av_flag = \
149 |         _average_visibilities(vis, weight, flag, timeav, chanav, flagav)
150 | 
151 |     # Average the metadata
152 |     av_freq = np.mean(channel_freqs.reshape(-1, chanav), axis=-1)
153 |     av_timestamps = np.mean(timestamps.reshape(-1, timeav), axis=-1)
154 | 
155 |     return av_vis, av_weight, av_flag, av_timestamps, av_freq
156 | 


--------------------------------------------------------------------------------
/scripts/mvf_copy.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | ################################################################################
  4 | # Copyright (c) 2021-2024, National Research Foundation (SARAO)
  5 | #
  6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  7 | # this file except in compliance with the License. You may obtain a copy
  8 | # of the License at
  9 | #
 10 | #   https://opensource.org/licenses/BSD-3-Clause
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | ################################################################################
 18 | 
 19 | #
 20 | # Make a local copy of an MVF4 dataset, optionally filtering it.
 21 | #
 22 | # Ludwig Schwardt
 23 | # 19 October 2021
 24 | #
 25 | 
 26 | import argparse
 27 | import os
 28 | from pathlib import Path, PurePosixPath
 29 | from urllib.parse import urlparse
 30 | 
 31 | import dask
 32 | import dask.array as da
 33 | from dask.diagnostics import ProgressBar
 34 | import katsdptelstate
 35 | from katsdptelstate.rdb_writer import RDBWriter
 36 | import katdal
 37 | from katdal.chunkstore_npy import NpyFileChunkStore
 38 | from katdal.datasources import view_capture_stream
 39 | from katdal.lazy_indexer import dask_getitem
 40 | 
 41 | 
 42 | DESCRIPTION = """
 43 | Copy MVFv4 dataset (or a part of it) from S3/disk to disk using dask.
 44 | 
 45 | Run the script like this:
 46 | 
 47 |   mvf_copy.py https://archive/1698676533/1698676533_sdp_l0.full.rdb?token=<> dest
 48 | 
 49 | or:
 50 | 
 51 |   mvf_copy.py src_dir dest_dir
 52 | 
 53 | Data will appear in three subdirectories in the specified output directory as
 54 | 
 55 |   dest/1698676533/...
 56 |   dest/1698676533-sdp-l0/...
 57 |   dest/1698676533-sdp-l1-flags/...
 58 | 
 59 | Open the local dataset like this:
 60 | 
 61 |   d = katdal.open("dest/1698676533/1698676533_sdp_l0.full.rdb")
 62 | 
 63 | BONUS: you can even copy just parts of the data by selecting a subset of
 64 | correlation products. The --corrprods value is passed to DataSet.select().
 65 | 
 66 | While dask allows multiple retries while downloading chunks, it currently has
 67 | no way to resume copying if the script crashes. For peace of mind, consider
 68 | using the mvf_download.py script instead if you are just trying to download
 69 | your dataset from the archive to disk. You are stuck with mvf_copy.py if you
 70 | are copying from disk to disk or you want to cull some correlation products.
 71 | 
 72 | Some examples:
 73 | 
 74 |   mvf_copy.py url directory --corrprods=auto
 75 |   mvf_copy.py url directory --corrprods=cross
 76 | """
 77 | 
 78 | 
 79 | def parse_args():
 80 |     """Parse script arguments."""
 81 |     parser = argparse.ArgumentParser(
 82 |         usage='%(prog)s [-h] [--corrprods CORRPRODS] [--workers N] source dest',
 83 |         description=DESCRIPTION,
 84 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 85 |     )
 86 |     parser.add_argument('source', help='Dataset URL (or input RDB file path)')
 87 |     parser.add_argument('dest', type=Path, help='Output directory')
 88 |     parser.add_argument('--corrprods',
 89 |                         help='Select correlation products (kwarg to '
 90 |                              'katdal.DataSet.select). Keeps all corrprods by default.')
 91 |     parser.add_argument('--workers', type=int, default=8 * dask.system.CPU_COUNT,
 92 |                         help='Number of dask workers for parallel I/O [%(default)s]')
 93 |     args = parser.parse_args()
 94 |     return args
 95 | 
 96 | 
 97 | def extra_flag_streams(telstate, capture_block_id, stream_name):
 98 |     """Look for associated flag streams and return corresponding telstate views."""
 99 |     # This is a simplified version of katdal.datasources._upgrade_flags
100 |     telstate_extra_flags = []
101 |     for s in telstate.get('sdp_archived_streams', []):
102 |         telstate_cs = view_capture_stream(telstate.root(), capture_block_id, s)
103 |         if telstate_cs.get('stream_type') == 'sdp.flags' and \
104 |            stream_name in telstate_cs['src_streams']:
105 |             telstate_extra_flags.append(telstate_cs)
106 |     return telstate_extra_flags
107 | 
108 | 
109 | def stream_graphs(telstate, store, corrprod_mask, out_telstate, out_store):
110 |     """Prepare Dask graphs to copy all chunked arrays of a capture stream.
111 | 
112 |     This returns a list of Dask graphs and also modifies `out_telstate` and
113 |     `out_store`.
114 |     """
115 |     out_n_baselines = corrprod_mask.sum()
116 |     out_chunk_info = {}
117 |     graphs = []
118 |     for array, info in telstate['chunk_info'].items():
119 |         array_name = store.join(info['prefix'], array)
120 |         darray = store.get_dask_array(array_name, info['chunks'], info['dtype'])
121 |         # Filter the correlation products if array has them
122 |         if darray.ndim == 3:
123 |             indices = (slice(None), slice(None), corrprod_mask)
124 |             # Try to turn fancy indexing into slices (works for autocorrs)
125 |             darray = dask_getitem(darray, indices)
126 |             info['chunks'] = info['chunks'][:2] + ((out_n_baselines,),)
127 |             info['shape'] = info['shape'][:2] + (out_n_baselines,)
128 |         out_store.create_array(array_name)
129 |         graphs.append(out_store.put_dask_array(array_name, darray))
130 |         out_chunk_info[array] = info
131 |     out_telstate[telstate.prefixes[0] + 'chunk_info'] = out_chunk_info
132 |     return graphs
133 | 
134 | 
135 | def main():
136 |     """Main routine of mvf_copy script."""
137 |     args = parse_args()
138 | 
139 |     d = katdal.open(args.source)
140 |     # XXX Simplify this once corrprods can accept slices as advertised
141 |     kwargs = {}
142 |     if args.corrprods is not None:
143 |         kwargs['corrprods'] = args.corrprods
144 |     d.select(**kwargs)
145 | 
146 |     # Convenience variables
147 |     cbid = d.source.capture_block_id
148 |     stream = d.source.stream_name
149 |     telstate = d.source.telstate
150 |     # XXX Replace private member with public corrprod index member when it exists
151 |     corrprod_mask = d._corrprod_keep
152 |     rdb_filename = PurePosixPath(urlparse(args.source).path).name
153 | 
154 |     telstate_overrides = katsdptelstate.TelescopeState()
155 |     # Override bls_ordering in telstate (in stream namespace) to match dataset selection
156 |     telstate_overrides.view(stream)['bls_ordering'] = d.corr_products
157 |     telstate_overrides.view(stream)['n_bls'] = len(d.corr_products)
158 |     os.makedirs(args.dest / cbid, exist_ok=True)
159 |     out_store = NpyFileChunkStore(args.dest)
160 |     # Iterate over all stream views, setting up Dask graph for each chunked array
161 |     graphs = []
162 |     for view in [telstate] + extra_flag_streams(telstate, cbid, stream):
163 |         graphs.extend(stream_graphs(view, d.source.data.store, corrprod_mask,
164 |                                     telstate_overrides, out_store))
165 | 
166 |     # Save original telstate + overrides to new RDB file (without duplicate keys)
167 |     unmodified_keys = set(telstate.keys()) - set(telstate_overrides.keys())
168 |     with RDBWriter(args.dest / cbid / rdb_filename) as rdbw:
169 |         rdbw.save(telstate.backend, unmodified_keys)
170 |         rdbw.save(telstate_overrides.backend)
171 |     # Transfer chunks to final resting place, filtering them along the way
172 |     with ProgressBar():
173 |         errors = da.compute(*graphs, num_workers=args.workers)
174 |     # put_dask_array returns an array with an exception object per chunk
175 |     for array_errors in errors:
176 |         for chunk_error in array_errors.flat:
177 |             if chunk_error is not None:
178 |                 raise chunk_error
179 | 
180 | 
181 | if __name__ == '__main__':
182 |     main()
183 | 


--------------------------------------------------------------------------------
/katdal/spectral_window.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2011,2018,2021, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | import threading
 18 | 
 19 | import numpy as np
 20 | 
 21 | 
 22 | class SpectralWindow:
 23 |     """Spectral window specification.
 24 | 
 25 |     A spectral window is determined by the number of frequency channels produced
 26 |     by the correlator and their corresponding centre frequencies, as well as the
 27 |     channel width. The channels are assumed to be regularly spaced and to be the
 28 |     result of either lower-sideband downconversion (channel frequencies
 29 |     decreasing with channel index) or upper-sideband downconversion (frequencies
 30 |     increasing with index). For further information the receiver band and
 31 |     correlator product names are also available.
 32 | 
 33 |     .. warning::
 34 | 
 35 |         Instances should be treated as immutable. Changing the attributes will
 36 |         lead to inconsistencies between them.
 37 | 
 38 |     Parameters
 39 |     ----------
 40 |     centre_freq : float
 41 |         Centre frequency of spectral window, in Hz
 42 |     channel_width : float
 43 |         Bandwidth of each frequency channel, in Hz
 44 |     num_chans : int
 45 |         Number of frequency channels
 46 |     product : string, optional
 47 |         Name of data product / correlator mode
 48 |     sideband : {-1, +1}, optional
 49 |         Type of downconversion (-1 => lower sideband, +1 => upper sideband)
 50 |     band : {'L', 'UHF', 'S', 'X', 'Ku'}, optional
 51 |         Name of receiver / band
 52 |     bandwidth : float, optional
 53 |         The bandwidth of the whole spectral window, in Hz. If specified,
 54 |         `channel_width` is ignored and computed from the bandwidth. If not
 55 |         specified, bandwidth is computed from the channel width. Specifying
 56 |         this is a good idea if the channel width cannot be exactly represented
 57 |         in floating point.
 58 | 
 59 |     Attributes
 60 |     ----------
 61 |     channel_freqs : array of float, shape (*F*,)
 62 |         Centre frequency of each frequency channel (assuming LSB mixing), in Hz
 63 |     """
 64 | 
 65 |     def __init__(self, centre_freq, channel_width, num_chans, product=None,
 66 |                  sideband=-1, band='L', bandwidth=None):
 67 |         if bandwidth is None:
 68 |             bandwidth = channel_width * num_chans
 69 |         else:
 70 |             channel_width = bandwidth / num_chans
 71 |         self.centre_freq = centre_freq
 72 |         self.channel_width = channel_width
 73 |         self.bandwidth = bandwidth
 74 |         self.num_chans = num_chans
 75 |         self.product = product if product is not None else ''
 76 |         self.sideband = sideband
 77 |         self.band = band
 78 |         # channel_freqs is computed on demand
 79 |         self._channel_freqs_lock = threading.Lock()
 80 |         self._channel_freqs = None
 81 | 
 82 |     @property
 83 |     def channel_freqs(self):
 84 |         with self._channel_freqs_lock:
 85 |             if self._channel_freqs is None:
 86 |                 # Don't subtract half a channel width as channel 0 is centred on 0 Hz in baseband
 87 |                 # We use self.bandwidth and self.num_chans to avoid rounding
 88 |                 # errors that might accumulate if channel_width is inexact.
 89 |                 self._channel_freqs = self.centre_freq + self.sideband * self.bandwidth * (
 90 |                     np.arange(self.num_chans) - self.num_chans // 2) / self.num_chans
 91 |             return self._channel_freqs
 92 | 
 93 |     def __repr__(self):
 94 |         """Short human-friendly string representation of spectral window object."""
 95 |         band = self.band if self.band else 'unknown',
 96 |         product = repr(self.product) if self.product else 'unknown'
 97 |         return (f"<katdal.SpectralWindow {band}-band product={product} "
 98 |                 f"centre={self.centre_freq/1e6:.3f} MHz bandwidth={self.bandwidth/1e6:.3f} MHz "
 99 |                 f"channels={self.num_chans} at {id(self):#x}>")
100 | 
101 |     @property
102 |     def _description(self):
103 |         """Complete hashable representation, used internally for comparisons."""
104 |         # Pick values that enable a sensible ordering of spectral windows
105 |         # Using self.bandwidth is generally redundant but may play a role in
106 |         # obscure rounding cases.
107 |         return (self.centre_freq,
108 |                 -self.channel_width, self.num_chans, self.sideband,
109 |                 self.band, self.product, -self.bandwidth)
110 | 
111 |     def __eq__(self, other):
112 |         """Equality comparison operator."""
113 |         return self._description == (
114 |             other._description if isinstance(other, SpectralWindow) else other)
115 | 
116 |     def __ne__(self, other):
117 |         """Inequality comparison operator."""
118 |         return not (self == other)
119 | 
120 |     def __lt__(self, other):
121 |         """Less-than comparison operator (needed for sorting and np.unique)."""
122 |         return self._description < (
123 |             other._description if isinstance(other, SpectralWindow) else other)
124 | 
125 |     def __hash__(self):
126 |         """Base hash on description tuple, just like equality operator."""
127 |         return hash(self._description)
128 | 
129 |     def subrange(self, first, last):
130 |         """Get a new :class:`SpectralWindow` representing a subset of the channels.
131 | 
132 |         The returned :class:`SpectralWindow` covers the same frequencies as
133 |         channels [first, last) of the original.
134 | 
135 |         Raises
136 |         ------
137 |         IndexError
138 |             If [first, last) is not a (non-empty) subinterval of the channels
139 |         """
140 |         if not (0 <= first < last <= self.num_chans):
141 |             raise IndexError('channel indices out of range')
142 |         channel_shift = (first + last) // 2 - self.num_chans // 2
143 |         num_chans = last - first
144 |         # We use self.bandwidth and self.num_chans to avoid rounding errors
145 |         # that might accumulate if channel_width is inexact.
146 |         centre_freq = self.centre_freq \
147 |             + channel_shift * self.bandwidth * self.sideband / self.num_chans
148 |         return SpectralWindow(
149 |             centre_freq, self.channel_width, num_chans,
150 |             self.product, self.sideband, self.band,
151 |             bandwidth=self.bandwidth * num_chans / self.num_chans)
152 | 
153 |     def rechannelise(self, num_chans):
154 |         """Get a new :class:`SpectralWindow` with a different number of channels.
155 | 
156 |         The returned :class:`SpectralWindow` covers the same frequencies as the
157 |         original, but dividing the bandwidth into a different number of
158 |         channels.
159 |         """
160 |         if num_chans == self.num_chans:
161 |             return self
162 |         # Find the centre of the bandwidth (whereas centre_freq is the centre
163 |         # of the middle channel)
164 |         centre_freq = self.centre_freq
165 |         if self.num_chans % 2 == 0:
166 |             centre_freq -= self.sideband * 0.5 * self.channel_width
167 |         channel_width = self.bandwidth / num_chans
168 |         # Now convert to the centre of the new middle channel
169 |         if num_chans % 2 == 0:
170 |             centre_freq += self.sideband * 0.5 * channel_width
171 |         return SpectralWindow(
172 |             centre_freq, channel_width, num_chans,
173 |             self.product, self.sideband, self.band,
174 |             bandwidth=self.bandwidth)
175 | 


--------------------------------------------------------------------------------
/NEWS.rst:
--------------------------------------------------------------------------------
  1 | History
  2 | =======
  3 | 
  4 | 0.23 (2024-06-28)
  5 | -----------------
  6 | * New `mvf_download` script (also promote `mvf_copy` and remove junk) (#380)
  7 | * Select targets by their tags (#377)
  8 | * Rename `np.product` to support numpy >= 2.0 and make unit tests more robust (#372)
  9 | 
 10 | 0.22 (2023-11-28)
 11 | -----------------
 12 | * Restore np.bool in Numba averaging function to prevent mvftoms crash (#370)
 13 | * Replace underscores with dashes when loading old buckets from RDBs (#370)
 14 | * Select multiple targets with same name to avoid dropped scans in MS (#369)
 15 | * Support on-the-fly (OTF) scans in mvftoms (#366)
 16 | 
 17 | 0.21 (2023-05-12)
 18 | -----------------
 19 | * Fix support for numpy >= 1.24 and move unit tests from nose to pytest (#361)
 20 | * Complete rewrite of S3ChunkStore retries for more robust archive downloads (#363)
 21 | * Remove IMAGING_WEIGHT column full of zeroes from MS (#356)
 22 | * Improve tests with ES256-encoded JWT tokens and more robust MinIO health check (#360)
 23 | 
 24 | 0.20.1 (2022-04-29)
 25 | -------------------
 26 | * Fix broken `dataset.vis[n]` due to DaskLazyIndexer / ChunkStore interaction (#355)
 27 | 
 28 | 0.20 (2022-04-14)
 29 | -----------------
 30 | * Fix support for dask >= 2022.01.1 in ChunkStore (#351)
 31 | * Allow mvftoms to continue with partial MS after an interruption (#348)
 32 | * New mvf_copy.py script that can be used to extract autocorrelations only (#349)
 33 | * Treat Ceph 403 errors properly in S3ChunkStore (#352)
 34 | 
 35 | 0.19 (2021-11-23)
 36 | -----------------
 37 | * Support scans and non-radec targets like planets in mvftoms (#333)
 38 | * Expose the raw flags of MVF4 datasets (#335)
 39 | * Expose CBF F-engine sensors: applied delays, phases and gains (#338)
 40 | * Verify that S3 bucket is not empty to detect datasets archived to tape (#344)
 41 | * Populate SIGMA_SPECTRUM and redo SIGMA and WEIGHT in mvftoms (#347)
 42 | * Have a sensible DataSet.name and also add a separate DataSet.url (#337)
 43 | * Allow deselection of antennas using '~m0XX' (#340)
 44 | * Allow nested DaskLazyIndexers (#336)
 45 | * Fix mvftoms on macOS and Python 3.8+ (#339)
 46 | 
 47 | 0.18 (2021-04-20)
 48 | -----------------
 49 | * Switch to PyJWT 2 and Python 3.6, cleaning up Python 2 relics (#321 - #323)
 50 | * Allow preselection of channels and dumps upon katdal.open() to save time and memory (#324)
 51 | * Allow user to select fields, scans and antennas in mvftoms (#269)
 52 | * Support h5py 3.0 string handling in MVF3 (#331)
 53 | * Refactor requirement files to remove recursive dependencies (#329)
 54 | 
 55 | 0.17 (2021-01-27)
 56 | -----------------
 57 | * This is the last release that will support Python 3.5
 58 | * Pin PyJWT version to 1.x to avoid breaking API changes (#320)
 59 | * Van Vleck correction! (autocorrelations only, though) (#316)
 60 | * Expose excision, aka raw weights (#308)
 61 | * Better unit testing of DataSource and S3ChunkStore in general (#319)
 62 | * Support indexed telstate keys (the 1000th cut that killed Python 2) (#304)
 63 | * Split out separate utility classes for Minio (#310)
 64 | * Fix filtering of sensor events with invalid status (#306)
 65 | 
 66 | 0.16 (2020-08-28)
 67 | -----------------
 68 | * This is the last release that will support Python 2 (python2 maintenance branch)
 69 | * New 'time_offset' sensor property that adjusts timestamps of any sensor (#307)
 70 | * Fix calculation of cbf_dump_period for 'wide' / 'narrowN' instruments (#301)
 71 | * Increase katstore search window by 600 seconds to find infrequent updates (#302)
 72 | * Refactor SensorData to become a lazy abstract interface without caching (#292)
 73 | * Refactor SensorCache to use MutableMapping (#300)
 74 | * Fix rx_serial sensor use and file mode warning in MVFv3 files (#298, #299)
 75 | 
 76 | 0.15 (2020-03-13)
 77 | -----------------
 78 | * Improve S3 chunk store: check tokens, improve timeouts and retries (#272 - #277)
 79 | * Retry truncated reads and 50x errors due to S3 server overload (#274)
 80 | * Apply flux calibration if available (#278, #279)
 81 | * Improve mvf_rechunk and mvf_read_benchmark scripts (#280, #281, #284)
 82 | * Fix selection by target description (#271)
 83 | * Mark Python 2 support as deprecated (#282)
 84 | 
 85 | 0.14 (2019-10-02)
 86 | -----------------
 87 | * Make L2 product by applying self-calibration corrections (#253 - #256)
 88 | * Speed up uvw calculations (#252, #262)
 89 | * Produce documentation on readthedocs.org (#244, #245, #247, #250, #261)
 90 | * Clean up mvftoms and fix REST_FREQUENCY in SOURCE sub-table (#258)
 91 | * Support katstore64 API (#265)
 92 | * Improve chunk store: detect short reads, speed up handling of lost data (#259, #260)
 93 | * Use katpoint 0.9 and dask 1.2.1 features (#262, #243)
 94 | 
 95 | 0.13 (2019-05-09)
 96 | -----------------
 97 | * Load RDB files straight from archive (#233, #241)
 98 | * Retrieve raw sensor data from CAM katstore (#234)
 99 | * Work around one-CBF-dump offset issue (#238)
100 | * Improved MS output: fixed RECEPTOR_ANGLE (#230), added WEIGHT_SPECTRUM (#231)
101 | * Various optimisations to applycal (#224), weights (#226), S3 reads (#229)
102 | * Use katsdptelstate 0.8 and dask 1.1 features (#228, #233, #240)
103 | 
104 | 0.12 (2019-02-12)
105 | -----------------
106 | * Optionally make L1 product by applying calibration corrections (#194 - #198)
107 | * Let default reference antenna in v4 datasets be "array" antenna (#202, #220)
108 | * Use katsdptelstate v0.7: generic encodings, memory backend (#196, #201, #212)
109 | * Prepare for multi-dump chunks (#213, #214, #216, #217, #219)
110 | * Allow L1 flags to be ignored (#209, #210)
111 | * Deal with deprecated dask features (#204, #215)
112 | * Remove RADOS chunk store (it's all via S3 from here on)
113 | 
114 | 0.11 (2018-10-15)
115 | -----------------
116 | * Python 3 support via python-future (finally!)
117 | * Load L1 flags if available (#164)
118 | * Reduced memory usage (#165) and speedups (#155, #169, #170, #171, #182)
119 | * S3 chunk store now uses requests directly instead of via botocore (#166)
120 | * Let lazy indexer use oindex semantics like in the past (#180)
121 | * Fix concatenated data sets (#161)
122 | * Fix IPython / Jupyter tab completion for sensor cache (#176)
123 | 
124 | 0.10.1 (2018-05-18)
125 | -------------------
126 | * Restore NumPy 1.14 support (all data flagged otherwise)
127 | 
128 | 0.10 (2018-05-17)
129 | -----------------
130 | * Rally around the MeerKAT Visibility Format (MVF)
131 | * First optimised converter from MVF v4 to MS: mvftoms
132 | * Latest v4 fixes (synthetic timestamps, autodetection, NPY files in Ceph)
133 | * Flag and zero missing chunks
134 | * Now requires katsdptelstate (released), dask, h5py 2.3 and Python 2.7
135 | * Restore S3 unit tests and NumPy 1.11 (on Ubuntu 16.04) support
136 | 
137 | 0.9.5 (2018-02-22)
138 | ------------------
139 | * New HDF5 v3.9 file format in anticipation of v4 (affects obs_params)
140 | * Fix receiver serial numbers in recent MeerKAT data sets
141 | * Add dask support to ChunkStore
142 | * katdal.open() works on v4 RDB files
143 | 
144 | 0.9 (2018-01-16)
145 | ----------------
146 | * New ChunkStore and telstate-based parser for future v4 format
147 | * Use python-casacore (>=2.2.1) to create Measurement Sets instead of blank.ms
148 | * Read new-style noise diode sensor names, serial numbers and L0 stream metadata
149 | * Select multiple polarisations (useful for cross-pol)
150 | * Relax the "expected number of dumps" check to avoid spurious warnings
151 | * Fix NumPy 1.14 warnings
152 | 
153 | 0.8 (2017-08-08)
154 | ----------------
155 | * Fix upside-down MeerKAT images
156 | * SensorData rework to load gain solutions and access telstate efficiently
157 | * Improve mapping of sensor events onto dumps, especially for long (8 s) dumps
158 | * Fix NumPy 1.13 warnings and errors
159 | * Support UHF receivers
160 | 
161 | 0.7.1 (2017-01-19)
162 | ------------------
163 | 
164 | * Fix MODEL_DATA / CORRECTED_DATA shapes in h5toms
165 | * Produce calibration solution tables in h5toms and improve error messages
166 | * Autodetect receiver band on older RTS files
167 | 
168 | 0.7 (2016-12-14)
169 | ----------------
170 | 
171 | * Support weights in file and improve vis / weights / flags API
172 | * Support multiple receivers and improve centre frequency extraction
173 | * Speed up h5toms by ordering visibilities by time
174 | * Fix band selection and corr products for latest SDP (cam2telstate)
175 | * Allow explicit MS names in h5toms
176 | 
177 | 0.6 (2016-09-16)
178 | ----------------
179 | 
180 | * Initial release of katdal
181 | 


--------------------------------------------------------------------------------
/katdal/test/test_sensordata.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2018-2022, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Tests for :py:mod:`katdal.sensordata`."""
 18 | 
 19 | from collections import OrderedDict
 20 | from unittest import mock
 21 | 
 22 | import numpy as np
 23 | import pytest
 24 | 
 25 | from katdal.sensordata import (SensorCache, SensorData, SimpleSensorGetter,
 26 |                                remove_duplicates_and_invalid_values,
 27 |                                telstate_decode, to_str)
 28 | 
 29 | 
 30 | def assert_equal_typed(a, b):
 31 |     assert a == b
 32 |     assert type(a) == type(b)
 33 | 
 34 | 
 35 | class TestToStr:
 36 |     def test_non_str(self):
 37 |         assert_equal_typed(to_str(3), 3)
 38 |         assert_equal_typed(to_str(None), None)
 39 | 
 40 |     def test_simple_str(self):
 41 |         assert_equal_typed(to_str(b'hello'), 'hello')
 42 |         assert_equal_typed(to_str('hello'), 'hello')
 43 | 
 44 |     def test_non_ascii(self):
 45 |         assert_equal_typed(to_str(b'caf\xc3\xa9'), 'café')
 46 |         assert_equal_typed(to_str('café'), 'café')
 47 | 
 48 |     def test_list(self):
 49 |         assert_equal_typed(to_str([b'hello', 'world']), ['hello', 'world'])
 50 | 
 51 |     def test_tuple(self):
 52 |         assert_equal_typed(to_str((b'hello', 'world')), ('hello', 'world'))
 53 | 
 54 |     def test_dict(self):
 55 |         assert_equal_typed(to_str({b'hello': b'world', 'abc': 'xyz'}),
 56 |                            {'hello': 'world', 'abc': 'xyz'})
 57 | 
 58 |     def test_custom_dict(self):
 59 |         assert_equal_typed(to_str(OrderedDict([(b'hello', b'world'), ('abc', 'xyz')])),
 60 |                            OrderedDict([('hello', 'world'), ('abc', 'xyz')]))
 61 | 
 62 |     def test_numpy_str(self):
 63 |         a = np.array([[b'abc', b'def'], [b'ghi', b'jk']])
 64 |         b = np.array([['abc', 'def'], ['ghi', 'jk']])
 65 |         c = np.array([['abc', 'def'], ['ghi', 'jk']])
 66 |         np.testing.assert_array_equal(to_str(a), c)
 67 |         np.testing.assert_array_equal(to_str(b), c)
 68 | 
 69 |     def test_numpy_object(self):
 70 |         a = np.array([b'abc', 'def', (b'xyz', 'uvw')], dtype='O')
 71 |         b = np.array(['abc', 'def', ('xyz', 'uvw')], dtype='O')
 72 |         np.testing.assert_array_equal(to_str(a), b)
 73 | 
 74 | 
 75 | @mock.patch('katsdptelstate.encoding._allow_pickle', True)
 76 | @mock.patch('katsdptelstate.encoding._warn_on_pickle', False)
 77 | def test_telstate_decode():
 78 |     raw = "S'1'\n."
 79 |     assert telstate_decode(raw) == '1'
 80 |     assert telstate_decode(raw.encode()) == '1'
 81 |     assert telstate_decode(np.void(raw.encode())) == '1'
 82 |     assert telstate_decode('l', no_decode=('l', 's', 'u', 'x')) == 'l'
 83 |     raw_np = ("cnumpy.core.multiarray\nscalar\np1\n(cnumpy\ndtype\np2\n(S'f8'\nI0\nI1\ntRp3\n"
 84 |               "(I3\nS'<'\nNNNI-1\nI-1\nI0\ntbS'8\\xdf\\xd4(\\x89\\xfc\\xef?'\ntRp4\n.")
 85 |     value_np = telstate_decode(raw_np)
 86 |     assert value_np == 0.9995771214953271
 87 |     assert isinstance(value_np, np.float64)
 88 | 
 89 | 
 90 | class TestSensorCache:
 91 |     def _cache_data(self):
 92 |         sensors = [
 93 |             ('foo', [4.0, 7.0], [3.0, 6.0]),
 94 |             ('cat', [2.0, 6.0], ['hello', 'world'])
 95 |         ]
 96 |         cache_data = {}
 97 |         for name, ts, values in sensors:
 98 |             sd = SimpleSensorGetter(name, np.asarray(ts), np.asarray(values))
 99 |             cache_data[name] = sd
100 |         return cache_data
101 | 
102 |     def setup_method(self):
103 |         self.cache = SensorCache(self._cache_data(), timestamps=np.arange(10.), dump_period=1.0)
104 | 
105 |     def test_extract_float(self):
106 |         data = self.cache.get('foo', extract=True)
107 |         np.testing.assert_array_equal(data, [3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 6.0, 6.0])
108 | 
109 |     def test_extract_categorical(self):
110 |         data = self.cache.get('cat', extract=True)
111 |         H = 'hello'
112 |         W = 'world'
113 |         np.testing.assert_array_equal(data[:], [H, H, H, H, H, H, W, W, W, W])
114 | 
115 |     def test_alias(self):
116 |         self.cache = SensorCache(
117 |             self._cache_data(), timestamps=np.arange(10.), dump_period=1.0,
118 |             aliases={'zz': 'at'})
119 |         # Check that adding the alias didn't lead to extraction
120 |         assert isinstance(self.cache.get('czz', extract=False), SimpleSensorGetter)
121 |         np.testing.assert_array_equal(self.cache['czz'], self.cache['cat'])
122 | 
123 |     def test_len(self):
124 |         assert len(self.cache) == 2
125 | 
126 |     def test_keys(self):
127 |         assert sorted(self.cache.keys()) == ['cat', 'foo']
128 | 
129 |     def test_contains(self):
130 |         assert 'cat' in self.cache
131 |         assert 'foo' in self.cache
132 |         assert 'dog' not in self.cache
133 |         template = 'Antennas/{ant}/{param1}_{param2}'
134 |         self.cache.virtual[template] = lambda x: None
135 |         assert template not in self.cache
136 | 
137 |     def test_setitem_delitem(self):
138 |         self.cache['bar'] = SimpleSensorGetter('bar', np.array([1.0]), np.array([0.0]))
139 |         np.testing.assert_array_equal(self.cache['bar'], np.zeros(10))
140 |         del self.cache['bar']
141 |         assert 'bar' not in self.cache
142 | 
143 |     def test_sensor_time_offset(self):
144 |         data = self.cache.get('foo', extract=True, time_offset=-1.0)
145 |         np.testing.assert_array_equal(data, [3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 6.0, 6.0, 6.0])
146 | 
147 |     def test_virtual_sensors(self):
148 |         calculate_value = mock.Mock()
149 | 
150 |         def _check_sensor(cache, name, **kwargs):
151 |             """Check that virtual sensor function gets the expected parameters."""
152 |             assert kwargs == params
153 |             calculate_value()
154 |             value = kwargs['param2']
155 |             cache[name] = value
156 |             return value
157 | 
158 |         # Set up a virtual sensor and trigger it to get a value
159 |         params = {'ant': 'm000', 'param1': 'one', 'param2': 'two'}
160 |         template = 'Antennas/{ant}/{param1}_{param2}'
161 |         self.cache.virtual[template] = _check_sensor
162 |         value = self.cache.get(template.format(**params))
163 |         assert value == params['param2']
164 |         assert calculate_value.call_count == 1
165 |         # Check that the value was taken from the cache the second time around
166 |         value = self.cache.get(template.format(**params))
167 |         assert value == params['param2']
168 |         assert calculate_value.call_count == 1
169 |         # If your parameter values contain underscores, don't use it as delimiter
170 |         params = {'ant': 'm000', 'param1': 'one', 'param2': 'two_three'}
171 |         with pytest.raises(AssertionError):
172 |             self.cache.get(template.format(**params))
173 |         template = 'Antennas/{ant}/{param1}/{param2}'
174 |         # The updated template has not yet been added to the cache
175 |         with pytest.raises(KeyError):
176 |             self.cache.get(template.format(**params))
177 |         self.cache.virtual[template] = _check_sensor
178 |         value = self.cache.get(template.format(**params))
179 |         assert value == params['param2']
180 |         assert calculate_value.call_count == 2
181 | 
182 |     # TODO: more tests required:
183 |     # - extract=False
184 |     # - selection
185 | 
186 | 
187 | def test_sensor_cleanup():
188 |     # The first sensor event has a status of "unknown" and is therefore invalid. It happened
189 |     # after the second (valid) event, though, and snuck through due to a bug (now fixed).
190 |     # This mirrors the behaviour of the cbf_1_wide_input_labelling sensor in CBID 1588667937.
191 |     timestamp = np.array([1.0, 0.0, 3.0, 3.0, 3.0, 3.0, 2.0])
192 |     value = np.array(['broke', 'a', 'c', 'c', 'c', 'd', 'b'])
193 |     status = np.array(['unknown', 'nominal', 'nominal', 'nominal', 'warn', 'error', 'nominal'])
194 |     dirty = SensorData('test', timestamp, value, status)
195 |     clean = remove_duplicates_and_invalid_values(dirty)
196 |     assert clean.status is None
197 |     np.testing.assert_array_equal(clean.value, np.array(['a', 'b', 'd']))
198 |     np.testing.assert_array_equal(clean.timestamp, np.array([0.0, 2.0, 3.0]))
199 | 


--------------------------------------------------------------------------------
/katdal/ms_async.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2018-2019,2021-2023, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Write data to a Measurement Set asynchronously.
 18 | 
 19 | This uses multiprocessing, a queue, and a circular buffer in shared memory to
 20 | pass visibility data to a separate process that actually writes to the
 21 | measurement set.
 22 | 
 23 | This is largely an implementation detail of the mvftoms.py script, and might
 24 | not be suited to other use cases. It is put into a separate module as a
 25 | workaround for https://bugs.python.org/issue9914.
 26 | """
 27 | 
 28 | import contextlib
 29 | import multiprocessing
 30 | import multiprocessing.sharedctypes
 31 | from collections import namedtuple
 32 | 
 33 | import katpoint
 34 | import numpy as np
 35 | 
 36 | from . import ms_extra
 37 | 
 38 | 
 39 | class RawArray:
 40 |     """Shared memory array, in representation that can be passed through multiprocessing queue"""
 41 |     def __init__(self, shape, dtype):
 42 |         self.shape = shape
 43 |         self.dtype = np.dtype(dtype)
 44 |         size = self.dtype.itemsize * int(np.prod(shape))
 45 |         self.storage = multiprocessing.sharedctypes.RawArray('c', size)
 46 | 
 47 |     def asarray(self):
 48 |         """Return numpy array representation"""
 49 |         return np.frombuffer(self.storage, self.dtype).reshape(self.shape)
 50 | 
 51 | 
 52 | QueueItem = namedtuple('QueueItem', ['slot', 'target', 'time_utc', 'dump_time_width',
 53 |                                      'field_id', 'state_id', 'scan_itr'])
 54 | ScanResult = namedtuple('ScanResult', ['scan_size'])
 55 | EndOfScan = namedtuple('EndOfScan', [])
 56 | 
 57 | 
 58 | def ms_writer_process(
 59 |         work_queue, result_queue, options, antennas, cp_info, ms_name,
 60 |         raw_vis_data, raw_weight_data, raw_flag_data, start_row):
 61 |     """
 62 |     Function to be run in a separate process for writing to a Measurement Set.
 63 |     The MS is assumed to have already been created with the appropriate
 64 |     columns.
 65 | 
 66 |     Incoming work is provided by submitting instances of :class:`QueueItem`
 67 |     to `work_queue`. The `slot` indexes the first dimension of the shared
 68 |     memory arrays. One may also submit an :class:`EndOfScan`, which will flush
 69 |     to disk and return a :class:`ScanResult` through the `result_queue` (these
 70 |     are not actually required to match katdal scans).
 71 | 
 72 |     To terminate the process, submit ``None`` to `work_queue`.
 73 | 
 74 |     If an exception occurs, it will be placed into `result_queue`, after which
 75 |     work_queue items will be fetched and discarded until ``None`` is received.
 76 |     When finished (either successfully or after an error), ``None`` is put in
 77 |     `result_queue`.
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     work_queue : :class:`multiprocessing.Queue`
 82 |         Incoming work. Note that this function gives no explicit indication
 83 |         when it is done with a piece of work, so the queue capacity needs to
 84 |         be bounded to prevent data races.
 85 |     result_queue : :class:`multiprocessing.Queue`
 86 |         Information about progress (see :class:`ScanResult`)
 87 |     options : :class:`argparse.Namespace`
 88 |         Command-line options to mvftoms
 89 |     antennas : list of :class:`katpoint.Antenna`
 90 |         Antennas (used to compute UVW coordinates)
 91 |     cp_info : namedtuple
 92 |         Correlation product info (see mvftoms.py)
 93 |     ms_name : str
 94 |         Name of the Measurement Set to write
 95 |     raw_vis_data, raw_weight_data, raw_flag_data : :class:`RawArray`
 96 |         Circular buffers for the data, with shape
 97 |         (slots, time, baseline, channel, pol).
 98 |     start_row : int
 99 |         Row in Measurement Set where output will start
100 |     """
101 | 
102 |     none_seen = False
103 |     try:
104 |         vis_arrays = raw_vis_data.asarray()
105 |         weight_arrays = raw_weight_data.asarray()
106 |         flag_arrays = raw_flag_data.asarray()
107 |         scan_size = 0
108 |         tdiff = vis_arrays.shape[1]
109 |         nbl = vis_arrays.shape[2]
110 | 
111 |         main_table = ms_extra.open_table(ms_name, verbose=options.verbose)
112 |         with contextlib.closing(main_table):
113 |             array_centre = antennas[0].array_reference_antenna()
114 |             while True:
115 |                 item = work_queue.get()
116 |                 if item is None:
117 |                     none_seen = True
118 |                     break
119 |                 elif isinstance(item, EndOfScan):
120 |                     main_table.flush()    # Mostly to get realistic throughput stats
121 |                     result_queue.put(ScanResult(scan_size))
122 |                     scan_size = 0
123 |                 else:
124 |                     # Extract the slot, and flatten time and baseline into a single axis
125 |                     new_shape = (-1, vis_arrays.shape[-2], vis_arrays.shape[-1])
126 |                     vis_data = vis_arrays[item.slot].reshape(new_shape)
127 |                     weight_data = weight_arrays[item.slot].reshape(new_shape)
128 |                     flag_data = flag_arrays[item.slot].reshape(new_shape)
129 | 
130 |                     # Iterate through baselines, computing UVW coordinates
131 |                     # for a chunk of timesteps. Note that we can't rely on the
132 |                     # u, v, w properties of the dataset because those
133 |                     # correspond to the original dumps, and we might be
134 |                     # averaging in time.
135 |                     uvw_ant = item.target.uvw(antennas, item.time_utc, array_centre)
136 |                     # Permute from axis, time, antenna to time, antenna, axis
137 |                     uvw_ant = np.transpose(uvw_ant, (1, 2, 0))
138 |                     # Compute baseline UVW coordinates from per-antenna coordinates.
139 |                     # The sign convention matches `CASA`_, rather than the
140 |                     # Measurement Set `definition`_.
141 |                     # .. _CASA: https://casa.nrao.edu/Memos/CoordConvention.pdf
142 |                     # .. _definition: https://casa.nrao.edu/Memos/229.html#SECTION00064000000000000000
143 |                     uvw_coordinates = (np.take(uvw_ant, cp_info.ant1_index, axis=1)
144 |                                        - np.take(uvw_ant, cp_info.ant2_index, axis=1))
145 |                     # Flatten time and baseline axes together
146 |                     uvw_coordinates = uvw_coordinates.reshape(-1, 3)
147 | 
148 |                     # Convert averaged UTC timestamps to MJD seconds.
149 |                     # Blow time up to (ntime*nbl,)
150 |                     out_mjd = np.asarray([katpoint.Timestamp(t).to_mjd() * 24 * 60 * 60
151 |                                           for t in item.time_utc])
152 | 
153 |                     out_mjd = np.broadcast_to(out_mjd[:, np.newaxis], (tdiff, nbl)).ravel()
154 | 
155 |                     # Repeat antenna indices to (ntime*nbl,)
156 |                     a1 = np.broadcast_to(cp_info.ant1_index[np.newaxis, :], (tdiff, nbl)).ravel()
157 |                     a2 = np.broadcast_to(cp_info.ant2_index[np.newaxis, :], (tdiff, nbl)).ravel()
158 | 
159 |                     # Blow field ID up to (ntime*nbl,)
160 |                     big_field_id = np.full((tdiff * nbl,), item.field_id, dtype=np.int32)
161 |                     big_state_id = np.full((tdiff * nbl,), item.state_id, dtype=np.int32)
162 |                     big_scan_itr = np.full((tdiff * nbl,), item.scan_itr, dtype=np.int32)
163 | 
164 |                     # Setup model_data and corrected_data if required
165 |                     model_data = None
166 |                     corrected_data = None
167 | 
168 |                     if options.model_data:
169 |                         # unity intensity zero phase model data set, same shape as vis_data
170 |                         model_data = np.ones(vis_data.shape, dtype=np.complex64)
171 |                         # corrected data set copied from vis_data
172 |                         corrected_data = vis_data
173 | 
174 |                     # Populate dictionary for write to MS
175 |                     main_dict = ms_extra.populate_main_dict(
176 |                         uvw_coordinates, vis_data,
177 |                         flag_data, weight_data, out_mjd, a1, a2,
178 |                         item.dump_time_width, big_field_id, big_state_id,
179 |                         big_scan_itr, model_data, corrected_data)
180 | 
181 |                     # Write data to MS.
182 |                     nrows = ms_extra.write_rows(main_table, main_dict,
183 |                                                 options.verbose, start_row)
184 |                     start_row += nrows
185 | 
186 |                     # Calculate bytes written from the summed arrays in the dict
187 |                     scan_size += sum(a.nbytes for a in main_dict.values()
188 |                                      if isinstance(a, np.ndarray))
189 |     except Exception as error:
190 |         result_queue.put(error)
191 |         while not none_seen:
192 |             item = work_queue.get()
193 |             if item is None:
194 |                 none_seen = True
195 |     finally:
196 |         result_queue.put(None)
197 | 


--------------------------------------------------------------------------------
/scripts/mvf_download.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python3
  2 | 
  3 | ################################################################################
  4 | # Copyright (c) 2023-2024, National Research Foundation (SARAO)
  5 | #
  6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  7 | # this file except in compliance with the License. You may obtain a copy
  8 | # of the License at
  9 | #
 10 | #   https://opensource.org/licenses/BSD-3-Clause
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | ################################################################################
 18 | 
 19 | #
 20 | # Download an MVF4 dataset using rclone.
 21 | #
 22 | # Ludwig Schwardt
 23 | # 16 May 2023
 24 | #
 25 | 
 26 | import argparse
 27 | import json
 28 | import os
 29 | import shutil
 30 | import subprocess
 31 | import sys
 32 | from collections import defaultdict
 33 | from pathlib import Path, PurePosixPath
 34 | from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
 35 | 
 36 | import dask
 37 | import katdal
 38 | from katdal.chunkstore import _blocks_ravel
 39 | from katdal.lazy_indexer import dask_getitem
 40 | from packaging import version
 41 | 
 42 | # This version is good for file-less config, enabling --config "" and --files-from -
 43 | MINIMUM_RCLONE_VERSION = version.Version('1.56')
 44 | DESCRIPTION = """
 45 | Download MVFv4 dataset (or a subset of chunks) from S3 to disk using rclone.
 46 | 
 47 | You need rclone (https://rclone.org/downloads/) if it is not on your system.
 48 | It is a single executable file that you could download to your user account.
 49 | Just ensure that it is on your PATH; no need to configure it any further.
 50 | 
 51 | Run the script like this:
 52 | 
 53 |   mvf_download.py https://archive/1698676533/1698676533_sdp_l0.full.rdb?token=<> dest
 54 | 
 55 | Data will appear in three subdirectories in the specified output directory as
 56 | 
 57 |   dest/1698676533/...
 58 |   dest/1698676533-sdp-l0/...
 59 |   dest/1698676533-sdp-l1-flags/...
 60 | 
 61 | Open the local dataset like this:
 62 | 
 63 |   d = katdal.open("dest/1698676533/1698676533_sdp_l0.full.rdb")
 64 | 
 65 | If the script crashes or you terminate it, you can just run it again and
 66 | it will carry on, fixing any half-downloaded chunks along the way. If it
 67 | completes, you can be sure that all your data is safely downloaded.
 68 | 
 69 | BONUS: you can even copy just parts of the data (e.g. the tracks and not the
 70 | slews). This works as long as your selection picks out a subset of the chunks
 71 | but leaves the chunks themselves intact. It is well suited for time-based
 72 | selections.
 73 | 
 74 | Because MeerKAT data is chunked first in time and then in frequency, but not
 75 | in correlation product, this won't help to select a subset of antennas or
 76 | baselines or autocorrelations, as that would require breaking up chunks into
 77 | smaller chunks. For that, consider using the mvf_copy.py script instead, which
 78 | is also useful if you want to copy a subset of data from disk to disk.
 79 | 
 80 | Note that you have to pass a JSON object (which resembles a Python dict) as a
 81 | string to the --select argument. The "dict" contains keyword arguments meant
 82 | for the DataSet.select() method. It's important to note that the strings in
 83 | the dict need double quotes (") while the entire string has to be encapsulated
 84 | in single quotes ('). Some examples:
 85 | 
 86 |   mvf_download.py url directory --select='{"scans": "track"}'
 87 |   mvf_download.py url directory --select='{"scans": 1}'
 88 |   mvf_download.py url directory --select='{"scans": [0, 1, 2]}'
 89 |   mvf_download.py url directory --select='{"targets": "J1939-6342"}'
 90 | 
 91 | The chunks that are not copied will appear as "lost" data in the downloaded
 92 | dataset, but that is fine. If you apply the same selection, you won't see it.
 93 | """
 94 | 
 95 | 
 96 | def parse_args(args=None, namespace=None):
 97 |     """Parse script arguments into script-specific ones and ones meant for rclone."""
 98 |     parser = argparse.ArgumentParser(
 99 |         usage='%(prog)s [-h] [--select JSON] [--workers N] '
100 |               'source dest [rclone options]',
101 |         description=DESCRIPTION,
102 |         epilog='Any extra script options are passed to rclone.',
103 |         formatter_class=argparse.RawDescriptionHelpFormatter,
104 |     )
105 |     parser.add_argument('source', help='Dataset URL (including token if needed)')
106 |     parser.add_argument('dest', type=Path, help='Output directory')
107 |     parser.add_argument('--select', type=json.loads, default={},
108 |                         help='Kwargs for katdal.DataSet.select as a JSON object')
109 |     parser.add_argument('--workers', type=int, default=16,
110 |                         help='Number of rclone threads for parallel I/O [%(default)s]')
111 |     mvf_download_args, rclone_args = parser.parse_known_args(args, namespace)
112 |     rclone_args = [
113 |         '--transfers', str(mvf_download_args.workers),
114 |         '--checkers', str(mvf_download_args.workers + 4)
115 |     ] + rclone_args
116 |     return mvf_download_args, rclone_args
117 | 
118 | 
119 | def chunk_names(vfw, keep):
120 |     """Names of chunks covered by selection `keep` in all storage arrays in `vfw`."""
121 |     all_chunks = defaultdict(list)
122 |     for array, info in vfw.chunk_info.items():
123 |         darray = vfw.store.get_dask_array(
124 |             array,
125 |             info['chunks'],
126 |             info['dtype'],
127 |             index=vfw.preselect_index,
128 |             errors='dryrun',
129 |         )
130 |         kept_blocks = _blocks_ravel(dask_getitem(darray, keep[:darray.ndim]))
131 |         chunks = sorted(chunk.name + '.npy' for chunk in dask.compute(*kept_blocks))
132 |         all_chunks[info['prefix']].extend(chunks)
133 |     return all_chunks
134 | 
135 | 
136 | def has_recent_rclone():
137 |     """Check that rclone is installed and has an appropriate version."""
138 |     try:
139 |         result = subprocess.run(['rclone', 'version'], capture_output=True, check=True)
140 |     except FileNotFoundError:
141 |         print('The rclone tool was not found. Please install at least version '
142 |               f'{MINIMUM_RCLONE_VERSION} (see rclone.org) or check the path.')
143 |     else:
144 |         installed_version = version.parse(result.stdout.split()[1].decode())
145 |         if installed_version >= MINIMUM_RCLONE_VERSION:
146 |             return True
147 |         print(f'Found rclone {installed_version} but the script needs version '
148 |               f'{MINIMUM_RCLONE_VERSION}. See rclone.org for installation options.')
149 |     return False
150 | 
151 | 
152 | def rclone_fit_output_to_terminal(args):
153 |     """Reduce rclone output to a single line if it won't fit on terminal."""
154 |     new_args = args.copy()
155 |     # Find last instances of --transfers and --checkers flags (guaranteed one of each)
156 |     parser = argparse.ArgumentParser()
157 |     parser.add_argument('--transfers', action='append', type=int)
158 |     parser.add_argument('--checkers', action='append', type=int)
159 |     n, _ = parser.parse_known_args([str(arg) for arg in new_args])
160 |     if n.transfers[-1] + n.checkers[-1] + 6 > shutil.get_terminal_size().lines:
161 |         new_args.append('--stats-one-line')
162 |     return new_args
163 | 
164 | 
165 | def rclone_copy(endpoint, bucket, dest, args, token=None, files=None):
166 |     """Run 'rclone copy' with appropriate arguments."""
167 |     env = os.environ.copy()
168 |     # Ignore config file as we will configure rclone with environment variables instead
169 |     env['RCLONE_CONFIG'] = ''
170 |     env['RCLONE_CONFIG_ARCHIVE_TYPE'] = 's3'
171 |     env['RCLONE_CONFIG_ARCHIVE_ENDPOINT'] = endpoint
172 |     rclone_args = [
173 |         'rclone', 'copy', f'archive:{bucket}', dest,
174 |         '--s3-provider', 'Ceph',
175 |         '--fast-list',
176 |         '--checksum',
177 |         '--progress',
178 |     ]
179 |     if token:
180 |         rclone_args.extend(['--header', f'Authorization: Bearer {token}'])
181 |     run_kwargs = dict(check=True, env=env)
182 |     if files is not None:
183 |         rclone_args.extend(['--files-from', '-'])
184 |         run_kwargs.update(input='\n'.join(files), text=True)
185 |     # User-supplied arguments can override any of the above args
186 |     rclone_args.extend(args)
187 |     rclone_args = rclone_fit_output_to_terminal(rclone_args)
188 |     subprocess.run(rclone_args, **run_kwargs)  # pylint: disable=subprocess-run-check
189 | 
190 | 
191 | def main():
192 |     """Main routine of mvf_download script."""
193 |     args, rclone_args = parse_args()
194 |     if not has_recent_rclone():
195 |         return False
196 |     url_parts = urlparse(args.source)
197 |     *_, cbid, rdb_filename = PurePosixPath(url_parts.path).parts
198 |     endpoint = urlunparse((url_parts.scheme, url_parts.netloc, '', '', '', ''))
199 |     token = dict(parse_qsl(url_parts.query)).get('token')
200 |     meta_path = args.dest / cbid
201 |     print(f"\nDownloading metadata bucket ({cbid}) to {meta_path.absolute()} ...")
202 |     rclone_copy(endpoint, cbid, meta_path, rclone_args, token)
203 | 
204 |     query_params = {'s3_endpoint_url': endpoint}
205 |     if token:
206 |         query_params['token'] = token
207 |     query = urlencode(query_params)
208 |     rdb_path = (meta_path / rdb_filename).absolute()
209 |     local_rdb = urlunparse(('file', '', str(rdb_path), '', query, ''))
210 |     print(f"Opening local RDB file: {local_rdb}")
211 |     d = katdal.open(local_rdb)
212 |     d.select(**args.select)
213 |     # Collect names of chunks covered by selection in each chunked storage array
214 |     chunks = chunk_names(d.source.data, d.vis.keep)
215 |     for bucket, files in chunks.items():
216 |         bucket_path = args.dest / bucket
217 |         n_chunks = len(files)
218 |         if not args.select:
219 |             n_chunks = f'all {n_chunks}'
220 |             files = None
221 |         print(f"\nDownloading {n_chunks} chunks from data bucket {bucket} "
222 |               f"to {bucket_path.absolute()} ...")
223 |         rclone_copy(endpoint, bucket, bucket_path, rclone_args, token, files)
224 |     return True
225 | 
226 | 
227 | if __name__ == '__main__':
228 |     if not main():
229 |         sys.exit(1)
230 | 


--------------------------------------------------------------------------------
/doc/intro.rst:
--------------------------------------------------------------------------------
  1 | Introduction to katdal
  2 | ======================
  3 | 
  4 | Data access library for data sets in the MeerKAT Visibility Format (MVF)
  5 | 
  6 | Overview
  7 | --------
  8 | 
  9 | This module serves as a data access library to interact with the chunk stores
 10 | and HDF5 files produced by the MeerKAT radio telescope and its predecessors
 11 | (KAT-7 and Fringe Finder). It uses memory carefully, allowing data sets to be
 12 | inspected and partially loaded into memory. Data sets may be concatenated and
 13 | split via a flexible selection mechanism. In addition, it provides a script to
 14 | convert these data sets to CASA MeasurementSets.
 15 | 
 16 | Quick Tutorial
 17 | --------------
 18 | 
 19 | Open any data set through a single function to obtain a data set object::
 20 | 
 21 |   import katdal
 22 |   d = katdal.open('1234567890.h5')
 23 | 
 24 | This automatically determines the version and storage location of the data set.
 25 | The versions roughly map to the various instruments::
 26 | 
 27 |   - v1 : Fringe Finder (HDF5 file)
 28 |   - v2 : KAT-7 (HDF5 file)
 29 |   - v3 : MeerKAT (HDF5 file)
 30 |   - v4 : MeerKAT (chunk store based on objects in Ceph)
 31 | 
 32 | Multiple data sets (even of different versions) may also be concatenated
 33 | together (as long as they have the same dump rate)::
 34 | 
 35 |   d = katdal.open(['1234567890.h5', '1234567891.h5'])
 36 | 
 37 | Inspect the contents of the data set by printing the object::
 38 | 
 39 |   print d
 40 | 
 41 | Here is a typical output::
 42 | 
 43 |   ===============================================================================
 44 |   Name: 1313067732.h5 (version 2.0)
 45 |   ===============================================================================
 46 |   Observer: someone  Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f
 47 |   Description: 'Track on Hyd A,Vir A, 3C 286 and 3C 273'
 48 |   Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST
 49 |   Dump rate: 1.00025 Hz
 50 |   Subarrays: 1
 51 |     ID  Antennas                            Inputs  Corrprods
 52 |      0  ant1,ant2,ant3,ant4,ant5,ant6,ant7  14      112
 53 |   Spectral Windows: 1
 54 |     ID  CentreFreq(MHz)  Bandwidth(MHz)  Channels  ChannelWidth(kHz)
 55 |      0  1822.000         400.000          1024      390.625
 56 |   -------------------------------------------------------------------------------
 57 |   Data selected according to the following criteria:
 58 |     subarray=0
 59 |     ants=['ant1', 'ant2', 'ant3', 'ant4', 'ant5', 'ant6', 'ant7']
 60 |     spw=0
 61 |   -------------------------------------------------------------------------------
 62 |   Shape: (1054 dumps, 1024 channels, 112 correlation products) => Size: 967.049 MB
 63 |   Antennas: *ant1,ant2,ant3,ant4,ant5,ant6,ant7  Inputs: 14  Autocorr: yes  Crosscorr: yes
 64 |   Channels: 1024 (index 0 - 1023, 2021.805 MHz - 1622.195 MHz), each 390.625 kHz wide
 65 |   Targets: 4 selected out of 4 in catalogue
 66 |     ID  Name    Type      RA(J2000)     DEC(J2000)  Tags  Dumps  ModelFlux(Jy)
 67 |      0  Hyd A   radec      9:18:05.28  -12:05:48.9          333      33.63
 68 |      1  Vir A   radec     12:30:49.42   12:23:28.0          251     166.50
 69 |      2  3C 286  radec     13:31:08.29   30:30:33.0          230      12.97
 70 |      3  3C 273  radec     12:29:06.70    2:03:08.6          240      39.96
 71 |   Scans: 8 selected out of 8 total       Compscans: 1 selected out of 1 total
 72 |     Date        Timerange(UTC)       ScanState  CompScanLabel  Dumps  Target
 73 |     11-Aug-2011/13:02:14 - 13:04:26    0:slew     0:             133    0:Hyd A
 74 |                 13:04:27 - 13:07:46    1:track    0:             200    0:Hyd A
 75 |                 13:07:47 - 13:08:37    2:slew     0:              51    1:Vir A
 76 |                 13:08:38 - 13:11:57    3:track    0:             200    1:Vir A
 77 |                 13:11:58 - 13:12:27    4:slew     0:              30    2:3C 286
 78 |                 13:12:28 - 13:15:47    5:track    0:             200    2:3C 286
 79 |                 13:15:48 - 13:16:27    6:slew     0:              40    3:3C 273
 80 |                 13:16:28 - 13:19:47    7:track    0:             200    3:3C 273
 81 | 
 82 | The first segment of the printout displays the static information of the data
 83 | set, including observer, dump rate and all the available subarrays and spectral
 84 | windows in the data set. The second segment (between the dashed lines) highlights
 85 | the active selection criteria. The last segment displays dynamic information
 86 | that is influenced by the selection, including the overall visibility array
 87 | shape, antennas, channel frequencies, targets and scan info.
 88 | 
 89 | The data set is built around the concept of a three-dimensional visibility array
 90 | with dimensions of time, frequency and correlation product. This is reflected in
 91 | the *shape* of the dataset::
 92 | 
 93 |   d.shape
 94 | 
 95 | which returns (1054, 1024, 112), meaning 1054 dumps by 1024 channels by 112
 96 | correlation products.
 97 | 
 98 | Let's select a subset of the data set::
 99 | 
100 |   d.select(scans='track', channels=slice(200,300), ants='ant4')
101 |   print d
102 | 
103 | This results in the following printout::
104 | 
105 |   ===============================================================================
106 |   Name: /Users/schwardt/Downloads/1313067732.h5 (version 2.0)
107 |   ===============================================================================
108 |   Observer: siphelele  Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f
109 |   Description: 'track on Hyd A,Vir A, 3C 286 and 3C 273 for Lud'
110 |   Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST
111 |   Dump rate: 1.00025 Hz
112 |   Subarrays: 1
113 |     ID  Antennas                            Inputs  Corrprods
114 |      0  ant1,ant2,ant3,ant4,ant5,ant6,ant7  14      112
115 |   Spectral Windows: 1
116 |     ID  CentreFreq(MHz)  Bandwidth(MHz)  Channels  ChannelWidth(kHz)
117 |      0  1822.000         400.000          1024      390.625
118 |   -------------------------------------------------------------------------------
119 |   Data selected according to the following criteria:
120 |     channels=slice(200, 300, None)
121 |     subarray=0
122 |     scans='track'
123 |     ants='ant4'
124 |     spw=0
125 |   -------------------------------------------------------------------------------
126 |   Shape: (800 dumps, 100 channels, 4 correlation products) => Size: 2.560 MB
127 |   Antennas: ant4  Inputs: 2  Autocorr: yes  Crosscorr: no
128 |   Channels: 100 (index 200 - 299, 1943.680 MHz - 1905.008 MHz), each 390.625 kHz wide
129 |   Targets: 4 selected out of 4 in catalogue
130 |     ID  Name    Type      RA(J2000)     DEC(J2000)  Tags  Dumps  ModelFlux(Jy)
131 |      0  Hyd A   radec      9:18:05.28  -12:05:48.9          200      31.83
132 |      1  Vir A   radec     12:30:49.42   12:23:28.0          200     159.06
133 |      2  3C 286  radec     13:31:08.29   30:30:33.0          200      12.61
134 |      3  3C 273  radec     12:29:06.70    2:03:08.6          200      39.32
135 |   Scans: 4 selected out of 8 total       Compscans: 1 selected out of 1 total
136 |     Date        Timerange(UTC)       ScanState  CompScanLabel  Dumps  Target
137 |     11-Aug-2011/13:04:27 - 13:07:46    1:track    0:             200    0:Hyd A
138 |                 13:08:38 - 13:11:57    3:track    0:             200    1:Vir A
139 |                 13:12:28 - 13:15:47    5:track    0:             200    2:3C 286
140 |                 13:16:28 - 13:19:47    7:track    0:             200    3:3C 273
141 | 
142 | Compared to the first printout, the static information has remained the same
143 | while the dynamic information now reflects the selected subset. There are many
144 | possible selection criteria, as illustrated below::
145 | 
146 |   d.select(timerange=('2011-08-11 13:10:00', '2011-08-11 13:15:00'), targets=[1, 2])
147 |   d.select(spw=0, subarray=0)
148 |   d.select(ants='ant1,ant2', pol='H', scans=(0,1,2), freqrange=(1700e6, 1800e6))
149 | 
150 | See the docstring of :meth:`DataSet.select` for more detailed information (i.e.
151 | do `d.select?` in IPython). Take note that only one subarray and one spectral
152 | window must be selected.
153 | 
154 | Once a subset of the data has been selected, you can access the data and
155 | timestamps on the data set object::
156 | 
157 |   vis = d.vis[:]
158 |   timestamps = d.timestamps[:]
159 | 
160 | Note the `[:]` indexing, as the *vis* and *timestamps* properties are special
161 | :class:`LazyIndexer` objects that only give you the actual data when you use
162 | indexing, in order not to inadvertently load the entire array into memory.
163 | 
164 | For the example dataset and no selection the *vis* array will have a shape of
165 | (1054, 1024, 112). The time dimension is labelled by `d.timestamps`, the
166 | frequency dimension by `d.channel_freqs` and the correlation product dimension
167 | by `d.corr_products`.
168 | 
169 | Another key concept in the data set object is that of *sensors*. These are named
170 | time series of arbritrary data that are either loaded from the data set
171 | (*actual* sensors) or calculated on the fly (*virtual* sensors). Both variants
172 | are accessed through the *sensor cache* (available as `d.sensor`) and cached
173 | there after the first access. The data set object also provides convenient
174 | properties to expose commonly-used sensors, as shown in the plot example below::
175 | 
176 |   import matplotlib.pyplot as plt
177 |   plt.plot(d.az, d.el, 'o')
178 |   plt.xlabel('Azimuth (degrees)')
179 |   plt.ylabel('Elevation (degrees)')
180 | 
181 | Other useful attributes include *ra*, *dec*, *lst*, *mjd*, *u*, *v*, *w*,
182 | *target_x* and *target_y*. These are all one-dimensional NumPy arrays that
183 | dynamically change length depending on the active selection.
184 | 
185 | As in katdal's predecessor (scape) there is a :meth:`DataSet.scans` generator
186 | that allows you to step through the scans in the data set. It returns the
187 | scan index, scan state and target object on each iteration, and updates
188 | the active selection on the data set to include only the current scan.
189 | It is also possible to iterate through the compound scans with the
190 | :meth:`DataSet.compscans` generator, which yields the compound scan index, label
191 | and first target on each iteration for convenience. These two iterators may also
192 | be used together to traverse the data set structure::
193 | 
194 |   for compscan, label, target in d.compscans():
195 |       plt.figure()
196 |       for scan, state, target in d.scans():
197 |           if state in ('scan', 'track'):
198 |               plt.plot(d.ra, d.dec, 'o')
199 |       plt.xlabel('Right ascension (J2000 degrees)')
200 |       plt.ylabel('Declination (J2000 degrees)')
201 |       plt.title(target.name)
202 | 
203 | Finally, all the targets (or fields) in the data set are stored in a catalogue
204 | available at `d.catalogue`, and the original HDF5 file is still accessible via
205 | a back door installed at `d.file` in the case of a single-file data set.
206 | 
207 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | katdal
  2 | ======
  3 | 
  4 | This package serves as a data access library to interact with the chunk stores
  5 | and HDF5 files produced by the MeerKAT radio telescope and its predecessors
  6 | (KAT-7 and Fringe Finder), which are collectively known as *MeerKAT Visibility
  7 | Format (MVF)* data sets. It uses memory carefully, allowing data sets to be
  8 | inspected and partially loaded into memory. Data sets may be concatenated and
  9 | split via a flexible selection mechanism. In addition, it provides a script to
 10 | convert these data sets to CASA MeasurementSets.
 11 | 
 12 | Quick Tutorial
 13 | --------------
 14 | 
 15 | Open any data set through a single function to obtain a data set object:
 16 | 
 17 | .. code:: python
 18 | 
 19 |   import katdal
 20 |   d = katdal.open('1234567890.h5')
 21 | 
 22 | The ``open`` function automatically determines the version and storage location
 23 | of the data set. The versions roughly map to the various instruments::
 24 | 
 25 |   - v1 : Fringe Finder (HDF5 file)
 26 |   - v2 : KAT-7 (HDF5 file)
 27 |   - v3 : MeerKAT (HDF5 file)
 28 |   - v4 : MeerKAT (RDB file + chunk store based on objects in Ceph)
 29 | 
 30 | Each MVFv4 data set is split into a Redis dump (aka *RDB*) file containing the
 31 | metadata in the form of a *telescope state* database, and a *chunk store*
 32 | containing the visibility data split into many small blocks or chunks (typically
 33 | served by a Ceph object store over the network). The RDB file is the main entry
 34 | point to the data set and it can be accessed directly from the MeerKAT SDP
 35 | archive if you have the appropriate permissions:
 36 | 
 37 | .. code:: python
 38 | 
 39 |   # This is just for illustration - the real URL looks a bit different
 40 |   d = katdal.open('https://archive/1234567890/1234567890_sdp_l0.rdb?token=AsD3')
 41 | 
 42 | Multiple data sets (even of different versions) may also be concatenated
 43 | together (as long as they have the same dump rate):
 44 | 
 45 | .. code:: python
 46 | 
 47 |   d = katdal.open(['1234567890.h5', '1234567891.h5'])
 48 | 
 49 | Inspect the contents of the data set by printing the object:
 50 | 
 51 | .. code:: python
 52 | 
 53 |   print(d)
 54 | 
 55 | Here is a typical output::
 56 | 
 57 |   ===============================================================================
 58 |   Name: 1313067732.h5 (version 2.0)
 59 |   ===============================================================================
 60 |   Observer: someone  Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f
 61 |   Description: 'Track on Hyd A,Vir A, 3C 286 and 3C 273'
 62 |   Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST
 63 |   Dump rate: 1.00025 Hz
 64 |   Subarrays: 1
 65 |   ID  Antennas                            Inputs  Corrprods
 66 |    0  ant1,ant2,ant3,ant4,ant5,ant6,ant7  14      112
 67 |   Spectral Windows: 1
 68 |   ID  CentreFreq(MHz)  Bandwidth(MHz)  Channels  ChannelWidth(kHz)
 69 |    0  1822.000         400.000          1024      390.625
 70 |   -------------------------------------------------------------------------------
 71 |   Data selected according to the following criteria:
 72 |   subarray=0
 73 |   ants=['ant1', 'ant2', 'ant3', 'ant4', 'ant5', 'ant6', 'ant7']
 74 |   spw=0
 75 |   -------------------------------------------------------------------------------
 76 |   Shape: (1054 dumps, 1024 channels, 112 correlation products) => Size: 967.049 MB
 77 |   Antennas: *ant1,ant2,ant3,ant4,ant5,ant6,ant7  Inputs: 14  Autocorr: yes  Crosscorr: yes
 78 |   Channels: 1024 (index 0 - 1023, 2021.805 MHz - 1622.195 MHz), each 390.625 kHz wide
 79 |   Targets: 4 selected out of 4 in catalogue
 80 |   ID  Name    Type      RA(J2000)     DEC(J2000)  Tags  Dumps  ModelFlux(Jy)
 81 |    0  Hyd A   radec      9:18:05.28  -12:05:48.9          333      33.63
 82 |    1  Vir A   radec     12:30:49.42   12:23:28.0          251     166.50
 83 |    2  3C 286  radec     13:31:08.29   30:30:33.0          230      12.97
 84 |    3  3C 273  radec     12:29:06.70    2:03:08.6          240      39.96
 85 |   Scans: 8 selected out of 8 total       Compscans: 1 selected out of 1 total
 86 |   Date        Timerange(UTC)       ScanState  CompScanLabel  Dumps  Target
 87 |   11-Aug-2011/13:02:14 - 13:04:26    0:slew     0:             133    0:Hyd A
 88 |               13:04:27 - 13:07:46    1:track    0:             200    0:Hyd A
 89 |               13:07:47 - 13:08:37    2:slew     0:              51    1:Vir A
 90 |               13:08:38 - 13:11:57    3:track    0:             200    1:Vir A
 91 |               13:11:58 - 13:12:27    4:slew     0:              30    2:3C 286
 92 |               13:12:28 - 13:15:47    5:track    0:             200    2:3C 286
 93 |               13:15:48 - 13:16:27    6:slew     0:              40    3:3C 273
 94 |               13:16:28 - 13:19:47    7:track    0:             200    3:3C 273
 95 | 
 96 | The first segment of the printout displays the static information of the data
 97 | set, including observer, dump rate and all the available subarrays and spectral
 98 | windows in the data set. The second segment (between the dashed lines) highlights
 99 | the active selection criteria. The last segment displays dynamic information
100 | that is influenced by the selection, including the overall visibility array
101 | shape, antennas, channel frequencies, targets and scan info.
102 | 
103 | The data set is built around the concept of a three-dimensional visibility array
104 | with dimensions of time, frequency and correlation product. This is reflected in
105 | the *shape* of the dataset:
106 | 
107 | .. code:: python
108 | 
109 |   d.shape
110 | 
111 | which returns ``(1054, 1024, 112)``, meaning 1054 dumps by 1024 channels by 112
112 | correlation products.
113 | 
114 | Let's select a subset of the data set:
115 | 
116 | .. code:: python
117 | 
118 |   d.select(scans='track', channels=slice(200, 300), ants='ant4')
119 |   print(d)
120 | 
121 | This results in the following printout::
122 | 
123 |   ===============================================================================
124 |   Name: /Users/schwardt/Downloads/1313067732.h5 (version 2.0)
125 |   ===============================================================================
126 |   Observer: siphelele  Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f
127 |   Description: 'track on Hyd A,Vir A, 3C 286 and 3C 273 for Lud'
128 |   Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST
129 |   Dump rate: 1.00025 Hz
130 |   Subarrays: 1
131 |   ID  Antennas                            Inputs  Corrprods
132 |    0  ant1,ant2,ant3,ant4,ant5,ant6,ant7  14      112
133 |   Spectral Windows: 1
134 |   ID  CentreFreq(MHz)  Bandwidth(MHz)  Channels  ChannelWidth(kHz)
135 |    0  1822.000         400.000          1024      390.625
136 |   -------------------------------------------------------------------------------
137 |   Data selected according to the following criteria:
138 |   channels=slice(200, 300, None)
139 |   subarray=0
140 |   scans='track'
141 |   ants='ant4'
142 |   spw=0
143 |   -------------------------------------------------------------------------------
144 |   Shape: (800 dumps, 100 channels, 4 correlation products) => Size: 2.560 MB
145 |   Antennas: ant4  Inputs: 2  Autocorr: yes  Crosscorr: no
146 |   Channels: 100 (index 200 - 299, 1943.680 MHz - 1905.008 MHz), each 390.625 kHz wide
147 |   Targets: 4 selected out of 4 in catalogue
148 |   ID  Name    Type      RA(J2000)     DEC(J2000)  Tags  Dumps  ModelFlux(Jy)
149 |    0  Hyd A   radec      9:18:05.28  -12:05:48.9          200      31.83
150 |    1  Vir A   radec     12:30:49.42   12:23:28.0          200     159.06
151 |    2  3C 286  radec     13:31:08.29   30:30:33.0          200      12.61
152 |    3  3C 273  radec     12:29:06.70    2:03:08.6          200      39.32
153 |   Scans: 4 selected out of 8 total       Compscans: 1 selected out of 1 total
154 |   Date        Timerange(UTC)       ScanState  CompScanLabel  Dumps  Target
155 |   11-Aug-2011/13:04:27 - 13:07:46    1:track    0:             200    0:Hyd A
156 |               13:08:38 - 13:11:57    3:track    0:             200    1:Vir A
157 |               13:12:28 - 13:15:47    5:track    0:             200    2:3C 286
158 |               13:16:28 - 13:19:47    7:track    0:             200    3:3C 273
159 | 
160 | Compared to the first printout, the static information has remained the same
161 | while the dynamic information now reflects the selected subset. There are many
162 | possible selection criteria, as illustrated below:
163 | 
164 | .. code:: python
165 | 
166 |   d.select(timerange=('2011-08-11 13:10:00', '2011-08-11 13:15:00'), targets=[1, 2])
167 |   d.select(spw=0, subarray=0)
168 |   d.select(ants='ant1,ant2', pol='H', scans=(0,1,2), freqrange=(1700e6, 1800e6))
169 | 
170 | See the docstring of ``DataSet.select`` for more detailed information (i.e.
171 | do ``d.select?`` in IPython). Take note that only one subarray and one spectral
172 | window must be selected.
173 | 
174 | Once a subset of the data has been selected, you can access the data and
175 | timestamps on the data set object:
176 | 
177 | .. code:: python
178 | 
179 |   vis = d.vis[:]
180 |   timestamps = d.timestamps[:]
181 | 
182 | Note the ``[:]`` indexing, as the ``vis`` and ``timestamps`` properties are
183 | special ``LazyIndexer`` objects that only give you the actual data when
184 | you use indexing, in order not to inadvertently load the entire array into memory.
185 | 
186 | For the example dataset and no selection the ``vis`` array will have a shape of
187 | ``(1054, 1024, 112)``. The time dimension is labelled by ``d.timestamps``, the
188 | frequency dimension by ``d.channel_freqs`` and the correlation product dimension
189 | by ``d.corr_products``.
190 | 
191 | Another key concept in the data set object is that of *sensors*. These are named
192 | time series of arbitrary data that are either loaded from the data set
193 | (*actual* sensors) or calculated on the fly (*virtual* sensors). Both variants
194 | are accessed through the *sensor cache* (available as ``d.sensor``) and cached
195 | there after the first access. The data set object also provides convenient
196 | properties to expose commonly-used sensors, as shown in the plot example below:
197 | 
198 | .. code:: python
199 | 
200 |   import matplotlib.pyplot as plt
201 |   plt.plot(d.az, d.el, 'o')
202 |   plt.xlabel('Azimuth (degrees)')
203 |   plt.ylabel('Elevation (degrees)')
204 | 
205 | Other useful attributes include ``ra``, ``dec``, ``lst``, ``mjd``, ``u``,
206 | ``v``, ``w``, ``target_x`` and ``target_y``. These are all one-dimensional
207 | NumPy arrays that dynamically change length depending on the active selection.
208 | 
209 | As in katdal's predecessor (scape) there is a ``DataSet.scans`` generator
210 | that allows you to step through the scans in the data set. It returns the
211 | scan index, scan state and target object on each iteration, and updates
212 | the active selection on the data set to include only the current scan.
213 | It is also possible to iterate through the compound scans with the
214 | ``DataSet.compscans`` generator, which yields the compound scan index, label
215 | and first target on each iteration for convenience. These two iterators may also
216 | be used together to traverse the data set structure:
217 | 
218 | .. code:: python
219 | 
220 |   for compscan, label, target in d.compscans():
221 |       plt.figure()
222 |       for scan, state, target in d.scans():
223 |           if state in ('scan', 'track'):
224 |               plt.plot(d.ra, d.dec, 'o')
225 |       plt.xlabel('Right ascension (J2000 degrees)')
226 |       plt.ylabel('Declination (J2000 degrees)')
227 |       plt.title(target.name)
228 | 
229 | Finally, all the targets (or fields) in the data set are stored in a catalogue
230 | available at ``d.catalogue``, and the original HDF5 file is still accessible via
231 | a back door installed at ``d.file`` in the case of a single-file data set (v3
232 | or older). On a v4 data set, ``d.source`` provides access to the underlying
233 | telstate for metadata and the chunk store for data.
234 | 


--------------------------------------------------------------------------------
/scripts/mvf_rechunk.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | ################################################################################
  4 | # Copyright (c) 2019-2021, National Research Foundation (SARAO)
  5 | #
  6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  7 | # this file except in compliance with the License. You may obtain a copy
  8 | # of the License at
  9 | #
 10 | #   https://opensource.org/licenses/BSD-3-Clause
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | ################################################################################
 18 | 
 19 | """Rechunk an existing MVF dataset"""
 20 | 
 21 | import argparse
 22 | import multiprocessing
 23 | import os
 24 | import re
 25 | import sys
 26 | import urllib.parse
 27 | 
 28 | import dask
 29 | import dask.array as da
 30 | import numpy as np
 31 | from katsdptelstate.rdb_writer import RDBWriter
 32 | 
 33 | from katdal.chunkstore import ChunkStoreError
 34 | from katdal.chunkstore_npy import NpyFileChunkStore
 35 | from katdal.datasources import (TelstateDataSource, infer_chunk_store,
 36 |                                 view_capture_stream)
 37 | from katdal.flags import DATA_LOST
 38 | 
 39 | 
 40 | class RechunkSpec:
 41 |     def __init__(self, arg):
 42 |         match = re.match(r'^([A-Za-z0-9_.]+)/([A-Za-z0-9_]+):(\d+),(\d+)', arg)
 43 |         if not match:
 44 |             raise ValueError(f'Could not parse {arg!r}')
 45 |         self.stream = match.group(1)
 46 |         self.array = match.group(2)
 47 |         self.time = int(match.group(3))
 48 |         self.freq = int(match.group(4))
 49 |         if self.time <= 0 or self.freq <= 0:
 50 |             raise ValueError('Chunk sizes must be positive')
 51 | 
 52 | 
 53 | def _fill_missing(data, default_value, block_info):
 54 |     if data is None:
 55 |         info = block_info[None]
 56 |         return np.full(info['chunk-shape'], default_value, info['dtype'])
 57 |     else:
 58 |         return data
 59 | 
 60 | 
 61 | def _make_lost(data, block_info):
 62 |     info = block_info[None]
 63 |     if data is None:
 64 |         return np.full(info['chunk-shape'], DATA_LOST, np.uint8)
 65 |     else:
 66 |         return np.zeros(info['chunk-shape'], np.uint8)
 67 | 
 68 | 
 69 | class Array:
 70 |     def __init__(self, stream_name, array_name, store, chunk_info):
 71 |         self.stream_name = stream_name
 72 |         self.array_name = array_name
 73 |         self.chunk_info = chunk_info
 74 |         self.store = store
 75 |         full_name = store.join(chunk_info['prefix'], array_name)
 76 |         chunks = chunk_info['chunks']
 77 |         dtype = chunk_info['dtype']
 78 |         raw_data = store.get_dask_array(full_name, chunks, dtype, errors='none')
 79 |         # raw_data has `None` objects instead of ndarrays for chunks with
 80 |         # missing data. That's not actually valid as a dask array, but we use
 81 |         # it to produce lost flags (similarly to datasources.py).
 82 |         default_value = DATA_LOST if array_name == 'flags' else 0
 83 |         self.data = da.map_blocks(_fill_missing, raw_data, default_value, dtype=raw_data.dtype)
 84 |         self.lost_flags = da.map_blocks(_make_lost, raw_data, dtype=np.uint8)
 85 | 
 86 | 
 87 | def get_chunk_store(source, telstate, array):
 88 |     """A wrapper around katdal.datasources.infer_chunk_store.
 89 | 
 90 |     It has a simpler interface, taking an URL rather than url_parts and kwargs.
 91 |     """
 92 |     url_parts = urllib.parse.urlparse(source, scheme='file')
 93 |     kwargs = dict(urllib.parse.parse_qsl(url_parts.query))
 94 |     return infer_chunk_store(url_parts, telstate, array=array, **kwargs)
 95 | 
 96 | 
 97 | def comma_list(value):
 98 |     return value.split(',')
 99 | 
100 | 
101 | def parse_args():
102 |     parser = argparse.ArgumentParser(
103 |         description='Rechunk a single capture block. For each array within each stream, '
104 |         'a new chunking scheme may be specified. A chunking scheme is '
105 |         'specified as the number of dumps and channels per chunk.')
106 |     parser.add_argument('--workers', type=int, default=8*multiprocessing.cpu_count(),
107 |                         help='Number of dask workers for parallel I/O [%(default)s]')
108 |     parser.add_argument('--streams', type=comma_list, metavar='STREAM,STREAM',
109 |                         help='Streams to copy [all]')
110 |     parser.add_argument('--s3-endpoint-url', help='URL where rechunked data will be uploaded')
111 |     parser.add_argument('--new-prefix', help='Replacement for capture block ID in output bucket names')
112 |     parser.add_argument('source', help='Input .rdb file')
113 |     parser.add_argument('dest', help='Output directory')
114 |     parser.add_argument('spec', nargs='*', default=[], type=RechunkSpec,
115 |                         metavar='STREAM/ARRAY:TIME,FREQ', help='New chunk specification')
116 |     args = parser.parse_args()
117 |     return args
118 | 
119 | 
120 | def get_stream_type(telstate, stream):
121 |     try:
122 |         return telstate.view(stream)['stream_type']
123 |     except KeyError:
124 |         try:
125 |             base = telstate.view(stream)['inherit']
126 |             return get_stream_type(telstate, base)
127 |         except KeyError:
128 |             return None
129 | 
130 | 
131 | def get_streams(telstate, streams):
132 |     """Determine streams to copy based on what the user asked for"""
133 |     archived_streams = telstate.get('sdp_archived_streams', [])
134 |     archived_streams = [
135 |         stream for stream in archived_streams
136 |         if get_stream_type(telstate, stream) in {'sdp.vis', 'sdp.flags'}]
137 |     if not archived_streams:
138 |         raise RuntimeError('Source dataset does not contain any visibility streams')
139 |     if streams is None:
140 |         streams = archived_streams
141 |     else:
142 |         for stream in streams:
143 |             if stream not in archived_streams:
144 |                 raise RuntimeError('Stream {!r} is not known (should be one of {})'
145 |                                    .format(stream, ', '.join(archived_streams)))
146 | 
147 |     return streams
148 | 
149 | 
150 | def main():
151 |     args = parse_args()
152 |     dask.config.set(num_workers=args.workers)
153 | 
154 |     # Lightweight open with no data - just to create telstate and identify the CBID
155 |     ds = TelstateDataSource.from_url(args.source, upgrade_flags=False, chunk_store=None)
156 |     # View the CBID, but not any specific stream
157 |     cbid = ds.capture_block_id
158 |     telstate = ds.telstate.root().view(cbid)
159 |     streams = get_streams(telstate, args.streams)
160 | 
161 |     # Find all arrays in the selected streams, and also ensure we're not
162 |     # trying to write things back on top of an existing dataset.
163 |     arrays = {}
164 |     for stream_name in streams:
165 |         sts = view_capture_stream(telstate, cbid, stream_name)
166 |         try:
167 |             chunk_info = sts['chunk_info']
168 |         except KeyError as exc:
169 |             raise RuntimeError(f'Could not get chunk info for {stream_name!r}: {exc}')
170 |         for array_name, array_info in chunk_info.items():
171 |             if args.new_prefix is not None:
172 |                 array_info['prefix'] = args.new_prefix + '-' + stream_name.replace('_', '-')
173 |             prefix = array_info['prefix']
174 |             path = os.path.join(args.dest, prefix)
175 |             if os.path.exists(path):
176 |                 raise RuntimeError(f'Directory {path!r} already exists')
177 |             store = get_chunk_store(args.source, sts, array_name)
178 |             # Older files have dtype as an object that can't be encoded in msgpack
179 |             dtype = np.dtype(array_info['dtype'])
180 |             array_info['dtype'] = np.lib.format.dtype_to_descr(dtype)
181 |             arrays[(stream_name, array_name)] = Array(stream_name, array_name, store, array_info)
182 | 
183 |     # Apply DATA_LOST bits to the flags arrays. This is a less efficient approach than
184 |     # datasources.py, but much simpler.
185 |     for stream_name in streams:
186 |         flags_array = arrays.get((stream_name, 'flags'))
187 |         if not flags_array:
188 |             continue
189 |         sources = [stream_name]
190 |         sts = view_capture_stream(telstate, cbid, stream_name)
191 |         sources += sts['src_streams']
192 |         for src_stream in sources:
193 |             if src_stream not in streams:
194 |                 continue
195 |             src_ts = view_capture_stream(telstate, cbid, src_stream)
196 |             for array_name in src_ts['chunk_info']:
197 |                 if array_name == 'flags' and src_stream != stream_name:
198 |                     # Upgraded flags completely replace the source stream's
199 |                     # flags, rather than augmenting them. Thus, data lost in
200 |                     # the source stream has no effect.
201 |                     continue
202 |                 lost_flags = arrays[(src_stream, array_name)].lost_flags
203 |                 lost_flags = lost_flags.rechunk(flags_array.data.chunks[:lost_flags.ndim])
204 |                 # weights_channel doesn't have a baseline axis
205 |                 while lost_flags.ndim < flags_array.data.ndim:
206 |                     lost_flags = lost_flags[..., np.newaxis]
207 |                 lost_flags = da.broadcast_to(lost_flags, flags_array.data.shape,
208 |                                              chunks=flags_array.data.chunks)
209 |                 flags_array.data |= lost_flags
210 | 
211 |     # Apply the rechunking specs
212 |     for spec in args.spec:
213 |         key = (spec.stream, spec.array)
214 |         if key not in arrays:
215 |             raise RuntimeError(f'{spec.stream}/{spec.array} is not a known array')
216 |         arrays[key].data = arrays[key].data.rechunk({0: spec.time, 1: spec.freq})
217 | 
218 |     # Write out the new data
219 |     dest_store = NpyFileChunkStore(args.dest)
220 |     stores = []
221 |     for array in arrays.values():
222 |         full_name = dest_store.join(array.chunk_info['prefix'], array.array_name)
223 |         dest_store.create_array(full_name)
224 |         stores.append(dest_store.put_dask_array(full_name, array.data))
225 |         array.chunk_info['chunks'] = array.data.chunks
226 |     stores = da.compute(*stores)
227 |     # put_dask_array returns an array with an exception object per chunk
228 |     for result_set in stores:
229 |         for result in result_set.flat:
230 |             if result is not None:
231 |                 raise result
232 | 
233 |     # Fix up chunk_info for new chunking
234 |     for stream_name in streams:
235 |         sts = view_capture_stream(telstate, cbid, stream_name)
236 |         chunk_info = sts['chunk_info']
237 |         for array_name in chunk_info.keys():
238 |             chunk_info[array_name] = arrays[(stream_name, array_name)].chunk_info
239 |         sts.wrapped.delete('chunk_info')
240 |         sts.wrapped['chunk_info'] = chunk_info
241 |         # s3_endpoint_url is for the old version of the data
242 |         sts.wrapped.delete('s3_endpoint_url')
243 |         if args.s3_endpoint_url is not None:
244 |             sts.wrapped['s3_endpoint_url'] = args.s3_endpoint_url
245 | 
246 |     # Write updated RDB file
247 |     url_parts = urllib.parse.urlparse(args.source, scheme='file')
248 |     dest_file = os.path.join(args.dest, args.new_prefix or cbid, os.path.basename(url_parts.path))
249 |     os.makedirs(os.path.dirname(dest_file), exist_ok=True)
250 |     with RDBWriter(dest_file) as writer:
251 |         writer.save(telstate.backend)
252 | 
253 | 
254 | if __name__ == '__main__':
255 |     try:
256 |         main()
257 |     except (RuntimeError, ChunkStoreError) as exc:
258 |         print(exc, file=sys.stderr)
259 |         sys.exit(1)
260 | 


--------------------------------------------------------------------------------
/katdal/test/test_lazy_indexer.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2018-2019,2021-2023,2025, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Tests for :py:mod:`katdal.lazy_indexer`."""
 18 | 
 19 | from functools import partial
 20 | from numbers import Integral
 21 | 
 22 | import dask
 23 | import dask.array as da
 24 | import numpy as np
 25 | from packaging.version import Version
 26 | import pytest
 27 | 
 28 | from katdal.lazy_indexer import (DaskLazyIndexer, _dask_oindex,
 29 |                                  _range_to_slice, _simplify_index,
 30 |                                  dask_getitem)
 31 | 
 32 | 
 33 | def slice_to_range(s, length):
 34 |     return range(*s.indices(length))
 35 | 
 36 | 
 37 | class TestRangeToSlice:
 38 |     """Test the :func:`~katdal.lazy_indexer._range_to_slice` function."""
 39 |     @staticmethod
 40 |     def _check_slice(start, stop, step):
 41 |         s = slice(start, stop, step)
 42 |         length = max(start, 0 if stop is None else stop) + 1
 43 |         r = slice_to_range(s, length)
 44 |         assert _range_to_slice(r) == s
 45 | 
 46 |     def test_basic_slices(self):
 47 |         # For testing both `start` and `stop` need to be non-negative
 48 |         self._check_slice(0, 10, 1)   # contiguous, ascending
 49 |         self._check_slice(0, 10, 2)   # strided, ascending
 50 |         self._check_slice(10, 0, -1)  # contiguous, descending
 51 |         self._check_slice(10, 0, -2)  # strided, descending
 52 |         self._check_slice(10, None, -2)  # strided, descending all the way to 0
 53 |         self._check_slice(0, 1, 1)    # single element (treated as ascending)
 54 |         self._check_slice(0, 10, 5)   # any two elements (has stop = 2 * step)
 55 | 
 56 |     def test_negative_elements(self):
 57 |         with pytest.raises(ValueError):
 58 |             _range_to_slice([-1, -2, -3, -4])
 59 | 
 60 |     def test_zero_increments(self):
 61 |         with pytest.raises(ValueError):
 62 |             _range_to_slice([1, 1, 1, 1])
 63 | 
 64 |     def test_uneven_increments(self):
 65 |         with pytest.raises(ValueError):
 66 |             _range_to_slice([1, 1, 2, 3, 5, 8, 13])
 67 | 
 68 | 
 69 | class TestSimplifyIndex:
 70 |     """Test the :func:`~katdal.lazy_indexer._simplify_index` function."""
 71 |     def setup_method(self):
 72 |         self.shape = (3, 4, 5)
 73 |         self.data = np.arange(np.prod(self.shape)).reshape(self.shape)
 74 | 
 75 |     def _test_with(self, indices):
 76 |         expected = self.data[indices]
 77 |         simplified = _simplify_index(indices, self.data.shape)
 78 |         actual = self.data[simplified]
 79 |         np.testing.assert_array_equal(actual, expected)
 80 | 
 81 |     def _test_index_error(self, indices):
 82 |         with pytest.raises(IndexError):
 83 |             simplified = _simplify_index(indices, self.data.shape)
 84 |             self.data[simplified]
 85 |         with pytest.raises(IndexError):
 86 |             self.data[indices]
 87 | 
 88 |     def test_1d(self):
 89 |         self._test_with(np.s_[np.array([False, True, False])])
 90 |         self._test_with(np.s_[[1]])
 91 | 
 92 |     def test_contiguous(self):
 93 |         self._test_with(np.s_[:, np.array([False, True, True, False]), :])
 94 |         self._test_with(np.s_[:, [1, 2], :])
 95 | 
 96 |     def test_discontiguous_but_regular(self):
 97 |         self._test_with(np.s_[:, [False, True, False, True], :])
 98 |         self._test_with(np.s_[:, [1, 3], :])
 99 | 
100 |     def test_discontiguous(self):
101 |         self._test_with(np.s_[:, [True, True, False, True], :])
102 |         self._test_with(np.s_[:, [0, 1, 3], :])
103 | 
104 |     def test_all_false(self):
105 |         self._test_with(np.s_[:, np.array([False, False, False, False]), :])
106 | 
107 |     def test_all_true(self):
108 |         self._test_with(np.s_[:, np.array([True, True, True, True]), :])
109 | 
110 |     def test_newaxis(self):
111 |         self._test_with(np.s_[np.newaxis, np.array([True, True, False])])
112 | 
113 |     def test_ellipsis(self):
114 |         self._test_with(np.s_[..., np.array([True, False, True, False, True])])
115 | 
116 |     def test_wrong_length(self):
117 |         self._test_index_error(np.s_[:, np.array([True, False]), :])
118 | 
119 |     def test_too_many_axes(self):
120 |         self._test_index_error(np.s_[0, 0, 0, 0])
121 | 
122 |     def test_bad_index_dtype(self):
123 |         self._test_index_error(np.s_[:, np.array([1.2, 3.4])])
124 | 
125 | 
126 | def ix_(keep, shape):
127 |     r"""Extend numpy.ix\_ to accept slices and single ints as well."""
128 |     # Inspired by Zarr's indexing.py (https://github.com/zarr-developers/zarr)
129 |     keep = [slice_to_range(k, s) if isinstance(k, slice)
130 |             else [k] if isinstance(k, Integral)
131 |             else k
132 |             for k, s in zip(keep, shape)]
133 |     return np.ix_(*keep)
134 | 
135 | 
136 | def numpy_oindex(x, keep):
137 |     """Perform outer indexing on a NumPy array (inspired by Zarr).
138 | 
139 |     This is more onerous, but calls `x.__getitem__` only once.
140 |     """
141 |     # Inspired by Zarr's indexing.py (https://github.com/zarr-developers/zarr)
142 |     # Get rid of ellipsis
143 |     keep = da.slicing.normalize_index(keep, x.shape)
144 |     new_axes = tuple(n for n, k in enumerate(keep) if k is np.newaxis)
145 |     drop_axes = tuple(n for n, k in enumerate(keep) if isinstance(k, Integral))
146 |     # Get rid of newaxis
147 |     keep = tuple(k for k in keep if k is not np.newaxis)
148 |     keep = ix_(keep, x.shape)
149 |     result = x[keep]
150 |     for ax in new_axes:
151 |         result = np.expand_dims(result, ax)
152 |     result = result.squeeze(axis=drop_axes)
153 |     return result
154 | 
155 | 
156 | def numpy_oindex_lite(x, keep):
157 |     """Perform outer indexing on a NumPy array (compact version).
158 | 
159 |     This is more compact, but calls `x.__getitem__` `x.ndim` times.
160 | 
161 |     It also assumes that `keep` contains no ellipsis to be as pure as possible.
162 |     """
163 |     if not isinstance(keep, tuple):
164 |         keep = (keep,)
165 |     dim = 0
166 |     result = x
167 |     for k in keep:
168 |         cumulative_index = (slice(None),) * dim + (k,)
169 |         result = result[cumulative_index]
170 |         # Handle dropped dimensions
171 |         if not isinstance(k, Integral):
172 |             dim += 1
173 |     return result
174 | 
175 | 
176 | UNEVEN = [False, True, True, True, False, False, True, True, False, True]
177 | DASK_SLICE_BUG = Version(dask.__version__) >= Version('2024.8.0')
178 | 
179 | 
180 | class TestDaskGetitem:
181 |     """Test the :func:`~katdal.lazy_indexer.dask_getitem` function."""
182 |     def setup_method(self):
183 |         shape = (10, 20, 30, 40)
184 |         self.data = np.arange(np.prod(shape)).reshape(shape)
185 |         self.data_dask = da.from_array(self.data, chunks=(2, 5, 2, 5))
186 | 
187 |     def _test_with(self, indices, normalised_indices=None):
188 |         npy = numpy_oindex(self.data, indices)
189 |         if normalised_indices is None:
190 |             normalised_indices = indices
191 |         npy_lite = numpy_oindex_lite(self.data, normalised_indices)
192 |         oindex = _dask_oindex(self.data_dask, normalised_indices).compute()
193 |         getitem = dask_getitem(self.data_dask, indices).compute()
194 |         np.testing.assert_array_equal(npy, npy_lite)
195 |         np.testing.assert_array_equal(getitem, npy)
196 |         np.testing.assert_array_equal(oindex, npy)
197 | 
198 |     def test_misc_indices(self):
199 |         self._test_with(())
200 |         self._test_with(2, (2,))
201 |         self._test_with((2, 3, 4, 5))
202 | 
203 |     def test_ellipsis(self):
204 |         self._test_with(np.s_[[0], ...], np.s_[[0], :, :, :])
205 |         self._test_with(np.s_[:, [0], ...], np.s_[:, [0], :, :])
206 |         self._test_with(np.s_[[0], ..., [0]], np.s_[[0], :, :, [0]])
207 | 
208 |     def test_evenly_spaced_ints(self):
209 |         self._test_with(np.s_[:, [0], [0], :])
210 |         self._test_with(np.s_[:, [0], :, [0]])
211 |         self._test_with(np.s_[:, [0], [0, 1, 2, 3], :])
212 |         self._test_with(np.s_[[0], [-1, -2, -3, -4, -5], :, [8, 6, 4, 2, 0]])
213 | 
214 |     def test_evenly_spaced_booleans(self):
215 |         pick_one = np.zeros(40, dtype=bool)
216 |         pick_one[6] = True
217 |         self._test_with(np.s_[:, [True, False] * 10, pick_one[:30], :])
218 |         self._test_with(np.s_[:, [False, True] * 10, :, pick_one])
219 |         self._test_with(np.s_[4:9, [False, True] * 10,
220 |                               [True, False] * 15, pick_one])
221 | 
222 |     def test_unevenly_spaced_fancy_indexing(self):
223 |         self._test_with(np.s_[:, [0, 1, 3], [1, 2, 4], :])
224 |         self._test_with(np.s_[UNEVEN, 2 * UNEVEN, 3 * UNEVEN, 4 * UNEVEN])
225 | 
226 |     def test_repeated_fancy_indexing(self):
227 |         self._test_with(np.s_[:, [1, 1, 1], [6, 6, 6], :])
228 | 
229 |     def test_slices(self):
230 |         self._test_with(np.s_[0:2, 2:4, 4:6, 6:8])
231 |         self._test_with(np.s_[-8:-6, -4:-2, 3:10:2, -2:])
232 | 
233 |     def test_single_ints(self):
234 |         self._test_with(np.s_[:, [0], 0, :])
235 |         self._test_with(np.s_[:, [0], :, 0])
236 |         self._test_with(np.s_[:, [0], -1, :])
237 |         self._test_with(np.s_[:, [0], :, -1])
238 |         self._test_with(np.s_[:, 0, [0, 2], [1, 3, 5]])
239 | 
240 |     @pytest.mark.skipif(DASK_SLICE_BUG, reason="Dask newaxis + mask slicing broken")
241 |     def test_newaxis(self):
242 |         self._test_with(np.s_[np.newaxis, :, 2 * UNEVEN, :, 0])
243 |         self._test_with(np.s_[:, 2 * UNEVEN, np.newaxis, 0, :])
244 |         self._test_with(np.s_[0, np.newaxis, 1, np.newaxis, 2, np.newaxis, 3])
245 | 
246 |     @pytest.mark.skipif(DASK_SLICE_BUG, reason="Dask newaxis + mask slicing broken")
247 |     def test_the_lot(self):
248 |         self._test_with(np.s_[..., 0, 2:5, 3 * UNEVEN, np.newaxis, [4, 6]],
249 |                         np.s_[0, 2:5, 3 * UNEVEN, np.newaxis, [4, 6]])
250 | 
251 | 
252 | class TestDaskLazyIndexer:
253 |     """Test the :class:`~katdal.lazy_indexer.DaskLazyIndexer` class."""
254 |     def setup_method(self):
255 |         shape = (10, 20, 30)
256 |         self.data = np.arange(np.prod(shape)).reshape(shape)
257 |         self.data_dask = da.from_array(self.data, chunks=(1, 4, 5), name='x')
258 | 
259 |     def test_str_repr(self):
260 |         def transform1(x):
261 |             return x
262 |         transform2 = lambda x: x  # noqa: E731
263 |         class Transform3:         # noqa: E306
264 |             def __call__(self, x):
265 |                 return x
266 |         transform3 = Transform3()
267 |         transform4 = partial(transform1)
268 |         transforms = [transform1, transform2, transform3, transform4]
269 |         indexer = DaskLazyIndexer(self.data_dask, transforms=transforms)
270 |         expected = 'x | transform1 | <lambda> | Transform3 | transform1'
271 |         expected += f' -> {indexer.shape} {indexer.dtype}'
272 |         assert str(indexer) == expected
273 |         # Simply exercise repr - no need to check result
274 |         repr(indexer)
275 | 
276 |     def _test_with(self, stage1=(), stage2=()):
277 |         npy1 = numpy_oindex(self.data, stage1)
278 |         npy2 = numpy_oindex(npy1, stage2)
279 |         indexer = DaskLazyIndexer(self.data_dask, stage1)
280 |         np.testing.assert_array_equal(indexer[stage2], npy2)
281 |         # Check nested indexers
282 |         indexer2 = DaskLazyIndexer(indexer, stage2)
283 |         np.testing.assert_array_equal(indexer2[()], npy2)
284 | 
285 |     def test_stage1_slices(self):
286 |         self._test_with(np.s_[5:, :, 1::2])
287 | 
288 |     def test_stage2_ints(self):
289 |         self._test_with(np.s_[5:, :, 1::2], np.s_[1, 2, -1])
290 | 
291 |     def test_stage1_multiple_fancy_indices(self):
292 |         self._test_with(tuple([True] * d for d in self.data.shape))
293 |         self._test_with(tuple([True, False] * (d // 2)
294 |                               for d in self.data.shape))
295 |         self._test_with(np.s_[UNEVEN, 2 * UNEVEN, :24])
296 |         self._test_with(np.s_[:3, [1, 2, 3, 4, 6, 9], [8, 6, 4, 2, 0]])
297 | 
298 |     def test_stage2_multiple_fancy_indices(self):
299 |         stage1 = tuple([True] * d for d in self.data.shape)
300 |         stage2 = tuple([True] * 4 + [False] * (d - 4) for d in self.data.shape)
301 |         self._test_with(stage1, stage2)
302 |         stage2 = tuple([True, False] * (d // 2) for d in self.data.shape)
303 |         self._test_with(stage1, stage2)
304 |         stage1 = np.s_[UNEVEN, 2 * UNEVEN, :24]
305 |         stage2 = np.s_[:3, [1, 2, 3, 4, 6, 9], [8, 6, 4, 2, 0]]
306 |         self._test_with(stage1, stage2)
307 | 
308 |     def test_transforms(self):
309 |         # Add transform at initialisation
310 |         indexer = DaskLazyIndexer(self.data_dask, transforms=[lambda x: 0 * x])
311 |         np.testing.assert_array_equal(indexer[:], np.zeros_like(indexer))
312 |         # Check nested indexers
313 |         indexer = DaskLazyIndexer(self.data_dask)
314 |         indexer2 = DaskLazyIndexer(indexer, transforms=[lambda x: 0 * x])
315 |         np.testing.assert_array_equal(indexer[:], self.data)
316 |         np.testing.assert_array_equal(indexer2[:], np.zeros_like(indexer))
317 | 


--------------------------------------------------------------------------------
/katdal/test/test_vis_flags_weights.py:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | # Copyright (c) 2018-2022,2024, National Research Foundation (SARAO)
  3 | #
  4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use
  5 | # this file except in compliance with the License. You may obtain a copy
  6 | # of the License at
  7 | #
  8 | #   https://opensource.org/licenses/BSD-3-Clause
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | ################################################################################
 16 | 
 17 | """Tests for :py:mod:`katdal.vis_flags_weights`."""
 18 | 
 19 | import itertools
 20 | import os
 21 | import random
 22 | import shutil
 23 | import tempfile
 24 | 
 25 | import dask.array as da
 26 | import numpy as np
 27 | from numpy.testing import assert_array_equal
 28 | import pytest
 29 | 
 30 | from katdal.chunkstore import generate_chunks
 31 | from katdal.chunkstore_npy import NpyFileChunkStore
 32 | from katdal.flags import DATA_LOST
 33 | from katdal.van_vleck import autocorr_lookup_table
 34 | from katdal.lazy_indexer import DaskLazyIndexer
 35 | from katdal.vis_flags_weights import (ChunkStoreVisFlagsWeights,
 36 |                                       VisFlagsWeights, corrprod_to_autocorr)
 37 | 
 38 | 
 39 | def test_vis_flags_weights():
 40 |     with pytest.raises(ValueError):
 41 |         VisFlagsWeights(np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 4)))
 42 |     with pytest.raises(ValueError):
 43 |         VisFlagsWeights(np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 4)))
 44 | 
 45 | 
 46 | def ramp(shape, offset=1.0, slope=1.0, dtype=np.float64):
 47 |     """Generate a multidimensional ramp of values of the given dtype."""
 48 |     x = offset + slope * np.arange(np.prod(shape), dtype=np.float64)
 49 |     return x.astype(dtype).reshape(shape)
 50 | 
 51 | 
 52 | def to_dask_array(x, chunks=None):
 53 |     """Turn ndarray `x` into dask array with the standard vis-like chunking."""
 54 |     if chunks is None:
 55 |         itemsize = np.dtype('complex64').itemsize
 56 |         # Special case for 2-D weights_channel array ensures one chunk per dump
 57 |         n_corrprods = x.shape[2] if x.ndim >= 3 else x.shape[1] // itemsize
 58 |         # This contrives to have a vis array with 1 dump and 4 channels per chunk
 59 |         chunk_size = 4 * n_corrprods * itemsize
 60 |         chunks = generate_chunks(x.shape, x.dtype, chunk_size,
 61 |                                  dims_to_split=(0, 1), power_of_two=True)
 62 |     return da.from_array(x, chunks)
 63 | 
 64 | 
 65 | def put_fake_dataset(store, prefix, shape, chunk_overrides=None, array_overrides=None, flags_only=False):
 66 |     """Write a fake dataset into the chunk store."""
 67 |     if flags_only:
 68 |         data = {'flags': np.random.RandomState(1).randint(0, 7, shape, dtype=np.uint8)}
 69 |     else:
 70 |         data = {'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j),
 71 |                 'flags': np.random.RandomState(2).randint(0, 7, shape, dtype=np.uint8),
 72 |                 'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8),
 73 |                 'weights_channel': ramp(shape[:-1], dtype=np.float32)}
 74 |     if array_overrides is not None:
 75 |         for name in data:
 76 |             if name in array_overrides:
 77 |                 data[name] = array_overrides[name]
 78 |     if chunk_overrides is None:
 79 |         chunk_overrides = {}
 80 |     ddata = {k: to_dask_array(array, chunk_overrides.get(k)) for k, array in data.items()}
 81 |     chunk_info = {k: {'prefix': prefix, 'chunks': darray.chunks,
 82 |                       'dtype': np.lib.format.dtype_to_descr(darray.dtype),
 83 |                       'shape': darray.shape}
 84 |                   for k, darray in ddata.items()}
 85 |     for k, darray in ddata.items():
 86 |         store.create_array(store.join(prefix, k))
 87 |     push = [store.put_dask_array(store.join(prefix, k), darray)
 88 |             for k, darray in ddata.items()]
 89 |     da.compute(*push)
 90 |     return data, chunk_info
 91 | 
 92 | 
 93 | class TestChunkStoreVisFlagsWeights:
 94 |     """Test the :class:`ChunkStoreVisFlagsWeights` dataset store."""
 95 | 
 96 |     @classmethod
 97 |     def setup_class(cls):
 98 |         cls.tempdir = tempfile.mkdtemp()
 99 | 
100 |     @classmethod
101 |     def teardown_class(cls):
102 |         shutil.rmtree(cls.tempdir)
103 | 
104 |     def _make_basic_dataset(self):
105 |         store = NpyFileChunkStore(self.tempdir)
106 |         prefix = 'cb1'
107 |         shape = (10, 64, 30)
108 |         data, chunk_info = put_fake_dataset(store, prefix, shape)
109 |         weights = data['weights'] * data['weights_channel'][..., np.newaxis]
110 |         return store, chunk_info, data, weights
111 | 
112 |     def test_construction(self):
113 |         # Put fake dataset into chunk store
114 |         store, chunk_info, data, weights = self._make_basic_dataset()
115 |         # Check that data is as expected when accessed via VisFlagsWeights
116 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info)
117 |         assert vfw.shape == data['correlator_data'].shape
118 |         assert_array_equal(vfw.vis.compute(), data['correlator_data'])
119 |         assert_array_equal(vfw.flags.compute(), data['flags'])
120 |         assert_array_equal(vfw.weights.compute(), weights)
121 |         assert vfw.unscaled_weights is None
122 | 
123 |     def test_index(self):
124 |         # Put fake dataset into chunk store
125 |         store, chunk_info, data, weights = self._make_basic_dataset()
126 |         index = np.s_[2:5, -20:]
127 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info, preselect_index=index)
128 |         assert_array_equal(vfw.vis.compute(), data['correlator_data'][index])
129 |         assert_array_equal(vfw.flags.compute(), data['flags'][index])
130 |         assert_array_equal(vfw.weights.compute(), weights[index])
131 | 
132 |     def test_lazy_indexer_interaction(self):
133 |         # Put fake dataset into chunk store
134 |         store, chunk_info, data, weights = self._make_basic_dataset()
135 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info)
136 |         # Check that the combination of DaskLazyIndexer and VisFlagsWeights works
137 |         vis_indexer = DaskLazyIndexer(vfw.vis)
138 |         flags_indexer = DaskLazyIndexer(vfw.flags)
139 |         weights_indexer = DaskLazyIndexer(vfw.weights)
140 |         assert_array_equal(vis_indexer[:], data['correlator_data'])
141 |         assert_array_equal(flags_indexer[:], data['flags'])
142 |         assert_array_equal(weights_indexer[:], weights)
143 |         # Probe the case where we select a small portion of the data, which
144 |         # has a different code path and also represents what mvftoms does.
145 |         assert_array_equal(vis_indexer[0], data['correlator_data'][0])
146 |         assert_array_equal(flags_indexer[0], data['flags'][0])
147 |         assert_array_equal(weights_indexer[0], weights[0])
148 |         # Also check fancy indexing to complete the set
149 |         dumps = np.ones(vfw.shape[0], dtype=bool)
150 |         dumps[2:5] = False
151 |         dumps[8:] = False
152 |         assert_array_equal(vis_indexer[dumps], data['correlator_data'][dumps])
153 |         assert_array_equal(flags_indexer[dumps], data['flags'][dumps])
154 |         assert_array_equal(weights_indexer[dumps], weights[dumps])
155 | 
156 |     def test_van_vleck(self):
157 |         ants = 7
158 |         index1, index2 = np.triu_indices(ants)
159 |         inputs = [f'm{i:03}h' for i in range(ants)]
160 |         corrprods = np.array([(inputs[a], inputs[b]) for (a, b) in zip(index1, index2)])
161 |         auto_indices, _, _ = corrprod_to_autocorr(corrprods)
162 |         # Put fake dataset into chunk store
163 |         store = NpyFileChunkStore(self.tempdir)
164 |         prefix = 'cb1'
165 |         shape = (10, 256, len(index1))
166 |         _, chunk_info = put_fake_dataset(store, prefix, shape,
167 |                                          chunk_overrides={'correlator_data': (1, 4, shape[2] // 2)})
168 |         # Extract uncorrected visibilities and correct them manually
169 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='off')
170 |         raw_vis = vfw.vis.compute()
171 |         # Yes, this is hard-coded for MeerKAT for now - only fix this once necessary
172 |         levels = np.arange(-127., 128.)
173 |         quantised_autocorr_table, true_autocorr_table = autocorr_lookup_table(levels)
174 |         expected_vis = raw_vis.copy()
175 |         expected_vis[..., auto_indices] = np.interp(raw_vis[..., auto_indices].real,
176 |                                                     quantised_autocorr_table, true_autocorr_table)
177 |         # Now extract corrected visibilities via VisFlagsWeights and compare
178 |         corrected_vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='autocorr')
179 |         assert_array_equal(corrected_vfw.vis.compute(), expected_vis)
180 |         # Check parameter validation
181 |         with pytest.raises(ValueError):
182 |             ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='blah')
183 | 
184 |     def test_weight_power_scale(self):
185 |         ants = 7
186 |         index1, index2 = np.triu_indices(ants)
187 |         inputs = [f'm{i:03}h' for i in range(ants)]
188 |         corrprods = np.array([(inputs[a], inputs[b]) for (a, b) in zip(index1, index2)])
189 |         # Put fake dataset into chunk store
190 |         store = NpyFileChunkStore(self.tempdir)
191 |         prefix = 'cb1'
192 |         shape = (10, 64, len(index1))
193 | 
194 |         # Make up some vis data where the expected scaling factors can be
195 |         # computed by hand. Note: the autocorrs are all set to powers of
196 |         # 2 so that we avoid any rounding errors.
197 |         vis = np.full(shape, 2 + 3j, np.complex64)
198 |         vis[:, :, index1 == index2] = 2     # Make all autocorrs real
199 |         vis[3, :, index1 == index2] = 4     # Tests time indexing
200 |         vis[:, 7, index1 == index2] = 4     # Tests frequency indexing
201 |         vis[:, :, ants] *= 8                # The (1, 1) baseline
202 |         vis[4, 5, 0] = 0                    # The (0, 0) baseline
203 |         expected_scale = np.full(shape, 0.25, np.float32)
204 |         expected_scale[3, :, :] = 1 / 16
205 |         expected_scale[:, 7, :] = 1 / 16
206 |         expected_scale[:, :, index1 == 1] /= 8
207 |         expected_scale[:, :, index2 == 1] /= 8
208 |         expected_scale[4, 5, index1 == 0] = 2.0**-32
209 |         expected_scale[4, 5, index2 == 0] = 2.0**-32
210 |         # The inverse scaling effectively multiplies by the relevant autocorrs
211 |         expected_inverse_scale = np.reciprocal(expected_scale)
212 |         # The tiny "bad" weights are not inverted but zeroed instead, a la pseudo-inverse
213 |         expected_inverse_scale[4, 5, index1 == 0] = 0
214 |         expected_inverse_scale[4, 5, index2 == 0] = 0
215 | 
216 |         data, chunk_info = put_fake_dataset(
217 |             store, prefix, shape, array_overrides={'correlator_data': vis})
218 |         stored_weights = data['weights'] * data['weights_channel'][..., np.newaxis]
219 | 
220 |         # Check that data is as expected when accessed via VisFlagsWeights
221 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods,
222 |                                         stored_weights_are_scaled=False)
223 |         assert vfw.shape == data['correlator_data'].shape
224 |         assert_array_equal(vfw.vis.compute(), data['correlator_data'])
225 |         assert_array_equal(vfw.flags.compute(), data['flags'])
226 |         assert_array_equal(vfw.weights.compute(), stored_weights * expected_scale)
227 |         assert_array_equal(vfw.unscaled_weights.compute(), stored_weights)
228 | 
229 |         # Check that scaled raw weights are also accepted
230 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods,
231 |                                         stored_weights_are_scaled=True)
232 |         assert vfw.shape == data['correlator_data'].shape
233 |         assert_array_equal(vfw.vis.compute(), data['correlator_data'])
234 |         assert_array_equal(vfw.flags.compute(), data['flags'])
235 |         assert_array_equal(vfw.weights.compute(), stored_weights)
236 |         assert_array_equal(vfw.unscaled_weights.compute(),
237 |                            stored_weights * expected_inverse_scale)
238 | 
239 |     def _test_missing_chunks(self, shape, chunk_overrides=None):
240 |         # Put fake dataset into chunk store
241 |         store = NpyFileChunkStore(self.tempdir)
242 |         prefix = 'cb2'
243 |         data, chunk_info = put_fake_dataset(store, prefix, shape, chunk_overrides)
244 |         # Delete some random chunks in each array of the dataset
245 |         missing_chunks = {}
246 |         rs = random.Random(4)
247 |         for array, info in chunk_info.items():
248 |             array_name = store.join(prefix, array)
249 |             slices = da.core.slices_from_chunks(info['chunks'])
250 |             culled_slices = rs.sample(slices, len(slices) // 10 + 1)
251 |             missing_chunks[array] = culled_slices
252 |             for culled_slice in culled_slices:
253 |                 chunk_name, shape = store.chunk_metadata(array_name, culled_slice)
254 |                 os.remove(os.path.join(store.path, chunk_name) + '.npy')
255 |         vfw = ChunkStoreVisFlagsWeights(store, chunk_info)
256 |         assert vfw.store == store
257 |         assert vfw.vis_prefix == prefix
258 |         # Check that (only) missing chunks have been replaced by zeros
259 |         vis = data['correlator_data']
260 |         for culled_slice in missing_chunks['correlator_data']:
261 |             vis[culled_slice] = 0.
262 |         assert_array_equal(vfw.vis, vis)
263 |         weights = data['weights'] * data['weights_channel'][..., np.newaxis]
264 |         for culled_slice in missing_chunks['weights'] + missing_chunks['weights_channel']:
265 |             weights[culled_slice] = 0.
266 |         assert_array_equal(vfw.weights, weights)
267 |         # Check that (only) missing chunks have been flagged as 'data lost'
268 |         flags = data['flags']
269 |         for culled_slice in missing_chunks['flags']:
270 |             flags[culled_slice] = 0
271 |         for culled_slice in itertools.chain(*missing_chunks.values()):
272 |             flags[culled_slice] |= DATA_LOST
273 |         assert_array_equal(vfw.flags, flags)
274 | 
275 |     def test_missing_chunks(self):
276 |         self._test_missing_chunks((100, 256, 30))
277 | 
278 |     def test_missing_chunks_uneven_chunking(self):
279 |         self._test_missing_chunks(
280 |             (20, 210, 30),
281 |             {
282 |                 'correlator_data': (1, 6, 30),
283 |                 'weights': (5, 10, 15),
284 |                 'weights_channel': (1, 7),
285 |                 'flags': (4, 15, 30)
286 |             })
287 | 


--------------------------------------------------------------------------------