├── doc ├── _static │ └── .keep ├── _templates │ └── .keep ├── .gitignore ├── modules.rst ├── user.rst ├── mvf_v3.rst ├── index.rst ├── data_set_format.rst ├── Makefile ├── signs.rst ├── katdal.rst ├── mvf_v2.rst ├── tuning.rst ├── conf.py └── intro.rst ├── katdal ├── test │ ├── __init__.py │ ├── conftest.py │ ├── test_categorical.py │ ├── test_chunkstore_npy.py │ ├── test_chunkstore_dict.py │ ├── test_van_vleck.py │ ├── test_spectral_window.py │ ├── test_concatdata.py │ ├── s3_utils.py │ ├── test_sensordata.py │ ├── test_lazy_indexer.py │ └── test_vis_flags_weights.py ├── flags.py ├── chunkstore_dict.py ├── van_vleck.py ├── chunkstore_npy.py ├── __init__.py ├── averager.py ├── spectral_window.py └── ms_async.py ├── pyproject.toml ├── MANIFEST.in ├── pytest.ini ├── setup.cfg ├── Jenkinsfile ├── requirements.txt ├── test-requirements.txt ├── doc-requirements.txt ├── .gitignore ├── LICENSE.txt ├── setup.py ├── scripts ├── mvf_read_benchmark.py ├── spectrogram_plot_example.py ├── mvf_copy.py ├── mvf_download.py └── mvf_rechunk.py ├── NEWS.rst └── README.rst /doc/_static/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/_templates/.keep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /katdal/test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /doc/.gitignore: -------------------------------------------------------------------------------- 1 | _build 2 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel", "katversion"] 3 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE.txt README.rst NEWS.rst requirements.txt test-requirements.txt 2 | -------------------------------------------------------------------------------- /doc/modules.rst: -------------------------------------------------------------------------------- 1 | katdal 2 | ====== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | katdal 8 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | expected_duration(seconds): The expected duration of a test, in seconds. 4 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | 4 | [darglint] 5 | docstring_style=numpy 6 | strictness=long 7 | -------------------------------------------------------------------------------- /doc/user.rst: -------------------------------------------------------------------------------- 1 | User guide 2 | ========== 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Contents: 7 | 8 | intro 9 | tuning 10 | signs 11 | -------------------------------------------------------------------------------- /Jenkinsfile: -------------------------------------------------------------------------------- 1 | #!groovy 2 | 3 | @Library('katsdpjenkins') _ 4 | katsdp.killOldJobs() 5 | katsdp.setDependencies([ 6 | 'ska-sa/katsdpdockerbase/master', 7 | 'ska-sa/katpoint/master', 8 | 'ska-sa/katsdptelstate/master']) 9 | katsdp.standardBuild() 10 | katsdp.mail('ludwig@ska.ac.za') 11 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -c https://raw.githubusercontent.com/ska-sa/katsdpdockerbase/master/docker-base-build/base-requirements.txt 2 | 3 | botocore 4 | cityhash 5 | dask[array] 6 | h5py 7 | numba 8 | numpy 9 | packaging 10 | pyjwt 11 | requests 12 | 13 | katpoint @ git+https://github.com/ska-sa/katpoint 14 | katsdptelstate[rdb] @ git+https://github.com/ska-sa/katsdptelstate 15 | -------------------------------------------------------------------------------- /doc/mvf_v3.rst: -------------------------------------------------------------------------------- 1 | MVF version 3 (early MeerKAT) 2 | ============================= 3 | 4 | The version 3 format is an evolution of the v2 format, and continues to 5 | use HDF5 as the underlying format. It was used for early engineering and 6 | commissioning of MeerKAT, but replaced by v4 before science operations 7 | started. 8 | 9 | At present there is no detailed documentation. 10 | -------------------------------------------------------------------------------- /test-requirements.txt: -------------------------------------------------------------------------------- 1 | -c https://raw.githubusercontent.com/ska-sa/katsdpdockerbase/master/docker-base-build/base-requirements.txt 2 | 3 | cffi==1.15.1 # via cryptography 4 | coverage 5 | cryptography==38.0.3 6 | packaging 7 | pycparser==2.21 # via cffi 8 | pyparsing # via packaging 9 | pytest 10 | pytest-cov 11 | -------------------------------------------------------------------------------- /doc-requirements.txt: -------------------------------------------------------------------------------- 1 | alabaster 2 | babel 3 | certifi 4 | chardet 5 | docutils 6 | idna 7 | imagesize 8 | jinja2 9 | markupsafe 10 | pygments 11 | pytz 12 | requests 13 | snowballstemmer 14 | sphinx 15 | sphinx-rtd-theme 16 | sphinxcontrib-applehelp 17 | sphinxcontrib-devhelp 18 | sphinxcontrib-htmlhelp 19 | sphinxcontrib-jsmath 20 | sphinxcontrib-qthelp 21 | sphinxcontrib-serializinghtml 22 | sphinxcontrib-websupport 23 | typing; python_version<'3' 24 | urllib3 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.py[cod] 2 | 3 | # C extensions 4 | *.so 5 | 6 | # Packages 7 | *.egg 8 | *.egg-info 9 | dist 10 | build 11 | eggs 12 | .eggs 13 | parts 14 | bin 15 | var 16 | sdist 17 | develop-eggs 18 | .installed.cfg 19 | lib 20 | lib64 21 | __pycache__ 22 | 23 | # Installer logs 24 | pip-log.txt 25 | 26 | # Unit test / coverage reports 27 | .coverage 28 | .tox 29 | nosetests.xml 30 | 31 | # Developer tools 32 | *~ 33 | .ropeproject 34 | 35 | # Virtual Studio Code settings 36 | .vscode 37 | 38 | -------------------------------------------------------------------------------- /doc/index.rst: -------------------------------------------------------------------------------- 1 | .. katdal documentation master file, created by 2 | sphinx-quickstart on Sun Jun 2 11:18:58 2019. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to katdal's documentation! 7 | ================================== 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | user 14 | data_set_format 15 | API reference 16 | 17 | 18 | Indices and tables 19 | ================== 20 | 21 | * :ref:`genindex` 22 | * :ref:`modindex` 23 | * :ref:`search` 24 | -------------------------------------------------------------------------------- /doc/data_set_format.rst: -------------------------------------------------------------------------------- 1 | Data set format reference 2 | ========================= 3 | 4 | In most cases uses should not need to know the details of the data set formats, 5 | because katdal exists to hide these details and present a consistent, 6 | user-friendly view. It also contains workarounds for known issues in older data 7 | sets (which are not documented here). This is reference documentation useful to 8 | katdal developers and to power users who need to extract information not 9 | presented by the katdal interface. 10 | 11 | .. toctree:: 12 | :maxdepth: 1 13 | :caption: Contents: 14 | 15 | mvf_v1 16 | mvf_v2 17 | mvf_v3 18 | mvf_v4 19 | -------------------------------------------------------------------------------- /doc/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SOURCEDIR = . 8 | BUILDDIR = _build 9 | 10 | # Put it first so that "make" without argument is like "make help". 11 | help: 12 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 13 | 14 | apidoc: 15 | sphinx-apidoc -f -o . ../katdal ../katdal/test/*.py 16 | 17 | .PHONY: help apidoc Makefile 18 | 19 | # Catch-all target: route all unknown targets to Sphinx using the new 20 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 21 | %: Makefile 22 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 23 | -------------------------------------------------------------------------------- /doc/signs.rst: -------------------------------------------------------------------------------- 1 | Sign conventions 2 | ---------------- 3 | 4 | Visibilities 5 | ============ 6 | 7 | For a wave with frequency :math:`\omega` and wave number :math:`k`, the 8 | phasor is 9 | 10 | .. math:: e^{(\omega t - kz)i} 11 | 12 | Visibilities are then :math:`e_1 \overline{e_2}`. 13 | 14 | In KAT-7, the opposite sign convention is used in the HDF5 files, but katdal 15 | conjugates the visibilities to match MeerKAT. 16 | 17 | Baseline coordinates 18 | ==================== 19 | 20 | The UVW coordinates for the baseline (A, B) are 21 | :math:`(u, v, w)_A - (u, v, w)_B`. Combined with the above, this means 22 | that ideal visibilities (ignoring any effects apart from geometric 23 | delay) are 24 | 25 | .. math:: V(u, v, w) = \int \frac{I(l, m)}{n} e^{2\pi i(ul + vm + w(n - 1))}\ dl\ dm 26 | 27 | Polarisation 28 | ============ 29 | 30 | KAT-7 and MeerKAT are linear feed systems. On MeerKAT, if one points 31 | one's right thumb in the direction of vertical polarisation and the 32 | right index finger in the direction of horizontal polarisation, then the 33 | right middle finger points from the antenna towards the source. 34 | 35 | When exporting to a Measurement Set, katdal maps H to (IEEE) x and V to 36 | y, and introduces a 90° offset to the parallactic angle rotation. 37 | 38 | KAT-7 has the opposite convention for polarisation (due to the lack of a 39 | sub-reflector). katdal does **not** make any effort to compensate for 40 | this. Measurement sets exported from KAT-7 data should thus not be used 41 | for polarimetry without further correction. 42 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2011-2024, National Research Foundation (SARAO) 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without modification, 5 | are permitted provided that the following conditions are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | 3. Neither the name of the copyright holder nor the names of its contributors 15 | may be used to endorse or promote products derived from this software 16 | without specific prior written permission. 17 | 18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 19 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, 20 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 | ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 22 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 | OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 25 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 26 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 27 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 28 | -------------------------------------------------------------------------------- /katdal/flags.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2011,2019, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Definitions of flag bits""" 18 | 19 | NAMES = ('reserved0', 'static', 'cam', 'data_lost', 20 | 'ingest_rfi', 'predicted_rfi', 'cal_rfi', 'postproc') 21 | DESCRIPTIONS = ('reserved - bit 0', 22 | 'predefined static flag list', 23 | 'flag based on live CAM information', 24 | 'no data was received', 25 | 'RFI detected in ingest', 26 | 'RFI predicted from space based pollutants', 27 | 'RFI detected in calibration', 28 | 'some correction/postprocessing step could not be applied') 29 | 30 | STATIC_BIT = 1 31 | CAM_BIT = 2 32 | DATA_LOST_BIT = 3 33 | INGEST_RFI_BIT = 4 34 | PREDICTED_RFI_BIT = 5 35 | CAL_RFI_BIT = 6 36 | POSTPROC_BIT = 7 37 | 38 | STATIC = 1 << STATIC_BIT 39 | CAM = 1 << CAM_BIT 40 | DATA_LOST = 1 << DATA_LOST_BIT 41 | INGEST_RFI = 1 << INGEST_RFI_BIT 42 | PREDICTED_RFI = 1 << PREDICTED_RFI_BIT 43 | CAL_RFI = 1 << CAL_RFI_BIT 44 | POSTPROC = 1 << POSTPROC_BIT 45 | -------------------------------------------------------------------------------- /katdal/test/conftest.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2023, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | import pytest 18 | 19 | TEST_DURATION_TOLERANCE = 0.1 20 | 21 | 22 | def pytest_addoption(parser): 23 | parser.addoption( 24 | "--check-durations", 25 | action="store_true", 26 | help="Verify how long some tests run (the ones with an `expected_duration` mark)", 27 | ) 28 | 29 | 30 | @pytest.hookimpl(hookwrapper=True) 31 | def pytest_runtest_makereport(item, call): 32 | """Optionally override pytest report creation to verify test duration.""" 33 | report = (yield).get_result() 34 | # Only continue if the user requests this and the test has an expected_duration mark 35 | check_durations = item.config.getoption("--check-durations", default=False) 36 | mark = item.get_closest_marker("expected_duration") 37 | if not check_durations or mark is None: 38 | return report 39 | # The test will take at least as long as the expected duration and probably a bit longer 40 | minimum = mark.args[0] 41 | maximum = minimum + TEST_DURATION_TOLERANCE 42 | # Only verify duration if the test itself passes (and we are in the 'call' phase of test) 43 | if ( 44 | report.when == 'call' 45 | and report.passed 46 | and not minimum <= report.duration <= maximum 47 | ): 48 | # Mark test as failed and report the timing discrepancy 49 | report.outcome = 'failed' 50 | report.longrepr = (f"\nTest took {report.duration:g} seconds, " 51 | f"which is outside the range [{minimum:g}, {maximum:g}]\n") 52 | return report 53 | -------------------------------------------------------------------------------- /katdal/test/test_categorical.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017-2018,2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.categorical`.""" 18 | 19 | import numpy as np 20 | from numpy.testing import assert_array_equal 21 | 22 | from katdal.categorical import _single_event_per_dump, sensor_to_categorical 23 | 24 | 25 | def test_dump_to_event_parsing(): 26 | values = np.array(list('ABCDEFGH')) 27 | events = np.array([0, 0, 1, 3, 3, 4, 4, 6, 8]) 28 | greedy = np.array([1, 0, 0, 1, 1, 0, 0, 0]) 29 | cleaned = list(_single_event_per_dump(events, greedy)) 30 | new_values = values[cleaned] 31 | new_events = events[cleaned] 32 | assert_array_equal(cleaned, [0, 2, 4, 6, 7], 'Dump->event parser failed') 33 | assert_array_equal(new_values, list('ACEGH'), 'Dump->event parser failed') 34 | assert_array_equal(new_events, [0, 1, 3, 5, 6], 'Dump->event parser failed') 35 | 36 | 37 | def test_categorical_sensor_creation(): 38 | timestamps = [-363.784, 2.467, 8.839, 8.867, 15.924, 48.925, 54.897, 88.982] 39 | values = ['stop', 'slew', 'track', 'slew', 'track', 'slew', 'track', 'slew'] 40 | dump_period = 8. 41 | dump_times = np.arange(4., 100., dump_period) 42 | categ = sensor_to_categorical(timestamps, values, dump_times, dump_period, 43 | greedy_values=('slew', 'stop'), 44 | initial_value='slew') 45 | assert_array_equal(categ.unique_values, ['slew', 'track'], 46 | 'Sensor->categorical failed') 47 | assert_array_equal(categ.events, [0, 2, 6, 7, 11, 12], 48 | 'Sensor->categorical failed') 49 | assert_array_equal(categ.indices, [0, 1, 0, 1, 0], 50 | 'Sensor->categorical failed') 51 | -------------------------------------------------------------------------------- /katdal/test/test_chunkstore_npy.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017-2018,2021-2022, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.chunkstore_npy`.""" 18 | 19 | import os 20 | import shutil 21 | import tempfile 22 | 23 | import pytest 24 | 25 | from katdal.chunkstore import StoreUnavailable 26 | from katdal.chunkstore_npy import NpyFileChunkStore 27 | from katdal.test.test_chunkstore import ChunkStoreTestBase, generate_arrays 28 | 29 | 30 | class TestNpyFileChunkStore(ChunkStoreTestBase): 31 | """Test NPY file functionality using a temporary directory.""" 32 | 33 | @classmethod 34 | def setup_class(cls): 35 | """Create temp dir to store NPY files and build ChunkStore on that.""" 36 | cls.arrays = generate_arrays() 37 | cls.tempdir = tempfile.mkdtemp() 38 | cls.store = NpyFileChunkStore(cls.tempdir) 39 | 40 | @classmethod 41 | def teardown_class(cls): 42 | shutil.rmtree(cls.tempdir) 43 | 44 | def setup_method(self): 45 | # Clean out data created by previous tests 46 | for entry in os.scandir(self.tempdir): 47 | if not entry.name.startswith('.') and entry.is_dir(): 48 | shutil.rmtree(entry.path) 49 | 50 | def test_store_unavailable(self): 51 | with pytest.raises(StoreUnavailable): 52 | NpyFileChunkStore('hahahahahaha') 53 | 54 | 55 | class TestNpyFileChunkStoreDirectWrite(TestNpyFileChunkStore): 56 | """Test NPY file functionality with O_DIRECT writes.""" 57 | 58 | @classmethod 59 | def setup_class(cls): 60 | """Create temp dir to store NPY files and build ChunkStore on that.""" 61 | cls.tempdir = tempfile.mkdtemp() 62 | try: 63 | cls.store = NpyFileChunkStore(cls.tempdir, direct_write=True) 64 | except StoreUnavailable as e: 65 | if 'not supported' in str(e): 66 | pytest.skip(str(e)) 67 | raise 68 | -------------------------------------------------------------------------------- /katdal/test/test_chunkstore_dict.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017-2018,2021-2022, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.chunkstore_dict`.""" 18 | 19 | import time 20 | 21 | import numpy as np 22 | import dask.array as da 23 | 24 | from katdal.chunkstore_dict import DictChunkStore 25 | from katdal.test.test_chunkstore import ChunkStoreTestBase, generate_arrays 26 | 27 | 28 | class TestDictChunkStore(ChunkStoreTestBase): 29 | def setup_method(self): 30 | self.arrays = generate_arrays() 31 | self.store = DictChunkStore(**self.arrays) 32 | # This store is prepopulated so missing chunks can't be checked 33 | self.preloaded_chunks = True 34 | 35 | 36 | def test_basic_overheads(): 37 | """Check overheads of creating and transferring dask array between stores.""" 38 | # The array is about 1 GB in size 39 | shape = (100, 1000, 1000) 40 | x = np.ones(shape) 41 | y = np.zeros(shape) 42 | store1 = DictChunkStore(x=x) 43 | store2 = DictChunkStore(y=y) 44 | # We have 1000 chunks of about 1 MB each 45 | chunk_size = (1, 100, 1000) 46 | chunks = da.core.normalize_chunks(chunk_size, shape) 47 | # Check that the time to set up dask arrays is not grossly inflated 48 | start_time = time.process_time() 49 | dx = store1.get_dask_array('x', chunks, float) 50 | py = store2.put_dask_array('y', dx) 51 | setup_duration = time.process_time() - start_time 52 | assert setup_duration < 1.0 53 | # Use basic array copy as a reference 54 | start_time = time.process_time() 55 | y[:] = x 56 | copy_duration = time.process_time() - start_time 57 | # Check ChunkStore / dask overhead on top of basic memory copy 58 | start_time = time.process_time() 59 | success = py.compute() 60 | dask_duration = time.process_time() - start_time 61 | assert dask_duration < 10 * copy_duration 62 | np.testing.assert_equal(success, None) 63 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################ 4 | # Copyright (c) 2011,2013,2016-2023, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | import os.path 20 | 21 | from setuptools import find_packages, setup 22 | 23 | here = os.path.dirname(__file__) 24 | readme = open(os.path.join(here, 'README.rst')).read() 25 | news = open(os.path.join(here, 'NEWS.rst')).read() 26 | long_description = readme + '\n\n' + news 27 | 28 | setup(name='katdal', 29 | description='Karoo Array Telescope data access library for interacting ' 30 | 'with data sets in the MeerKAT Visibility Format (MVF)', 31 | long_description=long_description, 32 | long_description_content_type='text/x-rst', 33 | author='Ludwig Schwardt', 34 | author_email='ludwig@ska.ac.za', 35 | packages=find_packages(), 36 | scripts=[ 37 | 'scripts/mvf_copy.py', 38 | 'scripts/mvf_download.py', 39 | 'scripts/mvftoms.py', 40 | ], 41 | url='https://github.com/ska-sa/katdal', 42 | license='Modified BSD', 43 | classifiers=[ 44 | 'Development Status :: 4 - Beta', 45 | 'Intended Audience :: Developers', 46 | 'License :: OSI Approved :: BSD License', 47 | 'Operating System :: OS Independent', 48 | 'Programming Language :: Python', 49 | 'Programming Language :: Python :: 3', 50 | 'Topic :: Software Development :: Libraries :: Python Modules', 51 | 'Topic :: Scientific/Engineering :: Astronomy'], 52 | platforms=['OS Independent'], 53 | keywords='meerkat ska', 54 | python_requires='>=3.6', 55 | setup_requires=['katversion'], 56 | use_katversion=True, 57 | install_requires=[ 58 | 'numpy >= 1.12.0', 59 | 'katpoint >= 0.9, < 1', 60 | 'h5py >= 2.3', 61 | 'numba', 62 | 'katsdptelstate[rdb] >= 0.10', 63 | 'dask[array] >= 2.7.0', 64 | 'requests >= 2.18.0', 65 | 'pyjwt >= 2', 66 | 'cityhash >= 0.2.2', 67 | 'packaging', 68 | ], 69 | extras_require={ 70 | 'ms': ['python-casacore >= 2.2.1'], 71 | 's3': [], 72 | 's3credentials': ['botocore'] 73 | }, 74 | tests_require=['cryptography', 'pytest']) 75 | -------------------------------------------------------------------------------- /katdal/chunkstore_dict.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017-2018,2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """A store of chunks (i.e. N-dimensional arrays) based on a dict of arrays.""" 18 | 19 | from .chunkstore import BadChunk, ChunkNotFound, ChunkStore 20 | 21 | 22 | class DictChunkStore(ChunkStore): 23 | """A store of chunks (i.e. N-dimensional arrays) based on a dict of arrays. 24 | 25 | This interprets all keyword arguments as NumPy arrays and stores them in 26 | an `arrays` dict. Each array is identified by its corresponding keyword. 27 | New arrays cannot be added via :meth:`put` - they all need to be in place 28 | at store initialisation (or can be added afterwards via direct insertion 29 | into the `arrays` dict). The `put` method is only useful for in-place 30 | modification of existing arrays. 31 | """ 32 | 33 | def __init__(self, **kwargs): 34 | error_map = {KeyError: ChunkNotFound, IndexError: ChunkNotFound} 35 | super().__init__(error_map) 36 | self.arrays = kwargs 37 | 38 | def get_chunk(self, array_name, slices, dtype): 39 | """See the docstring of :meth:`ChunkStore.get_chunk`.""" 40 | chunk_name, shape = self.chunk_metadata(array_name, slices, dtype=dtype) 41 | with self._standard_errors(chunk_name): 42 | array = self.arrays[array_name] 43 | # Ensure that chunk is array (otherwise 0-dim array becomes number) 44 | chunk = array[slices] if slices != () else array 45 | if chunk.shape != shape or chunk.dtype != dtype: 46 | raise BadChunk(f'Chunk {chunk_name!r}: requested dtype {chunk.dtype} and/or shape ' 47 | f'{chunk.shape} differs from expected dtype {dtype} and shape {shape}') 48 | return chunk 49 | 50 | def create_array(self, array_name): 51 | if array_name not in self.arrays: 52 | raise NotImplementedError 53 | 54 | def put_chunk(self, array_name, slices, chunk): 55 | """See the docstring of :meth:`ChunkStore.put_chunk`.""" 56 | self.chunk_metadata(array_name, slices, chunk=chunk) 57 | self.get_chunk(array_name, slices, chunk.dtype)[()] = chunk 58 | 59 | get_chunk.__doc__ = ChunkStore.get_chunk.__doc__ 60 | put_chunk.__doc__ = ChunkStore.put_chunk.__doc__ 61 | -------------------------------------------------------------------------------- /scripts/mvf_read_benchmark.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################ 4 | # Copyright (c) 2018-2021,2023, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | import argparse 20 | import logging 21 | import time 22 | 23 | import dask 24 | import numpy as np 25 | 26 | import katdal 27 | from katdal.lazy_indexer import DaskLazyIndexer 28 | 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('filename') 31 | parser.add_argument('--time', type=int, default=10, help='Number of times to read per batch') 32 | parser.add_argument('--channels', type=int, help='Number of channels to read') 33 | parser.add_argument('--dumps', type=int, help='Number of times to read') 34 | parser.add_argument('--joint', action='store_true', help='Load vis, weights, flags together') 35 | parser.add_argument('--applycal', help='Calibration solutions to apply') 36 | parser.add_argument('--workers', type=int, help='Number of dask workers') 37 | args = parser.parse_args() 38 | 39 | logging.basicConfig(level='INFO', format='%(asctime)s [%(levelname)s] %(message)s') 40 | if args.workers is not None: 41 | dask.config.set(num_workers=args.workers) 42 | logging.info('Starting') 43 | kwargs = {} 44 | if args.applycal is not None: 45 | kwargs['applycal'] = args.applycal 46 | f = katdal.open(args.filename, **kwargs) 47 | logging.info('File loaded, shape %s', f.shape) 48 | if args.channels: 49 | f.select(channels=np.s_[:args.channels]) 50 | if args.dumps: 51 | f.select(dumps=np.s_[:args.dumps]) 52 | # Trigger creation of the dask graphs, population of sensor cache for applycal etc 53 | _ = (f.vis[0, 0, 0], f.weights[0, 0, 0], f.flags[0, 0, 0]) 54 | logging.info('Selection complete') 55 | start = time.time() 56 | last_time = start 57 | for st in range(0, f.shape[0], args.time): 58 | et = st + args.time 59 | if args.joint: 60 | vis, weights, flags = DaskLazyIndexer.get([f.vis, f.weights, f.flags], np.s_[st:et]) 61 | else: 62 | vis = f.vis[st:et] 63 | weights = f.weights[st:et] 64 | flags = f.flags[st:et] 65 | current_time = time.time() 66 | elapsed = current_time - last_time 67 | last_time = current_time 68 | size = np.prod(vis.shape) * 10 69 | logging.info('Loaded %d dumps (%.3f MB/s)', vis.shape[0], size / elapsed / 1e6) 70 | size = np.prod(f.shape) * 10 71 | elapsed = time.time() - start 72 | logging.info('Loaded %d bytes in %.3f s (%.3f MB/s)', size, elapsed, size / elapsed / 1e6) 73 | -------------------------------------------------------------------------------- /katdal/test/test_van_vleck.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2020-2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | import numpy as np 18 | 19 | from katdal.van_vleck import autocorr_lookup_table, norm0_cdf 20 | 21 | 22 | def test_norm0_cdf(): 23 | scale = 2.0 24 | x = np.array([0.0, 2.0, 4.0, 6.0]) 25 | # Generated by scipy.stats.norm.cdf(x, scale=2.0) 26 | expected = np.array([0.5, 0.8413447460685429, 0.9772498680518208, 0.9986501019683699]) 27 | actual = norm0_cdf(x, scale) 28 | np.testing.assert_allclose(actual, expected, rtol=0., atol=np.finfo(float).eps) 29 | actual = norm0_cdf(-x, scale) 30 | np.testing.assert_allclose(actual, 1.0 - expected, rtol=0., atol=np.finfo(float).eps) 31 | actual = norm0_cdf(x[-1], scale) 32 | np.testing.assert_allclose(actual, expected[-1], rtol=0., atol=np.finfo(float).eps) 33 | 34 | 35 | def test_autocorr_correction(): 36 | # 15-level "4-bit" KAT-7 requantiser (contiguous ints for now) 37 | levels = np.arange(-7., 8.) 38 | quantised_ac_table, true_ac_table = autocorr_lookup_table(levels) 39 | N = 100000 40 | rs = np.random.RandomState(42) 41 | autocorrs = [0.06, 0.2, 1.0, 10.0, 100.0, 1000.0] 42 | # Excess above usual sample standard deviation due to loss of information caused by quantisation, 43 | # generated by Bayesian quantisation correction code (ask the author for details) 44 | rtol_factors = [2.5, 1.24, 1.16, 1.02, 1.5, 2.9] 45 | for true_ac, rtol_factor in zip(autocorrs, rtol_factors): 46 | # Generate complex random voltages with appropriate variance 47 | scale = np.sqrt(true_ac / 2.) 48 | x = rs.normal(scale=scale, size=N) + 1j * rs.normal(scale=scale, size=N) 49 | # Estimate power of the unquantised complex signal as a sanity check 50 | unquantised_sample_ac = x.dot(x.conj()).real / N 51 | # The standard deviation of sample variance of N complex normals of variance `var` 52 | # is var / sqrt(N). Use rtol since stdev is proportional to var and set it to 3 sigma. 53 | rtol = 3.0 / np.sqrt(N) 54 | np.testing.assert_allclose(unquantised_sample_ac, true_ac, rtol=rtol) 55 | # Quantise x to the nearest integer and clip (assumes levels are contiguous ints) 56 | xq = x.round() 57 | np.clip(xq.real, levels[0], levels[-1], out=xq.real) 58 | np.clip(xq.imag, levels[0], levels[-1], out=xq.imag) 59 | # Estimate power of the quantised signal and correct the effects of quantisation 60 | quantised_sample_ac = xq.dot(xq.conj()).real / N 61 | corrected_ac = np.interp(quantised_sample_ac, quantised_ac_table, true_ac_table) 62 | np.testing.assert_allclose(corrected_ac, true_ac, rtol=rtol_factor * rtol) 63 | np.testing.assert_allclose(corrected_ac, unquantised_sample_ac, rtol=rtol_factor * rtol) 64 | -------------------------------------------------------------------------------- /doc/katdal.rst: -------------------------------------------------------------------------------- 1 | katdal package 2 | ============== 3 | 4 | Submodules 5 | ---------- 6 | 7 | katdal.applycal module 8 | ---------------------- 9 | 10 | .. automodule:: katdal.applycal 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | katdal.averager module 16 | ---------------------- 17 | 18 | .. automodule:: katdal.averager 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | katdal.categorical module 24 | ------------------------- 25 | 26 | .. automodule:: katdal.categorical 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | katdal.chunkstore module 32 | ------------------------ 33 | 34 | .. automodule:: katdal.chunkstore 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | katdal.chunkstore\_dict module 40 | ------------------------------ 41 | 42 | .. automodule:: katdal.chunkstore_dict 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | katdal.chunkstore\_npy module 48 | ----------------------------- 49 | 50 | .. automodule:: katdal.chunkstore_npy 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | katdal.chunkstore\_s3 module 56 | ---------------------------- 57 | 58 | .. automodule:: katdal.chunkstore_s3 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | katdal.concatdata module 64 | ------------------------ 65 | 66 | .. automodule:: katdal.concatdata 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | katdal.dataset module 72 | --------------------- 73 | 74 | .. automodule:: katdal.dataset 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | katdal.datasources module 80 | ------------------------- 81 | 82 | .. automodule:: katdal.datasources 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | katdal.flags module 88 | ------------------- 89 | 90 | .. automodule:: katdal.flags 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | katdal.h5datav1 module 96 | ---------------------- 97 | 98 | .. automodule:: katdal.h5datav1 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | katdal.h5datav2 module 104 | ---------------------- 105 | 106 | .. automodule:: katdal.h5datav2 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | 111 | katdal.h5datav3 module 112 | ---------------------- 113 | 114 | .. automodule:: katdal.h5datav3 115 | :members: 116 | :undoc-members: 117 | :show-inheritance: 118 | 119 | katdal.lazy\_indexer module 120 | --------------------------- 121 | 122 | .. automodule:: katdal.lazy_indexer 123 | :members: 124 | :undoc-members: 125 | :show-inheritance: 126 | 127 | katdal.ms\_async module 128 | ----------------------- 129 | 130 | .. automodule:: katdal.ms_async 131 | :members: 132 | :undoc-members: 133 | :show-inheritance: 134 | 135 | katdal.ms\_extra module 136 | ----------------------- 137 | 138 | .. automodule:: katdal.ms_extra 139 | :members: 140 | :undoc-members: 141 | :show-inheritance: 142 | 143 | katdal.sensordata module 144 | ------------------------ 145 | 146 | .. automodule:: katdal.sensordata 147 | :members: 148 | :undoc-members: 149 | :show-inheritance: 150 | 151 | katdal.spectral\_window module 152 | ------------------------------ 153 | 154 | .. automodule:: katdal.spectral_window 155 | :members: 156 | :undoc-members: 157 | :show-inheritance: 158 | 159 | katdal.visdatav4 module 160 | ----------------------- 161 | 162 | .. automodule:: katdal.visdatav4 163 | :members: 164 | :undoc-members: 165 | :show-inheritance: 166 | 167 | 168 | Module contents 169 | --------------- 170 | 171 | .. automodule:: katdal 172 | :members: 173 | :undoc-members: 174 | :show-inheritance: 175 | -------------------------------------------------------------------------------- /katdal/van_vleck.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2012,2020,2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Routines for performing quantisation (Van Vleck) correction.""" 18 | 19 | import math 20 | 21 | import numba 22 | import numpy as np 23 | 24 | 25 | @numba.vectorize(['f8(f8, f8)'], nopython=True, cache=True) 26 | def norm0_cdf(x, scale): 27 | """Fast zero-mean (loc=0) implementation of :meth:`scipy.stats.norm.cdf`.""" 28 | return 0.5 * (math.erf(np.sqrt(0.5) * x / scale) + 1.) 29 | 30 | 31 | def _quant_norm0_pmf(levels, var=1.0): 32 | """Probability mass function of quantised zero-mean normal variable.""" 33 | edges = np.r_[-np.inf, levels[:-1] + np.diff(levels) / 2., np.inf] 34 | return np.diff(norm0_cdf(edges, np.sqrt(var))) 35 | 36 | 37 | def _squared_quant_norm0_mean(levels, var=1.0): 38 | """Mean of squared quantised zero-mean normal variable (same shape as `var`).""" 39 | levels = np.asarray(levels) 40 | # Allow var and levels to be broadcast against each other, with levels as last dimension 41 | var = np.asarray(var)[..., np.newaxis] 42 | pmf = _quant_norm0_pmf(levels, var) 43 | return pmf.dot(levels * levels) 44 | 45 | 46 | def autocorr_lookup_table(levels, size=4000): 47 | """Lookup table that corrects complex autocorrelation quantisation effects. 48 | 49 | This maps the variance of a quantised complex voltage signal to the variance 50 | of the unquantised signal under the assumption that the signal is proper 51 | (circularly-symmetric) complex normally distributed. 52 | 53 | Parameters 54 | ---------- 55 | levels : sequence of float 56 | Quantisation levels for real and imaginary components of voltage signal 57 | size : int, optional 58 | Size of lookup table 59 | 60 | Returns 61 | ------- 62 | quantised_autocorr_table, true_autocorr_table : array of float, shape (`size`,) 63 | Lookup table associating quantised autocorrelations and unquantised 64 | autocorrelations (i.e. power/variance of complex signals) 65 | """ 66 | # Terminology: 67 | # x = Proper complex normal voltage signal (zero-mean) 68 | # rxx = Power (variance) *per* real/imag component of unquantised / true x 69 | # sxx = Power (variance) *per* real/imag component of quantised x 70 | abs_levels = np.abs(levels) 71 | sxx_min_nonzero = abs_levels[abs_levels > 0].min() ** 2 72 | sxx_max = abs_levels.max() ** 2 73 | # Sweep across range of true power values, placing more table entries at tricky lower end 74 | rxx_grid = np.r_[np.logspace(-2.4, 0, size // 2, endpoint=False), 75 | np.logspace(0, np.log10(sxx_max / sxx_min_nonzero) + 8, size - 2 - size // 2)] 76 | # Shift the table to place inflection point at minimum non-zero sxx 77 | rxx_grid *= sxx_min_nonzero 78 | # Map true power to expected quantised power 79 | sxx_mean = _squared_quant_norm0_mean(levels, rxx_grid) 80 | # Extend quantised power values to its maximum range 81 | sxx_table = np.r_[0., sxx_mean, sxx_max] 82 | # Replace asymptotic with linear decay at bottom end, and clip unbounded growth at top end 83 | rxx_table = np.r_[0., rxx_grid, rxx_grid[-1]] 84 | # The factor 2 converts power per real/imag component to power/variance of complex signal 85 | return 2. * sxx_table, 2. * rxx_table 86 | -------------------------------------------------------------------------------- /katdal/test/test_spectral_window.py: -------------------------------------------------------------------------------- 1 | ############################################################################### 2 | # Copyright (c) 2018,2021-2022, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ############################################################################### 16 | 17 | """Tests for :py:mod:`katdal.spectral_window`.""" 18 | 19 | import numpy as np 20 | from numpy.testing import assert_array_almost_equal, assert_array_equal 21 | 22 | from katdal.spectral_window import SpectralWindow 23 | 24 | 25 | class TestSpectralWindow: 26 | def setup_method(self): 27 | self.lsb = SpectralWindow(1000.0, 10.0, 6, sideband=-1, product='lsb') 28 | self.usb = SpectralWindow(1000.0, 10.0, 6, sideband=1, band='X') 29 | self.odd = SpectralWindow(1000.0, 10.0, 5, sideband=1) 30 | # channel_width will not be an exact float. The values have been 31 | # chosen so that bandwidth / num_chans * num_chans does not quite 32 | # equal bandwidth. 33 | self.inexact = SpectralWindow(1000.0, None, 14, sideband=1, 34 | bandwidth=230.0) 35 | 36 | def test_width_properties(self): 37 | assert self.lsb.channel_width == 10.0 38 | assert self.lsb.bandwidth == 60.0 39 | assert self.inexact.channel_width == 230.0 / 14 40 | assert self.inexact.bandwidth == 230.0 41 | 42 | def test_channel_freqs(self): 43 | assert_array_equal(self.lsb.channel_freqs, 44 | [1030.0, 1020.0, 1010.0, 1000.0, 990.0, 980.0]) 45 | assert_array_equal(self.usb.channel_freqs, 46 | [970.0, 980.0, 990.0, 1000.0, 1010.0, 1020.0]) 47 | assert_array_equal(self.odd.channel_freqs, 48 | [980.0, 990.0, 1000.0, 1010.0, 1020.0]) 49 | assert_array_almost_equal(self.inexact.channel_freqs, 50 | np.arange(14) * 230.0 / 14 + 885.0) 51 | # Check that the exactly representable values are exact 52 | assert self.inexact.channel_freqs[0] == 885.0 53 | assert self.inexact.channel_freqs[7] == 1000.0 54 | 55 | def test_repr(self): 56 | # Just a smoke test to check that it doesn't crash 57 | repr(self.lsb) 58 | repr(self.usb) 59 | 60 | def test_subrange(self): 61 | lsb_sub = self.lsb.subrange(0, 3) 62 | assert_array_equal(lsb_sub.channel_freqs, [1030.0, 1020.0, 1010.0]) 63 | assert lsb_sub.product == self.lsb.product 64 | usb_sub = self.usb.subrange(2, 6) 65 | assert_array_equal(usb_sub.channel_freqs, 66 | [990.0, 1000.0, 1010.0, 1020.0]) 67 | assert usb_sub.band == self.usb.band 68 | # Check that updated bandwidth doesn't have rounding errors 69 | inexact_sub = self.inexact.subrange(0, 7) 70 | assert inexact_sub.bandwidth == 115.0 71 | 72 | def test_rechannelise_same(self): 73 | lsb = self.lsb.rechannelise(6) 74 | assert lsb == self.lsb 75 | 76 | def test_rechannelise_to_even(self): 77 | lsb = self.lsb.rechannelise(2) 78 | assert_array_equal(lsb.channel_freqs, [1020.0, 990.0]) 79 | usb = self.usb.rechannelise(2) 80 | assert_array_equal(usb.channel_freqs, [980.0, 1010.0]) 81 | 82 | def test_rechannelise_to_odd(self): 83 | lsb = self.lsb.rechannelise(3) 84 | assert_array_equal(lsb.channel_freqs, [1025.0, 1005.0, 985.0]) 85 | usb = self.usb.rechannelise(3) 86 | assert_array_equal(usb.channel_freqs, [975.0, 995.0, 1015.0]) 87 | odd = self.odd.rechannelise(1) 88 | assert_array_equal(odd.channel_freqs, [1000.0]) 89 | -------------------------------------------------------------------------------- /doc/mvf_v2.rst: -------------------------------------------------------------------------------- 1 | .. _hdf5_format_v2: 2 | 3 | MVF version 2 (KAT-7) 4 | ====================== 5 | 6 | .. sectionauthor:: Simon Ratcliffe , Ludwig Schwardt 7 | 8 | Introduction 9 | ------------ 10 | 11 | With the introduction of the KAT-7 correlator, we have taken the opportunity to revisit the correlator data storage format. This document describes this updated format. 12 | 13 | Basic Concept 14 | ------------- 15 | A single HDF5 corresponds to a single observation (contiguous telescope time segment for a specified subarray). 16 | 17 | At highest level split into Data and MetaData. 18 | 19 | MetaData contains two distinct types: 20 | 21 | * Configuration is known a priori and is static for the duration of the observation. 22 | * Sensors contains dynamic information provided in the form of katcp sensors. Typically only full known post observation. 23 | 24 | Flags and History are special cases objects that get populated during run time but not from sensors. These are also the only groups that could get updated post augmentation. 25 | 26 | Some datasets such as the noise_diode flags are synthesised from sensor information post capture. These base sensors could then be removed if space is a concern. 27 | 28 | A major/minor version number is included in the file. The major indicates the overall structural philosophy (this document describes version 2.x). The minor is used 29 | to identify the mandtory members of the MetaData and Markup groups included in the file. This allows addition of members (and modification of existing members) to the required list without wholesale changes to the file structure. The mandatory members are described in the following document: TBA. 30 | 31 | If used to store voltage data then both correlator_data and timestamps are omitted as timing is synthesized on the fly. 32 | 33 | Nut - number of correlator timeslots in this observation 34 | Nt - number of averaged time timeslots 35 | Nuf - number of correlator frequency channels 36 | Nf - number of averaged frequency channels 37 | Nbl - number of baselines 38 | Np - number of polarisation products 39 | Na - number of antennas in a given subarray 40 | AntennaK - first antenna in a given subarray 41 | AntennaN - last antenna in a given subarray 42 | 43 | HDF5 Format 44 | ----------- 45 | 46 | The structural format is shown below. 47 | 48 | Groups are named using CamelCase, datasets are all lower case with underscores. 49 | Attributes are indicated next to a group in {}:: 50 | 51 | / {augment_ts} 52 | {experiment_id} 53 | {version} 54 | 55 | /Data/ {ts_of_first_timeslot} 56 | /correlator_data - (Nt,Nf,Nbl,2) array of float32 visibilities (real and imag components) 57 | /timestamps - (Nt) array of float64 timestamps (UT seconds since Unix epoch) 58 | /voltage_data - (optional) (Na, Nt, Nf) array of 8bit voltage samples 59 | 60 | /MetaData/ 61 | /Configuration/ 62 | /Antennas/ {num_antennas, subarray_id} 63 | /AntennaK..N/ {description, delays, diameter, location, etc...} 64 | / beam_pattern 65 | / h_coupler_noise_diode_model 66 | / h_pin_noise_diode_model 67 | / v_coupler_noide_diode_model 68 | / v_pin_noise_diode_model 69 | /Correlator/ {num_channels, center_freq, channel_bw, etc...} 70 | /Observation/ {type, pi, contact, sw_build_versions, etc...} 71 | /PostProcessing/ {channel_averaging, rfi_threshold, etc...} 72 | /time_averaging - TBD detail of baseline dep time avg 73 | /Sensors/ 74 | /Antennas/ {num_antennas, subarray_id} 75 | /AntennaK..N/ 76 | /... - dataset per antenna and pedestal sensor 77 | /DBE/ 78 | /... - dataset per DBE sensor 79 | /Enviro/ 80 | /... - dataset per enviro sensor 81 | /Other/ 82 | /... - dataset per other sensor 83 | /RFE/ 84 | /... - dataset per RFE sensor 85 | /Source/ 86 | /phase_center 87 | /antenna_target - array of target sensors for each antenna 88 | 89 | /Markup/ 90 | /dropped_data - (optional) describes data dropped by receivers 91 | /flags - (Nt,Nf,Nbl) post averaged uint8 flags - 1bit per flag, packed 92 | /flags_description - (Nflags,3) index, name and description for each packed flag type 93 | /flags_full - (optional) (Nut,Nuf,Nbl) pre-averaged uint8 flags - 1bit per flag, packed 94 | /labels - (optional) descriptions of intent of each observational phase (e.g. scan, slew, cal, etc..) 95 | /noise_diode - (Nt,Na) noise diode state during this averaged timeslot 96 | /noise_diode_full - (optional) (Nut,Na) noise diode state per correlator timeslot 97 | /weights - (Nt,Nf,Nbl,Nweights) weights for each sample 98 | 99 | /History/ 100 | /augment_log - Log output of augmentation process 101 | /script_log - Log output of observation script 102 | -------------------------------------------------------------------------------- /doc/tuning.rst: -------------------------------------------------------------------------------- 1 | Tuning your application 2 | ======================= 3 | It is possible to load data at high bandwidth using katdal: rates over 4 | 2.5 GB/s have been seen when loading from a local disk. However, it 5 | requires an understanding of the storage layout and choice of an 6 | appropriate access pattern. 7 | 8 | This chapter is aimed at loading :doc:`mvf_v4` data, as older versions 9 | typically contain far less data. Some of the advice is generic but some 10 | of the methods described here will not work on older data sets. 11 | 12 | Chunking 13 | -------- 14 | The most important thing to understand is that the data is split into 15 | chunks, each of which are stored as a file on disk or an object in an S3 16 | store. Retrieving any element of a chunk causes the entire chunk to be 17 | retrieved. Thus, aligning accesses to whole chunks will give the best 18 | performance, as data is not discarded. 19 | 20 | As an illustration, consider an application that has an outer loop over 21 | the baselines, and loads data for one baseline at a time. Chunks 22 | typically span all baselines, so each time one baseline is loaded, 23 | katdal will actually load the entire data set. If the application can 24 | be redesigned to fetch data for a small time range for all baselines it 25 | will perform much better. 26 | 27 | When using MVFv4, katdal uses `dask`_ to manage the chunking. After 28 | opening a data set, you can determine the chunking for a particular 29 | array by examining its ``dataset`` member: 30 | 31 | .. code:: python 32 | 33 | >>> d.vis.dataset 34 | dask.array<1556179171-sdp, shape=(38, 4096, 40), dtype=complex64, chunksize=(32, 1024, 40)> 35 | >>> d.vis.dataset.chunks 36 | ((32, 6), (1024, 1024, 1024, 1024), (40,)) 37 | 38 | .. _dask: https://docs.dask.org/ 39 | 40 | For this data set, it will be optimal to load visibilities in 32 × 1024 41 | × 40 element pieces. 42 | 43 | Note that the chunking scheme may be different for visibilities, flags 44 | and weights. 45 | 46 | Joint loading 47 | ------------- 48 | The values returned by katdal are not the raw values stored in the 49 | chunks: there is processing involved, such as application of calibration 50 | solutions and flagging of missing data. Some of this processing is 51 | common between visibilities, flags and weights. It's thus more efficient 52 | to load the visibilities, flags and weights as a single operation rather 53 | than as three separate operations. 54 | 55 | This can be achieved using :meth:`.DaskLazyIndexer.get`. For example, 56 | replace 57 | 58 | .. code:: python 59 | 60 | vis = d.vis[idx] 61 | flags = d.flags[idx] 62 | weights = d.weights[idx] 63 | 64 | with 65 | 66 | .. code:: python 67 | 68 | vis, flags, weights = DaskLazyIndexer.get([d.vis, d.flags, d.weights], idx) 69 | 70 | Parallelism 71 | ----------- 72 | Dask uses multiple worker threads. It defaults to one thread per CPU 73 | core, but for I/O-bound tasks this is often not enough to achieve 74 | maximum throughput. Refer to the dask `scheduler`_ documentation for 75 | details of how to configure the number of workers. 76 | 77 | .. _scheduler: https://docs.dask.org/en/latest/scheduling.html 78 | 79 | More workers only helps if there is enough parallel work to be 80 | performed, which means there need to be at least as many chunks loaded 81 | at a time as there are workers (and preferably many more). It's thus 82 | advisable to load as much data at a time as possible without running out 83 | of memory. 84 | 85 | Selection 86 | --------- 87 | Using :meth:`DataSet.select` is relatively expensive. For the best 88 | performance, it should only be used occasionally (for example, to filter 89 | out unwanted data at the start), with array access notation or 90 | :meth:`.DaskLazyIndexer.get` used to break up large data sets into 91 | manageable pieces. 92 | 93 | Dask also performs better with selections that select contiguous data. 94 | You might be able to get a little more performance by using 95 | :meth:`.DataSet.scans` (which will yield a series of contiguous 96 | selections) rather than using :meth:`~.DataSet.select` with 97 | ``scans='track'``. 98 | 99 | When using MVF v4 one can also pass a `preselect` parameter to :meth:`katdal.open` 100 | which allows slicing a subset of the data (time and frequency). It is more 101 | limited than :meth:`DataSet.select` (it can only select contiguous ranges, and 102 | can only specify the selection in terms of channels and dumps), but if a script 103 | is only interested in working on a subset of data, this method can be more 104 | efficient and uses less memory. 105 | 106 | Network versus local disk 107 | ------------------------- 108 | When loading data from the network, latency is typically higher, and so 109 | more workers will be needed to achieve peak throughput. Network access 110 | is also more sensitive to access patterns that are mis-aligned with 111 | chunks, because chunks are not cached in memory by the operation system 112 | and hence must be re-fetched over the network if they are accessed 113 | again. 114 | 115 | Benchmarking 116 | ------------ 117 | To assist with testing out the effects of changing these tuning 118 | parameters, the katdal source code includes a script called 119 | ``mvf_read_benchmark.py`` that allows a data set to be loaded in 120 | various ways and reports the average throughput. The command-line 121 | options are somewhat limited so you may need to edit it yourself, for 122 | example, to add a custom selection. 123 | -------------------------------------------------------------------------------- /katdal/test/test_concatdata.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2020-2022, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.concatdata`.""" 18 | 19 | import numpy as np 20 | import pytest 21 | 22 | from katdal.categorical import CategoricalData 23 | from katdal.concatdata import ConcatenatedSensorCache 24 | from katdal.sensordata import SensorCache, SimpleSensorGetter 25 | 26 | 27 | class TestConcatenatedSensorCache: 28 | @staticmethod 29 | def _make_cache(timestamps, sensors): 30 | cache_data = {} 31 | for name, ts, values in sensors: 32 | sd = SimpleSensorGetter(name, np.asarray(ts), np.asarray(values)) 33 | cache_data[name] = sd 34 | return SensorCache(cache_data, timestamps, 2.0) 35 | 36 | def setup_method(self): 37 | self.timestamps1 = np.arange(100.0, 110.0, 2.0) 38 | self.timestamps2 = np.arange(1000.0, 1006.0, 2.0) 39 | sensors1 = [ 40 | ('foo', [104.0, 107.0], [3.0, 6.0]), 41 | ('cat', [102.0, 110.0], ['hello', 'world']), 42 | ('int_missing', [105.0], [42]) 43 | ] 44 | sensors2 = [ 45 | ('foo', [995.0, 1010.0], [10.0, 25.0]), 46 | ('cat', [1000.0, 1002.0, 1004.0, 1006.0], ['world', 'hello', 'again', 'hello']), 47 | ('float_missing', [995.0], [3.0]) 48 | ] 49 | self.cache1 = self._make_cache(self.timestamps1, sensors1) 50 | self.cache2 = self._make_cache(self.timestamps2, sensors2) 51 | self.keep = np.array([True, False, True, False, False, True, False, True]) 52 | self.cache = ConcatenatedSensorCache([self.cache1, self.cache2], keep=self.keep) 53 | 54 | def test_timestamps(self): 55 | np.testing.assert_array_equal( 56 | self.cache.timestamps, 57 | np.concatenate([self.timestamps1, self.timestamps2]) 58 | ) 59 | 60 | def test_float(self): 61 | data = self.cache.get('foo') 62 | np.testing.assert_allclose(data, [3.0, 3, 3, 5, 6, 15, 17, 19]) 63 | 64 | def test_categorical(self): 65 | data = self.cache.get('cat') 66 | assert data.unique_values == ['hello', 'world', 'again'] 67 | H = 'hello' 68 | W = 'world' 69 | A = 'again' 70 | np.testing.assert_array_equal(data[:], [H, H, H, H, H, W, H, A]) 71 | 72 | def test_float_missing(self): 73 | data = self.cache.get('float_missing') 74 | np.testing.assert_array_equal(data, [np.nan] * 5 + [3.0] * 3) 75 | 76 | def test_int_missing(self): 77 | data = self.cache.get('int_missing') 78 | np.testing.assert_array_equal(data[:], [42] * 5 + [-1] * 3) 79 | 80 | def test_missing_select(self): 81 | data = self.cache['int_missing'] 82 | np.testing.assert_array_equal(data[:], [42, 42, -1, -1]) 83 | 84 | def test_float_select(self): 85 | data = self.cache['foo'] 86 | np.testing.assert_allclose(data, [3.0, 3, 15, 19]) 87 | 88 | def test_categorical_select(self): 89 | data = self.cache['cat'] 90 | np.testing.assert_array_equal(data, ['hello', 'hello', 'world', 'again']) 91 | 92 | def test_no_extract(self): 93 | data = self.cache.get('foo', extract=False) 94 | values = data.get() 95 | np.testing.assert_array_equal(values.timestamp, [104.0, 107.0, 995.0, 1010.0]) 96 | np.testing.assert_array_equal(values.value, [3.0, 6.0, 10.0, 25.0]) 97 | 98 | def test_no_extract_missing(self): 99 | data = self.cache.get('float_missing', extract=False) 100 | values = data.get() 101 | np.testing.assert_array_equal(values.timestamp, [995.0]) 102 | np.testing.assert_array_equal(values.value, [3.0]) 103 | 104 | def test_missing_sensor(self): 105 | with pytest.raises(KeyError): 106 | self.cache['sir_not_appearing_in_this_cache'] 107 | 108 | def test_partially_extract(self): 109 | self.cache1['foo'] 110 | data = self.cache.get('foo', extract=False) 111 | np.testing.assert_array_equal(data, self.cache.get('foo', extract=True)) 112 | 113 | def test_setitem_categorical(self): 114 | data = CategoricalData(['x', 'y', 'x'], [0, 2, 4, 8]) 115 | self.cache['dog'] = data 116 | ans = self.cache.get('dog') 117 | assert data.unique_values == ans.unique_values 118 | np.testing.assert_array_equal(data.events, ans.events) 119 | np.testing.assert_array_equal(data.indices, ans.indices) 120 | 121 | def test_setitem_array(self): 122 | data = np.array([1.0, 2, 3, 5, 8, 13, 21, 34]) 123 | self.cache['fib'] = data 124 | ans = self.cache.get('fib') 125 | np.testing.assert_array_equal(data, ans) 126 | 127 | def test_len(self): 128 | assert len(self.cache) == 4 129 | 130 | def test_keys(self): 131 | assert sorted(self.cache.keys()) == ['cat', 'float_missing', 'foo', 'int_missing'] 132 | 133 | def test_contains(self): 134 | assert 'cat' in self.cache 135 | assert 'float_missing' in self.cache 136 | assert 'int_missing' in self.cache 137 | assert 'dog' not in self.cache 138 | assert '' not in self.cache 139 | -------------------------------------------------------------------------------- /katdal/chunkstore_npy.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017-2018,2020-2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """A store of chunks (i.e. N-dimensional arrays) based on NPY files.""" 18 | 19 | import contextlib 20 | import errno 21 | import mmap 22 | import os 23 | 24 | import numpy as np 25 | 26 | from .chunkstore import (BadChunk, ChunkNotFound, ChunkStore, StoreUnavailable, 27 | npy_header_and_body) 28 | 29 | 30 | def _write_chunk(filename, chunk, direct_write): 31 | if not direct_write: 32 | return np.save(filename, chunk, allow_pickle=False) 33 | header, chunk = npy_header_and_body(chunk) 34 | size = len(header) + chunk.nbytes 35 | gran = mmap.ALLOCATIONGRANULARITY 36 | aligned_size = (size + gran - 1) // gran * gran 37 | with contextlib.closing(mmap.mmap(-1, aligned_size)) as aligned: 38 | aligned.write(header) 39 | aligned.write(chunk) 40 | aligned.seek(0) 41 | fd = os.open(filename, os.O_RDWR | os.O_CREAT | os.O_TRUNC | os.O_DIRECT, 0o666) 42 | try: 43 | os.write(fd, aligned) 44 | # We had to round the size up to a page, now correct back to exact size 45 | os.ftruncate(fd, size) 46 | finally: 47 | os.close(fd) 48 | 49 | 50 | class NpyFileChunkStore(ChunkStore): 51 | """A store of chunks (i.e. N-dimensional arrays) based on NPY files. 52 | 53 | Each chunk is stored in a separate binary file in NumPy ``.npy`` format. 54 | The filename is constructed as 55 | 56 | "//.npy" 57 | 58 | where "" is the chunk store directory specified on construction, 59 | "" is the name of the parent array of the chunk and "" is 60 | the index string of each chunk (e.g. "00001_00512"). 61 | 62 | For a description of the ``.npy`` format, see :py:mod:`numpy.lib.format` 63 | or the relevant NumPy Enhancement Proposal 64 | `here `_. 65 | 66 | Parameters 67 | ---------- 68 | path : string 69 | Top-level directory that contains NPY files of chunk store 70 | direct_write : bool 71 | If true, use ``O_DIRECT`` when writing the file. This bypasses the 72 | OS page cache, which can be useful to avoid filling it up with 73 | files that won't be read again. 74 | 75 | Raises 76 | ------ 77 | :exc:`chunkstore.StoreUnavailable` 78 | If path does not exist / is not readable 79 | :exc:`chunkstore.StoreUnavailable` 80 | If `direct_write` was requested but is not available 81 | """ 82 | 83 | def __init__(self, path, direct_write=False): 84 | super().__init__({IOError: ChunkNotFound, ValueError: ChunkNotFound}) 85 | if not os.path.isdir(path): 86 | raise StoreUnavailable(f'Directory {path!r} does not exist') 87 | self.path = path 88 | self.direct_write = direct_write 89 | if direct_write and not hasattr(os, 'O_DIRECT'): 90 | raise StoreUnavailable('direct_write requested but not supported on this OS') 91 | 92 | def get_chunk(self, array_name, slices, dtype): 93 | """See the docstring of :meth:`ChunkStore.get_chunk`.""" 94 | chunk_name, shape = self.chunk_metadata(array_name, slices, dtype=dtype) 95 | filename = os.path.join(self.path, chunk_name) + '.npy' 96 | with self._standard_errors(chunk_name): 97 | chunk = np.load(filename, allow_pickle=False) 98 | if chunk.shape != shape or chunk.dtype != dtype: 99 | raise BadChunk(f'Chunk {chunk_name!r}: NPY file dtype {chunk.dtype} and/or shape ' 100 | f'{chunk.shape} differs from expected dtype {dtype} and shape {shape}') 101 | return chunk 102 | 103 | def create_array(self, array_name): 104 | """See the docstring of :meth:`ChunkStore.create_array`.""" 105 | # Ensure any subdirectories are in place 106 | array_dir = os.path.join(self.path, array_name) 107 | try: 108 | os.makedirs(array_dir) 109 | except OSError as e: 110 | # Be happy if someone already created the path 111 | if e.errno != errno.EEXIST: 112 | raise 113 | 114 | def put_chunk(self, array_name, slices, chunk): 115 | """See the docstring of :meth:`ChunkStore.put_chunk`.""" 116 | chunk_name, _ = self.chunk_metadata(array_name, slices, chunk=chunk) 117 | base_filename = os.path.join(self.path, chunk_name) 118 | with self._standard_errors(chunk_name): 119 | # Rename the file when done writing to make put_chunk() atomic 120 | temp_filename = base_filename + '.writing.npy' 121 | _write_chunk(temp_filename, chunk, self.direct_write) 122 | os.rename(temp_filename, base_filename + '.npy') 123 | 124 | def mark_complete(self, array_name): 125 | """See the docstring of :meth:`ChunkStore.mark_complete`.""" 126 | self.create_array(array_name) 127 | touch_file = os.path.join(self.path, array_name, 'complete') 128 | with open(touch_file, 'a'): 129 | os.utime(touch_file, None) 130 | 131 | def is_complete(self, array_name): 132 | """See the docstring of :meth:`ChunkStore.is_complete`.""" 133 | touch_file = os.path.join(self.path, array_name, 'complete') 134 | return os.path.isfile(touch_file) 135 | 136 | get_chunk.__doc__ = ChunkStore.get_chunk.__doc__ 137 | put_chunk.__doc__ = ChunkStore.put_chunk.__doc__ 138 | mark_complete.__doc__ = ChunkStore.mark_complete.__doc__ 139 | is_complete.__doc__ = ChunkStore.is_complete.__doc__ 140 | -------------------------------------------------------------------------------- /doc/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Configuration file for the Sphinx documentation builder. 4 | # 5 | # This file does only contain a selection of the most common options. For a 6 | # full list see the documentation: 7 | # http://www.sphinx-doc.org/en/master/config 8 | 9 | # -- Path setup -------------------------------------------------------------- 10 | 11 | # If extensions (or modules to document with autodoc) are in another directory, 12 | # add these directories to sys.path here. If the directory is relative to the 13 | # documentation root, use os.path.abspath to make it absolute, like shown here. 14 | # 15 | import os 16 | import sys 17 | 18 | sys.path.insert(0, os.path.abspath('..')) 19 | import katdal # noqa: E402 20 | 21 | # -- Project information ----------------------------------------------------- 22 | 23 | project = 'katdal' 24 | copyright = '2019, South African Radio Astronomy Observatory' 25 | author = 'South African Radio Astronomy Observatory' 26 | 27 | # The short X.Y version 28 | version = '.'.join(katdal.__version__.split('.')[:2]) 29 | # The full version, including alpha/beta/rc tags 30 | release = katdal.__version__ 31 | 32 | 33 | # -- General configuration --------------------------------------------------- 34 | 35 | # If your documentation needs a minimal Sphinx version, state it here. 36 | # 37 | # needs_sphinx = '1.0' 38 | 39 | # Add any Sphinx extension module names here, as strings. They can be 40 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 41 | # ones. 42 | extensions = [ 43 | 'sphinx.ext.autodoc', 44 | 'sphinx.ext.napoleon', 45 | 'sphinx.ext.mathjax', 46 | 'sphinx.ext.intersphinx' 47 | ] 48 | 49 | # Add any paths that contain templates here, relative to this directory. 50 | templates_path = ['_templates'] 51 | 52 | # The suffix(es) of source filenames. 53 | # You can specify multiple suffix as a list of string: 54 | # 55 | # source_suffix = ['.rst', '.md'] 56 | source_suffix = '.rst' 57 | 58 | # The master toctree document. 59 | master_doc = 'index' 60 | 61 | # The language for content autogenerated by Sphinx. Refer to documentation 62 | # for a list of supported languages. 63 | # 64 | # This is also used if you do content translation via gettext catalogs. 65 | # Usually you set "language" from the command line for these cases. 66 | language = None 67 | 68 | # List of patterns, relative to source directory, that match files and 69 | # directories to ignore when looking for source files. 70 | # This pattern also affects html_static_path and html_extra_path. 71 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 72 | 73 | # The name of the Pygments (syntax highlighting) style to use. 74 | pygments_style = None 75 | 76 | autodoc_member_order = 'bysource' 77 | 78 | intersphinx_mapping = { 79 | 'katsdptelstate': ('https://katsdptelstate.readthedocs.io/en/latest', None), 80 | 'katpoint': ('https://katpoint.readthedocs.io/en/latest', None) 81 | } 82 | 83 | 84 | # -- Options for HTML output ------------------------------------------------- 85 | 86 | # The theme to use for HTML and HTML Help pages. See the documentation for 87 | # a list of builtin themes. 88 | # 89 | html_theme = 'sphinx_rtd_theme' 90 | 91 | # Theme options are theme-specific and customize the look and feel of a theme 92 | # further. For a list of options available for each theme, see the 93 | # documentation. 94 | # 95 | # html_theme_options = {} 96 | 97 | # Add any paths that contain custom static files (such as style sheets) here, 98 | # relative to this directory. They are copied after the builtin static files, 99 | # so a file named "default.css" will overwrite the builtin "default.css". 100 | html_static_path = ['_static'] 101 | 102 | # Custom sidebar templates, must be a dictionary that maps document names 103 | # to template names. 104 | # 105 | # The default sidebars (for documents that don't match any pattern) are 106 | # defined by theme itself. Builtin themes are using these templates by 107 | # default: ``['localtoc.html', 'relations.html', 'sourcelink.html', 108 | # 'searchbox.html']``. 109 | # 110 | # html_sidebars = {} 111 | 112 | 113 | # -- Options for HTMLHelp output --------------------------------------------- 114 | 115 | # Output file base name for HTML help builder. 116 | htmlhelp_basename = 'katdaldoc' 117 | 118 | 119 | # -- Options for LaTeX output ------------------------------------------------ 120 | 121 | latex_elements = { 122 | # The paper size ('letterpaper' or 'a4paper'). 123 | # 124 | # 'papersize': 'letterpaper', 125 | 126 | # The font size ('10pt', '11pt' or '12pt'). 127 | # 128 | # 'pointsize': '10pt', 129 | 130 | # Additional stuff for the LaTeX preamble. 131 | # 132 | # 'preamble': '', 133 | 134 | # Latex figure (float) alignment 135 | # 136 | # 'figure_align': 'htbp', 137 | } 138 | 139 | # Grouping the document tree into LaTeX files. List of tuples 140 | # (source start file, target name, title, 141 | # author, documentclass [howto, manual, or own class]). 142 | latex_documents = [ 143 | (master_doc, 'katdal.tex', 'katdal Documentation', 144 | 'Ludwig Schwardt', 'manual'), 145 | ] 146 | 147 | 148 | # -- Options for manual page output ------------------------------------------ 149 | 150 | # One entry per manual page. List of tuples 151 | # (source start file, name, description, authors, manual section). 152 | man_pages = [ 153 | (master_doc, 'katdal', 'katdal Documentation', 154 | [author], 1) 155 | ] 156 | 157 | 158 | # -- Options for Texinfo output ---------------------------------------------- 159 | 160 | # Grouping the document tree into Texinfo files. List of tuples 161 | # (source start file, target name, title, author, 162 | # dir menu entry, description, category) 163 | texinfo_documents = [ 164 | (master_doc, 'katdal', 'katdal Documentation', 165 | author, 'katdal', 'One line description of project.', 166 | 'Miscellaneous'), 167 | ] 168 | 169 | 170 | # -- Options for Epub output ------------------------------------------------- 171 | 172 | # Bibliographic Dublin Core info. 173 | epub_title = project 174 | 175 | # The unique identifier of the text. This can be a ISBN number 176 | # or the project homepage. 177 | # 178 | # epub_identifier = '' 179 | 180 | # A unique identification for the text. 181 | # 182 | # epub_uid = '' 183 | 184 | # A list of files that should not be packed into the epub file. 185 | epub_exclude_files = ['search.html'] 186 | 187 | 188 | # -- Extension configuration ------------------------------------------------- 189 | -------------------------------------------------------------------------------- /katdal/__init__.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2011-2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Data access library for data sets in the MeerKAT Visibility Format (MVF).""" 18 | 19 | import logging as _logging 20 | import urllib.parse 21 | 22 | from .concatdata import ConcatenatedDataSet 23 | from .dataset import DataSet, WrongVersion # noqa: F401 24 | from .datasources import open_data_source 25 | from .h5datav1 import H5DataV1 26 | from .h5datav2 import H5DataV2 27 | from .h5datav3 import H5DataV3 28 | from .lazy_indexer import LazyTransform, dask_getitem # noqa: F401 29 | from .spectral_window import SpectralWindow # noqa: F401 30 | from .visdatav4 import VisibilityDataV4 31 | 32 | 33 | # Setup library logger and add a print-like handler used when no logging is configured 34 | class _NoConfigFilter(_logging.Filter): 35 | """Filter which only allows event if top-level logging is not configured.""" 36 | 37 | def filter(self, record): 38 | return 1 if not _logging.root.handlers else 0 39 | 40 | 41 | _no_config_handler = _logging.StreamHandler() 42 | _no_config_handler.setFormatter(_logging.Formatter(_logging.BASIC_FORMAT)) 43 | _no_config_handler.addFilter(_NoConfigFilter()) 44 | logger = _logging.getLogger(__name__) 45 | logger.addHandler(_no_config_handler) 46 | 47 | # BEGIN VERSION CHECK 48 | # Get package version when locally imported from repo or via -e develop install 49 | try: 50 | import katversion as _katversion 51 | except ImportError: 52 | import time as _time 53 | __version__ = "0.0+unknown.{}".format(_time.strftime('%Y%m%d%H%M')) 54 | else: 55 | __version__ = _katversion.get_version(__path__[0]) 56 | # END VERSION CHECK 57 | 58 | # ----------------------------------------------------------------------------- 59 | # -- Top-level functions passed on to the appropriate format handler 60 | # ----------------------------------------------------------------------------- 61 | 62 | formats = [H5DataV3, H5DataV2, H5DataV1] 63 | 64 | 65 | def _file_action(action, filename, *args, **kwargs): 66 | """Perform action on data file using the appropriate format class. 67 | 68 | Parameters 69 | ---------- 70 | action : string 71 | Name of method to call on format class 72 | filename : string 73 | Data file name 74 | args, kwargs : extra parameters to method (optional) 75 | 76 | Returns 77 | ------- 78 | result : object 79 | Result of action 80 | 81 | """ 82 | for format in formats: 83 | try: 84 | result = getattr(format, action)(filename, *args, **kwargs) 85 | break 86 | except WrongVersion: 87 | continue 88 | else: 89 | raise WrongVersion(f"File '{filename}' has unknown data file format or version") 90 | return result 91 | 92 | 93 | def open(filename, ref_ant='', time_offset=0.0, **kwargs): 94 | """Open data file(s) with loader of the appropriate version. 95 | 96 | Parameters 97 | ---------- 98 | filename : string or sequence of strings 99 | Data file name or list of file names 100 | ref_ant : string, optional 101 | Name of reference antenna (default is first antenna in use) 102 | time_offset : float, optional 103 | Offset to add to all timestamps, in seconds 104 | kwargs : dict, optional 105 | Extra keyword arguments are passed on to underlying accessor class: 106 | 107 | mode (string, optional) 108 | [H5DataV*] File opening mode (e.g. 'r+' to open file in write mode) 109 | quicklook (bool) 110 | [H5DataV2] True if synthesised timestamps should be used to 111 | partition data set even if real timestamps are irregular, thereby 112 | avoiding the slow loading of real timestamps at the cost of 113 | slightly inaccurate label borders 114 | 115 | See the documentation of :class:`VisibilityDataV4` for the keywords 116 | it accepts. 117 | 118 | Returns 119 | ------- 120 | data : :class:`DataSet` object 121 | Object providing :class:`DataSet` interface to file(s) 122 | 123 | """ 124 | if isinstance(filename, str): 125 | filenames = [filename] 126 | else: 127 | unexpected = set(kwargs.get('preselect', {})) - {'channels'} 128 | if unexpected: 129 | raise IndexError(f'Unsupported preselect key(s) for ConcatenatedDataSet: {unexpected}') 130 | filenames = filename 131 | datasets = [] 132 | for f in filenames: 133 | # V4 RDB file or live telstate with optional URL-style query string 134 | parsed = urllib.parse.urlsplit(f) 135 | if parsed.path.endswith('.rdb') or parsed.scheme != '': 136 | dataset = VisibilityDataV4(open_data_source(f, **kwargs), 137 | ref_ant, time_offset, **kwargs) 138 | else: 139 | if 'preselect' in kwargs: 140 | raise TypeError('preselect is not supported for this format') 141 | dataset = _file_action('__call__', f, ref_ant, time_offset, **kwargs) 142 | datasets.append(dataset) 143 | return datasets[0] if isinstance(filename, str) else ConcatenatedDataSet(datasets) 144 | 145 | 146 | def get_ants(filename): 147 | """Quick look function to get the list of antennas in a data file. 148 | 149 | Parameters 150 | ---------- 151 | filename : string 152 | Data file name 153 | 154 | Returns 155 | ------- 156 | antennas : list of :class:`katpoint.Antenna` objects 157 | 158 | """ 159 | return _file_action('_get_ants', filename) 160 | 161 | 162 | def get_targets(filename): 163 | """Quick look function to get the list of targets in a data file. 164 | 165 | Parameters 166 | ---------- 167 | filename : string 168 | Data file name 169 | 170 | Returns 171 | ------- 172 | targets : :class:`katpoint.Catalogue` object 173 | All targets in file 174 | 175 | """ 176 | return _file_action('_get_targets', filename) 177 | -------------------------------------------------------------------------------- /scripts/spectrogram_plot_example.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python 2 | 3 | ################################################################################ 4 | # Copyright (c) 2012-2016,2018,2021, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | # 20 | # Plot spectrogram of entire dataset in an efficient way that only loads 21 | # enough data that will fit onto the screen. 22 | # 23 | # Ludwig Schwardt 24 | # 26 June 2012 25 | # 26 | 27 | import optparse 28 | import time 29 | 30 | import matplotlib.pyplot as plt 31 | import numpy as np 32 | 33 | import katdal 34 | 35 | 36 | class ResampledImage: 37 | """Image that only loads enough data that will fit onto screen pixels. 38 | 39 | Parameters 40 | ---------- 41 | data : array-like, shape at least (N, M) 42 | Data object with ndarray interface 43 | extract : function, signature ``xy_data = f(data, x, y)``, optional 44 | Function used to extract 2-D image array from data object given x and 45 | y indices, using getitem interface on data by default 46 | autoscale : {False, True}, optional 47 | True if image should be renormalised after each update or zoom 48 | ax : :class:`matplotlib.axes.Axes` object or None, optional 49 | Axes onto which to plot image 50 | kwargs : dict, optional 51 | Additional parameters are passed on to underlying imshow 52 | 53 | """ 54 | def __init__(self, data, extract=None, autoscale=False, ax=None, **kwargs): 55 | self.data = data 56 | self.extract = extract if extract is not None else lambda d, x, y: d[y, x] 57 | self.autoscale = autoscale 58 | self.ax = ax if ax is not None else plt.gca() 59 | kwargs.update({'aspect': 'auto', 'origin': 'lower', 'interpolation': 'nearest', 60 | 'extent': (-0.5, data.shape[1] - 0.5, -0.5, data.shape[0] - 0.5)}) 61 | self.image = self.ax.imshow([[0]], **kwargs) 62 | self.update() 63 | # Connect to all events that change the data limits or the number of pixels in image 64 | self.ax.callbacks.connect('xlim_changed', self.update) 65 | self.ax.callbacks.connect('ylim_changed', self.update) 66 | self.ax.figure.canvas.mpl_connect('resize_event', self.update) 67 | 68 | def update(self, param=None): 69 | """Load required data and update image.""" 70 | data_limits, view_limits = self.ax.dataLim, self.ax.viewLim 71 | display_limits = self.ax.get_window_extent() 72 | # print "data =", data_limits.extents[[0, 2, 1, 3]].tolist() 73 | # print "view =", view_limits.extents[[0, 2, 1, 3]].tolist() 74 | # print "display =", display_limits.extents[[0, 2, 1, 3]].tolist() 75 | data_scale_x = self.data.shape[1] / data_limits.width 76 | data_scale_y = self.data.shape[0] / data_limits.height 77 | x_from = max(int(np.floor(data_scale_x * (view_limits.x0 - data_limits.x0))), 0) 78 | y_from = max(int(np.floor(data_scale_y * (view_limits.y0 - data_limits.y0))), 0) 79 | x_to = max(int(np.ceil(data_scale_x * (view_limits.x1 - data_limits.x0))), x_from + 1) 80 | y_to = max(int(np.ceil(data_scale_y * (view_limits.y1 - data_limits.y0))), y_from + 1) 81 | x_step = max(int(view_limits.width / display_limits.width), 1) 82 | y_step = max(int(view_limits.height / display_limits.height), 1) 83 | # print "range = %d:%d:%d, %d:%d:%d" % (x_from, x_to, x_step, y_from, y_to, y_step) 84 | x_slice = slice(x_from, x_to, x_step) 85 | y_slice = slice(y_from, y_to, y_step) 86 | x_inds = list(range(*x_slice.indices(self.data.shape[1]))) 87 | y_inds = list(range(*y_slice.indices(self.data.shape[0]))) 88 | im_left = x_inds[0] / data_scale_x + data_limits.x0 89 | im_right = (x_inds[-1] + 1) / data_scale_x + data_limits.x0 90 | im_bottom = y_inds[0] / data_scale_y + data_limits.y0 91 | im_top = (y_inds[-1] + 1) / data_scale_y + data_limits.y0 92 | # print "im =", (im_left, im_right, im_bottom, im_top) 93 | before = time.time() 94 | # Load and update image data and make it fill the view 95 | data = self.extract(self.data, x_slice, y_slice) 96 | extract_time = time.time() - before 97 | size_bytes = data.size * np.dtype('complex64').itemsize 98 | print("Loaded %d visibilities - x %s y %s - in %.2f seconds (%g MB/s)" % 99 | (data.size, x_slice, y_slice, extract_time, size_bytes * 1e-6 / extract_time)) 100 | self.image.set_data(data) 101 | self.image._extent = (im_left, im_right, im_bottom, im_top) 102 | if self.autoscale: 103 | self.image.autoscale() 104 | else: 105 | # Keep the same normalisation as soon as the extreme data values are known 106 | self.image.norm.vmin = min(self.image.norm.vmin, data.min()) 107 | self.image.norm.vmax = max(self.image.norm.vmax, data.max()) 108 | self.ax.figure.canvas.draw_idle() 109 | 110 | 111 | parser = optparse.OptionParser(usage="%prog [options] [ ...]", 112 | description='Waterfall plot from HDF5 data file(s)') 113 | parser.add_option('-a', '--ant', 114 | help="Antenna to plot (e.g. 'ant1'), default is first antenna") 115 | parser.add_option('-p', '--pol', type='choice', choices=['H', 'V'], default='H', 116 | help="Polarisation term to use ('H' or 'V'), default is %default") 117 | parser.add_option('-s', '--autoscale', action='store_true', default=False, 118 | help="Renormalise colour scale after each zoom or resize, default is %default") 119 | (opts, args) = parser.parse_args() 120 | 121 | if len(args) == 0: 122 | print('Please specify at least one HDF5 file to load') 123 | else: 124 | d = katdal.open(args) 125 | ant = opts.ant if opts.ant is not None else d.ref_ant 126 | d.select(ants=ant, pol=opts.pol) 127 | 128 | plt.figure(1) 129 | plt.clf() 130 | ax = plt.subplot(1, 1, 1) 131 | im = ResampledImage(d.vis, extract=lambda data, x, y: np.abs(data[y, x, 0]), 132 | autoscale=opts.autoscale, ax=ax) 133 | ax.set_xlabel('Channel index') 134 | ax.set_ylabel('Dump index') 135 | ax.set_title(f'Spectrogram {d.name} {ant} {opts.pol}') 136 | plt.show() 137 | -------------------------------------------------------------------------------- /katdal/test/s3_utils.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2017,2020-2021,2023, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Test utilities for code that interacts with the S3 API. 18 | 19 | It provides a class for managing running an external S3 server (currently 20 | `MinIO`_). 21 | 22 | Versions of minio prior to 2018-08-25T01:56:38Z contain a `race condition`_ 23 | that can cause it to crash when queried at the wrong point during startup, so 24 | should not be used. 25 | 26 | .. _minio: https://github.com/minio/minio 27 | .. _race condition: https://github.com/minio/minio/issues/6324 28 | """ 29 | 30 | import contextlib 31 | import os 32 | import pathlib 33 | import subprocess 34 | import time 35 | import urllib.parse 36 | 37 | import requests 38 | 39 | 40 | class MissingProgram(RuntimeError): 41 | """An required executable program was not found.""" 42 | 43 | 44 | class ProgramFailed(RuntimeError): 45 | """An external program did not run successfully.""" 46 | 47 | 48 | class S3User: 49 | """Credentials for an S3 user.""" 50 | 51 | def __init__(self, access_key: str, secret_key: str) -> None: 52 | self.access_key = access_key 53 | self.secret_key = secret_key 54 | 55 | 56 | class S3Server: 57 | """Run and manage an external program to run an S3 server. 58 | 59 | This can be used as a context manager, to shut down the server when 60 | finished. 61 | 62 | Parameters 63 | ---------- 64 | host 65 | Host to bind to 66 | port 67 | Port to bind to 68 | path 69 | Directory in which objects and config will be stored. 70 | user 71 | Credentials for the default admin user. 72 | 73 | Attributes 74 | ---------- 75 | host 76 | Hostname for connecting to the server 77 | port 78 | Port for connecting to the server 79 | url 80 | Base URL for the server 81 | auth_url 82 | URL with the access_key and secret_key baked in 83 | path 84 | Path given to the constructor 85 | user 86 | User given to the constructor 87 | 88 | Raises 89 | ------ 90 | MissingProgram 91 | if the ``minio`` binary was not found. 92 | ProgramFailed 93 | if minio started but failed before it became healthy 94 | """ 95 | 96 | def __init__(self, host: str, port: int, path: pathlib.Path, user: S3User) -> None: 97 | self.host = host 98 | self.port = port 99 | self.path = path 100 | self.user = user 101 | self.url = f'http://{self.host}:{self.port}' 102 | self.auth_url = f'http://{user.access_key}:{user.secret_key}@{self.host}:{self.port}' 103 | self._process = None 104 | 105 | env = os.environ.copy() 106 | env['MINIO_BROWSER'] = 'off' 107 | env['MINIO_ROOT_USER'] = self.user.access_key 108 | env['MINIO_ROOT_PASSWORD'] = self.user.secret_key 109 | try: 110 | self._process = subprocess.Popen( 111 | [ 112 | 'minio', 'server', '--quiet', 113 | '--address', f'{self.host}:{self.port}', 114 | '-C', str(self.path / 'config'), 115 | str(self.path / 'data'), 116 | ], 117 | stdout=subprocess.DEVNULL, 118 | env=env 119 | ) 120 | except OSError as exc: 121 | raise MissingProgram(f'Could not run minio: {exc}') from exc 122 | 123 | with contextlib.ExitStack() as exit_stack: 124 | exit_stack.callback(self._process.terminate) 125 | health_url = urllib.parse.urljoin(self.url, '/minio/health/ready') 126 | for i in range(100): 127 | try: 128 | with requests.get(health_url) as resp: 129 | if ( 130 | # Server is up... 131 | resp.ok 132 | # and initialised, therefore ready for requests 133 | and resp.headers.get('X-Minio-Server-Status') != 'offline' 134 | ): 135 | break 136 | except requests.ConnectionError: 137 | pass 138 | if self._process.poll() is not None: 139 | raise ProgramFailed('Minio died before it became healthy') 140 | time.sleep(0.1) 141 | else: 142 | raise ProgramFailed('Timed out waiting for minio to be ready') 143 | exit_stack.pop_all() 144 | 145 | def wipe(self) -> None: 146 | """Remove all buckets and objects, but leave the server running. 147 | 148 | See :meth:`mc` for information about exceptions. 149 | """ 150 | self.mc('rb', '--force', '--dangerous', 'minio') 151 | 152 | def close(self) -> None: 153 | """Shut down the server.""" 154 | if self._process: 155 | self._process.terminate() 156 | self._process.wait() 157 | self._process = None 158 | 159 | def __enter__(self) -> 'S3Server': 160 | return self 161 | 162 | def __exit__(self, exc_type, exc_value, exc_tb) -> None: 163 | self.close() 164 | 165 | def mc(self, *args) -> None: 166 | """Run a (minio) mc subcommand against the running server. 167 | 168 | The running server has the alias ``minio``. 169 | 170 | .. note:: 171 | 172 | The credentials will be exposed in the environment. This is only 173 | intended for unit testing, and hence not with sensitive 174 | credentials. 175 | 176 | Raises 177 | ------ 178 | MissingProgram 179 | if the ``mc`` command is not found on the path 180 | ProgramFailed 181 | if the command returned a non-zero exit status. The exception 182 | message will include the stderr output. 183 | """ 184 | env = os.environ.copy() 185 | env['MC_HOST_minio'] = self.auth_url 186 | # --config-dir is set just to prevent any config set by the user 187 | # from interfering with the test. 188 | try: 189 | subprocess.run( 190 | [ 191 | 'mc', '--quiet', '--no-color', f'--config-dir={self.path}', 192 | *args 193 | ], 194 | stdout=subprocess.DEVNULL, 195 | stderr=subprocess.PIPE, 196 | env=env, 197 | encoding='utf-8', 198 | errors='replace', 199 | check=True 200 | ) 201 | except OSError as exc: 202 | raise MissingProgram(f'mc could not be run: {exc}') from exc 203 | except subprocess.CalledProcessError as exc: 204 | raise ProgramFailed(exc.stderr) from exc 205 | -------------------------------------------------------------------------------- /katdal/averager.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2011,2016-2019,2021-2023, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | import numba 18 | import numpy as np 19 | 20 | 21 | @numba.jit(nopython=True, parallel=True) 22 | def _average_visibilities(vis, weight, flag, timeav, chanav, flagav): 23 | # Workaround for https://github.com/numba/numba/issues/2921 24 | flag_u8 = flag.view(np.uint8) 25 | 26 | # Compute shapes 27 | n_time, n_chans, n_bl = vis.shape 28 | av_n_time = n_time // timeav 29 | av_n_chans = n_chans // chanav 30 | av_shape = (av_n_time, av_n_chans, n_bl) 31 | 32 | # Allocate output buffers 33 | av_vis = np.empty(av_shape, vis.dtype) 34 | av_weight = np.empty(av_shape, weight.dtype) 35 | av_flag = np.empty(av_shape, flag.dtype) 36 | 37 | scale = weight.dtype.type(1.0 / (timeav * chanav)) 38 | wzero = weight.dtype.type(0) # Zero constant of correct type 39 | 40 | bl_step = 128 # Want a chunk to be multiple cache lines but into L1 41 | # We put channel as the outer loop just because it's more likely than 42 | # time to get parallel speedup with prange (since the time axis is often 43 | # short e.g. 1). 44 | for av_c in numba.prange(0, av_n_chans): 45 | cstart = av_c * chanav 46 | vis_sum = np.empty(bl_step, vis.dtype) 47 | vis_weight_sum = np.empty(bl_step, vis.dtype) 48 | weight_sum = np.empty(bl_step, weight.dtype) 49 | flag_any = np.empty(bl_step, dtype=np.bool_) 50 | flag_all = np.empty(bl_step, dtype=np.bool_) 51 | for av_t in range(0, av_n_time): 52 | tstart = av_t * timeav 53 | for bstart in range(0, n_bl, bl_step): 54 | bstop = min(n_bl, bstart + bl_step) 55 | vis_sum[:] = 0 56 | vis_weight_sum[:] = 0 57 | weight_sum[:] = 0 58 | flag_any[:] = False 59 | flag_all[:] = True 60 | for t in range(tstart, tstart + timeav): 61 | for c in range(cstart, cstart + chanav): 62 | for b in range(bstop - bstart): 63 | b1 = b + bstart 64 | v = vis[t, c, b1] 65 | w = weight[t, c, b1] 66 | f = (flag_u8[t, c, b1] != 0) 67 | if f: 68 | # Don't simply use 0 here: it causes numba's type 69 | # inference to upgrade w from float32 to float64. 70 | w = wzero 71 | flag_any[b] |= f 72 | flag_all[b] &= f 73 | vis_sum[b] += v 74 | vis_weight_sum[b] += w * v 75 | weight_sum[b] += w 76 | for b in range(bstop - bstart): 77 | b1 = b + bstart 78 | w = np.float32(weight_sum[b]) 79 | # If everything is flagged/zero-weighted, use an unweighted average 80 | if not w: 81 | v = vis_sum[b] * scale 82 | else: 83 | v = vis_weight_sum[b] / w 84 | f = flag_any[b] if flagav else flag_all[b] 85 | av_vis[av_t, av_c, b1] = v 86 | av_weight[av_t, av_c, b1] = w 87 | av_flag[av_t, av_c, b1] = f 88 | return av_vis, av_weight, av_flag 89 | 90 | 91 | def average_visibilities(vis, weight, flag, timestamps, channel_freqs, timeav=10, chanav=8, flagav=False): 92 | """Average visibilities, flags and weights. 93 | 94 | Visibilities are weight-averaged using the weights in the `weight` array 95 | with flagged data set to weight zero. The averaged weights are the sum of 96 | the input weights for each average block. An average flag is retained if 97 | all of the data in an averaging block is flagged (the averaged visibility 98 | in this case is the unweighted average of the input visibilities). In cases 99 | where the averaging size in channel or time does not evenly divide the size 100 | of the input data, the remaining channels or timestamps at the end of the 101 | array after averaging are discarded. Channels are averaged first and the 102 | timestamps are second. An array of timestamps and frequencies corresponding 103 | to each channel is also directly averaged and returned. 104 | 105 | Parameters 106 | ---------- 107 | vis: array(numtimestamps,numchannels,numbaselines) of complex64. 108 | The input visibilities to be averaged. 109 | weight: array(numtimestamps,numchannels,numbaselines) of float32. 110 | The input weights (used for weighted averaging). 111 | flag: array(numtimestamps,numchannels,numbaselines) of boolean. 112 | Input flags (flagged data have weight zero before averaging). 113 | timestamps: array(numtimestamps) of int. 114 | The timestamps (in mjd seconds) corresponding to the input data. 115 | channel_freqs: array(numchannels) of int. 116 | The frequencies (in Hz) corresponding to the input channels. 117 | timeav: int. 118 | The desired averaging size in timestamps. 119 | chanav: int. 120 | The desired averaging size in channels. 121 | flagav: bool 122 | Flagged averaged data in when there is a single flag in the bin if true. 123 | Only flag averaged data when all data in the bin is flagged if false. 124 | 125 | Returns 126 | ------- 127 | av_vis: array(int(numtimestamps/timeav),int(numchannels/chanav)) of complex64. 128 | av_weight: array(int(numtimestamps/timeav),int(numchannels/chanav)) of float32. 129 | av_flag: array(int(numtimestamps/timeav),int(numchannels/chanav)) of boolean. 130 | av_mjd: array(int(numtimestamps/timeav)) of int. 131 | av_freq: array(int(numchannels)/chanav) of int. 132 | 133 | """ 134 | # Trim data to integer multiples of the averaging factors 135 | n_time, n_chans, n_bl = vis.shape 136 | timeav = min(timeav, n_time) 137 | flagav = min(flagav, n_chans) 138 | n_time = n_time // timeav * timeav 139 | n_chans = n_chans // chanav * chanav 140 | 141 | vis = vis[:n_time, :n_chans] 142 | weight = weight[:n_time, :n_chans] 143 | flag = flag[:n_time, :n_chans] 144 | timestamps = timestamps[:n_time] 145 | channel_freqs = channel_freqs[:n_chans] 146 | 147 | # Average the data (using a numba-accelerated function) 148 | av_vis, av_weight, av_flag = \ 149 | _average_visibilities(vis, weight, flag, timeav, chanav, flagav) 150 | 151 | # Average the metadata 152 | av_freq = np.mean(channel_freqs.reshape(-1, chanav), axis=-1) 153 | av_timestamps = np.mean(timestamps.reshape(-1, timeav), axis=-1) 154 | 155 | return av_vis, av_weight, av_flag, av_timestamps, av_freq 156 | -------------------------------------------------------------------------------- /scripts/mvf_copy.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ################################################################################ 4 | # Copyright (c) 2021-2024, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | # 20 | # Make a local copy of an MVF4 dataset, optionally filtering it. 21 | # 22 | # Ludwig Schwardt 23 | # 19 October 2021 24 | # 25 | 26 | import argparse 27 | import os 28 | from pathlib import Path, PurePosixPath 29 | from urllib.parse import urlparse 30 | 31 | import dask 32 | import dask.array as da 33 | from dask.diagnostics import ProgressBar 34 | import katsdptelstate 35 | from katsdptelstate.rdb_writer import RDBWriter 36 | import katdal 37 | from katdal.chunkstore_npy import NpyFileChunkStore 38 | from katdal.datasources import view_capture_stream 39 | from katdal.lazy_indexer import dask_getitem 40 | 41 | 42 | DESCRIPTION = """ 43 | Copy MVFv4 dataset (or a part of it) from S3/disk to disk using dask. 44 | 45 | Run the script like this: 46 | 47 | mvf_copy.py https://archive/1698676533/1698676533_sdp_l0.full.rdb?token=<> dest 48 | 49 | or: 50 | 51 | mvf_copy.py src_dir dest_dir 52 | 53 | Data will appear in three subdirectories in the specified output directory as 54 | 55 | dest/1698676533/... 56 | dest/1698676533-sdp-l0/... 57 | dest/1698676533-sdp-l1-flags/... 58 | 59 | Open the local dataset like this: 60 | 61 | d = katdal.open("dest/1698676533/1698676533_sdp_l0.full.rdb") 62 | 63 | BONUS: you can even copy just parts of the data by selecting a subset of 64 | correlation products. The --corrprods value is passed to DataSet.select(). 65 | 66 | While dask allows multiple retries while downloading chunks, it currently has 67 | no way to resume copying if the script crashes. For peace of mind, consider 68 | using the mvf_download.py script instead if you are just trying to download 69 | your dataset from the archive to disk. You are stuck with mvf_copy.py if you 70 | are copying from disk to disk or you want to cull some correlation products. 71 | 72 | Some examples: 73 | 74 | mvf_copy.py url directory --corrprods=auto 75 | mvf_copy.py url directory --corrprods=cross 76 | """ 77 | 78 | 79 | def parse_args(): 80 | """Parse script arguments.""" 81 | parser = argparse.ArgumentParser( 82 | usage='%(prog)s [-h] [--corrprods CORRPRODS] [--workers N] source dest', 83 | description=DESCRIPTION, 84 | formatter_class=argparse.RawDescriptionHelpFormatter, 85 | ) 86 | parser.add_argument('source', help='Dataset URL (or input RDB file path)') 87 | parser.add_argument('dest', type=Path, help='Output directory') 88 | parser.add_argument('--corrprods', 89 | help='Select correlation products (kwarg to ' 90 | 'katdal.DataSet.select). Keeps all corrprods by default.') 91 | parser.add_argument('--workers', type=int, default=8 * dask.system.CPU_COUNT, 92 | help='Number of dask workers for parallel I/O [%(default)s]') 93 | args = parser.parse_args() 94 | return args 95 | 96 | 97 | def extra_flag_streams(telstate, capture_block_id, stream_name): 98 | """Look for associated flag streams and return corresponding telstate views.""" 99 | # This is a simplified version of katdal.datasources._upgrade_flags 100 | telstate_extra_flags = [] 101 | for s in telstate.get('sdp_archived_streams', []): 102 | telstate_cs = view_capture_stream(telstate.root(), capture_block_id, s) 103 | if telstate_cs.get('stream_type') == 'sdp.flags' and \ 104 | stream_name in telstate_cs['src_streams']: 105 | telstate_extra_flags.append(telstate_cs) 106 | return telstate_extra_flags 107 | 108 | 109 | def stream_graphs(telstate, store, corrprod_mask, out_telstate, out_store): 110 | """Prepare Dask graphs to copy all chunked arrays of a capture stream. 111 | 112 | This returns a list of Dask graphs and also modifies `out_telstate` and 113 | `out_store`. 114 | """ 115 | out_n_baselines = corrprod_mask.sum() 116 | out_chunk_info = {} 117 | graphs = [] 118 | for array, info in telstate['chunk_info'].items(): 119 | array_name = store.join(info['prefix'], array) 120 | darray = store.get_dask_array(array_name, info['chunks'], info['dtype']) 121 | # Filter the correlation products if array has them 122 | if darray.ndim == 3: 123 | indices = (slice(None), slice(None), corrprod_mask) 124 | # Try to turn fancy indexing into slices (works for autocorrs) 125 | darray = dask_getitem(darray, indices) 126 | info['chunks'] = info['chunks'][:2] + ((out_n_baselines,),) 127 | info['shape'] = info['shape'][:2] + (out_n_baselines,) 128 | out_store.create_array(array_name) 129 | graphs.append(out_store.put_dask_array(array_name, darray)) 130 | out_chunk_info[array] = info 131 | out_telstate[telstate.prefixes[0] + 'chunk_info'] = out_chunk_info 132 | return graphs 133 | 134 | 135 | def main(): 136 | """Main routine of mvf_copy script.""" 137 | args = parse_args() 138 | 139 | d = katdal.open(args.source) 140 | # XXX Simplify this once corrprods can accept slices as advertised 141 | kwargs = {} 142 | if args.corrprods is not None: 143 | kwargs['corrprods'] = args.corrprods 144 | d.select(**kwargs) 145 | 146 | # Convenience variables 147 | cbid = d.source.capture_block_id 148 | stream = d.source.stream_name 149 | telstate = d.source.telstate 150 | # XXX Replace private member with public corrprod index member when it exists 151 | corrprod_mask = d._corrprod_keep 152 | rdb_filename = PurePosixPath(urlparse(args.source).path).name 153 | 154 | telstate_overrides = katsdptelstate.TelescopeState() 155 | # Override bls_ordering in telstate (in stream namespace) to match dataset selection 156 | telstate_overrides.view(stream)['bls_ordering'] = d.corr_products 157 | telstate_overrides.view(stream)['n_bls'] = len(d.corr_products) 158 | os.makedirs(args.dest / cbid, exist_ok=True) 159 | out_store = NpyFileChunkStore(args.dest) 160 | # Iterate over all stream views, setting up Dask graph for each chunked array 161 | graphs = [] 162 | for view in [telstate] + extra_flag_streams(telstate, cbid, stream): 163 | graphs.extend(stream_graphs(view, d.source.data.store, corrprod_mask, 164 | telstate_overrides, out_store)) 165 | 166 | # Save original telstate + overrides to new RDB file (without duplicate keys) 167 | unmodified_keys = set(telstate.keys()) - set(telstate_overrides.keys()) 168 | with RDBWriter(args.dest / cbid / rdb_filename) as rdbw: 169 | rdbw.save(telstate.backend, unmodified_keys) 170 | rdbw.save(telstate_overrides.backend) 171 | # Transfer chunks to final resting place, filtering them along the way 172 | with ProgressBar(): 173 | errors = da.compute(*graphs, num_workers=args.workers) 174 | # put_dask_array returns an array with an exception object per chunk 175 | for array_errors in errors: 176 | for chunk_error in array_errors.flat: 177 | if chunk_error is not None: 178 | raise chunk_error 179 | 180 | 181 | if __name__ == '__main__': 182 | main() 183 | -------------------------------------------------------------------------------- /katdal/spectral_window.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2011,2018,2021, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | import threading 18 | 19 | import numpy as np 20 | 21 | 22 | class SpectralWindow: 23 | """Spectral window specification. 24 | 25 | A spectral window is determined by the number of frequency channels produced 26 | by the correlator and their corresponding centre frequencies, as well as the 27 | channel width. The channels are assumed to be regularly spaced and to be the 28 | result of either lower-sideband downconversion (channel frequencies 29 | decreasing with channel index) or upper-sideband downconversion (frequencies 30 | increasing with index). For further information the receiver band and 31 | correlator product names are also available. 32 | 33 | .. warning:: 34 | 35 | Instances should be treated as immutable. Changing the attributes will 36 | lead to inconsistencies between them. 37 | 38 | Parameters 39 | ---------- 40 | centre_freq : float 41 | Centre frequency of spectral window, in Hz 42 | channel_width : float 43 | Bandwidth of each frequency channel, in Hz 44 | num_chans : int 45 | Number of frequency channels 46 | product : string, optional 47 | Name of data product / correlator mode 48 | sideband : {-1, +1}, optional 49 | Type of downconversion (-1 => lower sideband, +1 => upper sideband) 50 | band : {'L', 'UHF', 'S', 'X', 'Ku'}, optional 51 | Name of receiver / band 52 | bandwidth : float, optional 53 | The bandwidth of the whole spectral window, in Hz. If specified, 54 | `channel_width` is ignored and computed from the bandwidth. If not 55 | specified, bandwidth is computed from the channel width. Specifying 56 | this is a good idea if the channel width cannot be exactly represented 57 | in floating point. 58 | 59 | Attributes 60 | ---------- 61 | channel_freqs : array of float, shape (*F*,) 62 | Centre frequency of each frequency channel (assuming LSB mixing), in Hz 63 | """ 64 | 65 | def __init__(self, centre_freq, channel_width, num_chans, product=None, 66 | sideband=-1, band='L', bandwidth=None): 67 | if bandwidth is None: 68 | bandwidth = channel_width * num_chans 69 | else: 70 | channel_width = bandwidth / num_chans 71 | self.centre_freq = centre_freq 72 | self.channel_width = channel_width 73 | self.bandwidth = bandwidth 74 | self.num_chans = num_chans 75 | self.product = product if product is not None else '' 76 | self.sideband = sideband 77 | self.band = band 78 | # channel_freqs is computed on demand 79 | self._channel_freqs_lock = threading.Lock() 80 | self._channel_freqs = None 81 | 82 | @property 83 | def channel_freqs(self): 84 | with self._channel_freqs_lock: 85 | if self._channel_freqs is None: 86 | # Don't subtract half a channel width as channel 0 is centred on 0 Hz in baseband 87 | # We use self.bandwidth and self.num_chans to avoid rounding 88 | # errors that might accumulate if channel_width is inexact. 89 | self._channel_freqs = self.centre_freq + self.sideband * self.bandwidth * ( 90 | np.arange(self.num_chans) - self.num_chans // 2) / self.num_chans 91 | return self._channel_freqs 92 | 93 | def __repr__(self): 94 | """Short human-friendly string representation of spectral window object.""" 95 | band = self.band if self.band else 'unknown', 96 | product = repr(self.product) if self.product else 'unknown' 97 | return (f"") 100 | 101 | @property 102 | def _description(self): 103 | """Complete hashable representation, used internally for comparisons.""" 104 | # Pick values that enable a sensible ordering of spectral windows 105 | # Using self.bandwidth is generally redundant but may play a role in 106 | # obscure rounding cases. 107 | return (self.centre_freq, 108 | -self.channel_width, self.num_chans, self.sideband, 109 | self.band, self.product, -self.bandwidth) 110 | 111 | def __eq__(self, other): 112 | """Equality comparison operator.""" 113 | return self._description == ( 114 | other._description if isinstance(other, SpectralWindow) else other) 115 | 116 | def __ne__(self, other): 117 | """Inequality comparison operator.""" 118 | return not (self == other) 119 | 120 | def __lt__(self, other): 121 | """Less-than comparison operator (needed for sorting and np.unique).""" 122 | return self._description < ( 123 | other._description if isinstance(other, SpectralWindow) else other) 124 | 125 | def __hash__(self): 126 | """Base hash on description tuple, just like equality operator.""" 127 | return hash(self._description) 128 | 129 | def subrange(self, first, last): 130 | """Get a new :class:`SpectralWindow` representing a subset of the channels. 131 | 132 | The returned :class:`SpectralWindow` covers the same frequencies as 133 | channels [first, last) of the original. 134 | 135 | Raises 136 | ------ 137 | IndexError 138 | If [first, last) is not a (non-empty) subinterval of the channels 139 | """ 140 | if not (0 <= first < last <= self.num_chans): 141 | raise IndexError('channel indices out of range') 142 | channel_shift = (first + last) // 2 - self.num_chans // 2 143 | num_chans = last - first 144 | # We use self.bandwidth and self.num_chans to avoid rounding errors 145 | # that might accumulate if channel_width is inexact. 146 | centre_freq = self.centre_freq \ 147 | + channel_shift * self.bandwidth * self.sideband / self.num_chans 148 | return SpectralWindow( 149 | centre_freq, self.channel_width, num_chans, 150 | self.product, self.sideband, self.band, 151 | bandwidth=self.bandwidth * num_chans / self.num_chans) 152 | 153 | def rechannelise(self, num_chans): 154 | """Get a new :class:`SpectralWindow` with a different number of channels. 155 | 156 | The returned :class:`SpectralWindow` covers the same frequencies as the 157 | original, but dividing the bandwidth into a different number of 158 | channels. 159 | """ 160 | if num_chans == self.num_chans: 161 | return self 162 | # Find the centre of the bandwidth (whereas centre_freq is the centre 163 | # of the middle channel) 164 | centre_freq = self.centre_freq 165 | if self.num_chans % 2 == 0: 166 | centre_freq -= self.sideband * 0.5 * self.channel_width 167 | channel_width = self.bandwidth / num_chans 168 | # Now convert to the centre of the new middle channel 169 | if num_chans % 2 == 0: 170 | centre_freq += self.sideband * 0.5 * channel_width 171 | return SpectralWindow( 172 | centre_freq, channel_width, num_chans, 173 | self.product, self.sideband, self.band, 174 | bandwidth=self.bandwidth) 175 | -------------------------------------------------------------------------------- /NEWS.rst: -------------------------------------------------------------------------------- 1 | History 2 | ======= 3 | 4 | 0.23 (2024-06-28) 5 | ----------------- 6 | * New `mvf_download` script (also promote `mvf_copy` and remove junk) (#380) 7 | * Select targets by their tags (#377) 8 | * Rename `np.product` to support numpy >= 2.0 and make unit tests more robust (#372) 9 | 10 | 0.22 (2023-11-28) 11 | ----------------- 12 | * Restore np.bool in Numba averaging function to prevent mvftoms crash (#370) 13 | * Replace underscores with dashes when loading old buckets from RDBs (#370) 14 | * Select multiple targets with same name to avoid dropped scans in MS (#369) 15 | * Support on-the-fly (OTF) scans in mvftoms (#366) 16 | 17 | 0.21 (2023-05-12) 18 | ----------------- 19 | * Fix support for numpy >= 1.24 and move unit tests from nose to pytest (#361) 20 | * Complete rewrite of S3ChunkStore retries for more robust archive downloads (#363) 21 | * Remove IMAGING_WEIGHT column full of zeroes from MS (#356) 22 | * Improve tests with ES256-encoded JWT tokens and more robust MinIO health check (#360) 23 | 24 | 0.20.1 (2022-04-29) 25 | ------------------- 26 | * Fix broken `dataset.vis[n]` due to DaskLazyIndexer / ChunkStore interaction (#355) 27 | 28 | 0.20 (2022-04-14) 29 | ----------------- 30 | * Fix support for dask >= 2022.01.1 in ChunkStore (#351) 31 | * Allow mvftoms to continue with partial MS after an interruption (#348) 32 | * New mvf_copy.py script that can be used to extract autocorrelations only (#349) 33 | * Treat Ceph 403 errors properly in S3ChunkStore (#352) 34 | 35 | 0.19 (2021-11-23) 36 | ----------------- 37 | * Support scans and non-radec targets like planets in mvftoms (#333) 38 | * Expose the raw flags of MVF4 datasets (#335) 39 | * Expose CBF F-engine sensors: applied delays, phases and gains (#338) 40 | * Verify that S3 bucket is not empty to detect datasets archived to tape (#344) 41 | * Populate SIGMA_SPECTRUM and redo SIGMA and WEIGHT in mvftoms (#347) 42 | * Have a sensible DataSet.name and also add a separate DataSet.url (#337) 43 | * Allow deselection of antennas using '~m0XX' (#340) 44 | * Allow nested DaskLazyIndexers (#336) 45 | * Fix mvftoms on macOS and Python 3.8+ (#339) 46 | 47 | 0.18 (2021-04-20) 48 | ----------------- 49 | * Switch to PyJWT 2 and Python 3.6, cleaning up Python 2 relics (#321 - #323) 50 | * Allow preselection of channels and dumps upon katdal.open() to save time and memory (#324) 51 | * Allow user to select fields, scans and antennas in mvftoms (#269) 52 | * Support h5py 3.0 string handling in MVF3 (#331) 53 | * Refactor requirement files to remove recursive dependencies (#329) 54 | 55 | 0.17 (2021-01-27) 56 | ----------------- 57 | * This is the last release that will support Python 3.5 58 | * Pin PyJWT version to 1.x to avoid breaking API changes (#320) 59 | * Van Vleck correction! (autocorrelations only, though) (#316) 60 | * Expose excision, aka raw weights (#308) 61 | * Better unit testing of DataSource and S3ChunkStore in general (#319) 62 | * Support indexed telstate keys (the 1000th cut that killed Python 2) (#304) 63 | * Split out separate utility classes for Minio (#310) 64 | * Fix filtering of sensor events with invalid status (#306) 65 | 66 | 0.16 (2020-08-28) 67 | ----------------- 68 | * This is the last release that will support Python 2 (python2 maintenance branch) 69 | * New 'time_offset' sensor property that adjusts timestamps of any sensor (#307) 70 | * Fix calculation of cbf_dump_period for 'wide' / 'narrowN' instruments (#301) 71 | * Increase katstore search window by 600 seconds to find infrequent updates (#302) 72 | * Refactor SensorData to become a lazy abstract interface without caching (#292) 73 | * Refactor SensorCache to use MutableMapping (#300) 74 | * Fix rx_serial sensor use and file mode warning in MVFv3 files (#298, #299) 75 | 76 | 0.15 (2020-03-13) 77 | ----------------- 78 | * Improve S3 chunk store: check tokens, improve timeouts and retries (#272 - #277) 79 | * Retry truncated reads and 50x errors due to S3 server overload (#274) 80 | * Apply flux calibration if available (#278, #279) 81 | * Improve mvf_rechunk and mvf_read_benchmark scripts (#280, #281, #284) 82 | * Fix selection by target description (#271) 83 | * Mark Python 2 support as deprecated (#282) 84 | 85 | 0.14 (2019-10-02) 86 | ----------------- 87 | * Make L2 product by applying self-calibration corrections (#253 - #256) 88 | * Speed up uvw calculations (#252, #262) 89 | * Produce documentation on readthedocs.org (#244, #245, #247, #250, #261) 90 | * Clean up mvftoms and fix REST_FREQUENCY in SOURCE sub-table (#258) 91 | * Support katstore64 API (#265) 92 | * Improve chunk store: detect short reads, speed up handling of lost data (#259, #260) 93 | * Use katpoint 0.9 and dask 1.2.1 features (#262, #243) 94 | 95 | 0.13 (2019-05-09) 96 | ----------------- 97 | * Load RDB files straight from archive (#233, #241) 98 | * Retrieve raw sensor data from CAM katstore (#234) 99 | * Work around one-CBF-dump offset issue (#238) 100 | * Improved MS output: fixed RECEPTOR_ANGLE (#230), added WEIGHT_SPECTRUM (#231) 101 | * Various optimisations to applycal (#224), weights (#226), S3 reads (#229) 102 | * Use katsdptelstate 0.8 and dask 1.1 features (#228, #233, #240) 103 | 104 | 0.12 (2019-02-12) 105 | ----------------- 106 | * Optionally make L1 product by applying calibration corrections (#194 - #198) 107 | * Let default reference antenna in v4 datasets be "array" antenna (#202, #220) 108 | * Use katsdptelstate v0.7: generic encodings, memory backend (#196, #201, #212) 109 | * Prepare for multi-dump chunks (#213, #214, #216, #217, #219) 110 | * Allow L1 flags to be ignored (#209, #210) 111 | * Deal with deprecated dask features (#204, #215) 112 | * Remove RADOS chunk store (it's all via S3 from here on) 113 | 114 | 0.11 (2018-10-15) 115 | ----------------- 116 | * Python 3 support via python-future (finally!) 117 | * Load L1 flags if available (#164) 118 | * Reduced memory usage (#165) and speedups (#155, #169, #170, #171, #182) 119 | * S3 chunk store now uses requests directly instead of via botocore (#166) 120 | * Let lazy indexer use oindex semantics like in the past (#180) 121 | * Fix concatenated data sets (#161) 122 | * Fix IPython / Jupyter tab completion for sensor cache (#176) 123 | 124 | 0.10.1 (2018-05-18) 125 | ------------------- 126 | * Restore NumPy 1.14 support (all data flagged otherwise) 127 | 128 | 0.10 (2018-05-17) 129 | ----------------- 130 | * Rally around the MeerKAT Visibility Format (MVF) 131 | * First optimised converter from MVF v4 to MS: mvftoms 132 | * Latest v4 fixes (synthetic timestamps, autodetection, NPY files in Ceph) 133 | * Flag and zero missing chunks 134 | * Now requires katsdptelstate (released), dask, h5py 2.3 and Python 2.7 135 | * Restore S3 unit tests and NumPy 1.11 (on Ubuntu 16.04) support 136 | 137 | 0.9.5 (2018-02-22) 138 | ------------------ 139 | * New HDF5 v3.9 file format in anticipation of v4 (affects obs_params) 140 | * Fix receiver serial numbers in recent MeerKAT data sets 141 | * Add dask support to ChunkStore 142 | * katdal.open() works on v4 RDB files 143 | 144 | 0.9 (2018-01-16) 145 | ---------------- 146 | * New ChunkStore and telstate-based parser for future v4 format 147 | * Use python-casacore (>=2.2.1) to create Measurement Sets instead of blank.ms 148 | * Read new-style noise diode sensor names, serial numbers and L0 stream metadata 149 | * Select multiple polarisations (useful for cross-pol) 150 | * Relax the "expected number of dumps" check to avoid spurious warnings 151 | * Fix NumPy 1.14 warnings 152 | 153 | 0.8 (2017-08-08) 154 | ---------------- 155 | * Fix upside-down MeerKAT images 156 | * SensorData rework to load gain solutions and access telstate efficiently 157 | * Improve mapping of sensor events onto dumps, especially for long (8 s) dumps 158 | * Fix NumPy 1.13 warnings and errors 159 | * Support UHF receivers 160 | 161 | 0.7.1 (2017-01-19) 162 | ------------------ 163 | 164 | * Fix MODEL_DATA / CORRECTED_DATA shapes in h5toms 165 | * Produce calibration solution tables in h5toms and improve error messages 166 | * Autodetect receiver band on older RTS files 167 | 168 | 0.7 (2016-12-14) 169 | ---------------- 170 | 171 | * Support weights in file and improve vis / weights / flags API 172 | * Support multiple receivers and improve centre frequency extraction 173 | * Speed up h5toms by ordering visibilities by time 174 | * Fix band selection and corr products for latest SDP (cam2telstate) 175 | * Allow explicit MS names in h5toms 176 | 177 | 0.6 (2016-09-16) 178 | ---------------- 179 | 180 | * Initial release of katdal 181 | -------------------------------------------------------------------------------- /katdal/test/test_sensordata.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2018-2022, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.sensordata`.""" 18 | 19 | from collections import OrderedDict 20 | from unittest import mock 21 | 22 | import numpy as np 23 | import pytest 24 | 25 | from katdal.sensordata import (SensorCache, SensorData, SimpleSensorGetter, 26 | remove_duplicates_and_invalid_values, 27 | telstate_decode, to_str) 28 | 29 | 30 | def assert_equal_typed(a, b): 31 | assert a == b 32 | assert type(a) == type(b) 33 | 34 | 35 | class TestToStr: 36 | def test_non_str(self): 37 | assert_equal_typed(to_str(3), 3) 38 | assert_equal_typed(to_str(None), None) 39 | 40 | def test_simple_str(self): 41 | assert_equal_typed(to_str(b'hello'), 'hello') 42 | assert_equal_typed(to_str('hello'), 'hello') 43 | 44 | def test_non_ascii(self): 45 | assert_equal_typed(to_str(b'caf\xc3\xa9'), 'café') 46 | assert_equal_typed(to_str('café'), 'café') 47 | 48 | def test_list(self): 49 | assert_equal_typed(to_str([b'hello', 'world']), ['hello', 'world']) 50 | 51 | def test_tuple(self): 52 | assert_equal_typed(to_str((b'hello', 'world')), ('hello', 'world')) 53 | 54 | def test_dict(self): 55 | assert_equal_typed(to_str({b'hello': b'world', 'abc': 'xyz'}), 56 | {'hello': 'world', 'abc': 'xyz'}) 57 | 58 | def test_custom_dict(self): 59 | assert_equal_typed(to_str(OrderedDict([(b'hello', b'world'), ('abc', 'xyz')])), 60 | OrderedDict([('hello', 'world'), ('abc', 'xyz')])) 61 | 62 | def test_numpy_str(self): 63 | a = np.array([[b'abc', b'def'], [b'ghi', b'jk']]) 64 | b = np.array([['abc', 'def'], ['ghi', 'jk']]) 65 | c = np.array([['abc', 'def'], ['ghi', 'jk']]) 66 | np.testing.assert_array_equal(to_str(a), c) 67 | np.testing.assert_array_equal(to_str(b), c) 68 | 69 | def test_numpy_object(self): 70 | a = np.array([b'abc', 'def', (b'xyz', 'uvw')], dtype='O') 71 | b = np.array(['abc', 'def', ('xyz', 'uvw')], dtype='O') 72 | np.testing.assert_array_equal(to_str(a), b) 73 | 74 | 75 | @mock.patch('katsdptelstate.encoding._allow_pickle', True) 76 | @mock.patch('katsdptelstate.encoding._warn_on_pickle', False) 77 | def test_telstate_decode(): 78 | raw = "S'1'\n." 79 | assert telstate_decode(raw) == '1' 80 | assert telstate_decode(raw.encode()) == '1' 81 | assert telstate_decode(np.void(raw.encode())) == '1' 82 | assert telstate_decode('l', no_decode=('l', 's', 'u', 'x')) == 'l' 83 | raw_np = ("cnumpy.core.multiarray\nscalar\np1\n(cnumpy\ndtype\np2\n(S'f8'\nI0\nI1\ntRp3\n" 84 | "(I3\nS'<'\nNNNI-1\nI-1\nI0\ntbS'8\\xdf\\xd4(\\x89\\xfc\\xef?'\ntRp4\n.") 85 | value_np = telstate_decode(raw_np) 86 | assert value_np == 0.9995771214953271 87 | assert isinstance(value_np, np.float64) 88 | 89 | 90 | class TestSensorCache: 91 | def _cache_data(self): 92 | sensors = [ 93 | ('foo', [4.0, 7.0], [3.0, 6.0]), 94 | ('cat', [2.0, 6.0], ['hello', 'world']) 95 | ] 96 | cache_data = {} 97 | for name, ts, values in sensors: 98 | sd = SimpleSensorGetter(name, np.asarray(ts), np.asarray(values)) 99 | cache_data[name] = sd 100 | return cache_data 101 | 102 | def setup_method(self): 103 | self.cache = SensorCache(self._cache_data(), timestamps=np.arange(10.), dump_period=1.0) 104 | 105 | def test_extract_float(self): 106 | data = self.cache.get('foo', extract=True) 107 | np.testing.assert_array_equal(data, [3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 6.0, 6.0]) 108 | 109 | def test_extract_categorical(self): 110 | data = self.cache.get('cat', extract=True) 111 | H = 'hello' 112 | W = 'world' 113 | np.testing.assert_array_equal(data[:], [H, H, H, H, H, H, W, W, W, W]) 114 | 115 | def test_alias(self): 116 | self.cache = SensorCache( 117 | self._cache_data(), timestamps=np.arange(10.), dump_period=1.0, 118 | aliases={'zz': 'at'}) 119 | # Check that adding the alias didn't lead to extraction 120 | assert isinstance(self.cache.get('czz', extract=False), SimpleSensorGetter) 121 | np.testing.assert_array_equal(self.cache['czz'], self.cache['cat']) 122 | 123 | def test_len(self): 124 | assert len(self.cache) == 2 125 | 126 | def test_keys(self): 127 | assert sorted(self.cache.keys()) == ['cat', 'foo'] 128 | 129 | def test_contains(self): 130 | assert 'cat' in self.cache 131 | assert 'foo' in self.cache 132 | assert 'dog' not in self.cache 133 | template = 'Antennas/{ant}/{param1}_{param2}' 134 | self.cache.virtual[template] = lambda x: None 135 | assert template not in self.cache 136 | 137 | def test_setitem_delitem(self): 138 | self.cache['bar'] = SimpleSensorGetter('bar', np.array([1.0]), np.array([0.0])) 139 | np.testing.assert_array_equal(self.cache['bar'], np.zeros(10)) 140 | del self.cache['bar'] 141 | assert 'bar' not in self.cache 142 | 143 | def test_sensor_time_offset(self): 144 | data = self.cache.get('foo', extract=True, time_offset=-1.0) 145 | np.testing.assert_array_equal(data, [3.0, 3.0, 3.0, 3.0, 4.0, 5.0, 6.0, 6.0, 6.0, 6.0]) 146 | 147 | def test_virtual_sensors(self): 148 | calculate_value = mock.Mock() 149 | 150 | def _check_sensor(cache, name, **kwargs): 151 | """Check that virtual sensor function gets the expected parameters.""" 152 | assert kwargs == params 153 | calculate_value() 154 | value = kwargs['param2'] 155 | cache[name] = value 156 | return value 157 | 158 | # Set up a virtual sensor and trigger it to get a value 159 | params = {'ant': 'm000', 'param1': 'one', 'param2': 'two'} 160 | template = 'Antennas/{ant}/{param1}_{param2}' 161 | self.cache.virtual[template] = _check_sensor 162 | value = self.cache.get(template.format(**params)) 163 | assert value == params['param2'] 164 | assert calculate_value.call_count == 1 165 | # Check that the value was taken from the cache the second time around 166 | value = self.cache.get(template.format(**params)) 167 | assert value == params['param2'] 168 | assert calculate_value.call_count == 1 169 | # If your parameter values contain underscores, don't use it as delimiter 170 | params = {'ant': 'm000', 'param1': 'one', 'param2': 'two_three'} 171 | with pytest.raises(AssertionError): 172 | self.cache.get(template.format(**params)) 173 | template = 'Antennas/{ant}/{param1}/{param2}' 174 | # The updated template has not yet been added to the cache 175 | with pytest.raises(KeyError): 176 | self.cache.get(template.format(**params)) 177 | self.cache.virtual[template] = _check_sensor 178 | value = self.cache.get(template.format(**params)) 179 | assert value == params['param2'] 180 | assert calculate_value.call_count == 2 181 | 182 | # TODO: more tests required: 183 | # - extract=False 184 | # - selection 185 | 186 | 187 | def test_sensor_cleanup(): 188 | # The first sensor event has a status of "unknown" and is therefore invalid. It happened 189 | # after the second (valid) event, though, and snuck through due to a bug (now fixed). 190 | # This mirrors the behaviour of the cbf_1_wide_input_labelling sensor in CBID 1588667937. 191 | timestamp = np.array([1.0, 0.0, 3.0, 3.0, 3.0, 3.0, 2.0]) 192 | value = np.array(['broke', 'a', 'c', 'c', 'c', 'd', 'b']) 193 | status = np.array(['unknown', 'nominal', 'nominal', 'nominal', 'warn', 'error', 'nominal']) 194 | dirty = SensorData('test', timestamp, value, status) 195 | clean = remove_duplicates_and_invalid_values(dirty) 196 | assert clean.status is None 197 | np.testing.assert_array_equal(clean.value, np.array(['a', 'b', 'd'])) 198 | np.testing.assert_array_equal(clean.timestamp, np.array([0.0, 2.0, 3.0])) 199 | -------------------------------------------------------------------------------- /katdal/ms_async.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2018-2019,2021-2023, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Write data to a Measurement Set asynchronously. 18 | 19 | This uses multiprocessing, a queue, and a circular buffer in shared memory to 20 | pass visibility data to a separate process that actually writes to the 21 | measurement set. 22 | 23 | This is largely an implementation detail of the mvftoms.py script, and might 24 | not be suited to other use cases. It is put into a separate module as a 25 | workaround for https://bugs.python.org/issue9914. 26 | """ 27 | 28 | import contextlib 29 | import multiprocessing 30 | import multiprocessing.sharedctypes 31 | from collections import namedtuple 32 | 33 | import katpoint 34 | import numpy as np 35 | 36 | from . import ms_extra 37 | 38 | 39 | class RawArray: 40 | """Shared memory array, in representation that can be passed through multiprocessing queue""" 41 | def __init__(self, shape, dtype): 42 | self.shape = shape 43 | self.dtype = np.dtype(dtype) 44 | size = self.dtype.itemsize * int(np.prod(shape)) 45 | self.storage = multiprocessing.sharedctypes.RawArray('c', size) 46 | 47 | def asarray(self): 48 | """Return numpy array representation""" 49 | return np.frombuffer(self.storage, self.dtype).reshape(self.shape) 50 | 51 | 52 | QueueItem = namedtuple('QueueItem', ['slot', 'target', 'time_utc', 'dump_time_width', 53 | 'field_id', 'state_id', 'scan_itr']) 54 | ScanResult = namedtuple('ScanResult', ['scan_size']) 55 | EndOfScan = namedtuple('EndOfScan', []) 56 | 57 | 58 | def ms_writer_process( 59 | work_queue, result_queue, options, antennas, cp_info, ms_name, 60 | raw_vis_data, raw_weight_data, raw_flag_data, start_row): 61 | """ 62 | Function to be run in a separate process for writing to a Measurement Set. 63 | The MS is assumed to have already been created with the appropriate 64 | columns. 65 | 66 | Incoming work is provided by submitting instances of :class:`QueueItem` 67 | to `work_queue`. The `slot` indexes the first dimension of the shared 68 | memory arrays. One may also submit an :class:`EndOfScan`, which will flush 69 | to disk and return a :class:`ScanResult` through the `result_queue` (these 70 | are not actually required to match katdal scans). 71 | 72 | To terminate the process, submit ``None`` to `work_queue`. 73 | 74 | If an exception occurs, it will be placed into `result_queue`, after which 75 | work_queue items will be fetched and discarded until ``None`` is received. 76 | When finished (either successfully or after an error), ``None`` is put in 77 | `result_queue`. 78 | 79 | Parameters 80 | ---------- 81 | work_queue : :class:`multiprocessing.Queue` 82 | Incoming work. Note that this function gives no explicit indication 83 | when it is done with a piece of work, so the queue capacity needs to 84 | be bounded to prevent data races. 85 | result_queue : :class:`multiprocessing.Queue` 86 | Information about progress (see :class:`ScanResult`) 87 | options : :class:`argparse.Namespace` 88 | Command-line options to mvftoms 89 | antennas : list of :class:`katpoint.Antenna` 90 | Antennas (used to compute UVW coordinates) 91 | cp_info : namedtuple 92 | Correlation product info (see mvftoms.py) 93 | ms_name : str 94 | Name of the Measurement Set to write 95 | raw_vis_data, raw_weight_data, raw_flag_data : :class:`RawArray` 96 | Circular buffers for the data, with shape 97 | (slots, time, baseline, channel, pol). 98 | start_row : int 99 | Row in Measurement Set where output will start 100 | """ 101 | 102 | none_seen = False 103 | try: 104 | vis_arrays = raw_vis_data.asarray() 105 | weight_arrays = raw_weight_data.asarray() 106 | flag_arrays = raw_flag_data.asarray() 107 | scan_size = 0 108 | tdiff = vis_arrays.shape[1] 109 | nbl = vis_arrays.shape[2] 110 | 111 | main_table = ms_extra.open_table(ms_name, verbose=options.verbose) 112 | with contextlib.closing(main_table): 113 | array_centre = antennas[0].array_reference_antenna() 114 | while True: 115 | item = work_queue.get() 116 | if item is None: 117 | none_seen = True 118 | break 119 | elif isinstance(item, EndOfScan): 120 | main_table.flush() # Mostly to get realistic throughput stats 121 | result_queue.put(ScanResult(scan_size)) 122 | scan_size = 0 123 | else: 124 | # Extract the slot, and flatten time and baseline into a single axis 125 | new_shape = (-1, vis_arrays.shape[-2], vis_arrays.shape[-1]) 126 | vis_data = vis_arrays[item.slot].reshape(new_shape) 127 | weight_data = weight_arrays[item.slot].reshape(new_shape) 128 | flag_data = flag_arrays[item.slot].reshape(new_shape) 129 | 130 | # Iterate through baselines, computing UVW coordinates 131 | # for a chunk of timesteps. Note that we can't rely on the 132 | # u, v, w properties of the dataset because those 133 | # correspond to the original dumps, and we might be 134 | # averaging in time. 135 | uvw_ant = item.target.uvw(antennas, item.time_utc, array_centre) 136 | # Permute from axis, time, antenna to time, antenna, axis 137 | uvw_ant = np.transpose(uvw_ant, (1, 2, 0)) 138 | # Compute baseline UVW coordinates from per-antenna coordinates. 139 | # The sign convention matches `CASA`_, rather than the 140 | # Measurement Set `definition`_. 141 | # .. _CASA: https://casa.nrao.edu/Memos/CoordConvention.pdf 142 | # .. _definition: https://casa.nrao.edu/Memos/229.html#SECTION00064000000000000000 143 | uvw_coordinates = (np.take(uvw_ant, cp_info.ant1_index, axis=1) 144 | - np.take(uvw_ant, cp_info.ant2_index, axis=1)) 145 | # Flatten time and baseline axes together 146 | uvw_coordinates = uvw_coordinates.reshape(-1, 3) 147 | 148 | # Convert averaged UTC timestamps to MJD seconds. 149 | # Blow time up to (ntime*nbl,) 150 | out_mjd = np.asarray([katpoint.Timestamp(t).to_mjd() * 24 * 60 * 60 151 | for t in item.time_utc]) 152 | 153 | out_mjd = np.broadcast_to(out_mjd[:, np.newaxis], (tdiff, nbl)).ravel() 154 | 155 | # Repeat antenna indices to (ntime*nbl,) 156 | a1 = np.broadcast_to(cp_info.ant1_index[np.newaxis, :], (tdiff, nbl)).ravel() 157 | a2 = np.broadcast_to(cp_info.ant2_index[np.newaxis, :], (tdiff, nbl)).ravel() 158 | 159 | # Blow field ID up to (ntime*nbl,) 160 | big_field_id = np.full((tdiff * nbl,), item.field_id, dtype=np.int32) 161 | big_state_id = np.full((tdiff * nbl,), item.state_id, dtype=np.int32) 162 | big_scan_itr = np.full((tdiff * nbl,), item.scan_itr, dtype=np.int32) 163 | 164 | # Setup model_data and corrected_data if required 165 | model_data = None 166 | corrected_data = None 167 | 168 | if options.model_data: 169 | # unity intensity zero phase model data set, same shape as vis_data 170 | model_data = np.ones(vis_data.shape, dtype=np.complex64) 171 | # corrected data set copied from vis_data 172 | corrected_data = vis_data 173 | 174 | # Populate dictionary for write to MS 175 | main_dict = ms_extra.populate_main_dict( 176 | uvw_coordinates, vis_data, 177 | flag_data, weight_data, out_mjd, a1, a2, 178 | item.dump_time_width, big_field_id, big_state_id, 179 | big_scan_itr, model_data, corrected_data) 180 | 181 | # Write data to MS. 182 | nrows = ms_extra.write_rows(main_table, main_dict, 183 | options.verbose, start_row) 184 | start_row += nrows 185 | 186 | # Calculate bytes written from the summed arrays in the dict 187 | scan_size += sum(a.nbytes for a in main_dict.values() 188 | if isinstance(a, np.ndarray)) 189 | except Exception as error: 190 | result_queue.put(error) 191 | while not none_seen: 192 | item = work_queue.get() 193 | if item is None: 194 | none_seen = True 195 | finally: 196 | result_queue.put(None) 197 | -------------------------------------------------------------------------------- /scripts/mvf_download.py: -------------------------------------------------------------------------------- 1 | #! /usr/bin/env python3 2 | 3 | ################################################################################ 4 | # Copyright (c) 2023-2024, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | # 20 | # Download an MVF4 dataset using rclone. 21 | # 22 | # Ludwig Schwardt 23 | # 16 May 2023 24 | # 25 | 26 | import argparse 27 | import json 28 | import os 29 | import shutil 30 | import subprocess 31 | import sys 32 | from collections import defaultdict 33 | from pathlib import Path, PurePosixPath 34 | from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse 35 | 36 | import dask 37 | import katdal 38 | from katdal.chunkstore import _blocks_ravel 39 | from katdal.lazy_indexer import dask_getitem 40 | from packaging import version 41 | 42 | # This version is good for file-less config, enabling --config "" and --files-from - 43 | MINIMUM_RCLONE_VERSION = version.Version('1.56') 44 | DESCRIPTION = """ 45 | Download MVFv4 dataset (or a subset of chunks) from S3 to disk using rclone. 46 | 47 | You need rclone (https://rclone.org/downloads/) if it is not on your system. 48 | It is a single executable file that you could download to your user account. 49 | Just ensure that it is on your PATH; no need to configure it any further. 50 | 51 | Run the script like this: 52 | 53 | mvf_download.py https://archive/1698676533/1698676533_sdp_l0.full.rdb?token=<> dest 54 | 55 | Data will appear in three subdirectories in the specified output directory as 56 | 57 | dest/1698676533/... 58 | dest/1698676533-sdp-l0/... 59 | dest/1698676533-sdp-l1-flags/... 60 | 61 | Open the local dataset like this: 62 | 63 | d = katdal.open("dest/1698676533/1698676533_sdp_l0.full.rdb") 64 | 65 | If the script crashes or you terminate it, you can just run it again and 66 | it will carry on, fixing any half-downloaded chunks along the way. If it 67 | completes, you can be sure that all your data is safely downloaded. 68 | 69 | BONUS: you can even copy just parts of the data (e.g. the tracks and not the 70 | slews). This works as long as your selection picks out a subset of the chunks 71 | but leaves the chunks themselves intact. It is well suited for time-based 72 | selections. 73 | 74 | Because MeerKAT data is chunked first in time and then in frequency, but not 75 | in correlation product, this won't help to select a subset of antennas or 76 | baselines or autocorrelations, as that would require breaking up chunks into 77 | smaller chunks. For that, consider using the mvf_copy.py script instead, which 78 | is also useful if you want to copy a subset of data from disk to disk. 79 | 80 | Note that you have to pass a JSON object (which resembles a Python dict) as a 81 | string to the --select argument. The "dict" contains keyword arguments meant 82 | for the DataSet.select() method. It's important to note that the strings in 83 | the dict need double quotes (") while the entire string has to be encapsulated 84 | in single quotes ('). Some examples: 85 | 86 | mvf_download.py url directory --select='{"scans": "track"}' 87 | mvf_download.py url directory --select='{"scans": 1}' 88 | mvf_download.py url directory --select='{"scans": [0, 1, 2]}' 89 | mvf_download.py url directory --select='{"targets": "J1939-6342"}' 90 | 91 | The chunks that are not copied will appear as "lost" data in the downloaded 92 | dataset, but that is fine. If you apply the same selection, you won't see it. 93 | """ 94 | 95 | 96 | def parse_args(args=None, namespace=None): 97 | """Parse script arguments into script-specific ones and ones meant for rclone.""" 98 | parser = argparse.ArgumentParser( 99 | usage='%(prog)s [-h] [--select JSON] [--workers N] ' 100 | 'source dest [rclone options]', 101 | description=DESCRIPTION, 102 | epilog='Any extra script options are passed to rclone.', 103 | formatter_class=argparse.RawDescriptionHelpFormatter, 104 | ) 105 | parser.add_argument('source', help='Dataset URL (including token if needed)') 106 | parser.add_argument('dest', type=Path, help='Output directory') 107 | parser.add_argument('--select', type=json.loads, default={}, 108 | help='Kwargs for katdal.DataSet.select as a JSON object') 109 | parser.add_argument('--workers', type=int, default=16, 110 | help='Number of rclone threads for parallel I/O [%(default)s]') 111 | mvf_download_args, rclone_args = parser.parse_known_args(args, namespace) 112 | rclone_args = [ 113 | '--transfers', str(mvf_download_args.workers), 114 | '--checkers', str(mvf_download_args.workers + 4) 115 | ] + rclone_args 116 | return mvf_download_args, rclone_args 117 | 118 | 119 | def chunk_names(vfw, keep): 120 | """Names of chunks covered by selection `keep` in all storage arrays in `vfw`.""" 121 | all_chunks = defaultdict(list) 122 | for array, info in vfw.chunk_info.items(): 123 | darray = vfw.store.get_dask_array( 124 | array, 125 | info['chunks'], 126 | info['dtype'], 127 | index=vfw.preselect_index, 128 | errors='dryrun', 129 | ) 130 | kept_blocks = _blocks_ravel(dask_getitem(darray, keep[:darray.ndim])) 131 | chunks = sorted(chunk.name + '.npy' for chunk in dask.compute(*kept_blocks)) 132 | all_chunks[info['prefix']].extend(chunks) 133 | return all_chunks 134 | 135 | 136 | def has_recent_rclone(): 137 | """Check that rclone is installed and has an appropriate version.""" 138 | try: 139 | result = subprocess.run(['rclone', 'version'], capture_output=True, check=True) 140 | except FileNotFoundError: 141 | print('The rclone tool was not found. Please install at least version ' 142 | f'{MINIMUM_RCLONE_VERSION} (see rclone.org) or check the path.') 143 | else: 144 | installed_version = version.parse(result.stdout.split()[1].decode()) 145 | if installed_version >= MINIMUM_RCLONE_VERSION: 146 | return True 147 | print(f'Found rclone {installed_version} but the script needs version ' 148 | f'{MINIMUM_RCLONE_VERSION}. See rclone.org for installation options.') 149 | return False 150 | 151 | 152 | def rclone_fit_output_to_terminal(args): 153 | """Reduce rclone output to a single line if it won't fit on terminal.""" 154 | new_args = args.copy() 155 | # Find last instances of --transfers and --checkers flags (guaranteed one of each) 156 | parser = argparse.ArgumentParser() 157 | parser.add_argument('--transfers', action='append', type=int) 158 | parser.add_argument('--checkers', action='append', type=int) 159 | n, _ = parser.parse_known_args([str(arg) for arg in new_args]) 160 | if n.transfers[-1] + n.checkers[-1] + 6 > shutil.get_terminal_size().lines: 161 | new_args.append('--stats-one-line') 162 | return new_args 163 | 164 | 165 | def rclone_copy(endpoint, bucket, dest, args, token=None, files=None): 166 | """Run 'rclone copy' with appropriate arguments.""" 167 | env = os.environ.copy() 168 | # Ignore config file as we will configure rclone with environment variables instead 169 | env['RCLONE_CONFIG'] = '' 170 | env['RCLONE_CONFIG_ARCHIVE_TYPE'] = 's3' 171 | env['RCLONE_CONFIG_ARCHIVE_ENDPOINT'] = endpoint 172 | rclone_args = [ 173 | 'rclone', 'copy', f'archive:{bucket}', dest, 174 | '--s3-provider', 'Ceph', 175 | '--fast-list', 176 | '--checksum', 177 | '--progress', 178 | ] 179 | if token: 180 | rclone_args.extend(['--header', f'Authorization: Bearer {token}']) 181 | run_kwargs = dict(check=True, env=env) 182 | if files is not None: 183 | rclone_args.extend(['--files-from', '-']) 184 | run_kwargs.update(input='\n'.join(files), text=True) 185 | # User-supplied arguments can override any of the above args 186 | rclone_args.extend(args) 187 | rclone_args = rclone_fit_output_to_terminal(rclone_args) 188 | subprocess.run(rclone_args, **run_kwargs) # pylint: disable=subprocess-run-check 189 | 190 | 191 | def main(): 192 | """Main routine of mvf_download script.""" 193 | args, rclone_args = parse_args() 194 | if not has_recent_rclone(): 195 | return False 196 | url_parts = urlparse(args.source) 197 | *_, cbid, rdb_filename = PurePosixPath(url_parts.path).parts 198 | endpoint = urlunparse((url_parts.scheme, url_parts.netloc, '', '', '', '')) 199 | token = dict(parse_qsl(url_parts.query)).get('token') 200 | meta_path = args.dest / cbid 201 | print(f"\nDownloading metadata bucket ({cbid}) to {meta_path.absolute()} ...") 202 | rclone_copy(endpoint, cbid, meta_path, rclone_args, token) 203 | 204 | query_params = {'s3_endpoint_url': endpoint} 205 | if token: 206 | query_params['token'] = token 207 | query = urlencode(query_params) 208 | rdb_path = (meta_path / rdb_filename).absolute() 209 | local_rdb = urlunparse(('file', '', str(rdb_path), '', query, '')) 210 | print(f"Opening local RDB file: {local_rdb}") 211 | d = katdal.open(local_rdb) 212 | d.select(**args.select) 213 | # Collect names of chunks covered by selection in each chunked storage array 214 | chunks = chunk_names(d.source.data, d.vis.keep) 215 | for bucket, files in chunks.items(): 216 | bucket_path = args.dest / bucket 217 | n_chunks = len(files) 218 | if not args.select: 219 | n_chunks = f'all {n_chunks}' 220 | files = None 221 | print(f"\nDownloading {n_chunks} chunks from data bucket {bucket} " 222 | f"to {bucket_path.absolute()} ...") 223 | rclone_copy(endpoint, bucket, bucket_path, rclone_args, token, files) 224 | return True 225 | 226 | 227 | if __name__ == '__main__': 228 | if not main(): 229 | sys.exit(1) 230 | -------------------------------------------------------------------------------- /doc/intro.rst: -------------------------------------------------------------------------------- 1 | Introduction to katdal 2 | ====================== 3 | 4 | Data access library for data sets in the MeerKAT Visibility Format (MVF) 5 | 6 | Overview 7 | -------- 8 | 9 | This module serves as a data access library to interact with the chunk stores 10 | and HDF5 files produced by the MeerKAT radio telescope and its predecessors 11 | (KAT-7 and Fringe Finder). It uses memory carefully, allowing data sets to be 12 | inspected and partially loaded into memory. Data sets may be concatenated and 13 | split via a flexible selection mechanism. In addition, it provides a script to 14 | convert these data sets to CASA MeasurementSets. 15 | 16 | Quick Tutorial 17 | -------------- 18 | 19 | Open any data set through a single function to obtain a data set object:: 20 | 21 | import katdal 22 | d = katdal.open('1234567890.h5') 23 | 24 | This automatically determines the version and storage location of the data set. 25 | The versions roughly map to the various instruments:: 26 | 27 | - v1 : Fringe Finder (HDF5 file) 28 | - v2 : KAT-7 (HDF5 file) 29 | - v3 : MeerKAT (HDF5 file) 30 | - v4 : MeerKAT (chunk store based on objects in Ceph) 31 | 32 | Multiple data sets (even of different versions) may also be concatenated 33 | together (as long as they have the same dump rate):: 34 | 35 | d = katdal.open(['1234567890.h5', '1234567891.h5']) 36 | 37 | Inspect the contents of the data set by printing the object:: 38 | 39 | print d 40 | 41 | Here is a typical output:: 42 | 43 | =============================================================================== 44 | Name: 1313067732.h5 (version 2.0) 45 | =============================================================================== 46 | Observer: someone Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f 47 | Description: 'Track on Hyd A,Vir A, 3C 286 and 3C 273' 48 | Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST 49 | Dump rate: 1.00025 Hz 50 | Subarrays: 1 51 | ID Antennas Inputs Corrprods 52 | 0 ant1,ant2,ant3,ant4,ant5,ant6,ant7 14 112 53 | Spectral Windows: 1 54 | ID CentreFreq(MHz) Bandwidth(MHz) Channels ChannelWidth(kHz) 55 | 0 1822.000 400.000 1024 390.625 56 | ------------------------------------------------------------------------------- 57 | Data selected according to the following criteria: 58 | subarray=0 59 | ants=['ant1', 'ant2', 'ant3', 'ant4', 'ant5', 'ant6', 'ant7'] 60 | spw=0 61 | ------------------------------------------------------------------------------- 62 | Shape: (1054 dumps, 1024 channels, 112 correlation products) => Size: 967.049 MB 63 | Antennas: *ant1,ant2,ant3,ant4,ant5,ant6,ant7 Inputs: 14 Autocorr: yes Crosscorr: yes 64 | Channels: 1024 (index 0 - 1023, 2021.805 MHz - 1622.195 MHz), each 390.625 kHz wide 65 | Targets: 4 selected out of 4 in catalogue 66 | ID Name Type RA(J2000) DEC(J2000) Tags Dumps ModelFlux(Jy) 67 | 0 Hyd A radec 9:18:05.28 -12:05:48.9 333 33.63 68 | 1 Vir A radec 12:30:49.42 12:23:28.0 251 166.50 69 | 2 3C 286 radec 13:31:08.29 30:30:33.0 230 12.97 70 | 3 3C 273 radec 12:29:06.70 2:03:08.6 240 39.96 71 | Scans: 8 selected out of 8 total Compscans: 1 selected out of 1 total 72 | Date Timerange(UTC) ScanState CompScanLabel Dumps Target 73 | 11-Aug-2011/13:02:14 - 13:04:26 0:slew 0: 133 0:Hyd A 74 | 13:04:27 - 13:07:46 1:track 0: 200 0:Hyd A 75 | 13:07:47 - 13:08:37 2:slew 0: 51 1:Vir A 76 | 13:08:38 - 13:11:57 3:track 0: 200 1:Vir A 77 | 13:11:58 - 13:12:27 4:slew 0: 30 2:3C 286 78 | 13:12:28 - 13:15:47 5:track 0: 200 2:3C 286 79 | 13:15:48 - 13:16:27 6:slew 0: 40 3:3C 273 80 | 13:16:28 - 13:19:47 7:track 0: 200 3:3C 273 81 | 82 | The first segment of the printout displays the static information of the data 83 | set, including observer, dump rate and all the available subarrays and spectral 84 | windows in the data set. The second segment (between the dashed lines) highlights 85 | the active selection criteria. The last segment displays dynamic information 86 | that is influenced by the selection, including the overall visibility array 87 | shape, antennas, channel frequencies, targets and scan info. 88 | 89 | The data set is built around the concept of a three-dimensional visibility array 90 | with dimensions of time, frequency and correlation product. This is reflected in 91 | the *shape* of the dataset:: 92 | 93 | d.shape 94 | 95 | which returns (1054, 1024, 112), meaning 1054 dumps by 1024 channels by 112 96 | correlation products. 97 | 98 | Let's select a subset of the data set:: 99 | 100 | d.select(scans='track', channels=slice(200,300), ants='ant4') 101 | print d 102 | 103 | This results in the following printout:: 104 | 105 | =============================================================================== 106 | Name: /Users/schwardt/Downloads/1313067732.h5 (version 2.0) 107 | =============================================================================== 108 | Observer: siphelele Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f 109 | Description: 'track on Hyd A,Vir A, 3C 286 and 3C 273 for Lud' 110 | Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST 111 | Dump rate: 1.00025 Hz 112 | Subarrays: 1 113 | ID Antennas Inputs Corrprods 114 | 0 ant1,ant2,ant3,ant4,ant5,ant6,ant7 14 112 115 | Spectral Windows: 1 116 | ID CentreFreq(MHz) Bandwidth(MHz) Channels ChannelWidth(kHz) 117 | 0 1822.000 400.000 1024 390.625 118 | ------------------------------------------------------------------------------- 119 | Data selected according to the following criteria: 120 | channels=slice(200, 300, None) 121 | subarray=0 122 | scans='track' 123 | ants='ant4' 124 | spw=0 125 | ------------------------------------------------------------------------------- 126 | Shape: (800 dumps, 100 channels, 4 correlation products) => Size: 2.560 MB 127 | Antennas: ant4 Inputs: 2 Autocorr: yes Crosscorr: no 128 | Channels: 100 (index 200 - 299, 1943.680 MHz - 1905.008 MHz), each 390.625 kHz wide 129 | Targets: 4 selected out of 4 in catalogue 130 | ID Name Type RA(J2000) DEC(J2000) Tags Dumps ModelFlux(Jy) 131 | 0 Hyd A radec 9:18:05.28 -12:05:48.9 200 31.83 132 | 1 Vir A radec 12:30:49.42 12:23:28.0 200 159.06 133 | 2 3C 286 radec 13:31:08.29 30:30:33.0 200 12.61 134 | 3 3C 273 radec 12:29:06.70 2:03:08.6 200 39.32 135 | Scans: 4 selected out of 8 total Compscans: 1 selected out of 1 total 136 | Date Timerange(UTC) ScanState CompScanLabel Dumps Target 137 | 11-Aug-2011/13:04:27 - 13:07:46 1:track 0: 200 0:Hyd A 138 | 13:08:38 - 13:11:57 3:track 0: 200 1:Vir A 139 | 13:12:28 - 13:15:47 5:track 0: 200 2:3C 286 140 | 13:16:28 - 13:19:47 7:track 0: 200 3:3C 273 141 | 142 | Compared to the first printout, the static information has remained the same 143 | while the dynamic information now reflects the selected subset. There are many 144 | possible selection criteria, as illustrated below:: 145 | 146 | d.select(timerange=('2011-08-11 13:10:00', '2011-08-11 13:15:00'), targets=[1, 2]) 147 | d.select(spw=0, subarray=0) 148 | d.select(ants='ant1,ant2', pol='H', scans=(0,1,2), freqrange=(1700e6, 1800e6)) 149 | 150 | See the docstring of :meth:`DataSet.select` for more detailed information (i.e. 151 | do `d.select?` in IPython). Take note that only one subarray and one spectral 152 | window must be selected. 153 | 154 | Once a subset of the data has been selected, you can access the data and 155 | timestamps on the data set object:: 156 | 157 | vis = d.vis[:] 158 | timestamps = d.timestamps[:] 159 | 160 | Note the `[:]` indexing, as the *vis* and *timestamps* properties are special 161 | :class:`LazyIndexer` objects that only give you the actual data when you use 162 | indexing, in order not to inadvertently load the entire array into memory. 163 | 164 | For the example dataset and no selection the *vis* array will have a shape of 165 | (1054, 1024, 112). The time dimension is labelled by `d.timestamps`, the 166 | frequency dimension by `d.channel_freqs` and the correlation product dimension 167 | by `d.corr_products`. 168 | 169 | Another key concept in the data set object is that of *sensors*. These are named 170 | time series of arbritrary data that are either loaded from the data set 171 | (*actual* sensors) or calculated on the fly (*virtual* sensors). Both variants 172 | are accessed through the *sensor cache* (available as `d.sensor`) and cached 173 | there after the first access. The data set object also provides convenient 174 | properties to expose commonly-used sensors, as shown in the plot example below:: 175 | 176 | import matplotlib.pyplot as plt 177 | plt.plot(d.az, d.el, 'o') 178 | plt.xlabel('Azimuth (degrees)') 179 | plt.ylabel('Elevation (degrees)') 180 | 181 | Other useful attributes include *ra*, *dec*, *lst*, *mjd*, *u*, *v*, *w*, 182 | *target_x* and *target_y*. These are all one-dimensional NumPy arrays that 183 | dynamically change length depending on the active selection. 184 | 185 | As in katdal's predecessor (scape) there is a :meth:`DataSet.scans` generator 186 | that allows you to step through the scans in the data set. It returns the 187 | scan index, scan state and target object on each iteration, and updates 188 | the active selection on the data set to include only the current scan. 189 | It is also possible to iterate through the compound scans with the 190 | :meth:`DataSet.compscans` generator, which yields the compound scan index, label 191 | and first target on each iteration for convenience. These two iterators may also 192 | be used together to traverse the data set structure:: 193 | 194 | for compscan, label, target in d.compscans(): 195 | plt.figure() 196 | for scan, state, target in d.scans(): 197 | if state in ('scan', 'track'): 198 | plt.plot(d.ra, d.dec, 'o') 199 | plt.xlabel('Right ascension (J2000 degrees)') 200 | plt.ylabel('Declination (J2000 degrees)') 201 | plt.title(target.name) 202 | 203 | Finally, all the targets (or fields) in the data set are stored in a catalogue 204 | available at `d.catalogue`, and the original HDF5 file is still accessible via 205 | a back door installed at `d.file` in the case of a single-file data set. 206 | 207 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | katdal 2 | ====== 3 | 4 | This package serves as a data access library to interact with the chunk stores 5 | and HDF5 files produced by the MeerKAT radio telescope and its predecessors 6 | (KAT-7 and Fringe Finder), which are collectively known as *MeerKAT Visibility 7 | Format (MVF)* data sets. It uses memory carefully, allowing data sets to be 8 | inspected and partially loaded into memory. Data sets may be concatenated and 9 | split via a flexible selection mechanism. In addition, it provides a script to 10 | convert these data sets to CASA MeasurementSets. 11 | 12 | Quick Tutorial 13 | -------------- 14 | 15 | Open any data set through a single function to obtain a data set object: 16 | 17 | .. code:: python 18 | 19 | import katdal 20 | d = katdal.open('1234567890.h5') 21 | 22 | The ``open`` function automatically determines the version and storage location 23 | of the data set. The versions roughly map to the various instruments:: 24 | 25 | - v1 : Fringe Finder (HDF5 file) 26 | - v2 : KAT-7 (HDF5 file) 27 | - v3 : MeerKAT (HDF5 file) 28 | - v4 : MeerKAT (RDB file + chunk store based on objects in Ceph) 29 | 30 | Each MVFv4 data set is split into a Redis dump (aka *RDB*) file containing the 31 | metadata in the form of a *telescope state* database, and a *chunk store* 32 | containing the visibility data split into many small blocks or chunks (typically 33 | served by a Ceph object store over the network). The RDB file is the main entry 34 | point to the data set and it can be accessed directly from the MeerKAT SDP 35 | archive if you have the appropriate permissions: 36 | 37 | .. code:: python 38 | 39 | # This is just for illustration - the real URL looks a bit different 40 | d = katdal.open('https://archive/1234567890/1234567890_sdp_l0.rdb?token=AsD3') 41 | 42 | Multiple data sets (even of different versions) may also be concatenated 43 | together (as long as they have the same dump rate): 44 | 45 | .. code:: python 46 | 47 | d = katdal.open(['1234567890.h5', '1234567891.h5']) 48 | 49 | Inspect the contents of the data set by printing the object: 50 | 51 | .. code:: python 52 | 53 | print(d) 54 | 55 | Here is a typical output:: 56 | 57 | =============================================================================== 58 | Name: 1313067732.h5 (version 2.0) 59 | =============================================================================== 60 | Observer: someone Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f 61 | Description: 'Track on Hyd A,Vir A, 3C 286 and 3C 273' 62 | Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST 63 | Dump rate: 1.00025 Hz 64 | Subarrays: 1 65 | ID Antennas Inputs Corrprods 66 | 0 ant1,ant2,ant3,ant4,ant5,ant6,ant7 14 112 67 | Spectral Windows: 1 68 | ID CentreFreq(MHz) Bandwidth(MHz) Channels ChannelWidth(kHz) 69 | 0 1822.000 400.000 1024 390.625 70 | ------------------------------------------------------------------------------- 71 | Data selected according to the following criteria: 72 | subarray=0 73 | ants=['ant1', 'ant2', 'ant3', 'ant4', 'ant5', 'ant6', 'ant7'] 74 | spw=0 75 | ------------------------------------------------------------------------------- 76 | Shape: (1054 dumps, 1024 channels, 112 correlation products) => Size: 967.049 MB 77 | Antennas: *ant1,ant2,ant3,ant4,ant5,ant6,ant7 Inputs: 14 Autocorr: yes Crosscorr: yes 78 | Channels: 1024 (index 0 - 1023, 2021.805 MHz - 1622.195 MHz), each 390.625 kHz wide 79 | Targets: 4 selected out of 4 in catalogue 80 | ID Name Type RA(J2000) DEC(J2000) Tags Dumps ModelFlux(Jy) 81 | 0 Hyd A radec 9:18:05.28 -12:05:48.9 333 33.63 82 | 1 Vir A radec 12:30:49.42 12:23:28.0 251 166.50 83 | 2 3C 286 radec 13:31:08.29 30:30:33.0 230 12.97 84 | 3 3C 273 radec 12:29:06.70 2:03:08.6 240 39.96 85 | Scans: 8 selected out of 8 total Compscans: 1 selected out of 1 total 86 | Date Timerange(UTC) ScanState CompScanLabel Dumps Target 87 | 11-Aug-2011/13:02:14 - 13:04:26 0:slew 0: 133 0:Hyd A 88 | 13:04:27 - 13:07:46 1:track 0: 200 0:Hyd A 89 | 13:07:47 - 13:08:37 2:slew 0: 51 1:Vir A 90 | 13:08:38 - 13:11:57 3:track 0: 200 1:Vir A 91 | 13:11:58 - 13:12:27 4:slew 0: 30 2:3C 286 92 | 13:12:28 - 13:15:47 5:track 0: 200 2:3C 286 93 | 13:15:48 - 13:16:27 6:slew 0: 40 3:3C 273 94 | 13:16:28 - 13:19:47 7:track 0: 200 3:3C 273 95 | 96 | The first segment of the printout displays the static information of the data 97 | set, including observer, dump rate and all the available subarrays and spectral 98 | windows in the data set. The second segment (between the dashed lines) highlights 99 | the active selection criteria. The last segment displays dynamic information 100 | that is influenced by the selection, including the overall visibility array 101 | shape, antennas, channel frequencies, targets and scan info. 102 | 103 | The data set is built around the concept of a three-dimensional visibility array 104 | with dimensions of time, frequency and correlation product. This is reflected in 105 | the *shape* of the dataset: 106 | 107 | .. code:: python 108 | 109 | d.shape 110 | 111 | which returns ``(1054, 1024, 112)``, meaning 1054 dumps by 1024 channels by 112 112 | correlation products. 113 | 114 | Let's select a subset of the data set: 115 | 116 | .. code:: python 117 | 118 | d.select(scans='track', channels=slice(200, 300), ants='ant4') 119 | print(d) 120 | 121 | This results in the following printout:: 122 | 123 | =============================================================================== 124 | Name: /Users/schwardt/Downloads/1313067732.h5 (version 2.0) 125 | =============================================================================== 126 | Observer: siphelele Experiment ID: 2118d346-c41a-11e0-b2df-a4badb44fe9f 127 | Description: 'track on Hyd A,Vir A, 3C 286 and 3C 273 for Lud' 128 | Observed from 2011-08-11 15:02:14.072 SAST to 2011-08-11 15:19:47.810 SAST 129 | Dump rate: 1.00025 Hz 130 | Subarrays: 1 131 | ID Antennas Inputs Corrprods 132 | 0 ant1,ant2,ant3,ant4,ant5,ant6,ant7 14 112 133 | Spectral Windows: 1 134 | ID CentreFreq(MHz) Bandwidth(MHz) Channels ChannelWidth(kHz) 135 | 0 1822.000 400.000 1024 390.625 136 | ------------------------------------------------------------------------------- 137 | Data selected according to the following criteria: 138 | channels=slice(200, 300, None) 139 | subarray=0 140 | scans='track' 141 | ants='ant4' 142 | spw=0 143 | ------------------------------------------------------------------------------- 144 | Shape: (800 dumps, 100 channels, 4 correlation products) => Size: 2.560 MB 145 | Antennas: ant4 Inputs: 2 Autocorr: yes Crosscorr: no 146 | Channels: 100 (index 200 - 299, 1943.680 MHz - 1905.008 MHz), each 390.625 kHz wide 147 | Targets: 4 selected out of 4 in catalogue 148 | ID Name Type RA(J2000) DEC(J2000) Tags Dumps ModelFlux(Jy) 149 | 0 Hyd A radec 9:18:05.28 -12:05:48.9 200 31.83 150 | 1 Vir A radec 12:30:49.42 12:23:28.0 200 159.06 151 | 2 3C 286 radec 13:31:08.29 30:30:33.0 200 12.61 152 | 3 3C 273 radec 12:29:06.70 2:03:08.6 200 39.32 153 | Scans: 4 selected out of 8 total Compscans: 1 selected out of 1 total 154 | Date Timerange(UTC) ScanState CompScanLabel Dumps Target 155 | 11-Aug-2011/13:04:27 - 13:07:46 1:track 0: 200 0:Hyd A 156 | 13:08:38 - 13:11:57 3:track 0: 200 1:Vir A 157 | 13:12:28 - 13:15:47 5:track 0: 200 2:3C 286 158 | 13:16:28 - 13:19:47 7:track 0: 200 3:3C 273 159 | 160 | Compared to the first printout, the static information has remained the same 161 | while the dynamic information now reflects the selected subset. There are many 162 | possible selection criteria, as illustrated below: 163 | 164 | .. code:: python 165 | 166 | d.select(timerange=('2011-08-11 13:10:00', '2011-08-11 13:15:00'), targets=[1, 2]) 167 | d.select(spw=0, subarray=0) 168 | d.select(ants='ant1,ant2', pol='H', scans=(0,1,2), freqrange=(1700e6, 1800e6)) 169 | 170 | See the docstring of ``DataSet.select`` for more detailed information (i.e. 171 | do ``d.select?`` in IPython). Take note that only one subarray and one spectral 172 | window must be selected. 173 | 174 | Once a subset of the data has been selected, you can access the data and 175 | timestamps on the data set object: 176 | 177 | .. code:: python 178 | 179 | vis = d.vis[:] 180 | timestamps = d.timestamps[:] 181 | 182 | Note the ``[:]`` indexing, as the ``vis`` and ``timestamps`` properties are 183 | special ``LazyIndexer`` objects that only give you the actual data when 184 | you use indexing, in order not to inadvertently load the entire array into memory. 185 | 186 | For the example dataset and no selection the ``vis`` array will have a shape of 187 | ``(1054, 1024, 112)``. The time dimension is labelled by ``d.timestamps``, the 188 | frequency dimension by ``d.channel_freqs`` and the correlation product dimension 189 | by ``d.corr_products``. 190 | 191 | Another key concept in the data set object is that of *sensors*. These are named 192 | time series of arbitrary data that are either loaded from the data set 193 | (*actual* sensors) or calculated on the fly (*virtual* sensors). Both variants 194 | are accessed through the *sensor cache* (available as ``d.sensor``) and cached 195 | there after the first access. The data set object also provides convenient 196 | properties to expose commonly-used sensors, as shown in the plot example below: 197 | 198 | .. code:: python 199 | 200 | import matplotlib.pyplot as plt 201 | plt.plot(d.az, d.el, 'o') 202 | plt.xlabel('Azimuth (degrees)') 203 | plt.ylabel('Elevation (degrees)') 204 | 205 | Other useful attributes include ``ra``, ``dec``, ``lst``, ``mjd``, ``u``, 206 | ``v``, ``w``, ``target_x`` and ``target_y``. These are all one-dimensional 207 | NumPy arrays that dynamically change length depending on the active selection. 208 | 209 | As in katdal's predecessor (scape) there is a ``DataSet.scans`` generator 210 | that allows you to step through the scans in the data set. It returns the 211 | scan index, scan state and target object on each iteration, and updates 212 | the active selection on the data set to include only the current scan. 213 | It is also possible to iterate through the compound scans with the 214 | ``DataSet.compscans`` generator, which yields the compound scan index, label 215 | and first target on each iteration for convenience. These two iterators may also 216 | be used together to traverse the data set structure: 217 | 218 | .. code:: python 219 | 220 | for compscan, label, target in d.compscans(): 221 | plt.figure() 222 | for scan, state, target in d.scans(): 223 | if state in ('scan', 'track'): 224 | plt.plot(d.ra, d.dec, 'o') 225 | plt.xlabel('Right ascension (J2000 degrees)') 226 | plt.ylabel('Declination (J2000 degrees)') 227 | plt.title(target.name) 228 | 229 | Finally, all the targets (or fields) in the data set are stored in a catalogue 230 | available at ``d.catalogue``, and the original HDF5 file is still accessible via 231 | a back door installed at ``d.file`` in the case of a single-file data set (v3 232 | or older). On a v4 data set, ``d.source`` provides access to the underlying 233 | telstate for metadata and the chunk store for data. 234 | -------------------------------------------------------------------------------- /scripts/mvf_rechunk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | ################################################################################ 4 | # Copyright (c) 2019-2021, National Research Foundation (SARAO) 5 | # 6 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 7 | # this file except in compliance with the License. You may obtain a copy 8 | # of the License at 9 | # 10 | # https://opensource.org/licenses/BSD-3-Clause 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | ################################################################################ 18 | 19 | """Rechunk an existing MVF dataset""" 20 | 21 | import argparse 22 | import multiprocessing 23 | import os 24 | import re 25 | import sys 26 | import urllib.parse 27 | 28 | import dask 29 | import dask.array as da 30 | import numpy as np 31 | from katsdptelstate.rdb_writer import RDBWriter 32 | 33 | from katdal.chunkstore import ChunkStoreError 34 | from katdal.chunkstore_npy import NpyFileChunkStore 35 | from katdal.datasources import (TelstateDataSource, infer_chunk_store, 36 | view_capture_stream) 37 | from katdal.flags import DATA_LOST 38 | 39 | 40 | class RechunkSpec: 41 | def __init__(self, arg): 42 | match = re.match(r'^([A-Za-z0-9_.]+)/([A-Za-z0-9_]+):(\d+),(\d+)', arg) 43 | if not match: 44 | raise ValueError(f'Could not parse {arg!r}') 45 | self.stream = match.group(1) 46 | self.array = match.group(2) 47 | self.time = int(match.group(3)) 48 | self.freq = int(match.group(4)) 49 | if self.time <= 0 or self.freq <= 0: 50 | raise ValueError('Chunk sizes must be positive') 51 | 52 | 53 | def _fill_missing(data, default_value, block_info): 54 | if data is None: 55 | info = block_info[None] 56 | return np.full(info['chunk-shape'], default_value, info['dtype']) 57 | else: 58 | return data 59 | 60 | 61 | def _make_lost(data, block_info): 62 | info = block_info[None] 63 | if data is None: 64 | return np.full(info['chunk-shape'], DATA_LOST, np.uint8) 65 | else: 66 | return np.zeros(info['chunk-shape'], np.uint8) 67 | 68 | 69 | class Array: 70 | def __init__(self, stream_name, array_name, store, chunk_info): 71 | self.stream_name = stream_name 72 | self.array_name = array_name 73 | self.chunk_info = chunk_info 74 | self.store = store 75 | full_name = store.join(chunk_info['prefix'], array_name) 76 | chunks = chunk_info['chunks'] 77 | dtype = chunk_info['dtype'] 78 | raw_data = store.get_dask_array(full_name, chunks, dtype, errors='none') 79 | # raw_data has `None` objects instead of ndarrays for chunks with 80 | # missing data. That's not actually valid as a dask array, but we use 81 | # it to produce lost flags (similarly to datasources.py). 82 | default_value = DATA_LOST if array_name == 'flags' else 0 83 | self.data = da.map_blocks(_fill_missing, raw_data, default_value, dtype=raw_data.dtype) 84 | self.lost_flags = da.map_blocks(_make_lost, raw_data, dtype=np.uint8) 85 | 86 | 87 | def get_chunk_store(source, telstate, array): 88 | """A wrapper around katdal.datasources.infer_chunk_store. 89 | 90 | It has a simpler interface, taking an URL rather than url_parts and kwargs. 91 | """ 92 | url_parts = urllib.parse.urlparse(source, scheme='file') 93 | kwargs = dict(urllib.parse.parse_qsl(url_parts.query)) 94 | return infer_chunk_store(url_parts, telstate, array=array, **kwargs) 95 | 96 | 97 | def comma_list(value): 98 | return value.split(',') 99 | 100 | 101 | def parse_args(): 102 | parser = argparse.ArgumentParser( 103 | description='Rechunk a single capture block. For each array within each stream, ' 104 | 'a new chunking scheme may be specified. A chunking scheme is ' 105 | 'specified as the number of dumps and channels per chunk.') 106 | parser.add_argument('--workers', type=int, default=8*multiprocessing.cpu_count(), 107 | help='Number of dask workers for parallel I/O [%(default)s]') 108 | parser.add_argument('--streams', type=comma_list, metavar='STREAM,STREAM', 109 | help='Streams to copy [all]') 110 | parser.add_argument('--s3-endpoint-url', help='URL where rechunked data will be uploaded') 111 | parser.add_argument('--new-prefix', help='Replacement for capture block ID in output bucket names') 112 | parser.add_argument('source', help='Input .rdb file') 113 | parser.add_argument('dest', help='Output directory') 114 | parser.add_argument('spec', nargs='*', default=[], type=RechunkSpec, 115 | metavar='STREAM/ARRAY:TIME,FREQ', help='New chunk specification') 116 | args = parser.parse_args() 117 | return args 118 | 119 | 120 | def get_stream_type(telstate, stream): 121 | try: 122 | return telstate.view(stream)['stream_type'] 123 | except KeyError: 124 | try: 125 | base = telstate.view(stream)['inherit'] 126 | return get_stream_type(telstate, base) 127 | except KeyError: 128 | return None 129 | 130 | 131 | def get_streams(telstate, streams): 132 | """Determine streams to copy based on what the user asked for""" 133 | archived_streams = telstate.get('sdp_archived_streams', []) 134 | archived_streams = [ 135 | stream for stream in archived_streams 136 | if get_stream_type(telstate, stream) in {'sdp.vis', 'sdp.flags'}] 137 | if not archived_streams: 138 | raise RuntimeError('Source dataset does not contain any visibility streams') 139 | if streams is None: 140 | streams = archived_streams 141 | else: 142 | for stream in streams: 143 | if stream not in archived_streams: 144 | raise RuntimeError('Stream {!r} is not known (should be one of {})' 145 | .format(stream, ', '.join(archived_streams))) 146 | 147 | return streams 148 | 149 | 150 | def main(): 151 | args = parse_args() 152 | dask.config.set(num_workers=args.workers) 153 | 154 | # Lightweight open with no data - just to create telstate and identify the CBID 155 | ds = TelstateDataSource.from_url(args.source, upgrade_flags=False, chunk_store=None) 156 | # View the CBID, but not any specific stream 157 | cbid = ds.capture_block_id 158 | telstate = ds.telstate.root().view(cbid) 159 | streams = get_streams(telstate, args.streams) 160 | 161 | # Find all arrays in the selected streams, and also ensure we're not 162 | # trying to write things back on top of an existing dataset. 163 | arrays = {} 164 | for stream_name in streams: 165 | sts = view_capture_stream(telstate, cbid, stream_name) 166 | try: 167 | chunk_info = sts['chunk_info'] 168 | except KeyError as exc: 169 | raise RuntimeError(f'Could not get chunk info for {stream_name!r}: {exc}') 170 | for array_name, array_info in chunk_info.items(): 171 | if args.new_prefix is not None: 172 | array_info['prefix'] = args.new_prefix + '-' + stream_name.replace('_', '-') 173 | prefix = array_info['prefix'] 174 | path = os.path.join(args.dest, prefix) 175 | if os.path.exists(path): 176 | raise RuntimeError(f'Directory {path!r} already exists') 177 | store = get_chunk_store(args.source, sts, array_name) 178 | # Older files have dtype as an object that can't be encoded in msgpack 179 | dtype = np.dtype(array_info['dtype']) 180 | array_info['dtype'] = np.lib.format.dtype_to_descr(dtype) 181 | arrays[(stream_name, array_name)] = Array(stream_name, array_name, store, array_info) 182 | 183 | # Apply DATA_LOST bits to the flags arrays. This is a less efficient approach than 184 | # datasources.py, but much simpler. 185 | for stream_name in streams: 186 | flags_array = arrays.get((stream_name, 'flags')) 187 | if not flags_array: 188 | continue 189 | sources = [stream_name] 190 | sts = view_capture_stream(telstate, cbid, stream_name) 191 | sources += sts['src_streams'] 192 | for src_stream in sources: 193 | if src_stream not in streams: 194 | continue 195 | src_ts = view_capture_stream(telstate, cbid, src_stream) 196 | for array_name in src_ts['chunk_info']: 197 | if array_name == 'flags' and src_stream != stream_name: 198 | # Upgraded flags completely replace the source stream's 199 | # flags, rather than augmenting them. Thus, data lost in 200 | # the source stream has no effect. 201 | continue 202 | lost_flags = arrays[(src_stream, array_name)].lost_flags 203 | lost_flags = lost_flags.rechunk(flags_array.data.chunks[:lost_flags.ndim]) 204 | # weights_channel doesn't have a baseline axis 205 | while lost_flags.ndim < flags_array.data.ndim: 206 | lost_flags = lost_flags[..., np.newaxis] 207 | lost_flags = da.broadcast_to(lost_flags, flags_array.data.shape, 208 | chunks=flags_array.data.chunks) 209 | flags_array.data |= lost_flags 210 | 211 | # Apply the rechunking specs 212 | for spec in args.spec: 213 | key = (spec.stream, spec.array) 214 | if key not in arrays: 215 | raise RuntimeError(f'{spec.stream}/{spec.array} is not a known array') 216 | arrays[key].data = arrays[key].data.rechunk({0: spec.time, 1: spec.freq}) 217 | 218 | # Write out the new data 219 | dest_store = NpyFileChunkStore(args.dest) 220 | stores = [] 221 | for array in arrays.values(): 222 | full_name = dest_store.join(array.chunk_info['prefix'], array.array_name) 223 | dest_store.create_array(full_name) 224 | stores.append(dest_store.put_dask_array(full_name, array.data)) 225 | array.chunk_info['chunks'] = array.data.chunks 226 | stores = da.compute(*stores) 227 | # put_dask_array returns an array with an exception object per chunk 228 | for result_set in stores: 229 | for result in result_set.flat: 230 | if result is not None: 231 | raise result 232 | 233 | # Fix up chunk_info for new chunking 234 | for stream_name in streams: 235 | sts = view_capture_stream(telstate, cbid, stream_name) 236 | chunk_info = sts['chunk_info'] 237 | for array_name in chunk_info.keys(): 238 | chunk_info[array_name] = arrays[(stream_name, array_name)].chunk_info 239 | sts.wrapped.delete('chunk_info') 240 | sts.wrapped['chunk_info'] = chunk_info 241 | # s3_endpoint_url is for the old version of the data 242 | sts.wrapped.delete('s3_endpoint_url') 243 | if args.s3_endpoint_url is not None: 244 | sts.wrapped['s3_endpoint_url'] = args.s3_endpoint_url 245 | 246 | # Write updated RDB file 247 | url_parts = urllib.parse.urlparse(args.source, scheme='file') 248 | dest_file = os.path.join(args.dest, args.new_prefix or cbid, os.path.basename(url_parts.path)) 249 | os.makedirs(os.path.dirname(dest_file), exist_ok=True) 250 | with RDBWriter(dest_file) as writer: 251 | writer.save(telstate.backend) 252 | 253 | 254 | if __name__ == '__main__': 255 | try: 256 | main() 257 | except (RuntimeError, ChunkStoreError) as exc: 258 | print(exc, file=sys.stderr) 259 | sys.exit(1) 260 | -------------------------------------------------------------------------------- /katdal/test/test_lazy_indexer.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2018-2019,2021-2023,2025, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.lazy_indexer`.""" 18 | 19 | from functools import partial 20 | from numbers import Integral 21 | 22 | import dask 23 | import dask.array as da 24 | import numpy as np 25 | from packaging.version import Version 26 | import pytest 27 | 28 | from katdal.lazy_indexer import (DaskLazyIndexer, _dask_oindex, 29 | _range_to_slice, _simplify_index, 30 | dask_getitem) 31 | 32 | 33 | def slice_to_range(s, length): 34 | return range(*s.indices(length)) 35 | 36 | 37 | class TestRangeToSlice: 38 | """Test the :func:`~katdal.lazy_indexer._range_to_slice` function.""" 39 | @staticmethod 40 | def _check_slice(start, stop, step): 41 | s = slice(start, stop, step) 42 | length = max(start, 0 if stop is None else stop) + 1 43 | r = slice_to_range(s, length) 44 | assert _range_to_slice(r) == s 45 | 46 | def test_basic_slices(self): 47 | # For testing both `start` and `stop` need to be non-negative 48 | self._check_slice(0, 10, 1) # contiguous, ascending 49 | self._check_slice(0, 10, 2) # strided, ascending 50 | self._check_slice(10, 0, -1) # contiguous, descending 51 | self._check_slice(10, 0, -2) # strided, descending 52 | self._check_slice(10, None, -2) # strided, descending all the way to 0 53 | self._check_slice(0, 1, 1) # single element (treated as ascending) 54 | self._check_slice(0, 10, 5) # any two elements (has stop = 2 * step) 55 | 56 | def test_negative_elements(self): 57 | with pytest.raises(ValueError): 58 | _range_to_slice([-1, -2, -3, -4]) 59 | 60 | def test_zero_increments(self): 61 | with pytest.raises(ValueError): 62 | _range_to_slice([1, 1, 1, 1]) 63 | 64 | def test_uneven_increments(self): 65 | with pytest.raises(ValueError): 66 | _range_to_slice([1, 1, 2, 3, 5, 8, 13]) 67 | 68 | 69 | class TestSimplifyIndex: 70 | """Test the :func:`~katdal.lazy_indexer._simplify_index` function.""" 71 | def setup_method(self): 72 | self.shape = (3, 4, 5) 73 | self.data = np.arange(np.prod(self.shape)).reshape(self.shape) 74 | 75 | def _test_with(self, indices): 76 | expected = self.data[indices] 77 | simplified = _simplify_index(indices, self.data.shape) 78 | actual = self.data[simplified] 79 | np.testing.assert_array_equal(actual, expected) 80 | 81 | def _test_index_error(self, indices): 82 | with pytest.raises(IndexError): 83 | simplified = _simplify_index(indices, self.data.shape) 84 | self.data[simplified] 85 | with pytest.raises(IndexError): 86 | self.data[indices] 87 | 88 | def test_1d(self): 89 | self._test_with(np.s_[np.array([False, True, False])]) 90 | self._test_with(np.s_[[1]]) 91 | 92 | def test_contiguous(self): 93 | self._test_with(np.s_[:, np.array([False, True, True, False]), :]) 94 | self._test_with(np.s_[:, [1, 2], :]) 95 | 96 | def test_discontiguous_but_regular(self): 97 | self._test_with(np.s_[:, [False, True, False, True], :]) 98 | self._test_with(np.s_[:, [1, 3], :]) 99 | 100 | def test_discontiguous(self): 101 | self._test_with(np.s_[:, [True, True, False, True], :]) 102 | self._test_with(np.s_[:, [0, 1, 3], :]) 103 | 104 | def test_all_false(self): 105 | self._test_with(np.s_[:, np.array([False, False, False, False]), :]) 106 | 107 | def test_all_true(self): 108 | self._test_with(np.s_[:, np.array([True, True, True, True]), :]) 109 | 110 | def test_newaxis(self): 111 | self._test_with(np.s_[np.newaxis, np.array([True, True, False])]) 112 | 113 | def test_ellipsis(self): 114 | self._test_with(np.s_[..., np.array([True, False, True, False, True])]) 115 | 116 | def test_wrong_length(self): 117 | self._test_index_error(np.s_[:, np.array([True, False]), :]) 118 | 119 | def test_too_many_axes(self): 120 | self._test_index_error(np.s_[0, 0, 0, 0]) 121 | 122 | def test_bad_index_dtype(self): 123 | self._test_index_error(np.s_[:, np.array([1.2, 3.4])]) 124 | 125 | 126 | def ix_(keep, shape): 127 | r"""Extend numpy.ix\_ to accept slices and single ints as well.""" 128 | # Inspired by Zarr's indexing.py (https://github.com/zarr-developers/zarr) 129 | keep = [slice_to_range(k, s) if isinstance(k, slice) 130 | else [k] if isinstance(k, Integral) 131 | else k 132 | for k, s in zip(keep, shape)] 133 | return np.ix_(*keep) 134 | 135 | 136 | def numpy_oindex(x, keep): 137 | """Perform outer indexing on a NumPy array (inspired by Zarr). 138 | 139 | This is more onerous, but calls `x.__getitem__` only once. 140 | """ 141 | # Inspired by Zarr's indexing.py (https://github.com/zarr-developers/zarr) 142 | # Get rid of ellipsis 143 | keep = da.slicing.normalize_index(keep, x.shape) 144 | new_axes = tuple(n for n, k in enumerate(keep) if k is np.newaxis) 145 | drop_axes = tuple(n for n, k in enumerate(keep) if isinstance(k, Integral)) 146 | # Get rid of newaxis 147 | keep = tuple(k for k in keep if k is not np.newaxis) 148 | keep = ix_(keep, x.shape) 149 | result = x[keep] 150 | for ax in new_axes: 151 | result = np.expand_dims(result, ax) 152 | result = result.squeeze(axis=drop_axes) 153 | return result 154 | 155 | 156 | def numpy_oindex_lite(x, keep): 157 | """Perform outer indexing on a NumPy array (compact version). 158 | 159 | This is more compact, but calls `x.__getitem__` `x.ndim` times. 160 | 161 | It also assumes that `keep` contains no ellipsis to be as pure as possible. 162 | """ 163 | if not isinstance(keep, tuple): 164 | keep = (keep,) 165 | dim = 0 166 | result = x 167 | for k in keep: 168 | cumulative_index = (slice(None),) * dim + (k,) 169 | result = result[cumulative_index] 170 | # Handle dropped dimensions 171 | if not isinstance(k, Integral): 172 | dim += 1 173 | return result 174 | 175 | 176 | UNEVEN = [False, True, True, True, False, False, True, True, False, True] 177 | DASK_SLICE_BUG = Version(dask.__version__) >= Version('2024.8.0') 178 | 179 | 180 | class TestDaskGetitem: 181 | """Test the :func:`~katdal.lazy_indexer.dask_getitem` function.""" 182 | def setup_method(self): 183 | shape = (10, 20, 30, 40) 184 | self.data = np.arange(np.prod(shape)).reshape(shape) 185 | self.data_dask = da.from_array(self.data, chunks=(2, 5, 2, 5)) 186 | 187 | def _test_with(self, indices, normalised_indices=None): 188 | npy = numpy_oindex(self.data, indices) 189 | if normalised_indices is None: 190 | normalised_indices = indices 191 | npy_lite = numpy_oindex_lite(self.data, normalised_indices) 192 | oindex = _dask_oindex(self.data_dask, normalised_indices).compute() 193 | getitem = dask_getitem(self.data_dask, indices).compute() 194 | np.testing.assert_array_equal(npy, npy_lite) 195 | np.testing.assert_array_equal(getitem, npy) 196 | np.testing.assert_array_equal(oindex, npy) 197 | 198 | def test_misc_indices(self): 199 | self._test_with(()) 200 | self._test_with(2, (2,)) 201 | self._test_with((2, 3, 4, 5)) 202 | 203 | def test_ellipsis(self): 204 | self._test_with(np.s_[[0], ...], np.s_[[0], :, :, :]) 205 | self._test_with(np.s_[:, [0], ...], np.s_[:, [0], :, :]) 206 | self._test_with(np.s_[[0], ..., [0]], np.s_[[0], :, :, [0]]) 207 | 208 | def test_evenly_spaced_ints(self): 209 | self._test_with(np.s_[:, [0], [0], :]) 210 | self._test_with(np.s_[:, [0], :, [0]]) 211 | self._test_with(np.s_[:, [0], [0, 1, 2, 3], :]) 212 | self._test_with(np.s_[[0], [-1, -2, -3, -4, -5], :, [8, 6, 4, 2, 0]]) 213 | 214 | def test_evenly_spaced_booleans(self): 215 | pick_one = np.zeros(40, dtype=bool) 216 | pick_one[6] = True 217 | self._test_with(np.s_[:, [True, False] * 10, pick_one[:30], :]) 218 | self._test_with(np.s_[:, [False, True] * 10, :, pick_one]) 219 | self._test_with(np.s_[4:9, [False, True] * 10, 220 | [True, False] * 15, pick_one]) 221 | 222 | def test_unevenly_spaced_fancy_indexing(self): 223 | self._test_with(np.s_[:, [0, 1, 3], [1, 2, 4], :]) 224 | self._test_with(np.s_[UNEVEN, 2 * UNEVEN, 3 * UNEVEN, 4 * UNEVEN]) 225 | 226 | def test_repeated_fancy_indexing(self): 227 | self._test_with(np.s_[:, [1, 1, 1], [6, 6, 6], :]) 228 | 229 | def test_slices(self): 230 | self._test_with(np.s_[0:2, 2:4, 4:6, 6:8]) 231 | self._test_with(np.s_[-8:-6, -4:-2, 3:10:2, -2:]) 232 | 233 | def test_single_ints(self): 234 | self._test_with(np.s_[:, [0], 0, :]) 235 | self._test_with(np.s_[:, [0], :, 0]) 236 | self._test_with(np.s_[:, [0], -1, :]) 237 | self._test_with(np.s_[:, [0], :, -1]) 238 | self._test_with(np.s_[:, 0, [0, 2], [1, 3, 5]]) 239 | 240 | @pytest.mark.skipif(DASK_SLICE_BUG, reason="Dask newaxis + mask slicing broken") 241 | def test_newaxis(self): 242 | self._test_with(np.s_[np.newaxis, :, 2 * UNEVEN, :, 0]) 243 | self._test_with(np.s_[:, 2 * UNEVEN, np.newaxis, 0, :]) 244 | self._test_with(np.s_[0, np.newaxis, 1, np.newaxis, 2, np.newaxis, 3]) 245 | 246 | @pytest.mark.skipif(DASK_SLICE_BUG, reason="Dask newaxis + mask slicing broken") 247 | def test_the_lot(self): 248 | self._test_with(np.s_[..., 0, 2:5, 3 * UNEVEN, np.newaxis, [4, 6]], 249 | np.s_[0, 2:5, 3 * UNEVEN, np.newaxis, [4, 6]]) 250 | 251 | 252 | class TestDaskLazyIndexer: 253 | """Test the :class:`~katdal.lazy_indexer.DaskLazyIndexer` class.""" 254 | def setup_method(self): 255 | shape = (10, 20, 30) 256 | self.data = np.arange(np.prod(shape)).reshape(shape) 257 | self.data_dask = da.from_array(self.data, chunks=(1, 4, 5), name='x') 258 | 259 | def test_str_repr(self): 260 | def transform1(x): 261 | return x 262 | transform2 = lambda x: x # noqa: E731 263 | class Transform3: # noqa: E306 264 | def __call__(self, x): 265 | return x 266 | transform3 = Transform3() 267 | transform4 = partial(transform1) 268 | transforms = [transform1, transform2, transform3, transform4] 269 | indexer = DaskLazyIndexer(self.data_dask, transforms=transforms) 270 | expected = 'x | transform1 | | Transform3 | transform1' 271 | expected += f' -> {indexer.shape} {indexer.dtype}' 272 | assert str(indexer) == expected 273 | # Simply exercise repr - no need to check result 274 | repr(indexer) 275 | 276 | def _test_with(self, stage1=(), stage2=()): 277 | npy1 = numpy_oindex(self.data, stage1) 278 | npy2 = numpy_oindex(npy1, stage2) 279 | indexer = DaskLazyIndexer(self.data_dask, stage1) 280 | np.testing.assert_array_equal(indexer[stage2], npy2) 281 | # Check nested indexers 282 | indexer2 = DaskLazyIndexer(indexer, stage2) 283 | np.testing.assert_array_equal(indexer2[()], npy2) 284 | 285 | def test_stage1_slices(self): 286 | self._test_with(np.s_[5:, :, 1::2]) 287 | 288 | def test_stage2_ints(self): 289 | self._test_with(np.s_[5:, :, 1::2], np.s_[1, 2, -1]) 290 | 291 | def test_stage1_multiple_fancy_indices(self): 292 | self._test_with(tuple([True] * d for d in self.data.shape)) 293 | self._test_with(tuple([True, False] * (d // 2) 294 | for d in self.data.shape)) 295 | self._test_with(np.s_[UNEVEN, 2 * UNEVEN, :24]) 296 | self._test_with(np.s_[:3, [1, 2, 3, 4, 6, 9], [8, 6, 4, 2, 0]]) 297 | 298 | def test_stage2_multiple_fancy_indices(self): 299 | stage1 = tuple([True] * d for d in self.data.shape) 300 | stage2 = tuple([True] * 4 + [False] * (d - 4) for d in self.data.shape) 301 | self._test_with(stage1, stage2) 302 | stage2 = tuple([True, False] * (d // 2) for d in self.data.shape) 303 | self._test_with(stage1, stage2) 304 | stage1 = np.s_[UNEVEN, 2 * UNEVEN, :24] 305 | stage2 = np.s_[:3, [1, 2, 3, 4, 6, 9], [8, 6, 4, 2, 0]] 306 | self._test_with(stage1, stage2) 307 | 308 | def test_transforms(self): 309 | # Add transform at initialisation 310 | indexer = DaskLazyIndexer(self.data_dask, transforms=[lambda x: 0 * x]) 311 | np.testing.assert_array_equal(indexer[:], np.zeros_like(indexer)) 312 | # Check nested indexers 313 | indexer = DaskLazyIndexer(self.data_dask) 314 | indexer2 = DaskLazyIndexer(indexer, transforms=[lambda x: 0 * x]) 315 | np.testing.assert_array_equal(indexer[:], self.data) 316 | np.testing.assert_array_equal(indexer2[:], np.zeros_like(indexer)) 317 | -------------------------------------------------------------------------------- /katdal/test/test_vis_flags_weights.py: -------------------------------------------------------------------------------- 1 | ################################################################################ 2 | # Copyright (c) 2018-2022,2024, National Research Foundation (SARAO) 3 | # 4 | # Licensed under the BSD 3-Clause License (the "License"); you may not use 5 | # this file except in compliance with the License. You may obtain a copy 6 | # of the License at 7 | # 8 | # https://opensource.org/licenses/BSD-3-Clause 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | ################################################################################ 16 | 17 | """Tests for :py:mod:`katdal.vis_flags_weights`.""" 18 | 19 | import itertools 20 | import os 21 | import random 22 | import shutil 23 | import tempfile 24 | 25 | import dask.array as da 26 | import numpy as np 27 | from numpy.testing import assert_array_equal 28 | import pytest 29 | 30 | from katdal.chunkstore import generate_chunks 31 | from katdal.chunkstore_npy import NpyFileChunkStore 32 | from katdal.flags import DATA_LOST 33 | from katdal.van_vleck import autocorr_lookup_table 34 | from katdal.lazy_indexer import DaskLazyIndexer 35 | from katdal.vis_flags_weights import (ChunkStoreVisFlagsWeights, 36 | VisFlagsWeights, corrprod_to_autocorr) 37 | 38 | 39 | def test_vis_flags_weights(): 40 | with pytest.raises(ValueError): 41 | VisFlagsWeights(np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 4))) 42 | with pytest.raises(ValueError): 43 | VisFlagsWeights(np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 3)), np.ones((1, 2, 4))) 44 | 45 | 46 | def ramp(shape, offset=1.0, slope=1.0, dtype=np.float64): 47 | """Generate a multidimensional ramp of values of the given dtype.""" 48 | x = offset + slope * np.arange(np.prod(shape), dtype=np.float64) 49 | return x.astype(dtype).reshape(shape) 50 | 51 | 52 | def to_dask_array(x, chunks=None): 53 | """Turn ndarray `x` into dask array with the standard vis-like chunking.""" 54 | if chunks is None: 55 | itemsize = np.dtype('complex64').itemsize 56 | # Special case for 2-D weights_channel array ensures one chunk per dump 57 | n_corrprods = x.shape[2] if x.ndim >= 3 else x.shape[1] // itemsize 58 | # This contrives to have a vis array with 1 dump and 4 channels per chunk 59 | chunk_size = 4 * n_corrprods * itemsize 60 | chunks = generate_chunks(x.shape, x.dtype, chunk_size, 61 | dims_to_split=(0, 1), power_of_two=True) 62 | return da.from_array(x, chunks) 63 | 64 | 65 | def put_fake_dataset(store, prefix, shape, chunk_overrides=None, array_overrides=None, flags_only=False): 66 | """Write a fake dataset into the chunk store.""" 67 | if flags_only: 68 | data = {'flags': np.random.RandomState(1).randint(0, 7, shape, dtype=np.uint8)} 69 | else: 70 | data = {'correlator_data': ramp(shape, dtype=np.float32) * (1 - 1j), 71 | 'flags': np.random.RandomState(2).randint(0, 7, shape, dtype=np.uint8), 72 | 'weights': ramp(shape, slope=255. / np.prod(shape), dtype=np.uint8), 73 | 'weights_channel': ramp(shape[:-1], dtype=np.float32)} 74 | if array_overrides is not None: 75 | for name in data: 76 | if name in array_overrides: 77 | data[name] = array_overrides[name] 78 | if chunk_overrides is None: 79 | chunk_overrides = {} 80 | ddata = {k: to_dask_array(array, chunk_overrides.get(k)) for k, array in data.items()} 81 | chunk_info = {k: {'prefix': prefix, 'chunks': darray.chunks, 82 | 'dtype': np.lib.format.dtype_to_descr(darray.dtype), 83 | 'shape': darray.shape} 84 | for k, darray in ddata.items()} 85 | for k, darray in ddata.items(): 86 | store.create_array(store.join(prefix, k)) 87 | push = [store.put_dask_array(store.join(prefix, k), darray) 88 | for k, darray in ddata.items()] 89 | da.compute(*push) 90 | return data, chunk_info 91 | 92 | 93 | class TestChunkStoreVisFlagsWeights: 94 | """Test the :class:`ChunkStoreVisFlagsWeights` dataset store.""" 95 | 96 | @classmethod 97 | def setup_class(cls): 98 | cls.tempdir = tempfile.mkdtemp() 99 | 100 | @classmethod 101 | def teardown_class(cls): 102 | shutil.rmtree(cls.tempdir) 103 | 104 | def _make_basic_dataset(self): 105 | store = NpyFileChunkStore(self.tempdir) 106 | prefix = 'cb1' 107 | shape = (10, 64, 30) 108 | data, chunk_info = put_fake_dataset(store, prefix, shape) 109 | weights = data['weights'] * data['weights_channel'][..., np.newaxis] 110 | return store, chunk_info, data, weights 111 | 112 | def test_construction(self): 113 | # Put fake dataset into chunk store 114 | store, chunk_info, data, weights = self._make_basic_dataset() 115 | # Check that data is as expected when accessed via VisFlagsWeights 116 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info) 117 | assert vfw.shape == data['correlator_data'].shape 118 | assert_array_equal(vfw.vis.compute(), data['correlator_data']) 119 | assert_array_equal(vfw.flags.compute(), data['flags']) 120 | assert_array_equal(vfw.weights.compute(), weights) 121 | assert vfw.unscaled_weights is None 122 | 123 | def test_index(self): 124 | # Put fake dataset into chunk store 125 | store, chunk_info, data, weights = self._make_basic_dataset() 126 | index = np.s_[2:5, -20:] 127 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info, preselect_index=index) 128 | assert_array_equal(vfw.vis.compute(), data['correlator_data'][index]) 129 | assert_array_equal(vfw.flags.compute(), data['flags'][index]) 130 | assert_array_equal(vfw.weights.compute(), weights[index]) 131 | 132 | def test_lazy_indexer_interaction(self): 133 | # Put fake dataset into chunk store 134 | store, chunk_info, data, weights = self._make_basic_dataset() 135 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info) 136 | # Check that the combination of DaskLazyIndexer and VisFlagsWeights works 137 | vis_indexer = DaskLazyIndexer(vfw.vis) 138 | flags_indexer = DaskLazyIndexer(vfw.flags) 139 | weights_indexer = DaskLazyIndexer(vfw.weights) 140 | assert_array_equal(vis_indexer[:], data['correlator_data']) 141 | assert_array_equal(flags_indexer[:], data['flags']) 142 | assert_array_equal(weights_indexer[:], weights) 143 | # Probe the case where we select a small portion of the data, which 144 | # has a different code path and also represents what mvftoms does. 145 | assert_array_equal(vis_indexer[0], data['correlator_data'][0]) 146 | assert_array_equal(flags_indexer[0], data['flags'][0]) 147 | assert_array_equal(weights_indexer[0], weights[0]) 148 | # Also check fancy indexing to complete the set 149 | dumps = np.ones(vfw.shape[0], dtype=bool) 150 | dumps[2:5] = False 151 | dumps[8:] = False 152 | assert_array_equal(vis_indexer[dumps], data['correlator_data'][dumps]) 153 | assert_array_equal(flags_indexer[dumps], data['flags'][dumps]) 154 | assert_array_equal(weights_indexer[dumps], weights[dumps]) 155 | 156 | def test_van_vleck(self): 157 | ants = 7 158 | index1, index2 = np.triu_indices(ants) 159 | inputs = [f'm{i:03}h' for i in range(ants)] 160 | corrprods = np.array([(inputs[a], inputs[b]) for (a, b) in zip(index1, index2)]) 161 | auto_indices, _, _ = corrprod_to_autocorr(corrprods) 162 | # Put fake dataset into chunk store 163 | store = NpyFileChunkStore(self.tempdir) 164 | prefix = 'cb1' 165 | shape = (10, 256, len(index1)) 166 | _, chunk_info = put_fake_dataset(store, prefix, shape, 167 | chunk_overrides={'correlator_data': (1, 4, shape[2] // 2)}) 168 | # Extract uncorrected visibilities and correct them manually 169 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='off') 170 | raw_vis = vfw.vis.compute() 171 | # Yes, this is hard-coded for MeerKAT for now - only fix this once necessary 172 | levels = np.arange(-127., 128.) 173 | quantised_autocorr_table, true_autocorr_table = autocorr_lookup_table(levels) 174 | expected_vis = raw_vis.copy() 175 | expected_vis[..., auto_indices] = np.interp(raw_vis[..., auto_indices].real, 176 | quantised_autocorr_table, true_autocorr_table) 177 | # Now extract corrected visibilities via VisFlagsWeights and compare 178 | corrected_vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='autocorr') 179 | assert_array_equal(corrected_vfw.vis.compute(), expected_vis) 180 | # Check parameter validation 181 | with pytest.raises(ValueError): 182 | ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, van_vleck='blah') 183 | 184 | def test_weight_power_scale(self): 185 | ants = 7 186 | index1, index2 = np.triu_indices(ants) 187 | inputs = [f'm{i:03}h' for i in range(ants)] 188 | corrprods = np.array([(inputs[a], inputs[b]) for (a, b) in zip(index1, index2)]) 189 | # Put fake dataset into chunk store 190 | store = NpyFileChunkStore(self.tempdir) 191 | prefix = 'cb1' 192 | shape = (10, 64, len(index1)) 193 | 194 | # Make up some vis data where the expected scaling factors can be 195 | # computed by hand. Note: the autocorrs are all set to powers of 196 | # 2 so that we avoid any rounding errors. 197 | vis = np.full(shape, 2 + 3j, np.complex64) 198 | vis[:, :, index1 == index2] = 2 # Make all autocorrs real 199 | vis[3, :, index1 == index2] = 4 # Tests time indexing 200 | vis[:, 7, index1 == index2] = 4 # Tests frequency indexing 201 | vis[:, :, ants] *= 8 # The (1, 1) baseline 202 | vis[4, 5, 0] = 0 # The (0, 0) baseline 203 | expected_scale = np.full(shape, 0.25, np.float32) 204 | expected_scale[3, :, :] = 1 / 16 205 | expected_scale[:, 7, :] = 1 / 16 206 | expected_scale[:, :, index1 == 1] /= 8 207 | expected_scale[:, :, index2 == 1] /= 8 208 | expected_scale[4, 5, index1 == 0] = 2.0**-32 209 | expected_scale[4, 5, index2 == 0] = 2.0**-32 210 | # The inverse scaling effectively multiplies by the relevant autocorrs 211 | expected_inverse_scale = np.reciprocal(expected_scale) 212 | # The tiny "bad" weights are not inverted but zeroed instead, a la pseudo-inverse 213 | expected_inverse_scale[4, 5, index1 == 0] = 0 214 | expected_inverse_scale[4, 5, index2 == 0] = 0 215 | 216 | data, chunk_info = put_fake_dataset( 217 | store, prefix, shape, array_overrides={'correlator_data': vis}) 218 | stored_weights = data['weights'] * data['weights_channel'][..., np.newaxis] 219 | 220 | # Check that data is as expected when accessed via VisFlagsWeights 221 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, 222 | stored_weights_are_scaled=False) 223 | assert vfw.shape == data['correlator_data'].shape 224 | assert_array_equal(vfw.vis.compute(), data['correlator_data']) 225 | assert_array_equal(vfw.flags.compute(), data['flags']) 226 | assert_array_equal(vfw.weights.compute(), stored_weights * expected_scale) 227 | assert_array_equal(vfw.unscaled_weights.compute(), stored_weights) 228 | 229 | # Check that scaled raw weights are also accepted 230 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info, corrprods, 231 | stored_weights_are_scaled=True) 232 | assert vfw.shape == data['correlator_data'].shape 233 | assert_array_equal(vfw.vis.compute(), data['correlator_data']) 234 | assert_array_equal(vfw.flags.compute(), data['flags']) 235 | assert_array_equal(vfw.weights.compute(), stored_weights) 236 | assert_array_equal(vfw.unscaled_weights.compute(), 237 | stored_weights * expected_inverse_scale) 238 | 239 | def _test_missing_chunks(self, shape, chunk_overrides=None): 240 | # Put fake dataset into chunk store 241 | store = NpyFileChunkStore(self.tempdir) 242 | prefix = 'cb2' 243 | data, chunk_info = put_fake_dataset(store, prefix, shape, chunk_overrides) 244 | # Delete some random chunks in each array of the dataset 245 | missing_chunks = {} 246 | rs = random.Random(4) 247 | for array, info in chunk_info.items(): 248 | array_name = store.join(prefix, array) 249 | slices = da.core.slices_from_chunks(info['chunks']) 250 | culled_slices = rs.sample(slices, len(slices) // 10 + 1) 251 | missing_chunks[array] = culled_slices 252 | for culled_slice in culled_slices: 253 | chunk_name, shape = store.chunk_metadata(array_name, culled_slice) 254 | os.remove(os.path.join(store.path, chunk_name) + '.npy') 255 | vfw = ChunkStoreVisFlagsWeights(store, chunk_info) 256 | assert vfw.store == store 257 | assert vfw.vis_prefix == prefix 258 | # Check that (only) missing chunks have been replaced by zeros 259 | vis = data['correlator_data'] 260 | for culled_slice in missing_chunks['correlator_data']: 261 | vis[culled_slice] = 0. 262 | assert_array_equal(vfw.vis, vis) 263 | weights = data['weights'] * data['weights_channel'][..., np.newaxis] 264 | for culled_slice in missing_chunks['weights'] + missing_chunks['weights_channel']: 265 | weights[culled_slice] = 0. 266 | assert_array_equal(vfw.weights, weights) 267 | # Check that (only) missing chunks have been flagged as 'data lost' 268 | flags = data['flags'] 269 | for culled_slice in missing_chunks['flags']: 270 | flags[culled_slice] = 0 271 | for culled_slice in itertools.chain(*missing_chunks.values()): 272 | flags[culled_slice] |= DATA_LOST 273 | assert_array_equal(vfw.flags, flags) 274 | 275 | def test_missing_chunks(self): 276 | self._test_missing_chunks((100, 256, 30)) 277 | 278 | def test_missing_chunks_uneven_chunking(self): 279 | self._test_missing_chunks( 280 | (20, 210, 30), 281 | { 282 | 'correlator_data': (1, 6, 30), 283 | 'weights': (5, 10, 15), 284 | 'weights_channel': (1, 7), 285 | 'flags': (4, 15, 30) 286 | }) 287 | --------------------------------------------------------------------------------