├── .gitignore ├── CHANGELOG.rst ├── INSTALL.rst ├── LICENSE.txt ├── README.rst ├── cuvarbase ├── __init__.py ├── bls.py ├── ce.py ├── core.py ├── cunfft.py ├── kernels │ ├── bls.cu │ ├── ce.cu │ ├── cunfft.cu │ ├── lomb.cu │ ├── pdm.cu │ └── wavelet.cu ├── lombscargle.py ├── pdm.py ├── tests │ ├── __init__.py │ ├── test_bls.py │ ├── test_ce.py │ ├── test_lombscargle.py │ ├── test_nfft.py │ └── test_pdm.py └── utils.py ├── docs ├── Makefile ├── requirements.txt └── source │ ├── bls.rst │ ├── ce.rst │ ├── conf.py │ ├── cuvarbase.rst │ ├── cuvarbase.tests.rst │ ├── index.rst │ ├── install.rst │ ├── logo.png │ ├── lomb.rst │ ├── modules.rst │ ├── plots │ ├── benchmarks.py │ ├── bls_example.py │ ├── bls_example_transit.py │ ├── bls_transit_diagram.py │ ├── ce_example.py │ ├── logo.py │ └── planet_transit_diagram.py │ └── whatsnew.rst ├── notebooks ├── Conditional entropy.ipynb ├── Lomb Scargle.ipynb ├── PDM2_bin.jpg ├── PDM2_binless_gauss.jpg ├── PDM2_binless_tophat.jpg ├── PDM_bin.jpg └── Phase Dispersion Minimization.ipynb ├── publish_docs.sh ├── requirements.txt ├── setup.cfg ├── setup.py └── test_python_versions.sh /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | .pytest_cache/ 46 | 47 | # Translations 48 | *.mo 49 | *.pot 50 | 51 | # Django stuff: 52 | *.log 53 | 54 | # Sphinx documentation 55 | docs/build/ 56 | 57 | # PyBuilder 58 | target/ 59 | 60 | # emacs backups 61 | *~ 62 | \#*\# 63 | 64 | .ipynb_checkpoints 65 | .idea/* 66 | tools/repos 67 | Untitled*.ipynb 68 | 69 | # vim backups 70 | *.swp 71 | 72 | # LaTeX 73 | *.aux 74 | *.pdf 75 | 76 | # misc 77 | scripts/saved_results 78 | .DS_Store 79 | work/ 80 | *.png 81 | *.gif 82 | *HAT*txt 83 | testing/* 84 | custom_test_ce.py 85 | -------------------------------------------------------------------------------- /CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | What's new in cuvarbase 2 | *********************** 3 | * **0.2.5** 4 | * swap out pycuda.autoinit for pycuda.autoprimaryctx to handle "cuFuncSetBlockShape" error 5 | 6 | * **0.2.4** 7 | * bugfix for pytest (broke b/c of incorrect fixture usage) 8 | * added ``ignore_negative_delta_sols`` option to BLS to ignore inverted dips in the lightcurve 9 | 10 | * **0.2.1** 11 | * bugfix for memory leak in BLS 12 | * contact email changed in setup 13 | 14 | * **0.2.0** 15 | * Many more unit tests for BLS and CE. 16 | * BLS 17 | * Now several orders of magnitude faster! Use ``use_fast=True`` in ``eebls_transit_gpu`` or use ``eebls_gpu_fast``. 18 | * Bug-fix for boost-python error when calling ``eebls_gpu_fast``. 19 | * CE 20 | * New ``use_fast`` parameter in ``ConditionalEntropyAsyncProcess``; if selected will use a kernel that should be substantially more efficient and that requires no memory overhead. If selected, you should use the ``run`` function and not the ``large_run`` function. Currently the ``weighted`` option is not supported when ``use_fast`` is ``True``. 21 | * Bug-fix for ``mag_overlap > 0``. 22 | 23 | * **0.1.9** 24 | * Added Sphinx documentation 25 | * **Now Python 3 compatible!** 26 | * Miscillaneous bug fixes 27 | * CE 28 | * Run functions for ``ConditionalEntropyAsyncProcess`` now allow for a ``balanced_magbins`` argument to set the magnitude bins to have widths that vary with the distribution of magnitude values. This is more robust to outliers, but performance comparisons between the usual CE algorithm indicate that you should use care. 29 | * Added ``precompute`` function to ``ConditionalEntropyAsyncProcess`` that allows you to speed up computations without resorting to the ``batched_run_constant_nfreq`` function. Currently it still assumes that the frequencies used will be the same for all lightcurves. 30 | * GLS 31 | * Added ``precompute`` function to ``LombScargleAsyncProcess``. 32 | * Avoids allocating GPU memory for NFFT when ``use_fft`` is ``False``. 33 | * ``LombScargleAsyncProcess.memory_requirement`` is now implemented. 34 | * BLS 35 | * ``eebls_gpu``, ``eebls_transit_gpu``, and ``eebls_custom_gpu`` now have a ``max_memory`` option that allows you to automatically set the ``batch_size`` without worrying about memory allocation errors. 36 | * ``eebls_transit_gpu`` now allows for a ``freqs`` argument and a ``qvals`` argument for customizing the frequencies and the fiducial ``q`` values 37 | * Fixed a small bug in ``fmin_transit`` that miscalculated the minimum frequency. 38 | 39 | * **0.1.8** 40 | * Removed gamma function usage from baluev 2008 false alarm probability (``use_gamma=True`` will override this) 41 | * Fixed a bug in the GLS notebook 42 | 43 | * **0.1.6/0.1.7** 44 | * Some bug fixes for GLS 45 | * ``large_run`` function for Conditional Entropy period finder allows large frequency grids 46 | without raising memory allocation errors. 47 | * More unit tests for conditional entropy 48 | * Conditional entropy now supports double precision with the ``use_double`` argument 49 | 50 | * **0.1.5** 51 | * Conditional Entropy period finder now unit tested 52 | * Weighted variant also implemented -- accounts for heteroskedasticity if 53 | that's important 54 | * BLS 55 | * New unit tests 56 | * A new transiting exoplanet BLS function: ``eebls_transit_gpu`` 57 | * Only searches plausible parameter space for Keplerian orbit 58 | * GLS 59 | * False alarm probability: ``fap_baluev`` 60 | * Implements `Baluev 2008 `_ false alarm probability measure based on extreme value theory 61 | 62 | -------------------------------------------------------------------------------- /INSTALL.rst: -------------------------------------------------------------------------------- 1 | Install instructions 2 | ******************** 3 | 4 | These installation instructions are for Linux/BSD-based systems (OS X/macOS, Ubuntu, etc.). Windows users, your suggestions and feedback is welcome if we can make your life easier! 5 | 6 | Installing the Nvidia Toolkit 7 | ----------------------------- 8 | 9 | ``cuvarbase`` requires PyCUDA and scikit-cuda, which both require the Nvidia toolkit for access to the Nvidia compiler, drivers, and runtime libraries. 10 | 11 | Go to the `NVIDIA Download page `_ and select the distribution for your operating system. Everything has been developed and tested using **version 8.0**, so it may be best to stick with that version for now until we verify that later versions are OK. 12 | 13 | .. warning:: 14 | 15 | Make sure that your ``$PATH`` environment variable contains the location of the ``CUDA`` binaries. You can test this by trying 16 | ``which nvcc`` from your terminal. If nothing is printed, you'll have to amend your ``~/.bashrc`` file: 17 | 18 | ``echo "export PATH=/usr/local/cuda/bin:${PATH}" >> ~/.bashrc && . ~/.bashrc`` 19 | 20 | The ``>>`` is not a typo -- using one ``>`` will *overwrite* the ``~/.bashrc`` file. Make sure you change ``/usr/local/cuda`` to the appropriate location of your Nvidia install. 21 | 22 | **Also important** 23 | 24 | Make sure your ``$LD_LIBRARY_PATH`` and ``$DYLD_LIBRARY_PATH`` are also similarly modified to include the ``/lib`` directory of the CUDA install: 25 | 26 | ``echo "export LD_LIBRARY_PATH=/usr/local/cuda/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc && . ~/.bashrc`` 27 | ``echo "export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:${DYLD_LIBRARY_PATH}" >> ~/.bashrc && . ~/.bashrc`` 28 | 29 | 30 | Using conda 31 | ----------- 32 | 33 | `Conda `_ is a great way to do this in a safe, isolated environment. 34 | 35 | First create a new conda environment (named ``pycu`` here) that will use Python 2.7 (python 2.7, 3.4, 3.5, and 3.6 36 | have been tested), with the numpy library installed. 37 | 38 | .. code:: bash 39 | 40 | conda create -n pycu python=2.7 numpy 41 | 42 | .. note:: 43 | 44 | The numpy library *has* to be installed *before* PyCUDA is installed with pip. 45 | The PyCUDA setup needs to be able to access the numpy library for building against it. You can do this with 46 | the above command, or alternatively just do ``pip install numpy && pip install cuvarbase`` 47 | 48 | Then activate the virtual environment 49 | 50 | .. code:: bash 51 | 52 | source activate pycu 53 | 54 | and then use ``pip`` to install ``cuvarbase`` 55 | 56 | .. code:: bash 57 | 58 | pip install cuvarbase 59 | 60 | 61 | Installing with just ``pip`` 62 | ---------------------------- 63 | 64 | **If you don't want to use conda** the following should work with just pip 65 | 66 | .. code:: bash 67 | 68 | pip install numpy 69 | pip install cuvarbase 70 | 71 | 72 | Troubleshooting PyCUDA installation problems 73 | -------------------------------------------- 74 | 75 | The ``PyCUDA`` installation step may be a hiccup in this otherwise orderly process. If you run into problems installing ``PyCUDA`` with pip, you may have to install PyCUDA from source yourself. It's not too bad, but if you experience any problems, please submit an `Issue `_ at the ``cuvarbase`` Github page and I'll amend this documentation. 76 | 77 | Below is a small bash script that (hopefully) automates the process of installing PyCUDA in the event of any problems you've encountered at this point. 78 | 79 | .. code-block:: bash 80 | 81 | PYCUDA="pycuda-2017.1.1" 82 | PYCUDA_URL="https://pypi.python.org/packages/b3/30/9e1c0a4c10e90b4c59ca7aa3c518e96f37aabcac73ffe6b5d9658f6ef843/pycuda-2017.1.1.tar.gz#md5=9e509f53a23e062b31049eb8220b2e3d" 83 | CUDA_ROOT=/usr/local/cuda 84 | 85 | # Download 86 | wget $PYCUDA_URL 87 | 88 | # Unpack 89 | tar xvf ${PYCUDA}.tar.gz 90 | cd $PYCUDA 91 | 92 | # Configure with current python exe 93 | ./configure.py --python-exe=`which python` --cuda-root=$CUDA_ROOT 94 | python setup.py build 95 | python setup.py install 96 | 97 | If everything goes smoothly, you should now test if ``pycuda`` is working correctly. 98 | 99 | .. code:: bash 100 | 101 | python -c "import pycuda.autoinit; print 'Hurray!'" 102 | 103 | If everything works up until now, we should be ready to install ``cuvarbase`` 104 | 105 | .. code:: bash 106 | 107 | pip install cuvarbase 108 | 109 | Installing from source 110 | ---------------------- 111 | 112 | You can also install directly from the repository. Clone the ``git`` repository on your machine: 113 | 114 | .. code:: bash 115 | 116 | git clone https://github.com/johnh2o2/cuvarbase 117 | 118 | Then install! 119 | 120 | .. code:: bash 121 | 122 | cd cuvarbase 123 | python setup.py install 124 | 125 | The last command can also be done with pip: 126 | 127 | .. code:: bash 128 | 129 | pip install -e . 130 | 131 | 132 | 133 | Troubleshooting on a Mac 134 | ------------------------ 135 | 136 | Nvidia offers `CUDA for Mac OSX `_. After installing the 137 | package via downloading and running the ``.dmg`` file, you'll have to make a couple of edits to your 138 | ``~/.bash_profile``: 139 | 140 | .. code:: sh 141 | 142 | export DYLD_LIBRARY_PATH="${DYLD_LIBRARY_PATH}:/usr/local/cuda/lib" 143 | export PATH="/usr/local/cuda/bin:${PATH}" 144 | 145 | and then source these changes in your current shell by running ``. ~/.bash_profile``. 146 | 147 | Another important note: **nvcc (8.0.61) does not appear to support the latest clang compiler**. If this is 148 | the case, running ``python example.py`` should produce the following error: 149 | 150 | .. code:: bash 151 | 152 | nvcc fatal : The version ('80100') of the host compiler ('Apple clang') is not supported 153 | 154 | You can fix this problem by temporarily downgrading your clang compiler. To do this: 155 | 156 | - `Download Xcode command line tools 7.3.1 `_ 157 | - Install. 158 | - Run ``sudo xcode-select --switch /Library/Developer/CommandLineTools`` until ``clang --version`` says ``7.3``. 159 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | cuvarbase 2 | ========= 3 | 4 | .. image:: https://badge.fury.io/py/cuvarbase.svg 5 | :target: https://badge.fury.io/py/cuvarbase 6 | 7 | John Hoffman 8 | (c) 2017 9 | 10 | ``cuvarbase`` is a Python library that uses `PyCUDA `_ to implement several time series tools used in astronomy on GPUs. 11 | 12 | See the `documentation `_. 13 | 14 | This project is under active development, and currently includes implementations of 15 | 16 | - Generalized `Lomb Scargle `_ periodogram 17 | - Box-least squares (`BLS `_ ) 18 | - Non-equispaced fast Fourier transform (adjoint operation) (`NFFT paper `_) 19 | - Conditional entropy period finder (`CE `_) 20 | - Phase dispersion minimization (`PDM2 `_) 21 | - Currently operational but minimal unit testing or documentation (yet) 22 | 23 | Hopefully future developments will have 24 | 25 | - (Weighted) wavelet transforms 26 | - Spectrograms (for PDM and GLS) 27 | - Multiharmonic extensions for GLS 28 | 29 | 30 | Dependencies 31 | ------------ 32 | 33 | - `PyCUDA `_ **<-essential** 34 | - `scikit cuda `_ **<-also essential** 35 | - used for access to the CUDA FFT runtime library 36 | - `matplotlib `_ (for plotting utilities) 37 | - `nfft `_ (for unit testing) 38 | - `astropy `_ (for unit testing) 39 | 40 | 41 | Using multiple GPUs 42 | ------------------- 43 | 44 | If you have more than one GPU, you can choose which one to 45 | use in a given script by setting the ``CUDA_DEVICE`` environment 46 | variable: 47 | 48 | .. code:: sh 49 | 50 | CUDA_DEVICE=1 python script.py 51 | 52 | If anyone is interested in implementing multi-device load-balancing 53 | solution, they are encouraged to do so! At some point this may 54 | become important, but for the time being manually splitting up the 55 | jobs to different GPU's will have to suffice. 56 | -------------------------------------------------------------------------------- /cuvarbase/__init__.py: -------------------------------------------------------------------------------- 1 | # import pycuda.autoinit causes problems when running e.g. FFT 2 | import pycuda.autoprimaryctx 3 | __version__ = "0.2.6" 4 | -------------------------------------------------------------------------------- /cuvarbase/core.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import range 6 | from builtins import object 7 | import numpy as np 8 | from .utils import gaussian_window, tophat_window, get_autofreqs 9 | import pycuda.driver as cuda 10 | from pycuda.compiler import SourceModule 11 | 12 | 13 | class GPUAsyncProcess(object): 14 | def __init__(self, *args, **kwargs): 15 | self.reader = kwargs.get('reader', None) 16 | self.nstreams = kwargs.get('nstreams', None) 17 | self.function_kwargs = kwargs.get('function_kwargs', {}) 18 | self.device = kwargs.get('device', 0) 19 | self.streams = [] 20 | self.gpu_data = [] 21 | self.results = [] 22 | self._adjust_nstreams = self.nstreams is None 23 | if self.nstreams is not None: 24 | self._create_streams(self.nstreams) 25 | self.prepared_functions = {} 26 | 27 | def _create_streams(self, n): 28 | for i in range(n): 29 | self.streams.append(cuda.Stream()) 30 | 31 | def _compile_and_prepare_functions(self): 32 | raise NotImplementedError() 33 | 34 | def run(self, *args, **kwargs): 35 | raise NotImplementedError() 36 | 37 | def finish(self): 38 | """ synchronize all active streams """ 39 | for i, stream in enumerate(self.streams): 40 | stream.synchronize() 41 | 42 | def batched_run(self, data, batch_size=10, **kwargs): 43 | """ Run your data in batches (avoids memory problems) """ 44 | nsubmit = 0 45 | results = [] 46 | while nsubmit < len(data): 47 | batch = [] 48 | while len(batch) < batch_size and nsubmit < len(data): 49 | batch.append(data[nsubmit]) 50 | nsubmit += 1 51 | 52 | res = self.run(batch, **kwargs) 53 | self.finish() 54 | results.extend(res) 55 | 56 | return results 57 | -------------------------------------------------------------------------------- /cuvarbase/cunfft.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from __future__ import absolute_import 3 | from __future__ import division 4 | from __future__ import print_function 5 | 6 | from builtins import object 7 | 8 | import sys 9 | import resource 10 | import numpy as np 11 | 12 | import pycuda.driver as cuda 13 | import pycuda.gpuarray as gpuarray 14 | from pycuda.compiler import SourceModule 15 | # import pycuda.autoinit 16 | 17 | import skcuda.fft as cufft 18 | 19 | from .core import GPUAsyncProcess 20 | from .utils import find_kernel, _module_reader 21 | 22 | 23 | class NFFTMemory(object): 24 | def __init__(self, sigma, stream, m, use_double=False, 25 | precomp_psi=True, **kwargs): 26 | 27 | self.sigma = sigma 28 | self.stream = stream 29 | self.m = m 30 | self.use_double = use_double 31 | self.precomp_psi = precomp_psi 32 | 33 | # set datatypes 34 | self.real_type = np.float32 if not self.use_double \ 35 | else np.float64 36 | self.complex_type = np.complex64 if not self.use_double \ 37 | else np.complex128 38 | 39 | self.other_settings = {} 40 | self.other_settings.update(kwargs) 41 | 42 | self.t = kwargs.get('t', None) 43 | self.y = kwargs.get('y', None) 44 | self.f0 = kwargs.get('f0', 0.) 45 | self.n0 = kwargs.get('n0', None) 46 | self.nf = kwargs.get('nf', None) 47 | self.t_g = kwargs.get('t_g', None) 48 | self.y_g = kwargs.get('y_g', None) 49 | self.ghat_g = kwargs.get('ghat_g', None) 50 | self.ghat_c = kwargs.get('ghat_c', None) 51 | self.q1 = kwargs.get('q1', None) 52 | self.q2 = kwargs.get('q2', None) 53 | self.q3 = kwargs.get('q3', None) 54 | self.cu_plan = kwargs.get('cu_plan', None) 55 | 56 | D = (2 * self.sigma - 1) * np.pi 57 | self.b = float(2 * self.sigma * self.m) / D 58 | 59 | def allocate_data(self, **kwargs): 60 | self.n0 = kwargs.get('n0', self.n0) 61 | self.nf = kwargs.get('nf', self.nf) 62 | 63 | assert(self.n0 is not None) 64 | assert(self.nf is not None) 65 | 66 | self.t_g = gpuarray.zeros(self.n0, dtype=self.real_type) 67 | self.y_g = gpuarray.zeros(self.n0, dtype=self.real_type) 68 | 69 | return self 70 | 71 | def allocate_precomp_psi(self, **kwargs): 72 | self.n0 = kwargs.get('n0', self.n0) 73 | 74 | assert(self.n0 is not None) 75 | 76 | self.q1 = gpuarray.zeros(self.n0, dtype=self.real_type) 77 | self.q2 = gpuarray.zeros(self.n0, dtype=self.real_type) 78 | self.q3 = gpuarray.zeros(2 * self.m + 1, dtype=self.real_type) 79 | 80 | return self 81 | 82 | def allocate_grid(self, **kwargs): 83 | self.nf = kwargs.get('nf', self.nf) 84 | 85 | assert(self.nf is not None) 86 | 87 | self.n = int(self.sigma * self.nf) 88 | self.ghat_g = gpuarray.zeros(self.n, 89 | dtype=self.complex_type) 90 | self.cu_plan = cufft.Plan(self.n, self.complex_type, self.complex_type, 91 | stream=self.stream) 92 | return self 93 | 94 | def allocate_pinned_cpu(self, **kwargs): 95 | self.nf = kwargs.get('nf', self.nf) 96 | 97 | assert(self.nf is not None) 98 | self.ghat_c = cuda.aligned_zeros(shape=(self.nf,), 99 | dtype=self.complex_type, 100 | alignment=resource.getpagesize()) 101 | self.ghat_c = cuda.register_host_memory(self.ghat_c) 102 | 103 | return self 104 | 105 | def is_ready(self): 106 | assert(self.n0 == len(self.t_g)) 107 | assert(self.n0 == len(self.y_g)) 108 | assert(self.n == len(self.ghat_g)) 109 | 110 | if self.ghat_c is not None: 111 | assert(self.nf == len(self.ghat_c)) 112 | 113 | if self.precomp_psi: 114 | assert(self.n0 == len(self.q1)) 115 | assert(self.n0 == len(self.q2)) 116 | assert(2 * self.m + 1 == len(self.q3)) 117 | 118 | def allocate(self, **kwargs): 119 | self.n0 = kwargs.get('n0', self.n0) 120 | self.nf = kwargs.get('nf', self.nf) 121 | 122 | assert(self.n0 is not None) 123 | assert(self.nf is not None) 124 | self.n = int(self.sigma * self.nf) 125 | 126 | self.allocate_data(**kwargs) 127 | self.allocate_grid(**kwargs) 128 | self.allocate_pinned_cpu(**kwargs) 129 | if self.precomp_psi: 130 | self.allocate_precomp_psi(**kwargs) 131 | 132 | return self 133 | 134 | def transfer_data_to_gpu(self, **kwargs): 135 | t = kwargs.get('t', self.t) 136 | y = kwargs.get('y', self.y) 137 | 138 | assert(t is not None) 139 | assert(y is not None) 140 | 141 | self.t_g.set_async(t, stream=self.stream) 142 | self.y_g.set_async(y, stream=self.stream) 143 | 144 | def transfer_nfft_to_cpu(self, **kwargs): 145 | cuda.memcpy_dtoh_async(self.ghat_c, self.ghat_g.ptr, 146 | stream=self.stream) 147 | 148 | def fromdata(self, t, y, allocate=True, **kwargs): 149 | self.tmin = min(t) 150 | self.tmax = max(t) 151 | 152 | self.t = np.asarray(t).astype(self.real_type) 153 | self.y = np.asarray(y).astype(self.real_type) 154 | 155 | self.n0 = kwargs.get('n0', len(t)) 156 | self.nf = kwargs.get('nf', self.nf) 157 | 158 | if self.nf is not None and allocate: 159 | self.allocate(**kwargs) 160 | 161 | return self 162 | 163 | 164 | def nfft_adjoint_async(memory, functions, 165 | minimum_frequency=0., block_size=256, 166 | just_return_gridded_data=False, use_grid=None, 167 | fast_grid=True, transfer_to_device=True, 168 | transfer_to_host=True, precomp_psi=True, 169 | samples_per_peak=1, **kwargs): 170 | """ 171 | Asynchronous NFFT adjoint operation. 172 | 173 | Use the ``NFFTAsyncProcess`` class and related subroutines when possible. 174 | 175 | Parameters 176 | ---------- 177 | memory: ``NFFTMemory`` 178 | Allocated memory, must have data already set (see, e.g., 179 | ``NFFTAsyncProcess.allocate()``) 180 | functions: tuple, length 5 181 | Tuple of compiled functions from `SourceModule`. Must be prepared with 182 | their appropriate dtype. 183 | minimum_frequency: float, optional (default: 0) 184 | First frequency of transform 185 | block_size: int, optional 186 | Number of CUDA threads per block 187 | just_return_gridded_data: bool, optional 188 | If True, returns grid via `grid_g.get()` after gridding 189 | use_grid: ``GPUArray``, optional 190 | If specified, will skip gridding procedure and use the `GPUArray` 191 | provided 192 | fast_grid: bool, optional, default: True 193 | Whether or not to use the "fast" gridding procedure 194 | transfer_to_device: bool, optional, (default: True) 195 | If the data is already on the gpu, set as False 196 | transfer_to_host: bool, optional, (default: True) 197 | If False, will not transfer the resulting nfft to CPU memory 198 | precomp_psi: bool, optional, (default: True) 199 | Only relevant if ``fast`` is True. Will precompute values for the 200 | fast gridding procedure. 201 | samples_per_peak: float, optional (default: 1) 202 | Frequency spacing is reduced by this factor, but number of frequencies 203 | is kept the same 204 | 205 | Returns 206 | ------- 207 | ghat_cpu: ``np.array`` 208 | The resulting NFFT 209 | """ 210 | 211 | precompute_psi, fast_gaussian_grid, slow_gaussian_grid, \ 212 | nfft_shift, normalize = functions 213 | 214 | stream = memory.stream 215 | 216 | block = (block_size, 1, 1) 217 | 218 | batch_size = 1 219 | 220 | def grid_size(nthreads): 221 | return int(np.ceil(float(nthreads) / block_size)) 222 | 223 | minimum_frequency = memory.real_type(minimum_frequency) 224 | 225 | # transfer data -> gpu 226 | if transfer_to_device: 227 | memory.transfer_data_to_gpu() 228 | 229 | # smooth data onto uniform grid 230 | if fast_grid: 231 | if memory.precomp_psi: 232 | grid = (grid_size(memory.n0 + 2 * memory.m + 1), 1) 233 | args = (grid, block, stream) 234 | args += (memory.t_g.ptr,) 235 | args += (memory.q1.ptr, memory.q2.ptr, memory.q3.ptr) 236 | args += (np.int32(memory.n0), np.int32(memory.n), 237 | np.int32(memory.m), memory.real_type(memory.b)) 238 | args += (memory.real_type(memory.tmin), 239 | memory.real_type(memory.tmax), 240 | memory.real_type(samples_per_peak)) 241 | precompute_psi.prepared_async_call(*args) 242 | 243 | grid = (grid_size(memory.n0), 1) 244 | args = (grid, block, stream) 245 | args += (memory.t_g.ptr, memory.y_g.ptr, memory.ghat_g.ptr) 246 | args += (memory.q1.ptr, memory.q2.ptr, memory.q3.ptr) 247 | args += (np.int32(memory.n0), np.int32(memory.n), 248 | np.int32(batch_size), np.int32(memory.m)) 249 | args += (memory.real_type(memory.tmin), 250 | memory.real_type(memory.tmax), 251 | memory.real_type(samples_per_peak)) 252 | fast_gaussian_grid.prepared_async_call(*args) 253 | 254 | else: 255 | grid = (grid_size(memory.n), 1) 256 | args = (grid, block, stream) 257 | args += (memory.t_g.ptr, memory.y_g.ptr, memory.ghat_g.ptr) 258 | args += (np.int32(memory.n0), np.int32(memory.n), 259 | np.int32(batch_size), np.int32(memory.m), 260 | memory.real_type(memory.b)) 261 | args += (memory.real_type(memory.tmin), 262 | memory.real_type(memory.tmax), 263 | memory.real_type(samples_per_peak)) 264 | slow_gaussian_grid.prepared_async_call(*args) 265 | 266 | # Stop if user wants the grid 267 | if just_return_gridded_data: 268 | stream.synchronize() 269 | return np.real(memory.ghat_g.get()) 270 | 271 | # Set the grid manually if the user wants to 272 | # (only for debugging) 273 | if use_grid is not None: 274 | memory.ghat_g.set(use_grid) 275 | 276 | # for a non-zero minimum frequency, do a shift 277 | if abs(minimum_frequency) > 1E-9: 278 | grid = (grid_size(memory.n), 1) 279 | args = (grid, block, stream) 280 | args += (memory.ghat_g.ptr, memory.ghat_g.ptr) 281 | args += (np.int32(memory.n), np.int32(batch_size)) 282 | args += (memory.real_type(memory.tmin), 283 | memory.real_type(memory.tmax), 284 | memory.real_type(samples_per_peak), 285 | memory.real_type(minimum_frequency)) 286 | nfft_shift.prepared_async_call(*args) 287 | 288 | # Run IFFT on grid 289 | cufft.ifft(memory.ghat_g, memory.ghat_g, memory.cu_plan) 290 | 291 | # Normalize result (deconvolve smoothing kernel) 292 | grid = (grid_size(memory.nf), 1) 293 | args = (grid, block, stream) 294 | args += (memory.ghat_g.ptr, memory.ghat_g.ptr) 295 | args += (np.int32(memory.n), 296 | np.int32(memory.nf), 297 | np.int32(batch_size), 298 | memory.real_type(memory.b)) 299 | args += (memory.real_type(memory.tmin), 300 | memory.real_type(memory.tmax), 301 | memory.real_type(samples_per_peak), 302 | memory.real_type(minimum_frequency)) 303 | normalize.prepared_async_call(*args) 304 | 305 | # Transfer result! 306 | if transfer_to_host: 307 | memory.transfer_nfft_to_cpu() 308 | 309 | return memory.ghat_c 310 | 311 | 312 | class NFFTAsyncProcess(GPUAsyncProcess): 313 | """ 314 | `GPUAsyncProcess` for the adjoint NFFT. 315 | 316 | Parameters 317 | ---------- 318 | sigma: float, optional (default: 2) 319 | Size of NFFT grid will be NFFT_SIZE * sigma 320 | m: int, optional (default: 8) 321 | Maximum radius for grid contributions (by default, 322 | this value will automatically be set based on a specified 323 | error tolerance) 324 | autoset_m: bool, optional (default: True) 325 | Automatically set the ``m`` parameter based on the 326 | error tolerance given by the ``m_tol`` parameter 327 | tol: float, optional (default: 1E-8) 328 | Error tolerance for the NFFT (used to auto set ``m``) 329 | block_size: int, optional (default: 256) 330 | CUDA block size. 331 | use_double: bool, optional (default: False) 332 | Use double precision. On non-Tesla cards this will 333 | make things ~24 times slower. 334 | use_fast_math: bool, optional (default: True) 335 | Compile kernel with the ``--use_fast_math`` option 336 | supplied to ``nvcc``. 337 | 338 | Example 339 | ------- 340 | 341 | >>> import numpy as np 342 | >>> t = np.random.rand(100) 343 | >>> y = np.cos(10 * t - 0.4) + 0.1 * np.random.randn(len(t)) 344 | >>> proc = NFFTAsyncProcess() 345 | >>> data = [(t, y, 2 * len(t))] 346 | >>> nfft_adjoint = proc.run(data) 347 | 348 | """ 349 | 350 | def __init__(self, *args, **kwargs): 351 | super(NFFTAsyncProcess, self).__init__(*args, **kwargs) 352 | 353 | self.sigma = kwargs.get('sigma', 4) 354 | self.m = kwargs.get('m', 8) 355 | self.autoset_m = kwargs.get('autoset_m', False) 356 | self.block_size = kwargs.get('block_size', 256) 357 | self.use_double = kwargs.get('use_double', False) 358 | self.m_tol = kwargs.get('tol', 1E-8) 359 | self.module_options = [] 360 | if kwargs.get('use_fast_math', True): 361 | self.module_options.append('--use_fast_math') 362 | 363 | self.real_type = np.float64 if self.use_double \ 364 | else np.float32 365 | self.complex_type = np.complex128 if self.use_double \ 366 | else np.complex64 367 | 368 | self._cpp_defs = dict(BLOCK_SIZE=self.block_size) 369 | if self.use_double: 370 | self._cpp_defs['DOUBLE_PRECISION'] = None 371 | 372 | self.function_names = ['precompute_psi', 373 | 'fast_gaussian_grid', 374 | 'slow_gaussian_grid', 'nfft_shift', 375 | 'normalize'] 376 | 377 | self.allocated_memory = [] 378 | 379 | def m_from_C(self, C, sigma): 380 | """ 381 | Returns an estimate for what ``m`` value to use from ``C``, 382 | where ``C`` is something like ``err_tolerance/N_freq``. 383 | 384 | Pulled from _ 385 | """ 386 | D = (np.pi * (1. - 1. / (2. * sigma - 1.))) 387 | return int(np.ceil(-np.log(0.25 * C) / D)) 388 | 389 | def estimate_m(self, N): 390 | """ 391 | Estimate ``m`` based on an error tolerance of ``self.tol``. 392 | 393 | Parameters 394 | ---------- 395 | N: int 396 | size of NFFT 397 | 398 | Returns 399 | ------- 400 | m: int 401 | Maximum grid radius 402 | 403 | Notes 404 | ----- 405 | Pulled from _. 406 | 407 | """ 408 | 409 | # TODO: this should be computed in terms of the L1-norm of the true 410 | # Fourier coefficients... see p. 11 of 411 | # https://www-user.tu-chemnitz.de/~potts/nfft/guide/nfft3.pdf 412 | # Need to think about how to estimate the value of m more accurately 413 | return self.m_from_C(self.m_tol / N, self.sigma) 414 | 415 | def get_m(self, N=None): 416 | """ 417 | Returns the ``m`` value for ``N`` frequencies. 418 | 419 | Parameters 420 | ---------- 421 | N: int 422 | Number of frequencies, only needed if ``autoset_m`` is ``False``. 423 | 424 | Returns 425 | ------- 426 | m: int 427 | The filter radius (in grid points) 428 | """ 429 | if self.autoset_m: 430 | return self.estimate_m(N) 431 | else: 432 | return self.m 433 | 434 | def _compile_and_prepare_functions(self, **kwargs): 435 | module_txt = _module_reader(find_kernel('cunfft'), self._cpp_defs) 436 | 437 | self.module = SourceModule(module_txt, options=self.module_options) 438 | 439 | self.dtypes = dict( 440 | precompute_psi=[np.intp, np.intp, np.intp, np.intp, np.int32, 441 | np.int32, np.int32, self.real_type, 442 | self.real_type, self.real_type, self.real_type], 443 | 444 | fast_gaussian_grid=[np.intp, np.intp, np.intp, np.intp, 445 | np.intp, np.intp, np.int32, np.int32, 446 | np.int32, np.int32, self.real_type, 447 | self.real_type, self.real_type], 448 | 449 | slow_gaussian_grid=[np.intp, np.intp, np.intp, np.int32, 450 | np.int32, np.int32, np.int32, self.real_type, 451 | self.real_type, self.real_type, 452 | self.real_type], 453 | 454 | normalize=[np.intp, np.intp, np.int32, np.int32, np.int32, 455 | self.real_type, self.real_type, self.real_type, 456 | self.real_type, self.real_type], 457 | 458 | nfft_shift=[np.intp, np.intp, np.int32, np.int32, self.real_type, 459 | self.real_type, self.real_type, self.real_type] 460 | ) 461 | 462 | for function, dtype in self.dtypes.items(): 463 | func = self.module.get_function(function) 464 | self.prepared_functions[function] = func.prepare(dtype) 465 | 466 | self.function_tuple = tuple([self.prepared_functions[f] 467 | for f in self.function_names]) 468 | 469 | def allocate(self, data, **kwargs): 470 | """ 471 | Allocate GPU memory for NFFT-related computations 472 | 473 | Parameters 474 | ---------- 475 | data: list of (t, y, N) tuples 476 | List of data, ``[(t_1, y_1, N_1), ...]`` 477 | * ``t``: Observation times. 478 | * ``y``: Observations. 479 | * ``nf``: int, FFT size 480 | **kwargs 481 | 482 | Returns 483 | ------- 484 | allocated_memory: list of ``NFFTMemory`` objects 485 | List of allocated memory for each dataset 486 | 487 | """ 488 | 489 | # Purge any previously allocated memory 490 | allocated_memory = [] 491 | 492 | if len(data) > len(self.streams): 493 | self._create_streams(len(data) - len(self.streams)) 494 | 495 | for i, (t, y, nf) in enumerate(data): 496 | 497 | m = self.get_m(nf) 498 | 499 | mem = NFFTMemory(self.sigma, self.streams[i], m, 500 | use_double=self.use_double, **kwargs) 501 | 502 | allocated_memory.append(mem.fromdata(t, y, nf=nf, 503 | allocate=True, 504 | **kwargs)) 505 | 506 | return allocated_memory 507 | 508 | def run(self, data, memory=None, **kwargs): 509 | """ 510 | Run the adjoint NFFT on a batch of data 511 | 512 | Parameters 513 | ---------- 514 | data: list of tuples 515 | list of [(t, y, w), ...] containing 516 | * ``t``: observation times 517 | * ``y``: observations 518 | * ``nf``: int, size of NFFT 519 | memory: 520 | **kwargs 521 | 522 | Returns 523 | ------- 524 | powers: list of np.ndarrays 525 | List of adjoint NFFTs 526 | 527 | """ 528 | if not hasattr(self, 'prepared_functions') or \ 529 | not all([func in self.prepared_functions 530 | for func in self.function_names]): 531 | self._compile_and_prepare_functions(**kwargs) 532 | 533 | if memory is None: 534 | memory = self.allocate(data, **kwargs) 535 | 536 | nfft_kwargs = dict(block_size=self.block_size) 537 | nfft_kwargs.update(kwargs) 538 | 539 | results = [nfft_adjoint_async(mem, self.function_tuple, 540 | **nfft_kwargs) 541 | for mem in memory] 542 | 543 | return results 544 | -------------------------------------------------------------------------------- /cuvarbase/kernels/bls.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #define RESTRICT __restrict__ 3 | #define CONSTANT const 4 | #define MIN_W 1E-3 5 | //{CPP_DEFS} 6 | 7 | __device__ unsigned int get_id(){ 8 | return blockIdx.x * blockDim.x + threadIdx.x; 9 | } 10 | 11 | __device__ int mod(int a, int b){ 12 | int r = a % b; 13 | return (r < 0) ? r + b : r; 14 | } 15 | 16 | __device__ float mod1(float a){ 17 | return a - floorf(a); 18 | } 19 | 20 | __device__ float bls_value(float ybar, float w, unsigned int ignore_negative_delta_sols){ 21 | // if ignore negative delta sols is turned on, that means only solutions where 22 | // the mean amplitude within the transit is _lower_ than the mean amplitude of the source 23 | // are considered: it will ignore "inverted dips" 24 | float bls = (w > 1e-10 && w < 1.f - 1e-10) ? ybar * ybar / (w * (1.f - w)) : 0.f; 25 | return ((ignore_negative_delta_sols == 1) & (ybar > 0)) ? 0.f : bls; 26 | } 27 | 28 | __global__ void binned_bls_bst(float *yw, float *w, float *bls, unsigned int n, unsigned int ignore_negative_delta_sols){ 29 | unsigned int i = get_id(); 30 | 31 | if (i < n){ 32 | bls[i] = bls_value(yw[i], w[i], ignore_negative_delta_sols); 33 | } 34 | } 35 | 36 | 37 | __device__ unsigned int dnbins(unsigned int nbins, float dlogq){ 38 | 39 | if (dlogq < 0) 40 | return 1; 41 | 42 | unsigned int n = (unsigned int) floorf(dlogq * nbins); 43 | 44 | return (n == 0) ? 1 : n; 45 | } 46 | 47 | __device__ unsigned int nbins_iter(unsigned int i, unsigned int nb0, float dlogq){ 48 | 49 | 50 | if (i == 0) 51 | return nb0; 52 | 53 | unsigned int nb = nb0; 54 | for(int j = 0; j < i; j++) 55 | nb += dnbins(nb, dlogq); 56 | 57 | return nb; 58 | } 59 | 60 | __device__ unsigned int count_tot_nbins(unsigned int nbins0, unsigned int nbinsf, float dlogq){ 61 | unsigned int ntot = 0; 62 | 63 | for(int i = 0; nbins_iter(i, nbins0, dlogq) <= nbinsf; i++) 64 | ntot += nbins_iter(i, nbins0, dlogq); 65 | return ntot; 66 | } 67 | 68 | 69 | 70 | __global__ void store_best_sols_custom(unsigned int *argmaxes, float *best_phi, 71 | float *best_q, float *q_values, 72 | float *phi_values, unsigned int nq, unsigned int nphi, 73 | unsigned int nfreq, unsigned int freq_offset){ 74 | 75 | unsigned int i = get_id(); 76 | 77 | if (i < nfreq){ 78 | unsigned int imax = argmaxes[i + freq_offset]; 79 | 80 | best_phi[i + freq_offset] = phi_values[imax / nq]; 81 | best_q[i + freq_offset] = q_values[imax % nq]; 82 | } 83 | } 84 | 85 | 86 | __device__ int divrndup(int a, int b){ 87 | return (a % b > 0) ? a/b + 1 : a/b; 88 | } 89 | 90 | 91 | 92 | 93 | __global__ void store_best_sols(unsigned int *argmaxes, float *best_phi, 94 | float *best_q, 95 | unsigned int nbins0, unsigned int nbinsf, 96 | unsigned int noverlap, 97 | float dlogq, unsigned int nfreq, unsigned int freq_offset){ 98 | 99 | unsigned int i = get_id(); 100 | 101 | if (i < nfreq){ 102 | unsigned int imax = argmaxes[i + freq_offset]; 103 | float dphi = 1. / noverlap; 104 | 105 | unsigned int nb = nbins0; 106 | unsigned int bin_offset = 0; 107 | unsigned int i_iter = 0; 108 | while ((bin_offset + nb) * noverlap <= imax){ 109 | bin_offset += nb; 110 | nb = nbins_iter(++i_iter, nbins0, dlogq); 111 | } 112 | 113 | float q = 1. / nb; 114 | int s = (((int) imax) - ((int) (bin_offset * noverlap))) / nb; 115 | int jphi = (((int) imax) - ((int) (bin_offset * noverlap))) % nb; 116 | 117 | float phi = mod1((float) (((double) q) * (((double) jphi) + ((double) s) * ((double) dphi)))); 118 | 119 | best_phi[i + freq_offset] = phi; 120 | best_q[i + freq_offset] = q; 121 | } 122 | } 123 | 124 | // needs ndata * nfreq threads 125 | // noverlap -- number of overlapped bins (noverlap * (1 / q) total bins) 126 | // Note: this thread heavily utilizes global atomic operations, and could 127 | // likely be improved by 1-2 orders of magnitude for large Ndata (10^4) 128 | // if shared memory atomics were utilized. 129 | __global__ void bin_and_phase_fold_bst_multifreq( 130 | float *t, float *yw, float *w, 131 | float *yw_bin, float *w_bin, float *freqs, 132 | unsigned int ndata, unsigned int nfreq, unsigned int nbins0, unsigned int nbinsf, 133 | unsigned int freq_offset, unsigned int noverlap, float dlogq, 134 | unsigned int nbins_tot){ 135 | unsigned int i = get_id(); 136 | 137 | if (i < ndata * nfreq){ 138 | unsigned int i_data = i % ndata; 139 | unsigned int i_freq = i / ndata; 140 | 141 | unsigned int offset = i_freq * nbins_tot * noverlap; 142 | 143 | float W = w[i_data]; 144 | float YW = yw[i_data]; 145 | 146 | // get phase [0, 1) 147 | float phi = mod1(t[i_data] * freqs[i_freq + freq_offset]); 148 | 149 | float dphi = 1.f / noverlap; 150 | unsigned int nbtot = 0; 151 | unsigned int nb, b; 152 | 153 | // iterate through bins (logarithmically spaced) 154 | for(int j = 0; nbins_iter(j, nbins0, dlogq) <= nbinsf; j++){ 155 | nb = nbins_iter(j, nbins0, dlogq); 156 | 157 | // iterate through offsets [ 0, 1./sigma, ..., 158 | // (sigma - 1) / sigma ] 159 | for (int s = 0; s < noverlap; s++){ 160 | b = (unsigned int) mod((int) floorf(nb * phi - s * dphi), nb); 161 | b += offset + s * nb + noverlap * nbtot; 162 | 163 | atomicAdd(&(yw_bin[b]), YW); 164 | atomicAdd(&(w_bin[b]), W); 165 | } 166 | nbtot += nb; 167 | } 168 | } 169 | } 170 | 171 | 172 | __global__ void full_bls_no_sol( 173 | const float* __restrict__ t, 174 | const float* __restrict__ yw, 175 | const float* __restrict__ w, 176 | float* __restrict__ bls, 177 | const float* __restrict__ freqs, 178 | const unsigned int * __restrict__ nbins0, 179 | const unsigned int * __restrict__ nbinsf, 180 | unsigned int ndata, 181 | unsigned int nfreq, 182 | unsigned int freq_offset, 183 | unsigned int hist_size, 184 | unsigned int noverlap, 185 | float dlogq, 186 | float dphi, 187 | unsigned int ignore_negative_delta_sols){ 188 | unsigned int i = get_id(); 189 | 190 | extern __shared__ float sh[]; 191 | 192 | float *block_bins = sh; 193 | float *best_bls = (float *)&sh[2 * hist_size]; 194 | 195 | __shared__ float f0; 196 | __shared__ int nb0, nbf, max_bin_width; 197 | 198 | #ifdef USE_LOG_BIN_SPACING 199 | __shared__ int tot_nbins; 200 | #endif 201 | 202 | unsigned int s; 203 | int b; 204 | float phi, bls1, bls2, thread_max_bls, thread_yw, thread_w; 205 | 206 | // this will be inefficient for block sizes >> number of bins per frequency 207 | unsigned int i_freq = blockIdx.x; 208 | while (i_freq < nfreq){ 209 | 210 | thread_max_bls = 0.f; 211 | 212 | if (threadIdx.x == 0){ 213 | // read frequency from global memory 214 | f0 = freqs[i_freq + freq_offset]; 215 | 216 | // read nbins from global memory 217 | nb0 = nbins0[i_freq + freq_offset]; 218 | nbf = nbinsf[i_freq + freq_offset]; 219 | 220 | max_bin_width = divrndup(nbf, nb0); 221 | 222 | #ifdef USE_LOG_BIN_SPACING 223 | tot_nbins = count_tot_nbins(nb0, nbf, dlogq); 224 | #endif 225 | } 226 | 227 | // wait for broadcasting to finish 228 | __syncthreads(); 229 | 230 | // intialize bins to 0 (synchronization is necessary here...) 231 | for(unsigned int k = threadIdx.x; k < nbf; k += blockDim.x){ 232 | block_bins[2 * k] = 0.f; 233 | block_bins[2 * k + 1] = 0.f; 234 | } 235 | 236 | // wait for initialization to finish 237 | __syncthreads(); 238 | 239 | // histogram the data 240 | for (unsigned int k = threadIdx.x; k < ndata; k += blockDim.x){ 241 | phi = mod1(t[k] * f0); 242 | 243 | b = mod((int) floorf(((float) nbf) * phi - dphi), (int) nbf); 244 | 245 | // shared memory atomics should (hopefully) be faster. 246 | atomicAdd(&(block_bins[2 * b]), yw[k]); 247 | atomicAdd(&(block_bins[2 * b + 1]), w[k]); 248 | } 249 | 250 | // wait for everyone to finish adding data to the histogram 251 | __syncthreads(); 252 | 253 | // get max bls for this THREAD 254 | #ifdef USE_LOG_BIN_SPACING 255 | for (unsigned int n = threadIdx.x; n < tot_nbins; n += blockDim.x){ 256 | 257 | unsigned int bin_offset = 0; 258 | unsigned int nb = nb0; 259 | while ((bin_offset + nb) * noverlap < n){ 260 | bin_offset += nb; 261 | nb += dnbins(nb, dlogq); 262 | } 263 | 264 | b = (((int) n) - ((int) (bin_offset * noverlap))) % nb; 265 | s = (((int) n) - ((int) (bin_offset * noverlap))) / nb; 266 | 267 | thread_yw = 0.f; 268 | thread_w = 0.f; 269 | unsigned int m0 = 0; 270 | 271 | for (unsigned int m = b; m < b + nb; m ++){ 272 | thread_yw += block_bins[2 * (m % nbf)]; 273 | thread_w += block_bins[2 * (m % nbf) + 1]; 274 | } 275 | 276 | bls1 = bls_value(thread_yw, thread_w, ignore_negative_delta_sols); 277 | if (bls1 > thread_max_bls) 278 | thread_max_bls = bls1; 279 | } 280 | 281 | #else 282 | for (unsigned int n = threadIdx.x; n < nbf; n += blockDim.x){ 283 | 284 | thread_yw = 0.f; 285 | thread_w = 0.f; 286 | unsigned int m0 = 0; 287 | 288 | for (unsigned int m = 1; m < max_bin_width; m += dnbins(m, dlogq)){ 289 | for (s = m0; s < m; s++){ 290 | thread_yw += block_bins[2 * ((n + s) % nbf)]; 291 | thread_w += block_bins[2 * ((n + s) % nbf) + 1]; 292 | } 293 | m0 = m; 294 | 295 | bls1 = bls_value(thread_yw, thread_w, ignore_negative_delta_sols); 296 | if (bls1 > thread_max_bls) 297 | thread_max_bls = bls1; 298 | } 299 | } 300 | #endif 301 | 302 | best_bls[threadIdx.x] = thread_max_bls; 303 | 304 | // wait for everyone to finish 305 | __syncthreads(); 306 | 307 | // get max bls for this BLOCK 308 | for(unsigned int k = (blockDim.x / 2); k > 0; k /= 2){ 309 | if(threadIdx.x < k){ 310 | bls1 = best_bls[threadIdx.x]; 311 | bls2 = best_bls[threadIdx.x + k]; 312 | 313 | best_bls[threadIdx.x] = (bls1 > bls2) ? bls1 : bls2; 314 | } 315 | __syncthreads(); 316 | } 317 | 318 | // store block max to global memory 319 | if (threadIdx.x == 0) 320 | bls[i_freq + freq_offset] = best_bls[0]; 321 | 322 | // increment frequency 323 | i_freq += gridDim.x; 324 | } 325 | } 326 | 327 | 328 | // needs ndata * nfreq threads 329 | // noverlap -- number of overlapped bins (noverlap * (1 / q) total bins) 330 | __global__ void bin_and_phase_fold_custom( 331 | float *t, float *yw, float *w, 332 | float *yw_bin, float *w_bin, float *freqs, 333 | float *q_values, float *phi_values, 334 | unsigned int nq, unsigned int nphi, unsigned int ndata, 335 | unsigned int nfreq, unsigned int freq_offset){ 336 | unsigned int i = get_id(); 337 | 338 | if (i < ndata * nfreq){ 339 | unsigned int i_data = i % ndata; 340 | unsigned int i_freq = i / ndata; 341 | 342 | unsigned int offset = i_freq * nq * nphi; 343 | 344 | float W = w[i_data]; 345 | float YW = yw[i_data]; 346 | 347 | // get phase [0, 1) 348 | float phi = mod1(t[i_data] * freqs[i_freq + freq_offset]); 349 | 350 | for(int pb = 0; pb < nphi; pb++){ 351 | float dphi = phi - phi_values[pb]; 352 | dphi -= floorf(dphi); 353 | 354 | for(int qb = 0; qb < nq; qb++){ 355 | if (dphi < q_values[qb]){ 356 | atomicAdd(&(yw_bin[pb * nq + qb + offset]), YW); 357 | atomicAdd(&(w_bin[pb * nq + qb + offset]), W); 358 | } 359 | } 360 | } 361 | } 362 | } 363 | 364 | 365 | 366 | 367 | __global__ void reduction_max(float *arr, unsigned int *arr_args, unsigned int nfreq, 368 | unsigned int nbins, unsigned int stride, 369 | float *block_max, unsigned int *block_arg_max, 370 | unsigned int offset, unsigned int init){ 371 | 372 | __shared__ float partial_max[BLOCK_SIZE]; 373 | __shared__ unsigned int partial_arg_max[BLOCK_SIZE]; 374 | 375 | unsigned int id = blockIdx.x * blockDim.x + threadIdx.x; 376 | 377 | unsigned int nblocks_per_freq = gridDim.x / nfreq; 378 | unsigned int nthreads_per_freq = blockDim.x * nblocks_per_freq; 379 | 380 | 381 | 382 | 383 | // freq_no / b 384 | // ----block 1 ----- ----- block N ------------------------ 385 | // 0 | 0 1 2 .. B - 1 | ... | (N - 1)B, ... , ndata, ..., N * B - 1| 386 | // 387 | // ---block N + 1--- ---- block 2N ------------------------ 388 | // 1 | 0 1 2 .. B - 1 | ... | (N - 1)B, ... , ndata, ..., N * B - 1| 389 | // ... 390 | // 391 | // ---(nf - 1)N ---- --- nf * N --- 392 | // nf - 1 | .. | ... | | 393 | 394 | unsigned int fno = id / nthreads_per_freq; 395 | unsigned int b = id % nthreads_per_freq; 396 | 397 | // read part of array from global memory into shared memory 398 | partial_max[threadIdx.x] = (fno < nfreq && b < nbins) ? 399 | arr[fno * stride + b] : -1.f; 400 | 401 | partial_arg_max[threadIdx.x] = (fno < nfreq && b < nbins) ? 402 | ( 403 | (init == 1) ? 404 | b : arr_args[fno * stride + b] 405 | ) : 0; 406 | 407 | __syncthreads(); 408 | 409 | float m1, m2; 410 | 411 | // reduce to find max of shared memory array 412 | for(int s = blockDim.x / 2; s > 0; s /= 2){ 413 | if(threadIdx.x < s){ 414 | m1 = partial_max[threadIdx.x]; 415 | m2 = partial_max[threadIdx.x + s]; 416 | 417 | partial_max[threadIdx.x] = (m1 > m2) ? m1 : m2; 418 | 419 | partial_arg_max[threadIdx.x] = (m1 > m2) ? 420 | partial_arg_max[threadIdx.x] : 421 | partial_arg_max[threadIdx.x + s]; 422 | } 423 | 424 | __syncthreads(); 425 | } 426 | 427 | // store partial max back into global memory 428 | if (threadIdx.x == 0 && fno < nfreq){ 429 | unsigned int i = (gridDim.x == nfreq) ? 0 : 430 | fno * stride - fno * nblocks_per_freq; 431 | 432 | i += blockIdx.x + offset; 433 | 434 | block_max[i] = partial_max[0]; 435 | block_arg_max[i] = partial_arg_max[0]; 436 | } 437 | } 438 | -------------------------------------------------------------------------------- /cuvarbase/kernels/ce.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | //{CPP_DEFS} 4 | 5 | #ifndef MAX_SHARED_MEM_SIZE 6 | #define MAX_SHARED_MEM_SIZE 48000 7 | #endif 8 | 9 | #ifdef DOUBLE_PRECISION 10 | #define ATOMIC_ADD atomicAddDouble 11 | #define FLT double 12 | #else 13 | #define ATOMIC_ADD atomicAdd 14 | #define FLT float 15 | #endif 16 | 17 | 18 | __device__ double atomicAddDouble(double* address, double val) 19 | { 20 | unsigned long long int* address_as_ull = 21 | (unsigned long long int*)address; 22 | unsigned long long int old = *address_as_ull, assumed; 23 | do { 24 | assumed = old; 25 | old = atomicCAS(address_as_ull, assumed, 26 | __double_as_longlong(val + 27 | __longlong_as_double(assumed))); 28 | } while (assumed != old); 29 | return __longlong_as_double(old); 30 | } 31 | 32 | 33 | __device__ FLT mod1(FLT x){ 34 | return x - floor(x); 35 | } 36 | 37 | __device__ int phase_ind(FLT ft){ 38 | int n = (int) (mod1(ft) * NPHASE); 39 | return n % NPHASE; 40 | } 41 | 42 | __device__ int posmod(int n, int N){ 43 | return (n < 0) ? n + N : n % N; 44 | } 45 | 46 | 47 | 48 | __global__ void histogram_data_weighted(FLT *t, FLT *y, FLT *dy, 49 | FLT *bin, FLT *freqs, 50 | unsigned int nfreq, unsigned int ndata, 51 | FLT max_phi){ 52 | 53 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 54 | 55 | unsigned int i_freq = i / ndata; 56 | unsigned int j_data = i % ndata; 57 | 58 | if (i_freq < nfreq){ 59 | FLT Y = y[j_data]; 60 | FLT DY = dy[j_data]; 61 | 62 | int n0 = phase_ind(freqs[i_freq] * t[j_data]); 63 | unsigned int offset = i_freq * (NMAG * NPHASE); 64 | 65 | int m0 = (int) (Y * NMAG); 66 | 67 | for(int m = 0; m < NMAG; m++){ 68 | FLT z = (((FLT) m) / NMAG - Y); 69 | if (abs(z) > max_phi * DY && m != m0) 70 | continue; 71 | FLT zmax = z + (1 + MAG_OVERLAP) / ((FLT) NMAG); 72 | FLT wtot = normcdf(zmax / DY) - normcdf(z / DY); 73 | 74 | for(int n = n0; n >= n0 - PHASE_OVERLAP; n--) 75 | ATOMIC_ADD(&(bin[offset + posmod(n, NPHASE) * NMAG + m]), wtot); 76 | 77 | } 78 | } 79 | 80 | } 81 | 82 | __global__ void histogram_data_count(FLT *t, unsigned int *y, 83 | unsigned int *bin, 84 | FLT *freqs, unsigned int nfreq, 85 | unsigned int ndata){ 86 | 87 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 88 | 89 | unsigned int i_freq = i / ndata; 90 | unsigned int j_data = i % ndata; 91 | if (i_freq < nfreq){ 92 | unsigned int offset = i_freq * (NMAG * NPHASE); 93 | unsigned int m0 = y[j_data]; 94 | int n0 = phase_ind(freqs[i_freq] * t[j_data]); 95 | 96 | for (int n = (int) n0; n >= (((int) n0) - PHASE_OVERLAP); n--){ 97 | for (int m = (int) m0; m >= 0 && m >= (((int) m0) - MAG_OVERLAP); m--) { 98 | atomicInc(&(bin[offset + posmod(n, NPHASE) * NMAG + m]), 99 | (PHASE_OVERLAP + 1) * (MAG_OVERLAP + 1) * ndata); 100 | } 101 | } 102 | } 103 | } 104 | 105 | __device__ unsigned int rnduppow2(unsigned int u){ 106 | unsigned int v = u; 107 | v--; 108 | v |= v >> 1; 109 | v |= v >> 2; 110 | v |= v >> 4; 111 | v |= v >> 8; 112 | v |= v >> 16; 113 | v++; 114 | 115 | return v; 116 | } 117 | 118 | 119 | 120 | 121 | __global__ void ce_classical_fast(const FLT * __restrict__ t, 122 | const unsigned int * __restrict__ y, 123 | const FLT * __restrict__ freqs, 124 | FLT * __restrict__ ce, 125 | unsigned int nfreq, 126 | unsigned int freq_offset, 127 | unsigned int ndata, 128 | unsigned int nphase, 129 | unsigned int nmag, 130 | unsigned int phase_overlap, 131 | unsigned int mag_overlap){ 132 | 133 | extern __shared__ unsigned int sh[]; 134 | 135 | // (unsigned int + FLT) * nmag * nphase + nphase * (unsigned int) 136 | //__shared__ float *t_sh = sh; 137 | //__shared__ unsigned int *y_sh = (unsigned int *)&t_sh[ndata]; 138 | //__shared__ unsigned int *bin = (unsigned int *)&y_sh[ndata]; 139 | 140 | unsigned int * block_bin = (unsigned int *)sh; 141 | unsigned int * block_bin_phi = (unsigned int *)&block_bin[nmag * nphase]; 142 | 143 | // align! 144 | unsigned int r = ((nmag * nphase + nphase) * sizeof(unsigned int)) % sizeof(FLT); 145 | FLT * Hc = (FLT *)&block_bin_phi[nphase + r]; 146 | __shared__ FLT f0; 147 | 148 | // each block works on a single frequency. 149 | unsigned int i_freq = blockIdx.x; 150 | 151 | unsigned int i, N, Nphi; 152 | unsigned int ntot_2 = rnduppow2(nmag * nphase); 153 | unsigned int nphase_2 = rnduppow2(nphase); 154 | int m, n, m0, n0; 155 | 156 | FLT dm0 = ((FLT) (mag_overlap + 1.f)) / nmag; 157 | FLT dm; 158 | while (i_freq < nfreq){ 159 | 160 | // read frequency from global data 161 | if (threadIdx.x == 0){ 162 | f0 = freqs[i_freq + freq_offset]; 163 | } 164 | 165 | // initialise blocks to zero 166 | for(i = threadIdx.x; i < nmag * nphase; i += blockDim.x){ 167 | if (i < nphase) 168 | block_bin_phi[i] = 0; 169 | 170 | block_bin[i] = 0; 171 | Hc[i] = 0.f; 172 | } 173 | 174 | __syncthreads(); 175 | 176 | // make 2d histogram 177 | for(i = threadIdx.x; i < ndata; i += blockDim.x){ 178 | m0 = (int) (y[i]); 179 | n0 = ((int) floor(nphase * mod1(t[i] * f0))) % nphase; 180 | 181 | for (n = n0; n >= (((int) n0) - ((int) phase_overlap)); n--){ 182 | for (m = m0; m >= 0 && m >= (((int) m0) - ((int) mag_overlap)); m--) 183 | atomicInc(&(block_bin[posmod(n, nphase) * nmag + m]), 184 | (phase_overlap + 1) * (mag_overlap + 1) * ndata); 185 | 186 | } 187 | } 188 | 189 | __syncthreads(); 190 | 191 | // Get the total number of data points across phi bins 192 | for(n=threadIdx.x; n < nmag * nphase; n+=blockDim.x) 193 | atomicAdd(&(block_bin_phi[n / nmag]), block_bin[n]); 194 | 195 | __syncthreads(); 196 | 197 | // Convert to dH 198 | for(n=threadIdx.x; n < nmag * nphase; n+=blockDim.x){ 199 | m0 = n % nmag; 200 | n0 = n / nmag; 201 | 202 | N = block_bin[n]; 203 | Nphi = block_bin_phi[n0]; 204 | 205 | if (Nphi * N == 0) 206 | continue; 207 | 208 | // adjust mag bin width for overlapping mag bins (phase bins are periodic) 209 | dm = (m0 + mag_overlap + 1 > nmag) ? (((int) nmag) - m0) * dm0 / (1.f + mag_overlap) : dm0; 210 | 211 | Hc[n] = ((FLT) N) * log((dm * ((FLT) Nphi)) / ((FLT) N)); 212 | } 213 | 214 | __syncthreads(); 215 | 216 | //add up contributions 217 | for(n = ntot_2 / 2; n > 0; n/=2){ 218 | for (m = threadIdx.x; m < n && m + n < nmag * nphase; m += blockDim.x) 219 | Hc[m] += Hc[m + n]; 220 | __syncthreads(); 221 | } 222 | 223 | // add up total bin counts 224 | for(n = nphase_2 / 2; n > 0; n/=2){ 225 | for (m = threadIdx.x; m < n && m + n < nphase; m += blockDim.x) 226 | block_bin_phi[m] += block_bin_phi[m + n]; 227 | __syncthreads(); 228 | } 229 | 230 | // write result to global memory 231 | if (threadIdx.x == 0) 232 | ce[i_freq + freq_offset] = Hc[0] / block_bin_phi[0]; 233 | 234 | i_freq += gridDim.x; 235 | } 236 | } 237 | 238 | 239 | 240 | 241 | __global__ void ce_classical_faster(const FLT * __restrict__ t, 242 | const unsigned int * __restrict__ y, 243 | const FLT * __restrict__ freqs, 244 | FLT * __restrict__ ce, 245 | unsigned int nfreq, 246 | unsigned int freq_offset, 247 | unsigned int ndata, 248 | unsigned int nphase, 249 | unsigned int nmag, 250 | unsigned int phase_overlap, 251 | unsigned int mag_overlap){ 252 | 253 | extern __shared__ unsigned int sh[]; 254 | 255 | // (unsigned int + FLT) * nmag * nphase + nphase * (unsigned int) 256 | unsigned int * block_bin = (unsigned int *)sh; 257 | unsigned int * block_bin_phi = (unsigned int *)&block_bin[nmag * nphase]; 258 | 259 | // align! 260 | unsigned int r = ((nmag * nphase + nphase) * sizeof(unsigned int)) % sizeof(FLT); 261 | FLT * Hc = (FLT *)&block_bin_phi[nphase + r]; 262 | FLT * t_sh = (FLT *)&Hc[nmag * nphase]; 263 | unsigned int * y_sh = (unsigned int *)&t_sh[ndata]; 264 | __shared__ FLT f0; 265 | 266 | unsigned int i, N, Nphi; 267 | // each block works on a single frequency. 268 | unsigned int i_freq = blockIdx.x; 269 | unsigned int ntot_2 = rnduppow2(nmag * nphase); 270 | unsigned int nphase_2 = rnduppow2(nphase); 271 | int m, n, m0, n0; 272 | 273 | // load data into shared memory 274 | for (int i = threadIdx.x; i < ndata; i += blockDim.x){ 275 | t_sh[i] = t[i]; 276 | y_sh[i] = y[i]; 277 | } 278 | 279 | __syncthreads(); 280 | 281 | FLT dm0 = ((FLT) (mag_overlap + 1.f)) / nmag; 282 | FLT dm; 283 | while (i_freq < nfreq){ 284 | 285 | // read frequency from global data 286 | if (threadIdx.x == 0){ 287 | f0 = freqs[i_freq + freq_offset]; 288 | } 289 | 290 | 291 | // initialise blocks to zero 292 | for(i = threadIdx.x; i < nmag * nphase; i += blockDim.x){ 293 | if (i < nphase) 294 | block_bin_phi[i] = 0; 295 | 296 | block_bin[i] = 0; 297 | Hc[i] = 0.f; 298 | } 299 | 300 | __syncthreads(); 301 | 302 | // make 2d histogram 303 | for(i = threadIdx.x; i < ndata; i += blockDim.x){ 304 | m0 = (int) (y[i]); 305 | n0 = ((int) floor(nphase * mod1(t_sh[i] * f0))) % nphase; 306 | 307 | for (n = n0; n >= (((int) n0) - ((int) phase_overlap)); n--){ 308 | for (m = m0; m >= 0 && m >= (((int) m0) - ((int) mag_overlap)); m--) 309 | atomicInc(&(block_bin[posmod(n, nphase) * nmag + m]), 310 | (phase_overlap + 1) * (mag_overlap + 1) * ndata); 311 | } 312 | 313 | } 314 | 315 | __syncthreads(); 316 | 317 | // Get the total number of data points across phi bins 318 | for(n=threadIdx.x; n < nmag * nphase; n+=blockDim.x) 319 | atomicAdd(&(block_bin_phi[n / nmag]), block_bin[n]); 320 | 321 | __syncthreads(); 322 | 323 | // Convert to dH 324 | for(n=threadIdx.x; n < nmag * nphase; n+=blockDim.x){ 325 | m0 = n % nmag; 326 | n0 = n / nmag; 327 | 328 | Nphi = block_bin_phi[n0]; 329 | N = block_bin[n]; 330 | if (Nphi*N == 0) 331 | continue; 332 | 333 | // adjust mag bin width for overlapping mag bins (phase bins are periodic) 334 | dm = (m0 + mag_overlap + 1 > ((int) nmag)) ? (((int) nmag) - m0) * dm0 / (1.f + mag_overlap) : dm0; 335 | 336 | Hc[n] = ((FLT) N) * log((dm * ((FLT) Nphi)) / ((FLT) N)); 337 | } 338 | 339 | __syncthreads(); 340 | 341 | //add up contributions 342 | for(n = ntot_2 / 2; n > 0; n/=2){ 343 | for (m = threadIdx.x; (m < n) && ((m + n) < nmag * nphase); m += blockDim.x) 344 | Hc[m] += Hc[m + n]; 345 | __syncthreads(); 346 | } 347 | 348 | // add up total bin counts 349 | for(n = nphase_2 / 2; n > 0; n/=2){ 350 | for (m = threadIdx.x; (m < n) && ((m + n) < nphase); m += blockDim.x) 351 | block_bin_phi[m] += block_bin_phi[m + n]; 352 | __syncthreads(); 353 | } 354 | 355 | // write result to global memory 356 | if (threadIdx.x == 0) 357 | ce[i_freq + freq_offset] = Hc[0] / ((FLT) (block_bin_phi[0])); 358 | 359 | i_freq += gridDim.x; 360 | } 361 | } 362 | 363 | 364 | 365 | 366 | __global__ void weighted_ce(FLT *bins, unsigned int nfreq, FLT *ce){ 367 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 368 | 369 | if (i < nfreq){ 370 | FLT Hc = 0.f; 371 | FLT bin_tot = 0.f; 372 | FLT dm = ((FLT)(MAG_OVERLAP + 1)) / NMAG; 373 | for(int n=0; n < NPHASE; n++){ 374 | unsigned int offset = i * (NMAG * NPHASE) + n * NMAG; 375 | 376 | FLT p_phi_n = 0.f; 377 | for (int m=0; m < NMAG; m++) 378 | p_phi_n += bins[offset + m]; 379 | 380 | for (int m=0; m < NMAG; m++){ 381 | FLT pmn = bins[offset + m]; 382 | bin_tot += pmn; 383 | 384 | if (pmn > 0.f && p_phi_n > 1E-10) 385 | Hc += pmn * log((dm * p_phi_n) / pmn); 386 | } 387 | } 388 | ce[i] = Hc / bin_tot; 389 | } 390 | } 391 | 392 | __global__ void standard_ce(unsigned int *bins, unsigned int nfreq, 393 | FLT *ce){ 394 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 395 | FLT dm, dm0, Hc; 396 | unsigned int bin_tot, offset, Nphi, N; 397 | 398 | if (i < nfreq){ 399 | Hc = 0.f; 400 | dm0 = ((FLT)(MAG_OVERLAP + 1)) / NMAG; 401 | bin_tot = 0; 402 | for(int n=0; n < NPHASE; n++){ 403 | offset = i * (NMAG * NPHASE) + n * NMAG; 404 | 405 | Nphi = 0; 406 | for (int m=0; m < NMAG; m++) 407 | Nphi += bins[offset + m]; 408 | 409 | if (Nphi == 0) 410 | continue; 411 | 412 | for (int m=0; m < NMAG; m++){ 413 | N = bins[offset + m]; 414 | 415 | if (N == 0) 416 | continue; 417 | 418 | bin_tot += N; 419 | 420 | // adjust mag bin width for overlapping bins 421 | dm = (m + MAG_OVERLAP + 1 > NMAG) ? (((FLT) NMAG) - ((FLT) m)) * dm0 / (1.f + MAG_OVERLAP) : dm0; 422 | Hc += N * log((dm * Nphi) / N); 423 | } 424 | } 425 | 426 | ce[i] = Hc / bin_tot; 427 | } 428 | } 429 | 430 | __global__ void constdpdm_ce(unsigned int *bins, unsigned int nfreq, 431 | FLT *ce, FLT *mag_bwf){ 432 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 433 | 434 | if (i < nfreq){ 435 | FLT Hc = 0.f; 436 | unsigned int bin_tot = 0; 437 | for(int n=0; n < NPHASE; n++){ 438 | unsigned int offset = i * (NMAG * NPHASE) + n * NMAG; 439 | 440 | unsigned int Nphi = 0; 441 | for (int m=0; m < NMAG; m++) 442 | Nphi += bins[offset + m]; 443 | 444 | if (Nphi == 0) 445 | continue; 446 | 447 | for (int m=0; m < NMAG; m++){ 448 | unsigned int N = bins[offset + m]; 449 | 450 | if (N == 0) 451 | continue; 452 | 453 | bin_tot += N; 454 | Hc += N * log((mag_bwf[m] * Nphi) / N); 455 | } 456 | } 457 | 458 | ce[i] = Hc / bin_tot; 459 | } 460 | } 461 | 462 | __global__ void log_prob(unsigned int *bins, unsigned int nfreq, 463 | FLT *log_proba, FLT *mag_bin_fracs){ 464 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 465 | 466 | if (i < nfreq){ 467 | FLT logP = 0.f; 468 | for(int n=0; n < NPHASE; n++){ 469 | unsigned int offset = i * (NMAG * NPHASE) + n * NMAG; 470 | 471 | unsigned int Nphi = 0; 472 | for (int m=0; m < NMAG; m++) 473 | Nphi += bins[offset + m]; 474 | 475 | if (Nphi == 0) 476 | continue; 477 | 478 | for (int m=0; m < NMAG; m++){ 479 | FLT N = (FLT) (bins[offset + m]); 480 | 481 | FLT Nexp = Nphi * mag_bin_fracs[m]; 482 | 483 | if (Nexp < 1e-9) 484 | continue; 485 | 486 | logP += N * log(Nexp) - Nexp - lgamma(N + 1.f); 487 | } 488 | } 489 | 490 | log_proba[i] = logP / (PHASE_OVERLAP + 1.f); 491 | } 492 | } 493 | 494 | -------------------------------------------------------------------------------- /cuvarbase/kernels/cunfft.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #define RESTRICT __restrict__ 5 | #define CONSTANT const 6 | #define PI 3.14159265358979323846264338327950288f 7 | #define FILTER gauss_filter 8 | //{CPP_DEFS} 9 | 10 | #ifdef DOUBLE_PRECISION 11 | #define ATOMIC_ADD atomicAddDouble 12 | #define FLT double 13 | 14 | #else 15 | #define ATOMIC_ADD atomicAdd 16 | #define FLT float 17 | #endif 18 | 19 | #define CMPLX pycuda::complex 20 | 21 | __device__ double atomicAddDouble(double* address, double val) 22 | { 23 | unsigned long long int* address_as_ull = 24 | (unsigned long long int*)address; 25 | unsigned long long int old = *address_as_ull, assumed; 26 | do { 27 | assumed = old; 28 | old = atomicCAS(address_as_ull, assumed, 29 | __double_as_longlong(val + 30 | __longlong_as_double(assumed))); 31 | } while (assumed != old); 32 | return __longlong_as_double(old); 33 | } 34 | 35 | 36 | __device__ FLT gauss_filter(CONSTANT FLT x, CONSTANT FLT b) { 37 | return exp(-(x*x) / b) / sqrt(PI * b); 38 | } 39 | 40 | __device__ int mod(CONSTANT int a, CONSTANT int b) { 41 | int ret = a % b; 42 | return (ret < 0) ? ret + b : ret; 43 | } 44 | 45 | __device__ float modflt(CONSTANT FLT a, CONSTANT FLT b){ 46 | return a - floor(a / b) * b; 47 | } 48 | 49 | __device__ FLT diffmod(CONSTANT FLT a, CONSTANT FLT b, CONSTANT FLT M) { 50 | FLT ret = a - b; 51 | if (fabsf(ret) > M/2){ 52 | if (ret > 0) 53 | return ret - M; 54 | return M + ret; 55 | } 56 | return ret; 57 | } 58 | 59 | __global__ void nfft_shift( 60 | CMPLX *in, 61 | CMPLX *out, 62 | CONSTANT int ng, 63 | CONSTANT int nbatch, 64 | CONSTANT FLT x0, 65 | CONSTANT FLT xf, 66 | CONSTANT FLT spp, 67 | CONSTANT FLT f0){ 68 | 69 | int i = blockIdx.x *blockDim.x + threadIdx.x; 70 | 71 | int batch = i / ng; 72 | 73 | if (batch < nbatch) { 74 | FLT k0 = f0 * spp * (xf - x0); 75 | 76 | FLT phi = (2.f * PI * (i % ng) * k0) / ng; 77 | 78 | CMPLX shift = CMPLX(cos(phi), sin(phi)); 79 | 80 | out[i] = shift * in[i]; 81 | } 82 | } 83 | 84 | __global__ void precompute_psi( 85 | FLT *RESTRICT x, // observation times 86 | FLT * q1, // precomputed filter values (length n0) 87 | FLT * q2, // precomputed filter values (length n0) 88 | FLT * q3, // precomputed filter values (length 2 * m + 1) 89 | CONSTANT int n0, // data size 90 | CONSTANT int ng, // grid size 91 | CONSTANT int m, // max filter radius 92 | CONSTANT FLT b, // filter scaling 93 | CONSTANT FLT x0, // min(x) 94 | CONSTANT FLT xf, // max(x) 95 | CONSTANT FLT spp) // samples per peak 96 | { 97 | int i = blockIdx.x *blockDim.x + threadIdx.x; 98 | 99 | FLT binv = 1.f/b; 100 | if (i < n0){ 101 | FLT xg = (x[i] - x0) / (spp * (xf - x0)); 102 | 103 | xg = m + modflt(ng * xg, 1.f); 104 | 105 | q1[i] = exp(-xg * (xg * binv)) / sqrt(b * PI); 106 | q2[i] = exp( 2.f * xg * binv); 107 | 108 | } else if (i - n0 < 2 * m + 1) { 109 | int l = i - n0; 110 | q3[l] = exp(-l * l * binv); 111 | } 112 | 113 | } 114 | 115 | __global__ void fast_gaussian_grid( 116 | FLT *RESTRICT x, // data (observation times), length n0 117 | FLT *RESTRICT y, // data (observations), length (nbatch * n0) 118 | CMPLX * grid, // grid, length n * nbatch 119 | FLT *RESTRICT q1, // precomputed filter values 120 | FLT *RESTRICT q2, // precomputed filter values 121 | FLT *RESTRICT q3, // precomputed filter values 122 | CONSTANT int n0, // data size 123 | CONSTANT int ng, // grid size 124 | CONSTANT int nbatch, // number of grids/datasets 125 | CONSTANT int m, // max filter radius 126 | CONSTANT FLT x0, // min(x) 127 | CONSTANT FLT xf, // max(x) 128 | CONSTANT FLT spp) // samples per peak 129 | { 130 | int i = blockIdx.x * blockDim.x + threadIdx.x; 131 | 132 | int batch = i / n0; 133 | 134 | if (batch < nbatch){ 135 | 136 | // datapoint 137 | int di = i % n0; 138 | 139 | // scale 140 | FLT xval = (x[di] - x0) / (spp * (xf - x0)); 141 | 142 | // observation 143 | FLT yi = y[i]; 144 | 145 | // nearest gridpoint (rounding down) 146 | int u = (int) floorf(ng * xval - m); 147 | 148 | // precomputed filter values 149 | FLT Q = q1[di]; 150 | FLT Q2 = q2[di]; 151 | 152 | // add datapoint to grid 153 | for(int k = 0; k < 2 * m + 1; k++){ 154 | FLT dg = Q * q3[k] * yi; 155 | if (!(isnan(dg) || isinf(dg))) 156 | ATOMIC_ADD(&(grid[mod(k + u, ng) + batch * ng]._M_re), 157 | dg); 158 | else 159 | break; 160 | Q *= Q2; 161 | } 162 | } 163 | } 164 | 165 | 166 | 167 | __global__ void slow_gaussian_grid( 168 | FLT *RESTRICT x, // data (observation times) 169 | FLT *RESTRICT y, // data (observations) 170 | CMPLX * grid, // grid 171 | CONSTANT int n0, // data size 172 | CONSTANT int ng, // grid size 173 | CONSTANT int nbatch, // number of grids 174 | CONSTANT int m, // max filter radius 175 | CONSTANT FLT b, // filter scaling 176 | CONSTANT FLT x0, // min(x) 177 | CONSTANT FLT xf, // max(x) 178 | CONSTANT FLT spp) // samples per peak 179 | { 180 | int i = blockIdx.x * blockDim.x + threadIdx.x; 181 | 182 | int batch = i / ng; 183 | 184 | if (batch < nbatch){ 185 | FLT dx, dgi; 186 | 187 | 188 | 189 | // grid index for this thread 190 | int grid_index = i - ng * batch; 191 | 192 | // iterate through data 193 | for(int di = 0; di < n0; di ++){ 194 | 195 | // scale 196 | FLT xval = (x[di] - x0) / (spp * (xf - x0)); 197 | 198 | // grid index of datapoint (float) 199 | dgi = ng * xval; 200 | 201 | // "distance" between grid_index and datapoint 202 | dx = diffmod(dgi, grid_index, ng); 203 | 204 | // skip if datapoint too far away 205 | if (dx > m) 206 | continue; 207 | 208 | // add (weighted) datapoint to grid 209 | grid[i] += FILTER(dx, b) * y[di + n0 * batch]; 210 | } 211 | } 212 | } 213 | 214 | __global__ void normalize( 215 | CMPLX *gin, 216 | CMPLX *gout, 217 | CONSTANT int ng, // sigma * nf 218 | CONSTANT int nf, // number of desired frequency samples 219 | CONSTANT int nbatch, // number of transforms 220 | CONSTANT FLT b, // filter scaling 221 | CONSTANT FLT x0, // min(x) 222 | CONSTANT FLT xf, // max(x) 223 | CONSTANT FLT spp, // samples per peak 224 | CONSTANT FLT f0) // first frequency 225 | { 226 | int i = blockIdx.x *blockDim.x + threadIdx.x; 227 | 228 | int batch = i / nf; 229 | 230 | if (batch < nbatch){ 231 | int k = i % nf; 232 | 233 | FLT sT = spp * (xf - x0); 234 | FLT n0 = (x0 / sT) * ng; 235 | FLT k0 = f0 * sT; 236 | CMPLX G = gin[batch * ng + k]; 237 | 238 | // *= exp(2pi i (k0 + k) * n0 / n) 239 | FLT theta_k = (2.f * PI * n0 * (k0 + k)) / ng; 240 | 241 | G *= CMPLX(cos(theta_k), sin(theta_k)); 242 | 243 | // normalization factor from gridding kernel (gaussian) 244 | FLT khat = PI * (k0 + k) / ng; 245 | gout[i] = G * exp(b * khat * khat); 246 | } 247 | 248 | } 249 | 250 | -------------------------------------------------------------------------------- /cuvarbase/kernels/lomb.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | //{CPP_DEFS} 4 | 5 | #define EPSILON 1E-8 6 | #define PI 3.141592653589793238462643383279502884f 7 | #ifdef DOUBLE_PRECISION 8 | #define FLT double 9 | #else 10 | #define FLT float 11 | #endif 12 | 13 | #define STANDARD 0 14 | #define FLOATING_MEAN 1 15 | #define WINDOW 2 16 | 17 | 18 | 19 | __device__ FLT cossum(FLT *t, FLT *y, int n, FLT freq){ 20 | FLT C = 0; 21 | for(int i = 0; i < n; i++) 22 | C += y[i] * cos((t[i] + 0.5f) * freq * 2.f * PI); 23 | 24 | return C; 25 | } 26 | 27 | 28 | __device__ FLT sinsum(FLT *t, FLT *y, int n, FLT freq){ 29 | FLT S = 0; 30 | for(int i = 0; i < n; i++) 31 | S += y[i] * sin((t[i] + 0.5f) * freq * 2.f * PI); 32 | 33 | return S; 34 | } 35 | 36 | __device__ FLT lspow_flmean(FLT C, FLT S, 37 | FLT C2, FLT S2, 38 | FLT YCh, FLT YSh, 39 | FLT YY, FLT Y, 40 | FLT *reg){ 41 | FLT r0 = 0.f, r1 = 0.f, r2 = 0.f; 42 | if (reg != NULL){ 43 | r0 = reg[0]; 44 | r1 = reg[1]; 45 | r2 = reg[2]; 46 | } 47 | FLT tan_2omega_tau = (S2 - 2 * S * C) / (C2 - (C * C - S * S)); 48 | 49 | FLT C2wInv2 = 1.f + tan_2omega_tau * tan_2omega_tau; 50 | 51 | FLT C2w = 1.f / sqrt(C2wInv2); 52 | FLT S2w = tan_2omega_tau * C2w; 53 | 54 | FLT Cw = sqrt(0.5f * (1.f + C2w)); 55 | FLT Sw = sqrt(0.5f * (1.f - C2w)); 56 | 57 | if (S2w < 0.f) 58 | Sw *= -1.f; 59 | 60 | FLT Cshft = C * Cw + S * Sw; 61 | FLT Sshft = S * Cw - C * Sw; 62 | 63 | FLT CC = 0.5f * (1.f + C2 * C2w + S2 * S2w); 64 | FLT SS = 0.5f * (1.f - C2 * C2w - S2 * S2w); 65 | 66 | CC -= Cshft * Cshft; 67 | SS -= Sshft * Sshft; 68 | 69 | FLT xreg = r2 / (1.f + r2); 70 | 71 | CC += Cshft * Cshft * xreg + r0; 72 | SS += Sshft * Sshft * xreg + r1; 73 | 74 | FLT YC = (YCh + Y * C * xreg) * Cw + (YSh + Y * S * xreg) * Sw; 75 | FLT YS = (YSh + Y * S * xreg) * Cw - (YCh + Y * C * xreg) * Sw; 76 | 77 | FLT P = ((YC * YC) / CC + (YS * YS) / SS) / YY; 78 | 79 | if (isnan(P) || isinf(P) || P < 0.f) 80 | P = -1.; 81 | 82 | return P; 83 | } 84 | 85 | __device__ FLT lspow0(FLT C, FLT S, 86 | FLT C2, FLT S2, 87 | FLT YCh, FLT YSh, 88 | FLT YY, FLT Y, 89 | FLT *reg){ 90 | 91 | FLT tan_2omega_tau = S2 / C2; 92 | FLT r0 = 0.f, r1 = 0.f; 93 | if (reg != NULL){ 94 | r0 = reg[0]; 95 | r1 = reg[1]; 96 | } 97 | 98 | FLT C2wInv2 = 1.f + tan_2omega_tau * tan_2omega_tau; 99 | 100 | FLT C2w = 1.f / sqrt(C2wInv2); 101 | FLT S2w = tan_2omega_tau * C2w; 102 | 103 | FLT Cw = sqrt(0.5f * (1.f + C2w)); 104 | FLT Sw = sqrt(0.5f * (1.f - C2w)); 105 | 106 | if (S2w < 0) 107 | Sw *= -1.f; 108 | 109 | FLT YC = (YCh + Y * C) * Cw + (YSh + Y * S) * Sw; 110 | FLT YS = (YSh + Y * S) * Cw - (YCh + Y * C) * Sw; 111 | 112 | FLT CC = 0.5f * (1.f + C2 * C2w + S2 * S2w) + r0; 113 | FLT SS = 0.5f * (1.f - C2 * C2w - S2 * S2w) + r1; 114 | 115 | FLT P = ((YC * YC) / CC + (YS * YS) / SS) / (YY + Y * Y); 116 | 117 | if (isnan(P) || isinf(P) || P < 0.f){ 118 | //printf("%e, %e, %e, %e, %e: %e\n", C, S, CC, SS, YY + Y*Y, P); 119 | P = -1.f; 120 | } 121 | 122 | return P; 123 | } 124 | 125 | 126 | __device__ FLT lspow(FLT C, FLT S, 127 | FLT C2, FLT S2, 128 | FLT YCh, FLT YSh, 129 | FLT YY, FLT Y, 130 | FLT *reg, int mode){ 131 | switch(mode){ 132 | case STANDARD: 133 | return lspow0(C, S, C2, S2, YCh, YSh, YY, Y, reg); 134 | case FLOATING_MEAN: 135 | return lspow_flmean(C, S, C2, S2, YCh, YSh, YY, Y, reg); 136 | case WINDOW: 137 | return lspow0(C, S, C2, S2, C, S, 0.f, 1.f, NULL); 138 | default: 139 | return -1.f; 140 | } 141 | } 142 | 143 | 144 | __global__ void lomb_dirsum(FLT *t, FLT *yw, FLT *w, 145 | FLT *lsp, FLT *reg, 146 | int nfreq, int n, FLT YY, FLT Y, FLT df, 147 | FLT fmin, int mode){ 148 | int i = blockIdx.x * blockDim.x + threadIdx.x; 149 | // reg = (lambda_a, lambda_b, lambda_c) 150 | if (i < nfreq){ 151 | 152 | FLT frq = fmin + i * df; 153 | 154 | FLT C = cossum(t, w, n, frq); 155 | FLT S = sinsum(t, w, n, frq); 156 | 157 | FLT C2 = cossum(t, w, n, 2.f * frq); 158 | FLT S2 = sinsum(t, w, n, 2.f * frq); 159 | 160 | FLT YCh = cossum(t, yw, n, frq); 161 | FLT YSh = sinsum(t, yw, n, frq); 162 | 163 | lsp[i] = lspow(C, S, C2, S2, YCh, YSh, YY, Y, reg, mode); 164 | } 165 | } 166 | 167 | __global__ void lomb_dirsum_custom_frq(FLT *t, FLT *w, FLT *yw, FLT *freqs, 168 | FLT *lsp, FLT *reg, 169 | int nfreq, int n, FLT YY, FLT Y, int mode){ 170 | int i = blockIdx.x * blockDim.x + threadIdx.x; 171 | // reg = (lambda_a, lambda_b, lambda_c) 172 | if (i < nfreq){ 173 | 174 | FLT frq = freqs[i]; 175 | 176 | FLT C = cossum(t, w, n, frq); 177 | FLT S = sinsum(t, w, n, frq); 178 | 179 | FLT C2 = cossum(t, w, n, 2.f * frq); 180 | FLT S2 = sinsum(t, w, n, 2.f * frq); 181 | 182 | FLT YCh = cossum(t, yw, n, frq); 183 | FLT YSh = sinsum(t, yw, n, frq); 184 | 185 | lsp[i] = lspow(C, S, C2, S2, YCh, YSh, YY, Y, reg, mode); 186 | } 187 | } 188 | 189 | __global__ void lomb(pycuda::complex *sw, 190 | pycuda::complex *syw, 191 | FLT *lsp, 192 | FLT *reg, 193 | int nfreq, 194 | FLT YY, 195 | FLT Y, 196 | int k0, 197 | int mode){ 198 | 199 | // least squares (lomb scargle with FLTing mean) 200 | 201 | int i = blockIdx.x * blockDim.x + threadIdx.x; 202 | // reg = (lambda_a, lambda_b, lambda_c) 203 | if (i < nfreq){ 204 | pycuda::complex SW, SW2, SYW; 205 | SW = sw[i]; 206 | SW2 = sw[2 * i + k0]; 207 | SYW = syw[i]; 208 | 209 | FLT C = SW.real(); 210 | FLT S = SW.imag(); 211 | 212 | FLT C2 = SW2.real(); 213 | FLT S2 = SW2.imag(); 214 | 215 | FLT YCh = SYW.real(); 216 | FLT YSh = SYW.imag(); 217 | 218 | lsp[i] = lspow(C, S, C2, S2, YCh, YSh, YY, Y, reg, mode); 219 | } 220 | } 221 | 222 | 223 | __global__ void lomb_mh(pycuda::complex *sw, 224 | pycuda::complex *syw, 225 | FLT *lsp, 226 | FLT *reg, 227 | int nfreq, 228 | int nharmonics, 229 | FLT YY, 230 | FLT Y, 231 | int k0, 232 | int mode){ 233 | 234 | // least squares (lomb scargle with FLTing mean) 235 | 236 | unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; 237 | // reg = (lambda_a, lambda_b, lambda_c) 238 | if (i < nfreq){ 239 | pycuda::complex SW, SW2, SYW; 240 | SW = sw[i]; 241 | SW2 = sw[2 * i + k0]; 242 | SYW = syw[i]; 243 | 244 | FLT C = SW.real(); 245 | FLT S = SW.imag(); 246 | 247 | FLT C2 = SW2.real(); 248 | FLT S2 = SW2.imag(); 249 | 250 | FLT YCh = SYW.real(); 251 | FLT YSh = SYW.imag(); 252 | 253 | lsp[i] = lspow(C, S, C2, S2, YCh, YSh, YY, Y, reg, mode); 254 | } 255 | } -------------------------------------------------------------------------------- /cuvarbase/kernels/pdm.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #define WEIGHT(k) (w==NULL ? 1.0f : w[k]) 3 | #define GAUSSIAN(x) expf(-0.5f *x*x) 4 | #define WEIGHTED_LININTERP true 5 | #define SKIP_BIN(i) (bin_wtots[i] * NBINS < 0.01f) 6 | //INSERT_NBINS_HERE 7 | #define PHASE(x,f) (x * f - floorf(x * f)) 8 | 9 | #define RESTRICT __restrict__ 10 | #define CONSTANT const 11 | 12 | 13 | __device__ float phase_diff( 14 | CONSTANT float dt, 15 | CONSTANT float freq){ 16 | float dphi = dt * freq - floorf(dt * freq); 17 | return ((dphi > 0.5f) ? 1.0f - dphi : dphi); 18 | } 19 | 20 | __device__ float var_step_function( 21 | float *RESTRICT t, 22 | float *RESTRICT y, 23 | float *RESTRICT w, 24 | CONSTANT float freq, 25 | CONSTANT int ndata){ 26 | float bin_means[NBINS]; 27 | float bin_wtots[NBINS]; 28 | int bin; 29 | float var_tot = 0.f; 30 | for (int i = 0; i < NBINS; i++){ 31 | bin_wtots[i] = 0.f; 32 | bin_means[i] = 0.f; 33 | } 34 | for(int i = 0; i < ndata; i++){ 35 | bin = (int) (PHASE(t[i], freq) * NBINS); 36 | bin = bin % NBINS; 37 | bin_wtots[bin] += w[i]; 38 | bin_means[bin] += y[i] * w[i]; 39 | } 40 | 41 | for(int i = 0; i < NBINS; i++){ 42 | if (bin_wtots[i] == 0.f) 43 | continue; 44 | bin_means[i] /= bin_wtots[i]; 45 | } 46 | 47 | for(int i = 0; i < ndata; i++){ 48 | bin = (int) (PHASE(t[i], freq) * NBINS); 49 | var_tot += w[i] * (y[i] - bin_means[bin]) * (y[i] - bin_means[bin]); 50 | } 51 | 52 | return var_tot; 53 | } 54 | 55 | __device__ float var_linear_interp( 56 | float *RESTRICT t, 57 | float *RESTRICT y, 58 | float *RESTRICT w, 59 | CONSTANT float freq, 60 | CONSTANT int ndata){ 61 | 62 | float bin_means[NBINS]; 63 | float bin_wtots[NBINS]; 64 | int bin, bin0, bin1; 65 | float var_tot = 0.f; 66 | float phase, y0, alpha; 67 | for(int i = 0; i < NBINS; i++){ 68 | bin_wtots[i] = 0.f; 69 | bin_means[i] = 0.f; 70 | } 71 | 72 | for(int i = 0; i < ndata; i++){ 73 | bin = (int) (PHASE(t[i], freq) * NBINS); 74 | bin = bin % NBINS; 75 | bin_wtots[bin] += w[i]; 76 | bin_means[bin] += w[i] * y[i]; 77 | } 78 | 79 | for (int i = 0; i < NBINS; i++){ 80 | if (bin_wtots[i] == 0.f) 81 | continue; 82 | bin_means[i] /= bin_wtots[i]; 83 | } 84 | 85 | 86 | for (int i = 0; i < ndata; i++){ 87 | phase = PHASE(t[i], freq); 88 | bin = (int) (phase * NBINS); 89 | bin = bin % NBINS; 90 | 91 | alpha = phase * NBINS - floorf(phase * NBINS) - 0.5f; 92 | bin0 = (alpha < 0) ? bin - 1 : bin; 93 | bin1 = (alpha < 0) ? bin : bin + 1; 94 | 95 | if (bin0 < 0) 96 | bin0 += NBINS; 97 | if (bin1 >= NBINS) 98 | bin1 -= NBINS; 99 | 100 | alpha += (alpha < 0) ? 1.f : 0.f; 101 | y0 = (1.f - alpha) * bin_means[bin0] + alpha * bin_means[bin1]; 102 | var_tot += w[i] * (y[i] - y0) * (y[i] - y0); 103 | } 104 | 105 | return var_tot; 106 | } 107 | 108 | 109 | __device__ float var_binless_tophat( 110 | float *RESTRICT t, 111 | float *RESTRICT y, 112 | float *RESTRICT w, 113 | CONSTANT float freq, 114 | CONSTANT int ndata, 115 | CONSTANT float dphi){ 116 | float mbar, tj, wtot, var; 117 | bool in_bin; 118 | var = 0.f; 119 | for(int j = 0; j < ndata; j++){ 120 | mbar = 0.f; 121 | wtot = 0.f; 122 | tj = t[j]; 123 | for(int k = 0; k < ndata; k++){ 124 | in_bin = phase_diff(fabsf(t[k] - tj), freq) < dphi; 125 | wtot += in_bin ? w[k] : 0.f; 126 | mbar += in_bin ? w[k] * y[k] : 0.f; 127 | } 128 | mbar /= wtot; 129 | var += w[j] * (y[j] - mbar) * (y[j] - mbar); 130 | } 131 | return var; 132 | } 133 | __device__ float var_binless_gauss( 134 | float *RESTRICT t, 135 | float *RESTRICT y, 136 | float *RESTRICT w, 137 | CONSTANT float freq, 138 | CONSTANT int ndata, 139 | CONSTANT float dphi){ 140 | float mbar, tj, wtot, var, wgt; 141 | var = 0.f; 142 | for(int j = 0; j < ndata; j++){ 143 | mbar = 0.f; 144 | wtot = 0.f; 145 | tj = t[j]; 146 | for(int k = 0; k < ndata; k++){ 147 | float dphase = phase_diff(fabsf(t[k] - tj), freq); 148 | wgt = w[k] * GAUSSIAN(dphase / dphi); 149 | mbar += wgt * y[k]; 150 | wtot += wgt; 151 | } 152 | mbar /= wtot; 153 | var += w[j] * (y[j] - mbar) * (y[j] - mbar); 154 | } 155 | return var; 156 | } 157 | __global__ void pdm_binless_tophat( 158 | float *RESTRICT t, 159 | float *RESTRICT y, 160 | float *RESTRICT w, 161 | float *RESTRICT freqs, 162 | float *power, 163 | CONSTANT int ndata, 164 | CONSTANT int nfreqs, 165 | CONSTANT float dphi, 166 | CONSTANT float var){ 167 | int i = blockIdx.x * blockDim.x + threadIdx.x; 168 | if (i < nfreqs){ 169 | power[i] = 1.f - var_binless_tophat(t, y, w, freqs[i], ndata, dphi) / var; 170 | } 171 | } 172 | 173 | __global__ void pdm_binless_gauss( 174 | float *RESTRICT t, 175 | float *RESTRICT y, 176 | float *RESTRICT w, 177 | float *RESTRICT freqs, 178 | float *power, 179 | CONSTANT int ndata, 180 | CONSTANT int nfreqs, 181 | CONSTANT float dphi, 182 | CONSTANT float var){ 183 | int i = blockIdx.x * blockDim.x + threadIdx.x; 184 | if (i < nfreqs){ 185 | power[i] = 1.f - var_binless_gauss(t, y, w, freqs[i], ndata, dphi) / var; 186 | } 187 | } 188 | 189 | __global__ void pdm_binned_linterp( 190 | float *RESTRICT t, 191 | float *RESTRICT y, 192 | float *RESTRICT w, 193 | float *RESTRICT freqs, 194 | float *power, 195 | CONSTANT int ndata, 196 | CONSTANT int nfreqs, 197 | CONSTANT float dphi, 198 | CONSTANT float var){ 199 | 200 | int i = blockIdx.x * blockDim.x + threadIdx.x; 201 | if (i < nfreqs){ 202 | power[i] = 1.f - var_linear_interp(t, y, w, freqs[i], ndata) / var; 203 | } 204 | } 205 | __global__ void pdm_binned_step( 206 | float *RESTRICT t, 207 | float *RESTRICT y, 208 | float *RESTRICT w, 209 | float *RESTRICT freqs, 210 | float *power, 211 | CONSTANT int ndata, 212 | CONSTANT int nfreqs, 213 | CONSTANT float dphi, 214 | CONSTANT float var){ 215 | int i = blockIdx.x * blockDim.x + threadIdx.x; 216 | if (i < nfreqs){ 217 | power[i] = 1.f - var_step_function(t, y, w, freqs[i], ndata) / var; 218 | } 219 | } 220 | -------------------------------------------------------------------------------- /cuvarbase/kernels/wavelet.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #define WEIGHT(k) (w==NULL ? 1.0f : w[k]) 3 | #define GAUSSIAN(x) expf(-0.5f *x*x) 4 | #define WEIGHTED_LININTERP true 5 | #define SKIP_BIN(i) (bin_wtots[i] * NBINS < 0.01f) 6 | //INSERT_NBINS_HERE 7 | #define PHASE(x,f) (x * f - floorf(x * f)) 8 | #define TWOPI 6.28318530718f 9 | #define RESTRICT __restrict__ 10 | #define CONSTANT const 11 | #define MIN_NOBS 10 12 | #define wavelet full_wavelet 13 | 14 | 15 | __device__ float fast_wavelet(float dt, float sigma, float freq){ 16 | float a = fabs(TWOPI * sigma * freq * dt); 17 | 18 | return a < 1.f ? 1.f - 3.f * a * a + 2.f * a * a * a : 0.f; 19 | } 20 | 21 | __device__ float full_wavelet(float dt, float sigma, float freq){ 22 | float a = fabs(TWOPI * sigma * freq * dt); 23 | 24 | return expf(-a*a); 25 | } 26 | 27 | __device__ float cosine_wtransform(float *t, float *y, float *w, float freq, float tau, float sigma, 28 | int imin, int imax){ 29 | float pow = 0.f; 30 | float weight = 0.f; 31 | float tot_weight = 0.f; 32 | for(int i = imin; i <= imax; i++){ 33 | weight = wavelet(t[i] - tau, sigma, freq) * (w == NULL ? 1.f : w[i]); 34 | tot_weight += weight; 35 | pow += y[i] * weight * cos(TWOPI * freq * t[i]); 36 | } 37 | return pow / tot_weight; 38 | } 39 | 40 | __device__ float sine_wtransform(float *t, float *y, float *w, float freq, float tau, float sigma, 41 | int imin, int imax){ 42 | float pow = 0.f; 43 | float weight = 0.f; 44 | float tot_weight = 0.f; 45 | for(int i = imin; i <= imax; i++){ 46 | weight = wavelet(t[i] - tau, sigma, freq) * (w == NULL ? 1.f : w[i]); 47 | tot_weight += weight; 48 | pow += y[i] * weight * cos(TWOPI * freq * t[i]); 49 | } 50 | return pow / tot_weight; 51 | } 52 | 53 | __device__ float weighted_mean(float *t, float *y, float *w, float freq, float tau, 54 | float sigma, int imin, int imax){ 55 | float s = 0.f; 56 | float weight = 0.f; 57 | float total_weight = 0.f; 58 | for(int i = imin; i <= imax; i++){ 59 | weight = wavelet(t[i] - tau, sigma, freq) * (w == NULL ? 1.f : w[i]); 60 | s += y[i] * weight; 61 | total_weight += weight; 62 | } 63 | return s / total_weight; 64 | } 65 | 66 | __device__ float weighted_var(float *t, float *y, float *w, float freq, float tau, 67 | float sigma, int imin, int imax){ 68 | float s = 0.f; 69 | float weight = 0.f; 70 | float total_weight = 0.f; 71 | for(int i = imin; i <= imax; i++){ 72 | weight = wavelet(t[i] - tau, sigma, freq) * (w == NULL ? 1.f : w[i]); 73 | s += y[i] * y[i] * weight; 74 | total_weight += weight; 75 | } 76 | return s / total_weight; 77 | } 78 | 79 | __device__ float power(float *t, float *y, float *w, float freq, float tau, 80 | float prec, float sigma, int nobs){ 81 | 82 | // least squares (lomb scargle with floating mean) 83 | 84 | int imin = 0; 85 | int imax = nobs - 1; 86 | 87 | float wmin = pow(10.f, -prec); 88 | 89 | while( imin < nobs && wavelet(t[imin] - tau, sigma, freq) < wmin) imin ++; 90 | while( imax > 0 && wavelet(t[imax] - tau, sigma, freq) < wmin) imax --; 91 | 92 | if (imax - imin < MIN_NOBS) return 0.f; 93 | 94 | float Y = weighted_mean(t, y, w, freq, tau, sigma, imin, imax); 95 | float YY = weighted_var(t, y, w, freq, tau, sigma, imin, imax) - Y*Y; 96 | 97 | float C = cosine_wtransform(t, w, NULL, freq, tau, sigma, imin, imax); 98 | float S = sine_wtransform(t, w, NULL, freq, tau, sigma, imin, imax); 99 | 100 | float C2 = cosine_wtransform(t, w, NULL, 2 * freq, tau, sigma, imin, imax); 101 | float S2 = sine_wtransform(t, w, NULL, 2 * freq, tau, sigma, imin, imax); 102 | 103 | float YC = cosine_wtransform(t, y, w, freq, tau, sigma, imin, imax) - Y * C; 104 | float YS = sine_wtransform(t, y, w, freq, tau, sigma, imin, imax) - Y * S; 105 | 106 | float CC = 0.5f * ( 1.f + C2 ) - C * C; 107 | float CS = 0.5f * S2 - C * S; 108 | float SS = 0.5f * ( 1.f - C2 ) - S * S; 109 | 110 | float D = CC * SS - CS * CS; 111 | 112 | float p = (SS * YC * YC + CC * YS * YS - 2 * CS * YC * YS) / (YY * D); 113 | 114 | // force 0 < p < 1 115 | return p < 0.f ? 0.f : (p > 1.f ? 0.f : p); 116 | } 117 | 118 | 119 | __device__ int sumint(int *arr, int len){ 120 | int s = 0.f; 121 | for(int i = 0; i < len; i++) 122 | s += arr[i]; 123 | return s; 124 | } 125 | 126 | 127 | __global__ void wavelet_spectrogram(float *t, float *y, float *w, float *spectrogram, 128 | float *freqs, float *taus, int *ntaus, int nfreqs, 129 | int nobs, float sigma, float prec){ 130 | 131 | int i = blockIdx.x * blockDim.x + threadIdx.x; 132 | 133 | int tot_ntaus = sumint(ntaus, nfreqs); 134 | if (i < tot_ntaus){ 135 | int fno = 0; 136 | int s = 0; 137 | while(s < i){ 138 | fno ++; 139 | s += ntaus[fno]; 140 | } 141 | 142 | float tau = taus[i]; 143 | float freq = freqs[fno]; 144 | 145 | spectrogram[i] = power(t, y, w, freq, tau, prec, sigma, nobs); 146 | } 147 | } -------------------------------------------------------------------------------- /cuvarbase/pdm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import zip 6 | from builtins import range 7 | 8 | import numpy as np 9 | import resource 10 | import warnings 11 | 12 | import pycuda.driver as cuda 13 | import pycuda.gpuarray as gpuarray 14 | from pycuda.compiler import SourceModule 15 | # import pycuda.autoinit 16 | 17 | from .core import GPUAsyncProcess 18 | from .utils import weights, find_kernel, dphase 19 | 20 | def var_tophat(t, y, w, freq, dphi): 21 | var = 0. 22 | for i, (T, Y, W) in enumerate(zip(t, y, w)): 23 | mbar = 0. 24 | wtot = 0. 25 | for j, (T2, Y2, W2) in enumerate(zip(t, y, w)): 26 | dph = dphase(abs(T2 - T), freq) 27 | if dph < dphi: 28 | mbar += W2 * Y2 29 | wtot += W2 30 | 31 | var += W * (Y - mbar / wtot)**2 32 | 33 | return var 34 | 35 | def var_gauss(t, y, w, freq, dphi): 36 | gaussian = lambda x: np.exp(-0.5 *x**2) 37 | var = 0. 38 | for i, (T, Y, W) in enumerate(zip(t, y, w)): 39 | mbar = 0. 40 | wtot = 0. 41 | 42 | for j, (T2, Y2, W2) in enumerate(zip(t, y, w)): 43 | dph = dphase(abs(T2 - T), freq) 44 | wgt = W2 * gaussian(dph / dphi) 45 | mbar += wgt * Y2 46 | wtot += wgt 47 | 48 | var += W * (Y - mbar / wtot)**2 49 | 50 | return var 51 | 52 | def binned_pdm_model(t, y, w, freq, nbins, linterp=True): 53 | 54 | if len(t) == 0: 55 | return lambda p, **kwargs: np.zeros_like(p) 56 | 57 | bin_means = np.zeros(nbins) 58 | phase = (t * freq) % 1.0 59 | bins = [int(p * nbins) % nbins for p in phase] 60 | 61 | for i in range(nbins): 62 | wtot = max([sum([W for j, W in enumerate(w) if bins[j] == i]), 1E-10]) 63 | bin_means[i] = sum([W * Y for j, (Y, W) in enumerate(zip(y, w)) 64 | if bins[j] == i]) / wtot 65 | 66 | def pred_y(p, nbins=nbins, linterp=linterp, bin_means=bin_means): 67 | bs = np.array([int(P * nbins) % nbins for P in p]) 68 | if not linterp: 69 | return bin_means[bs] 70 | alphas = p * nbins - np.floor(p * nbins) - 0.5 71 | di = np.floor(alphas).astype(np.int32) 72 | bins0 = bs + di 73 | bins1 = bins0 + 1 74 | 75 | alphas[alphas < 0] += 1 76 | bins0[bins0 < 0] += nbins 77 | bins1[bins1 >= nbins] -= nbins 78 | 79 | return (1 - alphas) * bin_means[bins0] + alphas * bin_means[bins1] 80 | 81 | return pred_y 82 | 83 | 84 | def var_binned(t, y, w, freq, nbins, linterp=True): 85 | ypred = binned_pdm_model(t, y, w, freq, nbins, linterp=linterp)((t * freq) % 1.0) 86 | return np.dot(w, np.power(y - ypred, 2)) 87 | 88 | 89 | def binless_pdm_cpu(t, y, w, freqs, dphi=0.05, tophat=True): 90 | # Prepare data 91 | t -= np.mean(t) 92 | y -= np.mean(y) 93 | 94 | ybar = np.dot(w, y) 95 | var = np.dot(w, np.power(y - ybar, 2)) 96 | if tophat: 97 | return [1 - var_tophat(t, y, w, freq, dphi) / var for freq in freqs] 98 | else: 99 | return [1 - var_gauss(t, y, w, freq, dphi) / var for freq in freqs] 100 | 101 | def pdm2_cpu(t, y, w, freqs, nbins=30, linterp=True): 102 | # Prepare data 103 | t -= np.mean(t) 104 | y -= np.mean(y) 105 | 106 | ybar = np.dot(w, y) 107 | var = np.dot(w, np.power(y - ybar, 2)) 108 | return [1 - var_binned(t, y, w, freq, 109 | nbins=nbins, linterp=linterp) / var 110 | for freq in freqs] 111 | 112 | 113 | def pdm2_single_freq(t, y, w, freq, nbins=30, linterp=True): 114 | # Prepare data 115 | t -= np.mean(t) 116 | y -= np.mean(y) 117 | 118 | ybar = np.dot(w, y) 119 | var = np.dot(w, np.power(y - ybar, 2)) 120 | return 1 - var_binned(t, y, w, freq, nbins=nbins, linterp=linterp) / var 121 | 122 | 123 | def pdm_async(stream, data_cpu, data_gpu, pow_cpu, function, 124 | dphi=0.05, block_size=256): 125 | t, y, w, freqs = data_cpu 126 | t_g, y_g, w_g, freqs_g, pow_g = data_gpu 127 | 128 | if t_g is None: 129 | return pow_cpu 130 | 131 | # constants 132 | nfreqs = np.int32(len(freqs)) 133 | ndata = np.int32(len(t)) 134 | dphi = np.float32(dphi) 135 | 136 | # kernel size 137 | grid_size = int(np.ceil(float(nfreqs) / block_size)) 138 | grid = (grid_size, 1) 139 | block = (block_size, 1, 1) 140 | 141 | # weights + weighted variance 142 | ybar = np.dot(w, y) 143 | var = np.float32(np.dot(w, np.power(y - ybar, 2))) 144 | 145 | # transfer data 146 | w_g.set_async(np.asarray(w).astype(np.float32), stream=stream) 147 | t_g.set_async(np.asarray(t).astype(np.float32), stream=stream) 148 | y_g.set_async(np.asarray(y).astype(np.float32), stream=stream) 149 | 150 | function.prepared_async_call(grid, block, stream, 151 | t_g.ptr, y_g.ptr, w_g.ptr, 152 | freqs_g.ptr, pow_g.ptr, 153 | ndata, nfreqs, dphi, var) 154 | 155 | pow_g.get_async(stream=stream, ary=pow_cpu) 156 | 157 | return pow_cpu 158 | 159 | 160 | class PDMAsyncProcess(GPUAsyncProcess): 161 | 162 | def __init__(self, *args, **kwargs): 163 | super(PDMAsyncProcess, self).__init__(*args, **kwargs) 164 | warnings.warn("PDM is experimental at this point. " 165 | "Use with great caution.") 166 | 167 | def _compile_and_prepare_functions(self, nbins=10): 168 | pdm2_txt = open(find_kernel('pdm'), 'r').read() 169 | pdm2_txt = pdm2_txt.replace('//INSERT_NBINS_HERE', 170 | '#define NBINS %d' % (nbins)) 171 | 172 | self.module = SourceModule(pdm2_txt, options=['--use_fast_math']) 173 | 174 | self.dtypes = [np.intp, np.intp, np.intp, np.intp, np.intp, 175 | np.int32, np.int32, np.float32, np.float32] 176 | for function in ['pdm_binless_tophat', 'pdm_binless_gauss', 177 | 'pdm_binned_linterp_%dbins' % (nbins), 178 | 'pdm_binned_step_%dbins' % (nbins)]: 179 | func = function.replace('_%dbins' % (nbins), '') 180 | func = self.module.get_function(func).prepare(self.dtypes) 181 | self.prepared_functions[function] = func 182 | 183 | def allocate(self, data): 184 | if len(data) > len(self.streams): 185 | self._create_streams(len(data) - len(self.streams)) 186 | 187 | gpu_data, pow_cpus = [], [] 188 | 189 | for t, y, w, freqs in data: 190 | 191 | pow_cpu = cuda.aligned_zeros(shape=(len(freqs),), 192 | dtype=np.float32, 193 | alignment=resource.getpagesize()) 194 | 195 | pow_cpu = cuda.register_host_memory(pow_cpu) 196 | 197 | t_g, y_g, w_g = None, None, None 198 | if len(t) > 0: 199 | t_g, y_g, w_g = tuple([gpuarray.zeros(len(t), dtype=np.float32) 200 | for i in range(3)]) 201 | 202 | pow_g = gpuarray.zeros(len(pow_cpu), dtype=pow_cpu.dtype) 203 | freqs_g = gpuarray.to_gpu(np.asarray(freqs).astype(np.float32)) 204 | 205 | gpu_data.append((t_g, y_g, w_g, freqs_g, pow_g)) 206 | pow_cpus.append(pow_cpu) 207 | return gpu_data, pow_cpus 208 | 209 | def run(self, data, gpu_data=None, pow_cpus=None, 210 | kind='binned_linterp', nbins=10, dphi=0.05, **pdm_kwargs): 211 | 212 | if kind in ['binless_tophat', 'binless_gauss']: 213 | function = 'pdm_%s' % (kind) 214 | elif kind in ['binned_linterp','binned_step']: 215 | function = 'pdm_%s_%dbins' % (kind, nbins) 216 | else: 217 | raise KeyError('Function not available. Please use one of the followings: ' + \ 218 | 'binless_tophat, binless_gauss, binned_linterp, binned_step') 219 | 220 | if function not in self.prepared_functions: 221 | self._compile_and_prepare_functions(nbins=nbins) 222 | 223 | # Prepare data 224 | for i,(t, y, w, freqs) in enumerate(data): 225 | t, y, w, freqs = t.copy(), y.copy(), w.copy(), freqs.copy() 226 | t -= np.mean(t) 227 | y -= np.mean(y) 228 | data[i] = t, y, w, freqs 229 | 230 | if pow_cpus is None or gpu_data is None: 231 | gpu_data, pow_cpus = self.allocate(data) 232 | streams = [s for i, s in enumerate(self.streams) if i < len(data)] 233 | func = self.prepared_functions[function] 234 | results = [pdm_async(stream, cdat, gdat, pcpu, func, dphi=dphi, **pdm_kwargs) 235 | for stream, cdat, gdat, pcpu in 236 | zip(streams, data, gpu_data, pow_cpus)] 237 | 238 | return results 239 | -------------------------------------------------------------------------------- /cuvarbase/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/cuvarbase/tests/__init__.py -------------------------------------------------------------------------------- /cuvarbase/tests/test_bls.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import zip 6 | from builtins import range 7 | from builtins import object 8 | from itertools import product 9 | import pytest 10 | import numpy as np 11 | from numpy.testing import assert_allclose 12 | from pycuda.tools import mark_cuda_test 13 | from ..bls import eebls_gpu, eebls_transit_gpu, \ 14 | q_transit, compile_bls, hone_solution,\ 15 | single_bls, eebls_gpu_custom, eebls_gpu_fast 16 | 17 | 18 | def transit_model(phi0, q, delta, q1=0.): 19 | def model(t, freq, q=q, phi0=phi0, delta=delta): 20 | 21 | phi = t * freq - phi0 22 | phi -= np.floor(phi) 23 | 24 | if not hasattr(t, '__iter__'): 25 | return -delta if np.absolute(phi) < q else 0 26 | y = np.zeros(len(t)) 27 | y[np.absolute(phi) < q] -= delta 28 | 29 | return y 30 | return model 31 | 32 | 33 | def plot_bls_sol(t, y, dy, freq, q, phi0): 34 | 35 | w = np.power(dy, -2) 36 | w /= sum(w) 37 | 38 | phi_plot = np.linspace(0, 1, 50./q) 39 | 40 | phi = (t * freq) 41 | phi -= np.floor(phi) 42 | 43 | dphi = phi - phi0 - np.floor(phi - phi0) 44 | mask = dphi < q 45 | 46 | ybt = np.dot(w[mask], y[mask]) / sum(w[mask]) 47 | yb0 = np.dot(w[~mask], y[~mask]) / sum(w[~mask]) 48 | 49 | delta = yb0 - ybt 50 | 51 | model = transit_model(phi0, q, delta) 52 | 53 | ym = model(phi_plot, 1.) + yb0 54 | 55 | import matplotlib.pyplot as plt 56 | 57 | f, ax = plt.subplots() 58 | 59 | ax.scatter(phi[~mask], y[~mask], c='k', s=1, alpha=0.1) 60 | ax.scatter(phi[mask], y[mask], c='g', s=1, alpha=0.8) 61 | ax.plot(phi_plot, ym, color='r') 62 | ax.axvline(phi0, color='k', ls=':') 63 | ax.axvline(phi0 + q, color='k', ls=':') 64 | 65 | plt.show() 66 | 67 | 68 | def data(seed=100, sigma=0.1, ybar=12., snr=10, ndata=200, freq=10., 69 | q=0.01, phi0=None, baseline=1., negative_delta=False): 70 | 71 | rand = np.random.RandomState(seed) 72 | 73 | if phi0 is None: 74 | phi0 = rand.rand() 75 | 76 | delta = snr * sigma / np.sqrt(ndata * q * (1 - q)) 77 | 78 | if negative_delta: 79 | delta *= -1 80 | 81 | model = transit_model(phi0, q, delta) 82 | 83 | t = baseline * np.sort(rand.rand(ndata)) 84 | y = model(t, freq) + sigma * rand.randn(len(t)) 85 | y += ybar - np.mean(y) 86 | err = sigma * np.ones_like(y) 87 | 88 | return t, y, err 89 | 90 | 91 | def get_total_nbins(nbins0, nbinsf, dlogq): 92 | nbins_tot = 0 93 | while (int(x * nbins0) <= nbinsf): 94 | nb = int(x * nbins0) 95 | x *= 1 + dlogq 96 | 97 | nbins_tot += nb 98 | 99 | return nbins_tot 100 | 101 | 102 | def mod1(x): 103 | return x - np.floor(x) 104 | 105 | 106 | def manual_binning(t, y, dy, freqs, nbins0, nbinsf, dlogq, 107 | phi_min, phi_max, noverlap): 108 | """ 109 | for possible tests of the binning procedure. this 110 | method has *not* been tested! 111 | """ 112 | 113 | w = np.power(dy, -2) 114 | w /= sum(w) 115 | 116 | yw = np.multiply(y, w) 117 | 118 | nbins_tot = get_total_nbins(nbins0, nbinsf, dlogq) 119 | 120 | yw_bins = np.zeros(nbins_tot * len(freqs) * noverlap) 121 | w_bins = np.zeros(nbins_tot * len(freqs) * noverlap) 122 | 123 | dphi = 1. / noverlap 124 | for i, freq in enumerate(freqs): 125 | nb = nbins0 126 | nbtot = 0 127 | x = 1. 128 | while (int(x * nbins0) <= nbinsf): 129 | nb = int(x * nbins0) 130 | x *= 1 + dlogq 131 | 132 | q = 1./nb 133 | 134 | for s in range(noverlap): 135 | phi = t * freq 136 | bf = np.floor(nb * mod1(phi - s * q * dphi)) 137 | 138 | bf += i * nbins_tot * noverlap + s * nb + noverlap * nbtot 139 | for b, YW, W in zip(bf[mask], yw[mask], w[mask]): 140 | yw_bins[b] += YW 141 | w_bins[b] += W 142 | 143 | nbtot += nb 144 | return yw_bins, w_bins 145 | 146 | 147 | class TestBLS(object): 148 | seed = 100 149 | rand = np.random.RandomState(seed) 150 | plot = False 151 | rtol = 1e-3 152 | atol = 1e-5 153 | 154 | # TODO: tests that have specific bls values; test single_bls function returns 155 | # what you expect it to for several example problems 156 | class SolutionParams(object): 157 | def __init__(self, freq, phi0, q, baseline, ybar, snr, negative_delta): 158 | self.freq = freq 159 | self.phi0 = phi0 160 | self.q = q 161 | self.baseline = baseline 162 | self.ybar = ybar 163 | self.snr = snr 164 | self.negative_delta = negative_delta 165 | 166 | @pytest.mark.parametrize("args", [( 167 | SolutionParams(freq=0.3, phi0=0.5, q=0.2, baseline=365., ybar=0., snr=50., 168 | negative_delta=True), 169 | {'bls0': 0.8902446483898836, 'bls_ignore': 0} 170 | ) 171 | ]) 172 | def test_ignore_positive_sols(self, args): 173 | solution, bls_values = args 174 | t, y_neg, dy = data(snr=solution.snr, 175 | q=solution.q, 176 | phi0=solution.phi0, 177 | freq=solution.freq, 178 | baseline=solution.baseline, 179 | ybar=solution.ybar, 180 | negative_delta=solution.negative_delta) 181 | 182 | freq, q, phi0 = solution.freq, solution.q, solution.phi0 183 | 184 | bls_default = single_bls(t, y_neg, dy, freq, q, phi0) 185 | bls0 = single_bls(t, y_neg, dy, freq, q, phi0, ignore_negative_delta_sols=False) 186 | bls_ignore = single_bls(t, y_neg, dy, freq, q, phi0, 187 | ignore_negative_delta_sols=True) 188 | assert np.allclose(bls_values['bls0'] , bls0) 189 | assert bls_values['bls_ignore'] == bls_ignore 190 | assert (bls0 == bls_default) 191 | 192 | @pytest.mark.parametrize("freq", [0.3]) 193 | @pytest.mark.parametrize("phi0", [0.0, 0.5]) 194 | @pytest.mark.parametrize("dlogq", [0.2, -1]) 195 | @pytest.mark.parametrize("nstreams", [1, 3]) 196 | @pytest.mark.parametrize("freq_batch_size", [1, 3, None]) 197 | @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) 198 | def test_transit_parameter_consistency(self, freq, phi0, dlogq, nstreams, 199 | freq_batch_size, ignore_negative_delta_sols): 200 | q = q_transit(freq) 201 | 202 | t, y, dy = data(snr=30, q=q, phi0=phi0, freq=freq, baseline=365.) 203 | 204 | freqs, power, sols = eebls_transit_gpu(t, y, dy, 205 | samples_per_peak=2, 206 | freq_batch_size=freq_batch_size, 207 | nstreams=nstreams, 208 | dlogq=dlogq, 209 | ignore_negative_delta_sols=ignore_negative_delta_sols, 210 | fmin=freq * 0.99, 211 | fmax=freq * 1.01) 212 | pcpu = [single_bls(t, y, dy, x[0], *x[1], ignore_negative_delta_sols=ignore_negative_delta_sols) 213 | for x in zip(freqs, sols)] 214 | pcpu = np.asarray(pcpu) 215 | 216 | if self.plot: 217 | import matplotlib.pyplot as plt 218 | f, ax = plt.subplots() 219 | ax.plot(freqs, pcpu) 220 | ax.plot(freqs, power) 221 | plt.show() 222 | 223 | sorted_results = sorted(zip(pcpu, power, freqs, sols), 224 | key=lambda x: -abs(x[1] - x[0])) 225 | 226 | for i, (pcs, pgs, freq, (qs, phs)) in enumerate(sorted_results): 227 | if i > 10: 228 | break 229 | print(pcs, pgs, (qs, phs)) 230 | if self.plot: 231 | plot_bls_sol(t, y, dy, freq, qs, phs) 232 | 233 | pows, diffs = list(zip(*sorted(zip(pcpu, 234 | np.absolute(power - pcpu)), 235 | key=lambda x: -x[1]))) 236 | 237 | upper_bound = self.rtol * np.array(pows) + self.atol 238 | mostly_ok = sum(np.array(diffs) > upper_bound) / len(pows) < 1e-2 239 | not_too_bad = max(diffs) < 1e-1 240 | 241 | print(max(diffs)) 242 | assert mostly_ok and not_too_bad 243 | 244 | @pytest.mark.parametrize("freq", [1.0]) 245 | @pytest.mark.parametrize("phi_index", [0, 10]) 246 | @pytest.mark.parametrize("q_index", [0, 5]) 247 | @pytest.mark.parametrize("nstreams", [1, 3]) 248 | @pytest.mark.parametrize("freq_batch_size", [1, 3, None]) 249 | @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) 250 | def test_custom(self, freq, q_index, phi_index, freq_batch_size, nstreams, 251 | ignore_negative_delta_sols): 252 | q_values = np.logspace(-1.1, -0.8, num=10) 253 | phi_values = np.linspace(0, 1, int(np.ceil(2./min(q_values)))) 254 | 255 | q = q_values[q_index] 256 | phi = phi_values[phi_index] 257 | 258 | t, y, dy = data(snr=10, q=q, phi0=phi, freq=freq, 259 | baseline=365., ndata=500) 260 | 261 | df = min(q_values) / (10 * (max(t) - min(t))) 262 | freqs = np.linspace(freq - 10 * df, freq + 10 * df, 20) 263 | 264 | power, gsols = eebls_gpu_custom(t, y, dy, freqs, 265 | q_values, phi_values, 266 | ignore_negative_delta_sols=ignore_negative_delta_sols, 267 | freq_batch_size=freq_batch_size, 268 | nstreams=nstreams) 269 | 270 | for freq, (qg, phg), gpower in zip(freqs, gsols, power): 271 | q_and_phis = product(q_values, phi_values) 272 | 273 | best_q, best_phi, best_p = None, None, None 274 | for Q, PHI in q_and_phis: 275 | p = single_bls(t, y, dy, freq, Q, PHI, 276 | ignore_negative_delta_sols=ignore_negative_delta_sols) 277 | if best_p is None or p > best_p: 278 | best_p = p 279 | best_q = Q 280 | best_phi = PHI 281 | 282 | assert np.abs(best_p - gpower) < 1e-5 283 | 284 | @pytest.mark.parametrize("freq", [1.0]) 285 | @pytest.mark.parametrize("phi_index", [0, 10, -1]) 286 | @pytest.mark.parametrize("q_index", [0, 5, -1]) 287 | @pytest.mark.parametrize("nstreams", [1, 3]) 288 | @pytest.mark.parametrize("freq_batch_size", [1, 3, None]) 289 | @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) 290 | def test_standard(self, freq, q_index, phi_index, nstreams, freq_batch_size, 291 | ignore_negative_delta_sols): 292 | 293 | q_values = np.logspace(-1.5, np.log10(0.1), num=100) 294 | phi_values = np.linspace(0, 1, int(np.ceil(2./min(q_values)))) 295 | 296 | q = q_values[q_index] 297 | phi = phi_values[phi_index] 298 | 299 | t, y, dy = data(snr=10, q=q, phi0=phi, freq=freq, 300 | baseline=365.) 301 | 302 | df = min(q_values) / (10 * (max(t) - min(t))) 303 | 304 | delta_f = 5 * df / freq 305 | freqs = np.linspace(freq * (1 - delta_f), 306 | (1 + delta_f) * freq, 307 | int(5. * 2 * delta_f * freq / df)) 308 | power, gsols = eebls_gpu(t, y, dy, freqs, 309 | qmin=0.1 * q, qmax=2.0 * q, 310 | nstreams=nstreams, noverlap=2, dlogq=0.5, 311 | freq_batch_size=freq_batch_size, 312 | ignore_negative_delta_sols=ignore_negative_delta_sols) 313 | 314 | bls_c = [single_bls(t, y, dy, x[0], *x[1], 315 | ignore_negative_delta_sols=ignore_negative_delta_sols) 316 | for x in zip(freqs, gsols)] 317 | if self.plot: 318 | import matplotlib.pyplot as plt 319 | f, ax = plt.subplots() 320 | 321 | ax.plot(freqs, bls_c) 322 | ax.plot(freqs, power) 323 | 324 | plt.show() 325 | 326 | inds = sorted(np.arange(len(power)), 327 | key=lambda i: -abs(power[i] - bls_c[i])) 328 | 329 | all_qs, all_phis = zip(*gsols) 330 | 331 | for i in inds[:100]: 332 | qs, phis = gsols[i] 333 | print(power[i], bls_c[i], abs(power[i] - bls_c[i]), 334 | qs, phis) 335 | #plot_bls_sol(t, y, dy, freqs[i], qs, phis) 336 | 337 | pows, diffs = list(zip(*sorted(zip(bls_c, np.absolute(power - bls_c)), 338 | key=lambda x: -x[1]))) 339 | 340 | upper_bound = self.rtol * np.array(pows) + self.atol 341 | mostly_ok = sum(np.array(diffs) > upper_bound) / len(pows) < 1e-2 342 | not_too_bad = max(diffs) < 1e-1 343 | 344 | print(diffs[0], pows[0]) 345 | assert mostly_ok and not_too_bad 346 | # assert_allclose(bls_c, power, rtol=1e-3, atol=1e-5) 347 | 348 | @pytest.mark.parametrize("freq", [1.0]) 349 | @pytest.mark.parametrize("dlogq", [0.5, -1.0]) 350 | @pytest.mark.parametrize("freq_batch_size", [1, 10, None]) 351 | @pytest.mark.parametrize("phi0", [0.0]) 352 | @pytest.mark.parametrize("use_fast", [True, False]) 353 | @pytest.mark.parametrize("nstreams", [1, 4]) 354 | @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) 355 | def test_transit(self, freq, use_fast, freq_batch_size, nstreams, phi0, dlogq, 356 | ignore_negative_delta_sols): 357 | q = q_transit(freq) 358 | samples_per_peak = 2 359 | noverlap = 2 360 | 361 | t, y, err = data(snr=10, q=q, phi0=phi0, freq=freq, 362 | baseline=365.) 363 | 364 | kw = dict(samples_per_peak=samples_per_peak, 365 | freq_batch_size=freq_batch_size, dlogq=dlogq, 366 | ignore_negative_delta_sols=ignore_negative_delta_sols, 367 | nstreams=nstreams, noverlap=noverlap, 368 | fmin=0.9 * freq, fmax=1.1 * freq, 369 | use_fast=use_fast) 370 | 371 | if use_fast: 372 | freqs, power = eebls_transit_gpu(t, y, err, **kw) 373 | 374 | kw['use_fast'] = False 375 | freqs, power_slow, sols = eebls_transit_gpu(t, y, err, **kw) 376 | kw['use_fast'] = True 377 | dfsol = freqs[np.argmax(power)] - freqs[np.argmax(power_slow)] 378 | close_enough = abs(dfsol) * (max(t) - min(t)) / q < 3 379 | if not close_enough and self.plot: 380 | import matplotlib.pyplot as plt 381 | plt.plot(freqs, power, alpha=0.5) 382 | plt.plot(freqs, power_slow, alpha=0.5) 383 | plt.show() 384 | 385 | assert(close_enough) 386 | return 387 | 388 | freqs, power, sols = eebls_transit_gpu(t, y, err, **kw) 389 | power_cpu = np.array([single_bls(t, y, err, x[0], *x[1], 390 | ignore_negative_delta_sols=ignore_negative_delta_sols) 391 | for x in zip(freqs, sols)]) 392 | 393 | if self.plot: 394 | import matplotlib.pyplot as plt 395 | f, ax = plt.subplots() 396 | 397 | ax.plot(freqs, power_cpu) 398 | ax.plot(freqs, power) 399 | 400 | pows, diffs = list(zip(*sorted(zip(power_cpu, power - power_cpu), 401 | key=lambda x: -abs(x[1])))) 402 | print(list(zip(pows[:10], diffs[:10]))) 403 | plt.show() 404 | 405 | diffs = np.absolute(power - power_cpu) 406 | upper_bound = 1e-3 * np.array(power_cpu) + 1e-5 407 | mostly_ok = sum(np.array(diffs) > upper_bound) / len(diffs) < 1e-2 408 | not_too_bad = max(diffs) < 1e-1 409 | 410 | print(max(diffs)) 411 | assert mostly_ok and not_too_bad 412 | 413 | @pytest.mark.parametrize("freq", [1.0]) 414 | @pytest.mark.parametrize("q", [0.1]) 415 | @pytest.mark.parametrize("phi0", [0.0]) 416 | @pytest.mark.parametrize("dphi", [0.0, 1.0]) 417 | @pytest.mark.parametrize("freq_batch_size", [None, 100]) 418 | @pytest.mark.parametrize("dlogq", [0.5, -1.0]) 419 | @pytest.mark.parametrize("ignore_negative_delta_sols", [True, False]) 420 | def test_fast_eebls(self, freq, q, phi0, freq_batch_size, dlogq, dphi, 421 | ignore_negative_delta_sols, **kwargs): 422 | t, y, err = data(snr=50, q=q, phi0=phi0, freq=freq, 423 | baseline=365.) 424 | 425 | df = 0.25 * q / (max(t) - min(t)) 426 | fmin = 0.9 * freq 427 | fmax = 1.1 * freq 428 | nf = int(np.ceil((fmax - fmin) / df)) 429 | freqs = fmin + df * np.arange(nf) 430 | 431 | kw = dict(qmin=1e-2, qmax=0.5, dphi=dphi, 432 | ignore_negative_delta_sols=ignore_negative_delta_sols, 433 | freq_batch_size=freq_batch_size, dlogq=dlogq) 434 | 435 | kw.update(kwargs) 436 | 437 | power = eebls_gpu_fast(t, y, err, freqs, **kw) 438 | 439 | power0, sols = eebls_gpu(t, y, err, freqs, **kw) 440 | if self.plot: 441 | import matplotlib.pyplot as plt 442 | f, ax = plt.subplots() 443 | ax.plot(freqs, power, alpha=0.5) 444 | ax.axvline(freq, ls=':', color='k') 445 | ax.plot(freqs, power0, alpha=0.5) 446 | ax.set_yscale('log') 447 | plt.show() 448 | 449 | # this is janky. Need better test 450 | # to ensure we're getting the best results, 451 | # but no apples-to-apples comparison is 452 | # possible for eebls_gpu and eebls_gpu_fast 453 | fmax_fast = freqs[np.argmax(power)] 454 | fmax_regular = freqs[np.argmax(power0)] 455 | assert(abs(fmax_fast - fmax_regular) * (max(t) - min(t)) / q < 3) 456 | -------------------------------------------------------------------------------- /cuvarbase/tests/test_ce.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import zip 6 | from builtins import range 7 | from builtins import object 8 | import pytest 9 | from pycuda.tools import mark_cuda_test 10 | import numpy as np 11 | from numpy.testing import assert_allclose 12 | from ..ce import ConditionalEntropyAsyncProcess 13 | lsrtol = 1E-2 14 | lsatol = 1E-5 15 | seed = 100 16 | 17 | rand = np.random.RandomState(seed) 18 | 19 | 20 | def data(sigma=0.1, ndata=500, freq=3., snr=1000, t0=0.): 21 | 22 | t = np.sort(rand.rand(ndata)) + t0 23 | y = snr * sigma * np.cos(2 * np.pi * freq * t) / np.sqrt(len(t)) 24 | 25 | y += sigma * rand.randn(len(t)) 26 | 27 | err = sigma * np.ones_like(y) 28 | 29 | return t, y, err 30 | 31 | 32 | def assert_similar(pdg0, pdg, top=5): 33 | inds = (np.argsort(pdg0)[::-1])[:top] 34 | 35 | p0 = np.asarray(pdg0)[inds] 36 | p = np.asarray(pdg)[inds] 37 | diff = np.absolute(p - p0) 38 | 39 | assert(all(diff < lsrtol * 0.5 * (p + p0) + lsatol)) 40 | 41 | 42 | class TestCE(object): 43 | plot = False 44 | 45 | @pytest.mark.parametrize('ndatas', [1, 5, 10]) 46 | def test_multiple_datasets(self, ndatas, **kwargs): 47 | datas = [data() for i in range(ndatas)] 48 | proc = ConditionalEntropyAsyncProcess(**kwargs) 49 | 50 | df = 0.02 51 | max_freq = 1.1 52 | min_freq = 0.9 53 | nf = int((max_freq - min_freq) / df) 54 | freqs = min_freq + df * np.arange(nf) 55 | 56 | mult_results = proc.run(datas, freqs=freqs) 57 | proc.finish() 58 | 59 | sing_results = [] 60 | 61 | for d in datas: 62 | sing_results.extend(proc.run([d], freqs=freqs)) 63 | proc.finish() 64 | 65 | for rb, rnb in zip(mult_results, sing_results): 66 | fb, pb = rb 67 | fnb, pnb = rnb 68 | 69 | assert(not any(np.isnan(pb))) 70 | assert(not any(np.isnan(pnb))) 71 | 72 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 73 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 74 | 75 | @pytest.mark.parametrize('ndatas', [1, 7]) 76 | @pytest.mark.parametrize('batch_size', [1, 3]) 77 | @pytest.mark.parametrize('use_double', [True, False]) 78 | @pytest.mark.parametrize('use_fast,weighted,shmem_lc,freq_batch_size', 79 | [(True, False, False, 1), 80 | (True, False, True, None), 81 | (False, True, False, None), 82 | (False, False, False, None)]) 83 | @pytest.mark.parametrize('phase_bins,phase_overlap', 84 | [(10, 1)]) 85 | @pytest.mark.parametrize('mag_bins,mag_overlap', 86 | [(5, 0)]) 87 | def test_batched_run(self, ndatas, batch_size, use_double, 88 | mag_bins, phase_bins, mag_overlap, 89 | phase_overlap, use_fast, 90 | shmem_lc, weighted, 91 | freq_batch_size): 92 | 93 | datas = [data(ndata=rand.randint(50, 100)) 94 | for i in range(ndatas)] 95 | kwargs = dict(use_double=use_double, 96 | mag_bins=mag_bins, 97 | phase_bins=phase_bins, 98 | phase_overlap=phase_overlap, 99 | mag_overlap=mag_overlap, 100 | use_fast=use_fast, 101 | weighted=weighted) 102 | proc = ConditionalEntropyAsyncProcess(**kwargs) 103 | df = 0.02 104 | max_freq = 1.1 105 | min_freq = 0.9 106 | nf = int((max_freq - min_freq) / df) 107 | freqs = min_freq + df * np.arange(nf) 108 | 109 | run_kw = dict(shmem_lc=shmem_lc, freqs=freqs, 110 | freq_batch_size=freq_batch_size) 111 | batched_results = proc.batched_run(datas, **run_kw) 112 | proc.finish() 113 | 114 | non_batched_results = [] 115 | for d in datas: 116 | r = proc.run([d], freqs=freqs) 117 | proc.finish() 118 | non_batched_results.extend(r) 119 | 120 | for rb, rnb in zip(batched_results, non_batched_results): 121 | fb, pb = rb 122 | fnb, pnb = rnb 123 | 124 | assert(not any(np.isnan(pb))) 125 | assert(not any(np.isnan(pnb))) 126 | 127 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 128 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 129 | 130 | @pytest.mark.parametrize('ndatas', [1, 7]) 131 | @pytest.mark.parametrize('batch_size', [1, 3]) 132 | @pytest.mark.parametrize('use_double', [True, False]) 133 | @pytest.mark.parametrize('use_fast,weighted,shmem_lc,freq_batch_size', 134 | [(True, False, False, 1), 135 | (True, False, True, None), 136 | (False, True, False, None), 137 | (False, False, False, None)]) 138 | @pytest.mark.parametrize('phase_bins,phase_overlap', 139 | [(10, 1)]) 140 | @pytest.mark.parametrize('mag_bins,mag_overlap', 141 | [(5, 0)]) 142 | def test_batched_run_const_nfreq(self, ndatas, batch_size, use_double, 143 | mag_bins, phase_bins, mag_overlap, 144 | phase_overlap, use_fast, weighted, 145 | shmem_lc, freq_batch_size): 146 | frequencies = np.sort(10 + rand.rand(ndatas) * 100.) 147 | datas = [data(ndata=rand.randint(50, 100), 148 | freq=freq) 149 | for i, freq in enumerate(frequencies)] 150 | 151 | kwargs = dict(use_double=use_double, 152 | mag_bins=mag_bins, 153 | phase_bins=phase_bins, 154 | phase_overlap=phase_overlap, 155 | mag_overlap=mag_overlap, 156 | use_fast=use_fast) 157 | proc = ConditionalEntropyAsyncProcess(**kwargs) 158 | 159 | df = 0.02 160 | max_freq = 1.1 161 | min_freq = 0.9 162 | nf = int((max_freq - min_freq) / df) 163 | freqs = min_freq + df * np.arange(nf) 164 | 165 | run_kw = dict(shmem_lc=shmem_lc, freqs=freqs, 166 | freq_batch_size=freq_batch_size) 167 | batched_results = proc.batched_run_const_nfreq(datas, **run_kw) 168 | proc.finish() 169 | 170 | procnb = ConditionalEntropyAsyncProcess(**kwargs) 171 | 172 | non_batched_results = [] 173 | for d, (frq, p) in zip(datas, batched_results): 174 | r = procnb.run([d], **run_kw) 175 | procnb.finish() 176 | non_batched_results.extend(r) 177 | 178 | for f0, (fb, pb), (fnb, pnb) in zip(frequencies, batched_results, 179 | non_batched_results): 180 | 181 | if self.plot: 182 | import matplotlib.pyplot as plt 183 | plt.plot(fnb, pnb, color='k', lw=3) 184 | plt.plot(fb, pb, color='r') 185 | plt.axvline(f0) 186 | plt.show() 187 | assert(not any(np.isnan(pb))) 188 | assert(not any(np.isnan(pnb))) 189 | 190 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 191 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 192 | 193 | @pytest.mark.parametrize('use_double', [True, False]) 194 | @pytest.mark.parametrize('use_fast,weighted,shmem_lc,freq_batch_size', 195 | [(True, False, False, 1), 196 | (True, False, True, None), 197 | (False, True, False, None), 198 | (False, False, False, None)]) 199 | @pytest.mark.parametrize('phase_bins,phase_overlap', 200 | [(10, 1)]) 201 | @pytest.mark.parametrize('mag_bins,mag_overlap', 202 | [(5, 0)]) 203 | @pytest.mark.parametrize('freq', [10.0]) 204 | @pytest.mark.parametrize('t0', [0.0]) 205 | @pytest.mark.parametrize('balanced_magbins', [True, False]) 206 | def test_inject_and_recover(self, freq, 207 | use_double, mag_bins, phase_bins, mag_overlap, 208 | phase_overlap, use_fast, t0, balanced_magbins, 209 | weighted, shmem_lc, freq_batch_size): 210 | 211 | kwargs = dict(use_double=use_double, 212 | mag_bins=mag_bins, 213 | phase_bins=phase_bins, 214 | phase_overlap=phase_overlap, 215 | mag_overlap=mag_overlap, 216 | use_fast=use_fast, 217 | balanced_magbins=balanced_magbins, 218 | weighted=weighted) 219 | proc = ConditionalEntropyAsyncProcess(**kwargs) 220 | t, y, err = data(freq=freq, t0=t0) 221 | 222 | df = 1. / (max(t) - min(t)) / 10 223 | max_freq = 1.1 * freq 224 | min_freq = 0.9 * freq 225 | nf = int((max_freq - min_freq) / df) 226 | freqs = min_freq + df * np.arange(nf) 227 | 228 | run_kw = dict(shmem_lc=shmem_lc, freq_batch_size=freq_batch_size) 229 | results = proc.large_run([(t, y, err)], 230 | freqs=freqs, **run_kw) 231 | proc.finish() 232 | frq, p = results[0] 233 | best_freq = frq[np.argmin(p)] 234 | 235 | if self.plot: 236 | import matplotlib.pyplot as plt 237 | f, ax = plt.subplots() 238 | ax.plot(frq, p) 239 | ax.axvline(freq, ls='-', color='k') 240 | ax.axvline(best_freq, ls=':', color='r') 241 | plt.show() 242 | 243 | # print best_freq, freq, abs(best_freq - freq) / freq 244 | assert(not any(np.isnan(p))) 245 | assert(abs(best_freq - freq) / freq < 3E-2) 246 | 247 | def test_large_run(self, make_plot=False, **kwargs): 248 | proc = ConditionalEntropyAsyncProcess(**kwargs) 249 | t, y, dy = data(sigma=0.01, ndata=100, freq=4.) 250 | df = 0.001 251 | max_freq = 100. 252 | min_freq = df 253 | nf = int((max_freq - min_freq) / df) 254 | freqs = min_freq + df * np.arange(nf) 255 | 256 | r0 = proc.run([(t, y, dy)], freqs=freqs) 257 | r1 = proc.large_run([(t, y, dy)], freqs=freqs, max_memory=1e7) 258 | 259 | f0, p0 = r0[0] 260 | f1, p1 = r1[0] 261 | 262 | rel_err = max(np.absolute(p0 - p1)) / np.median(np.absolute(p0)) 263 | print(max(np.absolute(p0 - p1)), rel_err) 264 | assert_allclose(p0, p1, rtol=1e-4, atol=1e-2) 265 | 266 | @pytest.mark.parametrize('use_double', [True, False]) 267 | @pytest.mark.parametrize('use_fast,weighted,shmem_lc,freq_batch_size', 268 | [(True, False, False, 1)]) 269 | @pytest.mark.parametrize('phase_bins,phase_overlap', 270 | [(10, 1)]) 271 | @pytest.mark.parametrize('mag_bins,mag_overlap', 272 | [(5, 0)]) 273 | @pytest.mark.parametrize('freq', [10.0]) 274 | @pytest.mark.parametrize('balanced_magbins', [True, False]) 275 | def test_time_shift_invariance(self, freq, 276 | use_double, mag_bins, phase_bins, 277 | mag_overlap, phase_overlap, use_fast, 278 | balanced_magbins, weighted, 279 | shmem_lc, freq_batch_size): 280 | 281 | kwargs = dict(use_double=use_double, 282 | mag_bins=mag_bins, 283 | phase_bins=phase_bins, 284 | phase_overlap=phase_overlap, 285 | mag_overlap=mag_overlap, 286 | use_fast=use_fast, 287 | balanced_magbins=balanced_magbins, 288 | weighted=weighted) 289 | proc = ConditionalEntropyAsyncProcess(**kwargs) 290 | 291 | run_kw = dict(shmem_lc=shmem_lc, freq_batch_size=freq_batch_size) 292 | for t0 in [-1e4, 1e4]: 293 | t, y, err = data(freq=freq) 294 | 295 | df = 1. / (max(t) - min(t)) / 10 296 | max_freq = 1.1 * freq 297 | min_freq = 0.9 * freq 298 | nf = int((max_freq - min_freq) / df) 299 | 300 | freqs = min_freq + df * np.arange(nf) 301 | 302 | results = proc.run([(t, y, err)], freqs=freqs, **run_kw) 303 | proc.finish() 304 | frq, p = results[0] 305 | 306 | results_shift = proc.run([(t + t0, y, err)], freqs=freqs, **run_kw) 307 | frq_shft, p_shft = results_shift[0] 308 | 309 | best_freq = frq[np.argmin(p)] 310 | best_freq_shft = frq_shft[np.argmin(p_shft)] 311 | 312 | if self.plot: 313 | import matplotlib.pyplot as plt 314 | f, ax = plt.subplots() 315 | ax.plot(frq, p) 316 | ax.plot(frq_shft, p_shft) 317 | ax.axvline(freq, ls='-', color='k') 318 | ax.axvline(best_freq, ls=':', color='r') 319 | plt.show() 320 | 321 | assert(not any(np.isnan(p))) 322 | assert(not any(np.isnan(p_shft))) 323 | 324 | baseline = max(t) - min(t) 325 | delta_f = abs(best_freq - best_freq_shft) 326 | top_freq_is_close = delta_f * baseline < 1 327 | 328 | diffs = np.absolute(p - p_shft) 329 | atol, rtol = 1e-1 * max(np.absolute(p)), 2e-1 330 | upper_limit = atol + rtol * np.absolute(p) 331 | 332 | pct_out_of_bounds = sum(diffs > upper_limit) / len(diffs) 333 | 334 | print(pct_out_of_bounds, delta_f * baseline) 335 | assert(top_freq_is_close and pct_out_of_bounds < 5e-2) 336 | 337 | @pytest.mark.parametrize('use_double', [True, False]) 338 | @pytest.mark.parametrize('shmem_lc', [True, False]) 339 | @pytest.mark.parametrize('freq_batch_size', [1, None]) 340 | @pytest.mark.parametrize('phase_bins,phase_overlap,mag_bins,mag_overlap', 341 | [(10, 0, 5, 0), (10, 1, 5, 1)]) 342 | @pytest.mark.parametrize('freq', [12.0]) 343 | @pytest.mark.parametrize('t0', [0.0]) 344 | #@pytest.mark.parametrize('balanced_magbins', [True, False]) 345 | @pytest.mark.parametrize('balanced_magbins', [False]) 346 | @pytest.mark.parametrize('weighted', [False]) 347 | @pytest.mark.parametrize('force_nblocks', [1, None]) 348 | @pytest.mark.parametrize('ndata', [300]) 349 | def test_fast(self, freq, use_double, mag_bins, phase_bins, mag_overlap, 350 | phase_overlap, t0, balanced_magbins, weighted, 351 | shmem_lc, freq_batch_size, force_nblocks, ndata): 352 | 353 | kwargs = dict(use_double=use_double, 354 | mag_bins=mag_bins, 355 | phase_bins=phase_bins, 356 | phase_overlap=phase_overlap, 357 | mag_overlap=mag_overlap, 358 | balanced_magbins=balanced_magbins, 359 | weighted=weighted) 360 | proc_fast = ConditionalEntropyAsyncProcess(use_fast=True, **kwargs) 361 | proc_slow = ConditionalEntropyAsyncProcess(use_fast=False, **kwargs) 362 | t, y, err = data(freq=freq, t0=t0, ndata=ndata) 363 | 364 | df = 1. / (max(t) - min(t)) / 10 365 | max_freq = 1.1 * freq 366 | min_freq = 0.9 * freq 367 | nf = int((max_freq - min_freq) / df) 368 | freqs = min_freq + df * np.arange(nf) 369 | 370 | run_kw = dict(shmem_lc=shmem_lc, 371 | freq_batch_size=freq_batch_size, 372 | force_nblocks=force_nblocks) 373 | results_fast = proc_fast.run([(t + t0, y, err)], freqs=freqs, 374 | **run_kw) 375 | proc_fast.finish() 376 | frq_fast, p_fast = results_fast[0] 377 | 378 | results_slow = proc_slow.run([(t + t0, y, err)], freqs=freqs) 379 | proc_slow.finish() 380 | frq_slow, p_slow = results_slow[0] 381 | 382 | max_diff = 2e-2 * max(np.absolute(p_slow)) 383 | if self.plot and \ 384 | not all(np.absolute(p_slow - p_fast) < max_diff): 385 | import matplotlib.pyplot as plt 386 | 387 | f, ax = plt.subplots() 388 | ax.plot(frq_slow, p_slow, alpha=0.5) 389 | ax.plot(frq_fast, p_fast, alpha=0.5) 390 | ax.axvline(freq, ls='-', color='k') 391 | plt.show() 392 | 393 | f, ax = plt.subplots() 394 | ax.plot(frq_slow, (p_slow - p_fast) / max(np.absolute(p_slow))) 395 | ax.axvline(freq, ls='-', color='k') 396 | plt.show() 397 | # print best_freq, freq, abs(best_freq - freq) / freq 398 | assert(not any(np.isnan(p_slow))) 399 | assert(not any(np.isnan(p_fast))) 400 | assert_allclose(p_slow, p_fast, atol=2e-2 * max(np.absolute(p_slow))) 401 | -------------------------------------------------------------------------------- /cuvarbase/tests/test_lombscargle.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import zip 6 | from builtins import range 7 | from builtins import object 8 | import numpy as np 9 | import pytest 10 | 11 | from numpy.testing import assert_allclose 12 | from astropy.timeseries import LombScargle 13 | 14 | from ..lombscargle import LombScargleAsyncProcess 15 | from pycuda.tools import mark_cuda_test 16 | #import pycuda.autoinit 17 | import pycuda.autoprimaryctx 18 | spp = 3 19 | nfac = 3 20 | lsrtol = 1E-2 21 | lsatol = 1E-2 22 | nfft_sigma = 5 23 | 24 | rand = np.random.RandomState(100) 25 | 26 | 27 | def data(seed=100, sigma=0.1, ndata=100, freq=3.): 28 | t = np.sort(rand.rand(ndata)) 29 | y = np.cos(2 * np.pi * freq * t) 30 | 31 | y += sigma * rand.randn(len(t)) 32 | 33 | err = sigma * np.ones_like(y) 34 | 35 | return t, y, err 36 | 37 | 38 | def assert_similar(pdg0, pdg, top=5): 39 | inds = (np.argsort(pdg0)[::-1])[:top] 40 | 41 | p0 = np.asarray(pdg0)[inds] 42 | p = np.asarray(pdg)[inds] 43 | diff = np.absolute(p - p0) 44 | 45 | res = sorted(zip(p0, p, diff), key=lambda x: -x[2]) 46 | 47 | for p0v, pv, dv in res: 48 | if dv > 1e-3: 49 | print(p0v, pv, dv) 50 | 51 | assert_allclose(p, p0, atol=lsatol, rtol=lsrtol) 52 | assert(all(diff < lsrtol * 0.5 * (p + p0) + lsatol)) 53 | 54 | 55 | class TestLombScargle(object): 56 | def test_against_astropy_double(self): 57 | t, y, err = data() 58 | ls_proc = LombScargleAsyncProcess(use_double=True, 59 | sigma=nfft_sigma) 60 | 61 | results = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 62 | use_fft=True, 63 | samples_per_peak=spp) 64 | ls_proc.finish() 65 | 66 | fgpu, pgpu = results[0] 67 | 68 | power = LombScargle(t, y, err).power(fgpu) 69 | 70 | assert_similar(power, pgpu) 71 | 72 | def test_against_astropy_single(self): 73 | t, y, err = data() 74 | ls_proc = LombScargleAsyncProcess(use_double=False, 75 | sigma=nfft_sigma) 76 | 77 | results = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 78 | samples_per_peak=spp) 79 | ls_proc.finish() 80 | fgpu, pgpu = results[0] 81 | 82 | power = LombScargle(t, y, err).power(fgpu) 83 | 84 | assert_similar(power, pgpu) 85 | 86 | def test_ls_kernel(self): 87 | t, y, err = data() 88 | ls_proc = LombScargleAsyncProcess(use_double=False, 89 | sigma=nfft_sigma) 90 | 91 | results = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 92 | samples_per_peak=spp) 93 | ls_proc.finish() 94 | fgpu, pgpu = results[0] 95 | 96 | ls = LombScargle(t, y, err, fit_mean=True, center_data=False) 97 | power = ls.power(fgpu) 98 | 99 | assert_similar(power, pgpu) 100 | 101 | def test_ls_kernel_direct_sums(self): 102 | t, y, err = data() 103 | ls_proc = LombScargleAsyncProcess(use_double=True, 104 | sigma=nfft_sigma) 105 | 106 | results = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 107 | samples_per_peak=spp, use_fft=False) 108 | ls_proc.finish() 109 | fgpu, pgpu = results[0] 110 | 111 | ls = LombScargle(t, y, err, fit_mean=True, center_data=True) 112 | power = ls.power(fgpu) 113 | 114 | assert_similar(power, pgpu) 115 | 116 | def test_ls_kernel_direct_sums_is_consistent(self): 117 | t, y, err = data() 118 | ls_proc = LombScargleAsyncProcess(use_double=False, 119 | sigma=nfft_sigma) 120 | 121 | results_ds = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 122 | samples_per_peak=spp, use_fft=False) 123 | ls_proc.finish() 124 | 125 | fgpu_ds, pgpu_ds = results_ds[0] 126 | 127 | results_reg = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 128 | samples_per_peak=spp, use_cpu_nfft=True) 129 | ls_proc.finish() 130 | 131 | fgpu_reg, pgpu_reg = results_reg[0] 132 | 133 | assert_similar(pgpu_reg, pgpu_ds) 134 | 135 | def test_ls_kernel_direct_sums_against_python(self): 136 | 137 | t, y, err = data() 138 | ls_proc = LombScargleAsyncProcess(use_double=False, sigma=nfft_sigma) 139 | 140 | result_ds = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 141 | samples_per_peak=spp, use_fft=False) 142 | ls_proc.finish() 143 | 144 | fgpu_ds, pgpu_ds = result_ds[0] 145 | 146 | result_reg = ls_proc.run([(t, y, err)], nyquist_factor=nfac, 147 | samples_per_peak=spp, 148 | use_fft=False, 149 | python_dir_sums=True) 150 | ls_proc.finish() 151 | fgpu_reg, pgpu_reg = result_reg[0] 152 | 153 | assert_similar(pgpu_reg, pgpu_ds) 154 | 155 | def test_multiple_datasets(self, ndatas=5): 156 | datas = [data() for i in range(ndatas)] 157 | ls_proc = LombScargleAsyncProcess(sigma=nfft_sigma) 158 | 159 | mult_results = ls_proc.run(datas, nyquist_factor=nfac, 160 | samples_per_peak=spp) 161 | ls_proc.finish() 162 | 163 | sing_results = [] 164 | 165 | for d in datas: 166 | sing_results.extend(ls_proc.run([d], nyquist_factor=nfac, 167 | samples_per_peak=spp)) 168 | ls_proc.finish() 169 | 170 | for rb, rnb in zip(mult_results, sing_results): 171 | fb, pb = rb 172 | fnb, pnb = rnb 173 | 174 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 175 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 176 | 177 | def test_batched_run(self, ndatas=5, batch_size=5, sigma=nfft_sigma, 178 | samples_per_peak=spp, nyquist_factor=nfac, 179 | **kwargs): 180 | 181 | datas = [data(ndata=rand.randint(50, 100)) 182 | for i in range(ndatas)] 183 | ls_proc = LombScargleAsyncProcess(sigma=sigma, **kwargs) 184 | 185 | kw = dict(nyquist_factor=nyquist_factor, 186 | samples_per_peak=samples_per_peak) 187 | 188 | batched_results = ls_proc.batched_run(datas, **kw) 189 | ls_proc.finish() 190 | 191 | non_batched_results = [] 192 | for d in datas: 193 | r = ls_proc.run([d], nyquist_factor=nyquist_factor, 194 | samples_per_peak=samples_per_peak) 195 | ls_proc.finish() 196 | non_batched_results.extend(r) 197 | 198 | for rb, rnb in zip(batched_results, non_batched_results): 199 | fb, pb = rb 200 | fnb, pnb = rnb 201 | 202 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 203 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 204 | 205 | def test_batched_run_const_nfreq(self, make_plot=False, ndatas=27, 206 | batch_size=5, sigma=nfft_sigma, 207 | samples_per_peak=spp, 208 | nyquist_factor=nfac, 209 | **kwargs): 210 | 211 | frequencies = 10 + rand.rand(ndatas) * 100. 212 | datas = [data(ndata=rand.randint(50, 100), 213 | freq=freq) 214 | for i, freq in enumerate(frequencies)] 215 | ls_proc = LombScargleAsyncProcess(sigma=sigma, **kwargs) 216 | 217 | kw = dict(samples_per_peak=spp, 218 | batch_size=batch_size) 219 | kw.update(kwargs) 220 | batched_results = ls_proc.batched_run_const_nfreq(datas, **kw) 221 | ls_proc.finish() 222 | 223 | ls_procnb = LombScargleAsyncProcess(sigma=nfft_sigma, 224 | use_double=False, **kwargs) 225 | 226 | non_batched_results = [] 227 | for d, (frq, p) in zip(datas, batched_results): 228 | r = ls_procnb.run([d], freqs=frq, **kwargs) 229 | ls_procnb.finish() 230 | non_batched_results.extend(r) 231 | 232 | # for f0, (fb, pb), (fnb, pnb) in zip(frequencies, batched_results, 233 | # non_batched_results): 234 | # print f0, fb[np.argmax(pb)], fnb[np.argmax(pnb)] 235 | 236 | for f0, (fb, pb), (fnb, pnb) in zip(frequencies, batched_results, 237 | non_batched_results): 238 | 239 | if make_plot: 240 | import matplotlib.pyplot as plt 241 | plt.plot(fnb, pnb, color='k', lw=3) 242 | plt.plot(fb, pb, color='r') 243 | plt.axvline(f0) 244 | plt.show() 245 | 246 | assert_allclose(pnb, pb, rtol=lsrtol, atol=lsatol) 247 | assert_allclose(fnb, fb, rtol=lsrtol, atol=lsatol) 248 | -------------------------------------------------------------------------------- /cuvarbase/tests/test_nfft.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | from builtins import zip 6 | from builtins import range 7 | from builtins import object 8 | import pytest 9 | import numpy as np 10 | from numpy.testing import assert_allclose 11 | from scipy import fftpack 12 | 13 | from pycuda.tools import mark_cuda_test 14 | from pycuda import gpuarray 15 | 16 | import skcuda.fft as cufft 17 | 18 | from nfft import nfft_adjoint as nfft_adjoint_cpu 19 | from nfft.utils import nfft_matrix 20 | from nfft.kernels import KERNELS 21 | 22 | from ..cunfft import NFFTAsyncProcess 23 | 24 | nfft_sigma = 5 25 | nfft_m = 8 26 | nfft_rtol = 5E-3 27 | nfft_atol = 5E-3 28 | spp = 1 29 | 30 | 31 | def direct_sums(t, y, freqs): 32 | def sfunc(func): 33 | return [np.sum(y * func(2 * np.pi * t * f)) for f in freqs] 34 | return np.asarray(sfunc(np.cos)) + 1j * np.asarray(sfunc(np.sin)) 35 | 36 | 37 | def scale_time(t, samples_per_peak): 38 | return (t - min(t)) / (samples_per_peak * (max(t) - min(t))) - 0.5 39 | 40 | 41 | def data(seed=100, sigma=0.1, ndata=100, samples_per_peak=spp): 42 | 43 | rand = np.random.RandomState(seed) 44 | 45 | t = np.sort(rand.rand(ndata)) 46 | y = np.cos(2 * np.pi * (3./(max(t) - min(t))) * t) 47 | 48 | tscl = scale_time(t, samples_per_peak=samples_per_peak) 49 | 50 | y += sigma * rand.randn(len(t)) 51 | 52 | err = sigma * np.ones_like(y) 53 | 54 | return t, tscl, y, err 55 | 56 | 57 | def get_b(sigma, m): 58 | return (2. * sigma * m) / ((2 * sigma - 1) * np.pi) 59 | 60 | 61 | def precomp_psi(t, b, n, m): 62 | xg = m + n * t - np.floor(n * t) 63 | 64 | q1 = np.exp(-xg ** 2 / b) / np.sqrt(np.pi * b) 65 | q2 = np.exp(2 * xg / b) 66 | q3 = np.exp(-np.arange(2 * m + 1) ** 2 / b) 67 | 68 | return q1, q2, q3 69 | 70 | 71 | def gpu_grid_scalar(t, y, sigma, m, N): 72 | b = get_b(sigma, m) 73 | 74 | n = int(sigma * N) 75 | 76 | q1, q2, q3 = precomp_psi(t, b, n, m) 77 | 78 | u = (np.floor(n * (t + 0.5) - m)).astype(np.int) 79 | 80 | grid = np.zeros(n) 81 | 82 | inds = np.arange(2 * m + 1) 83 | for i, (U, Y) in enumerate(zip(u, y)): 84 | q2vals = np.array([pow(q2[i], j) for j in inds]) 85 | grid[(U + inds) % len(grid)] += Y * q1[i] * q2vals * q3 86 | 87 | return grid 88 | 89 | 90 | def simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, use_double=False, 91 | m=nfft_m, samples_per_peak=spp, **kwargs): 92 | proc = NFFTAsyncProcess(sigma=sigma, m=m, autoset_m=False, 93 | use_double=use_double) 94 | 95 | for stream in proc.streams: 96 | stream.synchronize() 97 | 98 | nfft_kwargs = dict(samples_per_peak=samples_per_peak) 99 | nfft_kwargs.update(kwargs) 100 | results = proc.run([(t, y, nf)], **nfft_kwargs) 101 | 102 | proc.finish() 103 | return results[0] 104 | 105 | 106 | def get_cpu_grid(t, y, nf, sigma=nfft_sigma, m=nfft_m): 107 | kernel = KERNELS.get('gaussian', 'gaussian') 108 | mat = nfft_matrix(t, int(nf * sigma), m, sigma, kernel, truncated=True) 109 | return mat.T.dot(y) 110 | 111 | 112 | #@mark_cuda_test 113 | class TestNFFT(object): 114 | 115 | def test_fast_gridding_with_jvdp_nfft(self): 116 | t, tsc, y, err = data() 117 | 118 | nf = int(nfft_sigma * len(t)) 119 | gpu_grid = simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, m=nfft_m, 120 | just_return_gridded_data=True, 121 | fast_grid=True, 122 | minimum_frequency=-int(nf/2), 123 | samples_per_peak=spp) 124 | 125 | # get CPU grid 126 | cpu_grid = get_cpu_grid(tsc, y, nf, sigma=nfft_sigma, m=nfft_m) 127 | 128 | assert_allclose(gpu_grid, cpu_grid, atol=1E-4, rtol=0) 129 | 130 | def test_fast_gridding_against_scalar_version(self): 131 | t, tsc, y, err = data() 132 | 133 | nf = int(nfft_sigma * len(t)) 134 | gpu_grid = simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, m=nfft_m, 135 | just_return_gridded_data=True, 136 | fast_grid=True, 137 | minimum_frequency=-int(nf/2), 138 | samples_per_peak=spp) 139 | 140 | # get python version of gpu grid calculation 141 | cpu_grid = gpu_grid_scalar(tsc, y, nfft_sigma, nfft_m, nf) 142 | 143 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 144 | assert_allclose(gpu_grid, cpu_grid, **tols) 145 | 146 | def test_slow_gridding_against_scalar_fast_gridding(self): 147 | t, tsc, y, err = data() 148 | 149 | nf = int(nfft_sigma * len(t)) 150 | gpu_grid = simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, m=nfft_m, 151 | just_return_gridded_data=True, 152 | fast_grid=False, 153 | minimum_frequency=-int(nf/2), 154 | samples_per_peak=spp) 155 | 156 | # get python version of gpu grid calculation 157 | cpu_grid = gpu_grid_scalar(tsc, y, nfft_sigma, nfft_m, nf) 158 | 159 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 160 | assert_allclose(gpu_grid, cpu_grid, **tols) 161 | 162 | def test_slow_gridding_against_jvdp_nfft(self): 163 | t, tsc, y, err = data() 164 | 165 | nf = int(nfft_sigma * len(t)) 166 | gpu_grid = simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, m=nfft_m, 167 | just_return_gridded_data=True, 168 | fast_grid=False, 169 | minimum_frequency=-int(nf/2), 170 | samples_per_peak=spp) 171 | 172 | # get CPU grid 173 | cpu_grid = get_cpu_grid(tsc, y, nf, sigma=nfft_sigma, m=nfft_m) 174 | 175 | diffs = np.absolute(gpu_grid - cpu_grid) 176 | inds = (np.argsort(diffs)[::-1])[:10] 177 | 178 | for i, gpug, cpug, d in zip(inds, gpu_grid[inds], 179 | cpu_grid[inds], 180 | diffs[inds]): 181 | print(i, gpug, cpug, d) 182 | 183 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 184 | assert_allclose(gpu_grid, cpu_grid, **tols) 185 | 186 | def test_ffts(self): 187 | t, tsc, y, err = data() 188 | 189 | yhat = np.empty(len(y)) 190 | 191 | yg = gpuarray.to_gpu(y.astype(np.complex128)) 192 | yghat = gpuarray.to_gpu(yhat.astype(np.complex128)) 193 | 194 | plan = cufft.Plan(len(y), np.complex128, np.complex128) 195 | cufft.ifft(yg, yghat, plan) 196 | 197 | yhat = fftpack.ifft(y) * len(y) 198 | 199 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 200 | assert_allclose(yhat, yghat.get(), **tols) 201 | 202 | def nfft_against_direct_sums(self, samples_per_peak=spp, 203 | f0=None, scaled=True): 204 | t, tsc, y, err = data(samples_per_peak=samples_per_peak) 205 | 206 | nf = int(nfft_sigma * len(t)) 207 | 208 | df = 1./(samples_per_peak * (max(t) - min(t))) 209 | if f0 is None: 210 | f0 = -0.5 * nf * df 211 | k0 = int(f0 / df) 212 | 213 | f0 = k0 if scaled else k0 * df 214 | tg = tsc if scaled else t 215 | sppg = samples_per_peak 216 | 217 | gpu_nfft = simple_gpu_nfft(tg, y, nf, sigma=nfft_sigma, m=nfft_m, 218 | minimum_frequency=f0, 219 | samples_per_peak=sppg) 220 | 221 | freqs = (float(k0) + np.arange(nf)) 222 | if not scaled: 223 | freqs *= df 224 | direct_dft = direct_sums(tg, y, freqs) 225 | 226 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 227 | 228 | def dsort(arr0, arr): 229 | d = np.absolute(arr0 - arr) 230 | return np.argsort(-d) 231 | 232 | inds = dsort(np.real(direct_dft), np.real(gpu_nfft)) 233 | 234 | npr = 5 235 | q = list(zip(inds[:npr], direct_dft[inds[:npr]], gpu_nfft[inds[:npr]])) 236 | for i, dft, gnfft in q: 237 | print(i, dft, gnfft) 238 | assert_allclose(np.real(direct_dft), np.real(gpu_nfft), **tols) 239 | assert_allclose(np.imag(direct_dft), np.imag(gpu_nfft), **tols) 240 | 241 | def test_nfft_against_existing_impl_scaled_centered_spp1(self): 242 | self.nfft_against_direct_sums(samples_per_peak=1, scaled=True, f0=None) 243 | 244 | def test_nfft_against_existing_impl_scaled_centered_spp5(self): 245 | self.nfft_against_direct_sums(samples_per_peak=5, scaled=True, f0=None) 246 | 247 | def test_nfft_against_existing_impl_scaled_uncentered_spp1(self): 248 | self.nfft_against_direct_sums(samples_per_peak=1, scaled=True, f0=10.) 249 | 250 | def test_nfft_against_existing_impl_unscaled_centered_spp1(self): 251 | self.nfft_against_direct_sums(samples_per_peak=1, scaled=False, 252 | f0=None) 253 | 254 | def test_nfft_against_existing_impl_unscaled_uncentered_spp5(self): 255 | self.nfft_against_direct_sums(samples_per_peak=5, scaled=False, f0=0.) 256 | 257 | def test_nfft_adjoint_async(self, f0=0., ndata=10, 258 | batch_size=3, use_double=False): 259 | datas = [] 260 | for i in range(ndata): 261 | t, tsc, y, err = data() 262 | nf = int(nfft_sigma * len(t)) 263 | 264 | datas.append((t, y, nf)) 265 | 266 | kwargs = dict(minimum_frequency=f0, samples_per_peak=spp) 267 | 268 | proc = NFFTAsyncProcess(sigma=nfft_sigma, m=nfft_m, autoset_m=False, 269 | use_double=use_double) 270 | 271 | single_nffts = [] 272 | for t, y, nf in datas: 273 | nfft = simple_gpu_nfft(t, y, nf, sigma=nfft_sigma, m=nfft_m, 274 | use_double=use_double, **kwargs) 275 | single_nffts.append(nfft) 276 | 277 | multi_nffts = proc.run(datas, **kwargs) 278 | 279 | batch_nffts = proc.batched_run(datas, batch_size=batch_size, **kwargs) 280 | proc.finish() 281 | 282 | tols = dict(rtol=nfft_rtol, atol=nfft_atol) 283 | for ghat_m, ghat_s, ghat_b in zip(multi_nffts, single_nffts, 284 | batch_nffts): 285 | assert_allclose(ghat_s.real, ghat_m.real, **tols) 286 | assert_allclose(ghat_s.imag, ghat_m.imag, **tols) 287 | 288 | assert_allclose(ghat_s.real, ghat_b.real, **tols) 289 | assert_allclose(ghat_s.imag, ghat_b.imag, **tols) 290 | -------------------------------------------------------------------------------- /cuvarbase/tests/test_pdm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | from numpy.testing import assert_allclose 7 | import pytest 8 | from ..utils import weights 9 | from ..pdm import pdm2_cpu, binless_pdm_cpu, PDMAsyncProcess 10 | from pycuda.tools import mark_cuda_test 11 | 12 | pytest.nbins = 10 13 | pytest.seed = 100 14 | pytest.nfreqs = 100 15 | pytest.ndata = 10 16 | pytest.sigma = 0.1 17 | 18 | @pytest.fixture(scope="function") 19 | def pow_cpu(request): 20 | rand = np.random.RandomState(pytest.seed) 21 | 22 | t = np.sort(rand.rand(pytest.ndata)) 23 | y = np.cos(2 * np.pi * (10./(max(t) - min(t))) * t) 24 | 25 | y += pytest.sigma * rand.randn(len(t)) 26 | 27 | err = pytest.sigma * np.ones_like(y) 28 | 29 | w = weights(err) 30 | freqs = np.linspace(0, 100./(max(t) - min(t)), pytest.nfreqs) 31 | freqs += 0.5 * (freqs[1] - freqs[0]) 32 | 33 | pow_cpu = pdm2_cpu(t, y, w, freqs, 34 | linterp=(request.param == 'binned_linterp'), 35 | nbins=pytest.nbins) 36 | 37 | return pow_cpu 38 | 39 | @pytest.fixture(scope="function") 40 | def binless_pow_cpu(request): 41 | rand = np.random.RandomState(pytest.seed) 42 | 43 | t = np.sort(rand.rand(pytest.ndata)) 44 | y = np.cos(2 * np.pi * (10./(max(t) - min(t))) * t) 45 | 46 | y += pytest.sigma * rand.randn(len(t)) 47 | 48 | err = pytest.sigma * np.ones_like(y) 49 | 50 | w = weights(err) 51 | freqs = np.linspace(0, 100./(max(t) - min(t)), pytest.nfreqs) 52 | freqs += 0.5 * (freqs[1] - freqs[0]) 53 | 54 | pow_cpu = binless_pdm_cpu(t, y, w, freqs, tophat=(request.param == 'binless_tophat')) 55 | 56 | return pow_cpu 57 | 58 | @pytest.fixture(scope="function") 59 | def pow_gpu(request): 60 | rand = np.random.RandomState(pytest.seed) 61 | 62 | t = np.sort(rand.rand(pytest.ndata)) 63 | y = np.cos(2 * np.pi * (10./(max(t) - min(t))) * t) 64 | 65 | y += pytest.sigma * rand.randn(len(t)) 66 | 67 | err = pytest.sigma * np.ones_like(y) 68 | 69 | w = weights(err) 70 | freqs = np.linspace(0, 100./(max(t) - min(t)), pytest.nfreqs) 71 | freqs += 0.5 * (freqs[1] - freqs[0]) 72 | 73 | pdm_proc = PDMAsyncProcess() 74 | results = pdm_proc.run([(t, y, w, freqs)], kind=request.param, nbins=pytest.nbins) 75 | pdm_proc.finish() 76 | 77 | return results[0] 78 | 79 | @pytest.mark.parametrize(["pow_cpu","pow_gpu"], [("binned_linterp","binned_linterp")], indirect=True) 80 | def test_cuda_pdm_binned_linterp(pow_cpu,pow_gpu): 81 | assert_allclose(pow_cpu, pow_gpu, atol=1E-2, rtol=0) 82 | 83 | @pytest.mark.parametrize(["pow_cpu","pow_gpu"], [("binned_step","binned_step")], indirect=True) 84 | def test_cuda_pdm_binned_step(pow_cpu,pow_gpu): 85 | assert_allclose(pow_cpu, pow_gpu, atol=1E-2, rtol=0) 86 | 87 | 88 | @pytest.mark.parametrize(["binless_pow_cpu","pow_gpu"], [("binless_gauss","binless_gauss")], indirect=True) 89 | def test_cuda_pdm_binless_gauss(binless_pow_cpu,pow_gpu): 90 | assert_allclose(binless_pow_cpu, pow_gpu, atol=1E-2, rtol=0) 91 | 92 | 93 | @pytest.mark.parametrize(["binless_pow_cpu","pow_gpu"], [("binless_tophat","binless_tophat")], indirect=True) 94 | def test_cuda_pdm_binless_tophat(binless_pow_cpu,pow_gpu): 95 | assert_allclose(binless_pow_cpu, pow_gpu, atol=1E-2, rtol=0) 96 | -------------------------------------------------------------------------------- /cuvarbase/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | from __future__ import print_function 4 | 5 | import numpy as np 6 | import pkg_resources 7 | 8 | 9 | def weights(err): 10 | """ generate observation weights from uncertainties """ 11 | w = np.power(err, -2) 12 | return w/sum(w) 13 | 14 | 15 | def find_kernel(name): 16 | return pkg_resources.resource_filename('cuvarbase', 17 | 'kernels/%s.cu' % (name)) 18 | 19 | 20 | def _module_reader(fname, cpp_defs=None): 21 | txt = open(fname, 'r').read() 22 | 23 | if cpp_defs is None: 24 | return txt 25 | 26 | preamble = ['#define {key} {value}'.format(key=key, 27 | value=('' if value is None 28 | else value)) 29 | for key, value in cpp_defs.items()] 30 | txt = txt.replace('//{CPP_DEFS}', '\n'.join(preamble)) 31 | 32 | return txt 33 | 34 | 35 | def tophat_window(t, t0, d): 36 | w_window = np.zeros_like(t) 37 | w_window[np.absolute(t - t0) < d] += 1. 38 | return w_window / max(w_window) 39 | 40 | 41 | def gaussian_window(t, t0, d): 42 | w_window = np.exp(-0.5 * np.power(t - t0, 2) / (d * d)) 43 | return w_window / (1. if len(w_window) == 0 else max(w_window)) 44 | 45 | 46 | def autofrequency(t, nyquist_factor=5, samples_per_peak=5, 47 | minimum_frequency=None, 48 | maximum_frequency=None, **kwargs): 49 | """ 50 | Determine a suitable frequency grid for data. 51 | 52 | Note that this assumes the peak width is driven by the observational 53 | baseline, which is generally a good assumption when the baseline is 54 | much larger than the oscillation period. 55 | If you are searching for periods longer than the baseline of your 56 | observations, this may not perform well. 57 | 58 | Even with a large baseline, be aware that the maximum frequency 59 | returned is based on the concept of "average Nyquist frequency", which 60 | may not be useful for irregularly-sampled data. The maximum frequency 61 | can be adjusted via the nyquist_factor argument, or through the 62 | maximum_frequency argument. 63 | 64 | Parameters 65 | ---------- 66 | samples_per_peak : float (optional, default=5) 67 | The approximate number of desired samples across the typical peak 68 | nyquist_factor : float (optional, default=5) 69 | The multiple of the average nyquist frequency used to choose the 70 | maximum frequency if maximum_frequency is not provided. 71 | minimum_frequency : float (optional) 72 | If specified, then use this minimum frequency rather than one 73 | chosen based on the size of the baseline. 74 | maximum_frequency : float (optional) 75 | If specified, then use this maximum frequency rather than one 76 | chosen based on the average nyquist frequency. 77 | 78 | Returns 79 | ------- 80 | frequency : ndarray or Quantity 81 | The heuristically-determined optimal frequency bin 82 | """ 83 | baseline = max(t) - min(t) 84 | n_samples = len(t) 85 | 86 | df = 1. / (baseline * samples_per_peak) 87 | 88 | nf0 = 1 89 | if minimum_frequency is not None: 90 | nf0 = max([nf0, int(minimum_frequency / df)]) 91 | 92 | if maximum_frequency is not None: 93 | Nf = int(maximum_frequency / df) - nf0 94 | else: 95 | Nf = int(0.5 * samples_per_peak * nyquist_factor * n_samples) 96 | 97 | return df * (nf0 + np.arange(Nf)) 98 | 99 | 100 | def dphase(dt, freq): 101 | dph = dt * freq - np.floor(dt * freq) 102 | dph_final = dph if dph < 0.5 else 1 - dph 103 | return dph_final 104 | 105 | 106 | def get_autofreqs(t, **kwargs): 107 | autofreqs_kwargs = {var: value for var, value in kwargs.items() 108 | if var in ['minimum_frequency', 'maximum_frequency', 109 | 'nyquist_factor', 'samples_per_peak']} 110 | return autofrequency(t, **autofreqs_kwargs) 111 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = -E -a 6 | SPHINXBUILD = python -msphinx 7 | SPHINXPROJ = cuvarbase 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx 2 | astropy 3 | astrobase 4 | numpy 5 | matplotlib -------------------------------------------------------------------------------- /docs/source/bls.rst: -------------------------------------------------------------------------------- 1 | Box least squares (BLS) periodogram 2 | *********************************** 3 | 4 | The box-least squares periodogram [BLS]_ searches for the periodic dips in brightness that occur when, e.g., a planet passes in front of its host star. The algorithm fits 5 | a `boxcar function `_ to the data. The parameters used are 6 | 7 | - ``q``: the transit duration as a fraction of the period :math:`t_{\rm trans} / P` 8 | - ``phi0``: the phase offset of the transit (from 0) 9 | - ``delta``: the difference between the out-of-transit brightness and the brightness during transit 10 | - ``y0``: The out-of-transit brightness 11 | 12 | 13 | .. plot:: plots/bls_transit_diagram.py 14 | 15 | 16 | Using ``cuvarbase`` BLS 17 | ----------------------- 18 | 19 | 20 | .. plot:: plots/bls_example.py 21 | :include-source: 22 | 23 | 24 | A shortcut: assuming orbital mechanics 25 | -------------------------------------- 26 | 27 | If you assume :math:`R_p\ll R_{\star}`, :math:`M_p\ll M_{\star}`, :math:`L_p\ll L_{\star}`, and :math:`e\ll 1`, where :math:`e` is the ellipticity of the planetary orbit, :math:`L` is the luminosity, :math:`R` is the radius, and :math:`M` mass, you can eliminate a free parameter. 28 | 29 | This is because the orbital period obeys `Kepler's third law `_, 30 | 31 | .. math:: 32 | P^2 \approx \frac{4\pi^2a^3}{G(M_p + M_{\star})} 33 | 34 | .. plot:: plots/planet_transit_diagram.py 35 | 36 | 37 | The angle of the transit is 38 | 39 | .. math:: 40 | 41 | \theta = 2{\rm arcsin}\left(\frac{R_p + R_{\star}}{a}\right) 42 | 43 | and :math:`q` is therefore :math:`\theta / (2\pi)`. Thus we have a relation between :math:`q` and the period :math:`P` 44 | 45 | .. math:: 46 | 47 | \sin{\pi q} = (R_p + R_{\star})\left(\frac{4\pi^2}{P^2 G(M_p + M_{\star})}\right)^{1/3} 48 | 49 | By incorporating the fact that 50 | 51 | .. math:: 52 | 53 | R_{\star} = \left(\frac{3}{4\pi\rho_{\star}}\right)^{1/3}M_{\star}^{1/3} 54 | 55 | where :math:`\rho_{\star}` is the average stellar density of the host star, we can write 56 | 57 | .. math:: 58 | 59 | \sin{\pi q} = \frac{(1 + r)}{(1 + m)^{1/3}} \left(\frac{3\pi}{G\rho_{\star}}\right)^{1/3} P^{-2/3} 60 | 61 | where :math:`r = R_p / R_{\star}` and :math:`m = M_p / M_{\star}`. We can get rid of the constant factors and convert this to more intuitive units to obtain 62 | 63 | .. math:: 64 | 65 | \sin{\pi q} \approx 0.238 (1 + r - \frac{m}{3} + \dots{}) \left(\frac{\rho_{\star}}{\rho_{\odot}}\right)^{-1/3} \left(\frac{P}{\rm day}\right)^{-2/3} 66 | 67 | where here we've expanded :math:`(1 + r) / (1 + m)^{1/3}` to first order in :math:`r` and :math:`m`. 68 | 69 | 70 | Using the Keplerian assumption in ``cuvarbase`` 71 | ----------------------------------------------- 72 | 73 | .. plot:: plots/bls_example_transit.py 74 | :include-source: 75 | 76 | 77 | Period spacing considerations 78 | ----------------------------- 79 | 80 | The frequency spacing :math:`\delta f` needed to resolve a BLS signal with width :math:`q`, is 81 | 82 | .. math:: 83 | \delta f \lesssim \frac{q}{T} 84 | 85 | where :math:`T` is the baseline of the observations (:math:`T = {\rm max}(t) - {\rm min}(t)`). This can be especially problematic if no assumptions are made about the nature of the signal (e.g., a Keplerian assumption). If you want to resolve a transit signal with a few observations, the minimum :math:`q` value that you would need to search is :math:`\propto 1/N` where :math:`N` is the number of observations. 86 | 87 | For a typical Lomb-Scargle periodogram, the frequency spacing is :math:`\delta f \lesssim 1/T`, so running a BLS spectrum with an adequate frequency spacing over the same frequency range requires a factor of :math:`\mathcal{O}(N)` more trial frequencies, each of which requiring :math:`\mathcal{O}(N)` computations to estimate the best fit BLS parameters. That means that BLS scales as :math:`\mathcal{O}(N^2N_f)` while Lomb-Scargle only scales as :math:`\mathcal{O}(N_f\log N_f)` 88 | 89 | However, if you can use the assumption that the transit is caused by an edge-on transit of a circularly orbiting planet, we not only eliminate a degree of freedom, but (assuming :math:`\sin{\pi q}\approx \pi q`) 90 | 91 | .. math:: 92 | 93 | \delta f \propto q \propto f^{2/3} 94 | 95 | The minimum frequency you could hope to measure a transit period would be :math:`f_{\rm min} \approx 2/T`, and the maximum frequency is determined by :math:`\sin{\pi q} < 1` which implies 96 | 97 | .. math:: 98 | 99 | f_{max} = 8.612~{\rm c/day}~\times \left(1 - \frac{3r}{2} + \frac{m}{2} -\dots{}\right) \sqrt{\frac{\rho_{\star}}{\rho_{\odot}}} 100 | 101 | 102 | For a 10 year baseline, this translates to :math:`2.7\times 10^5` trial frequencies. The number of trial frequencies needed to perform Lomb-Scargle over this frequency range is only about :math:`3.1\times 10^4`, so 8-10 times less. However, if we were to search the *entire* range of possible :math:`q` values at each trial frequency instead of making a Keplerian assumption, we would instead require :math:`5.35\times 10^8` trial frequencies, so the Keplerian assumption reduces the number of frequencies by over 1,000. 103 | 104 | 105 | .. [BLS] `Kovacs et al. 2002 `_ -------------------------------------------------------------------------------- /docs/source/ce.rst: -------------------------------------------------------------------------------- 1 | Conditional Entropy 2 | =================== 3 | 4 | The conditional entropy period finder [G2013]_ phase-folds the data at each trial frequencies and estimates 5 | the conditional entropy :math:`H(m|\phi)` of the data. The idea is that the data with the least entropy (intuitively: the greatest "structure" or "non-randomness"), should correspond to the correct frequency of a stationary signal. 6 | 7 | Here, 8 | 9 | .. math:: 10 | H(m|\phi) = H(m, \phi) - H(\phi) = \sum_{m,\phi}p(m, \phi)\log\left(\frac{p(\phi)}{p(m, \phi)}\right) 11 | 12 | 13 | where :math:`p(m, \phi)` is the density of points that fall within the bin located at phase :math:`\phi` and magnitude :math:`m` and :math:`p(\phi) = \sum_m p(m, \phi)` is the density of points that fall within the phi range. 14 | 15 | .. plot:: plots/ce_example.py 16 | 17 | 18 | An example with ``cuvarbase`` 19 | ----------------------------- 20 | 21 | .. code-block:: python 22 | 23 | import cuvarbase.ce as ce 24 | import numpy as np 25 | 26 | # make some fake data 27 | t = np.sort(np.random.rand(100)) 28 | y = np.cos(2 * np.pi * 10 * t) 29 | y += np.random.randn(len(t)) 30 | dy = np.ones_like(t) 31 | 32 | # start a conditional entropy process 33 | proc = ConditionalEntropyAsyncProcess(phase_bins=10, mag_bins=5) 34 | 35 | # format your data as a list of lightcurves (t, y, dy) 36 | data = [(t, y, dy)] 37 | 38 | # run the CE process with your data 39 | results = proc.run(data) 40 | 41 | # finish the process (probably not necessary but ensures 42 | # all data has been transferred) 43 | proc.finish() 44 | 45 | # Results is a list of [(freqs, CE), ...] for each lightcurve 46 | # in ``data``. 47 | freqs, ce_spectrum = results[0] 48 | 49 | 50 | If you want to run CE on large datasets, you can do 51 | 52 | .. code-block:: python 53 | 54 | proc.large_run(data, max_memory=1e9) 55 | 56 | instead of ``run``, which will ensure that the memory limit (1 GB in this case) is not exceeded on the GPU (unless of course you have other processes running). 57 | 58 | 59 | .. [G2013] `Graham et al. 2013 `_ 60 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # cuvarbase documentation build configuration file, created by 4 | # sphinx-quickstart on Fri Sep 22 21:34:29 2017. 5 | # 6 | # This file is execfile()d with the current directory set to its 7 | # containing dir. 8 | # 9 | # Note that not all possible configuration values are present in this 10 | # autogenerated file. 11 | # 12 | # All configuration values have a default; values that are commented out 13 | # serve to show the default. 14 | 15 | # If extensions (or modules to document with autodoc) are in another directory, 16 | # add these directories to sys.path here. If the directory is relative to the 17 | # documentation root, use os.path.abspath to make it absolute, like shown here. 18 | # 19 | import os 20 | import sys 21 | import ctypes 22 | import io 23 | import re 24 | 25 | cuda_dir = "/Developer/NVIDIA/CUDA-8.0/lib/" 26 | sys.path.insert(0, os.path.abspath('../..')) 27 | sys.path.insert(0, cuda_dir) 28 | 29 | # Set DYLD and LD library paths 30 | dyld_lpath = os.environ.get('DYLD_LIBRARY_PATH', '') 31 | ld_lpath = os.environ.get('LD_LIBRARY_PATH', '') 32 | 33 | 34 | def lpath_insert(p, lpath): 35 | return '%s:%s' % (p, lpath) 36 | 37 | dyld_lpath = lpath_insert(cuda_dir, dyld_lpath) 38 | ld_lpath = lpath_insert(cuda_dir, ld_lpath) 39 | 40 | 41 | os.environ['DYLD_LIBRARY_PATH'] = dyld_lpath 42 | os.environ['LD_LIBRARY_PATH'] = ld_lpath 43 | 44 | 45 | def read(path, encoding='utf-8'): 46 | path = os.path.join(os.path.dirname(__file__), path) 47 | with io.open(path, encoding=encoding) as fp: 48 | return fp.read() 49 | 50 | 51 | def version(path): 52 | """Obtain the packge version from a python file e.g. pkg/__init__.py 53 | 54 | See . 55 | """ 56 | version_file = read(path) 57 | version_match = re.search(r"""^__version__ = ['"]([^'"]*)['"]""", 58 | version_file, re.M) 59 | if version_match: 60 | return version_match.group(1) 61 | raise RuntimeError("Unable to find version string.") 62 | 63 | 64 | VERSION = version('../../cuvarbase/__init__.py') 65 | 66 | 67 | # -- General configuration ------------------------------------------------ 68 | 69 | # If your documentation needs a minimal Sphinx version, state it here. 70 | # 71 | # needs_sphinx = '1.0' 72 | 73 | # Add any Sphinx extension module names here, as strings. They can be 74 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 75 | # ones. 76 | extensions = ['sphinx.ext.autodoc', 77 | 'sphinx.ext.doctest', 78 | 'sphinx.ext.todo', 79 | 'sphinx.ext.coverage', 80 | 'sphinx.ext.mathjax', 81 | 'sphinx.ext.ifconfig', 82 | 'sphinx.ext.viewcode', 83 | 'sphinx.ext.githubpages', 84 | 'sphinx.ext.napoleon', 85 | 'matplotlib.sphinxext.only_directives', 86 | 'matplotlib.sphinxext.plot_directive'] 87 | 88 | # Add any paths that contain templates here, relative to this directory. 89 | templates_path = ['.templates'] 90 | 91 | # The suffix(es) of source filenames. 92 | # You can specify multiple suffix as a list of string: 93 | # 94 | # source_suffix = ['.rst', '.md'] 95 | source_suffix = '.rst' 96 | 97 | # The master toctree document. 98 | master_doc = 'index' 99 | 100 | # General information about the project. 101 | project = u'cuvarbase' 102 | copyright = u'2017, John Hoffman' 103 | author = u'John Hoffman' 104 | 105 | # The version info for the project you're documenting, acts as replacement for 106 | # |version| and |release|, also used in various other places throughout the 107 | # built documents. 108 | # 109 | # The short X.Y version. 110 | version = VERSION 111 | # The full version, including alpha/beta/rc tags. 112 | release = VERSION 113 | 114 | # The language for content autogenerated by Sphinx. Refer to documentation 115 | # for a list of supported languages. 116 | # 117 | # This is also used if you do content translation via gettext catalogs. 118 | # Usually you set "language" from the command line for these cases. 119 | language = None 120 | 121 | # List of patterns, relative to source directory, that match files and 122 | # directories to ignore when looking for source files. 123 | # This patterns also effect to html_static_path and html_extra_path 124 | exclude_patterns = [] 125 | 126 | # The name of the Pygments (syntax highlighting) style to use. 127 | pygments_style = 'sphinx' 128 | 129 | # If true, `todo` and `todoList` produce output, else they produce nothing. 130 | todo_include_todos = True 131 | 132 | 133 | # -- Options for HTML output ---------------------------------------------- 134 | 135 | # The theme to use for HTML and HTML Help pages. See the documentation for 136 | # a list of builtin themes. 137 | # 138 | html_theme = 'alabaster' 139 | 140 | html_logo = './logo.png' 141 | # Theme options are theme-specific and customize the look and feel of a theme 142 | # further. For a list of options available for each theme, see the 143 | # documentation. 144 | # 145 | # html_theme_options = {} 146 | 147 | # Add any paths that contain custom static files (such as style sheets) here, 148 | # relative to this directory. They are copied after the builtin static files, 149 | # so a file named "default.css" will overwrite the builtin "default.css". 150 | html_static_path = ['.static'] 151 | 152 | # Custom sidebar templates, must be a dictionary that maps document names 153 | # to template names. 154 | # 155 | # This is required for the alabaster theme 156 | # refs: http://alabaster.readthedocs.io/en/latest/installation.html#sidebars 157 | html_sidebars = { 158 | '**': [ 159 | 'about.html', 160 | 'navigation.html', 161 | 'relations.html', # needs 'show_related': True theme option to display 162 | 'searchbox.html', 163 | 'donate.html', 164 | ] 165 | } 166 | 167 | 168 | # -- Options for HTMLHelp output ------------------------------------------ 169 | 170 | # Output file base name for HTML help builder. 171 | htmlhelp_basename = 'cuvarbasedoc' 172 | 173 | 174 | # -- Options for LaTeX output --------------------------------------------- 175 | 176 | latex_elements = { 177 | # The paper size ('letterpaper' or 'a4paper'). 178 | # 179 | # 'papersize': 'letterpaper', 180 | 181 | # The font size ('10pt', '11pt' or '12pt'). 182 | # 183 | # 'pointsize': '10pt', 184 | 185 | # Additional stuff for the LaTeX preamble. 186 | # 187 | # 'preamble': '', 188 | 189 | # Latex figure (float) alignment 190 | # 191 | # 'figure_align': 'htbp', 192 | } 193 | 194 | # Grouping the document tree into LaTeX files. List of tuples 195 | # (source start file, target name, title, 196 | # author, documentclass [howto, manual, or own class]). 197 | latex_documents = [ 198 | (master_doc, 'cuvarbase.tex', u'cuvarbase Documentation', 199 | u'John Hoffman', 'manual'), 200 | ] 201 | 202 | 203 | # -- Options for manual page output --------------------------------------- 204 | 205 | # One entry per manual page. List of tuples 206 | # (source start file, name, description, authors, manual section). 207 | man_pages = [ 208 | (master_doc, 'cuvarbase', u'cuvarbase Documentation', 209 | [author], 1) 210 | ] 211 | 212 | 213 | # -- Options for Texinfo output ------------------------------------------- 214 | 215 | # Grouping the document tree into Texinfo files. List of tuples 216 | # (source start file, target name, title, author, 217 | # dir menu entry, description, category) 218 | texinfo_documents = [ 219 | (master_doc, 'cuvarbase', u'cuvarbase Documentation', 220 | author, 'cuvarbase', 'One line description of project.', 221 | 'Miscellaneous'), 222 | ] 223 | -------------------------------------------------------------------------------- /docs/source/cuvarbase.rst: -------------------------------------------------------------------------------- 1 | cuvarbase package 2 | ================= 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | 9 | cuvarbase.tests 10 | 11 | Submodules 12 | ---------- 13 | 14 | cuvarbase\.bls module 15 | --------------------- 16 | 17 | .. automodule:: cuvarbase.bls 18 | :members: 19 | :undoc-members: 20 | :show-inheritance: 21 | 22 | cuvarbase\.ce module 23 | -------------------- 24 | 25 | .. automodule:: cuvarbase.ce 26 | :members: 27 | :undoc-members: 28 | :show-inheritance: 29 | 30 | cuvarbase\.core module 31 | ---------------------- 32 | 33 | .. automodule:: cuvarbase.core 34 | :members: 35 | :undoc-members: 36 | :show-inheritance: 37 | 38 | cuvarbase\.cunfft module 39 | ------------------------ 40 | 41 | .. automodule:: cuvarbase.cunfft 42 | :members: 43 | :undoc-members: 44 | :show-inheritance: 45 | 46 | cuvarbase\.lombscargle module 47 | ----------------------------- 48 | 49 | .. automodule:: cuvarbase.lombscargle 50 | :members: 51 | :undoc-members: 52 | :show-inheritance: 53 | 54 | cuvarbase\.pdm module 55 | --------------------- 56 | 57 | .. automodule:: cuvarbase.pdm 58 | :members: 59 | :undoc-members: 60 | :show-inheritance: 61 | 62 | 63 | cuvarbase\.utils module 64 | ----------------------- 65 | 66 | .. automodule:: cuvarbase.utils 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | 72 | Module contents 73 | --------------- 74 | 75 | .. automodule:: cuvarbase 76 | :members: 77 | :undoc-members: 78 | :show-inheritance: 79 | -------------------------------------------------------------------------------- /docs/source/cuvarbase.tests.rst: -------------------------------------------------------------------------------- 1 | cuvarbase\.tests package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | cuvarbase\.tests\.test\_bls module 8 | ---------------------------------- 9 | 10 | .. automodule:: cuvarbase.tests.test_bls 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | cuvarbase\.tests\.test\_ce module 16 | --------------------------------- 17 | 18 | .. automodule:: cuvarbase.tests.test_ce 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | cuvarbase\.tests\.test\_lombscargle module 24 | ------------------------------------------ 25 | 26 | .. automodule:: cuvarbase.tests.test_lombscargle 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | cuvarbase\.tests\.test\_nfft module 32 | ----------------------------------- 33 | 34 | .. automodule:: cuvarbase.tests.test_nfft 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | cuvarbase\.tests\.test\_pdm module 40 | ---------------------------------- 41 | 42 | .. automodule:: cuvarbase.tests.test_pdm 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | 48 | Module contents 49 | --------------- 50 | 51 | .. automodule:: cuvarbase.tests 52 | :members: 53 | :undoc-members: 54 | :show-inheritance: 55 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. cuvarbase documentation master file, created by 2 | sphinx-quickstart on Fri Sep 22 21:34:29 2017. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | 7 | 8 | .. include:: ../../README.rst 9 | 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Contents: 14 | 15 | whatsnew 16 | install 17 | ce 18 | lomb 19 | bls 20 | modules 21 | 22 | Indices and tables 23 | ================== 24 | 25 | * :ref:`genindex` 26 | * :ref:`modindex` 27 | * :ref:`search` 28 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../INSTALL.rst -------------------------------------------------------------------------------- /docs/source/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/docs/source/logo.png -------------------------------------------------------------------------------- /docs/source/lomb.rst: -------------------------------------------------------------------------------- 1 | Lomb-Scargle periodogram 2 | ************************ 3 | 4 | The Lomb-Scargle periodogram ([Barning1963]_, [Vanicek1969]_, [Scargle1982]_, [Lomb1976]_) is one of the best known and most popular period finding algorithms used in astrononomy. If you would like to learn more about least-squares methods for periodic signals, see the review article by [VanderPlas2017]_. 5 | 6 | The LS periodogram is a least-squares estimator for the following model 7 | 8 | .. math:: 9 | 10 | \hat{y}(t|\omega, \theta) = \theta_1\cos{\omega t} + \theta_2\sin{\omega t} 11 | 12 | and it is equivalent to the Discrete Fourier Transform in the regularly-sampled limit. For irregularly sampled data, LS is a maximum likelihood estimator for the parameters :math:`\theta` in the case where the noise is Gaussian. The periodogram has many normalizations in the literature, but ``cuvarbase`` adopts 13 | 14 | .. math:: 15 | 16 | P(\omega) = \frac{\chi^2_0 - \chi^2(\omega)}{\chi^2_0} 17 | 18 | where 19 | 20 | .. math:: 21 | 22 | \chi^2(\omega) = \sum_i \left(\frac{y_i - \hat{y}(t_i|\omega, \theta)}{\sigma_i}\right)^2 23 | 24 | is the goodness-of-fit statistic for the optimal parameters :math:`\theta` and 25 | 26 | .. math:: 27 | 28 | \chi^2_0 = \sum_i \left(\frac{y_i - \bar{y}}{\sigma_i}\right)^2 29 | 30 | is the goodness-of-fit statistic for a constant fit, and :math:`\bar{y}` is the weighted mean, 31 | 32 | 33 | .. math:: 34 | 35 | \bar{y} = \sum_i w_i y_i 36 | 37 | where :math:`w_i \propto 1/\sigma_i^2` and :math:`\sum_iw_i = 1`. 38 | 39 | The closed form of the periodogram is given by 40 | 41 | .. math:: 42 | 43 | P(\omega) = \frac{1}{\chi^2_0}\left(\frac{YC_{\tau}^2}{CC_{\tau}} + \frac{YS_{\tau}^2}{SS_{\tau}}\right) 44 | 45 | Where 46 | 47 | .. math:: 48 | 49 | YC_{\tau} &= \sum_i w_iy_i\cos{\omega (t_i - \tau)}\\ 50 | 51 | YS_{\tau} &= \sum_i w_iy_i\sin{\omega (t_i - \tau)}\\ 52 | 53 | CC_{\tau} &= \sum_i w_i\cos^2{\omega (t_i - \tau)}\\ 54 | 55 | SS_{\tau} &= \sum_i w_i\sin^2{\omega (t_i - \tau)}\\ 56 | 57 | \tan{2\omega\tau} &= \frac{\sum_i w_i \sin{2\omega t_i}}{\sum_i w_i \sin{2\omega t_i}} 58 | 59 | For the original formulation of the Lomb-Scargle periodogram without the constant offset term. 60 | 61 | Adding a constant offset 62 | ------------------------ 63 | 64 | Lomb-Scargle can be extended in many ways, most commonly to include a constant offset [ZK2009]_. 65 | 66 | .. math:: 67 | 68 | \hat{y}^{\rm GLS}(t|\omega, \theta) = \theta_1\cos{\omega t} + \theta_2\sin{\omega t} + \theta_3 69 | 70 | This protects against cases where the mean of the data does not correspond with the mean of the underlying 71 | signal, as is usually the case with sparsely sampled data or for signals with large amplitudes that become 72 | too bright or dim to be observed during part of the signal phase. 73 | 74 | With the constant offset term, the closed-form solution to :math:`P(\omega)` is the same, but the terms 75 | are slightly different. Derivations of this are in [ZK2009]_. 76 | 77 | Getting :math:`\mathcal{O}(N\log N)` performance 78 | ------------------------------------------------ 79 | 80 | The secret to Lomb-Scargle's speed lies in the fact that computing it requires evaluating sums that, for regularly-spaced data, can be evaluated with the fast Fourier transform (FFT), which scales as :math:`\mathcal{O}(N_f\log N_f)` where :math:`N_f` is the number of frequencies. For *irregularly* spaced data, however, we can employ tricks to get to this scaling. 81 | 82 | 1. We can "extirpolate" the data with Legendre polynomials to a regular grid and then perform the FFT [PressRybicki1989]_, or, 83 | 2. We can use the non-equispaced fast Fourier transform (NFFT) [DuttRokhlin1993]_, which is tailor made for this exact problem. 84 | 85 | The latter was shown by [Leroy2012]_ to give roughly an order-of-magnitude speed improvement over the [PressRybicki1989]_ method, with the added benefit that the NFFT is a rigorous extension of the FFT and has proven error bounds. 86 | 87 | It's worth mentioning the [Townsend2010]_ CUDA implementation of Lomb-Scargle, however this uses the :math:`\mathcal{O}(N_{\rm obs}N_f)` "naive" implementation 88 | of LS without any FFT's. 89 | 90 | Estimating significance 91 | ----------------------- 92 | 93 | See [Baluev2008]_ for more information (TODO.) 94 | 95 | 96 | Example: Basic 97 | -------------- 98 | 99 | .. plot:: 100 | :include-source: 101 | 102 | import skcuda.fft 103 | import cuvarbase.lombscargle as gls 104 | import numpy as np 105 | import matplotlib.pyplot as plt 106 | 107 | 108 | t = np.sort(np.random.rand(300)) 109 | y = 1 + np.cos(2 * np.pi * 100 * t - 0.1) 110 | dy = 0.1 * np.ones_like(y) 111 | y += dy * np.random.randn(len(t)) 112 | 113 | # Set up LombScargleAsyncProcess (compilation, etc.) 114 | proc = gls.LombScargleAsyncProcess() 115 | 116 | # Run on single lightcurve 117 | result = proc.run([(t, y, dy)]) 118 | 119 | # Synchronize all cuda streams 120 | proc.finish() 121 | 122 | # Read result! 123 | freqs, ls_power = result[0] 124 | 125 | ############ 126 | # Plotting # 127 | ############ 128 | 129 | f, ax = plt.subplots() 130 | ax.set_xscale('log') 131 | 132 | ax.plot(freqs, ls_power) 133 | ax.set_xlabel('Frequency') 134 | ax.set_ylabel('Lomb-Scargle') 135 | plt.show() 136 | 137 | Example: Batches of lightcurves 138 | ------------------------------- 139 | 140 | 141 | .. plot:: 142 | :include-source: 143 | 144 | import skcuda.fft 145 | import cuvarbase.lombscargle as gls 146 | import numpy as np 147 | import matplotlib.pyplot as plt 148 | 149 | nlcs = 9 150 | 151 | def lightcurve(freq=100, ndata=300): 152 | t = np.sort(np.random.rand(ndata)) 153 | y = 1 + np.cos(2 * np.pi * freq * t - 0.1) 154 | dy = 0.1 * np.ones_like(y) 155 | y += dy * np.random.randn(len(t)) 156 | return t, y, dy 157 | 158 | freqs = 200 * np.random.rand(nlcs) 159 | data = [lightcurve(freq=freq) for freq in freqs] 160 | 161 | # Set up LombScargleAsyncProcess (compilation, etc.) 162 | proc = gls.LombScargleAsyncProcess() 163 | 164 | # Run on batch of lightcurves 165 | results = proc.batched_run_const_nfreq(data) 166 | 167 | # Synchronize all cuda streams 168 | proc.finish() 169 | 170 | ############ 171 | # Plotting # 172 | ############ 173 | max_n_cols = 4 174 | ncols = max([1, min([int(np.sqrt(nlcs)), max_n_cols])]) 175 | nrows = int(np.ceil(float(nlcs) / ncols)) 176 | f, axes = plt.subplots(nrows, ncols, 177 | figsize=(3 * ncols, 3 * nrows)) 178 | 179 | for (frqs, ls_power), ax, freq in zip(results, 180 | np.ravel(axes), 181 | freqs): 182 | ax.set_xscale('log') 183 | ax.plot(frqs, ls_power) 184 | ax.axvline(freq, ls=':', color='r') 185 | 186 | f.text(0.05, 0.5, "Lomb-Scargle", rotation=90, 187 | va='center', ha='right', fontsize=20) 188 | f.text(0.5, 0.05, "Frequency", 189 | va='top', ha='center', fontsize=20) 190 | 191 | 192 | for i, ax in enumerate(np.ravel(axes)): 193 | if i >= nlcs: 194 | ax.axis('off') 195 | f.tight_layout() 196 | f.subplots_adjust(left=0.1, bottom=0.1) 197 | plt.show() 198 | 199 | 200 | .. [DuttRokhlin1993] `Dutt, A., & Rokhlin, V. 1993, SIAM J. Sci. Comput., 14(6), 1368–1393. `_ 201 | .. [PressRybicki1989] `Press, W. H., & Rybicki, G. B. 1989, ApJ, 338, 277 `_ 202 | .. [Baluev2008] `Baluev, R. V. 2008, MNRAS, 385, 1279 `_ 203 | .. [ZK2009] `Zechmeister, M., & Kürster, M. 2009, AAP, 496, 577 `_ 204 | .. [VanderPlas2017] `VanderPlas, J. T. 2017, arXiv:1703.09824 `_ 205 | .. [Leroy2012] `Leroy, B. 2012, AAP, 545, A50 `_ 206 | .. [Townsend2010] `Townsend, R. H. D. 2010, ApJS, 191, 247 `_ 207 | .. [Barning1963] `Barning, F. J. M. 1963, BAN, 17, 22 `_ 208 | .. [Vanicek1969] `Vaníček, P. 1969, APSS, 4, 387 `_ 209 | .. [Scargle1982] `Scargle, J. D. 1982, ApJ, 263, 835 `_ 210 | .. [Lomb1976] `Lomb, N. R. 1976, APSS, 39, 447 `_ -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | API documentation 2 | ================= 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | cuvarbase 8 | -------------------------------------------------------------------------------- /docs/source/plots/benchmarks.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function 4 | 5 | import sys 6 | import numpy as np 7 | from time import time 8 | import copy 9 | import matplotlib 10 | matplotlib.use('Agg') 11 | import matplotlib.pyplot as plt 12 | import pycuda.autoinit 13 | import pycuda.driver as cuda 14 | 15 | import cuvarbase.bls as bls 16 | import cuvarbase.ce as ce 17 | import cuvarbase.lombscargle as ls 18 | from astrobase.periodbase.kbls import _bls_runner as astrobase_bls 19 | from astropy.stats.lombscargle import LombScargle as AstropyLombScargle 20 | from tqdm import tqdm 21 | 22 | 23 | def get_freqs(baseline=5 * 365., fmin=None, 24 | fmax=(24 * 60.) / 30., samples_per_peak=5): 25 | 26 | df = 1. / baseline / samples_per_peak 27 | if fmin is None: 28 | fmin = 2./baseline 29 | 30 | nf = int(np.ceil((fmax - fmin) / df)) 31 | 32 | return fmin + df * np.arange(nf) 33 | 34 | 35 | def data(ndata, baseline=5 * 365.): 36 | t = baseline * np.sort(np.random.rand(ndata)) 37 | y = np.cos(2 * np.pi * t) 38 | dy = 0.1 * np.ones_like(t) 39 | 40 | y += dy * np.random.randn(len(t)) 41 | 42 | return t, y, dy 43 | 44 | def profile(func): 45 | def profiled_func(*args, **kwargs): 46 | cuda.start_profiler() 47 | func(*args, **kwargs) 48 | cuda.stop_profiler() 49 | #pycuda.autoinit.context.detach() 50 | sys.exit() 51 | return profiled_func 52 | 53 | def function_timer(func, nreps=3): 54 | def timed_func(*args, **kwargs): 55 | dts = [] 56 | for n in range(nreps): 57 | t0 = time() 58 | func(*args, **kwargs) 59 | dt = time() - t0 60 | dts.append(dt) 61 | return min(dts) 62 | 63 | return timed_func 64 | 65 | 66 | eebls_gpu = function_timer(bls.eebls_gpu) 67 | eebls_transit_gpu = function_timer(bls.eebls_transit_gpu) 68 | eebls_gpu_fast = function_timer(bls.eebls_gpu_fast) 69 | astrobase_bls = function_timer(astrobase_bls) 70 | 71 | _eebls_defaults = dict(qmin_fac=0.5, qmax_fac=2.0, dlogq=0.25, 72 | samples_per_peak=4, noverlap=2) 73 | 74 | 75 | def profile_cuvarbase_ce(t, y, dy, freqs, **kwargs): 76 | 77 | proc = ce.ConditionalEntropyAsyncProcess(**kwargs) 78 | proc.preallocate(len(t), freqs, **kwargs) 79 | run = profile(proc.run) 80 | 81 | run([(t, y, None)], freqs=freqs, **kwargs) 82 | 83 | return True 84 | 85 | def time_cuvarbase_ce_run(t, y, dy, freqs, **kwargs): 86 | proc = ce.ConditionalEntropyAsyncProcess(**kwargs) 87 | proc.preallocate(len(t), freqs, **kwargs) 88 | run = function_timer(proc.run) 89 | 90 | return run([(t, y, None)], freqs=freqs, **kwargs) 91 | 92 | 93 | def time_cuvarbase_bls(t, y, dy, freqs, qmin=1e-2, qmax=0.5, 94 | memory=None, pre_transfer=False, transit=False, 95 | use_fast=True, **kwargs): 96 | 97 | kw = copy.deepcopy(_eebls_defaults) 98 | kw.update(kwargs) 99 | kw['use_fast'] = use_fast 100 | 101 | if memory is None and not transit: 102 | memory = bls.BLSMemory.fromdata(t, y, dy, freqs=freqs, 103 | transfer=pre_transfer, 104 | qmin=qmin, qmax=qmax) 105 | 106 | if not transit and use_fast: 107 | return eebls_gpu_fast(t, y, dy, freqs, memory=memory, 108 | qmin=qmin, qmax=qmax, 109 | transfer_to_device=(not pre_transfer), 110 | **kw) 111 | if not transit: 112 | return eebls_gpu(t, y, dy, freqs, qmin=qmin, qmax=qmax, 113 | **kw) 114 | 115 | qvals = kwargs.get('qvals', None) 116 | if freqs is None: 117 | freqs, qvals = bls.transit_autofreq(t, **kw) 118 | elif qvals is None: 119 | qvals = bls.q_transit(freqs, **kw) 120 | 121 | return eebls_transit_gpu(t, y, dy, freqs=freqs, qvals=qvals, **kw) 122 | 123 | 124 | def time_astrobase_bls(t, y, dy, freqs, qmin=1e-2, qmax=0.5, 125 | **kwargs): 126 | 127 | nfreqs = len(freqs) 128 | minfreq = min(freqs) 129 | stepsize = freqs[1] - freqs[0] 130 | nphasebins = int(np.ceil(1./qmin)) 131 | 132 | args = (t, y) 133 | args += (nfreqs, minfreq, stepsize, nphasebins, qmin, qmax) 134 | return astrobase_bls(*args) 135 | 136 | 137 | def subset_data(t, y, dy, ndata): 138 | inds = np.arange(1, len(t) - 1) 139 | np.random.shuffle(inds) 140 | 141 | subinds = np.concatenate(([0], np.argsort(t[inds[:ndata-2]]), 142 | [len(t) - 1])) 143 | return (arr[subinds] for arr in (t, y, dy)) 144 | 145 | 146 | def time_group(task_dict, group_func, values): 147 | times = {} 148 | for name in task_dict.keys(): 149 | print(name) 150 | dts = [] 151 | for v in tqdm(values): 152 | dts.append((v, group_func(task_dict[name], v))) 153 | times[name] = dts 154 | return times 155 | 156 | n0 = 1000 157 | ndatas = np.floor(np.logspace(1, 4.5, num=8)).astype(np.int) 158 | #nblocks = np.arange(1, 25) 159 | #nblocks = np.concatenate((nblocks, np.arange(nblocks[-1], 3000, 50))) 160 | nblocks = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100, 200, 500, 1000, 2000, 5000] 161 | freq_batch_sizes = [1, 5, 10, 50, 100, 500, 1000, 2000, 5000] 162 | 163 | t, y, dy = data(max(ndatas), baseline=10. * 365) 164 | freqs_t, qvals_t = bls.transit_autofreq(t, fmin=0.01, **_eebls_defaults) 165 | t0, y0, dy0 = subset_data(t, y, dy, n0) 166 | 167 | qmin = min(qvals_t) 168 | qmax = max(qvals_t) 169 | freqs = get_freqs(baseline=(max(t) - min(t)), samples_per_peak=4, fmin=0.01) 170 | 171 | print(qmin, qmax, len(freqs_t), len(freqs)) 172 | # profile_cuvarbase_ce(t0, y0, dy0, freqs=freqs, use_fast=True, force_nblocks=200) 173 | 174 | 175 | tasks = { 176 | 'BLS: cuvarbase (0.2.0)': lambda T, Y, DY, FREQS=freqs, 177 | force_nblocks=1000, **kwargs: 178 | time_cuvarbase_bls(T, Y, DY, FREQS, use_fast=True, 179 | force_nblocks=force_nblocks, **kwargs), 180 | 181 | 'BLS: cuvarbase (0.2.0) -- transit': lambda T, Y, DY, FREQS=freqs, 182 | force_nblocks=1000, **kwargs: 183 | time_cuvarbase_bls(T, Y, DY, None, use_fast=True, 184 | force_nblocks=force_nblocks, transit=True, 185 | **kwargs), 186 | 187 | 'BLS: cuvarbase (0.1.9)': lambda T, Y, DY, FREQS=freqs, **kwargs: 188 | time_cuvarbase_bls(T, Y, DY, FREQS, use_fast=False, **kwargs), 189 | 190 | 'BLS: astrobase': lambda T, Y, DY, FREQS=freqs, **kwargs: 191 | time_astrobase_bls(T, Y, DY, FREQS, **kwargs), 192 | 193 | 'CE: cuvarbase (0.1.9) 25-2-10-1': lambda T, Y, DY, FREQS=freqs, 194 | use_fast=False, phase_bins=25, phase_overlap=2, mag_bins=10, 195 | mag_overlap=1, use_double=False, **kwargs: 196 | time_cuvarbase_ce_run(T, Y, DY, FREQS, use_fast=use_fast, **kwargs), 197 | 198 | 'CE: cuvarbase (0.2.0) 25-2-10-1': lambda T, Y, DY, FREQS=freqs, 199 | use_fast=True, phase_bins=25, phase_overlap=2, mag_bins=10, 200 | mag_overlap=1, use_double=False, **kwargs: 201 | time_cuvarbase_ce_run(T, Y, DY, FREQS, use_fast=use_fast, **kwargs) 202 | 203 | 204 | } 205 | 206 | 207 | 208 | tasks_nblocks = {name: tasks[name] for name in ['BLS: cuvarbase (0.2.0)', 209 | 'CE: cuvarbase (0.2.0) ' 210 | '25-2-10-1']} 211 | 212 | 213 | def nblock_group_func(func, nblock): 214 | return func(t0, y0, dy0, freqs, force_nblocks=nblock) 215 | 216 | 217 | def ndata_group_func(func, ndata): 218 | T, Y, DY = subset_data(t, y, dy, ndata) 219 | return func(T, Y, DY, freqs) 220 | 221 | 222 | def freq_batch_size_group_func(func, fbs): 223 | return func(t0, y0, dy0, freqs, freq_batch_size=fbs) 224 | 225 | 226 | groups = { 227 | 'N observations': (tasks, ndata_group_func, ndatas), 228 | 'Grid size': (tasks_nblocks, nblock_group_func, nblocks), 229 | 'Frequencies per kernel call': (tasks_nblocks, 230 | freq_batch_size_group_func, 231 | freq_batch_sizes) 232 | } 233 | 234 | dev = pycuda.autoinit.device 235 | attrs = dev.get_attributes() 236 | device_name = dev.name() 237 | 238 | print(device_name) 239 | #print(len(freqs)) 240 | #for attr in attrs.keys(): 241 | # print("{attr}: {value}".format(attr=attr, value=attrs[attr])) 242 | 243 | group_times = {} 244 | for group in groups.keys(): 245 | print("="*len(group)) 246 | print(group) 247 | print("="*len(group)) 248 | group_times[group] = time_group(*groups[group]) 249 | 250 | for group in group_times: 251 | times = group_times[group] 252 | 253 | f, ax = plt.subplots() 254 | for taskname in sorted(list(times.keys())): 255 | values, dts = zip(*times[taskname]) 256 | ax.plot(values, dts, label=taskname) 257 | 258 | f.suptitle(device_name) 259 | ax.set_xlabel(group) 260 | ax.legend(loc='best') 261 | ax.set_yscale('log') 262 | ax.set_xscale('log') 263 | 264 | device_name.replace(' ', '_') 265 | group.replace(' ', '_') 266 | fname = '{dev}-{group}.png'.format(dev=device_name.replace(' ', '_'), 267 | group=group.replace(' ', '_')) 268 | 269 | f.savefig(fname) 270 | 271 | # plt.show() 272 | -------------------------------------------------------------------------------- /docs/source/plots/bls_example.py: -------------------------------------------------------------------------------- 1 | import cuvarbase.bls as bls 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def phase(t, freq, phi0=0.): 7 | phi = (t * freq - phi0) 8 | phi -= np.floor(phi) 9 | 10 | return phi 11 | 12 | 13 | def transit_model(t, freq, y0=0.0, delta=1., q=0.01, phi0=0.5): 14 | phi = phase(t, freq, phi0=phi0) 15 | transit = phi < q 16 | 17 | y = y0 * np.ones_like(t) 18 | y[transit] -= delta 19 | return y 20 | 21 | 22 | def data(ndata=100, baseline=1, freq=10, sigma=1., **kwargs): 23 | t = baseline * np.sort(np.random.rand(ndata)) 24 | y = transit_model(t, freq, **kwargs) 25 | dy = sigma * np.ones_like(t) 26 | 27 | y += dy * np.random.randn(len(t)) 28 | 29 | return t, y, dy 30 | 31 | 32 | def plot_bls_model(ax, y0, delta, q, phi0, **kwargs): 33 | phi_plot = np.linspace(0, 1, 50./q) 34 | y_plot = transit_model(phi_plot, 1., y0=y0, 35 | delta=delta, q=q, phi0=phi0) 36 | 37 | ax.plot(phi_plot, y_plot, **kwargs) 38 | 39 | 40 | def plot_bls_sol(ax, t, y, dy, freq, q, phi0, **kwargs): 41 | w = np.power(dy, -2) 42 | w /= sum(w) 43 | 44 | phi = phase(t, freq, phi0=phi0) 45 | transit = phi < q 46 | 47 | def ybar(mask): 48 | return np.dot(w[mask], y[mask]) / sum(w[mask]) 49 | 50 | y0 = ybar(~transit) 51 | delta = y0 - ybar(transit) 52 | 53 | ax.scatter((phi[~transit] + phi0) % 1.0, y[~transit], 54 | c='k', s=1, alpha=0.5) 55 | ax.scatter((phi[transit] + phi0) % 1.0, y[transit], 56 | c='r', s=1, alpha=0.5) 57 | plot_bls_model(ax, y0, delta, q, phi0, **kwargs) 58 | 59 | ax.set_xlim(0, 1) 60 | ax.set_xlabel('$\phi$ ($f = %.3f$)' % (freq)) 61 | ax.set_ylabel('$y$') 62 | 63 | # set the transit parameters 64 | transit_kwargs = dict(freq=0.1, 65 | q=0.1, 66 | y0=10., 67 | sigma=0.002, 68 | delta=0.05, 69 | phi0=0.5) 70 | 71 | # generate data with a transit 72 | t, y, dy = data(ndata=300, 73 | baseline=365., 74 | **transit_kwargs) 75 | 76 | # set up search parameters 77 | search_params = dict(qmin=1e-2, 78 | qmax=0.5, 79 | 80 | # The logarithmic spacing of q 81 | dlogq=0.1, 82 | 83 | # Number of overlapping phase bins 84 | # to use for finding the best phi0 85 | noverlap=3) 86 | 87 | # derive baseline from the data for consistency 88 | baseline = max(t) - min(t) 89 | 90 | # df ~ qmin / baseline 91 | df = search_params['qmin'] / baseline 92 | fmin = 2. / baseline 93 | fmax = 2. 94 | 95 | nf = int(np.ceil((fmax - fmin) / df)) 96 | freqs = fmin + df * np.arange(nf) 97 | 98 | bls_power, sols = bls.eebls_gpu(t, y, dy, freqs, 99 | **search_params) 100 | 101 | # best BLS fit 102 | q_best, phi0_best = sols[np.argmax(bls_power)] 103 | f_best = freqs[np.argmax(bls_power)] 104 | 105 | # Plot results 106 | f, (ax_bls, ax_true, ax_best) = plt.subplots(1, 3, figsize=(9, 3)) 107 | 108 | # Periodogram 109 | ax_bls.plot(freqs, bls_power) 110 | ax_bls.axvline(transit_kwargs['freq'], 111 | ls=':', color='k', label="$f_0$") 112 | ax_bls.axvline(f_best, ls=':', color='r', 113 | label='BLS $f_{\\rm best}$') 114 | ax_bls.set_xlabel('freq.') 115 | ax_bls.set_ylabel('BLS power') 116 | 117 | # True solution 118 | plot_bls_sol(ax_true, t, y, dy, 119 | transit_kwargs['freq'], 120 | transit_kwargs['q'], 121 | transit_kwargs['phi0']) 122 | 123 | # Best-fit solution 124 | plot_bls_sol(ax_best, t, y, dy, 125 | f_best, q_best, phi0_best) 126 | 127 | 128 | ax_true.set_title("True parameters") 129 | ax_best.set_title("Best BLS parameters") 130 | 131 | f.tight_layout() 132 | plt.show() 133 | -------------------------------------------------------------------------------- /docs/source/plots/bls_example_transit.py: -------------------------------------------------------------------------------- 1 | import cuvarbase.bls as bls 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | def phase(t, freq, phi0=0.): 7 | phi = (t * freq - phi0) 8 | phi -= np.floor(phi) 9 | 10 | return phi 11 | 12 | 13 | def transit_model(t, freq, y0=0.0, delta=1., q=0.01, phi0=0.5): 14 | phi = phase(t, freq, phi0=phi0) 15 | transit = phi < q 16 | 17 | y = y0 * np.ones_like(t) 18 | y[transit] -= delta 19 | return y 20 | 21 | 22 | def data(ndata=100, baseline=1, freq=10, sigma=1., **kwargs): 23 | t = baseline * np.sort(np.random.rand(ndata)) 24 | y = transit_model(t, freq, **kwargs) 25 | dy = sigma * np.ones_like(t) 26 | 27 | y += dy * np.random.randn(len(t)) 28 | 29 | return t, y, dy 30 | 31 | 32 | def plot_bls_model(ax, y0, delta, q, phi0, **kwargs): 33 | phi_plot = np.linspace(0, 1, 50./q) 34 | y_plot = transit_model(phi_plot, 1., y0=y0, 35 | delta=delta, q=q, phi0=phi0) 36 | 37 | ax.plot(phi_plot, y_plot, **kwargs) 38 | 39 | 40 | def plot_bls_sol(ax, t, y, dy, freq, q, phi0, **kwargs): 41 | w = np.power(dy, -2) 42 | w /= sum(w) 43 | 44 | phi = phase(t, freq, phi0=phi0) 45 | transit = phi < q 46 | 47 | def ybar(mask): 48 | return np.dot(w[mask], y[mask]) / sum(w[mask]) 49 | 50 | y0 = ybar(~transit) 51 | delta = y0 - ybar(transit) 52 | 53 | ax.scatter((phi[~transit] + phi0) % 1.0, y[~transit], 54 | c='k', s=1, alpha=0.5) 55 | ax.scatter((phi[transit] + phi0) % 1.0, y[transit], 56 | c='r', s=1, alpha=0.5) 57 | plot_bls_model(ax, y0, delta, q, phi0, **kwargs) 58 | 59 | ax.set_xlim(0, 1) 60 | ax.set_xlabel('$\phi$ ($f = %.3f$)' % (freq)) 61 | ax.set_ylabel('$y$') 62 | 63 | # the mean density of the host star in solar units 64 | # i.e. rho = rho_star / rho_sun 65 | rho = 1. 66 | 67 | # set the transit parameters 68 | transit_kwargs = dict(freq=2., 69 | q=bls.q_transit(2., rho=rho), 70 | y0=10., 71 | sigma=0.005, 72 | delta=0.01, 73 | phi0=0.5) 74 | 75 | # generate data with a transit 76 | t, y, dy = data(ndata=300, 77 | baseline=365., 78 | **transit_kwargs) 79 | 80 | # set up search parameters 81 | search_params = dict( 82 | # Searches q values in the range 83 | # (q0 * qmin_fac, q0 * qmax_fac) 84 | # where q0 = q0(f, rho) is the fiducial 85 | # q value for Keplerian transit around 86 | # star with mean density rho 87 | qmin_fac=0.5, 88 | qmax_fac=2.0, 89 | 90 | # Assumed mean stellar density 91 | rho=1.0, 92 | 93 | # The min/max frequencies as a fraction 94 | # of their autoset values 95 | fmin_fac=1.0, 96 | fmax_fac=1.5, 97 | 98 | # oversampling factor; frequency spacing 99 | # is multiplied by 1/samples_per_peak 100 | samples_per_peak=2, 101 | 102 | # The logarithmic spacing of q 103 | dlogq=0.1, 104 | 105 | # Number of overlapping phase bins 106 | # to use for finding the best phi0 107 | noverlap=3) 108 | 109 | # Run keplerian BLS; frequencies are automatically set! 110 | freqs, bls_power, sols = bls.eebls_transit_gpu(t, y, dy, 111 | **search_params) 112 | 113 | # best BLS fit 114 | q_best, phi0_best = sols[np.argmax(bls_power)] 115 | f_best = freqs[np.argmax(bls_power)] 116 | 117 | # Plot results 118 | f, (ax_bls, ax_true, ax_best) = plt.subplots(1, 3, figsize=(9, 3)) 119 | 120 | # Periodogram 121 | ax_bls.plot(freqs, bls_power) 122 | ax_bls.axvline(transit_kwargs['freq'], 123 | ls=':', color='k', label="$f_0$") 124 | ax_bls.axvline(f_best, ls=':', color='r', 125 | label='BLS $f_{\\rm best}$') 126 | ax_bls.set_xlabel('freq.') 127 | ax_bls.set_ylabel('BLS power') 128 | ax_bls.set_xscale('log') 129 | 130 | # True solution 131 | label_true = '$q=%.3f$, ' % (transit_kwargs['q']) 132 | label_true += '$\\phi_0=%.3f$' % (transit_kwargs['phi0']) 133 | plot_bls_sol(ax_true, t, y, dy, 134 | transit_kwargs['freq'], 135 | transit_kwargs['q'], 136 | transit_kwargs['phi0'], 137 | label=label_true) 138 | ax_true.legend(loc='best') 139 | 140 | label_best = '$q=%.3f$, ' % (q_best) 141 | label_best += '$\\phi_0=%.3f$' % (phi0_best) 142 | # Best-fit solution 143 | plot_bls_sol(ax_best, t, y, dy, 144 | f_best, q_best, phi0_best, 145 | label=label_best) 146 | ax_best.legend(loc='best') 147 | 148 | ax_true.set_title("True parameters") 149 | ax_best.set_title("Best BLS parameters") 150 | 151 | f.tight_layout() 152 | plt.show() 153 | -------------------------------------------------------------------------------- /docs/source/plots/bls_transit_diagram.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import cuvarbase.bls as bls 4 | 5 | 6 | def transit_model(phi0, q, delta, q1=0.): 7 | def model(t, freq, q=q, phi0=phi0, delta=delta): 8 | 9 | phi = t * freq - phi0 10 | phi -= np.floor(phi) 11 | 12 | if not hasattr(t, '__iter__'): 13 | return -delta if np.absolute(phi) < q else 0 14 | y = np.zeros(len(t)) 15 | y[np.absolute(phi) < q] -= delta 16 | 17 | return y 18 | return model 19 | 20 | 21 | def plot_bls_sol(t, y, dy, freq, q, phi0): 22 | 23 | w = np.power(dy, -2) 24 | w /= sum(w) 25 | 26 | phi_plot = np.linspace(0, 1, 50./q) 27 | 28 | phi = (t * freq) 29 | phi -= np.floor(phi) 30 | 31 | dphi = phi - phi0 - np.floor(phi - phi0) 32 | mask = dphi < q 33 | 34 | ybt = np.dot(w[mask], y[mask]) / sum(w[mask]) 35 | yb0 = np.dot(w[~mask], y[~mask]) / sum(w[~mask]) 36 | 37 | delta = yb0 - ybt 38 | 39 | model = transit_model(phi0, q, delta) 40 | 41 | ym = model(phi_plot, 1.) + yb0 42 | 43 | f, ax = plt.subplots() 44 | 45 | ax.scatter(phi[~mask], y[~mask], c='k', s=1, alpha=0.4) 46 | ax.scatter(phi[mask], y[mask], c='g', s=1, alpha=0.8) 47 | ax.plot(phi_plot, ym, color='r') 48 | ax.axvline(phi0, color='k', ls=':') 49 | # ax.axvline(phi0 + q, color='k', ls=':') 50 | 51 | ax.axis('off') 52 | 53 | ax.annotate('$\\delta$', xy=(phi0 - 0.03, -0.5 * delta), xytext=(-5, 0), 54 | textcoords='offset points', ha='right', va='center', 55 | fontsize=20) 56 | 57 | ax.plot([phi0 - 0.03, phi0 - 0.03], [-delta, -0.03 * delta], ls='--', 58 | color='k') 59 | 60 | ax.plot([phi0, phi0 + q], [-1.03 * delta, -1.03 * delta], ls='--', 61 | color='k') 62 | ax.annotate('$q$', xy=(phi0 + 0.5 * q, -1.03 * delta), xytext=(0, -5), 63 | textcoords='offset points', ha='center', va='top', 64 | fontsize=20, transform=ax.transData) 65 | 66 | ax.annotate('$\\phi_0$', xy=(phi0, 0), xytext=(5, 5), 67 | textcoords='offset points', ha='left', va='bottom', 68 | fontsize=20, transform=ax.transData) 69 | 70 | ax.annotate('$y_0$', xy=(0.05, 0), xytext=(5, 5), 71 | textcoords='offset points', ha='left', va='bottom', 72 | fontsize=20, transform=ax.transData) 73 | plt.show() 74 | 75 | model = transit_model(0.5, 0.1, 0.1) 76 | t = np.sort(np.random.rand(200)) 77 | y = model(t, 10.) 78 | dy = 0.01 * np.ones_like(y) 79 | 80 | y += dy * np.random.randn(len(t)) 81 | 82 | plot_bls_sol(t, y, dy, 10., 0.1, 0.5) 83 | -------------------------------------------------------------------------------- /docs/source/plots/ce_example.py: -------------------------------------------------------------------------------- 1 | import cuvarbase.ce as ce 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | from copy import copy 5 | 6 | 7 | def phase(t, freq, phi0=0.): 8 | phi = (t * freq - phi0) 9 | phi -= np.floor(phi) 10 | 11 | return phi 12 | 13 | 14 | def sine_model(t, freq, amp=1., y0=0.0, phi0=0.5): 15 | return y0 + amp * np.sin((t * freq - phi0) * 2 * np.pi) 16 | 17 | 18 | def transit_model(t, freq, y0=0.0, delta=1., q=0.01, phi0=0.5): 19 | phi = phase(t, freq, phi0=phi0) 20 | transit = phi < q 21 | 22 | y = y0 * np.ones_like(t) 23 | y[transit] -= delta 24 | return y 25 | 26 | 27 | def data(ndata=100, baseline=1, freq=10, sigma=1., 28 | model=transit_model, **kwargs): 29 | t = baseline * np.sort(np.random.rand(ndata)) 30 | y = model(t, freq, **kwargs) 31 | dy = sigma * np.ones_like(t) 32 | 33 | y += dy * np.random.randn(len(t)) 34 | 35 | return t, y, dy 36 | 37 | 38 | def plot_ce_bins(ax, t, y, dy, freq, ce_proc): 39 | ax.set_xlim(0, 1) 40 | 41 | y0 = min(y) 42 | yrange = max(y) - y0 43 | 44 | # Phase-fold the data at the trial frequency 45 | phi = phase(t, freq) 46 | 47 | # Bin the data 48 | phi_bins = np.floor(phi * ce_proc.phase_bins).astype(np.int) 49 | 50 | yi = ce_proc.mag_bins * (y - y0)/yrange 51 | mag_bins = np.floor(yi).astype(np.int) 52 | 53 | bins = [[sum((phi_bins == i) & (mag_bins == j)) 54 | for j in range(ce_proc.mag_bins)] 55 | for i in range(ce_proc.phase_bins)] 56 | bins = np.array(bins).astype(np.float) 57 | 58 | # Convert to N(bin) / Ntotal 59 | bins /= np.sum(bins.ravel()) 60 | 61 | # The fraction of data that fall within a given phase bin 62 | p_phi = [np.sum(bins[i]) for i in range(ce_proc.phase_bins)] 63 | 64 | # fractional width of the (magnitude) bins 65 | dm = float(1 + ce_proc.mag_overlap) / ce_proc.mag_bins 66 | dphi = float(1 + ce_proc.phase_overlap) / ce_proc.phase_bins 67 | dY = yrange * dm 68 | 69 | # Compute conditional entropy contribution from each of the bins 70 | dH = [[bins[i][j] * np.log(dm * p_phi[i] / bins[i][j]) 71 | if bins[i][j] > 0 else 0. 72 | for j in range(ce_proc.mag_bins)] 73 | for i in range(ce_proc.phase_bins)] 74 | 75 | dH = np.array(dH) 76 | 77 | extent = [0, 1, min(y), max(y)] 78 | 79 | # Mask out the unoccupied bins 80 | dH = np.ma.masked_where(dH == 0, dH) 81 | 82 | palette = copy(plt.cm.GnBu_r) 83 | palette.set_bad('w', 0.) 84 | 85 | # Draw gridlines 86 | for i in range(ce_proc.phase_bins + 1): 87 | ax.axvline(0 + i * dphi, ls=':', color='k', 88 | alpha=0.5, zorder=95) 89 | 90 | for i in range(ce_proc.mag_bins + 1): 91 | ax.axhline(min(y) + i * dY, ls=':', color='k', 92 | alpha=0.5, zorder=95) 93 | 94 | # Plot the conditional entropy 95 | cplot = ax.imshow(dH.T, cmap=palette, extent=extent, 96 | aspect='auto', origin='lower', 97 | alpha=0.5, zorder=90) 98 | 99 | # Plot the data 100 | ax.scatter(phi, y, c='k', s=1, alpha=1, zorder=100) 101 | 102 | return cplot 103 | 104 | # Set up the signal parameters 105 | freq = 0.1 106 | signal_params = dict(y0=10., 107 | freq=freq, 108 | sigma=0.01, 109 | ndata=100, 110 | baseline=365., 111 | amp=0.1, 112 | phi0=0., 113 | model=sine_model) 114 | 115 | # Generate data 116 | t, y, dy = data(**signal_params) 117 | 118 | # Start GPU process for conditional entropy 119 | # (this does things like compiling the kernel, 120 | # setting parameter values, etc.) 121 | proc = ce.ConditionalEntropyAsyncProcess() 122 | 123 | # Set frequencies 124 | df = 1. / (2 * signal_params['baseline']) 125 | fmin = 2. / signal_params['baseline'] 126 | fmax = 50 * len(t) * df 127 | 128 | nf = int(np.ceil((fmax - fmin) / df)) 129 | freqs = fmin + df * np.arange(nf) 130 | 131 | #################### 132 | # Run the process! # 133 | #################### 134 | 135 | # Data is sent in list of tuples (in case we want 136 | # to run CE on more than one lightcurve) 137 | data = [(t, y, dy)] 138 | 139 | # The large_run function is an alternative to the run 140 | # function if the frequency grid & binning array is too 141 | # large to fit in GPU memory. 142 | try: 143 | results = proc.run(data, freqs=freqs) 144 | except: 145 | results = proc.large_run(data, freqs=freqs, max_memory=1e8) 146 | 147 | proc.finish() 148 | 149 | # The results come back as [(freqs, CE), ...] for 150 | # each element of the data list. In this case, there is only 151 | # one lightcurve. 152 | frq, p = results[0] 153 | 154 | # Find the best frequency (that *minimizes* the conditional entropy) 155 | f_best = frq[np.argmin(p)] 156 | 157 | 158 | ##################### 159 | # Plot the results! # 160 | ##################### 161 | 162 | f, (ax_ce, ax_bin) = plt.subplots(1, 2, figsize=(8, 4)) 163 | ax_ce.plot(frq, p) 164 | ax_ce.set_xlabel('freq.', fontsize=15) 165 | ax_ce.set_ylabel('Conditional Entropy ($H(f)$)', fontsize=15) 166 | ax_ce.set_xscale('log') 167 | ax_ce.axvline(freq, color='k', ls=':') 168 | ax_ce.axvline(f_best, color='r', ls=':') 169 | 170 | cplot = plot_ce_bins(ax_bin, t, y, dy, freq, proc) 171 | cbar = f.colorbar(cplot) 172 | cbar.ax.set_title('$H(\\phi, m)$') 173 | ax_bin.set_xlabel('$\\phi$', fontsize=15) 174 | ax_bin.set_ylabel('$m$', fontsize=15) 175 | ax_bin.set_title('$f = {\\rm argmin}_{f}(H(f))$') 176 | 177 | f.tight_layout() 178 | plt.show() 179 | -------------------------------------------------------------------------------- /docs/source/plots/logo.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import cuvarbase.lombscargle as ls 4 | 5 | rand = np.random.RandomState(100) 6 | freq = 40 7 | def data(ndata=100, freq=freq, sigma=0.4): 8 | t = np.sort(rand.rand(ndata)) 9 | y = sum([np.cos(2 * np.pi * n * freq * t - n) / np.sqrt(abs(n - 2) + 1) for n in range(4)]) 10 | dy = sigma * np.ones_like(t) 11 | 12 | y += dy * rand.randn(ndata) 13 | 14 | return t, y, dy 15 | 16 | t, y, dy = data() 17 | data = [(t, y, dy)] 18 | proc = ls.LombScargleAsyncProcess() 19 | result = proc.run(data, minimum_frequency=10, maximum_frequency=150) 20 | proc.finish() 21 | 22 | frq, p = result[0] 23 | 24 | mask = np.absolute(frq - freq) / freq < 0.02 25 | 26 | f, ax = plt.subplots(figsize=(3, 3)) 27 | 28 | phi = (t * freq) % 2.0 29 | 30 | #ax.plot(frq[~mask], p[~mask], color='k', lw=2, zorder=10) 31 | #ax.plot(frq[mask], p[mask], color='r', lw=2, zorder=11) 32 | ax.plot(frq, p, color='0.6', lw=2) 33 | 34 | for n in range(1, 4): 35 | mask = np.absolute(frq - n * freq) / freq < 1e-1 36 | ax.plot(frq[mask], p[mask]) 37 | 38 | ax.set_xlim(min(frq), max(frq)) 39 | xmin, xmax = ax.get_xlim() 40 | ymin, ymax = ax.get_ylim() 41 | yrange = max(y) - min(y) 42 | ys = (ymax - ymin) * (y - min(y)) / yrange 43 | 44 | #ax.scatter(0.5 * phi * (xmax - xmin), ys, s=2, c='k', alpha=0.2) 45 | ax.axis('off') 46 | #ax.axvline(freq, ls=':', color='r') 47 | f.subplots_adjust(left=0, top=1, bottom=0, right=1) 48 | f.savefig('../logo.png') 49 | #plt.show() 50 | -------------------------------------------------------------------------------- /docs/source/plots/planet_transit_diagram.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import matplotlib.patches as mpatches 3 | import numpy as np 4 | 5 | 6 | def theta_entry(a_rs=3., rp_rs=0.5): 7 | return -np.arcsin((1 + rp_rs) / a_rs) 8 | 9 | 10 | def circ_pos(center, theta, r): 11 | x, y = center 12 | y1 = y - r * np.cos(theta) 13 | x1 = x + r * np.sin(theta) 14 | 15 | return (x1, y1) 16 | 17 | 18 | def draw_system(ax, a_rs=3., rp_rs=0.5, theta=None, 19 | rs=0.1, xs=(0.5, 0.5), label_radii=True, 20 | draw_planet=True, draw_sun=True, 21 | planet_circle_kwargs=dict(color='0.7'), 22 | sun_circle_kwargs=dict(color='r', alpha=0.5), 23 | draw_a=True, a_kwargs=dict(color='k', ls='-'), 24 | label_a=True): 25 | if theta is None: 26 | theta = theta_entry(a_rs=a_rs, rp_rs=rp_rs) 27 | 28 | xp = circ_pos(xs, theta, a_rs * rs) 29 | 30 | if draw_planet: 31 | planet = plt.Circle(xp, rp_rs * rs, **planet_circle_kwargs) 32 | ax.add_artist(planet) 33 | if draw_sun: 34 | star = plt.Circle(xs, rs, **sun_circle_kwargs) 35 | ax.add_artist(star) 36 | 37 | ax.plot([xs[0] - rs, xs[0] - rs], [0, xs[1]], color='k', ls=':') 38 | ax.plot([xs[0] + rs, xs[0] + rs], [0, xs[1]], color='k', ls=':') 39 | 40 | if draw_a: 41 | ax.plot(*zip(xs, xp), **a_kwargs) 42 | 43 | if draw_a and label_a: 44 | atext_xy = tuple(0.5 * (np.array(xs) + np.array(xp))) 45 | 46 | acoords = (-5 * np.cos(theta), -5 * np.sin(theta)) 47 | ax.annotate("$a$", xy=atext_xy, xytext=acoords, 48 | textcoords='offset points', xycoords='data', 49 | ha='right', va='bottom' if theta < 0 else 'top', 50 | fontsize=20) 51 | 52 | if label_radii: 53 | ax.plot([xs[0], xs[0] + rs], [xs[1], xs[1]], ls='--', color='k') 54 | ax.annotate("$R_{\\star}$", xy=(xs[0] + 0.5 * rs, xs[1]), 55 | xytext=(0, 3), 56 | textcoords='offset points', xycoords='data', 57 | ha='center', va='bottom', fontsize=20) 58 | 59 | ax.plot([xp[0], xp[0] - rs * rp_rs], 60 | [xp[1], xp[1]], ls='--', color='k') 61 | ax.annotate("$R_p$", xy=(xp[0] - 0.5 * rs * rp_rs, xp[1]), 62 | xytext=(-5, -5), 63 | textcoords='offset points', xycoords='data', 64 | ha='right', va='top', fontsize=20) 65 | 66 | f, ax = plt.subplots() 67 | 68 | x0 = (0.5, 0.8) 69 | theta = -theta_entry() 70 | rs = 0.2 71 | 72 | draw_system(ax, theta=-theta, rs=rs, xs=x0) 73 | 74 | draw_system(ax, theta=theta, rs=rs, xs=x0, 75 | draw_sun=False, label_radii=False, label_a=False) 76 | 77 | arc_rad = 1.2 * rs 78 | arc = mpatches.Arc(x0, 2 * arc_rad, 2 * arc_rad, 79 | theta1=np.degrees(-np.pi/2 - theta), 80 | theta2=np.degrees(-np.pi/2 + theta)) 81 | 82 | arc2 = mpatches.Arc(x0, 6 * rs, 6 * rs, 83 | theta1=np.degrees(-np.pi/2 - 2 * theta), 84 | theta2=np.degrees(-np.pi/2 + 2 * theta), 85 | ls='--', color='k') 86 | 87 | ax.add_patch(arc) 88 | ax.add_patch(arc2) 89 | ax.annotate('$\\theta$', xy=(x0[0], x0[1] - arc_rad), xytext=(0, -5), 90 | textcoords='offset points', fontsize=20, va='top', ha='center') 91 | 92 | 93 | ax.axis('off') 94 | ax.set_aspect('equal', 'datalim') 95 | 96 | ax.set_xlim(0, 1) 97 | ax.set_ylim(0, 1) 98 | plt.show() 99 | -------------------------------------------------------------------------------- /docs/source/whatsnew.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../../CHANGELOG.rst 2 | -------------------------------------------------------------------------------- /notebooks/Conditional entropy.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Conditional Entropy period finder\n", 8 | "\n" 9 | ] 10 | } 11 | ], 12 | "metadata": { 13 | "kernelspec": { 14 | "display_name": "Python 2", 15 | "language": "python", 16 | "name": "python2" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 2 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython2", 28 | "version": "2.7.13" 29 | } 30 | }, 31 | "nbformat": 4, 32 | "nbformat_minor": 2 33 | } 34 | -------------------------------------------------------------------------------- /notebooks/PDM2_bin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/notebooks/PDM2_bin.jpg -------------------------------------------------------------------------------- /notebooks/PDM2_binless_gauss.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/notebooks/PDM2_binless_gauss.jpg -------------------------------------------------------------------------------- /notebooks/PDM2_binless_tophat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/notebooks/PDM2_binless_tophat.jpg -------------------------------------------------------------------------------- /notebooks/PDM_bin.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/johnh2o2/cuvarbase/0d97ae11bea01fdfb71cfbe15059979ebfe37373/notebooks/PDM_bin.jpg -------------------------------------------------------------------------------- /publish_docs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # A hack-ish way to automate the document publishing process for 4 | # github pages. 5 | # 6 | # This won't work if you're not @johnh2o2 on Github. 7 | # 8 | # To build docs locally 9 | # --------------------- 10 | # Just ``cd docs && make html``. Then open docs/build/html/index.html. 11 | set -x 12 | 13 | DOC_BRANCH=master 14 | NEEDED="cuvarbase docs/Makefile docs/source README.rst INSTALL.rst CHANGELOG.rst" 15 | 16 | # We need to grab hidden files with mv... 17 | shopt -s dotglob nullglob 18 | 19 | # Create gh-pages branch if one doesn't already exist. 20 | HAS_GH_BRANCH=`git branch | grep gh-pages` 21 | if [ "$HAS_GH_BRANCH" == "" ]; then 22 | echo "Did not detect gh-pages branch. Creating now." 23 | git checkout -b gh-pages || exit 1 24 | else 25 | git checkout gh-pages || exit 1 26 | fi 27 | 28 | # update 29 | git pull origin gh-pages 30 | 31 | # clean out 32 | git rm -rf . 33 | 34 | # checkout the files we need for the documentation 35 | git checkout $DOC_BRANCH $NEEDED 36 | git reset HEAD 37 | 38 | # make docs 39 | cd docs 40 | make html || exit 1 41 | cd .. 42 | 43 | # move content to parent directory 44 | mv docs/build/html/* ./ 45 | 46 | # remove unneeded files 47 | rm -rf $NEEDED docs 48 | 49 | # update the repo 50 | git add --all 51 | git commit -m "Updating docs" 52 | git push -u origin gh-pages 53 | 54 | # go home 55 | git checkout $DOC_BRANCH 56 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | future 2 | numpy >= 1.6 3 | scipy 4 | pycuda >= 2017.1.1, != 2024.1.2 5 | scikit-cuda 6 | -e . 7 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [bdist_wheel] 2 | universal=1 3 | 4 | [aliases] 5 | test=pytest 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import io 4 | import os 5 | import re 6 | 7 | try: 8 | from setuptools import setup 9 | except ImportError: 10 | from distutils.core import setup 11 | 12 | 13 | def read(path, encoding='utf-8'): 14 | path = os.path.join(os.path.dirname(__file__), path) 15 | with io.open(path, encoding=encoding) as fp: 16 | return fp.read() 17 | 18 | 19 | def version(path): 20 | """Obtain the packge version from a python file e.g. pkg/__init__.py 21 | 22 | See . 23 | """ 24 | version_file = read(path) 25 | version_match = re.search(r"""^__version__ = ['"]([^'"]*)['"]""", 26 | version_file, re.M) 27 | if version_match: 28 | return version_match.group(1) 29 | raise RuntimeError("Unable to find version string.") 30 | 31 | 32 | VERSION = version('cuvarbase/__init__.py') 33 | 34 | setup(name='cuvarbase', 35 | version=VERSION, 36 | description="Period-finding and variability on the GPU", 37 | author='John Hoffman', 38 | author_email='johnh2o2@gmail.com', 39 | packages=['cuvarbase', 40 | 'cuvarbase.tests'], 41 | package_data={'cuvarbase': ['kernels/*cu']}, 42 | url='https://github.com/johnh2o2/cuvarbase', 43 | setup_requires=['pytest-runner', 'future'], 44 | install_requires=['future', 45 | 'numpy>=1.6', 46 | 'scipy', 47 | 'pycuda>=2017.1.1,!=2024.1.2', 48 | 'scikit-cuda'], 49 | tests_require=['pytest', 50 | 'future', 51 | 'nfft', 52 | 'matplotlib', 53 | 'astropy'], 54 | classifiers=[ 55 | 'Development Status :: 4 - Beta', 56 | 'Environment :: Console', 57 | 'Intended Audience :: Science/Research', 58 | 'License :: OSI Approved :: GNU General Public License v3 (GPLv3)', 59 | 'Natural Language :: English', 60 | 'Programming Language :: Python :: 2.7', 61 | 'Programming Language :: Python :: 3.4', 62 | 'Programming Language :: Python :: 3.5', 63 | 'Programming Language :: Python :: 3.6', 64 | 'Programming Language :: C', 65 | 'Programming Language :: C++']) 66 | -------------------------------------------------------------------------------- /test_python_versions.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Very rough script for testing cuvarbase compatibility across python 4 | # versions 5 | # 6 | # (c) John Hoffman 7 | # 8 | # Run this from the top-level cuvarbase directory 9 | 10 | 11 | # Print everything you do. 12 | set -x 13 | 14 | # Decide which python version to test 15 | PYTHON_VERSION=2.7 16 | 17 | # Put your cuda installation directory here 18 | export CUDA_ROOT=/usr/local/cuda 19 | 20 | ######################################################################## 21 | CONDA_ENVIRONMENT_NAME=cuvar 22 | CUVARBASE_DIR=$PWD 23 | 24 | # Export the library paths 25 | export LD_LIBRARY_PATH="${CUDA_ROOT}/lib:${LD_LIBRARY_PATH}" 26 | export DYLD_LIBRARY_PATH="${CUDA_ROOT}/lib:${DYLD_LIBRARY_PATH}" 27 | export PATH="${CUDA_ROOT}/bin:${PATH}" 28 | 29 | # Erase the testing conda environment if it already exists 30 | test_str=`conda info --envs | grep ${CONDA_ENVIRONMENT_NAME}` 31 | if [ "$test_str" != "" ]; then 32 | echo "removing conda environment ${CONDA_ENVIRONMENT_NAME}" 33 | conda remove -y --name ${CONDA_ENVIRONMENT_NAME} --all 34 | fi 35 | 36 | # Create the conda environment for testing with the right Python version 37 | conda create -y -n $CONDA_ENVIRONMENT_NAME python=$PYTHON_VERSION numpy 38 | 39 | # Activate the conda environment 40 | source activate $CONDA_ENVIRONMENT_NAME 41 | 42 | cd $CUVARBASE_DIR 43 | 44 | # Install from the present directory, ignoring caches 45 | pip install --no-cache-dir -e . 46 | 47 | # test 48 | python setup.py test 49 | 50 | # (optionally) clean up conda environment 51 | #source deactivate 52 | #conda remove -y --name $CONDA_ENVIRONMENT_NAME --all 53 | --------------------------------------------------------------------------------