├── .gitignore
├── .readthedocs.yaml
├── LICENSE.md
├── README.md
├── TARDIS.svg
├── data
    ├── Pinched_torus.txt.gz
    └── Wedged_spheres_2D.txt.gz
├── docs
    ├── .gitignore
    ├── Makefile
    ├── make.bat
    └── source
    │   ├── api.rst
    │   ├── conf.py
    │   ├── data.rst
    │   ├── index.rst
    │   └── utils.rst
├── examples
    └── simple_numpy.py
├── output
    ├── Gudhi_Computational_Effort.txt
    └── Ripser_Computational_Effort.txt
├── poetry.lock
├── pyproject.toml
├── scripts
    ├── es.sh
    ├── ipsc.sh
    ├── pinched_torus.sh
    ├── vision.sh
    └── wedged_spheres.sh
└── tardis
    ├── __init__.py
    ├── analyse_euclidicity.py
    ├── api.py
    ├── cli.py
    ├── data.py
    ├── euclidicity.py
    ├── make_pinched_torus.py
    ├── make_wedged_spheres.py
    ├── make_wedged_spheres_varying_dim.py
    ├── mat_to_npy.py
    ├── persistent_homology.py
    ├── shapes.py
    ├── utils.py
    └── visualise_data.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *~
 2 | *.swp
 3 | 
 4 | data/
 5 | 
 6 | # Using this for `pyenv` and related tools. It can be ignored since we
 7 | # do not enforce a specific version in practice.
 8 | .python-version
 9 | 
10 | __pycache__/
11 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   # Required to get access to more recent Python versions.
 5 |   image: testing
 6 | 
 7 | sphinx:
 8 |   configuration: docs/source/conf.py
 9 | 
10 | python:
11 |   version: 3.9
12 |   install:
13 |     - method: pip
14 |       path: .
15 |       extra_requirements:
16 |         - docs
17 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2022 Julius von Rohrscheidt and Bastian Rieck
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions are
 5 | met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright
 8 |    notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 
14 | 3. Neither the name of the copyright holder nor the names of its
15 |    contributors may be used to endorse or promote products derived from
16 |    this software without specific prior written permission.
17 | 
18 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
19 | IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 | TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
24 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
25 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
26 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
27 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TARDIS: Topological Algorithms for Robust DIscovery of Singularities
  2 | 
  3 | [![arXiv](https://img.shields.io/badge/arXiv-2210.00069-b31b1b.svg)](https://arxiv.org/abs/2210.00069) [![Maintainability](https://api.codeclimate.com/v1/badges/4656850a9d0eb2f85b6e/maintainability)](https://codeclimate.com/github/aidos-lab/TARDIS/maintainability) ![GitHub contributors](https://img.shields.io/github/contributors/aidos-lab/TARDIS) ![GitHub](https://img.shields.io/github/license/aidos-lab/TARDIS) ![Read the Docs](https://img.shields.io/readthedocs/tardis-tda)
  4 | 
  5 | ![TARDIS icon](./TARDIS.svg)
  6 | 
  7 | This is the code for our [ICML paper on topology-driven singularity analysis](https://proceedings.mlr.press/v202/von-rohrscheidt23a.html):
  8 | 
  9 | ```bibtex
 10 | @inproceedings{vonRohrscheidt23a,
 11 |     title       = {Topological Singularity Detection at Multiple Scales},
 12 |     author      = {von Rohrscheidt, Julius and Rieck, Bastian},
 13 |     year        = 2023,
 14 |     booktitle   = {Proceedings of the 40th International Conference on Machine Learning},
 15 |     publisher   = {PMLR},
 16 |     series      = {Proceedings of Machine Learning Research},
 17 |     number      = 202,
 18 |     pages       = {35175--35197},
 19 |     editor      = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
 20 |     abstract    = {The manifold hypothesis, which assumes that data lies on or close to an unknown manifold of low intrinsic dimension, is a staple of modern machine learning research. However, recent work has shown that real-world data exhibits distinct non-manifold structures, i.e. singularities, that can lead to erroneous findings. Detecting such singularities is therefore crucial as a precursor to interpolation and inference tasks. We address this issue by developing a topological framework that (i) quantifies the local intrinsic dimension, and (ii) yields a Euclidicity score for assessing the `manifoldness' of a point along multiple scales. Our approach identifies singularities of complex spaces, while also capturing singular structures and local geometric complexity in image data.}
 21 | }
 22 | ```
 23 | 
 24 | ## Installation
 25 | 
 26 | Our code has been tested with Python 3.8 and Python 3.9 under Mac OS
 27 | X and Linux. Other Python versions *may* not support all dependencies.
 28 | 
 29 | The recommended way to install the project is via [`poetry`](https://python-poetry.org/).
 30 | If this is available, installation should work very quickly:
 31 | 
 32 |     $ poetry install
 33 | 
 34 | Recent versions of `pip` should also be capable of installing the
 35 | project directly:
 36 | 
 37 |     $ pip install .
 38 | 
 39 | ## Experiments
 40 | 
 41 | To reproduce the main experiments in our paper, we ship synthetic data
 42 | sets in the repository and offer the automated capability to download
 43 | the computer vision data sets&nbsp;(`MNIST` and `FashionMNIST`). For
 44 | reasons of simplicity, we suggest to reproduce the experiments with
 45 | synthetic point clouds first as they run quickly even on a standard
 46 | desktop computer.
 47 | 
 48 | All experiments make use of the script `cli.py`, which provides
 49 | a command-line interface to our framework. Given input parameters for
 50 | the local annuli, this script will calculate Euclidicity values as
 51 | described in the paper. For reasons of simplicity, all output is
 52 | provided to `stdout`, i.e. the standard output of your terminal, and
 53 | needs to be redirected to a file for subsequent analysis.
 54 | 
 55 | We will subsequently provide the precise commands to reproduce the
 56 | experiments; readers are invited to take a look at the code in `cli.py`
 57 | or call `python cli.py --help` in order to see what additional options
 58 | are available for processing data.
 59 | 
 60 | ### Pinched torus
 61 | 
 62 | Run the following commands from the root directory of the repository:
 63 | 
 64 |     $ cd tardis
 65 |     $ python cli.py ../data/Pinched_torus.txt.gz -q 500 -r 0.05 -R 0.45 -s 0.2 -S 0.6 > ../output/Pinched_torus.txt
 66 | 
 67 | This will create a point cloud of 500 sample points with $x, y, z$
 68 | coordinates, followed by our Euclidicity score.
 69 | 
 70 | ### Wedged spheres (with automated parameter selection)
 71 | 
 72 | **Warning**: this example might require a long runtime on an ordinary
 73 | machine. We ran this on our cluster (see also the [`scripts`](https://github.com/aidos-lab/TARDIS/tree/main/scripts)
 74 | folder in the root directory).
 75 | 
 76 | Run the following commands from the root directory of the repository:
 77 | 
 78 |     $ cd tardis
 79 |     $ python cli.py -k 100 -q 2000 -d 2 --num-steps 20 ../data/Wedged_spheres_2D.txt.gz > ../output/Wedged_spheres_2D.txt
 80 | 
 81 | This will make use of the automated parameter selection procedure based
 82 | on nearest neighbours. Notice that this example uses more query
 83 | points; it is of course possible to adjust this parameter.
 84 | 
 85 | ## API & examples
 86 | 
 87 | Check out the [examples folder](https://github.com/aidos-lab/TARDIS/tree/main/examples) for some code snippets that
 88 | demonstrate how to use TARDIS in your own code. They all make use of the
 89 | [preliminary API](https://github.com/aidos-lab/TARDIS/blob/main/tardis/api.py).
 90 | 
 91 | ## License
 92 | 
 93 | Our code is released under a BSD-3-Clause license. This license
 94 | essentially permits you to freely use our code as desired, integrate it
 95 | into your projects, and much more---provided you acknowledge the
 96 | original authors. Please refer to [LICENSE.md](./LICENSE.md) for more
 97 | information. 
 98 | 
 99 | ## Issues
100 | 
101 | This project is maintained by members of the [AIDOS Lab](https://github.com/aidos-lab).
102 | Please open an [issue](https://github.com/aidos-lab/TARDIS/issues) in
103 | case you encounter any problems.
104 | 


--------------------------------------------------------------------------------
/TARDIS.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!-- Generator: Adobe Illustrator 17.0.0, SVG Export Plug-In . SVG Version: 6.00 Build 0)  -->
  3 | 
  4 | <svg
  5 |    version="1.1"
  6 |    id="Layer_1"
  7 |    x="0px"
  8 |    y="0px"
  9 |    width="74.967003"
 10 |    height="142.633"
 11 |    viewBox="0 0 74.967005 142.633"
 12 |    enable-background="new 0 0 1400 980"
 13 |    xml:space="preserve"
 14 |    sodipodi:docname="TARDIS.svg"
 15 |    inkscape:version="1.2.2 (b0a8486541, 2022-12-01)"
 16 |    xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
 17 |    xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
 18 |    xmlns="http://www.w3.org/2000/svg"
 19 |    xmlns:svg="http://www.w3.org/2000/svg"><defs
 20 |    id="defs2119">
 21 | 	
 22 | 
 23 | 		
 24 | 		
 25 | 	
 26 | 			
 27 | 			
 28 | 			
 29 | 			
 30 | 		
 31 | 			
 32 | 			
 33 | 			
 34 | 		
 35 | 	
 36 | 	
 37 | </defs><sodipodi:namedview
 38 |    id="namedview2117"
 39 |    pagecolor="#ffffff"
 40 |    bordercolor="#666666"
 41 |    borderopacity="1.0"
 42 |    inkscape:showpageshadow="2"
 43 |    inkscape:pageopacity="0.0"
 44 |    inkscape:pagecheckerboard="0"
 45 |    inkscape:deskcolor="#d1d1d1"
 46 |    showgrid="false"
 47 |    inkscape:zoom="0.96326531"
 48 |    inkscape:cx="-70.074153"
 49 |    inkscape:cy="113.15678"
 50 |    inkscape:window-width="1920"
 51 |    inkscape:window-height="1115"
 52 |    inkscape:window-x="0"
 53 |    inkscape:window-y="0"
 54 |    inkscape:window-maximized="1"
 55 |    inkscape:current-layer="Layer_1" />
 56 | <g
 57 |    id="g292"
 58 |    transform="matrix(0.5,0,0,0.5,-519.0305,-92.3065)">
 59 | 		<g
 60 |    id="g12">
 61 | 			<g
 62 |    id="g6">
 63 | 				<polygon
 64 |    fill="#1a48a4"
 65 |    points="1106.463,190.859 1113.028,186.022 1119.594,190.859 1119.594,210.092 1106.463,210.092 "
 66 |    id="polygon4" />
 67 | 			</g>
 68 | 			<g
 69 |    id="g10">
 70 | 				<path
 71 |    fill="#ffffff"
 72 |    d="m 1113.028,187.431 5.431,4.001 v 17.525 h -10.861 v -17.525 l 5.43,-4.001 m 0,-2.818 -1.346,0.992 -5.431,4.001 -0.923,0.68 v 1.147 17.525 2.269 h 2.269 10.861 2.269 v -2.269 -17.525 -1.147 l -0.923,-0.68 -5.431,-4.001 z"
 73 |    id="path8" />
 74 | 			</g>
 75 | 		</g>
 76 | 		<g
 77 |    id="g22">
 78 | 			<g
 79 |    id="g16">
 80 | 				<polygon
 81 |    fill="#1c75bc"
 82 |    points="1186.86,459.268 1186.86,468.744 1039.197,468.744 1039.197,459.268 1045.725,454.553 1180.332,454.553 "
 83 |    id="polygon14" />
 84 | 			</g>
 85 | 			<g
 86 |    id="g20">
 87 | 				<path
 88 |    fill="#ffffff"
 89 |    d="m 1179.965,455.688 5.76,4.161 v 7.761 h -72.697 -72.697 v -7.761 l 5.76,-4.161 h 133.874 m 0.734,-2.269 h -0.734 -133.874 -0.734 l -0.595,0.43 -5.76,4.161 -0.941,0.679 v 1.16 7.761 2.269 h 2.269 72.697 72.697 2.269 v -2.269 -7.761 -1.16 l -0.941,-0.679 -5.76,-4.161 z"
 90 |    id="path18" />
 91 | 			</g>
 92 | 		</g>
 93 | 		<g
 94 |    id="g74">
 95 | 			<g
 96 |    id="g32">
 97 | 				<g
 98 |    id="g26">
 99 | 					<rect
100 |    x="1050"
101 |    y="213.87399"
102 |    fill="#1c75bc"
103 |    width="126.057"
104 |    height="10.338"
105 |    id="rect24" />
106 | 				</g>
107 | 				<g
108 |    id="g30">
109 | 					<path
110 |    fill="#ffffff"
111 |    d="m 1174.922,215.009 v 8.069 h -123.788 v -8.069 h 123.788 m 2.27,-2.27 h -2.269 -123.788 -2.269 v 2.269 8.069 2.269 h 2.269 123.788 2.269 v -2.269 -8.069 z"
112 |    id="path28" />
113 | 				</g>
114 | 			</g>
115 | 			<g
116 |    id="g42">
117 | 				<g
118 |    id="g36">
119 | 					<rect
120 |    x="1044.957"
121 |    y="221.94299"
122 |    fill="#1c75bc"
123 |    width="136.14301"
124 |    height="11.095"
125 |    id="rect34" />
126 | 				</g>
127 | 				<g
128 |    id="g40">
129 | 					<path
130 |    fill="#ffffff"
131 |    d="m 1179.965,223.078 v 8.825 h -133.874 v -8.825 h 5.043 123.788 5.043 m 2.27,-2.27 h -2.269 -5.043 -123.788 -5.043 -2.269 v 2.269 8.825 2.269 h 2.269 133.874 2.269 v -2.269 -8.825 z"
132 |    id="path38" />
133 | 				</g>
134 | 			</g>
135 | 			<g
136 |    id="g52">
137 | 				<g
138 |    id="g46">
139 | 					<polygon
140 |    fill="#1c75bc"
141 |    points="1175.091,213.887 1174.922,216.144 1051.135,216.144 1050.966,213.887 1103.708,205.963 1103.708,201.014 1122.349,201.014 1122.349,205.963 "
142 |    id="polygon44" />
143 | 				</g>
144 | 				<g
145 |    id="g50">
146 | 					<path
147 |    fill="#ffffff"
148 |    d="m 1121.214,202.149 v 4.791 l 53.708,8.069 h -123.788 l 53.708,-8.069 v -4.791 h 8.186 8.186 m 2.269,-2.269 h -2.269 -8.186 -8.186 -2.269 v 2.269 2.837 l -51.776,7.779 0.337,4.514 h 123.788 l 0.337,-4.514 -51.776,-7.779 v -2.837 z"
149 |    id="path48" />
150 | 				</g>
151 | 			</g>
152 | 			<g
153 |    id="g62">
154 | 				<g
155 |    id="g56">
156 | 					<rect
157 |    x="1044.957"
158 |    y="249.67999"
159 |    fill="#1c75bc"
160 |    width="136.14301"
161 |    height="207.14301"
162 |    id="rect54" />
163 | 				</g>
164 | 				<g
165 |    id="g60">
166 | 					<path
167 |    fill="#ffffff"
168 |    d="M 1179.965,250.814 V 455.688 H 1046.091 V 250.814 h 133.874 m 2.27,-2.269 h -2.269 -133.874 -2.269 v 2.269 204.874 2.269 h 2.269 133.874 2.269 v -2.269 -204.874 z"
169 |    id="path58" />
170 | 				</g>
171 | 			</g>
172 | 			<g
173 |    id="g72">
174 | 				<g
175 |    id="g66">
176 | 					<rect
177 |    x="1039.197"
178 |    y="230.76801"
179 |    fill="#1c75bc"
180 |    width="147.664"
181 |    height="21.181"
182 |    id="rect64" />
183 | 				</g>
184 | 				<g
185 |    id="g70">
186 | 					<path
187 |    fill="#ffffff"
188 |    d="m 1185.726,231.903 v 18.911 h -5.76 -133.874 -5.76 v -18.911 h 5.76 133.874 5.76 m 2.269,-2.269 h -2.269 -5.76 -133.874 -5.76 -2.269 v 2.269 18.911 2.269 h 2.269 5.76 133.874 5.76 2.269 v -2.269 -18.911 z"
189 |    id="path68" />
190 | 				</g>
191 | 			</g>
192 | 		</g>
193 | 		<g
194 |    id="g96">
195 | 			<g
196 |    id="g84">
197 | 				<g
198 |    id="g78">
199 | 					<rect
200 |    x="1064.58"
201 |    y="357.254"
202 |    fill="#1c75bc"
203 |    width="34.998001"
204 |    height="39.799999"
205 |    id="rect76" />
206 | 				</g>
207 | 				<g
208 |    id="g82">
209 | 					<path
210 |    fill="#ffffff"
211 |    d="m 1098.443,358.388 v 37.53 h -32.728 v -37.53 h 32.728 m 2.27,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.53 2.269 h 2.269 32.728 2.269 v -2.269 -37.53 z"
212 |    id="path80" />
213 | 				</g>
214 | 			</g>
215 | 			<g
216 |    id="g94">
217 | 				<g
218 |    id="g88">
219 | 					<rect
220 |    x="1126.479"
221 |    y="357.254"
222 |    fill="#1c75bc"
223 |    width="34.997002"
224 |    height="39.799999"
225 |    id="rect86" />
226 | 				</g>
227 | 				<g
228 |    id="g92">
229 | 					<path
230 |    fill="#ffffff"
231 |    d="m 1160.342,358.388 v 37.53 h -32.728 v -37.53 h 32.728 m 2.269,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.53 2.269 h 2.269 32.728 2.269 v -2.269 -37.53 z"
232 |    id="path90" />
233 | 				</g>
234 | 			</g>
235 | 		</g>
236 | 		<g
237 |    id="g118">
238 | 			<g
239 |    id="g106">
240 | 				<g
241 |    id="g100">
242 | 					<rect
243 |    x="1064.58"
244 |    y="407.11401"
245 |    fill="#1c75bc"
246 |    width="34.998001"
247 |    height="39.799999"
248 |    id="rect98" />
249 | 				</g>
250 | 				<g
251 |    id="g104">
252 | 					<path
253 |    fill="#ffffff"
254 |    d="m 1098.443,408.249 v 37.531 h -32.728 v -37.531 h 32.728 m 2.27,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.531 2.269 h 2.269 32.728 2.269 v -2.269 -37.531 z"
255 |    id="path102" />
256 | 				</g>
257 | 			</g>
258 | 			<g
259 |    id="g116">
260 | 				<g
261 |    id="g110">
262 | 					<rect
263 |    x="1126.479"
264 |    y="407.11401"
265 |    fill="#1c75bc"
266 |    width="34.997002"
267 |    height="39.799999"
268 |    id="rect108" />
269 | 				</g>
270 | 				<g
271 |    id="g114">
272 | 					<path
273 |    fill="#ffffff"
274 |    d="m 1160.342,408.249 v 37.531 h -32.728 v -37.531 h 32.728 m 2.269,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.531 2.269 h 2.269 32.728 2.269 v -2.269 -37.531 z"
275 |    id="path112" />
276 | 				</g>
277 | 			</g>
278 | 		</g>
279 | 		<g
280 |    id="g142">
281 | 			<g
282 |    id="g128">
283 | 				<g
284 |    id="g122">
285 | 					<rect
286 |    x="1126.479"
287 |    y="307.39301"
288 |    fill="#1c75bc"
289 |    width="34.997002"
290 |    height="39.799999"
291 |    id="rect120" />
292 | 				</g>
293 | 				<g
294 |    id="g126">
295 | 					<path
296 |    fill="#ffffff"
297 |    d="m 1160.342,308.527 v 37.53 h -32.728 v -37.53 h 32.728 m 2.269,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.53 2.269 h 2.269 32.728 2.269 v -2.269 -37.53 z"
298 |    id="path124" />
299 | 				</g>
300 | 			</g>
301 | 			<g
302 |    id="g140">
303 | 				<g
304 |    id="g132">
305 | 					<rect
306 |    x="1064.58"
307 |    y="307.39301"
308 |    fill="#ffffff"
309 |    width="34.998001"
310 |    height="39.799999"
311 |    id="rect130" />
312 | 				</g>
313 | 				<g
314 |    id="g138">
315 | 					<g
316 |    id="g136">
317 | 						<path
318 |    fill="#1c75bc"
319 |    d="m 1098.443,308.527 v 37.53 h -32.728 v -37.53 h 32.728 m 2.27,-2.269 h -2.269 -32.728 -2.269 v 2.269 37.53 2.269 h 2.269 32.728 2.269 v -2.269 -37.53 z"
320 |    id="path134" />
321 | 					</g>
322 | 				</g>
323 | 			</g>
324 | 		</g>
325 | 		<g
326 |    id="g268">
327 | 			<g
328 |    id="g204">
329 | 				<g
330 |    id="g152">
331 | 					<g
332 |    id="g146">
333 | 						<rect
334 |    x="1062.868"
335 |    y="258.75299"
336 |    fill="#ffffff"
337 |    width="13.031"
338 |    height="20.412001"
339 |    id="rect144" />
340 | 					</g>
341 | 					<g
342 |    id="g150">
343 | 						<path
344 |    fill="#1c75bc"
345 |    d="m 1074.764,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.27,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
346 |    id="path148" />
347 | 					</g>
348 | 				</g>
349 | 				<g
350 |    id="g162">
351 | 					<g
352 |    id="g156">
353 | 						<rect
354 |    x="1075.942"
355 |    y="258.75299"
356 |    fill="#ffffff"
357 |    width="13.03"
358 |    height="20.412001"
359 |    id="rect154" />
360 | 					</g>
361 | 					<g
362 |    id="g160">
363 | 						<path
364 |    fill="#1c75bc"
365 |    d="m 1087.838,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
366 |    id="path158" />
367 | 					</g>
368 | 				</g>
369 | 				<g
370 |    id="g172">
371 | 					<g
372 |    id="g166">
373 | 						<rect
374 |    x="1089.015"
375 |    y="258.75299"
376 |    fill="#ffffff"
377 |    width="13.031"
378 |    height="20.412001"
379 |    id="rect164" />
380 | 					</g>
381 | 					<g
382 |    id="g170">
383 | 						<path
384 |    fill="#1c75bc"
385 |    d="m 1100.911,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
386 |    id="path168" />
387 | 					</g>
388 | 				</g>
389 | 				<g
390 |    id="g182">
391 | 					<g
392 |    id="g176">
393 | 						<rect
394 |    x="1062.868"
395 |    y="279.20801"
396 |    fill="#ffffff"
397 |    width="13.031"
398 |    height="20.412001"
399 |    id="rect174" />
400 | 					</g>
401 | 					<g
402 |    id="g180">
403 | 						<path
404 |    fill="#1c75bc"
405 |    d="m 1074.764,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.27,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
406 |    id="path178" />
407 | 					</g>
408 | 				</g>
409 | 				<g
410 |    id="g192">
411 | 					<g
412 |    id="g186">
413 | 						<rect
414 |    x="1075.942"
415 |    y="279.20801"
416 |    fill="#ffffff"
417 |    width="13.03"
418 |    height="20.412001"
419 |    id="rect184" />
420 | 					</g>
421 | 					<g
422 |    id="g190">
423 | 						<path
424 |    fill="#1c75bc"
425 |    d="m 1087.838,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
426 |    id="path188" />
427 | 					</g>
428 | 				</g>
429 | 				<g
430 |    id="g202">
431 | 					<g
432 |    id="g196">
433 | 						<rect
434 |    x="1089.015"
435 |    y="279.20801"
436 |    fill="#ffffff"
437 |    width="13.031"
438 |    height="20.412001"
439 |    id="rect194" />
440 | 					</g>
441 | 					<g
442 |    id="g200">
443 | 						<path
444 |    fill="#1c75bc"
445 |    d="m 1100.911,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
446 |    id="path198" />
447 | 					</g>
448 | 				</g>
449 | 			</g>
450 | 			<g
451 |    id="g266">
452 | 				<g
453 |    id="g214">
454 | 					<g
455 |    id="g208">
456 | 						<rect
457 |    x="1124.011"
458 |    y="258.75299"
459 |    fill="#ffffff"
460 |    width="13.03"
461 |    height="20.412001"
462 |    id="rect206" />
463 | 					</g>
464 | 					<g
465 |    id="g212">
466 | 						<path
467 |    fill="#1c75bc"
468 |    d="m 1135.907,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
469 |    id="path210" />
470 | 					</g>
471 | 				</g>
472 | 				<g
473 |    id="g224">
474 | 					<g
475 |    id="g218">
476 | 						<rect
477 |    x="1137.084"
478 |    y="258.75299"
479 |    fill="#ffffff"
480 |    width="13.031"
481 |    height="20.412001"
482 |    id="rect216" />
483 | 					</g>
484 | 					<g
485 |    id="g222">
486 | 						<path
487 |    fill="#1c75bc"
488 |    d="m 1148.98,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.27,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
489 |    id="path220" />
490 | 					</g>
491 | 				</g>
492 | 				<g
493 |    id="g234">
494 | 					<g
495 |    id="g228">
496 | 						<rect
497 |    x="1150.158"
498 |    y="258.75299"
499 |    fill="#ffffff"
500 |    width="13.03"
501 |    height="20.412001"
502 |    id="rect226" />
503 | 					</g>
504 | 					<g
505 |    id="g232">
506 | 						<path
507 |    fill="#1c75bc"
508 |    d="m 1162.054,259.888 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
509 |    id="path230" />
510 | 					</g>
511 | 				</g>
512 | 				<g
513 |    id="g244">
514 | 					<g
515 |    id="g238">
516 | 						<rect
517 |    x="1124.011"
518 |    y="279.20801"
519 |    fill="#ffffff"
520 |    width="13.03"
521 |    height="20.412001"
522 |    id="rect236" />
523 | 					</g>
524 | 					<g
525 |    id="g242">
526 | 						<path
527 |    fill="#1c75bc"
528 |    d="m 1135.907,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
529 |    id="path240" />
530 | 					</g>
531 | 				</g>
532 | 				<g
533 |    id="g254">
534 | 					<g
535 |    id="g248">
536 | 						<rect
537 |    x="1137.084"
538 |    y="279.20801"
539 |    fill="#ffffff"
540 |    width="13.031"
541 |    height="20.412001"
542 |    id="rect246" />
543 | 					</g>
544 | 					<g
545 |    id="g252">
546 | 						<path
547 |    fill="#1c75bc"
548 |    d="m 1148.98,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.27,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
549 |    id="path250" />
550 | 					</g>
551 | 				</g>
552 | 				<g
553 |    id="g264">
554 | 					<g
555 |    id="g258">
556 | 						<rect
557 |    x="1150.158"
558 |    y="279.20801"
559 |    fill="#ffffff"
560 |    width="13.03"
561 |    height="20.412001"
562 |    id="rect256" />
563 | 					</g>
564 | 					<g
565 |    id="g262">
566 | 						<path
567 |    fill="#1c75bc"
568 |    d="m 1162.054,280.343 v 18.143 h -10.761 v -18.143 h 10.761 m 2.269,-2.269 h -2.269 -10.761 -2.269 v 2.269 18.143 2.269 h 2.269 10.761 2.269 v -2.269 -18.143 z"
569 |    id="path260" />
570 | 					</g>
571 | 				</g>
572 | 			</g>
573 | 		</g>
574 | 		<g
575 |    id="g280">
576 | 			<g
577 |    id="g278">
578 | 				<g
579 |    id="g272">
580 | 					<rect
581 |    x="1109.489"
582 |    y="258.75299"
583 |    fill="#1c75bc"
584 |    width="7.079"
585 |    height="189.38499"
586 |    id="rect270" />
587 | 				</g>
588 | 				<g
589 |    id="g276">
590 | 					<path
591 |    fill="#ffffff"
592 |    d="m 1115.433,259.888 v 187.115 h -4.81 V 259.888 h 4.81 m 2.27,-2.269 h -2.269 -4.81 -2.269 v 2.269 187.115 2.269 h 2.269 4.81 2.269 v -2.269 -187.115 z"
593 |    id="path274" />
594 | 				</g>
595 | 			</g>
596 | 		</g>
597 | 		<g
598 |    id="g290">
599 | 			<g
600 |    id="g284">
601 | 				<rect
602 |    x="1062.3361"
603 |    y="236.75101"
604 |    fill="#1c75bc"
605 |    width="101.384"
606 |    height="9.743"
607 |    id="rect282" />
608 | 			</g>
609 | 			<g
610 |    id="g288">
611 | 				<path
612 |    fill="#ffffff"
613 |    d="m 1162.586,237.885 v 7.474 h -99.115 v -7.474 h 99.115 m 2.269,-2.269 h -2.269 -99.115 -2.269 v 2.269 7.474 2.269 h 2.269 99.115 2.269 v -2.269 -7.474 z"
614 |    id="path286" />
615 | 			</g>
616 | 		</g>
617 | 	</g>
618 | 
619 | 
620 | 
621 | 
622 | 
623 | 
624 | </svg>
625 | 


--------------------------------------------------------------------------------
/data/Pinched_torus.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidos-lab/TARDIS/df0ef5d9567efb0b33616452aa9b6b439f16ae0d/data/Pinched_torus.txt.gz


--------------------------------------------------------------------------------
/data/Wedged_spheres_2D.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aidos-lab/TARDIS/df0ef5d9567efb0b33616452aa9b6b439f16ae0d/data/Wedged_spheres_2D.txt.gz


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | tardis.api
2 | ==========
3 | 
4 | .. automodule:: tardis.api
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | project = "TARDIS"
 2 | copyright = "2023, Julius von Rohrscheidt and Bastian Rieck"
 3 | author = "Julius von Rohrscheidt and Bastian Rieck"
 4 | 
 5 | extensions = [
 6 |     "sphinx.ext.autodoc",
 7 |     "sphinx.ext.napoleon",
 8 |     "sphinx.ext.linkcode",
 9 | ]
10 | 
11 | # Ensure that member functions are documented. These are sane defaults.
12 | autodoc_default_options = {
13 |     "members": True,
14 |     "member-order": "bysource",
15 |     "special-members": "__init__",
16 |     "undoc-members": True,
17 |     "exclude-members": "__weakref__",
18 | }
19 | 
20 | templates_path = ["_templates"]
21 | exclude_patterns = []
22 | 
23 | # Tries to assign some semantic meaning to arguments provided with
24 | # single backtics, such as `x`. This way, we can ignore `func` and
25 | # `class` targets etc. (They still work, though!)
26 | default_role = "obj"
27 | 
28 | html_theme = "furo"
29 | html_logo = "../../TARDIS.svg"
30 | html_static_path = ["_static"]
31 | 
32 | # Ensures that modules are sorted correctly. Since they all pertain to
33 | # the same package, the prefix itself can be ignored.
34 | modindex_common_prefix = ["tardis."]
35 | 
36 | html_theme_options = {
37 |     "source_repository": "https://github.com/aidos-lab/TARDIS/",
38 |     "source_branch": "main",
39 |     "source_directory": "docs/source/",
40 | }
41 | 
42 | 
43 | # Specifies how to actually find the sources of the modules. Ensures
44 | # that people can jump to files in the repository directly.
45 | def linkcode_resolve(domain, info):
46 |     # Let's frown on global imports and do everything locally as much as
47 |     # we can.
48 |     import sys
49 |     import tardis
50 | 
51 |     if domain != "py":
52 |         return None
53 |     if not info["module"]:
54 |         return None
55 | 
56 |     # Attempt to identify the source file belonging to an `info` object.
57 |     # This code is adapted from the Sphinx configuration of `numpy`; see
58 |     # https://github.com/numpy/numpy/blob/main/doc/source/conf.py.
59 |     def find_source_file(module):
60 |         obj = sys.modules[module]
61 | 
62 |         for part in info["fullname"].split("."):
63 |             obj = getattr(obj, part)
64 | 
65 |         import inspect
66 |         import os
67 | 
68 |         fn = inspect.getsourcefile(obj)
69 |         fn = os.path.relpath(fn, start=os.path.dirname(tardis.__file__))
70 | 
71 |         source, lineno = inspect.getsourcelines(obj)
72 |         return fn, lineno, lineno + len(source) - 1
73 | 
74 |     try:
75 |         module = info["module"]
76 |         source = find_source_file(module)
77 |     except Exception:
78 |         source = None
79 | 
80 |     root = f"https://github.com/aidos-lab/TARDIS/tree/main/{project.lower()}/"
81 | 
82 |     if source is not None:
83 |         fn, start, end = source
84 |         return root + f"{fn}#L{start}-L{end}"
85 |     else:
86 |         return None
87 | 


--------------------------------------------------------------------------------
/docs/source/data.rst:
--------------------------------------------------------------------------------
1 | tardis.data
2 | ===========
3 | 
4 | .. automodule:: tardis.data
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | TARDIS: Topological Algorithms for Robust DIscovery of Singularities
 2 | ====================================================================
 3 | 
 4 | The manifold hypothesis drives most of modern machine learning research,
 5 | but what if you are **not** dealing with a manifold but a more complicated space?
 6 | TARDIS uses a topology-driven approach to identify singularities in
 7 | high-dimensional data sets at multiple scales, giving you a better
 8 | overview of what is in your data.
 9 | 
10 | How can TARDIS help you?
11 | ------------------------
12 | 
13 | * Find out whether your data set contains singular regions, i.e. regions
14 |   that are not adequately described by Euclidean space.
15 | 
16 | * Discover whether dimensionality reduction algorithms are embedding
17 |   your data correctly or resulting in distortion.
18 | 
19 | * Assess the overall complexity of your data set in an unsupervised
20 |   fashion.
21 | 
22 | Interested?
23 | -----------
24 | 
25 | Read more about TARDIS in our `ICML paper <https://proceedings.mlr.press/v202/von-rohrscheidt23a.html>`_
26 | and consider citing us:
27 | 
28 | .. code-block:: bibtex
29 | 
30 |   @inproceedings{vonRohrscheidt23a,
31 |     title       = {Topological Singularity Detection at Multiple Scales},
32 |     author      = {von Rohrscheidt, Julius and Rieck, Bastian},
33 |     year        = 2023,
34 |     booktitle   = {Proceedings of the 40th International Conference on Machine Learning},
35 |     publisher   = {PMLR},
36 |     series      = {Proceedings of Machine Learning Research},
37 |     number      = 202,
38 |     pages       = {35175--35197},
39 |     editor      = {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
40 |     abstract    = {The manifold hypothesis, which assumes that data lies on or close to an unknown manifold of low intrinsic dimension, is a staple of modern machine learning research. However, recent work has shown that real-world data exhibits distinct non-manifold structures, i.e. singularities, that can lead to erroneous findings. Detecting such singularities is therefore crucial as a precursor to interpolation and inference tasks. We address this issue by developing a topological framework that (i) quantifies the local intrinsic dimension, and (ii) yields a Euclidicity score for assessing the `manifoldness' of a point along multiple scales. Our approach identifies singularities of complex spaces, while also capturing singular structures and local geometric complexity in image data.}
41 |   }
42 | 
43 | Documentation
44 | -------------
45 | 
46 | Please find the API documentation and the module documentation below. As
47 | with a lot of academic code, TARDIS is a constant work in progress. Your
48 | contributions are more than welcome!
49 | 
50 | .. toctree::
51 |     :maxdepth: 2
52 |     :caption: Contents:
53 | 
54 | .. toctree::
55 |     :maxdepth: 2
56 |     :caption: Modules
57 | 
58 |     api
59 |     data
60 |     utils
61 | 
62 | Indices and tables
63 | ==================
64 | 
65 | * :ref:`genindex`
66 | * :ref:`modindex`
67 | * :ref:`search`
68 | 


--------------------------------------------------------------------------------
/docs/source/utils.rst:
--------------------------------------------------------------------------------
1 | tardis.utils
2 | ============
3 | 
4 | .. automodule:: tardis.utils
5 |     :members:
6 | 


--------------------------------------------------------------------------------
/examples/simple_numpy.py:
--------------------------------------------------------------------------------
 1 | """Simple example of integrating TARDIS and ``numpy``."""
 2 | 
 3 | 
 4 | import numpy as np
 5 | 
 6 | from tardis import calculate_euclidicity
 7 | 
 8 | 
 9 | if __name__ == "__main__":
10 |     rng = np.random.default_rng(42)
11 | 
12 |     # This is the same data set that will also be used for the
13 |     # estimation of Euclidicity later on.
14 |     X = rng.normal(size=(100, 3))
15 | 
16 |     # Only get Euclidicity values. By default, no dimensions will be
17 |     # returned (they are always computed, though). Use `n_steps` for
18 |     # controlling the scale traversal.
19 |     euclidicity = calculate_euclidicity(
20 |         X, r=0.01, R=0.25, s=0.05, S=0.5, max_dim=3, n_steps=5
21 |     )
22 | 
23 |     # Get both Euclidicity and the persistent intrinsic dimension (PID)
24 |     # of each data point.
25 |     euclidicity, persistent_intrinsic_dimension = calculate_euclidicity(
26 |         X,
27 |         r=0.01,
28 |         R=0.25,
29 |         s=0.05,
30 |         S=0.5,
31 |         max_dim=3,
32 |         n_steps=5,
33 |         return_dimensions=True,
34 |     )
35 | 
36 |     # We can also just specify a number of neighbours to use for scale
37 |     # estimation. Note that most of the parameters specified below can
38 |     # be considered optional since we provide useful defaults.
39 |     euclidicity = calculate_euclidicity(
40 |         X,
41 |         max_dim=3,
42 |         n_steps=5,
43 |         k=10,
44 |     )
45 | 
46 |     # Finally, let's calculate Euclidicity with respect to *another*
47 |     # data set.
48 |     Y = rng.normal(size=(10, 3))
49 |     euclidicity = calculate_euclidicity(
50 |         X, Y, r=0.01, R=0.25, s=0.05, S=0.5, max_dim=3
51 |     )
52 | 


--------------------------------------------------------------------------------
/output/Gudhi_Computational_Effort.txt:
--------------------------------------------------------------------------------
 1 | ﻿--- Gudhi Rips Complex --- 10 samples --- dimension 1 --- 0.0020189285278320312 seconds --- 
 2 | --- Gudhi Rips Complex --- 20 samples --- dimension 1 --- 0.0025501251220703125 seconds --- 
 3 | --- Gudhi Rips Complex --- 30 samples --- dimension 1 --- 0.0033788681030273438 seconds --- 
 4 | --- Gudhi Rips Complex --- 40 samples --- dimension 1 --- 0.007442951202392578 seconds --- 
 5 | --- Gudhi Rips Complex --- 50 samples --- dimension 1 --- 0.006020069122314453 seconds --- 
 6 | --- Gudhi Rips Complex --- 60 samples --- dimension 1 --- 0.010352849960327148 seconds --- 
 7 | --- Gudhi Rips Complex --- 10 samples --- dimension 2 --- 0.0004889965057373047 seconds --- 
 8 | --- Gudhi Rips Complex --- 20 samples --- dimension 2 --- 0.002521038055419922 seconds --- 
 9 | --- Gudhi Rips Complex --- 30 samples --- dimension 2 --- 0.011078834533691406 seconds --- 
10 | --- Gudhi Rips Complex --- 40 samples --- dimension 2 --- 0.033782005310058594 seconds --- 
11 | --- Gudhi Rips Complex --- 50 samples --- dimension 2 --- 0.08360719680786133 seconds --- 
12 | --- Gudhi Rips Complex --- 60 samples --- dimension 2 --- 0.18638396263122559 seconds --- 
13 | --- Gudhi Rips Complex --- 10 samples --- dimension 3 --- 0.0005481243133544922 seconds --- 
14 | --- Gudhi Rips Complex --- 20 samples --- dimension 3 --- 0.009124040603637695 seconds --- 
15 | --- Gudhi Rips Complex --- 30 samples --- dimension 3 --- 0.07537388801574707 seconds --- 
16 | --- Gudhi Rips Complex --- 40 samples --- dimension 3 --- 0.31710100173950195 seconds --- 
17 | --- Gudhi Rips Complex --- 50 samples --- dimension 3 --- 1.0923190116882324 seconds --- 
18 | --- Gudhi Rips Complex --- 60 samples --- dimension 3 --- 3.1666109561920166 seconds --- 
19 | --- Gudhi Rips Complex --- 10 samples --- dimension 4 --- 0.0009720325469970703 seconds --- 
20 | --- Gudhi Rips Complex --- 20 samples --- dimension 4 --- 0.028330087661743164 seconds --- 
21 | --- Gudhi Rips Complex --- 30 samples --- dimension 4 --- 0.3949770927429199 seconds --- 
22 | --- Gudhi Rips Complex --- 40 samples --- dimension 4 --- 2.8597021102905273 seconds --- 
23 | --- Gudhi Rips Complex --- 50 samples --- dimension 4 --- 14.362553834915161 seconds --- 
24 | --- Gudhi Rips Complex --- 60 samples --- dimension 4 --- 61.31955695152283 seconds --- 
25 | --- Gudhi Rips Complex --- 10 samples --- dimension 5 --- 0.0011510848999023438 seconds --- 
26 | --- Gudhi Rips Complex --- 20 samples --- dimension 5 --- 0.07502388954162598 seconds --- 
27 | --- Gudhi Rips Complex --- 30 samples --- dimension 5 --- 2.104665994644165 seconds --- 
28 | --- Gudhi Rips Complex --- 40 samples --- dimension 5 --- 26.206981897354126 seconds --- 
29 | --- Gudhi Rips Complex --- 50 samples --- dimension 5 --- 192.25888419151306 seconds --- 
30 | --- Gudhi Rips Complex --- 60 samples --- dimension 5 --- 4449.7761080265045 seconds --- 
31 | 


--------------------------------------------------------------------------------
/output/Ripser_Computational_Effort.txt:
--------------------------------------------------------------------------------
 1 | ﻿--- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 1 --- 0.0019309520721435547 seconds --- 
 2 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 1 --- 0.002407073974609375 seconds --- 
 3 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 1 --- 0.00831294059753418 seconds --- 
 4 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 1 --- 0.007644176483154297 seconds --- 
 5 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 1 --- 0.021908998489379883 seconds --- 
 6 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 1 --- 0.030781030654907227 seconds --- 
 7 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 2 --- 0.0009131431579589844 seconds --- 
 8 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 2 --- 0.0012538433074951172 seconds --- 
 9 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 2 --- 0.0011789798736572266 seconds --- 
10 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 2 --- 0.016334056854248047 seconds --- 
11 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 2 --- 0.03236794471740723 seconds --- 
12 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 2 --- 0.05398416519165039 seconds --- 
13 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 3 --- 0.0006580352783203125 seconds --- 
14 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 3 --- 0.002480030059814453 seconds --- 
15 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 3 --- 0.00840306282043457 seconds --- 
16 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 3 --- 0.03389477729797363 seconds --- 
17 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 3 --- 0.17017388343811035 seconds --- 
18 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 3 --- 0.1088259220123291 seconds --- 
19 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 4 --- 0.0008890628814697266 seconds --- 
20 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 4 --- 0.0017061233520507812 seconds --- 
21 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 4 --- 0.0023801326751708984 seconds --- 
22 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 4 --- 0.060665130615234375 seconds --- 
23 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 4 --- 0.6652989387512207 seconds --- 
24 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 4 --- 2.826575994491577 seconds --- 
25 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 5 --- 0.0010030269622802734 seconds --- 
26 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 5 --- 0.001631021499633789 seconds --- 
27 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 5 --- 0.03177905082702637 seconds --- 
28 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 5 --- 0.18613076210021973 seconds --- 
29 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 5 --- 2.020855188369751 seconds --- 
30 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 5 --- 14.891887187957764 seconds --- 
31 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 6 --- 0.0009930133819580078 seconds --- 
32 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 6 --- 0.0017299652099609375 seconds --- 
33 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 6 --- 0.038159847259521484 seconds --- 
34 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 6 --- 1.1811788082122803 seconds --- 
35 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 6 --- 5.688662052154541 seconds --- 
36 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 6 --- 24.082331895828247 seconds --- 
37 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 7 --- 0.00086212158203125 seconds --- 
38 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 7 --- 0.001519918441772461 seconds --- 
39 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 7 --- 0.00822901725769043 seconds --- 
40 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 7 --- 2.1794939041137695 seconds --- 
41 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 7 --- 72.07182788848877 seconds --- 
42 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 7 --- 561.868953704834 seconds --- 
43 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 8 --- 0.0025250911712646484 seconds --- 
44 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 8 --- 0.0015439987182617188 seconds --- 
45 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 8 --- 0.0831460952758789 seconds --- 
46 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 8 --- 0.9906179904937744 seconds --- 
47 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 8 --- 10.50025987625122 seconds --- 
48 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 8 --- 916.2965040206909 seconds --- 
49 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 9 --- 0.0034601688385009766 seconds --- 
50 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 9 --- 0.0012841224670410156 seconds --- 
51 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 9 --- 0.09065103530883789 seconds --- 
52 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 9 --- 0.7031102180480957 seconds --- 
53 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 9 --- 157.24873232841492 seconds --- 
54 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 9 --- 661.64568400383 seconds --- 
55 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 10 --- 0.004229068756103516 seconds --- 
56 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 10 --- 0.002311229705810547 seconds --- 
57 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 10 --- 0.03548407554626465 seconds --- 
58 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 10 --- 5.440964937210083 seconds --- 
59 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 10 --- 25.291701078414917 seconds --- 
60 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 10 --- 791.482873916626 seconds --- 
61 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 11 --- 0.0065288543701171875 seconds --- 
62 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 11 --- 0.002384185791015625 seconds --- 
63 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 11 --- 0.0969080924987793 seconds --- 
64 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 11 --- 1.355767011642456 seconds --- 
65 | --- Ripser Rips Complex w/ Edge Collapse --- 50 samples --- dimension 11 --- 430.8150792121887 seconds --- 
66 | --- Ripser Rips Complex w/ Edge Collapse --- 60 samples --- dimension 11 --- 3043.677969932556 seconds --- 
67 | --- Ripser Rips Complex w/ Edge Collapse --- 10 samples --- dimension 12 --- 0.010491132736206055 seconds --- 
68 | --- Ripser Rips Complex w/ Edge Collapse --- 20 samples --- dimension 12 --- 0.002953052520751953 seconds --- 
69 | --- Ripser Rips Complex w/ Edge Collapse --- 30 samples --- dimension 12 --- 0.025474071502685547 seconds --- 
70 | --- Ripser Rips Complex w/ Edge Collapse --- 40 samples --- dimension 12 --- 13.660001039505005 seconds --- 
71 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "tardis"
 3 | version = "0.1.0"
 4 | description = "TARDIS: Topological Algorithms for Robust DIscovery of Singularities"
 5 | authors = ["Julius von Rohrscheidt <julius.rohrscheidt@helmholtz-muenchen.de>", "Bastian Rieck <bastian.rieck@helmholtz-muenchen.de>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.9,<3.12"
10 | gudhi = "^3.6.0"
11 | numpy = "^1.23.2"
12 | scikit-learn = "^1.1.2"
13 | tqdm = "^4.64.1"
14 | joblib = "^1.2.0"
15 | colorlog = "^6.7.0"
16 | pandas = "^1.5.0"
17 | matplotlib = "^3.6.0"
18 | seaborn = "^0.12.0"
19 | torch = "^1.12.1"
20 | torchvision = "^0.13.1"
21 | scipy = "^1.9.1"
22 | phate = "^1.0.9"
23 | pot = "^0.8.2"
24 | giotto-ph = "^0.2.2"
25 | sphinx = "^7.0.1"
26 | furo = "^2023.5.20"
27 | 
28 | [tool.black]
29 | line-length = 79
30 | 
31 | [build-system]
32 | requires = ["poetry-core"]
33 | build-backend = "poetry.core.masonry.api"
34 | 


--------------------------------------------------------------------------------
/scripts/es.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # es.sh: easy submit --- submit a job quickly to the cluster
 4 | #
 5 | # This script tries to provide a 'fire and forget' solution for
 6 | # submitting CPU jobs to the cluster. The parameters may not be
 7 | # perfect, but it's probably sufficient for most cases.
 8 | 
 9 | NAME="$1"
10 | CMD="$2"
11 | 
12 | if [ -z "$2" ]; then
13 |   NAME="easy-submit"
14 |   CMD=$1
15 | fi
16 | 
17 | if [ -z "$CMD" ]; then
18 |   echo "Usage: $0 [NAME] COMMAND"
19 |   echo "  Specify at least the command to run."
20 |   exit -1
21 | fi
22 | 
23 | MAIL_USER=${USER}@helmholtz-muenchen.de
24 | 
25 | sbatch -p cpu_p                 \
26 |        -J ${NAME}               \
27 |        -o "${NAME}_%j.out"      \
28 |        --cpus-per-task=32       \
29 |        --mem=16G                \
30 |        --mail-type=END,FAIL     \
31 |        --mail-user=${MAIL_USER} \
32 |        --nice=10000             \
33 |        --wrap "${CMD}"
34 | 


--------------------------------------------------------------------------------
/scripts/ipsc.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for POINTS in 1000 2500 5000 10000; do
4 |   ./es.sh IPSC-${POINTS} "poetry run python ../toast/cli.py ../data/ipsc.npz --seed 42 -q ${POINTS} -d 16 -o ../output/ipsc_d16_q${POINTS}_seed42.csv"
5 | done
6 | 


--------------------------------------------------------------------------------
/scripts/pinched_torus.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for S in 0.5 0.55 0.60 0.65 0.75; do
4 |   ./es.sh PINCHED_TORUS "poetry run python ../toast/cli.py ../data/Pinched_torus.txt -r 0.05 -R 0.45 -s 0.2 -S ${S} > ../output/Pinched_torus_S${S}.txt"
5 | done
6 | 


--------------------------------------------------------------------------------
/scripts/vision.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | for NUM in $(seq 1 5); do
4 |   ./es.sh        MNIST "poetry run python ../toast/cli.py -d 10 --num-steps 20 MNIST > ../output/MNIST_${NUM}.txt"
5 |   ./es.sh FashionMNIST "poetry run python ../toast/cli.py -d 10 --num-steps 20 FashionMNIST > ../output/FashionMNIST_${NUM}.txt"
6 | done
7 | 
8 | 


--------------------------------------------------------------------------------
/scripts/wedged_spheres.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | ./es.sh WEDGED-SPHERES-2 "poetry run python ../toast/cli.py -r 0.05 -R 0.25 -s 0.1 -S 0.5 -d 2 --num-steps 20 ../data/Wedged_spheres_2D.txt > ../output/Wedged_spheres_2D.txt"
4 | 


--------------------------------------------------------------------------------
/tardis/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import calculate_euclidicity
2 | from .utils import estimate_scales
3 | 


--------------------------------------------------------------------------------
/tardis/analyse_euclidicity.py:
--------------------------------------------------------------------------------
 1 | """Basic statistical analysis of Euclidicity scores.
 2 | 
 3 | This is a helper script for analysing Euclidicity scores. It generates
 4 | plots of the summary statistics and performs Tukey's range test.
 5 | """
 6 | 
 7 | import argparse
 8 | import os
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | import matplotlib.pyplot as plt
14 | import seaborn as sns
15 | 
16 | from scipy.stats import tukey_hsd
17 | 
18 | 
19 | def detect_outliers(data):
20 |     """Detect outliers based on IQR criterion."""
21 |     # Simple outlier detection: clip everything that is larger than
22 |     # q3 + 1.5 * IQR.
23 |     iqr = np.subtract(*np.percentile(data, [75, 25]))
24 |     q1 = np.percentile(data, 25)
25 |     q3 = np.percentile(data, 75)
26 | 
27 |     print(f"Q1 = {q1:.2f}, Q3 = {q3:.2f}, IQR = {iqr:.2f}")
28 | 
29 |     upper = data > q3 + 1.5 * iqr
30 |     lower = data < q1 - 1.5 * iqr
31 | 
32 |     print("- Found", upper.sum(), "upper outliers")
33 |     print("- Found", lower.sum(), "lower outliers")
34 | 
35 | 
36 | def print_summary_statistics(data):
37 |     """Print some summary statistics."""
38 |     print(
39 |         f"max = {np.max(data):.2f}, "
40 |         f"mean = {np.mean(data):.2f}, "
41 |         f"min = {np.min(data):.2f}",
42 |     )
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     parser = argparse.ArgumentParser()
47 |     parser.add_argument("FILE", nargs="+", help="Input filename(s)")
48 | 
49 |     args = parser.parse_args()
50 | 
51 |     n_files = len(args.FILE)
52 |     fig, axes = plt.subplots(nrows=2, ncols=n_files, squeeze=False)
53 | 
54 |     distributions = []
55 | 
56 |     for (
57 |         col,
58 |         filename,
59 |     ) in enumerate(args.FILE):
60 |         print(f"Processing {filename}")
61 | 
62 |         if filename.endswith(".csv"):
63 |             df = pd.read_csv(filename)
64 |             df = df.drop("persistent_intrinsic_dimension", axis="columns")
65 |             X = df.to_numpy()
66 |         elif filename.endswith(".npz"):
67 |             X = np.load(filename)["arr_0"]
68 |         else:
69 |             X = np.loadtxt(filename)
70 | 
71 |         # Skip empty files because they lead to problems in the
72 |         # downstream analysis.
73 |         if len(X) == 0:
74 |             continue
75 | 
76 |         euclidicity = X[:, -1].flatten()
77 | 
78 |         distributions.append(np.asarray(euclidicity))
79 | 
80 |         detect_outliers(euclidicity)
81 |         print_summary_statistics(euclidicity)
82 | 
83 |         axes[0, col].set_title(os.path.splitext(os.path.basename(filename))[0])
84 | 
85 |         sns.histplot(data=euclidicity, kde=True, ax=axes[0, col])
86 |         sns.violinplot(data=euclidicity, ax=axes[1, col])
87 |         sns.stripplot(data=euclidicity, ax=axes[1, col], color="black", size=1)
88 | 
89 |     # We can only do this with more than one distribution, but even for
90 |     # a single distribution, we can show the respective plot.
91 |     if len(distributions) > 1:
92 |         print(tukey_hsd(*distributions))
93 | 
94 |     plt.show()
95 | 


--------------------------------------------------------------------------------
/tardis/api.py:
--------------------------------------------------------------------------------
 1 | """Main entry point for API calls to TARDIS.
 2 | 
 3 | This module collects API calls to TARDIS. Each exported function should
 4 | facilitate using TARDIS for data analysis. Users that need fine-grained
 5 | control are encouraged to build their own functions.
 6 | """
 7 | 
 8 | import joblib
 9 | 
10 | import numpy as np
11 | 
12 | from tardis.euclidicity import Euclidicity
13 | from tardis.utils import estimate_scales
14 | 
15 | 
16 | def calculate_euclidicity(
17 |     X,
18 |     Y=None,
19 |     max_dim=2,
20 |     n_steps=10,
21 |     r=None,
22 |     R=None,
23 |     s=None,
24 |     S=None,
25 |     k=20,
26 |     n_jobs=1,
27 |     return_dimensions=False,
28 | ):
29 |     """Convenience function for calculating Euclidicity of a point cloud.
30 | 
31 |     This function provides the most convenient interface for calculating
32 |     Euclidicity of a point cloud. Internally, this function will use the
33 |     best and fastest Euclidicity calculation, but this comes at the cost
34 |     of configurability.
35 | 
36 |     TODO: Document me :-)
37 |     """
38 |     r_, R_, s_, S_ = r, R, s, S
39 |     query_points = X if Y is None else Y
40 | 
41 |     if all([x is not None for x in [r_, R_, s_, S_]]):
42 |         scales = [dict()] * len(query_points)
43 |     else:
44 |         scales = estimate_scales(X, query_points, k)
45 | 
46 |     euclidicity = Euclidicity(
47 |         max_dim=max_dim,
48 |         n_steps=n_steps,
49 |         r=r_,
50 |         R=R_,
51 |         s=s_,
52 |         S=S_,
53 |         method="ripser",
54 |         data=X,
55 |     )
56 | 
57 |     def _process(x, scale=None):
58 |         scores, dimensions = euclidicity(X, x, **scale)
59 | 
60 |         score = np.mean(np.nan_to_num(scores))
61 |         dimension = np.mean(dimensions)
62 | 
63 |         return score, dimension
64 | 
65 |     output = joblib.Parallel(n_jobs=n_jobs)(
66 |         joblib.delayed(_process)(x, scale)
67 |         for x, scale in zip(query_points, scales)
68 |     )
69 | 
70 |     euclidicity = np.asarray([e for (e, _) in output])
71 |     persistent_intrinsic_dimension = np.asarray([d for (_, d) in output])
72 | 
73 |     if return_dimensions:
74 |         return euclidicity, persistent_intrinsic_dimension
75 |     else:
76 |         return euclidicity
77 | 


--------------------------------------------------------------------------------
/tardis/cli.py:
--------------------------------------------------------------------------------
  1 | """Command-line interface for Euclidicity calculations.
  2 | 
  3 | This script is the main command-line interface for our Euclidicity
  4 | calculations. It supports loading various input formats, for which
  5 | it will calculate Euclidicity scores.
  6 | """
  7 | 
  8 | import argparse
  9 | import colorlog
 10 | import functools
 11 | import joblib
 12 | import os
 13 | 
 14 | import numpy as np
 15 | import pandas as pd
 16 | 
 17 | from tardis.euclidicity import Euclidicity
 18 | 
 19 | from tardis.shapes import sample_from_annulus
 20 | from tardis.shapes import sample_from_constant_curvature_annulus
 21 | 
 22 | from tardis.utils import load_data
 23 | from tardis.utils import estimate_scales
 24 | 
 25 | 
 26 | def setup():
 27 |     """Perform logging and argument parsing setup.
 28 | 
 29 |     Sets up the command-line interface for subsequent usage so that we
 30 |     do not clutter up the actual Euclidicity calculations.
 31 | 
 32 |     Returns
 33 |     -------
 34 |     Tuple of logger and parsed arguments
 35 |     """
 36 |     handler = colorlog.StreamHandler()
 37 |     handler.setFormatter(
 38 |         colorlog.ColoredFormatter("%(log_color)s%(levelname)-.1s: %(message)s")
 39 |     )
 40 | 
 41 |     logger = colorlog.getLogger()
 42 |     logger.addHandler(handler)
 43 |     logger.setLevel(colorlog.INFO)
 44 | 
 45 |     parser = argparse.ArgumentParser()
 46 |     parser.add_argument(
 47 |         "INPUT",
 48 |         type=str,
 49 |         help="Input point cloud or name of data set to load. If this points "
 50 |         "to an existing file, the file is loaded. Else the input is treated "
 51 |         "as the name of a (vision) data set.",
 52 |     )
 53 |     parser.add_argument(
 54 |         "-o",
 55 |         "--output",
 56 |         type=str,
 57 |         help="Output file (optional). If not set, data will be printed to "
 58 |         "standard output. If set, will guess the output format based "
 59 |         "on the file extension.",
 60 |     )
 61 | 
 62 |     euclidicity_group = parser.add_argument_group("Euclidicity calculations")
 63 | 
 64 |     euclidicity_group.add_argument(
 65 |         "-k",
 66 |         "--num-neighbours",
 67 |         default=50,
 68 |         type=int,
 69 |         help="Number of neighbours for parameter estimation",
 70 |     )
 71 |     euclidicity_group.add_argument(
 72 |         "-d",
 73 |         "--dimension",
 74 |         default=2,
 75 |         type=int,
 76 |         help="Known or estimated intrinsic dimension",
 77 |     )
 78 |     euclidicity_group.add_argument(
 79 |         "-r",
 80 |         type=float,
 81 |         help="Minimum inner radius of annulus",
 82 |     )
 83 |     euclidicity_group.add_argument(
 84 |         "-R",
 85 |         type=float,
 86 |         help="Maximum inner radius of annulus",
 87 |     )
 88 |     euclidicity_group.add_argument(
 89 |         "-s",
 90 |         type=float,
 91 |         help="Minimum outer radius of annulus",
 92 |     )
 93 |     euclidicity_group.add_argument(
 94 |         "-S",
 95 |         type=float,
 96 |         help="Maximum outer radius of annulus",
 97 |     )
 98 |     euclidicity_group.add_argument(
 99 |         "--num-steps",
100 |         default=10,
101 |         type=int,
102 |         help="Number of steps for annulus sampling",
103 |     )
104 |     parser.add_argument(
105 |         "-f",
106 |         "--fixed-annulus",
107 |         action="store_true",
108 |         help="If set, compare to fixed annulus (disables Euclidean sampling)",
109 |     )
110 | 
111 |     sampling_group = parser.add_argument_group("Sampling")
112 | 
113 |     sampling_group.add_argument(
114 |         "-b",
115 |         "--batch-size",
116 |         default=10000,
117 |         type=int,
118 |         help="Number of points to sample from input data",
119 |     )
120 |     sampling_group.add_argument(
121 |         "-q",
122 |         "--num-query-points",
123 |         default=1000,
124 |         type=int,
125 |         help="Number of query points for Euclidicity calculations",
126 |     )
127 |     sampling_group.add_argument(
128 |         "--seed",
129 |         type=int,
130 |         help="Random number generator seed for reproducible experiments",
131 |     )
132 | 
133 |     experimental_group = parser.add_argument_group("Experimental")
134 | 
135 |     experimental_group.add_argument(
136 |         "--curvature",
137 |         "-K",
138 |         type=float,
139 |         default=None,
140 |         help="If set, change model space from Euclidean annulus to 2D disk of "
141 |         "constant curvature.",
142 |     )
143 | 
144 |     # TODO: Check for compatibility of different settings. We cannot
145 |     # sample from different spaces if we also use a fixed annulus.
146 |     args = parser.parse_args()
147 |     return logger, args
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     logger, args = setup()
152 | 
153 |     if args.seed is not None:
154 |         logger.info(f"Using pre-defined seed {args.seed}")
155 | 
156 |     rng = np.random.default_rng(args.seed)
157 | 
158 |     X, query_points = load_data(
159 |         args.INPUT,
160 |         args.batch_size,
161 |         args.num_query_points,
162 |         seed=rng,
163 |     )
164 | 
165 |     r, R, s, S = args.r, args.R, args.s, args.S
166 |     k = args.num_neighbours
167 | 
168 |     # Check whether we have to perform scale estimation on a per-point
169 |     # basis. If not, we just supply an empty dict.
170 |     if all([x is not None for x in [r, R, s, S]]):
171 |         logger.info(
172 |             f"Using global scales r = {r:.2f}, R = {R:.2f}, "
173 |             f"s = {s:.2f}, S = {S:.2f}"
174 |         )
175 | 
176 |         scales = [dict()] * len(query_points)
177 |     else:
178 |         logger.info(
179 |             f"Performing scale estimation with k = {k} since no "
180 |             f"parameters have been provided by the client."
181 |         )
182 | 
183 |         scales = estimate_scales(X, query_points, k)
184 | 
185 |     max_dim = args.dimension
186 |     n_steps = args.num_steps
187 | 
188 |     logger.info(f"Maximum dimension: {max_dim}")
189 |     logger.info(f"Number of steps for local sampling: {n_steps}")
190 | 
191 |     # Choose a sampling procedure for the inner comparison of sampled
192 |     # annuli from the data space with model spaces.
193 |     if args.fixed_annulus:
194 |         logger.info("Using fixed annulus comparison")
195 |         model_sample_fn = None
196 |     elif args.curvature is not None:
197 |         logger.info("Using constant-curvature model space")
198 |         model_sample_fn = functools.partial(
199 |             sample_from_constant_curvature_annulus, K=args.curvature
200 |         )
201 |     else:
202 |         logger.info("Using Euclidean annulus model space")
203 |         model_sample_fn = sample_from_annulus
204 | 
205 |     euclidicity = Euclidicity(
206 |         max_dim=max_dim,
207 |         n_steps=n_steps,
208 |         r=args.r,
209 |         R=args.R,
210 |         s=args.s,
211 |         S=args.S,
212 |         method="ripser",
213 |         data=X,
214 |         model_sample_fn=model_sample_fn,
215 |     )
216 | 
217 |     def _process(x, scale=None):
218 |         scores, dimensions = euclidicity(X, x, **scale)
219 | 
220 |         # Aggregate over all scores that we find. We could pick
221 |         # a different aggregation here!
222 |         score = np.mean(np.nan_to_num(scores))
223 |         dimension = np.mean(dimensions)
224 | 
225 |         return score, dimension
226 | 
227 |     output = joblib.Parallel(n_jobs=-1)(
228 |         joblib.delayed(_process)(x, scale)
229 |         for x, scale in zip(query_points, scales)
230 |     )
231 | 
232 |     df = pd.DataFrame(
233 |         output, columns=["euclidicity", "persistent_intrinsic_dimension"]
234 |     )
235 | 
236 |     df = pd.concat([pd.DataFrame(query_points).add_prefix("X"), df], axis=1)
237 | 
238 |     if args.output is None:
239 |         print(df.to_csv(index=False))
240 |     else:
241 |         extension = os.path.splitext(args.output)[1]
242 |         if extension == ".tsv":
243 |             df.to_csv(args.output, index=False, sep="\t")
244 |         elif extension == ".csv":
245 |             df.to_csv(args.output, index=False)
246 |         elif extension == ".npy":
247 |             np.save(args.output, df)
248 |         elif extension == ".npz":
249 |             np.savez(args.output, df)
250 | 


--------------------------------------------------------------------------------
/tardis/data.py:
--------------------------------------------------------------------------------
 1 | """Provides samples of more complicated data sets."""
 2 | 
 3 | from torchvision import datasets
 4 | from torchvision import transforms
 5 | 
 6 | from torch.utils.data import DataLoader
 7 | 
 8 | 
 9 | def sample_vision_data_set(name, n_samples):
10 |     """Sample vision data set.
11 | 
12 |     Parameters
13 |     ----------
14 |     name : str
15 |         Name of the data set. Currently, only "MNIST" and "FashionMNIST"
16 |         are supported here.
17 | 
18 |     n_samples : int
19 |         Number of samples to retrieve.
20 | 
21 |     Returns
22 |     -------
23 |     np.array
24 |         Sampled data points
25 |     """
26 |     assert name in ["MNIST", "FashionMNIST"]
27 | 
28 |     transform = transforms.Compose(
29 |         [transforms.ToTensor(), transforms.Normalize((0.5), (0.5))]
30 |     )
31 | 
32 |     transform = transforms.ToTensor()
33 | 
34 |     if name == "MNIST":
35 |         cls = datasets.MNIST
36 |     elif name == "FashionMNIST":
37 |         cls = datasets.FashionMNIST
38 | 
39 |     data = cls(root="../data", train=True, download=True, transform=transform)
40 | 
41 |     data_loader = DataLoader(dataset=data, batch_size=n_samples, shuffle=True)
42 | 
43 |     X, _ = next(iter(data_loader))
44 |     X = X.reshape(n_samples, -1)
45 |     X = X.numpy()
46 | 
47 |     return X
48 | 


--------------------------------------------------------------------------------
/tardis/euclidicity.py:
--------------------------------------------------------------------------------
  1 | """Euclidicity example implementation."""
  2 | 
  3 | import numpy as np
  4 | 
  5 | from sklearn.neighbors import KDTree
  6 | 
  7 | from tardis.persistent_homology import GUDHI
  8 | from tardis.persistent_homology import Ripser
  9 | 
 10 | 
 11 | class Euclidicity:
 12 |     """Functor for calculating Euclidicity of a point cloud."""
 13 | 
 14 |     def __init__(
 15 |         self,
 16 |         max_dim,
 17 |         r=None,
 18 |         R=None,
 19 |         s=None,
 20 |         S=None,
 21 |         n_steps=10,
 22 |         data=None,
 23 |         method="gudhi",
 24 |         model_sample_fn=None,
 25 |     ):
 26 |         """Initialise new instance of functor.
 27 | 
 28 |         Sets up a new instance of the Euclidicity functor and stores
 29 |         shared parameters that will be used for the calculation. The
 30 |         client has the choice of either providing global parameters,
 31 |         or adjusting them on a per-point basis.
 32 | 
 33 |         Parameters
 34 |         ----------
 35 |         max_dim : int
 36 |             Maximum dimension for persistent homology approximations.
 37 |             This is the *only* required parameter.
 38 | 
 39 |         r : float, optional
 40 |             Minimum inner radius of annulus
 41 | 
 42 |         R : float, optional
 43 |             Maximum inner radius of annulus
 44 | 
 45 |         s : float, optional
 46 |             Minimum outer radius of annulus
 47 | 
 48 |         S : float, optional
 49 |             Maximum outer radius of annulus
 50 | 
 51 |         n_steps : int, optional
 52 |             Number of steps for the radius parameter grid of the
 53 |             annulus. Note that the complexity of the function is
 54 |             quadratic in the number of steps.
 55 | 
 56 |         data : np.array or None
 57 |             If set, prepares a tree for nearest-neighbour and radius
 58 |             queries on the input data set. This can lead to substantial
 59 |             speed improvements in practice.
 60 | 
 61 |         method : str
 62 |             Persistent homology calculation method. At the moment, only
 63 |             "gudhi" and "ripser" are supported. "gudhi" is better for a
 64 |             small, low-dimensional data set, while "ripser" scales well
 65 |             to larger, high-dimensional point clouds.
 66 | 
 67 |         model_sample_fn : callable
 68 |             Function to be called for sampling from a comparison space.
 69 |             The function is being supplied with the number of samples,
 70 |             the radii of the annulus, and the intrinsic dimension. Its
 71 |             output must be a point cloud representing the annulus. If no
 72 |             sample function is provided, the class will default to
 73 |             compare the topological features with those of fixed
 74 |             Euclidean annulus.
 75 |         """
 76 |         self.r = r
 77 |         self.R = R
 78 |         self.s = s
 79 |         self.S = S
 80 | 
 81 |         self.n_steps = n_steps
 82 |         self.max_dim = max_dim
 83 | 
 84 |         self.model_sample_fn = model_sample_fn
 85 | 
 86 |         if method == "gudhi":
 87 |             self.vr = GUDHI()
 88 |         elif method == "ripser":
 89 |             self.vr = Ripser()
 90 |         else:
 91 |             raise RuntimeError("No persistent homology calculation selected.")
 92 | 
 93 |         # Prepare KD tree to speed up annulus calculations. We make this
 94 |         # configurable to permit both types of workflows.
 95 |         if data is not None:
 96 |             self.tree = KDTree(data)
 97 |         else:
 98 |             self.tree = None
 99 | 
100 |     def __call__(self, X, x, **kwargs):
101 |         """Calculate Euclidicity of a specific point.
102 | 
103 |         Parameters
104 |         ----------
105 |         X : np.array or tensor of shape ``(N, d)``
106 |             Input data set. Must be compatible with the persistent
107 |             homology calculations.
108 | 
109 |         x : np.array, tensor, or iterable of shape ``(d, )``
110 |             Input point.
111 | 
112 |         Other Parameters
113 |         ----------------
114 |         r : float, optional
115 |             Minimum inner radius of annulus. Will default to global `r`
116 |             parameter if not set.
117 | 
118 |         R : float, optional
119 |             Maximum inner radius of annulus. Will default to global `R`
120 |             parameter if not set.
121 | 
122 |         s : float, optional
123 |             Minimum outer radius of annulus. Will default to global `s`
124 |             parameter if not set.
125 | 
126 |         S : float, optional
127 |             Maximum outer radius of annulus. Will default to global `S`
128 |             parameter if not set.
129 | 
130 |         Returns
131 |         -------
132 |         Tuple of np.array, np.array
133 |             1D array containing Euclidicity estimates. The length of the
134 |             array depends on the number of scales. The second array will
135 |             contain the persistent intrinsic dimension (PID) values.
136 |         """
137 |         r = kwargs.get("r", self.r)
138 |         R = kwargs.get("R", self.R)
139 |         s = kwargs.get("s", self.s)
140 |         S = kwargs.get("S", self.S)
141 | 
142 |         bottleneck_distances = []
143 |         dimensions = []
144 | 
145 |         for r in np.linspace(r, R, self.n_steps):
146 |             for s in np.linspace(s, S, self.n_steps):
147 |                 if r < s:
148 |                     dist, dim = self._calculate_euclidicity(
149 |                         r, s, X, x, self.max_dim
150 |                     )
151 | 
152 |                     bottleneck_distances.append(dist)
153 |                     dimensions.append(dim)
154 | 
155 |         return np.asarray(bottleneck_distances), np.asarray(dimensions)
156 | 
157 |     # Auxiliary method for performing the 'heavy lifting' when it comes
158 |     # to Euclidicity calculations.
159 |     def _calculate_euclidicity(self, r, s, X, x, d):
160 |         if self.tree is not None:
161 |             inner_indices = self.tree.query_radius(x.reshape(1, -1), r)[0]
162 |             outer_indices = self.tree.query_radius(x.reshape(1, -1), s)[0]
163 | 
164 |             annulus_indices = np.setdiff1d(outer_indices, inner_indices)
165 |             annulus = X[annulus_indices]
166 |         else:
167 |             annulus = np.asarray(
168 |                 [
169 |                     np.asarray(p)
170 |                     for p in X
171 |                     if np.linalg.norm(x - p) <= s
172 |                     and np.linalg.norm(x - p) >= r
173 |                 ]
174 |             )
175 | 
176 |         barcodes, max_dim = self.vr(annulus, d)
177 | 
178 |         if max_dim < 0:
179 |             return np.nan, max_dim
180 | 
181 |         if self.model_sample_fn is not None:
182 |             euclidean_annulus = self.model_sample_fn(
183 |                 n=len(annulus), r=r, R=s, d=d
184 |             )
185 |             barcodes_euclidean, _ = self.vr(euclidean_annulus, d)
186 | 
187 |         # No sampling function has been specified. Compare to a fixed
188 |         # annulus with known persistent homology.
189 |         #
190 |         # TODO: Technically, the single feature should be put into
191 |         # a persistence diagram of the right dimension. Let us not
192 |         # do that for now (since we stack diagrams anyway).
193 |         else:
194 |             barcodes_euclidean = np.asarray([[0, np.inf], [r, s]])
195 | 
196 |         if barcodes_euclidean is None:
197 |             return np.nan, max_dim
198 | 
199 |         dist = self.vr.distance(barcodes, barcodes_euclidean)
200 |         return dist, max_dim
201 | 


--------------------------------------------------------------------------------
/tardis/make_pinched_torus.py:
--------------------------------------------------------------------------------
 1 | """Create "pinched torus" data set.
 2 | 
 3 | Usage:
 4 |     python make_pinched_torus.py > Pinched_torus.csv
 5 | """
 6 | 
 7 | import sys
 8 | 
 9 | from math import pi
10 | from math import cos
11 | from math import sin
12 | 
13 | import numpy as np
14 | 
15 | n = 4096
16 | m = 512
17 | o = 512
18 | R = 10
19 | r = 1
20 | k = 0.5
21 | 
22 | # Gap size in angular coordinates. This is to be seen as the radius for
23 | # which the 'pinch' is relevant.
24 | gap_size = pi / 180.0 * 90
25 | 
26 | X = list()
27 | Y = list()
28 | Z = list()
29 | 
30 | for i in range(m):
31 |     for j in range(o):
32 |         phi = 2 * pi * i / (m - 1)
33 |         theta = 2 * pi * j / (o - 1)
34 | 
35 |         r_ = r
36 | 
37 |         x = (R + r_ * cos(theta) * cos(k * phi)) * cos(phi)
38 |         y = (R + r_ * cos(theta) * cos(k * phi)) * sin(phi)
39 |         z = r_ * sin(theta) * cos(k * phi)
40 | 
41 |         X.append(x)
42 |         Y.append(y)
43 |         Z.append(z)
44 | 
45 | X = np.vstack((X, Y, Z)).T
46 | np.savetxt(sys.stdout, X)
47 | 


--------------------------------------------------------------------------------
/tardis/make_wedged_spheres.py:
--------------------------------------------------------------------------------
 1 | """Create "wedged spheres" data set.
 2 | 
 3 | Usage:
 4 |     python make_wedged_spheres.py 2 > Wedged_spheres_2.csv
 5 | """
 6 | 
 7 | import argparse
 8 | import sys
 9 | 
10 | import numpy as np
11 | 
12 | from tardis.shapes import sample_from_wedged_spheres
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 | 
18 |     parser.add_argument(
19 |         "-d", "--dimension", default=2, type=int, help="Intrinsic dimension"
20 |     )
21 |     parser.add_argument(
22 |         "-n",
23 |         "--num-samples",
24 |         default=10000,
25 |         type=int,
26 |         help="Number of samples",
27 |     )
28 | 
29 |     args = parser.parse_args()
30 | 
31 |     X = sample_from_wedged_spheres(args.num_samples, args.dimension)
32 |     np.savetxt(sys.stdout, X)
33 | 


--------------------------------------------------------------------------------
/tardis/make_wedged_spheres_varying_dim.py:
--------------------------------------------------------------------------------
 1 | """Create "wedged spheres of possibly different dimensions" data set.
 2 | 
 3 | Usage:
 4 |     python make_wedged_spheres_varying_dim.py > Wedged_spheres_varying_dim.csv
 5 | """
 6 | 
 7 | import argparse
 8 | import sys
 9 | 
10 | import numpy as np
11 | 
12 | from tardis.shapes import sample_from_wedged_sphere_varying_dim
13 | 
14 | 
15 | if __name__ == "__main__":
16 |     parser = argparse.ArgumentParser()
17 | 
18 |     parser.add_argument(
19 |         "-d1",
20 |         "--dimension1",
21 |         default=1,
22 |         type=int,
23 |         help="Intrinsic dimension of first sphere",
24 |     )
25 |     parser.add_argument(
26 |         "-d2",
27 |         "--dimension2",
28 |         default=2,
29 |         type=int,
30 |         help="Intrinsic dimension of second sphere",
31 |     )
32 |     parser.add_argument(
33 |         "-n",
34 |         "--num-samples",
35 |         default=10000,
36 |         type=int,
37 |         help="Number of samples",
38 |     )
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     X = sample_from_wedged_sphere_varying_dim(
43 |         args.num_samples, args.dimension1, args.dimension2
44 |     )
45 |     np.savetxt(sys.stdout, X)
46 | 


--------------------------------------------------------------------------------
/tardis/mat_to_npy.py:
--------------------------------------------------------------------------------
 1 | """Convert .mat to .npy file(s).
 2 | 
 3 | Usage:
 4 |     python mat_to_npy.py INPUT
 5 | """
 6 | 
 7 | import os
 8 | 
 9 | import numpy as np
10 | import scipy.io as sio
11 | import sys
12 | 
13 | fname = sys.argv[1]
14 | X = sio.loadmat(fname)
15 | X = X["data"]
16 | 
17 | fname = os.path.splitext(fname)[0]
18 | 
19 | if not os.path.exists(fname):
20 |     np.savez(fname, data=X)
21 | 


--------------------------------------------------------------------------------
/tardis/persistent_homology.py:
--------------------------------------------------------------------------------
 1 | """Wrappers for persistent homology calculations.
 2 | 
 3 | The purpose of this module is to provide wrappers for the persistent
 4 | homology calculations. This is to ensure that the returned shapes of
 5 | barcodes etc. are always consistent regardless of any implementation
 6 | details.
 7 | """
 8 | 
 9 | import gudhi as gd
10 | import numpy as np
11 | 
12 | from gph import ripser_parallel
13 | 
14 | 
15 | class GUDHI:
16 |     """Wrapper for GUDHI persistent homology calculations."""
17 | 
18 |     def __call__(self, X, max_dim):
19 |         """Calculate persistent homology.
20 | 
21 |         Parameters
22 |         ----------
23 |         X : np.array of shape ``(N, d)``
24 |             Input data set.
25 | 
26 |         max_dim : int
27 |             Maximum dimension for calculations
28 | 
29 |         Returns
30 |         -------
31 |         np.array
32 |             Full barcode (persistence diagram) of the data set.
33 |         """
34 |         barcodes = (
35 |             gd.RipsComplex(points=X)
36 |             .create_simplex_tree(max_dimension=max_dim)
37 |             .persistence()
38 |         )
39 | 
40 |         if len(barcodes) == 0:
41 |             return None, -1
42 | 
43 |         # TODO: Check whether this is *always* a feature of non-zero
44 |         # persistence.
45 |         max_dim = np.max([d for d, _ in barcodes])
46 | 
47 |         # TODO: We are throwing away dimensionality information; it is
48 |         # thus possible that we are matching across different dimensions
49 |         # in any distance calculation.
50 |         barcodes = np.asarray([np.array(x) for _, x in barcodes])
51 | 
52 |         return barcodes, max_dim
53 | 
54 |     def distance(self, D1, D2):
55 |         """Calculate Bottleneck distance between two persistence diagrams."""
56 |         return gd.bottleneck_distance(D1, D2)
57 | 
58 | 
59 | class Ripser:
60 |     def __init__(self, stack_diagrams=True):
61 |         self.stack_diagrams = stack_diagrams
62 | 
63 |         if self.stack_diagrams:
64 |             def distance_fn(D1, D2):
65 |                 return gd.bottleneck_distance(D1, D2)
66 |         else:
67 |             def distance_fn(diagrams1, diagrams2):
68 |                 values = [
69 |                     gd.bottleneck_distance(D1, D2)
70 |                     for D1, D2 in zip(diagrams1, diagrams2)
71 |                 ]
72 |                 return np.max(values)
73 | 
74 |         self.distance = distance_fn
75 | 
76 |     def __call__(self, X, max_dim):
77 |         if len(X) == 0:
78 |             return [], -1
79 | 
80 |         diagrams = ripser_parallel(
81 |             X, maxdim=max_dim, collapse_edges=True
82 |         )
83 | 
84 |         diagrams = diagrams["dgms"]
85 |         max_dim = np.max([d for d, D in enumerate(diagrams) if len(D) > 0])
86 | 
87 |         if self.stack_diagrams:
88 |             diagrams = np.row_stack(diagrams)
89 | 
90 |         return diagrams, max_dim
91 | 


--------------------------------------------------------------------------------
/tardis/shapes.py:
--------------------------------------------------------------------------------
  1 | """Shape sampling methods."""
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | def sample_from_annulus(n, r, R, d=2, seed=None):
  7 |     """Sample points from an annulus.
  8 | 
  9 |     This function samples `N` points from an annulus with inner radius `r`
 10 |     and outer radius `R`.
 11 | 
 12 |     Parameters
 13 |     ----------
 14 |     n : int
 15 |         Number of points to sample
 16 | 
 17 |     r : float
 18 |         Inner radius of annulus
 19 | 
 20 |     R : float
 21 |         Outer radius of annulus
 22 | 
 23 |     d : int
 24 |         Dimension of the annulus. Technically, for higher dimensions, we
 25 |         should call the resulting space a "hyperspherical shell." Notice
 26 |         that the algorithm for sampling points in higher dimensions uses
 27 |         rejection sampling, so its efficiency decreases as the dimension
 28 |         increases.
 29 | 
 30 |     seed : int, instance of `np.random.Generator`, or `None`
 31 |         Seed for the random number generator, or an instance of such
 32 |         a generator. If set to `None`, the default random number
 33 |         generator will be used.
 34 | 
 35 |     Returns
 36 |     -------
 37 |     np.array of shape `(n, 2)`
 38 |         Array containing sampled coordinates.
 39 |     """
 40 |     if r >= R:
 41 |         raise RuntimeError(
 42 |             "Inner radius must be less than or equal to outer radius"
 43 |         )
 44 | 
 45 |     rng = np.random.default_rng(seed)
 46 | 
 47 |     if d == 2:
 48 |         thetas = rng.uniform(0, 2 * np.pi, n)
 49 | 
 50 |         # Need to sample based on squared radii to account for density
 51 |         # differences.
 52 |         radii = np.sqrt(rng.uniform(r**2, R**2, n))
 53 | 
 54 |         X = np.column_stack((radii * np.cos(thetas), radii * np.sin(thetas)))
 55 |     else:
 56 |         X = np.empty((0, d))
 57 | 
 58 |         while True:
 59 |             sample = sample_from_ball(n, d, r=R, seed=rng)
 60 |             norms = np.sqrt(np.sum(np.abs(sample) ** 2, axis=-1))
 61 | 
 62 |             X = np.row_stack((X, sample[norms >= r]))
 63 | 
 64 |             if len(X) >= n:
 65 |                 X = X[:n, :]
 66 |                 break
 67 | 
 68 |     return X
 69 | 
 70 | 
 71 | def sample_from_ball(n=100, d=2, r=1, seed=None):
 72 |     """Sample `n` data points from a `d`-ball in `d` dimensions.
 73 | 
 74 |     Parameters
 75 |     -----------
 76 |     n : int
 77 |         Number of data points in ball.
 78 | 
 79 |     d : int
 80 |         Dimension of the ball. Notice that there is an inherent shift in
 81 |         dimension if you compare a ball to a sphere.
 82 | 
 83 |     r : float
 84 |         Radius of ball.
 85 | 
 86 |     seed : int, instance of `np.random.Generator`, or `None`
 87 |         Seed for the random number generator, or an instance of such
 88 |         a generator. If set to `None`, the default random number
 89 |         generator will be used.
 90 | 
 91 |     Returns
 92 |     -------
 93 |     np.array of shape `(n, d)`
 94 |         Array of sampled coordinates.
 95 | 
 96 |     References
 97 |     ----------
 98 |     .. [Voelker2007] A. Voelker et al, Efficiently sampling vectors and
 99 |     coordinates from the $n$-sphere and $n$-ball, Technical Report,
100 |     2017. http://compneuro.uwaterloo.ca/files/publications/voelker.2017.pdf
101 |     """
102 |     rng = np.random.default_rng(seed)
103 | 
104 |     # This algorithm was originally described in the following blog
105 |     # post:
106 |     #
107 |     # http://extremelearning.com.au/how-to-generate-uniformly-random-points
108 |     # -on-n-spheres-and-n-balls/
109 |     #
110 |     # It's mind-boggling that this works but it's true!
111 |     U = rng.normal(size=(n, d + 2))
112 |     norms = np.sqrt(np.sum(np.abs(U) ** 2, axis=-1))
113 |     U = r * U / norms[:, np.newaxis]
114 |     X = U[:, 0:d]
115 | 
116 |     return np.asarray(X)
117 | 
118 | 
119 | def sample_from_sphere(n=100, d=2, r=1, noise=None, seed=None):
120 |     """Sample `n` data points from a `d`-sphere in `d + 1` dimensions.
121 | 
122 |     Parameters
123 |     -----------
124 |     n : int
125 |         Number of data points in shape.
126 | 
127 |     d : int
128 |         Dimension of the sphere.
129 | 
130 |     r : float
131 |         Radius of sphere.
132 | 
133 |     noise : float or None
134 |         Optional noise factor. If set, data coordinates will be
135 |         perturbed by a standard normal distribution, scaled by
136 |         `noise`.
137 | 
138 |     seed : int, instance of `np.random.Generator`, or `None`
139 |         Seed for the random number generator, or an instance of such
140 |         a generator. If set to `None`, the default random number
141 |         generator will be used.
142 | 
143 |     Returns
144 |     -------
145 |     np.array of shape `(n, d + 1)`
146 |         Array of sampled coordinates.
147 | 
148 |     Notes
149 |     -----
150 |     This function was originally authored by Nathaniel Saul as part of
151 |     the `tadasets` package. [tadasets]_
152 | 
153 |     References
154 |     ----------
155 |     .. [tadasets] https://github.com/scikit-tda/tadasets
156 |     """
157 |     rng = np.random.default_rng(seed)
158 |     data = rng.standard_normal((n, d + 1))
159 | 
160 |     # Normalize points to the sphere
161 |     data = r * data / np.sqrt(np.sum(data**2, 1)[:, None])
162 | 
163 |     if noise:
164 |         data += noise * rng.standard_normal(data.shape)
165 | 
166 |     return np.asarray(data)
167 | 
168 | 
169 | def sample_from_wedged_spheres(n=100, d=2, r=1, noise=None, seed=None):
170 |     """Sample points from two wedged spheres.
171 | 
172 |     Parameters
173 |     ----------
174 |     n : int
175 |         Number of points to sample
176 | 
177 |     d : int
178 |         Intrinsic dimension of spheres. The ambient dimension will be
179 |         ``d + 1``.
180 | 
181 |     r : float
182 |         Radius of spheres
183 | 
184 |     noise : float or None
185 |         If set, will be used as a scale factor for random perturbations
186 |         of the positions of points, following a standard normal
187 |         distribution.
188 | 
189 |     seed : int, instance of `np.random.Generator`, or `None`
190 |         Seed for the random number generator, or an instance of such
191 |         a generator. If set to `None`, the default random number
192 |         generator will be used.
193 |     """
194 |     rng = np.random.default_rng(seed)
195 | 
196 |     data1 = rng.standard_normal((n, d + 1))
197 |     data1 = r * data1 / np.sqrt(np.sum(data1**2, 1)[:, None])
198 | 
199 |     data2 = rng.standard_normal((n, d + 1))
200 |     data2 = (
201 |         r * data2 / np.sqrt(np.sum(data2**2, 1)[:, None])
202 |     ) + np.concatenate((np.array([2 * r]), np.zeros(data2.shape[1] - 1)))
203 | 
204 |     X = np.concatenate((data1, data2))
205 | 
206 |     if noise:
207 |         X += noise * rng.standard_normal(X.shape)
208 | 
209 |     return X
210 | 
211 | 
212 | def sample_from_wedged_sphere_varying_dim(n=100, d1=1, d2=2, r=1, noise=None):
213 |     """Sample points from two wedged spheres of possibly different dimensions.
214 | 
215 |     This function permits sampling from wedged spheres of different
216 |     dimensions, thus making it possible to, for instance, combine a
217 |     circle with an ordinary 2D sphere.
218 | 
219 |     Parameters
220 |     ----------
221 |     n : int
222 |         Number of points to sample
223 | 
224 |     d1 : int
225 |         Intrinsic dimension of first sphere. The ambient dimension will be
226 |         ``d1 + 1``.
227 | 
228 |     d2 : int
229 |         Intrinsic dimension of second spheres. The ambient dimension will be
230 |         ``d2 + 1``.
231 | 
232 |     r : float
233 |         Radius of spheres
234 | 
235 |     noise : float or None
236 |         If set, will be used as a scale factor for random perturbations
237 |         of the positions of points, following a standard normal
238 |         distribution.
239 |     """
240 |     data1 = np.random.randn(n, d1 + 1)
241 |     data1 = r * data1 / np.sqrt(np.sum(data1**2, 1)[:, None])
242 |     zeros = np.zeros((len(data1), d2 - d1))
243 |     data1 = np.concatenate((data1, zeros), axis=1)
244 | 
245 |     data2 = np.random.randn(n, d2 + 1)
246 |     data2 = (
247 |         r * data2 / np.sqrt(np.sum(data2**2, 1)[:, None])
248 |     ) + np.concatenate((np.array([2 * r]), np.zeros(data2.shape[1] - 1)))
249 | 
250 |     data = np.concatenate((data1, data2))
251 |     if noise:
252 |         data += noise * np.random.randn(*data.shape)
253 | 
254 |     return data
255 | 
256 | 
257 | def sample_from_constant_curvature_annulus(n, K, r, R, seed=None, **kwargs):
258 |     rng = np.random.default_rng(seed)
259 |     X = np.empty((0, 2))
260 | 
261 |     while True:
262 |         sample = sample_from_constant_curvature_disk(n, K=K, r=R, seed=rng)
263 |         norms = np.sqrt(np.sum(np.abs(sample) ** 2, axis=-1))
264 | 
265 |         X = np.row_stack((X, sample[norms >= r]))
266 | 
267 |         if len(X) >= n:
268 |             X = X[:n, :]
269 |             break
270 | 
271 |     return X
272 | 
273 | 
274 | def sample_from_constant_curvature_disk(n, K=0.0, r=1.0, seed=None):
275 |     """Sample from a disk of constant curvature.
276 | 
277 |     Parameters
278 |     ----------
279 |     n : int
280 |         Number of points to sample
281 | 
282 |     K : float
283 |         Curvature of the respective disk. When positive, must be less
284 |         than or equal to 2.
285 | 
286 |     seed : int, instance of `np.random.Generator`, or `None`
287 |         Seed for the random number generator, or an instance of such
288 |         a generator. If set to `None`, the default random number
289 |         generator will be used.
290 | 
291 |     Returns
292 |     -------
293 |     np.array of shape `(n, 2)`
294 |         Array containing sampled coordinates.
295 |     """
296 |     rng = np.random.default_rng(seed)
297 | 
298 |     theta = rng.uniform(0, 2 * np.pi, n)
299 |     u = rng.uniform(0, 1, n)
300 | 
301 |     # Sample from Euclidean disk; we could also get this result with
302 |     # other routines from this module, but implementing this here is
303 |     # making everything more self-contained.
304 |     if K == 0.0:
305 |         radii = np.sqrt(u)
306 | 
307 |     # Hyperbolic case (negative curvature)
308 |     elif K < 0.0:
309 |         radii = np.multiply(np.sqrt(u), np.sinh(np.sqrt(-K) / 2.0))
310 |         radii = np.multiply(2.0 / np.sqrt(-K), np.arcsinh(radii))
311 | 
312 |     # Spherical case (positive curvature)
313 |     else:
314 |         assert K <= 2
315 | 
316 |         radii = np.multiply(np.sqrt(u), np.sin(np.sqrt(K) / 2.0))
317 |         radii = np.multiply(2.0 / np.sqrt(K), np.arcsin(radii))
318 | 
319 |     x = np.multiply(r * radii, np.cos(theta))
320 |     y = np.multiply(r * radii, np.sin(theta))
321 |     return np.vstack([x, y]).T
322 | 


--------------------------------------------------------------------------------
/tardis/utils.py:
--------------------------------------------------------------------------------
  1 | """Utilities module.
  2 | 
  3 | This module collects some utility functions, making them accessible to
  4 | a wider number of modules.
  5 | """
  6 | 
  7 | import logging
  8 | import os
  9 | 
 10 | import numpy as np
 11 | 
 12 | from sklearn.neighbors import KDTree
 13 | 
 14 | from tardis.data import sample_vision_data_set
 15 | 
 16 | 
 17 | def load_data(filename, batch_size, n_query_points, seed=None):
 18 |     """Load data from filename, depending on input type.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     filename : str
 23 |         If this points to a file name, the function will attempt to load
 24 |         said file and parse it. Else, the function will consider this as
 25 |         the name of a data set to load.
 26 | 
 27 |     batch_size : int
 28 |         Number of points to sample from data set.
 29 | 
 30 |     n_query_points : int
 31 |         Number of points to use for the subsequent Euclidicity
 32 |         calculations. It is possible to use the full data set.
 33 | 
 34 |     seed : int, instance of `np.random.Generator`, or `None`
 35 |         Seed for the random number generator, or an instance of such
 36 |         a generator. If set to `None`, the default random number
 37 |         generator will be used.
 38 | 
 39 |     Returns
 40 |     -------
 41 |     Tuple of np.array, np.array
 42 |         The (subsampled) data set along with its query points is
 43 |         returned.
 44 |     """
 45 |     if os.path.exists(filename):
 46 |         ext = os.path.splitext(filename)[1]
 47 |         if ext == ".txt" or ext == ".gz":
 48 |             X = np.loadtxt(filename)
 49 |         elif ext == ".npz":
 50 |             X = np.load(filename)["data"]
 51 |     else:
 52 |         X = sample_vision_data_set(filename, batch_size)
 53 | 
 54 |     assert X is not None, RuntimeError(
 55 |         f"Unable to handle input file {filename}"
 56 |     )
 57 | 
 58 |     logger = logging.getLogger()
 59 | 
 60 |     logger.info(f"Sampling a batch of {batch_size} points")
 61 |     logger.info(f"Using {n_query_points} query points")
 62 | 
 63 |     rng = np.random.default_rng(seed)
 64 | 
 65 |     X = X[rng.choice(X.shape[0], batch_size, replace=False)]
 66 |     query_points = X[rng.choice(X.shape[0], n_query_points, replace=False)]
 67 | 
 68 |     return X, query_points
 69 | 
 70 | 
 71 | def estimate_scales(X, query_points, k_max):
 72 |     """Perform simple scale estimation of the data set.
 73 | 
 74 |     Parameters
 75 |     ----------
 76 |     k_max : int
 77 |         Maximum number of neighbours to consider for the local scale
 78 |         estimation.
 79 | 
 80 |     Returns
 81 |     --------
 82 |     List of dict
 83 |         A list of dictionaries consisting of the minimum and maximum
 84 |         inner and outer radius, respectively.
 85 |     """
 86 |     tree = KDTree(X)
 87 |     distances, _ = tree.query(query_points, k=k_max, return_distance=True)
 88 | 
 89 |     # Ignore the distance to ourself, as we know that one already.
 90 |     distances = distances[:, 1:]
 91 | 
 92 |     scales = [
 93 |         {
 94 |             "r": dist[0],
 95 |             "R": dist[round(k_max / 3)],
 96 |             "s": dist[round(k_max / 3)],
 97 |             "S": dist[-1],
 98 |         }
 99 |         for dist in distances
100 |     ]
101 | 
102 |     return scales
103 | 


--------------------------------------------------------------------------------
/tardis/visualise_data.py:
--------------------------------------------------------------------------------
 1 | """Basic visualisation of Euclidicity.
 2 | 
 3 | This is a helper script for visualising Euclidicity scores of
 4 | high-dimensional point clouds.
 5 | """
 6 | 
 7 | import argparse
 8 | import os
 9 | 
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | import phate
14 | 
15 | import matplotlib.pyplot as plt
16 | import seaborn as sns
17 | 
18 | 
19 | if __name__ == "__main__":
20 |     parser = argparse.ArgumentParser()
21 | 
22 |     parser.add_argument("FILE", nargs="+", type=str, help="Input filename(s)")
23 |     parser.add_argument(
24 |         "-o",
25 |         "--output",
26 |         help="Output directory. If set, will store embedded point clouds.",
27 |         type=str,
28 |     )
29 | 
30 |     args = parser.parse_args()
31 | 
32 |     n_files = len(args.FILE)
33 | 
34 |     sns.set_theme(style="darkgrid")
35 |     fig, axes = plt.subplots(ncols=n_files)
36 | 
37 |     if n_files == 1:
38 |         axes = [axes]
39 | 
40 |     # Following the parameters of the original PHATE publication. We set
41 |     # a random state to ensure that the output remains reproducible.
42 |     emb = phate.PHATE(decay=10, t=50, random_state=42)
43 | 
44 |     for filename, ax in zip(args.FILE, axes):
45 |         if (ext := os.path.splitext(filename)[1]) == ".csv":
46 |             df = pd.read_csv(filename)
47 |             df = df.drop("persistent_intrinsic_dimension", axis="columns")
48 |             X = df.to_numpy()
49 |         elif ext == ".npz":
50 |             X = np.load(filename)["arr_0"]
51 |         else:
52 |             X = np.loadtxt(filename)
53 | 
54 |         y = X[:, -1].flatten()
55 | 
56 |         iqr = np.subtract(*np.percentile(y, [75, 25]))
57 |         q3 = np.percentile(y, 75)
58 | 
59 |         # Remove Euclidicity scores. Our implementation adds them to the
60 |         # last column of the data.
61 |         X = X[:, :-1]
62 | 
63 |         X_emb = emb.fit_transform(X)
64 | 
65 |         scatter = ax.scatter(
66 |             x=X_emb[:, 0],
67 |             y=X_emb[:, 1],
68 |             c=y,
69 |             alpha=0.5,
70 |             s=1.0,
71 |             # Try to highlight outliers a little bit better.
72 |             vmax=q3 + 1.5 * iqr,
73 |         )
74 |         fig.colorbar(scatter, ax=ax)
75 | 
76 |         if args.output is not None:
77 |             out_filename = os.path.basename(filename)
78 |             out_filename = os.path.splitext(out_filename)[0] + ".csv"
79 |             out_filename = os.path.join(args.output, out_filename)
80 | 
81 |             X_out = np.hstack((X_emb, y.reshape(-1, 1)))
82 | 
83 |             np.savetxt(
84 |                 out_filename,
85 |                 X_out,
86 |                 fmt="%.4f",
87 |                 delimiter=",",
88 |                 header="x,y,euclidicity",
89 |             )
90 | 
91 |     plt.show()
92 | 


--------------------------------------------------------------------------------