├── .gitignore ├── LICENSE ├── README.md ├── assets ├── distance_correlations.png ├── distance_correlations_4separation.png ├── heatmap_and_scatters.png ├── heatmap_and_scatters_4separation.png ├── intrinsic_dims_increase_with_noise.png ├── intrinsic_dims_increase_with_noise_4separation.png ├── true_dims_with_noise_vs_dim_reduction.png └── true_dims_with_noise_vs_dim_reduction_4separation.png ├── experiment.py └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Scott Tyler 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Do tSNE and UMAP overfit their intrinsic dimensionality 2 | ## (The answer, unfortuantely: Yes) 3 | 4 | This repository presents an analysis of the intrinsic dimensionality of data that has true dimensions and additional noise dimensions. The primary goal is to explore how various noise levels influence the perceived dimensionality and how different dimensionality reduction methods represent the data. 5 | 6 | ## Intention 7 | 8 | The main intention behind this analysis is to: 9 | 1. Generate synthetic data with known "true dimensions" and introduce redundant noisy dimensions. 10 | 2. Observe how different dimension reduction techniques, namely t-SNE and UMAP, represent the data with increasing noise. 11 | 3. Estimate the intrinsic dimensionality as noise levels increase and analyze its implications. 12 | 13 | ## Methods Walkthrough 14 | 15 | 1. **Data Generation**: 16 | - We started by creating synthetic data with known dimensions using a random number generator. 17 | - For each true dimension, redundant dimensions were added. These dimensions were seeded from the true data but had noise introduced, which was controlled by a specified standard deviation ratio (`sd_ratio`). 18 | 19 | 2. **Dimension Reduction**: 20 | - For each noise level, dimensionality was reduced using some popular linear & non-linear methods: PCA, NMF, t-SNE, UMAP, and SOM. 21 | - The reduced dimensions were then visualized against the true dimensions to observe the effects of noise. 22 | 23 | 3. **Intrinsic Dimensionality Estimation**: 24 | - At each noise level, the intrinsic dimensionality of the data was estimated using the DANCo method. 25 | - This helped us understand how perceived dimensionality changes as noise increases. 26 | 27 | ## Results: 28 | 29 | ### 2 "real" dimensions, no structure in observations 30 | 31 | Okay - so folks have griped that the "false creation of struction from nothingness" might only happen when you "reduce" from 2 dimentions to 2 dimentions. Of course, that's not the point of dimension reduction - the point is to reduce dimensions. 32 | https://twitter.com/slavov_n/status/1683785160825643008 33 | 34 | So - what if the "intrinsic" dimentionality is 2, but there are lots of redundant dimentions? In this situation, we have 2 "real dimentions" that could explain most of the variation in the data. We'll first simulate those 2 "real dimentions" (left hand column in the plots below), then we'll create 100 redundant dimentions per real dimention for our 1000 observations (+variable amounts of noise for the redundant dimentions: rows). So here, the input fed into the algs are actually 1000 rows (observations), with 200 features (columns). But they are generated from 2 main features + variable amounts of noise. Do we still see structure from nothing when we are actually performing dimensionality reduction from 200 features to 2, knowing that the underlying 2 main features are unrelated? Yes. 35 | 36 | ![True Dimensions vs Dimension Reduction](assets/true_dims_with_noise_vs_dim_reduction.png) 37 | 38 | From the above figure, we can see that PCA looks similar to the true dimensions. NMF (w/ data shifted up to be non-negative) honestly surprised me here... I don't have a good intuition for why adding noise, creates that elongated shape... ¯\\_ (ツ)_/¯. Ideally it wouldn't - it might give the impression that there is some sort of "trajectory" but we see here that just noise can create that illusion. 39 | 40 | When we look at the t-SNE and UMAP (default params), we find that they create the appearance of structure from random Gaussian distributions, even when doing a 100 fold dimension reduction, knowing what the true dimensions are. We also see that as noise increases, this structure gets blurier (usurprising, we'll circle back to that). If you're not familiar with SOM, it might look strange, but it just places observations inside of a 2D grid, so that's why it looks uniform; overall it visually looks like it made something Gaussian-ish, but the grid pattern makes it a bit harder to interpret. 41 | 42 | You might be balking, thinking that this is still the 2 dims to 2 dims examples. It's not. Below, you'll see the heatmaps of exactly what the input data was. It's clear from the below that we really have our 2 main sources of variation, with varying levels of noise. The scatter plots on the right show the correlation between the main source variable, and an example of one of its 100 redundant features. Note however that, as we increase the spread of the data around this correlation, we really are adding another dimension (imgine an orthogonal line that would cut across, that we'd need to explain this noise). We can see this also if we try to estimate the "intrinsic dimensionality" as noted above the heatmaps. 43 | 44 | ![Heatmap and Scatters](assets/heatmap_and_scatters.png) 45 | 46 | This plot highlights an interesting phenomenon. As noise levels (or the `sd_ratio`) increase, the estimated intrinsic dimensionality also rises. This resonates with the notion that added noise in one dimension is perceived as adding its own dimension. We can actually explicitly test what the _apparent_ "intrinsic dimensionality" is. We know that there were 2 "main variables", and what we added was noise around those 2 main variables. So what does the intrnisic dimensionality look like when estimated using the [DANCo method](https://doi.org/10.48550/arXiv.1206.3881)? 47 | 48 | ![Intrinsic Dimensions Increase with Noise](assets/intrinsic_dims_increase_with_noise.png) 49 | 50 | The challenge lies in deciphering which dimensions are "meaningful" and which are mere noise. But that's the thing... Noise is a dimension. It's just bespoke to each individual variable. 51 | 52 | How well do they recapitulate the original observation:observation distances? This is an important question, but one to be interpreted with some degree of caution. In this case we're using random inputs, so the distances should be 1-to-1. But in situations where you have curved space, if your dimension reduction 'flattens' that space, you wouldn't expect it to be 1-to-1; however, you would still expect it to be monotonic, with low variation around the monotonic curve. So what do the results look like? Have a look below (X-axis: True distances (based on 'main' dimensions), Y-axis: Dim-reduced distances): 53 | 54 | ![Distance Correlations](assets/distance_correlations.png) 55 | 56 | Unsurprisingly in this case, PCA is bang on, followed by NMF, which has a heteroscedastic pattern, meaning that it preserves very nearby structure a bit better than global. tSNE and UMAP have similar strange patterns and small areas (that likely correspond to the 'cluster' looking structures), which decrease the quality of the conservation of distances. SOM was interesting here, because just looking at the 2D projection, it seemed like it somewhat captured the Gaussian like structure. But when we look at the actual distances, we see that which point ended up where within that Gaussian like grid pattern didn't necessarily match up, as it had the least conservation of distances. 57 | 58 | ### 2 "real" dimensions structure (2-clusters) in one of them 59 | 60 | What about when we _do_ have a some structure? We hypothesized that the "real" structure would indeed be observed, but that within each cluster, we may see a similar "craggly" pattern of overfitting. 61 | 62 | First, let's look at the input data, so we understand the input: 63 | 64 | ![Heatmap and Scatters clust](assets/heatmap_and_scatters_4separation.png) 65 | 66 | There are again only 2 'real' dimensions, but one includes a gap for half of the observations, creating 2 clusers. Again, also with increasing noise added (which is equivalent to adding a unique dimension of varing magnitude within each feature): 67 | 68 | Now, let's see how they all actually look: 69 | 70 | ![True Dimensions vs Dimension Reduction clust](assets/true_dims_with_noise_vs_dim_reduction_4separation.png) 71 | 72 | We do of course see the two clusters in the dim reductions (except for SOM, which appears to struggle here). PCA and NMF seem to do a good job regardless of the noise regime. tSNE and UMAP have the same neighbor overfitting issues as before until the noise becomes larger than the signal. 73 | 74 | The reason that this seems to help is that the noise dimensions are all orthogonal to each other, so that the 'neighbor signal' coming from the noisy orthogonal dimensions are all in random unrelated directions. So by the central limit theorum, we can think of this as adding a smoothing function because, if the 'neighbor signal' is receiving noise in many random directions, ultimately, the overfitting in the 'real' dimensions, gets washed out by the unique & orthogonal overfitting in the 'noise' dimensions. In fact for tSNE & UMAP, it's not until you hit a 25% noise ratio that the 'real' clusters become quite clear (but still with apparent overfit local structures within the clusters). 75 | 76 | Does this mean that I should just add some noise to my data & then it's fine? No. It's important to remember here that what we're calling noise and signal in simulations is somewhat arbitrary. If we keep adding more and more noise, all of the apparent structure will get progressively more drowned out. But we don't have an objective function to know that is 'real' signal & what is 'noise' signal, or technical variability in our measures, etc. To tackle those problems, you'll have to know what the noise sources are, and try to correct for them explicitly. 77 | 78 | So how well did they recapitualte the original distances in 'real' dimension space? 79 | 80 | ![Distance Correlations clust](assets/distance_correlations_4separation.png) 81 | 82 | Overall, the results seem fairly similar to the negative control. PCA does fairly well, but becomes more heteroscedastic in the relationship of the real and observed distances, somewhat more like NMF was originally. tSNE and UMAP are relatively monotonic, but the added local structures can be seen as the jagged edges and whisps added into the central correlation pattern & SOM still seems to struggle. 83 | 84 | 85 | ## Conclusion 86 | 87 | ### With strict negative controls: 88 | 89 | So why then when simulate _only_ orthogonal dimensions, it comes out as blobs? 90 | https://twitter.com/ChenxinLi2/status/1683818705296461830 91 | https://twitter.com/willmacnair/status/1684905102576889856 92 | 93 | Well - really - we need to change what we think about dimensionality. Noise _is_ a dimension. It's not an interesting one though... So when you have 100, or 10,000 completely random inputs. The true dimensionality is 100 or 10,000 dimentions. So if you have an algorithm that will overfit every one of them, but in a unique and random way, if we think about the concept of the central limit theorum, those errors, in different random directions, will end up collapsing it back down into a hairball. Now that doesn't mean 94 | 95 | We see this in our own example above as well. As you increase the amount of noise added on top of the "real dimentions" what you're actually doing is adding new, orthogonal (but uninteresting) dimentions. That's why we end up seeing the structure get progressively blurrier. We're actually increasing the intrinsic dimensionality, that can't be captured in a 2D display. 96 | 97 | In conclusion: yes - these dimension reduction algorithms overfit their intrinsic dimensionality. But we have now also seen that noise is essentially its own dimension & adding N-observation orthogonal noise dimentions blurs out the overfitting. My interpreation of this is that it's conceptually just the central limit theorum - that adding in many many sources of noise in random directions, causes the overfitting to seem to go away, but it's still there, just getting washed out by all of the other sources of overfitting layered on top. 98 | 99 | ### With a true source of structure present 100 | 101 | Overall even when one of the dimensions encodes a separation of points between them, we see the same thing as above. Bearing in mind that this simulation was with Gaussian distributed data, PCA did the best job, and not far behind was NMF. But tSNE and UMAP had the same issues as in the negative control, synthesizing extra local structure until sufficiently large noise was added, deceasing the overall neighbor overfitting. SOM struggles a bit - perhaps this is not the best use-case... 102 | 103 | Dmitry Kobak has mentioned that it always seems like it's in simulation rather than real world datasets that this overfitting pattern emerges. I think this simulation actually directly gets to the bottom of that. In real world data - there are many noise dimensions! In the single-cell -omics domain, every feature comes with both systematic technical sources of variation (which may be correlated between them based on some mediator variable), and there's also the noise of Poisson sampling. 104 | 105 | Ultimately, the challenge will be unraveling what's a noise dimension & what's a 'real' dimension. Answering this question will of course be _exceptionally_ domain specific, so I can't proffer any advice here... That being said, I have some ideas in the single cell space ;-) 106 | 107 | ## Final remarks 108 | 109 | I'm also happy to be wrong on this - but it's just what the data seems to indicate. The data is the data, as they say... 110 | 111 | That's also not to say tSNE/UMAP are completely unrelated to the underlying data! Of course they are related. I've used them too. But we _do_ need to be very aware of algorithm assumptions, limitations, and perform negative controls with any method that we use. Especially if it's used for analysis rather than just visualization. 112 | 113 | ## References 114 | 115 | - [Visualizing Data Using t-SNE](https://jmlr.org/papers/v9/vandermaaten08a.html) 116 | - [UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction](https://doi.org/10.48550/arXiv.1802.03426) 117 | - [DANCo: Dimensionality from Angle and Norm Concentration. Camastra & Vinciarelli, 2012](https://doi.org/10.48550/arXiv.1206.3881) 118 | TODO: need to cite the others 119 | -------------------------------------------------------------------------------- /assets/distance_correlations.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/distance_correlations.png -------------------------------------------------------------------------------- /assets/distance_correlations_4separation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/distance_correlations_4separation.png -------------------------------------------------------------------------------- /assets/heatmap_and_scatters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/heatmap_and_scatters.png -------------------------------------------------------------------------------- /assets/heatmap_and_scatters_4separation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/heatmap_and_scatters_4separation.png -------------------------------------------------------------------------------- /assets/intrinsic_dims_increase_with_noise.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/intrinsic_dims_increase_with_noise.png -------------------------------------------------------------------------------- /assets/intrinsic_dims_increase_with_noise_4separation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/intrinsic_dims_increase_with_noise_4separation.png -------------------------------------------------------------------------------- /assets/true_dims_with_noise_vs_dim_reduction.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/true_dims_with_noise_vs_dim_reduction.png -------------------------------------------------------------------------------- /assets/true_dims_with_noise_vs_dim_reduction_4separation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/true_dims_with_noise_vs_dim_reduction_4separation.png -------------------------------------------------------------------------------- /experiment.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import pearsonr 2 | from scipy.spatial import distance_matrix 3 | from sklearn.decomposition import NMF 4 | from minisom import MiniSom 5 | import skdim 6 | import seaborn as sns 7 | import matplotlib.pyplot as plt 8 | import umap 9 | import numpy as np 10 | from matplotlib import pyplot as plt 11 | from sklearn.manifold import TSNE 12 | from sklearn.decomposition import PCA 13 | 14 | 15 | def get_main_mat(true_gen_func, n_obs, true_dims, separation=0): 16 | """ 17 | Generate the main matrix with an optional separation between two clusters. 18 | 19 | Args: 20 | - true_gen_func (function): Function to generate the main matrix 21 | - n_obs (int): Number of observations 22 | - true_dims (int): Number of true dimensions 23 | - separation (float): Multiple of standard deviation for separating clusters 24 | 25 | Returns: 26 | - main_mat (numpy array): Generated main matrix 27 | """ 28 | if separation == 0: 29 | return true_gen_func(n_obs, true_dims) 30 | 31 | # Generate half of the observations 32 | half_obs = n_obs // 2 33 | cluster_1 = true_gen_func(half_obs, true_dims) 34 | 35 | # Generate the other half with an offset in the first dimension 36 | cluster_2 = true_gen_func(int(n_obs - half_obs), true_dims) 37 | offset = separation * np.std(cluster_1[:, 0]) 38 | cluster_2[:, 0] += offset 39 | # Concatenate the two clusters vertically 40 | main_mat = np.vstack([cluster_1, cluster_2]) 41 | 42 | return main_mat 43 | 44 | 45 | 46 | 47 | def generate_data(n_obs, true_dims, n_redundant_per_true, true_gen_func, redundant_gen_noise_func, sd_ratio, separation=0): 48 | """ 49 | Generates data matrix with true dimensions and redundant dimensions. 50 | 51 | Args: 52 | - n_obs (int): Number of observations 53 | - true_dims (int): Number of true dimensions 54 | - n_redundant_per_true (int): Number of redundant dimensions per true dimension 55 | - true_gen_func (function): Function to generate the main matrix 56 | - redundant_gen_noise_func (function): Function to generate noise for redundant dimensions 57 | - sd_ratio (float): Ratio for scaling noise 58 | 59 | Returns: 60 | - main_mat (numpy matrix): Matrix of true dimensions 61 | - redundant_mat (numpy matrix): Matrix of redundant dimensions 62 | """ 63 | # Generate the main matrix 64 | if separation == 0.: 65 | main_mat = true_gen_func(n_obs, true_dims) 66 | else: 67 | main_mat = get_main_mat(true_gen_func, n_obs, true_dims, separation=separation) 68 | # Placeholder for the redundant dimensions 69 | redundant_dims = [] 70 | for i in range(true_dims): 71 | # Standard deviation for this dimension in main_mat 72 | dim_std = np.std(main_mat[:, i]) 73 | # Create n_redundant_per_true redundant dimensions seeded at main_mat[:, i] values 74 | for _ in range(n_redundant_per_true): 75 | noise = redundant_gen_noise_func(n_obs, 1) 76 | redundant_dim = main_mat[:, i][:, 77 | np.newaxis] + noise * sd_ratio * dim_std 78 | # Standardize the redundant dimension 79 | redundant_dim = redundant_dim / np.std(redundant_dim) 80 | redundant_dims.append(redundant_dim) 81 | # Stack all redundant dimensions horizontally 82 | redundant_mat = np.hstack(redundant_dims) 83 | return main_mat, redundant_mat 84 | 85 | 86 | def dim_reduction(in_mat, dim_red_func_list, dim_red_names, final_dims): 87 | """ 88 | Reduces the dimensions of the input matrix using specified functions. 89 | 90 | Args: 91 | - in_mat (numpy matrix): Input data matrix 92 | - dim_red_func_list (list): List of dimension reduction functions 93 | - final_dims (int): Number of dimensions after reduction 94 | 95 | Returns: 96 | - results (dict): dictionary of dim_red_names and their results 97 | """ 98 | # Placeholder for the results from each dimension reduction function 99 | results = {} 100 | for func, name in zip(dim_red_func_list, dim_red_names): 101 | result = func(in_mat, final_dims) 102 | results[name]=result 103 | return results 104 | 105 | 106 | #################################################### 107 | def tsne_wrapper(data, n_components): 108 | tsne = TSNE(n_components=n_components) 109 | return tsne.fit_transform(data) 110 | 111 | 112 | def umap_wrapper(data, n_components): 113 | reducer = umap.UMAP(n_components=n_components) 114 | return reducer.fit_transform(data) 115 | 116 | 117 | def pca_wrapper(data, n_components): 118 | pca = PCA(n_components=n_components) 119 | return pca.fit_transform(data) 120 | 121 | 122 | def nmf_wrapper(data, n_components, epsilon = 1e-8): 123 | """ 124 | Use NMF for dimensionality reduction. 125 | 126 | Parameters: 127 | - data: input data 128 | - n_components: number of components for the reduced dimension 129 | 130 | Returns: 131 | - transformed_data: numpy array of shape (n_samples, n_components) 132 | """ 133 | data -= np.min(data) 134 | data += epsilon 135 | nmf = NMF(n_components=n_components, init='random', random_state=0) 136 | transformed_data = nmf.fit_transform(data) 137 | return transformed_data 138 | 139 | 140 | def som_wrapper(data, n_components=2): 141 | """ 142 | Use MiniSom for SOM. 143 | Note: For SOM, n_components is expected to be 2 since we are using a 2D grid. 144 | 145 | Parameters: 146 | - data: input data 147 | - n_components: dimensions of the output (expected to be 2 for a 2D grid) 148 | 149 | Returns: 150 | - positions: numpy array of shape (n_samples, n_components) representing positions on the grid 151 | """ 152 | assert n_components == 2, "For SOM, n_components should be 2." 153 | 154 | x_size, y_size = 50, 50 # You can adjust these values based on your needs 155 | som = MiniSom(x_size, y_size, data.shape[1]) 156 | som.train_random(data, 5000) 157 | 158 | positions = np.array([som.winner(d) for d in data]) 159 | return positions 160 | 161 | 162 | ##################################################### 163 | 164 | def true_gen_func(n_obs, true_dims): 165 | """ 166 | Example function to generate the main matrix. 167 | 168 | Args: 169 | - n_obs (int): Number of observations 170 | - true_dims (int): Number of true dimensions 171 | 172 | Returns: 173 | - Main matrix (numpy array) 174 | """ 175 | return np.random.randn(n_obs, true_dims) 176 | 177 | 178 | def redundant_gen_noise_func(n_obs, true_dims): 179 | """ 180 | Example function to generate noise for redundant dimensions. 181 | 182 | Args: 183 | - n_obs (int): Number of observations 184 | - true_dims (int): Number of true dimensions 185 | 186 | Returns: 187 | - Noise matrix (numpy array) 188 | """ 189 | return np.random.randn(n_obs, true_dims) 190 | 191 | 192 | ######################################### 193 | 194 | def plot_dim_reductions(true_dim_dict, results_dict, sep_str): 195 | """ 196 | Plots scatter plots of true dimensions and results of dimension reduction methods. 197 | 198 | Args: 199 | - true_dim_dict (dict): Dictionary of true dimensions for each sd_ratio 200 | - results_dict (dict): Dictionary of results for each sd_ratio and each dimension reduction method 201 | 202 | """ 203 | # Number of rows is the number of sd_ratios 204 | n_rows = len(results_dict) 205 | 206 | # Number of columns is 1 (for true dimensions) + number of dimension reduction methods 207 | n_cols = 1 + len(next(iter(results_dict.values()))) 208 | 209 | # Create a figure with subplots 210 | fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5 * n_rows)) 211 | 212 | # Adjust the spacing between subplots 213 | # Adjust these values as needed for desired spacing 214 | fig.subplots_adjust(wspace=0.3, hspace=0.3) 215 | 216 | # Loop through each sd_ratio and plot 217 | for i, sd_ratio in enumerate(results_dict): 218 | # Row title for sd_ratio 219 | if n_cols > 1: 220 | axes[i, 0].set_ylabel(sd_ratio, fontsize=28, 221 | rotation=90, labelpad=50, va="center") 222 | else: 223 | axes.set_ylabel(sd_ratio, fontsize=28, rotation=90, 224 | labelpad=50, va="center") 225 | 226 | # Plot true dimensions 227 | axes[i, 0].scatter(true_dim_dict[sd_ratio][:, 0], 228 | true_dim_dict[sd_ratio][:, 1], alpha=0.6, s=5) 229 | #axes[i, 0].set_title(f"True Dimensions", fontsize=12) 230 | 231 | # Loop through each dimension reduction method and plot 232 | for j, method in enumerate(results_dict[sd_ratio]): 233 | axes[i, j+1].scatter(results_dict[sd_ratio][method][:, 0], 234 | results_dict[sd_ratio][method][:, 1], alpha=0.6, s=5) 235 | #axes[i, j+1].set_title(f"{method}", fontsize=12) 236 | 237 | # Add column titles 238 | col_titles = ['Intrinsic Dimensions that\ncreated input vals+noise'] + \ 239 | list(next(iter(results_dict.values())).keys()) 240 | for ax, col in zip(axes[0], col_titles): 241 | ax.annotate(col, (0.5, 1.15), xycoords='axes fraction', ha='center', 242 | va='center', fontsize=28, textcoords='offset points') 243 | for ax_row in axes: 244 | for ax in ax_row: 245 | for spine in ax.spines.values(): 246 | spine.set_linewidth(2) 247 | plt.savefig("assets/true_dims_with_noise_vs_dim_reduction"+sep_str+".png", dpi=300) 248 | 249 | 250 | 251 | def plot_obs_data_heatmap(gt_data_dict, obs_data_dict, danco_dict, sep_str): 252 | """ 253 | Plots heatmaps of obs_data for each sd_ratio. 254 | 255 | Args: 256 | - gt_data_dict (dict): Dictionary of ground truth data matrices for each sd_ratio 257 | - obs_data_dict (dict): Dictionary of obs_data matrices for each sd_ratio 258 | 259 | """ 260 | # Number of rows is the number of sd_ratios 261 | n_rows = len(obs_data_dict) 262 | 263 | # Create a figure with subplots 264 | fig, axes = plt.subplots(n_rows, 2, figsize=(10, 5 * n_rows)) 265 | fig.subplots_adjust(wspace=0.35, hspace=0.38) 266 | 267 | if n_rows == 1: 268 | axes = [axes] 269 | 270 | # Loop through each sd_ratio and plot 271 | for i, sd_ratio in enumerate(obs_data_dict.keys()): 272 | # Extract ground truth and observation data for scatter plot 273 | temp_gt_data = gt_data_dict[sd_ratio] 274 | temp_obs_data = obs_data_dict[sd_ratio] 275 | estimated_dims = danco_dict[sd_ratio] 276 | 277 | # Plot the heatmap 278 | sns.heatmap(temp_obs_data, ax=axes[i, 0], cmap="YlGnBu", cbar=False) 279 | axes[i, 0].set_title(f"DANCo dim\nestimate {estimated_dims:.2f}", fontsize=23) 280 | axes[i, 0].set_xticks([]) 281 | axes[i, 0].set_yticks([]) 282 | axes[i, 0].set_ylabel(sd_ratio, fontsize=26, rotation=90, 283 | labelpad=50, va="center") 284 | 285 | # Scatter plot 286 | axes[i, 1].scatter(temp_obs_data[:, 0], temp_gt_data[:, 0], alpha=0.5) 287 | #axes[i, 1].set_title(f"Scatter for {sd_ratio}", fontsize=14) 288 | axes[i, 1].set_xlabel("Dim-1 Redundant + noise", fontsize=23) 289 | axes[i, 1].set_ylabel("Ground-truth Dim-1", fontsize=23) 290 | for spine in plt.gca().spines.values(): 291 | spine.set_linewidth(2) 292 | 293 | # Add correlation line 294 | m, b = np.polyfit(temp_obs_data[:, 0], temp_gt_data[:, 0], 1) 295 | axes[i, 1].plot(temp_obs_data[:, 0], m * 296 | temp_obs_data[:, 0] + b, color='red', linewidth=2) 297 | for ax_row in axes: 298 | for ax in ax_row: 299 | for spine in ax.spines.values(): 300 | spine.set_linewidth(2) 301 | plt.savefig("assets/heatmap_and_scatters"+sep_str+".png", dpi=300) 302 | 303 | 304 | ######### 305 | 306 | 307 | def plot_distance_correlations(true_dim_dict, results_dict, sep_str): 308 | """ 309 | Plots scatter plots of true pairwise distances and pairwise distances from dimension reduction methods. 310 | 311 | Args: 312 | - true_dim_dict (dict): Dictionary of true dimensions for each sd_ratio 313 | - results_dict (dict): Dictionary of results for each sd_ratio and each dimension reduction method 314 | 315 | """ 316 | # Number of rows is the number of sd_ratios 317 | n_rows = len(results_dict) 318 | 319 | # Number of columns is 1 (for true dimensions) + number of dimension reduction methods 320 | n_cols = 1 + len(next(iter(results_dict.values()))) 321 | 322 | # Create a figure with subplots 323 | fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5 * n_rows)) 324 | 325 | # Adjust the spacing between subplots 326 | fig.subplots_adjust(wspace=0.3, hspace=0.3) 327 | 328 | # Loop through each sd_ratio and plot 329 | for i, sd_ratio in enumerate(results_dict): 330 | # Compute true pairwise distances and flatten 331 | true_distances = distance_matrix( 332 | true_dim_dict[sd_ratio], true_dim_dict[sd_ratio]) 333 | true_distances_flat = true_distances.flatten() 334 | 335 | # Plot true distances against themselves 336 | axes[i, 0].scatter(true_distances_flat, 337 | true_distances_flat, alpha=0.6, s=5) 338 | r_val, p_val = pearsonr(true_distances_flat, true_distances_flat) 339 | axes[i, 0].set_title(f"r={r_val:.2f}, p={p_val:.2e}", fontsize=22) 340 | 341 | # Loop through each dimension reduction method and plot 342 | for j, method in enumerate(results_dict[sd_ratio]): 343 | # Compute pairwise distances for reduced data and flatten 344 | reduced_distances = distance_matrix( 345 | results_dict[sd_ratio][method], results_dict[sd_ratio][method]) 346 | reduced_distances_flat = reduced_distances.flatten() 347 | 348 | # Plot true distances against reduced distances 349 | axes[i, j+1].scatter(true_distances_flat, 350 | reduced_distances_flat, alpha=0.025, s=5) 351 | 352 | # Compute correlation 353 | r_val, p_val = pearsonr( 354 | true_distances_flat, reduced_distances_flat) 355 | axes[i, j + 356 | 1].set_title(f"r={r_val:.2f}, p={p_val:.2e}", fontsize=22) 357 | 358 | # Add column titles 359 | col_titles = ['True Distances'] + \ 360 | list(next(iter(results_dict.values())).keys()) 361 | for ax, col in zip(axes[0], col_titles): 362 | ax.annotate(col, (0.5, 1.15), xycoords='axes fraction', ha='center', 363 | va='center', fontsize=26, textcoords='offset points') 364 | 365 | for ax_row in axes: 366 | for ax in ax_row: 367 | for spine in ax.spines.values(): 368 | spine.set_linewidth(2) 369 | 370 | # Save the figure 371 | plt.savefig("assets/distance_correlations"+sep_str+".png", dpi=300) 372 | 373 | 374 | 375 | ######### 376 | 377 | 378 | def plot_intrinsic_dimensionality(sd_lookup, intrinsic_dim_estimate_dict, sep_str): 379 | """ 380 | Plots the estimated intrinsic dimensionality against noise levels. 381 | 382 | Args: 383 | - sd_lookup (dict): Dictionary mapping sd_ratio names to their respective values 384 | - intrinsic_dim_estimate_dict (dict): Dictionary of estimated intrinsic dimensionality for each sd_ratio 385 | """ 386 | # Extract data 387 | sd_values = [sd_lookup[key] for key in intrinsic_dim_estimate_dict.keys()] 388 | dim_estimates = list(intrinsic_dim_estimate_dict.values()) 389 | 390 | # Create a scatter plot with loess fit curve 391 | plt.figure(figsize=(10, 6)) 392 | sns.regplot(x=sd_values, y=dim_estimates, lowess=True, scatter_kws={ 393 | 's': 100, 'alpha': 0.6}, line_kws={'color': 'red', 'lw': 2}) 394 | plt.xlabel("Noise Level (SD Ratio)") 395 | plt.ylabel("Estimated Intrinsic Dimensionality") 396 | plt.title("Intrinsic Dimensionality vs. Noise Level") 397 | plt.grid(True, which='both', linestyle='--', linewidth=0.5) 398 | plt.tight_layout() 399 | plt.savefig("assets/intrinsic_dims_increase_with_noise"+sep_str+".png", dpi=300) 400 | 401 | 402 | ######### 403 | ######################################### 404 | 405 | # Parameters for the experiment 406 | np.random.seed(123456) 407 | n_obs = 1000 408 | true_dims = 2 409 | n_redundant_per_true = 100 410 | sd_ratios = [0.01, 0.05, 0.25, 0.5, 1.] 411 | separation_vect = [0, 4] 412 | sep_dict = {} 413 | intrinsic_dim_estimate_dict = {} 414 | true_dim_dict = {} 415 | obs_data_dict = {} 416 | results_dict = {} 417 | sd_lookup = {} 418 | for sep in separation_vect: 419 | sep_name = "Clust Sep:"+str(sep) 420 | 421 | for sd_ratio in sd_ratios: 422 | sd_name = "SD ratio:"+str(sd_ratio) 423 | sd_lookup[sd_name] = sd_ratio 424 | final_dims = true_dims # This is just an example; adjust as needed 425 | # Generate data 426 | true_dim_data, obs_data = generate_data(n_obs, true_dims, n_redundant_per_true, true_gen_func, redundant_gen_noise_func, sd_ratio, separation=sep) 427 | 428 | # Estimates of intrinsic dimensionality. 429 | # Interesting note here, but it actually identifies 430 | # that as noise dimensions are added, and the size of the noise relative to 431 | # dims are 'real dims.' This fits with the model of it finding 432 | # that added noise in one dimension is actually adding its own dimension, even if the 'real' 433 | # variation was already accounted for by prior dims. It's not like this is incorrect or anything... 434 | # It's just that noise is a dimension. The hard part is figuring out which dims are "meaningful"! 435 | ## https: // doi.org/10.48550/arXiv.1206.3881 436 | danco = skdim.id.DANCo().fit(obs_data) 437 | print(danco.dimension_) 438 | intrinsic_dim_estimate_dict[sd_name] = danco.dimension_ 439 | 440 | # log the data 441 | true_dim_dict[sd_name] = true_dim_data 442 | obs_data_dict[sd_name] = obs_data 443 | 444 | # Perform dimension reduction 445 | dim_red_funcs = [pca_wrapper, nmf_wrapper, tsne_wrapper, umap_wrapper, som_wrapper] 446 | dim_red_names = ["PCA", "NMF", "tSNE", "UMAP", "SOM"] 447 | results_dict[sd_name] = dim_reduction(obs_data, dim_red_funcs, dim_red_names, final_dims) 448 | if sep==0: 449 | sep_str = "" 450 | else: 451 | sep_str = "_"+str(sep)+"separation" 452 | # Call the plotting functions 453 | plot_dim_reductions(true_dim_dict, results_dict, sep_str) 454 | plot_obs_data_heatmap(true_dim_dict, obs_data_dict, 455 | intrinsic_dim_estimate_dict, sep_str) 456 | plot_intrinsic_dimensionality(sd_lookup, intrinsic_dim_estimate_dict, sep_str) 457 | plot_distance_correlations(true_dim_dict, results_dict, sep_str) 458 | sep_dict[sep_name] = results_dict 459 | 460 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | seaborn 2 | umap-learn 3 | numpy 4 | matplotlib 5 | scikit-learn 6 | scikit-dimension 7 | minisom --------------------------------------------------------------------------------