├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── distance_correlations.png
    ├── distance_correlations_4separation.png
    ├── heatmap_and_scatters.png
    ├── heatmap_and_scatters_4separation.png
    ├── intrinsic_dims_increase_with_noise.png
    ├── intrinsic_dims_increase_with_noise_4separation.png
    ├── true_dims_with_noise_vs_dim_reduction.png
    └── true_dims_with_noise_vs_dim_reduction_4separation.png
├── experiment.py
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Scott Tyler
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Do tSNE and UMAP overfit their intrinsic dimensionality
  2 | ## (The answer, unfortuantely: Yes)
  3 | 
  4 | This repository presents an analysis of the intrinsic dimensionality of data that has true dimensions and additional noise dimensions. The primary goal is to explore how various noise levels influence the perceived dimensionality and how different dimensionality reduction methods represent the data.
  5 | 
  6 | ## Intention
  7 | 
  8 | The main intention behind this analysis is to:
  9 | 1. Generate synthetic data with known "true dimensions" and introduce redundant noisy dimensions.
 10 | 2. Observe how different dimension reduction techniques, namely t-SNE and UMAP, represent the data with increasing noise.
 11 | 3. Estimate the intrinsic dimensionality as noise levels increase and analyze its implications.
 12 | 
 13 | ## Methods Walkthrough
 14 | 
 15 | 1. **Data Generation**: 
 16 |    - We started by creating synthetic data with known dimensions using a random number generator.
 17 |    - For each true dimension, redundant dimensions were added. These dimensions were seeded from the true data but had noise introduced, which was controlled by a specified standard deviation ratio (`sd_ratio`).
 18 | 
 19 | 2. **Dimension Reduction**:
 20 |    - For each noise level, dimensionality was reduced using some popular linear & non-linear methods: PCA, NMF, t-SNE, UMAP, and SOM.
 21 |    - The reduced dimensions were then visualized against the true dimensions to observe the effects of noise.
 22 | 
 23 | 3. **Intrinsic Dimensionality Estimation**:
 24 |    - At each noise level, the intrinsic dimensionality of the data was estimated using the DANCo method.
 25 |    - This helped us understand how perceived dimensionality changes as noise increases.
 26 | 
 27 | ## Results: 
 28 | 
 29 | ### 2 "real" dimensions, no structure in observations
 30 | 
 31 | Okay - so folks have griped that the "false creation of struction from nothingness" might only happen when you "reduce" from 2 dimentions to 2 dimentions. Of course, that's not the point of dimension reduction - the point is to reduce dimensions.
 32 | https://twitter.com/slavov_n/status/1683785160825643008
 33 | 
 34 | So - what if the "intrinsic" dimentionality is 2, but there are lots of redundant dimentions? In this situation, we have 2 "real dimentions" that could explain most of the variation in the data. We'll first simulate those 2 "real dimentions" (left hand column in the plots below), then we'll create 100 redundant dimentions per real dimention for our 1000 observations (+variable amounts of noise for the redundant dimentions: rows). So here, the input fed into the algs are actually 1000 rows (observations), with 200 features (columns). But they are generated from 2 main features + variable amounts of noise. Do we still see structure from nothing when we are actually performing dimensionality reduction from 200 features to 2, knowing that the underlying 2 main features are unrelated? Yes. 
 35 | 
 36 | ![True Dimensions vs Dimension Reduction](assets/true_dims_with_noise_vs_dim_reduction.png)
 37 | 
 38 | From the above figure, we can see that PCA looks similar to the true dimensions. NMF (w/ data shifted up to be non-negative) honestly surprised me here... I don't have a good intuition for why adding noise, creates that elongated shape... ¯\\_ (ツ)_/¯. Ideally it wouldn't - it might give the impression that there is some sort of "trajectory" but we see here that just noise can create that illusion.
 39 | 
 40 | When we look at the t-SNE and UMAP (default params), we find that they create the appearance of structure from random Gaussian distributions, even when doing a 100 fold dimension reduction, knowing what the true dimensions are. We also see that as noise increases, this structure gets blurier (usurprising, we'll circle back to that). If you're not familiar with SOM, it might look strange, but it just places observations inside of a 2D grid, so that's why it looks uniform; overall it visually looks like it made something Gaussian-ish, but the grid pattern makes it a bit harder to interpret.
 41 | 
 42 | You might be balking, thinking that this is still the 2 dims to 2 dims examples. It's not. Below, you'll see the heatmaps of exactly what the input data was. It's clear from the below that we really have our 2 main sources of variation, with varying levels of noise. The scatter plots on the right show the correlation between the main source variable, and an example of one of its 100 redundant features. Note however that, as we increase the spread of the data around this correlation, we really are adding another dimension (imgine an orthogonal line that would cut across, that we'd need to explain this noise). We can see this also if we try to estimate the "intrinsic dimensionality" as noted above the heatmaps.
 43 | 
 44 | ![Heatmap and Scatters](assets/heatmap_and_scatters.png)
 45 | 
 46 | This plot highlights an interesting phenomenon. As noise levels (or the `sd_ratio`) increase, the estimated intrinsic dimensionality also rises. This resonates with the notion that added noise in one dimension is perceived as adding its own dimension. We can actually explicitly test what the _apparent_ "intrinsic dimensionality" is. We know that there were 2 "main variables", and what we added was noise around those 2 main variables. So what does the intrnisic dimensionality look like when estimated using the [DANCo method](https://doi.org/10.48550/arXiv.1206.3881)?
 47 | 
 48 | ![Intrinsic Dimensions Increase with Noise](assets/intrinsic_dims_increase_with_noise.png)
 49 | 
 50 | The challenge lies in deciphering which dimensions are "meaningful" and which are mere noise. But that's the thing... Noise is a dimension. It's just bespoke to each individual variable.
 51 | 
 52 | How well do they recapitulate the original observation:observation distances? This is an important question, but one to be interpreted with some degree of caution. In this case we're using random inputs, so the distances should be 1-to-1. But in situations where you have curved space, if your dimension reduction 'flattens' that space, you wouldn't expect it to be 1-to-1; however, you would still expect it to be monotonic, with low variation around the monotonic curve. So what do the results look like? Have a look below (X-axis: True distances (based on 'main' dimensions), Y-axis: Dim-reduced distances):
 53 | 
 54 | ![Distance Correlations](assets/distance_correlations.png)
 55 | 
 56 | Unsurprisingly in this case, PCA is bang on, followed by NMF, which has a heteroscedastic pattern, meaning that it preserves very nearby structure a bit better than global. tSNE and UMAP have similar strange patterns and small areas (that likely correspond to the 'cluster' looking structures), which decrease the quality of the conservation of distances. SOM was interesting here, because just looking at the 2D projection, it seemed like it somewhat captured the Gaussian like structure. But when we look at the actual distances, we see that which point ended up where within that Gaussian like grid pattern didn't necessarily match up, as it had the least conservation of distances.
 57 | 
 58 | ### 2 "real" dimensions structure (2-clusters) in one of them
 59 | 
 60 | What about when we _do_ have a some structure? We hypothesized that the "real" structure would indeed be observed, but that within each cluster, we may see a similar "craggly" pattern of overfitting.
 61 | 
 62 | First, let's look at the input data, so we understand the input:
 63 | 
 64 | ![Heatmap and Scatters clust](assets/heatmap_and_scatters_4separation.png)
 65 | 
 66 | There are again only 2 'real' dimensions, but one includes a gap for half of the observations, creating 2 clusers. Again, also with increasing noise added (which is equivalent to adding a unique dimension of varing magnitude within each feature):
 67 | 
 68 | Now, let's see how they all actually look:
 69 | 
 70 | ![True Dimensions vs Dimension Reduction clust](assets/true_dims_with_noise_vs_dim_reduction_4separation.png)
 71 | 
 72 | We do of course see the two clusters in the dim reductions (except for SOM, which appears to struggle here). PCA and NMF seem to do a good job regardless of the noise regime. tSNE and UMAP have the same neighbor overfitting issues as before until the noise becomes larger than the signal. 
 73 | 
 74 | The reason that this seems to help is that the noise dimensions are all orthogonal to each other, so that the 'neighbor signal' coming from the noisy orthogonal dimensions are all in random unrelated directions. So by the central limit theorum, we can think of this as adding a smoothing function because, if the 'neighbor signal' is receiving noise in many random directions, ultimately, the overfitting in the 'real' dimensions, gets washed out by the unique & orthogonal overfitting in the 'noise' dimensions. In fact for tSNE & UMAP, it's not until you hit a 25% noise ratio that the 'real' clusters become quite clear (but still with apparent overfit local structures within the clusters).
 75 | 
 76 | Does this mean that I should just add some noise to my data & then it's fine? No. It's important to remember here that what we're calling noise and signal in simulations is somewhat arbitrary. If we keep adding more and more noise, all of the apparent structure will get progressively more drowned out. But we don't have an objective function to know that is 'real' signal & what is 'noise' signal, or technical variability in our measures, etc. To tackle those problems, you'll have to know what the noise sources are, and try to correct for them explicitly.
 77 | 
 78 | So how well did they recapitualte the original distances in 'real' dimension space?
 79 | 
 80 | ![Distance Correlations clust](assets/distance_correlations_4separation.png)
 81 | 
 82 | Overall, the results seem fairly similar to the negative control. PCA does fairly well, but becomes more heteroscedastic in the relationship of the real and observed distances, somewhat more like NMF was originally. tSNE and UMAP are relatively monotonic, but the added local structures can be seen as the jagged edges and whisps added into the central correlation pattern & SOM still seems to struggle.
 83 | 
 84 | 
 85 | ## Conclusion
 86 | 
 87 | ### With strict negative controls:
 88 | 
 89 | So why then when simulate _only_ orthogonal dimensions, it comes out as blobs?
 90 | https://twitter.com/ChenxinLi2/status/1683818705296461830
 91 | https://twitter.com/willmacnair/status/1684905102576889856
 92 | 
 93 | Well - really - we need to change what we think about dimensionality. Noise _is_ a dimension. It's not an interesting one though... So when you have 100, or 10,000 completely random inputs. The true dimensionality is 100 or 10,000 dimentions. So if you have an algorithm that will overfit every one of them, but in a unique and random way, if we think about the concept of the central limit theorum, those errors, in different random directions, will end up collapsing it back down into a hairball. Now that doesn't mean 
 94 | 
 95 | We see this in our own example above as well. As you increase the amount of noise added on top of the "real dimentions" what you're actually doing is adding new, orthogonal (but uninteresting) dimentions. That's why we end up seeing the structure get progressively blurrier. We're actually increasing the intrinsic dimensionality, that can't be captured in a 2D display.
 96 | 
 97 | In conclusion: yes - these dimension reduction algorithms overfit their intrinsic dimensionality. But we have now also seen that noise is essentially its own dimension & adding N-observation orthogonal noise dimentions blurs out the overfitting. My interpreation of this is that it's conceptually just the central limit theorum - that adding in many many sources of noise in random directions, causes the overfitting to seem to go away, but it's still there, just getting washed out by all of the other sources of overfitting layered on top.
 98 | 
 99 | ### With a true source of structure present
100 | 
101 | Overall even when one of the dimensions encodes a separation of points between them, we see the same thing as above. Bearing in mind that this simulation was with Gaussian distributed data, PCA did the best job, and not far behind was NMF. But tSNE and UMAP had the same issues as in the negative control, synthesizing extra local structure until sufficiently large noise was added, deceasing the overall neighbor overfitting. SOM struggles a bit - perhaps this is not the best use-case...
102 | 
103 | Dmitry Kobak has mentioned that it always seems like it's in simulation rather than real world datasets that this overfitting pattern emerges. I think this simulation actually directly gets to the bottom of that. In real world data - there are many noise dimensions! In the single-cell -omics domain, every feature comes with both systematic technical sources of variation (which may be correlated between them based on some mediator variable), and there's also the noise of Poisson sampling.
104 | 
105 | Ultimately, the challenge will be unraveling what's a noise dimension & what's a 'real' dimension. Answering this question will of course be _exceptionally_ domain specific, so I can't proffer any advice here... That being said, I have some ideas in the single cell space ;-)
106 | 
107 | ## Final remarks
108 | 
109 | I'm also happy to be wrong on this - but it's just what the data seems to indicate. The data is the data, as they say...
110 | 
111 | That's also not to say tSNE/UMAP are completely unrelated to the underlying data! Of course they are related. I've used them too. But we _do_ need to be very aware of algorithm assumptions, limitations, and perform negative controls with any method that we use. Especially if it's used for analysis rather than just visualization.
112 | 
113 | ## References
114 | 
115 | - [Visualizing Data Using t-SNE](https://jmlr.org/papers/v9/vandermaaten08a.html)
116 | - [UMAP: Uniform Manifold Approximation and Projection for Dimension Reduction](https://doi.org/10.48550/arXiv.1802.03426)
117 | - [DANCo: Dimensionality from Angle and Norm Concentration. Camastra & Vinciarelli, 2012](https://doi.org/10.48550/arXiv.1206.3881)
118 | TODO: need to cite the others
119 | 


--------------------------------------------------------------------------------
/assets/distance_correlations.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/distance_correlations.png


--------------------------------------------------------------------------------
/assets/distance_correlations_4separation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/distance_correlations_4separation.png


--------------------------------------------------------------------------------
/assets/heatmap_and_scatters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/heatmap_and_scatters.png


--------------------------------------------------------------------------------
/assets/heatmap_and_scatters_4separation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/heatmap_and_scatters_4separation.png


--------------------------------------------------------------------------------
/assets/intrinsic_dims_increase_with_noise.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/intrinsic_dims_increase_with_noise.png


--------------------------------------------------------------------------------
/assets/intrinsic_dims_increase_with_noise_4separation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/intrinsic_dims_increase_with_noise_4separation.png


--------------------------------------------------------------------------------
/assets/true_dims_with_noise_vs_dim_reduction.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/true_dims_with_noise_vs_dim_reduction.png


--------------------------------------------------------------------------------
/assets/true_dims_with_noise_vs_dim_reduction_4separation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/scottyler89/umap_tsne_experiment/0e6ce4ff31aa34a7d6dc0ef56c3275daf864a684/assets/true_dims_with_noise_vs_dim_reduction_4separation.png


--------------------------------------------------------------------------------
/experiment.py:
--------------------------------------------------------------------------------
  1 | from scipy.stats import pearsonr
  2 | from scipy.spatial import distance_matrix
  3 | from sklearn.decomposition import NMF
  4 | from minisom import MiniSom
  5 | import skdim
  6 | import seaborn as sns
  7 | import matplotlib.pyplot as plt
  8 | import umap
  9 | import numpy as np
 10 | from matplotlib import pyplot as plt
 11 | from sklearn.manifold import TSNE
 12 | from sklearn.decomposition import PCA
 13 | 
 14 | 
 15 | def get_main_mat(true_gen_func, n_obs, true_dims, separation=0):
 16 |     """
 17 |     Generate the main matrix with an optional separation between two clusters.
 18 |     
 19 |     Args:
 20 |     - true_gen_func (function): Function to generate the main matrix
 21 |     - n_obs (int): Number of observations
 22 |     - true_dims (int): Number of true dimensions
 23 |     - separation (float): Multiple of standard deviation for separating clusters
 24 |     
 25 |     Returns:
 26 |     - main_mat (numpy array): Generated main matrix
 27 |     """
 28 |     if separation == 0:
 29 |         return true_gen_func(n_obs, true_dims)
 30 | 
 31 |     # Generate half of the observations
 32 |     half_obs = n_obs // 2
 33 |     cluster_1 = true_gen_func(half_obs, true_dims)
 34 | 
 35 |     # Generate the other half with an offset in the first dimension
 36 |     cluster_2 = true_gen_func(int(n_obs - half_obs), true_dims)
 37 |     offset = separation * np.std(cluster_1[:, 0])
 38 |     cluster_2[:, 0] += offset
 39 |     # Concatenate the two clusters vertically
 40 |     main_mat = np.vstack([cluster_1, cluster_2])
 41 | 
 42 |     return main_mat
 43 | 
 44 | 
 45 | 
 46 | 
 47 | def generate_data(n_obs, true_dims, n_redundant_per_true, true_gen_func, redundant_gen_noise_func, sd_ratio, separation=0):
 48 |     """
 49 |     Generates data matrix with true dimensions and redundant dimensions.
 50 |     
 51 |     Args:
 52 |     - n_obs (int): Number of observations
 53 |     - true_dims (int): Number of true dimensions
 54 |     - n_redundant_per_true (int): Number of redundant dimensions per true dimension
 55 |     - true_gen_func (function): Function to generate the main matrix
 56 |     - redundant_gen_noise_func (function): Function to generate noise for redundant dimensions
 57 |     - sd_ratio (float): Ratio for scaling noise
 58 |     
 59 |     Returns:
 60 |     - main_mat (numpy matrix): Matrix of true dimensions
 61 |     - redundant_mat (numpy matrix): Matrix of redundant dimensions
 62 |     """
 63 |     # Generate the main matrix
 64 |     if separation == 0.:
 65 |         main_mat = true_gen_func(n_obs, true_dims)
 66 |     else:
 67 |         main_mat = get_main_mat(true_gen_func, n_obs, true_dims, separation=separation)
 68 |     # Placeholder for the redundant dimensions
 69 |     redundant_dims = []
 70 |     for i in range(true_dims):
 71 |         # Standard deviation for this dimension in main_mat
 72 |         dim_std = np.std(main_mat[:, i])
 73 |         # Create n_redundant_per_true redundant dimensions seeded at main_mat[:, i] values
 74 |         for _ in range(n_redundant_per_true):
 75 |             noise = redundant_gen_noise_func(n_obs, 1)
 76 |             redundant_dim = main_mat[:, i][:,
 77 |                                            np.newaxis] + noise * sd_ratio * dim_std
 78 |             # Standardize the redundant dimension
 79 |             redundant_dim = redundant_dim / np.std(redundant_dim)
 80 |             redundant_dims.append(redundant_dim)
 81 |     # Stack all redundant dimensions horizontally
 82 |     redundant_mat = np.hstack(redundant_dims)
 83 |     return main_mat, redundant_mat
 84 | 
 85 | 
 86 | def dim_reduction(in_mat, dim_red_func_list, dim_red_names, final_dims):
 87 |     """
 88 |     Reduces the dimensions of the input matrix using specified functions.
 89 |     
 90 |     Args:
 91 |     - in_mat (numpy matrix): Input data matrix
 92 |     - dim_red_func_list (list): List of dimension reduction functions
 93 |     - final_dims (int): Number of dimensions after reduction
 94 |     
 95 |     Returns:
 96 |     - results (dict): dictionary of dim_red_names and their results
 97 |     """
 98 |     # Placeholder for the results from each dimension reduction function
 99 |     results = {}
100 |     for func, name in zip(dim_red_func_list, dim_red_names):
101 |         result = func(in_mat, final_dims)
102 |         results[name]=result
103 |     return results
104 | 
105 | 
106 | ####################################################
107 | def tsne_wrapper(data, n_components):
108 |     tsne = TSNE(n_components=n_components)
109 |     return tsne.fit_transform(data)
110 | 
111 | 
112 | def umap_wrapper(data, n_components):
113 |     reducer = umap.UMAP(n_components=n_components)
114 |     return reducer.fit_transform(data)
115 | 
116 | 
117 | def pca_wrapper(data, n_components):
118 |     pca = PCA(n_components=n_components)
119 |     return pca.fit_transform(data)
120 | 
121 | 
122 | def nmf_wrapper(data, n_components, epsilon = 1e-8):
123 |     """
124 |     Use NMF for dimensionality reduction.
125 |     
126 |     Parameters:
127 |     - data: input data
128 |     - n_components: number of components for the reduced dimension
129 |     
130 |     Returns:
131 |     - transformed_data: numpy array of shape (n_samples, n_components)
132 |     """
133 |     data -= np.min(data)
134 |     data += epsilon
135 |     nmf = NMF(n_components=n_components, init='random', random_state=0)
136 |     transformed_data = nmf.fit_transform(data)
137 |     return transformed_data
138 | 
139 | 
140 | def som_wrapper(data, n_components=2):
141 |     """
142 |     Use MiniSom for SOM.
143 |     Note: For SOM, n_components is expected to be 2 since we are using a 2D grid.
144 |     
145 |     Parameters:
146 |     - data: input data
147 |     - n_components: dimensions of the output (expected to be 2 for a 2D grid)
148 |     
149 |     Returns:
150 |     - positions: numpy array of shape (n_samples, n_components) representing positions on the grid
151 |     """
152 |     assert n_components == 2, "For SOM, n_components should be 2."
153 | 
154 |     x_size, y_size = 50, 50  # You can adjust these values based on your needs
155 |     som = MiniSom(x_size, y_size, data.shape[1])
156 |     som.train_random(data, 5000)
157 | 
158 |     positions = np.array([som.winner(d) for d in data])
159 |     return positions
160 | 
161 | 
162 | #####################################################
163 | 
164 | def true_gen_func(n_obs, true_dims):
165 |     """
166 |     Example function to generate the main matrix.
167 |     
168 |     Args:
169 |     - n_obs (int): Number of observations
170 |     - true_dims (int): Number of true dimensions
171 |     
172 |     Returns:
173 |     - Main matrix (numpy array)
174 |     """
175 |     return np.random.randn(n_obs, true_dims)
176 | 
177 | 
178 | def redundant_gen_noise_func(n_obs, true_dims):
179 |     """
180 |     Example function to generate noise for redundant dimensions.
181 |     
182 |     Args:
183 |     - n_obs (int): Number of observations
184 |     - true_dims (int): Number of true dimensions
185 |     
186 |     Returns:
187 |     - Noise matrix (numpy array)
188 |     """
189 |     return np.random.randn(n_obs, true_dims)
190 | 
191 | 
192 | #########################################
193 | 
194 | def plot_dim_reductions(true_dim_dict, results_dict, sep_str):
195 |     """
196 |     Plots scatter plots of true dimensions and results of dimension reduction methods.
197 |     
198 |     Args:
199 |     - true_dim_dict (dict): Dictionary of true dimensions for each sd_ratio
200 |     - results_dict (dict): Dictionary of results for each sd_ratio and each dimension reduction method
201 |     
202 |     """
203 |     # Number of rows is the number of sd_ratios
204 |     n_rows = len(results_dict)
205 | 
206 |     # Number of columns is 1 (for true dimensions) + number of dimension reduction methods
207 |     n_cols = 1 + len(next(iter(results_dict.values())))
208 | 
209 |     # Create a figure with subplots
210 |     fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5 * n_rows))
211 | 
212 |     # Adjust the spacing between subplots
213 |     # Adjust these values as needed for desired spacing
214 |     fig.subplots_adjust(wspace=0.3, hspace=0.3)
215 | 
216 |     # Loop through each sd_ratio and plot
217 |     for i, sd_ratio in enumerate(results_dict):
218 |         # Row title for sd_ratio
219 |         if n_cols > 1:
220 |             axes[i, 0].set_ylabel(sd_ratio, fontsize=28,
221 |                                   rotation=90, labelpad=50, va="center")
222 |         else:
223 |             axes.set_ylabel(sd_ratio, fontsize=28, rotation=90,
224 |                             labelpad=50, va="center")
225 | 
226 |         # Plot true dimensions
227 |         axes[i, 0].scatter(true_dim_dict[sd_ratio][:, 0],
228 |                            true_dim_dict[sd_ratio][:, 1], alpha=0.6, s=5)
229 |         #axes[i, 0].set_title(f"True Dimensions", fontsize=12)
230 | 
231 |         # Loop through each dimension reduction method and plot
232 |         for j, method in enumerate(results_dict[sd_ratio]):
233 |             axes[i, j+1].scatter(results_dict[sd_ratio][method][:, 0],
234 |                                  results_dict[sd_ratio][method][:, 1], alpha=0.6, s=5)
235 |             #axes[i, j+1].set_title(f"{method}", fontsize=12)
236 | 
237 |     # Add column titles
238 |     col_titles = ['Intrinsic Dimensions that\ncreated input vals+noise'] + \
239 |         list(next(iter(results_dict.values())).keys())
240 |     for ax, col in zip(axes[0], col_titles):
241 |         ax.annotate(col, (0.5, 1.15), xycoords='axes fraction', ha='center',
242 |                     va='center', fontsize=28, textcoords='offset points')
243 |     for ax_row in axes:
244 |         for ax in ax_row:
245 |             for spine in ax.spines.values():
246 |                 spine.set_linewidth(2)
247 |     plt.savefig("assets/true_dims_with_noise_vs_dim_reduction"+sep_str+".png", dpi=300)
248 | 
249 | 
250 | 
251 | def plot_obs_data_heatmap(gt_data_dict, obs_data_dict, danco_dict, sep_str):
252 |     """
253 |     Plots heatmaps of obs_data for each sd_ratio.
254 |     
255 |     Args:
256 |     - gt_data_dict (dict): Dictionary of ground truth data matrices for each sd_ratio
257 |     - obs_data_dict (dict): Dictionary of obs_data matrices for each sd_ratio
258 |     
259 |     """
260 |     # Number of rows is the number of sd_ratios
261 |     n_rows = len(obs_data_dict)
262 | 
263 |     # Create a figure with subplots
264 |     fig, axes = plt.subplots(n_rows, 2, figsize=(10, 5 * n_rows))
265 |     fig.subplots_adjust(wspace=0.35, hspace=0.38)
266 | 
267 |     if n_rows == 1:
268 |         axes = [axes]
269 | 
270 |     # Loop through each sd_ratio and plot
271 |     for i, sd_ratio in enumerate(obs_data_dict.keys()):
272 |         # Extract ground truth and observation data for scatter plot
273 |         temp_gt_data = gt_data_dict[sd_ratio]
274 |         temp_obs_data = obs_data_dict[sd_ratio]
275 |         estimated_dims = danco_dict[sd_ratio]
276 | 
277 |         # Plot the heatmap
278 |         sns.heatmap(temp_obs_data, ax=axes[i, 0], cmap="YlGnBu", cbar=False)
279 |         axes[i, 0].set_title(f"DANCo dim\nestimate {estimated_dims:.2f}", fontsize=23)
280 |         axes[i, 0].set_xticks([])
281 |         axes[i, 0].set_yticks([])
282 |         axes[i, 0].set_ylabel(sd_ratio, fontsize=26, rotation=90,
283 |                               labelpad=50, va="center")
284 | 
285 |         # Scatter plot
286 |         axes[i, 1].scatter(temp_obs_data[:, 0], temp_gt_data[:, 0], alpha=0.5)
287 |         #axes[i, 1].set_title(f"Scatter for {sd_ratio}", fontsize=14)
288 |         axes[i, 1].set_xlabel("Dim-1 Redundant + noise", fontsize=23)
289 |         axes[i, 1].set_ylabel("Ground-truth Dim-1", fontsize=23)
290 |         for spine in plt.gca().spines.values():
291 |             spine.set_linewidth(2)
292 | 
293 |         # Add correlation line
294 |         m, b = np.polyfit(temp_obs_data[:, 0], temp_gt_data[:, 0], 1)
295 |         axes[i, 1].plot(temp_obs_data[:, 0], m *
296 |                         temp_obs_data[:, 0] + b, color='red', linewidth=2)
297 |     for ax_row in axes:
298 |         for ax in ax_row:
299 |             for spine in ax.spines.values():
300 |                 spine.set_linewidth(2)
301 |     plt.savefig("assets/heatmap_and_scatters"+sep_str+".png", dpi=300)
302 | 
303 | 
304 | #########
305 | 
306 | 
307 | def plot_distance_correlations(true_dim_dict, results_dict, sep_str):
308 |     """
309 |     Plots scatter plots of true pairwise distances and pairwise distances from dimension reduction methods.
310 |     
311 |     Args:
312 |     - true_dim_dict (dict): Dictionary of true dimensions for each sd_ratio
313 |     - results_dict (dict): Dictionary of results for each sd_ratio and each dimension reduction method
314 |     
315 |     """
316 |     # Number of rows is the number of sd_ratios
317 |     n_rows = len(results_dict)
318 | 
319 |     # Number of columns is 1 (for true dimensions) + number of dimension reduction methods
320 |     n_cols = 1 + len(next(iter(results_dict.values())))
321 | 
322 |     # Create a figure with subplots
323 |     fig, axes = plt.subplots(n_rows, n_cols, figsize=(5*n_cols, 5 * n_rows))
324 | 
325 |     # Adjust the spacing between subplots
326 |     fig.subplots_adjust(wspace=0.3, hspace=0.3)
327 | 
328 |     # Loop through each sd_ratio and plot
329 |     for i, sd_ratio in enumerate(results_dict):
330 |         # Compute true pairwise distances and flatten
331 |         true_distances = distance_matrix(
332 |             true_dim_dict[sd_ratio], true_dim_dict[sd_ratio])
333 |         true_distances_flat = true_distances.flatten()
334 | 
335 |         # Plot true distances against themselves
336 |         axes[i, 0].scatter(true_distances_flat,
337 |                            true_distances_flat, alpha=0.6, s=5)
338 |         r_val, p_val = pearsonr(true_distances_flat, true_distances_flat)
339 |         axes[i, 0].set_title(f"r={r_val:.2f}, p={p_val:.2e}", fontsize=22)
340 | 
341 |         # Loop through each dimension reduction method and plot
342 |         for j, method in enumerate(results_dict[sd_ratio]):
343 |             # Compute pairwise distances for reduced data and flatten
344 |             reduced_distances = distance_matrix(
345 |                 results_dict[sd_ratio][method], results_dict[sd_ratio][method])
346 |             reduced_distances_flat = reduced_distances.flatten()
347 | 
348 |             # Plot true distances against reduced distances
349 |             axes[i, j+1].scatter(true_distances_flat,
350 |                                  reduced_distances_flat, alpha=0.025, s=5)
351 | 
352 |             # Compute correlation
353 |             r_val, p_val = pearsonr(
354 |                 true_distances_flat, reduced_distances_flat)
355 |             axes[i, j +
356 |                  1].set_title(f"r={r_val:.2f}, p={p_val:.2e}", fontsize=22)
357 | 
358 |     # Add column titles
359 |     col_titles = ['True Distances'] + \
360 |         list(next(iter(results_dict.values())).keys())
361 |     for ax, col in zip(axes[0], col_titles):
362 |         ax.annotate(col, (0.5, 1.15), xycoords='axes fraction', ha='center',
363 |                     va='center', fontsize=26, textcoords='offset points')
364 | 
365 |     for ax_row in axes:
366 |         for ax in ax_row:
367 |             for spine in ax.spines.values():
368 |                 spine.set_linewidth(2)
369 | 
370 |     # Save the figure
371 |     plt.savefig("assets/distance_correlations"+sep_str+".png", dpi=300)
372 | 
373 | 
374 | 
375 | #########
376 | 
377 | 
378 | def plot_intrinsic_dimensionality(sd_lookup, intrinsic_dim_estimate_dict, sep_str):
379 |     """
380 |     Plots the estimated intrinsic dimensionality against noise levels.
381 |     
382 |     Args:
383 |     - sd_lookup (dict): Dictionary mapping sd_ratio names to their respective values
384 |     - intrinsic_dim_estimate_dict (dict): Dictionary of estimated intrinsic dimensionality for each sd_ratio
385 |     """
386 |     # Extract data
387 |     sd_values = [sd_lookup[key] for key in intrinsic_dim_estimate_dict.keys()]
388 |     dim_estimates = list(intrinsic_dim_estimate_dict.values())
389 | 
390 |     # Create a scatter plot with loess fit curve
391 |     plt.figure(figsize=(10, 6))
392 |     sns.regplot(x=sd_values, y=dim_estimates, lowess=True, scatter_kws={
393 |                 's': 100, 'alpha': 0.6}, line_kws={'color': 'red', 'lw': 2})
394 |     plt.xlabel("Noise Level (SD Ratio)")
395 |     plt.ylabel("Estimated Intrinsic Dimensionality")
396 |     plt.title("Intrinsic Dimensionality vs. Noise Level")
397 |     plt.grid(True, which='both', linestyle='--', linewidth=0.5)
398 |     plt.tight_layout()
399 |     plt.savefig("assets/intrinsic_dims_increase_with_noise"+sep_str+".png", dpi=300)
400 | 
401 | 
402 | #########
403 | #########################################
404 | 
405 | # Parameters for the experiment
406 | np.random.seed(123456)
407 | n_obs = 1000
408 | true_dims = 2
409 | n_redundant_per_true = 100
410 | sd_ratios = [0.01, 0.05, 0.25, 0.5, 1.]
411 | separation_vect = [0, 4]
412 | sep_dict = {}
413 | intrinsic_dim_estimate_dict = {}
414 | true_dim_dict = {}
415 | obs_data_dict = {}
416 | results_dict = {}
417 | sd_lookup = {}
418 | for sep in separation_vect:
419 |     sep_name = "Clust Sep:"+str(sep)
420 |     
421 |     for sd_ratio in sd_ratios:
422 |         sd_name = "SD ratio:"+str(sd_ratio)
423 |         sd_lookup[sd_name] = sd_ratio
424 |         final_dims = true_dims  # This is just an example; adjust as needed
425 |         # Generate data
426 |         true_dim_data, obs_data = generate_data(n_obs, true_dims, n_redundant_per_true, true_gen_func, redundant_gen_noise_func, sd_ratio, separation=sep)
427 | 
428 |         # Estimates of intrinsic dimensionality.
429 |         # Interesting note here, but it actually identifies
430 |         # that as noise dimensions are added, and the size of the noise relative to
431 |         # dims are 'real dims.' This fits with the model of it finding 
432 |         # that added noise in one dimension is actually adding its own dimension, even if the 'real'
433 |         # variation was already accounted for by prior dims. It's not like this is incorrect or anything...
434 |         # It's just that noise is a dimension. The hard part is figuring out which dims are "meaningful"!
435 |         ## https: // doi.org/10.48550/arXiv.1206.3881
436 |         danco = skdim.id.DANCo().fit(obs_data)
437 |         print(danco.dimension_)
438 |         intrinsic_dim_estimate_dict[sd_name] = danco.dimension_
439 | 
440 |         # log the data
441 |         true_dim_dict[sd_name] = true_dim_data
442 |         obs_data_dict[sd_name] = obs_data
443 | 
444 |         # Perform dimension reduction
445 |         dim_red_funcs = [pca_wrapper, nmf_wrapper, tsne_wrapper, umap_wrapper, som_wrapper]
446 |         dim_red_names = ["PCA", "NMF", "tSNE", "UMAP", "SOM"]
447 |         results_dict[sd_name] = dim_reduction(obs_data, dim_red_funcs, dim_red_names, final_dims)
448 |     if sep==0:
449 |         sep_str = ""
450 |     else:
451 |         sep_str = "_"+str(sep)+"separation"
452 |     # Call the plotting functions
453 |     plot_dim_reductions(true_dim_dict, results_dict, sep_str)
454 |     plot_obs_data_heatmap(true_dim_dict, obs_data_dict,
455 |                         intrinsic_dim_estimate_dict, sep_str)
456 |     plot_intrinsic_dimensionality(sd_lookup, intrinsic_dim_estimate_dict, sep_str)
457 |     plot_distance_correlations(true_dim_dict, results_dict, sep_str)
458 |     sep_dict[sep_name] = results_dict
459 | 
460 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | seaborn
2 | umap-learn
3 | numpy
4 | matplotlib
5 | scikit-learn
6 | scikit-dimension
7 | minisom


--------------------------------------------------------------------------------