├── .gitattributes ├── .gitignore ├── Bagging.ipynb ├── LICENSE.txt ├── MANIFEST ├── README.md ├── collinearity.egg-info ├── PKG-INFO ├── SOURCES.txt ├── dependency_links.txt ├── requires.txt └── top_level.txt ├── collinearity ├── SelectNonCollinear.py └── __init__.py ├── dist └── collinearity-0.6.tar.gz ├── setup.cfg └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.bat -------------------------------------------------------------------------------- /Bagging.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "Bagging.ipynb", 7 | "provenance": [], 8 | "authorship_tag": "ABX9TyNaaTq/6YMpzCw6an8wM0jO" 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "metadata": { 22 | "id": "PfYP9a4pgPjP" 23 | }, 24 | "source": [ 25 | "from sklearn.datasets import load_breast_cancer,load_diabetes" 26 | ], 27 | "execution_count": 1, 28 | "outputs": [] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "metadata": { 33 | "id": "-jH5nMn4gVAp" 34 | }, 35 | "source": [ 36 | "from sklearn.linear_model import LinearRegression\n", 37 | "from sklearn.naive_bayes import GaussianNB" 38 | ], 39 | "execution_count": 2, 40 | "outputs": [] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "metadata": { 45 | "id": "uDpqIJ9ygdT4" 46 | }, 47 | "source": [ 48 | "from sklearn.ensemble import BaggingClassifier,BaggingRegressor\n", 49 | "\n", 50 | "\n", 51 | "from sklearn.model_selection import cross_val_score" 52 | ], 53 | "execution_count": 3, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": { 59 | "id": "kip8j-dogkXl" 60 | }, 61 | "source": [ 62 | "# Classification" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "metadata": { 68 | "id": "Zz0yBcg3gjh5" 69 | }, 70 | "source": [ 71 | "X,y = load_breast_cancer(return_X_y=True)" 72 | ], 73 | "execution_count": 4, 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "AAcr6L2ehB2I", 83 | "outputId": "381e5f6d-a996-4ba5-b2d7-211e2ca46faa" 84 | }, 85 | "source": [ 86 | "nb = GaussianNB()\n", 87 | "\n", 88 | "cross_val_score(nb,X,y,scoring=\"balanced_accuracy\",cv=10).var()" 89 | ], 90 | "execution_count": 5, 91 | "outputs": [ 92 | { 93 | "output_type": "execute_result", 94 | "data": { 95 | "text/plain": [ 96 | "0.0011182285777794419" 97 | ] 98 | }, 99 | "metadata": {}, 100 | "execution_count": 5 101 | } 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "metadata": { 107 | "id": "73OUAh18gn0w" 108 | }, 109 | "source": [ 110 | "model = BaggingClassifier(GaussianNB(),n_estimators = 10, max_features = 0.5,random_state = 0, n_jobs = -1)" 111 | ], 112 | "execution_count": 6, 113 | "outputs": [] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "metadata": { 118 | "colab": { 119 | "base_uri": "https://localhost:8080/" 120 | }, 121 | "id": "IyJCe1pIg5Kg", 122 | "outputId": "06119176-f24f-437e-b3ce-e44cc6416710" 123 | }, 124 | "source": [ 125 | "cross_val_score(model,X,y,scoring=\"balanced_accuracy\",cv=10).var()" 126 | ], 127 | "execution_count": 7, 128 | "outputs": [ 129 | { 130 | "output_type": "execute_result", 131 | "data": { 132 | "text/plain": [ 133 | "0.000944202642795715" 134 | ] 135 | }, 136 | "metadata": {}, 137 | "execution_count": 7 138 | } 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": { 144 | "id": "_9Jq8iGJh4lI" 145 | }, 146 | "source": [ 147 | "# Regression" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "metadata": { 153 | "id": "g3sCLbR9hbWQ" 154 | }, 155 | "source": [ 156 | "X,y = load_diabetes(return_X_y=True)" 157 | ], 158 | "execution_count": 8, 159 | "outputs": [] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "metadata": { 164 | "colab": { 165 | "base_uri": "https://localhost:8080/" 166 | }, 167 | "id": "kUAjPZOniDrJ", 168 | "outputId": "7053de4b-de9f-475c-9f79-5ef39d396a00" 169 | }, 170 | "source": [ 171 | "lr = LinearRegression()\n", 172 | "\n", 173 | "cross_val_score(lr,X,y,scoring=\"r2\",cv=10).var()" 174 | ], 175 | "execution_count": 10, 176 | "outputs": [ 177 | { 178 | "output_type": "execute_result", 179 | "data": { 180 | "text/plain": [ 181 | "0.021605440351612316" 182 | ] 183 | }, 184 | "metadata": {}, 185 | "execution_count": 10 186 | } 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "metadata": { 192 | "id": "WDBYrDjoiJNj" 193 | }, 194 | "source": [ 195 | "model = BaggingRegressor(LinearRegression(),n_estimators = 10, max_features = 0.5,random_state = 0, n_jobs = -1)" 196 | ], 197 | "execution_count": 11, 198 | "outputs": [] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "metadata": { 203 | "colab": { 204 | "base_uri": "https://localhost:8080/" 205 | }, 206 | "id": "6lAhspYEkcJq", 207 | "outputId": "4e069618-4b42-4c5b-f0f9-0aef74f37e0f" 208 | }, 209 | "source": [ 210 | "cross_val_score(model,X,y,scoring=\"r2\",cv=10).var()" 211 | ], 212 | "execution_count": 12, 213 | "outputs": [ 214 | { 215 | "output_type": "execute_result", 216 | "data": { 217 | "text/plain": [ 218 | "0.013136832268767986" 219 | ] 220 | }, 221 | "metadata": {}, 222 | "execution_count": 12 223 | } 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "metadata": { 229 | "id": "w1zOyd_s-yF-" 230 | }, 231 | "source": [ 232 | "" 233 | ], 234 | "execution_count": null, 235 | "outputs": [] 236 | } 237 | ] 238 | } -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) [year] [fullname] 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST: -------------------------------------------------------------------------------- 1 | # file GENERATED by distutils, do NOT edit 2 | setup.cfg 3 | setup.py 4 | collinearity\SelectNonCollinear.py 5 | collinearity\__init__.py 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Introduction 2 | 3 | This library implements some functionf for removing collinearity from a dataset of features. It can be used both for supervised and for unsupervised machine learning problems. 4 | 5 | Collinearity is evaluated calculating __Pearson's linear correlation coefficient__ between the features. The user sets a __threshold__, which is the maximum absolute value allowed for the correlation coefficients in the correlation matrix. 6 | 7 | For __unsupervised problems__, the algorithm selects only those features that produce a correlation matrix whose off-diagonal elements are, in absolute value, less than the threshold. 8 | 9 | For __supervised problems__, the importance of the features with respect to the target variable is calculated using a univariate approach. Then, the features are added with the same unsupervised approach, starting from the most important ones. 10 | 11 | # Objects 12 | 13 | The main object is __SelectNonCollinear__. It can be imported this way: 14 | 15 | ```python 16 | from collinearity import SelectNonCollinear 17 | ``` 18 | 19 | > collinearity.__SelectNonCollinear__(_correlation_threshold=0.4, scoring=f_classif_) 20 | 21 | Parameters: 22 | 23 | __correlation_threshold : _float (between 0 and 1), default = 0.4___ 24 | 25 | Only those features that produce a correlation matrix with off-diagonal elements that are, in absolute value, less than this threshold will be chosen. 26 | 27 | __scoring : _callable, default=f_classif___ 28 | 29 | The scoring function for supervised problems. It must be the same accepted by [sklearn.feature_selection.SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html). 30 | 31 | # Methods 32 | 33 | This object supports the main methods of scikit-learn Estimators: 34 | 35 | > fit(X,y=None) 36 | 37 | Identifies the features to consider. For supervised problems, _y_ is the target array and the algorithm is: 38 | - Sort the features by scoring descending 39 | - Take the most important feature (i.e. the first feature) 40 | - Take the next feature if it shows a linear correlation coefficient with the already selected feature that is, in absolute value, lower than the threshold 41 | - Keep adding features as long as the correlation constraint holds 42 | 43 | For unsupervised problems, we have `y = None` and the algorithm is: 44 | - Take the couple of features that have the lowest absolute value of the linear correlation coefficient. 45 | - If it's lower than the threshold, consider these features 46 | - Keep adding features as long as the correlation matrix doesn't show off-diagonal elements whose absolute value is greater than the threshold. 47 | 48 | > transform(X) 49 | 50 | Selects the features according to the result of _fit_. It must be called after _fit_. 51 | 52 | > fit_transform(X,y=None) 53 | 54 | Calls _fit_ and then _transform_ 55 | 56 | > get_support() 57 | 58 | Returns an array of _True_ and _False_ of size X.shape[1]. A feature is selected if the value on this array corresponding to its index is _True_, otherwise it's not selected. 59 | 60 | # Examples 61 | 62 | The following examples explain how the main objects work. The code to run in advance for initializing the environment is: 63 | 64 | ```python 65 | from collinearity import SelectNonCollinear 66 | from sklearn.feature_selection import f_regression 67 | import numpy as np 68 | from sklearn.datasets import load_diabetes 69 | 70 | X,y = load_diabetes(return_X_y=True) 71 | ``` 72 | 73 | ## Unsupervised problems 74 | 75 | 76 | This example shows how to perform selection according to minimum collinearity in unsupervised problems. 77 | 78 | Let's consider, for this example, a threshold equal to 0.3. 79 | 80 | ```python 81 | selector = SelectNonCollinear(0.3) 82 | ``` 83 | 84 | If we apply the selection to the features and calculate the correlation matrix, we have: 85 | 86 | ```python 87 | np.corrcoef(selector.fit_transform(X),rowvar=False) 88 | 89 | # array([[1. , 0.1737371 , 0.18508467, 0.26006082], 90 | # [0.1737371 , 1. , 0.0881614 , 0.03527682], 91 | # [0.18508467, 0.0881614 , 1. , 0.24977742], 92 | # [0.26006082, 0.03527682, 0.24977742, 1. ]]) 93 | 94 | ``` 95 | As we can see, no off-diagonal element is greater than the threshold. 96 | 97 | ## Supervised problems 98 | 99 | For this problem, we must set the value of the `scoring` argument in the constructor. 100 | 101 | Let's consider a threshold equal to 0.4 and a scoring equal to `f_regression`. 102 | 103 | ```python 104 | selector = SelectNonCollinear(correlation_threshold=0.4,scoring=f_regression) 105 | 106 | selector.fit(X,y) 107 | ``` 108 | 109 | The correlation matrix is: 110 | ```python 111 | np.corrcoef(selector.transform(X),rowvar=False) 112 | 113 | # array([[ 1. , 0.1737371 , 0.18508467, 0.33542671, 0.26006082, 114 | # -0.07518097, 0.30173101], 115 | # [ 0.1737371 , 1. , 0.0881614 , 0.24101317, 0.03527682, 116 | # -0.37908963, 0.20813322], 117 | # [ 0.18508467, 0.0881614 , 1. , 0.39541532, 0.24977742, 118 | # -0.36681098, 0.38867999], 119 | # [ 0.33542671, 0.24101317, 0.39541532, 1. , 0.24246971, 120 | # -0.17876121, 0.39042938], 121 | # [ 0.26006082, 0.03527682, 0.24977742, 0.24246971, 1. , 122 | # 0.05151936, 0.32571675], 123 | # [-0.07518097, -0.37908963, -0.36681098, -0.17876121, 0.05151936, 124 | # 1. , -0.2736973 ], 125 | # [ 0.30173101, 0.20813322, 0.38867999, 0.39042938, 0.32571675, 126 | # -0.2736973 , 1. ]]) 127 | ``` 128 | 129 | Again, no off-diagonal element is greater than the threshold in absolute value. 130 | 131 | ## Use in pipelines 132 | 133 | It's possible to use `SelectNonCollinear` inside a pipeline, if necessary. 134 | 135 | ```python 136 | pipeline = make_pipeline(SelectNonCollinear(correlation_threshold=0.4, scoring=f_regression), LinearRegression()) 137 | ``` 138 | # Contact the author 139 | 140 | For any questions, you can contact me at gianluca.malato@gmail.com 141 | -------------------------------------------------------------------------------- /collinearity.egg-info/PKG-INFO: -------------------------------------------------------------------------------- 1 | Metadata-Version: 2.1 2 | Name: collinearity 3 | Version: 0.6 4 | Summary: A Python library for removing collinearity in machine learning datasets 5 | Home-page: https://github.com/gianlucamalato/collinearity 6 | Author: Gianluca Malato 7 | Author-email: gianluca.malato@gmail.com 8 | License: MIT 9 | Download-URL: https://github.com/gianlucamalato/collinearity/archive/refs/tags/0.6.tar.gz 10 | Description: # Introduction 11 | 12 | This library implements some functionf for removing collinearity from a dataset of features. It can be used both for supervised and for unsupervised machine learning problems. 13 | 14 | Collinearity is evaluated calculating __Pearson's linear correlation coefficient__ between the features. The user sets a __threshold__, which is the maximum absolute value allowed for the correlation coefficients in the correlation matrix. 15 | 16 | For __unsupervised problems__, the algorithm selects only those features that produce a correlation matrix whose off-diagonal elements are, in absolute value, less than the threshold. 17 | 18 | For __supervised problems__, the importance of the features with respect to the target variable is calculated using a univariate approach. Then, the features are added with the same unsupervised approach, starting from the most important ones. 19 | 20 | # Objects 21 | 22 | The main object is __SelectNonCollinear__. It can be imported this way: 23 | 24 | ```python 25 | from collinearity import SelectNonCollinear 26 | ``` 27 | 28 | > collinearity.__SelectNonCollinear__(_correlation_threshold=0.4, scoring=f_classif_) 29 | 30 | Parameters: 31 | 32 | __correlation_threshold : _float (between 0 and 1), default = 0.4___ 33 | 34 | Only those features that produce a correlation matrix with off-diagonal elements that are, in absolute value, less than this threshold will be chosen. 35 | 36 | __scoring : _callable, default=f_classif___ 37 | 38 | The scoring function for supervised problems. It must be the same accepted by [sklearn.feature_selection.SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html). 39 | 40 | # Methods 41 | 42 | This object supports the main methods of scikit-learn Estimators: 43 | 44 | > fit(X,y=None) 45 | 46 | Identifies the features to consider. For supervised problems, _y_ is the target array and the algorithm is: 47 | - Sort the features by scoring descending 48 | - Take the most important feature (i.e. the first feature) 49 | - Take the next feature if it shows a linear correlation coefficient with the already selected feature that is, in absolute value, lower than the threshold 50 | - Keep adding features as long as the correlation constraint holds 51 | 52 | For unsupervised problems, we have `y = None` and the algorithm is: 53 | - Take the couple of features that have the lowest absolute value of the linear correlation coefficient. 54 | - If it's lower than the threshold, consider these features 55 | - Keep adding features as long as the correlation matrix doesn't show off-diagonal elements whose absolute value is greater than the threshold. 56 | 57 | > transform(X) 58 | 59 | Selects the features according to the result of _fit_. It must be called after _fit_. 60 | 61 | > fit_transform(X,y=None) 62 | 63 | Calls _fit_ and then _transform_ 64 | 65 | > get_support() 66 | 67 | Returns an array of _True_ and _False_ of size X.shape[1]. A feature is selected if the value on this array corresponding to its index is _True_, otherwise it's not selected. 68 | 69 | # Examples 70 | 71 | The following examples explain how the main objects work. The code to run in advance for initializing the environment is: 72 | 73 | ```python 74 | from collinearity import SelectNonCollinear 75 | from sklearn.feature_selection import f_regression 76 | import numpy as np 77 | from sklearn.datasets import load_diabetes 78 | 79 | X,y = load_diabetes(return_X_y=True) 80 | ``` 81 | 82 | ## Unsupervised problems 83 | 84 | 85 | This example shows how to perform selection according to minimum collinearity in unsupervised problems. 86 | 87 | Let's consider, for this example, a threshold equal to 0.3. 88 | 89 | ```python 90 | selector = SelectNonCollinear(0.3) 91 | ``` 92 | 93 | If we apply the selection to the features and calculate the correlation matrix, we have: 94 | 95 | ```python 96 | np.corrcoef(selector.fit_transform(X),rowvar=False) 97 | 98 | # array([[1. , 0.1737371 , 0.18508467, 0.26006082], 99 | # [0.1737371 , 1. , 0.0881614 , 0.03527682], 100 | # [0.18508467, 0.0881614 , 1. , 0.24977742], 101 | # [0.26006082, 0.03527682, 0.24977742, 1. ]]) 102 | 103 | ``` 104 | As we can see, no off-diagonal element is greater than the threshold. 105 | 106 | ## Supervised problems 107 | 108 | For this problem, we must set the value of the `scoring` argument in the constructor. 109 | 110 | Let's consider a threshold equal to 0.4 and a scoring equal to `f_regression`. 111 | 112 | ```python 113 | selector = SelectNonCollinear(correlation_threshold=0.4,scoring=f_regression) 114 | 115 | selector.fit(X,y) 116 | ``` 117 | 118 | The correlation matrix is: 119 | ```python 120 | np.corrcoef(selector.transform(X),rowvar=False) 121 | 122 | # array([[ 1. , 0.1737371 , 0.18508467, 0.33542671, 0.26006082, 123 | # -0.07518097, 0.30173101], 124 | # [ 0.1737371 , 1. , 0.0881614 , 0.24101317, 0.03527682, 125 | # -0.37908963, 0.20813322], 126 | # [ 0.18508467, 0.0881614 , 1. , 0.39541532, 0.24977742, 127 | # -0.36681098, 0.38867999], 128 | # [ 0.33542671, 0.24101317, 0.39541532, 1. , 0.24246971, 129 | # -0.17876121, 0.39042938], 130 | # [ 0.26006082, 0.03527682, 0.24977742, 0.24246971, 1. , 131 | # 0.05151936, 0.32571675], 132 | # [-0.07518097, -0.37908963, -0.36681098, -0.17876121, 0.05151936, 133 | # 1. , -0.2736973 ], 134 | # [ 0.30173101, 0.20813322, 0.38867999, 0.39042938, 0.32571675, 135 | # -0.2736973 , 1. ]]) 136 | ``` 137 | 138 | Again, no off-diagonal element is greater than the threshold in absolute value. 139 | 140 | ## Use in pipelines 141 | 142 | It's possible to use `SelectNonCollinear` inside a pipeline, if necessary. 143 | 144 | ```python 145 | pipeline = make_pipeline(SelectNonCollinear(correlation_threshold=0.4, scoring=f_regression), LinearRegression()) 146 | ``` 147 | # Contact the author 148 | 149 | For any questions, you can contact me at gianluca.malato@gmail.com 150 | 151 | Keywords: machine learning,collinearity,supervised models 152 | Platform: UNKNOWN 153 | Classifier: Development Status :: 3 - Alpha 154 | Classifier: Intended Audience :: Developers 155 | Classifier: Topic :: Software Development :: Build Tools 156 | Classifier: License :: OSI Approved :: MIT License 157 | Classifier: Programming Language :: Python :: 3 158 | Classifier: Programming Language :: Python :: 3.4 159 | Classifier: Programming Language :: Python :: 3.5 160 | Classifier: Programming Language :: Python :: 3.6 161 | Description-Content-Type: text/markdown 162 | -------------------------------------------------------------------------------- /collinearity.egg-info/SOURCES.txt: -------------------------------------------------------------------------------- 1 | README.md 2 | setup.cfg 3 | setup.py 4 | collinearity/SelectNonCollinear.py 5 | collinearity/__init__.py 6 | collinearity.egg-info/PKG-INFO 7 | collinearity.egg-info/SOURCES.txt 8 | collinearity.egg-info/dependency_links.txt 9 | collinearity.egg-info/requires.txt 10 | collinearity.egg-info/top_level.txt -------------------------------------------------------------------------------- /collinearity.egg-info/dependency_links.txt: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /collinearity.egg-info/requires.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scikit-learn 3 | -------------------------------------------------------------------------------- /collinearity.egg-info/top_level.txt: -------------------------------------------------------------------------------- 1 | collinearity 2 | -------------------------------------------------------------------------------- /collinearity/SelectNonCollinear.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_selection import SelectKBest,f_classif,f_regression 2 | import numpy as np 3 | 4 | 5 | def remove_collinearity_unsupervised(X,corr_threshold=0.4): 6 | n = X.shape[1] 7 | corr_matrix = np.abs(np.corrcoef(X,rowvar=False)) 8 | chosen_indices = [] 9 | 10 | min_corr = 1 11 | best_couple = None 12 | for i in range(0,corr_matrix.shape[0]): 13 | for j in range(i+1,corr_matrix.shape[1]): 14 | if corr_matrix[i,j] < min_corr: 15 | best_couple = [i,j] 16 | min_corr = corr_matrix[i,j] 17 | 18 | if min_corr > corr_threshold: 19 | return [False]*n 20 | 21 | chosen_indices.extend(best_couple) 22 | 23 | stop = False 24 | 25 | while not stop: 26 | stop = True 27 | for i in range(X.shape[1]): 28 | if i in chosen_indices: 29 | continue 30 | else: 31 | max_corr =np.max(np.abs(np.corrcoef(X[:,chosen_indices + [i]],rowvar=False) - np.identity(len(chosen_indices)+1))) 32 | if max_corr < corr_threshold: 33 | chosen_indices.append(i) 34 | stop = False 35 | mask = [i in chosen_indices for i in range(X.shape[1])] 36 | 37 | return mask 38 | 39 | 40 | 41 | 42 | 43 | def remove_collinearity_supervised(X,y,scoring=f_classif,corr_threshold=0.4): 44 | s = SelectKBest(scoring,k="all") 45 | s.fit(X,y) 46 | scores = [(i,s.scores_[i]) for i in range(len(s.scores_))] 47 | scores.sort(key = lambda x : -x[1]) 48 | 49 | selected_features = [] 50 | for i in range(len(scores)): 51 | if len(selected_features) == 0: 52 | selected_features.append(scores[0][0]) 53 | else: 54 | f = scores[i][0] 55 | max_corr =np.max(np.abs(np.corrcoef(X[:,selected_features + [f]],rowvar=False) - np.identity(len(selected_features)+1))) 56 | if max_corr < corr_threshold: 57 | selected_features.append(f) 58 | 59 | mask = [i in selected_features for i in range(X.shape[1])] 60 | return mask 61 | 62 | 63 | class SelectNonCollinear: 64 | 65 | def __init__(self,correlation_threshold=0.5, scoring=f_classif): 66 | self.correlation_threshold_ = correlation_threshold 67 | self.scoring_ = scoring 68 | 69 | def fit(self,X,y=None): 70 | if y is None: 71 | self.mask_ = remove_collinearity_unsupervised(X,self.correlation_threshold_) 72 | else: 73 | self.mask_ = remove_collinearity_supervised(X,y,self.scoring_,self.correlation_threshold_) 74 | 75 | def transform(self,X): 76 | return X[:,self.mask_] 77 | 78 | def fit_transform(self,X,y=None): 79 | self.fit(X,y) 80 | return self.transform(X) 81 | 82 | def set_params(correlation_threshold): 83 | self.correlation_threshold_ = correlation_threshold 84 | 85 | def get_support(self): 86 | return self.mask_ -------------------------------------------------------------------------------- /collinearity/__init__.py: -------------------------------------------------------------------------------- 1 | from collinearity.SelectNonCollinear import SelectNonCollinear -------------------------------------------------------------------------------- /dist/collinearity-0.6.tar.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/gianlucamalato/collinearity/ad78f1c4344f7c7cd812717080377317af17fcb2/dist/collinearity-0.6.tar.gz -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | # Inside of setup.cfg 2 | [metadata] 3 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #from distutils.core import setup 2 | from setuptools import setup, Extension 3 | 4 | with open('README.md') as f: 5 | long_description = f.read() 6 | 7 | 8 | setup( 9 | name = 'collinearity', # How you named your package folder (MyLib) 10 | packages = ['collinearity'], # Chose the same as "name" 11 | version = '0.6.1', # Start with a small number and increase it with every change you make 12 | license='MIT', # Chose a license from here: https://help.github.com/articles/licensing-a-repository 13 | description = 'A Python library for removing collinearity in machine learning datasets', # Give a short description about your library 14 | long_description=long_description, 15 | long_description_content_type='text/markdown', # This is important! 16 | 17 | author = 'Gianluca Malato', # Type in your name 18 | author_email = 'gianluca.malato@gmail.com', # Type in your E-Mail 19 | url = 'https://github.com/gianlucamalato/collinearity', # Provide either the link to your github or to your website 20 | download_url = 'https://github.com/gianlucamalato/collinearity/archive/refs/tags/0.6.1.tar.gz', # I explain this later on 21 | keywords = ['machine learning', 'collinearity', 'supervised models'], # Keywords that define your package best 22 | install_requires=[ # I get to this in a second 23 | 'numpy', 24 | 'scikit-learn', 25 | ], 26 | classifiers=[ 27 | 'Development Status :: 3 - Alpha', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package 28 | 'Intended Audience :: Developers', # Define that your audience are developers 29 | 'Topic :: Software Development :: Build Tools', 30 | 'License :: OSI Approved :: MIT License', # Again, pick a license 31 | 'Programming Language :: Python :: 3', #Specify which pyhton versions that you want to support 32 | 'Programming Language :: Python :: 3.4', 33 | 'Programming Language :: Python :: 3.5', 34 | 'Programming Language :: Python :: 3.6', 35 | ], 36 | ) --------------------------------------------------------------------------------