├── .gitattributes
├── .gitignore
├── Bagging.ipynb
├── LICENSE.txt
├── MANIFEST
├── README.md
├── collinearity.egg-info
    ├── PKG-INFO
    ├── SOURCES.txt
    ├── dependency_links.txt
    ├── requires.txt
    └── top_level.txt
├── collinearity
    ├── SelectNonCollinear.py
    └── __init__.py
├── dist
    └── collinearity-0.6.tar.gz
├── setup.cfg
└── setup.py


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.bat


--------------------------------------------------------------------------------
/Bagging.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "Bagging.ipynb",
  7 |       "provenance": [],
  8 |       "authorship_tag": "ABX9TyNaaTq/6YMpzCw6an8wM0jO"
  9 |     },
 10 |     "kernelspec": {
 11 |       "name": "python3",
 12 |       "display_name": "Python 3"
 13 |     },
 14 |     "language_info": {
 15 |       "name": "python"
 16 |     }
 17 |   },
 18 |   "cells": [
 19 |     {
 20 |       "cell_type": "code",
 21 |       "metadata": {
 22 |         "id": "PfYP9a4pgPjP"
 23 |       },
 24 |       "source": [
 25 |         "from sklearn.datasets import load_breast_cancer,load_diabetes"
 26 |       ],
 27 |       "execution_count": 1,
 28 |       "outputs": []
 29 |     },
 30 |     {
 31 |       "cell_type": "code",
 32 |       "metadata": {
 33 |         "id": "-jH5nMn4gVAp"
 34 |       },
 35 |       "source": [
 36 |         "from sklearn.linear_model import LinearRegression\n",
 37 |         "from sklearn.naive_bayes import GaussianNB"
 38 |       ],
 39 |       "execution_count": 2,
 40 |       "outputs": []
 41 |     },
 42 |     {
 43 |       "cell_type": "code",
 44 |       "metadata": {
 45 |         "id": "uDpqIJ9ygdT4"
 46 |       },
 47 |       "source": [
 48 |         "from sklearn.ensemble import BaggingClassifier,BaggingRegressor\n",
 49 |         "\n",
 50 |         "\n",
 51 |         "from sklearn.model_selection import cross_val_score"
 52 |       ],
 53 |       "execution_count": 3,
 54 |       "outputs": []
 55 |     },
 56 |     {
 57 |       "cell_type": "markdown",
 58 |       "metadata": {
 59 |         "id": "kip8j-dogkXl"
 60 |       },
 61 |       "source": [
 62 |         "# Classification"
 63 |       ]
 64 |     },
 65 |     {
 66 |       "cell_type": "code",
 67 |       "metadata": {
 68 |         "id": "Zz0yBcg3gjh5"
 69 |       },
 70 |       "source": [
 71 |         "X,y = load_breast_cancer(return_X_y=True)"
 72 |       ],
 73 |       "execution_count": 4,
 74 |       "outputs": []
 75 |     },
 76 |     {
 77 |       "cell_type": "code",
 78 |       "metadata": {
 79 |         "colab": {
 80 |           "base_uri": "https://localhost:8080/"
 81 |         },
 82 |         "id": "AAcr6L2ehB2I",
 83 |         "outputId": "381e5f6d-a996-4ba5-b2d7-211e2ca46faa"
 84 |       },
 85 |       "source": [
 86 |         "nb = GaussianNB()\n",
 87 |         "\n",
 88 |         "cross_val_score(nb,X,y,scoring=\"balanced_accuracy\",cv=10).var()"
 89 |       ],
 90 |       "execution_count": 5,
 91 |       "outputs": [
 92 |         {
 93 |           "output_type": "execute_result",
 94 |           "data": {
 95 |             "text/plain": [
 96 |               "0.0011182285777794419"
 97 |             ]
 98 |           },
 99 |           "metadata": {},
100 |           "execution_count": 5
101 |         }
102 |       ]
103 |     },
104 |     {
105 |       "cell_type": "code",
106 |       "metadata": {
107 |         "id": "73OUAh18gn0w"
108 |       },
109 |       "source": [
110 |         "model = BaggingClassifier(GaussianNB(),n_estimators = 10, max_features = 0.5,random_state = 0, n_jobs = -1)"
111 |       ],
112 |       "execution_count": 6,
113 |       "outputs": []
114 |     },
115 |     {
116 |       "cell_type": "code",
117 |       "metadata": {
118 |         "colab": {
119 |           "base_uri": "https://localhost:8080/"
120 |         },
121 |         "id": "IyJCe1pIg5Kg",
122 |         "outputId": "06119176-f24f-437e-b3ce-e44cc6416710"
123 |       },
124 |       "source": [
125 |         "cross_val_score(model,X,y,scoring=\"balanced_accuracy\",cv=10).var()"
126 |       ],
127 |       "execution_count": 7,
128 |       "outputs": [
129 |         {
130 |           "output_type": "execute_result",
131 |           "data": {
132 |             "text/plain": [
133 |               "0.000944202642795715"
134 |             ]
135 |           },
136 |           "metadata": {},
137 |           "execution_count": 7
138 |         }
139 |       ]
140 |     },
141 |     {
142 |       "cell_type": "markdown",
143 |       "metadata": {
144 |         "id": "_9Jq8iGJh4lI"
145 |       },
146 |       "source": [
147 |         "# Regression"
148 |       ]
149 |     },
150 |     {
151 |       "cell_type": "code",
152 |       "metadata": {
153 |         "id": "g3sCLbR9hbWQ"
154 |       },
155 |       "source": [
156 |         "X,y = load_diabetes(return_X_y=True)"
157 |       ],
158 |       "execution_count": 8,
159 |       "outputs": []
160 |     },
161 |     {
162 |       "cell_type": "code",
163 |       "metadata": {
164 |         "colab": {
165 |           "base_uri": "https://localhost:8080/"
166 |         },
167 |         "id": "kUAjPZOniDrJ",
168 |         "outputId": "7053de4b-de9f-475c-9f79-5ef39d396a00"
169 |       },
170 |       "source": [
171 |         "lr = LinearRegression()\n",
172 |         "\n",
173 |         "cross_val_score(lr,X,y,scoring=\"r2\",cv=10).var()"
174 |       ],
175 |       "execution_count": 10,
176 |       "outputs": [
177 |         {
178 |           "output_type": "execute_result",
179 |           "data": {
180 |             "text/plain": [
181 |               "0.021605440351612316"
182 |             ]
183 |           },
184 |           "metadata": {},
185 |           "execution_count": 10
186 |         }
187 |       ]
188 |     },
189 |     {
190 |       "cell_type": "code",
191 |       "metadata": {
192 |         "id": "WDBYrDjoiJNj"
193 |       },
194 |       "source": [
195 |         "model = BaggingRegressor(LinearRegression(),n_estimators = 10, max_features = 0.5,random_state = 0, n_jobs = -1)"
196 |       ],
197 |       "execution_count": 11,
198 |       "outputs": []
199 |     },
200 |     {
201 |       "cell_type": "code",
202 |       "metadata": {
203 |         "colab": {
204 |           "base_uri": "https://localhost:8080/"
205 |         },
206 |         "id": "6lAhspYEkcJq",
207 |         "outputId": "4e069618-4b42-4c5b-f0f9-0aef74f37e0f"
208 |       },
209 |       "source": [
210 |         "cross_val_score(model,X,y,scoring=\"r2\",cv=10).var()"
211 |       ],
212 |       "execution_count": 12,
213 |       "outputs": [
214 |         {
215 |           "output_type": "execute_result",
216 |           "data": {
217 |             "text/plain": [
218 |               "0.013136832268767986"
219 |             ]
220 |           },
221 |           "metadata": {},
222 |           "execution_count": 12
223 |         }
224 |       ]
225 |     },
226 |     {
227 |       "cell_type": "code",
228 |       "metadata": {
229 |         "id": "w1zOyd_s-yF-"
230 |       },
231 |       "source": [
232 |         ""
233 |       ],
234 |       "execution_count": null,
235 |       "outputs": []
236 |     }
237 |   ]
238 | }


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) [year] [fullname]
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST:
--------------------------------------------------------------------------------
1 | # file GENERATED by distutils, do NOT edit
2 | setup.cfg
3 | setup.py
4 | collinearity\SelectNonCollinear.py
5 | collinearity\__init__.py
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Introduction
  2 | 
  3 | This library implements some functionf for removing collinearity from a dataset of features. It can be used both for supervised and for unsupervised machine learning problems.
  4 | 
  5 | Collinearity is evaluated calculating __Pearson's linear correlation coefficient__ between the features. The user sets a __threshold__, which is the maximum absolute value allowed for the correlation coefficients in the correlation matrix. 
  6 | 
  7 | For __unsupervised problems__, the algorithm selects only those features that produce a correlation matrix whose off-diagonal elements are, in absolute value, less than the threshold. 
  8 | 
  9 | For __supervised problems__, the importance of the features with respect to the target variable is calculated using a univariate approach. Then, the features are added with the same unsupervised approach, starting from the most important ones.
 10 | 
 11 | # Objects
 12 | 
 13 | The main object is __SelectNonCollinear__. It can be imported this way:
 14 | 
 15 | ```python
 16 | from collinearity import SelectNonCollinear
 17 | ```
 18 | 
 19 | > collinearity.__SelectNonCollinear__(_correlation_threshold=0.4, scoring=f_classif_)
 20 | 
 21 | Parameters:
 22 | 
 23 | __correlation_threshold : _float (between 0 and 1), default = 0.4___
 24 | 
 25 | Only those features that produce a correlation matrix with off-diagonal elements that are, in absolute value, less than this threshold will be chosen.
 26 | 
 27 | __scoring : _callable, default=f_classif___
 28 | 
 29 | The scoring function for supervised problems. It must be the same accepted by [sklearn.feature_selection.SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html).
 30 | 
 31 | # Methods
 32 | 
 33 | This object supports the main methods of scikit-learn Estimators:
 34 | 
 35 | > fit(X,y=None)
 36 | 
 37 | Identifies the features to consider. For supervised problems, _y_ is the target array and the algorithm is:
 38 | - Sort the features by scoring descending
 39 | - Take the most important feature (i.e. the first feature)
 40 | - Take the next feature if it shows a linear correlation coefficient with the already selected feature that is, in absolute value, lower than the threshold
 41 | - Keep adding features as long as the correlation constraint holds
 42 | 
 43 | For unsupervised problems, we have `y = None` and the algorithm is:
 44 | - Take the couple of features that have the lowest absolute value of the linear correlation coefficient.
 45 | - If it's lower than the threshold, consider these features
 46 | - Keep adding features as long as the correlation matrix doesn't show off-diagonal elements whose absolute value is greater than the threshold. 
 47 | 
 48 | > transform(X)
 49 | 
 50 | Selects the features according to the result of _fit_. It must be called after _fit_.
 51 | 
 52 | > fit_transform(X,y=None)
 53 | 
 54 | Calls _fit_ and then _transform_
 55 | 
 56 | > get_support()
 57 | 
 58 | Returns an array of _True_ and _False_ of size X.shape[1]. A feature is selected if the value on this array corresponding to its index is _True_, otherwise it's not selected.
 59 | 
 60 | # Examples
 61 | 
 62 | The following examples explain how the main objects work. The code to run in advance for initializing the environment is:
 63 | 
 64 | ```python
 65 | from collinearity import SelectNonCollinear
 66 | from sklearn.feature_selection import f_regression
 67 | import numpy as np
 68 | from sklearn.datasets import load_diabetes
 69 | 
 70 | X,y = load_diabetes(return_X_y=True)
 71 | ```
 72 | 
 73 | ## Unsupervised problems
 74 | 
 75 | 
 76 | This example shows how to perform selection according to minimum collinearity in unsupervised problems. 
 77 | 
 78 | Let's consider, for this example, a threshold equal to 0.3.
 79 | 
 80 | ```python
 81 | selector = SelectNonCollinear(0.3)
 82 | ```
 83 | 
 84 | If we apply the selection to the features and calculate the correlation matrix, we have:
 85 | 
 86 | ```python
 87 | np.corrcoef(selector.fit_transform(X),rowvar=False)
 88 | 
 89 | # array([[1.       , 0.1737371 , 0.18508467, 0.26006082],
 90 | #       [0.1737371 , 1.        , 0.0881614 , 0.03527682],
 91 | #       [0.18508467, 0.0881614 , 1.        , 0.24977742],
 92 | #       [0.26006082, 0.03527682, 0.24977742, 1.        ]])
 93 | 
 94 | ```
 95 | As we can see, no off-diagonal element is greater than the threshold.
 96 | 
 97 | ## Supervised problems
 98 | 
 99 | For this problem, we must set the value of the `scoring` argument in the constructor. 
100 | 
101 | Let's consider a threshold equal to 0.4 and a scoring equal to `f_regression`.
102 | 
103 | ```python
104 | selector = SelectNonCollinear(correlation_threshold=0.4,scoring=f_regression)
105 | 
106 | selector.fit(X,y)
107 | ```
108 | 
109 | The correlation matrix is:
110 | ```python
111 | np.corrcoef(selector.transform(X),rowvar=False)
112 | 
113 | # array([[ 1.       ,  0.1737371 ,  0.18508467,  0.33542671,  0.26006082,
114 | #        -0.07518097,  0.30173101],
115 | #       [ 0.1737371 ,  1.        ,  0.0881614 ,  0.24101317,  0.03527682,
116 | #        -0.37908963,  0.20813322],
117 | #       [ 0.18508467,  0.0881614 ,  1.        ,  0.39541532,  0.24977742,
118 | #        -0.36681098,  0.38867999],
119 | #       [ 0.33542671,  0.24101317,  0.39541532,  1.        ,  0.24246971,
120 | #        -0.17876121,  0.39042938],
121 | #       [ 0.26006082,  0.03527682,  0.24977742,  0.24246971,  1.        ,
122 | #         0.05151936,  0.32571675],
123 | #       [-0.07518097, -0.37908963, -0.36681098, -0.17876121,  0.05151936,
124 | #         1.        , -0.2736973 ],
125 | #       [ 0.30173101,  0.20813322,  0.38867999,  0.39042938,  0.32571675,
126 | #        -0.2736973 ,  1.        ]])
127 | ```
128 | 
129 | Again, no off-diagonal element is greater than the threshold in absolute value.
130 | 
131 | ## Use in pipelines
132 | 
133 | It's possible to use `SelectNonCollinear` inside a pipeline, if necessary.
134 | 
135 | ```python
136 | pipeline = make_pipeline(SelectNonCollinear(correlation_threshold=0.4, scoring=f_regression), LinearRegression())
137 | ```
138 | # Contact the author
139 | 
140 | For any questions, you can contact me at gianluca.malato@gmail.com
141 | 


--------------------------------------------------------------------------------
/collinearity.egg-info/PKG-INFO:
--------------------------------------------------------------------------------
  1 | Metadata-Version: 2.1
  2 | Name: collinearity
  3 | Version: 0.6
  4 | Summary: A Python library for removing collinearity in machine learning datasets
  5 | Home-page: https://github.com/gianlucamalato/collinearity
  6 | Author: Gianluca Malato
  7 | Author-email: gianluca.malato@gmail.com
  8 | License: MIT
  9 | Download-URL: https://github.com/gianlucamalato/collinearity/archive/refs/tags/0.6.tar.gz
 10 | Description: # Introduction
 11 |         
 12 |         This library implements some functionf for removing collinearity from a dataset of features. It can be used both for supervised and for unsupervised machine learning problems.
 13 |         
 14 |         Collinearity is evaluated calculating __Pearson's linear correlation coefficient__ between the features. The user sets a __threshold__, which is the maximum absolute value allowed for the correlation coefficients in the correlation matrix. 
 15 |         
 16 |         For __unsupervised problems__, the algorithm selects only those features that produce a correlation matrix whose off-diagonal elements are, in absolute value, less than the threshold. 
 17 |         
 18 |         For __supervised problems__, the importance of the features with respect to the target variable is calculated using a univariate approach. Then, the features are added with the same unsupervised approach, starting from the most important ones.
 19 |         
 20 |         # Objects
 21 |         
 22 |         The main object is __SelectNonCollinear__. It can be imported this way:
 23 |         
 24 |         ```python
 25 |         from collinearity import SelectNonCollinear
 26 |         ```
 27 |         
 28 |         > collinearity.__SelectNonCollinear__(_correlation_threshold=0.4, scoring=f_classif_)
 29 |         
 30 |         Parameters:
 31 |         
 32 |         __correlation_threshold : _float (between 0 and 1), default = 0.4___
 33 |         
 34 |         Only those features that produce a correlation matrix with off-diagonal elements that are, in absolute value, less than this threshold will be chosen.
 35 |         
 36 |         __scoring : _callable, default=f_classif___
 37 |         
 38 |         The scoring function for supervised problems. It must be the same accepted by [sklearn.feature_selection.SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html).
 39 |         
 40 |         # Methods
 41 |         
 42 |         This object supports the main methods of scikit-learn Estimators:
 43 |         
 44 |         > fit(X,y=None)
 45 |         
 46 |         Identifies the features to consider. For supervised problems, _y_ is the target array and the algorithm is:
 47 |         - Sort the features by scoring descending
 48 |         - Take the most important feature (i.e. the first feature)
 49 |         - Take the next feature if it shows a linear correlation coefficient with the already selected feature that is, in absolute value, lower than the threshold
 50 |         - Keep adding features as long as the correlation constraint holds
 51 |         
 52 |         For unsupervised problems, we have `y = None` and the algorithm is:
 53 |         - Take the couple of features that have the lowest absolute value of the linear correlation coefficient.
 54 |         - If it's lower than the threshold, consider these features
 55 |         - Keep adding features as long as the correlation matrix doesn't show off-diagonal elements whose absolute value is greater than the threshold. 
 56 |         
 57 |         > transform(X)
 58 |         
 59 |         Selects the features according to the result of _fit_. It must be called after _fit_.
 60 |         
 61 |         > fit_transform(X,y=None)
 62 |         
 63 |         Calls _fit_ and then _transform_
 64 |         
 65 |         > get_support()
 66 |         
 67 |         Returns an array of _True_ and _False_ of size X.shape[1]. A feature is selected if the value on this array corresponding to its index is _True_, otherwise it's not selected.
 68 |         
 69 |         # Examples
 70 |         
 71 |         The following examples explain how the main objects work. The code to run in advance for initializing the environment is:
 72 |         
 73 |         ```python
 74 |         from collinearity import SelectNonCollinear
 75 |         from sklearn.feature_selection import f_regression
 76 |         import numpy as np
 77 |         from sklearn.datasets import load_diabetes
 78 |         
 79 |         X,y = load_diabetes(return_X_y=True)
 80 |         ```
 81 |         
 82 |         ## Unsupervised problems
 83 |         
 84 |         
 85 |         This example shows how to perform selection according to minimum collinearity in unsupervised problems. 
 86 |         
 87 |         Let's consider, for this example, a threshold equal to 0.3.
 88 |         
 89 |         ```python
 90 |         selector = SelectNonCollinear(0.3)
 91 |         ```
 92 |         
 93 |         If we apply the selection to the features and calculate the correlation matrix, we have:
 94 |         
 95 |         ```python
 96 |         np.corrcoef(selector.fit_transform(X),rowvar=False)
 97 |         
 98 |         # array([[1.       , 0.1737371 , 0.18508467, 0.26006082],
 99 |         #       [0.1737371 , 1.        , 0.0881614 , 0.03527682],
100 |         #       [0.18508467, 0.0881614 , 1.        , 0.24977742],
101 |         #       [0.26006082, 0.03527682, 0.24977742, 1.        ]])
102 |         
103 |         ```
104 |         As we can see, no off-diagonal element is greater than the threshold.
105 |         
106 |         ## Supervised problems
107 |         
108 |         For this problem, we must set the value of the `scoring` argument in the constructor. 
109 |         
110 |         Let's consider a threshold equal to 0.4 and a scoring equal to `f_regression`.
111 |         
112 |         ```python
113 |         selector = SelectNonCollinear(correlation_threshold=0.4,scoring=f_regression)
114 |         
115 |         selector.fit(X,y)
116 |         ```
117 |         
118 |         The correlation matrix is:
119 |         ```python
120 |         np.corrcoef(selector.transform(X),rowvar=False)
121 |         
122 |         # array([[ 1.       ,  0.1737371 ,  0.18508467,  0.33542671,  0.26006082,
123 |         #        -0.07518097,  0.30173101],
124 |         #       [ 0.1737371 ,  1.        ,  0.0881614 ,  0.24101317,  0.03527682,
125 |         #        -0.37908963,  0.20813322],
126 |         #       [ 0.18508467,  0.0881614 ,  1.        ,  0.39541532,  0.24977742,
127 |         #        -0.36681098,  0.38867999],
128 |         #       [ 0.33542671,  0.24101317,  0.39541532,  1.        ,  0.24246971,
129 |         #        -0.17876121,  0.39042938],
130 |         #       [ 0.26006082,  0.03527682,  0.24977742,  0.24246971,  1.        ,
131 |         #         0.05151936,  0.32571675],
132 |         #       [-0.07518097, -0.37908963, -0.36681098, -0.17876121,  0.05151936,
133 |         #         1.        , -0.2736973 ],
134 |         #       [ 0.30173101,  0.20813322,  0.38867999,  0.39042938,  0.32571675,
135 |         #        -0.2736973 ,  1.        ]])
136 |         ```
137 |         
138 |         Again, no off-diagonal element is greater than the threshold in absolute value.
139 |         
140 |         ## Use in pipelines
141 |         
142 |         It's possible to use `SelectNonCollinear` inside a pipeline, if necessary.
143 |         
144 |         ```python
145 |         pipeline = make_pipeline(SelectNonCollinear(correlation_threshold=0.4, scoring=f_regression), LinearRegression())
146 |         ```
147 |         # Contact the author
148 |         
149 |         For any questions, you can contact me at gianluca.malato@gmail.com
150 |         
151 | Keywords: machine learning,collinearity,supervised models
152 | Platform: UNKNOWN
153 | Classifier: Development Status :: 3 - Alpha
154 | Classifier: Intended Audience :: Developers
155 | Classifier: Topic :: Software Development :: Build Tools
156 | Classifier: License :: OSI Approved :: MIT License
157 | Classifier: Programming Language :: Python :: 3
158 | Classifier: Programming Language :: Python :: 3.4
159 | Classifier: Programming Language :: Python :: 3.5
160 | Classifier: Programming Language :: Python :: 3.6
161 | Description-Content-Type: text/markdown
162 | 


--------------------------------------------------------------------------------
/collinearity.egg-info/SOURCES.txt:
--------------------------------------------------------------------------------
 1 | README.md
 2 | setup.cfg
 3 | setup.py
 4 | collinearity/SelectNonCollinear.py
 5 | collinearity/__init__.py
 6 | collinearity.egg-info/PKG-INFO
 7 | collinearity.egg-info/SOURCES.txt
 8 | collinearity.egg-info/dependency_links.txt
 9 | collinearity.egg-info/requires.txt
10 | collinearity.egg-info/top_level.txt


--------------------------------------------------------------------------------
/collinearity.egg-info/dependency_links.txt:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/collinearity.egg-info/requires.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | scikit-learn
3 | 


--------------------------------------------------------------------------------
/collinearity.egg-info/top_level.txt:
--------------------------------------------------------------------------------
1 | collinearity
2 | 


--------------------------------------------------------------------------------
/collinearity/SelectNonCollinear.py:
--------------------------------------------------------------------------------
 1 | from sklearn.feature_selection import SelectKBest,f_classif,f_regression
 2 | import numpy as np
 3 | 
 4 | 
 5 | def remove_collinearity_unsupervised(X,corr_threshold=0.4):
 6 |   n = X.shape[1]
 7 |   corr_matrix = np.abs(np.corrcoef(X,rowvar=False))
 8 |   chosen_indices = []
 9 | 
10 |   min_corr = 1
11 |   best_couple = None
12 |   for i in range(0,corr_matrix.shape[0]):
13 |     for j in range(i+1,corr_matrix.shape[1]):
14 |       if corr_matrix[i,j] < min_corr:
15 |         best_couple = [i,j]
16 |         min_corr = corr_matrix[i,j]
17 |   
18 |   if min_corr > corr_threshold:
19 |     return [False]*n
20 | 
21 |   chosen_indices.extend(best_couple)
22 | 
23 |   stop = False
24 | 
25 |   while not stop:
26 |     stop = True
27 |     for i in range(X.shape[1]):
28 |       if i in chosen_indices:
29 |         continue
30 |       else:
31 |         max_corr =np.max(np.abs(np.corrcoef(X[:,chosen_indices + [i]],rowvar=False) - np.identity(len(chosen_indices)+1)))
32 |         if max_corr < corr_threshold:
33 |           chosen_indices.append(i)
34 |           stop = False
35 |   mask = [i in chosen_indices for i in range(X.shape[1])]
36 | 
37 |   return mask
38 | 
39 | 
40 |   
41 | 
42 | 
43 | def remove_collinearity_supervised(X,y,scoring=f_classif,corr_threshold=0.4):
44 |   s = SelectKBest(scoring,k="all")
45 |   s.fit(X,y)
46 |   scores = [(i,s.scores_[i]) for i in range(len(s.scores_))]
47 |   scores.sort(key = lambda x : -x[1])
48 | 
49 |   selected_features = []
50 |   for i in range(len(scores)):
51 |     if len(selected_features) == 0:
52 |       selected_features.append(scores[0][0])
53 |     else:
54 |       f = scores[i][0]
55 |       max_corr =np.max(np.abs(np.corrcoef(X[:,selected_features + [f]],rowvar=False) - np.identity(len(selected_features)+1)))
56 |       if max_corr < corr_threshold:
57 |         selected_features.append(f)
58 | 
59 |   mask = [i in selected_features for i in range(X.shape[1])]
60 |   return mask
61 | 
62 | 
63 | class SelectNonCollinear:
64 | 
65 |   def __init__(self,correlation_threshold=0.5, scoring=f_classif):
66 |     self.correlation_threshold_ = correlation_threshold
67 |     self.scoring_ = scoring
68 |   
69 |   def fit(self,X,y=None):
70 |     if y is None:
71 |       self.mask_ = remove_collinearity_unsupervised(X,self.correlation_threshold_)
72 |     else:
73 |       self.mask_ = remove_collinearity_supervised(X,y,self.scoring_,self.correlation_threshold_)
74 |   
75 |   def transform(self,X):
76 |     return X[:,self.mask_]
77 |   
78 |   def fit_transform(self,X,y=None):
79 |     self.fit(X,y)
80 |     return self.transform(X)
81 | 
82 |   def set_params(correlation_threshold):
83 |     self.correlation_threshold_ = correlation_threshold
84 |   
85 |   def get_support(self):
86 |     return self.mask_


--------------------------------------------------------------------------------
/collinearity/__init__.py:
--------------------------------------------------------------------------------
1 | from collinearity.SelectNonCollinear import SelectNonCollinear


--------------------------------------------------------------------------------
/dist/collinearity-0.6.tar.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/gianlucamalato/collinearity/ad78f1c4344f7c7cd812717080377317af17fcb2/dist/collinearity-0.6.tar.gz


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | # Inside of setup.cfg
2 | [metadata]
3 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #from distutils.core import setup
 2 | from setuptools import setup, Extension
 3 | 
 4 | with open('README.md') as f:
 5 |     long_description = f.read()
 6 | 
 7 | 
 8 | setup(
 9 |   name = 'collinearity',         # How you named your package folder (MyLib)
10 |   packages = ['collinearity'],   # Chose the same as "name"
11 |   version = '0.6.1',      # Start with a small number and increase it with every change you make
12 |   license='MIT',        # Chose a license from here: https://help.github.com/articles/licensing-a-repository
13 |   description = 'A Python library for removing collinearity in machine learning datasets',   # Give a short description about your library
14 |   long_description=long_description,
15 |   long_description_content_type='text/markdown',  # This is important!
16 |   
17 |   author = 'Gianluca Malato',                   # Type in your name
18 |   author_email = 'gianluca.malato@gmail.com',      # Type in your E-Mail
19 |   url = 'https://github.com/gianlucamalato/collinearity',   # Provide either the link to your github or to your website
20 |   download_url = 'https://github.com/gianlucamalato/collinearity/archive/refs/tags/0.6.1.tar.gz',    # I explain this later on
21 |   keywords = ['machine learning', 'collinearity', 'supervised models'],   # Keywords that define your package best
22 |   install_requires=[            # I get to this in a second
23 |           'numpy',
24 |           'scikit-learn',
25 |       ],
26 |   classifiers=[
27 |     'Development Status :: 3 - Alpha',      # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package
28 |     'Intended Audience :: Developers',      # Define that your audience are developers
29 |     'Topic :: Software Development :: Build Tools',
30 |     'License :: OSI Approved :: MIT License',   # Again, pick a license
31 |     'Programming Language :: Python :: 3',      #Specify which pyhton versions that you want to support
32 |     'Programming Language :: Python :: 3.4',
33 |     'Programming Language :: Python :: 3.5',
34 |     'Programming Language :: Python :: 3.6',
35 |   ],
36 | )


--------------------------------------------------------------------------------