├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── gower ├── __init__.py └── gower_dist.py ├── pyproject.toml ├── requirements.txt ├── setup.cfg ├── setup.py └── tests ├── __pycache__ └── matrix_test.cpython-37-pytest-5.0.1.pyc └── matrix_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | *.csv 3 | *.json 4 | gower/.ipynb_checkpoints/ 5 | gower/__pycache__/ 6 | .ipynb_checkpoints/ 7 | gower.egg-info/ 8 | dist/ 9 | .idea/ 10 | build/ 11 | *.ipynb 12 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" # current default Python on Travis CI 4 | - "3.7" 5 | - "3.8" 6 | 7 | install: 8 | - pip install -r requirements.txt 9 | - pip install . 10 | # command to run tests 11 | script: pytest -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | Copyright (c) 2019 Michael Yan 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | The above copyright notice and this permission notice shall be included in all 10 | copies or substantial portions of the Software. 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 13 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 14 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 15 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 16 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 17 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![Build Status](https://travis-ci.com/wwwjk366/gower.svg?branch=master)](https://travis-ci.com/wwwjk366/gower) 3 | [![PyPI version](https://badge.fury.io/py/gower.svg)](https://pypi.org/project/gower/) 4 | [![Downloads](https://pepy.tech/badge/gower/month)](https://pepy.tech/project/gower/month) 5 | 6 | 7 | # Introduction 8 | 9 | Gower's distance calculation in Python. Gower Distance is a distance measure that can be used to calculate distance between two entity whose attribute has a mixed of categorical and numerical values. [Gower (1971) A general coefficient of similarity and some of its properties. Biometrics 27 857–874.](https://www.jstor.org/stable/2528823?seq=1) 10 | 11 | More details and examples can be found on my personal website here:(https://www.thinkdatascience.com/post/2019-12-16-introducing-python-package-gower/) 12 | 13 | Core functions are wrote by [Marcelo Beckmann](https://sourceforge.net/projects/gower-distance-4python/files/). 14 | 15 | # Examples 16 | 17 | ## Installation 18 | 19 | ``` 20 | pip install gower 21 | ``` 22 | 23 | ## Generate some data 24 | 25 | ```python 26 | import numpy as np 27 | import pandas as pd 28 | import gower 29 | 30 | Xd=pd.DataFrame({'age':[21,21,19, 30,21,21,19,30,None], 31 | 'gender':['M','M','N','M','F','F','F','F',None], 32 | 'civil_status':['MARRIED','SINGLE','SINGLE','SINGLE','MARRIED','SINGLE','WIDOW','DIVORCED',None], 33 | 'salary':[3000.0,1200.0 ,32000.0,1800.0 ,2900.0 ,1100.0 ,10000.0,1500.0,None], 34 | 'has_children':[1,0,1,1,1,0,0,1,None], 35 | 'available_credit':[2200,100,22000,1100,2000,100,6000,2200,None]}) 36 | Yd = Xd.iloc[1:3,:] 37 | X = np.asarray(Xd) 38 | Y = np.asarray(Yd) 39 | 40 | ``` 41 | 42 | ## Find the distance matrix 43 | 44 | ```python 45 | gower.gower_matrix(X) 46 | ``` 47 | 48 | 49 | 50 | 51 | array([[0. , 0.3590238 , 0.6707398 , 0.31787416, 0.16872811, 52 | 0.52622986, 0.59697855, 0.47778758, nan], 53 | [0.3590238 , 0. , 0.6964303 , 0.3138769 , 0.523629 , 54 | 0.16720603, 0.45600235, 0.6539635 , nan], 55 | [0.6707398 , 0.6964303 , 0. , 0.6552807 , 0.6728013 , 56 | 0.6969697 , 0.740428 , 0.8151941 , nan], 57 | [0.31787416, 0.3138769 , 0.6552807 , 0. , 0.4824794 , 58 | 0.48108295, 0.74818605, 0.34332284, nan], 59 | [0.16872811, 0.523629 , 0.6728013 , 0.4824794 , 0. , 60 | 0.35750175, 0.43237334, 0.3121036 , nan], 61 | [0.52622986, 0.16720603, 0.6969697 , 0.48108295, 0.35750175, 62 | 0. , 0.2898751 , 0.4878362 , nan], 63 | [0.59697855, 0.45600235, 0.740428 , 0.74818605, 0.43237334, 64 | 0.2898751 , 0. , 0.57476616, nan], 65 | [0.47778758, 0.6539635 , 0.8151941 , 0.34332284, 0.3121036 , 66 | 0.4878362 , 0.57476616, 0. , nan], 67 | [ nan, nan, nan, nan, nan, 68 | nan, nan, nan, nan]], dtype=float32) 69 | 70 | 71 | ## Find Top n results 72 | 73 | ```python 74 | gower.gower_topn(Xd.iloc[0:2,:], Xd.iloc[:,], n = 5) 75 | ``` 76 | 77 | 78 | 79 | 80 | {'index': array([4, 3, 1, 7, 5]), 81 | 'values': array([0.16872811, 0.31787416, 0.3590238 , 0.47778758, 0.52622986], 82 | dtype=float32)} 83 | 84 | 85 | -------------------------------------------------------------------------------- /gower/__init__.py: -------------------------------------------------------------------------------- 1 | from .gower_dist import gower_matrix, gower_topn -------------------------------------------------------------------------------- /gower/gower_dist.py: -------------------------------------------------------------------------------- 1 | from scipy.sparse import issparse 2 | import numpy as np 3 | import pandas as pd 4 | 5 | def gower_matrix(data_x, data_y=None, weight=None, cat_features=None): 6 | 7 | # function checks 8 | X = data_x 9 | if data_y is None: Y = data_x 10 | else: Y = data_y 11 | if not isinstance(X, np.ndarray): 12 | if not np.array_equal(X.columns, Y.columns): raise TypeError("X and Y must have same columns!") 13 | else: 14 | if not X.shape[1] == Y.shape[1]: raise TypeError("X and Y must have same y-dim!") 15 | 16 | if issparse(X) or issparse(Y): raise TypeError("Sparse matrices are not supported!") 17 | 18 | x_n_rows, x_n_cols = X.shape 19 | y_n_rows, y_n_cols = Y.shape 20 | 21 | if cat_features is None: 22 | if not isinstance(X, np.ndarray): 23 | is_number = np.vectorize(lambda x: not np.issubdtype(x, np.number)) 24 | cat_features = is_number(X.dtypes) 25 | else: 26 | cat_features = np.zeros(x_n_cols, dtype=bool) 27 | for col in range(x_n_cols): 28 | if not np.issubdtype(type(X[0, col]), np.number): 29 | cat_features[col]=True 30 | else: 31 | cat_features = np.array(cat_features) 32 | 33 | # print(cat_features) 34 | 35 | if not isinstance(X, np.ndarray): X = np.asarray(X) 36 | if not isinstance(Y, np.ndarray): Y = np.asarray(Y) 37 | 38 | Z = np.concatenate((X,Y)) 39 | 40 | x_index = range(0,x_n_rows) 41 | y_index = range(x_n_rows,x_n_rows+y_n_rows) 42 | 43 | Z_num = Z[:,np.logical_not(cat_features)] 44 | 45 | num_cols = Z_num.shape[1] 46 | num_ranges = np.zeros(num_cols) 47 | num_max = np.zeros(num_cols) 48 | 49 | for col in range(num_cols): 50 | col_array = Z_num[:, col].astype(np.float32) 51 | max = np.nanmax(col_array) 52 | min = np.nanmin(col_array) 53 | 54 | if np.isnan(max): 55 | max = 0.0 56 | if np.isnan(min): 57 | min = 0.0 58 | num_max[col] = max 59 | num_ranges[col] = np.abs(1 - min / max) if (max != 0) else 0.0 60 | 61 | # This is to normalize the numeric values between 0 and 1. 62 | Z_num = np.divide(Z_num ,num_max,out=np.zeros_like(Z_num), where=num_max!=0) 63 | Z_cat = Z[:,cat_features] 64 | 65 | if weight is None: 66 | weight = np.ones(Z.shape[1]) 67 | 68 | #print(weight) 69 | 70 | weight_cat=weight[cat_features] 71 | weight_num=weight[np.logical_not(cat_features)] 72 | 73 | out = np.zeros((x_n_rows, y_n_rows), dtype=np.float32) 74 | 75 | weight_sum = weight.sum() 76 | 77 | X_cat = Z_cat[x_index,] 78 | X_num = Z_num[x_index,] 79 | Y_cat = Z_cat[y_index,] 80 | Y_num = Z_num[y_index,] 81 | 82 | # print(X_cat,X_num,Y_cat,Y_num) 83 | 84 | for i in range(x_n_rows): 85 | j_start= i 86 | if x_n_rows != y_n_rows: 87 | j_start = 0 88 | # call the main function 89 | res = gower_get(X_cat[i,:], 90 | X_num[i,:], 91 | Y_cat[j_start:y_n_rows,:], 92 | Y_num[j_start:y_n_rows,:], 93 | weight_cat, 94 | weight_num, 95 | weight_sum, 96 | cat_features, 97 | num_ranges, 98 | num_max) 99 | #print(res) 100 | out[i,j_start:]=res 101 | if x_n_rows == y_n_rows: out[i:,j_start]=res 102 | 103 | return out 104 | 105 | 106 | def gower_get(xi_cat,xi_num,xj_cat,xj_num,feature_weight_cat, 107 | feature_weight_num,feature_weight_sum,categorical_features, 108 | ranges_of_numeric,max_of_numeric ): 109 | 110 | # categorical columns 111 | sij_cat = np.where(xi_cat == xj_cat,np.zeros_like(xi_cat),np.ones_like(xi_cat)) 112 | sum_cat = np.multiply(feature_weight_cat,sij_cat).sum(axis=1) 113 | 114 | # numerical columns 115 | abs_delta=np.absolute(xi_num-xj_num) 116 | sij_num=np.divide(abs_delta, ranges_of_numeric, out=np.zeros_like(abs_delta), where=ranges_of_numeric!=0) 117 | 118 | sum_num = np.multiply(feature_weight_num,sij_num).sum(axis=1) 119 | sums= np.add(sum_cat,sum_num) 120 | sum_sij = np.divide(sums,feature_weight_sum) 121 | 122 | return sum_sij 123 | 124 | def smallest_indices(ary, n): 125 | """Returns the n largest indices from a numpy array.""" 126 | #n += 1 127 | flat = np.nan_to_num(ary.flatten(), nan=999) 128 | indices = np.argpartition(-flat, -n)[-n:] 129 | indices = indices[np.argsort(flat[indices])] 130 | #indices = np.delete(indices,0,0) 131 | values = flat[indices] 132 | return {'index': indices, 'values': values} 133 | 134 | def gower_topn(data_x, data_y=None, weight=None, cat_features=None, n = 5): 135 | 136 | if data_x.shape[0] >= 2: TypeError("Only support `data_x` of 1 row. ") 137 | dm = gower_matrix(data_x, data_y, weight, cat_features) 138 | 139 | return smallest_indices(np.nan_to_num(dm[0], nan=1),n) 140 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "gower" 7 | version = "0.1.2" 8 | authors = [ 9 | { name="Michael Yan", email="author@example.com" }, 10 | { name="Dominic D"} 11 | ] 12 | description = "Python implementation of Gowers distance, pairwise between records in two data sets" 13 | readme = "README.md" 14 | requires-python = ">=2.7" 15 | keywords=['gower', 'distance', 'matrix'] 16 | classifiers = [ 17 | "Programming Language :: Python :: 3", 18 | "License :: OSI Approved :: MIT License", 19 | "Operating System :: OS Independent", 20 | ] 21 | 22 | [project.urls] 23 | "Homepage" = "https://github.com/wwwjk366/gower" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy 2 | scipy 3 | pandas -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | from os import path 4 | this_directory = path.abspath(path.dirname(__file__)) 5 | with open(path.join(this_directory, 'README.md'), encoding='utf-8') as f: 6 | long_description = f.read() 7 | 8 | setup( 9 | name='gower', 10 | version='0.1.0', 11 | description='Python implementation of Gowers distance, pairwise between records in two data sets', 12 | long_description=long_description, 13 | long_description_content_type="text/markdown", 14 | keywords=['gower', 'distance', 'matrix'], 15 | url='https://github.com/wwwjk366/gower', 16 | author='Michael Yan', 17 | author_email='tanbingy@gmail.com', 18 | license='MIT', 19 | packages=find_packages(), 20 | install_requires=['numpy', 'scipy'], 21 | include_package_data=True, 22 | zip_safe=False, 23 | classifiers=[ 24 | 'Development Status :: 3 - Alpha', # Chose either "3 - Alpha", "4 - Beta" or "5 - Production/Stable" as the current state of your package 25 | 'Intended Audience :: Developers', # Define that your audience are developers 26 | 'Topic :: Scientific/Engineering :: Mathematics', 27 | 'License :: OSI Approved :: MIT License', # Again, pick a license 28 | 'Programming Language :: Python :: 3.7', 29 | ], 30 | package_data={ 31 | # If any package contains *.txt files, include them: 32 | # '': ['*.sav'], 33 | # And include any *.dat files found in the 'data' subdirectory 34 | # of the 'mypkg' package, also: 35 | #'customer_models': ['model_objs/*.sav'], 36 | }) -------------------------------------------------------------------------------- /tests/__pycache__/matrix_test.cpython-37-pytest-5.0.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wwwjk366/gower/d92f2145e65efe305143e9e30cc69b722d1c52e3/tests/__pycache__/matrix_test.cpython-37-pytest-5.0.1.pyc -------------------------------------------------------------------------------- /tests/matrix_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | import numpy as np 4 | import pandas as pd 5 | import gower 6 | 7 | def test_answer(): 8 | Xd=pd.DataFrame({'age':[21,21,19, 30,21,21,19,30,None], 9 | 'gender':['M','M','N','M','F','F','F','F',None], 10 | 'civil_status':['MARRIED','SINGLE','SINGLE','SINGLE','MARRIED','SINGLE','WIDOW','DIVORCED',None], 11 | 'salary':[3000.0,1200.0 ,32000.0,1800.0 ,2900.0 ,1100.0 ,10000.0,1500.0,None], 12 | 'has_children':[1,0,1,1,1,0,0,1,None], 13 | 'available_credit':[2200,100,22000,1100,2000,100,6000,2200,None]}) 14 | Yd = Xd.iloc[1:3,:] 15 | X = np.asarray(Xd) 16 | Y = np.asarray(Yd) 17 | aaa = gower.gower_matrix(X) 18 | assert aaa[0][1] == pytest.approx(0.3590238, 0.001) --------------------------------------------------------------------------------