├── artml ├── metrics │ ├── scores.py │ ├── accuracy_score.py │ └── plots.py ├── explore │ ├── Ztest.py │ ├── Ttest.py │ ├── covariance.py │ ├── correlation.py │ ├── chi2.py │ ├── univariate.py │ └── stats.py ├── feature_selection │ └── mahalanobis_features.py └── bet.py ├── python └── artml │ ├── metrics │ ├── scores.py │ ├── accuracy_score.py │ └── plots.py │ ├── explore │ ├── Ztest.py │ ├── Ttest.py │ ├── covariance.py │ ├── correlation.py │ ├── chi2.py │ ├── univariate.py │ └── stats.py │ ├── feature_selection │ └── mahalanobis_features.py │ └── bet.py ├── README.md ├── examples └── datasets │ └── iris.csv ├── module.py └── util └── module.py /artml/metrics/scores.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | def accuracy(y_true, y_pred): 13 | y_true = list(y_true) 14 | y_pred =list(y_pred) 15 | matches = [] 16 | for i in range(len(y_true)): 17 | if y_true[i] == y_pred[i]: 18 | matches.append(1) 19 | return (sum(matches)/len(y_true))*100 20 | 21 | 22 | # In[ ]: 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /python/artml/metrics/scores.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | def accuracy(y_true, y_pred): 13 | y_true = list(y_true) 14 | y_pred =list(y_pred) 15 | matches = [] 16 | for i in range(len(y_true)): 17 | if y_true[i] == y_pred[i]: 18 | matches.append(1) 19 | return (sum(matches)/len(y_true))*100 20 | 21 | 22 | # In[ ]: 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /artml/metrics/accuracy_score.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | def accuracy_score(y_true, y_pred): 13 | y_true = list(y_true) 14 | y_pred =list(y_pred) 15 | matches = [] 16 | for i in range(len(y_true)): 17 | if y_true[i] == y_pred[i]: 18 | matches.append(1) 19 | return (sum(matches)/len(y_true))*100 20 | 21 | 22 | # In[ ]: 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /python/artml/metrics/accuracy_score.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | 10 | # In[2]: 11 | 12 | def accuracy_score(y_true, y_pred): 13 | y_true = list(y_true) 14 | y_pred =list(y_pred) 15 | matches = [] 16 | for i in range(len(y_true)): 17 | if y_true[i] == y_pred[i]: 18 | matches.append(1) 19 | return (sum(matches)/len(y_true))*100 20 | 21 | 22 | # In[ ]: 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /artml/explore/Ztest.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | 12 | 13 | # In[12]: 14 | 15 | def Ztest(BET, col1, col2): 16 | 17 | l =(len(BET)) 18 | BET.reset_index(drop = True, inplace = True) 19 | x = BET.to_dict(orient='list') 20 | keys =list(x.keys()) 21 | 22 | count = x[col2][keys.index(col1)][6] 23 | sumx = x[col2][keys.index(col1)][10] 24 | sumx2 = x[col2][keys.index(col1)][11] 25 | Mean = sumx/count 26 | Variance = (sumx2 - (((sumx)**2)/count))/count 27 | 28 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 29 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 30 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 31 | Mean_0 = sumx_0/count_0 32 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 33 | 34 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 35 | prob = 1 - stats.norm.cdf(zscore) 36 | return 2*prob 37 | 38 | 39 | # In[ ]: 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /python/artml/explore/Ztest.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | import warnings 10 | warnings.filterwarnings('ignore') 11 | 12 | 13 | # In[12]: 14 | 15 | def Ztest(BET, col1, col2): 16 | 17 | l =(len(BET)) 18 | BET.reset_index(drop = True, inplace = True) 19 | x = BET.to_dict(orient='list') 20 | keys =list(x.keys()) 21 | 22 | count = x[col2][keys.index(col1)][6] 23 | sumx = x[col2][keys.index(col1)][10] 24 | sumx2 = x[col2][keys.index(col1)][11] 25 | Mean = sumx/count 26 | Variance = (sumx2 - (((sumx)**2)/count))/count 27 | 28 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 29 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 30 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 31 | Mean_0 = sumx_0/count_0 32 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 33 | 34 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 35 | prob = 1 - stats.norm.cdf(zscore) 36 | return 2*prob 37 | 38 | 39 | # In[ ]: 40 | 41 | 42 | 43 | -------------------------------------------------------------------------------- /artml/explore/Ttest.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[13]: 15 | 16 | def Ttest(BET, col1, col2): 17 | 18 | l =(len(BET)) 19 | BET.reset_index(drop = True, inplace = True) 20 | x = BET.to_dict(orient='list') 21 | keys =list(x.keys()) 22 | 23 | count = x[col2][keys.index(col1)][6] 24 | sumx = x[col2][keys.index(col1)][10] 25 | sumx2 = x[col2][keys.index(col1)][11] 26 | Mean = sumx/count 27 | Variance = (sumx2 - (((sumx)**2)/count))/count 28 | 29 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 30 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 31 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 32 | Mean_0 = sumx_0/count_0 33 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 34 | 35 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 36 | 37 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 38 | 39 | df = (count + count_0 - 2) 40 | 41 | prob = (1-stats.t.cdf(tscore, df)) 42 | return 2*prob 43 | 44 | 45 | # In[ ]: 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /python/artml/explore/Ttest.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[13]: 15 | 16 | def Ttest(BET, col1, col2): 17 | 18 | l =(len(BET)) 19 | BET.reset_index(drop = True, inplace = True) 20 | x = BET.to_dict(orient='list') 21 | keys =list(x.keys()) 22 | 23 | count = x[col2][keys.index(col1)][6] 24 | sumx = x[col2][keys.index(col1)][10] 25 | sumx2 = x[col2][keys.index(col1)][11] 26 | Mean = sumx/count 27 | Variance = (sumx2 - (((sumx)**2)/count))/count 28 | 29 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 30 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 31 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 32 | Mean_0 = sumx_0/count_0 33 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 34 | 35 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 36 | 37 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 38 | 39 | df = (count + count_0 - 2) 40 | 41 | prob = (1-stats.t.cdf(tscore, df)) 42 | return 2*prob 43 | 44 | 45 | # In[ ]: 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /artml/explore/covariance.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[10]: 15 | 16 | def covariance(BET): 17 | 18 | """ 19 | This function computes pairwise covariance of all features in BET. Covariance describes 20 | the linear relationship between two features. 21 | 22 | Examples 23 | -------- 24 | Covariance(Basic_Element_Table) 25 | 26 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 27 | 28 | function returns Covariance as Pandas Dataframe. 29 | 30 | """ 31 | 32 | l =(len(BET)) 33 | BET.reset_index(drop = True, inplace = True) 34 | x = BET.to_dict(orient='list') 35 | keys =list(x.keys()) 36 | covar = {} 37 | 38 | for i in range(len(BET)): 39 | covar[i] = [] 40 | for j in range(len(BET)): 41 | m = keys[i] 42 | try: 43 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 44 | covar[i].append(cov) 45 | except: 46 | covar[i].append('NaN') 47 | 48 | result = pd.DataFrame(covar, index=keys) 49 | result.columns = keys 50 | return(result) 51 | 52 | 53 | # In[ ]: 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /python/artml/explore/covariance.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[10]: 15 | 16 | def covariance(BET): 17 | 18 | """ 19 | This function computes pairwise covariance of all features in BET. Covariance describes 20 | the linear relationship between two features. 21 | 22 | Examples 23 | -------- 24 | Covariance(Basic_Element_Table) 25 | 26 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 27 | 28 | function returns Covariance as Pandas Dataframe. 29 | 30 | """ 31 | 32 | l =(len(BET)) 33 | BET.reset_index(drop = True, inplace = True) 34 | x = BET.to_dict(orient='list') 35 | keys =list(x.keys()) 36 | covar = {} 37 | 38 | for i in range(len(BET)): 39 | covar[i] = [] 40 | for j in range(len(BET)): 41 | m = keys[i] 42 | try: 43 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 44 | covar[i].append(cov) 45 | except: 46 | covar[i].append('NaN') 47 | 48 | result = pd.DataFrame(covar, index=keys) 49 | result.columns = keys 50 | return(result) 51 | 52 | 53 | # In[ ]: 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /artml/explore/correlation.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[11]: 15 | 16 | def correlation(BET): 17 | 18 | """ 19 | This function computes pairwise correlations of all features in BET. correlation measures 20 | how strong a relationship is between two variables. 21 | 22 | Examples 23 | -------- 24 | correlation(Basic_Element_Table) 25 | 26 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 27 | 28 | function returns correlations as Pandas Dataframe. 29 | 30 | """ 31 | 32 | l =(len(BET)) 33 | BET.reset_index(drop = True, inplace = True) 34 | x = BET.to_dict(orient='list') 35 | keys =list(x.keys()) 36 | corr = {} 37 | 38 | for i in range(len(BET)): 39 | corr[i] = [] 40 | for j in range(len(BET)): 41 | m = keys[i] 42 | count1 = x[m][j][0] 43 | count2 = x[m][j][5] 44 | try: 45 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 46 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 47 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 48 | corr[i].append(corrl) 49 | except: 50 | corr[i].append('NaN') 51 | 52 | result = pd.DataFrame(corr, index=keys) 53 | result.columns = keys 54 | return(result) 55 | 56 | 57 | # In[ ]: 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /python/artml/explore/correlation.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[11]: 15 | 16 | def correlation(BET): 17 | 18 | """ 19 | This function computes pairwise correlations of all features in BET. correlation measures 20 | how strong a relationship is between two variables. 21 | 22 | Examples 23 | -------- 24 | correlation(Basic_Element_Table) 25 | 26 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 27 | 28 | function returns correlations as Pandas Dataframe. 29 | 30 | """ 31 | 32 | l =(len(BET)) 33 | BET.reset_index(drop = True, inplace = True) 34 | x = BET.to_dict(orient='list') 35 | keys =list(x.keys()) 36 | corr = {} 37 | 38 | for i in range(len(BET)): 39 | corr[i] = [] 40 | for j in range(len(BET)): 41 | m = keys[i] 42 | count1 = x[m][j][0] 43 | count2 = x[m][j][5] 44 | try: 45 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 46 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 47 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 48 | corr[i].append(corrl) 49 | except: 50 | corr[i].append('NaN') 51 | 52 | result = pd.DataFrame(corr, index=keys) 53 | result.columns = keys 54 | return(result) 55 | 56 | 57 | # In[ ]: 58 | 59 | 60 | 61 | -------------------------------------------------------------------------------- /artml/explore/chi2.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | from scipy.stats import chisqprob 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[14]: 16 | 17 | def chi2(BET, feature_1 , feature_2): 18 | 19 | l =(len(BET)) 20 | BET.reset_index(drop = True, inplace = True) 21 | x = BET.to_dict(orient='list') 22 | keys =list(x.keys()) 23 | obs_freq = {} 24 | exp_freq = {} 25 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 26 | chi2 = 0 27 | 28 | for i in range(len(feature_1)): 29 | obs_freq[feature_1[i]] = [] 30 | 31 | for j in range(len(feature_2)): 32 | col1 = (feature_1[i]) 33 | col2 = (feature_2[j]) 34 | sumx = x[col1][keys.index(col2)][10] 35 | obs_freq[feature_1[i]].append(sumx) 36 | 37 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 38 | total_in_contingency = sum(sum_exp_freq_vertical) 39 | 40 | for i in range(len(feature_1)): 41 | exp_freq[feature_1[i]] = [] 42 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 43 | for j in range(len(feature_2)): 44 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 45 | exp_freq[feature_1[i]].append(e) 46 | 47 | for i in range(len(feature_1)): 48 | for j in range(len(feature_2)): 49 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 50 | 51 | 52 | df = (len(feature_1) - 1)*(len(feature_2)-1) 53 | 54 | print('chi2: ' + str(chi2)) 55 | print('df: ' + str(df)) 56 | print('chisqprob: ' + str(chisqprob(chi2, df))) 57 | return(chisqprob(chi2, df)) 58 | 59 | 60 | # In[ ]: 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /python/artml/explore/chi2.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | from scipy.stats import chisqprob 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[14]: 16 | 17 | def chi2(BET, feature_1 , feature_2): 18 | 19 | l =(len(BET)) 20 | BET.reset_index(drop = True, inplace = True) 21 | x = BET.to_dict(orient='list') 22 | keys =list(x.keys()) 23 | obs_freq = {} 24 | exp_freq = {} 25 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 26 | chi2 = 0 27 | 28 | for i in range(len(feature_1)): 29 | obs_freq[feature_1[i]] = [] 30 | 31 | for j in range(len(feature_2)): 32 | col1 = (feature_1[i]) 33 | col2 = (feature_2[j]) 34 | sumx = x[col1][keys.index(col2)][10] 35 | obs_freq[feature_1[i]].append(sumx) 36 | 37 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 38 | total_in_contingency = sum(sum_exp_freq_vertical) 39 | 40 | for i in range(len(feature_1)): 41 | exp_freq[feature_1[i]] = [] 42 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 43 | for j in range(len(feature_2)): 44 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 45 | exp_freq[feature_1[i]].append(e) 46 | 47 | for i in range(len(feature_1)): 48 | for j in range(len(feature_2)): 49 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 50 | 51 | 52 | df = (len(feature_1) - 1)*(len(feature_2)-1) 53 | 54 | print('chi2: ' + str(chi2)) 55 | print('df: ' + str(df)) 56 | print('chisqprob: ' + str(chisqprob(chi2, df))) 57 | return(chisqprob(chi2, df)) 58 | 59 | 60 | # In[ ]: 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/) [![PyPI license](https://img.shields.io/pypi/l/ansicolortags.svg)](https://github.com/angular/angular.js/blob/master/LICENSE) [![PyPI status](https://img.shields.io/pypi/status/ansicolortags.svg)](https://github.com/AdaptiveMachineLearning) [![Documentation Status](https://readthedocs.org/projects/ansicolortags/badge/?version=latest)](https://adaptivemachinelearning.github.io/) [![GitHub release](https://img.shields.io/github/release/Naereen/StrapDown.js.svg)](https://github.com/AdaptiveMachineLearning/artml/tree/master/python/artml) 2 | 3 | # Fork the World of Real time Learning 4 | 5 | ARTML is a high-level Machine Learning API, written in Python and capable of running and building all linear models. It was developed with a focus on enabling continous and real time learning in distributed environments. Current hype is about Deep learning, But the future is deep with real learning. Welcome to the world of Real Learning! 6 | 7 | Read the documentation at [adaptivemachinelearning.io](https://adaptivemachinelearning.github.io/) 8 | 9 | ## Adaptive Real Time Machine Learning (ART-ML) 10 | 11 | The term “Real Time” is used to describe how well predictive modeling algorithms can accommodate an ever increasing data load instantaneously. However, such real time problems are usually closely coupled with the fact that conventional data mining algorithms operate in a batch mode where having all of the relevant data at once is a requirement. Thus, here Real Time Machine Learning is defined as having all of the following characteristics, independent of the amount of data involved: 12 | 13 | ![ARTML6](https://user-images.githubusercontent.com/36970153/55763008-58bb1b80-5a33-11e9-8255-ab4068373eef.JPG) 14 | 15 | 16 | **Incremental learning (Learn)**: Immediately updating a model with each new observation without the necessity of pooling new data with old data. 17 | 18 | **Decremental learning (Forget)**: Immediately updating a model by excluding observations identified as adversely affecting model performance without forming a new dataset omitting this data and returning to the model formulation step. 19 | 20 | **Variable addition (Grow)**: Adding a new attribute (variable) on the fly, without the necessity of pooling new data with old data. 21 | 22 | **Variable deletion (Shrink)**: Immediately discontinuing use of an attribute identified as adversely affecting model performance. 23 | 24 | **Distributed processing**: Separately processing distributed data or segments of large data (that may be located in diverse geographic locations) and re-combining the results to obtain a single model. 25 | 26 | **Parallel processing**: carrying out parallel processing extremely rapidly from multiple conventional processing units (multi-threads, multi-processors or a specialized chip). 27 | 28 | Project in PROGRESS... 29 | 30 | ### ARTML Models section is not open source as of now. Will be published soon! 31 | 32 | Have any questions? Shoot me an email and I shall get back to you asap! 33 | 34 | **Email Id**: sundeep.pothula@mail.utoronto.ca 35 | 36 | Happy Continual Learning! 37 | -------------------------------------------------------------------------------- /artml/explore/univariate.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[2]: 15 | 16 | def univariate(BET): 17 | 18 | """ 19 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 20 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 21 | a dataset’s distribution, excluding NaN values. 22 | 23 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 24 | 25 | Examples 26 | -------- 27 | univariate(Basic_Element_Table) 28 | 29 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 30 | 31 | function returns univariate stats as Pandas Dataframe. 32 | 33 | """ 34 | 35 | l =(len(BET)) 36 | BET.reset_index(drop = True, inplace = True) 37 | x = BET.to_dict(orient='list') # convert BET to dictionary 38 | keys =list(x.keys()) 39 | describe = {} 40 | 41 | for i in range(l): 42 | describe[i] = [] 43 | m = keys[i] 44 | 45 | try: 46 | count = x[m][i][0] 47 | describe[i].append(count) 48 | except: 49 | describe[i].append('NaN') 50 | try: 51 | Mean = (x[m][i][1])/count 52 | describe[i].append(Mean) 53 | except: 54 | describe[i].append('NaN') 55 | 56 | try: 57 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 58 | describe[i].append(Variance) 59 | except: 60 | describe[i].append('NaN') 61 | try: 62 | Standard_deviation = math.sqrt(Variance) 63 | describe[i].append(Standard_deviation) 64 | except: 65 | describe[i].append('NaN') 66 | try: 67 | coeff_of_variation = (Standard_deviation/Mean)*100 68 | describe[i].append(coeff_of_variation) 69 | except: 70 | describe[i].append('NaN') 71 | 72 | try: 73 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 74 | describe[i].append(skewness) 75 | except: 76 | describe[i].append('NaN') 77 | try: 78 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 79 | describe[i].append(Kurtosis) 80 | except: 81 | describe[i].append('NaN') 82 | 83 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 84 | result = pd.DataFrame(describe, index=names) 85 | result.columns = keys 86 | return(result) 87 | 88 | -------------------------------------------------------------------------------- /python/artml/explore/univariate.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | import warnings 11 | warnings.filterwarnings('ignore') 12 | 13 | 14 | # In[2]: 15 | 16 | def univariate(BET): 17 | 18 | """ 19 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 20 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 21 | a dataset’s distribution, excluding NaN values. 22 | 23 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 24 | 25 | Examples 26 | -------- 27 | univariate(Basic_Element_Table) 28 | 29 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 30 | 31 | function returns univariate stats as Pandas Dataframe. 32 | 33 | """ 34 | 35 | l =(len(BET)) 36 | BET.reset_index(drop = True, inplace = True) 37 | x = BET.to_dict(orient='list') # convert BET to dictionary 38 | keys =list(x.keys()) 39 | describe = {} 40 | 41 | for i in range(l): 42 | describe[i] = [] 43 | m = keys[i] 44 | 45 | try: 46 | count = x[m][i][0] 47 | describe[i].append(count) 48 | except: 49 | describe[i].append('NaN') 50 | try: 51 | Mean = (x[m][i][1])/count 52 | describe[i].append(Mean) 53 | except: 54 | describe[i].append('NaN') 55 | 56 | try: 57 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 58 | describe[i].append(Variance) 59 | except: 60 | describe[i].append('NaN') 61 | try: 62 | Standard_deviation = math.sqrt(Variance) 63 | describe[i].append(Standard_deviation) 64 | except: 65 | describe[i].append('NaN') 66 | try: 67 | coeff_of_variation = (Standard_deviation/Mean)*100 68 | describe[i].append(coeff_of_variation) 69 | except: 70 | describe[i].append('NaN') 71 | 72 | try: 73 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 74 | describe[i].append(skewness) 75 | except: 76 | describe[i].append('NaN') 77 | try: 78 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 79 | describe[i].append(Kurtosis) 80 | except: 81 | describe[i].append('NaN') 82 | 83 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 84 | result = pd.DataFrame(describe, index=names) 85 | result.columns = keys 86 | return(result) 87 | 88 | -------------------------------------------------------------------------------- /artml/metrics/plots.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn import metrics 9 | import matplotlib.pyplot as plt 10 | try: 11 | import scikitplot as skplt 12 | except: 13 | get_ipython().system('pip install scikitplot') 14 | import scikitplot as skplt 15 | from sklearn.metrics import precision_recall_curve 16 | 17 | 18 | # In[2]: 19 | 20 | def roc_curve(y_test, predicted_probas): 21 | skplt.metrics.plot_roc_curve(y_test, predicted_probas) 22 | print(plt.show()) 23 | 24 | 25 | # In[3]: 26 | 27 | def cumulative_gain(y_test, predicted_probas): 28 | skplt.metrics.plot_cumulative_gain(y_test, predicted_probas) 29 | print(plt.show()) 30 | 31 | 32 | # In[4]: 33 | 34 | def precision_recall_vs_threshold(y_test, predicted_probas): 35 | precisions, recalls, thresholds = precision_recall_curve(y_test, predicted_probas) 36 | """ 37 | Modified from: 38 | Hands-On Machine learning with Scikit-Learn 39 | and TensorFlow; p.89 40 | """ 41 | plt.figure(figsize=(8, 8)) 42 | plt.title("Precision and Recall Scores as a function of the decision threshold") 43 | plt.plot(thresholds, precisions[:-1], "b--", label="Precision") 44 | plt.plot(thresholds, recalls[:-1], "g-", label="Recall") 45 | plt.ylabel("Score") 46 | plt.xlabel("Decision Threshold") 47 | plt.legend(loc='best') 48 | 49 | print(plt.show()) 50 | 51 | 52 | # In[5]: 53 | 54 | def adjusted_classes(y_scores, t): 55 | """ 56 | This function adjusts class predictions based on the prediction threshold (t). 57 | Will only work for binary classification problems. 58 | """ 59 | return [1 if y >= t else 0 for y in y_scores] 60 | 61 | def precision_recall_threshold(y_test, predicted_probas, t=0.5): 62 | """ 63 | plots the precision recall curve and shows the current value for each 64 | by identifying the classifier's threshold (t). 65 | """ 66 | 67 | # generate new class predictions based on the adjusted_classes 68 | # function above and view the resulting confusion matrix. 69 | p, r, thresholds = precision_recall_curve(y_test, predicted_probas) 70 | y_pred_adj = adjusted_classes(predicted_probas, t) 71 | #print(confusion_matrix(y_test, y_pred_adj)) 72 | 73 | # plot the curve 74 | plt.figure(figsize=(8,8)) 75 | plt.title("Precision and Recall curve ^ = current threshold") 76 | plt.step(r, p, color='b', alpha=0.2, 77 | where='post') 78 | plt.fill_between(r, p, step='post', alpha=0.2, 79 | color='b') 80 | plt.ylim([0, 1.01]); 81 | plt.xlim([0, 1.01]); 82 | plt.xlabel('Recall'); 83 | plt.ylabel('Precision'); 84 | 85 | # plot the current threshold on the line 86 | close_default_clf = np.argmin(np.abs(thresholds - t)) 87 | plt.plot(r[close_default_clf], p[close_default_clf], '^', c='k', 88 | markersize=15) 89 | plt.show() 90 | 91 | 92 | # In[6]: 93 | 94 | def confusion_matrix(y_test, y_pred): 95 | skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True) 96 | print(plt.show()) 97 | 98 | 99 | # In[7]: 100 | 101 | def precision_recall(y_test, predicted_probas): 102 | skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas) 103 | print(plt.show()) 104 | 105 | -------------------------------------------------------------------------------- /python/artml/metrics/plots.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import pandas as pd 7 | import numpy as np 8 | from sklearn import metrics 9 | import matplotlib.pyplot as plt 10 | try: 11 | import scikitplot as skplt 12 | except: 13 | get_ipython().system('pip install scikitplot') 14 | import scikitplot as skplt 15 | from sklearn.metrics import precision_recall_curve 16 | 17 | 18 | # In[2]: 19 | 20 | def roc_curve(y_test, predicted_probas): 21 | skplt.metrics.plot_roc_curve(y_test, predicted_probas) 22 | print(plt.show()) 23 | 24 | 25 | # In[3]: 26 | 27 | def cumulative_gain(y_test, predicted_probas): 28 | skplt.metrics.plot_cumulative_gain(y_test, predicted_probas) 29 | print(plt.show()) 30 | 31 | 32 | # In[4]: 33 | 34 | def precision_recall_vs_threshold(y_test, predicted_probas): 35 | precisions, recalls, thresholds = precision_recall_curve(y_test, predicted_probas) 36 | """ 37 | Modified from: 38 | Hands-On Machine learning with Scikit-Learn 39 | and TensorFlow; p.89 40 | """ 41 | plt.figure(figsize=(8, 8)) 42 | plt.title("Precision and Recall Scores as a function of the decision threshold") 43 | plt.plot(thresholds, precisions[:-1], "b--", label="Precision") 44 | plt.plot(thresholds, recalls[:-1], "g-", label="Recall") 45 | plt.ylabel("Score") 46 | plt.xlabel("Decision Threshold") 47 | plt.legend(loc='best') 48 | 49 | print(plt.show()) 50 | 51 | 52 | # In[5]: 53 | 54 | def adjusted_classes(y_scores, t): 55 | """ 56 | This function adjusts class predictions based on the prediction threshold (t). 57 | Will only work for binary classification problems. 58 | """ 59 | return [1 if y >= t else 0 for y in y_scores] 60 | 61 | def precision_recall_threshold(y_test, predicted_probas, t=0.5): 62 | """ 63 | plots the precision recall curve and shows the current value for each 64 | by identifying the classifier's threshold (t). 65 | """ 66 | 67 | # generate new class predictions based on the adjusted_classes 68 | # function above and view the resulting confusion matrix. 69 | p, r, thresholds = precision_recall_curve(y_test, predicted_probas) 70 | y_pred_adj = adjusted_classes(predicted_probas, t) 71 | #print(confusion_matrix(y_test, y_pred_adj)) 72 | 73 | # plot the curve 74 | plt.figure(figsize=(8,8)) 75 | plt.title("Precision and Recall curve ^ = current threshold") 76 | plt.step(r, p, color='b', alpha=0.2, 77 | where='post') 78 | plt.fill_between(r, p, step='post', alpha=0.2, 79 | color='b') 80 | plt.ylim([0, 1.01]); 81 | plt.xlim([0, 1.01]); 82 | plt.xlabel('Recall'); 83 | plt.ylabel('Precision'); 84 | 85 | # plot the current threshold on the line 86 | close_default_clf = np.argmin(np.abs(thresholds - t)) 87 | plt.plot(r[close_default_clf], p[close_default_clf], '^', c='k', 88 | markersize=15) 89 | plt.show() 90 | 91 | 92 | # In[6]: 93 | 94 | def confusion_matrix(y_test, y_pred): 95 | skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True) 96 | print(plt.show()) 97 | 98 | 99 | # In[7]: 100 | 101 | def precision_recall(y_test, predicted_probas): 102 | skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas) 103 | print(plt.show()) 104 | 105 | -------------------------------------------------------------------------------- /examples/datasets/iris.csv: -------------------------------------------------------------------------------- 1 | sepal length,sepal width,petal length,petal width,iris 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,0.2,Iris-setosa 5 | 4.6,3.1,1.5,0.2,Iris-setosa 6 | 5,3.6,1.4,0.2,Iris-setosa 7 | 5.4,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5,3.4,1.5,0.2,Iris-setosa 10 | 4.4,2.9,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,1.6,0.2,Iris-setosa 14 | 4.8,3,1.4,0.1,Iris-setosa 15 | 4.3,3,1.1,0.1,Iris-setosa 16 | 5.8,4,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3,Iris-setosa 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5,3,1.6,0.2,Iris-setosa 28 | 5,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5,3.3,1.4,0.2,Iris-setosa 52 | 7,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5,Iris-versicolor 55 | 5.5,2.3,4,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5,2,3.5,1,Iris-versicolor 63 | 5.9,3,4.2,1.5,Iris-versicolor 64 | 6,2.2,4,1,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3,5,1.7,Iris-versicolor 80 | 6,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3,4.5,1.5,Iris-versicolor 87 | 6,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4,1.2,Iris-versicolor 95 | 5,2.3,3.3,1,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3,5.8,2.2,Iris-virginica 107 | 7.6,3,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5,2,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6,2.2,5,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2,Iris-virginica 124 | 7.7,2.8,6.7,2,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6,3,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5,1.9,Iris-virginica 149 | 6.5,3,5.2,2,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3,5.1,1.8,Iris-virginica 152 | -------------------------------------------------------------------------------- /artml/feature_selection/mahalanobis_features.py: -------------------------------------------------------------------------------- 1 | 2 | # Importing all the required libraries 3 | import os 4 | import math 5 | from numpy import * 6 | import numpy as np 7 | import pandas as pd 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | ''' 12 | 13 | mahalanobis_selection.forward_selection is a feature selection function to select the best features that contributes to the 14 | classifier performance. This is a real time forward selection technique which begins with no variables in the LDA model. 15 | For each variable, the forward method calculates Δ2 (mahalanobis distance) statistics that reflect the variable's 16 | contribution to the model if it is included. 17 | 18 | Parameters 19 | ---------- 20 | 21 | BET_file: Input BET table. (Make sure that the index of BET is same as column names) 22 | 23 | Target: Target variable of the classification 24 | 25 | alpha: It is the hyperparameter for the feature selection technique. This dictates the output number of features. Default 26 | value is 1.01 27 | 28 | 29 | ''' 30 | class mahalanobis_selection(): 31 | 32 | def find_best_feature(self, BET_best,BET_file,master_keys,target,benchmark,alpha): 33 | 34 | best_feature = [] 35 | for col in BET_file.columns: 36 | columns = [] 37 | BET_target = BET_file[[target]] 38 | BET_col = BET_file[[col]] 39 | columns = list(BET_best.columns) 40 | columns.append(col) 41 | columns.append(target) 42 | #Selecting the BET for particular columns & Target 43 | result = pd.concat([BET_best, BET_col, BET_target], axis=1) 44 | selected_rows = columns 45 | result = result.loc[selected_rows] 46 | result.index = list(result.columns) 47 | 48 | try: 49 | Delta = self.mahalanobis(result,target) 50 | except: 51 | Delta = 0 52 | if Delta/benchmark > alpha: 53 | best_feature = col 54 | benchmark = Delta 55 | 56 | return best_feature 57 | 58 | 59 | def mahalanobis(self, result, target): 60 | 61 | (mean1,mean2,Beta) = self.LDA_fit_transform(result, target) 62 | 63 | z = np.array(mean1)-np.array(mean2) 64 | Delta = np.matmul(Beta.T, z) 65 | return Delta 66 | 67 | 68 | def LDA_fit_transform(self, BET, target): 69 | 70 | l =(len(BET.columns)) 71 | count_1 = (BET.loc[(target), target][0]) - (BET.loc[(target), target][1]) 72 | count_2 = BET.loc[(target), target][1] 73 | 74 | mean1 = [] 75 | mean2 = [] 76 | c = [] 77 | 78 | for i in range(len(BET.columns)): 79 | if BET.columns[i] != target: 80 | 81 | mean1.append((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])/(BET.loc[BET.columns[i], (target)][0]-BET.loc[BET.columns[i], (target)][6])) 82 | mean2.append((BET.loc[BET.columns[i], (target)][10])/BET.loc[BET.columns[i], (target)][6]) 83 | 84 | for i in range(len(BET.columns)): 85 | if BET.columns[i] != target: 86 | for j in range(len(BET.columns)): 87 | if BET.columns[j] != target: 88 | cal1 = (((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])*(BET.loc[BET.columns[j], (target)][1]- BET.loc[BET.columns[j], (target)][10]))/count_1) 89 | cal2 = (BET.loc[BET.columns[i], (target)][10]*BET.loc[BET.columns[j], (target)][10])/count_2 90 | c.append((BET.loc[BET.columns[i],(BET.columns[j])][10] -cal1 - cal2)/(count_1+count_2-2)) 91 | c = np.array(c) 92 | n = (len(BET.columns)-1) 93 | c = np.reshape(c,(n,n)) 94 | 95 | try: 96 | inverse = np.linalg.inv(c) 97 | except: 98 | print('Handling zero determinent Exception with dummies!') 99 | dummies_ = np.random.random((l-1,l-1))/10000000 100 | inverse = np.linalg.inv(c + dummies_) 101 | 102 | z = np.array(mean1)-np.array(mean2) 103 | Beta = np.matmul(inverse, z.T) 104 | return (mean1,mean2,Beta) 105 | 106 | 107 | 108 | def forward_selection(self, BET_file, target, alpha=1.01): 109 | BET_best = pd.DataFrame() 110 | best_features = [] 111 | already_selected = [] 112 | benchmark = 0.0001 113 | master_keys = BET_file.columns 114 | for i in range(len(BET_file.columns)): 115 | best_feature = self.find_best_feature(BET_best,BET_file,master_keys,target,benchmark,alpha) 116 | if best_feature != []: 117 | best_features.append(best_feature) 118 | if best_feature == []: 119 | break 120 | BET_best = pd.concat([BET_best, BET_file[[best_feature]]], axis=1) 121 | BET_for_new_benchmark = pd.concat([BET_best, BET_file[[target]]], axis=1) 122 | 123 | 124 | selected_rows = list(BET_for_new_benchmark.columns) 125 | BET_for_new_benchmark= BET_for_new_benchmark.loc[selected_rows] 126 | BET_for_new_benchmark.index = list(BET_for_new_benchmark.columns) 127 | 128 | benchmark = self.mahalanobis(BET_for_new_benchmark,target) 129 | already_selected = [best_feature] 130 | 131 | BET_file = BET_file.drop(already_selected, axis=1) 132 | return best_features 133 | -------------------------------------------------------------------------------- /python/artml/feature_selection/mahalanobis_features.py: -------------------------------------------------------------------------------- 1 | 2 | # Importing all the required libraries 3 | import os 4 | import math 5 | from numpy import * 6 | import numpy as np 7 | import pandas as pd 8 | import warnings 9 | warnings.filterwarnings('ignore') 10 | 11 | ''' 12 | 13 | mahalanobis_selection.forward_selection is a feature selection function to select the best features that contributes to the 14 | classifier performance. This is a real time forward selection technique which begins with no variables in the LDA model. 15 | For each variable, the forward method calculates Δ2 (mahalanobis distance) statistics that reflect the variable's 16 | contribution to the model if it is included. 17 | 18 | Parameters 19 | ---------- 20 | 21 | BET_file: Input BET table. (Make sure that the index of BET is same as column names) 22 | 23 | Target: Target variable of the classification 24 | 25 | alpha: It is the hyperparameter for the feature selection technique. This dictates the output number of features. Default 26 | value is 1.01 27 | 28 | 29 | ''' 30 | class mahalanobis_selection(): 31 | 32 | def find_best_feature(self, BET_best,BET_file,master_keys,target,benchmark,alpha): 33 | 34 | best_feature = [] 35 | for col in BET_file.columns: 36 | columns = [] 37 | BET_target = BET_file[[target]] 38 | BET_col = BET_file[[col]] 39 | columns = list(BET_best.columns) 40 | columns.append(col) 41 | columns.append(target) 42 | #Selecting the BET for particular columns & Target 43 | result = pd.concat([BET_best, BET_col, BET_target], axis=1) 44 | selected_rows = columns 45 | result = result.loc[selected_rows] 46 | result.index = list(result.columns) 47 | 48 | try: 49 | Delta = self.mahalanobis(result,target) 50 | except: 51 | Delta = 0 52 | if Delta/benchmark > alpha: 53 | best_feature = col 54 | benchmark = Delta 55 | 56 | return best_feature 57 | 58 | 59 | def mahalanobis(self, result, target): 60 | 61 | (mean1,mean2,Beta) = self.LDA_fit_transform(result, target) 62 | 63 | z = np.array(mean1)-np.array(mean2) 64 | Delta = np.matmul(Beta.T, z) 65 | return Delta 66 | 67 | 68 | def LDA_fit_transform(self, BET, target): 69 | 70 | l =(len(BET.columns)) 71 | count_1 = (BET.loc[(target), target][0]) - (BET.loc[(target), target][1]) 72 | count_2 = BET.loc[(target), target][1] 73 | 74 | mean1 = [] 75 | mean2 = [] 76 | c = [] 77 | 78 | for i in range(len(BET.columns)): 79 | if BET.columns[i] != target: 80 | 81 | mean1.append((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])/(BET.loc[BET.columns[i], (target)][0]-BET.loc[BET.columns[i], (target)][6])) 82 | mean2.append((BET.loc[BET.columns[i], (target)][10])/BET.loc[BET.columns[i], (target)][6]) 83 | 84 | for i in range(len(BET.columns)): 85 | if BET.columns[i] != target: 86 | for j in range(len(BET.columns)): 87 | if BET.columns[j] != target: 88 | cal1 = (((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])*(BET.loc[BET.columns[j], (target)][1]- BET.loc[BET.columns[j], (target)][10]))/count_1) 89 | cal2 = (BET.loc[BET.columns[i], (target)][10]*BET.loc[BET.columns[j], (target)][10])/count_2 90 | c.append((BET.loc[BET.columns[i],(BET.columns[j])][10] -cal1 - cal2)/(count_1+count_2-2)) 91 | c = np.array(c) 92 | n = (len(BET.columns)-1) 93 | c = np.reshape(c,(n,n)) 94 | 95 | try: 96 | inverse = np.linalg.inv(c) 97 | except: 98 | print('Handling zero determinent Exception with dummies!') 99 | dummies_ = np.random.random((l-1,l-1))/10000000 100 | inverse = np.linalg.inv(c + dummies_) 101 | 102 | z = np.array(mean1)-np.array(mean2) 103 | Beta = np.matmul(inverse, z.T) 104 | return (mean1,mean2,Beta) 105 | 106 | 107 | 108 | def forward_selection(self, BET_file, target, alpha=1.01): 109 | BET_best = pd.DataFrame() 110 | best_features = [] 111 | already_selected = [] 112 | benchmark = 0.0001 113 | master_keys = BET_file.columns 114 | for i in range(len(BET_file.columns)): 115 | best_feature = self.find_best_feature(BET_best,BET_file,master_keys,target,benchmark,alpha) 116 | if best_feature != []: 117 | best_features.append(best_feature) 118 | if best_feature == []: 119 | break 120 | BET_best = pd.concat([BET_best, BET_file[[best_feature]]], axis=1) 121 | BET_for_new_benchmark = pd.concat([BET_best, BET_file[[target]]], axis=1) 122 | 123 | 124 | selected_rows = list(BET_for_new_benchmark.columns) 125 | BET_for_new_benchmark= BET_for_new_benchmark.loc[selected_rows] 126 | BET_for_new_benchmark.index = list(BET_for_new_benchmark.columns) 127 | 128 | benchmark = self.mahalanobis(BET_for_new_benchmark,target) 129 | already_selected = [best_feature] 130 | 131 | BET_file = BET_file.drop(already_selected, axis=1) 132 | return best_features 133 | -------------------------------------------------------------------------------- /artml/explore/stats.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | # from scipy.stats import chisqprob 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[15]: 16 | 17 | def univariate(BET): 18 | 19 | """ 20 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 21 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 22 | a dataset’s distribution, excluding NaN values. 23 | 24 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 25 | 26 | Examples 27 | -------- 28 | univariate(Basic_Element_Table) 29 | 30 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 31 | 32 | function returns univariate stats as Pandas Dataframe. 33 | 34 | """ 35 | 36 | l =(len(BET)) 37 | BET.reset_index(drop = True, inplace = True) 38 | x = BET.to_dict(orient='list') # convert BET to dictionary 39 | keys =list(x.keys()) 40 | describe = {} 41 | 42 | for i in range(l): 43 | describe[i] = [] 44 | m = keys[i] 45 | 46 | try: 47 | count = x[m][i][0] 48 | describe[i].append(count) 49 | except: 50 | describe[i].append('NaN') 51 | try: 52 | Mean = (x[m][i][1])/count 53 | describe[i].append(Mean) 54 | except: 55 | describe[i].append('NaN') 56 | 57 | try: 58 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 59 | describe[i].append(Variance) 60 | except: 61 | describe[i].append('NaN') 62 | try: 63 | Standard_deviation = math.sqrt(Variance) 64 | describe[i].append(Standard_deviation) 65 | except: 66 | describe[i].append('NaN') 67 | try: 68 | coeff_of_variation = (Standard_deviation/Mean)*100 69 | describe[i].append(coeff_of_variation) 70 | except: 71 | describe[i].append('NaN') 72 | 73 | try: 74 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 75 | describe[i].append(skewness) 76 | except: 77 | describe[i].append('NaN') 78 | try: 79 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 80 | describe[i].append(Kurtosis) 81 | except: 82 | describe[i].append('NaN') 83 | 84 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 85 | result = pd.DataFrame(describe, index=names) 86 | result.columns = keys 87 | return(result) 88 | 89 | 90 | # In[16]: 91 | 92 | def covariance(BET): 93 | 94 | """ 95 | This function computes pairwise covariance of all features in BET. Covariance describes 96 | the linear relationship between two features. 97 | 98 | Examples 99 | -------- 100 | Covariance(Basic_Element_Table) 101 | 102 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 103 | 104 | function returns Covariance as Pandas Dataframe. 105 | 106 | """ 107 | 108 | l =(len(BET)) 109 | BET.reset_index(drop = True, inplace = True) 110 | x = BET.to_dict(orient='list') 111 | keys =list(x.keys()) 112 | covar = {} 113 | 114 | for i in range(len(BET)): 115 | covar[i] = [] 116 | for j in range(len(BET)): 117 | m = keys[i] 118 | try: 119 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 120 | covar[i].append(cov) 121 | except: 122 | covar[i].append('NaN') 123 | 124 | result = pd.DataFrame(covar, index=keys) 125 | result.columns = keys 126 | return(result) 127 | 128 | 129 | # In[17]: 130 | 131 | def correlation(BET): 132 | 133 | """ 134 | This function computes pairwise correlations of all features in BET. correlation measures 135 | how strong a relationship is between two variables. 136 | 137 | Examples 138 | -------- 139 | correlation(Basic_Element_Table) 140 | 141 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 142 | 143 | function returns correlations as Pandas Dataframe. 144 | 145 | """ 146 | 147 | l =(len(BET)) 148 | BET.reset_index(drop = True, inplace = True) 149 | x = BET.to_dict(orient='list') 150 | keys =list(x.keys()) 151 | corr = {} 152 | 153 | for i in range(len(BET)): 154 | corr[i] = [] 155 | for j in range(len(BET)): 156 | m = keys[i] 157 | count1 = x[m][j][0] 158 | count2 = x[m][j][5] 159 | try: 160 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 161 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 162 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 163 | corr[i].append(corrl) 164 | except: 165 | corr[i].append('NaN') 166 | 167 | result = pd.DataFrame(corr, index=keys) 168 | result.columns = keys 169 | return(result) 170 | 171 | 172 | # In[18]: 173 | 174 | def Ztest(BET, col1, col2): 175 | 176 | l =(len(BET)) 177 | BET.reset_index(drop = True, inplace = True) 178 | x = BET.to_dict(orient='list') 179 | keys =list(x.keys()) 180 | 181 | count = x[col2][keys.index(col1)][6] 182 | sumx = x[col2][keys.index(col1)][10] 183 | sumx2 = x[col2][keys.index(col1)][11] 184 | Mean = sumx/count 185 | Variance = (sumx2 - (((sumx)**2)/count))/count 186 | 187 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 188 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 189 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 190 | Mean_0 = sumx_0/count_0 191 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 192 | 193 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 194 | prob = 1 - stats.norm.cdf(zscore) 195 | return 2*prob 196 | 197 | 198 | 199 | # In[19]: 200 | 201 | def Ttest(BET, col1, col2): 202 | 203 | l =(len(BET)) 204 | BET.reset_index(drop = True, inplace = True) 205 | x = BET.to_dict(orient='list') 206 | keys =list(x.keys()) 207 | 208 | count = x[col2][keys.index(col1)][6] 209 | sumx = x[col2][keys.index(col1)][10] 210 | sumx2 = x[col2][keys.index(col1)][11] 211 | Mean = sumx/count 212 | Variance = (sumx2 - (((sumx)**2)/count))/count 213 | 214 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 215 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 216 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 217 | Mean_0 = sumx_0/count_0 218 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 219 | 220 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 221 | 222 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 223 | 224 | df = (count + count_0 - 2) 225 | 226 | prob = (1-stats.t.cdf(tscore, df)) 227 | return 2*prob 228 | 229 | 230 | 231 | # In[14]: 232 | 233 | def chi2(BET, feature_1 , feature_2): 234 | 235 | l =(len(BET)) 236 | BET.reset_index(drop = True, inplace = True) 237 | x = BET.to_dict(orient='list') 238 | keys =list(x.keys()) 239 | obs_freq = {} 240 | exp_freq = {} 241 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 242 | chi2 = 0 243 | 244 | for i in range(len(feature_1)): 245 | obs_freq[feature_1[i]] = [] 246 | 247 | for j in range(len(feature_2)): 248 | col1 = (feature_1[i]) 249 | col2 = (feature_2[j]) 250 | sumx = x[col1][keys.index(col2)][10] 251 | obs_freq[feature_1[i]].append(sumx) 252 | 253 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 254 | total_in_contingency = sum(sum_exp_freq_vertical) 255 | 256 | for i in range(len(feature_1)): 257 | exp_freq[feature_1[i]] = [] 258 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 259 | for j in range(len(feature_2)): 260 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 261 | exp_freq[feature_1[i]].append(e) 262 | 263 | for i in range(len(feature_1)): 264 | for j in range(len(feature_2)): 265 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 266 | 267 | 268 | df = (len(feature_1) - 1)*(len(feature_2)-1) 269 | 270 | print('chi2: ' + str(chi2)) 271 | print('df: ' + str(df)) 272 | print('chisqprob: ' + str(chisqprob(chi2, df))) 273 | return(chisqprob(chi2, df)) 274 | 275 | 276 | # In[ ]: 277 | 278 | 279 | 280 | -------------------------------------------------------------------------------- /python/artml/explore/stats.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[9]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | # from scipy.stats import chisqprob 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[15]: 16 | 17 | def univariate(BET): 18 | 19 | """ 20 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 21 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 22 | a dataset’s distribution, excluding NaN values. 23 | 24 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 25 | 26 | Examples 27 | -------- 28 | univariate(Basic_Element_Table) 29 | 30 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 31 | 32 | function returns univariate stats as Pandas Dataframe. 33 | 34 | """ 35 | 36 | l =(len(BET)) 37 | BET.reset_index(drop = True, inplace = True) 38 | x = BET.to_dict(orient='list') # convert BET to dictionary 39 | keys =list(x.keys()) 40 | describe = {} 41 | 42 | for i in range(l): 43 | describe[i] = [] 44 | m = keys[i] 45 | 46 | try: 47 | count = x[m][i][0] 48 | describe[i].append(count) 49 | except: 50 | describe[i].append('NaN') 51 | try: 52 | Mean = (x[m][i][1])/count 53 | describe[i].append(Mean) 54 | except: 55 | describe[i].append('NaN') 56 | 57 | try: 58 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 59 | describe[i].append(Variance) 60 | except: 61 | describe[i].append('NaN') 62 | try: 63 | Standard_deviation = math.sqrt(Variance) 64 | describe[i].append(Standard_deviation) 65 | except: 66 | describe[i].append('NaN') 67 | try: 68 | coeff_of_variation = (Standard_deviation/Mean)*100 69 | describe[i].append(coeff_of_variation) 70 | except: 71 | describe[i].append('NaN') 72 | 73 | try: 74 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 75 | describe[i].append(skewness) 76 | except: 77 | describe[i].append('NaN') 78 | try: 79 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 80 | describe[i].append(Kurtosis) 81 | except: 82 | describe[i].append('NaN') 83 | 84 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 85 | result = pd.DataFrame(describe, index=names) 86 | result.columns = keys 87 | return(result) 88 | 89 | 90 | # In[16]: 91 | 92 | def covariance(BET): 93 | 94 | """ 95 | This function computes pairwise covariance of all features in BET. Covariance describes 96 | the linear relationship between two features. 97 | 98 | Examples 99 | -------- 100 | Covariance(Basic_Element_Table) 101 | 102 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 103 | 104 | function returns Covariance as Pandas Dataframe. 105 | 106 | """ 107 | 108 | l =(len(BET)) 109 | BET.reset_index(drop = True, inplace = True) 110 | x = BET.to_dict(orient='list') 111 | keys =list(x.keys()) 112 | covar = {} 113 | 114 | for i in range(len(BET)): 115 | covar[i] = [] 116 | for j in range(len(BET)): 117 | m = keys[i] 118 | try: 119 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 120 | covar[i].append(cov) 121 | except: 122 | covar[i].append('NaN') 123 | 124 | result = pd.DataFrame(covar, index=keys) 125 | result.columns = keys 126 | return(result) 127 | 128 | 129 | # In[17]: 130 | 131 | def correlation(BET): 132 | 133 | """ 134 | This function computes pairwise correlations of all features in BET. correlation measures 135 | how strong a relationship is between two variables. 136 | 137 | Examples 138 | -------- 139 | correlation(Basic_Element_Table) 140 | 141 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 142 | 143 | function returns correlations as Pandas Dataframe. 144 | 145 | """ 146 | 147 | l =(len(BET)) 148 | BET.reset_index(drop = True, inplace = True) 149 | x = BET.to_dict(orient='list') 150 | keys =list(x.keys()) 151 | corr = {} 152 | 153 | for i in range(len(BET)): 154 | corr[i] = [] 155 | for j in range(len(BET)): 156 | m = keys[i] 157 | count1 = x[m][j][0] 158 | count2 = x[m][j][5] 159 | try: 160 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 161 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 162 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 163 | corr[i].append(corrl) 164 | except: 165 | corr[i].append('NaN') 166 | 167 | result = pd.DataFrame(corr, index=keys) 168 | result.columns = keys 169 | return(result) 170 | 171 | 172 | # In[18]: 173 | 174 | def Ztest(BET, col1, col2): 175 | 176 | l =(len(BET)) 177 | BET.reset_index(drop = True, inplace = True) 178 | x = BET.to_dict(orient='list') 179 | keys =list(x.keys()) 180 | 181 | count = x[col2][keys.index(col1)][6] 182 | sumx = x[col2][keys.index(col1)][10] 183 | sumx2 = x[col2][keys.index(col1)][11] 184 | Mean = sumx/count 185 | Variance = (sumx2 - (((sumx)**2)/count))/count 186 | 187 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 188 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 189 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 190 | Mean_0 = sumx_0/count_0 191 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 192 | 193 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 194 | prob = 1 - stats.norm.cdf(zscore) 195 | return 2*prob 196 | 197 | 198 | 199 | # In[19]: 200 | 201 | def Ttest(BET, col1, col2): 202 | 203 | l =(len(BET)) 204 | BET.reset_index(drop = True, inplace = True) 205 | x = BET.to_dict(orient='list') 206 | keys =list(x.keys()) 207 | 208 | count = x[col2][keys.index(col1)][6] 209 | sumx = x[col2][keys.index(col1)][10] 210 | sumx2 = x[col2][keys.index(col1)][11] 211 | Mean = sumx/count 212 | Variance = (sumx2 - (((sumx)**2)/count))/count 213 | 214 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 215 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 216 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 217 | Mean_0 = sumx_0/count_0 218 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 219 | 220 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 221 | 222 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 223 | 224 | df = (count + count_0 - 2) 225 | 226 | prob = (1-stats.t.cdf(tscore, df)) 227 | return 2*prob 228 | 229 | 230 | 231 | # In[14]: 232 | 233 | def chi2(BET, feature_1 , feature_2): 234 | 235 | l =(len(BET)) 236 | BET.reset_index(drop = True, inplace = True) 237 | x = BET.to_dict(orient='list') 238 | keys =list(x.keys()) 239 | obs_freq = {} 240 | exp_freq = {} 241 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 242 | chi2 = 0 243 | 244 | for i in range(len(feature_1)): 245 | obs_freq[feature_1[i]] = [] 246 | 247 | for j in range(len(feature_2)): 248 | col1 = (feature_1[i]) 249 | col2 = (feature_2[j]) 250 | sumx = x[col1][keys.index(col2)][10] 251 | obs_freq[feature_1[i]].append(sumx) 252 | 253 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 254 | total_in_contingency = sum(sum_exp_freq_vertical) 255 | 256 | for i in range(len(feature_1)): 257 | exp_freq[feature_1[i]] = [] 258 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 259 | for j in range(len(feature_2)): 260 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 261 | exp_freq[feature_1[i]].append(e) 262 | 263 | for i in range(len(feature_1)): 264 | for j in range(len(feature_2)): 265 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 266 | 267 | 268 | df = (len(feature_1) - 1)*(len(feature_2)-1) 269 | 270 | print('chi2: ' + str(chi2)) 271 | print('df: ' + str(df)) 272 | print('chisqprob: ' + str(chisqprob(chi2, df))) 273 | return(chisqprob(chi2, df)) 274 | 275 | 276 | # In[ ]: 277 | 278 | 279 | 280 | -------------------------------------------------------------------------------- /artml/bet.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[23]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | from tqdm import tqdm 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[24]: 16 | 17 | def create_bet(df): 18 | 19 | """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 20 | it can be updated with the new data. 21 | 22 | BET function returns basic element table as Pandas Dataframe 23 | 24 | Notes: 25 | ----- 26 | see 'Real Time Data Mining' by Prof. Sayad 27 | 28 | (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining) 29 | 30 | """ 31 | col = df.columns.tolist() 32 | df_matrix = df.values 33 | l = len(col) 34 | 35 | idx = np.array([5,6,7,8,9,0,1,2,3,4,10,11]) 36 | bet={} 37 | x = np.array([[np.zeros(12) for x in range(l)] for y in range(l)]) 38 | for i in tqdm(range(l)): 39 | bet[i] = [] 40 | 41 | for j in range(i,l): 42 | y= np.array(df_matrix[:,j]) 43 | z= np.array(df_matrix[:,i]) 44 | 45 | """ 46 | This code makes calculations for all the basic elements in the table. They are appended to 47 | a lists of a dictionary. 48 | """ 49 | 50 | x[i,j] = np.array([len(z), z.sum(), (z**2).sum(), (z**3).sum(), (z**4).sum(), 51 | len(y), y.sum(), (y**2).sum(), (y**3).sum(), (y**4).sum(), (z*y).sum(), ((z*y)**2).sum()]) 52 | 53 | x[j,i] = x[i,j][idx] 54 | 55 | for j in range(l): 56 | bet[i].append(x[j,i]) 57 | 58 | result = pd.DataFrame(bet, index=col) 59 | result.columns = col 60 | return(result) 61 | 62 | 63 | 64 | 65 | # In[25]: 66 | 67 | def calculate_basic_elements1(x,key,e,c,const): 68 | 69 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 70 | calculations to update the BET 71 | 72 | This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 73 | for making the calculations 74 | """ 75 | 76 | array = np.array(x[key][e]) 77 | 78 | array = array + const*(np.array([1,c, c**2,c**3,c**4,1,c, c**2,c**3,c**4,c**2,c**4])) 79 | 80 | x[key][e] = array 81 | 82 | return x[key][e] 83 | 84 | 85 | # In[26]: 86 | 87 | def calculate_basic_elements2(x,key,k,b,c,i,m,const): 88 | 89 | 90 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 91 | calculations to update the BET 92 | 93 | This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 94 | for making the calculations 95 | """ 96 | 97 | array = np.array(x[key][k]) 98 | 99 | array = array + const*(np.array([1,c[b.index(m)], (c[b.index(m)])**2,(c[b.index(m)])**3,(c[b.index(m)])**4,1,c[i], c[i]**2,c[i]**3,c[i]**4, c[i]*(c[b.index(m)]),(c[i]*(c[b.index(m)])**2)])) 100 | 101 | x[key][k] = array 102 | 103 | return x[key][k] 104 | 105 | # In[27]: 106 | 107 | def learnbyindex(BET, *args): 108 | 109 | """ This function takes Basic Element Table and feature_names & values as arguments to update the 110 | given list of feature column & rows in the BET by corresponding values. 111 | 112 | Examples 113 | -------- 114 | learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 115 | 116 | The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively. 117 | 118 | """ 119 | 120 | BET.reset_index(drop = True, inplace = True) # convert BET to dictionary 121 | x = BET.to_dict(orient='list') 122 | keys = list(x.keys()) 123 | arguments_list = [item for item in args] 124 | n_features = int(len(arguments_list)/2) # no of features given as input for updating BET 125 | 126 | if (len(arguments_list))%2 != 0: 127 | print("Error: Give correct set of Feature_names & corresponding parameters") 128 | 129 | else: 130 | feature_names = arguments_list[0:n_features] 131 | values= arguments_list[n_features::] 132 | 133 | for i in range(len(feature_names)): 134 | key = feature_names[i] 135 | e = keys.index(key) 136 | calculate_basic_elements1(x,key,e,values[i],1) # function for updating elements BET 137 | 138 | for m in feature_names: 139 | if m != feature_names[i]: 140 | k = keys.index(m) 141 | calculate_basic_elements2(x,key,k,feature_names,values,i,m,1) # function for updating elements BET 142 | 143 | df = pd.DataFrame(x) 144 | df.index = keys 145 | df = df[keys] 146 | return df 147 | 148 | 149 | # In[28]: 150 | 151 | def forgetbyindex(BET, *args): 152 | 153 | """ This function takes Basic Element Table and feature name & values as arguments to update the 154 | given list of features in the BET by corresponding values (deleting effect of those values from BET). 155 | 156 | Examples 157 | -------- 158 | forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 159 | 160 | The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively. 161 | 162 | """ 163 | 164 | BET.reset_index(drop = True, inplace = True) 165 | x = BET.to_dict(orient='list') # convert BET to dictionary 166 | keys = list(x.keys()) 167 | arguments_list = [item for item in args] 168 | n_features = int(len(arguments_list)/2) 169 | 170 | if (len(arguments_list))%2 != 0: # no of features given as input for updating BET 171 | print("Give correct set of Index & parameters for function") 172 | else: 173 | feature_names = arguments_list[0 : n_features] 174 | values= arguments_list[n_features: :] 175 | for i in range(n_features): 176 | key = feature_names[i] 177 | e = keys.index(key) 178 | calculate_basic_elements1(x,key,e,values,i,-1) # function for updating elements BET 179 | 180 | for m in feature_names: 181 | if m != feature_names[i]: 182 | k = keys.index(m) 183 | calculate_basic_elements2(x,key,k,feature_names,values,i,m,-1) 184 | 185 | df = pd.DataFrame(x) 186 | df = df[keys] 187 | df.index = keys 188 | return df 189 | 190 | 191 | 192 | # In[29]: 193 | 194 | 195 | def growbyindex(BET, *args): 196 | 197 | """ This function takes Basic Element Table and feature name & values as arguments to update the 198 | BET with new features and corresponding values. 199 | 200 | Examples 201 | -------- 202 | growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 ) 203 | 204 | The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively. 205 | 206 | """ 207 | 208 | main_list = list(BET.columns) 209 | arguments_list = [item for item in args] # convert BET to dictionary 210 | n_features = int(len(arguments_list)/2) 211 | if (len(arguments_list))%2 != 0: 212 | print("Give correct set of Index & parameters for function") 213 | else: 214 | feature_names = arguments_list[0:n_features] 215 | values = arguments_list[n_features::] 216 | 217 | for i in range(n_features): 218 | 219 | elements = [[0]*12]*len(BET) #Creating null basic elements lists 220 | BET[feature_names[i]] = elements 221 | 222 | new_list = [] 223 | for j in range(len(BET.columns)): 224 | new_list.append(list(np.array([0]*12))) 225 | 226 | new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]]) 227 | BET = pd.concat([BET,new_row]) 228 | 229 | BET.reset_index(drop = True, inplace = True) 230 | x = BET.to_dict(orient='list') 231 | keys = list(x.keys()) 232 | 233 | for i in range(n_features): 234 | key = feature_names[i] 235 | if key in main_list: 236 | print('feature already exsists! Use Learn function') 237 | else: 238 | e = keys.index(key) 239 | calculate_basic_elements1(x,key,e,c,i,1) 240 | 241 | df = pd.DataFrame(BET) 242 | df.index = keys 243 | df = df[keys] 244 | return df 245 | 246 | 247 | # In[30]: 248 | 249 | def learn(BET, df): 250 | 251 | """ This function takes Basic Element Table and dataframe as inputs to update the 252 | BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input) 253 | 254 | Examples 255 | -------- 256 | learn(Basic_Element_Table, data_frame) 257 | 258 | The above function updates Basic_Element_Table with values in the new dataframe. 259 | 260 | """ 261 | 262 | col = list(df.columns) 263 | for index, row in df.iterrows(): 264 | row1 = [] 265 | for e in col: 266 | row1.append(row[e]) 267 | arguments = col + row1 268 | BET = learnbyindex(BET, *arguments) 269 | return BET 270 | 271 | 272 | # In[31]: 273 | 274 | def forget(BET, df): 275 | 276 | """ This function takes Basic Element Table and dataframe as inputs to change and remove the 277 | effect of that data in the BET. (Decremental Learning of BET with dataframe as input) 278 | 279 | Examples 280 | -------- 281 | forget(Basic_Element_Table, data_frame) 282 | 283 | The above function updates Basic_Element_Table with values in the new dataframe. 284 | 285 | """ 286 | 287 | col = list(df.columns) 288 | for index, row in df.iterrows(): 289 | row1 = [] 290 | for e in col: 291 | row1.append(row[e]) 292 | arguments = col + row1 293 | BET = forgetbyindex(BET, *arguments) 294 | return BET 295 | 296 | 297 | -------------------------------------------------------------------------------- /python/artml/bet.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[23]: 5 | 6 | import math 7 | import numpy as np 8 | import pandas as pd 9 | from scipy import stats 10 | from tqdm import tqdm 11 | import warnings 12 | warnings.filterwarnings('ignore') 13 | 14 | 15 | # In[24]: 16 | 17 | def create_bet(df): 18 | 19 | """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 20 | it can be updated with the new data. 21 | 22 | BET function returns basic element table as Pandas Dataframe 23 | 24 | Notes: 25 | ----- 26 | see 'Real Time Data Mining' by Prof. Sayad 27 | 28 | (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining) 29 | 30 | """ 31 | col = df.columns.tolist() 32 | df_matrix = df.values 33 | l = len(col) 34 | 35 | idx = np.array([5,6,7,8,9,0,1,2,3,4,10,11]) 36 | bet={} 37 | x = np.array([[np.zeros(12) for x in range(l)] for y in range(l)]) 38 | for i in tqdm(range(l)): 39 | bet[i] = [] 40 | 41 | for j in range(i,l): 42 | y= np.array(df_matrix[:,j]) 43 | z= np.array(df_matrix[:,i]) 44 | 45 | """ 46 | This code makes calculations for all the basic elements in the table. They are appended to 47 | a lists of a dictionary. 48 | """ 49 | 50 | x[i,j] = np.array([len(z), z.sum(), (z**2).sum(), (z**3).sum(), (z**4).sum(), 51 | len(y), y.sum(), (y**2).sum(), (y**3).sum(), (y**4).sum(), (z*y).sum(), ((z*y)**2).sum()]) 52 | 53 | x[j,i] = x[i,j][idx] 54 | 55 | for j in range(l): 56 | bet[i].append(x[j,i]) 57 | 58 | result = pd.DataFrame(bet, index=col) 59 | result.columns = col 60 | return(result) 61 | 62 | 63 | 64 | 65 | # In[25]: 66 | 67 | def calculate_basic_elements1(x,key,e,c,const): 68 | 69 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 70 | calculations to update the BET 71 | 72 | This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 73 | for making the calculations 74 | """ 75 | 76 | array = np.array(x[key][e]) 77 | 78 | array = array + const*(np.array([1,c, c**2,c**3,c**4,1,c, c**2,c**3,c**4,c**2,c**4])) 79 | 80 | x[key][e] = array 81 | 82 | return x[key][e] 83 | 84 | 85 | # In[26]: 86 | 87 | def calculate_basic_elements2(x,key,k,b,c,i,m,const): 88 | 89 | 90 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 91 | calculations to update the BET 92 | 93 | This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 94 | for making the calculations 95 | """ 96 | 97 | array = np.array(x[key][k]) 98 | 99 | array = array + const*(np.array([1,c[b.index(m)], (c[b.index(m)])**2,(c[b.index(m)])**3,(c[b.index(m)])**4,1,c[i], c[i]**2,c[i]**3,c[i]**4, c[i]*(c[b.index(m)]),(c[i]*(c[b.index(m)])**2)])) 100 | 101 | x[key][k] = array 102 | 103 | return x[key][k] 104 | 105 | # In[27]: 106 | 107 | def learnbyindex(BET, *args): 108 | 109 | """ This function takes Basic Element Table and feature_names & values as arguments to update the 110 | given list of feature column & rows in the BET by corresponding values. 111 | 112 | Examples 113 | -------- 114 | learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 115 | 116 | The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively. 117 | 118 | """ 119 | 120 | BET.reset_index(drop = True, inplace = True) # convert BET to dictionary 121 | x = BET.to_dict(orient='list') 122 | keys = list(x.keys()) 123 | arguments_list = [item for item in args] 124 | n_features = int(len(arguments_list)/2) # no of features given as input for updating BET 125 | 126 | if (len(arguments_list))%2 != 0: 127 | print("Error: Give correct set of Feature_names & corresponding parameters") 128 | 129 | else: 130 | feature_names = arguments_list[0:n_features] 131 | values= arguments_list[n_features::] 132 | 133 | for i in range(len(feature_names)): 134 | key = feature_names[i] 135 | e = keys.index(key) 136 | calculate_basic_elements1(x,key,e,values[i],1) # function for updating elements BET 137 | 138 | for m in feature_names: 139 | if m != feature_names[i]: 140 | k = keys.index(m) 141 | calculate_basic_elements2(x,key,k,feature_names,values,i,m,1) # function for updating elements BET 142 | 143 | df = pd.DataFrame(x) 144 | df.index = keys 145 | df = df[keys] 146 | return df 147 | 148 | 149 | # In[28]: 150 | 151 | def forgetbyindex(BET, *args): 152 | 153 | """ This function takes Basic Element Table and feature name & values as arguments to update the 154 | given list of features in the BET by corresponding values (deleting effect of those values from BET). 155 | 156 | Examples 157 | -------- 158 | forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 159 | 160 | The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively. 161 | 162 | """ 163 | 164 | BET.reset_index(drop = True, inplace = True) 165 | x = BET.to_dict(orient='list') # convert BET to dictionary 166 | keys = list(x.keys()) 167 | arguments_list = [item for item in args] 168 | n_features = int(len(arguments_list)/2) 169 | 170 | if (len(arguments_list))%2 != 0: # no of features given as input for updating BET 171 | print("Give correct set of Index & parameters for function") 172 | else: 173 | feature_names = arguments_list[0 : n_features] 174 | values= arguments_list[n_features: :] 175 | for i in range(n_features): 176 | key = feature_names[i] 177 | e = keys.index(key) 178 | calculate_basic_elements1(x,key,e,values,i,-1) # function for updating elements BET 179 | 180 | for m in feature_names: 181 | if m != feature_names[i]: 182 | k = keys.index(m) 183 | calculate_basic_elements2(x,key,k,feature_names,values,i,m,-1) 184 | 185 | df = pd.DataFrame(x) 186 | df = df[keys] 187 | df.index = keys 188 | return df 189 | 190 | 191 | 192 | # In[29]: 193 | 194 | 195 | def growbyindex(BET, *args): 196 | 197 | """ This function takes Basic Element Table and feature name & values as arguments to update the 198 | BET with new features and corresponding values. 199 | 200 | Examples 201 | -------- 202 | growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 ) 203 | 204 | The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively. 205 | 206 | """ 207 | 208 | main_list = list(BET.columns) 209 | arguments_list = [item for item in args] # convert BET to dictionary 210 | n_features = int(len(arguments_list)/2) 211 | if (len(arguments_list))%2 != 0: 212 | print("Give correct set of Index & parameters for function") 213 | else: 214 | feature_names = arguments_list[0:n_features] 215 | values = arguments_list[n_features::] 216 | 217 | for i in range(n_features): 218 | 219 | elements = [[0]*12]*len(BET) #Creating null basic elements lists 220 | BET[feature_names[i]] = elements 221 | 222 | new_list = [] 223 | for j in range(len(BET.columns)): 224 | new_list.append(list(np.array([0]*12))) 225 | 226 | new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]]) 227 | BET = pd.concat([BET,new_row]) 228 | 229 | BET.reset_index(drop = True, inplace = True) 230 | x = BET.to_dict(orient='list') 231 | keys = list(x.keys()) 232 | 233 | for i in range(n_features): 234 | key = feature_names[i] 235 | if key in main_list: 236 | print('feature already exsists! Use Learn function') 237 | else: 238 | e = keys.index(key) 239 | calculate_basic_elements1(x,key,e,c,i,1) 240 | 241 | df = pd.DataFrame(BET) 242 | df.index = keys 243 | df = df[keys] 244 | return df 245 | 246 | 247 | # In[30]: 248 | 249 | def learn(BET, df): 250 | 251 | """ This function takes Basic Element Table and dataframe as inputs to update the 252 | BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input) 253 | 254 | Examples 255 | -------- 256 | learn(Basic_Element_Table, data_frame) 257 | 258 | The above function updates Basic_Element_Table with values in the new dataframe. 259 | 260 | """ 261 | 262 | col = list(df.columns) 263 | for index, row in df.iterrows(): 264 | row1 = [] 265 | for e in col: 266 | row1.append(row[e]) 267 | arguments = col + row1 268 | BET = learnbyindex(BET, *arguments) 269 | return BET 270 | 271 | 272 | # In[31]: 273 | 274 | def forget(BET, df): 275 | 276 | """ This function takes Basic Element Table and dataframe as inputs to change and remove the 277 | effect of that data in the BET. (Decremental Learning of BET with dataframe as input) 278 | 279 | Examples 280 | -------- 281 | forget(Basic_Element_Table, data_frame) 282 | 283 | The above function updates Basic_Element_Table with values in the new dataframe. 284 | 285 | """ 286 | 287 | col = list(df.columns) 288 | for index, row in df.iterrows(): 289 | row1 = [] 290 | for e in col: 291 | row1.append(row[e]) 292 | arguments = col + row1 293 | BET = forgetbyindex(BET, *arguments) 294 | return BET 295 | 296 | 297 | -------------------------------------------------------------------------------- /module.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | from numpy import * 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import stats 7 | from scipy.stats import norm 8 | from scipy.stats import chisqprob 9 | import warnings 10 | import matplotlib.pyplot as plt 11 | warnings.filterwarnings('ignore') 12 | 13 | def BET(df): 14 | 15 | """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 16 | it can be updated with the new data. 17 | 18 | BET function returns basic element table as Pandas Dataframe 19 | 20 | Notes: 21 | ----- 22 | see 'Real Time Data Mining' by Prof. Sayad 23 | 24 | (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining) 25 | 26 | """ 27 | col = df.columns.tolist() 28 | l = len(col) 29 | x ={} # Creating empty dictionary 30 | for m in range(l): 31 | for n in range(l): 32 | x[m,n] = [] # Creating keys in dictionary with empty lists 33 | 34 | for i in range(l): 35 | for j in range(l): 36 | y=col[j] 37 | z=col[i] 38 | 39 | """ 40 | This code makes calculations for all the basic elements in the table. They are appended to 41 | a lists of a dictionary. 42 | 43 | """ 44 | count_x = len(df[col[i]]) # count in particular X column 45 | x[i,j].append(count_x) 46 | 47 | sum_x = df[col[i]].sum() # Sum of elemensts in y 48 | x[i,j].append(sum_x) 49 | 50 | sum_x2 = (df[z]*df[z]).sum() # Sum of elemensts in x2 51 | x[i,j].append(sum_x2) 52 | 53 | sum_x3 = (df[col[i]]*df[col[i]]*df[col[i]]).sum() # Sum of elemensts in x3 54 | x[i,j].append(sum_x3) 55 | 56 | sum_x4 = (df[col[i]]*df[col[i]]*df[col[i]]*df[col[i]]).sum() # Sum of elemensts in x4 57 | x[i,j].append(sum_x4) 58 | 59 | count_y = len(df[col[j]]) # count in particular Y column 60 | x[i,j].append(count_y) 61 | 62 | sum_y = df[col[j]].sum() # Sum of elemensts in y 63 | x[i,j].append(sum_y) 64 | 65 | sum_y2 = (df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y2 66 | x[i,j].append(sum_y2) 67 | 68 | sum_y3 = (df[col[j]]*df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y3 69 | x[i,j].append(sum_y3) 70 | 71 | sum_y4 = (df[col[j]]*df[col[j]]*df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y4 72 | x[i,j].append(sum_y4) 73 | 74 | sum_xy = (df[col[i]]*df[col[j]]).sum() # Sum of elemensts in xy 75 | x[i,j].append(sum_xy) 76 | 77 | sum_xy2 = (df[col[i]]*df[col[j]]*df[col[i]]*df[col[j]]).sum() # Sum of elemensts in (xy)2 78 | x[i,j].append(sum_xy2) 79 | 80 | z={} 81 | for m in range(l): # converting the dictionary to DataFrame 82 | z[m] = [] 83 | for i in range(l): 84 | for j in range(l): 85 | z[i].append(x[j,i]) 86 | result = pd.DataFrame(z, index=col) 87 | result.columns = col 88 | return(result) 89 | 90 | def calculate_basic_elements1(x,key,e,c,i,const): 91 | 92 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 93 | calculations to update the BET 94 | 95 | This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 96 | for making the calculations 97 | """ 98 | 99 | x[key][e][0] = (x[key][e][0]+(const*1)) 100 | 101 | x[key][e][1] = (x[key][e][1]+(const*c[i])) 102 | 103 | x[key][e][2] = (x[key][e][2]+(const*(c[i]*c[i]))) 104 | 105 | x[key][e][3] = (x[key][e][3]+(const*(c[i]*c[i]*c[i]))) 106 | 107 | x[key][e][4] = (x[key][e][4]+(const*(c[i]*c[i]*c[i]*c[i]))) 108 | 109 | x[key][e][5] = (x[key][e][5]+(const*1)) 110 | 111 | x[key][e][6] = (x[key][e][6]+(const*c[i])) 112 | 113 | x[key][e][7] = (x[key][e][7]+(const*(c[i]*c[i]))) 114 | 115 | x[key][e][8] = (x[key][e][8]+(const*(c[i]*c[i]*c[i]))) 116 | 117 | x[key][e][9] = (x[key][e][9]+(const*(c[i]*c[i]*c[i]*c[i]))) 118 | 119 | x[key][e][10] = (x[key][e][10]+(const*(c[i]*c[i]))) 120 | 121 | x[key][e][11] = (x[key][e][11]+(const*(c[i]*c[i]*c[i]*c[i]))) 122 | 123 | return x[key][e] 124 | 125 | 126 | # In[9]: 127 | 128 | def calculate_basic_elements2(x,key,k,b,c,i,m,const): 129 | 130 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 131 | calculations to update the BET 132 | 133 | This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 134 | for making the calculations 135 | """ 136 | 137 | x[key][k][0] = (x[key][k][0]+(const*1)) 138 | 139 | x[key][k][1] = (x[key][k][1]+(const*c[b.index(m)])) 140 | 141 | x[key][k][2] = (x[key][k][2]+(const*(c[b.index(m)]*c[b.index(m)]))) 142 | 143 | x[key][k][3] = (x[key][k][3]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]))) 144 | 145 | x[key][k][4] = (x[key][k][4]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]*c[b.index(m)]))) 146 | 147 | x[key][k][5] = (x[key][k][5]+(const*1)) 148 | 149 | x[key][k][6] = (x[key][k][6]+(const*c[i])) 150 | 151 | x[key][k][7] = (x[key][k][7]+(const*(c[i]*c[i]))) 152 | 153 | x[key][k][8] = (x[key][k][8]+(const*(c[i]*c[i]*c[i]))) 154 | 155 | x[key][k][9] = (x[key][k][9]+(const*(c[i]*c[i]*c[i]*c[i]))) 156 | 157 | x[key][k][10] = (x[key][k][10]+(const*(c[i]*c[b.index(m)]))) 158 | 159 | x[key][k][11] = (x[key][k][11]+(const*(c[i]*c[b.index(m)]*c[i]*c[b.index(m)]))) 160 | 161 | return x[key][k] 162 | 163 | # In[21]: 164 | 165 | def learnbyindex(BET, *args): 166 | 167 | """ This function takes Basic Element Table and feature_names & values as arguments to update the 168 | given list of feature column & rows in the BET by corresponding values. 169 | 170 | Examples 171 | -------- 172 | learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 173 | 174 | The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively. 175 | 176 | """ 177 | 178 | BET.reset_index(drop = True, inplace = True) # convert BET to dictionary 179 | x = BET.to_dict(orient='list') 180 | keys = list(x.keys()) 181 | arguments_list = [item for item in args] 182 | n_features = int(len(arguments_list)/2) # no of features given as input for updating BET 183 | 184 | if (len(arguments_list))%2 != 0: 185 | print("Error: Give correct set of Feature_names & corresponding parameters") 186 | 187 | else: 188 | feature_names = arguments_list[0:n_features] 189 | values= arguments_list[n_features::] 190 | 191 | for i in range(len(feature_names)): 192 | key = feature_names[i] 193 | e = keys.index(key) 194 | basic_elements1(x,key,e,values,i,1) # function for updating elements BET 195 | 196 | for m in feature_names: 197 | if m != feature_names[i]: 198 | k = keys.index(m) 199 | basic_elements2(x,key,k,feature_names,values,i,m,1) # function for updating elements BET 200 | 201 | df = pd.DataFrame(x) 202 | df.index = keys 203 | df = df[keys] 204 | return df 205 | 206 | 207 | 208 | # In[22]: 209 | 210 | def forgetbyindex(BET, *args): 211 | 212 | """ This function takes Basic Element Table and feature name & values as arguments to update the 213 | given list of features in the BET by corresponding values (deleting effect of those values from BET). 214 | 215 | Examples 216 | -------- 217 | forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 218 | 219 | The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively. 220 | 221 | """ 222 | 223 | BET.reset_index(drop = True, inplace = True) 224 | x = BET.to_dict(orient='list') # convert BET to dictionary 225 | keys = list(x.keys()) 226 | arguments_list = [item for item in args] 227 | n_features = int(len(arguments_list)/2) 228 | 229 | if (len(arguments_list))%2 != 0: # no of features given as input for updating BET 230 | print("Give correct set of Index & parameters for function") 231 | else: 232 | feature_names = arguments_list[0 : n_features] 233 | values= arguments_list[n_features: :] 234 | for i in range(n_features): 235 | key = feature_names[i] 236 | e = keys.index(key) 237 | basic_elements1(x,key,e,values,i,-1) # function for updating elements BET 238 | 239 | for m in feature_names: 240 | if m != feature_names[i]: 241 | k = keys.index(m) 242 | basic_elements2(x,key,k,feature_names,values,i,m,-1) 243 | 244 | df = pd.DataFrame(x) 245 | df = df[keys] 246 | df.index = keys 247 | return df 248 | 249 | 250 | # In[12]: 251 | 252 | def growbyindex(BET, *args): 253 | 254 | """ This function takes Basic Element Table and feature name & values as arguments to update the 255 | BET with new features and corresponding values. 256 | 257 | Examples 258 | -------- 259 | growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 ) 260 | 261 | The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively. 262 | 263 | """ 264 | 265 | main_list = list(BET.columns) 266 | arguments_list = [item for item in args] # convert BET to dictionary 267 | n_features = int(len(arguments_list)/2) 268 | if (len(arguments_list))%2 != 0: 269 | print("Give correct set of Index & parameters for function") 270 | else: 271 | feature_names = arguments_list[0:n_features] 272 | values = arguments_list[n_features::] 273 | 274 | for i in range(n_features): 275 | 276 | elements = [[0]*12]*len(BET) #Creating null basic elements lists 277 | BET[feature_names[i]] = elements 278 | 279 | new_list = [] 280 | for j in range(len(BET.columns)): 281 | new_list.append(list(np.array([0]*12))) 282 | 283 | new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]]) 284 | BET = pd.concat([BET,new_row]) 285 | 286 | BET.reset_index(drop = True, inplace = True) 287 | x = BET.to_dict(orient='list') 288 | keys = list(x.keys()) 289 | 290 | for i in range(n_features): 291 | key = feature_names[i] 292 | if key in main_list: 293 | print('feature already exsists! Use Learn function') 294 | else: 295 | e = keys.index(key) 296 | calculate_basic_elements1(x,key,e,c,i,1) 297 | 298 | df = pd.DataFrame(BET) 299 | df.index = keys 300 | df = df[keys] 301 | return df 302 | 303 | # In[14]: 304 | 305 | def learn(BET, df): 306 | 307 | """ This function takes Basic Element Table and dataframe as inputs to update the 308 | BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input) 309 | 310 | Examples 311 | -------- 312 | learn(Basic_Element_Table, data_frame) 313 | 314 | The above function updates Basic_Element_Table with values in the new dataframe. 315 | 316 | """ 317 | 318 | col = list(df.columns) 319 | for index, row in df.iterrows(): 320 | row1 = [] 321 | for e in col: 322 | row1.append(row[e]) 323 | arguments = col + row1 324 | BET = learnbyindex(BET, *arguments) 325 | return BET 326 | 327 | # In[16]: 328 | 329 | def forget(BET, df): 330 | 331 | """ This function takes Basic Element Table and dataframe as inputs to change and remove the 332 | effect of that data in the BET. (Decremental Learning of BET with dataframe as input) 333 | 334 | Examples 335 | -------- 336 | forget(Basic_Element_Table, data_frame) 337 | 338 | The above function updates Basic_Element_Table with values in the new dataframe. 339 | 340 | """ 341 | 342 | col = list(df.columns) 343 | for index, row in df.iterrows(): 344 | row1 = [] 345 | for e in col: 346 | row1.append(row[e]) 347 | arguments = col + row1 348 | BET = forgetbyindex(BET, *arguments) 349 | return BET 350 | 351 | 352 | # In[18]: 353 | 354 | def univariate(BET): 355 | 356 | """ 357 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 358 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 359 | a dataset’s distribution, excluding NaN values. 360 | 361 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 362 | 363 | Examples 364 | -------- 365 | univariate(Basic_Element_Table) 366 | 367 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 368 | 369 | function returns univariate stats as Pandas Dataframe. 370 | 371 | """ 372 | 373 | l =(len(BET)) 374 | BET.reset_index(drop = True, inplace = True) 375 | x = BET.to_dict(orient='list') # convert BET to dictionary 376 | keys =list(x.keys()) 377 | describe = {} 378 | 379 | for i in range(l): 380 | describe[i] = [] 381 | m = keys[i] 382 | 383 | try: 384 | count = x[m][i][0] 385 | describe[i].append(count) 386 | except: 387 | describe[i].append('NaN') 388 | try: 389 | Mean = (x[m][i][1])/count 390 | describe[i].append(Mean) 391 | except: 392 | describe[i].append('NaN') 393 | 394 | try: 395 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 396 | describe[i].append(Variance) 397 | except: 398 | describe[i].append('NaN') 399 | try: 400 | Standard_deviation = math.sqrt(Variance) 401 | describe[i].append(Standard_deviation) 402 | except: 403 | describe[i].append('NaN') 404 | try: 405 | coeff_of_variation = (Standard_deviation/Mean)*100 406 | describe[i].append(coeff_of_variation) 407 | except: 408 | describe[i].append('NaN') 409 | 410 | try: 411 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 412 | describe[i].append(skewness) 413 | except: 414 | describe[i].append('NaN') 415 | try: 416 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 417 | describe[i].append(Kurtosis) 418 | except: 419 | describe[i].append('NaN') 420 | 421 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 422 | result = pd.DataFrame(describe, index=names) 423 | result.columns = keys 424 | return(result) 425 | 426 | # In[19]: 427 | 428 | def Covariance(BET): 429 | 430 | """ 431 | This function computes pairwise covariance of all features in BET. Covariance describes 432 | the linear relationship between two features. 433 | 434 | Examples 435 | -------- 436 | Covariance(Basic_Element_Table) 437 | 438 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 439 | 440 | function returns Covariance as Pandas Dataframe. 441 | 442 | """ 443 | 444 | l =(len(BET)) 445 | BET.reset_index(drop = True, inplace = True) 446 | x = BET.to_dict(orient='list') 447 | keys =list(x.keys()) 448 | covar = {} 449 | 450 | for i in range(len(BET)): 451 | covar[i] = [] 452 | for j in range(len(BET)): 453 | m = keys[i] 454 | try: 455 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 456 | covar[i].append(cov) 457 | except: 458 | covar[i].append('NaN') 459 | 460 | result = pd.DataFrame(covar, index=keys) 461 | result.columns = keys 462 | return(result) 463 | 464 | 465 | def correlation(BET): 466 | 467 | """ 468 | This function computes pairwise correlations of all features in BET. correlation measures 469 | how strong a relationship is between two variables. 470 | 471 | Examples 472 | -------- 473 | correlation(Basic_Element_Table) 474 | 475 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 476 | 477 | function returns correlations as Pandas Dataframe. 478 | 479 | """ 480 | 481 | l =(len(BET)) 482 | BET.reset_index(drop = True, inplace = True) 483 | x = BET.to_dict(orient='list') 484 | keys =list(x.keys()) 485 | corr = {} 486 | 487 | for i in range(len(BET)): 488 | corr[i] = [] 489 | for j in range(len(BET)): 490 | m = keys[i] 491 | count1 = x[m][j][0] 492 | count2 = x[m][j][5] 493 | try: 494 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 495 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 496 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 497 | corr[i].append(corrl) 498 | except: 499 | corr[i].append('NaN') 500 | 501 | result = pd.DataFrame(corr, index=keys) 502 | result.columns = keys 503 | return(result) 504 | 505 | def Ztest(BET, col1, col2): 506 | 507 | l =(len(BET)) 508 | BET.reset_index(drop = True, inplace = True) 509 | x = BET.to_dict(orient='list') 510 | keys =list(x.keys()) 511 | 512 | count = x[col2][keys.index(col1)][6] 513 | sumx = x[col2][keys.index(col1)][10] 514 | sumx2 = x[col2][keys.index(col1)][11] 515 | Mean = sumx/count 516 | Variance = (sumx2 - (((sumx)**2)/count))/count 517 | 518 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 519 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 520 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 521 | Mean_0 = sumx_0/count_0 522 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 523 | 524 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 525 | prob = 1 - stats.norm.cdf(zscore) 526 | return 2*prob 527 | 528 | 529 | def Ttest(BET, col1, col2): 530 | 531 | l =(len(BET)) 532 | BET.reset_index(drop = True, inplace = True) 533 | x = BET.to_dict(orient='list') 534 | keys =list(x.keys()) 535 | 536 | count = x[col2][keys.index(col1)][6] 537 | sumx = x[col2][keys.index(col1)][10] 538 | sumx2 = x[col2][keys.index(col1)][11] 539 | Mean = sumx/count 540 | Variance = (sumx2 - (((sumx)**2)/count))/count 541 | 542 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 543 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 544 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 545 | Mean_0 = sumx_0/count_0 546 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 547 | 548 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 549 | 550 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 551 | 552 | df = (count + count_0 - 2) 553 | 554 | prob = (1-stats.t.cdf(tscore, df)) 555 | return 2*prob 556 | 557 | 558 | 559 | def chi2(BET, feature_1 , feature_2): 560 | 561 | l =(len(BET)) 562 | BET.reset_index(drop = True, inplace = True) 563 | x = BET.to_dict(orient='list') 564 | keys =list(x.keys()) 565 | obs_freq = {} 566 | exp_freq = {} 567 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 568 | chi2 = 0 569 | 570 | for i in range(len(feature_1)): 571 | obs_freq[feature_1[i]] = [] 572 | 573 | for j in range(len(feature_2)): 574 | col1 = (feature_1[i]) 575 | col2 = (feature_2[j]) 576 | sumx = x[col1][keys.index(col2)][10] 577 | obs_freq[feature_1[i]].append(sumx) 578 | 579 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 580 | total_in_contingency = sum(sum_exp_freq_vertical) 581 | 582 | for i in range(len(feature_1)): 583 | exp_freq[feature_1[i]] = [] 584 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 585 | for j in range(len(feature_2)): 586 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 587 | exp_freq[feature_1[i]].append(e) 588 | 589 | for i in range(len(feature_1)): 590 | for j in range(len(feature_2)): 591 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 592 | 593 | 594 | df = (len(feature_1) - 1)*(len(feature_2)-1) 595 | 596 | print('chi2: ' + str(chi2)) 597 | print('df: ' + str(df)) 598 | print('chisqprob: ' + str(chisqprob(chi2, df))) 599 | return(chisqprob(chi2, df)) 600 | 601 | 602 | #Models: 603 | 604 | def LDA_fit(BET, target): 605 | 606 | """ 607 | Linear Discriminant Analysis (LDA) is a classification method searching for a linear combination 608 | of variables (predictors) that best separates the classes (targets). 609 | 610 | It basically performs the supervised dimensionality reduction, by projecting the input data to a 611 | linear subspace consisting of the directions which maximize the separation between classes (Maximizing the difference 612 | between the means of groups and reducing Std. deviation within groups) 613 | 614 | Examples 615 | -------- 616 | LDA_fit(Basic_Element_Table, Target) 617 | 618 | where 'Basic_Element_Table' is found from BET function for the data and 'Target' is the feature that needs to be 619 | predicted. 620 | 621 | The function returns (mean1,mean2,Beta, prob) which are Mean vectors of the groups, Linear Model coefficients and 622 | class probability respectively. 623 | 624 | """ 625 | l =(len(BET)) 626 | BET1 = BET 627 | BET1.reset_index(drop = True, inplace = True) 628 | x = BET1.to_dict(orient='list') 629 | keys =list(x.keys()) 630 | k = keys.index(target) 631 | count_1 = BET[target][k][0] - BET[target][k][1] 632 | count_2 = BET[target][k][1] 633 | mean1 = [] 634 | mean2 = [] 635 | c = [] 636 | for i in range(len(BET)): 637 | if i != keys.index(target): 638 | mean1.append((BET[target][i][1] - BET[target][i][10])/(BET[target][i][0]-BET[target][i][6])) 639 | mean2.append((BET[target][i][10])/BET[target][i][6]) 640 | 641 | for i in range(len(BET)): 642 | if i != keys.index(target): 643 | for j in range(len(BET)): 644 | if j != keys.index(target): 645 | m = keys[i] 646 | n = keys[j] 647 | cal1 = (((x[m][k][6] - x[m][k][10])*(x[n][k][6] - x[n][k][10]))/count_1) 648 | cal2 = (x[m][k][10]*x[n][k][10])/count_2 649 | c.append((x[m][j][10]-cal1 - cal2)/(count_1+count_2-2)) 650 | 651 | c = np.array(c) 652 | n = (len(BET)-1) 653 | c = reshape(c,(n,n)) 654 | inverse = np.linalg.inv(c) 655 | z = np.array(mean1)-np.array(mean2) 656 | Beta = np.matmul(inverse, z.T) 657 | prob = (-math.log(count_1/count_2)) 658 | 659 | return (mean1,mean2,Beta, prob) 660 | 661 | 662 | def LDA_predict(BET, X, target): 663 | """ 664 | To predict the target values for the given data using LDA paramters calculated from the training dataset. 665 | Returns the predictions using LDA model. 666 | 667 | Examples 668 | -------- 669 | LDA_predict(Basic_Element_Table, Testing_data, Target) 670 | 671 | BET table and testing data should be given as inputs 672 | """ 673 | (mean1,mean2,Beta, prob) = LDA_fit(BET, target) 674 | numpy_matrix = X.as_matrix() 675 | q=[] 676 | for i in range(len(numpy_matrix)): 677 | z = numpy_matrix[i] - (0.5*(np.array(mean1) - np.array(mean2))) 678 | if np.matmul(Beta.T, z) > prob: 679 | q.append(0) 680 | else: 681 | q.append(1) 682 | return q 683 | 684 | def accuracy(y, y_pred): 685 | y = list(y) 686 | y_pred =list(y_pred) 687 | matches = [] 688 | for i in range(len(y)): 689 | if y[i] == y_pred[i]: 690 | matches.append(1) 691 | return (sum(matches)/len(y))*100 692 | 693 | 694 | def PCA(BET): 695 | """ 696 | Principal component analysis (PCA) is a classical statistical method that uses an orthogonal transformation 697 | to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables 698 | called principal components. 699 | 700 | Real time Principal components for datasets can be extracted from the ART-M covariance matrix equations. 701 | 702 | Examples 703 | -------- 704 | PCA(Basic_Element_Table) 705 | 706 | This function returns eigen values & eigen vectors for the features in the Basic element table. 707 | """ 708 | 709 | cov = Covariance(BET) 710 | cov_mat = cov.values 711 | eig_vals, eig_vecs = np.linalg.eig(cov_mat) 712 | 713 | print('Eigenvectors: \n%s' %eig_vecs) 714 | print('\nEigenvalues: \n%s' %eig_vals) 715 | 716 | # Make a list of (eigenvalue, eigenvector) tuples 717 | eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] 718 | 719 | # Sort the (eigenvalue, eigenvector) tuples from high to low 720 | eig_pairs.sort(key=lambda x: x[0], reverse=True) 721 | 722 | # Visually confirm that the list is correctly sorted by decreasing eigenvalues 723 | print('\nEigenvalues in descending order:') 724 | for i in eig_pairs: 725 | print(i[0]) 726 | 727 | def MLR(BET,target): 728 | 729 | row_indexes = list(BET.index) 730 | target_index = row_indexes.index(target) 731 | BET_features = BET.drop(target, axis =1) 732 | BET_features = BET_features.drop(target, axis =0) 733 | cov_features = Covariance(BET_features).values 734 | cov_target = Covariance(BET).values 735 | cov_target = cov_target[target_index] 736 | cov_target = np.delete(cov_target, target_index) 737 | inverse = np.linalg.inv(cov_features) 738 | Beta_array = np.matmul(inverse, cov_target) 739 | 740 | l =(len(BET)) 741 | BET.reset_index(drop = True, inplace = True) 742 | x = BET.to_dict(orient='list') 743 | keys =list(x.keys()) 744 | 745 | mean_target = (BET[target][keys.index(target)][1])/BET[target][keys.index(target)][0] 746 | mean_X = [] 747 | 748 | for i in range(len(BET_features)): 749 | if i != keys.index(target): 750 | mean_X.append((BET[target][i][1])/BET[target][i][0]) 751 | 752 | b0 = mean_target - np.matmul(Beta_array, mean_X) 753 | 754 | print(b0) 755 | return Beta_array 756 | 757 | 758 | def gaussian_NB(BET, X ,target): 759 | 760 | l =(len(BET)) 761 | BET.reset_index(drop = True, inplace = True) 762 | x = BET.to_dict(orient='list') 763 | keys =list(x.keys()) 764 | 765 | probability = [] 766 | likelihood = 1 767 | att_prior_prob = 1 768 | class_prior_prob = 1 769 | for i in range(len(BET)): 770 | if keys[i] != target: 771 | count = x[target][i][6] 772 | sumxy = x[target][i][10] 773 | sumxy2 = x[target][i][11] 774 | Mean = sumxy/count 775 | Variance = (sumxy2 - (((sumxy)**2)/count))/count 776 | value = X[i] 777 | likelihood = likelihood*(1/math.sqrt(2*np.pi*Variance))*(np.e**(-(value-Mean)/(2*Variance))) 778 | 779 | class_prior_prob = (count/x[target][i][5]) 780 | 781 | count_att = x[target][i][0] 782 | sumxy_att = x[target][i][1] 783 | sumxy2_att = x[target][i][2] 784 | Mean_att = sumxy_att/count_att 785 | Variance_att = (sumxy2_att - (((sumxy_att)**2)/count_att))/count_att 786 | 787 | att_prior_prob = att_prior_prob*(1/math.sqrt(2*np.pi*Variance_att))*(np.e**(-(value-Mean_att)/(2*Variance_att))) 788 | 789 | post_prob = (class_prior_prob * likelihood)/att_prior_prob 790 | 791 | return post_prob 792 | 793 | 794 | def Multinomial_NB(BET, X ,target): 795 | 796 | l =(len(BET)) 797 | BET.reset_index(drop = True, inplace = True) 798 | x = BET.to_dict(orient='list') 799 | keys =list(x.keys()) 800 | 801 | probability = [] 802 | likelihood = 1 803 | att_prior_prob = 1 804 | class_prior_prob = 1 805 | for i in range(len(BET)): 806 | if keys[i] != target: 807 | sumx = x[target][i][6] 808 | sumxy = x[target][i][10] 809 | likelihood = likelihood*(sumxy/sumx) 810 | 811 | class_prior_prob = (x[target][i][6]/x[target][i][5]) 812 | 813 | count_att = x[target][i][0] 814 | sumxy_att = x[target][i][1] 815 | att_prior_prob = att_prior_prob*(sumxy_att/count_att) 816 | 817 | post_prob = (class_prior_prob * likelihood)/att_prior_prob 818 | 819 | return post_prob 820 | 821 | 822 | def SVM_fit(BET, target): 823 | l =(len(BET)) 824 | BET1 = BET 825 | BET1.reset_index(drop = True, inplace = True) 826 | x = BET1.to_dict(orient='list') 827 | keys =list(x.keys()) 828 | k = keys.index(target) 829 | EE = [] 830 | last_row =[] 831 | Ede = [] 832 | count = BET[target][k][0] 833 | for i in range(len(BET)): 834 | if i != keys.index(target): 835 | for j in range(len(BET)): 836 | if j != keys.index(target): 837 | m = keys[i] 838 | n = keys[j] 839 | EE.append(x[m][j][10]) 840 | if j == keys.index(target): 841 | Ede.append(2*(x[m][j][10]) -x[m][i][6]) 842 | EE.append(-x[m][i][6]) 843 | last_row.append(-x[m][i][6]) 844 | final = EE+last_row 845 | final.pop() 846 | final.append(count) 847 | final = np.array(final) 848 | n = (len(BET)) 849 | final = reshape(final,(n,n)) 850 | 851 | Ede.append((count-2*(BET[target][k][1]))) 852 | 853 | I = np.identity(n) 854 | const = (((I/count)+ final)) 855 | 856 | inverse = np.linalg.inv(const) 857 | Beta = np.dot(inverse, np.array(Ede)) 858 | 859 | return(Beta) 860 | 861 | 862 | def SVM_Reg_fit(BET, target,tuning_parameter): 863 | l =(len(BET)) 864 | BET1 = BET 865 | BET1.reset_index(drop = True, inplace = True) 866 | x = BET1.to_dict(orient='list') 867 | keys =list(x.keys()) 868 | k = keys.index(target) 869 | EE = [] 870 | last_row =[] 871 | Ede = [] 872 | count = BET[target][k][0] 873 | for i in range(len(BET)): 874 | if i != keys.index(target): 875 | for j in range(len(BET)): 876 | if j != keys.index(target): 877 | m = keys[i] 878 | n = keys[j] 879 | EE.append(x[m][j][10]) 880 | if j == keys.index(target): 881 | Ede.append(x[m][j][10]) 882 | EE.append(-x[m][i][6]) 883 | last_row.append(-x[m][i][6]) 884 | final = EE+last_row 885 | final.pop() 886 | final.append(count) 887 | final = np.array(final) 888 | n = (len(BET)) 889 | final = reshape(final,(n,n)) 890 | 891 | Ede.append(-(BET[target][k][1])) 892 | print(Ede) 893 | I = np.identity(n) 894 | const = (((I/tuning_parameter)+ final)) 895 | 896 | inverse = np.linalg.inv(const) 897 | Beta = np.dot(inverse, np.array(Ede)) 898 | 899 | return(Beta) 900 | -------------------------------------------------------------------------------- /util/module.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | from numpy import * 4 | import numpy as np 5 | import pandas as pd 6 | from scipy import stats 7 | from scipy.stats import norm 8 | from scipy.stats import chisqprob 9 | import warnings 10 | import matplotlib.pyplot as plt 11 | warnings.filterwarnings('ignore') 12 | 13 | def BET(df): 14 | 15 | """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 16 | it can be updated with the new data. 17 | 18 | BET function returns basic element table as Pandas Dataframe 19 | 20 | Notes: 21 | ----- 22 | see 'Real Time Data Mining' by Prof. Sayad 23 | 24 | (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining) 25 | 26 | """ 27 | col = df.columns.tolist() 28 | l = len(col) 29 | x ={} # Creating empty dictionary 30 | for m in range(l): 31 | for n in range(l): 32 | x[m,n] = [] # Creating keys in dictionary with empty lists 33 | 34 | for i in range(l): 35 | for j in range(l): 36 | y=col[j] 37 | z=col[i] 38 | 39 | """ 40 | This code makes calculations for all the basic elements in the table. They are appended to 41 | a lists of a dictionary. 42 | 43 | """ 44 | count_x = len(df[col[i]]) # count in particular X column 45 | x[i,j].append(count_x) 46 | 47 | sum_x = df[col[i]].sum() # Sum of elemensts in y 48 | x[i,j].append(sum_x) 49 | 50 | sum_x2 = (df[z]*df[z]).sum() # Sum of elemensts in x2 51 | x[i,j].append(sum_x2) 52 | 53 | sum_x3 = (df[col[i]]*df[col[i]]*df[col[i]]).sum() # Sum of elemensts in x3 54 | x[i,j].append(sum_x3) 55 | 56 | sum_x4 = (df[col[i]]*df[col[i]]*df[col[i]]*df[col[i]]).sum() # Sum of elemensts in x4 57 | x[i,j].append(sum_x4) 58 | 59 | count_y = len(df[col[j]]) # count in particular Y column 60 | x[i,j].append(count_y) 61 | 62 | sum_y = df[col[j]].sum() # Sum of elemensts in y 63 | x[i,j].append(sum_y) 64 | 65 | sum_y2 = (df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y2 66 | x[i,j].append(sum_y2) 67 | 68 | sum_y3 = (df[col[j]]*df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y3 69 | x[i,j].append(sum_y3) 70 | 71 | sum_y4 = (df[col[j]]*df[col[j]]*df[col[j]]*df[col[j]]).sum() # Sum of elemensts in y4 72 | x[i,j].append(sum_y4) 73 | 74 | sum_xy = (df[col[i]]*df[col[j]]).sum() # Sum of elemensts in xy 75 | x[i,j].append(sum_xy) 76 | 77 | sum_xy2 = (df[col[i]]*df[col[j]]*df[col[i]]*df[col[j]]).sum() # Sum of elemensts in (xy)2 78 | x[i,j].append(sum_xy2) 79 | 80 | z={} 81 | for m in range(l): # converting the dictionary to DataFrame 82 | z[m] = [] 83 | for i in range(l): 84 | for j in range(l): 85 | z[i].append(x[j,i]) 86 | result = pd.DataFrame(z, index=col) 87 | result.columns = col 88 | return(result) 89 | 90 | def calculate_basic_elements1(x,key,e,c,i,const): 91 | 92 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 93 | calculations to update the BET 94 | 95 | This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 96 | for making the calculations 97 | """ 98 | 99 | x[key][e][0] = (x[key][e][0]+(const*1)) 100 | 101 | x[key][e][1] = (x[key][e][1]+(const*c[i])) 102 | 103 | x[key][e][2] = (x[key][e][2]+(const*(c[i]*c[i]))) 104 | 105 | x[key][e][3] = (x[key][e][3]+(const*(c[i]*c[i]*c[i]))) 106 | 107 | x[key][e][4] = (x[key][e][4]+(const*(c[i]*c[i]*c[i]*c[i]))) 108 | 109 | x[key][e][5] = (x[key][e][5]+(const*1)) 110 | 111 | x[key][e][6] = (x[key][e][6]+(const*c[i])) 112 | 113 | x[key][e][7] = (x[key][e][7]+(const*(c[i]*c[i]))) 114 | 115 | x[key][e][8] = (x[key][e][8]+(const*(c[i]*c[i]*c[i]))) 116 | 117 | x[key][e][9] = (x[key][e][9]+(const*(c[i]*c[i]*c[i]*c[i]))) 118 | 119 | x[key][e][10] = (x[key][e][10]+(const*(c[i]*c[i]))) 120 | 121 | x[key][e][11] = (x[key][e][11]+(const*(c[i]*c[i]*c[i]*c[i]))) 122 | 123 | return x[key][e] 124 | 125 | 126 | # In[9]: 127 | 128 | def calculate_basic_elements2(x,key,k,b,c,i,m,const): 129 | 130 | """ This is an inner function used in learn_by_index & grow_by_index functions for making 131 | calculations to update the BET 132 | 133 | This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 134 | for making the calculations 135 | """ 136 | 137 | x[key][k][0] = (x[key][k][0]+(const*1)) 138 | 139 | x[key][k][1] = (x[key][k][1]+(const*c[b.index(m)])) 140 | 141 | x[key][k][2] = (x[key][k][2]+(const*(c[b.index(m)]*c[b.index(m)]))) 142 | 143 | x[key][k][3] = (x[key][k][3]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]))) 144 | 145 | x[key][k][4] = (x[key][k][4]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]*c[b.index(m)]))) 146 | 147 | x[key][k][5] = (x[key][k][5]+(const*1)) 148 | 149 | x[key][k][6] = (x[key][k][6]+(const*c[i])) 150 | 151 | x[key][k][7] = (x[key][k][7]+(const*(c[i]*c[i]))) 152 | 153 | x[key][k][8] = (x[key][k][8]+(const*(c[i]*c[i]*c[i]))) 154 | 155 | x[key][k][9] = (x[key][k][9]+(const*(c[i]*c[i]*c[i]*c[i]))) 156 | 157 | x[key][k][10] = (x[key][k][10]+(const*(c[i]*c[b.index(m)]))) 158 | 159 | x[key][k][11] = (x[key][k][11]+(const*(c[i]*c[b.index(m)]*c[i]*c[b.index(m)]))) 160 | 161 | return x[key][k] 162 | 163 | # In[21]: 164 | 165 | def learnbyindex(BET, *args): 166 | 167 | """ This function takes Basic Element Table and feature_names & values as arguments to update the 168 | given list of feature column & rows in the BET by corresponding values. 169 | 170 | Examples 171 | -------- 172 | learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 173 | 174 | The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively. 175 | 176 | """ 177 | 178 | BET.reset_index(drop = True, inplace = True) # convert BET to dictionary 179 | x = BET.to_dict(orient='list') 180 | keys = list(x.keys()) 181 | arguments_list = [item for item in args] 182 | n_features = int(len(arguments_list)/2) # no of features given as input for updating BET 183 | 184 | if (len(arguments_list))%2 != 0: 185 | print("Error: Give correct set of Feature_names & corresponding parameters") 186 | 187 | else: 188 | feature_names = arguments_list[0:n_features] 189 | values= arguments_list[n_features::] 190 | 191 | for i in range(len(feature_names)): 192 | key = feature_names[i] 193 | e = keys.index(key) 194 | basic_elements1(x,key,e,values,i,1) # function for updating elements BET 195 | 196 | for m in feature_names: 197 | if m != feature_names[i]: 198 | k = keys.index(m) 199 | basic_elements2(x,key,k,feature_names,values,i,m,1) # function for updating elements BET 200 | 201 | df = pd.DataFrame(x) 202 | df.index = keys 203 | df = df[keys] 204 | return df 205 | 206 | 207 | 208 | # In[22]: 209 | 210 | def forgetbyindex(BET, *args): 211 | 212 | """ This function takes Basic Element Table and feature name & values as arguments to update the 213 | given list of features in the BET by corresponding values (deleting effect of those values from BET). 214 | 215 | Examples 216 | -------- 217 | forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 ) 218 | 219 | The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively. 220 | 221 | """ 222 | 223 | BET.reset_index(drop = True, inplace = True) 224 | x = BET.to_dict(orient='list') # convert BET to dictionary 225 | keys = list(x.keys()) 226 | arguments_list = [item for item in args] 227 | n_features = int(len(arguments_list)/2) 228 | 229 | if (len(arguments_list))%2 != 0: # no of features given as input for updating BET 230 | print("Give correct set of Index & parameters for function") 231 | else: 232 | feature_names = arguments_list[0 : n_features] 233 | values= arguments_list[n_features: :] 234 | for i in range(n_features): 235 | key = feature_names[i] 236 | e = keys.index(key) 237 | basic_elements1(x,key,e,values,i,-1) # function for updating elements BET 238 | 239 | for m in feature_names: 240 | if m != feature_names[i]: 241 | k = keys.index(m) 242 | basic_elements2(x,key,k,feature_names,values,i,m,-1) 243 | 244 | df = pd.DataFrame(x) 245 | df = df[keys] 246 | df.index = keys 247 | return df 248 | 249 | 250 | # In[12]: 251 | 252 | def growbyindex(BET, *args): 253 | 254 | """ This function takes Basic Element Table and feature name & values as arguments to update the 255 | BET with new features and corresponding values. 256 | 257 | Examples 258 | -------- 259 | growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 ) 260 | 261 | The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively. 262 | 263 | """ 264 | 265 | main_list = list(BET.columns) 266 | arguments_list = [item for item in args] # convert BET to dictionary 267 | n_features = int(len(arguments_list)/2) 268 | if (len(arguments_list))%2 != 0: 269 | print("Give correct set of Index & parameters for function") 270 | else: 271 | feature_names = arguments_list[0:n_features] 272 | values = arguments_list[n_features::] 273 | 274 | for i in range(n_features): 275 | 276 | elements = [[0]*12]*len(BET) #Creating null basic elements lists 277 | BET[feature_names[i]] = elements 278 | 279 | new_list = [] 280 | for j in range(len(BET.columns)): 281 | new_list.append(list(np.array([0]*12))) 282 | 283 | new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]]) 284 | BET = pd.concat([BET,new_row]) 285 | 286 | BET.reset_index(drop = True, inplace = True) 287 | x = BET.to_dict(orient='list') 288 | keys = list(x.keys()) 289 | 290 | for i in range(n_features): 291 | key = feature_names[i] 292 | if key in main_list: 293 | print('feature already exsists! Use Learn function') 294 | else: 295 | e = keys.index(key) 296 | calculate_basic_elements1(x,key,e,c,i,1) 297 | 298 | df = pd.DataFrame(BET) 299 | df.index = keys 300 | df = df[keys] 301 | return df 302 | 303 | # In[14]: 304 | 305 | def learn(BET, df): 306 | 307 | """ This function takes Basic Element Table and dataframe as inputs to update the 308 | BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input) 309 | 310 | Examples 311 | -------- 312 | learn(Basic_Element_Table, data_frame) 313 | 314 | The above function updates Basic_Element_Table with values in the new dataframe. 315 | 316 | """ 317 | 318 | col = list(df.columns) 319 | for index, row in df.iterrows(): 320 | row1 = [] 321 | for e in col: 322 | row1.append(row[e]) 323 | arguments = col + row1 324 | BET = learnbyindex(BET, *arguments) 325 | return BET 326 | 327 | # In[16]: 328 | 329 | def forget(BET, df): 330 | 331 | """ This function takes Basic Element Table and dataframe as inputs to change and remove the 332 | effect of that data in the BET. (Decremental Learning of BET with dataframe as input) 333 | 334 | Examples 335 | -------- 336 | forget(Basic_Element_Table, data_frame) 337 | 338 | The above function updates Basic_Element_Table with values in the new dataframe. 339 | 340 | """ 341 | 342 | col = list(df.columns) 343 | for index, row in df.iterrows(): 344 | row1 = [] 345 | for e in col: 346 | row1.append(row[e]) 347 | arguments = col + row1 348 | BET = forgetbyindex(BET, *arguments) 349 | return BET 350 | 351 | 352 | # In[18]: 353 | 354 | def univariate(BET): 355 | 356 | """ 357 | Univariate analysis explores variables (attributes) one by one by summarizing each attribute 358 | using statistical techniques. This summarizes the central tendency, dispersion and shape of 359 | a dataset’s distribution, excluding NaN values. 360 | 361 | univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 362 | 363 | Examples 364 | -------- 365 | univariate(Basic_Element_Table) 366 | 367 | The above function generates Univariate statistics for all the features in the Basic_Element_Table. 368 | 369 | function returns univariate stats as Pandas Dataframe. 370 | 371 | """ 372 | 373 | l =(len(BET)) 374 | BET.reset_index(drop = True, inplace = True) 375 | x = BET.to_dict(orient='list') # convert BET to dictionary 376 | keys =list(x.keys()) 377 | describe = {} 378 | 379 | for i in range(l): 380 | describe[i] = [] 381 | m = keys[i] 382 | 383 | try: 384 | count = x[m][i][0] 385 | describe[i].append(count) 386 | except: 387 | describe[i].append('NaN') 388 | try: 389 | Mean = (x[m][i][1])/count 390 | describe[i].append(Mean) 391 | except: 392 | describe[i].append('NaN') 393 | 394 | try: 395 | Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count 396 | describe[i].append(Variance) 397 | except: 398 | describe[i].append('NaN') 399 | try: 400 | Standard_deviation = math.sqrt(Variance) 401 | describe[i].append(Standard_deviation) 402 | except: 403 | describe[i].append('NaN') 404 | try: 405 | coeff_of_variation = (Standard_deviation/Mean)*100 406 | describe[i].append(coeff_of_variation) 407 | except: 408 | describe[i].append('NaN') 409 | 410 | try: 411 | skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3) 412 | describe[i].append(skewness) 413 | except: 414 | describe[i].append('NaN') 415 | try: 416 | Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3)))) 417 | describe[i].append(Kurtosis) 418 | except: 419 | describe[i].append('NaN') 420 | 421 | names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis'] 422 | result = pd.DataFrame(describe, index=names) 423 | result.columns = keys 424 | return(result) 425 | 426 | # In[19]: 427 | 428 | def Covariance(BET): 429 | 430 | """ 431 | This function computes pairwise covariance of all features in BET. Covariance describes 432 | the linear relationship between two features. 433 | 434 | Examples 435 | -------- 436 | Covariance(Basic_Element_Table) 437 | 438 | The above function generates pairwise Covariance for all the features in the Basic_Element_Table. 439 | 440 | function returns Covariance as Pandas Dataframe. 441 | 442 | """ 443 | 444 | l =(len(BET)) 445 | BET.reset_index(drop = True, inplace = True) 446 | x = BET.to_dict(orient='list') 447 | keys =list(x.keys()) 448 | covar = {} 449 | 450 | for i in range(len(BET)): 451 | covar[i] = [] 452 | for j in range(len(BET)): 453 | m = keys[i] 454 | try: 455 | cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]) 456 | covar[i].append(cov) 457 | except: 458 | covar[i].append('NaN') 459 | 460 | result = pd.DataFrame(covar, index=keys) 461 | result.columns = keys 462 | return(result) 463 | 464 | 465 | def correlation(BET): 466 | 467 | """ 468 | This function computes pairwise correlations of all features in BET. correlation measures 469 | how strong a relationship is between two variables. 470 | 471 | Examples 472 | -------- 473 | correlation(Basic_Element_Table) 474 | 475 | The above function generates pairwise correlations for all the features in the Basic_Element_Table. 476 | 477 | function returns correlations as Pandas Dataframe. 478 | 479 | """ 480 | 481 | l =(len(BET)) 482 | BET.reset_index(drop = True, inplace = True) 483 | x = BET.to_dict(orient='list') 484 | keys =list(x.keys()) 485 | corr = {} 486 | 487 | for i in range(len(BET)): 488 | corr[i] = [] 489 | for j in range(len(BET)): 490 | m = keys[i] 491 | count1 = x[m][j][0] 492 | count2 = x[m][j][5] 493 | try: 494 | var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1 495 | var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2 496 | corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2)) 497 | corr[i].append(corrl) 498 | except: 499 | corr[i].append('NaN') 500 | 501 | result = pd.DataFrame(corr, index=keys) 502 | result.columns = keys 503 | return(result) 504 | 505 | def Ztest(BET, col1, col2): 506 | 507 | l =(len(BET)) 508 | BET.reset_index(drop = True, inplace = True) 509 | x = BET.to_dict(orient='list') 510 | keys =list(x.keys()) 511 | 512 | count = x[col2][keys.index(col1)][6] 513 | sumx = x[col2][keys.index(col1)][10] 514 | sumx2 = x[col2][keys.index(col1)][11] 515 | Mean = sumx/count 516 | Variance = (sumx2 - (((sumx)**2)/count))/count 517 | 518 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 519 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 520 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 521 | Mean_0 = sumx_0/count_0 522 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 523 | 524 | zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count))) 525 | prob = 1 - stats.norm.cdf(zscore) 526 | return 2*prob 527 | 528 | 529 | def Ttest(BET, col1, col2): 530 | 531 | l =(len(BET)) 532 | BET.reset_index(drop = True, inplace = True) 533 | x = BET.to_dict(orient='list') 534 | keys =list(x.keys()) 535 | 536 | count = x[col2][keys.index(col1)][6] 537 | sumx = x[col2][keys.index(col1)][10] 538 | sumx2 = x[col2][keys.index(col1)][11] 539 | Mean = sumx/count 540 | Variance = (sumx2 - (((sumx)**2)/count))/count 541 | 542 | count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6] 543 | sumx_0 = x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10] 544 | sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11] 545 | Mean_0 = sumx_0/count_0 546 | Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0 547 | 548 | var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2) 549 | 550 | tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count)))) 551 | 552 | df = (count + count_0 - 2) 553 | 554 | prob = (1-stats.t.cdf(tscore, df)) 555 | return 2*prob 556 | 557 | 558 | 559 | def chi2(BET, feature_1 , feature_2): 560 | 561 | l =(len(BET)) 562 | BET.reset_index(drop = True, inplace = True) 563 | x = BET.to_dict(orient='list') 564 | keys =list(x.keys()) 565 | obs_freq = {} 566 | exp_freq = {} 567 | sum_exp_freq_vertical = np.zeros(len(feature_2)) 568 | chi2 = 0 569 | 570 | for i in range(len(feature_1)): 571 | obs_freq[feature_1[i]] = [] 572 | 573 | for j in range(len(feature_2)): 574 | col1 = (feature_1[i]) 575 | col2 = (feature_2[j]) 576 | sumx = x[col1][keys.index(col2)][10] 577 | obs_freq[feature_1[i]].append(sumx) 578 | 579 | sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]]) 580 | total_in_contingency = sum(sum_exp_freq_vertical) 581 | 582 | for i in range(len(feature_1)): 583 | exp_freq[feature_1[i]] = [] 584 | sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]]) 585 | for j in range(len(feature_2)): 586 | e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency 587 | exp_freq[feature_1[i]].append(e) 588 | 589 | for i in range(len(feature_1)): 590 | for j in range(len(feature_2)): 591 | chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j] 592 | 593 | 594 | df = (len(feature_1) - 1)*(len(feature_2)-1) 595 | 596 | print('chi2: ' + str(chi2)) 597 | print('df: ' + str(df)) 598 | print('chisqprob: ' + str(chisqprob(chi2, df))) 599 | return(chisqprob(chi2, df)) 600 | 601 | 602 | #Models: 603 | 604 | def LDA_fit(BET, target): 605 | 606 | """ 607 | Linear Discriminant Analysis (LDA) is a classification method searching for a linear combination 608 | of variables (predictors) that best separates the classes (targets). 609 | 610 | It basically performs the supervised dimensionality reduction, by projecting the input data to a 611 | linear subspace consisting of the directions which maximize the separation between classes (Maximizing the difference 612 | between the means of groups and reducing Std. deviation within groups) 613 | 614 | Examples 615 | -------- 616 | LDA_fit(Basic_Element_Table, Target) 617 | 618 | where 'Basic_Element_Table' is found from BET function for the data and 'Target' is the feature that needs to be 619 | predicted. 620 | 621 | The function returns (mean1,mean2,Beta, prob) which are Mean vectors of the groups, Linear Model coefficients and 622 | class probability respectively. 623 | 624 | """ 625 | l =(len(BET)) 626 | BET1 = BET 627 | BET1.reset_index(drop = True, inplace = True) 628 | x = BET1.to_dict(orient='list') 629 | keys =list(x.keys()) 630 | k = keys.index(target) 631 | count_1 = BET[target][k][0] - BET[target][k][1] 632 | count_2 = BET[target][k][1] 633 | mean1 = [] 634 | mean2 = [] 635 | c = [] 636 | for i in range(len(BET)): 637 | if i != keys.index(target): 638 | mean1.append((BET[target][i][1] - BET[target][i][10])/(BET[target][i][0]-BET[target][i][6])) 639 | mean2.append((BET[target][i][10])/BET[target][i][6]) 640 | 641 | for i in range(len(BET)): 642 | if i != keys.index(target): 643 | for j in range(len(BET)): 644 | if j != keys.index(target): 645 | m = keys[i] 646 | n = keys[j] 647 | cal1 = (((x[m][k][6] - x[m][k][10])*(x[n][k][6] - x[n][k][10]))/count_1) 648 | cal2 = (x[m][k][10]*x[n][k][10])/count_2 649 | c.append((x[m][j][10]-cal1 - cal2)/(count_1+count_2-2)) 650 | 651 | c = np.array(c) 652 | n = (len(BET)-1) 653 | c = reshape(c,(n,n)) 654 | inverse = np.linalg.inv(c) 655 | z = np.array(mean1)-np.array(mean2) 656 | Beta = np.matmul(inverse, z.T) 657 | prob = (-math.log(count_1/count_2)) 658 | 659 | return (mean1,mean2,Beta, prob) 660 | 661 | 662 | def LDA_predict(BET, X, target): 663 | """ 664 | To predict the target values for the given data using LDA paramters calculated from the training dataset. 665 | Returns the predictions using LDA model. 666 | 667 | Examples 668 | -------- 669 | LDA_predict(Basic_Element_Table, Testing_data, Target) 670 | 671 | BET table and testing data should be given as inputs 672 | """ 673 | (mean1,mean2,Beta, prob) = LDA_fit(BET, target) 674 | numpy_matrix = X.as_matrix() 675 | q=[] 676 | for i in range(len(numpy_matrix)): 677 | z = numpy_matrix[i] - (0.5*(np.array(mean1) - np.array(mean2))) 678 | if np.matmul(Beta.T, z) > prob: 679 | q.append(0) 680 | else: 681 | q.append(1) 682 | return q 683 | 684 | def accuracy(y, y_pred): 685 | y = list(y) 686 | y_pred =list(y_pred) 687 | matches = [] 688 | for i in range(len(y)): 689 | if y[i] == y_pred[i]: 690 | matches.append(1) 691 | return (sum(matches)/len(y))*100 692 | 693 | 694 | def PCA(BET): 695 | """ 696 | Principal component analysis (PCA) is a classical statistical method that uses an orthogonal transformation 697 | to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables 698 | called principal components. 699 | 700 | Real time Principal components for datasets can be extracted from the ART-M covariance matrix equations. 701 | 702 | Examples 703 | -------- 704 | PCA(Basic_Element_Table) 705 | 706 | This function returns eigen values & eigen vectors for the features in the Basic element table. 707 | """ 708 | 709 | cov = Covariance(BET) 710 | cov_mat = cov.values 711 | eig_vals, eig_vecs = np.linalg.eig(cov_mat) 712 | 713 | print('Eigenvectors: \n%s' %eig_vecs) 714 | print('\nEigenvalues: \n%s' %eig_vals) 715 | 716 | # Make a list of (eigenvalue, eigenvector) tuples 717 | eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))] 718 | 719 | # Sort the (eigenvalue, eigenvector) tuples from high to low 720 | eig_pairs.sort(key=lambda x: x[0], reverse=True) 721 | 722 | # Visually confirm that the list is correctly sorted by decreasing eigenvalues 723 | print('\nEigenvalues in descending order:') 724 | for i in eig_pairs: 725 | print(i[0]) 726 | 727 | def MLR(BET,target): 728 | 729 | row_indexes = list(BET.index) 730 | target_index = row_indexes.index(target) 731 | BET_features = BET.drop(target, axis =1) 732 | BET_features = BET_features.drop(target, axis =0) 733 | cov_features = Covariance(BET_features).values 734 | cov_target = Covariance(BET).values 735 | cov_target = cov_target[target_index] 736 | cov_target = np.delete(cov_target, target_index) 737 | inverse = np.linalg.inv(cov_features) 738 | Beta_array = np.matmul(inverse, cov_target) 739 | 740 | l =(len(BET)) 741 | BET.reset_index(drop = True, inplace = True) 742 | x = BET.to_dict(orient='list') 743 | keys =list(x.keys()) 744 | 745 | mean_target = (BET[target][keys.index(target)][1])/BET[target][keys.index(target)][0] 746 | mean_X = [] 747 | 748 | for i in range(len(BET_features)): 749 | if i != keys.index(target): 750 | mean_X.append((BET[target][i][1])/BET[target][i][0]) 751 | 752 | b0 = mean_target - np.matmul(Beta_array, mean_X) 753 | 754 | print(b0) 755 | return Beta_array 756 | 757 | 758 | def gaussian_NB(BET, X ,target): 759 | 760 | l =(len(BET)) 761 | BET.reset_index(drop = True, inplace = True) 762 | x = BET.to_dict(orient='list') 763 | keys =list(x.keys()) 764 | 765 | probability = [] 766 | likelihood = 1 767 | att_prior_prob = 1 768 | class_prior_prob = 1 769 | for i in range(len(BET)): 770 | if keys[i] != target: 771 | count = x[target][i][6] 772 | sumxy = x[target][i][10] 773 | sumxy2 = x[target][i][11] 774 | Mean = sumxy/count 775 | Variance = (sumxy2 - (((sumxy)**2)/count))/count 776 | value = X[i] 777 | likelihood = likelihood*(1/math.sqrt(2*np.pi*Variance))*(np.e**(-(value-Mean)/(2*Variance))) 778 | 779 | class_prior_prob = (count/x[target][i][5]) 780 | 781 | count_att = x[target][i][0] 782 | sumxy_att = x[target][i][1] 783 | sumxy2_att = x[target][i][2] 784 | Mean_att = sumxy_att/count_att 785 | Variance_att = (sumxy2_att - (((sumxy_att)**2)/count_att))/count_att 786 | 787 | att_prior_prob = att_prior_prob*(1/math.sqrt(2*np.pi*Variance_att))*(np.e**(-(value-Mean_att)/(2*Variance_att))) 788 | 789 | post_prob = (class_prior_prob * likelihood)/att_prior_prob 790 | 791 | return post_prob 792 | 793 | 794 | def Multinomial_NB(BET, X ,target): 795 | 796 | l =(len(BET)) 797 | BET.reset_index(drop = True, inplace = True) 798 | x = BET.to_dict(orient='list') 799 | keys =list(x.keys()) 800 | 801 | probability = [] 802 | likelihood = 1 803 | att_prior_prob = 1 804 | class_prior_prob = 1 805 | for i in range(len(BET)): 806 | if keys[i] != target: 807 | sumx = x[target][i][6] 808 | sumxy = x[target][i][10] 809 | likelihood = likelihood*(sumxy/sumx) 810 | 811 | class_prior_prob = (x[target][i][6]/x[target][i][5]) 812 | 813 | count_att = x[target][i][0] 814 | sumxy_att = x[target][i][1] 815 | att_prior_prob = att_prior_prob*(sumxy_att/count_att) 816 | 817 | post_prob = (class_prior_prob * likelihood)/att_prior_prob 818 | 819 | return post_prob 820 | 821 | 822 | def SVM_fit(BET, target): 823 | l =(len(BET)) 824 | BET1 = BET 825 | BET1.reset_index(drop = True, inplace = True) 826 | x = BET1.to_dict(orient='list') 827 | keys =list(x.keys()) 828 | k = keys.index(target) 829 | EE = [] 830 | last_row =[] 831 | Ede = [] 832 | count = BET[target][k][0] 833 | for i in range(len(BET)): 834 | if i != keys.index(target): 835 | for j in range(len(BET)): 836 | if j != keys.index(target): 837 | m = keys[i] 838 | n = keys[j] 839 | EE.append(x[m][j][10]) 840 | if j == keys.index(target): 841 | Ede.append(2*(x[m][j][10]) -x[m][i][6]) 842 | EE.append(-x[m][i][6]) 843 | last_row.append(-x[m][i][6]) 844 | final = EE+last_row 845 | final.pop() 846 | final.append(count) 847 | final = np.array(final) 848 | n = (len(BET)) 849 | final = reshape(final,(n,n)) 850 | 851 | Ede.append((count-2*(BET[target][k][1]))) 852 | 853 | I = np.identity(n) 854 | const = (((I/count)+ final)) 855 | 856 | inverse = np.linalg.inv(const) 857 | Beta = np.dot(inverse, np.array(Ede)) 858 | 859 | return(Beta) 860 | 861 | 862 | def SVM_Reg_fit(BET, target,tuning_parameter): 863 | l =(len(BET)) 864 | BET1 = BET 865 | BET1.reset_index(drop = True, inplace = True) 866 | x = BET1.to_dict(orient='list') 867 | keys =list(x.keys()) 868 | k = keys.index(target) 869 | EE = [] 870 | last_row =[] 871 | Ede = [] 872 | count = BET[target][k][0] 873 | for i in range(len(BET)): 874 | if i != keys.index(target): 875 | for j in range(len(BET)): 876 | if j != keys.index(target): 877 | m = keys[i] 878 | n = keys[j] 879 | EE.append(x[m][j][10]) 880 | if j == keys.index(target): 881 | Ede.append(x[m][j][10]) 882 | EE.append(-x[m][i][6]) 883 | last_row.append(-x[m][i][6]) 884 | final = EE+last_row 885 | final.pop() 886 | final.append(count) 887 | final = np.array(final) 888 | n = (len(BET)) 889 | final = reshape(final,(n,n)) 890 | 891 | Ede.append(-(BET[target][k][1])) 892 | print(Ede) 893 | I = np.identity(n) 894 | const = (((I/tuning_parameter)+ final)) 895 | 896 | inverse = np.linalg.inv(const) 897 | Beta = np.dot(inverse, np.array(Ede)) 898 | 899 | return(Beta) 900 | --------------------------------------------------------------------------------