├── artml
    ├── metrics
    │   ├── scores.py
    │   ├── accuracy_score.py
    │   └── plots.py
    ├── explore
    │   ├── Ztest.py
    │   ├── Ttest.py
    │   ├── covariance.py
    │   ├── correlation.py
    │   ├── chi2.py
    │   ├── univariate.py
    │   └── stats.py
    ├── feature_selection
    │   └── mahalanobis_features.py
    └── bet.py
├── python
    └── artml
    │   ├── metrics
    │       ├── scores.py
    │       ├── accuracy_score.py
    │       └── plots.py
    │   ├── explore
    │       ├── Ztest.py
    │       ├── Ttest.py
    │       ├── covariance.py
    │       ├── correlation.py
    │       ├── chi2.py
    │       ├── univariate.py
    │       └── stats.py
    │   ├── feature_selection
    │       └── mahalanobis_features.py
    │   └── bet.py
├── README.md
├── examples
    └── datasets
    │   └── iris.csv
├── module.py
└── util
    └── module.py


/artml/metrics/scores.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | def accuracy(y_true, y_pred):
13 |     y_true = list(y_true)
14 |     y_pred =list(y_pred)
15 |     matches = []
16 |     for i in range(len(y_true)):
17 |         if y_true[i] == y_pred[i]:
18 |             matches.append(1)
19 |     return (sum(matches)/len(y_true))*100
20 | 
21 | 
22 | # In[ ]:
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/python/artml/metrics/scores.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | def accuracy(y_true, y_pred):
13 |     y_true = list(y_true)
14 |     y_pred =list(y_pred)
15 |     matches = []
16 |     for i in range(len(y_true)):
17 |         if y_true[i] == y_pred[i]:
18 |             matches.append(1)
19 |     return (sum(matches)/len(y_true))*100
20 | 
21 | 
22 | # In[ ]:
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/artml/metrics/accuracy_score.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | def accuracy_score(y_true, y_pred):
13 |     y_true = list(y_true)
14 |     y_pred =list(y_pred)
15 |     matches = []
16 |     for i in range(len(y_true)):
17 |         if y_true[i] == y_pred[i]:
18 |             matches.append(1)
19 |     return (sum(matches)/len(y_true))*100
20 | 
21 | 
22 | # In[ ]:
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/python/artml/metrics/accuracy_score.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import numpy as np
 7 | import pandas as pd
 8 | 
 9 | 
10 | # In[2]:
11 | 
12 | def accuracy_score(y_true, y_pred):
13 |     y_true = list(y_true)
14 |     y_pred =list(y_pred)
15 |     matches = []
16 |     for i in range(len(y_true)):
17 |         if y_true[i] == y_pred[i]:
18 |             matches.append(1)
19 |     return (sum(matches)/len(y_true))*100
20 | 
21 | 
22 | # In[ ]:
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/artml/explore/Ztest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | import warnings
10 | warnings.filterwarnings('ignore')
11 | 
12 | 
13 | # In[12]:
14 | 
15 | def Ztest(BET, col1, col2):
16 |     
17 |     l =(len(BET))
18 |     BET.reset_index(drop = True, inplace = True)
19 |     x = BET.to_dict(orient='list')
20 |     keys =list(x.keys())
21 |     
22 |     count = x[col2][keys.index(col1)][6]
23 |     sumx = x[col2][keys.index(col1)][10]
24 |     sumx2 = x[col2][keys.index(col1)][11]
25 |     Mean = sumx/count
26 |     Variance = (sumx2 - (((sumx)**2)/count))/count
27 |     
28 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
29 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
30 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
31 |     Mean_0 = sumx_0/count_0
32 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
33 |     
34 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
35 |     prob = 1 - stats.norm.cdf(zscore)
36 |     return 2*prob
37 | 
38 | 
39 | # In[ ]:
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/python/artml/explore/Ztest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | import warnings
10 | warnings.filterwarnings('ignore')
11 | 
12 | 
13 | # In[12]:
14 | 
15 | def Ztest(BET, col1, col2):
16 |     
17 |     l =(len(BET))
18 |     BET.reset_index(drop = True, inplace = True)
19 |     x = BET.to_dict(orient='list')
20 |     keys =list(x.keys())
21 |     
22 |     count = x[col2][keys.index(col1)][6]
23 |     sumx = x[col2][keys.index(col1)][10]
24 |     sumx2 = x[col2][keys.index(col1)][11]
25 |     Mean = sumx/count
26 |     Variance = (sumx2 - (((sumx)**2)/count))/count
27 |     
28 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
29 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
30 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
31 |     Mean_0 = sumx_0/count_0
32 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
33 |     
34 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
35 |     prob = 1 - stats.norm.cdf(zscore)
36 |     return 2*prob
37 | 
38 | 
39 | # In[ ]:
40 | 
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/artml/explore/Ttest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[13]:
15 | 
16 | def Ttest(BET, col1, col2):
17 |     
18 |     l =(len(BET))
19 |     BET.reset_index(drop = True, inplace = True)
20 |     x = BET.to_dict(orient='list')
21 |     keys =list(x.keys())
22 |     
23 |     count = x[col2][keys.index(col1)][6]
24 |     sumx = x[col2][keys.index(col1)][10]
25 |     sumx2 = x[col2][keys.index(col1)][11]
26 |     Mean = sumx/count
27 |     Variance = (sumx2 - (((sumx)**2)/count))/count
28 |     
29 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
30 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
31 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
32 |     Mean_0 = sumx_0/count_0
33 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
34 |     
35 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
36 |     
37 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
38 |     
39 |     df = (count + count_0 - 2)
40 |     
41 |     prob = (1-stats.t.cdf(tscore, df)) 
42 |     return 2*prob
43 | 
44 | 
45 | # In[ ]:
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/python/artml/explore/Ttest.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[13]:
15 | 
16 | def Ttest(BET, col1, col2):
17 |     
18 |     l =(len(BET))
19 |     BET.reset_index(drop = True, inplace = True)
20 |     x = BET.to_dict(orient='list')
21 |     keys =list(x.keys())
22 |     
23 |     count = x[col2][keys.index(col1)][6]
24 |     sumx = x[col2][keys.index(col1)][10]
25 |     sumx2 = x[col2][keys.index(col1)][11]
26 |     Mean = sumx/count
27 |     Variance = (sumx2 - (((sumx)**2)/count))/count
28 |     
29 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
30 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
31 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
32 |     Mean_0 = sumx_0/count_0
33 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
34 |     
35 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
36 |     
37 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
38 |     
39 |     df = (count + count_0 - 2)
40 |     
41 |     prob = (1-stats.t.cdf(tscore, df)) 
42 |     return 2*prob
43 | 
44 | 
45 | # In[ ]:
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/artml/explore/covariance.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[10]:
15 | 
16 | def covariance(BET):
17 |     
18 |     """
19 |     This function computes pairwise covariance of all features in BET. Covariance describes 
20 |     the linear relationship between two features.
21 |     
22 |     Examples
23 |     --------
24 |         Covariance(Basic_Element_Table)
25 |         
26 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
27 |         
28 |         function returns Covariance as Pandas Dataframe.
29 |     
30 |     """
31 |     
32 |     l =(len(BET))
33 |     BET.reset_index(drop = True, inplace = True)
34 |     x = BET.to_dict(orient='list')
35 |     keys =list(x.keys())  
36 |     covar = {}
37 |     
38 |     for i in range(len(BET)):
39 |         covar[i] = []
40 |         for j in range(len(BET)):
41 |             m = keys[i]
42 |             try:
43 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
44 |                 covar[i].append(cov)
45 |             except:
46 |                 covar[i].append('NaN')
47 |             
48 |     result = pd.DataFrame(covar, index=keys)
49 |     result.columns = keys
50 |     return(result)
51 | 
52 | 
53 | # In[ ]:
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/python/artml/explore/covariance.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[10]:
15 | 
16 | def covariance(BET):
17 |     
18 |     """
19 |     This function computes pairwise covariance of all features in BET. Covariance describes 
20 |     the linear relationship between two features.
21 |     
22 |     Examples
23 |     --------
24 |         Covariance(Basic_Element_Table)
25 |         
26 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
27 |         
28 |         function returns Covariance as Pandas Dataframe.
29 |     
30 |     """
31 |     
32 |     l =(len(BET))
33 |     BET.reset_index(drop = True, inplace = True)
34 |     x = BET.to_dict(orient='list')
35 |     keys =list(x.keys())  
36 |     covar = {}
37 |     
38 |     for i in range(len(BET)):
39 |         covar[i] = []
40 |         for j in range(len(BET)):
41 |             m = keys[i]
42 |             try:
43 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
44 |                 covar[i].append(cov)
45 |             except:
46 |                 covar[i].append('NaN')
47 |             
48 |     result = pd.DataFrame(covar, index=keys)
49 |     result.columns = keys
50 |     return(result)
51 | 
52 | 
53 | # In[ ]:
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/artml/explore/correlation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[11]:
15 | 
16 | def correlation(BET):
17 |     
18 |     """
19 |     This function computes pairwise correlations of all features in BET. correlation measures 
20 |     how strong a relationship is between two variables.
21 |     
22 |     Examples
23 |     --------
24 |         correlation(Basic_Element_Table)
25 |         
26 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
27 |         
28 |         function returns correlations as Pandas Dataframe.
29 |     
30 |     """
31 |     
32 |     l =(len(BET))
33 |     BET.reset_index(drop = True, inplace = True)
34 |     x = BET.to_dict(orient='list')
35 |     keys =list(x.keys())  
36 |     corr = {}
37 |     
38 |     for i in range(len(BET)):
39 |         corr[i] = []
40 |         for j in range(len(BET)):
41 |             m = keys[i]      
42 |             count1 = x[m][j][0]
43 |             count2 = x[m][j][5]
44 |             try:
45 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
46 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
47 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
48 |                 corr[i].append(corrl)
49 |             except:
50 |                 corr[i].append('NaN')
51 |     
52 |     result = pd.DataFrame(corr, index=keys)
53 |     result.columns = keys
54 |     return(result)
55 | 
56 | 
57 | # In[ ]:
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/python/artml/explore/correlation.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[11]:
15 | 
16 | def correlation(BET):
17 |     
18 |     """
19 |     This function computes pairwise correlations of all features in BET. correlation measures 
20 |     how strong a relationship is between two variables.
21 |     
22 |     Examples
23 |     --------
24 |         correlation(Basic_Element_Table)
25 |         
26 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
27 |         
28 |         function returns correlations as Pandas Dataframe.
29 |     
30 |     """
31 |     
32 |     l =(len(BET))
33 |     BET.reset_index(drop = True, inplace = True)
34 |     x = BET.to_dict(orient='list')
35 |     keys =list(x.keys())  
36 |     corr = {}
37 |     
38 |     for i in range(len(BET)):
39 |         corr[i] = []
40 |         for j in range(len(BET)):
41 |             m = keys[i]      
42 |             count1 = x[m][j][0]
43 |             count2 = x[m][j][5]
44 |             try:
45 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
46 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
47 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
48 |                 corr[i].append(corrl)
49 |             except:
50 |                 corr[i].append('NaN')
51 |     
52 |     result = pd.DataFrame(corr, index=keys)
53 |     result.columns = keys
54 |     return(result)
55 | 
56 | 
57 | # In[ ]:
58 | 
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/artml/explore/chi2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | from scipy.stats import chisqprob
11 | import warnings
12 | warnings.filterwarnings('ignore')
13 | 
14 | 
15 | # In[14]:
16 | 
17 | def chi2(BET, feature_1 , feature_2):
18 |     
19 |     l =(len(BET))
20 |     BET.reset_index(drop = True, inplace = True)
21 |     x = BET.to_dict(orient='list')
22 |     keys =list(x.keys())
23 |     obs_freq = {}
24 |     exp_freq = {}
25 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
26 |     chi2 = 0
27 |     
28 |     for i in range(len(feature_1)):
29 |         obs_freq[feature_1[i]] = []
30 |         
31 |         for j in range(len(feature_2)): 
32 |             col1 = (feature_1[i])
33 |             col2 = (feature_2[j])
34 |             sumx = x[col1][keys.index(col2)][10]
35 |             obs_freq[feature_1[i]].append(sumx)
36 |             
37 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
38 |     total_in_contingency = sum(sum_exp_freq_vertical)
39 |     
40 |     for i in range(len(feature_1)):
41 |         exp_freq[feature_1[i]] = []
42 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
43 |         for j in range(len(feature_2)):            
44 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
45 |             exp_freq[feature_1[i]].append(e)
46 |         
47 |     for i in range(len(feature_1)):
48 |         for j in range(len(feature_2)):
49 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
50 |             
51 |             
52 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
53 |     
54 |     print('chi2: ' + str(chi2))
55 |     print('df: '  + str(df))
56 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
57 |     return(chisqprob(chi2, df))
58 | 
59 | 
60 | # In[ ]:
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/python/artml/explore/chi2.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[9]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | from scipy.stats import chisqprob
11 | import warnings
12 | warnings.filterwarnings('ignore')
13 | 
14 | 
15 | # In[14]:
16 | 
17 | def chi2(BET, feature_1 , feature_2):
18 |     
19 |     l =(len(BET))
20 |     BET.reset_index(drop = True, inplace = True)
21 |     x = BET.to_dict(orient='list')
22 |     keys =list(x.keys())
23 |     obs_freq = {}
24 |     exp_freq = {}
25 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
26 |     chi2 = 0
27 |     
28 |     for i in range(len(feature_1)):
29 |         obs_freq[feature_1[i]] = []
30 |         
31 |         for j in range(len(feature_2)): 
32 |             col1 = (feature_1[i])
33 |             col2 = (feature_2[j])
34 |             sumx = x[col1][keys.index(col2)][10]
35 |             obs_freq[feature_1[i]].append(sumx)
36 |             
37 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
38 |     total_in_contingency = sum(sum_exp_freq_vertical)
39 |     
40 |     for i in range(len(feature_1)):
41 |         exp_freq[feature_1[i]] = []
42 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
43 |         for j in range(len(feature_2)):            
44 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
45 |             exp_freq[feature_1[i]].append(e)
46 |         
47 |     for i in range(len(feature_1)):
48 |         for j in range(len(feature_2)):
49 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
50 |             
51 |             
52 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
53 |     
54 |     print('chi2: ' + str(chi2))
55 |     print('df: '  + str(df))
56 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
57 |     return(chisqprob(chi2, df))
58 | 
59 | 
60 | # In[ ]:
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [![made-with-python](https://img.shields.io/badge/Made%20with-Python-1f425f.svg)](https://www.python.org/)   [![PyPI license](https://img.shields.io/pypi/l/ansicolortags.svg)](https://github.com/angular/angular.js/blob/master/LICENSE)   [![PyPI status](https://img.shields.io/pypi/status/ansicolortags.svg)](https://github.com/AdaptiveMachineLearning)   [![Documentation Status](https://readthedocs.org/projects/ansicolortags/badge/?version=latest)](https://adaptivemachinelearning.github.io/)    [![GitHub release](https://img.shields.io/github/release/Naereen/StrapDown.js.svg)](https://github.com/AdaptiveMachineLearning/artml/tree/master/python/artml)
 2 | 
 3 | # Fork the World of Real time Learning
 4 | 
 5 | ARTML is a high-level Machine Learning API, written in Python and capable of running and building all linear models. It was developed with a focus on enabling continous and real time learning in distributed environments. Current hype is about Deep learning, But the future is deep with real learning. Welcome to the world of Real Learning!
 6 | 
 7 | Read the documentation at [adaptivemachinelearning.io](https://adaptivemachinelearning.github.io/)
 8 | 
 9 | ## Adaptive Real Time Machine Learning (ART-ML)
10 | 
11 | The term “Real Time” is used to describe how well predictive modeling algorithms can accommodate an ever increasing data load instantaneously. However, such real time problems are usually closely coupled with the fact that conventional data mining algorithms operate in a batch mode where having all of the relevant data at once is a requirement. Thus, here Real Time Machine Learning  is defined as having all of the following characteristics, independent of the amount of data involved: 
12 | 
13 | ![ARTML6](https://user-images.githubusercontent.com/36970153/55763008-58bb1b80-5a33-11e9-8255-ab4068373eef.JPG)
14 | 
15 | 
16 | **Incremental learning (Learn)**: Immediately updating a model with each new observation without the necessity of pooling new data with old data.
17 | 
18 | **Decremental learning (Forget)**: Immediately updating a model by excluding observations identified as adversely affecting model performance without forming a new dataset omitting this data and returning to the model formulation step.
19 | 
20 | **Variable addition (Grow)**: Adding a new attribute (variable) on the fly, without the necessity of pooling new data with old data.
21 | 
22 | **Variable deletion (Shrink)**: Immediately discontinuing use of an attribute identified as adversely affecting model performance.
23 | 
24 | **Distributed processing**: Separately processing distributed data or segments of large data (that may be located in diverse geographic locations) and re-combining the results to obtain a single model.
25 | 
26 | **Parallel processing**: carrying out parallel processing extremely rapidly from multiple conventional processing units (multi-threads, multi-processors or a specialized chip).
27 | 
28 | Project in PROGRESS...
29 | 
30 | ### ARTML Models section is not open source as of now. Will be published soon!
31 | 
32 | Have any questions? Shoot me an email and I shall get back to you asap! 
33 | 
34 | **Email Id**: sundeep.pothula@mail.utoronto.ca
35 | 
36 | Happy Continual Learning! 
37 | 


--------------------------------------------------------------------------------
/artml/explore/univariate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[2]:
15 | 
16 | def univariate(BET):
17 |     
18 |     """
19 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
20 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
21 |     a dataset’s distribution, excluding NaN values.
22 |     
23 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
24 |     
25 |     Examples
26 |     --------
27 |         univariate(Basic_Element_Table)
28 |         
29 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
30 |         
31 |         function returns univariate stats as Pandas Dataframe.
32 |     
33 |     """
34 |     
35 |     l =(len(BET))
36 |     BET.reset_index(drop = True, inplace = True)
37 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
38 |     keys =list(x.keys())  
39 |     describe = {}
40 |     
41 |     for i in range(l):
42 |         describe[i] = []
43 |         m = keys[i]
44 |         
45 |         try:
46 |             count = x[m][i][0]
47 |             describe[i].append(count)
48 |         except:
49 |             describe[i].append('NaN')
50 |         try:
51 |             Mean = (x[m][i][1])/count
52 |             describe[i].append(Mean)   
53 |         except:
54 |             describe[i].append('NaN')
55 |         
56 |         try:
57 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
58 |             describe[i].append(Variance)
59 |         except:
60 |             describe[i].append('NaN')
61 |         try:
62 |             Standard_deviation = math.sqrt(Variance)
63 |             describe[i].append(Standard_deviation)
64 |         except:
65 |             describe[i].append('NaN')
66 |         try:
67 |             coeff_of_variation = (Standard_deviation/Mean)*100
68 |             describe[i].append(coeff_of_variation)
69 |         except:
70 |             describe[i].append('NaN')
71 |             
72 |         try:
73 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
74 |             describe[i].append(skewness)
75 |         except:
76 |             describe[i].append('NaN')
77 |         try:
78 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
79 |             describe[i].append(Kurtosis)
80 |         except:
81 |             describe[i].append('NaN')        
82 |         
83 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
84 |     result = pd.DataFrame(describe, index=names)
85 |     result.columns = keys
86 |     return(result)
87 | 
88 | 


--------------------------------------------------------------------------------
/python/artml/explore/univariate.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # coding: utf-8
 3 | 
 4 | # In[1]:
 5 | 
 6 | import math
 7 | import numpy as np
 8 | import pandas as pd
 9 | from scipy import stats
10 | import warnings
11 | warnings.filterwarnings('ignore')
12 | 
13 | 
14 | # In[2]:
15 | 
16 | def univariate(BET):
17 |     
18 |     """
19 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
20 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
21 |     a dataset’s distribution, excluding NaN values.
22 |     
23 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
24 |     
25 |     Examples
26 |     --------
27 |         univariate(Basic_Element_Table)
28 |         
29 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
30 |         
31 |         function returns univariate stats as Pandas Dataframe.
32 |     
33 |     """
34 |     
35 |     l =(len(BET))
36 |     BET.reset_index(drop = True, inplace = True)
37 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
38 |     keys =list(x.keys())  
39 |     describe = {}
40 |     
41 |     for i in range(l):
42 |         describe[i] = []
43 |         m = keys[i]
44 |         
45 |         try:
46 |             count = x[m][i][0]
47 |             describe[i].append(count)
48 |         except:
49 |             describe[i].append('NaN')
50 |         try:
51 |             Mean = (x[m][i][1])/count
52 |             describe[i].append(Mean)   
53 |         except:
54 |             describe[i].append('NaN')
55 |         
56 |         try:
57 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
58 |             describe[i].append(Variance)
59 |         except:
60 |             describe[i].append('NaN')
61 |         try:
62 |             Standard_deviation = math.sqrt(Variance)
63 |             describe[i].append(Standard_deviation)
64 |         except:
65 |             describe[i].append('NaN')
66 |         try:
67 |             coeff_of_variation = (Standard_deviation/Mean)*100
68 |             describe[i].append(coeff_of_variation)
69 |         except:
70 |             describe[i].append('NaN')
71 |             
72 |         try:
73 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
74 |             describe[i].append(skewness)
75 |         except:
76 |             describe[i].append('NaN')
77 |         try:
78 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
79 |             describe[i].append(Kurtosis)
80 |         except:
81 |             describe[i].append('NaN')        
82 |         
83 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
84 |     result = pd.DataFrame(describe, index=names)
85 |     result.columns = keys
86 |     return(result)
87 | 
88 | 


--------------------------------------------------------------------------------
/artml/metrics/plots.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn import metrics
  9 | import matplotlib.pyplot as plt
 10 | try:
 11 |     import scikitplot as skplt
 12 | except:
 13 |     get_ipython().system('pip install scikitplot')
 14 | import scikitplot as skplt
 15 | from sklearn.metrics import precision_recall_curve
 16 | 
 17 | 
 18 | # In[2]:
 19 | 
 20 | def roc_curve(y_test, predicted_probas):
 21 |     skplt.metrics.plot_roc_curve(y_test, predicted_probas)
 22 |     print(plt.show())
 23 | 
 24 | 
 25 | # In[3]:
 26 | 
 27 | def cumulative_gain(y_test, predicted_probas):
 28 |     skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
 29 |     print(plt.show())
 30 | 
 31 | 
 32 | # In[4]:
 33 | 
 34 | def precision_recall_vs_threshold(y_test, predicted_probas):     
 35 |     precisions, recalls, thresholds = precision_recall_curve(y_test, predicted_probas)
 36 |     """
 37 |         Modified from:
 38 |         Hands-On Machine learning with Scikit-Learn
 39 |         and TensorFlow; p.89
 40 |     """
 41 |     plt.figure(figsize=(8, 8))
 42 |     plt.title("Precision and Recall Scores as a function of the decision threshold")
 43 |     plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
 44 |     plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
 45 |     plt.ylabel("Score")
 46 |     plt.xlabel("Decision Threshold")
 47 |     plt.legend(loc='best')
 48 | 
 49 |     print(plt.show())
 50 | 
 51 | 
 52 | # In[5]:
 53 | 
 54 | def adjusted_classes(y_scores, t):
 55 |     """
 56 |     This function adjusts class predictions based on the prediction threshold (t).
 57 |     Will only work for binary classification problems.
 58 |     """
 59 |     return [1 if y >= t else 0 for y in y_scores]
 60 | 
 61 | def precision_recall_threshold(y_test, predicted_probas, t=0.5):
 62 |     """
 63 |     plots the precision recall curve and shows the current value for each
 64 |     by identifying the classifier's threshold (t).
 65 |     """
 66 |     
 67 |     # generate new class predictions based on the adjusted_classes
 68 |     # function above and view the resulting confusion matrix.
 69 |     p, r, thresholds = precision_recall_curve(y_test, predicted_probas)
 70 |     y_pred_adj = adjusted_classes(predicted_probas, t)
 71 |     #print(confusion_matrix(y_test, y_pred_adj))
 72 |     
 73 |     # plot the curve
 74 |     plt.figure(figsize=(8,8))
 75 |     plt.title("Precision and Recall curve ^ = current threshold")
 76 |     plt.step(r, p, color='b', alpha=0.2,
 77 |              where='post')
 78 |     plt.fill_between(r, p, step='post', alpha=0.2,
 79 |                      color='b')
 80 |     plt.ylim([0, 1.01]);
 81 |     plt.xlim([0, 1.01]);
 82 |     plt.xlabel('Recall');
 83 |     plt.ylabel('Precision');
 84 |     
 85 |     # plot the current threshold on the line
 86 |     close_default_clf = np.argmin(np.abs(thresholds - t))
 87 |     plt.plot(r[close_default_clf], p[close_default_clf], '^', c='k',
 88 |             markersize=15)
 89 |     plt.show()
 90 | 
 91 | 
 92 | # In[6]:
 93 | 
 94 | def confusion_matrix(y_test, y_pred):
 95 |     skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)
 96 |     print(plt.show())
 97 | 
 98 | 
 99 | # In[7]:
100 | 
101 | def precision_recall(y_test, predicted_probas):
102 |     skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas)
103 |     print(plt.show())
104 | 
105 | 


--------------------------------------------------------------------------------
/python/artml/metrics/plots.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import pandas as pd
  7 | import numpy as np
  8 | from sklearn import metrics
  9 | import matplotlib.pyplot as plt
 10 | try:
 11 |     import scikitplot as skplt
 12 | except:
 13 |     get_ipython().system('pip install scikitplot')
 14 | import scikitplot as skplt
 15 | from sklearn.metrics import precision_recall_curve
 16 | 
 17 | 
 18 | # In[2]:
 19 | 
 20 | def roc_curve(y_test, predicted_probas):
 21 |     skplt.metrics.plot_roc_curve(y_test, predicted_probas)
 22 |     print(plt.show())
 23 | 
 24 | 
 25 | # In[3]:
 26 | 
 27 | def cumulative_gain(y_test, predicted_probas):
 28 |     skplt.metrics.plot_cumulative_gain(y_test, predicted_probas)
 29 |     print(plt.show())
 30 | 
 31 | 
 32 | # In[4]:
 33 | 
 34 | def precision_recall_vs_threshold(y_test, predicted_probas):     
 35 |     precisions, recalls, thresholds = precision_recall_curve(y_test, predicted_probas)
 36 |     """
 37 |         Modified from:
 38 |         Hands-On Machine learning with Scikit-Learn
 39 |         and TensorFlow; p.89
 40 |     """
 41 |     plt.figure(figsize=(8, 8))
 42 |     plt.title("Precision and Recall Scores as a function of the decision threshold")
 43 |     plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
 44 |     plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
 45 |     plt.ylabel("Score")
 46 |     plt.xlabel("Decision Threshold")
 47 |     plt.legend(loc='best')
 48 | 
 49 |     print(plt.show())
 50 | 
 51 | 
 52 | # In[5]:
 53 | 
 54 | def adjusted_classes(y_scores, t):
 55 |     """
 56 |     This function adjusts class predictions based on the prediction threshold (t).
 57 |     Will only work for binary classification problems.
 58 |     """
 59 |     return [1 if y >= t else 0 for y in y_scores]
 60 | 
 61 | def precision_recall_threshold(y_test, predicted_probas, t=0.5):
 62 |     """
 63 |     plots the precision recall curve and shows the current value for each
 64 |     by identifying the classifier's threshold (t).
 65 |     """
 66 |     
 67 |     # generate new class predictions based on the adjusted_classes
 68 |     # function above and view the resulting confusion matrix.
 69 |     p, r, thresholds = precision_recall_curve(y_test, predicted_probas)
 70 |     y_pred_adj = adjusted_classes(predicted_probas, t)
 71 |     #print(confusion_matrix(y_test, y_pred_adj))
 72 |     
 73 |     # plot the curve
 74 |     plt.figure(figsize=(8,8))
 75 |     plt.title("Precision and Recall curve ^ = current threshold")
 76 |     plt.step(r, p, color='b', alpha=0.2,
 77 |              where='post')
 78 |     plt.fill_between(r, p, step='post', alpha=0.2,
 79 |                      color='b')
 80 |     plt.ylim([0, 1.01]);
 81 |     plt.xlim([0, 1.01]);
 82 |     plt.xlabel('Recall');
 83 |     plt.ylabel('Precision');
 84 |     
 85 |     # plot the current threshold on the line
 86 |     close_default_clf = np.argmin(np.abs(thresholds - t))
 87 |     plt.plot(r[close_default_clf], p[close_default_clf], '^', c='k',
 88 |             markersize=15)
 89 |     plt.show()
 90 | 
 91 | 
 92 | # In[6]:
 93 | 
 94 | def confusion_matrix(y_test, y_pred):
 95 |     skplt.metrics.plot_confusion_matrix(y_test, y_pred, normalize=True)
 96 |     print(plt.show())
 97 | 
 98 | 
 99 | # In[7]:
100 | 
101 | def precision_recall(y_test, predicted_probas):
102 |     skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas)
103 |     print(plt.show())
104 | 
105 | 


--------------------------------------------------------------------------------
/examples/datasets/iris.csv:
--------------------------------------------------------------------------------
  1 | sepal length,sepal width,petal length,petal width,iris
  2 | 5.1,3.5,1.4,0.2,Iris-setosa
  3 | 4.9,3,1.4,0.2,Iris-setosa
  4 | 4.7,3.2,1.3,0.2,Iris-setosa
  5 | 4.6,3.1,1.5,0.2,Iris-setosa
  6 | 5,3.6,1.4,0.2,Iris-setosa
  7 | 5.4,3.9,1.7,0.4,Iris-setosa
  8 | 4.6,3.4,1.4,0.3,Iris-setosa
  9 | 5,3.4,1.5,0.2,Iris-setosa
 10 | 4.4,2.9,1.4,0.2,Iris-setosa
 11 | 4.9,3.1,1.5,0.1,Iris-setosa
 12 | 5.4,3.7,1.5,0.2,Iris-setosa
 13 | 4.8,3.4,1.6,0.2,Iris-setosa
 14 | 4.8,3,1.4,0.1,Iris-setosa
 15 | 4.3,3,1.1,0.1,Iris-setosa
 16 | 5.8,4,1.2,0.2,Iris-setosa
 17 | 5.7,4.4,1.5,0.4,Iris-setosa
 18 | 5.4,3.9,1.3,0.4,Iris-setosa
 19 | 5.1,3.5,1.4,0.3,Iris-setosa
 20 | 5.7,3.8,1.7,0.3,Iris-setosa
 21 | 5.1,3.8,1.5,0.3,Iris-setosa
 22 | 5.4,3.4,1.7,0.2,Iris-setosa
 23 | 5.1,3.7,1.5,0.4,Iris-setosa
 24 | 4.6,3.6,1,0.2,Iris-setosa
 25 | 5.1,3.3,1.7,0.5,Iris-setosa
 26 | 4.8,3.4,1.9,0.2,Iris-setosa
 27 | 5,3,1.6,0.2,Iris-setosa
 28 | 5,3.4,1.6,0.4,Iris-setosa
 29 | 5.2,3.5,1.5,0.2,Iris-setosa
 30 | 5.2,3.4,1.4,0.2,Iris-setosa
 31 | 4.7,3.2,1.6,0.2,Iris-setosa
 32 | 4.8,3.1,1.6,0.2,Iris-setosa
 33 | 5.4,3.4,1.5,0.4,Iris-setosa
 34 | 5.2,4.1,1.5,0.1,Iris-setosa
 35 | 5.5,4.2,1.4,0.2,Iris-setosa
 36 | 4.9,3.1,1.5,0.1,Iris-setosa
 37 | 5,3.2,1.2,0.2,Iris-setosa
 38 | 5.5,3.5,1.3,0.2,Iris-setosa
 39 | 4.9,3.1,1.5,0.1,Iris-setosa
 40 | 4.4,3,1.3,0.2,Iris-setosa
 41 | 5.1,3.4,1.5,0.2,Iris-setosa
 42 | 5,3.5,1.3,0.3,Iris-setosa
 43 | 4.5,2.3,1.3,0.3,Iris-setosa
 44 | 4.4,3.2,1.3,0.2,Iris-setosa
 45 | 5,3.5,1.6,0.6,Iris-setosa
 46 | 5.1,3.8,1.9,0.4,Iris-setosa
 47 | 4.8,3,1.4,0.3,Iris-setosa
 48 | 5.1,3.8,1.6,0.2,Iris-setosa
 49 | 4.6,3.2,1.4,0.2,Iris-setosa
 50 | 5.3,3.7,1.5,0.2,Iris-setosa
 51 | 5,3.3,1.4,0.2,Iris-setosa
 52 | 7,3.2,4.7,1.4,Iris-versicolor
 53 | 6.4,3.2,4.5,1.5,Iris-versicolor
 54 | 6.9,3.1,4.9,1.5,Iris-versicolor
 55 | 5.5,2.3,4,1.3,Iris-versicolor
 56 | 6.5,2.8,4.6,1.5,Iris-versicolor
 57 | 5.7,2.8,4.5,1.3,Iris-versicolor
 58 | 6.3,3.3,4.7,1.6,Iris-versicolor
 59 | 4.9,2.4,3.3,1,Iris-versicolor
 60 | 6.6,2.9,4.6,1.3,Iris-versicolor
 61 | 5.2,2.7,3.9,1.4,Iris-versicolor
 62 | 5,2,3.5,1,Iris-versicolor
 63 | 5.9,3,4.2,1.5,Iris-versicolor
 64 | 6,2.2,4,1,Iris-versicolor
 65 | 6.1,2.9,4.7,1.4,Iris-versicolor
 66 | 5.6,2.9,3.6,1.3,Iris-versicolor
 67 | 6.7,3.1,4.4,1.4,Iris-versicolor
 68 | 5.6,3,4.5,1.5,Iris-versicolor
 69 | 5.8,2.7,4.1,1,Iris-versicolor
 70 | 6.2,2.2,4.5,1.5,Iris-versicolor
 71 | 5.6,2.5,3.9,1.1,Iris-versicolor
 72 | 5.9,3.2,4.8,1.8,Iris-versicolor
 73 | 6.1,2.8,4,1.3,Iris-versicolor
 74 | 6.3,2.5,4.9,1.5,Iris-versicolor
 75 | 6.1,2.8,4.7,1.2,Iris-versicolor
 76 | 6.4,2.9,4.3,1.3,Iris-versicolor
 77 | 6.6,3,4.4,1.4,Iris-versicolor
 78 | 6.8,2.8,4.8,1.4,Iris-versicolor
 79 | 6.7,3,5,1.7,Iris-versicolor
 80 | 6,2.9,4.5,1.5,Iris-versicolor
 81 | 5.7,2.6,3.5,1,Iris-versicolor
 82 | 5.5,2.4,3.8,1.1,Iris-versicolor
 83 | 5.5,2.4,3.7,1,Iris-versicolor
 84 | 5.8,2.7,3.9,1.2,Iris-versicolor
 85 | 6,2.7,5.1,1.6,Iris-versicolor
 86 | 5.4,3,4.5,1.5,Iris-versicolor
 87 | 6,3.4,4.5,1.6,Iris-versicolor
 88 | 6.7,3.1,4.7,1.5,Iris-versicolor
 89 | 6.3,2.3,4.4,1.3,Iris-versicolor
 90 | 5.6,3,4.1,1.3,Iris-versicolor
 91 | 5.5,2.5,4,1.3,Iris-versicolor
 92 | 5.5,2.6,4.4,1.2,Iris-versicolor
 93 | 6.1,3,4.6,1.4,Iris-versicolor
 94 | 5.8,2.6,4,1.2,Iris-versicolor
 95 | 5,2.3,3.3,1,Iris-versicolor
 96 | 5.6,2.7,4.2,1.3,Iris-versicolor
 97 | 5.7,3,4.2,1.2,Iris-versicolor
 98 | 5.7,2.9,4.2,1.3,Iris-versicolor
 99 | 6.2,2.9,4.3,1.3,Iris-versicolor
100 | 5.1,2.5,3,1.1,Iris-versicolor
101 | 5.7,2.8,4.1,1.3,Iris-versicolor
102 | 6.3,3.3,6,2.5,Iris-virginica
103 | 5.8,2.7,5.1,1.9,Iris-virginica
104 | 7.1,3,5.9,2.1,Iris-virginica
105 | 6.3,2.9,5.6,1.8,Iris-virginica
106 | 6.5,3,5.8,2.2,Iris-virginica
107 | 7.6,3,6.6,2.1,Iris-virginica
108 | 4.9,2.5,4.5,1.7,Iris-virginica
109 | 7.3,2.9,6.3,1.8,Iris-virginica
110 | 6.7,2.5,5.8,1.8,Iris-virginica
111 | 7.2,3.6,6.1,2.5,Iris-virginica
112 | 6.5,3.2,5.1,2,Iris-virginica
113 | 6.4,2.7,5.3,1.9,Iris-virginica
114 | 6.8,3,5.5,2.1,Iris-virginica
115 | 5.7,2.5,5,2,Iris-virginica
116 | 5.8,2.8,5.1,2.4,Iris-virginica
117 | 6.4,3.2,5.3,2.3,Iris-virginica
118 | 6.5,3,5.5,1.8,Iris-virginica
119 | 7.7,3.8,6.7,2.2,Iris-virginica
120 | 7.7,2.6,6.9,2.3,Iris-virginica
121 | 6,2.2,5,1.5,Iris-virginica
122 | 6.9,3.2,5.7,2.3,Iris-virginica
123 | 5.6,2.8,4.9,2,Iris-virginica
124 | 7.7,2.8,6.7,2,Iris-virginica
125 | 6.3,2.7,4.9,1.8,Iris-virginica
126 | 6.7,3.3,5.7,2.1,Iris-virginica
127 | 7.2,3.2,6,1.8,Iris-virginica
128 | 6.2,2.8,4.8,1.8,Iris-virginica
129 | 6.1,3,4.9,1.8,Iris-virginica
130 | 6.4,2.8,5.6,2.1,Iris-virginica
131 | 7.2,3,5.8,1.6,Iris-virginica
132 | 7.4,2.8,6.1,1.9,Iris-virginica
133 | 7.9,3.8,6.4,2,Iris-virginica
134 | 6.4,2.8,5.6,2.2,Iris-virginica
135 | 6.3,2.8,5.1,1.5,Iris-virginica
136 | 6.1,2.6,5.6,1.4,Iris-virginica
137 | 7.7,3,6.1,2.3,Iris-virginica
138 | 6.3,3.4,5.6,2.4,Iris-virginica
139 | 6.4,3.1,5.5,1.8,Iris-virginica
140 | 6,3,4.8,1.8,Iris-virginica
141 | 6.9,3.1,5.4,2.1,Iris-virginica
142 | 6.7,3.1,5.6,2.4,Iris-virginica
143 | 6.9,3.1,5.1,2.3,Iris-virginica
144 | 5.8,2.7,5.1,1.9,Iris-virginica
145 | 6.8,3.2,5.9,2.3,Iris-virginica
146 | 6.7,3.3,5.7,2.5,Iris-virginica
147 | 6.7,3,5.2,2.3,Iris-virginica
148 | 6.3,2.5,5,1.9,Iris-virginica
149 | 6.5,3,5.2,2,Iris-virginica
150 | 6.2,3.4,5.4,2.3,Iris-virginica
151 | 5.9,3,5.1,1.8,Iris-virginica
152 | 


--------------------------------------------------------------------------------
/artml/feature_selection/mahalanobis_features.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Importing all the required libraries
  3 | import os
  4 | import math
  5 | from numpy import *
  6 | import numpy as np
  7 | import pandas as pd
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | '''
 12 | 
 13 | mahalanobis_selection.forward_selection is a feature selection function to select the best features that contributes to the
 14 | classifier performance. This is a real time forward selection technique which begins with no variables in the LDA model.
 15 | For each variable, the forward method calculates Δ2 (mahalanobis distance) statistics that reflect the variable's
 16 | contribution to the model if it is included.
 17 | 
 18 | Parameters
 19 | ----------
 20 | 
 21 | BET_file: Input BET table. (Make sure that the index of BET is same as column names)
 22 | 
 23 | Target: Target variable of the classification
 24 | 
 25 | alpha: It is the hyperparameter for the feature selection technique. This dictates the output number of features. Default
 26 | value is 1.01
 27 | 
 28 | 
 29 | '''
 30 | class mahalanobis_selection():
 31 | 
 32 |     def find_best_feature(self, BET_best,BET_file,master_keys,target,benchmark,alpha):
 33 | 
 34 |         best_feature = []
 35 |         for col in BET_file.columns:
 36 |             columns = []
 37 |             BET_target = BET_file[[target]]
 38 |             BET_col = BET_file[[col]]
 39 |             columns = list(BET_best.columns)
 40 |             columns.append(col)
 41 |             columns.append(target)
 42 |             #Selecting the BET for particular columns & Target
 43 |             result = pd.concat([BET_best, BET_col, BET_target], axis=1)
 44 |             selected_rows = columns
 45 |             result = result.loc[selected_rows]
 46 |             result.index = list(result.columns)
 47 | 
 48 |             try:
 49 |                 Delta = self.mahalanobis(result,target)
 50 |             except:
 51 |                 Delta = 0
 52 |             if Delta/benchmark > alpha:
 53 |                 best_feature = col
 54 |                 benchmark = Delta
 55 | 
 56 |         return best_feature
 57 | 
 58 | 
 59 |     def mahalanobis(self, result, target):
 60 | 
 61 |         (mean1,mean2,Beta) = self.LDA_fit_transform(result, target)
 62 | 
 63 |         z = np.array(mean1)-np.array(mean2)
 64 |         Delta = np.matmul(Beta.T, z)
 65 |         return Delta
 66 | 
 67 | 
 68 |     def LDA_fit_transform(self, BET, target):
 69 | 
 70 |         l =(len(BET.columns))
 71 |         count_1 = (BET.loc[(target), target][0]) - (BET.loc[(target), target][1])
 72 |         count_2 = BET.loc[(target), target][1]
 73 | 
 74 |         mean1 = []
 75 |         mean2 = []
 76 |         c = []
 77 | 
 78 |         for i in range(len(BET.columns)):
 79 |             if BET.columns[i] != target:
 80 | 
 81 |                 mean1.append((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])/(BET.loc[BET.columns[i], (target)][0]-BET.loc[BET.columns[i], (target)][6]))
 82 |                 mean2.append((BET.loc[BET.columns[i], (target)][10])/BET.loc[BET.columns[i], (target)][6])
 83 | 
 84 |         for i in range(len(BET.columns)):
 85 |             if BET.columns[i] != target:
 86 |                 for j in range(len(BET.columns)):
 87 |                     if BET.columns[j] != target:
 88 |                         cal1 = (((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])*(BET.loc[BET.columns[j], (target)][1]- BET.loc[BET.columns[j], (target)][10]))/count_1)
 89 |                         cal2 = (BET.loc[BET.columns[i], (target)][10]*BET.loc[BET.columns[j], (target)][10])/count_2
 90 |                         c.append((BET.loc[BET.columns[i],(BET.columns[j])][10] -cal1 - cal2)/(count_1+count_2-2))
 91 |         c = np.array(c)
 92 |         n = (len(BET.columns)-1)
 93 |         c = np.reshape(c,(n,n))
 94 | 
 95 |         try:
 96 |             inverse = np.linalg.inv(c)
 97 |         except:
 98 |             print('Handling zero determinent Exception with dummies!')
 99 |             dummies_ = np.random.random((l-1,l-1))/10000000
100 |             inverse = np.linalg.inv(c + dummies_)
101 | 
102 |         z = np.array(mean1)-np.array(mean2)
103 |         Beta = np.matmul(inverse, z.T)
104 |         return (mean1,mean2,Beta)
105 | 
106 | 
107 | 
108 |     def forward_selection(self, BET_file, target, alpha=1.01):
109 |         BET_best = pd.DataFrame()
110 |         best_features = []
111 |         already_selected = []
112 |         benchmark = 0.0001
113 |         master_keys = BET_file.columns
114 |         for i in range(len(BET_file.columns)):
115 |             best_feature = self.find_best_feature(BET_best,BET_file,master_keys,target,benchmark,alpha)
116 |             if best_feature != []:
117 |                 best_features.append(best_feature)
118 |             if best_feature == []:
119 |                 break
120 |             BET_best = pd.concat([BET_best, BET_file[[best_feature]]], axis=1)
121 |             BET_for_new_benchmark = pd.concat([BET_best, BET_file[[target]]], axis=1)
122 | 
123 | 
124 |             selected_rows = list(BET_for_new_benchmark.columns)
125 |             BET_for_new_benchmark= BET_for_new_benchmark.loc[selected_rows]
126 |             BET_for_new_benchmark.index = list(BET_for_new_benchmark.columns)
127 | 
128 |             benchmark = self.mahalanobis(BET_for_new_benchmark,target)
129 |             already_selected = [best_feature]
130 | 
131 |             BET_file = BET_file.drop(already_selected, axis=1)
132 |         return best_features
133 | 


--------------------------------------------------------------------------------
/python/artml/feature_selection/mahalanobis_features.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # Importing all the required libraries
  3 | import os
  4 | import math
  5 | from numpy import *
  6 | import numpy as np
  7 | import pandas as pd
  8 | import warnings
  9 | warnings.filterwarnings('ignore')
 10 | 
 11 | '''
 12 | 
 13 | mahalanobis_selection.forward_selection is a feature selection function to select the best features that contributes to the
 14 | classifier performance. This is a real time forward selection technique which begins with no variables in the LDA model.
 15 | For each variable, the forward method calculates Δ2 (mahalanobis distance) statistics that reflect the variable's
 16 | contribution to the model if it is included.
 17 | 
 18 | Parameters
 19 | ----------
 20 | 
 21 | BET_file: Input BET table. (Make sure that the index of BET is same as column names)
 22 | 
 23 | Target: Target variable of the classification
 24 | 
 25 | alpha: It is the hyperparameter for the feature selection technique. This dictates the output number of features. Default
 26 | value is 1.01
 27 | 
 28 | 
 29 | '''
 30 | class mahalanobis_selection():
 31 | 
 32 |     def find_best_feature(self, BET_best,BET_file,master_keys,target,benchmark,alpha):
 33 | 
 34 |         best_feature = []
 35 |         for col in BET_file.columns:
 36 |             columns = []
 37 |             BET_target = BET_file[[target]]
 38 |             BET_col = BET_file[[col]]
 39 |             columns = list(BET_best.columns)
 40 |             columns.append(col)
 41 |             columns.append(target)
 42 |             #Selecting the BET for particular columns & Target
 43 |             result = pd.concat([BET_best, BET_col, BET_target], axis=1)
 44 |             selected_rows = columns
 45 |             result = result.loc[selected_rows]
 46 |             result.index = list(result.columns)
 47 | 
 48 |             try:
 49 |                 Delta = self.mahalanobis(result,target)
 50 |             except:
 51 |                 Delta = 0
 52 |             if Delta/benchmark > alpha:
 53 |                 best_feature = col
 54 |                 benchmark = Delta
 55 | 
 56 |         return best_feature
 57 | 
 58 | 
 59 |     def mahalanobis(self, result, target):
 60 | 
 61 |         (mean1,mean2,Beta) = self.LDA_fit_transform(result, target)
 62 | 
 63 |         z = np.array(mean1)-np.array(mean2)
 64 |         Delta = np.matmul(Beta.T, z)
 65 |         return Delta
 66 | 
 67 | 
 68 |     def LDA_fit_transform(self, BET, target):
 69 | 
 70 |         l =(len(BET.columns))
 71 |         count_1 = (BET.loc[(target), target][0]) - (BET.loc[(target), target][1])
 72 |         count_2 = BET.loc[(target), target][1]
 73 | 
 74 |         mean1 = []
 75 |         mean2 = []
 76 |         c = []
 77 | 
 78 |         for i in range(len(BET.columns)):
 79 |             if BET.columns[i] != target:
 80 | 
 81 |                 mean1.append((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])/(BET.loc[BET.columns[i], (target)][0]-BET.loc[BET.columns[i], (target)][6]))
 82 |                 mean2.append((BET.loc[BET.columns[i], (target)][10])/BET.loc[BET.columns[i], (target)][6])
 83 | 
 84 |         for i in range(len(BET.columns)):
 85 |             if BET.columns[i] != target:
 86 |                 for j in range(len(BET.columns)):
 87 |                     if BET.columns[j] != target:
 88 |                         cal1 = (((BET.loc[BET.columns[i], (target)][1] - BET.loc[BET.columns[i], (target)][10])*(BET.loc[BET.columns[j], (target)][1]- BET.loc[BET.columns[j], (target)][10]))/count_1)
 89 |                         cal2 = (BET.loc[BET.columns[i], (target)][10]*BET.loc[BET.columns[j], (target)][10])/count_2
 90 |                         c.append((BET.loc[BET.columns[i],(BET.columns[j])][10] -cal1 - cal2)/(count_1+count_2-2))
 91 |         c = np.array(c)
 92 |         n = (len(BET.columns)-1)
 93 |         c = np.reshape(c,(n,n))
 94 | 
 95 |         try:
 96 |             inverse = np.linalg.inv(c)
 97 |         except:
 98 |             print('Handling zero determinent Exception with dummies!')
 99 |             dummies_ = np.random.random((l-1,l-1))/10000000
100 |             inverse = np.linalg.inv(c + dummies_)
101 | 
102 |         z = np.array(mean1)-np.array(mean2)
103 |         Beta = np.matmul(inverse, z.T)
104 |         return (mean1,mean2,Beta)
105 | 
106 | 
107 | 
108 |     def forward_selection(self, BET_file, target, alpha=1.01):
109 |         BET_best = pd.DataFrame()
110 |         best_features = []
111 |         already_selected = []
112 |         benchmark = 0.0001
113 |         master_keys = BET_file.columns
114 |         for i in range(len(BET_file.columns)):
115 |             best_feature = self.find_best_feature(BET_best,BET_file,master_keys,target,benchmark,alpha)
116 |             if best_feature != []:
117 |                 best_features.append(best_feature)
118 |             if best_feature == []:
119 |                 break
120 |             BET_best = pd.concat([BET_best, BET_file[[best_feature]]], axis=1)
121 |             BET_for_new_benchmark = pd.concat([BET_best, BET_file[[target]]], axis=1)
122 | 
123 | 
124 |             selected_rows = list(BET_for_new_benchmark.columns)
125 |             BET_for_new_benchmark= BET_for_new_benchmark.loc[selected_rows]
126 |             BET_for_new_benchmark.index = list(BET_for_new_benchmark.columns)
127 | 
128 |             benchmark = self.mahalanobis(BET_for_new_benchmark,target)
129 |             already_selected = [best_feature]
130 | 
131 |             BET_file = BET_file.drop(already_selected, axis=1)
132 |         return best_features
133 | 


--------------------------------------------------------------------------------
/artml/explore/stats.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[9]:
  5 | 
  6 | import math
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy import stats
 10 | # from scipy.stats import chisqprob
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | 
 15 | # In[15]:
 16 | 
 17 | def univariate(BET):
 18 |     
 19 |     """
 20 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
 21 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
 22 |     a dataset’s distribution, excluding NaN values.
 23 |     
 24 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
 25 |     
 26 |     Examples
 27 |     --------
 28 |         univariate(Basic_Element_Table)
 29 |         
 30 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
 31 |         
 32 |         function returns univariate stats as Pandas Dataframe.
 33 |     
 34 |     """
 35 |     
 36 |     l =(len(BET))
 37 |     BET.reset_index(drop = True, inplace = True)
 38 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
 39 |     keys =list(x.keys())  
 40 |     describe = {}
 41 |     
 42 |     for i in range(l):
 43 |         describe[i] = []
 44 |         m = keys[i]
 45 |         
 46 |         try:
 47 |             count = x[m][i][0]
 48 |             describe[i].append(count)
 49 |         except:
 50 |             describe[i].append('NaN')
 51 |         try:
 52 |             Mean = (x[m][i][1])/count
 53 |             describe[i].append(Mean)   
 54 |         except:
 55 |             describe[i].append('NaN')
 56 |         
 57 |         try:
 58 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
 59 |             describe[i].append(Variance)
 60 |         except:
 61 |             describe[i].append('NaN')
 62 |         try:
 63 |             Standard_deviation = math.sqrt(Variance)
 64 |             describe[i].append(Standard_deviation)
 65 |         except:
 66 |             describe[i].append('NaN')
 67 |         try:
 68 |             coeff_of_variation = (Standard_deviation/Mean)*100
 69 |             describe[i].append(coeff_of_variation)
 70 |         except:
 71 |             describe[i].append('NaN')
 72 |             
 73 |         try:
 74 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
 75 |             describe[i].append(skewness)
 76 |         except:
 77 |             describe[i].append('NaN')
 78 |         try:
 79 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
 80 |             describe[i].append(Kurtosis)
 81 |         except:
 82 |             describe[i].append('NaN')        
 83 |         
 84 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
 85 |     result = pd.DataFrame(describe, index=names)
 86 |     result.columns = keys
 87 |     return(result)
 88 | 
 89 | 
 90 | # In[16]:
 91 | 
 92 | def covariance(BET):
 93 |     
 94 |     """
 95 |     This function computes pairwise covariance of all features in BET. Covariance describes 
 96 |     the linear relationship between two features.
 97 |     
 98 |     Examples
 99 |     --------
100 |         Covariance(Basic_Element_Table)
101 |         
102 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
103 |         
104 |         function returns Covariance as Pandas Dataframe.
105 |     
106 |     """
107 |     
108 |     l =(len(BET))
109 |     BET.reset_index(drop = True, inplace = True)
110 |     x = BET.to_dict(orient='list')
111 |     keys =list(x.keys())  
112 |     covar = {}
113 |     
114 |     for i in range(len(BET)):
115 |         covar[i] = []
116 |         for j in range(len(BET)):
117 |             m = keys[i]
118 |             try:
119 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
120 |                 covar[i].append(cov)
121 |             except:
122 |                 covar[i].append('NaN')
123 |             
124 |     result = pd.DataFrame(covar, index=keys)
125 |     result.columns = keys
126 |     return(result)
127 | 
128 | 
129 | # In[17]:
130 | 
131 | def correlation(BET):
132 |     
133 |     """
134 |     This function computes pairwise correlations of all features in BET. correlation measures 
135 |     how strong a relationship is between two variables.
136 |     
137 |     Examples
138 |     --------
139 |         correlation(Basic_Element_Table)
140 |         
141 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
142 |         
143 |         function returns correlations as Pandas Dataframe.
144 |     
145 |     """
146 |     
147 |     l =(len(BET))
148 |     BET.reset_index(drop = True, inplace = True)
149 |     x = BET.to_dict(orient='list')
150 |     keys =list(x.keys())  
151 |     corr = {}
152 |     
153 |     for i in range(len(BET)):
154 |         corr[i] = []
155 |         for j in range(len(BET)):
156 |             m = keys[i]      
157 |             count1 = x[m][j][0]
158 |             count2 = x[m][j][5]
159 |             try:
160 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
161 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
162 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
163 |                 corr[i].append(corrl)
164 |             except:
165 |                 corr[i].append('NaN')
166 |     
167 |     result = pd.DataFrame(corr, index=keys)
168 |     result.columns = keys
169 |     return(result)
170 | 
171 | 
172 | # In[18]:
173 | 
174 | def Ztest(BET, col1, col2):
175 |     
176 |     l =(len(BET))
177 |     BET.reset_index(drop = True, inplace = True)
178 |     x = BET.to_dict(orient='list')
179 |     keys =list(x.keys())
180 |     
181 |     count = x[col2][keys.index(col1)][6]
182 |     sumx = x[col2][keys.index(col1)][10]
183 |     sumx2 = x[col2][keys.index(col1)][11]
184 |     Mean = sumx/count
185 |     Variance = (sumx2 - (((sumx)**2)/count))/count
186 |     
187 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
188 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
189 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
190 |     Mean_0 = sumx_0/count_0
191 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
192 |     
193 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
194 |     prob = 1 - stats.norm.cdf(zscore)
195 |     return 2*prob
196 |     
197 | 
198 | 
199 | # In[19]:
200 | 
201 | def Ttest(BET, col1, col2):
202 |     
203 |     l =(len(BET))
204 |     BET.reset_index(drop = True, inplace = True)
205 |     x = BET.to_dict(orient='list')
206 |     keys =list(x.keys())
207 |     
208 |     count = x[col2][keys.index(col1)][6]
209 |     sumx = x[col2][keys.index(col1)][10]
210 |     sumx2 = x[col2][keys.index(col1)][11]
211 |     Mean = sumx/count
212 |     Variance = (sumx2 - (((sumx)**2)/count))/count
213 |     
214 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
215 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
216 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
217 |     Mean_0 = sumx_0/count_0
218 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
219 |     
220 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
221 |     
222 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
223 |     
224 |     df = (count + count_0 - 2)
225 |     
226 |     prob = (1-stats.t.cdf(tscore, df)) 
227 |     return 2*prob
228 |     
229 | 
230 | 
231 | # In[14]:
232 | 
233 | def chi2(BET, feature_1 , feature_2):
234 |     
235 |     l =(len(BET))
236 |     BET.reset_index(drop = True, inplace = True)
237 |     x = BET.to_dict(orient='list')
238 |     keys =list(x.keys())
239 |     obs_freq = {}
240 |     exp_freq = {}
241 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
242 |     chi2 = 0
243 |     
244 |     for i in range(len(feature_1)):
245 |         obs_freq[feature_1[i]] = []
246 |         
247 |         for j in range(len(feature_2)): 
248 |             col1 = (feature_1[i])
249 |             col2 = (feature_2[j])
250 |             sumx = x[col1][keys.index(col2)][10]
251 |             obs_freq[feature_1[i]].append(sumx)
252 |             
253 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
254 |     total_in_contingency = sum(sum_exp_freq_vertical)
255 |     
256 |     for i in range(len(feature_1)):
257 |         exp_freq[feature_1[i]] = []
258 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
259 |         for j in range(len(feature_2)):            
260 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
261 |             exp_freq[feature_1[i]].append(e)
262 |         
263 |     for i in range(len(feature_1)):
264 |         for j in range(len(feature_2)):
265 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
266 |             
267 |             
268 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
269 |     
270 |     print('chi2: ' + str(chi2))
271 |     print('df: '  + str(df))
272 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
273 |     return(chisqprob(chi2, df))
274 | 
275 | 
276 | # In[ ]:
277 | 
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/python/artml/explore/stats.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[9]:
  5 | 
  6 | import math
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy import stats
 10 | # from scipy.stats import chisqprob
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | 
 15 | # In[15]:
 16 | 
 17 | def univariate(BET):
 18 |     
 19 |     """
 20 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
 21 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
 22 |     a dataset’s distribution, excluding NaN values.
 23 |     
 24 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
 25 |     
 26 |     Examples
 27 |     --------
 28 |         univariate(Basic_Element_Table)
 29 |         
 30 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
 31 |         
 32 |         function returns univariate stats as Pandas Dataframe.
 33 |     
 34 |     """
 35 |     
 36 |     l =(len(BET))
 37 |     BET.reset_index(drop = True, inplace = True)
 38 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
 39 |     keys =list(x.keys())  
 40 |     describe = {}
 41 |     
 42 |     for i in range(l):
 43 |         describe[i] = []
 44 |         m = keys[i]
 45 |         
 46 |         try:
 47 |             count = x[m][i][0]
 48 |             describe[i].append(count)
 49 |         except:
 50 |             describe[i].append('NaN')
 51 |         try:
 52 |             Mean = (x[m][i][1])/count
 53 |             describe[i].append(Mean)   
 54 |         except:
 55 |             describe[i].append('NaN')
 56 |         
 57 |         try:
 58 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
 59 |             describe[i].append(Variance)
 60 |         except:
 61 |             describe[i].append('NaN')
 62 |         try:
 63 |             Standard_deviation = math.sqrt(Variance)
 64 |             describe[i].append(Standard_deviation)
 65 |         except:
 66 |             describe[i].append('NaN')
 67 |         try:
 68 |             coeff_of_variation = (Standard_deviation/Mean)*100
 69 |             describe[i].append(coeff_of_variation)
 70 |         except:
 71 |             describe[i].append('NaN')
 72 |             
 73 |         try:
 74 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
 75 |             describe[i].append(skewness)
 76 |         except:
 77 |             describe[i].append('NaN')
 78 |         try:
 79 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
 80 |             describe[i].append(Kurtosis)
 81 |         except:
 82 |             describe[i].append('NaN')        
 83 |         
 84 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
 85 |     result = pd.DataFrame(describe, index=names)
 86 |     result.columns = keys
 87 |     return(result)
 88 | 
 89 | 
 90 | # In[16]:
 91 | 
 92 | def covariance(BET):
 93 |     
 94 |     """
 95 |     This function computes pairwise covariance of all features in BET. Covariance describes 
 96 |     the linear relationship between two features.
 97 |     
 98 |     Examples
 99 |     --------
100 |         Covariance(Basic_Element_Table)
101 |         
102 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
103 |         
104 |         function returns Covariance as Pandas Dataframe.
105 |     
106 |     """
107 |     
108 |     l =(len(BET))
109 |     BET.reset_index(drop = True, inplace = True)
110 |     x = BET.to_dict(orient='list')
111 |     keys =list(x.keys())  
112 |     covar = {}
113 |     
114 |     for i in range(len(BET)):
115 |         covar[i] = []
116 |         for j in range(len(BET)):
117 |             m = keys[i]
118 |             try:
119 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
120 |                 covar[i].append(cov)
121 |             except:
122 |                 covar[i].append('NaN')
123 |             
124 |     result = pd.DataFrame(covar, index=keys)
125 |     result.columns = keys
126 |     return(result)
127 | 
128 | 
129 | # In[17]:
130 | 
131 | def correlation(BET):
132 |     
133 |     """
134 |     This function computes pairwise correlations of all features in BET. correlation measures 
135 |     how strong a relationship is between two variables.
136 |     
137 |     Examples
138 |     --------
139 |         correlation(Basic_Element_Table)
140 |         
141 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
142 |         
143 |         function returns correlations as Pandas Dataframe.
144 |     
145 |     """
146 |     
147 |     l =(len(BET))
148 |     BET.reset_index(drop = True, inplace = True)
149 |     x = BET.to_dict(orient='list')
150 |     keys =list(x.keys())  
151 |     corr = {}
152 |     
153 |     for i in range(len(BET)):
154 |         corr[i] = []
155 |         for j in range(len(BET)):
156 |             m = keys[i]      
157 |             count1 = x[m][j][0]
158 |             count2 = x[m][j][5]
159 |             try:
160 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
161 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
162 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
163 |                 corr[i].append(corrl)
164 |             except:
165 |                 corr[i].append('NaN')
166 |     
167 |     result = pd.DataFrame(corr, index=keys)
168 |     result.columns = keys
169 |     return(result)
170 | 
171 | 
172 | # In[18]:
173 | 
174 | def Ztest(BET, col1, col2):
175 |     
176 |     l =(len(BET))
177 |     BET.reset_index(drop = True, inplace = True)
178 |     x = BET.to_dict(orient='list')
179 |     keys =list(x.keys())
180 |     
181 |     count = x[col2][keys.index(col1)][6]
182 |     sumx = x[col2][keys.index(col1)][10]
183 |     sumx2 = x[col2][keys.index(col1)][11]
184 |     Mean = sumx/count
185 |     Variance = (sumx2 - (((sumx)**2)/count))/count
186 |     
187 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
188 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
189 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
190 |     Mean_0 = sumx_0/count_0
191 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
192 |     
193 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
194 |     prob = 1 - stats.norm.cdf(zscore)
195 |     return 2*prob
196 |     
197 | 
198 | 
199 | # In[19]:
200 | 
201 | def Ttest(BET, col1, col2):
202 |     
203 |     l =(len(BET))
204 |     BET.reset_index(drop = True, inplace = True)
205 |     x = BET.to_dict(orient='list')
206 |     keys =list(x.keys())
207 |     
208 |     count = x[col2][keys.index(col1)][6]
209 |     sumx = x[col2][keys.index(col1)][10]
210 |     sumx2 = x[col2][keys.index(col1)][11]
211 |     Mean = sumx/count
212 |     Variance = (sumx2 - (((sumx)**2)/count))/count
213 |     
214 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
215 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
216 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
217 |     Mean_0 = sumx_0/count_0
218 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
219 |     
220 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
221 |     
222 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
223 |     
224 |     df = (count + count_0 - 2)
225 |     
226 |     prob = (1-stats.t.cdf(tscore, df)) 
227 |     return 2*prob
228 |     
229 | 
230 | 
231 | # In[14]:
232 | 
233 | def chi2(BET, feature_1 , feature_2):
234 |     
235 |     l =(len(BET))
236 |     BET.reset_index(drop = True, inplace = True)
237 |     x = BET.to_dict(orient='list')
238 |     keys =list(x.keys())
239 |     obs_freq = {}
240 |     exp_freq = {}
241 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
242 |     chi2 = 0
243 |     
244 |     for i in range(len(feature_1)):
245 |         obs_freq[feature_1[i]] = []
246 |         
247 |         for j in range(len(feature_2)): 
248 |             col1 = (feature_1[i])
249 |             col2 = (feature_2[j])
250 |             sumx = x[col1][keys.index(col2)][10]
251 |             obs_freq[feature_1[i]].append(sumx)
252 |             
253 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
254 |     total_in_contingency = sum(sum_exp_freq_vertical)
255 |     
256 |     for i in range(len(feature_1)):
257 |         exp_freq[feature_1[i]] = []
258 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
259 |         for j in range(len(feature_2)):            
260 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
261 |             exp_freq[feature_1[i]].append(e)
262 |         
263 |     for i in range(len(feature_1)):
264 |         for j in range(len(feature_2)):
265 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
266 |             
267 |             
268 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
269 |     
270 |     print('chi2: ' + str(chi2))
271 |     print('df: '  + str(df))
272 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
273 |     return(chisqprob(chi2, df))
274 | 
275 | 
276 | # In[ ]:
277 | 
278 | 
279 | 
280 | 


--------------------------------------------------------------------------------
/artml/bet.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[23]:
  5 | 
  6 | import math
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy import stats
 10 | from tqdm import tqdm
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | 
 15 | # In[24]:
 16 | 
 17 | def create_bet(df):
 18 | 
 19 |     """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and
 20 |     it can be updated with the new data.
 21 | 
 22 |     BET function returns basic element table as Pandas Dataframe
 23 | 
 24 |     Notes:
 25 |     -----
 26 |     see 'Real Time Data Mining' by Prof. Sayad
 27 | 
 28 |     (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining)
 29 | 
 30 |     """
 31 |     col = df.columns.tolist()
 32 |     df_matrix = df.values
 33 |     l = len(col)
 34 | 
 35 |     idx = np.array([5,6,7,8,9,0,1,2,3,4,10,11])
 36 |     bet={}
 37 |     x = np.array([[np.zeros(12) for x in range(l)] for y in range(l)])
 38 |     for i in tqdm(range(l)):
 39 |         bet[i] = []
 40 | 
 41 |         for j in range(i,l):
 42 |             y= np.array(df_matrix[:,j])
 43 |             z= np.array(df_matrix[:,i])
 44 | 
 45 |             """
 46 |             This code makes calculations for all the basic elements in the table. They are appended to
 47 |             a lists of a dictionary.
 48 |             """
 49 |             
 50 |             x[i,j] = np.array([len(z), z.sum(), (z**2).sum(), (z**3).sum(), (z**4).sum(), 
 51 |                                len(y), y.sum(), (y**2).sum(), (y**3).sum(), (y**4).sum(), (z*y).sum(), ((z*y)**2).sum()])
 52 | 
 53 |             x[j,i] = x[i,j][idx]
 54 |       
 55 |         for j in range(l): 
 56 |            bet[i].append(x[j,i])
 57 | 
 58 |     result = pd.DataFrame(bet, index=col)
 59 |     result.columns = col
 60 |     return(result)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | # In[25]:
 66 | 
 67 | def calculate_basic_elements1(x,key,e,c,const):
 68 |     
 69 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 70 |     calculations to update the BET
 71 |     
 72 |     This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 
 73 |     for making the calculations
 74 |     """
 75 |     
 76 |     array = np.array(x[key][e])
 77 |     
 78 |     array = array + const*(np.array([1,c, c**2,c**3,c**4,1,c, c**2,c**3,c**4,c**2,c**4]))
 79 |     
 80 |     x[key][e] = array
 81 |     
 82 |     return x[key][e]
 83 | 
 84 | 
 85 | # In[26]:
 86 | 
 87 | def calculate_basic_elements2(x,key,k,b,c,i,m,const):   
 88 | 
 89 |     
 90 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 91 |     calculations to update the BET
 92 |     
 93 |     This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 
 94 |     for making the calculations
 95 |     """
 96 |    
 97 |     array = np.array(x[key][k])
 98 |     
 99 |     array = array + const*(np.array([1,c[b.index(m)],  (c[b.index(m)])**2,(c[b.index(m)])**3,(c[b.index(m)])**4,1,c[i], c[i]**2,c[i]**3,c[i]**4, c[i]*(c[b.index(m)]),(c[i]*(c[b.index(m)])**2)]))
100 |     
101 |     x[key][k] = array
102 |     
103 |     return x[key][k]
104 | 
105 | # In[27]:
106 | 
107 | def learnbyindex(BET, *args):
108 |     
109 |     """ This function takes Basic Element Table and feature_names & values as arguments to update the 
110 |         given list of feature column & rows in the BET by corresponding values.
111 |         
112 |         Examples
113 |         --------
114 |         learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
115 |         
116 |         The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively.
117 |     
118 |     """
119 |    
120 |     BET.reset_index(drop = True, inplace = True)                               # convert BET to dictionary
121 |     x = BET.to_dict(orient='list')
122 |     keys = list(x.keys())
123 |     arguments_list = [item for item in args]
124 |     n_features = int(len(arguments_list)/2)                          # no of features given as input for updating BET
125 |     
126 |     if (len(arguments_list))%2 != 0:                    
127 |         print("Error: Give correct set of Feature_names & corresponding parameters")
128 |     
129 |     else:  
130 |         feature_names = arguments_list[0:n_features]
131 |         values=  arguments_list[n_features::]
132 |         
133 |         for i in range(len(feature_names)):
134 |             key = feature_names[i]
135 |             e = keys.index(key)
136 |             calculate_basic_elements1(x,key,e,values[i],1)                           # function for updating elements  BET
137 |                         
138 |             for m in feature_names: 
139 |                  if m != feature_names[i]:
140 |                     k = keys.index(m)
141 |                     calculate_basic_elements2(x,key,k,feature_names,values,i,m,1)   # function for updating elements  BET
142 |                     
143 |     df = pd.DataFrame(x)
144 |     df.index = keys
145 |     df = df[keys]
146 |     return df
147 | 
148 | 
149 | # In[28]:
150 | 
151 | def forgetbyindex(BET, *args):
152 |     
153 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
154 |         given list of features in the BET by corresponding values (deleting effect of those values from BET).
155 |         
156 |         Examples
157 |         --------
158 |         forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
159 |         
160 |         The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively.
161 |     
162 |     """
163 |     
164 |     BET.reset_index(drop = True, inplace = True)
165 |     x = BET.to_dict(orient='list')                                                   # convert BET to dictionary
166 |     keys = list(x.keys())
167 |     arguments_list = [item for item in args]
168 |     n_features = int(len(arguments_list)/2)  
169 |     
170 |     if (len(arguments_list))%2 != 0:                                        # no of features given as input for updating BET
171 |         print("Give correct set of Index & parameters for function")
172 |     else:  
173 |         feature_names = arguments_list[0 : n_features]
174 |         values=  arguments_list[n_features: :]
175 |         for i in range(n_features):
176 |             key = feature_names[i]
177 |             e = keys.index(key)
178 |             calculate_basic_elements1(x,key,e,values,i,-1)                                  # function for updating elements  BET
179 |             
180 |             for m in feature_names: 
181 |                  if m != feature_names[i]:
182 |                     k = keys.index(m)
183 |                     calculate_basic_elements2(x,key,k,feature_names,values,i,m,-1)
184 | 
185 |     df = pd.DataFrame(x)
186 |     df = df[keys]
187 |     df.index = keys
188 |     return df
189 | 
190 | 
191 | 
192 | # In[29]:
193 | 
194 | 
195 | def growbyindex(BET, *args):
196 |     
197 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
198 |         BET with new features and corresponding values.
199 |         
200 |         Examples
201 |         --------
202 |         growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 )
203 |         
204 |         The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively.
205 |     
206 |     """
207 |     
208 |     main_list = list(BET.columns)
209 |     arguments_list = [item for item in args]                                            # convert BET to dictionary
210 |     n_features = int(len(arguments_list)/2)
211 |     if (len(arguments_list))%2 != 0:
212 |         print("Give correct set of Index & parameters for function")
213 |     else:  
214 |         feature_names = arguments_list[0:n_features]
215 |         values =  arguments_list[n_features::]
216 |     
217 |         for i in range(n_features):
218 |             
219 |             elements = [[0]*12]*len(BET)                                         #Creating null  basic elements lists
220 |             BET[feature_names[i]] = elements
221 |             
222 |             new_list = []
223 |             for j in range(len(BET.columns)):               
224 |                 new_list.append(list(np.array([0]*12)))
225 |     
226 |             new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]])
227 |             BET = pd.concat([BET,new_row])
228 |     
229 |         BET.reset_index(drop = True, inplace = True)
230 |         x = BET.to_dict(orient='list')
231 |         keys = list(x.keys())  
232 |            
233 |         for i in range(n_features):
234 |             key = feature_names[i]
235 |             if key in main_list:
236 |                 print('feature already exsists! Use Learn function')
237 |             else:        
238 |                 e = keys.index(key)
239 |                 calculate_basic_elements1(x,key,e,c,i,1)
240 | 
241 |     df = pd.DataFrame(BET)
242 |     df.index = keys
243 |     df = df[keys]
244 |     return df
245 | 
246 | 
247 | # In[30]:
248 | 
249 | def learn(BET, df):
250 |           
251 |     """ This function takes Basic Element Table and dataframe as inputs to update the 
252 |         BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input)
253 |         
254 |         Examples
255 |         --------
256 |         learn(Basic_Element_Table, data_frame)
257 |         
258 |         The above function updates Basic_Element_Table with values in the new dataframe.
259 |     
260 |     """
261 |     
262 |     col = list(df.columns)
263 |     for index, row in df.iterrows():
264 |         row1 = []
265 |         for e in col:
266 |             row1.append(row[e])
267 |         arguments  = col + row1
268 |         BET = learnbyindex(BET, *arguments)
269 |     return BET
270 | 
271 | 
272 | # In[31]:
273 | 
274 | def forget(BET, df):
275 |     
276 |     """ This function takes Basic Element Table and dataframe as inputs to change and remove the  
277 |         effect of that data in the BET. (Decremental Learning of BET with dataframe as input)
278 |         
279 |         Examples
280 |         --------
281 |         forget(Basic_Element_Table, data_frame)
282 |         
283 |         The above function updates Basic_Element_Table with values in the new dataframe.
284 |     
285 |     """
286 |     
287 |     col = list(df.columns)
288 |     for index, row in df.iterrows():
289 |         row1 = []
290 |         for e in col:
291 |             row1.append(row[e])
292 |         arguments  = col + row1
293 |         BET = forgetbyindex(BET, *arguments)
294 |     return BET
295 | 
296 | 
297 | 


--------------------------------------------------------------------------------
/python/artml/bet.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[23]:
  5 | 
  6 | import math
  7 | import numpy as np
  8 | import pandas as pd
  9 | from scipy import stats
 10 | from tqdm import tqdm
 11 | import warnings
 12 | warnings.filterwarnings('ignore')
 13 | 
 14 | 
 15 | # In[24]:
 16 | 
 17 | def create_bet(df):
 18 | 
 19 |     """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and
 20 |     it can be updated with the new data.
 21 | 
 22 |     BET function returns basic element table as Pandas Dataframe
 23 | 
 24 |     Notes:
 25 |     -----
 26 |     see 'Real Time Data Mining' by Prof. Sayad
 27 | 
 28 |     (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining)
 29 | 
 30 |     """
 31 |     col = df.columns.tolist()
 32 |     df_matrix = df.values
 33 |     l = len(col)
 34 | 
 35 |     idx = np.array([5,6,7,8,9,0,1,2,3,4,10,11])
 36 |     bet={}
 37 |     x = np.array([[np.zeros(12) for x in range(l)] for y in range(l)])
 38 |     for i in tqdm(range(l)):
 39 |         bet[i] = []
 40 | 
 41 |         for j in range(i,l):
 42 |             y= np.array(df_matrix[:,j])
 43 |             z= np.array(df_matrix[:,i])
 44 | 
 45 |             """
 46 |             This code makes calculations for all the basic elements in the table. They are appended to
 47 |             a lists of a dictionary.
 48 |             """
 49 |             
 50 |             x[i,j] = np.array([len(z), z.sum(), (z**2).sum(), (z**3).sum(), (z**4).sum(), 
 51 |                                len(y), y.sum(), (y**2).sum(), (y**3).sum(), (y**4).sum(), (z*y).sum(), ((z*y)**2).sum()])
 52 | 
 53 |             x[j,i] = x[i,j][idx]
 54 |       
 55 |         for j in range(l): 
 56 |            bet[i].append(x[j,i])
 57 | 
 58 |     result = pd.DataFrame(bet, index=col)
 59 |     result.columns = col
 60 |     return(result)
 61 | 
 62 | 
 63 | 
 64 | 
 65 | # In[25]:
 66 | 
 67 | def calculate_basic_elements1(x,key,e,c,const):
 68 |     
 69 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 70 |     calculations to update the BET
 71 |     
 72 |     This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 
 73 |     for making the calculations
 74 |     """
 75 |     
 76 |     array = np.array(x[key][e])
 77 |     
 78 |     array = array + const*(np.array([1,c, c**2,c**3,c**4,1,c, c**2,c**3,c**4,c**2,c**4]))
 79 |     
 80 |     x[key][e] = array
 81 |     
 82 |     return x[key][e]
 83 | 
 84 | 
 85 | # In[26]:
 86 | 
 87 | def calculate_basic_elements2(x,key,k,b,c,i,m,const):   
 88 | 
 89 |     
 90 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 91 |     calculations to update the BET
 92 |     
 93 |     This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 
 94 |     for making the calculations
 95 |     """
 96 |    
 97 |     array = np.array(x[key][k])
 98 |     
 99 |     array = array + const*(np.array([1,c[b.index(m)],  (c[b.index(m)])**2,(c[b.index(m)])**3,(c[b.index(m)])**4,1,c[i], c[i]**2,c[i]**3,c[i]**4, c[i]*(c[b.index(m)]),(c[i]*(c[b.index(m)])**2)]))
100 |     
101 |     x[key][k] = array
102 |     
103 |     return x[key][k]
104 | 
105 | # In[27]:
106 | 
107 | def learnbyindex(BET, *args):
108 |     
109 |     """ This function takes Basic Element Table and feature_names & values as arguments to update the 
110 |         given list of feature column & rows in the BET by corresponding values.
111 |         
112 |         Examples
113 |         --------
114 |         learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
115 |         
116 |         The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively.
117 |     
118 |     """
119 |    
120 |     BET.reset_index(drop = True, inplace = True)                               # convert BET to dictionary
121 |     x = BET.to_dict(orient='list')
122 |     keys = list(x.keys())
123 |     arguments_list = [item for item in args]
124 |     n_features = int(len(arguments_list)/2)                          # no of features given as input for updating BET
125 |     
126 |     if (len(arguments_list))%2 != 0:                    
127 |         print("Error: Give correct set of Feature_names & corresponding parameters")
128 |     
129 |     else:  
130 |         feature_names = arguments_list[0:n_features]
131 |         values=  arguments_list[n_features::]
132 |         
133 |         for i in range(len(feature_names)):
134 |             key = feature_names[i]
135 |             e = keys.index(key)
136 |             calculate_basic_elements1(x,key,e,values[i],1)                           # function for updating elements  BET
137 |                         
138 |             for m in feature_names: 
139 |                  if m != feature_names[i]:
140 |                     k = keys.index(m)
141 |                     calculate_basic_elements2(x,key,k,feature_names,values,i,m,1)   # function for updating elements  BET
142 |                     
143 |     df = pd.DataFrame(x)
144 |     df.index = keys
145 |     df = df[keys]
146 |     return df
147 | 
148 | 
149 | # In[28]:
150 | 
151 | def forgetbyindex(BET, *args):
152 |     
153 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
154 |         given list of features in the BET by corresponding values (deleting effect of those values from BET).
155 |         
156 |         Examples
157 |         --------
158 |         forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
159 |         
160 |         The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively.
161 |     
162 |     """
163 |     
164 |     BET.reset_index(drop = True, inplace = True)
165 |     x = BET.to_dict(orient='list')                                                   # convert BET to dictionary
166 |     keys = list(x.keys())
167 |     arguments_list = [item for item in args]
168 |     n_features = int(len(arguments_list)/2)  
169 |     
170 |     if (len(arguments_list))%2 != 0:                                        # no of features given as input for updating BET
171 |         print("Give correct set of Index & parameters for function")
172 |     else:  
173 |         feature_names = arguments_list[0 : n_features]
174 |         values=  arguments_list[n_features: :]
175 |         for i in range(n_features):
176 |             key = feature_names[i]
177 |             e = keys.index(key)
178 |             calculate_basic_elements1(x,key,e,values,i,-1)                                  # function for updating elements  BET
179 |             
180 |             for m in feature_names: 
181 |                  if m != feature_names[i]:
182 |                     k = keys.index(m)
183 |                     calculate_basic_elements2(x,key,k,feature_names,values,i,m,-1)
184 | 
185 |     df = pd.DataFrame(x)
186 |     df = df[keys]
187 |     df.index = keys
188 |     return df
189 | 
190 | 
191 | 
192 | # In[29]:
193 | 
194 | 
195 | def growbyindex(BET, *args):
196 |     
197 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
198 |         BET with new features and corresponding values.
199 |         
200 |         Examples
201 |         --------
202 |         growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 )
203 |         
204 |         The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively.
205 |     
206 |     """
207 |     
208 |     main_list = list(BET.columns)
209 |     arguments_list = [item for item in args]                                            # convert BET to dictionary
210 |     n_features = int(len(arguments_list)/2)
211 |     if (len(arguments_list))%2 != 0:
212 |         print("Give correct set of Index & parameters for function")
213 |     else:  
214 |         feature_names = arguments_list[0:n_features]
215 |         values =  arguments_list[n_features::]
216 |     
217 |         for i in range(n_features):
218 |             
219 |             elements = [[0]*12]*len(BET)                                         #Creating null  basic elements lists
220 |             BET[feature_names[i]] = elements
221 |             
222 |             new_list = []
223 |             for j in range(len(BET.columns)):               
224 |                 new_list.append(list(np.array([0]*12)))
225 |     
226 |             new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]])
227 |             BET = pd.concat([BET,new_row])
228 |     
229 |         BET.reset_index(drop = True, inplace = True)
230 |         x = BET.to_dict(orient='list')
231 |         keys = list(x.keys())  
232 |            
233 |         for i in range(n_features):
234 |             key = feature_names[i]
235 |             if key in main_list:
236 |                 print('feature already exsists! Use Learn function')
237 |             else:        
238 |                 e = keys.index(key)
239 |                 calculate_basic_elements1(x,key,e,c,i,1)
240 | 
241 |     df = pd.DataFrame(BET)
242 |     df.index = keys
243 |     df = df[keys]
244 |     return df
245 | 
246 | 
247 | # In[30]:
248 | 
249 | def learn(BET, df):
250 |           
251 |     """ This function takes Basic Element Table and dataframe as inputs to update the 
252 |         BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input)
253 |         
254 |         Examples
255 |         --------
256 |         learn(Basic_Element_Table, data_frame)
257 |         
258 |         The above function updates Basic_Element_Table with values in the new dataframe.
259 |     
260 |     """
261 |     
262 |     col = list(df.columns)
263 |     for index, row in df.iterrows():
264 |         row1 = []
265 |         for e in col:
266 |             row1.append(row[e])
267 |         arguments  = col + row1
268 |         BET = learnbyindex(BET, *arguments)
269 |     return BET
270 | 
271 | 
272 | # In[31]:
273 | 
274 | def forget(BET, df):
275 |     
276 |     """ This function takes Basic Element Table and dataframe as inputs to change and remove the  
277 |         effect of that data in the BET. (Decremental Learning of BET with dataframe as input)
278 |         
279 |         Examples
280 |         --------
281 |         forget(Basic_Element_Table, data_frame)
282 |         
283 |         The above function updates Basic_Element_Table with values in the new dataframe.
284 |     
285 |     """
286 |     
287 |     col = list(df.columns)
288 |     for index, row in df.iterrows():
289 |         row1 = []
290 |         for e in col:
291 |             row1.append(row[e])
292 |         arguments  = col + row1
293 |         BET = forgetbyindex(BET, *arguments)
294 |     return BET
295 | 
296 | 
297 | 


--------------------------------------------------------------------------------
/module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | from numpy import * 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import stats
  7 | from scipy.stats import norm
  8 | from scipy.stats import chisqprob
  9 | import warnings
 10 | import matplotlib.pyplot as plt
 11 | warnings.filterwarnings('ignore')
 12 | 
 13 | def BET(df):
 14 |     
 15 |     """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 
 16 |     it can be updated with the new data.
 17 |     
 18 |     BET function returns basic element table as Pandas Dataframe
 19 |     
 20 |     Notes:
 21 |     -----
 22 |     see 'Real Time Data Mining' by Prof. Sayad
 23 |     
 24 |     (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining)
 25 |     
 26 |     """
 27 |     col = df.columns.tolist()
 28 |     l = len(col)                                                              
 29 |     x ={}                                                                   # Creating empty dictionary                                 
 30 |     for m in range(l):
 31 |         for n in range(l):
 32 |             x[m,n] = []                                      # Creating keys in dictionary with empty lists
 33 |         
 34 |     for i in range(l):
 35 |         for j in range(l):
 36 |             y=col[j]
 37 |             z=col[i]
 38 |             
 39 |             """
 40 |             This code makes calculations for all the basic elements in the table. They are appended to 
 41 |             a lists of a dictionary.
 42 |             
 43 |             """
 44 |             count_x = len(df[col[i]])                                           # count in particular X column
 45 |             x[i,j].append(count_x)
 46 |             
 47 |             sum_x = df[col[i]].sum()                                                 # Sum of elemensts in y
 48 |             x[i,j].append(sum_x)
 49 |             
 50 |             sum_x2 = (df[z]*df[z]).sum()                                             # Sum of elemensts in x2
 51 |             x[i,j].append(sum_x2)
 52 |             
 53 |             sum_x3 = (df[col[i]]*df[col[i]]*df[col[i]]).sum()                        # Sum of elemensts in x3
 54 |             x[i,j].append(sum_x3)
 55 |             
 56 |             sum_x4 = (df[col[i]]*df[col[i]]*df[col[i]]*df[col[i]]).sum()             # Sum of elemensts in x4
 57 |             x[i,j].append(sum_x4)
 58 |             
 59 |             count_y = len(df[col[j]])                                          # count in particular Y column
 60 |             x[i,j].append(count_y)
 61 |             
 62 |             sum_y = df[col[j]].sum()                                                 # Sum of elemensts in y
 63 |             x[i,j].append(sum_y)
 64 |             
 65 |             sum_y2 = (df[col[j]]*df[col[j]]).sum()                                  # Sum of elemensts in y2
 66 |             x[i,j].append(sum_y2) 
 67 |             
 68 |             sum_y3 = (df[col[j]]*df[col[j]]*df[col[j]]).sum()                       # Sum of elemensts in y3
 69 |             x[i,j].append(sum_y3)
 70 |             
 71 |             sum_y4 = (df[col[j]]*df[col[j]]*df[col[j]]*df[col[j]]).sum()            # Sum of elemensts in y4
 72 |             x[i,j].append(sum_y4)
 73 |             
 74 |             sum_xy = (df[col[i]]*df[col[j]]).sum()                                  # Sum of elemensts in xy
 75 |             x[i,j].append(sum_xy)
 76 |             
 77 |             sum_xy2 = (df[col[i]]*df[col[j]]*df[col[i]]*df[col[j]]).sum()           # Sum of elemensts in (xy)2
 78 |             x[i,j].append(sum_xy2)       
 79 |             
 80 |     z={}
 81 |     for m in range(l):                                                    # converting the dictionary to DataFrame
 82 |         z[m] = []  
 83 |     for i in range(l):
 84 |         for j in range(l):
 85 |             z[i].append(x[j,i])
 86 |     result = pd.DataFrame(z, index=col)
 87 |     result.columns = col
 88 |     return(result)
 89 | 
 90 | def calculate_basic_elements1(x,key,e,c,i,const):
 91 |     
 92 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 93 |     calculations to update the BET
 94 |     
 95 |     This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 
 96 |     for making the calculations
 97 |     """
 98 |     
 99 |     x[key][e][0] = (x[key][e][0]+(const*1))
100 | 
101 |     x[key][e][1] = (x[key][e][1]+(const*c[i]))
102 | 
103 |     x[key][e][2] = (x[key][e][2]+(const*(c[i]*c[i])))
104 |             
105 |     x[key][e][3] = (x[key][e][3]+(const*(c[i]*c[i]*c[i])))
106 |             
107 |     x[key][e][4] = (x[key][e][4]+(const*(c[i]*c[i]*c[i]*c[i])))
108 | 
109 |     x[key][e][5] = (x[key][e][5]+(const*1))
110 | 
111 |     x[key][e][6] = (x[key][e][6]+(const*c[i]))
112 | 
113 |     x[key][e][7] = (x[key][e][7]+(const*(c[i]*c[i])))
114 |             
115 |     x[key][e][8] = (x[key][e][8]+(const*(c[i]*c[i]*c[i])))
116 |             
117 |     x[key][e][9] = (x[key][e][9]+(const*(c[i]*c[i]*c[i]*c[i])))
118 | 
119 |     x[key][e][10] = (x[key][e][10]+(const*(c[i]*c[i])))
120 |                                
121 |     x[key][e][11] = (x[key][e][11]+(const*(c[i]*c[i]*c[i]*c[i])))
122 |     
123 |     return x[key][e]
124 | 
125 | 
126 | # In[9]:
127 | 
128 | def calculate_basic_elements2(x,key,k,b,c,i,m,const):    
129 |     
130 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
131 |     calculations to update the BET
132 |     
133 |     This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 
134 |     for making the calculations
135 |     """
136 |     
137 |     x[key][k][0] = (x[key][k][0]+(const*1))
138 | 
139 |     x[key][k][1] = (x[key][k][1]+(const*c[b.index(m)]))
140 | 
141 |     x[key][k][2] = (x[key][k][2]+(const*(c[b.index(m)]*c[b.index(m)])))
142 |                     
143 |     x[key][k][3] = (x[key][k][3]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)])))
144 |             
145 |     x[key][k][4] = (x[key][k][4]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]*c[b.index(m)])))
146 |      
147 |     x[key][k][5] = (x[key][k][5]+(const*1))
148 | 
149 |     x[key][k][6] = (x[key][k][6]+(const*c[i]))
150 | 
151 |     x[key][k][7] = (x[key][k][7]+(const*(c[i]*c[i])))
152 |                                
153 |     x[key][k][8] = (x[key][k][8]+(const*(c[i]*c[i]*c[i])))
154 |             
155 |     x[key][k][9] = (x[key][k][9]+(const*(c[i]*c[i]*c[i]*c[i])))
156 | 
157 |     x[key][k][10] = (x[key][k][10]+(const*(c[i]*c[b.index(m)])))
158 | 
159 |     x[key][k][11] = (x[key][k][11]+(const*(c[i]*c[b.index(m)]*c[i]*c[b.index(m)])))
160 |     
161 |     return x[key][k]
162 | 
163 | # In[21]:
164 | 
165 | def learnbyindex(BET, *args):
166 |     
167 |     """ This function takes Basic Element Table and feature_names & values as arguments to update the 
168 |         given list of feature column & rows in the BET by corresponding values.
169 |         
170 |         Examples
171 |         --------
172 |         learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
173 |         
174 |         The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively.
175 |     
176 |     """
177 |    
178 |     BET.reset_index(drop = True, inplace = True)                               # convert BET to dictionary
179 |     x = BET.to_dict(orient='list')
180 |     keys = list(x.keys())
181 |     arguments_list = [item for item in args]
182 |     n_features = int(len(arguments_list)/2)                          # no of features given as input for updating BET
183 |     
184 |     if (len(arguments_list))%2 != 0:                    
185 |         print("Error: Give correct set of Feature_names & corresponding parameters")
186 |     
187 |     else:  
188 |         feature_names = arguments_list[0:n_features]
189 |         values=  arguments_list[n_features::]
190 |         
191 |         for i in range(len(feature_names)):
192 |             key = feature_names[i]
193 |             e = keys.index(key)
194 |             basic_elements1(x,key,e,values,i,1)                           # function for updating elements  BET
195 |             
196 |             for m in feature_names: 
197 |                  if m != feature_names[i]:
198 |                     k = keys.index(m)
199 |                     basic_elements2(x,key,k,feature_names,values,i,m,1)   # function for updating elements  BET
200 |                     
201 |     df = pd.DataFrame(x)
202 |     df.index = keys
203 |     df = df[keys]
204 |     return df
205 | 
206 | 
207 | 
208 | # In[22]:
209 | 
210 | def forgetbyindex(BET, *args):
211 |     
212 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
213 |         given list of features in the BET by corresponding values (deleting effect of those values from BET).
214 |         
215 |         Examples
216 |         --------
217 |         forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
218 |         
219 |         The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively.
220 |     
221 |     """
222 |     
223 |     BET.reset_index(drop = True, inplace = True)
224 |     x = BET.to_dict(orient='list')                                                   # convert BET to dictionary
225 |     keys = list(x.keys())
226 |     arguments_list = [item for item in args]
227 |     n_features = int(len(arguments_list)/2)  
228 |     
229 |     if (len(arguments_list))%2 != 0:                                        # no of features given as input for updating BET
230 |         print("Give correct set of Index & parameters for function")
231 |     else:  
232 |         feature_names = arguments_list[0 : n_features]
233 |         values=  arguments_list[n_features: :]
234 |         for i in range(n_features):
235 |             key = feature_names[i]
236 |             e = keys.index(key)
237 |             basic_elements1(x,key,e,values,i,-1)                                  # function for updating elements  BET
238 |             
239 |             for m in feature_names: 
240 |                  if m != feature_names[i]:
241 |                     k = keys.index(m)
242 |                     basic_elements2(x,key,k,feature_names,values,i,m,-1)
243 | 
244 |     df = pd.DataFrame(x)
245 |     df = df[keys]
246 |     df.index = keys
247 |     return df
248 | 
249 | 
250 | # In[12]:
251 | 
252 | def growbyindex(BET, *args):
253 |     
254 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
255 |         BET with new features and corresponding values.
256 |         
257 |         Examples
258 |         --------
259 |         growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 )
260 |         
261 |         The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively.
262 |     
263 |     """
264 |     
265 |     main_list = list(BET.columns)
266 |     arguments_list = [item for item in args]                                            # convert BET to dictionary
267 |     n_features = int(len(arguments_list)/2)
268 |     if (len(arguments_list))%2 != 0:
269 |         print("Give correct set of Index & parameters for function")
270 |     else:  
271 |         feature_names = arguments_list[0:n_features]
272 |         values =  arguments_list[n_features::]
273 |     
274 |         for i in range(n_features):
275 |             
276 |             elements = [[0]*12]*len(BET)                                         #Creating null  basic elements lists
277 |             BET[feature_names[i]] = elements
278 |             
279 |             new_list = []
280 |             for j in range(len(BET.columns)):               
281 |                 new_list.append(list(np.array([0]*12)))
282 |     
283 |             new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]])
284 |             BET = pd.concat([BET,new_row])
285 |     
286 |         BET.reset_index(drop = True, inplace = True)
287 |         x = BET.to_dict(orient='list')
288 |         keys = list(x.keys())  
289 |            
290 |         for i in range(n_features):
291 |             key = feature_names[i]
292 |             if key in main_list:
293 |                 print('feature already exsists! Use Learn function')
294 |             else:        
295 |                 e = keys.index(key)
296 |                 calculate_basic_elements1(x,key,e,c,i,1)
297 | 
298 |     df = pd.DataFrame(BET)
299 |     df.index = keys
300 |     df = df[keys]
301 |     return df
302 | 
303 | # In[14]:
304 | 
305 | def learn(BET, df):
306 |           
307 |     """ This function takes Basic Element Table and dataframe as inputs to update the 
308 |         BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input)
309 |         
310 |         Examples
311 |         --------
312 |         learn(Basic_Element_Table, data_frame)
313 |         
314 |         The above function updates Basic_Element_Table with values in the new dataframe.
315 |     
316 |     """
317 |     
318 |     col = list(df.columns)
319 |     for index, row in df.iterrows():
320 |         row1 = []
321 |         for e in col:
322 |             row1.append(row[e])
323 |         arguments  = col + row1
324 |         BET = learnbyindex(BET, *arguments)
325 |     return BET
326 | 
327 | # In[16]:
328 | 
329 | def forget(BET, df):
330 |     
331 |     """ This function takes Basic Element Table and dataframe as inputs to change and remove the  
332 |         effect of that data in the BET. (Decremental Learning of BET with dataframe as input)
333 |         
334 |         Examples
335 |         --------
336 |         forget(Basic_Element_Table, data_frame)
337 |         
338 |         The above function updates Basic_Element_Table with values in the new dataframe.
339 |     
340 |     """
341 |     
342 |     col = list(df.columns)
343 |     for index, row in df.iterrows():
344 |         row1 = []
345 |         for e in col:
346 |             row1.append(row[e])
347 |         arguments  = col + row1
348 |         BET = forgetbyindex(BET, *arguments)
349 |     return BET
350 | 
351 | 
352 | # In[18]:
353 | 
354 | def univariate(BET):
355 |     
356 |     """
357 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
358 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
359 |     a dataset’s distribution, excluding NaN values.
360 |     
361 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
362 |     
363 |     Examples
364 |     --------
365 |         univariate(Basic_Element_Table)
366 |         
367 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
368 |         
369 |         function returns univariate stats as Pandas Dataframe.
370 |     
371 |     """
372 |     
373 |     l =(len(BET))
374 |     BET.reset_index(drop = True, inplace = True)
375 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
376 |     keys =list(x.keys())  
377 |     describe = {}
378 |     
379 |     for i in range(l):
380 |         describe[i] = []
381 |         m = keys[i]
382 |         
383 |         try:
384 |             count = x[m][i][0]
385 |             describe[i].append(count)
386 |         except:
387 |             describe[i].append('NaN')
388 |         try:
389 |             Mean = (x[m][i][1])/count
390 |             describe[i].append(Mean)   
391 |         except:
392 |             describe[i].append('NaN')
393 |         
394 |         try:
395 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
396 |             describe[i].append(Variance)
397 |         except:
398 |             describe[i].append('NaN')
399 |         try:
400 |             Standard_deviation = math.sqrt(Variance)
401 |             describe[i].append(Standard_deviation)
402 |         except:
403 |             describe[i].append('NaN')
404 |         try:
405 |             coeff_of_variation = (Standard_deviation/Mean)*100
406 |             describe[i].append(coeff_of_variation)
407 |         except:
408 |             describe[i].append('NaN')
409 |             
410 |         try:
411 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
412 |             describe[i].append(skewness)
413 |         except:
414 |             describe[i].append('NaN')
415 |         try:
416 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
417 |             describe[i].append(Kurtosis)
418 |         except:
419 |             describe[i].append('NaN')        
420 |         
421 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
422 |     result = pd.DataFrame(describe, index=names)
423 |     result.columns = keys
424 |     return(result)
425 | 
426 | # In[19]:
427 | 
428 | def Covariance(BET):
429 |     
430 |     """
431 |     This function computes pairwise covariance of all features in BET. Covariance describes 
432 |     the linear relationship between two features.
433 |     
434 |     Examples
435 |     --------
436 |         Covariance(Basic_Element_Table)
437 |         
438 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
439 |         
440 |         function returns Covariance as Pandas Dataframe.
441 |     
442 |     """
443 |     
444 |     l =(len(BET))
445 |     BET.reset_index(drop = True, inplace = True)
446 |     x = BET.to_dict(orient='list')
447 |     keys =list(x.keys())  
448 |     covar = {}
449 |     
450 |     for i in range(len(BET)):
451 |         covar[i] = []
452 |         for j in range(len(BET)):
453 |             m = keys[i]
454 |             try:
455 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
456 |                 covar[i].append(cov)
457 |             except:
458 |                 covar[i].append('NaN')
459 |             
460 |     result = pd.DataFrame(covar, index=keys)
461 |     result.columns = keys
462 |     return(result)
463 | 
464 | 	
465 | def correlation(BET):
466 |     
467 |     """
468 |     This function computes pairwise correlations of all features in BET. correlation measures 
469 |     how strong a relationship is between two variables.
470 |     
471 |     Examples
472 |     --------
473 |         correlation(Basic_Element_Table)
474 |         
475 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
476 |         
477 |         function returns correlations as Pandas Dataframe.
478 |     
479 |     """
480 |     
481 |     l =(len(BET))
482 |     BET.reset_index(drop = True, inplace = True)
483 |     x = BET.to_dict(orient='list')
484 |     keys =list(x.keys())  
485 |     corr = {}
486 |     
487 |     for i in range(len(BET)):
488 |         corr[i] = []
489 |         for j in range(len(BET)):
490 |             m = keys[i]      
491 |             count1 = x[m][j][0]
492 |             count2 = x[m][j][5]
493 |             try:
494 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
495 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
496 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
497 |                 corr[i].append(corrl)
498 |             except:
499 |                 corr[i].append('NaN')
500 |     
501 |     result = pd.DataFrame(corr, index=keys)
502 |     result.columns = keys
503 |     return(result)
504 | 
505 | def Ztest(BET, col1, col2):
506 |     
507 |     l =(len(BET))
508 |     BET.reset_index(drop = True, inplace = True)
509 |     x = BET.to_dict(orient='list')
510 |     keys =list(x.keys())
511 |     
512 |     count = x[col2][keys.index(col1)][6]
513 |     sumx = x[col2][keys.index(col1)][10]
514 |     sumx2 = x[col2][keys.index(col1)][11]
515 |     Mean = sumx/count
516 |     Variance = (sumx2 - (((sumx)**2)/count))/count
517 |     
518 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
519 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
520 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
521 |     Mean_0 = sumx_0/count_0
522 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
523 |     
524 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
525 |     prob = 1 - stats.norm.cdf(zscore)
526 |     return 2*prob
527 |     
528 | 	
529 | def Ttest(BET, col1, col2):
530 |     
531 |     l =(len(BET))
532 |     BET.reset_index(drop = True, inplace = True)
533 |     x = BET.to_dict(orient='list')
534 |     keys =list(x.keys())
535 |     
536 |     count = x[col2][keys.index(col1)][6]
537 |     sumx = x[col2][keys.index(col1)][10]
538 |     sumx2 = x[col2][keys.index(col1)][11]
539 |     Mean = sumx/count
540 |     Variance = (sumx2 - (((sumx)**2)/count))/count
541 |     
542 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
543 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
544 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
545 |     Mean_0 = sumx_0/count_0
546 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
547 |     
548 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
549 |     
550 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
551 |     
552 |     df = (count + count_0 - 2)
553 |     
554 |     prob = (1-stats.t.cdf(tscore, df)) 
555 |     return 2*prob
556 |     
557 | 	
558 | 
559 | def chi2(BET, feature_1 , feature_2):
560 |     
561 |     l =(len(BET))
562 |     BET.reset_index(drop = True, inplace = True)
563 |     x = BET.to_dict(orient='list')
564 |     keys =list(x.keys())
565 |     obs_freq = {}
566 |     exp_freq = {}
567 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
568 |     chi2 = 0
569 |     
570 |     for i in range(len(feature_1)):
571 |         obs_freq[feature_1[i]] = []
572 |         
573 |         for j in range(len(feature_2)): 
574 |             col1 = (feature_1[i])
575 |             col2 = (feature_2[j])
576 |             sumx = x[col1][keys.index(col2)][10]
577 |             obs_freq[feature_1[i]].append(sumx)
578 |             
579 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
580 |     total_in_contingency = sum(sum_exp_freq_vertical)
581 |     
582 |     for i in range(len(feature_1)):
583 |         exp_freq[feature_1[i]] = []
584 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
585 |         for j in range(len(feature_2)):            
586 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
587 |             exp_freq[feature_1[i]].append(e)
588 |         
589 |     for i in range(len(feature_1)):
590 |         for j in range(len(feature_2)):
591 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
592 |             
593 |             
594 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
595 |     
596 |     print('chi2: ' + str(chi2))
597 |     print('df: '  + str(df))
598 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
599 |     return(chisqprob(chi2, df))
600 | 
601 | 
602 | #Models:
603 |     
604 | def LDA_fit(BET, target):
605 |     
606 |     """
607 |     Linear Discriminant Analysis (LDA) is a classification method searching for a linear combination 
608 |     of variables (predictors) that best separates the classes (targets). 
609 |     
610 |     It basically performs the supervised dimensionality reduction, by projecting the input data to a 
611 |     linear subspace consisting of the directions which maximize the separation between classes (Maximizing the difference
612 |     between the means of groups and reducing Std. deviation within groups)
613 |     
614 |     Examples
615 |     --------
616 |         LDA_fit(Basic_Element_Table, Target)
617 |         
618 |         where 'Basic_Element_Table' is found from BET function for the data and 'Target' is the feature that needs to be 
619 |         predicted.
620 |         
621 |         The function returns (mean1,mean2,Beta, prob) which are Mean vectors of the groups, Linear Model coefficients and
622 |         class probability respectively.
623 |     
624 |     """
625 |     l =(len(BET))
626 |     BET1 = BET 
627 |     BET1.reset_index(drop = True, inplace = True)
628 |     x = BET1.to_dict(orient='list')
629 |     keys =list(x.keys())
630 |     k = keys.index(target)
631 |     count_1 = BET[target][k][0] - BET[target][k][1]
632 |     count_2 = BET[target][k][1]
633 |     mean1 = []
634 |     mean2 = []
635 |     c = []
636 |     for i in range(len(BET)):
637 |         if i != keys.index(target):
638 |             mean1.append((BET[target][i][1] - BET[target][i][10])/(BET[target][i][0]-BET[target][i][6]))
639 |             mean2.append((BET[target][i][10])/BET[target][i][6])
640 | 
641 |     for i in range(len(BET)):
642 |         if i != keys.index(target):
643 |             for j in range(len(BET)):
644 |                 if j != keys.index(target):
645 |                     m = keys[i]
646 |                     n = keys[j]
647 |                     cal1 = (((x[m][k][6] - x[m][k][10])*(x[n][k][6] - x[n][k][10]))/count_1)
648 |                     cal2 = (x[m][k][10]*x[n][k][10])/count_2
649 |                     c.append((x[m][j][10]-cal1 - cal2)/(count_1+count_2-2))
650 | 
651 |     c = np.array(c)
652 |     n = (len(BET)-1)
653 |     c = reshape(c,(n,n))
654 |     inverse = np.linalg.inv(c)
655 |     z = np.array(mean1)-np.array(mean2)
656 |     Beta = np.matmul(inverse, z.T)
657 |     prob =  (-math.log(count_1/count_2))
658 |     
659 |     return (mean1,mean2,Beta, prob)
660 |     
661 |     
662 | def LDA_predict(BET, X, target):
663 |     """
664 |     To predict the target values for the given data using LDA paramters calculated from the training dataset.
665 |     Returns the predictions using LDA model.
666 |     
667 |     Examples
668 |     --------
669 |         LDA_predict(Basic_Element_Table, Testing_data, Target)
670 |         
671 |         BET table and testing data should be given as inputs
672 |     """
673 |     (mean1,mean2,Beta, prob) = LDA_fit(BET, target)
674 |     numpy_matrix = X.as_matrix()
675 |     q=[]
676 |     for i in range(len(numpy_matrix)):
677 |         z = numpy_matrix[i] - (0.5*(np.array(mean1) - np.array(mean2)))
678 |         if np.matmul(Beta.T, z) > prob:
679 |             q.append(0)
680 |         else:
681 |             q.append(1)
682 |     return q
683 |     
684 | def accuracy(y, y_pred):
685 |     y = list(y)
686 |     y_pred =list(y_pred)
687 |     matches = []
688 |     for i in range(len(y)):
689 |         if y[i] == y_pred[i]:
690 |             matches.append(1)
691 |     return (sum(matches)/len(y))*100
692 | 
693 | 
694 | def PCA(BET): 
695 |     """
696 |     Principal component analysis (PCA) is a classical statistical method that uses an orthogonal transformation 
697 |     to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables 
698 |     called principal components.
699 |     
700 |     Real time Principal components for datasets can be extracted from the ART-M covariance matrix equations.
701 |     
702 |     Examples
703 |     --------
704 |     PCA(Basic_Element_Table)
705 |     
706 |     This function returns eigen values & eigen vectors for the features in the Basic element table.
707 |     """
708 |     
709 |     cov = Covariance(BET)
710 |     cov_mat  = cov.values
711 |     eig_vals, eig_vecs = np.linalg.eig(cov_mat)
712 | 
713 |     print('Eigenvectors: \n%s' %eig_vecs)
714 |     print('\nEigenvalues: \n%s' %eig_vals)
715 |     
716 |     # Make a list of (eigenvalue, eigenvector) tuples
717 |     eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
718 | 
719 |     # Sort the (eigenvalue, eigenvector) tuples from high to low
720 |     eig_pairs.sort(key=lambda x: x[0], reverse=True)
721 | 
722 |     # Visually confirm that the list is correctly sorted by decreasing eigenvalues
723 |     print('\nEigenvalues in descending order:')
724 |     for i in eig_pairs:
725 |         print(i[0])
726 | 
727 | def MLR(BET,target):
728 |         
729 |     row_indexes = list(BET.index)
730 |     target_index = row_indexes.index(target)
731 |     BET_features = BET.drop(target, axis =1)
732 |     BET_features = BET_features.drop(target, axis =0)  
733 |     cov_features = Covariance(BET_features).values
734 |     cov_target = Covariance(BET).values
735 |     cov_target = cov_target[target_index]
736 |     cov_target = np.delete(cov_target, target_index)
737 |     inverse = np.linalg.inv(cov_features)
738 |     Beta_array = np.matmul(inverse, cov_target)
739 |    
740 |     l =(len(BET))
741 |     BET.reset_index(drop = True, inplace = True)
742 |     x = BET.to_dict(orient='list')
743 |     keys =list(x.keys())
744 |     
745 |     mean_target = (BET[target][keys.index(target)][1])/BET[target][keys.index(target)][0]
746 |     mean_X = []
747 |     
748 |     for i in range(len(BET_features)):
749 |         if i != keys.index(target):
750 |             mean_X.append((BET[target][i][1])/BET[target][i][0])
751 |             
752 |     b0 = mean_target - np.matmul(Beta_array, mean_X)
753 |     
754 |     print(b0)
755 |     return Beta_array
756 |     
757 | 	
758 | def gaussian_NB(BET, X ,target):
759 |     
760 |     l =(len(BET))
761 |     BET.reset_index(drop = True, inplace = True)
762 |     x = BET.to_dict(orient='list')
763 |     keys =list(x.keys())
764 |     
765 |     probability = []
766 |     likelihood = 1
767 |     att_prior_prob = 1
768 |     class_prior_prob = 1
769 |     for i in range(len(BET)):
770 |         if keys[i] != target:
771 |             count = x[target][i][6]
772 |             sumxy = x[target][i][10]
773 |             sumxy2 = x[target][i][11]
774 |             Mean = sumxy/count
775 |             Variance = (sumxy2 - (((sumxy)**2)/count))/count
776 |             value = X[i]
777 |             likelihood = likelihood*(1/math.sqrt(2*np.pi*Variance))*(np.e**(-(value-Mean)/(2*Variance)))
778 | 
779 |             class_prior_prob = (count/x[target][i][5])
780 | 
781 |             count_att = x[target][i][0]
782 |             sumxy_att = x[target][i][1]
783 |             sumxy2_att = x[target][i][2]
784 |             Mean_att = sumxy_att/count_att
785 |             Variance_att = (sumxy2_att - (((sumxy_att)**2)/count_att))/count_att
786 | 
787 |             att_prior_prob = att_prior_prob*(1/math.sqrt(2*np.pi*Variance_att))*(np.e**(-(value-Mean_att)/(2*Variance_att)))
788 | 
789 |     post_prob = (class_prior_prob * likelihood)/att_prior_prob
790 |             
791 |     return post_prob
792 | 
793 | 
794 | def Multinomial_NB(BET, X ,target):
795 |     
796 |     l =(len(BET))
797 |     BET.reset_index(drop = True, inplace = True)
798 |     x = BET.to_dict(orient='list')
799 |     keys =list(x.keys())
800 |     
801 |     probability = []
802 |     likelihood = 1
803 |     att_prior_prob = 1
804 |     class_prior_prob = 1
805 |     for i in range(len(BET)):
806 |         if keys[i] != target:
807 |             sumx = x[target][i][6]
808 |             sumxy = x[target][i][10]
809 |             likelihood = likelihood*(sumxy/sumx)
810 | 
811 |             class_prior_prob = (x[target][i][6]/x[target][i][5])
812 | 
813 |             count_att = x[target][i][0]
814 |             sumxy_att = x[target][i][1]
815 |             att_prior_prob = att_prior_prob*(sumxy_att/count_att)
816 | 
817 |     post_prob = (class_prior_prob * likelihood)/att_prior_prob
818 |             
819 |     return post_prob
820 | 
821 | 
822 | def SVM_fit(BET, target):
823 |     l =(len(BET))
824 |     BET1 = BET 
825 |     BET1.reset_index(drop = True, inplace = True)
826 |     x = BET1.to_dict(orient='list')
827 |     keys =list(x.keys())
828 |     k = keys.index(target)       
829 |     EE = []
830 |     last_row =[]
831 |     Ede = []
832 |     count = BET[target][k][0]
833 |     for i in range(len(BET)):
834 |         if i != keys.index(target):
835 |             for j in range(len(BET)):
836 |                 if j != keys.index(target):
837 |                     m = keys[i]
838 |                     n = keys[j]
839 |                     EE.append(x[m][j][10])
840 |                 if j == keys.index(target):
841 |                     Ede.append(2*(x[m][j][10]) -x[m][i][6]) 
842 |             EE.append(-x[m][i][6])    
843 |         last_row.append(-x[m][i][6])
844 |     final = EE+last_row       
845 |     final.pop()
846 |     final.append(count)
847 |     final = np.array(final)
848 |     n = (len(BET))
849 |     final = reshape(final,(n,n))
850 | 
851 |     Ede.append((count-2*(BET[target][k][1])))
852 | 
853 |     I = np.identity(n)
854 |     const = (((I/count)+ final))
855 | 
856 |     inverse = np.linalg.inv(const)
857 |     Beta = np.dot(inverse, np.array(Ede))
858 |     
859 |     return(Beta)
860 | 
861 | 
862 | def SVM_Reg_fit(BET, target,tuning_parameter):
863 |     l =(len(BET))
864 |     BET1 = BET 
865 |     BET1.reset_index(drop = True, inplace = True)
866 |     x = BET1.to_dict(orient='list')
867 |     keys =list(x.keys())
868 |     k = keys.index(target)       
869 |     EE = []
870 |     last_row =[]
871 |     Ede = []
872 |     count = BET[target][k][0]
873 |     for i in range(len(BET)):
874 |         if i != keys.index(target):
875 |             for j in range(len(BET)):
876 |                 if j != keys.index(target):
877 |                     m = keys[i]
878 |                     n = keys[j]
879 |                     EE.append(x[m][j][10])
880 |                 if j == keys.index(target):
881 |                     Ede.append(x[m][j][10]) 
882 |             EE.append(-x[m][i][6])    
883 |         last_row.append(-x[m][i][6])
884 |     final = EE+last_row       
885 |     final.pop()
886 |     final.append(count)
887 |     final = np.array(final)
888 |     n = (len(BET))
889 |     final = reshape(final,(n,n))
890 | 
891 |     Ede.append(-(BET[target][k][1]))
892 |     print(Ede)
893 |     I = np.identity(n)
894 |     const = (((I/tuning_parameter)+ final))
895 |     
896 |     inverse = np.linalg.inv(const)
897 |     Beta = np.dot(inverse, np.array(Ede))
898 |     
899 |     return(Beta)
900 | 


--------------------------------------------------------------------------------
/util/module.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | from numpy import * 
  4 | import numpy as np
  5 | import pandas as pd
  6 | from scipy import stats
  7 | from scipy.stats import norm
  8 | from scipy.stats import chisqprob
  9 | import warnings
 10 | import matplotlib.pyplot as plt
 11 | warnings.filterwarnings('ignore')
 12 | 
 13 | def BET(df):
 14 |     
 15 |     """ BET function constructs the Basic Element Table for the Dataframe. BET is the key step for ARTML and 
 16 |     it can be updated with the new data.
 17 |     
 18 |     BET function returns basic element table as Pandas Dataframe
 19 |     
 20 |     Notes:
 21 |     -----
 22 |     see 'Real Time Data Mining' by Prof. Sayad
 23 |     
 24 |     (https://www.researchgate.net/publication/265619432_Real_Time_Data_Mining)
 25 |     
 26 |     """
 27 |     col = df.columns.tolist()
 28 |     l = len(col)                                                              
 29 |     x ={}                                                                   # Creating empty dictionary                                 
 30 |     for m in range(l):
 31 |         for n in range(l):
 32 |             x[m,n] = []                                      # Creating keys in dictionary with empty lists
 33 |         
 34 |     for i in range(l):
 35 |         for j in range(l):
 36 |             y=col[j]
 37 |             z=col[i]
 38 |             
 39 |             """
 40 |             This code makes calculations for all the basic elements in the table. They are appended to 
 41 |             a lists of a dictionary.
 42 |             
 43 |             """
 44 |             count_x = len(df[col[i]])                                           # count in particular X column
 45 |             x[i,j].append(count_x)
 46 |             
 47 |             sum_x = df[col[i]].sum()                                                 # Sum of elemensts in y
 48 |             x[i,j].append(sum_x)
 49 |             
 50 |             sum_x2 = (df[z]*df[z]).sum()                                             # Sum of elemensts in x2
 51 |             x[i,j].append(sum_x2)
 52 |             
 53 |             sum_x3 = (df[col[i]]*df[col[i]]*df[col[i]]).sum()                        # Sum of elemensts in x3
 54 |             x[i,j].append(sum_x3)
 55 |             
 56 |             sum_x4 = (df[col[i]]*df[col[i]]*df[col[i]]*df[col[i]]).sum()             # Sum of elemensts in x4
 57 |             x[i,j].append(sum_x4)
 58 |             
 59 |             count_y = len(df[col[j]])                                          # count in particular Y column
 60 |             x[i,j].append(count_y)
 61 |             
 62 |             sum_y = df[col[j]].sum()                                                 # Sum of elemensts in y
 63 |             x[i,j].append(sum_y)
 64 |             
 65 |             sum_y2 = (df[col[j]]*df[col[j]]).sum()                                  # Sum of elemensts in y2
 66 |             x[i,j].append(sum_y2) 
 67 |             
 68 |             sum_y3 = (df[col[j]]*df[col[j]]*df[col[j]]).sum()                       # Sum of elemensts in y3
 69 |             x[i,j].append(sum_y3)
 70 |             
 71 |             sum_y4 = (df[col[j]]*df[col[j]]*df[col[j]]*df[col[j]]).sum()            # Sum of elemensts in y4
 72 |             x[i,j].append(sum_y4)
 73 |             
 74 |             sum_xy = (df[col[i]]*df[col[j]]).sum()                                  # Sum of elemensts in xy
 75 |             x[i,j].append(sum_xy)
 76 |             
 77 |             sum_xy2 = (df[col[i]]*df[col[j]]*df[col[i]]*df[col[j]]).sum()           # Sum of elemensts in (xy)2
 78 |             x[i,j].append(sum_xy2)       
 79 |             
 80 |     z={}
 81 |     for m in range(l):                                                    # converting the dictionary to DataFrame
 82 |         z[m] = []  
 83 |     for i in range(l):
 84 |         for j in range(l):
 85 |             z[i].append(x[j,i])
 86 |     result = pd.DataFrame(z, index=col)
 87 |     result.columns = col
 88 |     return(result)
 89 | 
 90 | def calculate_basic_elements1(x,key,e,c,i,const):
 91 |     
 92 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
 93 |     calculations to update the BET
 94 |     
 95 |     This takes (BET_dictionary, feature_name, feature_index, values_list, i, +1/-1 (const)) as arguments 
 96 |     for making the calculations
 97 |     """
 98 |     
 99 |     x[key][e][0] = (x[key][e][0]+(const*1))
100 | 
101 |     x[key][e][1] = (x[key][e][1]+(const*c[i]))
102 | 
103 |     x[key][e][2] = (x[key][e][2]+(const*(c[i]*c[i])))
104 |             
105 |     x[key][e][3] = (x[key][e][3]+(const*(c[i]*c[i]*c[i])))
106 |             
107 |     x[key][e][4] = (x[key][e][4]+(const*(c[i]*c[i]*c[i]*c[i])))
108 | 
109 |     x[key][e][5] = (x[key][e][5]+(const*1))
110 | 
111 |     x[key][e][6] = (x[key][e][6]+(const*c[i]))
112 | 
113 |     x[key][e][7] = (x[key][e][7]+(const*(c[i]*c[i])))
114 |             
115 |     x[key][e][8] = (x[key][e][8]+(const*(c[i]*c[i]*c[i])))
116 |             
117 |     x[key][e][9] = (x[key][e][9]+(const*(c[i]*c[i]*c[i]*c[i])))
118 | 
119 |     x[key][e][10] = (x[key][e][10]+(const*(c[i]*c[i])))
120 |                                
121 |     x[key][e][11] = (x[key][e][11]+(const*(c[i]*c[i]*c[i]*c[i])))
122 |     
123 |     return x[key][e]
124 | 
125 | 
126 | # In[9]:
127 | 
128 | def calculate_basic_elements2(x,key,k,b,c,i,m,const):    
129 |     
130 |     """ This is an inner function used in learn_by_index & grow_by_index functions for making 
131 |     calculations to update the BET
132 |     
133 |     This takes (BET_dictionary, feature_name, feature_index,feature_names_list, values_list, i, m, +1/-1 (const)) as arguments 
134 |     for making the calculations
135 |     """
136 |     
137 |     x[key][k][0] = (x[key][k][0]+(const*1))
138 | 
139 |     x[key][k][1] = (x[key][k][1]+(const*c[b.index(m)]))
140 | 
141 |     x[key][k][2] = (x[key][k][2]+(const*(c[b.index(m)]*c[b.index(m)])))
142 |                     
143 |     x[key][k][3] = (x[key][k][3]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)])))
144 |             
145 |     x[key][k][4] = (x[key][k][4]+(const*(c[b.index(m)]*c[b.index(m)]*c[b.index(m)]*c[b.index(m)])))
146 |      
147 |     x[key][k][5] = (x[key][k][5]+(const*1))
148 | 
149 |     x[key][k][6] = (x[key][k][6]+(const*c[i]))
150 | 
151 |     x[key][k][7] = (x[key][k][7]+(const*(c[i]*c[i])))
152 |                                
153 |     x[key][k][8] = (x[key][k][8]+(const*(c[i]*c[i]*c[i])))
154 |             
155 |     x[key][k][9] = (x[key][k][9]+(const*(c[i]*c[i]*c[i]*c[i])))
156 | 
157 |     x[key][k][10] = (x[key][k][10]+(const*(c[i]*c[b.index(m)])))
158 | 
159 |     x[key][k][11] = (x[key][k][11]+(const*(c[i]*c[b.index(m)]*c[i]*c[b.index(m)])))
160 |     
161 |     return x[key][k]
162 | 
163 | # In[21]:
164 | 
165 | def learnbyindex(BET, *args):
166 |     
167 |     """ This function takes Basic Element Table and feature_names & values as arguments to update the 
168 |         given list of feature column & rows in the BET by corresponding values.
169 |         
170 |         Examples
171 |         --------
172 |         learnbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
173 |         
174 |         The above function updates feature_1, feature_2 in the BET by values 1 and 2 respectively.
175 |     
176 |     """
177 |    
178 |     BET.reset_index(drop = True, inplace = True)                               # convert BET to dictionary
179 |     x = BET.to_dict(orient='list')
180 |     keys = list(x.keys())
181 |     arguments_list = [item for item in args]
182 |     n_features = int(len(arguments_list)/2)                          # no of features given as input for updating BET
183 |     
184 |     if (len(arguments_list))%2 != 0:                    
185 |         print("Error: Give correct set of Feature_names & corresponding parameters")
186 |     
187 |     else:  
188 |         feature_names = arguments_list[0:n_features]
189 |         values=  arguments_list[n_features::]
190 |         
191 |         for i in range(len(feature_names)):
192 |             key = feature_names[i]
193 |             e = keys.index(key)
194 |             basic_elements1(x,key,e,values,i,1)                           # function for updating elements  BET
195 |             
196 |             for m in feature_names: 
197 |                  if m != feature_names[i]:
198 |                     k = keys.index(m)
199 |                     basic_elements2(x,key,k,feature_names,values,i,m,1)   # function for updating elements  BET
200 |                     
201 |     df = pd.DataFrame(x)
202 |     df.index = keys
203 |     df = df[keys]
204 |     return df
205 | 
206 | 
207 | 
208 | # In[22]:
209 | 
210 | def forgetbyindex(BET, *args):
211 |     
212 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
213 |         given list of features in the BET by corresponding values (deleting effect of those values from BET).
214 |         
215 |         Examples
216 |         --------
217 |         forgetbyindex(Basic_Element_Table, 'feature_1','feature_2', 1, 2 )
218 |         
219 |         The above function reduces feature_1, feature_2 in the BET by values 1 and 2 respectively.
220 |     
221 |     """
222 |     
223 |     BET.reset_index(drop = True, inplace = True)
224 |     x = BET.to_dict(orient='list')                                                   # convert BET to dictionary
225 |     keys = list(x.keys())
226 |     arguments_list = [item for item in args]
227 |     n_features = int(len(arguments_list)/2)  
228 |     
229 |     if (len(arguments_list))%2 != 0:                                        # no of features given as input for updating BET
230 |         print("Give correct set of Index & parameters for function")
231 |     else:  
232 |         feature_names = arguments_list[0 : n_features]
233 |         values=  arguments_list[n_features: :]
234 |         for i in range(n_features):
235 |             key = feature_names[i]
236 |             e = keys.index(key)
237 |             basic_elements1(x,key,e,values,i,-1)                                  # function for updating elements  BET
238 |             
239 |             for m in feature_names: 
240 |                  if m != feature_names[i]:
241 |                     k = keys.index(m)
242 |                     basic_elements2(x,key,k,feature_names,values,i,m,-1)
243 | 
244 |     df = pd.DataFrame(x)
245 |     df = df[keys]
246 |     df.index = keys
247 |     return df
248 | 
249 | 
250 | # In[12]:
251 | 
252 | def growbyindex(BET, *args):
253 |     
254 |     """ This function takes Basic Element Table and feature name & values as arguments to update the 
255 |         BET with new features and corresponding values.
256 |         
257 |         Examples
258 |         --------
259 |         growbyindex(Basic_Element_Table, 'new_feature_1','new_feature_2', 1, 2 )
260 |         
261 |         The above function adds new_feature_1, new_feature_2 in the BET with values 1 and 2 respectively.
262 |     
263 |     """
264 |     
265 |     main_list = list(BET.columns)
266 |     arguments_list = [item for item in args]                                            # convert BET to dictionary
267 |     n_features = int(len(arguments_list)/2)
268 |     if (len(arguments_list))%2 != 0:
269 |         print("Give correct set of Index & parameters for function")
270 |     else:  
271 |         feature_names = arguments_list[0:n_features]
272 |         values =  arguments_list[n_features::]
273 |     
274 |         for i in range(n_features):
275 |             
276 |             elements = [[0]*12]*len(BET)                                         #Creating null  basic elements lists
277 |             BET[feature_names[i]] = elements
278 |             
279 |             new_list = []
280 |             for j in range(len(BET.columns)):               
281 |                 new_list.append(list(np.array([0]*12)))
282 |     
283 |             new_row = pd.DataFrame([new_list],columns= list(BET.columns),index = [feature_names[i]])
284 |             BET = pd.concat([BET,new_row])
285 |     
286 |         BET.reset_index(drop = True, inplace = True)
287 |         x = BET.to_dict(orient='list')
288 |         keys = list(x.keys())  
289 |            
290 |         for i in range(n_features):
291 |             key = feature_names[i]
292 |             if key in main_list:
293 |                 print('feature already exsists! Use Learn function')
294 |             else:        
295 |                 e = keys.index(key)
296 |                 calculate_basic_elements1(x,key,e,c,i,1)
297 | 
298 |     df = pd.DataFrame(BET)
299 |     df.index = keys
300 |     df = df[keys]
301 |     return df
302 | 
303 | # In[14]:
304 | 
305 | def learn(BET, df):
306 |           
307 |     """ This function takes Basic Element Table and dataframe as inputs to update the 
308 |         BET with new data in the dataframe. (Incremental Learning of BET with new dataframe as input)
309 |         
310 |         Examples
311 |         --------
312 |         learn(Basic_Element_Table, data_frame)
313 |         
314 |         The above function updates Basic_Element_Table with values in the new dataframe.
315 |     
316 |     """
317 |     
318 |     col = list(df.columns)
319 |     for index, row in df.iterrows():
320 |         row1 = []
321 |         for e in col:
322 |             row1.append(row[e])
323 |         arguments  = col + row1
324 |         BET = learnbyindex(BET, *arguments)
325 |     return BET
326 | 
327 | # In[16]:
328 | 
329 | def forget(BET, df):
330 |     
331 |     """ This function takes Basic Element Table and dataframe as inputs to change and remove the  
332 |         effect of that data in the BET. (Decremental Learning of BET with dataframe as input)
333 |         
334 |         Examples
335 |         --------
336 |         forget(Basic_Element_Table, data_frame)
337 |         
338 |         The above function updates Basic_Element_Table with values in the new dataframe.
339 |     
340 |     """
341 |     
342 |     col = list(df.columns)
343 |     for index, row in df.iterrows():
344 |         row1 = []
345 |         for e in col:
346 |             row1.append(row[e])
347 |         arguments  = col + row1
348 |         BET = forgetbyindex(BET, *arguments)
349 |     return BET
350 | 
351 | 
352 | # In[18]:
353 | 
354 | def univariate(BET):
355 |     
356 |     """
357 |     Univariate analysis explores variables (attributes) one by one by summarizing each attribute 
358 |     using statistical techniques. This summarizes the central tendency, dispersion and shape of 
359 |     a dataset’s distribution, excluding NaN values.
360 |     
361 |     univariate Stats calculated are: ['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
362 |     
363 |     Examples
364 |     --------
365 |         univariate(Basic_Element_Table)
366 |         
367 |         The above function generates Univariate statistics for all the features in the Basic_Element_Table.
368 |         
369 |         function returns univariate stats as Pandas Dataframe.
370 |     
371 |     """
372 |     
373 |     l =(len(BET))
374 |     BET.reset_index(drop = True, inplace = True)
375 |     x = BET.to_dict(orient='list')                                                 # convert BET to dictionary
376 |     keys =list(x.keys())  
377 |     describe = {}
378 |     
379 |     for i in range(l):
380 |         describe[i] = []
381 |         m = keys[i]
382 |         
383 |         try:
384 |             count = x[m][i][0]
385 |             describe[i].append(count)
386 |         except:
387 |             describe[i].append('NaN')
388 |         try:
389 |             Mean = (x[m][i][1])/count
390 |             describe[i].append(Mean)   
391 |         except:
392 |             describe[i].append('NaN')
393 |         
394 |         try:
395 |             Variance = ((x[m][i][2])-(((x[m][i][1])**2)/count))/count
396 |             describe[i].append(Variance)
397 |         except:
398 |             describe[i].append('NaN')
399 |         try:
400 |             Standard_deviation = math.sqrt(Variance)
401 |             describe[i].append(Standard_deviation)
402 |         except:
403 |             describe[i].append('NaN')
404 |         try:
405 |             coeff_of_variation = (Standard_deviation/Mean)*100
406 |             describe[i].append(coeff_of_variation)
407 |         except:
408 |             describe[i].append('NaN')
409 |             
410 |         try:
411 |             skewness = (count/((count-1)*(count-2)))*((x[m][i][3])-(3*Mean*x[m][i][2])+(3*(Mean**2)*x[m][i][1])-(count*(Mean**3)))/(Standard_deviation**3)
412 |             describe[i].append(skewness)
413 |         except:
414 |             describe[i].append('NaN')
415 |         try:
416 |             Kurtosis = (((((count)*(count+1))/((count-1)*(count-2)*(count-3)))*((1/Standard_deviation**4)*((x[m][i][4])-(4*Mean*(x[m][i][3]))+(6*(Mean**2)*(x[m][i][2]))-(4*(Mean**3)*(x[m][i][1]))+(count*(Mean**4)))))-((3*(count-1)**2)/((count-2)*(count-3))))
417 |             describe[i].append(Kurtosis)
418 |         except:
419 |             describe[i].append('NaN')        
420 |         
421 |     names =['count','Mean','Variance','Standard_deviation','coeff_of_variation','skewness','Kurtosis']
422 |     result = pd.DataFrame(describe, index=names)
423 |     result.columns = keys
424 |     return(result)
425 | 
426 | # In[19]:
427 | 
428 | def Covariance(BET):
429 |     
430 |     """
431 |     This function computes pairwise covariance of all features in BET. Covariance describes 
432 |     the linear relationship between two features.
433 |     
434 |     Examples
435 |     --------
436 |         Covariance(Basic_Element_Table)
437 |         
438 |         The above function generates pairwise Covariance for all the features in the Basic_Element_Table.
439 |         
440 |         function returns Covariance as Pandas Dataframe.
441 |     
442 |     """
443 |     
444 |     l =(len(BET))
445 |     BET.reset_index(drop = True, inplace = True)
446 |     x = BET.to_dict(orient='list')
447 |     keys =list(x.keys())  
448 |     covar = {}
449 |     
450 |     for i in range(len(BET)):
451 |         covar[i] = []
452 |         for j in range(len(BET)):
453 |             m = keys[i]
454 |             try:
455 |                 cov = (x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0])
456 |                 covar[i].append(cov)
457 |             except:
458 |                 covar[i].append('NaN')
459 |             
460 |     result = pd.DataFrame(covar, index=keys)
461 |     result.columns = keys
462 |     return(result)
463 | 
464 | 	
465 | def correlation(BET):
466 |     
467 |     """
468 |     This function computes pairwise correlations of all features in BET. correlation measures 
469 |     how strong a relationship is between two variables.
470 |     
471 |     Examples
472 |     --------
473 |         correlation(Basic_Element_Table)
474 |         
475 |         The above function generates pairwise correlations for all the features in the Basic_Element_Table.
476 |         
477 |         function returns correlations as Pandas Dataframe.
478 |     
479 |     """
480 |     
481 |     l =(len(BET))
482 |     BET.reset_index(drop = True, inplace = True)
483 |     x = BET.to_dict(orient='list')
484 |     keys =list(x.keys())  
485 |     corr = {}
486 |     
487 |     for i in range(len(BET)):
488 |         corr[i] = []
489 |         for j in range(len(BET)):
490 |             m = keys[i]      
491 |             count1 = x[m][j][0]
492 |             count2 = x[m][j][5]
493 |             try:
494 |                 var1 = ((x[m][j][2])-(((x[m][j][1])**2)/count1))/count1
495 |                 var2 = ((x[m][j][7])-(((x[m][j][6])**2)/count2))/count2
496 |                 corrl = ((x[m][j][10]-(((x[m][j][1])*(x[m][j][6]))/(x[m][j][0])))/(x[m][j][0]))/(math.sqrt(var1*var2))
497 |                 corr[i].append(corrl)
498 |             except:
499 |                 corr[i].append('NaN')
500 |     
501 |     result = pd.DataFrame(corr, index=keys)
502 |     result.columns = keys
503 |     return(result)
504 | 
505 | def Ztest(BET, col1, col2):
506 |     
507 |     l =(len(BET))
508 |     BET.reset_index(drop = True, inplace = True)
509 |     x = BET.to_dict(orient='list')
510 |     keys =list(x.keys())
511 |     
512 |     count = x[col2][keys.index(col1)][6]
513 |     sumx = x[col2][keys.index(col1)][10]
514 |     sumx2 = x[col2][keys.index(col1)][11]
515 |     Mean = sumx/count
516 |     Variance = (sumx2 - (((sumx)**2)/count))/count
517 |     
518 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
519 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
520 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
521 |     Mean_0 = sumx_0/count_0
522 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
523 |     
524 |     zscore = (Mean_0 - Mean)/(np.sqrt((Variance_0/count_0)+(Variance/count)))
525 |     prob = 1 - stats.norm.cdf(zscore)
526 |     return 2*prob
527 |     
528 | 	
529 | def Ttest(BET, col1, col2):
530 |     
531 |     l =(len(BET))
532 |     BET.reset_index(drop = True, inplace = True)
533 |     x = BET.to_dict(orient='list')
534 |     keys =list(x.keys())
535 |     
536 |     count = x[col2][keys.index(col1)][6]
537 |     sumx = x[col2][keys.index(col1)][10]
538 |     sumx2 = x[col2][keys.index(col1)][11]
539 |     Mean = sumx/count
540 |     Variance = (sumx2 - (((sumx)**2)/count))/count
541 |     
542 |     count_0 = x[col2][keys.index(col1)][0] - x[col2][keys.index(col1)][6]
543 |     sumx_0 =  x[col2][keys.index(col1)][1] - x[col2][keys.index(col1)][10]
544 |     sumx2_0 = x[col1][keys.index(col1)][10] -x[col2][keys.index(col1)][11]
545 |     Mean_0 = sumx_0/count_0
546 |     Variance_0 = (sumx2_0 - (((sumx_0)**2)/count_0))/count_0
547 |     
548 |     var = (((count_0-1)*Variance_0) + ((count-1)*Variance))/(count_0 + count - 2)
549 |     
550 |     tscore = (Mean_0 - Mean)/(np.sqrt(var*((1/count_0)+(1/count))))
551 |     
552 |     df = (count + count_0 - 2)
553 |     
554 |     prob = (1-stats.t.cdf(tscore, df)) 
555 |     return 2*prob
556 |     
557 | 	
558 | 
559 | def chi2(BET, feature_1 , feature_2):
560 |     
561 |     l =(len(BET))
562 |     BET.reset_index(drop = True, inplace = True)
563 |     x = BET.to_dict(orient='list')
564 |     keys =list(x.keys())
565 |     obs_freq = {}
566 |     exp_freq = {}
567 |     sum_exp_freq_vertical = np.zeros(len(feature_2))
568 |     chi2 = 0
569 |     
570 |     for i in range(len(feature_1)):
571 |         obs_freq[feature_1[i]] = []
572 |         
573 |         for j in range(len(feature_2)): 
574 |             col1 = (feature_1[i])
575 |             col2 = (feature_2[j])
576 |             sumx = x[col1][keys.index(col2)][10]
577 |             obs_freq[feature_1[i]].append(sumx)
578 |             
579 |         sum_exp_freq_vertical = sum_exp_freq_vertical + np.array(obs_freq[feature_1[i]])
580 |     total_in_contingency = sum(sum_exp_freq_vertical)
581 |     
582 |     for i in range(len(feature_1)):
583 |         exp_freq[feature_1[i]] = []
584 |         sum_exp_freq_horizontal = sum(obs_freq[feature_1[i]])      
585 |         for j in range(len(feature_2)):            
586 |             e = (sum_exp_freq_horizontal*sum_exp_freq_vertical[j])/total_in_contingency              
587 |             exp_freq[feature_1[i]].append(e)
588 |         
589 |     for i in range(len(feature_1)):
590 |         for j in range(len(feature_2)):
591 |             chi2 = chi2 + ((obs_freq[feature_1[i]][j] - exp_freq[feature_1[i]][j])**2)/exp_freq[feature_1[i]][j]
592 |             
593 |             
594 |     df = (len(feature_1) - 1)*(len(feature_2)-1)
595 |     
596 |     print('chi2: ' + str(chi2))
597 |     print('df: '  + str(df))
598 |     print('chisqprob: ' + str(chisqprob(chi2, df)))
599 |     return(chisqprob(chi2, df))
600 | 
601 | 
602 | #Models:
603 |     
604 | def LDA_fit(BET, target):
605 |     
606 |     """
607 |     Linear Discriminant Analysis (LDA) is a classification method searching for a linear combination 
608 |     of variables (predictors) that best separates the classes (targets). 
609 |     
610 |     It basically performs the supervised dimensionality reduction, by projecting the input data to a 
611 |     linear subspace consisting of the directions which maximize the separation between classes (Maximizing the difference
612 |     between the means of groups and reducing Std. deviation within groups)
613 |     
614 |     Examples
615 |     --------
616 |         LDA_fit(Basic_Element_Table, Target)
617 |         
618 |         where 'Basic_Element_Table' is found from BET function for the data and 'Target' is the feature that needs to be 
619 |         predicted.
620 |         
621 |         The function returns (mean1,mean2,Beta, prob) which are Mean vectors of the groups, Linear Model coefficients and
622 |         class probability respectively.
623 |     
624 |     """
625 |     l =(len(BET))
626 |     BET1 = BET 
627 |     BET1.reset_index(drop = True, inplace = True)
628 |     x = BET1.to_dict(orient='list')
629 |     keys =list(x.keys())
630 |     k = keys.index(target)
631 |     count_1 = BET[target][k][0] - BET[target][k][1]
632 |     count_2 = BET[target][k][1]
633 |     mean1 = []
634 |     mean2 = []
635 |     c = []
636 |     for i in range(len(BET)):
637 |         if i != keys.index(target):
638 |             mean1.append((BET[target][i][1] - BET[target][i][10])/(BET[target][i][0]-BET[target][i][6]))
639 |             mean2.append((BET[target][i][10])/BET[target][i][6])
640 | 
641 |     for i in range(len(BET)):
642 |         if i != keys.index(target):
643 |             for j in range(len(BET)):
644 |                 if j != keys.index(target):
645 |                     m = keys[i]
646 |                     n = keys[j]
647 |                     cal1 = (((x[m][k][6] - x[m][k][10])*(x[n][k][6] - x[n][k][10]))/count_1)
648 |                     cal2 = (x[m][k][10]*x[n][k][10])/count_2
649 |                     c.append((x[m][j][10]-cal1 - cal2)/(count_1+count_2-2))
650 | 
651 |     c = np.array(c)
652 |     n = (len(BET)-1)
653 |     c = reshape(c,(n,n))
654 |     inverse = np.linalg.inv(c)
655 |     z = np.array(mean1)-np.array(mean2)
656 |     Beta = np.matmul(inverse, z.T)
657 |     prob =  (-math.log(count_1/count_2))
658 |     
659 |     return (mean1,mean2,Beta, prob)
660 |     
661 |     
662 | def LDA_predict(BET, X, target):
663 |     """
664 |     To predict the target values for the given data using LDA paramters calculated from the training dataset.
665 |     Returns the predictions using LDA model.
666 |     
667 |     Examples
668 |     --------
669 |         LDA_predict(Basic_Element_Table, Testing_data, Target)
670 |         
671 |         BET table and testing data should be given as inputs
672 |     """
673 |     (mean1,mean2,Beta, prob) = LDA_fit(BET, target)
674 |     numpy_matrix = X.as_matrix()
675 |     q=[]
676 |     for i in range(len(numpy_matrix)):
677 |         z = numpy_matrix[i] - (0.5*(np.array(mean1) - np.array(mean2)))
678 |         if np.matmul(Beta.T, z) > prob:
679 |             q.append(0)
680 |         else:
681 |             q.append(1)
682 |     return q
683 |     
684 | def accuracy(y, y_pred):
685 |     y = list(y)
686 |     y_pred =list(y_pred)
687 |     matches = []
688 |     for i in range(len(y)):
689 |         if y[i] == y_pred[i]:
690 |             matches.append(1)
691 |     return (sum(matches)/len(y))*100
692 | 
693 | 
694 | def PCA(BET): 
695 |     """
696 |     Principal component analysis (PCA) is a classical statistical method that uses an orthogonal transformation 
697 |     to convert a set of observations of possibly correlated variables into a set of values of linearly uncorrelated variables 
698 |     called principal components.
699 |     
700 |     Real time Principal components for datasets can be extracted from the ART-M covariance matrix equations.
701 |     
702 |     Examples
703 |     --------
704 |     PCA(Basic_Element_Table)
705 |     
706 |     This function returns eigen values & eigen vectors for the features in the Basic element table.
707 |     """
708 |     
709 |     cov = Covariance(BET)
710 |     cov_mat  = cov.values
711 |     eig_vals, eig_vecs = np.linalg.eig(cov_mat)
712 | 
713 |     print('Eigenvectors: \n%s' %eig_vecs)
714 |     print('\nEigenvalues: \n%s' %eig_vals)
715 |     
716 |     # Make a list of (eigenvalue, eigenvector) tuples
717 |     eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
718 | 
719 |     # Sort the (eigenvalue, eigenvector) tuples from high to low
720 |     eig_pairs.sort(key=lambda x: x[0], reverse=True)
721 | 
722 |     # Visually confirm that the list is correctly sorted by decreasing eigenvalues
723 |     print('\nEigenvalues in descending order:')
724 |     for i in eig_pairs:
725 |         print(i[0])
726 | 
727 | def MLR(BET,target):
728 |         
729 |     row_indexes = list(BET.index)
730 |     target_index = row_indexes.index(target)
731 |     BET_features = BET.drop(target, axis =1)
732 |     BET_features = BET_features.drop(target, axis =0)  
733 |     cov_features = Covariance(BET_features).values
734 |     cov_target = Covariance(BET).values
735 |     cov_target = cov_target[target_index]
736 |     cov_target = np.delete(cov_target, target_index)
737 |     inverse = np.linalg.inv(cov_features)
738 |     Beta_array = np.matmul(inverse, cov_target)
739 |    
740 |     l =(len(BET))
741 |     BET.reset_index(drop = True, inplace = True)
742 |     x = BET.to_dict(orient='list')
743 |     keys =list(x.keys())
744 |     
745 |     mean_target = (BET[target][keys.index(target)][1])/BET[target][keys.index(target)][0]
746 |     mean_X = []
747 |     
748 |     for i in range(len(BET_features)):
749 |         if i != keys.index(target):
750 |             mean_X.append((BET[target][i][1])/BET[target][i][0])
751 |             
752 |     b0 = mean_target - np.matmul(Beta_array, mean_X)
753 |     
754 |     print(b0)
755 |     return Beta_array
756 |     
757 | 	
758 | def gaussian_NB(BET, X ,target):
759 |     
760 |     l =(len(BET))
761 |     BET.reset_index(drop = True, inplace = True)
762 |     x = BET.to_dict(orient='list')
763 |     keys =list(x.keys())
764 |     
765 |     probability = []
766 |     likelihood = 1
767 |     att_prior_prob = 1
768 |     class_prior_prob = 1
769 |     for i in range(len(BET)):
770 |         if keys[i] != target:
771 |             count = x[target][i][6]
772 |             sumxy = x[target][i][10]
773 |             sumxy2 = x[target][i][11]
774 |             Mean = sumxy/count
775 |             Variance = (sumxy2 - (((sumxy)**2)/count))/count
776 |             value = X[i]
777 |             likelihood = likelihood*(1/math.sqrt(2*np.pi*Variance))*(np.e**(-(value-Mean)/(2*Variance)))
778 | 
779 |             class_prior_prob = (count/x[target][i][5])
780 | 
781 |             count_att = x[target][i][0]
782 |             sumxy_att = x[target][i][1]
783 |             sumxy2_att = x[target][i][2]
784 |             Mean_att = sumxy_att/count_att
785 |             Variance_att = (sumxy2_att - (((sumxy_att)**2)/count_att))/count_att
786 | 
787 |             att_prior_prob = att_prior_prob*(1/math.sqrt(2*np.pi*Variance_att))*(np.e**(-(value-Mean_att)/(2*Variance_att)))
788 | 
789 |     post_prob = (class_prior_prob * likelihood)/att_prior_prob
790 |             
791 |     return post_prob
792 | 
793 | 
794 | def Multinomial_NB(BET, X ,target):
795 |     
796 |     l =(len(BET))
797 |     BET.reset_index(drop = True, inplace = True)
798 |     x = BET.to_dict(orient='list')
799 |     keys =list(x.keys())
800 |     
801 |     probability = []
802 |     likelihood = 1
803 |     att_prior_prob = 1
804 |     class_prior_prob = 1
805 |     for i in range(len(BET)):
806 |         if keys[i] != target:
807 |             sumx = x[target][i][6]
808 |             sumxy = x[target][i][10]
809 |             likelihood = likelihood*(sumxy/sumx)
810 | 
811 |             class_prior_prob = (x[target][i][6]/x[target][i][5])
812 | 
813 |             count_att = x[target][i][0]
814 |             sumxy_att = x[target][i][1]
815 |             att_prior_prob = att_prior_prob*(sumxy_att/count_att)
816 | 
817 |     post_prob = (class_prior_prob * likelihood)/att_prior_prob
818 |             
819 |     return post_prob
820 | 
821 | 
822 | def SVM_fit(BET, target):
823 |     l =(len(BET))
824 |     BET1 = BET 
825 |     BET1.reset_index(drop = True, inplace = True)
826 |     x = BET1.to_dict(orient='list')
827 |     keys =list(x.keys())
828 |     k = keys.index(target)       
829 |     EE = []
830 |     last_row =[]
831 |     Ede = []
832 |     count = BET[target][k][0]
833 |     for i in range(len(BET)):
834 |         if i != keys.index(target):
835 |             for j in range(len(BET)):
836 |                 if j != keys.index(target):
837 |                     m = keys[i]
838 |                     n = keys[j]
839 |                     EE.append(x[m][j][10])
840 |                 if j == keys.index(target):
841 |                     Ede.append(2*(x[m][j][10]) -x[m][i][6]) 
842 |             EE.append(-x[m][i][6])    
843 |         last_row.append(-x[m][i][6])
844 |     final = EE+last_row       
845 |     final.pop()
846 |     final.append(count)
847 |     final = np.array(final)
848 |     n = (len(BET))
849 |     final = reshape(final,(n,n))
850 | 
851 |     Ede.append((count-2*(BET[target][k][1])))
852 | 
853 |     I = np.identity(n)
854 |     const = (((I/count)+ final))
855 | 
856 |     inverse = np.linalg.inv(const)
857 |     Beta = np.dot(inverse, np.array(Ede))
858 |     
859 |     return(Beta)
860 | 
861 | 
862 | def SVM_Reg_fit(BET, target,tuning_parameter):
863 |     l =(len(BET))
864 |     BET1 = BET 
865 |     BET1.reset_index(drop = True, inplace = True)
866 |     x = BET1.to_dict(orient='list')
867 |     keys =list(x.keys())
868 |     k = keys.index(target)       
869 |     EE = []
870 |     last_row =[]
871 |     Ede = []
872 |     count = BET[target][k][0]
873 |     for i in range(len(BET)):
874 |         if i != keys.index(target):
875 |             for j in range(len(BET)):
876 |                 if j != keys.index(target):
877 |                     m = keys[i]
878 |                     n = keys[j]
879 |                     EE.append(x[m][j][10])
880 |                 if j == keys.index(target):
881 |                     Ede.append(x[m][j][10]) 
882 |             EE.append(-x[m][i][6])    
883 |         last_row.append(-x[m][i][6])
884 |     final = EE+last_row       
885 |     final.pop()
886 |     final.append(count)
887 |     final = np.array(final)
888 |     n = (len(BET))
889 |     final = reshape(final,(n,n))
890 | 
891 |     Ede.append(-(BET[target][k][1]))
892 |     print(Ede)
893 |     I = np.identity(n)
894 |     const = (((I/tuning_parameter)+ final))
895 |     
896 |     inverse = np.linalg.inv(const)
897 |     Beta = np.dot(inverse, np.array(Ede))
898 |     
899 |     return(Beta)
900 | 


--------------------------------------------------------------------------------