├── AFTERFAC.CPP ├── ARCING.CPP ├── ARCING_M.CPP ├── BILINEAR.CPP ├── BOOT_C_1.CPP ├── BOOT_C_2.CPP ├── BOOT_P_1.CPP ├── BOOT_P_2.CPP ├── BOOT_P_3.CPP ├── BOOT_P_4.CPP ├── BOOT_P_5.CPP ├── CODE_DESCRIPTION.TXT ├── CONFCONF.CPP ├── DEP_BOOT.CPP ├── ENTROPY.CPP ├── GRNN.CPP ├── GRNN.H ├── GRNNGATE.CPP ├── INFO.H ├── INTEGRAT.CPP ├── LICENSE.txt ├── LINREG.CPP ├── LINREG.H ├── LOGISTIC.CPP ├── LOGISTIC.H ├── MC_TRAIN.CPP ├── MEM.CPP ├── MINIMIZE.CPP ├── MINIMIZE.H ├── MI_BIN.CPP ├── MI_CONT.CPP ├── MI_DISC.CPP ├── MI_ONLY.CPP ├── MLFN.CPP ├── MLFN.H ├── MULTCLAS.CPP ├── MULTPRED.CPP ├── MUTINF_B.CPP ├── MUTINF_C.CPP ├── MUTINF_D.CPP ├── PART.CPP ├── PARZDENS.CPP ├── QSORTD.CPP ├── RAND32.CPP ├── READFILE.CPP ├── README.md ├── SPLINE.CPP ├── STATS.CPP ├── TEST_CON.CPP ├── TEST_DIS.CPP ├── TRANSFER.CPP ├── TRANS_ENT.CPP ├── contributing.md ├── spearman.cpp └── svdcmp.h /BILINEAR.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* BILINEAR - Bilinear class for two-dimensional interpolation */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "info.h" 13 | 14 | Bilinear::Bilinear ( // Uses input points (x,y,z) where z=f(x,y) 15 | int nxin , // Number of x points 16 | double *xin , // They are here, sorted ascending 17 | int nyin , // Number of y points 18 | double *yin , // They are here, sorted ascending 19 | double *zin , // Corresponding function values, y changing fastest 20 | int extra // If nonzero, use 3x3 block with quadratic interpolation 21 | ) 22 | { 23 | 24 | quadratic = extra ; 25 | nx = nxin ; 26 | ny = nyin ; 27 | MEMTEXT ( "Bilinear constructor" ) ; 28 | x = (double *) MALLOC ( nx * sizeof(double) ) ; 29 | y = (double *) MALLOC ( ny * sizeof(double) ) ; 30 | z = (double *) MALLOC ( nx * ny * sizeof(double) ) ; 31 | assert ( x != NULL ) ; 32 | assert ( y != NULL ) ; 33 | assert ( z != NULL ) ; 34 | 35 | memcpy ( x , xin , nx * sizeof(double) ) ; 36 | memcpy ( y , yin , ny * sizeof(double) ) ; 37 | memcpy ( z , zin , nx * ny * sizeof(double) ) ; 38 | } 39 | 40 | Bilinear::~Bilinear () 41 | { 42 | MEMTEXT ( "Bilinear destructor" ) ; 43 | FREE ( x ) ; 44 | FREE ( y ) ; 45 | FREE ( z ) ; 46 | } 47 | 48 | double Bilinear::evaluate ( double xpt , double ypt ) 49 | { 50 | int k, kxlo, kxmid, kxhi, kylo, kymid, kyhi ; 51 | double t, u, val, clo, cmid, chi, zlo, zmid, zhi ; 52 | double dlo, dmid, dhi, lo_mid, lo_hi, mid_hi ; 53 | 54 | /* 55 | Bound outlying inputs 56 | */ 57 | 58 | if (xpt < x[0]) 59 | xpt = x[0] ; 60 | if (xpt > x[nx-1]) 61 | xpt = x[nx-1] ; 62 | if (ypt < y[0]) 63 | ypt = y[0] ; 64 | if (ypt > y[ny-1]) 65 | ypt = y[ny-1] ; 66 | 67 | /* 68 | Find the pair of x coordinates that bound the input 69 | */ 70 | 71 | kxlo = 0 ; 72 | kxhi = nx - 1 ; 73 | while (kxhi > kxlo+1) { 74 | k = (kxhi + kxlo) / 2 ; 75 | if (xpt < x[k]) 76 | kxhi = k ; 77 | else 78 | kxlo = k ; 79 | } 80 | 81 | /* 82 | Find the pair of y coordinates that bound the input 83 | */ 84 | 85 | kylo = 0 ; 86 | kyhi = ny - 1 ; 87 | while (kyhi > kylo+1) { 88 | k = (kyhi + kylo) / 2 ; 89 | if (ypt < y[k]) 90 | kyhi = k ; 91 | else 92 | kylo = k ; 93 | } 94 | 95 | /* 96 | 3x3 with quadratic interpolation? 97 | */ 98 | 99 | if (quadratic) { 100 | // Choose which way to go for the third x point 101 | if (kxlo == 0) { 102 | kxmid = kxhi ; 103 | ++kxhi ; 104 | } 105 | else if (kxhi == nx-1) { 106 | kxmid = kxlo ; 107 | --kxlo ; 108 | } 109 | else if (xpt-x[kxlo] < x[kxhi]-xpt) { 110 | kxmid = kxlo ; 111 | --kxlo ; 112 | } 113 | else { 114 | kxmid = kxhi ; 115 | ++kxhi ; 116 | } 117 | 118 | // Choose which way to go for the third y point 119 | if (kylo == 0) { 120 | kymid = kyhi ; 121 | ++kyhi ; 122 | } 123 | else if (kyhi == ny-1) { 124 | kymid = kylo ; 125 | --kylo ; 126 | } 127 | else if (ypt-y[kylo] < y[kyhi]-ypt) { 128 | kymid = kylo ; 129 | --kylo ; 130 | } 131 | else { 132 | kymid = kyhi ; 133 | ++kyhi ; 134 | } 135 | 136 | dlo = xpt - x[kxlo] ; 137 | dmid = xpt - x[kxmid] ; 138 | dhi = xpt - x[kxhi] ; 139 | lo_mid = x[kxlo] - x[kxmid] ; 140 | lo_hi = x[kxlo] - x[kxhi] ; 141 | mid_hi = x[kxmid] - x[kxhi] ; 142 | clo = dmid * dhi / (lo_mid * lo_hi) ; 143 | cmid = dlo * dhi / (-lo_mid * mid_hi) ; 144 | chi = dlo * dmid / (lo_hi * mid_hi) ; 145 | 146 | zlo = clo * z[kxlo*ny+kylo] + cmid * z[kxmid*ny+kylo] + chi * z[kxhi*ny+kylo] ; 147 | zmid = clo * z[kxlo*ny+kymid] + cmid * z[kxmid*ny+kymid] + chi * z[kxhi*ny+kymid] ; 148 | zhi = clo * z[kxlo*ny+kyhi] + cmid * z[kxmid*ny+kyhi] + chi * z[kxhi*ny+kyhi] ; 149 | 150 | dlo = ypt - y[kylo] ; 151 | dmid = ypt - y[kymid] ; 152 | dhi = ypt - y[kyhi] ; 153 | lo_mid = y[kylo] - y[kymid] ; 154 | lo_hi = y[kylo] - y[kyhi] ; 155 | mid_hi = y[kymid] - y[kyhi] ; 156 | clo = dmid * dhi / (lo_mid * lo_hi) ; 157 | cmid = dlo * dhi / (-lo_mid * mid_hi) ; 158 | chi = dlo * dmid / (lo_hi * mid_hi) ; 159 | 160 | return clo * zlo + cmid * zmid + chi * zhi ; 161 | } // If quadratic 162 | 163 | /* 164 | Ordinary 2x2 bilinear 165 | */ 166 | 167 | else { 168 | t = (xpt - x[kxlo]) / (x[kxhi] - x[kxlo]) ; 169 | u = (ypt - y[kylo]) / (y[kyhi] - y[kylo]) ; 170 | 171 | val = (1.0 - t) * (1.0 - u) * z[kxlo*ny+kylo] ; 172 | val += t * (1.0 - u) * z[kxhi*ny+kylo] ; 173 | val += t * u * z[kxhi*ny+kyhi] ; 174 | val += (1.0 - t) * u * z[kxlo*ny+kyhi] ; 175 | return val ; 176 | } 177 | } 178 | -------------------------------------------------------------------------------- /BOOT_P_1.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* BOOT_P_1 - Bootstrap estimate of bias and variance when s != t */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double unifrand () ; 15 | double normal () ; 16 | void qsortd ( int istart , int istop , double *x ) ; 17 | 18 | 19 | /* 20 | -------------------------------------------------------------------------------- 21 | 22 | Compute the parameter 23 | 24 | -------------------------------------------------------------------------------- 25 | */ 26 | 27 | double param_mean ( int n , double *x ) 28 | { 29 | int i ; 30 | double mean ; 31 | 32 | mean = 0.0 ; 33 | for (i=0 ; i= n) // Should never happen, but be prepared 80 | k = n - 1 ; 81 | work[i] = data[k] ; // Put bootstrap sample in work 82 | } 83 | 84 | stat = user_s ( n , work ) ; // Evaluate estimator for this boot rep 85 | work2[rep] = stat ; // Enables more accurate variance 86 | mean += stat ; // Cumulate theta-hat star dot 87 | } 88 | 89 | mean /= nboot ; 90 | variance = 0.0 ; 91 | for (rep=0 ; rep 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double unifrand () ; 15 | double normal () ; 16 | void qsortd ( int istart , int istop , double *x ) ; 17 | void qsortds ( int istart , int istop , double *x , double *s ) ; 18 | 19 | 20 | /* 21 | -------------------------------------------------------------------------------- 22 | 23 | Compute the parameter 24 | 25 | -------------------------------------------------------------------------------- 26 | */ 27 | 28 | double param_mean ( int n , double *x , double *freq ) 29 | { 30 | int i ; 31 | double mean ; 32 | 33 | mean = 0.0 ; 34 | 35 | if (freq == NULL) { 36 | for (i=0 ; i= 0.5) 70 | return x[0] ; 71 | else if (freq[n-1] >= 0.5) 72 | return x[n-1] ; 73 | 74 | /* 75 | Usual case. Keep it primitive and simple: no interolation. 76 | */ 77 | 78 | sum = 0.0 ; 79 | for (i=0 ; i= 0.5) 82 | break ; 83 | } 84 | 85 | return 0.5 * (x[i] + x[i-1]) ; 86 | } 87 | 88 | /* 89 | -------------------------------------------------------------------------------- 90 | 91 | boot_bias_var - Compute bias and variance of parameter (plug-in case, s=t) 92 | 93 | -------------------------------------------------------------------------------- 94 | */ 95 | 96 | void boot_bias_var ( 97 | int n , // Number of cases in sample 98 | double *data , // The sample 99 | double (*user_s) (int , double * , double * ) , // Compute param 100 | int nboot , // Number of bootstrap replications 101 | double *rawstat , // Raw statistic of sample, theta-hat 102 | double *bias , // Output of bias estimate 103 | double *var , // Output of variance estimate 104 | double *work , // Work area n long 105 | double *work2 , // Work area nboot long 106 | double *freq // Work area n long 107 | ) 108 | { 109 | int i, rep, k ; 110 | double stat, mean, variance, diff ; 111 | 112 | mean = 0.0 ; 113 | 114 | for (i=0 ; i= n) // Should never happen, but be prepared 122 | k = n - 1 ; 123 | work[i] = data[k] ; // Put bootstrap sample in work 124 | ++freq[k] ; // Tally for mean frequency 125 | } 126 | 127 | stat = user_s ( n , work , NULL ) ; // Evaluate estimator for this rep 128 | work2[rep] = stat ; // Enables more accurate variance 129 | mean += stat ; // Cumulate theta-hat star dot 130 | } 131 | 132 | mean /= nboot ; 133 | variance = 0.0 ; 134 | for (rep=0 ; rep 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double unifrand () ; 15 | double normal () ; 16 | 17 | 18 | /* 19 | -------------------------------------------------------------------------------- 20 | 21 | Compute the parameter 22 | 23 | -------------------------------------------------------------------------------- 24 | */ 25 | 26 | double param_beta ( int n , double *x , double *y ) 27 | { 28 | int i ; 29 | double xmean, ymean, xdif, ydif, xvar, covar ; 30 | 31 | xmean = ymean = 0.0 ; 32 | for (i=0 ; i= n) // Should never happen, but be prepared 85 | k = n - 1 ; 86 | xwork[i] = x[k] ; // Put bootstrap sample in work 87 | ywork[i] = y[k] ; 88 | } 89 | 90 | stat = user_t ( n , xwork , ywork ) ; // Evaluate estimator for this rep 91 | work2[rep] = stat ; // Enables more accurate variance 92 | mean += stat ; // Cumulate theta-hat star dot 93 | } 94 | 95 | mean /= nboot ; 96 | variance = 0.0 ; 97 | for (rep=0 ; rep 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double unifrand () ; 15 | double normal () ; 16 | 17 | /* 18 | -------------------------------------------------------------------------------- 19 | 20 | Compute the parameter 21 | 22 | -------------------------------------------------------------------------------- 23 | */ 24 | 25 | double param_pf ( int n , double *x , double *freq ) 26 | { 27 | int i ; 28 | double sum_win, sum_loss ; 29 | 30 | sum_win = sum_loss = 1.e-5 ; // Really zero, but prevent division by 0 31 | 32 | if (freq == NULL) { 33 | for (i=0 ; i 0.0) 35 | sum_win += x[i] ; 36 | else 37 | sum_loss -= x[i] ; 38 | } 39 | return sum_win / sum_loss ; 40 | } 41 | 42 | for (i=0 ; i 0.0) 44 | sum_win += freq[i] * x[i] ; 45 | else 46 | sum_loss -= freq[i] * x[i] ; 47 | } 48 | return sum_win / sum_loss ; 49 | } 50 | 51 | /* 52 | -------------------------------------------------------------------------------- 53 | 54 | boot_bias_var - Compute bias and variance of parameter (plug-in case, s=t) 55 | 56 | -------------------------------------------------------------------------------- 57 | */ 58 | 59 | void boot_bias_var ( 60 | int n , // Number of cases in sample 61 | double *data , // The sample 62 | double (*user_t) (int , double * , double * ) , // Compute param 63 | int nboot , // Number of bootstrap replications 64 | double *rawstat , // Raw statistic of sample, theta-hat 65 | double *bias , // Output of bias estimate 66 | double *var , // Output of variance estimate 67 | double *work , // Work area n long 68 | double *work2 , // Work area nboot long 69 | double *freq // Work area n long 70 | ) 71 | { 72 | int i, rep, k ; 73 | double stat, mean, variance, diff ; 74 | 75 | mean = 0.0 ; 76 | 77 | for (i=0 ; i= n) // Should never happen, but be prepared 85 | k = n - 1 ; 86 | work[i] = data[k] ; // Put bootstrap sample in work 87 | ++freq[k] ; // Tally for mean frequency 88 | } 89 | 90 | stat = user_t ( n , work , NULL ) ; // Evaluate estimator for this rep 91 | work2[rep] = stat ; // Enables more accurate variance 92 | mean += stat ; // Cumulate theta-hat star dot 93 | } 94 | 95 | mean /= nboot ; 96 | variance = 0.0 ; 97 | for (rep=0 ; rep 0.0) // Cumulate so we know the true value 230 | grand_wins += x[i] ; 231 | else 232 | grand_losses -= x[i] ; 233 | } 234 | 235 | boot_bias_var ( nsamps , x , param_pf , nboot , 236 | &computed_param_1[itry] , &computed_bias_1[itry] , 237 | &computed_var_1[itry] , work , work2 , freq ) ; 238 | 239 | jack_bias_var ( nsamps , x , param_pf , 240 | &computed_param_2[itry] , &computed_bias_2[itry] , 241 | &computed_var_2[itry] , work ) ; 242 | 243 | if (((itry % divisor) == 1) 244 | || (itry == ntries-1) ) { // Don't do this every try! Too slow. 245 | ndone = itry + 1 ; // This many tries done (and in arrays) 246 | printf ( "\n\n\nTry %d True value = %lf", itry, 247 | grand_wins / grand_losses ) ; 248 | 249 | /* 250 | Process test 1 of 2 251 | */ 252 | 253 | mean_computed_param = 0.0 ; 254 | mean_computed_bias = 0.0 ; 255 | mean_computed_var = 0.0 ; 256 | var_computed_param = 0.0 ; 257 | var_computed_bias = 0.0 ; 258 | for (i=0 ; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double unifrand () ; 15 | void qsortd ( int istart , int istop , double *x ) ; 16 | double quantile_conf ( int n , int m , double conf ) ; 17 | double inverse_ks ( int n , double cdf ) ; 18 | double ks_test ( int n , double *x , double *D_plus , double *D_minus ) ; 19 | 20 | int main ( 21 | int argc , // Number of command line arguments (includes prog name) 22 | char *argv[] // Arguments (prog name is argv[0]) 23 | ) 24 | { 25 | int i, ncases, irep, nreps, m, n_lower, n_upper, n_ks2, n_ks_null, n_ks_alt ; 26 | double *x, pval, conf, pessimistic_lower, pessimistic_upper ; 27 | double ks_two, ks_one, D, Dp, Dm ; 28 | 29 | if (argc != 5) { 30 | printf ( "\nUsage: ConfConf ncases pval conf nreps" ) ; 31 | printf ( "\n ncases - Number of cases in the sample" ) ; 32 | printf ( "\n pval - Probability value (<0.5) for quantile test" ) ; 33 | printf ( "\n conf - Desired confidence value (<0.5) for both tests" ) ; 34 | printf ( "\n nreps - Number of replications" ) ; 35 | exit ( 1 ) ; 36 | } 37 | 38 | ncases = atoi ( argv[1] ) ; 39 | pval = atof ( argv[2] ) ; 40 | conf = atof ( argv[3] ) ; 41 | nreps = atoi ( argv[4] ) ; 42 | 43 | if (ncases < 10) { 44 | printf ( "\nERROR.. Must have at least 10 cases" ) ; 45 | exit ( 1 ) ; 46 | } 47 | 48 | if (pval * ncases < 1.0 || pval >= 0.5) { 49 | printf ( "\nERROR.. Pval too small or too large" ) ; 50 | exit ( 1 ) ; 51 | } 52 | 53 | if (conf <= 0.0 || conf >= 0.5) { 54 | printf ( "\nERROR.. Conf must be greater than 0 and less than 0.5" ) ; 55 | exit ( 1 ) ; 56 | } 57 | 58 | if (nreps < 1) { 59 | printf ( "\nERROR.. Must have at least 1 replication" ) ; 60 | exit ( 1 ) ; 61 | } 62 | 63 | 64 | /* 65 | Allocate memory and initialize 66 | */ 67 | 68 | x = (double *) malloc ( ncases * sizeof(double) ) ; 69 | 70 | m = (int) (pval * ncases) ; // Conservative order statistic for bound 71 | pessimistic_lower = quantile_conf ( ncases , m , conf ) ; 72 | pessimistic_upper = 1.0 - pessimistic_lower ; 73 | ks_two = inverse_ks ( ncases , 1.0 - conf ) ; // Two-tailed test 74 | ks_one = inverse_ks ( ncases , 1.0 - 2.0 * conf ) ; // One-tailed test 75 | 76 | printf ( "\nSuppose the model predicts values near 0 for the null hypothesis" ) ; 77 | printf ( "\nand values near 1 for the alternative hypothesis." ) ; 78 | 79 | printf ( "\n\nIf the dataset represents the null hypothesis, the threshold" ) ; 80 | printf ( "\nfor rejecting the null at p=%.4lf is given by the %d'th order statistic.", 81 | pval, ncases - m + 1 ) ; 82 | printf ( "\nThis is a conservative estimate of the %.4lf quantile", 1.0-pval ) ; 83 | printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", 84 | conf, pessimistic_upper ) ; 85 | 86 | printf ( "\n\nIf the dataset represents the alternative hypothesis, the threshold" ) ; 87 | printf ( "\nfor rejecting the alt at p=%.4lf is given by the %d'th order statistic.", 88 | pval, m ) ; 89 | printf ( "\nThis is a conservative estimate of the %.4lf quantile", pval ) ; 90 | printf ( "\nThere is only a %.4lf chance that it will really be the %.4lf quantile or worse.", 91 | conf, pessimistic_lower) ; 92 | 93 | printf ( "\n\nKS thresholds: two-tailed KS = %.4lf one-tailed KS = %.4lf", 94 | ks_two, ks_one ) ; 95 | 96 | /* 97 | Now generate nreps samples. Verify that our required confidence level 98 | is observed. Note that the fact that this test uses a uniform distribution 99 | does not in any way limit its applicability to uniform distributions. 100 | If one were to generate cases from any other reasonable distribtion, 101 | the pessimistic quantile bounds would have to be transformed similarly. 102 | The result is that the inequalities below would pass or fail identically. 103 | We count the number of times 'disaster' happens. 104 | Disaster is when the order statistic used for the threshold is toward the 105 | inside (center) of the distribution, meaning that if this order statistic 106 | had been used as a threshold, more of the distribution would be outside 107 | the threshold than the user expected. We expect disaster to happen with 108 | probability equal to the specified conf parameter. 109 | 110 | For the two-tailed Kolmogorov-Smirnov test, disaster is when the empirical 111 | CDF deviates (above or below) from the correct value by more than the 112 | conf-inspired value. For the one-tailed test in which the dataset is from 113 | the NULL distribution, disaster is when the empirical CDF exceeds the true 114 | CDF, a situation that would encourage false rejection of the null hypothesis. 115 | This is measured by D+. For the one-tailed test in which the dataset is from 116 | the ALT distribution, disaster is when the empirical CDF is less than the 117 | true CDF, a situation that would encourage false rejection of the alternative 118 | hypothesis. This is measured by D-. 119 | */ 120 | 121 | n_lower = n_upper = n_ks2 = n_ks_null = n_ks_alt = 0 ; 122 | 123 | for (irep=0 ; irep pessimistic_lower) 130 | ++n_lower ; 131 | 132 | if (x[ncases-m] < pessimistic_upper) 133 | ++n_upper ; 134 | 135 | D = ks_test ( ncases , x , &Dp , &Dm ) ; 136 | if (D > ks_two) 137 | ++n_ks2 ; 138 | if (Dp > ks_one) 139 | ++n_ks_null ; 140 | if (Dm > ks_one) 141 | ++n_ks_alt ; 142 | } 143 | 144 | printf ( "\nPoint failure (expected=%.4lf) Lower=%.4lf Upper=%.4lf", 145 | conf, (double) n_lower / nreps, (double) n_upper / nreps) ; 146 | printf ( "\nKS failure: two-tailed = %.4lf NULL = %.4lf ALT = %.4lf", 147 | (double) n_ks2 / nreps, (double) n_ks_null / nreps, 148 | (double) n_ks_alt / nreps) ; 149 | 150 | free ( x ) ; 151 | return ( 0 ) ; 152 | } 153 | -------------------------------------------------------------------------------- /ENTROPY.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* ENTROPY - Compute the entropy of each of a set of variables */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "..\info.h" 15 | 16 | /* 17 | These are defined in MEM.CPP 18 | */ 19 | 20 | extern int mem_keep_log ; // Keep a log file? 21 | extern char mem_file_name[] ; // Log file name 22 | extern int mem_max_used ; // Maximum memory ever in use 23 | 24 | int main ( 25 | int argc , // Number of command line arguments (includes prog name) 26 | char *argv[] // Arguments (prog name is argv[0]) 27 | ) 28 | 29 | { 30 | int i, k, nbins, itype, nvars, ncases, ivar, *counts, ilow, ihigh, nb ; 31 | int istart, istop, ibest, *sortwork, n_indep_vars ; 32 | double *data, *work, *entropies, *proportional, p, max_entropy, low, high ; 33 | double dist, best_dist, factor, entropy ; 34 | short int *bins ; 35 | char filename[256], **names ; 36 | FILE *fp ; 37 | 38 | /* 39 | Process command line parameters 40 | */ 41 | 42 | #if 1 43 | if (argc != 5) { 44 | printf ( "\nUsage: ENTROPY datafile nvars nbins type" ) ; 45 | printf ( "\n datafile - name of the text file containing the data" ) ; 46 | printf ( "\n The first line is variable names" ) ; 47 | printf ( "\n Subsequent lines are the data" ) ; 48 | printf ( "\n Delimiters can be space, comma, or tab" ) ; 49 | printf ( "\n nvars - This many variables, starting with the first, will be tested" ) ; 50 | printf ( "\n nbins - If the data is discrete, this must be at least the" ) ; 51 | printf ( "\n number of bins. It will be automatically reduced" ) ; 52 | printf ( "\n to the exact number of bins." ) ; 53 | printf ( "\n If the data is continuous, it specifies the number of bins" ) ; 54 | printf ( "\n computed by linearly scaling the interior/exterior range." ) ; 55 | printf ( "\n About 10 bins is generally good for continuous data," ) ; 56 | printf ( "\n although values as high as 15 or, rarely, even 20, may be" ) ; 57 | printf ( "\n appropriate if there are tens of thousands of cases." ) ; 58 | printf ( "\n type - Type of data processing:" ) ; 59 | printf ( "\n 1 - The data is discrete" ) ; 60 | printf ( "\n 2 - The data is continuous, and the entire range is to be tested" ) ; 61 | printf ( "\n 3 - The data is continuous, and the extremes are to be truncated" ) ; 62 | exit ( 1 ) ; 63 | } 64 | 65 | strcpy ( filename , argv[1] ) ; 66 | n_indep_vars = atoi ( argv[2] ) ; 67 | nbins = atoi ( argv[3] ) ; 68 | itype = atoi ( argv[4] ) ; 69 | #else 70 | strcpy ( filename , "..\\VARS.TXT" ) ; 71 | n_indep_vars = 8 ; 72 | nbins = 10 ; 73 | itype = 2 ; 74 | #endif 75 | 76 | if (itype < 1 || itype > 3) { 77 | printf ( "\nERROR... type illegal" ) ; 78 | return EXIT_FAILURE ; 79 | } 80 | 81 | if (nbins < 2 || (itype > 1 && nbins < 3)) { 82 | printf ( "\nERROR... nbins illegal" ) ; 83 | return EXIT_FAILURE ; 84 | } 85 | 86 | /* 87 | These are used by MEM.CPP for runtime memory validation 88 | */ 89 | 90 | _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; 91 | fp = fopen ( mem_file_name , "wt" ) ; 92 | if (fp == NULL) { // Should never happen 93 | printf ( "\nCannot open MEM.LOG file for writing!" ) ; 94 | return EXIT_FAILURE ; 95 | } 96 | fclose ( fp ) ; 97 | mem_keep_log = 1 ; 98 | mem_max_used = 0 ; 99 | 100 | /* 101 | Open the text file to which results will be written 102 | */ 103 | 104 | fp = fopen ( "ENTROPY.LOG" , "wt" ) ; 105 | if (fp == NULL) { // Should never happen 106 | printf ( "\nCannot open ENTROPY.LOG file for writing!" ) ; 107 | return EXIT_FAILURE ; 108 | } 109 | 110 | /* 111 | Read the file 112 | */ 113 | 114 | if (readfile ( filename , &nvars , &names , &ncases , &data )) 115 | return EXIT_FAILURE ; 116 | 117 | /* 118 | Allocate scratch memory 119 | 120 | bins - Bin ids for all variables 121 | counts - Count of cases in each bin 122 | entropies - Entropy of each variable 123 | proportional - Proportional entropy of each variable 124 | work - Temporary use for extracting a variable from the dataset 125 | sortwork - Temporary use for printing variable's information sorted 126 | */ 127 | 128 | MEMTEXT ( "ENTROPY 6 allocs" ) ; 129 | bins = (short int *) MALLOC ( ncases * sizeof(short int) ) ; 130 | assert ( bins != NULL ) ; 131 | counts = (int *) MALLOC ( nbins * sizeof(int) ) ; 132 | assert ( counts != NULL ) ; 133 | entropies = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; 134 | assert ( entropies != NULL ) ; 135 | proportional = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; 136 | assert ( proportional != NULL ) ; 137 | work = (double *) MALLOC ( ncases * sizeof(double) ) ; 138 | assert ( work != NULL ) ; 139 | sortwork = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; 140 | assert ( sortwork != NULL ) ; 141 | 142 | /* 143 | If splitting a continuous variable across interior range, 144 | compute things that will be needed. 145 | */ 146 | 147 | if (itype > 1) { 148 | nb = nbins ; // Always needed 149 | ilow = (ncases + 1) / nb - 1 ; // Needed only if itype==3 150 | if (ilow < 0) 151 | ilow = 0 ; 152 | ihigh = ncases - 1 - ilow ; 153 | } 154 | 155 | /* 156 | If splitting a discrete variable, warn the user if the variable is continuous 157 | */ 158 | 159 | else { 160 | for (ivar=0 ; ivar work[i-1]) 167 | ++k ; 168 | } 169 | if (k > nbins) 170 | fprintf ( fp, 171 | "\nWARNING... %s has %d distinct values, not %d. Results will be incorrect.", 172 | names[ivar], k, nbins ) ; 173 | } 174 | } 175 | 176 | /* 177 | Compute and save the entropy for each variable. 178 | Print the results, sort them, and print them again, this time sorted. 179 | */ 180 | 181 | fprintf ( fp , "\n Variable Entropy Proportional" ) ; 182 | 183 | for (ivar=0 ; ivar high) 201 | high = work[i] ; 202 | if (work[i] < low) 203 | low = work[i] ; 204 | } 205 | for (i=0 ; i 0) { 254 | p = (double) counts[i] / (double) ncases ; 255 | entropy -= p * log(p) ; 256 | } 257 | } 258 | 259 | sortwork[ivar] = ivar ; 260 | entropies[ivar] = entropy ; 261 | proportional[ivar] = entropy / max_entropy ; 262 | 263 | fprintf ( fp , "\n%31s %11.5lf %11.5lf", 264 | names[ivar], entropy, proportional[ivar] ) ; 265 | } 266 | 267 | fprintf ( fp , "\n" ) ; 268 | fprintf ( fp , "\n" ) ; 269 | fprintf ( fp , "\nEntropies, in decreasing order" ) ; 270 | fprintf ( fp , "\n" ) ; 271 | fprintf ( fp , "\n Variable Entropy Relative" ) ; 272 | 273 | qsortdsi ( 0 , n_indep_vars-1 , entropies , sortwork ) ; 274 | for (ivar=0 ; ivar 31 | #include 32 | #include 33 | #include 34 | #include "grnn.h" 35 | 36 | double normal () ; 37 | #define EPS1 1.e-180 38 | 39 | /* 40 | -------------------------------------------------------------------------------- 41 | 42 | Constructor, destructor, reset(), add_case() 43 | 44 | -------------------------------------------------------------------------------- 45 | */ 46 | 47 | GRNN::GRNN ( int ncase , int nin , int nout ) 48 | { 49 | ncases = ncase ; 50 | ninputs = nin ; 51 | noutputs = nout ; 52 | tset = (double *) malloc ( ncases * (ninputs + noutputs) * sizeof(double) ) ; 53 | sigma = (double *) malloc ( ninputs * sizeof(double) ) ; 54 | outwork = (double *) malloc ( noutputs * sizeof(double) ) ; 55 | reset () ; 56 | } 57 | 58 | 59 | GRNN::~GRNN () 60 | { 61 | if (tset != NULL) 62 | free ( tset ) ; 63 | if (sigma != NULL) 64 | free ( sigma ) ; 65 | if (outwork != NULL) 66 | free ( outwork ) ; 67 | } 68 | 69 | /* 70 | This discards any existing training data. 71 | It does not need to be called after construction, but it must 72 | be called if the user wants to reuse the GRNN object for a new dataset. 73 | */ 74 | 75 | void GRNN::reset () 76 | { 77 | nrows = 0 ; // No rows (via add_case()) yet present 78 | trained = 0 ; // Training not done yet 79 | } 80 | 81 | /* 82 | Build the training set one case at a time. 83 | The user must call this member EXACTLY ncases times after construction 84 | or a call to reset(), and before a call to train(). 85 | */ 86 | 87 | void GRNN::add_case ( double *newcase ) 88 | { 89 | if (nrows >= ncases) // Careful user never lets this happen 90 | return ; // But cheap insurance 91 | 92 | memcpy ( tset + nrows * (ninputs + noutputs) , newcase , 93 | (ninputs + noutputs) * sizeof(double) ) ; 94 | ++nrows ; 95 | } 96 | 97 | 98 | /* 99 | -------------------------------------------------------------------------------- 100 | 101 | predict() - Given an input vector, compute output using trained model 102 | 103 | -------------------------------------------------------------------------------- 104 | */ 105 | 106 | void GRNN::predict ( 107 | double *input , // Input vector 108 | double *output // Returned output 109 | ) 110 | { 111 | int icase, iout, ivar ; 112 | double *dptr, diff, dist, psum ; 113 | 114 | for (iout=0 ; iout 8 | #include "info.h" 9 | 10 | #define INTBUF 100 /* Incredibly conservative! (divisions 2^(-100) are tiny!) */ 11 | 12 | double integrate ( 13 | double low , // Lower limit for definite integral 14 | double high , // Upper limit 15 | double min_width , // Demand subdivision this small or smaller 16 | double acc , // Relative interval width limit 17 | double tol , // Relative error tolerance 18 | double (*criter) (double) // Criterion function 19 | ) 20 | { 21 | int istack ; 22 | double sum, a, b, mid, fa, fb, fmid, lowres, hires, fac ; 23 | 24 | struct IntStack { 25 | double x0 ; 26 | double x1 ; 27 | double f0 ; 28 | double f1 ; 29 | } stack[INTBUF] ; 30 | 31 | fac = 3.0 * tol ; // Error is about (lowres-hires) / 3 32 | 33 | /* 34 | Start by initializing the stack to be the entire interval 35 | and the integral so far to be zero 36 | */ 37 | 38 | stack[0].x0 = low ; 39 | stack[0].f0 = criter ( low ) ; 40 | stack[0].x1 = high ; 41 | stack[0].f1 = criter ( high ) ; 42 | istack = 1 ; 43 | sum = 0.0 ; 44 | 45 | /* 46 | Main algorithm starts here. Pop interval off stack and test its quality. 47 | */ 48 | 49 | while (istack > 0) { // While there is still at least one interval on stack 50 | --istack ; // Pop this interval 51 | a = stack[istack].x0 ; 52 | b = stack[istack].x1 ; 53 | fa = stack[istack].f0 ; 54 | fb = stack[istack].f1 ; 55 | mid = 0.5 * (a + b) ; 56 | fmid = criter ( mid ) ; 57 | lowres = 0.5 * (b - a) * (fa + fb) ; // Trapezoidal rule 58 | hires = 0.25 * (b - a) * (fa + 2.0 * fmid + fb) ; // And refined value 59 | // If the interval is ridiculously narrow, no point in continuing 60 | // If it gets this far, chances are the integrand is discontinuous 61 | if (b - a <= acc * (1.0 + fabs(a) + fabs(b))) 62 | sum += hires ; // Quit trying to refine 63 | else if ((b - a) <= min_width && fabs(lowres-hires) < fac * (b - a)) 64 | sum += hires ; // Normal convergence flag 65 | else { 66 | stack[istack].x0 = a ; 67 | stack[istack].f0 = fa ; 68 | stack[istack].x1 = mid ; 69 | stack[istack].f1 = fmid ; 70 | ++istack ; 71 | if (istack < INTBUF) { // Insurance against catastrophe only 72 | stack[istack].x0 = mid ; // Should ALWAYS be true (easily!) 73 | stack[istack].f0 = fmid ; // If this if() fails, the answer will 74 | stack[istack].x1 = b ; // of course be wrong, but only due to 75 | stack[istack].f1 = fb ; // a horrendous underlying problem 76 | ++istack ; // like a singularity in the function 77 | } 78 | else { 79 | --istack ; // Error condition, so undo push 80 | sum += hires ; // And go with this best estimiate 81 | } 82 | } 83 | } 84 | return sum ; 85 | } 86 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/assessing-and-improving-prediction-and-classification/28736ace4e23f260aa4a19dbab092f668a96480a/LICENSE.txt -------------------------------------------------------------------------------- /LINREG.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* LINREG - Linear regression */ 4 | /* */ 5 | /* To use this class: */ 6 | /* 1) Construct a new instance of the class */ 7 | /* 2) Call add_case() exactly ncases times, each time providing the */ 8 | /* npred=ncols vector of predictors. */ 9 | /* 3) Call solve() as many times as desired with various right hand sides */ 10 | /* 4) Optionally, call reset() and go to step 2 */ 11 | /* */ 12 | /* This does not include any checks for insufficient memory. */ 13 | /* It also assumes that the user calls add_case exactly ncases times */ 14 | /* and does not check for failure to do so. */ 15 | /* */ 16 | /******************************************************************************/ 17 | 18 | #include 19 | #include 20 | #include 21 | #include "linreg.h" 22 | 23 | LinReg::LinReg ( int ncase , int ncol ) 24 | { 25 | ncases = ncase ; 26 | ncols = ncol ; 27 | svd = new SingularValueDecomp ( ncase , ncol ) ; 28 | reset () ; 29 | } 30 | 31 | 32 | LinReg::~LinReg () 33 | { 34 | if (svd != NULL) 35 | delete svd ; 36 | } 37 | 38 | /* 39 | This discards any existing design matrix. 40 | It does not need to be called after construction, but it must 41 | be called if the user wants to reuse the LinReg object for a new 42 | design matrix. 43 | */ 44 | 45 | void LinReg::reset () 46 | { 47 | nrows = 0 ; // No rows (via add_case()) yet present 48 | decomp = 0 ; // Decomposition not done yet 49 | } 50 | 51 | /* 52 | Build the design matrix one case at a time. 53 | The user must call this member EXACTLY ncases times after construction 54 | or a call to reset(), and before a call to solve(). 55 | */ 56 | 57 | void LinReg::add_case ( double *newcase ) 58 | { 59 | if (nrows >= ncases) // Careful user never lets this happen 60 | return ; // But cheap insurance 61 | 62 | memcpy ( svd->a + nrows * ncols , newcase , ncols * sizeof(double) ) ; 63 | ++nrows ; 64 | } 65 | 66 | /* 67 | After add_case has been called exactly ncases times, this may be called 68 | as many times as desired to solve a system. 69 | */ 70 | 71 | void LinReg::solve ( 72 | double eps , // Singularity limit, typically 1.e-8 or so 73 | double *rhs , // Right hand side, ncases long 74 | double *b // Output of solution, npred=ncols long 75 | ) 76 | { 77 | int i ; 78 | 79 | if (nrows != ncases) { // Careful user never lets this happen 80 | for (i=0 ; isvdcmp () ; // Do it now 87 | decomp = 1 ; // And flag that it has been done 88 | } 89 | 90 | memcpy ( svd->b , rhs , ncases * sizeof(double) ) ; 91 | svd->backsub ( eps , b ) ; 92 | } 93 | -------------------------------------------------------------------------------- /LINREG.H: -------------------------------------------------------------------------------- 1 | #ifndef SVD 2 | #define SVD 3 | class SingularValueDecomp { 4 | 5 | public: 6 | 7 | SingularValueDecomp ( int nrows , int ncols , int save_a=0 ) ; 8 | ~SingularValueDecomp () ; 9 | void svdcmp () ; 10 | void backsub ( double limit , double *soln ) ; 11 | 12 | int ok ; // Was everything legal and allocs successful? 13 | 14 | /* 15 | These are made public to allow access if desired. 16 | Normally, only 'a' (the design matrix) and 'b' (the right-hand-side) 17 | are written by the user. If 'save_a' is nonzero, 'a' is kept intact. 18 | */ 19 | 20 | double *a ; // nrows by ncols input of design, output of U 21 | double *u ; // unless save_a nonzero, in which case U output in 'u' 22 | double *w ; // Unsorted ncols vector of singular values 23 | double *v ; // Ncols by ncols output of 'v' 24 | double *b ; // Nrows right-hand-side for backsub 25 | 26 | 27 | private: 28 | 29 | void bidiag ( double *matrix ) ; 30 | double bid1 ( int col , double *matrix , double scale ) ; 31 | double bid2 ( int col , double *matrix , double scale ) ; 32 | void right ( double *matrix ) ; 33 | void left ( double *matrix ) ; 34 | void cancel ( int low , int high , double *matrix ) ; 35 | void qr ( int low , int high , double *matrix ) ; 36 | void qr_mrot ( int col , double sine , double cosine , double *matrix ) ; 37 | void qr_vrot ( int col , double sine , double cosine ) ; 38 | 39 | int rows ; // Nrows preserved here 40 | int cols ; // And ncols 41 | double *work ; // Scratch vector ncols long 42 | double norm ; // Norm of 'a' matrix 43 | } ; 44 | #endif 45 | 46 | class LinReg { 47 | 48 | public: 49 | 50 | LinReg ( int ncase , int ncol ) ; 51 | ~LinReg () ; 52 | void reset () ; 53 | void add_case ( double *newcase ) ; 54 | void solve ( double eps , double *rhs , double *b ) ; 55 | 56 | 57 | private: 58 | SingularValueDecomp *svd ; 59 | int ncases ; // Number of cases 60 | int ncols ; // Number of columns 61 | int nrows ; // How many times has add_case() been called? 62 | int decomp ; // Has the decomposition been done yet? 63 | } ; 64 | -------------------------------------------------------------------------------- /LOGISTIC.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* LOGISTIC - Logistic regression */ 4 | /* */ 5 | /* Unlike most implementations (which use iterative gradient ascent) */ 6 | /* this uses simulated annealing. This is considerably slower than */ 7 | /* the traditional method, but more likely to find the global optimum. */ 8 | /* It is also more numerically stable. */ 9 | /* */ 10 | /* To use this class: */ 11 | /* 1) Construct a new instance of the class */ 12 | /* 2) Call add_case() exactly ncases times, each time providing the */ 13 | /* nin+1 vector of inputs and output. */ 14 | /* 3) Call train() */ 15 | /* 4) Call predict() as many times as desired */ 16 | /* 5) Optionally, call reset() and go to step 2 */ 17 | /* */ 18 | /* This does not include any checks for insufficient memory. */ 19 | /* It also assumes that the user calls add_case exactly ncases times */ 20 | /* and does not check for failure to do so. */ 21 | /* */ 22 | /******************************************************************************/ 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include "logistic.h" 29 | #include "minimize.h" 30 | 31 | double normal () ; 32 | 33 | static double max_exp = log ( 1.e190 ) ; 34 | inline double safe_exp ( double x ) 35 | { 36 | if (x <= max_exp) 37 | return exp ( x ) ; 38 | 39 | return 1.e190 ; 40 | } 41 | 42 | /* 43 | -------------------------------------------------------------------------------- 44 | 45 | Constructor, destructor, reset(), add_case() 46 | 47 | -------------------------------------------------------------------------------- 48 | */ 49 | 50 | Logistic::Logistic ( int ncase , int nin ) 51 | { 52 | ncases = ncase ; 53 | ninputs = nin ; 54 | coefs = (double *) malloc ( (ninputs +1) * sizeof(double) ) ; 55 | tset = (double *) malloc ( ncases * (ninputs + 1) * sizeof(double) ) ; 56 | reset () ; 57 | } 58 | 59 | 60 | Logistic::~Logistic () 61 | { 62 | if (tset != NULL) 63 | free ( tset ) ; 64 | if (coefs != NULL) 65 | free ( coefs ) ; 66 | } 67 | 68 | /* 69 | This discards any existing training data. 70 | It does not need to be called after construction, but it must 71 | be called if the user wants to reuse the Logistic object for a new dataset. 72 | */ 73 | 74 | void Logistic::reset () 75 | { 76 | nrows = 0 ; // No rows (via add_case()) yet present 77 | trained = 0 ; // Training not done yet 78 | } 79 | 80 | /* 81 | Build the training set one case at a time. 82 | The user must call this member EXACTLY ncases times after construction 83 | or a call to reset(), and before a call to train(). 84 | */ 85 | 86 | void Logistic::add_case ( double *newcase ) 87 | { 88 | if (nrows >= ncases) // Careful user never lets this happen 89 | return ; // But cheap insurance 90 | 91 | #if 0 92 | printf ( "\n---> " ) ; 93 | for (int i=0 ; i<=ninputs ; i++) 94 | printf ( " %.2lf", newcase[i] ) ; 95 | #endif 96 | memcpy ( tset + nrows * (ninputs + 1) , newcase , 97 | (ninputs + 1) * sizeof(double) ) ; 98 | ++nrows ; 99 | } 100 | 101 | 102 | /* 103 | -------------------------------------------------------------------------------- 104 | 105 | predict() - Given an input vector, compute output using trained model 106 | The output is the linear combination, the log odds ratio 107 | 108 | -------------------------------------------------------------------------------- 109 | */ 110 | 111 | void Logistic::predict ( 112 | double *input , // Input vector 113 | double *output // Returned output 114 | ) 115 | { 116 | int i ; 117 | 118 | *output = coefs[ninputs] ; // Constant term 119 | for (i=0 ; i best_y)) { 196 | first = 0 ; 197 | best_y = y ; 198 | memcpy ( best_wts , test_wts , ninputs * sizeof(double) ) ; 199 | } 200 | 201 | } // For inner loop iterations 202 | memcpy ( center , best_wts , ninputs * sizeof(double) ) ; 203 | std *= 0.7 ; 204 | } // For outer loop iterations 205 | 206 | logit_crit ( best_wts ) ; // Needed to set coefs correctly 207 | trained = 1 ; // Training complete 208 | 209 | #if 0 210 | printf ( "\n" ) ; 211 | for (i=0 ; i<=ninputs ; i++) 212 | printf ( " %.3lf", coefs[i] ) ; 213 | // getch () ; 214 | #endif 215 | 216 | 217 | free ( test_wts ) ; 218 | free ( best_wts ) ; 219 | free ( center ) ; 220 | } 221 | 222 | static double logit_crit ( double *x ) 223 | { 224 | int i ; 225 | double x1, y1, x2, y2, x3, y3 ; 226 | 227 | for (i=0 ; ininputs ; i++) 228 | local_logistic->coefs[i] = safe_exp ( x[i] ) ; 229 | 230 | glob_min ( -20.0 , 20.0 , 5 , 0 , -1.e160 , logit_unicrit , 231 | &x1 , &y1 , &x2 , &y2 , &x3 , &y3 ) ; 232 | 233 | y2 = brentmin ( 50 , -1.e160 , 1.e-10 , 1.e-10 , logit_unicrit , 234 | &x1 , &x2 , &x3 , y2 ) ; 235 | 236 | local_logistic->coefs[local_logistic->ninputs] = x2 ; 237 | return -y2 ; 238 | } 239 | 240 | static double logit_unicrit ( double t ) 241 | { 242 | double penalty ; 243 | 244 | penalty = 0.0 ; 245 | if (fabs ( t ) > 20.0) // Rare pathological event 246 | penalty = 1.e10 * (fabs ( t ) - 20.0) ; 247 | 248 | local_logistic->coefs[local_logistic->ninputs] = t ; 249 | return penalty - local_logistic->execute () ; 250 | } 251 | -------------------------------------------------------------------------------- /LOGISTIC.H: -------------------------------------------------------------------------------- 1 | class Logistic { 2 | 3 | public: 4 | 5 | Logistic ( int ncase , int nin ) ; 6 | ~Logistic () ; 7 | void reset () ; 8 | void add_case ( double *newcase ) ; 9 | void train () ; 10 | void predict ( double *input , double *output ) ; 11 | 12 | double execute () ; 13 | 14 | int ncases ; // Number of cases 15 | int ninputs ; // Number of inputs 16 | int nrows ; // How many times has add_case() been called? 17 | int trained ; // Has it been trained yet? 18 | double *tset ; // Ncases by (ninputs+1) matrix of training data 19 | double *coefs ; // Trained coefficient vector ninputs+1 long 20 | } ; 21 | 22 |  -------------------------------------------------------------------------------- /MINIMIZE.H: -------------------------------------------------------------------------------- 1 | extern int glob_min ( double low , double high , int npts , int log_space , 2 | double critlim , double (*criter) (double) , 3 | double *x1, double *y1 , double *x2, double *y2 , double *x3, double *y3 ) ; 4 | 5 | extern double brentmin ( int itmax , double critlim , double eps , 6 | double tol , double (*criter) (double) , 7 | double *x1 , double *x2 , double *x3 , double y ) ; 8 | 9 | extern double powell ( int maxits , double critlim , double tol , 10 | double (*criter) ( double * ) , int n , double *x , double ystart , 11 | double *base , double *p0 , double *direc ) ; 12 | -------------------------------------------------------------------------------- /MI_ONLY.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* MI_ONLY - Mutual information ONLY for continuous predicted and predictors */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "..\info.h" 15 | 16 | /* 17 | These are defined in MEM.CPP 18 | */ 19 | 20 | extern int mem_keep_log ; // Keep a log file? 21 | extern char mem_file_name[] ; // Log file name 22 | extern int mem_max_used ; // Maximum memory ever in use 23 | 24 | int main ( 25 | int argc , // Number of command line arguments (includes prog name) 26 | char *argv[] // Arguments (prog name is argv[0]) 27 | ) 28 | 29 | { 30 | int i, j, k, nvars, ncases, irep, nreps, ivar, nties, ties ; 31 | int n_indep_vars, idep, icand, *index, *mcpt_max_counts, *mcpt_same_counts, *mcpt_solo_counts ; 32 | double *data, *work, dtemp, *save_info, criterion, *crits ; 33 | char filename[256], **names, depname[256] ; 34 | FILE *fp ; 35 | MutualInformationAdaptive *mi_adapt ; 36 | 37 | /* 38 | Process command line parameters 39 | */ 40 | 41 | #if 1 42 | if (argc != 5) { 43 | printf ( "\nUsage: MI_ONLY datafile n_indep depname nreps" ) ; 44 | printf ( "\n datafile - name of the text file containing the data" ) ; 45 | printf ( "\n The first line is variable names" ) ; 46 | printf ( "\n Subsequent lines are the data." ) ; 47 | printf ( "\n Delimiters can be space, comma, or tab" ) ; 48 | printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; 49 | printf ( "\n depname - Name of the 'dependent' variable" ) ; 50 | printf ( "\n It must be AFTER the first n_indep variables" ) ; 51 | printf ( "\n nreps - Number of Monte-Carlo permutations, including unpermuted" ) ; 52 | exit ( 1 ) ; 53 | } 54 | 55 | strcpy ( filename , argv[1] ) ; 56 | n_indep_vars = atoi ( argv[2] ) ; 57 | strcpy ( depname , argv[3] ) ; 58 | nreps = atoi ( argv[4] ) ; 59 | #else 60 | strcpy ( filename , "..\\SYNTH.TXT" ) ; 61 | n_indep_vars = 7 ; 62 | strcpy ( depname , "SUM1234" ) ; 63 | nreps = 100 ; 64 | #endif 65 | 66 | _strupr ( depname ) ; 67 | 68 | /* 69 | These are used by MEM.CPP for runtime memory validation 70 | */ 71 | 72 | _fullpath ( mem_file_name , "MEM.LOG" , 256 ) ; 73 | fp = fopen ( mem_file_name , "wt" ) ; 74 | if (fp == NULL) { // Should never happen 75 | printf ( "\nCannot open MEM.LOG file for writing!" ) ; 76 | return EXIT_FAILURE ; 77 | } 78 | fclose ( fp ) ; 79 | mem_keep_log = 0 ; // Change this to 1 to keep a memory use log (slows execution!) 80 | mem_max_used = 0 ; 81 | 82 | /* 83 | Open the text file to which results will be written 84 | */ 85 | 86 | fp = fopen ( "MI_ONLY.LOG" , "wt" ) ; 87 | if (fp == NULL) { // Should never happen 88 | printf ( "\nCannot open MI_ONLY.LOG file for writing!" ) ; 89 | return EXIT_FAILURE ; 90 | } 91 | 92 | /* 93 | Read the file and locate the index of the dependent variable 94 | */ 95 | 96 | if (readfile ( filename , &nvars , &names , &ncases , &data )) 97 | return EXIT_FAILURE ; 98 | 99 | for (idep=0 ; idep n_indep_vars && ivar != idep) 129 | continue ; // Check only the variables selected by the user 130 | for (i=0 ; i 0.05) { 139 | ++ties ; 140 | fprintf ( fp , "\nWARNING... %s has %.2lf percent ties!", 141 | names[ivar], 100.0 * nties / (double) ncases ) ; 142 | } 143 | } // For all variables 144 | if (ties) { 145 | fprintf ( fp , "\nThe presence of ties will seriously degrade" ) ; 146 | fprintf ( fp , "\nperformance of the adaptive partitioning algorithm\n\n" ) ; 147 | } 148 | 149 | /* 150 | Allocate scratch memory and create the MutualInformation object using the 151 | dependent variable 152 | 153 | crits - Mutual information criterion 154 | index - Indices that sort the criterion 155 | save_info - Ditto, this is univariate information, to be sorted 156 | mi_adapt - The MutualInformation object, constructed with the 'dependent' variable 157 | */ 158 | 159 | MEMTEXT ( "MI_ONLY work allocs plus MutualInformation" ) ; 160 | crits = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; 161 | assert ( crits != NULL ) ; 162 | index = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; 163 | assert ( index != NULL ) ; 164 | mcpt_max_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; 165 | assert ( mcpt_max_counts != NULL ) ; 166 | mcpt_same_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; 167 | assert ( mcpt_same_counts != NULL ) ; 168 | mcpt_solo_counts = (int *) MALLOC ( n_indep_vars * sizeof(int) ) ; 169 | assert ( mcpt_solo_counts != NULL ) ; 170 | save_info = (double *) MALLOC ( n_indep_vars * sizeof(double) ) ; 171 | assert ( save_info != NULL ) ; 172 | 173 | for (irep=0 ; irep0) 179 | 180 | if (irep) { // If doing permuted runs, shuffle 181 | i = ncases ; // Number remaining to be shuffled 182 | while (i > 1) { // While at least 2 left to shuffle 183 | j = (int) (unifrand () * i) ; 184 | if (j >= i) 185 | j = i - 1 ; 186 | dtemp = work[--i] ; 187 | work[i] = work[j] ; 188 | work[j] = dtemp ; 189 | } 190 | } 191 | 192 | // Here we use a tiny split theshold (instead of the usual 6.0) so that it picks up 193 | // small amounts of mutual information (perhaps including noise). 194 | // If we used 6.0, nearly all permutations of any reasonably sized dataset 195 | // would have a computed mutual information of zero. It's safe picking up 196 | // some noise because the permutation test will account for this. 197 | 198 | mi_adapt = new MutualInformationAdaptive ( ncases , work , 1 , 0.1 ) ; // Deliberately tiny for low information 199 | assert ( mi_adapt != NULL ) ; 200 | 201 | /* 202 | Compute and save the mutual information for the dependent variable 203 | with each individual independent variable candidate. 204 | */ 205 | 206 | for (icand=0 ; icandmut_inf ( work , 1 ) ; 211 | 212 | save_info[icand] = criterion ; // We will sort this when all candidates are done 213 | 214 | if (irep == 0) { // If doing original (unpermuted), save criterion 215 | index[icand] = icand ; // Will need original indices when criteria are sorted 216 | crits[icand] = criterion ; 217 | mcpt_max_counts[icand] = mcpt_same_counts[icand] = mcpt_solo_counts[icand] = 1 ; // This is >= itself so count it now 218 | } 219 | else { 220 | if (criterion >= crits[icand]) 221 | ++mcpt_solo_counts[icand] ; 222 | } 223 | } // Initial list of all candidates 224 | 225 | delete mi_adapt ; 226 | mi_adapt = NULL ; 227 | 228 | if (irep == 0) // Find the indices that sort the candidates per criterion 229 | qsortdsi ( 0 , n_indep_vars-1 , save_info , index ) ; 230 | 231 | else { 232 | qsortd ( 0 , n_indep_vars-1 , save_info ) ; 233 | for (icand=0 ; icand= crits[index[icand]]) 235 | ++mcpt_same_counts[index[icand]] ; 236 | if (save_info[n_indep_vars-1] >= crits[index[icand]]) // Valid only for largest 237 | ++mcpt_max_counts[index[icand]] ; 238 | } 239 | } 240 | 241 | } // For all reps 242 | 243 | fprintf ( fp , "\nAdaptive partitioning mutual information of %s", depname); 244 | 245 | fprintf ( fp , "\n" ) ; 246 | fprintf ( fp , "\n" ) ; 247 | fprintf ( fp , "\nPredictors, in order of decreasing mutual information" ) ; 248 | fprintf ( fp , "\n" ) ; 249 | fprintf ( fp , "\n Variable Information Solo pval Min pval Max pval" ) ; 250 | 251 | for (icand=0 ; icand