├── 9781484233146.png ├── AN_COHERENCE.TXT ├── AN_CVARS.TXT ├── AN_EIGEN.TXT ├── AN_FACTOR.TXT ├── AN_ROTATE.TXT ├── BILINEAR.CPP ├── BRENTMIN.CPP ├── Contributing.md ├── DATAMINE_Manual.pdf ├── DENSITY_PLOTS.TXT ├── DataMine.exe ├── EVEC_RS.CPP ├── FREL.TXT ├── GLOB_MIN.CPP ├── HORNS_METHOD.TXT ├── INTEGRAT.CPP ├── INVERT.CPP ├── LICENSE.txt ├── MI_BIN.CPP ├── MUTINF_B.CPP ├── MUTINF_C.CPP ├── MUTINF_D.CPP ├── PART.CPP ├── PARZDENS.CPP ├── POWELL.CPP ├── QSORTD.CPP ├── RANDOM.CPP ├── README.md ├── SCREEN_BIVAR.CPP ├── SCREEN_RR.CPP ├── SCREEN_UNIVAR.CPP ├── SPEARMAN.CPP ├── SPLINE.CPP ├── STATS.CPP ├── SVDCMP.CPP ├── TEST_CON.CPP ├── TEST_DIS.CPP ├── TRANS_ENT.CPP └── errata.md /9781484233146.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/data-mining-algorithms-cpp/bf18dda7f2361534423c56076fd60999b1becd86/9781484233146.png -------------------------------------------------------------------------------- /AN_COHERENCE.TXT: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* AN_COHERENCE - Coherence analysis and plot of values */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | class AnalyzeCoherenceChild { 8 | 9 | public: 10 | AnalyzeCoherenceChild ( int npreds , int *preds , int n_dim , int nonpar ) ; 11 | ~AnalyzeCoherenceChild () ; 12 | 13 | int lookback ; 14 | int npred ; 15 | int preds[MAX_VARS] ; 16 | int nonpar ; 17 | int n ; // Length of displayed series of coherences 18 | double *val ; // Coherences for display 19 | // Work areas 20 | double *mean ; 21 | double *covar ; 22 | } ; 23 | 24 | 25 | /* 26 | -------------------------------------------------------------------------------- 27 | 28 | Constructor and Destructor 29 | 30 | -------------------------------------------------------------------------------- 31 | */ 32 | 33 | AnalyzeCoherenceChild::AnalyzeCoherenceChild ( int np , int *p , int lb , int nonp ) 34 | { 35 | int icase, i, j, k ; 36 | double *dptr, *means, *evals, *evects, *workv, minval, maxval, meanval ; 37 | double sum, total, diff, diff2, *nonpar_work, factor ; 38 | char msg[512], line[1024] ; 39 | FILE *fp ; 40 | 41 | MEMTEXT ( "AN_COHERENCE AnalyzeCoherenceChild constructor" ) ; 42 | npred = np ; 43 | lookback = lb ; 44 | nonpar = nonp ; 45 | for (i=0 ; i maxval) 178 | maxval = val[icase-lookback+1] ; 179 | if (val[icase-lookback+1] < minval) 180 | minval = val[icase-lookback+1] ; 181 | meanval += val[icase-lookback+1] ; 182 | 183 | } // For all cases 184 | 185 | meanval /= n_cases - lookback + 1 ; 186 | 187 | 188 | /* 189 | Print summary 190 | */ 191 | 192 | audit ( "" ) ; 193 | sprintf_s ( msg, "Mean coherence = %.5lf", meanval ) ; 194 | audit ( msg ) ; 195 | sprintf_s ( msg, "Min = %.5lf", minval ) ; 196 | audit ( msg ) ; 197 | sprintf_s ( msg, "Max = %.5lf", maxval ) ; 198 | audit ( msg ) ; 199 | audit ( "" ) ; 200 | sprintf_s ( msg, "Coherence values have been written to %s", coherence_log ) ; 201 | audit ( msg ) ; 202 | 203 | MEMTEXT ( "AN_COHERENCE: free means, covar, evals, evects, workv (,nonpar_work)" ) ; 204 | free ( means ) ; 205 | free ( covar ) ; 206 | free ( evals ) ; 207 | free ( evects ) ; 208 | free ( workv ) ; 209 | if (nonpar_work != NULL) 210 | free ( nonpar_work ) ; 211 | } 212 | 213 | AnalyzeCoherenceChild::~AnalyzeCoherenceChild () 214 | { 215 | MEMTEXT ( "AN_COHERENCE.CPP AnalyzeCoherenceChild destructor" ) ; 216 | if (val != NULL) 217 | free ( val ) ; 218 | } 219 | -------------------------------------------------------------------------------- /AN_CVARS.TXT: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* AN_CVARS - AnalyzeClusterVars operations */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | int an_cvars ( 8 | int n_dim , // Number of initial dimensions to consider 9 | int ngrp_to_print , // Start printing when n of groups drops this low 10 | int type // Centroid versus leader method 11 | ) 12 | { 13 | int i, j, nvars, icand1, icand2, ibest1, ibest2, n_groups ; 14 | int *group_id, *n_in_group ; 15 | double x, dotprod, length, best_dotprod, *centroids ; 16 | char msg[256], msg2[256] ; 17 | 18 | n_groups = npred ; // Number of groups; initially, every variable is its own group 19 | nvars = npred ; // This name just makes things more clear; no other reason 20 | 21 | /* 22 | Allocate memory 23 | */ 24 | 25 | group_id = (int *) malloc ( nvars * sizeof(int) ) ; 26 | n_in_group = (int *) malloc ( nvars * sizeof(int) ) ; 27 | centroids = (double *) malloc ( nvars * n_dim * sizeof(double) ) ; 28 | 29 | /* 30 | Initialize; For each variable, make the length of the vector one 31 | */ 32 | 33 | for (i=0 ; i 1) { 68 | best_dotprod = -1.0 ; 69 | 70 | // Try every pair of groups (icand1 and icand2) 71 | for (icand1=0 ; icand1 best_dotprod) { // Keep track of the pair with best criterion 83 | best_dotprod = dotprod ; 84 | ibest1 = icand1 ; 85 | ibest2 = icand2 ; 86 | } 87 | 88 | } // For icand2 89 | } // For icand1 90 | 91 | // We just found the closest pair. Merge larger index into smaller. 92 | 93 | if (best_dotprod > 1.0) // Should never happen, but handle tiny fpt errors 94 | best_dotprod = 1.0 ; 95 | 96 | sprintf_s ( msg , "Merged groups %d and %d separated by %.2lf degrees; now have %d groups", 97 | ibest1+1, ibest2+1, acos(best_dotprod)*180.0/PI, n_groups-1 ) ; 98 | audit ( msg ) ; 99 | 100 | if (type) { // Did the user request centroid method? 101 | // Recompute the (approximate) centroid of the absorbing (smaller id) group 102 | length = 0.0 ; 103 | for (j=0 ; j ibest2) // Groups above absorbed group 122 | --group_id[i] ; // Now have to fill in the hole below them 123 | } 124 | 125 | for (i=ibest2+1 ; i 1) { 138 | audit ( "Group membership..." ) ; 139 | for (i=0 ; i 1) 152 | 153 | FINISH: 154 | free ( group_id ) ; 155 | free ( n_in_group ) ; 156 | free ( centroids ) ; 157 | 158 | return 0 ; 159 | } -------------------------------------------------------------------------------- /AN_EIGEN.TXT: -------------------------------------------------------------------------------- 1 | class AnalyzeEigenChild { 2 | 3 | public: 4 | AnalyzeEigenChild ( int npreds , int *preds , int nonpar ) ; 5 | ~AnalyzeEigenChild () ; 6 | 7 | int npred ; 8 | int preds[MAX_VARS] ; 9 | int nonpar ; 10 | int n ; // Size of matrix (number of eigenvalues) 11 | double *val ; // Eigenvalues for display 12 | } ; 13 | 14 | 15 | /* 16 | Allocate memory 17 | */ 18 | 19 | cumulative = (double *) malloc ( npred * sizeof(double) ) ; 20 | covar = (double *) malloc ( npred * npred * sizeof(double) ) ; 21 | evals = (double *) malloc ( npred * sizeof(double) ) ; 22 | structure = (double *) malloc ( npred * npred * sizeof(double) ) ; 23 | means = (double *) malloc ( npred * sizeof(double) ) ; 24 | stddev = (double *) malloc ( npred * sizeof(double) ) ; 25 | 26 | /* 27 | Compute means (means) and standard deviations (stddev) 28 | */ 29 | 30 | for (i=0 ; i 1.0) 109 | structure[i*npred+j] = 1.0 ; 110 | } 111 | } 112 | 113 | free ( covar ) ; 114 | free ( means ) ; 115 | free ( stddev ) ; 116 | } 117 | -------------------------------------------------------------------------------- /AN_ROTATE.TXT: -------------------------------------------------------------------------------- 1 | /* 2 | Compute (square root) communalities 3 | This assumes that structure contains all npred columns (factors) 4 | and we are rotating the first n_kept of them. 5 | */ 6 | 7 | for (i=0 ; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | class Bilinear { 14 | 15 | public: 16 | Bilinear ( int nxin , double *xin , int nyin , double *yin , double *zin , 17 | int extra ) ; 18 | ~Bilinear () ; 19 | double evaluate ( double x , double y ) ; 20 | 21 | private: 22 | int quadratic ; 23 | int nx ; 24 | int ny ; 25 | double *x ; 26 | double *y ; 27 | double *z ; 28 | } ; 29 | 30 | 31 | Bilinear::Bilinear ( // Uses input points (x,y,z) where z=f(x,y) 32 | int nxin , // Number of x points 33 | double *xin , // They are here, sorted ascending 34 | int nyin , // Number of y points 35 | double *yin , // They are here, sorted ascending 36 | double *zin , // Corresponding function values, y changing fastest 37 | int extra // If nonzero, use 3x3 block with quadratic interpolation 38 | ) 39 | { 40 | 41 | quadratic = extra ; 42 | nx = nxin ; 43 | ny = nyin ; 44 | x = (double *) malloc ( nx * sizeof(double) ) ; 45 | y = (double *) malloc ( ny * sizeof(double) ) ; 46 | z = (double *) malloc ( nx * ny * sizeof(double) ) ; 47 | assert ( x != NULL ) ; 48 | assert ( y != NULL ) ; 49 | assert ( z != NULL ) ; 50 | 51 | memcpy ( x , xin , nx * sizeof(double) ) ; 52 | memcpy ( y , yin , ny * sizeof(double) ) ; 53 | memcpy ( z , zin , nx * ny * sizeof(double) ) ; 54 | } 55 | 56 | Bilinear::~Bilinear () 57 | { 58 | free ( x ) ; 59 | free ( y ) ; 60 | free ( z ) ; 61 | } 62 | 63 | double Bilinear::evaluate ( double xpt , double ypt ) 64 | { 65 | int k, kxlo, kxmid, kxhi, kylo, kymid, kyhi ; 66 | double t, u, val, clo, cmid, chi, zlo, zmid, zhi ; 67 | double dlo, dmid, dhi, lo_mid, lo_hi, mid_hi ; 68 | 69 | /* 70 | Bound outlying inputs 71 | */ 72 | 73 | if (xpt < x[0]) 74 | xpt = x[0] ; 75 | if (xpt > x[nx-1]) 76 | xpt = x[nx-1] ; 77 | if (ypt < y[0]) 78 | ypt = y[0] ; 79 | if (ypt > y[ny-1]) 80 | ypt = y[ny-1] ; 81 | 82 | /* 83 | Find the pair of x coordinates that bound the input 84 | */ 85 | 86 | kxlo = 0 ; 87 | kxhi = nx - 1 ; 88 | while (kxhi > kxlo+1) { 89 | k = (kxhi + kxlo) / 2 ; 90 | if (xpt < x[k]) 91 | kxhi = k ; 92 | else 93 | kxlo = k ; 94 | } 95 | 96 | /* 97 | Find the pair of y coordinates that bound the input 98 | */ 99 | 100 | kylo = 0 ; 101 | kyhi = ny - 1 ; 102 | while (kyhi > kylo+1) { 103 | k = (kyhi + kylo) / 2 ; 104 | if (ypt < y[k]) 105 | kyhi = k ; 106 | else 107 | kylo = k ; 108 | } 109 | 110 | /* 111 | 3x3 with quadratic interpolation? 112 | */ 113 | 114 | if (quadratic) { 115 | // Choose which way to go for the third x point 116 | if (kxlo == 0) { 117 | kxmid = kxhi ; 118 | ++kxhi ; 119 | } 120 | else if (kxhi == nx-1) { 121 | kxmid = kxlo ; 122 | --kxlo ; 123 | } 124 | else if (xpt-x[kxlo] < x[kxhi]-xpt) { 125 | kxmid = kxlo ; 126 | --kxlo ; 127 | } 128 | else { 129 | kxmid = kxhi ; 130 | ++kxhi ; 131 | } 132 | 133 | // Choose which way to go for the third y point 134 | if (kylo == 0) { 135 | kymid = kyhi ; 136 | ++kyhi ; 137 | } 138 | else if (kyhi == ny-1) { 139 | kymid = kylo ; 140 | --kylo ; 141 | } 142 | else if (ypt-y[kylo] < y[kyhi]-ypt) { 143 | kymid = kylo ; 144 | --kylo ; 145 | } 146 | else { 147 | kymid = kyhi ; 148 | ++kyhi ; 149 | } 150 | 151 | dlo = xpt - x[kxlo] ; 152 | dmid = xpt - x[kxmid] ; 153 | dhi = xpt - x[kxhi] ; 154 | lo_mid = x[kxlo] - x[kxmid] ; 155 | lo_hi = x[kxlo] - x[kxhi] ; 156 | mid_hi = x[kxmid] - x[kxhi] ; 157 | clo = dmid * dhi / (lo_mid * lo_hi) ; 158 | cmid = dlo * dhi / (-lo_mid * mid_hi) ; 159 | chi = dlo * dmid / (lo_hi * mid_hi) ; 160 | 161 | zlo = clo * z[kxlo*ny+kylo] + cmid * z[kxmid*ny+kylo] + chi * z[kxhi*ny+kylo] ; 162 | zmid = clo * z[kxlo*ny+kymid] + cmid * z[kxmid*ny+kymid] + chi * z[kxhi*ny+kymid] ; 163 | zhi = clo * z[kxlo*ny+kyhi] + cmid * z[kxmid*ny+kyhi] + chi * z[kxhi*ny+kyhi] ; 164 | 165 | dlo = ypt - y[kylo] ; 166 | dmid = ypt - y[kymid] ; 167 | dhi = ypt - y[kyhi] ; 168 | lo_mid = y[kylo] - y[kymid] ; 169 | lo_hi = y[kylo] - y[kyhi] ; 170 | mid_hi = y[kymid] - y[kyhi] ; 171 | clo = dmid * dhi / (lo_mid * lo_hi) ; 172 | cmid = dlo * dhi / (-lo_mid * mid_hi) ; 173 | chi = dlo * dmid / (lo_hi * mid_hi) ; 174 | 175 | return clo * zlo + cmid * zmid + chi * zhi ; 176 | } // If quadratic 177 | 178 | /* 179 | Ordinary 2x2 bilinear 180 | */ 181 | 182 | else { 183 | t = (xpt - x[kxlo]) / (x[kxhi] - x[kxlo]) ; 184 | u = (ypt - y[kylo]) / (y[kyhi] - y[kylo]) ; 185 | 186 | val = (1.0 - t) * (1.0 - u) * z[kxlo*ny+kylo] ; 187 | val += t * (1.0 - u) * z[kxhi*ny+kylo] ; 188 | val += t * u * z[kxhi*ny+kyhi] ; 189 | val += (1.0 - t) * u * z[kxlo*ny+kyhi] ; 190 | return val ; 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /BRENTMIN.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* BRENTMIN - Use Brent's method to find a local minimum of a */ 4 | /* univariate function. */ 5 | /* */ 6 | /* This is given three points such that the center has lesser function */ 7 | /* value than its neighbors. It iteratively refines the interval. */ 8 | /* If the criterion function drops to critlim or smaller, execution will */ 9 | /* terminate. */ 10 | /* */ 11 | /******************************************************************************/ 12 | 13 | #include 14 | 15 | int user_pressed_escape() ; 16 | 17 | int brentmin ( 18 | int itmax , // Iteration limit 19 | double critlim , // Quit if crit drops this low 20 | double eps , // Function convergence tolerance 21 | double tol , // X convergence tolerance 22 | int (*criter) (double , double *) , // Criterion function 23 | double *xa , // Lower X value, input and output 24 | double *xb , // Middle (best), input and output 25 | double *xc , // And upper, input and output 26 | double *y // Function value at xb, input and output 27 | ) 28 | { 29 | int iter, user_quit ; 30 | double x0, x1, x2, y0, y1, y2, xleft, xmid, xright, movement, trial ; 31 | double small_step, small_dist, numer, denom, temp1, temp2 ; 32 | double testdist, this_x, this_y ; 33 | 34 | /* 35 | Initialize 36 | */ 37 | 38 | x0 = x1 = x2 = *xb ; 39 | xleft = *xa ; 40 | xright = *xc ; 41 | 42 | y0 = y1 = y2 = *y ; 43 | 44 | /* 45 | We want a golden-section search the first iteration. Force this by setting 46 | movement equal to zero. 47 | */ 48 | 49 | movement = trial = 0.0 ; 50 | user_quit = 0 ; 51 | 52 | /* 53 | Main loop. 54 | */ 55 | 56 | for (iter=0 ; iter= 4) && ((fabs(y2 - y0) / (fabs(y0) + 1.0)) < eps)) 85 | break ; 86 | 87 | if (fabs ( movement ) > small_step) { // Try parabolic only if moving 88 | temp1 = (x0 - x2) * (y0 - y1) ; 89 | temp2 = (x0 - x1) * (y0 - y2) ; 90 | numer = (x0 - x1) * temp2 - (x0 - x2) * temp1 ; 91 | denom = 2. * (temp1 - temp2) ; 92 | testdist = movement ; // Intervals must get smaller 93 | movement = trial ; 94 | if (fabs(denom) > 1.e-40) 95 | trial = numer / denom ; // Parabolic estimate of minimum 96 | else 97 | trial = 1.e40 ; 98 | 99 | temp1 = trial + x0 ; 100 | if ((2.0 * fabs ( trial ) < fabs ( testdist ))// If shrinking 101 | && (temp1 > xleft) && (temp1 < xright)) { // And safely in bounds 102 | this_x = temp1 ; // Use parabolic estimate 103 | if ((this_x - xleft < small_dist) || // Cannot get too close 104 | (xright - this_x < small_dist)) // to the endpoints 105 | trial = (x0 < xmid) ? small_step : -small_step ; 106 | } 107 | else { // Punt via golden section because cannot use parabolic 108 | movement = (xmid > x0) ? xright - x0 : xleft - x0 ; 109 | trial = .3819660 * movement ; 110 | } 111 | } 112 | else { // Must use golden section due to insufficient movement 113 | movement = (xmid > x0) ? xright - x0 : xleft - x0 ; 114 | trial = .3819660 * movement ; 115 | } 116 | 117 | if (fabs (trial) >= small_step) // Make sure we move a good distance 118 | this_x = x0 + trial ; 119 | else 120 | this_x = (trial > 0.0) ? x0 + small_step : x0 - small_step ; 121 | 122 | /* 123 | Evaluate the function here. 124 | */ 125 | 126 | user_quit = criter ( this_x , &this_y ) ; 127 | if (user_quit) 128 | break ; 129 | 130 | /* 131 | Insert this new point in the correct position in the 'best' hierarchy 132 | */ 133 | 134 | if (this_y <= y0) { // Improvement 135 | if (this_x < x0) 136 | xright = x0 ; 137 | else 138 | xleft = x0 ; 139 | x2 = x1 ; 140 | x1 = x0 ; 141 | x0 = this_x ; 142 | y2 = y1 ; 143 | y1 = y0 ; 144 | y0 = this_y ; 145 | } 146 | 147 | else { // No improvement 148 | if (this_x >= x0) 149 | xright = this_x ; 150 | else 151 | xleft = this_x ; 152 | 153 | if ((this_y <= y1) || (x1 == x0)) { 154 | x2 = x1 ; 155 | x1 = this_x ; 156 | y2 = y1 ; 157 | y1 = this_y ; 158 | } 159 | else if ((this_y <= y2) || (x2 == x0) || (x2 == x1)) { 160 | x2 = this_x ; 161 | y2 = this_y ; 162 | } 163 | } 164 | } 165 | 166 | *xa = xleft ; 167 | *xb = x0 ; 168 | *xc = xright ; 169 | *y = y0 ; 170 | 171 | return user_quit ; 172 | } 173 | -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /DATAMINE_Manual.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/data-mining-algorithms-cpp/bf18dda7f2361534423c56076fd60999b1becd86/DATAMINE_Manual.pdf -------------------------------------------------------------------------------- /DENSITY_PLOTS.TXT: -------------------------------------------------------------------------------- 1 | This is a set of code fragments that illustrate computation of a density plot. 2 | 3 | The following variables are especially important here 4 | database n_cases (rows) by n_vars (columns) dataset containing all data 5 | grid res by res displayable image which we compute 6 | val1 Horizontal variables, which we extract from the database 7 | val2 And vertical variable 8 | keys Work area, needed only for histogram equalization 9 | 10 | The user-specified parameters are as follows: 11 | varnum1 Column in the database of horizontal variable 12 | varnum2 Column in the database of vertical variable 13 | use_lowlim1 Flag: limit the lower range of the horizontal variable? 14 | lowlim_val1 Lower limit if specified by user 15 | Similarly variables for upper limits and vertical variable 16 | res Vertical and horizontal resolution of the square image generated 17 | width Fraction of standard deviation used for Parzen window width 18 | shift Amount to shift displayed tone for better display 19 | spread Amount to expand displayed tone range for better display 20 | type Type of display 21 | TYPE_DENSITY Actual density 22 | TYPE_MARGINAL Marginal density product, shows 'no relationship' pattern 23 | TYPE_INCONSISTENCY Marginal inconsistency 24 | TYPE_MI Mutual information contribution 25 | hist Apply histogram normalization? 26 | sharpen Sharpen display range to clarify boundary? 27 | 28 | 29 | /* 30 | These are memory allocations, with 'keys', 'val1', and 'val2' being work areas 31 | */ 32 | 33 | grid = (double *) malloc ( 2 * res * res * sizeof(double) ) ; 34 | keys = (int *) malloc ( res * res * sizeof(int) ) ; 35 | val1 = (double *) malloc ( n_cases * sizeof(double) ) ; 36 | val2 = (double *) malloc ( n_cases * sizeof(double) ) ; 37 | 38 | 39 | /* 40 | Get the data from the database, with 'n_vars' being the number of columns in the database, 41 | and 'n_cases' being the number of rows. 42 | */ 43 | 44 | for (i=0 ; i largest) 60 | largest = val1[i] ; 61 | } 62 | 63 | if (use_lowlim1) 64 | smallest = lowlim_val1 ; 65 | 66 | if (use_highlim1) 67 | largest = highlim_val1 ; 68 | 69 | if (largest <= smallest) { // Should never happen, but user may be careless 70 | largest = smallest + 0.1 ; 71 | smallest = largest - 0.2 ; 72 | } 73 | 74 | // Use these ranges to set up plot things, such as labels 75 | // This code is omitted, as it is specific to the desired interface system 76 | // We let xmin and xmax be the actual display range, which may equal or be outside (smallest, largest). 77 | // Now we do the same thing for the vertical variable 78 | 79 | smallest = largest = val2[0] ; 80 | for (i=1 ; i largest) 84 | largest = val2[i] ; 85 | } 86 | 87 | if (use_lowlim2) 88 | smallest = lowlim_val2 ; 89 | 90 | if (use_highlim2) 91 | largest = highlim_val2 ; 92 | 93 | if (largest <= smallest) { 94 | largest = smallest + 0.1 ; 95 | smallest = largest - 0.2 ; 96 | } 97 | 98 | 99 | /* 100 | Compute the scale factors for the Parzen windows 101 | */ 102 | 103 | scale1 = scale2 = mean1 = mean2 = 0.0 ; 104 | 105 | for (i=0 ; i highlim_val1) 110 | x = highlim_val1 ; 111 | mean1 += x ; 112 | x = val2[i] ; 113 | if (use_lowlim2 && x < lowlim_val2) 114 | x = lowlim_val2 ; 115 | if (use_highlim2 && x > highlim_val2) 116 | x = highlim_val2 ; 117 | mean2 += x ; 118 | } 119 | 120 | mean1 /= n_cases ; 121 | mean2 /= n_cases ; 122 | 123 | for (i=0 ; i highlim_val1) 128 | x = highlim_val1 ; 129 | diff = x - mean1 ; 130 | scale1 += diff * diff ; 131 | x = val2[i] ; 132 | if (use_lowlim2 && x < lowlim_val2) 133 | x = lowlim_val2 ; 134 | if (use_highlim2 && x > highlim_val2) 135 | x = highlim_val2 ; 136 | diff = x - mean2 ; 137 | scale2 += diff * diff ; 138 | } 139 | 140 | scale1 = width * sqrt ( scale1 / n_cases ) ; 141 | scale2 = width * sqrt ( scale2 / n_cases ) ; 142 | 143 | if (scale1 < 1.e-30) // Should never happen, but user may be careless 144 | scale1 = 1.e-30 ; 145 | 146 | if (scale2 < 1.e-30) 147 | scale2 = 1.e-30 ; 148 | 149 | 150 | /* 151 | Compute the raw display grid, before any transformations 152 | */ 153 | 154 | for (i=0 ; i maxMI) { 212 | maxMI = grid[vert*res+horz] ; 213 | maxMIx = x ; 214 | maxMIy = y ; 215 | } 216 | } 217 | } 218 | if (totalMI > 0.0) 219 | maxMI *= res * res / totalMI ; 220 | else 221 | maxMI = 0.0 ; 222 | } 223 | 224 | if (type == TYPE_INCONSISTENCY) { // If user wants inconsistency 225 | max_pos = max_neg = 1.e-20 ; 226 | for (i=0 ; i 0.0 && grid[i] > max_pos) 228 | max_pos = grid[i] ; 229 | if (grid[i] < 0.0 && (-grid[i]) > max_neg) 230 | max_neg = -grid[i] ; 231 | } 232 | for (i=0 ; i 0.0) 234 | grid[i] /= max_pos ; 235 | if (grid[i] < 0.0) 236 | grid[i] /= -max_neg ; 237 | } 238 | } 239 | 240 | if (hist) { 241 | /* 242 | Sort the grid entries so we can compute fractiles. 243 | Recall that we allocated twice the needed space to allow for sorted array. 244 | Then convert each grid entry into its fractile. 245 | */ 246 | 247 | for (i=0 ; i 1.0) 274 | grid[i] = 1.0 ; 275 | if (grid[i] < 0.0) 276 | grid[i] = 0.0 ; 277 | if (sharpen) 278 | grid[i] = grid[i] * grid[i] * grid[i] ; 279 | } 280 | } // No histogram equalization 281 | 282 | 283 | 284 | /* 285 | Apply the user's visual transform 286 | */ 287 | 288 | if (spread >= 0.0) 289 | mult = spread + 1.0 ; 290 | else 291 | mult = 1.0 / (1.0 - spread) ; 292 | 293 | for (i=0 ; i 1.0 - 1.e-12) // Ditto 298 | grid[i] = 1.0 - 1.e-12 ; 299 | 300 | if (grid[i] <= 0.5) 301 | grid[i] = 0.5 * exp ( mult * log ( 2.0 * grid[i] )) ; 302 | else 303 | grid[i] = 1.0 - 0.5 * exp ( mult * log ( 2.0 * (1.0 - grid[i]) )) ; 304 | } 305 | } -------------------------------------------------------------------------------- /DataMine.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/data-mining-algorithms-cpp/bf18dda7f2361534423c56076fd60999b1becd86/DataMine.exe -------------------------------------------------------------------------------- /EVEC_RS.CPP: -------------------------------------------------------------------------------- 1 | /****************************************************************************/ 2 | /* */ 3 | /* EVEC_RS */ 4 | /* */ 5 | /* Compute eigenvalues and vectors of real symmetric matrix */ 6 | /* */ 7 | /****************************************************************************/ 8 | /* */ 9 | 10 | #include 11 | 12 | /* 13 | The input matrix is mat_in. It is not touched. The upper minor triangle 14 | of it is ignored, and hence may be garbage. Its column dimension is n. 15 | The eigenvectors are output in vect, which has column dimension n. 16 | The calling program may use the same matrix for mat_in and vect, 17 | in which case the input is simply replaced. 18 | The eigenvalues are output in eval. Workv is a double work vector n long. 19 | This returns the number of eigenvalues which could not be computed, 20 | which is virtually always 0. I've exhaustively tested this routine and 21 | never seen it return a nonzero value. 22 | */ 23 | 24 | int evec_rs ( double *mat_in , int n , int find_vec , double *vect , double *eval , double *workv ) 25 | { 26 | int i, im1, j, k, irow, irowm1, ival, ivalp1, iercnt, msplit, ibig ; 27 | double b, f, g, h, hh, p, r, x, scale, shift, sine, cosine, big, *vptr ; 28 | 29 | // Compzero is an accuracy versus speed tradeoff. The algorithm is most accurate when compzero=0. 30 | // But by letting 'zero' be a very small positive number, we can take some early loop exits 31 | // with very little penalty, insignificant most of the time. 32 | double compzero = 1.e-16 ; 33 | 34 | // Eps is used only for splitting a large matrix into two smaller matrices at a 'zero' diagonal, 35 | // greatly speeding operation. But if the diagonal is not quite zero, this does introduce a tiny, 36 | // usually insignificant, error. 37 | // The algorithm is most accurate when eps=0, but very small values are fine for most work. 38 | double eps = 1.e-12 ; 39 | 40 | /* copy lower triangle of input to output. */ 41 | for (i=0 ; i0 ; irow--) { 55 | irowm1 = irow - 1 ; 56 | h = 0.0 ; 57 | /* We can improve computational accuracy by scaling the row. */ 58 | for (scale=0.0 , i=0 ; i<=irowm1 ; i++) /* do left of diag only */ 59 | scale += fabs ( vect[irow*n+i] ) ; 60 | /* Avoid a lot of work if this row already tri-diagonal */ 61 | if (scale < compzero || irow == 1) 62 | workv[irow] = vect[irow*n+irowm1] ; 63 | else { 64 | /* Do actual scaling (left of diag only). Cumulate sum squares */ 65 | for (i=0 ; i<=irowm1 ; i++) { 66 | x = vect[irow*n+i] / scale ; 67 | vect[irow*n+i] = x ; 68 | h += x * x ; 69 | } 70 | /* The 'U' vector of the literature is the row vector except that 71 | its first element (f) has the length of the vector (sqrt(h)) 72 | either added or subtracted (g), whichever gives the largest 73 | absolute value. */ 74 | f = vect[irow*n+irowm1] ; 75 | g = ( f > 0 ) ? -sqrt (h) : sqrt (h) ; 76 | workv[irow] = g * scale ; /* subdiagonal compensated for scaling */ 77 | 78 | h -= f * g ; 79 | vect[irow*n+irowm1] = f - g ; 80 | 81 | /* Prepare to reduce vect. Use upper triangle for storage. */ 82 | 83 | for (f=0.0 , j=0 ; j<=irowm1 ; j++) { 84 | if (find_vec) 85 | vect[j*n+irow] = vect[irow*n+j] / h ; 86 | /* Form element of A * U */ 87 | for (g=0.0 , k=0 ; k<=j ; k++) 88 | g += vect[j*n+k] * vect[irow*n+k] ; 89 | if (j < irowm1) 90 | for (k=j+1 ; k<=irowm1 ; k++) 91 | g += vect[k*n+j] * vect[irow*n+k] ; 92 | /* Compute an element of P. Use the positions in workv below 93 | those already determined subdiagonals as work areas. */ 94 | workv[j] = g / h ; 95 | f += workv[j] * vect[irow*n+j] ; 96 | } /* for f=0.0 j=0 */ 97 | 98 | /* Reduce A such that all elements of row irow are zero except the 99 | diagonal and the element to its left (ignoring symmetric 100 | elements). Naturally we need not compute those zeroes. Just 101 | modify the rows above irow. */ 102 | hh = f / (h + h) ; 103 | for (j=0 ; j<=irowm1 ; j++) { 104 | f = vect[irow*n+j] ; 105 | g = workv[j] - hh * f ; 106 | workv[j] = g ; 107 | for (k=0 ; k<=j ; k++) 108 | vect[j*n+k] -= f * workv[k] + g * vect[irow*n+k] ; 109 | } 110 | } /* else scale compzero) { 132 | for (j=0 ; j<=irowm1 ; j++) { 133 | for (g=0.0 , k=0 ; k<=irowm1 ; k++) 134 | g += vect[irow*n+k] * vect[k*n+j] ; 135 | for (k=0 ; k<=irowm1 ; k++) 136 | vect[k*n+j] -= g * vect[k*n+irow] ; 137 | } 138 | } 139 | /* Recover diagonal and zero matrix elements which are truly zero 140 | but were not computed. */ 141 | eval[irow] = vect[irow*n+irow] ; 142 | vect[irow*n+irow] = 1. ; 143 | for (j=0 ; j<=irowm1 ; j++) { 144 | vect[irow*n+j] = 0.0 ; 145 | vect[j*n+irow] = 0.0 ; 146 | } 147 | } /* for irow=0 */ 148 | } // If find_vec 149 | 150 | else { 151 | for (irow=0 ; irow compzero) ? h : compzero ; /* needed in some cases */ 185 | b = (b > h) ? b : h ; 186 | /* Recall we set workv[n-1]=0.0 This loop at least finds that. */ 187 | for (msplit=ival ; msplit ival) { 195 | do { 196 | if (iercnt++ > 100) /* avoid useless repetition */ 197 | return (n - ival) ; 198 | /* Before transforming we shift all eigenvalues by a constant to 199 | accelerate convergence. Now shift by an additional h for 200 | this one. */ 201 | ivalp1 = ival + 1 ; 202 | g = eval[ival] ; 203 | p = ( eval[ivalp1] - g ) / (2. * workv[ival]);/* tricky denom */ 204 | r = sqrt ( p * p + 1.0 ) ; 205 | eval[ival] = workv[ival] / ( p + ( (p>0) ? r : -r ) ) ; 206 | 207 | h = g - eval[ival] ; 208 | /* We just shifted ival'th. Do same for others. */ 209 | for (i=ivalp1 ; i= ival ; i--) { 218 | g = cosine * workv[i] ; 219 | h = cosine * p ; 220 | if (fabs (p) >= fabs (workv[i])) { 221 | cosine = workv[i] / p ; 222 | r = sqrt ( cosine * cosine + 1.0 ) ; 223 | workv[i+1] = sine * p * r ; 224 | sine = cosine / r ; 225 | cosine = 1.0 / r ; 226 | } 227 | else { 228 | cosine = p / workv[i] ; 229 | r = sqrt ( cosine * cosine + 1.0 ) ; 230 | workv[i+1] = sine * workv[i] * r ; 231 | sine = 1.0 / r ; 232 | cosine = cosine * sine ; 233 | } 234 | p = cosine * eval[i] - sine * g ; 235 | eval[i+1] = h + sine * (cosine * g + sine * eval[i]) ; 236 | /* now we must transform vect the same way, so that we get 237 | the eigenvector of the original matrix. Note that 238 | previous vectors are untouched. */ 239 | if (find_vec) { 240 | for (k=0 ; k b ) ; 254 | } /* if msplit > ival */ 255 | /* We have an eigenvalue. Compensate for shifting. */ 256 | eval[ival] += shift ; 257 | 258 | } /* for ival=0 */ 259 | /* 260 | ------------------------------------------------------------------------------ 261 | 262 | This is it. We are all done. However, many programs prefer for the 263 | eigenvalues (and corresponding vectors!) to be sorted in decreasing 264 | order. Do this now. Then flip signs in any column which has more 265 | negatives than positives. This is appreciated during interpretation. 266 | 267 | ------------------------------------------------------------------------------ 268 | */ 269 | 270 | for (i=1 ; i big) { 278 | big = x ; 279 | ibig = j ; 280 | } 281 | } 282 | if (ibig != im1) { 283 | /* swap */ 284 | eval[ibig] = eval[im1] ; 285 | eval[im1] = big ; 286 | if (find_vec) { 287 | for (j=0 ; j n) 303 | for (j=0 ; j 27 | 28 | int user_pressed_escape() ; 29 | 30 | int glob_min ( 31 | double low , // Lower limit for search 32 | double high , // Upper limit 33 | int npts , // Number of points to try 34 | int log_space , // Space by log? 35 | double critlim , // Quit global if crit drops this low 36 | int (*criter) (double , double *) , // Criterion function 37 | double *x1 , 38 | double *y1 , // Lower X value and function there 39 | double *x2 , 40 | double *y2 , // Middle (best) 41 | double *x3 , 42 | double *y3 // And upper 43 | ) 44 | { 45 | int i, ibest, turned_up, know_first_point, user_quit ; 46 | double x, y, rate, previous ; 47 | 48 | user_quit = 0 ; 49 | 50 | if (npts < 0) { 51 | npts = -npts ; 52 | know_first_point = 1 ; 53 | } 54 | else 55 | know_first_point = 0 ; 56 | 57 | if (log_space) 58 | rate = exp ( log (high / low) / (npts - 1) ) ; 59 | else 60 | rate = (high - low) / (npts - 1) ; 61 | 62 | x = low ; 63 | 64 | previous = 0.0 ; // Avoids "use before set" compiler warnings 65 | ibest = -1 ; // For proper critlim escape 66 | turned_up = 0 ; // Must know if function increased after min 67 | 68 | for (i=0 ; i 0) && turned_up) 94 | break ; // Done if (abort or good enough) and both neighbors found 95 | 96 | if (user_quit) // Alas, both neighbors not found 97 | return user_quit ; // Flag that the other 2 pts not there 98 | 99 | if (log_space) 100 | x *= rate ; 101 | else 102 | x += rate ; 103 | } 104 | 105 | /* 106 | At this point we have a minimum (within low,high) at (x2,y2). 107 | Compute x1 and x3, its neighbors. 108 | We already know y1 and y3 (unless the minimum is at an endpoint!). 109 | */ 110 | 111 | if (log_space) { 112 | *x1 = *x2 / rate ; 113 | *x3 = *x2 * rate ; 114 | } 115 | else { 116 | *x1 = *x2 - rate ; 117 | *x3 = *x2 + rate ; 118 | } 119 | 120 | /* 121 | Normally we would now be done. However, the careless user may have 122 | given us a bad x range (low,high) for the global search. 123 | If the function was still decreasing at an endpoint, bail out the 124 | user by continuing the search. 125 | */ 126 | 127 | if (! turned_up) { // Must extend to the right (larger x) 128 | for (;;) { // Endless loop goes as long as necessary 129 | 130 | user_quit = user_pressed_escape () ; 131 | 132 | if (! user_quit) 133 | user_quit = criter ( *x3 , y3 ) ; 134 | 135 | if (user_quit) // Alas, both neighbors not found 136 | return user_quit ; // Flag that the other 2 pts not there 137 | 138 | if (*y3 > *y2) // If function increased we are done 139 | break ; 140 | if ((*y1 == *y2) && (*y2 == *y3)) // Give up if flat 141 | break ; 142 | 143 | *x1 = *x2 ; // Shift all points 144 | *y1 = *y2 ; 145 | *x2 = *x3 ; 146 | *y2 = *y3 ; 147 | 148 | rate *= 3.0 ; // Step further each time 149 | if (log_space) // And advance to new frontier 150 | *x3 *= rate ; 151 | else 152 | *x3 += rate ; 153 | } 154 | } 155 | 156 | else if (ibest == 0) { // Must extend to the left (smaller x) 157 | for (;;) { // Endless loop goes as long as necessary 158 | 159 | user_quit = user_pressed_escape () ; 160 | 161 | if (! user_quit) 162 | user_quit = criter ( *x1 , y1 ) ; 163 | 164 | if (user_quit) // Alas, both neighbors not found 165 | return user_quit ; // Flag that the other 2 pts not there 166 | 167 | if (*y1 > *y2) // If function increased we are done 168 | break ; 169 | if ((*y1 == *y2) && (*y2 == *y3)) // Give up if flat 170 | break ; 171 | 172 | *x3 = *x2 ; // Shift all points 173 | *y3 = *y2 ; 174 | *x2 = *x1 ; 175 | *y2 = *y1 ; 176 | 177 | rate *= 3.0 ; // Step further each time 178 | if (log_space) // And advance to new frontier 179 | *x1 /= rate ; 180 | else 181 | *x1 -= rate ; 182 | } 183 | } 184 | 185 | return 0 ; 186 | } 187 | -------------------------------------------------------------------------------- /HORNS_METHOD.TXT: -------------------------------------------------------------------------------- 1 | typedef struct { 2 | int nc ; // Number of cases 3 | int nv ; // Number of variables 4 | double *covar ; // Scratch for covariance matrix 5 | double *evals ; // Computed eigenvalues 6 | double *workv ; // Scratch vector for evec_rs() 7 | int ieval ; // Needed for placing result in all_evals 8 | } MC_EVALS_PARAMS ; 9 | 10 | static unsigned int __stdcall evals_threaded ( LPVOID dp ) 11 | { 12 | int i, j, icase, n_cases, n_vars ; 13 | double *xvec, *sums, *covar, xtemp, *evals, *workv ; 14 | 15 | n_cases = ((MC_EVALS_PARAMS *) dp)->nc ; 16 | n_vars = ((MC_EVALS_PARAMS *) dp)->nv ; 17 | covar = ((MC_EVALS_PARAMS *) dp)->covar ; 18 | xvec = evals = ((MC_EVALS_PARAMS *) dp)->evals ; // We borrow this for computing covar 19 | sums = workv = ((MC_EVALS_PARAMS *) dp)->workv ; // Ditto 20 | 21 | /* 22 | Compute the lower-left triangle of the covariance matrix of a 23 | standardized, uncorrelated normal random variable. 24 | The upper-right triangle is ignored by the evec_rs() routine. 25 | */ 26 | 27 | for (i=0 ; i mc_reps) 96 | max_threads = mc_reps ; 97 | 98 | /* 99 | Allocate memory 100 | */ 101 | 102 | covar = (double *) malloc ( nv * nv * max_threads * sizeof(double) ) ; 103 | evals = (double *) malloc ( nv * max_threads * sizeof(double) ) ; 104 | workv = (double *) malloc ( nv * max_threads * sizeof(double) ) ; 105 | all_evals = (double *) malloc ( nv * mc_reps * sizeof(double) ) ; 106 | 107 | /* 108 | -------------------------------------------------------------------------------- 109 | 110 | Outer-most loop does threaded MC replications 111 | Initialize those thread parameters which are constant for all threads. 112 | 113 | -------------------------------------------------------------------------------- 114 | */ 115 | 116 | for (ithread=0 ; ithread= n_threads) { 187 | ret_val = ERROR_INSUFFICIENT_MEMORY ; 188 | goto FINISH ; 189 | } 190 | 191 | k = mc_evals_params[ret_val].ieval ; 192 | for (i=0 ; i= n_threads) { 208 | ret_val = ERROR_INSUFFICIENT_MEMORY ; 209 | goto FINISH ; 210 | } 211 | for (i=0 ; i= mc_reps) 234 | k = mc_reps - 1 ; 235 | 236 | for (i=0 ; i 8 | 9 | #define INTBUF 100 /* Incredibly conservative! (divisions 2^(-100) are tiny!) */ 10 | 11 | double integrate ( 12 | double low , // Lower limit for definite integral 13 | double high , // Upper limit 14 | double min_width , // Demand subdivision this small or smaller 15 | double acc , // Relative interval width limit 16 | double tol , // Relative error tolerance 17 | double (*criter) (double) // Criterion function 18 | ) 19 | { 20 | int istack ; 21 | double sum, a, b, mid, fa, fb, fmid, lowres, hires, fac ; 22 | 23 | struct IntStack { 24 | double x0 ; 25 | double x1 ; 26 | double f0 ; 27 | double f1 ; 28 | } stack[INTBUF] ; 29 | 30 | fac = 3.0 * tol ; // Error is about (lowres-hires) / 3 31 | 32 | /* 33 | Start by initializing the stack to be the entire interval 34 | and the integral so far to be zero 35 | */ 36 | 37 | stack[0].x0 = low ; 38 | stack[0].f0 = criter ( low ) ; 39 | stack[0].x1 = high ; 40 | stack[0].f1 = criter ( high ) ; 41 | istack = 1 ; 42 | sum = 0.0 ; 43 | 44 | /* 45 | Main algorithm starts here. Pop interval off stack and test its quality. 46 | */ 47 | 48 | while (istack > 0) { // While there is still at least one interval on stack 49 | --istack ; // Pop this interval 50 | a = stack[istack].x0 ; 51 | b = stack[istack].x1 ; 52 | fa = stack[istack].f0 ; 53 | fb = stack[istack].f1 ; 54 | mid = 0.5 * (a + b) ; 55 | fmid = criter ( mid ) ; 56 | lowres = 0.5 * (b - a) * (fa + fb) ; // Trapezoidal rule 57 | hires = 0.25 * (b - a) * (fa + 2.0 * fmid + fb) ; // And refined value 58 | // If the interval is ridiculously narrow, no point in continuing 59 | // If it gets this far, chances are the integrand is discontinuous 60 | if (b - a <= acc * (1.0 + fabs(a) + fabs(b))) 61 | sum += hires ; // Quit trying to refine 62 | else if ((b - a) <= min_width && fabs(lowres-hires) < fac * (b - a)) 63 | sum += hires ; // Normal convergence flag 64 | else { 65 | stack[istack].x0 = a ; 66 | stack[istack].f0 = fa ; 67 | stack[istack].x1 = mid ; 68 | stack[istack].f1 = fmid ; 69 | ++istack ; 70 | if (istack < INTBUF) { // Insurance against catastrophe only 71 | stack[istack].x0 = mid ; // Should ALWAYS be true (easily!) 72 | stack[istack].f0 = fmid ; // If this if() fails, the answer will 73 | stack[istack].x1 = b ; // of course be wrong, but only due to 74 | stack[istack].f1 = fb ; // a horrendous underlying problem 75 | ++istack ; // like a singularity in the function 76 | } 77 | else { 78 | --istack ; // Error condition, so undo push 79 | sum += hires ; // And go with this best estimiate 80 | } 81 | } 82 | } 83 | return sum ; 84 | } 85 | -------------------------------------------------------------------------------- /INVERT.CPP: -------------------------------------------------------------------------------- 1 | //---------------------------------------------------------------------------- 2 | /* LUDECOMP */ 3 | /* */ 4 | /* Compute the LU decomposition via Crout algorithm */ 5 | 6 | #include 7 | #include 8 | 9 | int LUdecomp ( 10 | int n , // Order of the input matrix 11 | double *mat_in , // Input matrix in standard (row major) order 12 | double *mat_out , // Output of LU decomposition 13 | int dim , // Their column dimension in the calling routine 14 | int digits , // If > 0 mat_in is assumed to be accurate to digits figs 15 | double *det , // Determinant 16 | int *pivot , // Output of permutation used for pivot optimization 17 | double *equil ) // Output of smallest 1 / abs (mat_in[i]) for each 18 | 19 | /* It returns 1 if accuracy has not been maintained, 2 if singular. */ 20 | 21 | { 22 | int row, col, inner, i, rmax ; 23 | double sum, fptemp, rn, wrel, big, biggest, *ptr1, *ptr2, *lurc ; 24 | double ai, wi, wa, p, q, test ; 25 | 26 | rmax = 0 ; // Not needed. Shuts up LINT. 27 | 28 | /* 29 | Initialize. Copy input matrix to output. 30 | */ 31 | 32 | rn = (double) n ; 33 | wrel = 0.0 ; 34 | biggest = 0.0 ; 35 | *det = 1.0 ; 36 | 37 | for (row=0 ; row big) 45 | big = fptemp ; 46 | } 47 | if (big < 1.0e-90) 48 | goto SINGULAR ; 49 | if (big > biggest) 50 | biggest = big ; 51 | equil[row] = 1.0 / big ; 52 | } 53 | 54 | /* 55 | ------------------------------------------------------------------------------- 56 | 57 | This is the main loop which does all columns 58 | 59 | ------------------------------------------------------------------------------- 60 | */ 61 | 62 | for (col=0 ; col wrel) 92 | wrel = test ; 93 | } /* if digits (accuracy test) */ 94 | 95 | else { /* No accuracy test */ 96 | if (row) { 97 | ptr1 = mat_out + row * dim ; 98 | ptr2 = mat_out + col ; 99 | inner = row ; 100 | while (inner--) { 101 | sum -= *ptr1++ * *ptr2 ; 102 | ptr2 += dim ; 103 | } 104 | *lurc = sum ; 105 | } 106 | } /* No accuracy test */ 107 | } /* for row */ 108 | 109 | /* 110 | Now compute the diagonal of U and the elements of L below the diagonal 111 | */ 112 | 113 | p = 0.0 ; 114 | 115 | for (row=col ; row wrel) 137 | wrel = test ; 138 | } /* if digits (accuracy test) */ 139 | 140 | else { /* No accuracy test */ 141 | if (col) { 142 | ptr1 = mat_out + row * dim ; 143 | ptr2 = mat_out + col ; 144 | inner = col ; 145 | while (inner--) { 146 | sum -= *ptr1++ * *ptr2 ; 147 | ptr2 = ptr2 + dim ; 148 | } 149 | *lurc = sum ; 150 | } 151 | } /* no accuracy test */ 152 | 153 | q = equil[row] * fabs ( sum ) ; 154 | if (q > p) { 155 | p = q ; 156 | rmax = row ; 157 | } 158 | } /* for row */ 159 | 160 | if ((rn + p) == rn) /* No longer can tell them apart? */ 161 | goto SINGULAR ; 162 | 163 | /* 164 | If this row is not the best pivot, interchange for stability 165 | */ 166 | 167 | if (rmax != col) { 168 | *det = - *det ; 169 | ptr1 = mat_out + rmax * dim ; 170 | ptr2 = mat_out + col * dim ; 171 | inner = n ; 172 | while (inner--) { 173 | fptemp = *ptr1 ; 174 | *ptr1++ = *ptr2 ; 175 | *ptr2++ = fptemp ; 176 | } 177 | equil[rmax] = equil[col] ; 178 | } 179 | 180 | /* 181 | Final loop housekeeping. Divide by pivot. 182 | */ 183 | 184 | pivot[col] = rmax ; 185 | *det *= (fptemp = mat_out[col * dim + col]) ; 186 | 187 | ptr1 = mat_out + (col+1) * dim + col ; 188 | inner = n - col ; 189 | while (--inner) { 190 | *ptr1 /= fptemp ; 191 | ptr1 += dim ; 192 | } 193 | 194 | } /* for col */ 195 | 196 | /* 197 | All done. Do final accuracy test. 198 | */ 199 | 200 | if (digits) { 201 | p = (double) (3 * n + 3) ; 202 | wa = p * wrel ; 203 | if (wa + pow ( 10.0 , (double) -digits ) == wa) 204 | return 1 ; 205 | } 206 | return 0 ; 207 | 208 | SINGULAR: 209 | *det = 0.0 ; 210 | return 2 ; 211 | } 212 | 213 | //---------------------------------------------------------------------------- 214 | /* ELIM */ 215 | /* */ 216 | /* Elimination part of equation solution (follows LUdecomp) */ 217 | 218 | void elim ( 219 | int n , // Order of system 220 | double *lu , // LU output of LUdecomp 221 | int dim , // Column dimension of LU in calling program 222 | double *rhs , // Right hand side of system 223 | int *pivot, // Pivot output of LUdecomp 224 | double *x ) // Solution 225 | { 226 | int ip, row, col, iw ; 227 | double sum, *ptr1, *ptr2 ; 228 | 229 | 230 | memcpy ( x , rhs , n * sizeof(double) ) ; // Copy right hand side to x 231 | 232 | /* 233 | Solve LY = RHS for Y 234 | */ 235 | 236 | iw = -1 ; 237 | for (row=0 ; row= 0) { 242 | ptr1 = lu + row * dim + iw ; 243 | ptr2 = x + iw ; 244 | col = row - iw ; 245 | while (col--) 246 | sum -= *ptr1++ * *ptr2++ ; 247 | } 248 | else if (fabs ( sum ) > 1.e-90) 249 | iw = row ; 250 | x[row] = sum ; 251 | } 252 | 253 | /* 254 | Now solve UX = Y for X 255 | */ 256 | 257 | for (row=n-1 ; row>=0 ; row--) { 258 | sum = x[row] ; 259 | ptr1 = lu + row * dim + row + 1 ; 260 | ptr2 = x + row + 1 ; 261 | col = n - row ; 262 | while (--col) 263 | sum -= *ptr1++ * *ptr2++ ; 264 | x[row] = sum / lu[row * dim + row] ; 265 | } 266 | } 267 | 268 | //---------------------------------------------------------------------------- 269 | 270 | int invert ( 271 | int n , // Size of matrix 272 | double *x , // Matrix to be inverted, not changed 273 | double *xinv , // Output of its inverse 274 | double *det , // Determinant 275 | double *rwork , // Work vector n*n + 2*n long 276 | int *iwork ) // Work vector n long 277 | { 278 | int i, j, ret_val ; 279 | double *lu, *equil, *soln ; 280 | 281 | lu = rwork ; 282 | equil = lu + n * n ; 283 | soln = equil + n ; 284 | 285 | ret_val = LUdecomp ( n , x , lu , n , 0 , det , iwork , equil ) ; 286 | 287 | if (ret_val) 288 | return 1 ; 289 | 290 | for (i=0 ; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | extern void free_data ( int nvars , char **names , double *data ) ; 16 | extern double mutinf_b ( int n , short int *y , short int *x , short int *z ) ; 17 | extern int readfile ( char *name , int *nvars , char ***names , 18 | int *ncases , double **data ) ; 19 | extern void partition ( int n , double *data , int *npart , 20 | double *bnds , short int *bins ) ; 21 | extern void qsortdsi ( int first , int last , double *data , int *slave ) ; 22 | 23 | int main ( 24 | int argc , // Number of command line arguments (includes prog name) 25 | char *argv[] // Arguments (prog name is argv[0]) 26 | ) 27 | 28 | { 29 | int i, j, k, depzero, indepzero, nvars, ncases, maxkept, ivar, *kept ; 30 | int n_indep_vars, idep, icand, iz, ibest, *sortwork, nkept, *last_indices ; 31 | double *data, *work, temp, p, error_entropy ; 32 | double *save_info, bestcrit ; 33 | double criterion, entropy, bound, *crits, *scores ; 34 | short int *bins_dep, *bins_indep, *xbins ; 35 | char filename[256], **names, depname[256] ; 36 | char trial_name[256] ; 37 | FILE *fp ; 38 | 39 | /* 40 | Process command line parameters 41 | */ 42 | 43 | #if 1 44 | if (argc != 7) { 45 | printf ( "\nUsage: MI_BIN datafile n_indep depname depzero indepzero maxkept" ) ; 46 | printf ( "\n datafile - name of the text file containing the data" ) ; 47 | printf ( "\n The first line is variable names" ) ; 48 | printf ( "\n Subsequent lines are the data." ) ; 49 | printf ( "\n Delimiters can be space, comma, or tab" ) ; 50 | printf ( "\n n_indep - Number of independent vars, starting with the first" ) ; 51 | printf ( "\n depname - Name of the 'dependent' variable" ) ; 52 | printf ( "\n It must be AFTER the first n_indep variables" ) ; 53 | printf ( "\n depzero - If nonzero, dependent variable is split >0 vs <=0" ) ; 54 | printf ( "\n Else split is by optimal partition" ) ; 55 | printf ( "\n indepzero - Ditto, for independent variables" ) ; 56 | printf ( "\n maxkept - Stepwise will allow at most this many predictors" ) ; 57 | return EXIT_FAILURE ; 58 | } 59 | 60 | strcpy ( filename , argv[1] ) ; 61 | n_indep_vars = atoi ( argv[2] ) ; 62 | strcpy ( depname , argv[3] ) ; 63 | depzero = atoi ( argv[4] ) ; 64 | indepzero = atoi ( argv[5] ) ; 65 | maxkept = atoi ( argv[6] ) ; 66 | #else 67 | strcpy ( filename , "..\\VARS.TXT" ) ; 68 | strcpy ( depname , "DAY_RETURN" ) ; 69 | n_indep_vars = 8 ; 70 | depzero = 1 ; 71 | indepzero = 1 ; 72 | maxkept = 99 ; 73 | #endif 74 | 75 | _strupr ( depname ) ; 76 | 77 | 78 | /* 79 | Open the text file to which results will be written 80 | */ 81 | 82 | fp = fopen ( "MI_BIN.LOG" , "wt" ) ; 83 | if (fp == NULL) { // Should never happen 84 | printf ( "\nCannot open MI_BIN.LOG file for writing!" ) ; 85 | return EXIT_FAILURE ; 86 | } 87 | 88 | /* 89 | Read the file and locate the index of the 'dependent' variable 90 | */ 91 | 92 | if (readfile ( filename , &nvars , &names , &ncases , &data )) 93 | return EXIT_FAILURE ; 94 | 95 | for (idep=0 ; idep0 as the definition of bin membership. 147 | Otherwise we use partition() to do the split. 148 | */ 149 | 150 | if (depzero) { // The dependent variable is split at zero 151 | for (i=0 ; i 0.0) 153 | bins_dep[i] = (short int) 1 ; 154 | else 155 | bins_dep[i] = (short int) 0 ; 156 | } 157 | fprintf ( fp , "\n%s has been split at zero", names[idep] ) ; 158 | } 159 | else { // The dependent variable is to be partitioned 160 | for (i=0 ; i 0.0) 172 | bins_indep[ivar*ncases+i] = (short int) 1 ; 173 | else 174 | bins_indep[ivar*ncases+i] = (short int) 0 ; 175 | } 176 | } 177 | } 178 | else { 179 | fprintf ( fp , "\nIndependent variables have been given an optimal split"); 180 | for (ivar=0 ; ivar 0 && k < ncases) { 218 | p = (double) k / (double) ncases ; 219 | error_entropy = -p * log(p) - (1.0 - p) * log(1.0-p) ; 220 | } 221 | else 222 | error_entropy = 0.0 ; 223 | 224 | criterion = mutinf_b ( ncases , bins_dep , xbins , NULL ) ; 225 | bound = (entropy - criterion - error_entropy) / log ( 2.0 ) ; 226 | if (bound < 0.0) 227 | bound = 0.0 ; 228 | printf ( "\n%s = %.5lf (%.5lf)", names[icand], criterion, bound ) ; 229 | fprintf ( fp , "\n%31s %11.5lf %13.5lf", names[icand], criterion, bound ) ; 230 | sortwork[icand] = icand ; 231 | scores[icand] = save_info[icand] = criterion ; 232 | last_indices[icand] = -1 ; 233 | } // Initial list of all candidates 234 | 235 | 236 | fprintf ( fp , "\n" ) ; 237 | fprintf ( fp , "\nInitial candidates, in order of decreasing mutual information" ) ; 238 | fprintf ( fp , "\n" ) ; 239 | fprintf ( fp , "\n Variable Information" ) ; 240 | 241 | qsortdsi ( 0 , n_indep_vars-1 , save_info , sortwork ) ; 242 | for (icand=0 ; icand n_indep_vars) // Guard against silly user 278 | maxkept = n_indep_vars ; 279 | 280 | while (nkept < maxkept) { 281 | 282 | printf ( "\n\nLatest candidate: %s", names[kept[nkept-1]] ) ; 283 | 284 | fprintf ( fp , "\n" ) ; 285 | fprintf ( fp , "\nVariables so far Criterion" ) ; 286 | for (i=0 ; i bestcrit) { // Did we just set a new record? 327 | bestcrit = criterion ; // If so, update the record 328 | ibest = icand ; // Keep track of the winning candidate 329 | } 330 | 331 | } // For all candidates 332 | 333 | // We now have the best candidate 334 | if (bestcrit <= 0.0) 335 | break ; 336 | kept[nkept] = ibest ; 337 | crits[nkept] = bestcrit ; 338 | printf ( "\nAdded %s = %.5lf", names[ibest], bestcrit ) ; 339 | ++nkept ; 340 | } // While adding new variables 341 | 342 | fprintf ( fp , "\n" ) ; 343 | fprintf ( fp , "\nFinal set Criterion" ) ; 344 | for (i=0 ; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | double mutinf_b ( 14 | int n , // Number of cases 15 | short int *y , // The 'dependent' variable 16 | short int *x , // The 'independent' variable; NULL to compute H(Y) 17 | short int *z ) // NULL to compute I(X;Y), z to compute I(X;Y|Z) 18 | { 19 | int i, nx0, nx1, ny0, ny1, nz0, nz1, n00, n01, n10, n11 ; 20 | int n000, n010, n100, n110, n001, n011, n101, n111 ; 21 | double p, HX, HY, HZ, HXY, HYZ, HXZ, HXYZ ; 22 | 23 | /* 24 | -------------------------------------------------------------------------------- 25 | 26 | Compute the entropy of Y 27 | 28 | -------------------------------------------------------------------------------- 29 | */ 30 | 31 | if (x == NULL) { 32 | ny1 = 0 ; 33 | for (i=0 ; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | class MutualInformationDiscrete { 15 | 16 | public: 17 | MutualInformationDiscrete ( int nc , short int *bins ) ; 18 | ~MutualInformationDiscrete () ; 19 | double entropy () ; 20 | double mut_inf ( short int *bins ) ; 21 | double conditional ( short int *bins ) ; 22 | double conditional_error ( short int *bins ) ; 23 | double HYe ( short int *bins ) ; 24 | double hPe ( short int *bins ) ; 25 | 26 | private: 27 | int ncases ; // Number of cases 28 | short int *bins_y ; // They are here 29 | int nbins_y ; // Number of bins 30 | int *marginal_y ; // Marginal distribution 31 | } ; 32 | 33 | 34 | /* 35 | -------------------------------------------------------------------------------- 36 | 37 | MutualInformationDiscrete - Constructor and destructor 38 | 39 | -------------------------------------------------------------------------------- 40 | */ 41 | 42 | 43 | MutualInformationDiscrete::MutualInformationDiscrete ( 44 | int nc , // Number of cases 45 | short int *bins ) // They are here (y, the 'dependent' variable) 46 | { 47 | int i ; 48 | 49 | /* 50 | Keep a local copy of the bins 51 | */ 52 | 53 | ncases = nc ; 54 | 55 | bins_y = (short int *) malloc ( ncases * sizeof(short int) ) ; 56 | 57 | memcpy ( bins_y , bins , ncases * sizeof(short int) ) ; 58 | 59 | /* 60 | Compute the number of bins, and then compute and save the marginal distribution 61 | */ 62 | 63 | nbins_y = 0 ; 64 | for (i=0 ; i nbins_y) 66 | nbins_y = bins_y[i] ; 67 | } 68 | ++nbins_y ; // Number of bins is one greater than max bin because org=0 69 | 70 | marginal_y = (int *) malloc ( nbins_y * sizeof(int) ) ; 71 | assert (marginal_y != NULL) ; 72 | 73 | for (i=0 ; i 0) { 102 | p = (double) marginal_y[i] / ncases ; 103 | ent += p * log ( p ) ; 104 | } 105 | } 106 | return -ent ; 107 | } 108 | 109 | /* 110 | -------------------------------------------------------------------------------- 111 | 112 | conditional ( bins_x ) - Compute the conditional entropy of Y given X 113 | 114 | -------------------------------------------------------------------------------- 115 | */ 116 | 117 | double MutualInformationDiscrete::conditional ( short int *bins_x ) 118 | { 119 | int i, ix, iy, nbins_x, *grid, *marginal_x ; 120 | double CI, pyx, cix ; 121 | 122 | /* 123 | Compute the number of bins 124 | */ 125 | 126 | nbins_x = 0 ; 127 | for (i=0 ; i nbins_x) 129 | nbins_x = bins_x[i] ; 130 | } 131 | ++nbins_x ; // Number of bins is one greater than max bin because org=0 132 | 133 | /* 134 | Compute the marginal of x and the counts in the nbins_x by nbins_y grid 135 | */ 136 | 137 | marginal_x = (int *) malloc ( nbins_x * sizeof(int) ) ; 138 | 139 | grid = (int *) malloc ( nbins_x * nbins_y * sizeof(int) ) ; 140 | 141 | for (ix=0 ; ix 0) { 160 | cix = 0.0 ; 161 | for (iy=0 ; iy 0.0) 164 | cix += pyx * log ( pyx ) ; 165 | } 166 | } 167 | CI += cix * marginal_x[ix] / ncases ; 168 | } 169 | 170 | free ( marginal_x ) ; 171 | free ( grid ) ; 172 | 173 | return -CI ; 174 | } 175 | 176 | /* 177 | -------------------------------------------------------------------------------- 178 | 179 | mut_inf ( bins_x ) - Compute the mutual information I(X;Y) 180 | 181 | -------------------------------------------------------------------------------- 182 | */ 183 | 184 | double MutualInformationDiscrete::mut_inf ( short int *bins_x ) 185 | { 186 | int i, j, ix, nbins_x, *grid, *marginal_x ; 187 | double MI, px, py, pxy ; 188 | 189 | /* 190 | Compute the number of bins 191 | */ 192 | 193 | nbins_x = 0 ; 194 | for (i=0 ; i nbins_x) 196 | nbins_x = bins_x[i] ; 197 | } 198 | ++nbins_x ; // Number of bins is one greater than max bin because org=0 199 | 200 | /* 201 | Compute the marginal of x and the counts in the nbins_x by nbins_y grid 202 | */ 203 | 204 | marginal_x = (int *) malloc ( nbins_x * sizeof(int) ) ; 205 | assert (marginal_x != NULL) ; 206 | 207 | grid = (int *) malloc ( nbins_x * nbins_y * sizeof(int) ) ; 208 | assert ( grid != NULL ) ; 209 | 210 | for (i=0 ; i 0.0) 233 | MI += pxy * log ( pxy / (px * py) ) ; 234 | } 235 | } 236 | 237 | free ( marginal_x ) ; 238 | free ( grid ) ; 239 | 240 | return MI ; 241 | } 242 | 243 | /* 244 | -------------------------------------------------------------------------------- 245 | 246 | hPe ( bins_x ) - Compute the Shannon entropy of the probability of error 247 | This only makes sense if X and Y have the same number of 248 | bins, and the bin of X is a prediction of the bin of Y. 249 | 250 | -------------------------------------------------------------------------------- 251 | */ 252 | 253 | double MutualInformationDiscrete::hPe ( short int *bins_x ) 254 | { 255 | int i, err ; 256 | double p ; 257 | 258 | err = 0 ; 259 | for (i=0 ; i nbins_x) 291 | nbins_x = bins_x[i] ; 292 | } 293 | ++nbins_x ; // Number of bins is one greater than max bin because org=0 294 | 295 | /* 296 | Compute the marginal of x and the error counts 297 | */ 298 | 299 | marginal_x = (int *) malloc ( nbins_x * sizeof(int) ) ; 300 | assert (marginal_x != NULL) ; 301 | 302 | error_count = (int *) malloc ( nbins_x * sizeof(int) ) ; 303 | assert ( error_count != NULL ) ; 304 | 305 | for (ix=0 ; ix 0 && error_count[ix] < marginal_x[ix]) { 324 | pyx = (double) error_count[ix] / (double) marginal_x[ix] ; 325 | CI += (pyx * log(pyx) + (1.0-pyx) * log(1.0-pyx)) * marginal_x[ix] / ncases ; 326 | } 327 | } 328 | 329 | free ( marginal_x ) ; 330 | free ( error_count ) ; 331 | 332 | return -CI ; 333 | } 334 | 335 | /* 336 | -------------------------------------------------------------------------------- 337 | 338 | HYe ( bins_x ) - Compute the minimum (over bins of X) conditional entropy 339 | H(Y|error,X). In other words, for each X bin,compute the 340 | conditional entropy of Y given that this X is an incorrect 341 | decision. Return the minimum of this value across X bins. 342 | This only makes sense if X and Y have the same number of 343 | bins, and the bin of X is a prediction of the bin of Y. 344 | 345 | -------------------------------------------------------------------------------- 346 | */ 347 | 348 | double MutualInformationDiscrete::HYe ( short int *bins_x ) 349 | { 350 | int i, ix, iy, nbins_x, nerr, *grid, *marginal_x ; 351 | double minCI, pyx, cix ; 352 | 353 | /* 354 | Compute the number of bins 355 | */ 356 | 357 | nbins_x = 0 ; 358 | for (i=0 ; i nbins_x) 360 | nbins_x = bins_x[i] ; 361 | } 362 | ++nbins_x ; // Number of bins is one greater than max bin because org=0 363 | 364 | /* 365 | This algorithm makes sense only if nbins_x equals nbins_y. 366 | Return an error flag that will get the user's attention if this is violated. 367 | */ 368 | 369 | if (nbins_x != nbins_y) 370 | return -1.e60 ; 371 | 372 | /* 373 | Compute the marginal of x and the counts in the nbins_x by nbins_y grid 374 | */ 375 | 376 | marginal_x = (int *) malloc ( nbins_x * sizeof(int) ) ; 377 | assert (marginal_x != NULL) ; 378 | 379 | grid = (int *) malloc ( nbins_x * nbins_y * sizeof(int) ) ; 380 | assert ( grid != NULL ) ; 381 | 382 | for (ix=0 ; ix 0) { 407 | cix = 0.0 ; 408 | for (iy=0 ; iy 0.0) 413 | cix -= pyx * log ( pyx ) ; 414 | } 415 | if (cix < minCI) 416 | minCI = cix ; 417 | } 418 | } 419 | 420 | free ( marginal_x ) ; 421 | free ( grid ) ; 422 | 423 | return minCI ; 424 | } 425 | -------------------------------------------------------------------------------- /PART.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* PART - Partition an array into roughly equal size bins, avoiding ties */ 4 | /* */ 5 | /* I make no special claims of optimality for this algorithm, largely */ 6 | /* because there is no single optimality criterion! All algorithms involve */ 7 | /* tradeoffs. However, I am reasonably certain that this algorithm has two */ 8 | /* valuable properties: */ 9 | /* */ 10 | /* 1) If the user inputs npart at least as large as the number of distinct */ 11 | /* values in the dataset, npart will be returned equal to the number of */ 12 | /* distinct values in the dataset, and each bin will correspond exactly */ 13 | /* to a distinct value. */ 14 | /* */ 15 | /* 2) If the data has few or no ties, and the user inputs npart much less */ 16 | /* than n, the dataset will be partitioned into npart bins, all of which */ 17 | /* have equal or very nearly equal size. */ 18 | /* */ 19 | /******************************************************************************/ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | extern void qsortdsi ( int first , int last , double *data , int *slave ) ; 28 | 29 | void partition ( 30 | int n , // Input: Number of cases in the data array 31 | double *data , // Input: The data array 32 | int *npart , // Input/Output: Number of partitions to find; Returned as 33 | // actual number of partitions, which happens if massive ties 34 | double *bnds , // Output: Upper bound (inclusive) of each partition 35 | // If the user inputs this NULL, bounds are not returned 36 | short int *bins // Output: Bin id (0 through npart-1) for each case 37 | ) 38 | { 39 | int i, j, k, np, *ix, *indices, *bin_end, ibound, tie_found ; 40 | int istart, istop, nleft, nright, nbest, ibound_best, isplit_best ; 41 | double *x ; 42 | 43 | if (*npart > n) // Defend against a careless user 44 | *npart = n ; 45 | 46 | np = *npart ; // Will be number of partitions 47 | 48 | x = (double *) malloc ( n * sizeof(double) ) ; 49 | ix = (int *) malloc ( n * sizeof(int) ) ; 50 | indices = (int *) malloc ( n * sizeof(int) ) ; 51 | bin_end = (int *) malloc ( np * sizeof(int) ) ; 52 | 53 | /* 54 | Sort the data and compute an integer rank array that identifies ties. 55 | We could use the x array, but the code later will run faster if it can 56 | work with integers instead of reals. 57 | Also keep the indices of the original data points, as we will need this 58 | information at the end of this code to assign cases to bins. 59 | */ 60 | 61 | for (i=0 ; i= 1.e-12 * (1.0 + fabs(x[i]) + fabs(x[i-1]))) 71 | ++k ; // If not a tie, advance the counter of unique values 72 | ix[i] = k ; 73 | } 74 | 75 | /* 76 | Compute initial bounds based strictly on equal number of cases in each bin. 77 | Ignore ties for now. 78 | */ 79 | 80 | k = 0 ; // Will be start of next bin up 81 | for (i=0 ; i nbest) { 131 | nbest = nleft ; 132 | ibound_best = ibound ; 133 | isplit_best = i ; 134 | } 135 | } 136 | else { 137 | if (nright > nbest) { 138 | nbest = nright ; 139 | ibound_best = ibound ; 140 | isplit_best = i ; 141 | } 142 | } 143 | } 144 | istart = istop + 1 ; 145 | } // For all bounds, looking for the best bin to split 146 | 147 | // The search is done. It may (rarely) be the case that no further 148 | // splits are possible. This will happen if the user requests more 149 | // partitions than there are unique values in the dataset. 150 | // We know that this has happened if nbest is still -1. In this case 151 | // we (obviously) cannot do a split to make up for the one lost above. 152 | 153 | if (nbest < 0) 154 | continue ; 155 | 156 | // We get here when the best split of an existing partition has been 157 | // found. Save it. The bin that we are splitting is ibound_best, 158 | // and the split for a new bound is at isplit_best. 159 | 160 | for (ibound=np-1 ; ibound>=ibound_best ; ibound--) 161 | bin_end[ibound+1] = bin_end[ibound] ; 162 | bin_end[ibound_best] = isplit_best ; 163 | ++np ; 164 | 165 | } // Endless search loop 166 | 167 | /* 168 | The partition bounds are found. 169 | Return them to the user if requested. 170 | */ 171 | 172 | *npart = np ; // Return the final number of partitions 173 | 174 | if (bnds != NULL) { // Does the user want the boundary values? 175 | for (ibound=0 ; ibound 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | #if ! defined ( PI ) 20 | #define PI 3.141592653589793 21 | #endif 22 | 23 | extern double inverse_normal_cdf ( double p ) ; 24 | extern void qsortdsi ( int first , int last , double *data , int *slave ) ; 25 | 26 | class CubicSpline { 27 | 28 | public: 29 | CubicSpline ( int n , double *xin , double *yin ) ; 30 | ~CubicSpline () ; 31 | double evaluate ( double x ) ; 32 | 33 | private: 34 | int n ; 35 | double *x ; 36 | double *y ; 37 | double *y2 ; 38 | } ; 39 | 40 | class Bilinear { 41 | 42 | public: 43 | Bilinear ( int nxin , double *xin , int nyin , double *yin , double *zin , 44 | int extra ) ; 45 | ~Bilinear () ; 46 | double evaluate ( double x , double y ) ; 47 | 48 | private: 49 | int quadratic ; 50 | int nx ; 51 | int ny ; 52 | double *x ; 53 | double *y ; 54 | double *z ; 55 | } ; 56 | 57 | /* 58 | -------------------------------------------------------------------------------- 59 | 60 | ParzenDensities ParzDens_? used for continuous mutual information 61 | 62 | -------------------------------------------------------------------------------- 63 | */ 64 | 65 | class ParzDens_1 { 66 | 67 | public: 68 | ParzDens_1 ( int n_tset , double *tset , int n_div ) ; 69 | ~ParzDens_1 () ; 70 | double density ( double x ) ; 71 | double low ; // Lowest value with significant density 72 | double high ; // And highest 73 | 74 | private: 75 | int nd ; // Number of points in array below 76 | double *d ; // The data on which the density is based 77 | double var ; // Presumed variance 78 | double factor ; // Normalizing factor to make it a density 79 | CubicSpline *spline ; // Used only if interpolation 80 | } ; 81 | 82 | class ParzDens_2 { 83 | 84 | public: 85 | ParzDens_2 ( int n_tset , double *tset0 , double *tset1 , int n_div ) ; 86 | ~ParzDens_2 () ; 87 | double density ( double x0 , double x1 ) ; 88 | 89 | private: 90 | int nd ; // Number of points in arrays below 91 | double *d0 ; // The data on which the density is based; first variable 92 | double *d1 ; // And second 93 | double var0 ; // Presumed variance of first variable 94 | double var1 ; // And second 95 | double factor ; // Normalizing factor to make it a density 96 | Bilinear *bilin ; // Used only for bilinear interpolation 97 | } ; 98 | 99 | class ParzDens_3 { 100 | 101 | public: 102 | ParzDens_3 ( int n_tset , double *tset0 , double *tset1 , double *tset2 , int n_div ) ; 103 | ~ParzDens_3 () ; 104 | double density ( double x0 , double x1 , double x2 ) ; 105 | 106 | private: 107 | int nd ; // Number of points in arrays below 108 | double *d0 ; // The data on which the density is based; first variable 109 | double *d1 ; // And second 110 | double *d2 ; // And third 111 | double var0 ; // Presumed variance of first variable 112 | double var1 ; // And second 113 | double var2 ; // And third 114 | double factor ; // Normalizing factor to make it a density 115 | } ; 116 | 117 | /* 118 | -------------------------------------------------------------------------------- 119 | 120 | MutualInformation 121 | 122 | -------------------------------------------------------------------------------- 123 | */ 124 | 125 | class MutualInformationParzen { // Parzen window method 126 | 127 | public: 128 | MutualInformationParzen ( int nn , double *dep_vals , int ndiv ) ; 129 | ~MutualInformationParzen () ; 130 | double mut_inf ( double *x ) ; 131 | 132 | private: 133 | int n ; // Number of cases 134 | int n_div ; // Number of divisions of range, typically 5-10 135 | double *depvals ; // 'Dependent' variable 136 | ParzDens_1 *dens_dep ; // Marginal density of 'dependent' variable 137 | } ; 138 | 139 | class MutualInformationAdaptive { // Adaptive partitioning method 140 | 141 | public: 142 | MutualInformationAdaptive ( int nn , double *dep_vals , 143 | int respect_ties , double crit ) ; 144 | ~MutualInformationAdaptive () ; 145 | double mut_inf ( double *x , int respect_ties ) ; 146 | 147 | private: 148 | int n ; // Number of cases 149 | int *y ; // 'Dependent' variable ranks 150 | int *y_tied ; // tied[i] != 0 if case with rank i == case with rank i+1 151 | double chi_crit ; // Chi-square test criterion 152 | } ; 153 | 154 | 155 | /* 156 | -------------------------------------------------------------------------------- 157 | 158 | ParzDens_1 - Parzen density of a single variable 159 | 160 | -------------------------------------------------------------------------------- 161 | */ 162 | 163 | ParzDens_1::ParzDens_1 ( int n_tset , double *tset , int n_div ) 164 | { 165 | int i, j, *indices ; 166 | double std, *x, *y, xbot, xinc, diff, sum ; 167 | 168 | nd = n_tset ; 169 | spline = NULL ; 170 | 171 | d = (double *) malloc ( nd * sizeof(double) ) ; 172 | 173 | indices = (int *) malloc ( nd * sizeof(int) ) ; 174 | 175 | /* 176 | Convert the data to a normal distribution 177 | */ 178 | 179 | for (i=0 ; ievaluate ( x ) ; 247 | 248 | sum = 0.0 ; 249 | for (i=0 ; ievaluate ( x0 , x1 ) ; 397 | 398 | sum = 0.0 ; 399 | for (i=0 ; i 9 | 10 | extern int brentmin ( int itmax , double critlim , double eps , 11 | double tol , int (*criter) (double , double *) , 12 | double *x1 , double *x2 , double *x3 , double *y ) ; 13 | extern int glob_min ( double low , double high , int npts , int log_space , 14 | double critlim , int (*criter) (double , double *) , 15 | double *x1, double *y1 , double *x2, double *y2 , 16 | double *x3, double *y3 ) ; 17 | extern int user_pressed_escape () ; 18 | 19 | /* 20 | This routine uses the general univariate minimizers 'glob_min' and 21 | 'brentmin' to minimize along the gradient line. So we must have a local 22 | function for them to call, and it must have access to the relevant data. 23 | These statics handle that. 24 | */ 25 | 26 | static int univar_crit ( double t , double *fval ) ; // Local univariate criterion 27 | static double *local_x, *local_base, *local_direc ; // It uses these 28 | static int local_n ; 29 | static int (*local_criter) ( double *xvec , double *fval ) ; 30 | 31 | int powell ( 32 | double scale , // Used for size of initial search by glob_min() 33 | int maxits , // Iteration limit (0 for no limit) 34 | double critlim , // Quit if crit drops this low 35 | double tol , // Convergence tolerance 36 | int (*criter) ( double * , double * ) , // Criterion func 37 | int n , // Number of variables 38 | double *x , // In/out of independent variable 39 | double *y , // In/out of function value 40 | double *base , // Work vector n long 41 | double *p0 , // Work vector n long 42 | double *direc , // Work vector n*n long 43 | int update_progress // Call setpos_progress_message() to update progress bar? 44 | ) 45 | { 46 | int i, j, idir, iter, user_quit, convergence_counter, idelta, replaced ; 47 | double fval, fbest, f0, test, t1, t2, t3, y1, y2, y3 ; 48 | double prev_best, toler, delta, len, ftest, mult ; 49 | 50 | t2 = 0.0 ; // Not needed. Shuts up LINT. 51 | 52 | /* 53 | Initialize for the local univariate criterion which may be called by 54 | 'glob_min' and 'brentmin' to minimize along the search direction. 55 | */ 56 | 57 | 58 | local_x = x ; 59 | local_base = base ; 60 | local_n = n ; 61 | local_criter = criter ; 62 | 63 | /* 64 | Initialize the direction matrix to be an identity. 65 | */ 66 | 67 | for (i=0 ; i= maxits) && (maxits > 0)) 84 | break ; 85 | 86 | if (fbest < critlim) // Do we satisfy user yet? 87 | break ; 88 | 89 | /* 90 | Convergence check 91 | */ 92 | 93 | if (fabs(prev_best) <= 1.0) // If the function is small 94 | toler = tol ; // Work on absolutes 95 | else // But if it is large 96 | toler = tol * fabs(prev_best) ; // Keep things relative 97 | 98 | if ((prev_best - fbest) <= toler) { // If little improvement 99 | if (++convergence_counter >= 2) // Then count how many 100 | break ; // And quit if too many 101 | } 102 | else // But a good iteration 103 | convergence_counter = 0 ; // Resets this counter 104 | 105 | if (fbest < prev_best) // Always true if well behaved 106 | prev_best = fbest ; 107 | 108 | /* 109 | Does the user want to quit? 110 | */ 111 | 112 | if ((user_quit = user_pressed_escape ()) != 0) 113 | break ; 114 | 115 | /* 116 | Loop over all search directions, minimizing in each. 117 | Keep track of the direction that gave the most improvement. 118 | For efficiency, we keep 'replaced' as the vector that just got replaced 119 | (or -1 if no replacement was done). Skip the first direction if that 120 | was the one just replaced! 121 | */ 122 | 123 | for (i=0 ; i1) && ! idir && ! replaced) // If we just replaced the 131 | continue ; // first vector, avoid waste 132 | for (i=0 ; i delta) { // Keep track of best direction 175 | delta = fbest - fval ; 176 | idelta = idir ; 177 | } 178 | fbest = fval ; // This is always the best so far 179 | } // For all directions 180 | 181 | /* 182 | Before looping through all n directions, we stood at point p0 with f=f0. 183 | We now stand at point x with f=fbest. 184 | It is quite possible that the average direction of motion points right 185 | along a ravine. Thus, it behooves us to step out in that direction. 186 | Try it. We might luck out. 187 | */ 188 | 189 | for (i=0 ; i 8 | 9 | void qsortd ( int first , int last , double *data ) 10 | { 11 | int lower, upper ; 12 | double ftemp, split ; 13 | 14 | split = data[(first+last)/2] ; 15 | lower = first ; 16 | upper = last ; 17 | 18 | do { 19 | while ( split > data[lower] ) 20 | ++lower ; 21 | while ( split < data[upper] ) 22 | --upper ; 23 | if (lower == upper) { 24 | ++lower ; 25 | --upper ; 26 | } 27 | else if (lower < upper) { 28 | ftemp = data[lower] ; 29 | data[lower++] = data[upper] ; 30 | data[upper--] = ftemp ; 31 | } 32 | } while ( lower <= upper ) ; 33 | 34 | if (first < upper) 35 | qsortd ( first , upper , data ) ; 36 | if (lower < last) 37 | qsortd ( lower , last , data ) ; 38 | } 39 | 40 | void qsortds ( int first , int last , double *data , double *slave ) 41 | { 42 | int lower, upper ; 43 | double ftemp, split ; 44 | 45 | split = data[(first+last)/2] ; 46 | lower = first ; 47 | upper = last ; 48 | 49 | do { 50 | while ( split > data[lower] ) 51 | ++lower ; 52 | while ( split < data[upper] ) 53 | --upper ; 54 | if (lower == upper) { 55 | ++lower ; 56 | --upper ; 57 | } 58 | else if (lower < upper) { 59 | ftemp = slave[lower] ; 60 | slave[lower] = slave[upper] ; 61 | slave[upper] = ftemp ; 62 | ftemp = data[lower] ; 63 | data[lower++] = data[upper] ; 64 | data[upper--] = ftemp ; 65 | } 66 | } while ( lower <= upper ) ; 67 | 68 | if (first < upper) 69 | qsortds ( first , upper , data , slave ) ; 70 | if (lower < last) 71 | qsortds ( lower , last , data , slave ) ; 72 | } 73 | 74 | void qsortdsi ( int first , int last , double *data , int *slave ) 75 | { 76 | int lower, upper, itemp ; 77 | double ftemp, split ; 78 | 79 | split = data[(first+last)/2] ; 80 | lower = first ; 81 | upper = last ; 82 | 83 | do { 84 | while ( split > data[lower] ) 85 | ++lower ; 86 | while ( split < data[upper] ) 87 | --upper ; 88 | if (lower == upper) { 89 | ++lower ; 90 | --upper ; 91 | } 92 | else if (lower < upper) { 93 | itemp = slave[lower] ; 94 | slave[lower] = slave[upper] ; 95 | slave[upper] = itemp ; 96 | ftemp = data[lower] ; 97 | data[lower++] = data[upper] ; 98 | data[upper--] = ftemp ; 99 | } 100 | } while ( lower <= upper ) ; 101 | 102 | if (first < upper) 103 | qsortdsi ( first , upper , data , slave ) ; 104 | if (lower < last) 105 | qsortdsi ( lower , last , data , slave ) ; 106 | } -------------------------------------------------------------------------------- /RANDOM.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* RANDOM - Assorted non-uniform random number generators. */ 4 | /* They all call an external uniform generator, unifrand(). */ 5 | /* */ 6 | /* normal () - Normal (mean zero, unit variance) */ 7 | /* normal_pair ( double *x1 , double *x2 ) - Pair of standard normals */ 8 | /* beta ( int v1 , int v2 ) - Beta with parameters v1 / 2 and v2 / v2 */ 9 | /* rand_sphere ( int nvars , double *x ) - Uniform on unit sphere surface */ 10 | /* cauchy ( int n , double scale , double *x ) - Multivariate Cauchy */ 11 | /* */ 12 | /******************************************************************************/ 13 | 14 | #include 15 | 16 | #if ! defined ( PI ) 17 | #define PI 3.141592653589793 18 | #endif 19 | 20 | extern double unifrand () ; 21 | 22 | /* 23 | -------------------------------------------------------------------------------- 24 | 25 | Generate a standard normal random variable or a pair of them 26 | using the Box-Muller method. 27 | 28 | -------------------------------------------------------------------------------- 29 | */ 30 | 31 | double normal () 32 | { 33 | double x1, x2 ; 34 | 35 | for (;;) { 36 | x1 = unifrand () ; 37 | if (x1 <= 0.0) // Safety: log(0) is undefined 38 | continue ; 39 | x1 = sqrt ( -2.0 * log ( x1 )) ; 40 | x2 = cos ( 2.0 * PI * unifrand () ) ; 41 | return x1 * x2 ; 42 | } 43 | } 44 | 45 | void normal_pair ( double *x1 , double *x2 ) 46 | { 47 | double u1, u2 ; 48 | 49 | for (;;) { 50 | u1 = unifrand () ; 51 | if (u1 <= 0.0) // Safety: log(0) is undefined 52 | continue ; 53 | u1 = sqrt ( -2.0 * log ( u1 )) ; 54 | u2 = 2.0 * PI * unifrand () ; 55 | *x1 = u1 * sin ( u2 ) ; 56 | *x2 = u1 * cos ( u2 ) ; 57 | return ; 58 | } 59 | } 60 | 61 | /* 62 | -------------------------------------------------------------------------------- 63 | 64 | Generate a Gamma random variable having parameter v/2 65 | 66 | -------------------------------------------------------------------------------- 67 | */ 68 | 69 | double gamma ( int v ) 70 | { 71 | double x, y, z, vm1, root ; 72 | 73 | switch (v) { 74 | 75 | case 1: // Chi-square with 1 df is 2 gamma(.5) 76 | x = normal () ; 77 | return 0.5 * x * x ; 78 | 79 | case 2: // Gamma(1) is exponential(1) 80 | for (;;) { 81 | x = unifrand () ; 82 | if (x > 0.0) 83 | return -log ( x ) ; 84 | } 85 | 86 | default: // Valid for all real a>1 (a=v/2) 87 | vm1 = 0.5 * v - 1.0 ; 88 | root = sqrt ( v - 1.0 ) ; 89 | 90 | for (;;) { 91 | y = tan ( PI * unifrand () ) ; 92 | x = root * y + vm1 ; 93 | if (x <= 0.0) 94 | continue ; 95 | z = (1.0 + y * y) * exp ( vm1 * log(x/vm1) - root * y ) ; 96 | if (unifrand () <= z) 97 | return x ; 98 | } 99 | } 100 | } 101 | 102 | 103 | /* 104 | -------------------------------------------------------------------------------- 105 | 106 | Generate a beta random variable with parameters v1 / 2 and v2 / 2. 107 | 108 | -------------------------------------------------------------------------------- 109 | */ 110 | 111 | double beta ( int v1 , int v2 ) 112 | { 113 | double x1, x2 ; 114 | 115 | x1 = gamma ( v1 ) ; 116 | x2 = gamma ( v2 ) ; 117 | 118 | return x1 / (x1 + x2) ; 119 | } 120 | 121 | /* 122 | -------------------------------------------------------------------------------- 123 | 124 | Generate a random point on an n-sphere. 125 | 126 | -------------------------------------------------------------------------------- 127 | */ 128 | 129 | void rand_sphere ( int nvars , double *x ) 130 | { 131 | int i ; 132 | double length ; 133 | 134 | length = 0.0 ; 135 | for (i=0 ; i 8 | #include 9 | 10 | extern void qsortds ( int first , int last , double *data , double *slave ) ; 11 | 12 | double spearman ( // Returns rho in range -1 to 1 13 | int n , // Input: Number of cases 14 | double *var1 , // Input: One variable 15 | double *var2 , // Input: Other variable 16 | double *x , // Work vector n long 17 | double *y // Work vector n long 18 | ) 19 | { 20 | int j, k, ntied ; 21 | double val, x_tie_correc, y_tie_correc ; 22 | double dn, ssx, ssy, rank, diff, rankerr, rho ; 23 | 24 | // We need to rearrange input vectors, so copy them to work vectors 25 | // To avoid disturbing the caller 26 | if (x != var1) 27 | memcpy ( x , var1 , n * sizeof(double) ) ; 28 | if (y != var2) 29 | memcpy ( y , var2 , n * sizeof(double) ) ; 30 | 31 | // Compute ties in x, compute correction as SUM ( ties**3 - ties ) 32 | // The following routine sorts x ascending and simultaneously moves y 33 | qsortds ( 0 , n-1 , x , y ) ; 34 | x_tie_correc = 0.0 ; 35 | for (j=0 ; j val) 39 | break ; 40 | } 41 | ntied = k - j ; 42 | x_tie_correc += (double) ntied * ntied * ntied - ntied ; 43 | rank = 0.5 * ((double) j + (double) k + 1.0) ; 44 | while (j < k) 45 | x[j++] = rank ; 46 | } // For each case in sorted x array 47 | 48 | // Now do same for y 49 | qsortds ( 0 , n-1 , y , x ) ; 50 | y_tie_correc = 0.0 ; 51 | for (j=0 ; j val) 55 | break ; 56 | } 57 | ntied = k - j ; 58 | y_tie_correc += (double) ntied * ntied * ntied - ntied ; 59 | rank = 0.5 * ((double) j + (double) k + 1.0) ; 60 | while (j < k) 61 | y[j++] = rank ; 62 | } // For each case in sorted y array 63 | 64 | // Final computations 65 | dn = n ; 66 | ssx = (dn * dn * dn - dn - x_tie_correc) / 12.0 ; 67 | ssy = (dn * dn * dn - dn - y_tie_correc) / 12.0 ; 68 | rankerr = 0.0 ; 69 | for (j=0 ; j 8 | #include 9 | #include 10 | #include 11 | 12 | extern void qsortds ( int first , int last , double *x , double *y ) ; 13 | 14 | class CubicSpline { 15 | 16 | public: 17 | CubicSpline ( int n , double *xin , double *yin ) ; 18 | ~CubicSpline () ; 19 | double evaluate ( double x ) ; 20 | 21 | private: 22 | int n ; 23 | double *x ; 24 | double *y ; 25 | double *y2 ; 26 | } ; 27 | 28 | 29 | CubicSpline::CubicSpline ( 30 | int nin , // Number of input points 31 | double *xin , // They are here, not necessarily sorted 32 | double *yin 33 | ) 34 | { 35 | int i ; 36 | double temp, p, *c ; 37 | 38 | n = nin ; 39 | c = (double *) malloc ( n * sizeof(double) ) ; 40 | x = (double *) malloc ( n * sizeof(double) ) ; 41 | y = (double *) malloc ( n * sizeof(double) ) ; 42 | y2 = (double *) malloc ( n * sizeof(double) ) ; 43 | 44 | assert ( c != NULL ) ; 45 | assert ( x != NULL ) ; 46 | assert ( y != NULL ) ; 47 | assert ( y2 != NULL ) ; 48 | 49 | memcpy ( x , xin , n * sizeof(double) ) ; 50 | memcpy ( y , yin , n * sizeof(double) ) ; 51 | qsortds ( 0 , n-1 , x , y ) ; 52 | 53 | y2[0] = c[0] = 0.0 ; 54 | 55 | for (i=1 ; i=0 ; i--) 66 | y2[i] = y2[i] * y2[i+1] + c[i] ; 67 | 68 | free ( c ) ; 69 | } 70 | 71 | CubicSpline::~CubicSpline () 72 | { 73 | free ( x ) ; 74 | free ( y ) ; 75 | free ( y2 ) ; 76 | } 77 | 78 | double CubicSpline::evaluate ( double xpt ) 79 | { 80 | int k, klo, khi ; 81 | double dist, a, b, aa, bb, val ; 82 | 83 | if (xpt < x[0]) 84 | return y[0] ; 85 | 86 | if (xpt > x[n-1]) 87 | return y[n-1] ; 88 | 89 | klo = 0 ; 90 | khi = n - 1 ; 91 | 92 | while (khi > klo+1) { 93 | k = (khi + klo) / 2 ; 94 | if (xpt < x[k]) 95 | khi = k ; 96 | else 97 | klo = k ; 98 | } 99 | 100 | dist = x[khi] - x[klo] + 1.e-60 ; 101 | a = (x[khi] - xpt) / dist ; 102 | b = (xpt - x[klo]) / dist ; 103 | aa = a * (a * a - 1.0) ; 104 | bb = b * (b * b - 1.0) ; 105 | 106 | val = (aa * y2[klo] + bb * y2[khi]) * dist * dist / 6.0 ; 107 | return a * y[klo] + b * y[khi] + val ; 108 | } 109 | -------------------------------------------------------------------------------- /SVDCMP.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* SVDCMP - SingularValueDecomp class for computing the singular value */ 4 | /* decomposition of a rectangular matrix having at least as many */ 5 | /* rows as columns. */ 6 | /* This also includes a back-substitution routine for computing */ 7 | /* solutions to linear systems. */ 8 | /* */ 9 | /* This is based on the implementation in Press "Numerical Recipes" */ 10 | /* with several bug fixes. */ 11 | /* */ 12 | /******************************************************************************/ 13 | 14 | #include 15 | #include 16 | #include 17 | #include 18 | 19 | /* 20 | -------------------------------------------------------------------------------- 21 | 22 | SingularValueDecomp - Singular value decomposition 23 | 24 | The following steps are needed to compute a least-squares solution 25 | to a (possibly overdetermined) linear system: 26 | 1) Create a SingularValueDecomp object. The constructor will allocate 27 | memory for the design matrix 'a', the right-hand-side 'b', and all 28 | scratch memory that it needs. Optionally, the user can flag the 29 | constructor to preserve 'a' and return the decomposition in 'u'. 30 | Normally, 'a' is overwritten. 31 | 2) The design matrix must be placed in 'a' and svdcmp called. 32 | 3) Place the right-hand-side in 'b' 33 | 4) Allocate a vector where the solution is to be placed. 34 | Call backsub with a pointer to this vector. 35 | 36 | -------------------------------------------------------------------------------- 37 | */ 38 | 39 | class SingularValueDecomp { 40 | 41 | public: 42 | 43 | SingularValueDecomp ( int nrows , int ncols , int save_a=0 ) ; 44 | ~SingularValueDecomp () ; 45 | void svdcmp () ; 46 | void backsub ( double limit , double *soln ) ; 47 | 48 | int ok ; // Was everything legal and allocs successful? 49 | 50 | /* 51 | These are made public to allow access if desired. 52 | Normally, only 'a' (the design matrix) and 'b' (the right-hand-side) 53 | are written by the user. If 'save_a' is nonzero, 'a' is kept intact. 54 | */ 55 | 56 | double *a ; // nrows by ncols input of design, output of U 57 | double *u ; // unless save_a nonzero, in which case U output in 'u' 58 | double *w ; // Unsorted ncols vector of singular values 59 | double *v ; // Ncols by ncols output of 'v' 60 | double *b ; // Nrows right-hand-side for backsub 61 | 62 | 63 | private: 64 | 65 | void bidiag ( double *matrix ) ; 66 | double bid1 ( int col , double *matrix , double scale ) ; 67 | double bid2 ( int col , double *matrix , double scale ) ; 68 | void right ( double *matrix ) ; 69 | void left ( double *matrix ) ; 70 | void cancel ( int low , int high , double *matrix ) ; 71 | void qr ( int low , int high , double *matrix ) ; 72 | void qr_mrot ( int col , double sine , double cosine , double *matrix ) ; 73 | void qr_vrot ( int col , double sine , double cosine ) ; 74 | 75 | int rows ; // Nrows preserved here 76 | int cols ; // And ncols 77 | double *work ; // Scratch vector ncols long 78 | double norm ; // Norm of 'a' matrix 79 | } ; 80 | 81 | inline double root_ss ( double x , double y ) 82 | { 83 | double ratio ; 84 | if (x < 0.0) 85 | x = -x ; 86 | if (y < 0.0) 87 | y = -y ; 88 | // if (x >= y) { 89 | if (x > y) { // Bug fix 7/26/2012 90 | ratio = y / x ; 91 | return x * sqrt ( ratio * ratio + 1.0 ) ; 92 | } 93 | else if (y == 0.0) 94 | return 0.0 ; 95 | else { 96 | ratio = x / y ; 97 | return y * sqrt ( ratio * ratio + 1.0 ) ; 98 | } 99 | } 100 | 101 | /* 102 | -------------------------------------------------------------------------------- 103 | 104 | Constructor - Allocate input/output and scratch memory. 105 | Normally, this returns ok=1. If not, the user called it with 106 | more columns than rows, or there was insufficient memory. 107 | 108 | -------------------------------------------------------------------------------- 109 | */ 110 | 111 | SingularValueDecomp::SingularValueDecomp ( int nr , int nc , int save_a ) 112 | { 113 | if (nc > nr) { // Illegal 114 | rows = cols = ok = 0 ; 115 | return ; 116 | } 117 | 118 | a = (double *) malloc ( nr * nc * sizeof(double) ) ; 119 | w = (double *) malloc ( nc * sizeof(double) ) ; 120 | v = (double *) malloc ( nc * nc * sizeof(double) ) ; 121 | b = (double *) malloc ( nr * sizeof(double) ) ; 122 | work = (double *) malloc ( nc * sizeof(double) ) ; 123 | if (save_a) 124 | u = (double *) malloc ( nr * nc * sizeof(double) ) ; 125 | else 126 | u = NULL ; 127 | 128 | if ((a == NULL) || (w == NULL) || (v == NULL) || (b == NULL) || 129 | (work == NULL) || (save_a && (u == NULL))) { 130 | if (a != NULL) 131 | free ( a ) ; 132 | if (w != NULL) 133 | free ( w ) ; 134 | if (v != NULL) 135 | free ( v ) ; 136 | if (b != NULL) 137 | free ( b ) ; 138 | if (work != NULL) 139 | free ( work ) ; 140 | if (u != NULL) 141 | free ( u ) ; 142 | rows = cols = ok = 0 ; 143 | return ; 144 | } 145 | 146 | ok = 1 ; // Flag to user that all went well 147 | rows = nr ; 148 | cols = nc ; 149 | } 150 | 151 | /* 152 | -------------------------------------------------------------------------------- 153 | 154 | Destructor - Free memory 155 | 156 | -------------------------------------------------------------------------------- 157 | */ 158 | 159 | SingularValueDecomp::~SingularValueDecomp () 160 | { 161 | if (! ok) // If constructor's mallocs failed 162 | return ; // there is nothing to free 163 | 164 | free ( a ) ; 165 | free ( w ) ; 166 | free ( v ) ; 167 | free ( b ) ; 168 | free ( work ) ; 169 | if (u != NULL) 170 | free ( u ) ; 171 | } 172 | 173 | 174 | /* 175 | -------------------------------------------------------------------------------- 176 | 177 | svdcmp - Singular value decomposition of 'a' 178 | 179 | -------------------------------------------------------------------------------- 180 | */ 181 | 182 | void SingularValueDecomp::svdcmp () 183 | { 184 | int i, sval, split, iter_limit ; 185 | double *matrix ; 186 | 187 | if (u != NULL) { // Must we keep 'a' intact? 188 | memcpy ( u , a , rows * cols * sizeof(double) ) ; // If so, copy it 189 | matrix = u ; // And work on copy 190 | } 191 | else // If not, operate directly on 'a' 192 | matrix = a ; 193 | 194 | bidiag ( matrix ) ; // Reduce to bidiagonal 195 | right ( matrix ) ; // Accumulate right transforms 196 | left ( matrix ) ; // And left 197 | 198 | sval = cols ; 199 | while (sval--) { // Loop over the singular values in reverse order 200 | iter_limit = 50 ; 201 | while (iter_limit--) { // Avoid nearly endless loop (very rare!) 202 | split = sval + 1 ; 203 | while (--split) { // Keep splitting as long as possible 204 | if (norm + fabs (work[split]) == norm) { 205 | break ; 206 | } 207 | if (norm + fabs (w[split-1]) == norm) { 208 | cancel ( split , sval , matrix ) ; 209 | break ; 210 | } 211 | } 212 | if (split == sval) { // Converged? 213 | if (w[sval] < 0.0) { // Keep them nonnegative 214 | w[sval] = -w[sval] ; 215 | for (i=0 ; i 0.0) 249 | w[col] = scale * bid1 ( col , matrix , scale ) ; 250 | else 251 | w[col] = 0.0 ; 252 | 253 | scale = 0.0 ; 254 | for (k=col+1 ; k 0.0) 258 | temp = bid2 ( col , matrix , scale ) ; 259 | else 260 | temp = 0.0 ; 261 | 262 | testnorm = fabs (w[col]) + fabs (work[col]) ; 263 | if (testnorm > norm) 264 | norm = testnorm ; 265 | } 266 | } 267 | 268 | double SingularValueDecomp::bid1 ( int col , double *matrix , double scale ) 269 | { 270 | int i, j ; 271 | double diag, rv, fac, sum ; 272 | 273 | sum = 0.0 ; 274 | for (i=col ; i 0.0) 281 | rv = -rv ; 282 | fac = 1.0 / (diag * rv - sum) ; 283 | matrix[col*cols+col] = diag - rv ; 284 | 285 | for (j=col+1 ; j 0.0) 314 | rv = -rv ; 315 | 316 | matrix[col*cols+col+1] = diag - rv ; 317 | fac = 1.0 / (diag * rv - sum) ; 318 | for (i=col+1 ; i wmax)) 572 | wmax = w[i] ; 573 | } 574 | 575 | limit = limit * wmax + 1.e-60 ; 576 | 577 | /* 578 | Find U'b 579 | */ 580 | 581 | for (i=0 ; i limit) { 584 | for (j=0 ; j 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | extern double normal () ; 15 | extern double unifrand () ; 16 | 17 | class CubicSpline { 18 | 19 | public: 20 | CubicSpline ( int n , double *xin , double *yin ) ; 21 | ~CubicSpline () ; 22 | double evaluate ( double x ) ; 23 | 24 | private: 25 | int n ; 26 | double *x ; 27 | double *y ; 28 | double *y2 ; 29 | } ; 30 | 31 | class ParzDens_1 { 32 | 33 | public: 34 | ParzDens_1 ( int n_tset , double *tset , int n_div ) ; 35 | ~ParzDens_1 () ; 36 | double density ( double x ) ; 37 | double low ; // Lowest value with significant density 38 | double high ; // And highest 39 | 40 | private: 41 | int nd ; // Number of points in array below 42 | double *d ; // The data on which the density is based 43 | double var ; // Presumed variance 44 | double factor ; // Normalizing factor to make it a density 45 | CubicSpline *spline ; // Used only if interpolation 46 | } ; 47 | 48 | class MutualInformationParzen { // Parzen window method 49 | 50 | public: 51 | MutualInformationParzen ( int nn , double *dep_vals , int ndiv ) ; 52 | ~MutualInformationParzen () ; 53 | double mut_inf ( double *x ) ; 54 | 55 | private: 56 | int n ; // Number of cases 57 | int n_div ; // Number of divisions of range, typically 5-10 58 | double *depvals ; // 'Dependent' variable 59 | ParzDens_1 *dens_dep ; // Marginal density of 'dependent' variable 60 | } ; 61 | 62 | class MutualInformationAdaptive { // Adaptive partitioning method 63 | 64 | public: 65 | MutualInformationAdaptive ( int nn , double *dep_vals , 66 | int respect_ties , double crit ) ; 67 | ~MutualInformationAdaptive () ; 68 | double mut_inf ( double *x , int respect_ties ) ; 69 | 70 | private: 71 | int n ; // Number of cases 72 | int *y ; // 'Dependent' variable ranks 73 | int *y_tied ; // tied[i] != 0 if case with rank i == case with rank i+1 74 | double chi_crit ; // Chi-square test criterion 75 | } ; 76 | 77 | 78 | int main ( 79 | int argc , // Number of command line arguments (includes prog name) 80 | char *argv[] // Arguments (prog name is argv[0]) 81 | ) 82 | 83 | { 84 | int i, nsamps, ntries, ndiv, divisor, itry, respect_ties ; 85 | double corr, correct, ptie, *x, *y, x1, x2, result, prior_x1 ; 86 | double total_parzen, bias_parzen, stderr_parzen ; 87 | double total_adapt, bias_adapt, stderr_adapt ; 88 | double chi_test ; 89 | MutualInformationParzen *mi_parzen ; 90 | MutualInformationAdaptive *mi_adapt ; 91 | 92 | /* 93 | Process command line parameters 94 | */ 95 | 96 | #if 1 97 | if (argc != 8) { 98 | printf ( 99 | "\nUsage: TEST_CON nsamples ntries correlation ptie respect_ties ndiv chi_test" ) ; 100 | exit ( 1 ) ; 101 | } 102 | 103 | nsamps = atoi ( argv[1] ) ; 104 | ntries = atoi ( argv[2] ) ; 105 | corr = atof ( argv[3] ) ; 106 | ptie = atof ( argv[4] ) ; 107 | respect_ties = atoi ( argv[5] ) ; 108 | ndiv = atoi ( argv[6] ) ; 109 | chi_test = atof ( argv[7] ) ; 110 | #else 111 | nsamps = 101 ; 112 | ntries = 10 ; 113 | corr = 0.9 ; 114 | ptie = 0.0 ; 115 | respect_ties = 0 ; 116 | ndiv = 5 ; 117 | chi_test = 6.0 ; 118 | #endif 119 | 120 | if ((nsamps <= 0) || (ntries <= 0) || (corr < -1.0) || (corr > 1.0) 121 | || (ptie < 0.0) || (ptie > 1.0) || (ndiv < 2) || (chi_test < 0.0)) { 122 | printf ( 123 | "\nUsage: TEST_CON nsamples ntries correlation ptie respect_ties ndiv chi_test" ) ; 124 | exit ( 1 ) ; 125 | } 126 | 127 | 128 | /* 129 | Allocate memory and initialize 130 | */ 131 | 132 | divisor = ntries / 100 ; // This is for progress reports only 133 | if (divisor < 1) 134 | divisor = 1 ; 135 | 136 | x = (double *) malloc ( nsamps * sizeof(double) ) ; 137 | y = (double *) malloc ( nsamps * sizeof(double) ) ; 138 | 139 | 140 | /* 141 | Main outer loop does all tries 142 | */ 143 | 144 | correct = -0.5 * log ( 1.0 - corr * corr ) ; 145 | total_parzen = bias_parzen = stderr_parzen = 0.0 ; 146 | total_adapt = bias_adapt = stderr_adapt = 0.0 ; 147 | 148 | for (itry=1 ; itry<=ntries ; itry++) { 149 | 150 | if (((itry-1) % divisor) == 0) 151 | printf ( "\n\n\nTry %d of %d", itry, ntries ) ; 152 | 153 | prior_x1 = 0.5 ; // Arbitrary 154 | for (i=0 ; imut_inf ( x , respect_ties ) ; 174 | delete mi_adapt ; 175 | total_adapt += result ; 176 | bias_adapt += result - correct ; 177 | stderr_adapt += (result - correct) * (result - correct) ; 178 | 179 | mi_parzen = new MutualInformationParzen ( nsamps , y , ndiv ) ; 180 | result = mi_parzen->mut_inf ( x ) ; 181 | delete mi_parzen ; 182 | total_parzen += result ; 183 | bias_parzen += result - correct ; 184 | stderr_parzen += (result - correct) * (result - correct) ; 185 | 186 | if ((((itry-1) % divisor) == 0) 187 | || (itry == ntries) ) { // Don't do this every try! Too slow. 188 | printf ( "\nParzen Mean = %.5lf Bias = %.5lf StdErr = %.5lf", 189 | total_parzen/itry, bias_parzen/itry, sqrt ( stderr_parzen/itry )) ; 190 | printf ( "\nAdapt Mean = %.5lf Bias = %.5lf StdErr = %.5lf", 191 | total_adapt/itry, bias_adapt/itry, sqrt ( stderr_adapt/itry )) ; 192 | } 193 | 194 | if (_kbhit ()) { // Has the user pressed a key? 195 | if (_getch() == 27) // The ESCape key? 196 | break ; 197 | } 198 | 199 | } // For all tries 200 | 201 | free ( x ) ; 202 | free ( y ) ; 203 | return EXIT_SUCCESS ; 204 | } 205 | -------------------------------------------------------------------------------- /TRANS_ENT.CPP: -------------------------------------------------------------------------------- 1 | /******************************************************************************/ 2 | /* */ 3 | /* TRANS_ENT - Schreiber's transfer entropy (information transfer) */ 4 | /* */ 5 | /******************************************************************************/ 6 | 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | 13 | /* 14 | -------------------------------------------------------------------------------- 15 | 16 | We are given two series, x and y, each having n cases. 17 | It is assumed that p(y[i]) is a function of y[i-1], y[i-2], ..., y[i-yhist]. 18 | But does x[i-xlag], x[i-xlag-1], ..., x[i-xlag-xhist+1] influence the y transition 19 | probabilities? This function measures the extent to which this occurs. 20 | 21 | The traditional version has xlag=1, meaning that the value of x concurrent 22 | with y is not allowed to participate in influencing y. 23 | Many models want the historical x influence to come up to y, 24 | allowing concurrent influence. For this, xlag=0. 25 | This happens, for example, in developing model-based market trading systems 26 | in which the indicator/target data is such that indicators are computed 27 | based strictly on the past and targets strictly on the future. 28 | So the data already has X inherently lagged to Y, and you would not want 29 | to lag it still further. 30 | 31 | Note that we have nbins_x ^ xhist * nbins_y ^ (yhist+1) bins. 32 | In order to get decent probability estimates, these bins must contain 33 | a decent number of cases. The number of bins will blow up fast as 34 | xhist and yhist grow! Keep them small unless n is gigantic. 35 | 36 | Suppose 'a' represents the current y, 'b' represents y history, and 37 | 'c' represents x history. Then the information transfer is: 38 | 39 | SUM [ p(a,b,c) log ( p(a|b,c) / p(a|b) ] 40 | 41 | So it's a sum of logs, weighted by the probability of each possible outcome. 42 | The log term is the ratio of the conditional probability of the current y 43 | given both its history and x history, over the conditional given just 44 | its own (y) history. If c, the x history, has no impact, this ratio will 45 | be 1, and its log will be zero. 46 | 47 | Note that p(a|b,c) = p(a,b,c) / p(b,c) and p(a|b) = p(a,b) / p(b) 48 | 49 | To speed calculations, after cumulating p(a,b,c) we compute and save 50 | the marginals p(b,c), p(a,b), and p(b). 51 | 52 | Four work vectors must be supplied. 53 | Let nx = nbins_x ^ xhist and ny = nbins_y ^ yhist. The lengths are: 54 | counts = nx * ny * nbins_y 55 | ab = nbins_y * ny 56 | bc = nx * ny 57 | b = ny 58 | 59 | -------------------------------------------------------------------------------- 60 | */ 61 | 62 | double trans_ent ( 63 | int n , // Length of x and y 64 | int nbins_x , // Number of x bins. Beware if greater than 2. 65 | int nbins_y , // Ditto y 66 | short int *x , // Independent variable, which impacts y transitions 67 | short int *y , // Dependent variable 68 | int xlag , // Lag of most recent predictive x: 1 for traditional, 0 for concurrent 69 | int xhist , // Length of x history. At least 1; Beware if greater than 1. 70 | int yhist , // Ditto y 71 | int *counts , // Work vector (see comment above) 72 | double *ab , // Ditto 73 | double *bc , // Ditto 74 | double *b // Ditto 75 | ) 76 | { 77 | int i, j, nx, ny, nxy, istart, ix, iy, ia, total ; 78 | double p, trans, numer, denom ; 79 | 80 | /* 81 | Compute key constants. 82 | */ 83 | 84 | nx = nbins_x ; 85 | for (i=1 ; i istart) 104 | istart = yhist ; 105 | 106 | for (i=istart ; i