├── README.md
├── 4_OX129_ML_data_prep
├── 8_OX129_decision_curve_analysis
├── Licence.txt
├── 1_OX129_imputation
├── 2_OX129_Cox_models
├── 7_OX129_ML_model_evaluation
├── 3_OX129_Competing_risks_regression
├── 5_OX129_XGBoost
└── 6_OX129_neural_networks


/README.md:
--------------------------------------------------------------------------------
 1 | **  Please review the file 'Licence.txt' regarding the academic use license ** 
 2 | 
 3 | - 👋 Hi, I’m Ash Darnell-Fish @AshDF91
 4 | - 👀 Clinical Research Fellow (DPhil) at Uni of Oxford
 5 | - 🌱 Interested in clinical prediction modelling using statistical learning and machine learning
 6 | - 🐱‍🏍 Competing risks models, pseudovalues, model calibration, Bayesian optimisation, adapting ML models for time-to-event/competing risks data, assessing stability/transportability of models using meta-analytical approaches    
 7 | 
 8 | <!---
 9 | AshDF91/AshDF91 is a ✨ special ✨ repository because its `README.md` (this file) appears on your GitHub profile.
10 | You can click the Preview link to take a look at your changes.
11 | --->
12 | 


--------------------------------------------------------------------------------
/4_OX129_ML_data_prep:
--------------------------------------------------------------------------------
 1 | ## Preparing multiply imputed data for machine learning modelling ##
 2 | 
 3 | ## This is undertaken in R, so need to prepare Stata dataset to be usable ##
 4 | 
 5 | ## The approach is centered on stacked imputations - a stack of all 50 imputations is formed, 
 6 | ## to generate 1 long dataset, with each individual assigned a weight of 0.02 (1/50) ##
 7 | 
 8 | ## The outcome variables are jack-knife pseudovalues, so these need to be generated too ##
 9 | 
10 | 
11 | *****************************************************    
12 | *	Prepare for ML modelling – calculate pseudovalues * 
13 | *****************************************************
14 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear           // data used in competing risks modelling 
15 | 
16 | * convert to long form - m=0 at top, then stacked imputations m=1,2,3...50 below in sequence *   // m=0 is the complete case dataset 
17 | mi convert flong 
18 | 
19 | * drop the m=0, i.e. non-imputed data *           // in flong format, _mi_m variable is generated to denote imputed set. E.g. _mi_m==10 is the 10th imputation
20 | tab _mi_m  
21 | * This is the variable to denote which imputation each row 'belongs' to * 
22 | drop if _mi_m==0
23 | 
24 | * Stack imputations* 
25 | * For purposes of later modelling in R - sort by sha1 (geographical region) then by patid, then by imputation number * 
26 | * This will enable the predictions to be matched to each individual when IECV runs on the stacked dataset * 
27 | sort sha1 patid _mi_m
28 | 
29 | * Remove the mi setting - this makes a 'clean' simple stacked dataset * 
30 | mi unset, asis 
31 | rename _mi_m imputation                // To track numbers if needed 
32 | 
33 | * Assign weight to each bservation for use in DL/XGBoost in R - each individual 
34 | * has a weight of 1/50, so that all imputations together 'equal' 1 patient * 
35 | gen imp_weight = 1/50 
36 | summ imp_weight, det 
37 | 
38 | ********************
39 | * CIF pseudovalues *
40 | ********************
41 | * Already calculated in the competing risks regression approach - 
42 | * summarise here for the record * 
43 | summ pseudo, det                // whole cohort 
44 | summ period1_pseudo, det        // used for model fitting 
45 | summ period2_pseudo, det        // used for model evaluation in the IECV process 
46 | 
47 | * Retransform FP term to get 'final' or imputed BMI variable * 
48 | gen bmi_x = 1/(Idiag__1+0.135374792)
49 | gen bmi = 10*(sqrt(bmi_x))
50 | summ bmi, det                     // Unscaled values 
51 | 
52 | summ townsend, det                // Already pre-generated in competing risks regression approach 
53 | 
54 | * Slim down and keep salient variables (predictors, etc.) * 
55 | keep patid sha1 study_practice cancer_route ethriskid smoke_cat cancer_grade cancer_stage // 
56 | progesterone_status HER2_status vasculitis radiotherapy chemotherapy mastectomy other_surgery //
57 | tca hrt anti_psychotic age_at_diagnosis bmi townsend period pseudo period1_pseudo period2_pseudo //
58 | timesplit endpoint exit3date d_breastca fu_start fu_end imputation imp_weight iecv_event endpoint prob_cens ipcw 
59 | 
60 | save "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", replace
61 | 
62 | ** Form separate datasets for period 1 and period 2 - this will aid IECV in R ** 
63 | use  "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", clear 
64 | keep if period==1 
65 | save "\final_datasets\ML\OX129_endpoint3_stacked50_period1_pseudovalues.dta", replace
66 | clear 
67 | 
68 | ********************************************************************************
69 | ** Again, use previously generated dataset to make data_period2 versions * 
70 | use  "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", clear 
71 | keep if period==2 
72 | save "\final_datasets\ML\OX129_endpoint3_stacked50_period2_pseudovalues.dta", replace
73 | clear 
74 | 
75 | ********************************************************************************
76 | ********************************************************************************
77 | 


--------------------------------------------------------------------------------
/8_OX129_decision_curve_analysis:
--------------------------------------------------------------------------------
 1 | ## USING DECISION CURVE ANALYSIS TO COMPARE CLINICAL UTILITY OF MODELS ##
 2 | 
 3 | ## We use the saved individual-level predictions generated with IECV to evaluate this ##
 4 | ## Overall, we load in the average prediction (average over 50 imputations, as per Rubin's rule for point estimates), 
 5 | ## merge with the survival/time-to-event data in the full dataset, then run the dca package ##
 6 | 
 7 | ***********************************************************************************************
 8 | 
 9 | ** Step 1: Obtain mean predictions for each individual from each model, save as mini file with patid and mean_prediction ** 
10 | 
11 | ** Cox model 
12 | use "\final_datasets\OX129_endpoint3_imputed_IECV.dta", clear 
13 | mi convert wide 
14 | keep if period==2 
15 | gen mean_prediction_cox = 1 - period1_baseline_surv_cox^exp(iecv_xb)
16 | rename mean_prediction_cox cox                                                                   * Rename prediction for each model simply, so can keep track/evaluate 
17 | keep patid cox 
18 | summ cox, det 
19 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_cox_iecv_predictions_formatted.dta", replace
20 | clear 
21 | 
22 | ** Competing risks regression 
23 | use "\final_datasets\OX129_endpoint3_competingrisk_IECV.dta", clear 
24 | mi convert wide 
25 | keep if period==2
26 | rename iecv_probabilities crr                                                                   * Rename predictions crr = competing risks regression, you get the idea
27 | keep patid crr  
28 | summ crr, det 
29 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_crr_iecv_predictions_formatted.dta", replace
30 | clear 
31 | 
32 | ** XGBoost 
33 | use "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", clear 
34 | keep patid mean_prediction
35 | rename mean_prediction xgb
36 | summ xgb, det 
37 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_xgboost_iecv_predictions_formatted.dta", replace
38 | clear 
39 | 
40 | 
41 | ** Neural network 
42 | use "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", clear 
43 | keep patid mean_prediction
44 | rename mean_prediction nnet 
45 | summ nnet, det 
46 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_nnet_iecv_predictions_formatted.dta", replace
47 | clear 
48 | 
49 | 
50 | ** Import, merge and analyse ** 
51 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear                              * This is the key dataset generated during regression modelling steps 
52 | mi convert wide                                                                               * Don't need stacked imputations - quicker to run and makes sense in wide format 
53 | keep if period==2                                                                             * Only have predictions generated in Period 2 data after IECV
54 | drop _merge 
55 | 
56 | foreach x in cox crr xgboost nnet {                                                           * Merge in the predictions from each model with their simple/handy variable names 
57 | 
58 | merge 1:1 patid using "\estimates\ML_IECV_predictions\DCA_endpoint3_`x'_iecv_predictions_formatted.dta"
59 | drop _merge 
60 | }
61 | 
62 |  mi stset follow_up, failure(iecv_event==1) exit(time 10)                                       * Stset the data so that DCA can run 
63 |  
64 |  cd "\ado\plus\"                                                                                * Path for calling in non-routine packages
65 |  
66 |  stdca cox crr xgb nnet, timepoint(10) prob(yes yes yes yes) smooth lpattern(solid solid dash dash dash dash)
67 |  graph save "Graph" "\graphs\EP3_decisioncurve_all_models.gph", replace 
68 |  
69 |  * Now, run version that takes u=into account competing risks - key one ! 
70 |  stdca cox crr xgb nnet, timepoint(10) prob(yes yes yes yes) smooth lpattern(solid solid dash dash dash dash) compet1(2)    // competing event is iecv_event=2 (=other death) 
71 |  graph save "Graph" "\graphs\EP3_decisioncurve_all_models.gph", replace 
72 |  
73 |  clear 
74 |  
75 |  *******************************************************************************
76 |  *******************************************************************************
77 |  ** END **
78 | 
79 | 


--------------------------------------------------------------------------------
/Licence.txt:
--------------------------------------------------------------------------------
 1 | Academic Use Licence
 2 | 
 3 | These licence terms apply to all licences granted by THE CHANCELLOR, MASTERS AND SCHOLARS OF THE UNIVERSITY OF OXFORD 
 4 | whose administrative offices are at University Offices, Wellington Square, Oxford OX1 2JD, United Kingdom (the “University”) 
 5 | for use of breast cancer prognosis and risk prediction tools (“the Software”) through this website 
 6 | https://github.com/AshDF91/Breast-cancer-prognosis  (the ”Website”). 
 7 | 
 8 | PLEASE READ THESE LICENCE TERMS CAREFULLY BEFORE DOWNLOADING THE SOFTWARE THROUGH THIS WEBSITE.  IF YOU DO NOT AGREE TO 
 9 | THESE LICENCE TERMS YOU SHOULD NOT DOWNLOAD THE SOFTWARE.
10 | 
11 | THE SOFTWARE IS INTENDED FOR USE BY ACADEMICS CARRYING OUT RESEARCH AND NOT FOR USE BY CONSUMERS OR COMMERCIAL BUSINESSES.
12 | 
13 | 1.	Academic Use Licence
14 | 1.1	The Licensee is granted a limited non-exclusive and non-transferable royalty free licence to download and use the Software 
15 | provided that the Licensee will:
16 | (a)	limit their use of the Software to their own internal academic non-commercial research which is undertaken for the purposes 
17 | of education or other scholarly use; 
18 | (b)	not use the Software for or on behalf of any third party or to provide a service or integrate all or part of the Software 
19 | into a product for sale or license to third parties;
20 | (c)	use the Software in accordance with the prevailing instructions and guidance for use given on the Website and comply with 
21 | procedures on the Website for user identification, authentication and access;
22 | (d)	comply with all applicable laws and regulations with respect to their use of the Software; and 
23 | (e)	ensure that the Copyright Notice “Copyright © 2021, University of Oxford” appears prominently wherever the Software is 
24 | reproduced and is referenced or cited with the Copyright Notice when the Software is described in any research publication or 
25 | on any documents or other material created using the Software.
26 | 1.2	The Licensee may only reproduce, modify, transmit or transfer the Software where:
27 | (a)	such reproduction, modification, transmission or transfer is for academic, research or other scholarly use;
28 | (b)	the conditions of this Licence are imposed upon the receiver of the Software or any modified Software;
29 | (c)	all original and modified Source Code is included in any transmitted software program; and
30 | (d)	the Licensee grants the University an irrevocable, indefinite, royalty free, non-exclusive unlimited licence to use and 
31 | sub-licence any modified Source Code as part of the Software.
32 | 
33 | 1.3	The University reserves the right at any time and without liability or prior notice to the Licensee to revise, modify 
34 | and replace the functionality and performance of the access to and operation of the Software. 
35 | 1.4	The Licensee acknowledges and agrees that the University owns all intellectual property rights in the Software.  
36 | The Licensee shall not have any right, title or interest in the Software.
37 | 1.5	This Licence will terminate immediately and the Licensee will no longer have any right to use the Software or exercise 
38 | any of the rights granted to the Licensee upon any breach of the conditions in Section 1 of this Licence.
39 | 
40 | 2.	Indemnity and Liability 
41 | 2.1	The Licensee shall defend, indemnify and hold harmless the University against any claims, actions, proceedings, losses, 
42 | damages, expenses and costs (including without limitation court costs and reasonable legal fees) arising out of or in connection 
43 | with the Licensee's possession or use of the Software, or any breach of these terms by the Licensee. 
44 | 2.2	The Software is provided on an ‘as is’ basis and the Licensee uses the Software at their own risk. No representations, 
45 | conditions, warranties or other terms of any kind are given in respect of the the Software and all statutory warranties and 
46 | conditions are excluded to the fullest extent permitted by law. Without affecting the generality of the previous sentences, 
47 | the University gives no implied or express warranty and makes no representation that the Software or any part of the Software: 
48 | (a) will enable specific results to be obtained;
49 | or (b) meets a particular specification or is comprehensive within its field or that it is error free or will operate without interruption; 
50 | or (c) is suitable for any particular, or the Licensee's specific purposes. 
51 | 2.3	Except in relation to fraud, death or personal injury, the University's liability to the Licensee for any use of the Software, 
52 | in negligence or arising in any other way out of the subject matter of these licence terms, will not extend to any incidental or 
53 | consequential damages or losses, or any loss of profits, loss of revenue, loss of data, loss of contracts or opportunity, whether direct or indirect.
54 | 2.4	The Licensee hereby irrevocably undertakes to the University not to make any claim against any employee, student, researcher or 
55 | other individual engaged by the University, being a claim which seeks to enforce against any of them any liability whatsoever in 
56 | connection with these licence terms or their subject-matter. 
57 | 
58 | 3.	General 
59 | 3.1	Severability - If any provision (or part of a provision) of these licence terms is found by any court or administrative body 
60 | of competent jurisdiction to be invalid, unenforceable or illegal, the other provisions shall remain in force.
61 | 3.2	Entire Agreement - These licence terms constitute the whole agreement between the parties and supersede any previous arrangement, 
62 | understanding or agreement between them relating to the Software. 
63 | 3.3	Law and Jurisdiction - These licence terms and any disputes or claims arising out of or in connection with them shall be governed by,
64 | and construed in accordance with, the law of England. The Licensee irrevocably submits to the exclusive jurisdiction of the English courts
65 | for any dispute or claim that arises out of or in connection with these licence terms.
66 | 
67 | If you are interested in using the Software commercially, please contact Oxford University Innovation Limited to negotiate a licence. Contact details are enquiries@innovation.ox.ac.uk quoting reference OUI 21633.
68 | 
69 | 


--------------------------------------------------------------------------------
/1_OX129_imputation:
--------------------------------------------------------------------------------
  1 | 
  2 | ******************************************************************************** 
  3 | *                           1) Variable descriptions                           *
  4 | *                          2) Multiple imputation run                          *
  5 | ********************************************************************************
  6 | 
  7 | ********************************************************************************
  8 | *                           1) Variable descriptions                           *
  9 | ********************************************************************************
 10 | 
 11 | * All variables are latest recorded prior to/at date of diagnosis, unless specified* 
 12 | 
 13 | * age = continuous
 14 | * diagnosis_bmi = body mass index, continuous 
 15 | * town_int = Townsend deprivation score, continuous 
 16 | * alcohol_cat = alcohol intake group, based on units/day coding in EMIS, ordinal 
 17 | * smoke_cat = smoking status, as per EMIS coding (non-smoker, ex-smoker, etc.) ordinal  
 18 | * cancer_stage = stage at diagnosis, stge I, II, III, IV, ordinal 
 19 | * cancer_grade = differentiation/grade of tumour, ordinal  
 20 | * er_status = Oestrogen receptor status, binary (+ve, -ve)
 21 | * pr_status = progesterone receptor status, binary (+ve, -ve) 
 22 | * her2_status = HER2 receptor status, binary (+ve, -ve) 
 23 | * cancer_route = route to cancer diagnosis, from PHE. E.g. screen-detected, emergency admission, categorical 
 24 | * ethriskid = self-reported ethnicity, informed by ONS classes, categorical 
 25 | * fh_breastca = recorded family history of breast cancer, binary 
 26 | * radio = radiotherapy within 1 year of breast cancer diagnosis, binary 
 27 | * chemo = chemotherapy within 1 year of breast cancer diagnosis, binary 
 28 | * mastectomy = mastectomy within 1 year of breast cancer diagnosis, binary 
 29 | * other_surg = other surgery (e.g. wide local excision) within 1 year of breast cancer diagnosis, binary 
 30 | * ckd = chronic kidney disease, ordinal 
 31 | * cirrhosis = chronic liver disease or cirrhosis, binary 
 32 | * htn = hypertension, binary 
 33 | * ihd = history of ischaemic heart disease, binary 
 34 | * t1dm = type 1 diabetes mellitus, binary 
 35 | * t2dm = type 2 diabetes mellitus, binary 
 36 | * vasculitis = vasculitis, binary 
 37 | * lupus = systemic lupus erythematosus, binary 
 38 | * thiazide = use of thiazide within 6months pre-diagnosis, binary 
 39 | * betablocker = use of beta-blocker med within 6mo pre-diagnosis, binary 
 40 | * acei = use of ACE inhibitor med within 6mo pre-diagnosis, binary 
 41 | * tca = use of tricyclic antidepressant within 6mo pre-diagnosis, binary 
 42 | * ssri = use of selective serotonion reuptake inhibitor within 6mo pre-diagnosis, binary 
 43 | * maoi = use of monoamine oxidase inhibitor within 6mo pre-diagnosis 
 44 | * otherantidepress = use of other (not above) antidepressant within 6mo pre-diagnosis 
 45 | * ocp = use of oral contraceptive pill within 6mo pre-diagnosis 
 46 | * hrt = use of hormone replacement therapy within 6mo pre-diagnosis 
 47 | * anti_psycho = use of anti-psychotic medication within 6mo pre-diagnosis 
 48 | * d_breastca = death with breast cancer on death certificate during follow-up
 49 | * outcomedate6 = date of breast cancer diagnosis (earliest of GP/HES/CancerRegistry)  
 50 | * exit3date = date of death, date of censoring (right censor date) 
 51 | 
 52 | 
 53 | ******************************************************************************** 
 54 | *                1) MULTIPLE IMPUTATION PREPARATION AND RUN                    *  
 55 | ******************************************************************************** 
 56 | 
 57 | use "\data\final_datasets\Endpoint_3\OX129_endpoint3.dta" 
 58 | 
 59 | * Visualise distributions of continuous variables *  
 60 | 
 61 | histogram age, percent normal  
 62 | histogram diagnosis_bmi, percent normal  
 63 | 
 64 | * Assess natural log transform - ? better normality of distribution for multiple imputation?*  
 65 | gen ln_bmi = log(diagnosis_bmi) 
 66 | histogram ln_bmi, percent normal 
 67 | * Chekc for Townsend score *
 68 | histogram town_int, percent normal    // looks fine 
 69 | 
 70 |  
 71 | * Stset the data so that we can identify optimal FP terms in turn for the aformentioned continuous variables *  
 72 | * Tabulate the breast cancer death variable * 
 73 | tab d_breastcancer 
 74 | 
 75 | * Stset the data, restrict to 10 years follow-up (prediction horizon) *  
 76 | stset exit3date, origin(outcomedate6) fail(d_breastca==1) scale(365.25) exit(time outcomedate6 + 3652.5) 
 77 | 
 78 | * Estimate Nelson-Aalen cumulative failure function for inclusion in imputation model *  
 79 | sts gen nelson_aalen = na 
 80 | summ nelson_aalen, det  
 81 | 
 82 | * Quantify degree of missingness for key variables to be considered *  
 83 | tab ethriskid, m  
 84 | tab smoke_cat, m  
 85 | tab alcohol_cat, m  
 86 | tab er_status, m  
 87 | tab her2_status, m  
 88 | tab pr_status, m  
 89 | tab cancer_grade, m  
 90 | tab cancer_route, m  
 91 | tab cancer_stage, m  
 92 | 
 93 | count if ln_bmi==.  
 94 | count if town_int==.  
 95 | 
 96 | * Assess ranges for continuous variables, so that we can perform truncated regression in multiple imputation *  
 97 | summ ln_bmi, det  
 98 | summ town_int, det  
 99 | 
100 | * Use period (decade) of cohort entry as an auxiliary variable *  
101 | gen timesplit = td(31Dec2009)
102 | format timesplit %td  
103 | gen period = 1  
104 | replace period = 2 if outcomedate6>timesplit  
105 | tab period  
106 |  
107 | ** Set up the data for multiple imputation, register variables to be imputed, and run it **  
108 | mi set wide  
109 | 
110 | mi register imputed ln_bmi town_int smoke_cat alcohol_cat cancer_grade cancer_stage // 
111 | cancer_route er_status pr_status her2_status ethriskid  
112 | 
113 | timer on 1  
114 | 
115 | * nelson-aalen cumulative hazard estimate, period of cohort entry as auxiliary variables * 
116 | * Include the endpoint * 
117 | 
118 | mi impute chained (truncreg, ll(2.70805) ul(3.850157)) ln_bmi (truncreg, ll(-7) ul(10))           // 
119 | town_int (ologit, ascontinuous) alcohol_cat smoke_cat cancer_stage cancer_grade                   // 
120 | (mlogit, ascontinuous) er_status pr_status her2_status cancer_route ethriskid = age i.fh_breastca //
121 | i.radio i.chemo i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis i.lupus i.thiazide i.betablocker  //
122 | i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt i.anti_psycho i.mastect i.other_surg //
123 | i._d nelson_aalen i.period, add(50) augment chaindots rseed(1066)  
124 | 
125 | timer off 1  
126 | timer list  
127 | timer clear  
128 |  
129 | * Convert ln(BMI) back to original scale for later use in FPs and models *  
130 | mi passive: gen bmi = exp(ln_bmi) 
131 | summ _1_bmi, det  
132 | summ _50_bmi, det  
133 | 
134 | * Save imputed datasats for use next - FP selection, model fitting, evaluation *  
135 | save "\data\final_datasets\Endpoint_3\OX129_endpoint3_imputed.dta", replace  
136 | 
137 | clear  
138 | log close  
139 | ******************************************************************************** 
140 | ******************************************************************************** 
141 | 
142 |  
143 | 


--------------------------------------------------------------------------------
/2_OX129_Cox_models:
--------------------------------------------------------------------------------
  1 | ## Cox proportional hazards model ##
  2 | 
  3 | 
  4 | ## Multiple imputation already performed  ##
  5 | 
  6 | ****************************************************************************
  7 | *              1) FRACTIONAL POLYNOMIAL TERM SELECTION                     *
  8 | ****************************************************************************
  9 | use "\final_datasets\OX129_endpoint3.dta"              // complete case data 
 10 | 
 11 | summ age, det 
 12 | summ diagnosis_bmi, det 
 13 | summ town_int, det 
 14 | 
 15 | * outcomedate6 = date of breast cancer diagnosis (start of follow-up) 
 16 | * d_breastcancer = binary variable denoting breast cancer death (=1) or not (=0)
 17 | * exit3date = date left cohort (died, or censored) 
 18 | * exit3status = categorical (0=censored, 1 = died) 
 19 | * outcomedate6 = date of breast cancer diagnosis 
 20 | 
 21 | stset exit3date, origin(outcomedate6) fail(d_breastca==1) scale(365.25) exit(time outcomedate6 + 3652.5)
 22 | 
 23 | * Run mfp package to find FP terms * 
 24 | mfp stcox age diagnosis_bmi town_int, all 
 25 | 
 26 | * Age = 0.5, 2 
 27 | * BMI = 2, 2 
 28 | * Townsend score = 1 (keep linear) 
 29 | clear 
 30 | ********************************************************************************
 31 | 
 32 | ********************************************************************************
 33 | *          2) FIT FULL MODEL, PERFORM PREDICTOR VARIABLE SELECTION             *
 34 | ********************************************************************************
 35 | 
 36 | use "\final_datasets\OX129_endpoint3_imputed.dta"     // imputed data, ready for modelling 
 37 | 
 38 | ** Generate FP terms - this was generated in a previous step above ** 
 39 | mi passive: gen double bmi_x = bmi/10
 40 | mi passive: gen bmi_fp1 = (bmi_x^-2) - 0.1353749689
 41 | mi passive: gen bmi_fp2 = (bmi_x^-2)*ln(bmi_x) - 0.1353551232
 42 | 
 43 | fp gen double ge_at_diagnosis^(.5 2), scale(0 10) center(63.1168366)
 44 | rename age_at_diagnosis1 age_fp1 
 45 | rename age_at_diagnosis age_fp2 
 46 | 
 47 | * register as imputed variables to Stata knows how to handle them across mi commands * 
 48 | mi register imputed bmi_fp1 bmi_fp2     // imputed variables from passive 
 49 | mi register regular age_fp1 age_fp2     // age was complete for all, so just declare as regular 
 50 | 
 51 | * Tabulate events - d_breastcancer = breast cancer death (of interest), 
 52 | * exit3status = contains all cause death (use to define competing event)
 53 | tab d_breastcancer 
 54 | tab exit3status
 55 | 
 56 | gen time = exit3date - outcomedate6 
 57 | replace time = time/365.25
 58 | 
 59 | ************************************************************************
 60 | * Stset the data - 10 year exit date as this is the prediction horizon *
 61 | ************************************************************************ 
 62 | mi stset time, failure(d_breastca==1) exit(time 10)
 63 | 
 64 | 
 65 | ********************************************
 66 | ** Fit full model, including interactions **
 67 | ******************************************** 
 68 | * Form global for covariates, then use this in the model * 
 69 | global covariates = "age_fp* bmi_fp* townsend i.ethriskid i.alcohol_cat i.smoke_cat // 
 70 | ib6.cancer_route ib2.pr_status, ib2.her2_status, i.fh_breastca i.radio i.chemo i.mastectomy //
 71 | i.other_surg i.cancer_stage i.cancer_grade i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis //
 72 | i.lupus i.thiazide i.betablocker i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt // 
 73 | i.anti_psycho c.age_fp1##i.chemo c.age_fp2##i.chemo c.age_fp1##i.mastec c.age_fp2##i.mastec  // 
 74 | c.age_fp1##i.other_surg c.age_fp2##i.other_surg c.age_fp1##i.fh_breastca c.age_fp2##i.fh_breastca i.ethriskid##c.age_fp1 //
 75 | i.ethriskid##c.age_fp2 i.ethriskid##i.other_surg i.ethriskid##i.mastectomy i.ethriskid##i.chemotherapy i.ethriskid##i.radiotherapy"
 76 | 
 77 | mi estimate, dots eform errorok: stcox $covariates, vce(cluster study_practice) 
 78 | parmest, list(,) saving(cox_endpoint3_parmest_fullmodel_preselection.dta, replace) label eform for(estimate min95 max95) 
 79 | 
 80 | ********************************************************************************
 81 | 
 82 | 
 83 | ********************************************************************************
 84 | *                             3) FIT FINAL MODEL                               *
 85 | ********************************************************************************
 86 | 
 87 | ************************************************************
 88 | ** Refit model, with variables meeting inclusion criteria **
 89 | ************************************************************
 90 | * Keep if HR<0.9 or HR>1.1 and p<0.01 * 
 91 | * Keep if interaction significant at p<0.01 * 
 92 | 
 93 | global selected_covariates = "age_fp* bmi_fp* i.smoke_cat ib6.cancer_route ib2.pr_status ib2.er_status //
 94 | ib2.her2_status i.fh_breastca i.radio i.chemo i.mastect i.other_surg i.cancer_stage i.cancer_grade i.ckd //
 95 | i.cirrhosis i.acei i.tca i.ssri i.otherantidepress i.hrt i.anti_psycho c.age_fp1##i.mastec c.age_fp2##i.mastec //
 96 | c.age_fp1##i.other_surg c.age_fp2##i.other_surg"
 97 | 
 98 | ************************
 99 | * Save final Cox model *
100 | ************************ 
101 | mi estimate, dots eform errorok saving(cox_ep3, replace): stcox $selected_covariates, vce(cluster study_practice) 
102 | parmest, list(,) saving(cox_endpoint3_parmest.dta, replace) label eform for(estimate min95 max95) 
103 | 
104 | * Calculate baseline function after Cox model (rather than KM estimate) * 
105 | * Use mi xeq function to estimate baseline survival function in each imputation, then pool (average) *  
106 | mi query 
107 | local M = r(M)
108 | scalar base_surv = 0 
109 | mi xeq 1/`M': stcox $selected_covariates ; predict double basesurv, basesurv ; summ basesurv if _t<10; scalar base_surv = base_surv + r(min)
110 | scalar base_surv = base_surv/`M' 
111 | disp as txt "Pooled baseline survivor function over imputations = " as res base_surv 
112 | gen baseline_surv_cox = base_surv
113 | summ baseline_surv_cox
114 | 
115 | 
116 | ********************************************************************************
117 | 
118 | 
119 | ********************************************************************************
120 | *                 	4) INTERNAL-EXTERNAL CROSS-VALIDATION RUN                  *
121 | ********************************************************************************
122 | 
123 | **********************
124 | * Preparing for IECV * 
125 | **********************
126 | 
127 | * Need to generate a measure of FU time from entry to earliest of: event/cens- * 
128 | * oring/end of period 1/end of study * 
129 | * We want 2 temporally distinct sub-cohorts, with no overlap. Therefore, follow-up 
130 | * ends at the earliest of event, censoring date, or the end of the decade * 
131 | 
132 | * outcomedate6 is date of breast cancer diagnosis * 
133 | gen fu_start = outcomedate6 
134 | format fu_start %td 
135 | gen fu_end = . 
136 | 
137 | * Generate variable denoting cut off point for the two temp. distinct sub-cohorts * 
138 | gen timesplit = mdy(01,01,2010)
139 | format timesplit %td
140 | tab period, m 
141 | 
142 | * People in Period1 - truncate follow-up as appropriate * 
143 | replace fu_end = min(exit3date, timesplit) if period==1 
144 | replace fu_end = exit3date if period==2 
145 | format fu_end %td 
146 | 
147 | * Calculate follow-up time using IECV-suitable dates * 
148 | gen follow_up = fu_end-fu_start
149 | summ follow_up, det 
150 | replace follow_up = follow_up/365.25
151 | 
152 | * Overall event counts, any time in study period * 
153 | tab d_breastca 
154 | 
155 | * Generate new event indicator for IECV - event only within period of entry * 
156 | gen iecv_event = 0 
157 | replace iecv_event = 1 if period==1 & d_breastca==1 & exit3date<timesplit  
158 | replace iecv_event = 1 if period==2 & d_breastca==1 
159 | 
160 | tab period 
161 | tab iecv_event 
162 | tab period iecv_event 
163 | 
164 | * quick  checks * 
165 | tab iecv_event if period==1 
166 | tab iecv_event if period==2
167 | 
168 | * Estimate baseline survival function in period 1 data- used for later computation
169 | * of predicted risks in Period 2 by combining with linear predictors from IECV * 
170 | mi stset follow_up if period==1, failure(iecv_event==1) exit(time 10)
171 |  
172 | mi query 
173 | local M = r(M)
174 | scalar period1_base_surv = 0 
175 | mi xeq 1/`M': qui stcox $selected_covariates ; predict double basesurv, basesurv ; summ basesurv if _t<10; scalar period1_base_surv = period1_base_surv + r(min)
176 | scalar period1_base_surv = period1_base_surv/`M' 
177 | disp as txt "Pooled baseline survivor function over imputations = " as res period1_base_surv 
178 | gen period1_baseline_surv_cox = period1_base_surv
179 | summ period1_baseline_surv_cox  
180 | 
181 | * stset the data for the IECV process proper - take into account the event and time * 
182 | mi stset follow_up, failure(iecv_event==1) exit(time 10)
183 | 
184 | ************
185 | * Run IECV *
186 | ************ 
187 | save "\final_datasets\OX129_endpoint3_imputed_IECV.dta", replace
188 | * Convert to long format so that all predictions (i.e. for each imputation) are saved * 
189 | mi convert flong 
190 | 
191 | * Generate new variables in which to store held out predictions* 
192 | * Want to keep linear predictor (can be converted into HR later) *
193 | * Then, we can use them for the pooled overall metrics, but also to calculate centre-specific estimates *  
194 | gen iecv_xb = .  
195 | 
196 | * Loop over each region, fit model, estimate metrics on held out region data. Later, we present individual practice results * 
197 | cd "\estimates\"
198 | 
199 |  merge m:1 patid using "\final_datasets\OX129_endpoint3_sha1.dta"
200 | 
201 | forval x = 1(1)10 {                                                                                                 \\ 10 geograhical regions 
202 | 	mi estimate, dots errorok saving(myestiecv_ep3, replace): stcox $selected_covariates if (sha1!=`x' & period==1)   \\ fit model, save temporarily
203 | 	mi predict double xb if (sha1==`x' & period==2) using myestiecv_ep3, xb storecompleted                            \\ double precision using the model just fitted 
204 | 	replace iecv_xb = xb if iecv_xb==.                                                                                 
205 | 	drop xb                                                                                                           \\ clear xb so loop can run on next region
206 | 	display `x'
207 | }
208 | 
209 | summ iecv_xb, det 
210 | count if iecv_xb==.                                                      \\ Show those that do not have IECV-yielded prediction (should equal number in Period 1 )
211 | tab period 
212 | 
213 | * Save updated dataset: save predictions, and then can use for the model evaluation* 
214 | save "\final_datasets\OX129_endpoint3_imputed_IECV.dta", replace
215 | 
216 | clear 
217 | 
218 | ********************************************************************************
219 | 
220 | 
221 | ********************************************************************************
222 | *	          5) PERFORMANCE ASSESSMENT, INC. HETEROGENEITY                    *
223 | ********************************************************************************
224 | 
225 | use "\final_datasets\OX129_endpoint3_imputed_IECV.dta", clear 
226 | 
227 | ************************************************
228 | * Pooled metrics - those without meta-analysis *
229 | ************************************************
230 | 
231 | * Calibration slope overall in period 2 data * 
232 | * Slope is estimated using a second Cox model - regress linear predictor * 
233 | mi stset follow_up, failure(iecv_event==1) exit(time 10)
234 | mi estimate, dots: stcox iecv_xb if period==2, vce(cluster study_practice)
235 | 
236 | * Calibration-in-the-large in period 2 data * 
237 | mi estimate, dots: stcox iecv_xb if period==2, offset(iecv_xb) vce(cluster study_practice) 
238 | 
239 | * Smoothed calibration plot * 
240 | * Save before converting to wide format to avoid Stata error * 
241 | * Coverting to wide takes the average imputation value (e.g. average prediction - fits with Rubin's rules) * 
242 | save "\final_datasets\OX129_endpoint3_imputed_IECV.dta", replace 
243 | mi convert wide 
244 | * Directory to where ado files are stored, so that command can run * 
245 | cd "\ado\plus\"        
246 | * Pseudovalues for failure function in Period 2 data - 'observed' event probs * 
247 | stpsurv if period==2, failure at(10) gen(period2_pseudo)
248 | summ period2_pseudo, det 
249 | * Calculated 'predicted' probs - use IECV linear predictors and the baseline 
250 | * 10yr survival function estimated in the whole cohort * 
251 | * Use the average value of iecv_xb across all imputations * 
252 | gen iecv_risk10yr = 1 - period1_baseline_surv_cox^exp(iecv_xb) 
253 | summ iecv_risk10yr, det 
254 | running period2_pseudo iecv_risk10yr, ci leg(off) nopts addplot(function y=x) ysc(range(0 1)) xsc(range(0 1))  //
255 | ylab(0(.2)1) xlab(0(.2)1) title("Smoothed calibration plot - Cox proportional hazards model", size(small)) //
256 | xtitle("Predicted event probability") ytitle("Observed event probability") graphregion(color(white)) aspect(1)
257 | 
258 | graph save "Graph" "\graphs\ep3_cox\EP3_cox_smoothed_calibration.gph", replace 
259 | 
260 | * Risk groups calibration plot * 
261 | pmcalplot iecv_risk10yr if period==2, ci surv t(10) bin(20) nospike //
262 | xtitle("Predicted event probability", size(small)) ytitle("Observed event probability", size(small)) graphregion(color(white))//
263 | title("Calibration plot - Cox proportional hazards model", size(small))
264 | 
265 | graph save "Graph" "\graphs\ep3_cox\EP3_cox_riskgroups_calibration.gph", replace
266 | 
267 | clear 
268 | 
269 | * Reopen the long format dataset so that all imputations can be used * 
270 | use "\final_datasets\OX129_endpoint3_imputed_IECV.dta", clear 
271 | 
272 | * Discrimination * 
273 | * somersd command assumes higher score = better survival * 
274 | * Exponentiate in case any predictions equal 0 * 
275 | gen iecv_hr = exp(iecv_xb) 
276 | * Then take the inverse * 
277 | gen iecv_invhr = 1/iecv_hr  
278 | * Censoring indicator made * 
279 | gen censind = 1-_d if _st==1
280 | mi estimate, cmdok dots: somersd _t iecv_invhr if _st==1, cenind(censind) tdist transf(c)
281 | 
282 | 
283 | *******************************
284 | * Performance by ethnic group * 
285 | *******************************
286 | * Discrimination in different ethnic groups * 
287 | * As ethnicity was imputed, ethnicity per person may vary across imputations - capture this * 
288 | forval x = 1(1)9 {
289 | 	mi estimate, cmdok dots esampvaryok: somersd _t iecv_invhr if (_st==1 & period==2 & ethriskid==`x'), cenind(censind) tdist transf(c)
290 |   }
291 | 
292 | * Calibration slope * 
293 | forval x = 1(1)9 { 
294 | 	mi estimate, dots esampvaryok: stcox iecv_xb if ethriskid==`x', vce(cluster study_practice)
295 | }
296 | 
297 | * Calibration in the large * 
298 | forval x = 1(1)9 { 
299 | 	mi estimate, dots esampvaryok: stcox iecv_xb if ethriskid==`x', offset(iecv_xb) vce(cluster study_practice)
300 | }
301 | 
302 | 
303 | **************************************************
304 | **     Region-level heterogeneity and results   **
305 | **************************************************
306 | * Form a small dataset file for each metric - this will contain the region-level estimates of kery metrics * 
307 | * These are then used later for the random effects meta-analysis pooling * 
308 | 
309 | cd "\estimates\" 
310 | 
311 | ***************************
312 | * GT IECV for Harrell's C *
313 | 
314 | capture postutil clear   
315 | tempname C_EP3regioncox
316 | postfile `C_EP3regioncox' beta st_err val_size using C_EP3regioncox.dta , replace 
317 |   
318 |   forval x = 1(1)10 {
319 |   mi estimate, cmdok dots: somersd _t iecv_invhr if (_st==1 & sha1==`x' & period==2), cenind(censind) tdist transf(c) 
320 |   local beta = r(table)[1,1]
321 |   local st_err = r(table)[2,1]
322 |   local val_size = e(N)
323 |   post `C_EP3regioncox' (`beta') (`st_err') (`val_size')
324 |   }
325 |   
326 |   postclose `C_EP3regioncox' 
327 |   
328 | *********************************
329 | * GT IECV for calibration slope *
330 | 
331 | capture postutil clear   
332 | tempname slope_EP3regioncox
333 | postfile `slope_EP3regioncox' slope slope_se val_size using slope_EP3regioncox.dta , replace 
334 |   
335 |   forval x = 1(1)10 {
336 |   mi estimate, dots: stcox iecv_xb if sha1==`x', vce(cluster study_practice)
337 |   local slope = r(table)[1,1]
338 |   local slope_se = r(table)[2,1]
339 |   local val_size = e(N)
340 |   post `slope_EP3regioncox' (`slope') (`slope_se') (`val_size')
341 |   }
342 |   
343 |   postclose `slope_EP3regioncox' 
344 |   
345 | ****************************************
346 | * GT IECV for calibration-in-the-large *
347 | 
348 | capture postutil clear   
349 | tempname citl_EP3regioncox
350 | postfile `citl_EP3regioncox' citl citl_se val_size using citl_EP3regioncox.dta , replace 
351 |   
352 |   forval x = 1(1)10 { 
353 |   mi estimate, dots: stcox iecv_xb if sha1==`x', offset(iecv_xb) vce(cluster study_practice)
354 |   local citl = r(table)[1,1]
355 |   local citl_se = r(table)[2,1]
356 |   local val_size = e(N)
357 |   post `citl_EP3regioncox' (`citl') (`citl_se') (`val_size')
358 |   }
359 |   
360 |   postclose `citl_EP3regioncox'
361 |  
362 | save "\final_datasets\OX129_endpoint3_IECV.dta", replace 
363 | 
364 | * For events/denominators in meta-analysis graphs * 
365 | tab sha1 _d if period==2 & _mi_m==1       \\ tabulate region, number of events in period 2 data, in a single imputation 
366 | 
367 | clear 
368 | 
369 | ********************************************************************************
370 | 
371 | 
372 | *****************
373 | * Meta-analyses *
374 | ***************** 
375 | 
376 | ** Random effects meta-analysis pooled performance metrics * 
377 | use "\estimates\C_EP3regioncox.dta", clear 
378 | 
379 | * Input new variable with region name, number of events, number of total individuals * 
380 | 
381 | input str80 region
382 | "East Midlands (464/3,337)"
383 | "East of England (543/4,826)"
384 | "London (1,270/13,287)"
385 | "North East (387/2,784)"
386 | "North West (1,664/15,419)"
387 | "South Central (1,062/11,272)"
388 | "South East (848/8,666)"
389 | "South West (1,054/10,165)"
390 | "West Midlands (1,032/8,678)"
391 | "Yorkshire & Humber (484/3,946)"
392 | end
393 | meta set beta st_err, studylab(region)
394 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
395 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 
396 | graph save "Graph" "\graphs\EP3_harrellsC_cox.gph", replace 
397 | clear 
398 | 
399 | use "\estimates\slope_EP3regioncox.dta" , clear 
400 | input str80 region
401 | "East Midlands (464/3,337)"
402 | "East of England (543/4,826)"
403 | "London (1,270/13,287)"
404 | "North East (387/2,784)"
405 | "North West (1,664/15,419)"
406 | "South Central (1,062/11,272)"
407 | "South East (848/8,666)"
408 | "South West (1,054/10,165)"
409 | "West Midlands (1,032/8,678)"
410 | "Yorkshire & Humber (484/3,946)"
411 | end
412 | meta set slope slope_se, studylab(region)
413 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
414 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1)
415 | graph save "Graph" "\graphs\EP3_slope_cox.gph", replace 
416 | clear
417 | 
418 | use "\estimates\citl_EP3regioncox.dta" , clear 
419 | input str80 region
420 | "East Midlands (464/3,337)"
421 | "East of England (543/4,826)"
422 | "London (1,270/13,287)"
423 | "North East (387/2,784)"
424 | "North West (1,664/15,419)"
425 | "South Central (1,062/11,272)"
426 | "South East (848/8,666)"
427 | "South West (1,054/10,165)"
428 | "West Midlands (1,032/8,678)"
429 | "Yorkshire & Humber (484/3,946)"
430 | end
431 | meta set citl citl_se, studylab(region)
432 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
433 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0)
434 | graph save "Graph" "\graphs\EP3_citl_cox.gph", replace 
435 | clear
436 | 
437 | *********************
438 | ** Meta-regression **
439 | *********************
440 | 
441 | * Form collapsed dataset which contains the region-level factors (period 2 data) to be explored in meta-reg *
442 | 
443 | use "\final_datasets\OX129_endpoint3_imputed_IECV.dta" , clear 
444 | keep if period==2 
445 | gen nonwhite = 0
446 | replace nonwhite=1 if (ethriskid!=. & ethriskid>1)
447 | gen age1 = age_at_diagnosis
448 | gen baseline_bmi1 = diagnosis_bmi
449 | gen town_int1 = town_int
450 | 
451 | collapse (mean)age_at_diagnosis diagnosis_bmi town_int (sd)age1 baseline_bmi1 town_int1 (percent)nonwhite, by(sha1)
452 | 
453 | tab age_at_diagnosis 
454 | tab diagnosis_bmi 
455 | tab town_int 
456 | tab age1
457 | tab baseline_bmi1 
458 | tab town_int1 
459 | tab nonwhite
460 | 
461 | save "\final_datasets\OX129_regionalperiod2_EP3.dta", replace 
462 | clear 
463 | 
464 | 
465 | **********************************************************************************
466 | * Examine effect of age, BMI, deprivation and non-white ethnicity on perf hetero *
467 | ********************************************************************************** 
468 | 
469 | *****************
470 | ** Harrell's C **  
471 | use "\estimates\C_EP3regioncox.dta"
472 | gen sha1 = _n 
473 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
474 | 
475 | meta set beta st_err, studylab(sha1)
476 | 
477 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
478 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
479 | 
480 | meta regress age_at_diagnosis, random(sjonkman)
481 | estat bubbleplot 
482 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_age_cox.gph", replace 
483 | 
484 | meta regress age1, random(sjonkman)
485 | estat bubbleplot 
486 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_ageSD_cox.gph", replace 
487 | 
488 | meta regress town_int, random(sjonkman)
489 | estat bubbleplot 
490 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_townsend_cox.gph", replace 
491 | 
492 | meta regress nonwhite, random(sjonkman)
493 | estat bubbleplot 
494 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_ethnic_cox.gph", replace 
495 | 
496 | meta regress diagnosis_bmi, random(sjonkman)
497 | estat bubbleplot 
498 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_bmi_cox.gph", replace 
499 | 
500 | meta regress baseline_bmi, random(sjonkman)
501 | estat bubbleplot 
502 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_bmiSD_cox.gph", replace 
503 | 
504 | clear 
505 | 
506 | 
507 | ***********************
508 | ** Calibration slope ** 
509 | use "\estimates\slope_EP3regioncox.dta"
510 | gen sha1 = _n 
511 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
512 | 
513 | meta set slope slope_se, studylab(sha1)
514 | 
515 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
516 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
517 | 
518 | meta regress age_at_diagnosis, random(sjonkman)
519 | estat bubbleplot 
520 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_age_cox.gph", replace 
521 | 
522 | meta regress age1, random(sjonkman)
523 | estat bubbleplot 
524 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_ageSD_cox.gph", replace 
525 | 
526 | meta regress town_int, random(sjonkman)
527 | estat bubbleplot 
528 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_townsend_cox.gph", replace 
529 | 
530 | meta regress nonwhite, random(sjonkman)
531 | estat bubbleplot 
532 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_ethnic_cox.gph", replace 
533 | 
534 | meta regress diagnosis_bmi, random(sjonkman)
535 | estat bubbleplot 
536 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_bmi_cox.gph", replace 
537 | 
538 | meta regress baseline_bmi, random(sjonkman)
539 | estat bubbleplot 
540 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_bmiSD_cox.gph", replace 
541 | 
542 | clear 
543 | 
544 | 
545 | ******************************
546 | ** Calibration in the large ** 
547 | 
548 | use "\estimates\citl_EP3regioncox.dta"
549 | gen sha1 = _n 
550 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
551 | 
552 | meta set citl citl_se, studylab(sha1)
553 | 
554 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
555 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
556 | 
557 | meta regress age_at_diagnosis, random(sjonkman)
558 | estat bubbleplot 
559 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_age_cox.gph", replace 
560 | 
561 | meta regress age1, random(sjonkman)
562 | estat bubbleplot 
563 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_ageSD_cox.gph", replace 
564 | 
565 | meta regress town_int, random(sjonkman)
566 | estat bubbleplot 
567 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_townsend_cox.gph", replace 
568 | 
569 | meta regress nonwhite, random(sjonkman)
570 | estat bubbleplot 
571 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_ethnic_cox.gph", replace 
572 | 
573 | meta regress diagnosis_bmi, random(sjonkman)
574 | estat bubbleplot 
575 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_bmi_cox.gph", replace 
576 | 
577 | meta regress baseline_bmi, random(sjonkman)
578 | estat bubbleplot 
579 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_bmiSD_cox.gph", replace 
580 | 
581 | clear 
582 | 
583 | 
584 | ********************************************************************************
585 | ********************************************************************************
586 | ******************************************************************************** 
587 | 
588 | 


--------------------------------------------------------------------------------
/7_OX129_ML_model_evaluation:
--------------------------------------------------------------------------------
  1 | ## Evaluating machine learning models ##
  2 | 
  3 | ## After fitting the models, then using IECV to generate individual-level predictions, the predictions 
  4 | ## (which were stored in csv files) are imported to Stata so that the same metrics can be calculated ##
  5 | 
  6 | 
  7 | * OPEN STATA *
  8 | 
  9 | *************************
 10 | * NEURAL NETWORK MODELS *
 11 | *************************
 12 | 
 13 | ********************************************************************************
 14 | ** Performance of neural network based on 'pooled predictions' and IECV  ** 
 15 | ***************************************************************************
 16 | 
 17 | * Open saved csv file in Stata from R * 
 18 | import delimited "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions.csv", asdouble 
 19 | 
 20 | drop v1                                                                                            // v1 is just an observation number - not useful 
 21 | 
 22 | * Destring the predictions, and rename variables for congeniality with Stata files* 
 23 | destring nnet_iecv_predictions, dpcomma replace
 24 | summ nnet_iecv_predictions, det
 25 | rename nnet_iecv_predictions predictions                                                           // Renaming for consistency across both models
 26 | rename data_period2patid patid
 27 | bys patid: gen m = _n                                                                              // Sort by patient ID, generate imputation number (each patient will be present 50 times due to m=50 imputations) 
 28 |                                                                                                    // This variable 'm' will be used to denote imputation number 
 29 | * Clip the predictions to be between 0 and 1 - they are probabilities * 
 30 | count if predictions<0
 31 | count if predictions>1
 32 | replace predictions = 0.0000001 if predictions<0      ## Miniscule risk rather than zero, will use cloglog transform later!
 33 | replace predictions = 0.9999999 if predictions>1      ## Rather than guaranteed risk 
 34 | 
 35 | * Save this as interim dataset * 
 36 | save "\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace
 37 | 
 38 | * We want to perform model evaluation in accordance with Rubin's rules with 
 39 | * multiple imputed datasets. For this, Stata 'needs' an m=0 (i.e. complete case)
 40 | * copy, and then the separate imputations below it. We need to trick Stata into handling this * 
 41 | 
 42 | * First, keep first imputation, and say it is the complete case, m=0 
 43 | keep if m==1 
 44 | replace m=0 
 45 | 
 46 | * Generate mi style missing indicator - (variable '1') to say that predictions are missing, then 
 47 | * later we can use Stata's mi machinery to handle the 50 predictions for each 
 48 | * patid as separate imputations' predictions 
 49 | gen i_predictions = 1 
 50 | tab m 
 51 | replace predictions = . if i_predictions==1 
 52 | 
 53 | * Now that the predictions in m=0 are missing, append the dataset with the 50 
 54 | * true values of the predictions, 1 for each patid in each imputation 
 55 | append using  "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta"
 56 | 
 57 | * Save interim data file 
 58 | save  "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace 
 59 | 
 60 | * Format as multiply imputed dataset - now, m=0 will have no predictions, whereas 
 61 | * the m=1 to m=50 will have their respective predictions in 
 62 | mi import flong, m(m) id(patid) imputed(predictions)                                                 // Importing imputations into this dataset 
 63 | 
 64 | * Convert to wide style for the convenience of dataset size, and for merging later 
 65 | mi convert wide 
 66 | mi describe 
 67 | 
 68 | * Save interim datafile after mi conversion * 
 69 | save  "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace
 70 | 
 71 | * For the smoothed calibration plot, we want each individual's average prediction, so that 
 72 | * this can be plotted against the observed pseudovalues * 
 73 | egen mean_prediction = rowmean(_1_predict - _50_predict) 
 74 | summ mean_prediction, det 
 75 | 
 76 | * Save and clear * 
 77 | save  "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace
 78 | clear 
 79 | 
 80 | * Open up the key dataset - this contains psuedovalues, period1, period2, etc. *
 81 | * This is in 'wide' format * 
 82 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear 
 83 | keep if period==2 
 84 | 
 85 | * Merge in the formatted predictions from the IECV process for the neural net * 
 86 | mi merge 1:1 patid using "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta"
 87 | 
 88 | mi describe 
 89 | 
 90 | ************************
 91 | ** Pooled predictions **
 92 | ************************
 93 | 
 94 | * Plot the smoothed calibration plot - save out * 
 95 | * Running smoother between predicted probabilities and observed pseudo-prabilities - both on same scale * 
 96 | running period2_pseudo mean_prediction if _mi_m==1, ci leg(off) nopts xscale(range(0 1)) yscale(range(0 1)) aspect(1) //
 97 | title("Smoothed calibration plot - neural network", size(small)) xtitle("Predicted event probability", size(small)) // 
 98 | ytitle("Observed event probability", size(small)) addplot(function y=x) graphregion(color(white)) ylabel(0(.2)1) xlabel(0(.2)1)
 99 | 
100 | graph save "\graphs\ep3_competing\calibration_neuralnet_iecv.gph", replace
101 | 
102 | * Use the cloglog transform of predictions for overall calibration metrics * 
103 | mi passive: gen cll_pred = log(-log(1-prediction)) 
104 | 
105 | * Calibration of neural network - pooled predictions * 
106 | * 'Observed' (pseudo)probability, regress on cloglog of prediction, model on cloglog scale * 
107 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls 
108 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls offset(cll_pred)   
109 |  
110 | * Discrimination - pooled predictions * 
111 | mi register regular ipcw                                                                              // ipc weights - one for each patient
112 | mi stset fu_end, origin(fu_start) failure(iecv_event==1) scale(365.25) exit(time fu_start+3652.5)     // Use follow-up and event indicators previously made in regression steps 
113 | drop hr 
114 | mi passive: gen hr = exp(prediction)                                                                  // Exponentiate to avoid any predictions equalling zero - this preserves the ordering!
115 | mi passive: gen invhr = 1/hr                                                                          // somersd expects higher score = better survival, so invert 
116 | gen censind = 1 - _d if _st==1
117 | mi estimate, dots cmdok: somersd _t invhr if (_st==1) [iweight=ipcw], cenind(censind) tdist transf(c) // Run it 
118 | 
119 | 
120 | ****************************************************************
121 | ** Pooled predictions: heterogeneity by ethnicity and age grp **
122 | ****************************************************************
123 | 
124 | * Discrimination and calibration in different ethnic groups * 
125 | forval x = 1(1)9 {
126 | 	mi estimate, dots esampvaryok: somersd _t invhr if (_st==1 & ethriskid==`x') [iweight=ipcw], cenind(censind) tdist transf(c)    // Again, ethnic group may vary across imputations, so permit this to be captured when making full use of imputed data 
127 |   } 
128 | 
129 | * Calibration slope * 
130 | forval x = 1(1)9 {
131 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) noconstant irls vce(robust)      // Estimate per-ethnic gorup result in each imputation, then pool 
132 | }
133 | 
134 | * Calibration in the large * 
135 | forval x = 1(1)9 {
136 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred)      
137 | }
138 | 
139 | 
140 | **************
141 | * Age groups *
142 | **************
143 | gen agegroup = 1 
144 | replace agegroup = 2 if age_at_diagnosis>=30 
145 | replace agegroup = 3 if age_at_diagnosis>=40 
146 | replace agegroup = 4 if age_at_diagnosis>=50 
147 | replace agegroup = 5 if age_at_diagnosis>=60 
148 | replace agegroup = 6 if age_at_diagnosis>=70 
149 | replace agegroup = 7 if age_at_diagnosis>=80 
150 | tab agegroup 
151 | 
152 | * Discrimination and calibration in different age bands * 
153 | forval x = 1(1)7 {
154 | 	mi estimate, dots: somersd _t invhr if (_st==1 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c)    
155 |   } 
156 | 
157 | * Calibration slope * 
158 | forval x = 1(1)7 {
159 | 	mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust)     
160 | }
161 | 
162 | * Calibration in the large * 
163 | forval x = 1(1)7 {
164 | 	mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred)      
165 | }
166 | 
167 | 
168 | ************************************************
169 | ** IECV predictions and performance estimates **
170 | ************************************************
171 | 
172 | **************************************************
173 | **     Region-level heterogeneity and results   **
174 | **************************************************
175 | cd "\\estimates\" 
176 | 
177 | ***************************
178 | * GT IECV for Harrell's C *                                        // Loop to estimate model's Harrell's C for each region, save into separate dataset, then use later to pool with meta-analysis
179 | 
180 | capture postutil clear   
181 | tempname C_EP3regionnnet
182 | postfile `C_EP3regionnnet' beta st_err val_size using C_EP3regionnnet.dta , replace 
183 |   
184 |   forval x = 1(1)10 {
185 |   mi estimate, dots: somersd _t invhr if (_st==1 & sha1==`x') [iweight=ipcw], cenind(censind) tdist transf(c)
186 |   local beta = r(table)[1,1]
187 |   local st_err = r(table)[2,1]
188 |   local val_size = e(N)
189 |   post `C_EP3regionnnet' (`beta') (`st_err') (`val_size')
190 |   }
191 |   
192 |   postclose `C_EP3regionnnet' 
193 |   
194 | *********************************
195 | * GT IECV for calibration slope *                                   
196 | 
197 | capture postutil clear   
198 | tempname slope_EP3regionnnet
199 | postfile `slope_EP3regionnnet' slope slope_se val_size using slope_EP3regionnnet.dta , replace 
200 |   
201 |   forval x = 1(1)10 {
202 |   mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) noconstant irls vce(robust) 
203 |   local slope = r(table)[1,1]
204 |   local slope_se = r(table)[2,1]
205 |   local val_size = e(N)
206 |   post `slope_EP3regionnnet' (`slope') (`slope_se') (`val_size')
207 |   }
208 |   
209 |   postclose `slope_EP3regionnnet'  
210 |   
211 | ****************************************
212 | * GT IECV for calibration-in-the-large *                           // CITL is the intercept term when constraining slope to 1 
213 | 
214 | capture postutil clear   
215 | tempname citl_EP3regionnnet
216 | postfile `citl_EP3regionnnet' citl citl_se val_size using citl_EP3regionnnet.dta , replace 
217 |   
218 |   forval x = 1(1)10 { 
219 |   mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred)    
220 |   local citl = r(table)[1,1]
221 |   local citl_se = r(table)[2,1]
222 |   local val_size = e(N)
223 |   post `citl_EP3regionnnet' (`citl') (`citl_se') (`val_size')
224 |   }
225 |   
226 |   postclose `citl_EP3regionnnet'
227 |  
228 | * Event/denominator counts for meta-analysis plots below * 
229 | tab sha1 _d if period==2                                        // Number of events in each region during Period 2, and total in each region 
230 |  
231 | clear 
232 | 
233 | *****************
234 | * Meta-analyses *
235 | ***************** 
236 | 
237 | ** Random effects meta-analysis pooled performance metrics * 
238 | use "\estimates\C_EP3regionnnet.dta", clear 
239 | input str50 region
240 | "East Midlands (464/3,337)"
241 | "East of England (543/4,826)"
242 | "London (1,270/13,287)"
243 | "North East (387/2,784)"
244 | "North West (1,664/15,419)"
245 | "South Central (1,062/11,272)"
246 | "South East (848/8,666)"
247 | "South West (1,054/10,165)"
248 | "West Midlands (1,032/8,678)"
249 | "Yorkshire & Humber (484/3,946)"
250 | end
251 | meta set beta st_err, studylab(region) eslabel("Harrell's C") 
252 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
253 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 
254 | graph save "Graph" "\graphs\EP3_harrellsC_nnet.gph", replace 
255 | clear 
256 | 
257 | use "\estimates\slope_EP3regionnnet.dta" , clear 
258 | input str50 region
259 | "East Midlands (464/3,337)"
260 | "East of England (543/4,826)"
261 | "London (1,270/13,287)"
262 | "North East (387/2,784)"
263 | "North West (1,664/15,419)"
264 | "South Central (1,062/11,272)"
265 | "South East (848/8,666)"
266 | "South West (1,054/10,165)"
267 | "West Midlands (1,032/8,678)"
268 | "Yorkshire & Humber (484/3,946)"
269 | end
270 | meta set slope slope_se, studylab(region) eslabel("Calibration slope") 
271 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
272 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1) 
273 | graph save "Graph" "\graphs\EP3_slope_nnet.gph", replace 
274 | clear
275 | 
276 | use "\estimates\citl_EP3regionnnet.dta" , clear 
277 | input str50 region
278 | "East Midlands (464/3,337)"
279 | "East of England (543/4,826)"
280 | "London (1,270/13,287)"
281 | "North East (387/2,784)"
282 | "North West (1,664/15,419)"
283 | "South Central (1,062/11,272)"
284 | "South East (848/8,666)"
285 | "South West (1,054/10,165)"
286 | "West Midlands (1,032/8,678)"
287 | "Yorkshire & Humber (484/3,946)"
288 | end
289 | meta set citl citl_se, studylab(region) eslabel("Calibration-in-the-large") 
290 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
291 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0)
292 | graph save "Graph" "\graphs\EP3_citl_nnet.gph", replace 
293 | clear
294 | 
295 | ********************************************************************************
296 | ********************************************************************************
297 | 
298 | * END FOR NEURAL NETWORK EVALUATION * 
299 | 
300 | 
301 | ********************************************************************************
302 | ********************************************************************************
303 | 
304 | ******************
305 | * XGBoost Models *
306 | ******************
307 | 
308 | *********************************************************************
309 | ** Performance of XGBoost based on 'pooled predictions' and IECV  ** 
310 | *********************************************************************
311 | * The below, initial processing steps are the same from above for the XGBoost model * 
312 | 
313 | * Open saved csv file from R * 
314 | import delimited "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions.csv", asdouble 
315 | drop v1 
316 | 
317 | * Destring the predictions, and rename variables for congeniality with Stata files* 
318 | destring xgboost_iecv_predictions, dpcomma replace
319 | summ xgboost_iecv_predictions, det
320 | rename xgboost_iecv_predictions predictions 
321 | rename data_period2patid patid
322 | bys patid: gen m = _n 
323 | 
324 | * Clip the predictions to be between 0 and 1 - they are probabilities * 
325 | count if predictions<0
326 | count if predictions>1
327 | replace predictions = 0.0000001 if predictions<0      ## Miniscule risk rather than zero, will use cloglog transform later!
328 | replace predictions = 0.9999999 if predictions>1  
329 | 
330 | * Save this as interim dataset * 
331 | save "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace
332 | 
333 | * We want to perform model evaluation in accordance with Rubin's rules with 
334 | * multiple imputed datasets. For this, Stata 'needs' an m=0 (i.e. complete case)
335 | * copy, and then the separate imputations. We need to trick Stata into this
336 | * First, keep first imputation, and say it is the complete case, m=0 
337 | keep if m==1 
338 | replace m=0 
339 | 
340 | * Generate mi style missing indicator - say that predictions are missing, then 
341 | * later we can use Stata's mi machinery to handle the 50 predictions for each 
342 | * patid as a separate imputations' predition 
343 | gen i_predictions = 1 
344 | tab m 
345 | replace predictions = . if i_predictions==1 
346 | 
347 | * Now that the predictions in m=0 are missing, append the dataset with the 50 
348 | * true values of the predictions, 1 for each pati in each imputation 
349 | append using  "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta"
350 | 
351 | * Save interim data file 
352 | save  "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace 
353 | 
354 | * Format as multiply imputed dataset - now, m=0 will have no predictions, whereas 
355 | * the m=1 to m=50 will have their respective predictions in 
356 | mi import flong, m(m) id(patid) imputed(predictions)
357 | 
358 | * Convert to wide style for the convenience of dataset size, and for merging later 
359 | mi convert wide 
360 | mi describe 
361 | 
362 | * Save interim datafile after mi conversion * 
363 | save  "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace
364 | 
365 | * For the smoothed calibration plot, we want the average prediction, so that 
366 | * this can be plotted against the observed pseudovalues * 
367 | egen mean_prediction = rowmean(_1_predict - _50_predict) 
368 | 
369 | * Save and clear * 
370 | save  "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace
371 | clear 
372 | 
373 | * Open up the key dataset - this contains psuedovalues, period1, period2, etc. *
374 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear 
375 | keep if period==2 
376 | 
377 | * Merge in the formatted predictions from the IECV process for the neural net * 
378 | mi merge 1:1 patid using "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta"
379 | 
380 | 
381 | ************************
382 | ** Pooled predictions **
383 | ************************
384 | 
385 | * Plot the smoothed calibration plot - save out *  
386 | running period2_pseudo mean_prediction if _mi_m==1, ci leg(off) nopts xscale(range(0 1)) yscale(range(0 1)) aspect(1) //
387 | title("Smoothed calibration plot - XGBoost", size(small)) xtitle("Predicted event probability", size(small)) // 
388 | ytitle("Observed event probability", size(small)) addplot(function y=x) graphregion(color(white)) ylabel(0(.2)1) xlabel(0(.2)1)
389 | 
390 | graph save "\graphs\ep3_competing\calibration_xgboost_iecv.gph", replace
391 | 
392 | * Calculate the cloglog transforms * 
393 | mi passive: gen cll_pred = log(-log(-prediction)) 
394 | 
395 | * Calibration of neural network - pooled predictions * 
396 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls
397 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls offset(cll_pred) 
398 |  
399 | * Discrimination - pooled predictions * 
400 | mi register regular ipcw 
401 | mi stset fu_end, origin(fu_start) failure(iecv_event==1) scale(365.25) exit(time fu_start+3652.5)
402 | mi passive: gen hr = exp(prediction)
403 | mi passive: gen invhr = 1/hr 
404 | gen censind = 1 - _d if _st==1
405 | mi estimate, dots cmdok: somersd _t invhr if (_st==1) [iweight=ipcw], cenind(censind) tdist transf(c)
406 | 
407 | 
408 | ****************************************************************
409 | ** Pooled predictions: heterogeneity by ethnicity and age grp **
410 | ****************************************************************
411 | 
412 | * Discrimination in different ethnic groups * 
413 | forval x = 1(1)9 {
414 | 	mi estimate, dots esampvaryok: somersd _t invhr if (_st==1 & ethriskid==`x') [iweight=ipcw], cenind(censind) tdist transf(c)
415 |   } 
416 | 
417 | * Calibration slope and calibration-in-the-large * 
418 | forval x = 1(1)9 { 
419 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls  
420 | }
421 | 
422 | forval x = 1(1)9 { 
423 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls offset(cll_pred)  
424 | }
425 | 
426 | 
427 | **************
428 | * Age groups *
429 | **************
430 | gen agegroup = 1 
431 | replace agegroup = 2 if age_at_diagnosis>=30 
432 | replace agegroup = 3 if age_at_diagnosis>=40 
433 | replace agegroup = 4 if age_at_diagnosis>=50 
434 | replace agegroup = 5 if age_at_diagnosis>=60 
435 | replace agegroup = 6 if age_at_diagnosis>=70 
436 | replace agegroup = 7 if age_at_diagnosis>=80 
437 | tab agegroup 
438 | 
439 | * Discrimination and calibration in different age bands * 
440 | forval x = 1(1)7 {
441 | 	mi estimate, dots: somersd _t invhr if (_st==1 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c)    
442 |   } 
443 | 
444 | * Calibration slope * 
445 | forval x = 1(1)7 {
446 | 	mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust)     
447 | }
448 | 
449 | * Calibration in the large * 
450 | forval x = 1(1)7 {
451 | 	mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred)      
452 | }
453 | 
454 | 
455 | ************************************************
456 | ** IECV predictions and performance estimates **
457 | ************************************************
458 | 
459 | ** Numbers of events and denominators for the forest plots ** 
460 | * Estimated using neural network evaluation steps above * 
461 | 
462 | **************************************************
463 | **     Region-level heterogeneity and results   **
464 | **************************************************
465 | cd "\estimates\" 
466 | 
467 | ***************************
468 | * GT IECV for Harrell's C *
469 | 
470 | capture postutil clear   
471 | tempname C_EP3regionxgboost
472 | postfile `C_EP3regionxgboost' beta st_err val_size using C_EP3regionxgboost.dta , replace 
473 |   
474 |   forval x = 1(1)10 {
475 |   mi estimate, dots: somersd _t invhr if (_st==1 & sha1==`x') [iweight=ipcw], cenind(censind) tdist transf(c)
476 |   local beta = r(table)[1,1]
477 |   local st_err = r(table)[2,1]
478 |   local val_size = e(N)
479 |   post `C_EP3regionxgboost' (`beta') (`st_err') (`val_size')
480 |   }
481 |   
482 |   postclose `C_EP3regionxgboost' 
483 |   
484 | *********************************
485 | * GT IECV for calibration slope *
486 | 
487 | capture postutil clear   
488 | tempname slope_EP3regionxgboost
489 | postfile `slope_EP3regionxgboost' slope slope_se val_size using slope_EP3regionxgboost.dta , replace 
490 |   
491 |   forval x = 1(1)10 {
492 |   mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) vce(robust) irls noconstant 
493 |   local slope = r(table)[1,1]
494 |   local slope_se = r(table)[2,1]
495 |   local val_size = e(N)
496 |   post `slope_EP3regionxgboost' (`slope') (`slope_se') (`val_size')
497 |   }
498 |   
499 |   postclose `slope_EP3regionxgboost'  
500 |   
501 | ****************************************
502 | * GT IECV for calibration-in-the-large *
503 | 
504 | capture postutil clear   
505 | tempname citl_EP3regionxgboost
506 | postfile `citl_EP3regionxgboost' citl citl_se val_size using citl_EP3regionxgboost.dta , replace 
507 |   
508 |   forval x = 1(1)10 { 
509 |   mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) vce(robust) irls noconstant offset(cll_pred) 
510 |   local citl = r(table)[1,1]
511 |   local citl_se = r(table)[2,1]
512 |   local val_size = e(N)
513 |   post `citl_EP3regionxgboost' (`citl') (`citl_se') (`val_size')
514 |   }
515 |   
516 |   postclose `citl_EP3regionxgboost'
517 |  
518 | 
519 | clear 
520 | 
521 | *****************
522 | * Meta-analyses *
523 | ***************** 
524 | 
525 | ** Random effects meta-analysis pooled performance metrics * 
526 | use "\estimates\C_EP3regionxgboost.dta", clear 
527 | input str50 region
528 | "East Midlands (464/3,337)"
529 | "East of England (543/4,826)"
530 | "London (1,270/13,287)"
531 | "North East (387/2,784)"
532 | "North West (1,664/15,419)"
533 | "South Central (1,062/11,272)"
534 | "South East (848/8,666)"
535 | "South West (1,054/10,165)"
536 | "West Midlands (1,032/8,678)"
537 | "Yorkshire & Humber (484/3,946)"
538 | end
539 | meta set beta st_err, studylab(region) eslabel("Harrell's C")
540 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
541 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 
542 | graph save "Graph" "\graphs\EP3_harrellsC_xgboost.gph", replace 
543 | clear 
544 | 
545 | use "\estimates\slope_EP3regionxgboost.dta" , clear 
546 | input str50 region
547 | "East Midlands (464/3,337)"
548 | "East of England (543/4,826)"
549 | "London (1,270/13,287)"
550 | "North East (387/2,784)"
551 | "North West (1,664/15,419)"
552 | "South Central (1,062/11,272)"
553 | "South East (848/8,666)"
554 | "South West (1,054/10,165)"
555 | "West Midlands (1,032/8,678)"
556 | "Yorkshire & Humber (484/3,946)"
557 | end
558 | meta set slope slope_se, studylab(region) eslabel("Calibration slope")
559 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
560 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1) 
561 | graph save "Graph" "\graphs\EP3_slope_xgboost.gph", replace 
562 | clear
563 | 
564 | use "\estimates\citl_EP3regionxgboost.dta" , clear 
565 | input str50 Region
566 | "East Midlands (464/3,337)"
567 | "East of England (543/4,826)"
568 | "London (1,270/13,287)"
569 | "North East (387/2,784)"
570 | "North West (1,664/15,419)"
571 | "South Central (1,062/11,272)"
572 | "South East (848/8,666)"
573 | "South West (1,054/10,165)"
574 | "West Midlands (1,032/8,678)"
575 | "Yorkshire & Humber (484/3,946)"
576 | end
577 | meta set citl citl_se, studylabel("Region") eslabel("Calibration-in-the-large") 
578 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
579 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0)
580 | graph save "Graph" "\\graphs\EP3_citl_xgboost.gph", replace 
581 | clear
582 | 
583 | ********************************************************************************
584 | ********************************************************************************
585 | 
586 | 
587 | 
588 | *********************************
589 | * META-REGRESSION FOR ML MODELS *
590 | *********************************
591 | 
592 | * Use the pre-made datasets for region-level performance metrics, and the region-level summary demographic characteristics data file * 
593 | 
594 | * XGboost * 
595 | 
596 | ***************
597 | * Harrell's C * 
598 | use "\estimates\C_EP3regionxgboost.dta"
599 | gen sha1 = _n 
600 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
601 | 
602 | meta set beta st_err, studylab(sha1) 
603 | 
604 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
605 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 
606 | 
607 | meta regress age1, random(sjonkman) 
608 | estat bubbleplot 
609 | 
610 | meta regess town_int, random(sjonkman) 
611 | estat bubbleplot 
612 | 
613 | meta regress non_white, random(sjonkman) 
614 | estat bubbleplot 
615 | 
616 | meta regress baseline_bmi1, random(sjonkman) 
617 | estat bubbleplot 
618 | clear 
619 | 
620 | *********************
621 | * Calibration slope *
622 | use "\estimates\slope_EP3regionxgboost.dta"
623 | gen sha1 = _n 
624 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
625 | 
626 | meta set slope slope_se, studylab(sha1) 
627 | 
628 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
629 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 
630 | 
631 | meta regress age1, random(sjonkman) 
632 | estat bubbleplot 
633 | 
634 | meta regess town_int, random(sjonkman) 
635 | estat bubbleplot 
636 | 
637 | meta regress non_white, random(sjonkman) 
638 | estat bubbleplot 
639 | 
640 | meta regress baseline_bmi1, random(sjonkman) 
641 | estat bubbleplot 
642 | clear 
643 | 
644 | 
645 | 
646 | * Neural networks * 
647 | 
648 | ***************
649 | * Harrell's C * 
650 | use "\estimates\C_EP3regionnnet.dta"
651 | gen sha1 = _n 
652 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
653 | 
654 | meta set beta st_err, studylab(sha1) 
655 | 
656 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
657 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 
658 | 
659 | meta regress age1, random(sjonkman) 
660 | estat bubbleplot 
661 | 
662 | meta regess town_int, random(sjonkman) 
663 | estat bubbleplot 
664 | 
665 | meta regress non_white, random(sjonkman) 
666 | estat bubbleplot 
667 | 
668 | meta regress baseline_bmi1, random(sjonkman) 
669 | estat bubbleplot 
670 | clear 
671 | 
672 | *********************
673 | * Calibration slope *
674 | use "\estimates\slope_EP3regionnnet.dta"
675 | gen sha1 = _n 
676 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
677 | 
678 | meta set slope slope_se, studylab(sha1) 
679 | 
680 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
681 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 
682 | 
683 | meta regress age1, random(sjonkman) 
684 | estat bubbleplot 
685 | 
686 | meta regess town_int, random(sjonkman) 
687 | estat bubbleplot 
688 | 
689 | meta regress non_white, random(sjonkman) 
690 | estat bubbleplot 
691 | 
692 | meta regress baseline_bmi1, random(sjonkman) 
693 | estat bubbleplot 
694 | clear 
695 | 
696 | ********************************************************************************
697 | ********************************************************************************
698 | 
699 | ** END ** 
700 | 
701 | 


--------------------------------------------------------------------------------
/3_OX129_Competing_risks_regression:
--------------------------------------------------------------------------------
  1 | ## Competing risks regression modelling ##
  2 | 
  3 | 
  4 | ## Multiple imputation has already been carried out in accordance with principles
  5 | ## for Cox models (e.g. including endpoint, Aalen-Johanssen cumulative hazard function ##
  6 | 
  7 | ## Strategy here is, using the same set of 50 imputations, to permit and identify optimal FP terms 
  8 | ## for age/BMI/Townsend score for a pseudo-observations-based competing risks regression model, 
  9 | ## then conduct the model fitting and IECV thereafter ## 
 10 | 
 11 | 
 12 | ********************************************************************************
 13 | *              1) Fractional polynomials, predictor selection                  *
 14 | ********************************************************************************
 15 | 
 16 | * As candidate predictors may have different associations with the outcome in a 
 17 | * competing risks, pseudovalues-based model, we start with the 'raw data' and 
 18 | * repeat the processing steps to: 
 19 | * Find the optimal FP terms for age, BMI, Townsend
 20 | 
 21 | * Start with the original dataset (complete case, not yet imputed) * 
 22 | use "\final_datasets\OX129_endpoint3.dta", clear 
 23 | count 
 24 | 
 25 | * Stset the data, estimate pseudovalues for the cumulative incidence function *  
 26 | * exit3status = contains all cause death (use to define competing event)
 27 | tab d_breastcancer                                                                \\ 1=breast cancer-related death, 0 = no
 28 | tab exit3status                                                                   \\ 1 = death from any cause, 0 = censored/left alive
 29 | la de endpoints 0 "Censored" 1 "Breast cancer death" 2 "Competing death"
 30 | gen endpoint = 0 
 31 | replace endpoint = 1 if d_breastca==1                                              \\ Died, and breast cancer was on death certificate 
 32 | replace endpoint = 2 if exit3status==1 & d_breastca!=1                             \\ Died, but not from breast cancer
 33 | la val endpoint endpoints 
 34 | tab endpoint 
 35 | 
 36 | * Make time variable - unit is years * 
 37 | gen time = exit3date - outcomedate6                                               \\ Date of exit/death, minus date of breast cancer diagnosis 
 38 | replace time = time/365.25
 39 | summ time, det 
 40 | 
 41 | * Stset the data - unrestricted timeframe. Will use this to define competing
 42 | * events, breast cancer deaths, and compete the CIF pseudovalues at 10years  
 43 | stset time, failure(endpoint==1) exit(time 10)                                    \\ stpci command for pseudovalues automatically assumes 
 44 |                                                                                   \\ the other ategory of endpoint is the competing one 
 45 | cd "\ado\plus\"
 46 | stpci, at(10)                                                                      \\ estimate jack-knife pseudovalues for Aalen-Johanssen CIF at 10 years 
 47 | summ pseudo, det 
 48 | 
 49 | * Fractional polynomials for age, BMI at diagnosis, and Townsend score * 
 50 | mfp, all: glm pseudo age_at_diagnosis diagnosis_bmi town_int                       \\ In complete case data, as was done for Cox model 
 51 |                                                                                    \\ Sometimes there is non-convergence with the cloglog - use the linear version for FPs 
 52 | 
 53 | * Review FP terms returned - age and BMI have non-linear terms, Townsend = 1 * 
 54 | * Age = 1, 2  (different than Cox model)
 55 | * BMI at diagnosis = -2, -2 (same as Cox model)
 56 | * Townsend = 1, so keep linear. Let's 'undo' the scaling, and drop Itown 
 57 | 
 58 | * Plotting FP terms for the competing risks models * 
 59 | glm pseudo Iage*, link(cloglog) irls                                              \\ Model on cloglog scale, iteratively weighted least squares for maximisation
 60 | predict age_fp, xb                                                                \\ predict model linear predictor 
 61 | replace age_fp = 1 - exp(-exp(age_fp))                                            \\ complementar log-log transform of linear predictor = probabililty 
 62 | twoway fpfitci age_fp age_at_diagnosis, ytitle("Predicted event probability") xtitle("Age at diagnosis")
 63 | graph save "Graph" "\graphs\ep3_competing\FP_terms_age.gph", replace 
 64 | 
 65 | glm pseudo Idiag*, link(cloglog) irls                                             \\ Repeat this, but for BMI with its FP terms 
 66 | predict bmi_fp, xb 
 67 | replace bmi_fp = 1 - exp(-exp(bmi_fp))
 68 | twoway fpfitci bmi_fp diagnosis_bmi, ytitle("Predicted event probability") xtitle("Body mass index at diagnosis")
 69 | graph save "Graph" "\graphs\ep3_competing\FP_terms_bmi.gph", replace 
 70 | 
 71 | 
 72 | * Now we know the functional forms, and that they are very similar (one FP term
 73 | * for age is different, and easily coded), we can generate this new term in the 
 74 | * imputed dataset that we have previously generated. 
 75 | 
 76 | * close this dataset, then move on to core dataset for endpoint 3 *
 77 | clear 
 78 | 
 79 | ********************************************************************************
 80 | 
 81 | use "\final_datasets\OX129_endpoint3_imputed.dta", clear                     \\ Same set of 50 imputations used for Cox models 
 82 | count 
 83 | 
 84 | * Generate the new FP term for age - we will have FP terms (1,2) for CR model* 
 85 | * Detail on scale, etc. from mfp command above * 
 86 | gen double age_fp1 = (age_at_diagnosis/10) - 6.301833523
 87 | gen double age_fp2 = (age_at_diagnosis/10)^2 - 39.71310575
 88 | summ age_fp1, det 
 89 | summ age_fp2, det 
 90 | 
 91 | * Generate the BMI terms * 
 92 | mi passive: gen double bmi_x = bmi/10 
 93 | mi passive: gen double bmi_fp1 = (bmi_x^-2) - 0.1353749689
 94 | mi passive: gen double bmi_fp2 = (bmi_x^-2)*ln(bmi_x) = 0.1353551232
 95 | 
 96 | * Stset the data, estimate pseudovalues for the cumulative incidence function *  
 97 | * exit3status = contains all cause death (use to define competing event)
 98 | tab d_breastcancer 
 99 | tab exit3status
100 | la de endpoints 0 "Censored" 1 "Breast cancer death" 2 "Competing death"
101 | gen endpoint = 0 
102 | replace endpoint = 1 if d_breastca==1                                            \\ Same as above
103 | replace endpoint = 2 if exit3status==1 & d_breastca!=1 
104 | la val endpoint endpoints 
105 | tab endpoint 
106 | 
107 | * Make time variable * 
108 | gen time = exit3date - outcomedate6
109 | replace time = time/365.25
110 | 
111 | * Stset the data - unrestricted timeframe. Will use this to define competing
112 | * events, breast cancer deaths, and compete the CIF pseudovalues at 10years  
113 | mi stset time, failure(endpoint==1) exit(time 10)
114 | 
115 | cd "\ado\plus\"
116 | stpci, at(10)                                                                \\ Aalen-Johanssen cumulative incidence function at 10 years
117 | summ pseudo, det 
118 | 
119 | * Perform predictor selection * 
120 | * Fit the 'full' model - generalised linear model with cloglog link functon to approximate Fine-Gray; iteratively rewighted least squares for maximisation 
121 | * Robust standard errors in view of non-independence of pseudovalues * 
122 | 
123 | global covariates_crr = "age_fp* bmi_fp* townsend i.ethriskid i.alcohol_cat i.smoke_cat ib6.cancer_route ib2.pr_status, ib2.her2_status,                         //
124 | i.fh_breastca i.radio i.chemo i.mastectomy i.other_surg i.cancer_stage i.cancer_grade i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis                   //
125 | i.lupus i.thiazide i.betablocker i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt i.anti_psycho c.age_fp1##i.chemo c.age_fp2##i.chemo             //
126 | c.age_fp1##i.mastec c.age_fp2##i.mastec c.age_fp1##i.other_surg c.age_fp2##i.other_surg c.age_fp1##i.fh_breastca c.age_fp2##i.fh_breastca i.ethriskid##c.age_fp1 //
127 | i.ethriskid##c.age_fp2 i.ethriskid##i.other_surg i.ethriskid##i.mastectomy i.ethriskid##i.chemotherapy i.ethriskid##i.radiotherapy"
128 | 
129 | timer on 1 
130 | mi estimate, dots eform: glm pseudo $covariates_crr, link(cloglog) vce(robust) irls                         
131 | timer off 1 
132 | timer list 
133 | timer clear 
134 | 
135 | * Save exponentaited coefficients for pre-selection model * 
136 | cd "\models\Endpoint_3\"
137 | parmest, list(,) saving(CR_endpoint3_parmest_preselection.dta, replace) label eform for(estimate min95 max95) 
138 | 
139 | * Selected covariates - SHR<0.9 or >1.1, with p<0.01; interactions if p<0.01 *
140 | * For categories, if >1 of the sub-groups is sig, and (if relevant) 'dose' trend seend, retain variable * 
141 | global select_covariates_crr = "age_fp* bmi_fp* i.smoke_cat i.ib6.cancer_route ib2.pr_status ib2.her2_status ib2.er_status i.radio i.mastectomy i.other_surg i.cancer_stage i.cancer_grade i.hrt c.age_fp1##i.other_surg" 
142 | 
143 | * Fit the final model * 
144 | mi estimate, dots eform: glm pseudo $select_covariates_crr, link(cloglog) vce(robust) irls 
145 | 
146 | * Save coefficients and constant term for later plotting/use * 
147 | parmest, list(,) saving(CR_endpoint3_parmest.dta, replace) label eform for(estimate min95 max95)      // Save the model as 'SHRs'
148 | parmest, list(,) saving(CR_endpoint3_parmest_coefs.dta, replace) label for(estimate min95 max95)      // Save the model as coefficients 
149 | 
150 | 
151 | ********************************************************************************
152 | *                              2) Estimate IPCWs                               *
153 | ********************************************************************************
154 | 
155 | * Use Nelson-Aalen cumulative hazard - using same covariates as final model * 
156 | * Same covariate used as those selected for the prediction model * 
157 | mi stset time if period==2, failure(endpoint==0) exit(time 10)                                      // Code inspired by ado file for Stata command 'stbrier'
158 | sts gen basechazard = na 
159 | gen base_ipcw = . 
160 | summ basechazard if _t==10                                                                          // Nelson-Aalen cumulative hazard for censoring at 10 years 
161 | replace base_ipcw = r(mean) if period==2 
162 | 
163 | cd "\models\Endpoint_3\"
164 | mi estimate, dots saving(ipcw, replace): stcox $select_covariates_crr if period==2, estimate efron      // Fit model for censoring times, rather than death times
165 | mi predict xbC using ipcw if period==2, xb                                                              // Linear predictor (xb) from this censoring model 
166 | summ xbC, det 
167 | 
168 | gen prob_cens = (exp(-base_ipcw)^exp(xbC))                                                              // Combine baseline 'hazard' with linear predictor for censoring  
169 | summ prob_cens, det 
170 | gen ipcw = 1/prob_cens                                                                                  // Inverse probability of being censored 
171 | summ ipcw, det 
172 | 
173 | *Ensure data adequately re-stset - endpoint is breast cancer death * 
174 | mi stset time, failure(endpoint==1) exit(time 10)                                                       // Re-stset so that we go back to dealing with deaths
175 | 
176 | 
177 | save "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", replace                                     // Save as new, standalone dataset 
178 | clear 
179 | 
180 | ********************************************************************************
181 | ********************************************************************************
182 | 
183 | 
184 | 
185 | ********************************************************************************
186 | *                 3) Internal-external cross-validation run                    *
187 | ********************************************************************************
188 | 
189 | **********************
190 | * Preparing for IECV * 
191 | **********************
192 | 
193 | * By using pseudovalues for the endpoint, and need to have separate time periods 
194 | * for IECV, we will use two 'sub-datasets' - one for period 1, another for 
195 | * period 2.  
196 | * Period 1 data will be used to fit model, linear predictor and probabilities will be 
197 | * calculated in iteratively held-out Period 2 samples * 
198 | 
199 | use "/final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear                                  // The Competing Risks modelling dataset made above 
200 | drop fu_start fu_end follow_up time                                                               // Keep clean so that we can be sure on dates/times 
201 |  
202 | * Need to generate a measure of FU time from entry to earliest of: event/competing event/censoring/end of period 1/end of study * 
203 | * Outcomedate6 = date of breast cancer diagnosis * 
204 | gen fu_start = outcomedate6 
205 | format fu_start %td 
206 | gen fu_end = . 
207 | 
208 | * Timesplit is the cut off between the two periods * 
209 | gen timesplit = mdy(01,01,2010)
210 | format timesplit %td
211 | tab period, m 
212 | 
213 | * Follow-up must end at the timesplit if entered in period 1 *
214 | * Take the earliest of the timesplit, or other recorded exit date from cohort * 
215 | replace fu_end = min(exit3date, timesplit) if period==1 
216 | replace fu_end = exit3date if period==2 
217 | format fu_end %td 
218 | 
219 | * Simple calc of follow-up in the IECV framework * 
220 | gen follow_up = fu_end-fu_start
221 | summ follow_up, det
222 | replace follow_up = follow_up/365.25                               // Scale is years 
223 | summ follow_up, det 
224 | 
225 | * Overall event counts, any time in study period * 
226 | tab endpoint 
227 | 
228 | * Generate new event indicator for IECV - event only within period * 
229 | gen iecv_event = 0 
230 | replace iecv_event = 1 if period==1 & endpoint==1 & exit3date<timesplit    // IECV endpoint = 1 if breast cancer death within Period  
231 | replace iecv_event = 1 if period==2 & endpoint==1 
232 | 
233 | replace iecv_event = 2 if period==1 & endpoint==2 & exit3date<timesplit    // IECV endpoint = 2 if competing cause of death within Period 
234 | replace iecv_event = 2 if period==2 & endpoint==2 
235 | 
236 | tab period 
237 | tab iecv_event 
238 | tab period iecv_event 
239 | 
240 | * quick quality checks * 
241 | tab endpoint if period==1 
242 | tab iecv_event if period==1 
243 | tab iecv_event if period==2
244 | 
245 | *set directory to call in ado files * 
246 | cd "\ado\plus\"
247 | 
248 | * Estimate pseudovalues for CIF at 10 years in Period 1 data - this will be the 
249 | * 'target' for the models in IECV * 
250 | mi stset follow_up if period==1, failure(iecv_event==1) exit(time 10)          
251 | tab endpoint iecv_event if period==1  
252 | stpci if period==1, at(10) gen(period1_pseudo)                        // period1_pseudo = pseudovalues calculated in Period 1 data - endpoint for models in IECV
253 | summ period1_pseudo, det 
254 | 
255 | mi stset follow_up if period==2, failure(iecv_event==1) exit(time 10)
256 | stpci if period==2, at(10) gen(period2_pseudo)
257 | summ period2_pseudo, det 
258 | 
259 | save "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", replace 
260 | 
261 | * Convert to flong style so that all predictions are stored (they may vary by 
262 | * individual, as they may have varying imputed values). Want to use the MI data 
263 | * to calculate the performance metrics * 
264 | mi convert flong                                                     // Takes a minute and expands RAM usage
265 | 
266 | ************
267 | * Run IECV *
268 | ************ 
269 | 
270 | * Generate new variables in which to store held out predictions* 
271 | * Want to keep linear predictor (can be converted into event probability later) *
272 | * Then, we can use them for the pooled overall metrics, but also to calculate centre-specific estimates *  
273 | gen iecv_xb = .  
274 | 
275 | global select_covariates_crr = "age_fp* bmi_fp* i.smoke_cat i.ib6.cancer_route ib2.pr_status ib2.her2_status ib2.er_status i.radio i.mastectomy i.other_surg i.cancer_stage i.cancer_grade i.hrt c.age_fp1##i.other_surg" 
276 | 
277 | 
278 | ** Imputation numbers 7 and 29 cause intermittent issues with model not converging on them * 
279 | * This leads to missing predictions when using mi predict, which affects downstream code * 
280 | * Therefore, remove these 2 imputations (48/50) and use these throughout the 
281 | * evaluation steps * 
282 | mi set m -= (7 29)                                      // In Stata, an error on a model fitted to 1 imputation throws off the entire set of predictions...
283 | mi describe                                             // Therefore, better to keep 96% of data and make full use of IECV process 
284 | 
285 | * Loop over each region, fit model, estimate metrics on held out region data. Later, we present individual practice results * 
286 | cd "\estimates\"
287 | 
288 | forval x = 1(1)10 {                                      // 10 geographical regions
289 | 	mi estimate, dots errorok notable saving(myestiecv, replace): glm period1_pseudo $select_covariates_crr if (sha1!=`x' & period==1), link(cloglog) irls vce(robust)
290 | 	mi predict double xb if (sha1==`x' & period==2) using myestiecv, xb storecompleted 
291 | 	replace iecv_xb = xb if iecv_xb==. 
292 | 	drop xb 
293 | 	display `x'                                             // Keep track of how far through process you are 
294 | }
295 | 
296 | summ iecv_xb, det 
297 | count if iecv_xb==. 
298 | tab period 
299 | 
300 | * Generate predicted risks - as probabilities, by direct transformation (cloglog) of model linear predictor * 
301 | gen iecv_probabilities = 1 - exp(-exp(iecv_xb))
302 | summ iecv_probabilities, det 
303 | 
304 | histogram iecv_prob, percent 
305 | graph save "Graph" "\graphs\ep3_competing\Histogram_EP3_competingrisks_predicted_risks_50imp.gph", replace 
306 | 
307 | * Save updated dataset: save predictions, and then can use for the model evaluation* 
308 | save "\\final_datasets\OX129_endpoint3_competingrisk_IECV.dta" , replace
309 | 
310 | 
311 | ****************************************************************
312 | ** Calculate pooled metrics from predictions on held-out data **
313 | ****************************************************************
314 | 
315 | * Calibration plot - running smoother between predicted probs, and individual observed pseudo-probabilities, following stcoxcal approach of Royston * 
316 | * Use average prediction across all imputed datasets - mi convert wide * 
317 | mi convert wide 
318 | cd "\ado\plus\"
319 | running period2_pseudo iecv_probabilities, ci leg(off) nopts addplot(function y=x) ysc(range(0 1)) xsc(range(0 1))  //
320 | ylab(0(.2)1) xlab(0(.2)1) title("Smoothed calibration plot - Competing risks regression model", size(small))  //
321 | xtitle("Predicted event probability") ytitle("Observed event probability") graphregion(color(white)) aspect(1)
322 | 
323 | graph save "Graph" "\graphs\ep3_competing\EP3_competing_smoothed_calibration.gph", replace
324 | 
325 | * Predicted risks distribution - average prediction across all imputations * 
326 | histogram iecv_prob, percent 
327 | graph save "Graph" "\graphs\ep3_competing\Histogram_EP3_competingrisks_predicted_risks_average.gph", replace 
328 | 
329 | * clear this without saving, then re-open the database so all imputation's predictions are  
330 | * used for other aspects of evaluation * 
331 | clear 
332 | 
333 | use "\final_datasets\OX129_endpoint3_competingrisk_IECV.dta" , clear 
334 | 
335 | * Calibration slope and calibration-in-the-large * 
336 | * Use the cloglog transform of predicted probabilities, and model on cloglog scale * 
337 | gen cll_pred = log(-log(1-iecv_prob))                              // data is mi stset flong 
338 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls   
339 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant itls offset(cll_pred)  
340 | 
341 | * Harrell's C - inverse probability of censoring weighted *
342 | mi register regular ipcw                                                 // Tell Stata that each individual has this in every imputation 
343 | gen invprob = 1/iecv_probabilities                                       // somersd assumes that higher score = favourable survival. Need to invert this!
344 | mi stset follow_up if period==2, failure(endpoint==1) exit(time 10)
345 | gen censind = 1-_d if _st==1 & period==2                                 // need censoring indicator 
346 | mi estimate, cmdok dots: somersd _t invprob if (_st==1 & period==2) [iweight=ipcw], cenind(censind) tdist transf(c)    // Weight each contribution by IPCensoring 
347 | 
348 | *************************************************************
349 | * Discrimination and calibration in different ethnic groups *
350 | *************************************************************
351 | * Using esampvarok means that we can use imputed data - ethnicity may vary across 
352 | * imputation if was originally missing for that individual * 
353 | 
354 | tab ethriskid _d if period==2 & _mi_m==0, m                              // Counts in complete case data as this is stationary 
355 | 
356 | forval x = 1(1)9 {
357 | 	mi estimate, cmdok dots esampvaryok: somersd _t invprob if (_st==1 & period==2 & ethriskid==`x') [iweight=ipcw], cenind(censind) tdist transf(c)
358 |   }
359 | 
360 | * Calibration slope * 
361 | forval x = 1(1)9 { 
362 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls 
363 | }
364 | 
365 | * Calibration-in-the-large * 
366 | forval x = 1(1)9 { 
367 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls offset(cll_pred)  
368 | }
369 | 
370 | *****************************
371 | * Performance in age groups * 
372 | *****************************
373 | gen agegroup = 1 
374 | replace agegroup = 2 if age_at_diagnosis>=30 
375 | replace agegroup = 3 if age_at_diagnosis>=40 
376 | replace agegroup = 4 if age_at_diagnosis>=50 
377 | replace agegroup = 5 if age_at_diagnosis>=60 
378 | replace agegroup = 6 if age_at_diagnosis>=70 
379 | replace agegroup = 7 if age_at_diagnosis>-80 
380 | 
381 | forval x = 1(1)7 {
382 | 	mi estimate, cmdok dots esampvaryok: somersd _t invprob if (_st==1 & period==2 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c)
383 |   }
384 | 
385 | * Calibration slope * 
386 | forval x = 1(1)7 { 
387 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) vce(robust) noconstant irls 
388 | }
389 | 
390 | * Calibration-in-the-large * 
391 | forval x = 1(1)7 { 
392 | 	mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) vce(robust) noconstant irls offset(cll_pred)  
393 | }
394 | 
395 | ********************************************************************************
396 | ********************************************************************************
397 | 
398 | 
399 | 
400 | **************************************************
401 | **  4) Region-level heterogeneity and results   **
402 | **************************************************
403 | 
404 | cd "\estimates\" 
405 | 
406 | ***************************
407 | * GT IECV for Harrell's C *                        // As for Cox models, run loop estimating performance for each region, save as own dataset, which used later 
408 | 
409 | capture postutil clear   
410 | tempname C_EP3regioncompeting
411 | postfile `C_EP3regioncompeting' beta st_err val_size using C_EP3regioncompeting.dta , replace 
412 |   
413 |   forval x = 1(1)10 {
414 |   mi estimate, cmdok dots: somersd _t invprob if (_st==1 & sha1==`x' & period==2) [iweight=ipcw], cenind(censind) tdist transf(c) 
415 |   local beta = r(table)[1,1]
416 |   local st_err = r(table)[2,1]
417 |   local val_size = e(N)
418 |   post `C_EP3regioncompeting' (`beta') (`st_err') (`val_size')                      // beta = point estimate, st_err = standard error thereof 
419 |   }
420 |   
421 |   postclose `C_EP3regioncompeting' 
422 |   
423 | *********************************
424 | * GT IECV for calibration slope *
425 | 
426 | capture postutil clear   
427 | tempname slope_EP3regioncompeting
428 | postfile `slope_EP3regioncompeting' slope slope_se val_size using slope_EP3regioncompeting.dta , replace          // linear regression between predicted and observed probs 
429 |   
430 |   forval x = 1(1)10 {
431 |   mi estimate, dots: glm period2_pseudo cll_pred if (sha1==`x' & period==2), link(cloglog) vce(robust) irls noconstant  
432 |   local slope = r(table)[1,1]             
433 |   local slope_se = r(table)[2,1]
434 |   local val_size = e(N)
435 |   post `slope_EP3regioncompeting' (`slope') (`slope_se') (`val_size')
436 |   }
437 |   
438 |   postclose `slope_EP3regioncompeting' 
439 |   
440 | ****************************************
441 | * GT IECV for calibration-in-the-large *
442 | 
443 | capture postutil clear   
444 | tempname citl_EP3regioncompeting
445 | postfile `citl_EP3regioncompeting' citl citl_se val_size using citl_EP3regioncompeting.dta , replace               // offset used to constrain slope to 1, take the intercept
446 |   
447 |   forval x = 1(1)10 { 
448 |   mi estimate, dots: glm period2_pseudo cll_pred if (sha1==`x' & period==2), link(cloglog) vce(robust) irls noconstant offset(cll_pred)  
449 |   local citl = r(table)[1,1]
450 |   local citl_se = r(table)[2,1]
451 |   local val_size = e(N)
452 |   post `citl_EP3regioncompeting' (`citl') (`citl_se') (`val_size')
453 |   }
454 |   
455 |   postclose `citl_EP3regioncompeting'
456 |  
457 |  save "\final_datasets\OX129_endpoint3_competingrisk_IECV.dta" , replace 
458 |  
459 |  * numbers of events per region in data - for forest plots below * 
460 |  tab sha1 _d if period==2 & _mi_m==0 
461 |  
462 | clear 
463 | 
464 | *****************
465 | * Meta-analyses *
466 | ***************** 
467 | 
468 | 
469 | ** Random effects meta-analysis pooled performance metrics * 
470 | use "\estimates\C_EP3regioncompeting.dta", clear 
471 | input str60 region                                                     // Make new variable containing Region name, event counts, and total count for reference 
472 | "East Midlands (464/3,337)"
473 | "East of England (543/4,826)"
474 | "London (1,270/13,287)"
475 | "North East (387/2,784)"
476 | "North West (1,664/15,419)"
477 | "South Central (1,062/11,272)"
478 | "South East (848/8,666)"
479 | "South West (1,054/10,165)"
480 | "West Midlands (1,032/8,678)"
481 | "Yorkshire & Humber (484/3,946)"
482 | end
483 | meta set beta st_err, studylab(region) eslabel("Harrell's C")
484 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
485 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 
486 | graph save "Graph" "\graphs\EP3_harrellsC_competing.gph", replace 
487 | clear 
488 | 
489 | use "\estimates\slope_EP3regioncompeting.dta" , clear 
490 | input str60 region
491 | "East Midlands (464/3,337)"
492 | "East of England (543/4,826)"
493 | "London (1,270/13,287)"
494 | "North East (387/2,784)"
495 | "North West (1,664/15,419)"
496 | "South Central (1,062/11,272)"
497 | "South East (848/8,666)"
498 | "South West (1,054/10,165)"
499 | "West Midlands (1,032/8,678)"
500 | "Yorkshire & Humber (484/3,946)"
501 | end
502 | meta set slope slope_se, studylab(region) eslabel("Calibration slope") 
503 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
504 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1)
505 | graph save "Graph" "\graphs\EP3_slope_competing.gph", replace 
506 | clear
507 | 
508 | use "\estimates\citl_EP3regioncompeting.dta" , clear 
509 | input str60 region
510 | "East Midlands (464/3,337)"
511 | "East of England (543/4,826)"
512 | "London (1,270/13,287)"
513 | "North East (387/2,784)"
514 | "North West (1,664/15,419)"
515 | "South Central (1,062/11,272)"
516 | "South East (848/8,666)"
517 | "South West (1,054/10,165)"
518 | "West Midlands (1,032/8,678)"
519 | "Yorkshire & Humber (484/3,946)"
520 | end
521 | meta set citl citl_se, studylab(region) eslabel("Calibration-in-the-large") 
522 | meta summarize, random(sjonkman) se(khartung) predinterval(95)
523 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0)
524 | graph save "Graph" "\graphs\EP3_citl_competing.gph", replace 
525 | clear
526 | 
527 | 
528 | log close 
529 | 
530 | 
531 | ********************************************************************************
532 | ********************************************************************************
533 | 
534 | 
535 | ************************
536 | ** 5) Meta-regression **
537 | ************************
538 | 
539 | * Form collapsed dataset which contains the region-level factors (period 2 data) to be explored in meta-reg *        // Same as for Cox models 
540 | use "\\final_datasets\OX129_endpoint3_imputed_IECV.dta" , clear                                                      // Repeated here for clarity 
541 | mi convert wide 
542 | keep if period==2 
543 | gen nonwhite = 0
544 | replace nonwhite=1 if (ethriskid!=. & ethriskid>1)
545 | gen age1 = age_at_diagnosis
546 | gen baseline_bmi1 = diagnosis_bmi
547 | gen town_int1 = town_int
548 | collapse (mean)age_at_diagnosis diagnosis_bmi town_int (sd)age1 baseline_bmi1 town_int1 (percent)nonwhite, by(sha1)
549 | tab age_at_diagnosis 
550 | tab diagnosis_bmi 
551 | tab town_int 
552 | tab age1
553 | tab baseline_bmi1 
554 | tab town_int1 
555 | tab nonwhite
556 | save "\\final_datasets\OX129_regionalperiod2_EP3.dta", replace 
557 | clear 
558 |  
559 | **********************************************************************************
560 | * Examine effect of age, BMI, deprivation and non-white ethnicity on perf hetero *
561 | ********************************************************************************** 
562 | 
563 | *****************
564 | ** Harrell's C **  
565 | use "\estimates\C_EP3regioncompeting.dta"
566 | gen sha1 = _n 
567 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
568 | 
569 | meta set beta st_err, studylab(sha1)
570 | 
571 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
572 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
573 | 
574 | meta regress age_at_diagnosis, random(sjonkman)
575 | estat bubbleplot 
576 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_age_competing.gph", replace 
577 | 
578 | meta regress age1, random(sjonkman)
579 | estat bubbleplot 
580 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_ageSD_competing.gph", replace 
581 | 
582 | meta regress town_int, random(sjonkman)
583 | estat bubbleplot 
584 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_townsend_competing.gph", replace 
585 | 
586 | meta regress nonwhite, random(sjonkman)
587 | estat bubbleplot 
588 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_ethnic_competing.gph", replace 
589 | 
590 | meta regress diagnosis_bmi, random(sjonkman)
591 | estat bubbleplot 
592 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_bmi_competing.gph", replace 
593 | 
594 | meta regress baseline_bmi, random(sjonkman)
595 | estat bubbleplot 
596 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_bmiSD_competing.gph", replace 
597 | 
598 | clear 
599 | 
600 | 
601 | ***********************
602 | ** Calibration slope ** 
603 | use "\estimates\slope_EP3regioncompeting.dta"
604 | gen sha1 = _n 
605 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
606 | 
607 | meta set slope slope_se, studylab(sha1)
608 | 
609 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
610 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
611 | 
612 | meta regress age_at_diagnosis, random(sjonkman)
613 | estat bubbleplot 
614 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_age_competing.gph", replace 
615 | 
616 | meta regress age1, random(sjonkman)
617 | estat bubbleplot 
618 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_ageSD_competing.gph", replace 
619 | 
620 | meta regress town_int, random(sjonkman)
621 | estat bubbleplot 
622 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_townsend_competing.gph", replace 
623 | 
624 | meta regress nonwhite, random(sjonkman)
625 | estat bubbleplot 
626 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_ethnic_competing.gph", replace 
627 | 
628 | meta regress diagnosis_bmi, random(sjonkman)
629 | estat bubbleplot 
630 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_bmi_competing.gph", replace 
631 | 
632 | meta regress baseline_bmi, random(sjonkman)
633 | estat bubbleplot 
634 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_bmiSD_competing.gph", replace 
635 | 
636 | clear 
637 | 
638 | 
639 | ******************************
640 | ** Calibration in the large ** 
641 | use "\estimates\citl_EP3regioncompeting.dta"
642 | gen sha1 = _n 
643 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta"
644 | 
645 | meta set citl citl_se, studylab(sha1)
646 | 
647 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung)
648 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung)
649 | 
650 | meta regress age_at_diagnosis, random(sjonkman)
651 | estat bubbleplot 
652 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_age_competing.gph", replace 
653 | 
654 | meta regress age1, random(sjonkman)
655 | estat bubbleplot 
656 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_ageSD_competing.gph", replace 
657 | 
658 | meta regress town_int, random(sjonkman)
659 | estat bubbleplot 
660 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_townsend_competing.gph", replace 
661 | 
662 | meta regress nonwhite, random(sjonkman)
663 | estat bubbleplot 
664 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_ethnic_competing.gph", replace 
665 | 
666 | meta regress diagnosis_bmi, random(sjonkman)
667 | estat bubbleplot 
668 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_bmi_competing.gph", replace 
669 | 
670 | meta regress baseline_bmi, random(sjonkman)
671 | estat bubbleplot 
672 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_bmiSD_competing.gph", replace 
673 | 
674 | clear 
675 | 
676 | ********************************************************************************
677 | ********************************************************************************
678 | 
679 | ## END ## 
680 | 


--------------------------------------------------------------------------------
/5_OX129_XGBoost:
--------------------------------------------------------------------------------
  1 | ## XGBoost model fitting and evaluation with internal-external cross-validation ##
  2 | 
  3 | ## This is conducted in R, after data imputation, formatting, etc. in Stata ##
  4 | 
  5 | ## This stacked imputations approach enables full use of data ##
  6 | ## 'Full' data for entire cohort is used to fit the full model, with Bayesian Optimisation used to 
  7 | ## identify optimal hyperparameters via 5-fold cross-validation ##
  8 | 
  9 | ############################################################################
 10 | ############################################################################
 11 | 
 12 | ## MODEL FITTING ##
 13 | 
 14 | #############################
 15 | ## Load in packages needed ##
 16 | #############################
 17 | library(survival)
 18 | library(readr)
 19 | library(stringr)
 20 | library(haven)
 21 | library(caret) 
 22 | library(ParBayesianOptimization) 
 23 | library(xgboost) 
 24 | 
 25 | 
 26 | ########################################################
 27 | ## Check and set working directory for importing data ##
 28 | ########################################################
 29 | getwd() 
 30 | setwd("/final_datasets/ML/")
 31 | 
 32 | ##################
 33 | ## Load in data ##
 34 | ##################
 35 | data <- read_dta("OX129_endpoint3_stacked50_pseudovalues.dta")     ## Stacked dataset for entire cohort 
 36 | str(data)
 37 | 
 38 | #########################
 39 | ## Variable formatting ##
 40 | #########################
 41 | ## Categorical variables need to be converted into dummy variables ##
 42 | ## First, reformat relevant parameters as factors, then convert to dummies ##
 43 | 
 44 | data$smoke_cat <- factor(data$smoke_cat)                         # Smoking status 
 45 | data$cancer_route <- factor(data$cancer_route)                   # Route to diagnosis
 46 | data$pr_status2 <- factor(data$pr_status2)                       # PR +/-
 47 | data$her2_status2 <- factor(data$her2_status2)                   # HER2 +/-
 48 | data$radiotherapy <- factor(data$radiotherapy)                   # Use of RTx in 1st year 
 49 | data$mastectomy <- factor(data$mastectomy)                       # Mastectomy in 1st year  
 50 | data$other_surgery <- factor(data$other_surgery)                 # Other surgery in 1st year 
 51 | data$cancer_stage <- factor(data$cancer_stage)                   # Stage (I-IV)
 52 | data$cancer_grade <- factor(data$cancer_grade)                   # Grade (differentiation)
 53 | data$hrt <- factor(data$hrt)                                     # Hormone replacement therapy use
 54 | data$sha1 <- factor(data$sha1)                                   # Region - used for IECV
 55 | 
 56 | ## Now, start converting these factors to dummies ##
 57 | ## Set up list of parameters that need dummies ##
 58 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage',
 59 |                       'cancer_grade', 'pr_status', 'er_status',  
 60 |                       'her2_status', 'cancer_route')
 61 | 
 62 | ## The others will be hamdled as binary ##
 63 | ## Generate the dummies ##
 64 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
 65 |                        other_surgery + hrt + cancer_stage + 
 66 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
 67 |                        cancer_route, data = data)
 68 | 
 69 | ## Package the dummy variables into its own dataset, ready to bind with the ##
 70 | ## other, non-dummy parameters ##
 71 | dummied <- as.data.frame(predict(dummies, newdata=data))
 72 | 
 73 | ## Form the ready-to-process dataset by binding the new dummies with the other 
 74 | #numeric parameters ##
 75 | data_for_model <- cbind(data[, -c(which(colnames(data) %in% dummy_parameters))], 
 76 |                         dummied)
 77 | 
 78 | ## Continuous variables (age, BMI) - no scaling needed for XGB ##
 79 | ## Name variables so that we can track these in variable importance plots, etc. ##
 80 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.0']     <- 'Non_smoker'
 81 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.1']     <- 'Ex_smoker'
 82 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.2']     <- 'Light_smoker'
 83 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.3']     <- 'Moderate_smoker'
 84 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.4']     <- 'Heavy_smoker'
 85 | colnames(data_for_model)[colnames(data_for_model) == 'radiotherapy.1']  <- 'Radiotherapy'
 86 | colnames(data_for_model)[colnames(data_for_model) == 'mastectomy.1']    <- 'Mastectomy'
 87 | colnames(data_for_model)[colnames(data_for_model) == 'other_surgery.1'] <- 'Other_surgery'
 88 | colnames(data_for_model)[colnames(data_for_model) == 'hrt.1']           <- 'HRT_use'
 89 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.1']  <- 'Stage1'
 90 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.2']  <- 'Stage2'
 91 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.3']  <- 'Stage3'
 92 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.4']  <- 'Stage4'
 93 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.1']  <- 'Well_differentiated'
 94 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.2']  <- 'Moderately_differentiated'
 95 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.3']  <- 'Poorly_differentiated'
 96 | colnames(data_for_model)[colnames(data_for_model) == 'pr_status2.3']    <- 'PR_positive'
 97 | colnames(data_for_model)[colnames(data_for_model) == 'her2_status2.3']  <- 'HER2_positive'
 98 | colnames(data_for_model)[colnames(data_for_model) == 'er_status2.3']    <- 'ER_positive' 
 99 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.2']  <- 'Emergency_presentation'
100 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.3']  <- 'GP_referral'
101 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.4']  <- 'Inpatient_elective'
102 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.5']  <- 'Other_outpatient_pathway'
103 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.6']  <- 'Screening_detected'
104 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.7']  <- 'Two_week_wait'
105 | 
106 | ## 'pseudo' contains the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ##
107 | x_cols <-  c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 
108 |              'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 
109 |              'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 
110 |              'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 
111 |              'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 
112 |              'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait')
113 | y_cols <- c('pseudo')
114 | 
115 | ## Clean up environment to avoid issues with memory ##
116 | rm(dummied) 
117 | rm(data) 
118 | 
119 | ########################
120 | ## Dataset formatting ##
121 | ########################
122 | 
123 | ## Change dataset to a matrix  ##
124 | ## x_train = predictor parameters ##
125 | x_train <- as.matrix(data_for_model[, x_cols])
126 | 
127 | ## Labels = the target  - the pseudovalues ##
128 | label_train <- as.matrix(data_for_model[, y_cols])
129 | 
130 | ## Form an XGB dense matrix for the development data - name it dtrain ##
131 | dtrain <- xgb.DMatrix(data=x_train, label=label_train)
132 | 
133 | 
134 | ###########################################
135 | ## Bayesian optimisation for full model  ##
136 | ###########################################
137 | 
138 | ## Fit full model to entire available dataset - this is the model 
139 | ## that will be evaluated using IECV ## 
140 | ## We need to run cross-validation for hyperparameter tuning ##
141 | 
142 | ## Bayesian optimsation package requires you to first: specify the function 
143 | ## one seeks to optimimse. Here, it is to find the optimal hyperparameters for the 
144 | ## XGBoost model such that the root mean squared error (precicted and observed pseudovalues is minimised ##
145 | 
146 | ## Here, we set up this 'scoring function' so that we find the best values of max_depth, 
147 | ## min_child_weight, eta, subsample, and number of boosting rounds, etc.  #
148 | ## In order to assess the performance of each combination, 5-fold cross-validation is used.
149 | ## We use the inbuilt xgb.cv function for this. 
150 | ## 5-fold CV used to estimates the 'average' performance with each hyperparam selection. The smallest value of the mse desired
151 | ## is found from the evaluation log, and these values used ##
152 | 
153 | scorefunction <- function(max_depth, eta, subsample, number, alpha, gamma, 
154 |                           lambda, colsampletree, colsamplelevel, minchild) {
155 |   
156 |   set.seed(1066)
157 |   
158 |   dtrain <- xgb.DMatrix(data=x_train, label=label_train, weight=weights)
159 |   
160 |   pars <- list(                                   ## Define hyperparameters to eb fed to XGBoost model 
161 |     tree_method = "gpu_hist",                     ## Use GPU support
162 |     sampling_method = "gradient_based",           ## Gradient-based sampling
163 |     objective = "reg:squarederror",               ## Regression task 
164 |     eval_metric = "rmse",                         ## How model 'fit' evaluated
165 |     maximize = FALSE,                             ## Want to minimise this
166 |     max_depth = max_depth,                        ## How deep each tree goes
167 |     eta = eta,                                    ## Learning rate
168 |     subsample = subsample,                        ## Proportion of observations taken in sub-sample  
169 |     alpha = alpha,                                ## alpha, gamma, lambda = regularisation parameters
170 |     gamma = gamma, 
171 |     lambda = lambda, 
172 |     colsample_bytree = colsampletree,             ## Variable columns samples for building each tree
173 |     colsample_bylevel = colsamplelevel            ## Variable columns sampled for adding branches to tree
174 |   )
175 |   
176 |   xgbcv <- xgb.cv(                                ## Use xgboost package's inbuilt cv function 
177 |     params = pars,                                ## Which parameter sets to use 
178 |     data = dtrain, 
179 |     nround = number, 
180 |     nfold = 5, 
181 |     early_stopping_rounds = 10, 
182 |     maximize = FALSE,                               
183 |     verbose = 1
184 |   )
185 |   
186 |   result <- min(xgbcv$evaluation_log$test_rmse_mean)  ## Extract the rmse 
187 |   converted_result <- -1*result                       ## so that BO can maximise it
188 |   
189 |   return(list(Score = converted_result)               ## Return the score to the function
190 |   )
191 |   
192 | }
193 | 
194 | ## Once scoring function set up, need to define the hyperparameter search space #
195 | 
196 | bounds <- list(
197 |   max_depth = c(1L, 6L), 
198 |   eta = c(0.0001, 0.1), 
199 |   subsample = c(0.1, 0.5), 
200 |   number = c(1L, 500L), 
201 |   alpha = c(0L, 20L), 
202 |   gamma = c(0L, 20L), 
203 |   lambda = c(0L, 20L), 
204 |   colsampletree = c(0.1, 0.8), 
205 |   colsamplelevel = c(0.1, 0.8)
206 | )
207 | 
208 | ## Finally, set up the Bayesian optimisation, and run it with the above functions/details ##
209 | 
210 | start <- Sys.time()                              ## Like to time things 
211 | 
212 | set.seed(1066)                                   ## Reproducibility before setting off the BO function 
213 | 
214 | bayesian_boost <- bayesOpt(
215 |   FUN = scorefunction, 
216 |   bounds = bounds,                               ## Tells function which search space to use
217 |   initPoints = 25,                               ## 25 trials first, followed by 
218 |   iters.n = 25,                                  ## 25 runs of BO thereafter. Total of 50 iterations. 
219 |   iters.k = 1, 
220 |   parallel = FALSE, 
221 |   verbose = 2, 
222 |   acq = "ei",                                    ## Expected improvement acquisition function 
223 |   plotProgress = FALSE                           ## Plot this later, rather than sequentially each iter. Graph overload... 
224 | )
225 | 
226 | end <- Sys.time() 
227 | end-start 
228 | 
229 | bayesian_boost$scoreSummary                      ## Get summary of models fitted 
230 | bestpars <- getBestPars(bayesian_boost)
231 | bestpars                                         ## List the optimal hyperparameter configuration 
232 | 
233 | # Store these best values to plug into a model fitted to the full development data #
234 | opt_maxdepth        = bestpars[1]
235 | opt_eta             = bestpars[2]
236 | opt_subsamp         = bestpars[3]
237 | opt_number          = bestpars[4]
238 | opt_alpha           = bestpars[5]
239 | opt_gamma           = bestpars[6]
240 | opt_lambda          = bestpars[7]
241 | opt_colsampletree   = bestpars[8]
242 | opt_colsamplelevel  = bestpars[9] 
243 | 
244 | # set these as the parameters for the model # 
245 | 
246 | parameters <- list(tree_method = "gpu_hist", 
247 |                    sampling_method = "gradient_based", 
248 |                    objective = "reg:squarederror", 
249 |                    eval_metric = "rmse", 
250 |                    maximize = FALSE, 
251 |                    max_depth = opt_maxdepth, 
252 |                    eta = opt_eta, 
253 |                    subsample = opt_subsamp, 
254 |                    alpha = opt_alpha, 
255 |                    gamma = opt_gamma, 
256 |                    lambda = opt_lambda, 
257 |                    colsample_bytree = opt_colsampletree, 
258 |                    colsample_bylevel = opt_colsamplelevel
259 |                    )
260 | 
261 | xgboost_model_endpoint3 <- xgb.train(data = dtrain, param = parameters, 
262 |                              nrounds = opt_number$number, verbose =2)
263 | 
264 | xgb.save(xgboost_model_endpoint3, fname = "xgboost_model_endpoint3")   ## Save this final model 
265 | 
266 | ## Variable importance - all predictors ##
267 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3)
268 | xgb.plot.importance(importance.matrix, rel_to_first=TRUE, xlab="Relative predictor importance", col="darkblue", cex=0.4) 
269 | 
270 | ## Variable importance - top 10 predictors ##
271 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3)
272 | xgb.plot.importance(importance.matrix, rel_to_first=TRUE, xlab="Relative predictor importance", col="darkblue", cex=0.8, top_n=10)
273 | 
274 | ## Same as first, but print all values in a Table so can interpret more easily ##
275 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3)
276 | table <- xgb.plot.importance(importance.matrix, plot=FALSE)
277 | table 
278 | 
279 | 
280 | # Clear everything, so we can now run the internal-external cross-validation ##
281 | rm(list = ls())
282 | 
283 | ################################################################################
284 | ################################################################################
285 | 
286 | ## MODEL EVALUATION ##
287 | 
288 | #################
289 | # Load packages #
290 | #################
291 | library(survival)
292 | library(xgboost)
293 | library(readr)
294 | library(haven)
295 | library(stringr)
296 | library(caret)
297 | library(ParBayesianOptimization)
298 | 
299 | #####################################
300 | # Open and format datasets for IECV #
301 | #####################################
302 | # Note - in these, rows are sorted by region, and then by patid, to allow safe merging in Stata #
303 | getwd() 
304 | setwd("/final_datasets/ML/")
305 | 
306 | ## Import Period 1 data first - this is used for iterative model fitting ##
307 | data_period1 <- read_dta("OX129_endpoint3_stacked50_period1_pseudovalues.dta")
308 | str(data_period1)
309 | data_period1$smoke_cat <- factor(data_period1$smoke_cat)                         # Smoking status 
310 | data_period1$cancer_route <- factor(data_period1$cancer_route)                   # Route to diagnosis
311 | data_period1$pr_status2 <- factor(data_period1$pr_status2)                       # PR +/-
312 | data_period1$her2_status2 <- factor(data_period1$her2_status2)                   # HER2 +/-
313 | data_period1$radiotherapy <- factor(data_period1$radiotherapy)                   # Use of RTx in 1st year 
314 | data_period1$mastectomy <- factor(data_period1$mastectomy)                       # Mastectomy in 1st year  
315 | data_period1$other_surgery <- factor(data_period1$other_surgery)                 # Other surgery in 1st year 
316 | data_period1$cancer_stage <- factor(data_period1$cancer_stage)                   # Stage (I-IV)
317 | data_period1$cancer_grade <- factor(data_period1$cancer_grade)                   # Grade (differentiation)
318 | data_period1$hrt <- factor(data_period1$hrt)                                     # Hormone replacement therapy use
319 | data_period1$sha1 <- factor(data_period1$sha1)                                   # Region - used for IECV
320 | 
321 | ## Now, start converting these factors to dummies ##
322 | ## Set up list of parameters that need dummies ##
323 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage',
324 |                       'cancer_grade', 'pr_status', 'er_status',  
325 |                       'her2_status', 'cancer_route')
326 | 
327 | ## The others will be hamdled as binary ##
328 | ## Generate the dummies ##
329 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
330 |                        other_surgery + hrt + cancer_stage + 
331 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
332 |                        cancer_route, data = data_period1)
333 | 
334 | # Package the dummy variables into its own dataset, ready to bind with the 
335 | # other, non-dummy parameters #
336 | dummied <- as.data.frame(predict(dummies, newdata=data_period1))
337 | 
338 | # Form the ready-to-process dataset by binding the new dummies with the other 
339 | #numeric parameters #
340 | data_period1 <- cbind(data_period1[, -c(which(colnames(data_period1) %in% dummy_parameters))], 
341 |                         dummied)
342 | 
343 | ## Continuous variables (age, BMI, Townsend score) - no scaling needed for XGB ##
344 | 
345 | ## Set up the predictors ('x cols') and outcome ('y cols') ##
346 | ## Rename variables (inc. new dummies) where relevant for interpretation of 
347 | ## variable importance and other graphs later ##
348 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.0'] <- 'Non_smoker'
349 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.1'] <- 'Ex_smoker'
350 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.2'] <- 'Light_smoker'
351 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.3'] <- 'Moderate_smoker'
352 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.4'] <- 'Heavy_smoker'
353 | colnames(data_period1)[colnames(data_period1) == 'radiotherapy.1'] <- 'Radiotherapy'
354 | colnames(data_period1)[colnames(data_period1) == 'mastectomy.1'] <- 'Mastectomy'
355 | colnames(data_period1)[colnames(data_period1) == 'other_surgery.1'] <- 'Other_surgery'
356 | colnames(data_period1)[colnames(data_period1) == 'hrt.1'] <- 'HRT_use'
357 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.1'] <- 'Stage1'
358 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.2'] <- 'Stage2'
359 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.3'] <- 'Stage3'
360 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.4'] <- 'Stage4'
361 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.1'] <- 'Well_differentiated'
362 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.2'] <- 'Moderately_differentiated'
363 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.3'] <- 'Poorly_differentiated'
364 | colnames(data_period1)[colnames(data_period1) == 'pr_status2.3'] <- 'PR_positive'
365 | colnames(data_period1)[colnames(data_period1) == 'her2_status2.3'] <- 'HER2_positive'
366 | colnames(data_period1)[colnames(data_period1) == 'er_status2.3'] <- 'ER_positive'
367 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.2'] <- 'Emergency_presentation'
368 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.3'] <- 'GP_referral'
369 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.4'] <- 'Inpatient_elective'
370 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.5'] <- 'Other_outpatient_pathway'
371 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.6'] <- 'Screening_detected'
372 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.7'] <- 'Two_week_wait'
373 | 
374 | ## 'pseudo' are the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ##
375 | x_cols <-  c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 
376 |              'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 
377 |              'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 
378 |              'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 
379 |              'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 
380 |              'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait')
381 |              
382 | ## Target for model fit is Period 1 pseudovalues ##
383 | ## Target for the evaluation are the Period 2 pseudovalues ##
384 | y_cols <- c('period1_pseudo')
385 | 
386 | ######################################
387 | ## NOW, REPEAT FOR PERIOD 2 DATASET ##
388 | ######################################
389 | 
390 | setwd("/final_datasets/ML/")
391 | data_period2 <- read_dta("OX129_endpoint3_stacked50_period2_pseudovalues.dta")
392 | str(data_period2)
393 | 
394 | ## Need to reformat categorical variables ##
395 | ## Categorical variables need to be converted into dummy variables ##
396 | ## First, reformat relevant parameters as factors, then convert to dummies #
397 | data_period2$smoke_cat <- factor(data_period2$smoke_cat)                         # Smoking status 
398 | data_period2$cancer_route <- factor(data_period2$cancer_route)                   # Route to diagnosis
399 | data_period2$pr_status2 <- factor(data_period2$pr_status2)                       # PR +/-
400 | data_period2$her2_status2 <- factor(data_period2$her2_status2)                   # HER2 +/-
401 | data_period2$radiotherapy <- factor(data_period2$radiotherapy)                   # Use of RTx in 1st year 
402 | data_period2$mastectomy <- factor(data_period2$mastectomy)                       # Mastectomy in 1st year  
403 | data_period2$other_surgery <- factor(data_period2$other_surgery)                 # Other surgery in 1st year 
404 | data_period2$cancer_stage <- factor(data_period2$cancer_stage)                   # Stage (I-IV)
405 | data_period2$cancer_grade <- factor(data_period2$cancer_grade)                   # Grade (differentiation)
406 | data_period2$hrt <- factor(data_period2$hrt)                                     # Hormone replacement therapy use
407 | data_period2$sha1 <- factor(data_period2$sha1)                                   # Region - used for IECV
408 | 
409 | # Generate the dummies #
410 | dummies2 <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
411 |                        other_surgery + hrt + cancer_stage + 
412 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
413 |                        cancer_route, data = data_period2)
414 | 
415 | # Package the dummy variables into its own dataset, ready to bind with the
416 | # other, non-dummy parameters #
417 | dummied2 <- as.data.frame(predict(dummies2, newdata=data_period2))
418 | 
419 | # Form the ready-to-process dataset by binding the new dummies with the other 
420 | # numeric parameters #
421 | data_period2 <- cbind(data_period2[, -c(which(colnames(data_period2) %in% 
422 |                                                 dummy_parameters))], dummied2)
423 | 
424 | ## rename variables for understanding ##
425 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.0'] <- 'Non_smoker'
426 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.1'] <- 'Ex_smoker'
427 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.2'] <- 'Light_smoker'
428 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.3'] <- 'Moderate_smoker'
429 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.4'] <- 'Heavy_smoker'
430 | colnames(data_period2)[colnames(data_period2) == 'radiotherapy.1'] <- 'Radiotherapy'
431 | colnames(data_period2)[colnames(data_period2) == 'mastectomy.1'] <- 'Mastectomy'
432 | colnames(data_period2)[colnames(data_period2) == 'chemotherapy.1'] <- 'Chemotherapy'
433 | colnames(data_period2)[colnames(data_period2) == 'other_surgery.1'] <- 'Other_surgery'
434 | colnames(data_period2)[colnames(data_period2) == 'hrt.1'] <- 'HRT_use'
435 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.1'] <- 'Stage1'
436 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.2'] <- 'Stage2'
437 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.3'] <- 'Stage3'
438 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.4'] <- 'Stage4'
439 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.1'] <- 'Well_differentiated'
440 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.2'] <- 'Moderately_differentiated'
441 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.3'] <- 'Poorly_differentiated'
442 | colnames(data_period2)[colnames(data_period2) == 'pr_status2.3'] <- 'PR_positive'
443 | colnames(data_period2)[colnames(data_period2) == 'her2_status2.3'] <- 'HER2_positive'
444 | colnames(data_period2)[colnames(data_period2) == 'er_status2.3'] <- 'ER_positive'
445 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.2'] <- 'Emergency_presentation'
446 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.3'] <- 'GP_referral'
447 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.4'] <- 'Inpatient_elective'
448 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.5'] <- 'Other_outpatient_pathway'
449 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.6'] <- 'Screening_detected'
450 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.7'] <- 'Two_week_wait'
451 | 
452 | z_cols <- c('period2_pseudo')            ## To form xgb.DMatrix for validation data, need to provide labels 
453 | 
454 | ## Check dimensions of two sub-sets ##
455 | dim(data_period1)
456 | dim(data_period2)
457 | 
458 | ## Clean up memory by removing the dummied interim datasets 
459 | rm(dummied) 
460 | rm(dummied2) 
461 | 
462 | ################################
463 | # Bayesian optimisation set-up #
464 | ################################
465 | 
466 | # There are 10 regions (sha1 variable), so set up loop to go over each of them #
467 | # Development data = period 1, region 'all except i', validation data = period 2, region i #
468 | 
469 | # Set up empty list for storing individual predictions # 
470 | iecv_predictions = list()
471 | 
472 | # Bounds for the search space for BO # 
473 | bounds <- list(
474 |   max_depth = c(1L, 6L), 
475 |   eta = c(0.0001, 0.1), 
476 |   subsample = c(0.1, 0.5), 
477 |   number = c(1L, 500L), 
478 |   alpha = c(0L, 20L), 
479 |   gamma = c(0L, 20L), 
480 |   lambda = c(0L, 20L),                       ## Same as for model fitting steps abive 
481 |   colsampletree = c(0.1, 0.8), 
482 |   colsamplelevel = c(0.1, 0.8)
483 | )
484 | 
485 | start <- Sys.time()                            ## Like timing
486 | 
487 | ## LOOP STARTS HERE ##
488 | 
489 | for(a in seq(1:10)) {                          ## Cycle through each value of region(sha1) from 1 to 10 
490 |   
491 | set.seed(1066)  
492 | 
493 |   data_dev = data_period1[ -which (data_period1$sha1 == levels(data_period1$sha1)[a]), ]  ## Development data = all except region 'a' in period 1
494 |   data_val = data_period2[ which (data_period2$sha1 == levels(data_period2$sha1)[a]), ]   ## Validation data = region 'a' in period 2 
495 |   
496 |   x_dev <- as.matrix(data_dev[, x_cols])                      ## Form matrices for these dev/val sub-sets 
497 |   label_dev <- as.matrix(data_dev[, y_cols])                  ## And their labels (pseudovalues)
498 |   x_val <- as.matrix(data_val[, x_cols])                      ## These are replaced with each loop iteration         
499 |   label_val <- as.matrix(data_val[, z_cols])
500 |   
501 |   ddev <- xgb.DMatrix(data=x_dev, label=label_dev)             ## join these together to hand to XGBoost  
502 |   dval <- xgb.DMatrix(data=x_val, label=label_val)             ## Same, but for the validation data for this loop 
503 |  
504 |   scorefunction <- function(max_depth, eta, subsample, number, alpha, gamma,      ## Nested cross-validation loop for hyperparameter tuning 
505 |                           lambda, colsampletree, colsamplelevel)  {               ## same style objective function for Bayesian optimisation to minimise
506 |                                  
507 |     pars <- list(                                                       ## Model hyperparamaters as above 
508 |     tree_method = "gpu_hist", 
509 |     sampling_method = "gradient_based", 
510 |     objective = "reg:squarederror", 
511 |     eval_metric = "rmse", 
512 |     maximize = FALSE,
513 |     max_depth = max_depth, 
514 |     eta = eta, 
515 |     subsample = subsample, 
516 |     alpha = alpha, 
517 |     gamma = gamma, 
518 |     lambda = lambda, 
519 |     colsample_bytree = colsampletree, 
520 |     colsample_bylevel = colsamplelevel
521 |   )
522 |     
523 |     xgbcv <- xgb.cv(                                                    ## Run nested 5-fold cross-validation to test the hyperparameter combo
524 |       params = pars, 
525 |       data = ddev, 
526 |       nround = number, 
527 |       nfold = 5, 
528 |       early_stopping_rounds = 10, 
529 |       maximize = FALSE, 
530 |       verbose = 1
531 |     )
532 |     
533 |     result <- min(xgbcv$evaluation_log$test_rmse_mean)                  ## Extract the result 
534 |     converted_result <- -1*result                                       ## We want to minimise RMSE, so maximimse the -ve RMSE
535 |     
536 |     return(list(Score = converted_result)                               ## Pass these 'scores' to the Bayesian Optim package 
537 |     )
538 |     
539 |   }
540 | 
541 |   ## Finally, having set up the Bayesian optimisation, run it with the above functions/details ##
542 |   
543 |   set.seed(1066) 
544 |   
545 |   bayesian_boost <- bayesOpt(                                                
546 |   FUN = scorefunction, 
547 |   bounds = bounds, 
548 |   initPoints = 25, 
549 |   iters.n = 25,
550 |   iters.k = 1, 
551 |   parallel = FALSE, 
552 |   verbose = 1, 
553 |   acq = "ei", 
554 |   plotProgress = FALSE, 
555 |   otherHalting = list(timeLimit=5400)                                   ## Stop if >1.5hrs taken on one cycle of loop (never reached on this) 
556 | )
557 |   
558 |   ## Evaluate the output of the Bayesian Optimisation process ## 
559 |   # Extract the best hyperparameters combination found #
560 |   bayesian_boost$scoreSummary 
561 |   bestpars <- getBestPars(bayesian_boost)
562 |   
563 |   # Store these best values to plug into a model fitted to this loop's development data #
564 |   
565 |   opt_maxdepth        = bestpars[1]
566 |   opt_eta             = bestpars[2]
567 |   opt_subsamp         = bestpars[3]
568 |   opt_number          = bestpars[4]
569 |   opt_alpha           = bestpars[5]
570 |   opt_gamma           = bestpars[6]
571 |   opt_lambda          = bestpars[7]
572 |   opt_colsampletree   = bestpars[8]
573 |   opt_colsamplelevel  = bestpars[9] 
574 |   
575 |   # set these as the parameters for the loop's model # 
576 |   
577 |   parameters <- list(tree_method = "gpu_hist", 
578 |                    sampling_method = "gradient_based", 
579 |                    objective = "reg:squarederror", 
580 |                    eval_metric = "rmse", 
581 |                    maximize = FALSE,
582 |                    max_depth = opt_maxdepth, 
583 |                    eta = opt_eta, 
584 |                    subsample = opt_subsamp, 
585 |                    alpha = opt_alpha, 
586 |                    gamma = opt_gamma, 
587 |                    lambda = opt_lambda, 
588 |                    colsample_bytree = opt_colsampletree, 
589 |                    colsample_bylevel = opt_colsamplelevel
590 |                    )
591 |   
592 |   xgboost_iecv_loop <- xgb.train(data = ddev, param = parameters,               ## Fit the model with optimal hyperparameter config identified from this loop 
593 |                                        nrounds = opt_number$number, verbose=1)
594 |   
595 |   
596 |   predictions.test <- predict(xgboost_iecv_loop, dval)                          ## Make predictions on held out period 2 data for region 'a'
597 |   iecv_predictions[[a]] = predictions.test                                      ## Store these predictions in the pre-made repository for access later
598 | }    ## Then repeat so that all regions are cycled through 
599 | 
600 | ## LOOP ENDS ##
601 | 
602 | end <-Sys.time() 
603 | end-start            ## Takes approx. 4.41 hours with GPU support on my server 
604 | 
605 | #####################################
606 | # POOLING THE PREDICTIONS FROM IECV #
607 | #####################################
608 | 
609 | # Combine the predictions made on the held out data from each cycle #
610 | xgboost_iecv_predictions <- c(iecv_predictions[[1]], iecv_predictions[[2]], iecv_predictions[[3]],
611 |                               iecv_predictions[[4]], iecv_predictions[[5]], iecv_predictions[[6]],
612 |                               iecv_predictions[[7]], iecv_predictions[[8]], iecv_predictions[[9]],
613 |                               iecv_predictions[[10]])
614 | # Check sensible - no overt error! #
615 | summary(xgboost_iecv_predictions) 
616 | 
617 | # Extract the patient ID - need this to merge together later for calculating performance metrics in Stata #
618 | patid <- as.data.frame(data_period2$patid)
619 | export_predictions <- cbind(patid, xgboost_iecv_predictions)    ## bind these columns together 
620 | head(export_predictions)
621 | 
622 | # Save the predictions for later use - will be used to evaluate performance of the model fitted to the entire data #
623 | setwd("/estimates/ML_IECV_predictions/")
624 | # Save as csv for easy importing into Stata #
625 | write.csv(export_predictions, file="endpoint3_xgboost_iecv_predictions.csv")
626 | 
627 | 
628 | ###########################################################################################
629 | ###########################################################################################
630 | 


--------------------------------------------------------------------------------
/6_OX129_neural_networks:
--------------------------------------------------------------------------------
  1 | ## Neural network (competing risks) modelling ##
  2 | 
  3 | ## Same overall process as XGBoost - use stacked imputed datasets, fit model to entire data, 
  4 | ## evaluate using internal-external cross-validation that recapitulates hyperparameter tuning (nested)
  5 | 
  6 | 
  7 | #############################
  8 | ## Load in packages needed ##
  9 | #############################
 10 | library(survival)
 11 | library(readr)
 12 | library(stringr)
 13 | library(caret)
 14 | library(haven)
 15 | library(ParBayesianOptimization) 
 16 | library(keras)
 17 | library(tensorflow)
 18 | 
 19 | ########################################################
 20 | ## Check and set working directory for importing data ##
 21 | ########################################################
 22 | getwd() 
 23 | setwd("/final_datasets/ML/")
 24 | 
 25 | ##################
 26 | ## Load in data ##
 27 | ##################
 28 | data <- read_dta("OX129_endpoint3_stacked50_pseudovalues.dta")
 29 | str(data)
 30 | 
 31 | #########################
 32 | ## Variable formatting ##
 33 | #########################
 34 | ## Categorical variables need to be converted into dummy variables ##
 35 | ## First, reformat relevant parameters as factors, then convert to dummies ##
 36 | 
 37 | data$smoke_cat <- factor(data$smoke_cat)                         # Smoking status 
 38 | data$cancer_route <- factor(data$cancer_route)                   # Route to diagnosis
 39 | data$pr_status2 <- factor(data$pr_status2)                       # PR +/-
 40 | data$her2_status2 <- factor(data$her2_status2)                   # HER2 +/-
 41 | data$er_status2 <- factor(data$er_status2)                       # ER +/- 
 42 | data$radiotherapy <- factor(data$radiotherapy)                   # Use of RTx in 1st year 
 43 | data$mastectomy <- factor(data$mastectomy)                       # Mastectomy in 1st year  
 44 | data$other_surgery <- factor(data$other_surgery)                 # Other surgery in 1st year 
 45 | data$cancer_stage <- factor(data$cancer_stage)                   # Stage (I-IV)
 46 | data$cancer_grade <- factor(data$cancer_grade)                   # Grade (differentiation)
 47 | data$hrt <- factor(data$hrt)                                     # Hormone replacement therapy use
 48 | data$sha1 <- factor(data$sha1)                                   # Region - used for IECV
 49 | 
 50 | ## Now, start converting these factors to dummies ##
 51 | ## Set up list of parameters that need dummies ##
 52 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage',
 53 |                       'cancer_grade', 'pr_status', 'er_status',  
 54 |                       'her2_status', 'cancer_route')
 55 | 
 56 | ## The others will be hamdled as binary ##
 57 | ## Generate the dummies ##
 58 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
 59 |                        other_surgery + hrt + cancer_stage + 
 60 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
 61 |                        cancer_route, data = data)
 62 | 
 63 | ## Package the dummy variables into its own dataset, ready to bind with the ##
 64 | ## other, non-dummy parameters ##
 65 | dummied <- as.data.frame(predict(dummies, newdata=data))
 66 | 
 67 | ## Form the ready-to-process dataset by binding the new dummies with the other 
 68 | #numeric parameters ##
 69 | data_for_model <- cbind(data[, -c(which(colnames(data) %in% dummy_parameters))], 
 70 |                         dummied)
 71 | 
 72 | ## Create normalise function - continuous variables are to be normalised/scaled 
 73 | ## to be between 0 and 1 ##
 74 | 
 75 | normalise <- function(x) {
 76 |   return ((x - min(x)) / (max(x) - min(x)))
 77 | }
 78 | 
 79 | data_for_model$age_at_diagnosis  <- normalise(data_for_model$age)
 80 | data_for_model$bmi               <- normalise(data_for_model$bmi)
 81 | 
 82 | ## Set up the predictors ('x cols') and outcome ('y cols') ##
 83 | ## Rename variables (inc. new dummies) where relevant for interpretation ##
 84 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.0']     <- 'Non_smoker'
 85 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.1']     <- 'Ex_smoker'
 86 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.2']     <- 'Light_smoker'
 87 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.3']     <- 'Moderate_smoker'
 88 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.4']     <- 'Heavy_smoker'
 89 | colnames(data_for_model)[colnames(data_for_model) == 'radiotherapy.1']  <- 'Radiotherapy'
 90 | colnames(data_for_model)[colnames(data_for_model) == 'mastectomy.1']    <- 'Mastectomy'
 91 | colnames(data_for_model)[colnames(data_for_model) == 'other_surgery.1'] <- 'Other_surgery'
 92 | colnames(data_for_model)[colnames(data_for_model) == 'hrt.1']           <- 'HRT_use'
 93 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.1']  <- 'Stage1'
 94 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.2']  <- 'Stage2'
 95 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.3']  <- 'Stage3'
 96 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.4']  <- 'Stage4'
 97 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.1']  <- 'Well_differentiated'
 98 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.2']  <- 'Moderately_differentiated'
 99 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.3']  <- 'Poorly_differentiated'
100 | colnames(data_for_model)[colnames(data_for_model) == 'pr_status2.3']    <- 'PR_positive'
101 | colnames(data_for_model)[colnames(data_for_model) == 'her2_status2.3']  <- 'HER2_positive'
102 | colnames(data_for_model)[colnames(data_for_model) == 'er_status2.3']    <- 'ER_positive' 
103 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.2']  <- 'Emergency_presentation'
104 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.3']  <- 'GP_referral'
105 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.4']  <- 'Inpatient_elective'
106 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.5']  <- 'Other_outpatient_pathway'
107 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.6']  <- 'Screening_detected'
108 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.7']  <- 'Two_week_wait'
109 | 
110 | ## 'pseudo' are the jack-knife pseudo-values for the A-J CIF ##
111 | x_cols <-  c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 
112 |              'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 
113 |              'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 
114 |              'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 
115 |              'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 
116 |              'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait')
117 | y_cols <- c('pseudo')
118 | 
119 | ## Clean up environment to avoid issues with memory ##
120 | rm(dummied) 
121 | rm(data) 
122 | 
123 | 
124 | ########################
125 | ## Dataset formatting ##
126 | ########################
127 | 
128 | ## Change dataset to a matrix  ##
129 | ## x_train = predictor parameters ##
130 | x_train <- as.matrix(data_for_model[, x_cols])
131 | 
132 | ## Labels = the target for the neural net - the pseudovalues ##
133 | label_train <- as.matrix(data_for_model[, y_cols])
134 | 
135 | ## Number of input neurons (predictor parameters) 
136 | dim(x_train) 
137 | 
138 | 
139 | ###############################################
140 | ## Define custom loss function for the model ##
141 | ###############################################
142 | 
143 | ## Define root mean squared error loss function ##
144 | ## This is used to fit the neural network model ##
145 | 
146 | rmse <- function(y_true, y_pred){
147 |   K <- backend()
148 |   loss <- K$sqrt(K$mean((y_true - y_pred)^2))
149 |   return(loss)
150 | }
151 | 
152 | ############################################################################
153 | ## FULL MODEL: Setting up Bayesian optimisation for hyperparameter tuning ##
154 | ############################################################################
155 | 
156 | ## First, we fit neural network to entirety of data; Bayesian optimisation ## 
157 | ## used to identify optimal hyperparameters (with nested cross-val) ##
158 | 
159 | ## Set up callbacks - use early stopping to stop training if models gets into ##
160 | ## a rut and there is no improvement. Combine with drop in learning rate if 
161 | ## hits plateau.
162 | callbacks_list <- list(
163 |          callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3),
164 |          callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5)
165 |          )
166 | 
167 | 
168 | ## seed for reproducibility ##
169 | set.seed(2066)
170 | 
171 | ## The Bayesian Optimisation package requires a 'scoring function' which ##
172 | ## it uses to assess the performance of the different configurations ##
173 | 
174 | ## Herein, we specify that we want to tune the number of epochs,  ##
175 | ## number of units in each hidden layer, and number of ##
176 | ## hidden layers ##
177 | ## To estimate the performance of each model configuration, we use 5-fold ##
178 | ## cross-validation, so we nest a loop inside the scoring function so that ##
179 | ## the 'score' returned is a cross-validated estimate ##
180 | 
181 | scorefunction <- function(epochs, units, layers, learnrate) {
182 |   
183 |   set.seed(1066)
184 |   
185 |   k <- 5                                                                      ## 5-fold cross-validation 
186 |   indices <- sample(1:nrow(x_train))                                          ## Used to randomly assign to folds 
187 |   folds <- cut(1:length(indices), breaks = k, labels = FALSE)                 ## folds 
188 |   
189 |   cv_score <- c()                                                             ## empty list to store 'scores' over each CV iteration 
190 |   
191 |   for (i in 1:k) {                                                            ## Nested loop to perform CV
192 |     
193 |     cv_val_indices     <- which(folds==i)                                     ## validation set 1/5 folds 
194 |     cv_val_data        <- x_train[cv_val_indices, ]                           ## Take from x_train 
195 |     cv_val_labels      <- label_train[cv_val_indices]                         ## Take from labels 
196 |     
197 |     cv_train_data      <- x_train[-cv_val_indices, ]                          ## Train data = 4/5 folds
198 |     cv_train_labels    <- label_train[-cv_val_indices]                        ## Train labels 4/5 
199 |     
200 |     cv_model <- keras_model_sequential()                                      ## Instantiate keras model 
201 |     for(m in 1:layers){                                                       ## loop added to vary number of hidden layers
202 |       cv_model %>% 
203 |         layer_dense(units = units, activation = 'relu', 
204 |                     input_shape=c(27))
205 |     }
206 |     cv_model %>% layer_dense(units = 1, activation = "linear") %>% 
207 |       compile(
208 |         optimizer = optimizer_adam(lr=learnrate),  
209 |         loss = rmse,                                                          ## Use the above defined custom loss function
210 |         metrics = "mse"                                                       ## Reported as model fits, but not a loss function 
211 |       )
212 |     ## Fit the model- batch size= 1024, weights applied for loss function ##
213 |     cv_model %>% fit(cv_train_data, cv_train_labels, epochs=epochs, 
214 |                      batch_size=1024, verbose=2, callbacks = callbacks_list, 
215 |                      validation_data = list(cv_val_data, cv_val_labels))      ## Use heldout data to plot training dynamics   
216 |     
217 |     ## Generate predictions on held-out fold ##
218 |     cv_predictions <- predict(cv_model, cv_val_data)
219 |     
220 |     ## BO wants to find the maximum, but lower RMSE = better ##
221 |     ## -1 x RMSE: lower RMSE = higher value, so use this to score the config ##
222 |     negative_rmse <- -1*sqrt((mean((cv_predictions-cv_val_labels)^2)))
223 |     
224 |     # Return score (or clipped score, if gradient exploded) to bayesian_neural 
225 |     cv_score <- c(cv_score, negative_rmse)
226 |     
227 |     ## Clear out the model at the end of loop to avoid merging ##
228 |     rm(cv_model) 
229 |   }
230 |   ## BO needs score to be returned to it ##
231 |   return(list(Score = mean(cv_score)))
232 | }
233 | 
234 | 
235 | ## Define hyperparameter search space - 'bounds' to search ##
236 | 
237 | bounds <- list(
238 |   epochs = c(1L, 50L),                                                        ## Number of times model gets fed the data
239 |   units = c(27L, 50L),                                                        ## Number of nodes in each layer 
240 |   layers = c(1L, 5L),                                                         ## Between 1 and 5 hidden layers
241 |   learnrate = c(0.001, 0.1)  
242 | )
243 | 
244 | ###############################################################################
245 | ## FULL MODEL: Running Bayesian optimisation to find optimal hyperparameters ##
246 | ###############################################################################
247 | 
248 | ## Define Bayesian Optimisation function and run it ##
249 | ## Also keep track of time ##
250 | start <- Sys.time() 
251 | 
252 | ## Function takes in the scoring function we defined above, the bounds to ##
253 | ## search within, number of 'initialisations' to run first, then the number ## 
254 | ## of further iterations to try find the global optimum ##
255 | 
256 | bayesian_neural <- bayesOpt(
257 |   FUN = scorefunction, 
258 |   bounds = bounds, 
259 |   initPoints = 25, 
260 |   iters.n = 25, 
261 |   iters.k = 1, 
262 |   parallel = FALSE, 
263 |   plotProgress = FALSE, 
264 |   verbose = 2, 
265 |   errorHandling = "continue", 
266 |   acq = "ei"
267 | )
268 | 
269 | end <- Sys.time() 
270 | end-start 
271 | 
272 | ## Summarise Bayesian Optimisation run ##
273 | bayesian_neural$scoreSummary 
274 | 
275 | plot(bayesian_neural) 
276 | 
277 | bestpars <- getBestPars(bayesian_neural)
278 | bestpars 
279 | 
280 | # Extract optimal hyperparameter config to fit final model on whole dataset # 
281 | best_epochs <- bestpars[1]
282 | best_units <- bestpars[2]
283 | best_layers <- toString(bestpars[3])
284 | best_learnrate <- as.numeric(bestpars[4]) 
285 | 
286 | callbacks_list <- list(
287 |          callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3),
288 |          callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5)
289 |          )
290 | 
291 | ## Use same format as above - define the 'final_model' and fit it ##
292 | final_model <- keras_model_sequential() 
293 | for(i in 1:best_layers){
294 |   final_model %>% 
295 |     layer_dense(units = best_units, activation = 'relu',
296 |                 input_shape=c(27))
297 | }
298 | final_model %>% layer_dense(units = 1, activation = "linear") %>% 
299 |   compile(
300 |     optimizer = optimizer_adam(lr=best_learnrate), 
301 |     loss = rmse, 
302 |     metrics = "mae"
303 |   )
304 | 
305 | ## Basic characteristics of final model, e.g. number of layers, parameters ##
306 | summary(final_model) 
307 | 
308 | ############################
309 | # Fit final neural network #
310 | ############################
311 | 
312 | ## Fit the model with said config to whole dataset, and save it ##
313 | final_model %>% fit(x_train, label_train, epochs=best_epochs, 
314 |                     batch_size=1024, verbose=2, callbacks=callbacks_list)
315 |                     
316 | ## Save the model ##
317 | setwd("/models/Endpoint_3/")    
318 | save_model_hdf5(final_model, "ep3_neuralnetwork_final.h5")
319 | 
320 | ###############################################################################################
321 | ###############################################################################################
322 | 
323 | ## Clear out the environment to start fresh for evaluation ##
324 | rm = ls() 
325 | 
326 | 
327 | ## MODEL EVALUATION STRATEGY ## 
328 | 
329 | #############################
330 | ## Load in packages needed ##
331 | #############################
332 | library(survival)
333 | library(readr)
334 | library(stringr)
335 | library(caret)
336 | library(haven)
337 | library(ParBayesianOptimization) 
338 | library(keras)
339 | library(tensorflow)
340 | 
341 | ########################################################
342 | ## Check and set working directory for importing data ##
343 | ########################################################
344 | ## Here, we need to separately import and format the Period 1 and Period 2 data ##
345 | getwd() 
346 | setwd("/final_datasets/ML/")
347 | 
348 | ## Importing Period 1 sub-cohort (model fitting) separately from Period 2 sub-cohort (model evaluation) ##
349 | data_period1 <- read_dta("OX129_endpoint3_stacked50_period1_pseudovalues.dta")
350 | str(data_period1)
351 | 
352 | ## Need to reformat categorical variables ##
353 | ## Categorical variables need to be converted into dummy variables ##
354 | ## First, reformat relevant parameters as factors, then convert to dummies #
355 | 
356 | data_period1$smoke_cat <- factor(data_period1$smoke_cat)                         # Smoking status 
357 | data_period1$cancer_route <- factor(data_period1$cancer_route)                   # Route to diagnosis
358 | data_period1$pr_status2 <- factor(data_period1$pr_status2)                       # PR +/-
359 | data_period1$her2_status2 <- factor(data_period1$her2_status2)                   # HER2 +/-
360 | data_period1$er_status2 <- factor(data_period1$er_status2)                       # ER +/- 
361 | data_period1$radiotherapy <- factor(data_period1$radiotherapy)                   # Use of RTx in 1st year 
362 | data_period1$mastectomy <- factor(data_period1$mastectomy)                       # Mastectomy in 1st year  
363 | data_period1$other_surgery <- factor(data_period1$other_surgery)                 # Other surgery in 1st year 
364 | data_period1$cancer_stage <- factor(data_period1$cancer_stage)                   # Stage (I-IV)
365 | data_period1$cancer_grade <- factor(data_period1$cancer_grade)                   # Grade (differentiation)
366 | data_period1$hrt <- factor(data_period1$hrt)                                     # Hormone replacement therapy use
367 | data_period1$sha1 <- factor(data_period1$sha1)                                   # Region - used for IECV
368 | 
369 | ## Now, start converting these factors to dummies ##
370 | ## Set up list of parameters that need dummies ##
371 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage',
372 |                       'cancer_grade', 'pr_status', 'er_status',  
373 |                       'her2_status', 'cancer_route')
374 | 
375 | ## The others will be hamdled as binary ##
376 | ## Generate the dummies ##
377 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
378 |                        other_surgery + hrt + cancer_stage + 
379 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
380 |                        cancer_route, data = data_period1)
381 | 
382 | # Package the dummy variables into its own dataset, ready to bind with the 
383 | # other, non-dummy parameters #
384 | dummied <- as.data.frame(predict(dummies, newdata=data_period1))
385 | 
386 | # Form the ready-to-process dataset by binding the new dummies with the other 
387 | #numeric parameters #
388 | data_period1 <- cbind(data_period1[, -c(which(colnames(data_period1) %in% dummy_parameters))], 
389 |                         dummied)
390 | 
391 | # Use normalise function - continuous variables are to be normalised/scaled to 
392 | # be between 0 and 1
393 | 
394 | normalise <- function(x) {
395 |   return ((x - min(x)) / (max(x) - min(x)))
396 | }
397 | 
398 | data_period1$age_at_diagnosis  <- normalise(data_period1$age_at_diagnosis)
399 | data_period1$bmi               <- normalise(data_period1$bmi)
400 | 
401 | 
402 | ## Set up the predictors ('x cols') and outcome ('y cols') ##
403 | ## Rename variables (inc. new dummies) where relevant for interpretation of 
404 | ## variable importance and other graphs later ##
405 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.0']     <- 'Non_smoker'
406 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.1']     <- 'Ex_smoker'
407 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.2']     <- 'Light_smoker'
408 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.3']     <- 'Moderate_smoker'
409 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.4']     <- 'Heavy_smoker'
410 | colnames(data_period1)[colnames(data_period1) == 'radiotherapy.1']  <- 'Radiotherapy'
411 | colnames(data_period1)[colnames(data_period1) == 'mastectomy.1']    <- 'Mastectomy'
412 | colnames(data_period1)[colnames(data_period1) == 'other_surgery.1'] <- 'Other_surgery'
413 | colnames(data_period1)[colnames(data_period1) == 'hrt.1']           <- 'HRT_use'
414 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.1']  <- 'Stage1'
415 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.2']  <- 'Stage2'
416 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.3']  <- 'Stage3'
417 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.4']  <- 'Stage4'
418 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.1']  <- 'Well_differentiated'
419 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.2']  <- 'Moderately_differentiated'
420 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.3']  <- 'Poorly_differentiated'
421 | colnames(data_period1)[colnames(data_period1) == 'pr_status2.3']    <- 'PR_positive'
422 | colnames(data_period1)[colnames(data_period1) == 'her2_status2.3']  <- 'HER2_positive'
423 | colnames(data_period1)[colnames(data_period1) == 'er_status2.3']    <- 'ER_positive' 
424 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.2']  <- 'Emergency_presentation'
425 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.3']  <- 'GP_referral'
426 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.4']  <- 'Inpatient_elective'
427 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.5']  <- 'Other_outpatient_pathway'
428 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.6']  <- 'Screening_detected'
429 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.7']  <- 'Two_week_wait'
430 | 
431 | ## 'pseudo' are the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ##
432 | x_cols <-  c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 
433 |              'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 
434 |              'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 
435 |              'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 
436 |              'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 
437 |              'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait')
438 |              
439 | ## Target for model fit is Period 1 pseudovalues ##
440 | ## Target for the evaluation are the Period 2 pseudovalues ##
441 | y_cols <- c('period1_pseudo')
442 | 
443 | ######################################
444 | ## NOW, REPEAT FOR PERIOD 2 DATASET ##
445 | ######################################
446 | 
447 | data_period2 <- read_dta("OX129_endpoint3_stacked50_period2_pseudovalues.dta")
448 | str(data_period2)
449 | ## Categorical variables need to be converted into dummy variables ##
450 | ## First, reformat relevant parameters as factors, then convert to dummies #
451 | data_period2$smoke_cat <- factor(data_period2$smoke_cat)                         # Smoking status 
452 | data_period2$cancer_route <- factor(data_period2$cancer_route)                   # Route to diagnosis
453 | data_period2$pr_status2 <- factor(data_period2$pr_status2)                       # PR +/-
454 | data_period2$her2_status2 <- factor(data_period2$her2_status2)                   # HER2 +/-
455 | data_period2$er_status2 <- factor(data_period2$er_status2)                       # ER +/- 
456 | data_period2$radiotherapy <- factor(data_period2$radiotherapy)                   # Use of RTx in 1st year 
457 | data_period2$mastectomy <- factor(data_period2$mastectomy)                       # Mastectomy in 1st year  
458 | data_period2$other_surgery <- factor(data_period2$other_surgery)                 # Other surgery in 1st year 
459 | data_period2$cancer_stage <- factor(data_period2$cancer_stage)                   # Stage (I-IV)
460 | data_period2$cancer_grade <- factor(data_period2$cancer_grade)                   # Grade (differentiation)
461 | data_period2$hrt <- factor(data_period2$hrt)                                     # Hormone replacement therapy use
462 | data_period2$sha1 <- factor(data_period2$sha1)                                   # Region - used for IECV
463 | 
464 | # Generate the dummies #
465 | dummies2 <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 
466 |                        other_surgery + hrt + cancer_stage + 
467 |                        cancer_grade + pr_status2 + her2_status2 + er_status2 +
468 |                        cancer_route, data = data_period2)
469 | 
470 | # Package the dummy variables into its own dataset, ready to bind with the
471 | # other, non-dummy parameters #
472 | dummied2 <- as.data.frame(predict(dummies2, newdata=data_period2))
473 | 
474 | # Form the ready-to-process dataset by binding the new dummies with the other 
475 | # numeric parameters #
476 | data_period2 <- cbind(data_period2[, -c(which(colnames(data_period2) %in% 
477 |                                                 dummy_parameters))], dummied2)
478 | 
479 | data_period2$age_at_diagnosis  <- normalise(data_period2$age_at_diagnosis)
480 | data_period2$bmi               <- normalise(data_period2$bmi)
481 | 
482 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.0']     <- 'Non_smoker'
483 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.1']     <- 'Ex_smoker'
484 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.2']     <- 'Light_smoker'
485 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.3']     <- 'Moderate_smoker'
486 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.4']     <- 'Heavy_smoker'
487 | colnames(data_period2)[colnames(data_period2) == 'radiotherapy.1']  <- 'Radiotherapy'
488 | colnames(data_period2)[colnames(data_period2) == 'mastectomy.1']    <- 'Mastectomy'
489 | colnames(data_period2)[colnames(data_period2) == 'other_surgery.1'] <- 'Other_surgery'
490 | colnames(data_period2)[colnames(data_period2) == 'hrt.1']           <- 'HRT_use'
491 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.1']  <- 'Stage1'
492 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.2']  <- 'Stage2'
493 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.3']  <- 'Stage3'
494 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.4']  <- 'Stage4'
495 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.1']  <- 'Well_differentiated'
496 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.2']  <- 'Moderately_differentiated'
497 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.3']  <- 'Poorly_differentiated'
498 | colnames(data_period2)[colnames(data_period2) == 'pr_status2.3']    <- 'PR_positive'
499 | colnames(data_period2)[colnames(data_period2) == 'her2_status2.3']  <- 'HER2_positive'
500 | colnames(data_period2)[colnames(data_period2) == 'er_status2.3']    <- 'ER_positive' 
501 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.2']  <- 'Emergency_presentation'
502 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.3']  <- 'GP_referral'
503 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.4']  <- 'Inpatient_elective'
504 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.5']  <- 'Other_outpatient_pathway'
505 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.6']  <- 'Screening_detected'
506 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.7']  <- 'Two_week_wait'
507 | 
508 | ## Check dimensions of two sub-sets ##
509 | dim(data_period1)
510 | dim(data_period2) 
511 | 
512 | ## Clean up memory by removing the dummied interim datasets 
513 | rm(dummied) 
514 | rm(dummied2) 
515 | 
516 | ###############################################
517 | ## Define custom loss function for the model ##
518 | ###############################################
519 | 
520 | ## Define root mean squared error loss function ##
521 | ## This is used to fit the neural network model ##
522 | 
523 | rmse <- function(y_true, y_pred){
524 |   K <- backend()
525 |   loss <- K$sqrt(K$mean((y_true - y_pred)^2))
526 |   return(loss)
527 | }
528 | 
529 | ## Set up callbacks ##
530 | callbacks_list <- list(
531 |          callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3),
532 |          callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5)
533 |          )
534 | 
535 | ##############################################################
536 | ## Define scoring function for nested cross-validation/IECV ##
537 | ##############################################################
538 | 
539 | ## Need to use the period 1 data for tuning with 5-fold cross-validation ## 
540 | ## Use the period 2 data as the completely held-out test set(s) ## 
541 | ## Below, we set up a loop to iterate this over each region for IECV, but here, 
542 | # first we set up the Bayesian Optimisation scoring function that will be embedded in this loop ##
543 | set.seed(2066) 
544 |   
545 | scorefunction <- function(epochs, units, layers, learnrate){
546 |   
547 |   set.seed(2066)
548 |   
549 |   k <- 5 
550 |   indices <- sample(1:nrow(data_fit))                               ## 'data_fit' will be generated from data_period1 subset 
551 |   folds <- cut(1:length(indices), breaks = k, labels = FALSE)
552 |   
553 |   iecv_score <- c() 
554 |   
555 |   for (a in 1:k) {
556 |     
557 |     iecv_val_indices     <- which(folds==a)
558 |     
559 |     iecv_val_data        <- data_fit[iecv_val_indices, ]           ## validation is the heldout 1/5th
560 |     iecv_val_labels      <- label_fit[iecv_val_indices]            ## for CV loop for hyperparam tuning 
561 |     
562 |     iecv_fit_data        <- data_fit[-iecv_val_indices, ]
563 |     iecv_fit_labels      <- label_fit[-iecv_val_indices]
564 |     
565 |     bayes_model <- keras_model_sequential()                        ## Instantiate keras model 
566 |     for(b in 1:layers){                                            ## loop added to vary number of hidden layers
567 |       bayes_model %>% 
568 |         layer_dense(units = units, activation = 'relu', 
569 |                     input_shape=c(27))
570 |     }
571 |     bayes_model %>% layer_dense(units = 1, activation = "linear") %>% 
572 |       compile(
573 |         optimizer = optimizer_adam(lr=learnrate),  
574 |         loss = rmse,                                                ## Use the above defined custom loss function
575 |         metrics = "mse"                                             ## Reported as model fits, but not a loss function 
576 |       )
577 |     ## Fit the model- batch size= 1024, weights applied for loss function ##
578 |     bayes_model %>% fit(iecv_fit_data, iecv_fit_labels, epochs=epochs, 
579 |                      batch_size=1024, verbose=0, callbacks = callbacks_list, 
580 |                      validation_data = list(iecv_val_data, iecv_val_labels)) 
581 |     
582 |     iecv_predictions <- predict(bayes_model, iecv_val_data)
583 |     negative_rmse <- -1*sqrt(mean((iecv_predictions-iecv_val_labels)^2))
584 |     iecv_score <- c(iecv_score, negative_rmse)
585 |     
586 |     rm(bayes_model) 
587 |    }
588 |   
589 |   return(list(Score = mean(iecv_score)))                           ## Return the score (-ve RMSE obtained with each combination to Bayes Optim algorithm)
590 |   
591 |  }
592 | 
593 | #############################################################
594 | ## Define hyperparameter search space - 'bounds' to search ##
595 | #############################################################
596 | 
597 | bounds <- list(
598 |   epochs = c(1L, 20L),                                  ## Number of times model gets fed the data
599 |   units = c(27L, 50L),                                  ## Number of nodes in each layer 
600 |   layers = c(1L, 5L),                                   ## Between 1 and 5 hidden layers
601 |   learnrate = c(0.001, 0.1)  
602 | )
603 | 
604 | ######################################################
605 | ## Define Bayesian Optimisation function and run it ##
606 | ######################################################
607 | 
608 | start <- Sys.time()                           ## Time taken is of interest 
609 | 
610 | iecv_predictions <- c() 
611 | 
612 | for(d in seq(1:10)) {                        ## 10 regions (sha1) - loop over each of them
613 |   
614 |   set.seed(1066)  
615 |   
616 |   data_1 <- data_period1[ -which (data_period1$sha1 == 
617 |                                      levels(data_period1$sha1)[d]), ]          ## Fit data is all regions except region 'a'
618 |   data_fit       <- as.matrix(data_1[, x_cols])                                ## Form this into matrix of predictors 
619 |   label_fit      <- as.matrix(data_1[, y_cols])                                ## Form matrix of their pseudovalues
620 |   
621 |   data_2 <- data_period2[ which (data_period2$sha1 == 
622 |                                     levels(data_period2$sha1)[d]), ]           ## Validation data is region 'a'
623 |   data_val       <- as.matrix(data_2[, x_cols])                                ## Package matrix of predictors - no need for 'target' pseudovalues here
624 |   
625 |   bayesian_neural <- bayesOpt(                                                 ## Bayesian Optimisation command, will run scoring function above on this cross-val loop
626 |   FUN = scorefunction, 
627 |   bounds = bounds, 
628 |   initPoints = 25, 
629 |   iters.n = 25, 
630 |   iters.k = 1, 
631 |   parallel = FALSE, 
632 |   plotProgress = FALSE, 
633 |   verbose = 2, 
634 |   errorHandling = "continue", 
635 |   acq = "ei"
636 |   ) 
637 |  
638 |   ## Summarise Bayesian Optimisation run ##
639 |   bayesian_neural$scoreSummary 
640 |   plot(bayesian_neural) 
641 |   bestpars <- getBestPars(bayesian_neural)                                      ## Best parameter combination found in this iteration
642 |   bestpars 
643 | 
644 |   # Extract optimal hyperparameter config to fit final model on whole 'fit' dataset # 
645 |   best_epochs <- bestpars[1]
646 |   best_units <- bestpars[2]
647 |   best_layers <- toString(bestpars[3])
648 |   best_learnrate <- as.numeric(bestpars[4]) 
649 | 
650 |   ## Use same format as above - define the 'final_model' and fit it ##
651 |   iecv_model <- keras_model_sequential() 
652 |   for(e in 1:best_layers){
653 |     iecv_model %>% 
654 |       layer_dense(units = best_units, activation = 'relu',
655 |                 input_shape=c(27))
656 |   }
657 |   iecv_model %>% layer_dense(units = 1, activation = "linear") %>% 
658 |   compile(
659 |     optimizer_adam(lr=best_learnrate), 
660 |     loss = rmse, 
661 |     metrics = "mse"
662 |   )
663 | 
664 |   iecv_model %>% fit(data_fit, label_fit, epochs=best_epochs, 
665 |                     batch_size=1024, verbose=0)                        ## Fit model to period 1 data from all regions except 'a'
666 | 
667 | predictions <- predict(iecv_model, data_val)                                         ## Test model on period 2 data for region 'a'
668 | iecv_predictions[[d]] = predictions                                                  ## Store the predictions as loop iterates 
669 | 
670 | rm(iecv_model)                                                                       ## Clear out so that model can be refitted on next loop iteration without stacking networks...
671 | 
672 | }
673 | 
674 | end <- Sys.time() 
675 | end-start                                                                            ## Took 2.08 days on my set-upwith GPU 
676 | 
677 | 
678 | ###########################################
679 | # Combine predictions generated from IECV #
680 | ###########################################
681 | nnet_iecv_predictions <-    c(iecv_predictions[[1]], iecv_predictions[[2]], 
682 |                               iecv_predictions[[3]], iecv_predictions[[4]], 
683 |                               iecv_predictions[[5]], iecv_predictions[[6]],
684 |                               iecv_predictions[[7]], iecv_predictions[[8]], 
685 |                               iecv_predictions[[9]], iecv_predictions[[10]])
686 | 
687 | summary(nnet_iecv_predictions)
688 | 
689 | ## Combine with individual ID numbers so those can be linked up later ##
690 | 
691 | patid <- as.data.frame(data_period2$patid)
692 | export_predictions <- cbind(patid, nnet_iecv_predictions)
693 | head(export_predictions)
694 | 
695 | ## Save these for use in Stata for evaluation ##
696 | 
697 | setwd("/estimates/ML_IECV_predictions/")
698 | write.csv(export_predictions, "endpoint3_nnet_iecv_predictions.csv")
699 | 
700 | 
701 | ###############################################################################################
702 | ###############################################################################################
703 | ###############################################################################################
704 | 
705 | 


--------------------------------------------------------------------------------