├── README.md ├── 4_OX129_ML_data_prep ├── 8_OX129_decision_curve_analysis ├── Licence.txt ├── 1_OX129_imputation ├── 2_OX129_Cox_models ├── 7_OX129_ML_model_evaluation ├── 3_OX129_Competing_risks_regression ├── 5_OX129_XGBoost └── 6_OX129_neural_networks /README.md: -------------------------------------------------------------------------------- 1 | ** Please review the file 'Licence.txt' regarding the academic use license ** 2 | 3 | - 👋 Hi, I’m Ash Darnell-Fish @AshDF91 4 | - 👀 Clinical Research Fellow (DPhil) at Uni of Oxford 5 | - 🌱 Interested in clinical prediction modelling using statistical learning and machine learning 6 | - 🐱‍🏍 Competing risks models, pseudovalues, model calibration, Bayesian optimisation, adapting ML models for time-to-event/competing risks data, assessing stability/transportability of models using meta-analytical approaches 7 | 8 | 12 | -------------------------------------------------------------------------------- /4_OX129_ML_data_prep: -------------------------------------------------------------------------------- 1 | ## Preparing multiply imputed data for machine learning modelling ## 2 | 3 | ## This is undertaken in R, so need to prepare Stata dataset to be usable ## 4 | 5 | ## The approach is centered on stacked imputations - a stack of all 50 imputations is formed, 6 | ## to generate 1 long dataset, with each individual assigned a weight of 0.02 (1/50) ## 7 | 8 | ## The outcome variables are jack-knife pseudovalues, so these need to be generated too ## 9 | 10 | 11 | ***************************************************** 12 | * Prepare for ML modelling – calculate pseudovalues * 13 | ***************************************************** 14 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear // data used in competing risks modelling 15 | 16 | * convert to long form - m=0 at top, then stacked imputations m=1,2,3...50 below in sequence * // m=0 is the complete case dataset 17 | mi convert flong 18 | 19 | * drop the m=0, i.e. non-imputed data * // in flong format, _mi_m variable is generated to denote imputed set. E.g. _mi_m==10 is the 10th imputation 20 | tab _mi_m 21 | * This is the variable to denote which imputation each row 'belongs' to * 22 | drop if _mi_m==0 23 | 24 | * Stack imputations* 25 | * For purposes of later modelling in R - sort by sha1 (geographical region) then by patid, then by imputation number * 26 | * This will enable the predictions to be matched to each individual when IECV runs on the stacked dataset * 27 | sort sha1 patid _mi_m 28 | 29 | * Remove the mi setting - this makes a 'clean' simple stacked dataset * 30 | mi unset, asis 31 | rename _mi_m imputation // To track numbers if needed 32 | 33 | * Assign weight to each bservation for use in DL/XGBoost in R - each individual 34 | * has a weight of 1/50, so that all imputations together 'equal' 1 patient * 35 | gen imp_weight = 1/50 36 | summ imp_weight, det 37 | 38 | ******************** 39 | * CIF pseudovalues * 40 | ******************** 41 | * Already calculated in the competing risks regression approach - 42 | * summarise here for the record * 43 | summ pseudo, det // whole cohort 44 | summ period1_pseudo, det // used for model fitting 45 | summ period2_pseudo, det // used for model evaluation in the IECV process 46 | 47 | * Retransform FP term to get 'final' or imputed BMI variable * 48 | gen bmi_x = 1/(Idiag__1+0.135374792) 49 | gen bmi = 10*(sqrt(bmi_x)) 50 | summ bmi, det // Unscaled values 51 | 52 | summ townsend, det // Already pre-generated in competing risks regression approach 53 | 54 | * Slim down and keep salient variables (predictors, etc.) * 55 | keep patid sha1 study_practice cancer_route ethriskid smoke_cat cancer_grade cancer_stage // 56 | progesterone_status HER2_status vasculitis radiotherapy chemotherapy mastectomy other_surgery // 57 | tca hrt anti_psychotic age_at_diagnosis bmi townsend period pseudo period1_pseudo period2_pseudo // 58 | timesplit endpoint exit3date d_breastca fu_start fu_end imputation imp_weight iecv_event endpoint prob_cens ipcw 59 | 60 | save "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", replace 61 | 62 | ** Form separate datasets for period 1 and period 2 - this will aid IECV in R ** 63 | use "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", clear 64 | keep if period==1 65 | save "\final_datasets\ML\OX129_endpoint3_stacked50_period1_pseudovalues.dta", replace 66 | clear 67 | 68 | ******************************************************************************** 69 | ** Again, use previously generated dataset to make data_period2 versions * 70 | use "\final_datasets\ML\OX129_endpoint3_stacked50_pseudovalues.dta", clear 71 | keep if period==2 72 | save "\final_datasets\ML\OX129_endpoint3_stacked50_period2_pseudovalues.dta", replace 73 | clear 74 | 75 | ******************************************************************************** 76 | ******************************************************************************** 77 | -------------------------------------------------------------------------------- /8_OX129_decision_curve_analysis: -------------------------------------------------------------------------------- 1 | ## USING DECISION CURVE ANALYSIS TO COMPARE CLINICAL UTILITY OF MODELS ## 2 | 3 | ## We use the saved individual-level predictions generated with IECV to evaluate this ## 4 | ## Overall, we load in the average prediction (average over 50 imputations, as per Rubin's rule for point estimates), 5 | ## merge with the survival/time-to-event data in the full dataset, then run the dca package ## 6 | 7 | *********************************************************************************************** 8 | 9 | ** Step 1: Obtain mean predictions for each individual from each model, save as mini file with patid and mean_prediction ** 10 | 11 | ** Cox model 12 | use "\final_datasets\OX129_endpoint3_imputed_IECV.dta", clear 13 | mi convert wide 14 | keep if period==2 15 | gen mean_prediction_cox = 1 - period1_baseline_surv_cox^exp(iecv_xb) 16 | rename mean_prediction_cox cox * Rename prediction for each model simply, so can keep track/evaluate 17 | keep patid cox 18 | summ cox, det 19 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_cox_iecv_predictions_formatted.dta", replace 20 | clear 21 | 22 | ** Competing risks regression 23 | use "\final_datasets\OX129_endpoint3_competingrisk_IECV.dta", clear 24 | mi convert wide 25 | keep if period==2 26 | rename iecv_probabilities crr * Rename predictions crr = competing risks regression, you get the idea 27 | keep patid crr 28 | summ crr, det 29 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_crr_iecv_predictions_formatted.dta", replace 30 | clear 31 | 32 | ** XGBoost 33 | use "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", clear 34 | keep patid mean_prediction 35 | rename mean_prediction xgb 36 | summ xgb, det 37 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_xgboost_iecv_predictions_formatted.dta", replace 38 | clear 39 | 40 | 41 | ** Neural network 42 | use "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", clear 43 | keep patid mean_prediction 44 | rename mean_prediction nnet 45 | summ nnet, det 46 | save "\estimates\ML_IECV_predictions\DCA_endpoint3_nnet_iecv_predictions_formatted.dta", replace 47 | clear 48 | 49 | 50 | ** Import, merge and analyse ** 51 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear * This is the key dataset generated during regression modelling steps 52 | mi convert wide * Don't need stacked imputations - quicker to run and makes sense in wide format 53 | keep if period==2 * Only have predictions generated in Period 2 data after IECV 54 | drop _merge 55 | 56 | foreach x in cox crr xgboost nnet { * Merge in the predictions from each model with their simple/handy variable names 57 | 58 | merge 1:1 patid using "\estimates\ML_IECV_predictions\DCA_endpoint3_`x'_iecv_predictions_formatted.dta" 59 | drop _merge 60 | } 61 | 62 | mi stset follow_up, failure(iecv_event==1) exit(time 10) * Stset the data so that DCA can run 63 | 64 | cd "\ado\plus\" * Path for calling in non-routine packages 65 | 66 | stdca cox crr xgb nnet, timepoint(10) prob(yes yes yes yes) smooth lpattern(solid solid dash dash dash dash) 67 | graph save "Graph" "\graphs\EP3_decisioncurve_all_models.gph", replace 68 | 69 | * Now, run version that takes u=into account competing risks - key one ! 70 | stdca cox crr xgb nnet, timepoint(10) prob(yes yes yes yes) smooth lpattern(solid solid dash dash dash dash) compet1(2) // competing event is iecv_event=2 (=other death) 71 | graph save "Graph" "\graphs\EP3_decisioncurve_all_models.gph", replace 72 | 73 | clear 74 | 75 | ******************************************************************************* 76 | ******************************************************************************* 77 | ** END ** 78 | 79 | -------------------------------------------------------------------------------- /Licence.txt: -------------------------------------------------------------------------------- 1 | Academic Use Licence 2 | 3 | These licence terms apply to all licences granted by THE CHANCELLOR, MASTERS AND SCHOLARS OF THE UNIVERSITY OF OXFORD 4 | whose administrative offices are at University Offices, Wellington Square, Oxford OX1 2JD, United Kingdom (the “University”) 5 | for use of breast cancer prognosis and risk prediction tools (“the Software”) through this website 6 | https://github.com/AshDF91/Breast-cancer-prognosis (the ”Website”). 7 | 8 | PLEASE READ THESE LICENCE TERMS CAREFULLY BEFORE DOWNLOADING THE SOFTWARE THROUGH THIS WEBSITE. IF YOU DO NOT AGREE TO 9 | THESE LICENCE TERMS YOU SHOULD NOT DOWNLOAD THE SOFTWARE. 10 | 11 | THE SOFTWARE IS INTENDED FOR USE BY ACADEMICS CARRYING OUT RESEARCH AND NOT FOR USE BY CONSUMERS OR COMMERCIAL BUSINESSES. 12 | 13 | 1. Academic Use Licence 14 | 1.1 The Licensee is granted a limited non-exclusive and non-transferable royalty free licence to download and use the Software 15 | provided that the Licensee will: 16 | (a) limit their use of the Software to their own internal academic non-commercial research which is undertaken for the purposes 17 | of education or other scholarly use; 18 | (b) not use the Software for or on behalf of any third party or to provide a service or integrate all or part of the Software 19 | into a product for sale or license to third parties; 20 | (c) use the Software in accordance with the prevailing instructions and guidance for use given on the Website and comply with 21 | procedures on the Website for user identification, authentication and access; 22 | (d) comply with all applicable laws and regulations with respect to their use of the Software; and 23 | (e) ensure that the Copyright Notice “Copyright © 2021, University of Oxford” appears prominently wherever the Software is 24 | reproduced and is referenced or cited with the Copyright Notice when the Software is described in any research publication or 25 | on any documents or other material created using the Software. 26 | 1.2 The Licensee may only reproduce, modify, transmit or transfer the Software where: 27 | (a) such reproduction, modification, transmission or transfer is for academic, research or other scholarly use; 28 | (b) the conditions of this Licence are imposed upon the receiver of the Software or any modified Software; 29 | (c) all original and modified Source Code is included in any transmitted software program; and 30 | (d) the Licensee grants the University an irrevocable, indefinite, royalty free, non-exclusive unlimited licence to use and 31 | sub-licence any modified Source Code as part of the Software. 32 | 33 | 1.3 The University reserves the right at any time and without liability or prior notice to the Licensee to revise, modify 34 | and replace the functionality and performance of the access to and operation of the Software. 35 | 1.4 The Licensee acknowledges and agrees that the University owns all intellectual property rights in the Software. 36 | The Licensee shall not have any right, title or interest in the Software. 37 | 1.5 This Licence will terminate immediately and the Licensee will no longer have any right to use the Software or exercise 38 | any of the rights granted to the Licensee upon any breach of the conditions in Section 1 of this Licence. 39 | 40 | 2. Indemnity and Liability 41 | 2.1 The Licensee shall defend, indemnify and hold harmless the University against any claims, actions, proceedings, losses, 42 | damages, expenses and costs (including without limitation court costs and reasonable legal fees) arising out of or in connection 43 | with the Licensee's possession or use of the Software, or any breach of these terms by the Licensee. 44 | 2.2 The Software is provided on an ‘as is’ basis and the Licensee uses the Software at their own risk. No representations, 45 | conditions, warranties or other terms of any kind are given in respect of the the Software and all statutory warranties and 46 | conditions are excluded to the fullest extent permitted by law. Without affecting the generality of the previous sentences, 47 | the University gives no implied or express warranty and makes no representation that the Software or any part of the Software: 48 | (a) will enable specific results to be obtained; 49 | or (b) meets a particular specification or is comprehensive within its field or that it is error free or will operate without interruption; 50 | or (c) is suitable for any particular, or the Licensee's specific purposes. 51 | 2.3 Except in relation to fraud, death or personal injury, the University's liability to the Licensee for any use of the Software, 52 | in negligence or arising in any other way out of the subject matter of these licence terms, will not extend to any incidental or 53 | consequential damages or losses, or any loss of profits, loss of revenue, loss of data, loss of contracts or opportunity, whether direct or indirect. 54 | 2.4 The Licensee hereby irrevocably undertakes to the University not to make any claim against any employee, student, researcher or 55 | other individual engaged by the University, being a claim which seeks to enforce against any of them any liability whatsoever in 56 | connection with these licence terms or their subject-matter. 57 | 58 | 3. General 59 | 3.1 Severability - If any provision (or part of a provision) of these licence terms is found by any court or administrative body 60 | of competent jurisdiction to be invalid, unenforceable or illegal, the other provisions shall remain in force. 61 | 3.2 Entire Agreement - These licence terms constitute the whole agreement between the parties and supersede any previous arrangement, 62 | understanding or agreement between them relating to the Software. 63 | 3.3 Law and Jurisdiction - These licence terms and any disputes or claims arising out of or in connection with them shall be governed by, 64 | and construed in accordance with, the law of England. The Licensee irrevocably submits to the exclusive jurisdiction of the English courts 65 | for any dispute or claim that arises out of or in connection with these licence terms. 66 | 67 | If you are interested in using the Software commercially, please contact Oxford University Innovation Limited to negotiate a licence. Contact details are enquiries@innovation.ox.ac.uk quoting reference OUI 21633. 68 | 69 | -------------------------------------------------------------------------------- /1_OX129_imputation: -------------------------------------------------------------------------------- 1 | 2 | ******************************************************************************** 3 | * 1) Variable descriptions * 4 | * 2) Multiple imputation run * 5 | ******************************************************************************** 6 | 7 | ******************************************************************************** 8 | * 1) Variable descriptions * 9 | ******************************************************************************** 10 | 11 | * All variables are latest recorded prior to/at date of diagnosis, unless specified* 12 | 13 | * age = continuous 14 | * diagnosis_bmi = body mass index, continuous 15 | * town_int = Townsend deprivation score, continuous 16 | * alcohol_cat = alcohol intake group, based on units/day coding in EMIS, ordinal 17 | * smoke_cat = smoking status, as per EMIS coding (non-smoker, ex-smoker, etc.) ordinal 18 | * cancer_stage = stage at diagnosis, stge I, II, III, IV, ordinal 19 | * cancer_grade = differentiation/grade of tumour, ordinal 20 | * er_status = Oestrogen receptor status, binary (+ve, -ve) 21 | * pr_status = progesterone receptor status, binary (+ve, -ve) 22 | * her2_status = HER2 receptor status, binary (+ve, -ve) 23 | * cancer_route = route to cancer diagnosis, from PHE. E.g. screen-detected, emergency admission, categorical 24 | * ethriskid = self-reported ethnicity, informed by ONS classes, categorical 25 | * fh_breastca = recorded family history of breast cancer, binary 26 | * radio = radiotherapy within 1 year of breast cancer diagnosis, binary 27 | * chemo = chemotherapy within 1 year of breast cancer diagnosis, binary 28 | * mastectomy = mastectomy within 1 year of breast cancer diagnosis, binary 29 | * other_surg = other surgery (e.g. wide local excision) within 1 year of breast cancer diagnosis, binary 30 | * ckd = chronic kidney disease, ordinal 31 | * cirrhosis = chronic liver disease or cirrhosis, binary 32 | * htn = hypertension, binary 33 | * ihd = history of ischaemic heart disease, binary 34 | * t1dm = type 1 diabetes mellitus, binary 35 | * t2dm = type 2 diabetes mellitus, binary 36 | * vasculitis = vasculitis, binary 37 | * lupus = systemic lupus erythematosus, binary 38 | * thiazide = use of thiazide within 6months pre-diagnosis, binary 39 | * betablocker = use of beta-blocker med within 6mo pre-diagnosis, binary 40 | * acei = use of ACE inhibitor med within 6mo pre-diagnosis, binary 41 | * tca = use of tricyclic antidepressant within 6mo pre-diagnosis, binary 42 | * ssri = use of selective serotonion reuptake inhibitor within 6mo pre-diagnosis, binary 43 | * maoi = use of monoamine oxidase inhibitor within 6mo pre-diagnosis 44 | * otherantidepress = use of other (not above) antidepressant within 6mo pre-diagnosis 45 | * ocp = use of oral contraceptive pill within 6mo pre-diagnosis 46 | * hrt = use of hormone replacement therapy within 6mo pre-diagnosis 47 | * anti_psycho = use of anti-psychotic medication within 6mo pre-diagnosis 48 | * d_breastca = death with breast cancer on death certificate during follow-up 49 | * outcomedate6 = date of breast cancer diagnosis (earliest of GP/HES/CancerRegistry) 50 | * exit3date = date of death, date of censoring (right censor date) 51 | 52 | 53 | ******************************************************************************** 54 | * 1) MULTIPLE IMPUTATION PREPARATION AND RUN * 55 | ******************************************************************************** 56 | 57 | use "\data\final_datasets\Endpoint_3\OX129_endpoint3.dta" 58 | 59 | * Visualise distributions of continuous variables * 60 | 61 | histogram age, percent normal 62 | histogram diagnosis_bmi, percent normal 63 | 64 | * Assess natural log transform - ? better normality of distribution for multiple imputation?* 65 | gen ln_bmi = log(diagnosis_bmi) 66 | histogram ln_bmi, percent normal 67 | * Chekc for Townsend score * 68 | histogram town_int, percent normal // looks fine 69 | 70 | 71 | * Stset the data so that we can identify optimal FP terms in turn for the aformentioned continuous variables * 72 | * Tabulate the breast cancer death variable * 73 | tab d_breastcancer 74 | 75 | * Stset the data, restrict to 10 years follow-up (prediction horizon) * 76 | stset exit3date, origin(outcomedate6) fail(d_breastca==1) scale(365.25) exit(time outcomedate6 + 3652.5) 77 | 78 | * Estimate Nelson-Aalen cumulative failure function for inclusion in imputation model * 79 | sts gen nelson_aalen = na 80 | summ nelson_aalen, det 81 | 82 | * Quantify degree of missingness for key variables to be considered * 83 | tab ethriskid, m 84 | tab smoke_cat, m 85 | tab alcohol_cat, m 86 | tab er_status, m 87 | tab her2_status, m 88 | tab pr_status, m 89 | tab cancer_grade, m 90 | tab cancer_route, m 91 | tab cancer_stage, m 92 | 93 | count if ln_bmi==. 94 | count if town_int==. 95 | 96 | * Assess ranges for continuous variables, so that we can perform truncated regression in multiple imputation * 97 | summ ln_bmi, det 98 | summ town_int, det 99 | 100 | * Use period (decade) of cohort entry as an auxiliary variable * 101 | gen timesplit = td(31Dec2009) 102 | format timesplit %td 103 | gen period = 1 104 | replace period = 2 if outcomedate6>timesplit 105 | tab period 106 | 107 | ** Set up the data for multiple imputation, register variables to be imputed, and run it ** 108 | mi set wide 109 | 110 | mi register imputed ln_bmi town_int smoke_cat alcohol_cat cancer_grade cancer_stage // 111 | cancer_route er_status pr_status her2_status ethriskid 112 | 113 | timer on 1 114 | 115 | * nelson-aalen cumulative hazard estimate, period of cohort entry as auxiliary variables * 116 | * Include the endpoint * 117 | 118 | mi impute chained (truncreg, ll(2.70805) ul(3.850157)) ln_bmi (truncreg, ll(-7) ul(10)) // 119 | town_int (ologit, ascontinuous) alcohol_cat smoke_cat cancer_stage cancer_grade // 120 | (mlogit, ascontinuous) er_status pr_status her2_status cancer_route ethriskid = age i.fh_breastca // 121 | i.radio i.chemo i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis i.lupus i.thiazide i.betablocker // 122 | i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt i.anti_psycho i.mastect i.other_surg // 123 | i._d nelson_aalen i.period, add(50) augment chaindots rseed(1066) 124 | 125 | timer off 1 126 | timer list 127 | timer clear 128 | 129 | * Convert ln(BMI) back to original scale for later use in FPs and models * 130 | mi passive: gen bmi = exp(ln_bmi) 131 | summ _1_bmi, det 132 | summ _50_bmi, det 133 | 134 | * Save imputed datasats for use next - FP selection, model fitting, evaluation * 135 | save "\data\final_datasets\Endpoint_3\OX129_endpoint3_imputed.dta", replace 136 | 137 | clear 138 | log close 139 | ******************************************************************************** 140 | ******************************************************************************** 141 | 142 | 143 | -------------------------------------------------------------------------------- /2_OX129_Cox_models: -------------------------------------------------------------------------------- 1 | ## Cox proportional hazards model ## 2 | 3 | 4 | ## Multiple imputation already performed ## 5 | 6 | **************************************************************************** 7 | * 1) FRACTIONAL POLYNOMIAL TERM SELECTION * 8 | **************************************************************************** 9 | use "\final_datasets\OX129_endpoint3.dta" // complete case data 10 | 11 | summ age, det 12 | summ diagnosis_bmi, det 13 | summ town_int, det 14 | 15 | * outcomedate6 = date of breast cancer diagnosis (start of follow-up) 16 | * d_breastcancer = binary variable denoting breast cancer death (=1) or not (=0) 17 | * exit3date = date left cohort (died, or censored) 18 | * exit3status = categorical (0=censored, 1 = died) 19 | * outcomedate6 = date of breast cancer diagnosis 20 | 21 | stset exit3date, origin(outcomedate6) fail(d_breastca==1) scale(365.25) exit(time outcomedate6 + 3652.5) 22 | 23 | * Run mfp package to find FP terms * 24 | mfp stcox age diagnosis_bmi town_int, all 25 | 26 | * Age = 0.5, 2 27 | * BMI = 2, 2 28 | * Townsend score = 1 (keep linear) 29 | clear 30 | ******************************************************************************** 31 | 32 | ******************************************************************************** 33 | * 2) FIT FULL MODEL, PERFORM PREDICTOR VARIABLE SELECTION * 34 | ******************************************************************************** 35 | 36 | use "\final_datasets\OX129_endpoint3_imputed.dta" // imputed data, ready for modelling 37 | 38 | ** Generate FP terms - this was generated in a previous step above ** 39 | mi passive: gen double bmi_x = bmi/10 40 | mi passive: gen bmi_fp1 = (bmi_x^-2) - 0.1353749689 41 | mi passive: gen bmi_fp2 = (bmi_x^-2)*ln(bmi_x) - 0.1353551232 42 | 43 | fp gen double ge_at_diagnosis^(.5 2), scale(0 10) center(63.1168366) 44 | rename age_at_diagnosis1 age_fp1 45 | rename age_at_diagnosis age_fp2 46 | 47 | * register as imputed variables to Stata knows how to handle them across mi commands * 48 | mi register imputed bmi_fp1 bmi_fp2 // imputed variables from passive 49 | mi register regular age_fp1 age_fp2 // age was complete for all, so just declare as regular 50 | 51 | * Tabulate events - d_breastcancer = breast cancer death (of interest), 52 | * exit3status = contains all cause death (use to define competing event) 53 | tab d_breastcancer 54 | tab exit3status 55 | 56 | gen time = exit3date - outcomedate6 57 | replace time = time/365.25 58 | 59 | ************************************************************************ 60 | * Stset the data - 10 year exit date as this is the prediction horizon * 61 | ************************************************************************ 62 | mi stset time, failure(d_breastca==1) exit(time 10) 63 | 64 | 65 | ******************************************** 66 | ** Fit full model, including interactions ** 67 | ******************************************** 68 | * Form global for covariates, then use this in the model * 69 | global covariates = "age_fp* bmi_fp* townsend i.ethriskid i.alcohol_cat i.smoke_cat // 70 | ib6.cancer_route ib2.pr_status, ib2.her2_status, i.fh_breastca i.radio i.chemo i.mastectomy // 71 | i.other_surg i.cancer_stage i.cancer_grade i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis // 72 | i.lupus i.thiazide i.betablocker i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt // 73 | i.anti_psycho c.age_fp1##i.chemo c.age_fp2##i.chemo c.age_fp1##i.mastec c.age_fp2##i.mastec // 74 | c.age_fp1##i.other_surg c.age_fp2##i.other_surg c.age_fp1##i.fh_breastca c.age_fp2##i.fh_breastca i.ethriskid##c.age_fp1 // 75 | i.ethriskid##c.age_fp2 i.ethriskid##i.other_surg i.ethriskid##i.mastectomy i.ethriskid##i.chemotherapy i.ethriskid##i.radiotherapy" 76 | 77 | mi estimate, dots eform errorok: stcox $covariates, vce(cluster study_practice) 78 | parmest, list(,) saving(cox_endpoint3_parmest_fullmodel_preselection.dta, replace) label eform for(estimate min95 max95) 79 | 80 | ******************************************************************************** 81 | 82 | 83 | ******************************************************************************** 84 | * 3) FIT FINAL MODEL * 85 | ******************************************************************************** 86 | 87 | ************************************************************ 88 | ** Refit model, with variables meeting inclusion criteria ** 89 | ************************************************************ 90 | * Keep if HR<0.9 or HR>1.1 and p<0.01 * 91 | * Keep if interaction significant at p<0.01 * 92 | 93 | global selected_covariates = "age_fp* bmi_fp* i.smoke_cat ib6.cancer_route ib2.pr_status ib2.er_status // 94 | ib2.her2_status i.fh_breastca i.radio i.chemo i.mastect i.other_surg i.cancer_stage i.cancer_grade i.ckd // 95 | i.cirrhosis i.acei i.tca i.ssri i.otherantidepress i.hrt i.anti_psycho c.age_fp1##i.mastec c.age_fp2##i.mastec // 96 | c.age_fp1##i.other_surg c.age_fp2##i.other_surg" 97 | 98 | ************************ 99 | * Save final Cox model * 100 | ************************ 101 | mi estimate, dots eform errorok saving(cox_ep3, replace): stcox $selected_covariates, vce(cluster study_practice) 102 | parmest, list(,) saving(cox_endpoint3_parmest.dta, replace) label eform for(estimate min95 max95) 103 | 104 | * Calculate baseline function after Cox model (rather than KM estimate) * 105 | * Use mi xeq function to estimate baseline survival function in each imputation, then pool (average) * 106 | mi query 107 | local M = r(M) 108 | scalar base_surv = 0 109 | mi xeq 1/`M': stcox $selected_covariates ; predict double basesurv, basesurv ; summ basesurv if _t<10; scalar base_surv = base_surv + r(min) 110 | scalar base_surv = base_surv/`M' 111 | disp as txt "Pooled baseline survivor function over imputations = " as res base_surv 112 | gen baseline_surv_cox = base_surv 113 | summ baseline_surv_cox 114 | 115 | 116 | ******************************************************************************** 117 | 118 | 119 | ******************************************************************************** 120 | * 4) INTERNAL-EXTERNAL CROSS-VALIDATION RUN * 121 | ******************************************************************************** 122 | 123 | ********************** 124 | * Preparing for IECV * 125 | ********************** 126 | 127 | * Need to generate a measure of FU time from entry to earliest of: event/cens- * 128 | * oring/end of period 1/end of study * 129 | * We want 2 temporally distinct sub-cohorts, with no overlap. Therefore, follow-up 130 | * ends at the earliest of event, censoring date, or the end of the decade * 131 | 132 | * outcomedate6 is date of breast cancer diagnosis * 133 | gen fu_start = outcomedate6 134 | format fu_start %td 135 | gen fu_end = . 136 | 137 | * Generate variable denoting cut off point for the two temp. distinct sub-cohorts * 138 | gen timesplit = mdy(01,01,2010) 139 | format timesplit %td 140 | tab period, m 141 | 142 | * People in Period1 - truncate follow-up as appropriate * 143 | replace fu_end = min(exit3date, timesplit) if period==1 144 | replace fu_end = exit3date if period==2 145 | format fu_end %td 146 | 147 | * Calculate follow-up time using IECV-suitable dates * 148 | gen follow_up = fu_end-fu_start 149 | summ follow_up, det 150 | replace follow_up = follow_up/365.25 151 | 152 | * Overall event counts, any time in study period * 153 | tab d_breastca 154 | 155 | * Generate new event indicator for IECV - event only within period of entry * 156 | gen iecv_event = 0 157 | replace iecv_event = 1 if period==1 & d_breastca==1 & exit3date1) 447 | gen age1 = age_at_diagnosis 448 | gen baseline_bmi1 = diagnosis_bmi 449 | gen town_int1 = town_int 450 | 451 | collapse (mean)age_at_diagnosis diagnosis_bmi town_int (sd)age1 baseline_bmi1 town_int1 (percent)nonwhite, by(sha1) 452 | 453 | tab age_at_diagnosis 454 | tab diagnosis_bmi 455 | tab town_int 456 | tab age1 457 | tab baseline_bmi1 458 | tab town_int1 459 | tab nonwhite 460 | 461 | save "\final_datasets\OX129_regionalperiod2_EP3.dta", replace 462 | clear 463 | 464 | 465 | ********************************************************************************** 466 | * Examine effect of age, BMI, deprivation and non-white ethnicity on perf hetero * 467 | ********************************************************************************** 468 | 469 | ***************** 470 | ** Harrell's C ** 471 | use "\estimates\C_EP3regioncox.dta" 472 | gen sha1 = _n 473 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 474 | 475 | meta set beta st_err, studylab(sha1) 476 | 477 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 478 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 479 | 480 | meta regress age_at_diagnosis, random(sjonkman) 481 | estat bubbleplot 482 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_age_cox.gph", replace 483 | 484 | meta regress age1, random(sjonkman) 485 | estat bubbleplot 486 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_ageSD_cox.gph", replace 487 | 488 | meta regress town_int, random(sjonkman) 489 | estat bubbleplot 490 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_townsend_cox.gph", replace 491 | 492 | meta regress nonwhite, random(sjonkman) 493 | estat bubbleplot 494 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_ethnic_cox.gph", replace 495 | 496 | meta regress diagnosis_bmi, random(sjonkman) 497 | estat bubbleplot 498 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_bmi_cox.gph", replace 499 | 500 | meta regress baseline_bmi, random(sjonkman) 501 | estat bubbleplot 502 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_C_bmiSD_cox.gph", replace 503 | 504 | clear 505 | 506 | 507 | *********************** 508 | ** Calibration slope ** 509 | use "\estimates\slope_EP3regioncox.dta" 510 | gen sha1 = _n 511 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 512 | 513 | meta set slope slope_se, studylab(sha1) 514 | 515 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 516 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 517 | 518 | meta regress age_at_diagnosis, random(sjonkman) 519 | estat bubbleplot 520 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_age_cox.gph", replace 521 | 522 | meta regress age1, random(sjonkman) 523 | estat bubbleplot 524 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_ageSD_cox.gph", replace 525 | 526 | meta regress town_int, random(sjonkman) 527 | estat bubbleplot 528 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_townsend_cox.gph", replace 529 | 530 | meta regress nonwhite, random(sjonkman) 531 | estat bubbleplot 532 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_ethnic_cox.gph", replace 533 | 534 | meta regress diagnosis_bmi, random(sjonkman) 535 | estat bubbleplot 536 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_bmi_cox.gph", replace 537 | 538 | meta regress baseline_bmi, random(sjonkman) 539 | estat bubbleplot 540 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_slope_bmiSD_cox.gph", replace 541 | 542 | clear 543 | 544 | 545 | ****************************** 546 | ** Calibration in the large ** 547 | 548 | use "\estimates\citl_EP3regioncox.dta" 549 | gen sha1 = _n 550 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 551 | 552 | meta set citl citl_se, studylab(sha1) 553 | 554 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 555 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 556 | 557 | meta regress age_at_diagnosis, random(sjonkman) 558 | estat bubbleplot 559 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_age_cox.gph", replace 560 | 561 | meta regress age1, random(sjonkman) 562 | estat bubbleplot 563 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_ageSD_cox.gph", replace 564 | 565 | meta regress town_int, random(sjonkman) 566 | estat bubbleplot 567 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_townsend_cox.gph", replace 568 | 569 | meta regress nonwhite, random(sjonkman) 570 | estat bubbleplot 571 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_ethnic_cox.gph", replace 572 | 573 | meta regress diagnosis_bmi, random(sjonkman) 574 | estat bubbleplot 575 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_bmi_cox.gph", replace 576 | 577 | meta regress baseline_bmi, random(sjonkman) 578 | estat bubbleplot 579 | graph save "Graph" "\ep3_cox\bubbleplot_EP3_citl_bmiSD_cox.gph", replace 580 | 581 | clear 582 | 583 | 584 | ******************************************************************************** 585 | ******************************************************************************** 586 | ******************************************************************************** 587 | 588 | -------------------------------------------------------------------------------- /7_OX129_ML_model_evaluation: -------------------------------------------------------------------------------- 1 | ## Evaluating machine learning models ## 2 | 3 | ## After fitting the models, then using IECV to generate individual-level predictions, the predictions 4 | ## (which were stored in csv files) are imported to Stata so that the same metrics can be calculated ## 5 | 6 | 7 | * OPEN STATA * 8 | 9 | ************************* 10 | * NEURAL NETWORK MODELS * 11 | ************************* 12 | 13 | ******************************************************************************** 14 | ** Performance of neural network based on 'pooled predictions' and IECV ** 15 | *************************************************************************** 16 | 17 | * Open saved csv file in Stata from R * 18 | import delimited "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions.csv", asdouble 19 | 20 | drop v1 // v1 is just an observation number - not useful 21 | 22 | * Destring the predictions, and rename variables for congeniality with Stata files* 23 | destring nnet_iecv_predictions, dpcomma replace 24 | summ nnet_iecv_predictions, det 25 | rename nnet_iecv_predictions predictions // Renaming for consistency across both models 26 | rename data_period2patid patid 27 | bys patid: gen m = _n // Sort by patient ID, generate imputation number (each patient will be present 50 times due to m=50 imputations) 28 | // This variable 'm' will be used to denote imputation number 29 | * Clip the predictions to be between 0 and 1 - they are probabilities * 30 | count if predictions<0 31 | count if predictions>1 32 | replace predictions = 0.0000001 if predictions<0 ## Miniscule risk rather than zero, will use cloglog transform later! 33 | replace predictions = 0.9999999 if predictions>1 ## Rather than guaranteed risk 34 | 35 | * Save this as interim dataset * 36 | save "\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace 37 | 38 | * We want to perform model evaluation in accordance with Rubin's rules with 39 | * multiple imputed datasets. For this, Stata 'needs' an m=0 (i.e. complete case) 40 | * copy, and then the separate imputations below it. We need to trick Stata into handling this * 41 | 42 | * First, keep first imputation, and say it is the complete case, m=0 43 | keep if m==1 44 | replace m=0 45 | 46 | * Generate mi style missing indicator - (variable '1') to say that predictions are missing, then 47 | * later we can use Stata's mi machinery to handle the 50 predictions for each 48 | * patid as separate imputations' predictions 49 | gen i_predictions = 1 50 | tab m 51 | replace predictions = . if i_predictions==1 52 | 53 | * Now that the predictions in m=0 are missing, append the dataset with the 50 54 | * true values of the predictions, 1 for each patid in each imputation 55 | append using "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta" 56 | 57 | * Save interim data file 58 | save "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace 59 | 60 | * Format as multiply imputed dataset - now, m=0 will have no predictions, whereas 61 | * the m=1 to m=50 will have their respective predictions in 62 | mi import flong, m(m) id(patid) imputed(predictions) // Importing imputations into this dataset 63 | 64 | * Convert to wide style for the convenience of dataset size, and for merging later 65 | mi convert wide 66 | mi describe 67 | 68 | * Save interim datafile after mi conversion * 69 | save "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace 70 | 71 | * For the smoothed calibration plot, we want each individual's average prediction, so that 72 | * this can be plotted against the observed pseudovalues * 73 | egen mean_prediction = rowmean(_1_predict - _50_predict) 74 | summ mean_prediction, det 75 | 76 | * Save and clear * 77 | save "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta", replace 78 | clear 79 | 80 | * Open up the key dataset - this contains psuedovalues, period1, period2, etc. * 81 | * This is in 'wide' format * 82 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear 83 | keep if period==2 84 | 85 | * Merge in the formatted predictions from the IECV process for the neural net * 86 | mi merge 1:1 patid using "\estimates\ML_IECV_predictions\endpoint3_nnet_iecv_predictions_formatted.dta" 87 | 88 | mi describe 89 | 90 | ************************ 91 | ** Pooled predictions ** 92 | ************************ 93 | 94 | * Plot the smoothed calibration plot - save out * 95 | * Running smoother between predicted probabilities and observed pseudo-prabilities - both on same scale * 96 | running period2_pseudo mean_prediction if _mi_m==1, ci leg(off) nopts xscale(range(0 1)) yscale(range(0 1)) aspect(1) // 97 | title("Smoothed calibration plot - neural network", size(small)) xtitle("Predicted event probability", size(small)) // 98 | ytitle("Observed event probability", size(small)) addplot(function y=x) graphregion(color(white)) ylabel(0(.2)1) xlabel(0(.2)1) 99 | 100 | graph save "\graphs\ep3_competing\calibration_neuralnet_iecv.gph", replace 101 | 102 | * Use the cloglog transform of predictions for overall calibration metrics * 103 | mi passive: gen cll_pred = log(-log(1-prediction)) 104 | 105 | * Calibration of neural network - pooled predictions * 106 | * 'Observed' (pseudo)probability, regress on cloglog of prediction, model on cloglog scale * 107 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls 108 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls offset(cll_pred) 109 | 110 | * Discrimination - pooled predictions * 111 | mi register regular ipcw // ipc weights - one for each patient 112 | mi stset fu_end, origin(fu_start) failure(iecv_event==1) scale(365.25) exit(time fu_start+3652.5) // Use follow-up and event indicators previously made in regression steps 113 | drop hr 114 | mi passive: gen hr = exp(prediction) // Exponentiate to avoid any predictions equalling zero - this preserves the ordering! 115 | mi passive: gen invhr = 1/hr // somersd expects higher score = better survival, so invert 116 | gen censind = 1 - _d if _st==1 117 | mi estimate, dots cmdok: somersd _t invhr if (_st==1) [iweight=ipcw], cenind(censind) tdist transf(c) // Run it 118 | 119 | 120 | **************************************************************** 121 | ** Pooled predictions: heterogeneity by ethnicity and age grp ** 122 | **************************************************************** 123 | 124 | * Discrimination and calibration in different ethnic groups * 125 | forval x = 1(1)9 { 126 | mi estimate, dots esampvaryok: somersd _t invhr if (_st==1 & ethriskid==`x') [iweight=ipcw], cenind(censind) tdist transf(c) // Again, ethnic group may vary across imputations, so permit this to be captured when making full use of imputed data 127 | } 128 | 129 | * Calibration slope * 130 | forval x = 1(1)9 { 131 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) noconstant irls vce(robust) // Estimate per-ethnic gorup result in each imputation, then pool 132 | } 133 | 134 | * Calibration in the large * 135 | forval x = 1(1)9 { 136 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred) 137 | } 138 | 139 | 140 | ************** 141 | * Age groups * 142 | ************** 143 | gen agegroup = 1 144 | replace agegroup = 2 if age_at_diagnosis>=30 145 | replace agegroup = 3 if age_at_diagnosis>=40 146 | replace agegroup = 4 if age_at_diagnosis>=50 147 | replace agegroup = 5 if age_at_diagnosis>=60 148 | replace agegroup = 6 if age_at_diagnosis>=70 149 | replace agegroup = 7 if age_at_diagnosis>=80 150 | tab agegroup 151 | 152 | * Discrimination and calibration in different age bands * 153 | forval x = 1(1)7 { 154 | mi estimate, dots: somersd _t invhr if (_st==1 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 155 | } 156 | 157 | * Calibration slope * 158 | forval x = 1(1)7 { 159 | mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) 160 | } 161 | 162 | * Calibration in the large * 163 | forval x = 1(1)7 { 164 | mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred) 165 | } 166 | 167 | 168 | ************************************************ 169 | ** IECV predictions and performance estimates ** 170 | ************************************************ 171 | 172 | ************************************************** 173 | ** Region-level heterogeneity and results ** 174 | ************************************************** 175 | cd "\\estimates\" 176 | 177 | *************************** 178 | * GT IECV for Harrell's C * // Loop to estimate model's Harrell's C for each region, save into separate dataset, then use later to pool with meta-analysis 179 | 180 | capture postutil clear 181 | tempname C_EP3regionnnet 182 | postfile `C_EP3regionnnet' beta st_err val_size using C_EP3regionnnet.dta , replace 183 | 184 | forval x = 1(1)10 { 185 | mi estimate, dots: somersd _t invhr if (_st==1 & sha1==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 186 | local beta = r(table)[1,1] 187 | local st_err = r(table)[2,1] 188 | local val_size = e(N) 189 | post `C_EP3regionnnet' (`beta') (`st_err') (`val_size') 190 | } 191 | 192 | postclose `C_EP3regionnnet' 193 | 194 | ********************************* 195 | * GT IECV for calibration slope * 196 | 197 | capture postutil clear 198 | tempname slope_EP3regionnnet 199 | postfile `slope_EP3regionnnet' slope slope_se val_size using slope_EP3regionnnet.dta , replace 200 | 201 | forval x = 1(1)10 { 202 | mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) noconstant irls vce(robust) 203 | local slope = r(table)[1,1] 204 | local slope_se = r(table)[2,1] 205 | local val_size = e(N) 206 | post `slope_EP3regionnnet' (`slope') (`slope_se') (`val_size') 207 | } 208 | 209 | postclose `slope_EP3regionnnet' 210 | 211 | **************************************** 212 | * GT IECV for calibration-in-the-large * // CITL is the intercept term when constraining slope to 1 213 | 214 | capture postutil clear 215 | tempname citl_EP3regionnnet 216 | postfile `citl_EP3regionnnet' citl citl_se val_size using citl_EP3regionnnet.dta , replace 217 | 218 | forval x = 1(1)10 { 219 | mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred) 220 | local citl = r(table)[1,1] 221 | local citl_se = r(table)[2,1] 222 | local val_size = e(N) 223 | post `citl_EP3regionnnet' (`citl') (`citl_se') (`val_size') 224 | } 225 | 226 | postclose `citl_EP3regionnnet' 227 | 228 | * Event/denominator counts for meta-analysis plots below * 229 | tab sha1 _d if period==2 // Number of events in each region during Period 2, and total in each region 230 | 231 | clear 232 | 233 | ***************** 234 | * Meta-analyses * 235 | ***************** 236 | 237 | ** Random effects meta-analysis pooled performance metrics * 238 | use "\estimates\C_EP3regionnnet.dta", clear 239 | input str50 region 240 | "East Midlands (464/3,337)" 241 | "East of England (543/4,826)" 242 | "London (1,270/13,287)" 243 | "North East (387/2,784)" 244 | "North West (1,664/15,419)" 245 | "South Central (1,062/11,272)" 246 | "South East (848/8,666)" 247 | "South West (1,054/10,165)" 248 | "West Midlands (1,032/8,678)" 249 | "Yorkshire & Humber (484/3,946)" 250 | end 251 | meta set beta st_err, studylab(region) eslabel("Harrell's C") 252 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 253 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 254 | graph save "Graph" "\graphs\EP3_harrellsC_nnet.gph", replace 255 | clear 256 | 257 | use "\estimates\slope_EP3regionnnet.dta" , clear 258 | input str50 region 259 | "East Midlands (464/3,337)" 260 | "East of England (543/4,826)" 261 | "London (1,270/13,287)" 262 | "North East (387/2,784)" 263 | "North West (1,664/15,419)" 264 | "South Central (1,062/11,272)" 265 | "South East (848/8,666)" 266 | "South West (1,054/10,165)" 267 | "West Midlands (1,032/8,678)" 268 | "Yorkshire & Humber (484/3,946)" 269 | end 270 | meta set slope slope_se, studylab(region) eslabel("Calibration slope") 271 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 272 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1) 273 | graph save "Graph" "\graphs\EP3_slope_nnet.gph", replace 274 | clear 275 | 276 | use "\estimates\citl_EP3regionnnet.dta" , clear 277 | input str50 region 278 | "East Midlands (464/3,337)" 279 | "East of England (543/4,826)" 280 | "London (1,270/13,287)" 281 | "North East (387/2,784)" 282 | "North West (1,664/15,419)" 283 | "South Central (1,062/11,272)" 284 | "South East (848/8,666)" 285 | "South West (1,054/10,165)" 286 | "West Midlands (1,032/8,678)" 287 | "Yorkshire & Humber (484/3,946)" 288 | end 289 | meta set citl citl_se, studylab(region) eslabel("Calibration-in-the-large") 290 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 291 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0) 292 | graph save "Graph" "\graphs\EP3_citl_nnet.gph", replace 293 | clear 294 | 295 | ******************************************************************************** 296 | ******************************************************************************** 297 | 298 | * END FOR NEURAL NETWORK EVALUATION * 299 | 300 | 301 | ******************************************************************************** 302 | ******************************************************************************** 303 | 304 | ****************** 305 | * XGBoost Models * 306 | ****************** 307 | 308 | ********************************************************************* 309 | ** Performance of XGBoost based on 'pooled predictions' and IECV ** 310 | ********************************************************************* 311 | * The below, initial processing steps are the same from above for the XGBoost model * 312 | 313 | * Open saved csv file from R * 314 | import delimited "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions.csv", asdouble 315 | drop v1 316 | 317 | * Destring the predictions, and rename variables for congeniality with Stata files* 318 | destring xgboost_iecv_predictions, dpcomma replace 319 | summ xgboost_iecv_predictions, det 320 | rename xgboost_iecv_predictions predictions 321 | rename data_period2patid patid 322 | bys patid: gen m = _n 323 | 324 | * Clip the predictions to be between 0 and 1 - they are probabilities * 325 | count if predictions<0 326 | count if predictions>1 327 | replace predictions = 0.0000001 if predictions<0 ## Miniscule risk rather than zero, will use cloglog transform later! 328 | replace predictions = 0.9999999 if predictions>1 329 | 330 | * Save this as interim dataset * 331 | save "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace 332 | 333 | * We want to perform model evaluation in accordance with Rubin's rules with 334 | * multiple imputed datasets. For this, Stata 'needs' an m=0 (i.e. complete case) 335 | * copy, and then the separate imputations. We need to trick Stata into this 336 | * First, keep first imputation, and say it is the complete case, m=0 337 | keep if m==1 338 | replace m=0 339 | 340 | * Generate mi style missing indicator - say that predictions are missing, then 341 | * later we can use Stata's mi machinery to handle the 50 predictions for each 342 | * patid as a separate imputations' predition 343 | gen i_predictions = 1 344 | tab m 345 | replace predictions = . if i_predictions==1 346 | 347 | * Now that the predictions in m=0 are missing, append the dataset with the 50 348 | * true values of the predictions, 1 for each pati in each imputation 349 | append using "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta" 350 | 351 | * Save interim data file 352 | save "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace 353 | 354 | * Format as multiply imputed dataset - now, m=0 will have no predictions, whereas 355 | * the m=1 to m=50 will have their respective predictions in 356 | mi import flong, m(m) id(patid) imputed(predictions) 357 | 358 | * Convert to wide style for the convenience of dataset size, and for merging later 359 | mi convert wide 360 | mi describe 361 | 362 | * Save interim datafile after mi conversion * 363 | save "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace 364 | 365 | * For the smoothed calibration plot, we want the average prediction, so that 366 | * this can be plotted against the observed pseudovalues * 367 | egen mean_prediction = rowmean(_1_predict - _50_predict) 368 | 369 | * Save and clear * 370 | save "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta", replace 371 | clear 372 | 373 | * Open up the key dataset - this contains psuedovalues, period1, period2, etc. * 374 | use "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear 375 | keep if period==2 376 | 377 | * Merge in the formatted predictions from the IECV process for the neural net * 378 | mi merge 1:1 patid using "\estimates\ML_IECV_predictions\endpoint3_xgboost_iecv_predictions_formatted.dta" 379 | 380 | 381 | ************************ 382 | ** Pooled predictions ** 383 | ************************ 384 | 385 | * Plot the smoothed calibration plot - save out * 386 | running period2_pseudo mean_prediction if _mi_m==1, ci leg(off) nopts xscale(range(0 1)) yscale(range(0 1)) aspect(1) // 387 | title("Smoothed calibration plot - XGBoost", size(small)) xtitle("Predicted event probability", size(small)) // 388 | ytitle("Observed event probability", size(small)) addplot(function y=x) graphregion(color(white)) ylabel(0(.2)1) xlabel(0(.2)1) 389 | 390 | graph save "\graphs\ep3_competing\calibration_xgboost_iecv.gph", replace 391 | 392 | * Calculate the cloglog transforms * 393 | mi passive: gen cll_pred = log(-log(-prediction)) 394 | 395 | * Calibration of neural network - pooled predictions * 396 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls 397 | mi estimate, dots: glm period2_pseudo cll_pred, link(cloglog) vce(robust) noconstant irls offset(cll_pred) 398 | 399 | * Discrimination - pooled predictions * 400 | mi register regular ipcw 401 | mi stset fu_end, origin(fu_start) failure(iecv_event==1) scale(365.25) exit(time fu_start+3652.5) 402 | mi passive: gen hr = exp(prediction) 403 | mi passive: gen invhr = 1/hr 404 | gen censind = 1 - _d if _st==1 405 | mi estimate, dots cmdok: somersd _t invhr if (_st==1) [iweight=ipcw], cenind(censind) tdist transf(c) 406 | 407 | 408 | **************************************************************** 409 | ** Pooled predictions: heterogeneity by ethnicity and age grp ** 410 | **************************************************************** 411 | 412 | * Discrimination in different ethnic groups * 413 | forval x = 1(1)9 { 414 | mi estimate, dots esampvaryok: somersd _t invhr if (_st==1 & ethriskid==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 415 | } 416 | 417 | * Calibration slope and calibration-in-the-large * 418 | forval x = 1(1)9 { 419 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls 420 | } 421 | 422 | forval x = 1(1)9 { 423 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if ethriskid==`x', link(cloglog) vce(robust) noconstant irls offset(cll_pred) 424 | } 425 | 426 | 427 | ************** 428 | * Age groups * 429 | ************** 430 | gen agegroup = 1 431 | replace agegroup = 2 if age_at_diagnosis>=30 432 | replace agegroup = 3 if age_at_diagnosis>=40 433 | replace agegroup = 4 if age_at_diagnosis>=50 434 | replace agegroup = 5 if age_at_diagnosis>=60 435 | replace agegroup = 6 if age_at_diagnosis>=70 436 | replace agegroup = 7 if age_at_diagnosis>=80 437 | tab agegroup 438 | 439 | * Discrimination and calibration in different age bands * 440 | forval x = 1(1)7 { 441 | mi estimate, dots: somersd _t invhr if (_st==1 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 442 | } 443 | 444 | * Calibration slope * 445 | forval x = 1(1)7 { 446 | mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) 447 | } 448 | 449 | * Calibration in the large * 450 | forval x = 1(1)7 { 451 | mi estimate, dots: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) noconstant irls vce(robust) offset(cll_pred) 452 | } 453 | 454 | 455 | ************************************************ 456 | ** IECV predictions and performance estimates ** 457 | ************************************************ 458 | 459 | ** Numbers of events and denominators for the forest plots ** 460 | * Estimated using neural network evaluation steps above * 461 | 462 | ************************************************** 463 | ** Region-level heterogeneity and results ** 464 | ************************************************** 465 | cd "\estimates\" 466 | 467 | *************************** 468 | * GT IECV for Harrell's C * 469 | 470 | capture postutil clear 471 | tempname C_EP3regionxgboost 472 | postfile `C_EP3regionxgboost' beta st_err val_size using C_EP3regionxgboost.dta , replace 473 | 474 | forval x = 1(1)10 { 475 | mi estimate, dots: somersd _t invhr if (_st==1 & sha1==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 476 | local beta = r(table)[1,1] 477 | local st_err = r(table)[2,1] 478 | local val_size = e(N) 479 | post `C_EP3regionxgboost' (`beta') (`st_err') (`val_size') 480 | } 481 | 482 | postclose `C_EP3regionxgboost' 483 | 484 | ********************************* 485 | * GT IECV for calibration slope * 486 | 487 | capture postutil clear 488 | tempname slope_EP3regionxgboost 489 | postfile `slope_EP3regionxgboost' slope slope_se val_size using slope_EP3regionxgboost.dta , replace 490 | 491 | forval x = 1(1)10 { 492 | mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) vce(robust) irls noconstant 493 | local slope = r(table)[1,1] 494 | local slope_se = r(table)[2,1] 495 | local val_size = e(N) 496 | post `slope_EP3regionxgboost' (`slope') (`slope_se') (`val_size') 497 | } 498 | 499 | postclose `slope_EP3regionxgboost' 500 | 501 | **************************************** 502 | * GT IECV for calibration-in-the-large * 503 | 504 | capture postutil clear 505 | tempname citl_EP3regionxgboost 506 | postfile `citl_EP3regionxgboost' citl citl_se val_size using citl_EP3regionxgboost.dta , replace 507 | 508 | forval x = 1(1)10 { 509 | mi estimate, dots: glm period2_pseudo cll_pred if sha1==`x', link(cloglog) vce(robust) irls noconstant offset(cll_pred) 510 | local citl = r(table)[1,1] 511 | local citl_se = r(table)[2,1] 512 | local val_size = e(N) 513 | post `citl_EP3regionxgboost' (`citl') (`citl_se') (`val_size') 514 | } 515 | 516 | postclose `citl_EP3regionxgboost' 517 | 518 | 519 | clear 520 | 521 | ***************** 522 | * Meta-analyses * 523 | ***************** 524 | 525 | ** Random effects meta-analysis pooled performance metrics * 526 | use "\estimates\C_EP3regionxgboost.dta", clear 527 | input str50 region 528 | "East Midlands (464/3,337)" 529 | "East of England (543/4,826)" 530 | "London (1,270/13,287)" 531 | "North East (387/2,784)" 532 | "North West (1,664/15,419)" 533 | "South Central (1,062/11,272)" 534 | "South East (848/8,666)" 535 | "South West (1,054/10,165)" 536 | "West Midlands (1,032/8,678)" 537 | "Yorkshire & Humber (484/3,946)" 538 | end 539 | meta set beta st_err, studylab(region) eslabel("Harrell's C") 540 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 541 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 542 | graph save "Graph" "\graphs\EP3_harrellsC_xgboost.gph", replace 543 | clear 544 | 545 | use "\estimates\slope_EP3regionxgboost.dta" , clear 546 | input str50 region 547 | "East Midlands (464/3,337)" 548 | "East of England (543/4,826)" 549 | "London (1,270/13,287)" 550 | "North East (387/2,784)" 551 | "North West (1,664/15,419)" 552 | "South Central (1,062/11,272)" 553 | "South East (848/8,666)" 554 | "South West (1,054/10,165)" 555 | "West Midlands (1,032/8,678)" 556 | "Yorkshire & Humber (484/3,946)" 557 | end 558 | meta set slope slope_se, studylab(region) eslabel("Calibration slope") 559 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 560 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1) 561 | graph save "Graph" "\graphs\EP3_slope_xgboost.gph", replace 562 | clear 563 | 564 | use "\estimates\citl_EP3regionxgboost.dta" , clear 565 | input str50 Region 566 | "East Midlands (464/3,337)" 567 | "East of England (543/4,826)" 568 | "London (1,270/13,287)" 569 | "North East (387/2,784)" 570 | "North West (1,664/15,419)" 571 | "South Central (1,062/11,272)" 572 | "South East (848/8,666)" 573 | "South West (1,054/10,165)" 574 | "West Midlands (1,032/8,678)" 575 | "Yorkshire & Humber (484/3,946)" 576 | end 577 | meta set citl citl_se, studylabel("Region") eslabel("Calibration-in-the-large") 578 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 579 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0) 580 | graph save "Graph" "\\graphs\EP3_citl_xgboost.gph", replace 581 | clear 582 | 583 | ******************************************************************************** 584 | ******************************************************************************** 585 | 586 | 587 | 588 | ********************************* 589 | * META-REGRESSION FOR ML MODELS * 590 | ********************************* 591 | 592 | * Use the pre-made datasets for region-level performance metrics, and the region-level summary demographic characteristics data file * 593 | 594 | * XGboost * 595 | 596 | *************** 597 | * Harrell's C * 598 | use "\estimates\C_EP3regionxgboost.dta" 599 | gen sha1 = _n 600 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 601 | 602 | meta set beta st_err, studylab(sha1) 603 | 604 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 605 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 606 | 607 | meta regress age1, random(sjonkman) 608 | estat bubbleplot 609 | 610 | meta regess town_int, random(sjonkman) 611 | estat bubbleplot 612 | 613 | meta regress non_white, random(sjonkman) 614 | estat bubbleplot 615 | 616 | meta regress baseline_bmi1, random(sjonkman) 617 | estat bubbleplot 618 | clear 619 | 620 | ********************* 621 | * Calibration slope * 622 | use "\estimates\slope_EP3regionxgboost.dta" 623 | gen sha1 = _n 624 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 625 | 626 | meta set slope slope_se, studylab(sha1) 627 | 628 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 629 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 630 | 631 | meta regress age1, random(sjonkman) 632 | estat bubbleplot 633 | 634 | meta regess town_int, random(sjonkman) 635 | estat bubbleplot 636 | 637 | meta regress non_white, random(sjonkman) 638 | estat bubbleplot 639 | 640 | meta regress baseline_bmi1, random(sjonkman) 641 | estat bubbleplot 642 | clear 643 | 644 | 645 | 646 | * Neural networks * 647 | 648 | *************** 649 | * Harrell's C * 650 | use "\estimates\C_EP3regionnnet.dta" 651 | gen sha1 = _n 652 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 653 | 654 | meta set beta st_err, studylab(sha1) 655 | 656 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 657 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 658 | 659 | meta regress age1, random(sjonkman) 660 | estat bubbleplot 661 | 662 | meta regess town_int, random(sjonkman) 663 | estat bubbleplot 664 | 665 | meta regress non_white, random(sjonkman) 666 | estat bubbleplot 667 | 668 | meta regress baseline_bmi1, random(sjonkman) 669 | estat bubbleplot 670 | clear 671 | 672 | ********************* 673 | * Calibration slope * 674 | use "\estimates\slope_EP3regionnnet.dta" 675 | gen sha1 = _n 676 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 677 | 678 | meta set slope slope_se, studylab(sha1) 679 | 680 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 681 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 682 | 683 | meta regress age1, random(sjonkman) 684 | estat bubbleplot 685 | 686 | meta regess town_int, random(sjonkman) 687 | estat bubbleplot 688 | 689 | meta regress non_white, random(sjonkman) 690 | estat bubbleplot 691 | 692 | meta regress baseline_bmi1, random(sjonkman) 693 | estat bubbleplot 694 | clear 695 | 696 | ******************************************************************************** 697 | ******************************************************************************** 698 | 699 | ** END ** 700 | 701 | -------------------------------------------------------------------------------- /3_OX129_Competing_risks_regression: -------------------------------------------------------------------------------- 1 | ## Competing risks regression modelling ## 2 | 3 | 4 | ## Multiple imputation has already been carried out in accordance with principles 5 | ## for Cox models (e.g. including endpoint, Aalen-Johanssen cumulative hazard function ## 6 | 7 | ## Strategy here is, using the same set of 50 imputations, to permit and identify optimal FP terms 8 | ## for age/BMI/Townsend score for a pseudo-observations-based competing risks regression model, 9 | ## then conduct the model fitting and IECV thereafter ## 10 | 11 | 12 | ******************************************************************************** 13 | * 1) Fractional polynomials, predictor selection * 14 | ******************************************************************************** 15 | 16 | * As candidate predictors may have different associations with the outcome in a 17 | * competing risks, pseudovalues-based model, we start with the 'raw data' and 18 | * repeat the processing steps to: 19 | * Find the optimal FP terms for age, BMI, Townsend 20 | 21 | * Start with the original dataset (complete case, not yet imputed) * 22 | use "\final_datasets\OX129_endpoint3.dta", clear 23 | count 24 | 25 | * Stset the data, estimate pseudovalues for the cumulative incidence function * 26 | * exit3status = contains all cause death (use to define competing event) 27 | tab d_breastcancer \\ 1=breast cancer-related death, 0 = no 28 | tab exit3status \\ 1 = death from any cause, 0 = censored/left alive 29 | la de endpoints 0 "Censored" 1 "Breast cancer death" 2 "Competing death" 30 | gen endpoint = 0 31 | replace endpoint = 1 if d_breastca==1 \\ Died, and breast cancer was on death certificate 32 | replace endpoint = 2 if exit3status==1 & d_breastca!=1 \\ Died, but not from breast cancer 33 | la val endpoint endpoints 34 | tab endpoint 35 | 36 | * Make time variable - unit is years * 37 | gen time = exit3date - outcomedate6 \\ Date of exit/death, minus date of breast cancer diagnosis 38 | replace time = time/365.25 39 | summ time, det 40 | 41 | * Stset the data - unrestricted timeframe. Will use this to define competing 42 | * events, breast cancer deaths, and compete the CIF pseudovalues at 10years 43 | stset time, failure(endpoint==1) exit(time 10) \\ stpci command for pseudovalues automatically assumes 44 | \\ the other ategory of endpoint is the competing one 45 | cd "\ado\plus\" 46 | stpci, at(10) \\ estimate jack-knife pseudovalues for Aalen-Johanssen CIF at 10 years 47 | summ pseudo, det 48 | 49 | * Fractional polynomials for age, BMI at diagnosis, and Townsend score * 50 | mfp, all: glm pseudo age_at_diagnosis diagnosis_bmi town_int \\ In complete case data, as was done for Cox model 51 | \\ Sometimes there is non-convergence with the cloglog - use the linear version for FPs 52 | 53 | * Review FP terms returned - age and BMI have non-linear terms, Townsend = 1 * 54 | * Age = 1, 2 (different than Cox model) 55 | * BMI at diagnosis = -2, -2 (same as Cox model) 56 | * Townsend = 1, so keep linear. Let's 'undo' the scaling, and drop Itown 57 | 58 | * Plotting FP terms for the competing risks models * 59 | glm pseudo Iage*, link(cloglog) irls \\ Model on cloglog scale, iteratively weighted least squares for maximisation 60 | predict age_fp, xb \\ predict model linear predictor 61 | replace age_fp = 1 - exp(-exp(age_fp)) \\ complementar log-log transform of linear predictor = probabililty 62 | twoway fpfitci age_fp age_at_diagnosis, ytitle("Predicted event probability") xtitle("Age at diagnosis") 63 | graph save "Graph" "\graphs\ep3_competing\FP_terms_age.gph", replace 64 | 65 | glm pseudo Idiag*, link(cloglog) irls \\ Repeat this, but for BMI with its FP terms 66 | predict bmi_fp, xb 67 | replace bmi_fp = 1 - exp(-exp(bmi_fp)) 68 | twoway fpfitci bmi_fp diagnosis_bmi, ytitle("Predicted event probability") xtitle("Body mass index at diagnosis") 69 | graph save "Graph" "\graphs\ep3_competing\FP_terms_bmi.gph", replace 70 | 71 | 72 | * Now we know the functional forms, and that they are very similar (one FP term 73 | * for age is different, and easily coded), we can generate this new term in the 74 | * imputed dataset that we have previously generated. 75 | 76 | * close this dataset, then move on to core dataset for endpoint 3 * 77 | clear 78 | 79 | ******************************************************************************** 80 | 81 | use "\final_datasets\OX129_endpoint3_imputed.dta", clear \\ Same set of 50 imputations used for Cox models 82 | count 83 | 84 | * Generate the new FP term for age - we will have FP terms (1,2) for CR model* 85 | * Detail on scale, etc. from mfp command above * 86 | gen double age_fp1 = (age_at_diagnosis/10) - 6.301833523 87 | gen double age_fp2 = (age_at_diagnosis/10)^2 - 39.71310575 88 | summ age_fp1, det 89 | summ age_fp2, det 90 | 91 | * Generate the BMI terms * 92 | mi passive: gen double bmi_x = bmi/10 93 | mi passive: gen double bmi_fp1 = (bmi_x^-2) - 0.1353749689 94 | mi passive: gen double bmi_fp2 = (bmi_x^-2)*ln(bmi_x) = 0.1353551232 95 | 96 | * Stset the data, estimate pseudovalues for the cumulative incidence function * 97 | * exit3status = contains all cause death (use to define competing event) 98 | tab d_breastcancer 99 | tab exit3status 100 | la de endpoints 0 "Censored" 1 "Breast cancer death" 2 "Competing death" 101 | gen endpoint = 0 102 | replace endpoint = 1 if d_breastca==1 \\ Same as above 103 | replace endpoint = 2 if exit3status==1 & d_breastca!=1 104 | la val endpoint endpoints 105 | tab endpoint 106 | 107 | * Make time variable * 108 | gen time = exit3date - outcomedate6 109 | replace time = time/365.25 110 | 111 | * Stset the data - unrestricted timeframe. Will use this to define competing 112 | * events, breast cancer deaths, and compete the CIF pseudovalues at 10years 113 | mi stset time, failure(endpoint==1) exit(time 10) 114 | 115 | cd "\ado\plus\" 116 | stpci, at(10) \\ Aalen-Johanssen cumulative incidence function at 10 years 117 | summ pseudo, det 118 | 119 | * Perform predictor selection * 120 | * Fit the 'full' model - generalised linear model with cloglog link functon to approximate Fine-Gray; iteratively rewighted least squares for maximisation 121 | * Robust standard errors in view of non-independence of pseudovalues * 122 | 123 | global covariates_crr = "age_fp* bmi_fp* townsend i.ethriskid i.alcohol_cat i.smoke_cat ib6.cancer_route ib2.pr_status, ib2.her2_status, // 124 | i.fh_breastca i.radio i.chemo i.mastectomy i.other_surg i.cancer_stage i.cancer_grade i.ckd i.cirrhosis i.htn i.ihd i.t1dm i.t2dm i.vasculitis // 125 | i.lupus i.thiazide i.betablocker i.acei i.raa i.tca i.ssri i.maoi i.otherantidepress i.ocp i.hrt i.anti_psycho c.age_fp1##i.chemo c.age_fp2##i.chemo // 126 | c.age_fp1##i.mastec c.age_fp2##i.mastec c.age_fp1##i.other_surg c.age_fp2##i.other_surg c.age_fp1##i.fh_breastca c.age_fp2##i.fh_breastca i.ethriskid##c.age_fp1 // 127 | i.ethriskid##c.age_fp2 i.ethriskid##i.other_surg i.ethriskid##i.mastectomy i.ethriskid##i.chemotherapy i.ethriskid##i.radiotherapy" 128 | 129 | timer on 1 130 | mi estimate, dots eform: glm pseudo $covariates_crr, link(cloglog) vce(robust) irls 131 | timer off 1 132 | timer list 133 | timer clear 134 | 135 | * Save exponentaited coefficients for pre-selection model * 136 | cd "\models\Endpoint_3\" 137 | parmest, list(,) saving(CR_endpoint3_parmest_preselection.dta, replace) label eform for(estimate min95 max95) 138 | 139 | * Selected covariates - SHR<0.9 or >1.1, with p<0.01; interactions if p<0.01 * 140 | * For categories, if >1 of the sub-groups is sig, and (if relevant) 'dose' trend seend, retain variable * 141 | global select_covariates_crr = "age_fp* bmi_fp* i.smoke_cat i.ib6.cancer_route ib2.pr_status ib2.her2_status ib2.er_status i.radio i.mastectomy i.other_surg i.cancer_stage i.cancer_grade i.hrt c.age_fp1##i.other_surg" 142 | 143 | * Fit the final model * 144 | mi estimate, dots eform: glm pseudo $select_covariates_crr, link(cloglog) vce(robust) irls 145 | 146 | * Save coefficients and constant term for later plotting/use * 147 | parmest, list(,) saving(CR_endpoint3_parmest.dta, replace) label eform for(estimate min95 max95) // Save the model as 'SHRs' 148 | parmest, list(,) saving(CR_endpoint3_parmest_coefs.dta, replace) label for(estimate min95 max95) // Save the model as coefficients 149 | 150 | 151 | ******************************************************************************** 152 | * 2) Estimate IPCWs * 153 | ******************************************************************************** 154 | 155 | * Use Nelson-Aalen cumulative hazard - using same covariates as final model * 156 | * Same covariate used as those selected for the prediction model * 157 | mi stset time if period==2, failure(endpoint==0) exit(time 10) // Code inspired by ado file for Stata command 'stbrier' 158 | sts gen basechazard = na 159 | gen base_ipcw = . 160 | summ basechazard if _t==10 // Nelson-Aalen cumulative hazard for censoring at 10 years 161 | replace base_ipcw = r(mean) if period==2 162 | 163 | cd "\models\Endpoint_3\" 164 | mi estimate, dots saving(ipcw, replace): stcox $select_covariates_crr if period==2, estimate efron // Fit model for censoring times, rather than death times 165 | mi predict xbC using ipcw if period==2, xb // Linear predictor (xb) from this censoring model 166 | summ xbC, det 167 | 168 | gen prob_cens = (exp(-base_ipcw)^exp(xbC)) // Combine baseline 'hazard' with linear predictor for censoring 169 | summ prob_cens, det 170 | gen ipcw = 1/prob_cens // Inverse probability of being censored 171 | summ ipcw, det 172 | 173 | *Ensure data adequately re-stset - endpoint is breast cancer death * 174 | mi stset time, failure(endpoint==1) exit(time 10) // Re-stset so that we go back to dealing with deaths 175 | 176 | 177 | save "\final_datasets\OX129_endpoint3_pseudovaluescif.dta", replace // Save as new, standalone dataset 178 | clear 179 | 180 | ******************************************************************************** 181 | ******************************************************************************** 182 | 183 | 184 | 185 | ******************************************************************************** 186 | * 3) Internal-external cross-validation run * 187 | ******************************************************************************** 188 | 189 | ********************** 190 | * Preparing for IECV * 191 | ********************** 192 | 193 | * By using pseudovalues for the endpoint, and need to have separate time periods 194 | * for IECV, we will use two 'sub-datasets' - one for period 1, another for 195 | * period 2. 196 | * Period 1 data will be used to fit model, linear predictor and probabilities will be 197 | * calculated in iteratively held-out Period 2 samples * 198 | 199 | use "/final_datasets\OX129_endpoint3_pseudovaluescif.dta", clear // The Competing Risks modelling dataset made above 200 | drop fu_start fu_end follow_up time // Keep clean so that we can be sure on dates/times 201 | 202 | * Need to generate a measure of FU time from entry to earliest of: event/competing event/censoring/end of period 1/end of study * 203 | * Outcomedate6 = date of breast cancer diagnosis * 204 | gen fu_start = outcomedate6 205 | format fu_start %td 206 | gen fu_end = . 207 | 208 | * Timesplit is the cut off between the two periods * 209 | gen timesplit = mdy(01,01,2010) 210 | format timesplit %td 211 | tab period, m 212 | 213 | * Follow-up must end at the timesplit if entered in period 1 * 214 | * Take the earliest of the timesplit, or other recorded exit date from cohort * 215 | replace fu_end = min(exit3date, timesplit) if period==1 216 | replace fu_end = exit3date if period==2 217 | format fu_end %td 218 | 219 | * Simple calc of follow-up in the IECV framework * 220 | gen follow_up = fu_end-fu_start 221 | summ follow_up, det 222 | replace follow_up = follow_up/365.25 // Scale is years 223 | summ follow_up, det 224 | 225 | * Overall event counts, any time in study period * 226 | tab endpoint 227 | 228 | * Generate new event indicator for IECV - event only within period * 229 | gen iecv_event = 0 230 | replace iecv_event = 1 if period==1 & endpoint==1 & exit3date=30 375 | replace agegroup = 3 if age_at_diagnosis>=40 376 | replace agegroup = 4 if age_at_diagnosis>=50 377 | replace agegroup = 5 if age_at_diagnosis>=60 378 | replace agegroup = 6 if age_at_diagnosis>=70 379 | replace agegroup = 7 if age_at_diagnosis>-80 380 | 381 | forval x = 1(1)7 { 382 | mi estimate, cmdok dots esampvaryok: somersd _t invprob if (_st==1 & period==2 & agegroup==`x') [iweight=ipcw], cenind(censind) tdist transf(c) 383 | } 384 | 385 | * Calibration slope * 386 | forval x = 1(1)7 { 387 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) vce(robust) noconstant irls 388 | } 389 | 390 | * Calibration-in-the-large * 391 | forval x = 1(1)7 { 392 | mi estimate, dots esampvaryok: glm period2_pseudo cll_pred if agegroup==`x', link(cloglog) vce(robust) noconstant irls offset(cll_pred) 393 | } 394 | 395 | ******************************************************************************** 396 | ******************************************************************************** 397 | 398 | 399 | 400 | ************************************************** 401 | ** 4) Region-level heterogeneity and results ** 402 | ************************************************** 403 | 404 | cd "\estimates\" 405 | 406 | *************************** 407 | * GT IECV for Harrell's C * // As for Cox models, run loop estimating performance for each region, save as own dataset, which used later 408 | 409 | capture postutil clear 410 | tempname C_EP3regioncompeting 411 | postfile `C_EP3regioncompeting' beta st_err val_size using C_EP3regioncompeting.dta , replace 412 | 413 | forval x = 1(1)10 { 414 | mi estimate, cmdok dots: somersd _t invprob if (_st==1 & sha1==`x' & period==2) [iweight=ipcw], cenind(censind) tdist transf(c) 415 | local beta = r(table)[1,1] 416 | local st_err = r(table)[2,1] 417 | local val_size = e(N) 418 | post `C_EP3regioncompeting' (`beta') (`st_err') (`val_size') // beta = point estimate, st_err = standard error thereof 419 | } 420 | 421 | postclose `C_EP3regioncompeting' 422 | 423 | ********************************* 424 | * GT IECV for calibration slope * 425 | 426 | capture postutil clear 427 | tempname slope_EP3regioncompeting 428 | postfile `slope_EP3regioncompeting' slope slope_se val_size using slope_EP3regioncompeting.dta , replace // linear regression between predicted and observed probs 429 | 430 | forval x = 1(1)10 { 431 | mi estimate, dots: glm period2_pseudo cll_pred if (sha1==`x' & period==2), link(cloglog) vce(robust) irls noconstant 432 | local slope = r(table)[1,1] 433 | local slope_se = r(table)[2,1] 434 | local val_size = e(N) 435 | post `slope_EP3regioncompeting' (`slope') (`slope_se') (`val_size') 436 | } 437 | 438 | postclose `slope_EP3regioncompeting' 439 | 440 | **************************************** 441 | * GT IECV for calibration-in-the-large * 442 | 443 | capture postutil clear 444 | tempname citl_EP3regioncompeting 445 | postfile `citl_EP3regioncompeting' citl citl_se val_size using citl_EP3regioncompeting.dta , replace // offset used to constrain slope to 1, take the intercept 446 | 447 | forval x = 1(1)10 { 448 | mi estimate, dots: glm period2_pseudo cll_pred if (sha1==`x' & period==2), link(cloglog) vce(robust) irls noconstant offset(cll_pred) 449 | local citl = r(table)[1,1] 450 | local citl_se = r(table)[2,1] 451 | local val_size = e(N) 452 | post `citl_EP3regioncompeting' (`citl') (`citl_se') (`val_size') 453 | } 454 | 455 | postclose `citl_EP3regioncompeting' 456 | 457 | save "\final_datasets\OX129_endpoint3_competingrisk_IECV.dta" , replace 458 | 459 | * numbers of events per region in data - for forest plots below * 460 | tab sha1 _d if period==2 & _mi_m==0 461 | 462 | clear 463 | 464 | ***************** 465 | * Meta-analyses * 466 | ***************** 467 | 468 | 469 | ** Random effects meta-analysis pooled performance metrics * 470 | use "\estimates\C_EP3regioncompeting.dta", clear 471 | input str60 region // Make new variable containing Region name, event counts, and total count for reference 472 | "East Midlands (464/3,337)" 473 | "East of England (543/4,826)" 474 | "London (1,270/13,287)" 475 | "North East (387/2,784)" 476 | "North West (1,664/15,419)" 477 | "South Central (1,062/11,272)" 478 | "South East (848/8,666)" 479 | "South West (1,054/10,165)" 480 | "West Midlands (1,032/8,678)" 481 | "Yorkshire & Humber (484/3,946)" 482 | end 483 | meta set beta st_err, studylab(region) eslabel("Harrell's C") 484 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 485 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) 486 | graph save "Graph" "\graphs\EP3_harrellsC_competing.gph", replace 487 | clear 488 | 489 | use "\estimates\slope_EP3regioncompeting.dta" , clear 490 | input str60 region 491 | "East Midlands (464/3,337)" 492 | "East of England (543/4,826)" 493 | "London (1,270/13,287)" 494 | "North East (387/2,784)" 495 | "North West (1,664/15,419)" 496 | "South Central (1,062/11,272)" 497 | "South East (848/8,666)" 498 | "South West (1,054/10,165)" 499 | "West Midlands (1,032/8,678)" 500 | "Yorkshire & Humber (484/3,946)" 501 | end 502 | meta set slope slope_se, studylab(region) eslabel("Calibration slope") 503 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 504 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(1) 505 | graph save "Graph" "\graphs\EP3_slope_competing.gph", replace 506 | clear 507 | 508 | use "\estimates\citl_EP3regioncompeting.dta" , clear 509 | input str60 region 510 | "East Midlands (464/3,337)" 511 | "East of England (543/4,826)" 512 | "London (1,270/13,287)" 513 | "North East (387/2,784)" 514 | "North West (1,664/15,419)" 515 | "South Central (1,062/11,272)" 516 | "South East (848/8,666)" 517 | "South West (1,054/10,165)" 518 | "West Midlands (1,032/8,678)" 519 | "Yorkshire & Humber (484/3,946)" 520 | end 521 | meta set citl citl_se, studylab(region) eslabel("Calibration-in-the-large") 522 | meta summarize, random(sjonkman) se(khartung) predinterval(95) 523 | meta forestplot, random(sjonkman) se(khartung) predinterval(95) xline(0) 524 | graph save "Graph" "\graphs\EP3_citl_competing.gph", replace 525 | clear 526 | 527 | 528 | log close 529 | 530 | 531 | ******************************************************************************** 532 | ******************************************************************************** 533 | 534 | 535 | ************************ 536 | ** 5) Meta-regression ** 537 | ************************ 538 | 539 | * Form collapsed dataset which contains the region-level factors (period 2 data) to be explored in meta-reg * // Same as for Cox models 540 | use "\\final_datasets\OX129_endpoint3_imputed_IECV.dta" , clear // Repeated here for clarity 541 | mi convert wide 542 | keep if period==2 543 | gen nonwhite = 0 544 | replace nonwhite=1 if (ethriskid!=. & ethriskid>1) 545 | gen age1 = age_at_diagnosis 546 | gen baseline_bmi1 = diagnosis_bmi 547 | gen town_int1 = town_int 548 | collapse (mean)age_at_diagnosis diagnosis_bmi town_int (sd)age1 baseline_bmi1 town_int1 (percent)nonwhite, by(sha1) 549 | tab age_at_diagnosis 550 | tab diagnosis_bmi 551 | tab town_int 552 | tab age1 553 | tab baseline_bmi1 554 | tab town_int1 555 | tab nonwhite 556 | save "\\final_datasets\OX129_regionalperiod2_EP3.dta", replace 557 | clear 558 | 559 | ********************************************************************************** 560 | * Examine effect of age, BMI, deprivation and non-white ethnicity on perf hetero * 561 | ********************************************************************************** 562 | 563 | ***************** 564 | ** Harrell's C ** 565 | use "\estimates\C_EP3regioncompeting.dta" 566 | gen sha1 = _n 567 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 568 | 569 | meta set beta st_err, studylab(sha1) 570 | 571 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 572 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 573 | 574 | meta regress age_at_diagnosis, random(sjonkman) 575 | estat bubbleplot 576 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_age_competing.gph", replace 577 | 578 | meta regress age1, random(sjonkman) 579 | estat bubbleplot 580 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_ageSD_competing.gph", replace 581 | 582 | meta regress town_int, random(sjonkman) 583 | estat bubbleplot 584 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_townsend_competing.gph", replace 585 | 586 | meta regress nonwhite, random(sjonkman) 587 | estat bubbleplot 588 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_ethnic_competing.gph", replace 589 | 590 | meta regress diagnosis_bmi, random(sjonkman) 591 | estat bubbleplot 592 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_bmi_competing.gph", replace 593 | 594 | meta regress baseline_bmi, random(sjonkman) 595 | estat bubbleplot 596 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_C_bmiSD_competing.gph", replace 597 | 598 | clear 599 | 600 | 601 | *********************** 602 | ** Calibration slope ** 603 | use "\estimates\slope_EP3regioncompeting.dta" 604 | gen sha1 = _n 605 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 606 | 607 | meta set slope slope_se, studylab(sha1) 608 | 609 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 610 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 611 | 612 | meta regress age_at_diagnosis, random(sjonkman) 613 | estat bubbleplot 614 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_age_competing.gph", replace 615 | 616 | meta regress age1, random(sjonkman) 617 | estat bubbleplot 618 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_ageSD_competing.gph", replace 619 | 620 | meta regress town_int, random(sjonkman) 621 | estat bubbleplot 622 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_townsend_competing.gph", replace 623 | 624 | meta regress nonwhite, random(sjonkman) 625 | estat bubbleplot 626 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_ethnic_competing.gph", replace 627 | 628 | meta regress diagnosis_bmi, random(sjonkman) 629 | estat bubbleplot 630 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_bmi_competing.gph", replace 631 | 632 | meta regress baseline_bmi, random(sjonkman) 633 | estat bubbleplot 634 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_slope_bmiSD_competing.gph", replace 635 | 636 | clear 637 | 638 | 639 | ****************************** 640 | ** Calibration in the large ** 641 | use "\estimates\citl_EP3regioncompeting.dta" 642 | gen sha1 = _n 643 | merge 1:1 sha1 using "\final_datasets\OX129_regionalperiod2_EP3.dta" 644 | 645 | meta set citl citl_se, studylab(sha1) 646 | 647 | meta regress age_at_diagnosis diagnosis_bmi town_int nonwhite, random(sjonkman) se(khartung) 648 | meta regress age1 baseline_bmi1 town_int nonwhite, random(sjonkman) se(khartung) 649 | 650 | meta regress age_at_diagnosis, random(sjonkman) 651 | estat bubbleplot 652 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_age_competing.gph", replace 653 | 654 | meta regress age1, random(sjonkman) 655 | estat bubbleplot 656 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_ageSD_competing.gph", replace 657 | 658 | meta regress town_int, random(sjonkman) 659 | estat bubbleplot 660 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_townsend_competing.gph", replace 661 | 662 | meta regress nonwhite, random(sjonkman) 663 | estat bubbleplot 664 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_ethnic_competing.gph", replace 665 | 666 | meta regress diagnosis_bmi, random(sjonkman) 667 | estat bubbleplot 668 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_bmi_competing.gph", replace 669 | 670 | meta regress baseline_bmi, random(sjonkman) 671 | estat bubbleplot 672 | graph save "Graph" "\graphs\ep3_competing\bubbleplot_EP3_citl_bmiSD_competing.gph", replace 673 | 674 | clear 675 | 676 | ******************************************************************************** 677 | ******************************************************************************** 678 | 679 | ## END ## 680 | -------------------------------------------------------------------------------- /5_OX129_XGBoost: -------------------------------------------------------------------------------- 1 | ## XGBoost model fitting and evaluation with internal-external cross-validation ## 2 | 3 | ## This is conducted in R, after data imputation, formatting, etc. in Stata ## 4 | 5 | ## This stacked imputations approach enables full use of data ## 6 | ## 'Full' data for entire cohort is used to fit the full model, with Bayesian Optimisation used to 7 | ## identify optimal hyperparameters via 5-fold cross-validation ## 8 | 9 | ############################################################################ 10 | ############################################################################ 11 | 12 | ## MODEL FITTING ## 13 | 14 | ############################# 15 | ## Load in packages needed ## 16 | ############################# 17 | library(survival) 18 | library(readr) 19 | library(stringr) 20 | library(haven) 21 | library(caret) 22 | library(ParBayesianOptimization) 23 | library(xgboost) 24 | 25 | 26 | ######################################################## 27 | ## Check and set working directory for importing data ## 28 | ######################################################## 29 | getwd() 30 | setwd("/final_datasets/ML/") 31 | 32 | ################## 33 | ## Load in data ## 34 | ################## 35 | data <- read_dta("OX129_endpoint3_stacked50_pseudovalues.dta") ## Stacked dataset for entire cohort 36 | str(data) 37 | 38 | ######################### 39 | ## Variable formatting ## 40 | ######################### 41 | ## Categorical variables need to be converted into dummy variables ## 42 | ## First, reformat relevant parameters as factors, then convert to dummies ## 43 | 44 | data$smoke_cat <- factor(data$smoke_cat) # Smoking status 45 | data$cancer_route <- factor(data$cancer_route) # Route to diagnosis 46 | data$pr_status2 <- factor(data$pr_status2) # PR +/- 47 | data$her2_status2 <- factor(data$her2_status2) # HER2 +/- 48 | data$radiotherapy <- factor(data$radiotherapy) # Use of RTx in 1st year 49 | data$mastectomy <- factor(data$mastectomy) # Mastectomy in 1st year 50 | data$other_surgery <- factor(data$other_surgery) # Other surgery in 1st year 51 | data$cancer_stage <- factor(data$cancer_stage) # Stage (I-IV) 52 | data$cancer_grade <- factor(data$cancer_grade) # Grade (differentiation) 53 | data$hrt <- factor(data$hrt) # Hormone replacement therapy use 54 | data$sha1 <- factor(data$sha1) # Region - used for IECV 55 | 56 | ## Now, start converting these factors to dummies ## 57 | ## Set up list of parameters that need dummies ## 58 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage', 59 | 'cancer_grade', 'pr_status', 'er_status', 60 | 'her2_status', 'cancer_route') 61 | 62 | ## The others will be hamdled as binary ## 63 | ## Generate the dummies ## 64 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 65 | other_surgery + hrt + cancer_stage + 66 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 67 | cancer_route, data = data) 68 | 69 | ## Package the dummy variables into its own dataset, ready to bind with the ## 70 | ## other, non-dummy parameters ## 71 | dummied <- as.data.frame(predict(dummies, newdata=data)) 72 | 73 | ## Form the ready-to-process dataset by binding the new dummies with the other 74 | #numeric parameters ## 75 | data_for_model <- cbind(data[, -c(which(colnames(data) %in% dummy_parameters))], 76 | dummied) 77 | 78 | ## Continuous variables (age, BMI) - no scaling needed for XGB ## 79 | ## Name variables so that we can track these in variable importance plots, etc. ## 80 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.0'] <- 'Non_smoker' 81 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.1'] <- 'Ex_smoker' 82 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.2'] <- 'Light_smoker' 83 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.3'] <- 'Moderate_smoker' 84 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.4'] <- 'Heavy_smoker' 85 | colnames(data_for_model)[colnames(data_for_model) == 'radiotherapy.1'] <- 'Radiotherapy' 86 | colnames(data_for_model)[colnames(data_for_model) == 'mastectomy.1'] <- 'Mastectomy' 87 | colnames(data_for_model)[colnames(data_for_model) == 'other_surgery.1'] <- 'Other_surgery' 88 | colnames(data_for_model)[colnames(data_for_model) == 'hrt.1'] <- 'HRT_use' 89 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.1'] <- 'Stage1' 90 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.2'] <- 'Stage2' 91 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.3'] <- 'Stage3' 92 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.4'] <- 'Stage4' 93 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.1'] <- 'Well_differentiated' 94 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.2'] <- 'Moderately_differentiated' 95 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.3'] <- 'Poorly_differentiated' 96 | colnames(data_for_model)[colnames(data_for_model) == 'pr_status2.3'] <- 'PR_positive' 97 | colnames(data_for_model)[colnames(data_for_model) == 'her2_status2.3'] <- 'HER2_positive' 98 | colnames(data_for_model)[colnames(data_for_model) == 'er_status2.3'] <- 'ER_positive' 99 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.2'] <- 'Emergency_presentation' 100 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.3'] <- 'GP_referral' 101 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.4'] <- 'Inpatient_elective' 102 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 103 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.6'] <- 'Screening_detected' 104 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.7'] <- 'Two_week_wait' 105 | 106 | ## 'pseudo' contains the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ## 107 | x_cols <- c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 108 | 'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 109 | 'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 110 | 'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 111 | 'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 112 | 'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait') 113 | y_cols <- c('pseudo') 114 | 115 | ## Clean up environment to avoid issues with memory ## 116 | rm(dummied) 117 | rm(data) 118 | 119 | ######################## 120 | ## Dataset formatting ## 121 | ######################## 122 | 123 | ## Change dataset to a matrix ## 124 | ## x_train = predictor parameters ## 125 | x_train <- as.matrix(data_for_model[, x_cols]) 126 | 127 | ## Labels = the target - the pseudovalues ## 128 | label_train <- as.matrix(data_for_model[, y_cols]) 129 | 130 | ## Form an XGB dense matrix for the development data - name it dtrain ## 131 | dtrain <- xgb.DMatrix(data=x_train, label=label_train) 132 | 133 | 134 | ########################################### 135 | ## Bayesian optimisation for full model ## 136 | ########################################### 137 | 138 | ## Fit full model to entire available dataset - this is the model 139 | ## that will be evaluated using IECV ## 140 | ## We need to run cross-validation for hyperparameter tuning ## 141 | 142 | ## Bayesian optimsation package requires you to first: specify the function 143 | ## one seeks to optimimse. Here, it is to find the optimal hyperparameters for the 144 | ## XGBoost model such that the root mean squared error (precicted and observed pseudovalues is minimised ## 145 | 146 | ## Here, we set up this 'scoring function' so that we find the best values of max_depth, 147 | ## min_child_weight, eta, subsample, and number of boosting rounds, etc. # 148 | ## In order to assess the performance of each combination, 5-fold cross-validation is used. 149 | ## We use the inbuilt xgb.cv function for this. 150 | ## 5-fold CV used to estimates the 'average' performance with each hyperparam selection. The smallest value of the mse desired 151 | ## is found from the evaluation log, and these values used ## 152 | 153 | scorefunction <- function(max_depth, eta, subsample, number, alpha, gamma, 154 | lambda, colsampletree, colsamplelevel, minchild) { 155 | 156 | set.seed(1066) 157 | 158 | dtrain <- xgb.DMatrix(data=x_train, label=label_train, weight=weights) 159 | 160 | pars <- list( ## Define hyperparameters to eb fed to XGBoost model 161 | tree_method = "gpu_hist", ## Use GPU support 162 | sampling_method = "gradient_based", ## Gradient-based sampling 163 | objective = "reg:squarederror", ## Regression task 164 | eval_metric = "rmse", ## How model 'fit' evaluated 165 | maximize = FALSE, ## Want to minimise this 166 | max_depth = max_depth, ## How deep each tree goes 167 | eta = eta, ## Learning rate 168 | subsample = subsample, ## Proportion of observations taken in sub-sample 169 | alpha = alpha, ## alpha, gamma, lambda = regularisation parameters 170 | gamma = gamma, 171 | lambda = lambda, 172 | colsample_bytree = colsampletree, ## Variable columns samples for building each tree 173 | colsample_bylevel = colsamplelevel ## Variable columns sampled for adding branches to tree 174 | ) 175 | 176 | xgbcv <- xgb.cv( ## Use xgboost package's inbuilt cv function 177 | params = pars, ## Which parameter sets to use 178 | data = dtrain, 179 | nround = number, 180 | nfold = 5, 181 | early_stopping_rounds = 10, 182 | maximize = FALSE, 183 | verbose = 1 184 | ) 185 | 186 | result <- min(xgbcv$evaluation_log$test_rmse_mean) ## Extract the rmse 187 | converted_result <- -1*result ## so that BO can maximise it 188 | 189 | return(list(Score = converted_result) ## Return the score to the function 190 | ) 191 | 192 | } 193 | 194 | ## Once scoring function set up, need to define the hyperparameter search space # 195 | 196 | bounds <- list( 197 | max_depth = c(1L, 6L), 198 | eta = c(0.0001, 0.1), 199 | subsample = c(0.1, 0.5), 200 | number = c(1L, 500L), 201 | alpha = c(0L, 20L), 202 | gamma = c(0L, 20L), 203 | lambda = c(0L, 20L), 204 | colsampletree = c(0.1, 0.8), 205 | colsamplelevel = c(0.1, 0.8) 206 | ) 207 | 208 | ## Finally, set up the Bayesian optimisation, and run it with the above functions/details ## 209 | 210 | start <- Sys.time() ## Like to time things 211 | 212 | set.seed(1066) ## Reproducibility before setting off the BO function 213 | 214 | bayesian_boost <- bayesOpt( 215 | FUN = scorefunction, 216 | bounds = bounds, ## Tells function which search space to use 217 | initPoints = 25, ## 25 trials first, followed by 218 | iters.n = 25, ## 25 runs of BO thereafter. Total of 50 iterations. 219 | iters.k = 1, 220 | parallel = FALSE, 221 | verbose = 2, 222 | acq = "ei", ## Expected improvement acquisition function 223 | plotProgress = FALSE ## Plot this later, rather than sequentially each iter. Graph overload... 224 | ) 225 | 226 | end <- Sys.time() 227 | end-start 228 | 229 | bayesian_boost$scoreSummary ## Get summary of models fitted 230 | bestpars <- getBestPars(bayesian_boost) 231 | bestpars ## List the optimal hyperparameter configuration 232 | 233 | # Store these best values to plug into a model fitted to the full development data # 234 | opt_maxdepth = bestpars[1] 235 | opt_eta = bestpars[2] 236 | opt_subsamp = bestpars[3] 237 | opt_number = bestpars[4] 238 | opt_alpha = bestpars[5] 239 | opt_gamma = bestpars[6] 240 | opt_lambda = bestpars[7] 241 | opt_colsampletree = bestpars[8] 242 | opt_colsamplelevel = bestpars[9] 243 | 244 | # set these as the parameters for the model # 245 | 246 | parameters <- list(tree_method = "gpu_hist", 247 | sampling_method = "gradient_based", 248 | objective = "reg:squarederror", 249 | eval_metric = "rmse", 250 | maximize = FALSE, 251 | max_depth = opt_maxdepth, 252 | eta = opt_eta, 253 | subsample = opt_subsamp, 254 | alpha = opt_alpha, 255 | gamma = opt_gamma, 256 | lambda = opt_lambda, 257 | colsample_bytree = opt_colsampletree, 258 | colsample_bylevel = opt_colsamplelevel 259 | ) 260 | 261 | xgboost_model_endpoint3 <- xgb.train(data = dtrain, param = parameters, 262 | nrounds = opt_number$number, verbose =2) 263 | 264 | xgb.save(xgboost_model_endpoint3, fname = "xgboost_model_endpoint3") ## Save this final model 265 | 266 | ## Variable importance - all predictors ## 267 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3) 268 | xgb.plot.importance(importance.matrix, rel_to_first=TRUE, xlab="Relative predictor importance", col="darkblue", cex=0.4) 269 | 270 | ## Variable importance - top 10 predictors ## 271 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3) 272 | xgb.plot.importance(importance.matrix, rel_to_first=TRUE, xlab="Relative predictor importance", col="darkblue", cex=0.8, top_n=10) 273 | 274 | ## Same as first, but print all values in a Table so can interpret more easily ## 275 | importance.matrix <- xgb.importance(colnames(x_train), model=xgboost_model_endpoint3) 276 | table <- xgb.plot.importance(importance.matrix, plot=FALSE) 277 | table 278 | 279 | 280 | # Clear everything, so we can now run the internal-external cross-validation ## 281 | rm(list = ls()) 282 | 283 | ################################################################################ 284 | ################################################################################ 285 | 286 | ## MODEL EVALUATION ## 287 | 288 | ################# 289 | # Load packages # 290 | ################# 291 | library(survival) 292 | library(xgboost) 293 | library(readr) 294 | library(haven) 295 | library(stringr) 296 | library(caret) 297 | library(ParBayesianOptimization) 298 | 299 | ##################################### 300 | # Open and format datasets for IECV # 301 | ##################################### 302 | # Note - in these, rows are sorted by region, and then by patid, to allow safe merging in Stata # 303 | getwd() 304 | setwd("/final_datasets/ML/") 305 | 306 | ## Import Period 1 data first - this is used for iterative model fitting ## 307 | data_period1 <- read_dta("OX129_endpoint3_stacked50_period1_pseudovalues.dta") 308 | str(data_period1) 309 | data_period1$smoke_cat <- factor(data_period1$smoke_cat) # Smoking status 310 | data_period1$cancer_route <- factor(data_period1$cancer_route) # Route to diagnosis 311 | data_period1$pr_status2 <- factor(data_period1$pr_status2) # PR +/- 312 | data_period1$her2_status2 <- factor(data_period1$her2_status2) # HER2 +/- 313 | data_period1$radiotherapy <- factor(data_period1$radiotherapy) # Use of RTx in 1st year 314 | data_period1$mastectomy <- factor(data_period1$mastectomy) # Mastectomy in 1st year 315 | data_period1$other_surgery <- factor(data_period1$other_surgery) # Other surgery in 1st year 316 | data_period1$cancer_stage <- factor(data_period1$cancer_stage) # Stage (I-IV) 317 | data_period1$cancer_grade <- factor(data_period1$cancer_grade) # Grade (differentiation) 318 | data_period1$hrt <- factor(data_period1$hrt) # Hormone replacement therapy use 319 | data_period1$sha1 <- factor(data_period1$sha1) # Region - used for IECV 320 | 321 | ## Now, start converting these factors to dummies ## 322 | ## Set up list of parameters that need dummies ## 323 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage', 324 | 'cancer_grade', 'pr_status', 'er_status', 325 | 'her2_status', 'cancer_route') 326 | 327 | ## The others will be hamdled as binary ## 328 | ## Generate the dummies ## 329 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 330 | other_surgery + hrt + cancer_stage + 331 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 332 | cancer_route, data = data_period1) 333 | 334 | # Package the dummy variables into its own dataset, ready to bind with the 335 | # other, non-dummy parameters # 336 | dummied <- as.data.frame(predict(dummies, newdata=data_period1)) 337 | 338 | # Form the ready-to-process dataset by binding the new dummies with the other 339 | #numeric parameters # 340 | data_period1 <- cbind(data_period1[, -c(which(colnames(data_period1) %in% dummy_parameters))], 341 | dummied) 342 | 343 | ## Continuous variables (age, BMI, Townsend score) - no scaling needed for XGB ## 344 | 345 | ## Set up the predictors ('x cols') and outcome ('y cols') ## 346 | ## Rename variables (inc. new dummies) where relevant for interpretation of 347 | ## variable importance and other graphs later ## 348 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.0'] <- 'Non_smoker' 349 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.1'] <- 'Ex_smoker' 350 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.2'] <- 'Light_smoker' 351 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.3'] <- 'Moderate_smoker' 352 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.4'] <- 'Heavy_smoker' 353 | colnames(data_period1)[colnames(data_period1) == 'radiotherapy.1'] <- 'Radiotherapy' 354 | colnames(data_period1)[colnames(data_period1) == 'mastectomy.1'] <- 'Mastectomy' 355 | colnames(data_period1)[colnames(data_period1) == 'other_surgery.1'] <- 'Other_surgery' 356 | colnames(data_period1)[colnames(data_period1) == 'hrt.1'] <- 'HRT_use' 357 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.1'] <- 'Stage1' 358 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.2'] <- 'Stage2' 359 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.3'] <- 'Stage3' 360 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.4'] <- 'Stage4' 361 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.1'] <- 'Well_differentiated' 362 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.2'] <- 'Moderately_differentiated' 363 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.3'] <- 'Poorly_differentiated' 364 | colnames(data_period1)[colnames(data_period1) == 'pr_status2.3'] <- 'PR_positive' 365 | colnames(data_period1)[colnames(data_period1) == 'her2_status2.3'] <- 'HER2_positive' 366 | colnames(data_period1)[colnames(data_period1) == 'er_status2.3'] <- 'ER_positive' 367 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.2'] <- 'Emergency_presentation' 368 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.3'] <- 'GP_referral' 369 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.4'] <- 'Inpatient_elective' 370 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 371 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.6'] <- 'Screening_detected' 372 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.7'] <- 'Two_week_wait' 373 | 374 | ## 'pseudo' are the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ## 375 | x_cols <- c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 376 | 'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 377 | 'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 378 | 'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 379 | 'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 380 | 'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait') 381 | 382 | ## Target for model fit is Period 1 pseudovalues ## 383 | ## Target for the evaluation are the Period 2 pseudovalues ## 384 | y_cols <- c('period1_pseudo') 385 | 386 | ###################################### 387 | ## NOW, REPEAT FOR PERIOD 2 DATASET ## 388 | ###################################### 389 | 390 | setwd("/final_datasets/ML/") 391 | data_period2 <- read_dta("OX129_endpoint3_stacked50_period2_pseudovalues.dta") 392 | str(data_period2) 393 | 394 | ## Need to reformat categorical variables ## 395 | ## Categorical variables need to be converted into dummy variables ## 396 | ## First, reformat relevant parameters as factors, then convert to dummies # 397 | data_period2$smoke_cat <- factor(data_period2$smoke_cat) # Smoking status 398 | data_period2$cancer_route <- factor(data_period2$cancer_route) # Route to diagnosis 399 | data_period2$pr_status2 <- factor(data_period2$pr_status2) # PR +/- 400 | data_period2$her2_status2 <- factor(data_period2$her2_status2) # HER2 +/- 401 | data_period2$radiotherapy <- factor(data_period2$radiotherapy) # Use of RTx in 1st year 402 | data_period2$mastectomy <- factor(data_period2$mastectomy) # Mastectomy in 1st year 403 | data_period2$other_surgery <- factor(data_period2$other_surgery) # Other surgery in 1st year 404 | data_period2$cancer_stage <- factor(data_period2$cancer_stage) # Stage (I-IV) 405 | data_period2$cancer_grade <- factor(data_period2$cancer_grade) # Grade (differentiation) 406 | data_period2$hrt <- factor(data_period2$hrt) # Hormone replacement therapy use 407 | data_period2$sha1 <- factor(data_period2$sha1) # Region - used for IECV 408 | 409 | # Generate the dummies # 410 | dummies2 <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 411 | other_surgery + hrt + cancer_stage + 412 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 413 | cancer_route, data = data_period2) 414 | 415 | # Package the dummy variables into its own dataset, ready to bind with the 416 | # other, non-dummy parameters # 417 | dummied2 <- as.data.frame(predict(dummies2, newdata=data_period2)) 418 | 419 | # Form the ready-to-process dataset by binding the new dummies with the other 420 | # numeric parameters # 421 | data_period2 <- cbind(data_period2[, -c(which(colnames(data_period2) %in% 422 | dummy_parameters))], dummied2) 423 | 424 | ## rename variables for understanding ## 425 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.0'] <- 'Non_smoker' 426 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.1'] <- 'Ex_smoker' 427 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.2'] <- 'Light_smoker' 428 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.3'] <- 'Moderate_smoker' 429 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.4'] <- 'Heavy_smoker' 430 | colnames(data_period2)[colnames(data_period2) == 'radiotherapy.1'] <- 'Radiotherapy' 431 | colnames(data_period2)[colnames(data_period2) == 'mastectomy.1'] <- 'Mastectomy' 432 | colnames(data_period2)[colnames(data_period2) == 'chemotherapy.1'] <- 'Chemotherapy' 433 | colnames(data_period2)[colnames(data_period2) == 'other_surgery.1'] <- 'Other_surgery' 434 | colnames(data_period2)[colnames(data_period2) == 'hrt.1'] <- 'HRT_use' 435 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.1'] <- 'Stage1' 436 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.2'] <- 'Stage2' 437 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.3'] <- 'Stage3' 438 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.4'] <- 'Stage4' 439 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.1'] <- 'Well_differentiated' 440 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.2'] <- 'Moderately_differentiated' 441 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.3'] <- 'Poorly_differentiated' 442 | colnames(data_period2)[colnames(data_period2) == 'pr_status2.3'] <- 'PR_positive' 443 | colnames(data_period2)[colnames(data_period2) == 'her2_status2.3'] <- 'HER2_positive' 444 | colnames(data_period2)[colnames(data_period2) == 'er_status2.3'] <- 'ER_positive' 445 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.2'] <- 'Emergency_presentation' 446 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.3'] <- 'GP_referral' 447 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.4'] <- 'Inpatient_elective' 448 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 449 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.6'] <- 'Screening_detected' 450 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.7'] <- 'Two_week_wait' 451 | 452 | z_cols <- c('period2_pseudo') ## To form xgb.DMatrix for validation data, need to provide labels 453 | 454 | ## Check dimensions of two sub-sets ## 455 | dim(data_period1) 456 | dim(data_period2) 457 | 458 | ## Clean up memory by removing the dummied interim datasets 459 | rm(dummied) 460 | rm(dummied2) 461 | 462 | ################################ 463 | # Bayesian optimisation set-up # 464 | ################################ 465 | 466 | # There are 10 regions (sha1 variable), so set up loop to go over each of them # 467 | # Development data = period 1, region 'all except i', validation data = period 2, region i # 468 | 469 | # Set up empty list for storing individual predictions # 470 | iecv_predictions = list() 471 | 472 | # Bounds for the search space for BO # 473 | bounds <- list( 474 | max_depth = c(1L, 6L), 475 | eta = c(0.0001, 0.1), 476 | subsample = c(0.1, 0.5), 477 | number = c(1L, 500L), 478 | alpha = c(0L, 20L), 479 | gamma = c(0L, 20L), 480 | lambda = c(0L, 20L), ## Same as for model fitting steps abive 481 | colsampletree = c(0.1, 0.8), 482 | colsamplelevel = c(0.1, 0.8) 483 | ) 484 | 485 | start <- Sys.time() ## Like timing 486 | 487 | ## LOOP STARTS HERE ## 488 | 489 | for(a in seq(1:10)) { ## Cycle through each value of region(sha1) from 1 to 10 490 | 491 | set.seed(1066) 492 | 493 | data_dev = data_period1[ -which (data_period1$sha1 == levels(data_period1$sha1)[a]), ] ## Development data = all except region 'a' in period 1 494 | data_val = data_period2[ which (data_period2$sha1 == levels(data_period2$sha1)[a]), ] ## Validation data = region 'a' in period 2 495 | 496 | x_dev <- as.matrix(data_dev[, x_cols]) ## Form matrices for these dev/val sub-sets 497 | label_dev <- as.matrix(data_dev[, y_cols]) ## And their labels (pseudovalues) 498 | x_val <- as.matrix(data_val[, x_cols]) ## These are replaced with each loop iteration 499 | label_val <- as.matrix(data_val[, z_cols]) 500 | 501 | ddev <- xgb.DMatrix(data=x_dev, label=label_dev) ## join these together to hand to XGBoost 502 | dval <- xgb.DMatrix(data=x_val, label=label_val) ## Same, but for the validation data for this loop 503 | 504 | scorefunction <- function(max_depth, eta, subsample, number, alpha, gamma, ## Nested cross-validation loop for hyperparameter tuning 505 | lambda, colsampletree, colsamplelevel) { ## same style objective function for Bayesian optimisation to minimise 506 | 507 | pars <- list( ## Model hyperparamaters as above 508 | tree_method = "gpu_hist", 509 | sampling_method = "gradient_based", 510 | objective = "reg:squarederror", 511 | eval_metric = "rmse", 512 | maximize = FALSE, 513 | max_depth = max_depth, 514 | eta = eta, 515 | subsample = subsample, 516 | alpha = alpha, 517 | gamma = gamma, 518 | lambda = lambda, 519 | colsample_bytree = colsampletree, 520 | colsample_bylevel = colsamplelevel 521 | ) 522 | 523 | xgbcv <- xgb.cv( ## Run nested 5-fold cross-validation to test the hyperparameter combo 524 | params = pars, 525 | data = ddev, 526 | nround = number, 527 | nfold = 5, 528 | early_stopping_rounds = 10, 529 | maximize = FALSE, 530 | verbose = 1 531 | ) 532 | 533 | result <- min(xgbcv$evaluation_log$test_rmse_mean) ## Extract the result 534 | converted_result <- -1*result ## We want to minimise RMSE, so maximimse the -ve RMSE 535 | 536 | return(list(Score = converted_result) ## Pass these 'scores' to the Bayesian Optim package 537 | ) 538 | 539 | } 540 | 541 | ## Finally, having set up the Bayesian optimisation, run it with the above functions/details ## 542 | 543 | set.seed(1066) 544 | 545 | bayesian_boost <- bayesOpt( 546 | FUN = scorefunction, 547 | bounds = bounds, 548 | initPoints = 25, 549 | iters.n = 25, 550 | iters.k = 1, 551 | parallel = FALSE, 552 | verbose = 1, 553 | acq = "ei", 554 | plotProgress = FALSE, 555 | otherHalting = list(timeLimit=5400) ## Stop if >1.5hrs taken on one cycle of loop (never reached on this) 556 | ) 557 | 558 | ## Evaluate the output of the Bayesian Optimisation process ## 559 | # Extract the best hyperparameters combination found # 560 | bayesian_boost$scoreSummary 561 | bestpars <- getBestPars(bayesian_boost) 562 | 563 | # Store these best values to plug into a model fitted to this loop's development data # 564 | 565 | opt_maxdepth = bestpars[1] 566 | opt_eta = bestpars[2] 567 | opt_subsamp = bestpars[3] 568 | opt_number = bestpars[4] 569 | opt_alpha = bestpars[5] 570 | opt_gamma = bestpars[6] 571 | opt_lambda = bestpars[7] 572 | opt_colsampletree = bestpars[8] 573 | opt_colsamplelevel = bestpars[9] 574 | 575 | # set these as the parameters for the loop's model # 576 | 577 | parameters <- list(tree_method = "gpu_hist", 578 | sampling_method = "gradient_based", 579 | objective = "reg:squarederror", 580 | eval_metric = "rmse", 581 | maximize = FALSE, 582 | max_depth = opt_maxdepth, 583 | eta = opt_eta, 584 | subsample = opt_subsamp, 585 | alpha = opt_alpha, 586 | gamma = opt_gamma, 587 | lambda = opt_lambda, 588 | colsample_bytree = opt_colsampletree, 589 | colsample_bylevel = opt_colsamplelevel 590 | ) 591 | 592 | xgboost_iecv_loop <- xgb.train(data = ddev, param = parameters, ## Fit the model with optimal hyperparameter config identified from this loop 593 | nrounds = opt_number$number, verbose=1) 594 | 595 | 596 | predictions.test <- predict(xgboost_iecv_loop, dval) ## Make predictions on held out period 2 data for region 'a' 597 | iecv_predictions[[a]] = predictions.test ## Store these predictions in the pre-made repository for access later 598 | } ## Then repeat so that all regions are cycled through 599 | 600 | ## LOOP ENDS ## 601 | 602 | end <-Sys.time() 603 | end-start ## Takes approx. 4.41 hours with GPU support on my server 604 | 605 | ##################################### 606 | # POOLING THE PREDICTIONS FROM IECV # 607 | ##################################### 608 | 609 | # Combine the predictions made on the held out data from each cycle # 610 | xgboost_iecv_predictions <- c(iecv_predictions[[1]], iecv_predictions[[2]], iecv_predictions[[3]], 611 | iecv_predictions[[4]], iecv_predictions[[5]], iecv_predictions[[6]], 612 | iecv_predictions[[7]], iecv_predictions[[8]], iecv_predictions[[9]], 613 | iecv_predictions[[10]]) 614 | # Check sensible - no overt error! # 615 | summary(xgboost_iecv_predictions) 616 | 617 | # Extract the patient ID - need this to merge together later for calculating performance metrics in Stata # 618 | patid <- as.data.frame(data_period2$patid) 619 | export_predictions <- cbind(patid, xgboost_iecv_predictions) ## bind these columns together 620 | head(export_predictions) 621 | 622 | # Save the predictions for later use - will be used to evaluate performance of the model fitted to the entire data # 623 | setwd("/estimates/ML_IECV_predictions/") 624 | # Save as csv for easy importing into Stata # 625 | write.csv(export_predictions, file="endpoint3_xgboost_iecv_predictions.csv") 626 | 627 | 628 | ########################################################################################### 629 | ########################################################################################### 630 | -------------------------------------------------------------------------------- /6_OX129_neural_networks: -------------------------------------------------------------------------------- 1 | ## Neural network (competing risks) modelling ## 2 | 3 | ## Same overall process as XGBoost - use stacked imputed datasets, fit model to entire data, 4 | ## evaluate using internal-external cross-validation that recapitulates hyperparameter tuning (nested) 5 | 6 | 7 | ############################# 8 | ## Load in packages needed ## 9 | ############################# 10 | library(survival) 11 | library(readr) 12 | library(stringr) 13 | library(caret) 14 | library(haven) 15 | library(ParBayesianOptimization) 16 | library(keras) 17 | library(tensorflow) 18 | 19 | ######################################################## 20 | ## Check and set working directory for importing data ## 21 | ######################################################## 22 | getwd() 23 | setwd("/final_datasets/ML/") 24 | 25 | ################## 26 | ## Load in data ## 27 | ################## 28 | data <- read_dta("OX129_endpoint3_stacked50_pseudovalues.dta") 29 | str(data) 30 | 31 | ######################### 32 | ## Variable formatting ## 33 | ######################### 34 | ## Categorical variables need to be converted into dummy variables ## 35 | ## First, reformat relevant parameters as factors, then convert to dummies ## 36 | 37 | data$smoke_cat <- factor(data$smoke_cat) # Smoking status 38 | data$cancer_route <- factor(data$cancer_route) # Route to diagnosis 39 | data$pr_status2 <- factor(data$pr_status2) # PR +/- 40 | data$her2_status2 <- factor(data$her2_status2) # HER2 +/- 41 | data$er_status2 <- factor(data$er_status2) # ER +/- 42 | data$radiotherapy <- factor(data$radiotherapy) # Use of RTx in 1st year 43 | data$mastectomy <- factor(data$mastectomy) # Mastectomy in 1st year 44 | data$other_surgery <- factor(data$other_surgery) # Other surgery in 1st year 45 | data$cancer_stage <- factor(data$cancer_stage) # Stage (I-IV) 46 | data$cancer_grade <- factor(data$cancer_grade) # Grade (differentiation) 47 | data$hrt <- factor(data$hrt) # Hormone replacement therapy use 48 | data$sha1 <- factor(data$sha1) # Region - used for IECV 49 | 50 | ## Now, start converting these factors to dummies ## 51 | ## Set up list of parameters that need dummies ## 52 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage', 53 | 'cancer_grade', 'pr_status', 'er_status', 54 | 'her2_status', 'cancer_route') 55 | 56 | ## The others will be hamdled as binary ## 57 | ## Generate the dummies ## 58 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 59 | other_surgery + hrt + cancer_stage + 60 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 61 | cancer_route, data = data) 62 | 63 | ## Package the dummy variables into its own dataset, ready to bind with the ## 64 | ## other, non-dummy parameters ## 65 | dummied <- as.data.frame(predict(dummies, newdata=data)) 66 | 67 | ## Form the ready-to-process dataset by binding the new dummies with the other 68 | #numeric parameters ## 69 | data_for_model <- cbind(data[, -c(which(colnames(data) %in% dummy_parameters))], 70 | dummied) 71 | 72 | ## Create normalise function - continuous variables are to be normalised/scaled 73 | ## to be between 0 and 1 ## 74 | 75 | normalise <- function(x) { 76 | return ((x - min(x)) / (max(x) - min(x))) 77 | } 78 | 79 | data_for_model$age_at_diagnosis <- normalise(data_for_model$age) 80 | data_for_model$bmi <- normalise(data_for_model$bmi) 81 | 82 | ## Set up the predictors ('x cols') and outcome ('y cols') ## 83 | ## Rename variables (inc. new dummies) where relevant for interpretation ## 84 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.0'] <- 'Non_smoker' 85 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.1'] <- 'Ex_smoker' 86 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.2'] <- 'Light_smoker' 87 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.3'] <- 'Moderate_smoker' 88 | colnames(data_for_model)[colnames(data_for_model) == 'smoke_cat.4'] <- 'Heavy_smoker' 89 | colnames(data_for_model)[colnames(data_for_model) == 'radiotherapy.1'] <- 'Radiotherapy' 90 | colnames(data_for_model)[colnames(data_for_model) == 'mastectomy.1'] <- 'Mastectomy' 91 | colnames(data_for_model)[colnames(data_for_model) == 'other_surgery.1'] <- 'Other_surgery' 92 | colnames(data_for_model)[colnames(data_for_model) == 'hrt.1'] <- 'HRT_use' 93 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.1'] <- 'Stage1' 94 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.2'] <- 'Stage2' 95 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.3'] <- 'Stage3' 96 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_stage.4'] <- 'Stage4' 97 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.1'] <- 'Well_differentiated' 98 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.2'] <- 'Moderately_differentiated' 99 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_grade.3'] <- 'Poorly_differentiated' 100 | colnames(data_for_model)[colnames(data_for_model) == 'pr_status2.3'] <- 'PR_positive' 101 | colnames(data_for_model)[colnames(data_for_model) == 'her2_status2.3'] <- 'HER2_positive' 102 | colnames(data_for_model)[colnames(data_for_model) == 'er_status2.3'] <- 'ER_positive' 103 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.2'] <- 'Emergency_presentation' 104 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.3'] <- 'GP_referral' 105 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.4'] <- 'Inpatient_elective' 106 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 107 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.6'] <- 'Screening_detected' 108 | colnames(data_for_model)[colnames(data_for_model) == 'cancer_route.7'] <- 'Two_week_wait' 109 | 110 | ## 'pseudo' are the jack-knife pseudo-values for the A-J CIF ## 111 | x_cols <- c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 112 | 'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 113 | 'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 114 | 'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 115 | 'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 116 | 'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait') 117 | y_cols <- c('pseudo') 118 | 119 | ## Clean up environment to avoid issues with memory ## 120 | rm(dummied) 121 | rm(data) 122 | 123 | 124 | ######################## 125 | ## Dataset formatting ## 126 | ######################## 127 | 128 | ## Change dataset to a matrix ## 129 | ## x_train = predictor parameters ## 130 | x_train <- as.matrix(data_for_model[, x_cols]) 131 | 132 | ## Labels = the target for the neural net - the pseudovalues ## 133 | label_train <- as.matrix(data_for_model[, y_cols]) 134 | 135 | ## Number of input neurons (predictor parameters) 136 | dim(x_train) 137 | 138 | 139 | ############################################### 140 | ## Define custom loss function for the model ## 141 | ############################################### 142 | 143 | ## Define root mean squared error loss function ## 144 | ## This is used to fit the neural network model ## 145 | 146 | rmse <- function(y_true, y_pred){ 147 | K <- backend() 148 | loss <- K$sqrt(K$mean((y_true - y_pred)^2)) 149 | return(loss) 150 | } 151 | 152 | ############################################################################ 153 | ## FULL MODEL: Setting up Bayesian optimisation for hyperparameter tuning ## 154 | ############################################################################ 155 | 156 | ## First, we fit neural network to entirety of data; Bayesian optimisation ## 157 | ## used to identify optimal hyperparameters (with nested cross-val) ## 158 | 159 | ## Set up callbacks - use early stopping to stop training if models gets into ## 160 | ## a rut and there is no improvement. Combine with drop in learning rate if 161 | ## hits plateau. 162 | callbacks_list <- list( 163 | callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3), 164 | callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5) 165 | ) 166 | 167 | 168 | ## seed for reproducibility ## 169 | set.seed(2066) 170 | 171 | ## The Bayesian Optimisation package requires a 'scoring function' which ## 172 | ## it uses to assess the performance of the different configurations ## 173 | 174 | ## Herein, we specify that we want to tune the number of epochs, ## 175 | ## number of units in each hidden layer, and number of ## 176 | ## hidden layers ## 177 | ## To estimate the performance of each model configuration, we use 5-fold ## 178 | ## cross-validation, so we nest a loop inside the scoring function so that ## 179 | ## the 'score' returned is a cross-validated estimate ## 180 | 181 | scorefunction <- function(epochs, units, layers, learnrate) { 182 | 183 | set.seed(1066) 184 | 185 | k <- 5 ## 5-fold cross-validation 186 | indices <- sample(1:nrow(x_train)) ## Used to randomly assign to folds 187 | folds <- cut(1:length(indices), breaks = k, labels = FALSE) ## folds 188 | 189 | cv_score <- c() ## empty list to store 'scores' over each CV iteration 190 | 191 | for (i in 1:k) { ## Nested loop to perform CV 192 | 193 | cv_val_indices <- which(folds==i) ## validation set 1/5 folds 194 | cv_val_data <- x_train[cv_val_indices, ] ## Take from x_train 195 | cv_val_labels <- label_train[cv_val_indices] ## Take from labels 196 | 197 | cv_train_data <- x_train[-cv_val_indices, ] ## Train data = 4/5 folds 198 | cv_train_labels <- label_train[-cv_val_indices] ## Train labels 4/5 199 | 200 | cv_model <- keras_model_sequential() ## Instantiate keras model 201 | for(m in 1:layers){ ## loop added to vary number of hidden layers 202 | cv_model %>% 203 | layer_dense(units = units, activation = 'relu', 204 | input_shape=c(27)) 205 | } 206 | cv_model %>% layer_dense(units = 1, activation = "linear") %>% 207 | compile( 208 | optimizer = optimizer_adam(lr=learnrate), 209 | loss = rmse, ## Use the above defined custom loss function 210 | metrics = "mse" ## Reported as model fits, but not a loss function 211 | ) 212 | ## Fit the model- batch size= 1024, weights applied for loss function ## 213 | cv_model %>% fit(cv_train_data, cv_train_labels, epochs=epochs, 214 | batch_size=1024, verbose=2, callbacks = callbacks_list, 215 | validation_data = list(cv_val_data, cv_val_labels)) ## Use heldout data to plot training dynamics 216 | 217 | ## Generate predictions on held-out fold ## 218 | cv_predictions <- predict(cv_model, cv_val_data) 219 | 220 | ## BO wants to find the maximum, but lower RMSE = better ## 221 | ## -1 x RMSE: lower RMSE = higher value, so use this to score the config ## 222 | negative_rmse <- -1*sqrt((mean((cv_predictions-cv_val_labels)^2))) 223 | 224 | # Return score (or clipped score, if gradient exploded) to bayesian_neural 225 | cv_score <- c(cv_score, negative_rmse) 226 | 227 | ## Clear out the model at the end of loop to avoid merging ## 228 | rm(cv_model) 229 | } 230 | ## BO needs score to be returned to it ## 231 | return(list(Score = mean(cv_score))) 232 | } 233 | 234 | 235 | ## Define hyperparameter search space - 'bounds' to search ## 236 | 237 | bounds <- list( 238 | epochs = c(1L, 50L), ## Number of times model gets fed the data 239 | units = c(27L, 50L), ## Number of nodes in each layer 240 | layers = c(1L, 5L), ## Between 1 and 5 hidden layers 241 | learnrate = c(0.001, 0.1) 242 | ) 243 | 244 | ############################################################################### 245 | ## FULL MODEL: Running Bayesian optimisation to find optimal hyperparameters ## 246 | ############################################################################### 247 | 248 | ## Define Bayesian Optimisation function and run it ## 249 | ## Also keep track of time ## 250 | start <- Sys.time() 251 | 252 | ## Function takes in the scoring function we defined above, the bounds to ## 253 | ## search within, number of 'initialisations' to run first, then the number ## 254 | ## of further iterations to try find the global optimum ## 255 | 256 | bayesian_neural <- bayesOpt( 257 | FUN = scorefunction, 258 | bounds = bounds, 259 | initPoints = 25, 260 | iters.n = 25, 261 | iters.k = 1, 262 | parallel = FALSE, 263 | plotProgress = FALSE, 264 | verbose = 2, 265 | errorHandling = "continue", 266 | acq = "ei" 267 | ) 268 | 269 | end <- Sys.time() 270 | end-start 271 | 272 | ## Summarise Bayesian Optimisation run ## 273 | bayesian_neural$scoreSummary 274 | 275 | plot(bayesian_neural) 276 | 277 | bestpars <- getBestPars(bayesian_neural) 278 | bestpars 279 | 280 | # Extract optimal hyperparameter config to fit final model on whole dataset # 281 | best_epochs <- bestpars[1] 282 | best_units <- bestpars[2] 283 | best_layers <- toString(bestpars[3]) 284 | best_learnrate <- as.numeric(bestpars[4]) 285 | 286 | callbacks_list <- list( 287 | callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3), 288 | callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5) 289 | ) 290 | 291 | ## Use same format as above - define the 'final_model' and fit it ## 292 | final_model <- keras_model_sequential() 293 | for(i in 1:best_layers){ 294 | final_model %>% 295 | layer_dense(units = best_units, activation = 'relu', 296 | input_shape=c(27)) 297 | } 298 | final_model %>% layer_dense(units = 1, activation = "linear") %>% 299 | compile( 300 | optimizer = optimizer_adam(lr=best_learnrate), 301 | loss = rmse, 302 | metrics = "mae" 303 | ) 304 | 305 | ## Basic characteristics of final model, e.g. number of layers, parameters ## 306 | summary(final_model) 307 | 308 | ############################ 309 | # Fit final neural network # 310 | ############################ 311 | 312 | ## Fit the model with said config to whole dataset, and save it ## 313 | final_model %>% fit(x_train, label_train, epochs=best_epochs, 314 | batch_size=1024, verbose=2, callbacks=callbacks_list) 315 | 316 | ## Save the model ## 317 | setwd("/models/Endpoint_3/") 318 | save_model_hdf5(final_model, "ep3_neuralnetwork_final.h5") 319 | 320 | ############################################################################################### 321 | ############################################################################################### 322 | 323 | ## Clear out the environment to start fresh for evaluation ## 324 | rm = ls() 325 | 326 | 327 | ## MODEL EVALUATION STRATEGY ## 328 | 329 | ############################# 330 | ## Load in packages needed ## 331 | ############################# 332 | library(survival) 333 | library(readr) 334 | library(stringr) 335 | library(caret) 336 | library(haven) 337 | library(ParBayesianOptimization) 338 | library(keras) 339 | library(tensorflow) 340 | 341 | ######################################################## 342 | ## Check and set working directory for importing data ## 343 | ######################################################## 344 | ## Here, we need to separately import and format the Period 1 and Period 2 data ## 345 | getwd() 346 | setwd("/final_datasets/ML/") 347 | 348 | ## Importing Period 1 sub-cohort (model fitting) separately from Period 2 sub-cohort (model evaluation) ## 349 | data_period1 <- read_dta("OX129_endpoint3_stacked50_period1_pseudovalues.dta") 350 | str(data_period1) 351 | 352 | ## Need to reformat categorical variables ## 353 | ## Categorical variables need to be converted into dummy variables ## 354 | ## First, reformat relevant parameters as factors, then convert to dummies # 355 | 356 | data_period1$smoke_cat <- factor(data_period1$smoke_cat) # Smoking status 357 | data_period1$cancer_route <- factor(data_period1$cancer_route) # Route to diagnosis 358 | data_period1$pr_status2 <- factor(data_period1$pr_status2) # PR +/- 359 | data_period1$her2_status2 <- factor(data_period1$her2_status2) # HER2 +/- 360 | data_period1$er_status2 <- factor(data_period1$er_status2) # ER +/- 361 | data_period1$radiotherapy <- factor(data_period1$radiotherapy) # Use of RTx in 1st year 362 | data_period1$mastectomy <- factor(data_period1$mastectomy) # Mastectomy in 1st year 363 | data_period1$other_surgery <- factor(data_period1$other_surgery) # Other surgery in 1st year 364 | data_period1$cancer_stage <- factor(data_period1$cancer_stage) # Stage (I-IV) 365 | data_period1$cancer_grade <- factor(data_period1$cancer_grade) # Grade (differentiation) 366 | data_period1$hrt <- factor(data_period1$hrt) # Hormone replacement therapy use 367 | data_period1$sha1 <- factor(data_period1$sha1) # Region - used for IECV 368 | 369 | ## Now, start converting these factors to dummies ## 370 | ## Set up list of parameters that need dummies ## 371 | dummy_parameters <- c('radiotherapy', 'hrt', 'cancer_stage', 372 | 'cancer_grade', 'pr_status', 'er_status', 373 | 'her2_status', 'cancer_route') 374 | 375 | ## The others will be hamdled as binary ## 376 | ## Generate the dummies ## 377 | dummies <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 378 | other_surgery + hrt + cancer_stage + 379 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 380 | cancer_route, data = data_period1) 381 | 382 | # Package the dummy variables into its own dataset, ready to bind with the 383 | # other, non-dummy parameters # 384 | dummied <- as.data.frame(predict(dummies, newdata=data_period1)) 385 | 386 | # Form the ready-to-process dataset by binding the new dummies with the other 387 | #numeric parameters # 388 | data_period1 <- cbind(data_period1[, -c(which(colnames(data_period1) %in% dummy_parameters))], 389 | dummied) 390 | 391 | # Use normalise function - continuous variables are to be normalised/scaled to 392 | # be between 0 and 1 393 | 394 | normalise <- function(x) { 395 | return ((x - min(x)) / (max(x) - min(x))) 396 | } 397 | 398 | data_period1$age_at_diagnosis <- normalise(data_period1$age_at_diagnosis) 399 | data_period1$bmi <- normalise(data_period1$bmi) 400 | 401 | 402 | ## Set up the predictors ('x cols') and outcome ('y cols') ## 403 | ## Rename variables (inc. new dummies) where relevant for interpretation of 404 | ## variable importance and other graphs later ## 405 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.0'] <- 'Non_smoker' 406 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.1'] <- 'Ex_smoker' 407 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.2'] <- 'Light_smoker' 408 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.3'] <- 'Moderate_smoker' 409 | colnames(data_period1)[colnames(data_period1) == 'smoke_cat.4'] <- 'Heavy_smoker' 410 | colnames(data_period1)[colnames(data_period1) == 'radiotherapy.1'] <- 'Radiotherapy' 411 | colnames(data_period1)[colnames(data_period1) == 'mastectomy.1'] <- 'Mastectomy' 412 | colnames(data_period1)[colnames(data_period1) == 'other_surgery.1'] <- 'Other_surgery' 413 | colnames(data_period1)[colnames(data_period1) == 'hrt.1'] <- 'HRT_use' 414 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.1'] <- 'Stage1' 415 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.2'] <- 'Stage2' 416 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.3'] <- 'Stage3' 417 | colnames(data_period1)[colnames(data_period1) == 'cancer_stage.4'] <- 'Stage4' 418 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.1'] <- 'Well_differentiated' 419 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.2'] <- 'Moderately_differentiated' 420 | colnames(data_period1)[colnames(data_period1) == 'cancer_grade.3'] <- 'Poorly_differentiated' 421 | colnames(data_period1)[colnames(data_period1) == 'pr_status2.3'] <- 'PR_positive' 422 | colnames(data_period1)[colnames(data_period1) == 'her2_status2.3'] <- 'HER2_positive' 423 | colnames(data_period1)[colnames(data_period1) == 'er_status2.3'] <- 'ER_positive' 424 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.2'] <- 'Emergency_presentation' 425 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.3'] <- 'GP_referral' 426 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.4'] <- 'Inpatient_elective' 427 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 428 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.6'] <- 'Screening_detected' 429 | colnames(data_period1)[colnames(data_period1) == 'cancer_route.7'] <- 'Two_week_wait' 430 | 431 | ## 'pseudo' are the jack-knife pseudo-values for the Aalen-Johanssen CIF at 10 years ## 432 | x_cols <- c('age_at_diagnosis', 'bmi', 'Non_smoker', 'Ex_smoker', 'Light_smoker', 433 | 'Moderate_smoker', 'Heavy_smoker', 'Radiotherapy', 'Mastectomy', 434 | 'Other_surgery', 'HRT_use', 'Stage1', 'Stage2', 'Stage3', 'Stage4', 'Well_differentiated', 435 | 'Moderately_differentiated', 'Poorly_differentiated', 'PR_positive', 436 | 'HER2_positive', 'ER_positive', 'Emergency_presentation', 'GP_referral', 'Inpatient_elective', 437 | 'Other_outpatient_pathway', 'Screening_detected', 'Two_week_wait') 438 | 439 | ## Target for model fit is Period 1 pseudovalues ## 440 | ## Target for the evaluation are the Period 2 pseudovalues ## 441 | y_cols <- c('period1_pseudo') 442 | 443 | ###################################### 444 | ## NOW, REPEAT FOR PERIOD 2 DATASET ## 445 | ###################################### 446 | 447 | data_period2 <- read_dta("OX129_endpoint3_stacked50_period2_pseudovalues.dta") 448 | str(data_period2) 449 | ## Categorical variables need to be converted into dummy variables ## 450 | ## First, reformat relevant parameters as factors, then convert to dummies # 451 | data_period2$smoke_cat <- factor(data_period2$smoke_cat) # Smoking status 452 | data_period2$cancer_route <- factor(data_period2$cancer_route) # Route to diagnosis 453 | data_period2$pr_status2 <- factor(data_period2$pr_status2) # PR +/- 454 | data_period2$her2_status2 <- factor(data_period2$her2_status2) # HER2 +/- 455 | data_period2$er_status2 <- factor(data_period2$er_status2) # ER +/- 456 | data_period2$radiotherapy <- factor(data_period2$radiotherapy) # Use of RTx in 1st year 457 | data_period2$mastectomy <- factor(data_period2$mastectomy) # Mastectomy in 1st year 458 | data_period2$other_surgery <- factor(data_period2$other_surgery) # Other surgery in 1st year 459 | data_period2$cancer_stage <- factor(data_period2$cancer_stage) # Stage (I-IV) 460 | data_period2$cancer_grade <- factor(data_period2$cancer_grade) # Grade (differentiation) 461 | data_period2$hrt <- factor(data_period2$hrt) # Hormone replacement therapy use 462 | data_period2$sha1 <- factor(data_period2$sha1) # Region - used for IECV 463 | 464 | # Generate the dummies # 465 | dummies2 <- dummyVars(~ smoke_cat + radiotherapy + mastectomy + 466 | other_surgery + hrt + cancer_stage + 467 | cancer_grade + pr_status2 + her2_status2 + er_status2 + 468 | cancer_route, data = data_period2) 469 | 470 | # Package the dummy variables into its own dataset, ready to bind with the 471 | # other, non-dummy parameters # 472 | dummied2 <- as.data.frame(predict(dummies2, newdata=data_period2)) 473 | 474 | # Form the ready-to-process dataset by binding the new dummies with the other 475 | # numeric parameters # 476 | data_period2 <- cbind(data_period2[, -c(which(colnames(data_period2) %in% 477 | dummy_parameters))], dummied2) 478 | 479 | data_period2$age_at_diagnosis <- normalise(data_period2$age_at_diagnosis) 480 | data_period2$bmi <- normalise(data_period2$bmi) 481 | 482 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.0'] <- 'Non_smoker' 483 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.1'] <- 'Ex_smoker' 484 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.2'] <- 'Light_smoker' 485 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.3'] <- 'Moderate_smoker' 486 | colnames(data_period2)[colnames(data_period2) == 'smoke_cat.4'] <- 'Heavy_smoker' 487 | colnames(data_period2)[colnames(data_period2) == 'radiotherapy.1'] <- 'Radiotherapy' 488 | colnames(data_period2)[colnames(data_period2) == 'mastectomy.1'] <- 'Mastectomy' 489 | colnames(data_period2)[colnames(data_period2) == 'other_surgery.1'] <- 'Other_surgery' 490 | colnames(data_period2)[colnames(data_period2) == 'hrt.1'] <- 'HRT_use' 491 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.1'] <- 'Stage1' 492 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.2'] <- 'Stage2' 493 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.3'] <- 'Stage3' 494 | colnames(data_period2)[colnames(data_period2) == 'cancer_stage.4'] <- 'Stage4' 495 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.1'] <- 'Well_differentiated' 496 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.2'] <- 'Moderately_differentiated' 497 | colnames(data_period2)[colnames(data_period2) == 'cancer_grade.3'] <- 'Poorly_differentiated' 498 | colnames(data_period2)[colnames(data_period2) == 'pr_status2.3'] <- 'PR_positive' 499 | colnames(data_period2)[colnames(data_period2) == 'her2_status2.3'] <- 'HER2_positive' 500 | colnames(data_period2)[colnames(data_period2) == 'er_status2.3'] <- 'ER_positive' 501 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.2'] <- 'Emergency_presentation' 502 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.3'] <- 'GP_referral' 503 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.4'] <- 'Inpatient_elective' 504 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.5'] <- 'Other_outpatient_pathway' 505 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.6'] <- 'Screening_detected' 506 | colnames(data_period2)[colnames(data_period2) == 'cancer_route.7'] <- 'Two_week_wait' 507 | 508 | ## Check dimensions of two sub-sets ## 509 | dim(data_period1) 510 | dim(data_period2) 511 | 512 | ## Clean up memory by removing the dummied interim datasets 513 | rm(dummied) 514 | rm(dummied2) 515 | 516 | ############################################### 517 | ## Define custom loss function for the model ## 518 | ############################################### 519 | 520 | ## Define root mean squared error loss function ## 521 | ## This is used to fit the neural network model ## 522 | 523 | rmse <- function(y_true, y_pred){ 524 | K <- backend() 525 | loss <- K$sqrt(K$mean((y_true - y_pred)^2)) 526 | return(loss) 527 | } 528 | 529 | ## Set up callbacks ## 530 | callbacks_list <- list( 531 | callback_reduce_lr_on_plateau(monitor = "loss", factor = 0.5, patience = 3), 532 | callback_early_stopping(monitor = "loss", min_delta = 0.0001, patience = 5) 533 | ) 534 | 535 | ############################################################## 536 | ## Define scoring function for nested cross-validation/IECV ## 537 | ############################################################## 538 | 539 | ## Need to use the period 1 data for tuning with 5-fold cross-validation ## 540 | ## Use the period 2 data as the completely held-out test set(s) ## 541 | ## Below, we set up a loop to iterate this over each region for IECV, but here, 542 | # first we set up the Bayesian Optimisation scoring function that will be embedded in this loop ## 543 | set.seed(2066) 544 | 545 | scorefunction <- function(epochs, units, layers, learnrate){ 546 | 547 | set.seed(2066) 548 | 549 | k <- 5 550 | indices <- sample(1:nrow(data_fit)) ## 'data_fit' will be generated from data_period1 subset 551 | folds <- cut(1:length(indices), breaks = k, labels = FALSE) 552 | 553 | iecv_score <- c() 554 | 555 | for (a in 1:k) { 556 | 557 | iecv_val_indices <- which(folds==a) 558 | 559 | iecv_val_data <- data_fit[iecv_val_indices, ] ## validation is the heldout 1/5th 560 | iecv_val_labels <- label_fit[iecv_val_indices] ## for CV loop for hyperparam tuning 561 | 562 | iecv_fit_data <- data_fit[-iecv_val_indices, ] 563 | iecv_fit_labels <- label_fit[-iecv_val_indices] 564 | 565 | bayes_model <- keras_model_sequential() ## Instantiate keras model 566 | for(b in 1:layers){ ## loop added to vary number of hidden layers 567 | bayes_model %>% 568 | layer_dense(units = units, activation = 'relu', 569 | input_shape=c(27)) 570 | } 571 | bayes_model %>% layer_dense(units = 1, activation = "linear") %>% 572 | compile( 573 | optimizer = optimizer_adam(lr=learnrate), 574 | loss = rmse, ## Use the above defined custom loss function 575 | metrics = "mse" ## Reported as model fits, but not a loss function 576 | ) 577 | ## Fit the model- batch size= 1024, weights applied for loss function ## 578 | bayes_model %>% fit(iecv_fit_data, iecv_fit_labels, epochs=epochs, 579 | batch_size=1024, verbose=0, callbacks = callbacks_list, 580 | validation_data = list(iecv_val_data, iecv_val_labels)) 581 | 582 | iecv_predictions <- predict(bayes_model, iecv_val_data) 583 | negative_rmse <- -1*sqrt(mean((iecv_predictions-iecv_val_labels)^2)) 584 | iecv_score <- c(iecv_score, negative_rmse) 585 | 586 | rm(bayes_model) 587 | } 588 | 589 | return(list(Score = mean(iecv_score))) ## Return the score (-ve RMSE obtained with each combination to Bayes Optim algorithm) 590 | 591 | } 592 | 593 | ############################################################# 594 | ## Define hyperparameter search space - 'bounds' to search ## 595 | ############################################################# 596 | 597 | bounds <- list( 598 | epochs = c(1L, 20L), ## Number of times model gets fed the data 599 | units = c(27L, 50L), ## Number of nodes in each layer 600 | layers = c(1L, 5L), ## Between 1 and 5 hidden layers 601 | learnrate = c(0.001, 0.1) 602 | ) 603 | 604 | ###################################################### 605 | ## Define Bayesian Optimisation function and run it ## 606 | ###################################################### 607 | 608 | start <- Sys.time() ## Time taken is of interest 609 | 610 | iecv_predictions <- c() 611 | 612 | for(d in seq(1:10)) { ## 10 regions (sha1) - loop over each of them 613 | 614 | set.seed(1066) 615 | 616 | data_1 <- data_period1[ -which (data_period1$sha1 == 617 | levels(data_period1$sha1)[d]), ] ## Fit data is all regions except region 'a' 618 | data_fit <- as.matrix(data_1[, x_cols]) ## Form this into matrix of predictors 619 | label_fit <- as.matrix(data_1[, y_cols]) ## Form matrix of their pseudovalues 620 | 621 | data_2 <- data_period2[ which (data_period2$sha1 == 622 | levels(data_period2$sha1)[d]), ] ## Validation data is region 'a' 623 | data_val <- as.matrix(data_2[, x_cols]) ## Package matrix of predictors - no need for 'target' pseudovalues here 624 | 625 | bayesian_neural <- bayesOpt( ## Bayesian Optimisation command, will run scoring function above on this cross-val loop 626 | FUN = scorefunction, 627 | bounds = bounds, 628 | initPoints = 25, 629 | iters.n = 25, 630 | iters.k = 1, 631 | parallel = FALSE, 632 | plotProgress = FALSE, 633 | verbose = 2, 634 | errorHandling = "continue", 635 | acq = "ei" 636 | ) 637 | 638 | ## Summarise Bayesian Optimisation run ## 639 | bayesian_neural$scoreSummary 640 | plot(bayesian_neural) 641 | bestpars <- getBestPars(bayesian_neural) ## Best parameter combination found in this iteration 642 | bestpars 643 | 644 | # Extract optimal hyperparameter config to fit final model on whole 'fit' dataset # 645 | best_epochs <- bestpars[1] 646 | best_units <- bestpars[2] 647 | best_layers <- toString(bestpars[3]) 648 | best_learnrate <- as.numeric(bestpars[4]) 649 | 650 | ## Use same format as above - define the 'final_model' and fit it ## 651 | iecv_model <- keras_model_sequential() 652 | for(e in 1:best_layers){ 653 | iecv_model %>% 654 | layer_dense(units = best_units, activation = 'relu', 655 | input_shape=c(27)) 656 | } 657 | iecv_model %>% layer_dense(units = 1, activation = "linear") %>% 658 | compile( 659 | optimizer_adam(lr=best_learnrate), 660 | loss = rmse, 661 | metrics = "mse" 662 | ) 663 | 664 | iecv_model %>% fit(data_fit, label_fit, epochs=best_epochs, 665 | batch_size=1024, verbose=0) ## Fit model to period 1 data from all regions except 'a' 666 | 667 | predictions <- predict(iecv_model, data_val) ## Test model on period 2 data for region 'a' 668 | iecv_predictions[[d]] = predictions ## Store the predictions as loop iterates 669 | 670 | rm(iecv_model) ## Clear out so that model can be refitted on next loop iteration without stacking networks... 671 | 672 | } 673 | 674 | end <- Sys.time() 675 | end-start ## Took 2.08 days on my set-upwith GPU 676 | 677 | 678 | ########################################### 679 | # Combine predictions generated from IECV # 680 | ########################################### 681 | nnet_iecv_predictions <- c(iecv_predictions[[1]], iecv_predictions[[2]], 682 | iecv_predictions[[3]], iecv_predictions[[4]], 683 | iecv_predictions[[5]], iecv_predictions[[6]], 684 | iecv_predictions[[7]], iecv_predictions[[8]], 685 | iecv_predictions[[9]], iecv_predictions[[10]]) 686 | 687 | summary(nnet_iecv_predictions) 688 | 689 | ## Combine with individual ID numbers so those can be linked up later ## 690 | 691 | patid <- as.data.frame(data_period2$patid) 692 | export_predictions <- cbind(patid, nnet_iecv_predictions) 693 | head(export_predictions) 694 | 695 | ## Save these for use in Stata for evaluation ## 696 | 697 | setwd("/estimates/ML_IECV_predictions/") 698 | write.csv(export_predictions, "endpoint3_nnet_iecv_predictions.csv") 699 | 700 | 701 | ############################################################################################### 702 | ############################################################################################### 703 | ############################################################################################### 704 | 705 | --------------------------------------------------------------------------------