├── .gitignore ├── requirements.txt ├── datamap.py ├── README.md ├── basic_statistics.py ├── test-demog-cl.R ├── dataset.py ├── pt_embedding.py ├── features.py ├── clustering.py ├── behavioral_phenotyping_pipeline.ipynb └── visualization.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | .*/ 3 | *.csv 4 | *.pkl 5 | data/ 6 | logs/ 7 | tmp/ 8 | utils.py 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sqlalchemy 2 | pandas 3 | numpy 4 | scikit-learn 5 | scipy 6 | glove 7 | jupyter 8 | matplotlib 9 | bokeh 10 | umap-learn 11 | dataclasses 12 | google-api-python-client 13 | gsheets 14 | oauth2client 15 | httplib2 16 | seaborn 17 | torch -------------------------------------------------------------------------------- /datamap.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import pandas as pd 3 | from utils import select_clm 4 | import numpy as np 5 | 6 | flags = None 7 | logger = logging.getLogger('datamap') 8 | 9 | 10 | def levels_datamap(table_dict): 11 | """ Returns a dataframe with a boolean vector per 12 | instrument per level to select the columns correspondent 13 | to the desired level 14 | 15 | Parameters 16 | ---------- 17 | table_dict: dict 18 | dictionary of tables (df) from the database already 19 | filtered 20 | Returns 21 | ------- 22 | dataframe 23 | instruments x levels, each element is a boolean vector 24 | """ 25 | cselect_dict = {} 26 | insname_list = [] 27 | for table, df in table_dict.items(): 28 | insname_list.append(table) 29 | for lev in range(1, 5): 30 | if table in select_clm[lev]: 31 | clm_list = _col_select(lev, table, 32 | df.columns) 33 | cselect_dict.setdefault(lev, list()).append(clm_list) 34 | else: 35 | logger.info("Not considered table {0}".format(table)) 36 | selectcol_df = pd.DataFrame(cselect_dict, 37 | index=insname_list).sort_index() 38 | return selectcol_df 39 | 40 | 41 | """ 42 | Private Functions 43 | """ 44 | 45 | 46 | def _col_select(lev, instrument, 47 | clm_names): 48 | """ Given a table and a depth level, it returns a boolean array 49 | storing the columns to select. 50 | 51 | Parameters 52 | ---------- 53 | lev: int 54 | Level depth 55 | instrument: str 56 | Instrument name 57 | clm_names: Index object 58 | 59 | Returns 60 | ------- 61 | array 62 | Array of integers with the columns to select 63 | """ 64 | cselect_list = np.array([1, 1, 1, 1], dtype=int) 65 | for col in clm_names[4:]: 66 | if col in select_clm[lev][instrument]: 67 | cselect_list = np.append(cselect_list, [int(1)]) 68 | else: 69 | cselect_list = np.append(cselect_list, [int(0)]) 70 | return cselect_list 71 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Behavioral phenotyping project 2 | 3 | Behavioral data embeddings for the stratification of individuals 4 | with neurodevelopmental conditions. 5 | 6 | Designed for observational measurements of cognition and behavior of individuals with 7 | Autism Spectrum Conditions (ASCs). 8 | 9 | #### TODO: Abstract 10 | 11 | ### Technical Requirements 12 | 13 | ``` 14 | Python 3.6+ 15 | 16 | R 3.4+ 17 | ``` 18 | 19 | The full list of required Python Packages is available in `requrirements.txt` file. It is possible 20 | to install all the dependency by: 21 | 22 | ```bash 23 | $ pip install -r requirements.txt 24 | ``` 25 | 26 | ## Behavioural Phenotyping Pipeline (TLDR ;)) 27 | 28 | A complete example of the _Behavioural Phenotype Stratification_ is available 29 | as Jupyter notebook: 30 | 31 | ``` 32 | jupyter notebook behavioral_phenotyping_pipeline.ipynb 33 | ``` 34 | 35 | ### Documentation (at a glance) 36 | 37 | The code is structured into multiple modules (`.py` files), including algorithms and methods 38 | for the multiple steps of the pipeline: 39 | 40 | * `dataset.py`: Connects to the database and dump data 41 | * `features.py`: Returns vocabulary and dictionary of behavioral *EHRs* for each of the 4 possible depth levels. 42 | It also returns a dataset with quantitative scores for level 4 features 43 | * `pt_embedding.py`: Performs TFIDF for patient embeddings; Glove embeddings on words and average them out for 44 | subject embeddings; Word2vec embeddings on words, that are then averaged to output individual representations 45 | * `clustering.py`: Performs Hierarchical Clustering/k-means on embeddings, and quantitative 4th level features 46 | * `visualization.py`: Visualizes results (e.g. _scatterplot & dendrogram_)for sub-cluster visualization; 47 | _Heatmap_ for inspection of quantitative scores between sub-clusters 48 | * `basic_statistics.py`: Returns basic demographic statistics for dataset description 49 | * `test-demog-cl.R`: Runs multiple pairwise comparisons between subgroups 50 | to check for confounders and support clinical validation 51 | 52 | 53 | #### TODO: Paper, Poster, Conference Reference 54 | 55 | #### TODO: Credits and Acknowledgements 56 | 57 | 58 | -------------------------------------------------------------------------------- /basic_statistics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import utils as ut 3 | import os 4 | import logging 5 | from datetime import datetime 6 | import matplotlib.pyplot as plt 7 | 8 | # Create a custom logger, logging to file 9 | logger = logging.getLogger('descriptive_statistics') 10 | 11 | # Create handlers 12 | c_handler = logging.FileHandler('./logs/descriptive_statistics.log', 13 | mode='w') 14 | c_handler.setLevel(logging.INFO) 15 | 16 | # Create formatters and add it to handlers 17 | c_format = logging.Formatter('%(message)s') 18 | c_handler.setFormatter(c_format) 19 | 20 | # Add handlers to the logger 21 | logger.addHandler(c_handler) 22 | 23 | 24 | class DataStatistics: 25 | """Class for data statistics computation.""" 26 | 27 | def compute(self, data_dir): 28 | """Compute basic statistics and save output to log file. 29 | 30 | Parameter 31 | --------- 32 | data_dir: str 33 | directory name where to save log file 34 | """ 35 | pd.set_option('float_format', '{:.3f}'.format) 36 | 37 | dem = pd.read_csv(os.path.join(ut.DATA_FOLDER_PATH, data_dir, 38 | 'person-demographics.csv'), 39 | sep=',', 40 | header=0) 41 | enc = pd.read_csv(os.path.join(ut.DATA_FOLDER_PATH, data_dir, 42 | 'person-encounters.csv'), 43 | sep=',', 44 | header=0) 45 | dem['AGE'] = list(map(lambda x: self.__age(x), dem.DOB.tolist())) 46 | 47 | logger.info('N of subjects: %d\n', len(dem.ID_SUBJ.unique())) 48 | logger.info('%s\n', pd.crosstab(dem.SEX, columns='count')) 49 | logger.info('%s\n', 50 | dem.describe()) 51 | 52 | logger.info("Instrument list:") 53 | for ins in sorted(enc.INSTRUMENT.unique()): 54 | logger.info('%s', ins) 55 | logger.info('\n%s\n', 56 | enc.describe()) 57 | # Consider assessment as number of administered instruments 58 | ass_dict = {} 59 | for _, row in enc.iterrows(): 60 | ass_dict.setdefault(row.ID_SUBJ, list()).append(row.INSTRUMENT) 61 | count_ass = {'pid': list(ass_dict.keys()), 62 | 'ass_count': [len(ass_dict[pid]) for pid in ass_dict]} 63 | logger.info("Assessment (i.e., administered instrument counts) statistics:") 64 | logger.info('%s\n', pd.DataFrame(count_ass).describe()) 65 | 66 | # return period span 67 | doa_vec = [list(map(int, el.split('/'))) for el in enc.DOA.tolist()] 68 | doa_min = min(doa_vec, key=lambda x: (x[-1], x[1])) 69 | doa_max = max(doa_vec, key=lambda x: (x[-1], x[1])) 70 | logger.info(f'Period span: {doa_min} -- {doa_max}\n') 71 | 72 | # plot histogram with number of encounters 73 | plt.figure(figsize=(40, 20)) 74 | plt.bar(dem.ID_SUBJ, dem.N_ENC) 75 | plt.tick_params(axis='x', rotation=90) 76 | plt.tick_params(axis='y', labelsize=30) 77 | plt.savefig(os.path.join(ut.DATA_FOLDER_PATH, 78 | data_dir, 79 | 'n-encounter.png')) 80 | plt.close() 81 | 82 | @staticmethod 83 | def __age(dob): 84 | """ 85 | Parameters 86 | ---------- 87 | dob: str 88 | date of birth in format %d/%m/%Y 89 | 90 | Return 91 | ------ 92 | float 93 | age from birth date 94 | """ 95 | days_in_year = 365.2425 96 | dt_dob = datetime.strptime(dob, '%d/%m/%Y') 97 | current_age = (datetime.today() - dt_dob).days / days_in_year 98 | return current_age 99 | -------------------------------------------------------------------------------- /test-demog-cl.R: -------------------------------------------------------------------------------- 1 | # Post-hoc analyses: 2 | # - Check confounders; 3 | # - Compare variable scores; 4 | # - Run external validation (TBD). 5 | 6 | # LIBRARIES 7 | require(eeptools) 8 | require(reshape2) 9 | require(ggplot2) 10 | require(GGally) 11 | require(plyr) 12 | require(tidyr) 13 | 14 | # FUNCTIONS 15 | # Pairwise chi-square test function 16 | pairwise.chisq.test <- function(x, g, p.adjust.method = p.adjust.methods, ...) { 17 | DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(g))) 18 | g <- factor(g) 19 | p.adjust.method <- match.arg(p.adjust.method) 20 | 21 | compare.levels <- function(i, j) { 22 | idx <- which(as.integer(g) == i | as.integer(g) == j) 23 | xij <- x[idx] 24 | gij <- as.character(g[idx]) 25 | gij <- as.factor(gij) 26 | print(table(xij, gij)) 27 | chisq.test(xij, gij, ...)$p.value 28 | } 29 | PVAL <- pairwise.table(compare.levels, levels(g), p.adjust.method) 30 | ans <- list(method = "chi-squared test", 31 | data.name = DNAME, 32 | p.value = PVAL, 33 | p.adjust.method = p.adjust.method) 34 | class(ans) <- "pairwise.htest" 35 | ans 36 | } 37 | 38 | # DATA PATH AND FILE 39 | DATA_PATH <- '~/Documents/behavioral_phenotyping/data' 40 | FILE_NAME <- 'df_w2vemb_level4.csv' 41 | PLOT_NAME <- 'feat_dist_hc_w2v_level4.pdf' 42 | 43 | # RUN ANALYSES 44 | # Read table 45 | df <- read.table(file.path(DATA_PATH, FILE_NAME), 46 | sep = ',', 47 | header = TRUE, 48 | as.is = TRUE) 49 | df <- subset(df, select = c(clpid, sex, bdate, aoa, 50 | n_enc, feat, score_sc, score)) 51 | # Add current age column to dataframe 52 | df$cage <- age_calc(as.Date(df$bdate, "%d/%m/%Y"), 53 | units = 'years') 54 | df <- df[order(df$clpid),] 55 | 56 | # Inspect confounders 57 | df_conf <- unique(data.frame(pid = apply(t(df$clpid), 2, 58 | function(x) strsplit(x, '-')[[1]][2]), 59 | cluster = as.factor(apply(t(df$clpid), 2, 60 | function(x) strsplit(x, '-')[[1]][1])), 61 | cage = df$cage, 62 | sex = df$sex, 63 | n_enc = df$n_enc), by = 'pid') 64 | # Add to confounder df the behr length for each subject 65 | lenbehr <- ddply(df, .(clpid), nrow)$V1 66 | df_conf$lenbehr <- lenbehr 67 | 68 | # Tests: 69 | # (1) age mean differences between clusters (pairwise t-test with Bonferroni correction); 70 | # (2) average number of encounters between clusters (pairwise t-test with Bonferroni correction); 71 | # (3) sex counts via chi-squared test with Bonferroni correction. 72 | 73 | print("AGE per cluster (M, SD):") 74 | tapply(df_conf$cage, df_conf$cluster, function(x) c(mean(x), sd(x))) 75 | pairwise.t.test(df_conf$cage, df_conf$cluster, p.adjust.method = 'bonferroni') 76 | 77 | print("N_ENCOUNTERS per cluster (M, SD):") 78 | tapply(df_conf$n_enc, df_conf$cluster, function(x) c(mean(x), sd(x))) 79 | pairwise.t.test(df_conf$n_enc, df_conf$cluster, p.adjust.method = 'bonferroni') 80 | 81 | print("SEX counts pairwise chi-square between clusters") 82 | tab <- table(df_conf$sex, df_conf$cluster) 83 | tab 84 | pairwise.chisq.test(df_conf$sex, df_conf$cluster, 85 | p.adjust.method = 'bonferroni') 86 | 87 | print("AGE OF ASSESSMENT per cluster (M, SD):") 88 | df$cluster <- as.factor(apply(t(df$clpid), 2, 89 | function(x) strsplit(x, '-')[[1]][1])) 90 | tapply(df$aoa, df$cluster, function(x) c(mean(x), sd(x))) 91 | pairwise.t.test(df$aoa, df$cluster, p.adjust.method = 'bonferroni') 92 | 93 | print("Length BEHR per cluster (M, SD):") 94 | tapply(df_conf$lenbehr, df_conf$cluster, function(x) c(mean(x), sd(x))) 95 | pairwise.t.test(df_conf$lenbehr, df_conf$cluster, p.adjust.method = 'bonferroni') 96 | 97 | # Summary statistics feature raw scores. 98 | # df$feat_cl <- paste(df$cluster, 99 | # df$feat, 100 | # sep = '-') 101 | # print("Summary statistics feature scores foe each cluster.") 102 | # tapply(df$score, 103 | # df$feat_cl, 104 | # function(x) c(mean(x), sd(x))) 105 | 106 | ################################################################## 107 | 108 | # Multiple pairwise comparisons between groups 109 | df_wide <- subset(df, select = c(clpid, feat, score_sc)) 110 | df_wide <- dcast(df_wide, clpid ~ feat, 111 | value.var = 'score_sc', 112 | drop = FALSE, fun.aggregate = mean) 113 | df_wide$pid <- apply(t(df_wide$clpid), 2, function(x) strsplit(x, '-')[[1]][2]) 114 | df_wide$cluster <- apply(t(df_wide$clpid), 2, function(x) strsplit(x, '-')[[1]][1]) 115 | df_wide <- subset(df_wide, select = - clpid) 116 | df_wide$cluster <- as.factor(df_wide$cluster) 117 | 118 | print("Percentage of missing data for each cluster") 119 | na_cl <- c() 120 | na_count <- c() 121 | for (cl in levels(df_wide$cluster)){ 122 | tmp <- df_wide[df_wide$cluster==cl, 1:(ncol(df_wide)-2)] 123 | ttab <- table(is.na(tmp))/(nrow(tmp)*ncol(tmp)) 124 | print(ttab) 125 | na_cl <- c(na_cl, rep(cl,nrow(tmp)*ncol(tmp))) 126 | na_count <- c(na_count, rep('notmiss', table(is.na(tmp))[1]), 127 | rep('miss', table(is.na(tmp))[2])) 128 | } 129 | na_cl <- as.factor(na_cl) 130 | na_count <- as.factor(na_count) 131 | pairwise.chisq.test(na_count, na_cl) 132 | 133 | # Run pairwise t-test or t-test for score comparisons. 134 | # for (n in names(df_wide)[1 : (ncol(df_wide) - 2)]) { 135 | # # Drop missing values. 136 | # df_tmp <- drop_na(subset(df_wide, select = c(n, 'cluster'))) 137 | # check_tab <- table(df_tmp$cluster) > 1 138 | # cat("\n", "Testing variable", n, "\n\n") 139 | # if (length(check_tab[check_tab == FALSE]) >= 1) { 140 | # idxs <- which(df_tmp$cluster == which(check_tab == FALSE) - 1) 141 | # if (length(idxs) > 0) { 142 | # df_tmp <- df_tmp[- which(df_tmp$cluster == which(check_tab == FALSE) - 1),]} 143 | # try(print(t.test(df_tmp[, 1] ~ df_tmp[, 2])))} else { 144 | # try(print(pairwise.t.test(df_tmp[, 1], df_tmp[, 2], 145 | # p.adjust.method = 'bonferroni'))) 146 | # #pt <- pairwise.t.test(df_tmp[, 1], df_tmp[, 2], 147 | # # p.adjust.method = 'bonferroni') 148 | # #print(pt)} 149 | # } 150 | # } 151 | 152 | # Feature distibution plot 153 | # pdf(file = file.path(DATA_PATH, PLOT_NAME)) 154 | ggpairs(subset(df_wide, select = c(grep('ados|psi', names(df_wide)), cluster)), label.pos = 3) 155 | # ggpairs(subset(df_wide, select = c(grep('griffiths', names(df_wide)), cluster)), 156 | # columnLabels = c("gmds::GQ", "gmds::q_A", 157 | # "gmds::q_B", "gmds::q_C", 158 | # "gmds::q_D", "gmds::q_E", 159 | # "gmds::q_F", "cluster")) 160 | # ggpairs(subset(df_wide, select = c(grep('wechsler', names(df_wide)), cluster))) 161 | # ggpairs(subset(df_wide, select = c(grep('vineland', names(df_wide)), cluster))) 162 | # ggpairs(subset(df_wide, select = c(grep('srs', names(df_wide)), cluster))) 163 | # ggpairs(subset(df_wide, select = c(grep('psi', names(df_wide)), cluster))) 164 | # ggpairs(subset(df_wide, select = c(grep('leiter', names(df_wide)), cluster))) 165 | # dev.off() 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /dataset.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine, MetaData 2 | import datetime 3 | from datetime import datetime 4 | import csv 5 | import os 6 | import utils as ut 7 | import pandas as pd 8 | import logging 9 | from dataclasses import dataclass 10 | from basic_statistics import DataStatistics 11 | import numpy as np 12 | 13 | 14 | # Dataclasses to store patient demographics, 15 | # and patient info on encounters. 16 | @dataclass 17 | class Pinfo: 18 | sex: str 19 | dob: str 20 | n_enc: int = 0 21 | 22 | 23 | @dataclass 24 | class Penc: 25 | sex: str 26 | dob: str 27 | doa_instrument: list() 28 | 29 | def count_enc(self): 30 | yr_enc = list(map(lambda x: x[0].split('/')[2], 31 | self.doa_instrument)) 32 | return len(set(yr_enc)) 33 | 34 | 35 | # Configure the logging, logging to file. 36 | logging.basicConfig(level=logging.INFO, 37 | filename='./logs/pipeline.log', 38 | filemode='w') 39 | 40 | # Create new directory or point to an existing one to store data. 41 | data_dir = 'odf-data' 42 | data_path = os.path.join(ut.DATA_FOLDER_PATH, data_dir) 43 | os.makedirs(data_path, exist_ok=True) 44 | runtime_date = datetime.now().strftime('%Y-%m-%d-%H-%M-%S') 45 | logging.info(f'{runtime_date} created ../data/odf-data folder for returned objects') 46 | 47 | 48 | def access_db(): 49 | """ Access the database and dump tables. 50 | 51 | Returns 52 | ------- 53 | 54 | dictionary 55 | {key=table_name, value=pandas dataframe} 56 | """ 57 | # connect to the database 58 | engine = create_engine(ut.SQLALCHEMY_CONN_STRING) 59 | conn = engine.connect() 60 | logging.info('Connection to DB established') 61 | # inspect the tables in the database 62 | metadata = MetaData(engine, reflect=True) 63 | 64 | logging.info('Dumping all tables') 65 | df_tables = {} 66 | for table_name in metadata.tables: 67 | # ADDED THIS TO STOP IMPORTING NEW DATA FOR NOW 68 | df_tables[table_name] = pd.read_sql_table(table_name, 69 | con=conn, 70 | parse_dates=['date_birth', 'date_ass'], 71 | index_col='id').query( 72 | 'form_info <= datetime(2019, 10, 5)').drop('form_info', axis=1) 73 | return df_tables 74 | 75 | 76 | def data_wrangling(tables_dict): 77 | """ Drop excluded subjects and tables 78 | 79 | Parameters 80 | ---------- 81 | tables_dict: dictionary 82 | dictionary with dumped tables from DB 83 | 84 | Returns 85 | ------- 86 | dictionary 87 | reduced dictionary without excluded tables and subjects (rows) 88 | """ 89 | adult_subj = tables_dict['ados-2modulo4'].id_subj.unique() 90 | # added lab1680 on the 1st of October 2019, new entry with only WISC-IV 91 | # added also lab1353/lab1152, only psi-sf/srs available 92 | adult_subj = np.append(adult_subj, ['lab1680', 'lab1353', 'lab1152']) 93 | logging.info(f'Dropped {len(adult_subj)} subjects') 94 | 95 | # names of the tables to drop from the dictionary 96 | tb_drop = ['ados-2modulo4', 97 | 'emotionalavailabilityscales'] 98 | 99 | tb_dict_rid = {} 100 | for tb_name, df in tables_dict.items(): 101 | if tb_name not in tb_drop: 102 | row_drop = ~(df['id_subj'].isin(adult_subj)) 103 | tb_dict_rid[tb_name] = df.loc[row_drop] 104 | 105 | return tb_dict_rid 106 | 107 | 108 | def cohort_info(tables_dict): 109 | """Store instances of Pinfo and Penc classes in dictionaries 110 | 111 | Parameters 112 | ---------- 113 | tables_dict: dictionary 114 | dictionary with data tables 115 | 116 | Returns 117 | ------- 118 | dictionary 119 | {keys=pid, values=Pinfo instances} 120 | dictionary 121 | {keys=pid, values=Penc instances} 122 | """ 123 | demog_dict = {} 124 | enc_dict = {} 125 | for tn, df in tables_dict.items(): 126 | for _, row in df.iterrows(): 127 | ass_date = __correct_datetime(row.date_ass) 128 | birth_date = __correct_datetime(row.date_birth) 129 | if row.id_subj in enc_dict: 130 | enc_dict[row.id_subj].doa_instrument.append((ass_date, tn)) 131 | else: 132 | enc_dict[row.id_subj] = Penc(sex=row.sex, 133 | dob=birth_date, 134 | doa_instrument=[(ass_date, 135 | tn)]) 136 | demog_dict[row.id_subj] = Pinfo(sex=row.sex, 137 | dob=birth_date) 138 | for pid in demog_dict: 139 | demog_dict[pid].n_enc = enc_dict[pid].count_enc() 140 | # dump info to csv files 141 | _dump_info(demog_dict, enc_dict) 142 | # save log with statistics 143 | logging.info('\nComputing basics statistics (DataStatistics module)\n') 144 | DataStatistics().compute(data_dir) 145 | return demog_dict, enc_dict 146 | 147 | 148 | """ 149 | Functions 150 | """ 151 | 152 | 153 | def age_ass(dob, doa): 154 | """ 155 | Parameters 156 | ---------- 157 | dob: str 158 | date of birth 159 | doa: str 160 | date of assessment 161 | 162 | Return 163 | ------ 164 | float 165 | age of assessment 166 | """ 167 | # dob = pd.Timestamp(year=int(dob.split('/')[2]), 168 | # month=int(dob.split('/')[1]), 169 | # day=int(dob.split('/')[0])) 170 | # doa = pd.Timestamp(year=int(doa.split('/')[2]), 171 | # month=int(doa.split('/')[1]), 172 | # day=int(doa.split('/')[0])) 173 | dob = pd.Timestamp(dob) 174 | doa = pd.Timestamp(doa) 175 | days_in_year = 365.2425 176 | aoa = (doa - dob).days / days_in_year 177 | return aoa 178 | 179 | 180 | def __correct_datetime(date_ts): 181 | """ 182 | Parameters 183 | ---------- 184 | date_ts: pandas Timestamp 185 | 186 | Returns 187 | ------- 188 | str 189 | strftime %d/%m/%Y 190 | """ 191 | # correct wrong dates 192 | today = datetime.today() 193 | try: 194 | if date_ts.year == today.year and date_ts.month >= today.month: 195 | corrected_date = pd.Timestamp(year=date_ts.year, 196 | month=date_ts.day, 197 | day=date_ts.month) 198 | else: 199 | corrected_date = date_ts 200 | 201 | return corrected_date.strftime("%d/%m/%Y") 202 | except AttributeError: 203 | return date_ts 204 | 205 | 206 | def _dump_info(demog_info, enc_info): 207 | """Save csv file with demographic and encounter info 208 | 209 | Parameters 210 | ---------- 211 | demog_info: dictionary 212 | {keys=pid, values=Pinfo instances} 213 | enc_info: dictionary 214 | {keys=pid, values=Penc instances} 215 | """ 216 | logging.info("Saving csv files on subject info and subject encounters") 217 | with open(os.path.join(ut.DATA_FOLDER_PATH, data_dir, 218 | 'person-encounters.csv'), 'w') as f: 219 | wr = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) 220 | wr.writerow(['ID_SUBJ', 'SEX', 'DOB', 'DOA', 'AOA', 'INSTRUMENT']) 221 | for pid in sorted(enc_info.keys()): 222 | enc_info[pid].doa_instrument.sort(key=lambda x: (x[0].split('/')[2], 223 | x[0].split('/')[1], 224 | x[1])) 225 | for tup in enc_info[pid].doa_instrument: 226 | wr.writerow([pid, enc_info[pid].sex, 227 | enc_info[pid].dob, tup[0], 228 | age_ass(enc_info[pid].dob, tup[0]), 229 | tup[1]]) 230 | with open(os.path.join(ut.DATA_FOLDER_PATH, data_dir, 231 | 'person-demographics.csv'), 'w') as f: 232 | wr = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL) 233 | wr.writerow(['ID_SUBJ', 'SEX', 'DOB', 'N_ENC']) 234 | for pid in sorted(demog_info.keys()): 235 | wr.writerow([pid, demog_info[pid].sex, 236 | demog_info[pid].dob, 237 | demog_info[pid].n_enc]) 238 | -------------------------------------------------------------------------------- /pt_embedding.py: -------------------------------------------------------------------------------- 1 | from sklearn.feature_extraction.text import TfidfVectorizer 2 | from sklearn.decomposition import TruncatedSVD 3 | import glove 4 | import numpy as np 5 | import utils as ut 6 | import torch 7 | import torch.nn.functional as F 8 | import logging 9 | 10 | 11 | class Pembeddings: 12 | def __init__(self, behr, vocab): 13 | """ Range of possible embeddings to perform on behavioral data 14 | TFIDF, GLOVE, WORD2VEC 15 | 16 | Parameters 17 | ---------- 18 | behr 19 | dictionary {pid: trm sequence} 20 | vocab 21 | dictionary, needed btm_to_idx 22 | """ 23 | self.behr = behr 24 | self.vocab = vocab 25 | 26 | def tfidf(self): 27 | """performs TFIDF 28 | 29 | Return 30 | ------ 31 | list 32 | pids list 33 | list 34 | svd matrix 35 | """ 36 | # create document list 37 | doc_list = [] 38 | for tupl_list in self.behr.values(): 39 | sentence = [] 40 | for tm_vect in tupl_list: 41 | sentence.extend(tm_vect[2:]) 42 | doc_list.append(' '.join(list(map(lambda x: str(x), sentence)))) 43 | pid_list = [pid for pid in self.behr] 44 | 45 | vectorizer = TfidfVectorizer(norm='l2') 46 | tfidf_mtx = vectorizer.fit_transform(doc_list) 47 | 48 | logging.info("Performing SVD on the TF-IDF matrix...") 49 | reducer = TruncatedSVD(n_components=ut.n_dim_tfidf, random_state=123) 50 | svd_mtx = reducer.fit_transform(tfidf_mtx) 51 | 52 | return pid_list, svd_mtx 53 | 54 | def word2vec_emb(self): 55 | """Skip-gram word2vec 56 | 57 | Returns 58 | ------- 59 | list 60 | pids list 61 | list 62 | matrix of patient embeddings 63 | numpy array: 64 | first layer weight matrix (vocab size, embedding dim) 65 | numpy array: 66 | second layer weight matrix (vocab size, embedding dim) 67 | 68 | """ 69 | corpus = self.__build_corpus() 70 | idx_pairs = self.__get_idx_pairs(corpus, window_size=10) 71 | 72 | torch.manual_seed(1234) 73 | W1 = torch.randn(ut.n_dim_w2v, len(self.vocab), 74 | dtype=torch.float32, 75 | requires_grad=True) 76 | W2 = torch.randn(len(self.vocab), ut.n_dim_w2v, 77 | dtype=torch.float32, 78 | requires_grad=True) 79 | 80 | for epoch in range(ut.n_epoch_w2v): 81 | loss_val = 0 82 | for data, target in idx_pairs: 83 | x = self.__get_input_layer(data).float() 84 | y_true = torch.from_numpy(np.array([target])).long() 85 | 86 | z1 = torch.matmul(W1, x) 87 | z2 = torch.matmul(W2, z1) 88 | 89 | log_softmax = F.log_softmax(z2, dim=0) 90 | 91 | loss = F.nll_loss(log_softmax.view(1, -1), y_true) 92 | loss_val += loss.item() 93 | loss.backward() 94 | w1 = W1.detach() 95 | w2 = W2.detach() 96 | w1 -= ut.learning_rate_w2v * W1.grad 97 | w2 -= ut.learning_rate_w2v * W2.grad 98 | 99 | W1.grad.zero_() 100 | W2.grad.zero_() 101 | 102 | if epoch % 10 == 0: 103 | logging.info(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}') 104 | logging.info(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}') 105 | 106 | p_emb = [] 107 | pid_list = [] 108 | for pid, term in corpus.items(): 109 | if len(term) != 0: 110 | pid_list.append(pid) 111 | p_emb.append(np.mean([W1[:, int(t)].tolist() for t in term], 112 | axis=0).tolist()) 113 | 114 | return pid_list, p_emb, w1.numpy(), w2.numpy() 115 | 116 | def glove_pemb(self): 117 | """Computes Glove embeddings from co-occurrence matrix 118 | and returns patient embeddings 119 | 120 | Return 121 | ------ 122 | list 123 | pids list 124 | list 125 | matrix of patient embeddings 126 | array 127 | word embeddings 128 | """ 129 | 130 | corpus = self.__build_corpus() 131 | coocc_dict = self.__build_cooccur(corpus, window_size=10) 132 | model = glove.Glove(coocc_dict, alpha=0.75, x_max=10.0, d=ut.n_dim_glove, seed=1234) 133 | logging.info("\nTraining Glove embeddings...") 134 | for epoch in range(ut.n_epoch_glove): 135 | err = model.train(batch_size=ut.batch_size_glove, step_size=ut.learning_rate_glove) 136 | if epoch % 10 == 0: 137 | logging.info("epoch %d, error %.3f" % (epoch, err)) 138 | logging.info("epoch %d, error %.3f" % (epoch, err)) 139 | 140 | wemb = model.W + model.ContextW # as suggested in Pennington et al. 141 | p_emb = [] 142 | pid_list = [] 143 | for pid, term in corpus.items(): 144 | if len(term) != 0: 145 | pid_list.append(pid) 146 | p_emb.append(np.mean([wemb[int(t)].tolist() for t in term], 147 | axis=0).tolist()) 148 | 149 | return pid_list, p_emb, wemb 150 | 151 | @staticmethod 152 | def __age_tf(age): 153 | """ convert age to time slot string 154 | 155 | Parameter 156 | --------- 157 | age 158 | float 159 | Return 160 | ------ 161 | str 162 | """ 163 | if 0 < age <= 2.5: 164 | return 'F1' 165 | elif 2.5 < age <= 6.0: 166 | return 'F2' 167 | elif 6.0 < age <= 13.0: 168 | return 'F3' 169 | elif 13.0 < age < 17.0: 170 | return 'F4' 171 | else: 172 | return 'F5' 173 | 174 | def __build_corpus(self): 175 | """random shuffle terms in time slots 176 | 177 | Return 178 | ------ 179 | dictionary 180 | {pid: term list set and shuffles wrt to time slots F1-F5} 181 | """ 182 | # set seed 183 | np.random.seed(0) # 1234 (3 ns subtypes); 47 (7 ns subtypes) 184 | # We structure behrs wrt timeframes to learn word embeddings. 185 | # Structure of bvect = [Penc, aoa, tokens]. 186 | behr_tf = {} 187 | for pid, bvect in self.behr.items(): 188 | for el in bvect: 189 | if pid not in behr_tf: 190 | behr_tf[pid] = {self.__age_tf(el[1]): list(map(lambda x: int(self.vocab[x]), 191 | el[2:]))} 192 | else: 193 | behr_tf[pid].setdefault(self.__age_tf(el[1]), 194 | list()).extend(list(map(lambda x: int(self.vocab[x]), 195 | el[2:]))) 196 | corpus = {} 197 | for pid, tf_dict in behr_tf.items(): 198 | for tf in sorted(tf_dict.keys()): 199 | np.random.shuffle(behr_tf[pid][tf]) 200 | corpus.setdefault(pid, 201 | list()).extend(behr_tf[pid][tf]) 202 | return corpus 203 | 204 | @staticmethod 205 | def __get_idx_pairs(corpus, window_size): 206 | """Creates the center-context vectors for Word2vec predictions 207 | 208 | Parameters 209 | ---------- 210 | corpus: dictionary 211 | {pid: behr} 212 | window_size: int 213 | size of the context 214 | Returns 215 | ------- 216 | numpy array 217 | """ 218 | idx_pairs = [] 219 | # for each sentence 220 | for sentence in corpus.values(): 221 | # for each word, treated as center word 222 | for center_word_pos in range(len(sentence)): 223 | # for each window position 224 | for w in range(-window_size, window_size + 1): 225 | context_word_pos = center_word_pos + w 226 | # make sure not jump out sentence 227 | if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos: 228 | continue 229 | context_word_idx = sentence[context_word_pos] 230 | idx_pairs.append((sentence[center_word_pos], context_word_idx)) 231 | 232 | return np.array(idx_pairs) 233 | 234 | def __get_input_layer(self, word_idx): 235 | """Transforms a token into a one-hot encoded representation 236 | 237 | Parameters 238 | ---------- 239 | word_idx: int 240 | word token 241 | Returns 242 | ------- 243 | torch tensor 244 | """ 245 | x = torch.zeros(len(self.vocab), dtype=torch.float32) 246 | x[word_idx] = 1.0 247 | return x 248 | 249 | def __build_cooccur(self, corpus, window_size=10): 250 | """Build a word co-occurrence dictionary for the given corpus. 251 | 252 | Parameters 253 | ---------- 254 | corpus 255 | behr dictionary as returned by __build_corpus 256 | window_size 257 | int, size of the context window 258 | 259 | Return 260 | ------ 261 | dictionary 262 | {i_main: {i_context: cooccurrence}} 263 | see Pennington et al., (2014). 264 | """ 265 | 266 | # Collect cooccurrences internally as a sparse matrix for passable 267 | # indexing speed; we'll convert into a list later 268 | cooccurrences = {k: {} for k in self.vocab.values()} 269 | 270 | for pid, sentence in corpus.items(): 271 | 272 | for center_i, center_id in enumerate(sentence): 273 | # Collect all word IDs in left window of center word 274 | context_ids = sentence[max(0, center_i - window_size): center_i] 275 | contexts_len = len(context_ids) 276 | 277 | for left_i, left_id in enumerate(context_ids): 278 | # Distance from center word 279 | distance = contexts_len - left_i 280 | 281 | # Weight by inverse of distance between words 282 | increment = 1.0 / float(distance) 283 | # Build co-occurrence matrix symmetrically (pretend we 284 | # are calculating right contexts as well) 285 | if left_id in cooccurrences[center_id]: 286 | cooccurrences[center_id][left_id] += increment 287 | cooccurrences[left_id][center_id] += increment 288 | else: 289 | cooccurrences[center_id][left_id] = increment 290 | cooccurrences[left_id][center_id] = increment 291 | return cooccurrences 292 | -------------------------------------------------------------------------------- /features.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import os 3 | import re 4 | import pandas as pd 5 | from dataset import Penc, age_ass 6 | from datamap import levels_datamap 7 | from sklearn.preprocessing import StandardScaler 8 | import logging 9 | import utils as ut 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | import numpy as np 13 | 14 | 15 | # Configure the logging, logging to file. 16 | # logging.basicConfig(level=logging.INFO) 17 | 18 | 19 | class DataFeatures: 20 | """ Each instance is initialized with the desired level 21 | and the dictionary with the instrument tables, as dataframes, 22 | from the database. A dataframe stores the datamap 23 | for feature selection. 24 | """ 25 | 26 | def __init__(self, level, df_dict): 27 | self.level = level 28 | self.df_dict = df_dict 29 | dm_df = levels_datamap(df_dict) # class variable 30 | lev_dict = {} 31 | for ins, df in df_dict.items(): 32 | lev_dict[ins] = df[df.columns[pd.Series(dm_df.loc[ins, 33 | level], 34 | dtype='bool')]] 35 | self.lev_dict = lev_dict 36 | 37 | def create_level_tokens(self): 38 | """Transforms instrument values into words joining instrument name, 39 | scale/subscale and score. Returns a dictionary of token dataframes 40 | per instrument according to level. It also returns 41 | the correspondent vocabulary of terms. 42 | 43 | Returns 44 | ------- 45 | dictionary 46 | {key: instrument, value: list of token lists} 47 | dictionary 48 | {key: word, value: int} 49 | """ 50 | logging.info(f"Building token dataframes and vocabulary for level {self.level}.") 51 | 52 | # Create token strings to populate behr dictionary and vocabulary 53 | behr_tkns = {} 54 | lev_vocab = set() 55 | for ins, df in self.lev_dict.items(): 56 | # for _, row in df.iterrows(): 57 | # # The first two positions of each vector of tokens store a 58 | # # Penc dataclass and the assessment age. 59 | # token = [Penc(sex=row.sex, 60 | # dob=penc[row.id_subj].dob, 61 | # doa_instrument=[(correct_datetime(row.date_ass), ins)]), 62 | # age_ass(penc[row.id_subj].dob, 63 | # correct_datetime(row.date_ass))] 64 | for _, row in df.iterrows(): 65 | # The first two positions of each vector of tokens store a 66 | # Penc dataclass and the assessment age. 67 | token = [Penc(sex=row.sex, 68 | dob=row.date_birth.strftime("%d/%m/%Y"), 69 | doa_instrument=[(row.date_ass.strftime("%d/%m/%Y"), 70 | ins)]), 71 | age_ass(row.date_birth, row.date_ass)] 72 | for c in df.columns[4:]: 73 | try: 74 | if row[c] != '' and pd.notna(row[c]): 75 | sig = self.__create_token(row, ins, c) 76 | token.append('::'.join([sig, str(int(row[c]))])) 77 | lev_vocab.update(['::'.join([sig, str(int(row[c]))])]) 78 | else: 79 | pass 80 | except ValueError: 81 | pass 82 | behr_tkns.setdefault(row['id_subj'], list()).append(token) 83 | bt_to_idx = {trm: idx for idx, trm in enumerate(sorted(list(lev_vocab)))} 84 | idx_to_bt = {idx: trm for idx, trm in enumerate(sorted(list(lev_vocab)))} 85 | behr = {} 86 | for p_id, vect in behr_tkns.items(): 87 | vect.sort(key=lambda x: x[1]) 88 | for v in vect: 89 | behr.setdefault(p_id, list()).append(v) 90 | logging.info(f'Vocabulary size:{len(bt_to_idx)}') 91 | self.__save_vocab_behr(behr, bt_to_idx) 92 | 93 | return behr, (bt_to_idx, idx_to_bt) 94 | 95 | def create_level_features(self, missing_data_plot=False): 96 | """ If level is not 4 it returns an Error. For level 4 it returns 97 | a dataframe with patient ids as index and time-ordered features as columns. 98 | Missing values are NaN. Dataframe and vocabulary are saved to csv file. 99 | 100 | Returns 101 | ------- 102 | dataframe 103 | Table with instrument scores at level 4 (at different times F1-F5) 104 | per subject. 105 | dataframe Scaled feature set with mean imputed missing values. 106 | """ 107 | if self.level != 4: 108 | logging.error("create_level_features() is only available for level 4.") 109 | raise ValueError("create_level_features() attribute is only available for level 4.") 110 | else: 111 | # Create token strings as features 112 | feat_set = set() 113 | feat_tkns = {} 114 | for ins, df in self.lev_dict.items(): 115 | for _, row in df.iterrows(): 116 | for c in df.columns[4:]: 117 | try: 118 | # if row[c] != '' and pd.notna(row[c]): 119 | # sig = self.__create_token(row, ins, c) 120 | # feat_tkns.setdefault(row['id_subj'], list()).append( 121 | # '::'.join([self.__aoa_to_tf(age_ass(penc[row.id_subj].dob, 122 | # correct_datetime(row.date_ass))), 123 | # sig, 124 | # str(int(row[c]))])) 125 | # feat_set.update(['::'.join([self.__aoa_to_tf(age_ass(penc[row.id_subj].dob, 126 | # correct_datetime(row.date_ass))), 127 | # sig])]) 128 | if row[c] != '' and pd.notna(row[c]): 129 | sig = self.__create_token(row, ins, c) 130 | feat_tkns.setdefault(row['id_subj'], list()).append( 131 | '::'.join([self.__aoa_to_tf(age_ass(row.date_birth, 132 | row.date_ass)), 133 | sig, 134 | str(int(row[c]))])) 135 | feat_set.update(['::'.join([self.__aoa_to_tf(age_ass(row.date_birth, 136 | row.date_ass)), 137 | sig])]) 138 | else: 139 | pass 140 | except ValueError: 141 | pass 142 | feat_df = pd.DataFrame(columns=sorted(list(feat_set)), 143 | index=sorted(list(feat_tkns.keys()))) 144 | for p_id, vect in feat_tkns.items(): 145 | for tkn_val in vect: 146 | tkn = tkn_val.split('::') 147 | feat_df.loc[p_id, ['::'.join(tkn[:-1])]] = int(tkn[-1]) 148 | feat_df.to_csv('./data/level-4/feature_data.csv') # dump dataframe 149 | 150 | scaler = StandardScaler() 151 | feat_df_scaled = feat_df.fillna(feat_df.mean(), inplace=False) 152 | feat_df_scaled = pd.DataFrame(scaler.fit_transform(feat_df_scaled), 153 | columns=feat_df.columns, 154 | index=feat_df.index) 155 | missing_data = feat_df.isna().mean() * 100 156 | logging.info(f'Percentages of missing values for columns of feature data:\n{missing_data}') 157 | 158 | if missing_data_plot: 159 | rid_list = {} 160 | ins = set() 161 | for k, v in zip(missing_data.keys(), 162 | missing_data): 163 | ins.add(ut.shorten_names[k.split('::')[1]]) 164 | rid_list.setdefault(k.split('::')[0], 165 | dict()).setdefault(ut.shorten_names[k.split('::')[1]], 166 | list()).append(v) 167 | df_dict = {} 168 | ins = list(ins) 169 | for i in sorted(ins): 170 | df_dict[i] = [] 171 | for t in rid_list.keys(): 172 | try: 173 | df_dict[i].append(np.mean(rid_list[t][i])) 174 | except KeyError: 175 | df_dict[i].append(np.nan) 176 | df = pd.DataFrame(df_dict, index=sorted(list(rid_list.keys()))) 177 | logging.info(f'Mean percentages over items of missing values for feature data\n{df}') 178 | mask = df.isnull() 179 | fig, ax = plt.subplots(figsize=(6, 4)) 180 | sns.heatmap(df, mask=mask, cmap='GnBu') 181 | ax.xaxis.tick_top() 182 | ax.xaxis.set_label_position('top') 183 | ax.tick_params(length=0) 184 | plt.xticks(rotation=90) 185 | plt.savefig('./data/level-4/missing_feature_data.eps', format='eps', 186 | dpi=200, bbox_inches='tight') 187 | 188 | return feat_df, feat_df_scaled 189 | 190 | def __save_vocab_behr(self, behr, bt_to_idx): 191 | """Saves behavioral EHRs and vocabulary of terms at the level specified 192 | to .csv file in a new data folder according to level. 193 | 194 | Parameters 195 | ---------- 196 | behr: dictionary 197 | {key:pid, value:list(list of terms for each assessment)} 198 | bt_to_idx: dictionary 199 | Dictionary with behavioral terms as keys and idx as values 200 | """ 201 | os.makedirs('./data/level-{0}'.format(self.level), 202 | exist_ok=True) 203 | with open(os.path.join('./data/level-{0}'.format(self.level), 204 | 'cohort-behr.csv'), 'w') as f: 205 | wr = csv.writer(f) 206 | wr.writerow(['ID_SUBJ', 'AOA', 'TERM']) 207 | for pid, seq in behr.items(): 208 | for s in seq: 209 | wr.writerow([pid, s[1]] + [bt_to_idx[s[idx]] 210 | for idx in range(2, len(s))]) 211 | with open(os.path.join('./data/level-{0}'.format(self.level), 212 | 'bt_to_idx.csv'), 'w') as f: 213 | wr = csv.writer(f) 214 | wr.writerow(["TERM", "LABEL"]) 215 | for bt, idx in bt_to_idx.items(): 216 | wr.writerow([bt, idx]) 217 | 218 | @staticmethod 219 | def __create_token(row, ins, c): 220 | """Private custom-based function to modify and uniform dataset features. 221 | Must be modified when changing dataset. Returns token string and value. These 222 | objects must be joined for NLP behavioral embedding and kept separate for 223 | feature dataset. 224 | 225 | Parameters 226 | ---------- 227 | row: pandas Series 228 | Row corresponding to a patient assessment 229 | ins: str 230 | Instrument considered 231 | c: str 232 | Name of the instrument item considered 233 | 234 | Returns 235 | ------- 236 | str 237 | String of the form instrument::item 238 | """ 239 | if bool(re.match('ados', ins)): 240 | if bool(re.search("\.d1|\.d2|\.b1|d1|d2|b1|" 241 | "comparison_score|" 242 | "sa_tot|rrb_tot|sarrb_tot|" 243 | "\.sa_tot|\.rrb_tot", 244 | c)): 245 | if len(c.split('.')) > 1: 246 | token = '::'.join(['ados', 247 | c.split('.')[1]]) 248 | else: 249 | token = '::'.join(['ados', 250 | c]) 251 | else: 252 | if len(c.split('.')) > 1: 253 | token = '::'.join([ins, 254 | c.split('.')[1]]) 255 | else: 256 | token = '::'.join([ins, 257 | c]) 258 | elif bool(re.match('psi', ins)): 259 | token = '::'.join([ins, 260 | row['parent'].lower(), 261 | c]) 262 | elif bool(re.match('vinel|srs', ins)): 263 | token = '::'.join([ins, 'caretaker', c]) 264 | elif bool(re.match('wa|wi|wp', ins)): 265 | token = '::'.join(['wechsler', c]) 266 | else: 267 | token = '::'.join([ins, 268 | c]) 269 | return token 270 | 271 | @staticmethod 272 | def __aoa_to_tf(aoa): 273 | """Returns the time period from the age of assessment 274 | 275 | Parameters 276 | ---------- 277 | aoa: float 278 | age of assessment 279 | 280 | Return 281 | ------ 282 | str 283 | time period string (F1-F5) 284 | """ 285 | 286 | if 0 < float(aoa) <= 2.5: 287 | return 'F1' 288 | elif 2.5 < float(aoa) <= 6.0: 289 | return 'F2' 290 | elif 6.0 < float(aoa) <= 13.0: 291 | return 'F3' 292 | elif 13.0 < float(aoa) < 17.0: 293 | return 'F4' 294 | else: 295 | return 'F5' 296 | -------------------------------------------------------------------------------- /clustering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.cluster import AgglomerativeClustering, KMeans 3 | from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, fowlkes_mallows_score 4 | from sklearn.preprocessing import MinMaxScaler 5 | from scipy.cluster.hierarchy import linkage 6 | import logging 7 | 8 | 9 | class HclustEmbeddings: 10 | """ Performs hierarchical clustering on patient embeddings""" 11 | 12 | def __init__(self, min_cl, max_cl, linkage, affinity): 13 | self.min_cl = min_cl 14 | self.max_cl = max_cl 15 | self.linkage = linkage 16 | self.affinity = affinity 17 | 18 | def find_best_nclu(self, 19 | mtx, 20 | n_iter, 21 | subsampl): 22 | """Iterate clustering of subsets and find best number of clusters 23 | 24 | Parameters 25 | ---------- 26 | mtx: list 27 | List of embeddings as returned by pt_embedding module 28 | n_iter: int 29 | number of iteration to select the best number of clusters 30 | subsampl: float 31 | Fraction of data to consider for clustering 32 | 33 | Returns 34 | ------- 35 | int 36 | Best number of clusters 37 | """ 38 | n_cl_selected = [] 39 | for it in range(n_iter): 40 | idx = np.random.randint(0, len(mtx), int(len(mtx) * subsampl)) 41 | sub_data = [mtx[i] for i in idx] 42 | best_n_clu = self.elbow_method(sub_data) 43 | # for n_clu in range(self.min_cl, self.max_cl): 44 | # hclu = AgglomerativeClustering(n_clusters=n_clu, 45 | # linkage=self.linkage, 46 | # affinity=self.affinity) 47 | # lab_cl = hclu.fit_predict(sub_data) 48 | # tmp_silh = silhouette_score(sub_data, lab_cl) 49 | # if tmp_silh > best_silh: 50 | # best_silh = tmp_silh 51 | # best_n_clu = n_clu 52 | # print("(*) Iter {0} -- N clusters {1}".format(it, best_n_clu)) 53 | n_cl_selected.append(best_n_clu) 54 | unique, counts = np.unique(n_cl_selected, return_counts=True) 55 | logging.info("Counts of N clusters:") 56 | logging.info("N clusters -- Count") 57 | for un, ct in dict(zip(unique, counts)).items(): 58 | logging.info(un, ct) 59 | best_n_clu = unique[np.argmax(counts)] 60 | logging.info("\nBest N cluster:{0}".format(best_n_clu)) 61 | return best_n_clu 62 | 63 | def elbow_method(self, 64 | mtx): 65 | """Select the best number of clusters via elbow method. 66 | 67 | Parameters 68 | ---------- 69 | mtx list: 70 | List of embeddings as returned by pt_embedding module 71 | 72 | Returns 73 | ------- 74 | int: 75 | Best number of clusters 76 | """ 77 | # Scale data. 78 | scaler = MinMaxScaler() 79 | mtx = scaler.fit_transform(mtx) 80 | 81 | Z = linkage(mtx, self.linkage) 82 | last = Z[-self.max_cl:, 2] 83 | 84 | acceleration = np.diff(last, 2) # 2nd derivative of the distances 85 | acceleration_rev = acceleration[::-1] 86 | 87 | k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters 88 | 89 | return k 90 | 91 | @staticmethod 92 | def fit(mtx, pid_list, n_clu): 93 | """ Perform HC on patient embeddings 94 | 95 | Parameters 96 | ---------- 97 | mtx: list 98 | Embeddings list 99 | pid_list: list 100 | List of subjects id ordered as in mtx 101 | n_clu: int 102 | Number of clusters 103 | 104 | Returns 105 | ------- 106 | dictionary 107 | Dictionary with cluster label per subject id 108 | {pid: cl} 109 | """ 110 | # Scale data matrix 111 | scaler = MinMaxScaler() 112 | mtx = scaler.fit_transform(mtx) 113 | 114 | hclu = AgglomerativeClustering(n_clusters=n_clu) 115 | lab_cl = hclu.fit_predict(mtx) 116 | silh = silhouette_score(mtx, lab_cl) 117 | logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh)) 118 | 119 | num_count = np.unique(lab_cl, return_counts=True)[1] 120 | for idx, nc in enumerate(num_count): 121 | logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc)) 122 | 123 | return {pid: cl for pid, cl in zip(pid_list, lab_cl)} 124 | 125 | 126 | class HclustFeatures: 127 | """ Performs Hierarchical clustering on feature data""" 128 | 129 | def __init__(self, min_cl, max_cl, linkage, affinity): 130 | self.min_cl = min_cl 131 | self.max_cl = max_cl 132 | self.linkage = linkage 133 | self.affinity = affinity 134 | 135 | def find_best_nclu(self, 136 | df_scaled, 137 | n_iter, 138 | subsampl): 139 | """ Find the best number of clusters iterating over subset of data 140 | 141 | Parameters 142 | ---------- 143 | df_scaled: dataframe 144 | Scaled feature data with patient ids as index 145 | n_iter: int 146 | Number of iterations to perform 147 | subsampl: float 148 | Fraction of data to consider in the subset 149 | at each iteration 150 | 151 | Returns 152 | ------- 153 | int 154 | best number of clusters 155 | """ 156 | n_cl_selected = [] 157 | for it in range(n_iter): 158 | idx = np.random.randint(0, len(df_scaled), int(len(df_scaled) * subsampl)) 159 | sub_df = df_scaled.iloc[[i for i in idx], :] 160 | best_n_clu = self.elbow_method(sub_df) 161 | # for n_clu in range(self.min_cl, self.max_cl): 162 | # hclu = AgglomerativeClustering(n_clusters=n_clu) 163 | # lab_cl = hclu.fit_predict(sub_df) 164 | # tmp_silh = silhouette_score(sub_df, lab_cl) 165 | # if tmp_silh > best_silh: 166 | # best_silh = tmp_silh 167 | # best_n_clu = n_clu 168 | # print("(*) Iter {0} -- N clusters {1}".format(it, 169 | # best_n_clu)) 170 | n_cl_selected.append(best_n_clu) 171 | unique, counts = np.unique(n_cl_selected, return_counts=True) 172 | logging.info("Counts of N clusters:") 173 | logging.info("N clusters -- Count") 174 | for un, ct in dict(zip(unique, counts)).items(): 175 | logging.info(un, ct) 176 | best_n_clu = unique[np.argmax(counts)] 177 | logging.info("\nBest N cluster:{0}".format(best_n_clu)) 178 | return best_n_clu 179 | 180 | def elbow_method(self, 181 | df_scaled): 182 | """Select the best number of clusters via elbow method. 183 | 184 | Parameters 185 | ---------- 186 | df_scaled dataframe: 187 | Scaled feature data with patient ids as index 188 | 189 | Returns 190 | ------- 191 | int: 192 | Best number of clusters 193 | """ 194 | 195 | data = df_scaled.to_numpy() 196 | 197 | Z = linkage(data, self.linkage) 198 | last = Z[-self.max_cl:, 2] 199 | 200 | acceleration = np.diff(last, 2) # 2nd derivative of the distances 201 | acceleration_rev = acceleration[::-1] 202 | k = acceleration_rev.argmax() + 2 # if idx 0 is the max of this we want 2 clusters 203 | 204 | return k 205 | 206 | @staticmethod 207 | def fit(df_scaled, n_clu): 208 | """Fit HC on patient feature data 209 | 210 | Parameters 211 | ---------- 212 | df_scaled: dataframe 213 | Dataframe of scaled feature data 214 | n_clu: int 215 | Number of clusters 216 | Returns 217 | ------- 218 | dictionary 219 | Dictionary of patient ids and correspondent 220 | clusters {pid: cl} 221 | """ 222 | hclu = AgglomerativeClustering(n_clusters=n_clu) 223 | lab_cl = hclu.fit_predict(df_scaled) 224 | silh = silhouette_score(df_scaled, lab_cl) 225 | logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh)) 226 | 227 | num_count = np.unique(lab_cl, return_counts=True)[1] 228 | for idx, nc in enumerate(num_count): 229 | logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc)) 230 | 231 | return {pid: cl for pid, cl in zip(df_scaled.index, lab_cl)} 232 | 233 | 234 | class KMeansEmbeddings: 235 | """ Performs KMeans on patient embeddings""" 236 | 237 | def __init__(self, min_cl, max_cl): 238 | self.min_cl = min_cl 239 | self.max_cl = max_cl 240 | 241 | def find_best_nclu(self, 242 | mtx, 243 | n_iter, 244 | subsampl): 245 | """Iterate clustering of subsets anf find best number of clusters 246 | 247 | Parameters 248 | ---------- 249 | mtx: list 250 | List of embeddings as returned by pt_embedding module 251 | n_iter: int 252 | number of iteration to select the best number of clusters 253 | subsampl: float 254 | Fraction of data to consider for clustering 255 | 256 | Returns 257 | ------- 258 | int 259 | Best number of clusters 260 | """ 261 | n_cl_selected = [] 262 | for it in range(n_iter): 263 | idx = np.random.randint(0, len(mtx), int(len(mtx) * subsampl)) 264 | sub_data = [mtx[i] for i in idx] 265 | best_n_clu = self.elbow_method(sub_data) 266 | # for n_clu in range(self.min_cl, self.max_cl): 267 | # hclu = AgglomerativeClustering(n_clusters=n_clu, 268 | # linkage=self.linkage, 269 | # affinity=self.affinity) 270 | # lab_cl = hclu.fit_predict(sub_data) 271 | # tmp_silh = silhouette_score(sub_data, lab_cl) 272 | # if tmp_silh > best_silh: 273 | # best_silh = tmp_silh 274 | # best_n_clu = n_clu 275 | # print("(*) Iter {0} -- N clusters {1}".format(it, best_n_clu)) 276 | n_cl_selected.append(best_n_clu) 277 | unique, counts = np.unique(n_cl_selected, return_counts=True) 278 | logging.info("Counts of N clusters:") 279 | logging.info("N clusters -- Count") 280 | for un, ct in dict(zip(unique, counts)).items(): 281 | logging.info(un, ct) 282 | best_n_clu = unique[np.argmax(counts)] 283 | logging.info("\nBest N cluster:{0}".format(best_n_clu)) 284 | return best_n_clu 285 | 286 | def elbow_method(self, 287 | mtx): 288 | """Select the best number of clusters via elbow method. 289 | 290 | Parameters 291 | ---------- 292 | mtx list: 293 | List of embeddings as returned by pt_embedding module 294 | 295 | Returns 296 | ------- 297 | int: 298 | Best number of clusters 299 | """ 300 | # Scale data. 301 | scaler = MinMaxScaler() 302 | mtx = scaler.fit_transform(mtx) 303 | 304 | inertia = [] # Sum of square differences of samples from cluster centers 305 | K = range(1, self.max_cl) 306 | 307 | for k in K: 308 | kmean_model = KMeans(n_clusters=k).fit(mtx) 309 | inertia.append(kmean_model.inertia_) 310 | 311 | acceleration = np.diff(inertia, 2) 312 | 313 | k = acceleration.argmax() + 2 # If idx 0 is the max of this we want 2 clusters 314 | 315 | return k 316 | 317 | @staticmethod 318 | def fit(mtx, pid_list, n_clu): 319 | """ Perform HC on patient embeddings 320 | 321 | Parameters 322 | ---------- 323 | mtx: list 324 | Embeddings list 325 | pid_list: list 326 | List of subjects id ordered as in mtx 327 | n_clu: int 328 | Number of clusters 329 | 330 | Returns 331 | ------- 332 | dictionary 333 | Dictionary with cluster label per subject id 334 | {pid: cl} 335 | """ 336 | # Scale data matrix 337 | scaler = MinMaxScaler() 338 | mtx = scaler.fit_transform(mtx) 339 | 340 | kmclu = KMeans(n_clusters=n_clu) 341 | lab_cl = kmclu.fit_predict(mtx) 342 | silh = silhouette_score(mtx, lab_cl) 343 | logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh)) 344 | 345 | num_count = np.unique(lab_cl, return_counts=True)[1] 346 | for idx, nc in enumerate(num_count): 347 | logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc)) 348 | 349 | return {pid: cl for pid, cl in zip(pid_list, lab_cl)} 350 | 351 | 352 | class KMeansFeatures: 353 | """ Performs Hierarchical clustering on feature data""" 354 | 355 | def __init__(self, min_cl, max_cl): 356 | self.min_cl = min_cl 357 | self.max_cl = max_cl 358 | 359 | def find_best_nclu(self, 360 | df_scaled, 361 | n_iter, 362 | subsampl): 363 | """ Find the best number of clusters iterating over subset of data 364 | 365 | Parameters 366 | ---------- 367 | df_scaled: dataframe 368 | Scaled feature data with patient ids as index 369 | n_iter: int 370 | Number of iterations to perform 371 | subsampl: float 372 | Fraction of data to consider in the subset 373 | at each iteration 374 | 375 | Returns 376 | ------- 377 | int 378 | best number of clusters 379 | """ 380 | n_cl_selected = [] 381 | for it in range(n_iter): 382 | idx = np.random.randint(0, len(df_scaled), int(len(df_scaled) * subsampl)) 383 | sub_df = df_scaled.iloc[[i for i in idx], :] 384 | best_n_clu = self.elbow_method(sub_df) 385 | # for n_clu in range(self.min_cl, self.max_cl): 386 | # hclu = AgglomerativeClustering(n_clusters=n_clu) 387 | # lab_cl = hclu.fit_predict(sub_df) 388 | # tmp_silh = silhouette_score(sub_df, lab_cl) 389 | # if tmp_silh > best_silh: 390 | # best_silh = tmp_silh 391 | # best_n_clu = n_clu 392 | # print("(*) Iter {0} -- N clusters {1}".format(it, 393 | # best_n_clu)) 394 | n_cl_selected.append(best_n_clu) 395 | unique, counts = np.unique(n_cl_selected, return_counts=True) 396 | logging.info("Counts of N clusters:") 397 | logging.info("N clusters -- Count") 398 | for un, ct in dict(zip(unique, counts)).items(): 399 | logging.info(un, ct) 400 | best_n_clu = unique[np.argmax(counts)] 401 | logging.info("\nBest N cluster:{0}".format(best_n_clu)) 402 | return best_n_clu 403 | 404 | def elbow_method(self, 405 | df_scaled): 406 | """Select the best number of clusters via elbow method. 407 | 408 | Parameters 409 | ---------- 410 | df_scaled dataframe: 411 | Scaled feature data with patient ids as index 412 | 413 | Returns 414 | ------- 415 | int: 416 | Best number of clusters 417 | """ 418 | 419 | data = df_scaled.to_numpy() 420 | 421 | inertia = [] # Sum of square differences of samples from cluster centers 422 | K = range(1, self.max_cl) 423 | 424 | for k in K: 425 | kmean_model = KMeans(n_clusters=k).fit(data) 426 | inertia.append(kmean_model.inertia_) 427 | 428 | acceleration = np.diff(inertia, 2) 429 | 430 | k = acceleration.argmax() + 2 # If idx 0 is the max of this we want 2 clusters 431 | 432 | return k 433 | 434 | @staticmethod 435 | def fit(df_scaled, n_clu): 436 | """Fit HC on patient feature data 437 | 438 | Parameters 439 | ---------- 440 | df_scaled: dataframe 441 | Dataframe of scaled feature data 442 | n_clu: int 443 | Number of clusters 444 | Returns 445 | ------- 446 | dictionary 447 | Dictionary of patient ids and correspondent 448 | clusters {pid: cl} 449 | """ 450 | kmclu = KMeans(n_clusters=n_clu) 451 | lab_cl = kmclu.fit_predict(df_scaled) 452 | silh = silhouette_score(df_scaled, lab_cl) 453 | logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh)) 454 | 455 | num_count = np.unique(lab_cl, return_counts=True)[1] 456 | for idx, nc in enumerate(num_count): 457 | logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc)) 458 | 459 | return {pid: cl for pid, cl in zip(df_scaled.index, lab_cl)} 460 | 461 | 462 | def compare_clustering(cl1, cl2, method): 463 | """Compute cluster comparison score (compare favorite cluster to other clustering techniques), 464 | either Adjusted Mutual Information Score (AMI), or Fowlkes - Mallows Score (FM) 465 | 466 | Parameters 467 | ---------- 468 | cl1: list, array 469 | first clustering labels 470 | cl2: list, array 471 | second clustering labels 472 | method: str 473 | either 'AMI' or 'FM' 474 | Returns 475 | ------- 476 | float 477 | desired score 478 | """ 479 | if method == 'AMI': 480 | return adjusted_mutual_info_score(cl1, cl2, 481 | average_method='arithmetic') 482 | else: 483 | return fowlkes_mallows_score(cl1, cl2) 484 | -------------------------------------------------------------------------------- /behavioral_phenotyping_pipeline.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Behavioral Profile Stratification via Unsupervised learning" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "> Behavioral data embeddings for the stratification of individuals\n", 15 | "with neurodevelopmental conditions.\n", 16 | "\n", 17 | "> Designed for observational measurements of cognition and behavior of individuals with \n", 18 | "Autism Spectrum Conditions (ASCs).\n", 19 | "\n", 20 | "* `dataset.py`: Connects to the database and dump data\n", 21 | "* `features.py`: Returns vocabulary and dictionary of behavioral *EHRs* for each of the 4 possible depth levels. \n", 22 | "It also returns a dataset with quantitative scores for level 4 features\n", 23 | "* `pt_embedding.py`: Performs TFIDF for patient embeddings; Glove embeddings on words and average them out for \n", 24 | "subject embeddings; Word2vec embeddings on words, that are then averaged to output individual representations\n", 25 | "* `clustering.py`: Performs Hierarchical Clustering/k-means on embeddings, and quantitative 4th level features\n", 26 | "* `visualization.py`: Visualizes results (e.g. _scatterplot & dendrogram_)for sub-cluster visualization; \n", 27 | "_Heatmap_ for inspection of quantitative scores between sub-clusters" 28 | ] 29 | }, 30 | { 31 | "cell_type": "markdown", 32 | "metadata": {}, 33 | "source": [ 34 | "---\n", 35 | "*Run the cell below to enable logging display in notebook. Otherwise the log info are written to `pipeline.log` file in `./log` folder.*" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "from importlib import reload # Not needed in Python 2\n", 45 | "import logging\n", 46 | "reload(logging)\n", 47 | "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', \n", 48 | " level=logging.INFO, datefmt='%I:%M:%S')" 49 | ] 50 | }, 51 | { 52 | "cell_type": "markdown", 53 | "metadata": {}, 54 | "source": [ 55 | "---" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "## Step 1: Data Loading\n", 63 | "\n", 64 | "> The `dataset` module access the database and dumps all the available tables. Information for Data Accessibility should be provided in the `utils.py` file. Then, subject (e.g., adults) and tables (e.g., ados-2 module 4) that need to be excluded are filtered out and dictionaries of subject demographics and encounter information are provided and saved to _.csv_ file. " 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "from dataset import access_db, data_wrangling, cohort_info" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "# # it returns a dictionary of pandas dataframes storing tables from the db\n", 83 | "tables = access_db()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "# # reduced dictionary (it excludes tables and subjects that are not required, e.g., ados-2modulo4, eas)\n", 93 | "rid_tables = data_wrangling(tables)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# # it returns dictionary of subjects info and encounters\n", 103 | "pinfo, penc = cohort_info(rid_tables)" 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": {}, 109 | "source": [ 110 | "## Step 2: Feature Processing\n", 111 | "\n", 112 | "> Class `DataFeatures`is initialized with the depth level desired. The depth level can range from 1 to 4, where levels 1-3 are sistematically derived from instrument item structures, and level 4 is empirically derived in accordance with clinical experts. According to the levels, _behavioral EHRs_ (bEHRs) and vocabulary of terms are created. For each subject, each item score $N$ is considered as a word of the form `instrument_name::item::N`, the sequence of \"words\" chronologically ordered becomes the bEHR for each individual. Moreover, all the behavioral terms obtained are collected into a vocabulary. \n", 113 | "\n", 114 | "> The `create_level_features` method is only available for level 4, due to noise and missingness of data. It represents each subject as a vector of quantitative scores to tests ordered according to 5 timeframes (F1-F5), clinically selected. Missing values are imputed with mean." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "from features import DataFeatures" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "datafeatures = DataFeatures(level=4, df_dict=rid_tables)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "behr, (bt_to_idx, idx_to_bt) = datafeatures.create_level_tokens()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "feat_df, feat_df_scaled = datafeatures.create_level_features()" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "## Step 3: Embeddings\n", 158 | "\n", 159 | "> `Pembeddings` class consits of three methods: `tfidf` that outputs patient embeddings from SVD transform of word co-occurrence counts; `word2vec_emb` that computes word embeddings for each behavioral term learned via _continuous Skip-gram model_ (Mikolov et al., 2013) and outputs patient representations averaging out the behavioral terms of their sequence; `glove_pemb` that learns word embeddings via GloVe algorithm (Pennington et al., 2014) and averages out behavioral terms returning patient encodings." 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": null, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "from pt_embedding import Pembeddings" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "model = Pembeddings(behr, bt_to_idx)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "svd_pid_list, svd_mtx = model.tfidf()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "glove_pid_list, glove_emb, word_emb = model.glove_pemb()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "w2v_pid_list, w2v_emb, w2v_word_emb, _ = model.word2vec_emb()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "## Step 4: Clustering\n", 212 | "\n", 213 | "> This module performs _hierarchical clustering_ or _k-means clustering_ techniques on either subject embeddings or feature data. The best number of clusters is chosen via the Elbow Method." 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "from clustering import HclustEmbeddings, HclustFeatures, KMeansEmbeddings, KMeansFeatures, compare_clustering\n", 223 | "import utils as ut" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "hclust_emb = HclustEmbeddings(min_cl=ut.min_cl, max_cl=ut.max_cl, \n", 233 | " affinity='euclidean', linkage='ward')\n", 234 | "\n", 235 | "kmclust_emb = KMeansEmbeddings(min_cl=ut.min_cl, max_cl=ut.max_cl)" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "### `TF-IDF` Embedding" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "# TFIDF EMBEDDING\n", 252 | "# tfidf_best_cl = hclust_emb.find_best_nclu(svd_mtx, n_iter=ut.n_iter, \n", 253 | "# subsampl=ut.subsampl)\n", 254 | "tfidf_best_hccl = hclust_emb.elbow_method(svd_mtx)\n", 255 | "tfidf_hcsubc = hclust_emb.fit(svd_mtx, svd_pid_list, tfidf_best_hccl)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "# # KMeans clustering\n", 265 | "tfidf_best_kmcl = kmclust_emb.elbow_method(svd_mtx)\n", 266 | "tfidf_kmsubc = kmclust_emb.fit(svd_mtx, svd_pid_list, tfidf_best_kmcl)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "markdown", 271 | "metadata": {}, 272 | "source": [ 273 | "### `Glove` Embedding" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": null, 279 | "metadata": {}, 280 | "outputs": [], 281 | "source": [ 282 | "# GLOVE EMBEDDING\n", 283 | "# glv_best_cl = hclust_emb.find_best_nclu(glove_emb, n_iter=ut.n_iter, subsampl=ut.subsampl)\n", 284 | "glv_best_hccl = hclust_emb.elbow_method(glove_emb)\n", 285 | "glv_hcsubc = hclust_emb.fit(glove_emb, glove_pid_list, glv_best_hccl)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "glv_best_kmcl = kmclust_emb.elbow_method(glove_emb)\n", 295 | "glv_kmsubc = kmclust_emb.fit(glove_emb, glove_pid_list, glv_best_kmcl)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "### `Word2Vec` Embedding" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": null, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "w2v_best_hccl = hclust_emb.elbow_method(w2v_emb)\n", 312 | "w2v_hcsubc = hclust_emb.fit(w2v_emb, w2v_pid_list, w2v_best_hccl)" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "w2v_best_kmcl = kmclust_emb.elbow_method(w2v_emb)\n", 322 | "w2v_kmsubc = kmclust_emb.fit(w2v_emb, w2v_pid_list, w2v_best_kmcl)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "### Feature clustering" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": null, 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "hclust_feat = HclustFeatures(min_cl=ut.min_cl, max_cl=ut.max_cl, \n", 339 | " affinity='euclidean', linkage='ward')\n", 340 | "kmclust_feat = KMeansFeatures(min_cl=ut.min_cl, max_cl=ut.max_cl)" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "# FEATURES REPRESENTATION\n", 350 | "# feat_best_cl = hclust_feat.find_best_nclu(feat_df_scaled, n_iter=ut.n_iter, subsampl=ut.subsampl)\n", 351 | "feat_best_hccl = hclust_feat.elbow_method(feat_df_scaled)\n", 352 | "feat_hcsubc = hclust_feat.fit(feat_df_scaled, feat_best_hccl)" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": null, 358 | "metadata": {}, 359 | "outputs": [], 360 | "source": [ 361 | "feat_best_kmcl = kmclust_feat.elbow_method(feat_df_scaled)\n", 362 | "feat_kmsubc = kmclust_feat.fit(feat_df_scaled, feat_best_kmcl)" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "## Step 5: Clustering II (Visualization) \n", 370 | "\n", 371 | "> The second clustering module (`visualization`) enables the visualization of dendrogram, and Elbow Method curve for number of clusters selection. Moreover, it allows the visualization of the identified subtypes with scatterplots (UMAP projection visualization technique) and heatmaps for phenotyping (quantitative scores of selected items are highlighted). All these plots are available for both patient embeddings and feature data." 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "from visualization import Visualization" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "metadata": {}, 387 | "outputs": [], 388 | "source": [ 389 | "viz = Visualization(pinfo, ut.col_dict, ut.c_out)" 390 | ] 391 | }, 392 | { 393 | "cell_type": "markdown", 394 | "metadata": {}, 395 | "source": [ 396 | "### Tf-idf " 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": {}, 403 | "outputs": [], 404 | "source": [ 405 | "# # Example of visualization for tfidf embeddings\n", 406 | "# # Prepare data for umap and dendrogram\n", 407 | "# umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(svd_mtx, tfidf_hcsubc, svd_pid_list, random_state=42,\n", 408 | "# n_neighbors = 100,\n", 409 | "# min_dist=0.0)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "code", 414 | "execution_count": null, 415 | "metadata": {}, 416 | "outputs": [], 417 | "source": [ 418 | "# viz.scatterplot_dendrogram(svd_mtx, umap_mtx, pid_subc_list, 15, 10)" 419 | ] 420 | }, 421 | { 422 | "cell_type": "code", 423 | "execution_count": null, 424 | "metadata": {}, 425 | "outputs": [], 426 | "source": [ 427 | "# # Prepare data for heatmap\n", 428 | "# emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, tfidf_hcsubc, \n", 429 | "# save_df='df_tfidfemb_level4.csv')" 430 | ] 431 | }, 432 | { 433 | "cell_type": "code", 434 | "execution_count": null, 435 | "metadata": {}, 436 | "outputs": [], 437 | "source": [ 438 | "# viz.heatmap_emb(emb_scaled, 500, 2000, save_html='tfidf_heatmap_level-4')" 439 | ] 440 | }, 441 | { 442 | "cell_type": "markdown", 443 | "metadata": {}, 444 | "source": [ 445 | "### `GloVe`" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "metadata": {}, 452 | "outputs": [], 453 | "source": [ 454 | "# Visualization for GloVe embeddings\n", 455 | "# Prepare data for umap and dendrogram\n", 456 | "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(glove_emb, glv_hcsubc, glove_pid_list, random_state=42,\n", 457 | " n_neighbors = 5,\n", 458 | " min_dist=0.0)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": {}, 465 | "outputs": [], 466 | "source": [ 467 | "viz.scatterplot_dendrogram(glove_emb, umap_mtx, pid_subc_list, 15, 10, save_fig=None)" 468 | ] 469 | }, 470 | { 471 | "cell_type": "code", 472 | "execution_count": null, 473 | "metadata": {}, 474 | "outputs": [], 475 | "source": [ 476 | "# Plot UMAP projection of word embeddings via GloVe\n", 477 | "viz.plot_word_embedding(word_emb, idx_to_bt, 800, \n", 478 | " 800,\n", 479 | " n_neighbors = 10,\n", 480 | " min_dist=0.0)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": {}, 487 | "outputs": [], 488 | "source": [ 489 | "# Prepare data for heatmap\n", 490 | "emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, glv_hcsubc, \n", 491 | " save_df=None)" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "viz.heatmap_emb(emb_scaled, 500, 1800, save_html=None)" 501 | ] 502 | }, 503 | { 504 | "cell_type": "markdown", 505 | "metadata": {}, 506 | "source": [ 507 | "### `Word2vec`" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "# Scatterplot and dendrogram of UMAP projections\n", 517 | "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(w2v_emb, w2v_hcsubc, w2v_pid_list, random_state=42,\n", 518 | " n_neighbors = 5,\n", 519 | " min_dist=0.0)\n", 520 | "viz.scatterplot_dendrogram(w2v_emb, umap_mtx, pid_subc_list, 15, 10)" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "# Plot UMAP projection of word embeddings via Word2Vec\n", 530 | "viz.plot_word_embedding(w2v_word_emb.transpose(), \n", 531 | " idx_to_bt, \n", 532 | " 800, \n", 533 | " 800,\n", 534 | " n_neighbors =10,\n", 535 | " min_dist=0.0)" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": null, 541 | "metadata": {}, 542 | "outputs": [], 543 | "source": [ 544 | "# Prepare data for heatmap\n", 545 | "emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, w2v_hcsubc, \n", 546 | " save_df=None)\n", 547 | "viz.heatmap_emb(emb_scaled, 500, 1800)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "### Features" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "# Feature data visualization\n", 564 | "# Prepare data for umap and dendrogram\n", 565 | "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(feat_df_scaled, feat_hcsubc, random_state=42,\n", 566 | " n_neighbors = 10,\n", 567 | " min_dist=0.0)" 568 | ] 569 | }, 570 | { 571 | "cell_type": "code", 572 | "execution_count": null, 573 | "metadata": {}, 574 | "outputs": [], 575 | "source": [ 576 | "viz.scatterplot_dendrogram(feat_df_scaled, umap_mtx, pid_subc_list, 15, 10, save_fig=None)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "# Prepare data for heatmap\n", 586 | "emb_scaled = viz.data_heatmap_feat(feat_df, feat_df_scaled, feat_hcsubc, \n", 587 | " save_df=None)" 588 | ] 589 | }, 590 | { 591 | "cell_type": "code", 592 | "execution_count": null, 593 | "metadata": {}, 594 | "outputs": [], 595 | "source": [ 596 | "viz.heatmap_feat(emb_scaled, 1000, 2000, save_html=None)" 597 | ] 598 | }, 599 | { 600 | "cell_type": "markdown", 601 | "metadata": {}, 602 | "source": [ 603 | "---\n" 604 | ] 605 | } 606 | ], 607 | "metadata": { 608 | "kernelspec": { 609 | "display_name": "Python 3", 610 | "language": "python", 611 | "name": "python3" 612 | }, 613 | "language_info": { 614 | "codemirror_mode": { 615 | "name": "ipython", 616 | "version": 3 617 | }, 618 | "file_extension": ".py", 619 | "mimetype": "text/x-python", 620 | "name": "python", 621 | "nbconvert_exporter": "python", 622 | "pygments_lexer": "ipython3", 623 | "version": "3.6.8" 624 | } 625 | }, 626 | "nbformat": 4, 627 | "nbformat_minor": 2 628 | } 629 | -------------------------------------------------------------------------------- /visualization.py: -------------------------------------------------------------------------------- 1 | import umap 2 | from matplotlib import pyplot as plt 3 | from scipy.cluster.hierarchy import dendrogram, linkage 4 | from sklearn.cluster import KMeans 5 | import pandas as pd 6 | from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, \ 7 | ColorBar, HoverTool, ColumnDataSource, CategoricalColorMapper 8 | from bokeh.plotting import figure, show, output_notebook, output_file, save 9 | from bokeh.io import export_svgs 10 | import numpy as np 11 | from sklearn.preprocessing import MinMaxScaler 12 | from math import pi 13 | import utils as ut 14 | # Eliminate verbose warnings from Numba 15 | import warnings 16 | 17 | warnings.filterwarnings('ignore') 18 | 19 | 20 | class Visualization: 21 | """Class for the visualization of data and results. It returns: 22 | scatter plot, dendrogram, heatmap, plot Glove embeddings. 23 | """ 24 | 25 | def __init__(self, subject_info, col_dict, c_out): 26 | """ 27 | Parameters 28 | ---------- 29 | subject_info: dictionary 30 | Dictionary with subject demographics (Pinfo dataclass) 31 | as returned by cohort_info method in dataset module 32 | """ 33 | self.c_out = c_out # List of colors to exclude 34 | self.col_dict = col_dict # Dictionary of colors from matplotlib 35 | # colormap = [c for c in self.col_dict if c not in self.c_out] 36 | colormap = ut.colormap 37 | self.colormap = colormap 38 | self.subject_info = subject_info 39 | 40 | @staticmethod 41 | def data_scatter_dendrogram(X, 42 | subc_dict, 43 | pid_list=None, 44 | **kwargs): 45 | """ Prepare the data to be visualized in umap scatterplot and 46 | dendrogram 47 | 48 | Parameters 49 | ---------- 50 | X: array, dataframe 51 | either an array (as returned by patient embedding functions) 52 | or a dataframe (feature dataset) 53 | subc_dict: dictionary 54 | dictionary of pids and subcluster labels 55 | pid_list: list 56 | list of pids as ordered in X 57 | kwargs: kewyword arguments to be passed to UMAP 58 | 59 | Returns 60 | ------- 61 | numpy array 62 | umap projection 63 | list 64 | list of tuple with pid and subcluster label 65 | """ 66 | if isinstance(X, pd.DataFrame): 67 | pid_list = list(X.index) 68 | X = X.to_numpy() 69 | else: 70 | scaler = MinMaxScaler() 71 | X = scaler.fit_transform(X) 72 | 73 | umap_mtx = umap.UMAP(**kwargs).fit_transform(X) 74 | 75 | return umap_mtx, [(pid, subc_dict[pid]) for pid in pid_list] 76 | 77 | @staticmethod 78 | def plot_word_embedding(wemb_mtx, 79 | vocab, 80 | fig_width, 81 | fig_height, 82 | **kwargs): 83 | """ Function plotting word embeddings from Glove/Word2vec after UMAP transformation 84 | 85 | Parameters 86 | ---------- 87 | wemb_mtx: numpy array 88 | word embeddings as stored in vocabulary 89 | vocab: dictionary 90 | idx_to_bt dictionary 91 | **kwargs: n_neighbors, min_dist for UMAP module 92 | """ 93 | scaler = MinMaxScaler() 94 | wemb_mtx = scaler.fit_transform((wemb_mtx)) 95 | 96 | umap_mtx = umap.UMAP(**kwargs).fit_transform(wemb_mtx) 97 | 98 | source = ColumnDataSource(data=dict(x=umap_mtx[:, 0], 99 | y=umap_mtx[:, 1], 100 | words=list(vocab.values()))) 101 | 102 | TOOLTIPS = [('word', '@words')] 103 | 104 | plotTools = 'box_zoom, wheel_zoom, pan, crosshair, reset, save' 105 | 106 | p = figure(plot_width=fig_width, 107 | plot_height=fig_height, 108 | tools=plotTools) 109 | p.add_tools(HoverTool(tooltips=TOOLTIPS)) 110 | p.scatter(x='x', y='y', size=8, source=source) 111 | 112 | show(p) 113 | 114 | def data_heatmap_feat(self, X, X_scaled, subc_dict, save_df=None): 115 | """ Prepare data as input to heatmap feature. 116 | 117 | Parameters 118 | ---------- 119 | X: dataframe 120 | Dataframe with raw feature values 121 | X_scaled: dataframe 122 | Dataframe with scaled feature values 123 | subc_dict: dictionary 124 | Dictionary with subject ids and subcluster labels 125 | save_df: str 126 | if not None it stores the file name for csv dump 127 | 128 | Returns 129 | ------- 130 | dataframe 131 | Object with both scaled and raw values. A column with subcluster and 132 | subject id is added. 133 | """ 134 | label = {'0': 'SI', 135 | '1': 'SII', 136 | '2': 'SIII', 137 | '3': 'SIV', 138 | '4': 'SV', 139 | '5': 'SVI', 140 | '6': 'SVII', 141 | '7': 'SVIII', 142 | '8': 'SIX', 143 | '9': 'SX', 144 | '10': 'SXI', 145 | '11': 'SXII', 146 | '12': 'SXIII', 147 | '13': 'SXIV'} 148 | 149 | X_scaled = pd.DataFrame(X_scaled.sort_index().stack(), 150 | columns=['score_sc']).reset_index() 151 | X_scaled.columns = ['clpid', 'feat', 'score_sc'] 152 | X_scaled['clpid'] = ['-'.join([label[str(subc_dict[pid])], str(pid)]) 153 | for pid in X_scaled.clpid] 154 | X_scaled = X_scaled.sort_values(by='feat') 155 | 156 | X = pd.DataFrame(X.sort_index().stack(), columns=['score']).reset_index() 157 | X.columns = ['pid', 'feat', 'score'] 158 | X = X.sort_values(by='feat') 159 | 160 | X_scaled['score'] = X['score'] 161 | 162 | X_scaled = self._modify_df(X_scaled) 163 | 164 | if save_df is not None: 165 | X_scaled.to_csv(f'./data/{save_df}', 166 | index=False) 167 | return X_scaled 168 | 169 | def data_heatmap_emb(self, X, vocab, subc_dict, save_df=None): 170 | """ Prepare data as input to heatmap embeddings. 171 | 172 | Parameters 173 | ---------- 174 | X: dictionary 175 | BEHR dictionary 176 | vocab: dictionary 177 | bt_to_idx vocabulary 178 | subc_dict: dictionary 179 | Dictionary with pid and subcluster labels 180 | save_df: str 181 | if not None, it stores the name for csv dump file 182 | 183 | Returns 184 | ------- 185 | dataframe 186 | Dataframe with raw scores and scaled scores for subclusters. 187 | clpid columns with joined subcluster label and pid. 188 | """ 189 | label = {'0': 'SI', 190 | '1': 'SII', 191 | '2': 'SIII', 192 | '3': 'SIV', 193 | '4': 'SV', 194 | '5': 'SVI', 195 | '6': 'SVII', 196 | '7': 'SVIII'} 197 | 198 | # Build feature list 199 | c_lab = sorted(set(['::'.join(lab.split('::')[:-1]) 200 | for lab in vocab.keys()])) 201 | 202 | dict_age = {} 203 | for p, behr in X.items(): 204 | for vect in behr: 205 | if (p, vect[1]) not in dict_age: 206 | dict_age[(p, vect[1])] = {} 207 | for t in vect[2:]: 208 | ss = t.split('::') 209 | dict_age[(p, vect[1])].setdefault('::'.join(ss[:-1]), 210 | list()).append(int(ss[-1])) 211 | # Create dataframe with cl-pid as index 212 | val_dict = {} 213 | indx = [] 214 | for vect in sorted(list(dict_age.keys())): 215 | for f in c_lab: 216 | try: 217 | if len(dict_age[vect][f]) == 1: 218 | val_dict.setdefault(f, list()).extend(dict_age[vect][f]) 219 | else: # Mean of scores if multiple score per assessment 220 | val_dict.setdefault(f, list()).append(np.mean(dict_age[vect][f])) 221 | except KeyError: 222 | val_dict.setdefault(f, list()).append(None) 223 | indx.append(('-'.join([label[str(subc_dict[vect[0]])], vect[0]]), vect[1])) 224 | 225 | # create dataframe with cl-pi as index 226 | emb_df = pd.DataFrame(val_dict, index=indx) 227 | emb_df_imp = emb_df.fillna(emb_df.mean(), inplace=False) 228 | 229 | scaler = MinMaxScaler() 230 | emb_df_scaled = scaler.fit_transform(emb_df_imp.values) 231 | emb_df_scaled = pd.DataFrame(emb_df_scaled, index=indx, 232 | columns=emb_df.columns) 233 | 234 | emb_df = pd.DataFrame(emb_df.stack(dropna=False), 235 | columns=['score']).reset_index() 236 | emb_df_scaled = pd.DataFrame(emb_df_scaled.stack(), 237 | columns=['score_sc']).reset_index() 238 | emb_df_scaled['score'] = emb_df['score'] 239 | emb_df_scaled.columns = ['cllab_aoa', 'feat', 'score_sc', 'score'] 240 | 241 | emb_df_scaled = self._modify_df(emb_df_scaled) 242 | 243 | emb_df_scaled['clpid'] = [tup[0] for tup in emb_df_scaled['cllab_aoa']] 244 | emb_df_scaled['aoa'] = [tup[1] for tup in emb_df_scaled['cllab_aoa']] 245 | 246 | emb_df_scaled = emb_df_scaled.dropna() 247 | 248 | if save_df is not None: 249 | emb_df_scaled.to_csv(f'./data/{save_df}', 250 | index=False) 251 | 252 | return emb_df_scaled 253 | 254 | def scatterplot_dendrogram(self, 255 | X, 256 | umap_mtx, 257 | pid_subc_list, 258 | fig_height, 259 | fig_width, 260 | save_fig=None): 261 | """Scatterplot and dendrogram for clustering. The elbow method plot is also displayed. 262 | 263 | Parameters 264 | ---------- 265 | X: numpy array or dataframe 266 | dendrogram input 267 | umap_mtx: np array 268 | Umap projections of patients 269 | pid_subc_list: list of tuples 270 | list of pid and subclusters tuples as ordered in X 271 | fig_height, fig_width: int 272 | save_fig: str name of the figure (only method and level) 273 | """ 274 | 275 | # Scale embedding data matrix 276 | if not isinstance(X, pd.DataFrame): 277 | scaler = MinMaxScaler() 278 | X = scaler.fit_transform(X) 279 | 280 | subc_list = [el[1] for el in pid_subc_list] 281 | label = {'0': 'Subgroup I', 282 | '1': 'Subgroup II', 283 | '2': 'Subgroup III', 284 | '3': 'Subgroup IV', 285 | '4': 'Subgroup V', 286 | '5': 'Subgroup VI', 287 | '6': 'Subgroup VII', 288 | '7': 'Subgroup VIII', 289 | '8': 'Subgroup IX', 290 | '9': 'Subgroup X', 291 | '10': 'Subgroup XI', 292 | '11': 'Subgroup XII', 293 | '12': 'Subgroup XIII', 294 | '13': 'Subgroup XIV'} 295 | colors = [self.colormap[cl] for cl in sorted(list(set(subc_list)))] 296 | # Bokeh scatterplot 297 | self._scatter_plot(umap_mtx, pid_subc_list, colors, fig_width, fig_height, label, save_fig) 298 | 299 | # Dendrogram 300 | linked = linkage(X, 'ward') 301 | # Color mapping 302 | dflt_col = "#808080" # Unclustered gray 303 | # * rows in Z correspond to "inverted U" links that connect clusters 304 | # * rows are ordered by increasing distance 305 | # * if the colors of the connected clusters match, use that color for link 306 | link_cols = {} 307 | for idx, lidx in enumerate(linked[:, :2].astype(int)): 308 | c1, c2 = (link_cols[x] if x > len(linked) else colors[subc_list[x]] 309 | for x in lidx) 310 | link_cols[idx + 1 + len(linked)] = c1 if c1 == c2 else dflt_col 311 | 312 | plt.figure(figsize=(5, 5)) 313 | dendrogram(Z=linked, 314 | # labels=np.array([str(int(i) + 1) for i in subc_list]), 315 | labels=np.array([''] * len(subc_list)), 316 | color_threshold=None, 317 | leaf_font_size=5, leaf_rotation=0, 318 | link_color_func=lambda x: link_cols[x]) 319 | if save_fig is None: 320 | plt.show() 321 | else: 322 | plt.savefig(f'./data/{save_fig}-dendrogram.eps') 323 | plt.close() 324 | 325 | # Elbow method with clusters ranging from 2 to 15 326 | plt.figure(figsize=(5, 5)) 327 | last = linked[-15:, 2] 328 | last_rev = last[::-1] 329 | idxs = np.arange(1, len(last) + 1, dtype=int) 330 | plt.plot(idxs, last_rev) 331 | 332 | acceleration = np.diff(last, 2) # 2nd derivative of the distances 333 | acceleration_rev = acceleration[::-1] 334 | plt.plot(idxs[:-2] + 1, acceleration_rev) 335 | plt.xticks(idxs) 336 | if save_fig is None: 337 | plt.show() 338 | else: 339 | plt.savefig(f'./data/{save_fig}-elbow.eps') 340 | plt.close() 341 | 342 | def scatterplot_kmeans(self, 343 | X, 344 | umap_mtx, 345 | pid_subc_list, 346 | fig_height, 347 | fig_width): 348 | """Scatterplot and elbow method for KMeans clustering. 349 | 350 | Parameters 351 | ---------- 352 | X: numpy array or dataframe 353 | dendrogram input 354 | umap_mtx: np array 355 | Umap projections of patients 356 | pid_subc_list: list of tuples 357 | list of pid and subclusters tuples as ordered in X 358 | fig_height, fig_width: int 359 | """ 360 | 361 | # Scale embedding data matrix 362 | if not isinstance(X, pd.DataFrame): 363 | scaler = MinMaxScaler() 364 | X = scaler.fit_transform(X) 365 | else: 366 | X = X.to_numpy() 367 | 368 | subc_list = [el[1] for el in pid_subc_list] 369 | 370 | colors = [self.colormap[cl] for cl in sorted(list(set(subc_list)))] 371 | # Bokeh scatterplot 372 | self._scatter_plot(umap_mtx, pid_subc_list, colors, fig_width, fig_height) 373 | 374 | # Elbow method with clusters ranging from 2 to 15 375 | inertia = [] # Sum of square differences of samples from cluster centers 376 | K = np.arange(1, 15, dtype=int) 377 | 378 | for k in K: 379 | kmean_model = KMeans(n_clusters=k).fit(X) 380 | inertia.append(kmean_model.inertia_) 381 | 382 | plt.plot(K, inertia) 383 | 384 | acceleration = np.diff(inertia, 2) # 2nd derivative of the distances 385 | plt.plot(K[:-2] + 1, acceleration) 386 | plt.xticks(K) 387 | plt.show() 388 | 389 | @staticmethod 390 | def heatmap_feat(X_scaled, 391 | fig_height, 392 | fig_width, 393 | save_html=None, 394 | save_svg=None): 395 | """ Bokeh heatmap for the visualization of scaled scores in the 396 | different subclusters. Hovertool displaying subject info and raw 397 | scores. 398 | 399 | Parameters 400 | ---------- 401 | X_scaled: dataframe 402 | Feature scaled scores 403 | fig_height, fig_width: int 404 | save_html: str file name 405 | save_svg: str svg file name 406 | """ 407 | X_scaled = X_scaled.replace({'F1::psi-sf::padre::raw_ts': 'F1::psi-sf::caretakerm::raw_ts', 408 | 'F1::psi-sf::madre::raw_ts': 'F1::psi-sf::caretakerf::raw_ts', 409 | 'F2::psi-sf::padre::raw_ts': 'F2::psi-sf::caretakerm::raw_ts', 410 | 'F2::psi-sf::madre::raw_ts': 'F2::psi-sf::caretakerf::raw_ts', 411 | 'F3::psi-sf::padre::raw_ts': 'F3::psi-sf::caretakerm::raw_ts', 412 | 'F3::psi-sf::madre::raw_ts': 'F3::psi-sf::caretakerf::raw_ts', 413 | 'F4::psi-sf::padre::raw_ts': 'F4::psi-sf::caretakerm::raw_ts', 414 | 'F4::psi-sf::madre::raw_ts': 'F4::psi-sf::caretakerf::raw_ts', 415 | 'F5::psi-sf::padre::raw_ts': 'F5::psi-sf::caretakerm::raw_ts', 416 | 'F5::psi-sf::madre::raw_ts': 'F5::psi-sf::caretakerf::raw_ts' 417 | }) 418 | 419 | colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", 420 | "#dfccce", "#ddb7b1", "#cc7878", "#933b41", 421 | "#550b1d"] 422 | 423 | mapper = LinearColorMapper(palette=colors, 424 | low=X_scaled.score_sc.min(), 425 | high=X_scaled.score_sc.max()) 426 | output_notebook() 427 | p = figure(x_range=sorted(list(set(X_scaled['clpid']))), 428 | y_range=sorted(list(set(X_scaled['feat']))), 429 | x_axis_location="above", 430 | plot_width=fig_width, 431 | plot_height=fig_height, 432 | toolbar_location='below') 433 | 434 | TOOLTIPS = [('clpid', '@clpid'), 435 | ('sex', '@sex'), 436 | ('bdate', '@bdate'), 437 | ('feat', '@feat'), 438 | ('score', '@score'), 439 | ('n_enc', '@n_enc')] 440 | 441 | p.add_tools(HoverTool(tooltips=TOOLTIPS)) 442 | 443 | p.grid.grid_line_color = None 444 | p.axis.axis_line_color = None 445 | p.axis.major_tick_line_color = None 446 | p.xaxis.major_label_text_font_size = "7pt" 447 | p.yaxis.major_label_text_font_size = "7pt" 448 | p.axis.major_label_standoff = 0 449 | p.xaxis.major_label_orientation = pi / 2 450 | 451 | p.rect(x="clpid", y="feat", 452 | width=1, height=1, 453 | source=X_scaled, 454 | fill_color={'field': 'score_sc', 455 | 'transform': mapper}, 456 | line_color=None) 457 | 458 | color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="8pt", 459 | ticker=BasicTicker(desired_num_ticks=len(colors)), 460 | formatter=PrintfTickFormatter(format="%.2f"), 461 | label_standoff=6, border_line_color=None, location=(0, 0)) 462 | p.add_layout(color_bar, 'right') 463 | if save_html is not None: 464 | output_file(f'./data/{save_html}.html') 465 | save(p) 466 | elif save_svg is not None: 467 | p.output_backend = 'svg' 468 | export_svgs(p, f'./data/{save_svg}.svg') 469 | else: 470 | show(p) 471 | 472 | @staticmethod 473 | def heatmap_emb(emb_df_scaled, 474 | fig_height, 475 | fig_width, 476 | save_html=None, 477 | save_svg=None): 478 | """ Bokeh heatmap of scaled scores for patient embedding subclusters. 479 | Hovertool with subject info and subject raw scores. 480 | 481 | Parameters 482 | ---------- 483 | emb_df_scaled: dataframe 484 | output of data_heatmap_emb 485 | fig_height, fig_width: int 486 | save_html: str file name 487 | save_svg: str file name 488 | """ 489 | 490 | emb_df_scaled = emb_df_scaled.replace({'psi-sf::padre::raw_ts': 'psi-sf::caretakerm::raw_ts', 491 | 'psi-sf::madre::raw_ts': 'psi-sf::caretakerf::raw_ts'}) 492 | colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2", 493 | "#dfccce", "#ddb7b1", "#cc7878", "#933b41", 494 | "#550b1d"] 495 | 496 | mapper = LinearColorMapper(palette=colors, 497 | low=emb_df_scaled.score_sc.min(), 498 | high=emb_df_scaled.score_sc.max()) 499 | 500 | # output_notebook() 501 | p = figure(x_range=sorted(list(set(emb_df_scaled['clpid']))), 502 | y_range=sorted(list(set(emb_df_scaled['feat']))), 503 | x_axis_location="above", 504 | plot_width=fig_width, 505 | plot_height=fig_height, 506 | toolbar_location='below') 507 | 508 | TOOLTIPS = [('clpid', '@clpid'), 509 | ('sex', '@sex'), 510 | ('bdate', '@bdate'), 511 | ('aoa', '@aoa'), 512 | ('feat', '@feat'), 513 | ('score', '@score'), 514 | ('n_enc', '@n_enc')] 515 | 516 | p.add_tools(HoverTool(tooltips=TOOLTIPS)) 517 | 518 | p.grid.grid_line_color = None 519 | p.axis.axis_line_color = None 520 | p.axis.major_tick_line_color = None 521 | p.xaxis.major_label_text_font_size = "7pt" 522 | p.yaxis.major_label_text_font_size = "7pt" 523 | p.axis.major_label_standoff = 0 524 | p.xaxis.major_label_orientation = pi / 2 525 | 526 | p.rect(x="clpid", y="feat", 527 | width=1, height=1, 528 | source=emb_df_scaled, 529 | fill_color={'field': 'score_sc', 530 | 'transform': mapper}, 531 | line_color=None) 532 | 533 | color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="8pt", 534 | ticker=BasicTicker(desired_num_ticks=len(colors)), 535 | formatter=PrintfTickFormatter(format="%.2f"), 536 | label_standoff=6, border_line_color=None, location=(0, 0)) 537 | p.add_layout(color_bar, 'right') 538 | if save_html is not None: 539 | output_file(f'./data/{save_html}.html') 540 | save(p) 541 | elif save_svg is not None: 542 | p.output_backend = 'svg' 543 | export_svgs(p, f'./data/{save_svg}.svg') 544 | else: 545 | show(p) 546 | 547 | def _scatter_plot(self, 548 | umap_mtx, 549 | pid_subc_list, 550 | colors, 551 | fig_height, 552 | fig_width, 553 | label, 554 | save_fig): 555 | """Bokeh scatterplot to visualize in jupyter clusters and subject info. 556 | 557 | Parameters 558 | ---------- 559 | umap_mtx: np array 560 | Array with UMAP projections 561 | pid_subc_list: list of tuples 562 | list of pids ordered as in umap_mtx and subcluster labels 563 | colors: list 564 | Color list 565 | fig_height, fig_width: int 566 | Figure dimensions 567 | label: dict dictionary of class numbers and subtype labels 568 | save_fig: str file name 569 | """ 570 | 571 | pid_list = list(map(lambda x: x[0], pid_subc_list)) 572 | subc_list = list(map(lambda x: x[1], pid_subc_list)) 573 | df_dict = {'x': umap_mtx[:, 0].tolist(), 574 | 'y': umap_mtx[:, 1].tolist(), 575 | 'pid_list': pid_list, 576 | 'subc_list': subc_list} 577 | 578 | df = pd.DataFrame(df_dict).sort_values('subc_list') 579 | 580 | source = ColumnDataSource(dict( 581 | x=df['x'].tolist(), 582 | y=df['y'].tolist(), 583 | pid=df['pid_list'].tolist(), 584 | subc=list(map(lambda x: label[str(x)], df['subc_list'].tolist())), 585 | col_class=[str(i) for i in df['subc_list'].tolist()], 586 | bdate=[self.subject_info[pid].dob for pid in df['pid_list'].tolist()], 587 | sex=[self.subject_info[pid].sex for pid in df['pid_list'].tolist()], 588 | n_enc=[self.subject_info[pid].n_enc for pid in df['pid_list'].tolist()])) 589 | 590 | labels = [str(i) for i in df['subc_list']] 591 | cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)), 592 | palette=colors) 593 | TOOLTIPS = [('pid', '@pid'), 594 | ('subc', '@subc'), 595 | ('sex', '@sex'), 596 | ('bdate', '@bdate'), 597 | ('n_enc', '@n_enc')] 598 | 599 | plotTools = 'box_zoom, wheel_zoom, pan, crosshair, reset, save' 600 | 601 | output_notebook() 602 | p = figure(plot_width=fig_width * 50, plot_height=fig_height * 50, 603 | tools=plotTools, title='Quantitative features') 604 | p.add_tools(HoverTool(tooltips=TOOLTIPS)) 605 | p.circle('x', 'y', legend='subc', source=source, 606 | color={'field': 'col_class', 607 | # "field": 'subc', 608 | "transform": cmap}, size=8) 609 | p.xaxis.major_tick_line_color = None 610 | p.xaxis.minor_tick_line_color = None 611 | p.yaxis.major_tick_line_color = None 612 | p.yaxis.minor_tick_line_color = None 613 | p.xaxis.major_label_text_color = None 614 | p.yaxis.major_label_text_color = None 615 | p.grid.grid_line_color = None 616 | p.legend.location = 'bottom_right' 617 | if save_fig is None: 618 | show(p) 619 | else: 620 | p.output_backend = 'svg' 621 | export_svgs(p, f'./data/{save_fig}-scatterplot.svg') 622 | 623 | def _modify_df(self, df): 624 | """ Adds subject info to dataframe for heatmaps 625 | 626 | Parameters 627 | ---------- 628 | df: dataframe 629 | Stacked scaled dataframe with cl-pid column 630 | 631 | Returns 632 | ------- 633 | dataframe 634 | Dataframe with subject demographic info and number 635 | of encounters 636 | """ 637 | 638 | sex_vect = [] 639 | bdate_vect = [] 640 | n_enc_vect = [] 641 | for pid in df.iloc[:, 0]: 642 | if isinstance(pid, str): 643 | slab = pid.split('-')[1] 644 | sex_vect.append(self.subject_info[slab].sex) 645 | bdate_vect.append(self.subject_info[slab].dob) 646 | n_enc_vect.append(self.subject_info[slab].n_enc) 647 | else: 648 | slab = pid[0].split('-')[1] 649 | sex_vect.append(self.subject_info[slab].sex) 650 | bdate_vect.append(self.subject_info[slab].dob) 651 | n_enc_vect.append(self.subject_info[slab].n_enc) 652 | 653 | df['sex'] = sex_vect 654 | df['bdate'] = bdate_vect 655 | df['n_enc'] = n_enc_vect 656 | 657 | return df 658 | --------------------------------------------------------------------------------