├── .gitignore
├── requirements.txt
├── datamap.py
├── README.md
├── basic_statistics.py
├── test-demog-cl.R
├── dataset.py
├── pt_embedding.py
├── features.py
├── clustering.py
├── behavioral_phenotyping_pipeline.ipynb
└── visualization.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | .*/
3 | *.csv
4 | *.pkl
5 | data/
6 | logs/
7 | tmp/
8 | utils.py
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | sqlalchemy
 2 | pandas
 3 | numpy
 4 | scikit-learn
 5 | scipy
 6 | glove
 7 | jupyter
 8 | matplotlib
 9 | bokeh
10 | umap-learn
11 | dataclasses
12 | google-api-python-client
13 | gsheets
14 | oauth2client
15 | httplib2
16 | seaborn
17 | torch


--------------------------------------------------------------------------------
/datamap.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import pandas as pd
 3 | from utils import select_clm
 4 | import numpy as np
 5 | 
 6 | flags = None
 7 | logger = logging.getLogger('datamap')
 8 | 
 9 | 
10 | def levels_datamap(table_dict):
11 |     """ Returns a dataframe with a boolean vector per
12 |     instrument per level to select the columns correspondent
13 |     to the desired level
14 | 
15 |     Parameters
16 |     ----------
17 |     table_dict: dict
18 |         dictionary of tables (df) from the database already
19 |         filtered
20 |     Returns
21 |     -------
22 |     dataframe
23 |         instruments x levels, each element is a boolean vector
24 |     """
25 |     cselect_dict = {}
26 |     insname_list = []
27 |     for table, df in table_dict.items():
28 |         insname_list.append(table)
29 |         for lev in range(1, 5):
30 |             if table in select_clm[lev]:
31 |                 clm_list = _col_select(lev, table,
32 |                                        df.columns)
33 |                 cselect_dict.setdefault(lev, list()).append(clm_list)
34 |             else:
35 |                 logger.info("Not considered table {0}".format(table))
36 |     selectcol_df = pd.DataFrame(cselect_dict,
37 |                                 index=insname_list).sort_index()
38 |     return selectcol_df
39 | 
40 | 
41 | """
42 | Private Functions
43 | """
44 | 
45 | 
46 | def _col_select(lev, instrument,
47 |                 clm_names):
48 |     """ Given a table and a depth level, it returns a boolean array
49 |     storing the columns to select.
50 | 
51 |     Parameters
52 |     ----------
53 |     lev: int
54 |         Level depth
55 |     instrument: str
56 |         Instrument name
57 |     clm_names: Index object
58 | 
59 |     Returns
60 |     -------
61 |     array
62 |         Array of integers with the columns to select
63 |     """
64 |     cselect_list = np.array([1, 1, 1, 1], dtype=int)
65 |     for col in clm_names[4:]:
66 |         if col in select_clm[lev][instrument]:
67 |             cselect_list = np.append(cselect_list, [int(1)])
68 |         else:
69 |             cselect_list = np.append(cselect_list, [int(0)])
70 |     return cselect_list
71 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Behavioral phenotyping project
 2 | 
 3 | Behavioral data embeddings for the stratification of individuals
 4 | with neurodevelopmental conditions.
 5 | 
 6 | Designed for observational measurements of cognition and behavior of individuals with 
 7 | Autism Spectrum Conditions (ASCs).
 8 | 
 9 | #### TODO: Abstract 
10 | 
11 | ### Technical Requirements
12 | 
13 | ```
14 | Python 3.6+
15 | 
16 | R 3.4+
17 | ```
18 | 
19 | The full list of required Python Packages is available in `requrirements.txt` file. It is possible
20 | to install all the dependency by:
21 | 
22 | ```bash
23 | $ pip install -r requirements.txt 
24 | ```
25 | 
26 | ## Behavioural Phenotyping Pipeline (TLDR ;))
27 | 
28 | A complete example of the _Behavioural Phenotype Stratification_ is available 
29 | as Jupyter notebook:
30 | 
31 | ```
32 | jupyter notebook behavioral_phenotyping_pipeline.ipynb
33 | ```
34 | 
35 | ### Documentation (at a glance)
36 | 
37 | The code is structured into multiple modules (`.py` files), including algorithms and methods 
38 | for the multiple steps of the pipeline:
39 | 
40 | * `dataset.py`: Connects to the database and dump data
41 | * `features.py`: Returns vocabulary and dictionary of behavioral *EHRs* for each of the 4 possible depth levels. 
42 | It also returns a dataset with quantitative scores for level 4 features
43 | * `pt_embedding.py`: Performs TFIDF for patient embeddings; Glove embeddings on words and average them out for 
44 | subject embeddings; Word2vec embeddings on words, that are then averaged to output individual representations
45 | * `clustering.py`: Performs Hierarchical Clustering/k-means on embeddings, and quantitative 4th level features
46 | * `visualization.py`: Visualizes results (e.g. _scatterplot & dendrogram_)for sub-cluster visualization; 
47 | _Heatmap_ for inspection of quantitative scores between sub-clusters
48 | * `basic_statistics.py`: Returns basic demographic statistics for dataset description
49 | * `test-demog-cl.R`: Runs multiple pairwise comparisons between subgroups 
50 | to check for confounders and support clinical validation
51 | 
52 | 
53 | #### TODO: Paper, Poster, Conference Reference
54 | 
55 | #### TODO: Credits and Acknowledgements
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/basic_statistics.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import utils as ut
 3 | import os
 4 | import logging
 5 | from datetime import datetime
 6 | import matplotlib.pyplot as plt
 7 | 
 8 | # Create a custom logger, logging to file
 9 | logger = logging.getLogger('descriptive_statistics')
10 | 
11 | # Create handlers
12 | c_handler = logging.FileHandler('./logs/descriptive_statistics.log',
13 |                                 mode='w')
14 | c_handler.setLevel(logging.INFO)
15 | 
16 | # Create formatters and add it to handlers
17 | c_format = logging.Formatter('%(message)s')
18 | c_handler.setFormatter(c_format)
19 | 
20 | # Add handlers to the logger
21 | logger.addHandler(c_handler)
22 | 
23 | 
24 | class DataStatistics:
25 |     """Class for data statistics computation."""
26 | 
27 |     def compute(self, data_dir):
28 |         """Compute basic statistics and save output to log file.
29 | 
30 |         Parameter
31 |         ---------
32 |         data_dir: str
33 |             directory name where to save log file
34 |         """
35 |         pd.set_option('float_format', '{:.3f}'.format)
36 | 
37 |         dem = pd.read_csv(os.path.join(ut.DATA_FOLDER_PATH, data_dir,
38 |                                        'person-demographics.csv'),
39 |                           sep=',',
40 |                           header=0)
41 |         enc = pd.read_csv(os.path.join(ut.DATA_FOLDER_PATH, data_dir,
42 |                                        'person-encounters.csv'),
43 |                           sep=',',
44 |                           header=0)
45 |         dem['AGE'] = list(map(lambda x: self.__age(x), dem.DOB.tolist()))
46 | 
47 |         logger.info('N of subjects: %d\n', len(dem.ID_SUBJ.unique()))
48 |         logger.info('%s\n', pd.crosstab(dem.SEX, columns='count'))
49 |         logger.info('%s\n',
50 |                     dem.describe())
51 | 
52 |         logger.info("Instrument list:")
53 |         for ins in sorted(enc.INSTRUMENT.unique()):
54 |             logger.info('%s', ins)
55 |         logger.info('\n%s\n',
56 |                     enc.describe())
57 |         # Consider assessment as number of administered instruments
58 |         ass_dict = {}
59 |         for _, row in enc.iterrows():
60 |             ass_dict.setdefault(row.ID_SUBJ, list()).append(row.INSTRUMENT)
61 |         count_ass = {'pid': list(ass_dict.keys()),
62 |                      'ass_count': [len(ass_dict[pid]) for pid in ass_dict]}
63 |         logger.info("Assessment (i.e., administered instrument counts) statistics:")
64 |         logger.info('%s\n', pd.DataFrame(count_ass).describe())
65 | 
66 |         # return period span
67 |         doa_vec = [list(map(int, el.split('/'))) for el in enc.DOA.tolist()]
68 |         doa_min = min(doa_vec, key=lambda x: (x[-1], x[1]))
69 |         doa_max = max(doa_vec, key=lambda x: (x[-1], x[1]))
70 |         logger.info(f'Period span: {doa_min} -- {doa_max}\n')
71 | 
72 |         # plot histogram with number of encounters
73 |         plt.figure(figsize=(40, 20))
74 |         plt.bar(dem.ID_SUBJ, dem.N_ENC)
75 |         plt.tick_params(axis='x', rotation=90)
76 |         plt.tick_params(axis='y', labelsize=30)
77 |         plt.savefig(os.path.join(ut.DATA_FOLDER_PATH,
78 |                                  data_dir,
79 |                                  'n-encounter.png'))
80 |         plt.close()
81 | 
82 |     @staticmethod
83 |     def __age(dob):
84 |         """
85 |         Parameters
86 |         ----------
87 |         dob: str
88 |             date of birth in format %d/%m/%Y
89 | 
90 |         Return
91 |         ------
92 |         float
93 |             age from birth date
94 |         """
95 |         days_in_year = 365.2425
96 |         dt_dob = datetime.strptime(dob, '%d/%m/%Y')
97 |         current_age = (datetime.today() - dt_dob).days / days_in_year
98 |         return current_age
99 | 


--------------------------------------------------------------------------------
/test-demog-cl.R:
--------------------------------------------------------------------------------
  1 | # Post-hoc analyses:
  2 | # - Check confounders;
  3 | # - Compare variable scores;
  4 | # - Run external validation (TBD).
  5 | 
  6 | # LIBRARIES
  7 | require(eeptools)
  8 | require(reshape2)
  9 | require(ggplot2)
 10 | require(GGally)
 11 | require(plyr)
 12 | require(tidyr)
 13 | 
 14 | # FUNCTIONS
 15 | # Pairwise chi-square test function
 16 | pairwise.chisq.test <- function(x, g, p.adjust.method = p.adjust.methods, ...) {
 17 |     DNAME <- paste(deparse(substitute(x)), "and", deparse(substitute(g)))
 18 |     g <- factor(g)
 19 |     p.adjust.method <- match.arg(p.adjust.method)
 20 | 
 21 |     compare.levels <- function(i, j) {
 22 |         idx <- which(as.integer(g) == i | as.integer(g) == j)
 23 |         xij <- x[idx]
 24 |         gij <- as.character(g[idx])
 25 |         gij <- as.factor(gij)
 26 |         print(table(xij, gij))
 27 |         chisq.test(xij, gij, ...)$p.value
 28 |     }
 29 |     PVAL <- pairwise.table(compare.levels, levels(g), p.adjust.method)
 30 |     ans <- list(method = "chi-squared test",
 31 |     data.name = DNAME,
 32 |     p.value = PVAL,
 33 |     p.adjust.method = p.adjust.method)
 34 |     class(ans) <- "pairwise.htest"
 35 |     ans
 36 | }
 37 | 
 38 | # DATA PATH AND FILE
 39 | DATA_PATH <- '~/Documents/behavioral_phenotyping/data'
 40 | FILE_NAME <- 'df_w2vemb_level4.csv'
 41 | PLOT_NAME <- 'feat_dist_hc_w2v_level4.pdf'
 42 | 
 43 | # RUN ANALYSES
 44 | # Read table
 45 | df <- read.table(file.path(DATA_PATH, FILE_NAME),
 46 | sep = ',',
 47 | header = TRUE,
 48 | as.is = TRUE)
 49 | df <- subset(df, select = c(clpid, sex, bdate, aoa,
 50 | n_enc, feat, score_sc, score))
 51 | # Add current age column to dataframe
 52 | df$cage <- age_calc(as.Date(df$bdate, "%d/%m/%Y"),
 53 | units = 'years')
 54 | df <- df[order(df$clpid),]
 55 | 
 56 | # Inspect confounders
 57 | df_conf <- unique(data.frame(pid = apply(t(df$clpid), 2,
 58 | function(x) strsplit(x, '-')[[1]][2]),
 59 | cluster = as.factor(apply(t(df$clpid), 2,
 60 | function(x) strsplit(x, '-')[[1]][1])),
 61 | cage = df$cage,
 62 | sex = df$sex,
 63 | n_enc = df$n_enc), by = 'pid')
 64 | # Add to confounder df the behr length for each subject
 65 | lenbehr <- ddply(df, .(clpid), nrow)$V1
 66 | df_conf$lenbehr <- lenbehr
 67 | 
 68 | # Tests:
 69 | # (1) age mean differences between clusters (pairwise t-test with Bonferroni correction);
 70 | # (2) average number of encounters between clusters (pairwise t-test with Bonferroni correction);
 71 | # (3) sex counts via chi-squared test with Bonferroni correction.
 72 | 
 73 | print("AGE per cluster (M, SD):")
 74 | tapply(df_conf$cage, df_conf$cluster, function(x) c(mean(x), sd(x)))
 75 | pairwise.t.test(df_conf$cage, df_conf$cluster, p.adjust.method = 'bonferroni')
 76 | 
 77 | print("N_ENCOUNTERS per cluster (M, SD):")
 78 | tapply(df_conf$n_enc, df_conf$cluster, function(x) c(mean(x), sd(x)))
 79 | pairwise.t.test(df_conf$n_enc, df_conf$cluster, p.adjust.method = 'bonferroni')
 80 | 
 81 | print("SEX counts pairwise chi-square between clusters")
 82 | tab <- table(df_conf$sex, df_conf$cluster)
 83 | tab
 84 | pairwise.chisq.test(df_conf$sex, df_conf$cluster,
 85 | p.adjust.method = 'bonferroni')
 86 | 
 87 | print("AGE OF ASSESSMENT per cluster (M, SD):")
 88 | df$cluster <- as.factor(apply(t(df$clpid), 2,
 89 | function(x) strsplit(x, '-')[[1]][1]))
 90 | tapply(df$aoa, df$cluster, function(x) c(mean(x), sd(x)))
 91 | pairwise.t.test(df$aoa, df$cluster, p.adjust.method = 'bonferroni')
 92 | 
 93 | print("Length BEHR per cluster (M, SD):")
 94 | tapply(df_conf$lenbehr, df_conf$cluster, function(x) c(mean(x), sd(x)))
 95 | pairwise.t.test(df_conf$lenbehr, df_conf$cluster, p.adjust.method = 'bonferroni')
 96 | 
 97 | # Summary statistics feature raw scores.
 98 | # df$feat_cl <- paste(df$cluster,
 99 | # df$feat,
100 | # sep = '-')
101 | # print("Summary statistics feature scores foe each cluster.")
102 | # tapply(df$score,
103 | # df$feat_cl,
104 | # function(x) c(mean(x), sd(x)))
105 | 
106 | ##################################################################
107 | 
108 | # Multiple pairwise comparisons between groups
109 | df_wide <- subset(df, select = c(clpid, feat, score_sc))
110 | df_wide <- dcast(df_wide, clpid ~ feat,
111 | value.var = 'score_sc',
112 | drop = FALSE, fun.aggregate = mean)
113 | df_wide$pid <- apply(t(df_wide$clpid), 2, function(x) strsplit(x, '-')[[1]][2])
114 | df_wide$cluster <- apply(t(df_wide$clpid), 2, function(x) strsplit(x, '-')[[1]][1])
115 | df_wide <- subset(df_wide, select = - clpid)
116 | df_wide$cluster <- as.factor(df_wide$cluster)
117 | 
118 | print("Percentage of missing data for each cluster")
119 | na_cl <- c()
120 | na_count <- c()
121 | for (cl in levels(df_wide$cluster)){
122 |   tmp <- df_wide[df_wide$cluster==cl, 1:(ncol(df_wide)-2)]
123 |   ttab <- table(is.na(tmp))/(nrow(tmp)*ncol(tmp))
124 |   print(ttab)
125 |   na_cl <- c(na_cl, rep(cl,nrow(tmp)*ncol(tmp)))
126 |   na_count <- c(na_count, rep('notmiss', table(is.na(tmp))[1]),
127 |                 rep('miss', table(is.na(tmp))[2]))
128 | }
129 | na_cl <- as.factor(na_cl)
130 | na_count <- as.factor(na_count)
131 | pairwise.chisq.test(na_count, na_cl)
132 |   
133 | # Run pairwise t-test or t-test for score comparisons.
134 | # for (n in names(df_wide)[1 : (ncol(df_wide) - 2)]) {
135 | #     # Drop missing values.
136 | #     df_tmp <- drop_na(subset(df_wide, select = c(n, 'cluster')))
137 | #     check_tab <- table(df_tmp$cluster) > 1
138 | #     cat("\n", "Testing variable", n, "\n\n")
139 | #     if (length(check_tab[check_tab == FALSE]) >= 1) {
140 | #         idxs <- which(df_tmp$cluster == which(check_tab == FALSE) - 1)
141 | #         if (length(idxs) > 0) {
142 | #             df_tmp <- df_tmp[- which(df_tmp$cluster == which(check_tab == FALSE) - 1),]}
143 | #         try(print(t.test(df_tmp[, 1] ~ df_tmp[, 2])))} else {
144 | #           try(print(pairwise.t.test(df_tmp[, 1], df_tmp[, 2],
145 | #                               p.adjust.method = 'bonferroni')))
146 | #         #pt <- pairwise.t.test(df_tmp[, 1], df_tmp[, 2],
147 | #         #                      p.adjust.method = 'bonferroni')
148 | #         #print(pt)}
149 | #         }
150 | # }
151 | 
152 | # Feature distibution plot
153 | # pdf(file = file.path(DATA_PATH, PLOT_NAME))
154 | ggpairs(subset(df_wide, select = c(grep('ados|psi', names(df_wide)), cluster)), label.pos = 3)
155 | # ggpairs(subset(df_wide, select = c(grep('griffiths', names(df_wide)), cluster)),
156 | #         columnLabels = c("gmds::GQ", "gmds::q_A",
157 | #                          "gmds::q_B", "gmds::q_C",
158 | #                          "gmds::q_D", "gmds::q_E",
159 | #                          "gmds::q_F", "cluster"))
160 | # ggpairs(subset(df_wide, select = c(grep('wechsler', names(df_wide)), cluster)))
161 | # ggpairs(subset(df_wide, select = c(grep('vineland', names(df_wide)), cluster)))
162 | # ggpairs(subset(df_wide, select = c(grep('srs', names(df_wide)), cluster)))
163 | # ggpairs(subset(df_wide, select = c(grep('psi', names(df_wide)), cluster)))
164 | # ggpairs(subset(df_wide, select = c(grep('leiter', names(df_wide)), cluster)))
165 | # dev.off()
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/dataset.py:
--------------------------------------------------------------------------------
  1 | from sqlalchemy import create_engine, MetaData
  2 | import datetime
  3 | from datetime import datetime
  4 | import csv
  5 | import os
  6 | import utils as ut
  7 | import pandas as pd
  8 | import logging
  9 | from dataclasses import dataclass
 10 | from basic_statistics import DataStatistics
 11 | import numpy as np
 12 | 
 13 | 
 14 | # Dataclasses to store patient demographics,
 15 | # and patient info on encounters.
 16 | @dataclass
 17 | class Pinfo:
 18 |     sex: str
 19 |     dob: str
 20 |     n_enc: int = 0
 21 | 
 22 | 
 23 | @dataclass
 24 | class Penc:
 25 |     sex: str
 26 |     dob: str
 27 |     doa_instrument: list()
 28 | 
 29 |     def count_enc(self):
 30 |         yr_enc = list(map(lambda x: x[0].split('/')[2],
 31 |                           self.doa_instrument))
 32 |         return len(set(yr_enc))
 33 | 
 34 | 
 35 | # Configure the logging, logging to file.
 36 | logging.basicConfig(level=logging.INFO,
 37 |                     filename='./logs/pipeline.log',
 38 |                     filemode='w')
 39 | 
 40 | # Create new directory or point to an existing one to store data.
 41 | data_dir = 'odf-data'
 42 | data_path = os.path.join(ut.DATA_FOLDER_PATH, data_dir)
 43 | os.makedirs(data_path, exist_ok=True)
 44 | runtime_date = datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
 45 | logging.info(f'{runtime_date} created ../data/odf-data folder for returned objects')
 46 | 
 47 | 
 48 | def access_db():
 49 |     """ Access the database and dump tables.
 50 | 
 51 |     Returns
 52 |     -------
 53 | 
 54 |     dictionary
 55 |         {key=table_name, value=pandas dataframe}
 56 |     """
 57 |     # connect to the database
 58 |     engine = create_engine(ut.SQLALCHEMY_CONN_STRING)
 59 |     conn = engine.connect()
 60 |     logging.info('Connection to DB established')
 61 |     # inspect the tables in the database
 62 |     metadata = MetaData(engine, reflect=True)
 63 | 
 64 |     logging.info('Dumping all tables')
 65 |     df_tables = {}
 66 |     for table_name in metadata.tables:
 67 |         # ADDED THIS TO STOP IMPORTING NEW DATA FOR NOW
 68 |         df_tables[table_name] = pd.read_sql_table(table_name,
 69 |                                                   con=conn,
 70 |                                                   parse_dates=['date_birth', 'date_ass'],
 71 |                                                   index_col='id').query(
 72 |             'form_info <= datetime(2019, 10, 5)').drop('form_info', axis=1)
 73 |     return df_tables
 74 | 
 75 | 
 76 | def data_wrangling(tables_dict):
 77 |     """ Drop excluded subjects and tables
 78 | 
 79 |     Parameters
 80 |     ----------
 81 |     tables_dict: dictionary
 82 |         dictionary with dumped tables from DB
 83 | 
 84 |     Returns
 85 |     -------
 86 |     dictionary
 87 |         reduced dictionary without excluded tables and subjects (rows)
 88 |     """
 89 |     adult_subj = tables_dict['ados-2modulo4'].id_subj.unique()
 90 |     # added lab1680 on the 1st of October 2019, new entry with only WISC-IV
 91 |     # added also lab1353/lab1152, only psi-sf/srs available
 92 |     adult_subj = np.append(adult_subj, ['lab1680', 'lab1353', 'lab1152'])
 93 |     logging.info(f'Dropped {len(adult_subj)} subjects')
 94 | 
 95 |     # names of the tables to drop from the dictionary
 96 |     tb_drop = ['ados-2modulo4',
 97 |                'emotionalavailabilityscales']
 98 | 
 99 |     tb_dict_rid = {}
100 |     for tb_name, df in tables_dict.items():
101 |         if tb_name not in tb_drop:
102 |             row_drop = ~(df['id_subj'].isin(adult_subj))
103 |             tb_dict_rid[tb_name] = df.loc[row_drop]
104 | 
105 |     return tb_dict_rid
106 | 
107 | 
108 | def cohort_info(tables_dict):
109 |     """Store instances of Pinfo and Penc classes in dictionaries
110 | 
111 |     Parameters
112 |     ----------
113 |     tables_dict: dictionary
114 |         dictionary with data tables
115 | 
116 |     Returns
117 |     -------
118 |     dictionary
119 |         {keys=pid, values=Pinfo instances}
120 |     dictionary
121 |         {keys=pid, values=Penc instances}
122 |     """
123 |     demog_dict = {}
124 |     enc_dict = {}
125 |     for tn, df in tables_dict.items():
126 |         for _, row in df.iterrows():
127 |             ass_date = __correct_datetime(row.date_ass)
128 |             birth_date = __correct_datetime(row.date_birth)
129 |             if row.id_subj in enc_dict:
130 |                 enc_dict[row.id_subj].doa_instrument.append((ass_date, tn))
131 |             else:
132 |                 enc_dict[row.id_subj] = Penc(sex=row.sex,
133 |                                              dob=birth_date,
134 |                                              doa_instrument=[(ass_date,
135 |                                                               tn)])
136 |                 demog_dict[row.id_subj] = Pinfo(sex=row.sex,
137 |                                                 dob=birth_date)
138 |     for pid in demog_dict:
139 |         demog_dict[pid].n_enc = enc_dict[pid].count_enc()
140 |     # dump info to csv files
141 |     _dump_info(demog_dict, enc_dict)
142 |     # save log with statistics
143 |     logging.info('\nComputing basics statistics (DataStatistics module)\n')
144 |     DataStatistics().compute(data_dir)
145 |     return demog_dict, enc_dict
146 | 
147 | 
148 | """
149 | Functions
150 | """
151 | 
152 | 
153 | def age_ass(dob, doa):
154 |     """
155 |     Parameters
156 |     ----------
157 |     dob: str
158 |         date of birth
159 |     doa: str
160 |         date of assessment
161 | 
162 |     Return
163 |     ------
164 |     float
165 |         age of assessment
166 |     """
167 |     # dob = pd.Timestamp(year=int(dob.split('/')[2]),
168 |     #                    month=int(dob.split('/')[1]),
169 |     #                    day=int(dob.split('/')[0]))
170 |     # doa = pd.Timestamp(year=int(doa.split('/')[2]),
171 |     #                    month=int(doa.split('/')[1]),
172 |     #                    day=int(doa.split('/')[0]))
173 |     dob = pd.Timestamp(dob)
174 |     doa = pd.Timestamp(doa)
175 |     days_in_year = 365.2425
176 |     aoa = (doa - dob).days / days_in_year
177 |     return aoa
178 | 
179 | 
180 | def __correct_datetime(date_ts):
181 |     """
182 |     Parameters
183 |     ----------
184 |     date_ts: pandas Timestamp
185 | 
186 |     Returns
187 |     -------
188 |     str
189 |         strftime %d/%m/%Y
190 |     """
191 |     # correct wrong dates
192 |     today = datetime.today()
193 |     try:
194 |         if date_ts.year == today.year and date_ts.month >= today.month:
195 |             corrected_date = pd.Timestamp(year=date_ts.year,
196 |                                           month=date_ts.day,
197 |                                           day=date_ts.month)
198 |         else:
199 |             corrected_date = date_ts
200 | 
201 |         return corrected_date.strftime("%d/%m/%Y")
202 |     except AttributeError:
203 |         return date_ts
204 | 
205 | 
206 | def _dump_info(demog_info, enc_info):
207 |     """Save csv file with demographic and encounter info
208 | 
209 |     Parameters
210 |     ----------
211 |     demog_info: dictionary
212 |         {keys=pid, values=Pinfo instances}
213 |     enc_info: dictionary
214 |         {keys=pid, values=Penc instances}
215 |     """
216 |     logging.info("Saving csv files on subject info and subject encounters")
217 |     with open(os.path.join(ut.DATA_FOLDER_PATH, data_dir,
218 |                            'person-encounters.csv'), 'w') as f:
219 |         wr = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
220 |         wr.writerow(['ID_SUBJ', 'SEX', 'DOB', 'DOA', 'AOA', 'INSTRUMENT'])
221 |         for pid in sorted(enc_info.keys()):
222 |             enc_info[pid].doa_instrument.sort(key=lambda x: (x[0].split('/')[2],
223 |                                                              x[0].split('/')[1],
224 |                                                              x[1]))
225 |             for tup in enc_info[pid].doa_instrument:
226 |                 wr.writerow([pid, enc_info[pid].sex,
227 |                              enc_info[pid].dob, tup[0],
228 |                              age_ass(enc_info[pid].dob, tup[0]),
229 |                              tup[1]])
230 |     with open(os.path.join(ut.DATA_FOLDER_PATH, data_dir,
231 |                            'person-demographics.csv'), 'w') as f:
232 |         wr = csv.writer(f, delimiter=',', quoting=csv.QUOTE_MINIMAL)
233 |         wr.writerow(['ID_SUBJ', 'SEX', 'DOB', 'N_ENC'])
234 |         for pid in sorted(demog_info.keys()):
235 |             wr.writerow([pid, demog_info[pid].sex,
236 |                          demog_info[pid].dob,
237 |                          demog_info[pid].n_enc])
238 | 


--------------------------------------------------------------------------------
/pt_embedding.py:
--------------------------------------------------------------------------------
  1 | from sklearn.feature_extraction.text import TfidfVectorizer
  2 | from sklearn.decomposition import TruncatedSVD
  3 | import glove
  4 | import numpy as np
  5 | import utils as ut
  6 | import torch
  7 | import torch.nn.functional as F
  8 | import logging
  9 | 
 10 | 
 11 | class Pembeddings:
 12 |     def __init__(self, behr, vocab):
 13 |         """ Range of possible embeddings to perform on behavioral data
 14 |         TFIDF, GLOVE, WORD2VEC
 15 | 
 16 |         Parameters
 17 |         ----------
 18 |         behr
 19 |             dictionary {pid: trm sequence}
 20 |         vocab
 21 |             dictionary, needed btm_to_idx
 22 |         """
 23 |         self.behr = behr
 24 |         self.vocab = vocab
 25 | 
 26 |     def tfidf(self):
 27 |         """performs TFIDF
 28 | 
 29 |         Return
 30 |         ------
 31 |         list
 32 |             pids list
 33 |         list
 34 |             svd matrix
 35 |         """
 36 |         # create document list
 37 |         doc_list = []
 38 |         for tupl_list in self.behr.values():
 39 |             sentence = []
 40 |             for tm_vect in tupl_list:
 41 |                 sentence.extend(tm_vect[2:])
 42 |             doc_list.append(' '.join(list(map(lambda x: str(x), sentence))))
 43 |         pid_list = [pid for pid in self.behr]
 44 | 
 45 |         vectorizer = TfidfVectorizer(norm='l2')
 46 |         tfidf_mtx = vectorizer.fit_transform(doc_list)
 47 | 
 48 |         logging.info("Performing SVD on the TF-IDF matrix...")
 49 |         reducer = TruncatedSVD(n_components=ut.n_dim_tfidf, random_state=123)
 50 |         svd_mtx = reducer.fit_transform(tfidf_mtx)
 51 | 
 52 |         return pid_list, svd_mtx
 53 | 
 54 |     def word2vec_emb(self):
 55 |         """Skip-gram word2vec
 56 | 
 57 |         Returns
 58 |         -------
 59 |         list
 60 |             pids list
 61 |         list
 62 |             matrix of patient embeddings
 63 |         numpy array:
 64 |             first layer weight matrix (vocab size, embedding dim)
 65 |         numpy array:
 66 |             second layer weight matrix (vocab size, embedding dim)
 67 | 
 68 |         """
 69 |         corpus = self.__build_corpus()
 70 |         idx_pairs = self.__get_idx_pairs(corpus, window_size=10)
 71 | 
 72 |         torch.manual_seed(1234)
 73 |         W1 = torch.randn(ut.n_dim_w2v, len(self.vocab),
 74 |                          dtype=torch.float32,
 75 |                          requires_grad=True)
 76 |         W2 = torch.randn(len(self.vocab), ut.n_dim_w2v,
 77 |                          dtype=torch.float32,
 78 |                          requires_grad=True)
 79 | 
 80 |         for epoch in range(ut.n_epoch_w2v):
 81 |             loss_val = 0
 82 |             for data, target in idx_pairs:
 83 |                 x = self.__get_input_layer(data).float()
 84 |                 y_true = torch.from_numpy(np.array([target])).long()
 85 | 
 86 |                 z1 = torch.matmul(W1, x)
 87 |                 z2 = torch.matmul(W2, z1)
 88 | 
 89 |                 log_softmax = F.log_softmax(z2, dim=0)
 90 | 
 91 |                 loss = F.nll_loss(log_softmax.view(1, -1), y_true)
 92 |                 loss_val += loss.item()
 93 |                 loss.backward()
 94 |                 w1 = W1.detach()
 95 |                 w2 = W2.detach()
 96 |                 w1 -= ut.learning_rate_w2v * W1.grad
 97 |                 w2 -= ut.learning_rate_w2v * W2.grad
 98 | 
 99 |                 W1.grad.zero_()
100 |                 W2.grad.zero_()
101 | 
102 |             if epoch % 10 == 0:
103 |                 logging.info(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}')
104 |         logging.info(f'Loss at epoch {epoch}: {loss_val/len(idx_pairs)}')
105 | 
106 |         p_emb = []
107 |         pid_list = []
108 |         for pid, term in corpus.items():
109 |             if len(term) != 0:
110 |                 pid_list.append(pid)
111 |                 p_emb.append(np.mean([W1[:, int(t)].tolist() for t in term],
112 |                                      axis=0).tolist())
113 | 
114 |         return pid_list, p_emb, w1.numpy(), w2.numpy()
115 | 
116 |     def glove_pemb(self):
117 |         """Computes Glove embeddings from co-occurrence matrix
118 |             and returns patient embeddings
119 | 
120 |         Return
121 |         ------
122 |         list
123 |             pids list
124 |         list
125 |             matrix of patient embeddings
126 |         array
127 |             word embeddings
128 |         """
129 | 
130 |         corpus = self.__build_corpus()
131 |         coocc_dict = self.__build_cooccur(corpus, window_size=10)
132 |         model = glove.Glove(coocc_dict, alpha=0.75, x_max=10.0, d=ut.n_dim_glove, seed=1234)
133 |         logging.info("\nTraining Glove embeddings...")
134 |         for epoch in range(ut.n_epoch_glove):
135 |             err = model.train(batch_size=ut.batch_size_glove, step_size=ut.learning_rate_glove)
136 |             if epoch % 10 == 0:
137 |                 logging.info("epoch %d, error %.3f" % (epoch, err))
138 |         logging.info("epoch %d, error %.3f" % (epoch, err))
139 | 
140 |         wemb = model.W + model.ContextW  # as suggested in Pennington et al.
141 |         p_emb = []
142 |         pid_list = []
143 |         for pid, term in corpus.items():
144 |             if len(term) != 0:
145 |                 pid_list.append(pid)
146 |                 p_emb.append(np.mean([wemb[int(t)].tolist() for t in term],
147 |                                      axis=0).tolist())
148 | 
149 |         return pid_list, p_emb, wemb
150 | 
151 |     @staticmethod
152 |     def __age_tf(age):
153 |         """ convert age to time slot string
154 | 
155 |         Parameter
156 |         ---------
157 |         age
158 |             float
159 |         Return
160 |         ------
161 |         str
162 |         """
163 |         if 0 < age <= 2.5:
164 |             return 'F1'
165 |         elif 2.5 < age <= 6.0:
166 |             return 'F2'
167 |         elif 6.0 < age <= 13.0:
168 |             return 'F3'
169 |         elif 13.0 < age < 17.0:
170 |             return 'F4'
171 |         else:
172 |             return 'F5'
173 | 
174 |     def __build_corpus(self):
175 |         """random shuffle terms in time slots
176 | 
177 |         Return
178 |         ------
179 |         dictionary
180 |             {pid: term list set and shuffles wrt to time slots F1-F5}
181 |         """
182 |         # set seed
183 |         np.random.seed(0)  # 1234 (3 ns subtypes); 47 (7 ns subtypes)
184 |         # We structure behrs wrt timeframes to learn word embeddings.
185 |         # Structure of bvect = [Penc, aoa, tokens].
186 |         behr_tf = {}
187 |         for pid, bvect in self.behr.items():
188 |             for el in bvect:
189 |                 if pid not in behr_tf:
190 |                     behr_tf[pid] = {self.__age_tf(el[1]): list(map(lambda x: int(self.vocab[x]),
191 |                                                                    el[2:]))}
192 |                 else:
193 |                     behr_tf[pid].setdefault(self.__age_tf(el[1]),
194 |                                             list()).extend(list(map(lambda x: int(self.vocab[x]),
195 |                                                                     el[2:])))
196 |         corpus = {}
197 |         for pid, tf_dict in behr_tf.items():
198 |             for tf in sorted(tf_dict.keys()):
199 |                 np.random.shuffle(behr_tf[pid][tf])
200 |                 corpus.setdefault(pid,
201 |                                   list()).extend(behr_tf[pid][tf])
202 |         return corpus
203 | 
204 |     @staticmethod
205 |     def __get_idx_pairs(corpus, window_size):
206 |         """Creates the center-context vectors for Word2vec predictions
207 | 
208 |         Parameters
209 |         ----------
210 |         corpus: dictionary
211 |             {pid: behr}
212 |         window_size: int
213 |             size of the context
214 |         Returns
215 |         -------
216 |         numpy array
217 |         """
218 |         idx_pairs = []
219 |         # for each sentence
220 |         for sentence in corpus.values():
221 |             # for each word, treated as center word
222 |             for center_word_pos in range(len(sentence)):
223 |                 # for each window position
224 |                 for w in range(-window_size, window_size + 1):
225 |                     context_word_pos = center_word_pos + w
226 |                     # make sure not jump out sentence
227 |                     if context_word_pos < 0 or context_word_pos >= len(sentence) or center_word_pos == context_word_pos:
228 |                         continue
229 |                     context_word_idx = sentence[context_word_pos]
230 |                     idx_pairs.append((sentence[center_word_pos], context_word_idx))
231 | 
232 |         return np.array(idx_pairs)
233 | 
234 |     def __get_input_layer(self, word_idx):
235 |         """Transforms a token into a one-hot encoded representation
236 | 
237 |         Parameters
238 |         ----------
239 |         word_idx: int
240 |             word token
241 |         Returns
242 |         -------
243 |         torch tensor
244 |         """
245 |         x = torch.zeros(len(self.vocab), dtype=torch.float32)
246 |         x[word_idx] = 1.0
247 |         return x
248 | 
249 |     def __build_cooccur(self, corpus, window_size=10):
250 |         """Build a word co-occurrence dictionary for the given corpus.
251 | 
252 |         Parameters
253 |         ----------
254 |         corpus
255 |             behr dictionary as returned by __build_corpus
256 |         window_size
257 |             int, size of the context window
258 | 
259 |         Return
260 |         ------
261 |         dictionary
262 |             {i_main: {i_context: cooccurrence}}
263 |             see Pennington et al., (2014).
264 |         """
265 | 
266 |         # Collect cooccurrences internally as a sparse matrix for passable
267 |         # indexing speed; we'll convert into a list later
268 |         cooccurrences = {k: {} for k in self.vocab.values()}
269 | 
270 |         for pid, sentence in corpus.items():
271 | 
272 |             for center_i, center_id in enumerate(sentence):
273 |                 # Collect all word IDs in left window of center word
274 |                 context_ids = sentence[max(0, center_i - window_size): center_i]
275 |                 contexts_len = len(context_ids)
276 | 
277 |                 for left_i, left_id in enumerate(context_ids):
278 |                     # Distance from center word
279 |                     distance = contexts_len - left_i
280 | 
281 |                     # Weight by inverse of distance between words
282 |                     increment = 1.0 / float(distance)
283 |                     # Build co-occurrence matrix symmetrically (pretend we
284 |                     # are calculating right contexts as well)
285 |                     if left_id in cooccurrences[center_id]:
286 |                         cooccurrences[center_id][left_id] += increment
287 |                         cooccurrences[left_id][center_id] += increment
288 |                     else:
289 |                         cooccurrences[center_id][left_id] = increment
290 |                         cooccurrences[left_id][center_id] = increment
291 |         return cooccurrences
292 | 


--------------------------------------------------------------------------------
/features.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import os
  3 | import re
  4 | import pandas as pd
  5 | from dataset import Penc, age_ass
  6 | from datamap import levels_datamap
  7 | from sklearn.preprocessing import StandardScaler
  8 | import logging
  9 | import utils as ut
 10 | import seaborn as sns
 11 | import matplotlib.pyplot as plt
 12 | import numpy as np
 13 | 
 14 | 
 15 | # Configure the logging, logging to file.
 16 | # logging.basicConfig(level=logging.INFO)
 17 | 
 18 | 
 19 | class DataFeatures:
 20 |     """ Each instance is initialized with the desired level
 21 |     and the dictionary with the instrument tables, as dataframes,
 22 |     from the database. A dataframe stores the datamap
 23 |     for feature selection.
 24 |     """
 25 | 
 26 |     def __init__(self, level, df_dict):
 27 |         self.level = level
 28 |         self.df_dict = df_dict
 29 |         dm_df = levels_datamap(df_dict)  # class variable
 30 |         lev_dict = {}
 31 |         for ins, df in df_dict.items():
 32 |             lev_dict[ins] = df[df.columns[pd.Series(dm_df.loc[ins,
 33 |                                                               level],
 34 |                                                     dtype='bool')]]
 35 |         self.lev_dict = lev_dict
 36 | 
 37 |     def create_level_tokens(self):
 38 |         """Transforms instrument values into words joining instrument name,
 39 |         scale/subscale and score. Returns a dictionary of token dataframes
 40 |         per instrument according to level. It also returns
 41 |         the correspondent vocabulary of terms.
 42 | 
 43 |         Returns
 44 |         -------
 45 |         dictionary
 46 |             {key: instrument, value: list of token lists}
 47 |         dictionary
 48 |             {key: word, value: int}
 49 |         """
 50 |         logging.info(f"Building token dataframes and vocabulary for level {self.level}.")
 51 | 
 52 |         # Create token strings to populate behr dictionary and vocabulary
 53 |         behr_tkns = {}
 54 |         lev_vocab = set()
 55 |         for ins, df in self.lev_dict.items():
 56 |             # for _, row in df.iterrows():
 57 |             #     # The first two positions of each vector of tokens store a
 58 |             #     # Penc dataclass and the assessment age.
 59 |             #     token = [Penc(sex=row.sex,
 60 |             #                   dob=penc[row.id_subj].dob,
 61 |             #                   doa_instrument=[(correct_datetime(row.date_ass), ins)]),
 62 |             #              age_ass(penc[row.id_subj].dob,
 63 |             #                      correct_datetime(row.date_ass))]
 64 |             for _, row in df.iterrows():
 65 |                 # The first two positions of each vector of tokens store a
 66 |                 # Penc dataclass and the assessment age.
 67 |                 token = [Penc(sex=row.sex,
 68 |                               dob=row.date_birth.strftime("%d/%m/%Y"),
 69 |                               doa_instrument=[(row.date_ass.strftime("%d/%m/%Y"),
 70 |                                                ins)]),
 71 |                          age_ass(row.date_birth, row.date_ass)]
 72 |                 for c in df.columns[4:]:
 73 |                     try:
 74 |                         if row[c] != '' and pd.notna(row[c]):
 75 |                             sig = self.__create_token(row, ins, c)
 76 |                             token.append('::'.join([sig, str(int(row[c]))]))
 77 |                             lev_vocab.update(['::'.join([sig, str(int(row[c]))])])
 78 |                         else:
 79 |                             pass
 80 |                     except ValueError:
 81 |                         pass
 82 |                 behr_tkns.setdefault(row['id_subj'], list()).append(token)
 83 |         bt_to_idx = {trm: idx for idx, trm in enumerate(sorted(list(lev_vocab)))}
 84 |         idx_to_bt = {idx: trm for idx, trm in enumerate(sorted(list(lev_vocab)))}
 85 |         behr = {}
 86 |         for p_id, vect in behr_tkns.items():
 87 |             vect.sort(key=lambda x: x[1])
 88 |             for v in vect:
 89 |                 behr.setdefault(p_id, list()).append(v)
 90 |         logging.info(f'Vocabulary size:{len(bt_to_idx)}')
 91 |         self.__save_vocab_behr(behr, bt_to_idx)
 92 | 
 93 |         return behr, (bt_to_idx, idx_to_bt)
 94 | 
 95 |     def create_level_features(self, missing_data_plot=False):
 96 |         """ If level is not 4 it returns an Error. For level 4 it returns
 97 |         a dataframe with patient ids as index and time-ordered features as columns.
 98 |         Missing values are NaN. Dataframe and vocabulary are saved to csv file.
 99 | 
100 |         Returns
101 |         -------
102 |         dataframe
103 |             Table with instrument scores at level 4 (at different times F1-F5)
104 |             per subject.
105 |         dataframe Scaled feature set with mean imputed missing values.
106 |         """
107 |         if self.level != 4:
108 |             logging.error("create_level_features() is only available for level 4.")
109 |             raise ValueError("create_level_features() attribute is only available for level 4.")
110 |         else:
111 |             # Create token strings as features
112 |             feat_set = set()
113 |             feat_tkns = {}
114 |             for ins, df in self.lev_dict.items():
115 |                 for _, row in df.iterrows():
116 |                     for c in df.columns[4:]:
117 |                         try:
118 |                             # if row[c] != '' and pd.notna(row[c]):
119 |                             #     sig = self.__create_token(row, ins, c)
120 |                             #     feat_tkns.setdefault(row['id_subj'], list()).append(
121 |                             #         '::'.join([self.__aoa_to_tf(age_ass(penc[row.id_subj].dob,
122 |                             #                                             correct_datetime(row.date_ass))),
123 |                             #                    sig,
124 |                             #                    str(int(row[c]))]))
125 |                             #     feat_set.update(['::'.join([self.__aoa_to_tf(age_ass(penc[row.id_subj].dob,
126 |                             #                                                          correct_datetime(row.date_ass))),
127 |                             #                                 sig])])
128 |                             if row[c] != '' and pd.notna(row[c]):
129 |                                 sig = self.__create_token(row, ins, c)
130 |                                 feat_tkns.setdefault(row['id_subj'], list()).append(
131 |                                     '::'.join([self.__aoa_to_tf(age_ass(row.date_birth,
132 |                                                                         row.date_ass)),
133 |                                                sig,
134 |                                                str(int(row[c]))]))
135 |                                 feat_set.update(['::'.join([self.__aoa_to_tf(age_ass(row.date_birth,
136 |                                                                                      row.date_ass)),
137 |                                                             sig])])
138 |                             else:
139 |                                 pass
140 |                         except ValueError:
141 |                             pass
142 |             feat_df = pd.DataFrame(columns=sorted(list(feat_set)),
143 |                                    index=sorted(list(feat_tkns.keys())))
144 |             for p_id, vect in feat_tkns.items():
145 |                 for tkn_val in vect:
146 |                     tkn = tkn_val.split('::')
147 |                     feat_df.loc[p_id, ['::'.join(tkn[:-1])]] = int(tkn[-1])
148 |             feat_df.to_csv('./data/level-4/feature_data.csv')  # dump dataframe
149 | 
150 |             scaler = StandardScaler()
151 |             feat_df_scaled = feat_df.fillna(feat_df.mean(), inplace=False)
152 |             feat_df_scaled = pd.DataFrame(scaler.fit_transform(feat_df_scaled),
153 |                                           columns=feat_df.columns,
154 |                                           index=feat_df.index)
155 |             missing_data = feat_df.isna().mean() * 100
156 |             logging.info(f'Percentages of missing values for columns of feature data:\n{missing_data}')
157 | 
158 |             if missing_data_plot:
159 |                 rid_list = {}
160 |                 ins = set()
161 |                 for k, v in zip(missing_data.keys(),
162 |                                 missing_data):
163 |                     ins.add(ut.shorten_names[k.split('::')[1]])
164 |                     rid_list.setdefault(k.split('::')[0],
165 |                                         dict()).setdefault(ut.shorten_names[k.split('::')[1]],
166 |                                                            list()).append(v)
167 |                 df_dict = {}
168 |                 ins = list(ins)
169 |                 for i in sorted(ins):
170 |                     df_dict[i] = []
171 |                     for t in rid_list.keys():
172 |                         try:
173 |                             df_dict[i].append(np.mean(rid_list[t][i]))
174 |                         except KeyError:
175 |                             df_dict[i].append(np.nan)
176 |                 df = pd.DataFrame(df_dict, index=sorted(list(rid_list.keys())))
177 |                 logging.info(f'Mean percentages over items of missing values for feature data\n{df}')
178 |                 mask = df.isnull()
179 |                 fig, ax = plt.subplots(figsize=(6, 4))
180 |                 sns.heatmap(df, mask=mask, cmap='GnBu')
181 |                 ax.xaxis.tick_top()
182 |                 ax.xaxis.set_label_position('top')
183 |                 ax.tick_params(length=0)
184 |                 plt.xticks(rotation=90)
185 |                 plt.savefig('./data/level-4/missing_feature_data.eps', format='eps',
186 |                             dpi=200, bbox_inches='tight')
187 | 
188 |         return feat_df, feat_df_scaled
189 | 
190 |     def __save_vocab_behr(self, behr, bt_to_idx):
191 |         """Saves behavioral EHRs and vocabulary of terms at the level specified
192 |         to .csv file in a new data folder according to level.
193 | 
194 |         Parameters
195 |         ----------
196 |         behr: dictionary
197 |             {key:pid, value:list(list of terms for each assessment)}
198 |         bt_to_idx: dictionary
199 |             Dictionary with behavioral terms as keys and idx as values
200 |         """
201 |         os.makedirs('./data/level-{0}'.format(self.level),
202 |                     exist_ok=True)
203 |         with open(os.path.join('./data/level-{0}'.format(self.level),
204 |                                'cohort-behr.csv'), 'w') as f:
205 |             wr = csv.writer(f)
206 |             wr.writerow(['ID_SUBJ', 'AOA', 'TERM'])
207 |             for pid, seq in behr.items():
208 |                 for s in seq:
209 |                     wr.writerow([pid, s[1]] + [bt_to_idx[s[idx]]
210 |                                                for idx in range(2, len(s))])
211 |         with open(os.path.join('./data/level-{0}'.format(self.level),
212 |                                'bt_to_idx.csv'), 'w') as f:
213 |             wr = csv.writer(f)
214 |             wr.writerow(["TERM", "LABEL"])
215 |             for bt, idx in bt_to_idx.items():
216 |                 wr.writerow([bt, idx])
217 | 
218 |     @staticmethod
219 |     def __create_token(row, ins, c):
220 |         """Private custom-based function to modify and uniform dataset features.
221 |         Must be modified when changing dataset. Returns token string and value. These
222 |         objects must be joined for NLP behavioral embedding and kept separate for
223 |         feature dataset.
224 | 
225 |         Parameters
226 |         ----------
227 |         row: pandas Series
228 |             Row corresponding to a patient assessment
229 |         ins: str
230 |             Instrument considered
231 |         c: str
232 |             Name of the instrument item considered
233 | 
234 |         Returns
235 |         -------
236 |         str
237 |             String of the form instrument::item
238 |         """
239 |         if bool(re.match('ados', ins)):
240 |             if bool(re.search("\.d1|\.d2|\.b1|d1|d2|b1|"
241 |                               "comparison_score|"
242 |                               "sa_tot|rrb_tot|sarrb_tot|"
243 |                               "\.sa_tot|\.rrb_tot",
244 |                               c)):
245 |                 if len(c.split('.')) > 1:
246 |                     token = '::'.join(['ados',
247 |                                        c.split('.')[1]])
248 |                 else:
249 |                     token = '::'.join(['ados',
250 |                                        c])
251 |             else:
252 |                 if len(c.split('.')) > 1:
253 |                     token = '::'.join([ins,
254 |                                        c.split('.')[1]])
255 |                 else:
256 |                     token = '::'.join([ins,
257 |                                        c])
258 |         elif bool(re.match('psi', ins)):
259 |             token = '::'.join([ins,
260 |                                row['parent'].lower(),
261 |                                c])
262 |         elif bool(re.match('vinel|srs', ins)):
263 |             token = '::'.join([ins, 'caretaker', c])
264 |         elif bool(re.match('wa|wi|wp', ins)):
265 |             token = '::'.join(['wechsler', c])
266 |         else:
267 |             token = '::'.join([ins,
268 |                                c])
269 |         return token
270 | 
271 |     @staticmethod
272 |     def __aoa_to_tf(aoa):
273 |         """Returns the time period from the age of assessment
274 | 
275 |         Parameters
276 |         ----------
277 |         aoa: float
278 |             age of assessment
279 | 
280 |         Return
281 |         ------
282 |         str
283 |             time period string (F1-F5)
284 |         """
285 | 
286 |         if 0 < float(aoa) <= 2.5:
287 |             return 'F1'
288 |         elif 2.5 < float(aoa) <= 6.0:
289 |             return 'F2'
290 |         elif 6.0 < float(aoa) <= 13.0:
291 |             return 'F3'
292 |         elif 13.0 < float(aoa) < 17.0:
293 |             return 'F4'
294 |         else:
295 |             return 'F5'
296 | 


--------------------------------------------------------------------------------
/clustering.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sklearn.cluster import AgglomerativeClustering, KMeans
  3 | from sklearn.metrics import silhouette_score, adjusted_mutual_info_score, fowlkes_mallows_score
  4 | from sklearn.preprocessing import MinMaxScaler
  5 | from scipy.cluster.hierarchy import linkage
  6 | import logging
  7 | 
  8 | 
  9 | class HclustEmbeddings:
 10 |     """ Performs hierarchical clustering on patient embeddings"""
 11 | 
 12 |     def __init__(self, min_cl, max_cl, linkage, affinity):
 13 |         self.min_cl = min_cl
 14 |         self.max_cl = max_cl
 15 |         self.linkage = linkage
 16 |         self.affinity = affinity
 17 | 
 18 |     def find_best_nclu(self,
 19 |                        mtx,
 20 |                        n_iter,
 21 |                        subsampl):
 22 |         """Iterate clustering of subsets and find best number of clusters
 23 | 
 24 |         Parameters
 25 |         ----------
 26 |         mtx: list
 27 |             List of embeddings as returned by pt_embedding module
 28 |         n_iter: int
 29 |             number of iteration to select the best number of clusters
 30 |         subsampl: float
 31 |             Fraction of data to consider for clustering
 32 | 
 33 |         Returns
 34 |         -------
 35 |         int
 36 |             Best number of clusters
 37 |         """
 38 |         n_cl_selected = []
 39 |         for it in range(n_iter):
 40 |             idx = np.random.randint(0, len(mtx), int(len(mtx) * subsampl))
 41 |             sub_data = [mtx[i] for i in idx]
 42 |             best_n_clu = self.elbow_method(sub_data)
 43 |             # for n_clu in range(self.min_cl, self.max_cl):
 44 |             #     hclu = AgglomerativeClustering(n_clusters=n_clu,
 45 |             #                                    linkage=self.linkage,
 46 |             #                                    affinity=self.affinity)
 47 |             #     lab_cl = hclu.fit_predict(sub_data)
 48 |             #     tmp_silh = silhouette_score(sub_data, lab_cl)
 49 |             #     if tmp_silh > best_silh:
 50 |             #         best_silh = tmp_silh
 51 |             #         best_n_clu = n_clu
 52 |             # print("(*) Iter {0} -- N clusters {1}".format(it, best_n_clu))
 53 |             n_cl_selected.append(best_n_clu)
 54 |         unique, counts = np.unique(n_cl_selected, return_counts=True)
 55 |         logging.info("Counts of N clusters:")
 56 |         logging.info("N clusters -- Count")
 57 |         for un, ct in dict(zip(unique, counts)).items():
 58 |             logging.info(un, ct)
 59 |         best_n_clu = unique[np.argmax(counts)]
 60 |         logging.info("\nBest N cluster:{0}".format(best_n_clu))
 61 |         return best_n_clu
 62 | 
 63 |     def elbow_method(self,
 64 |                      mtx):
 65 |         """Select the best number of clusters via elbow method.
 66 | 
 67 |         Parameters
 68 |         ----------
 69 |         mtx list:
 70 |             List of embeddings as returned by pt_embedding module
 71 | 
 72 |         Returns
 73 |         -------
 74 |         int:
 75 |             Best number of clusters
 76 |         """
 77 |         # Scale data.
 78 |         scaler = MinMaxScaler()
 79 |         mtx = scaler.fit_transform(mtx)
 80 | 
 81 |         Z = linkage(mtx, self.linkage)
 82 |         last = Z[-self.max_cl:, 2]
 83 | 
 84 |         acceleration = np.diff(last, 2)  # 2nd derivative of the distances
 85 |         acceleration_rev = acceleration[::-1]
 86 | 
 87 |         k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
 88 | 
 89 |         return k
 90 | 
 91 |     @staticmethod
 92 |     def fit(mtx, pid_list, n_clu):
 93 |         """ Perform HC on patient embeddings
 94 | 
 95 |         Parameters
 96 |         ----------
 97 |         mtx: list
 98 |             Embeddings list
 99 |         pid_list: list
100 |             List of subjects id ordered as in mtx
101 |         n_clu: int
102 |             Number of clusters
103 | 
104 |         Returns
105 |         -------
106 |         dictionary
107 |             Dictionary with cluster label per subject id
108 |             {pid: cl}
109 |         """
110 |         # Scale data matrix
111 |         scaler = MinMaxScaler()
112 |         mtx = scaler.fit_transform(mtx)
113 | 
114 |         hclu = AgglomerativeClustering(n_clusters=n_clu)
115 |         lab_cl = hclu.fit_predict(mtx)
116 |         silh = silhouette_score(mtx, lab_cl)
117 |         logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh))
118 | 
119 |         num_count = np.unique(lab_cl, return_counts=True)[1]
120 |         for idx, nc in enumerate(num_count):
121 |             logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc))
122 | 
123 |         return {pid: cl for pid, cl in zip(pid_list, lab_cl)}
124 | 
125 | 
126 | class HclustFeatures:
127 |     """ Performs Hierarchical clustering on feature data"""
128 | 
129 |     def __init__(self, min_cl, max_cl, linkage, affinity):
130 |         self.min_cl = min_cl
131 |         self.max_cl = max_cl
132 |         self.linkage = linkage
133 |         self.affinity = affinity
134 | 
135 |     def find_best_nclu(self,
136 |                        df_scaled,
137 |                        n_iter,
138 |                        subsampl):
139 |         """ Find the best number of clusters iterating over subset of data
140 | 
141 |         Parameters
142 |         ----------
143 |         df_scaled: dataframe
144 |             Scaled feature data with patient ids as index
145 |         n_iter: int
146 |             Number of iterations to perform
147 |         subsampl: float
148 |             Fraction of data to consider in the subset
149 |             at each iteration
150 | 
151 |         Returns
152 |         -------
153 |         int
154 |             best number of clusters
155 |         """
156 |         n_cl_selected = []
157 |         for it in range(n_iter):
158 |             idx = np.random.randint(0, len(df_scaled), int(len(df_scaled) * subsampl))
159 |             sub_df = df_scaled.iloc[[i for i in idx], :]
160 |             best_n_clu = self.elbow_method(sub_df)
161 |             # for n_clu in range(self.min_cl, self.max_cl):
162 |             #     hclu = AgglomerativeClustering(n_clusters=n_clu)
163 |             #     lab_cl = hclu.fit_predict(sub_df)
164 |             #     tmp_silh = silhouette_score(sub_df, lab_cl)
165 |             #     if tmp_silh > best_silh:
166 |             #         best_silh = tmp_silh
167 |             #         best_n_clu = n_clu
168 |             # print("(*) Iter {0} -- N clusters {1}".format(it,
169 |             #                                               best_n_clu))
170 |             n_cl_selected.append(best_n_clu)
171 |         unique, counts = np.unique(n_cl_selected, return_counts=True)
172 |         logging.info("Counts of N clusters:")
173 |         logging.info("N clusters -- Count")
174 |         for un, ct in dict(zip(unique, counts)).items():
175 |             logging.info(un, ct)
176 |         best_n_clu = unique[np.argmax(counts)]
177 |         logging.info("\nBest N cluster:{0}".format(best_n_clu))
178 |         return best_n_clu
179 | 
180 |     def elbow_method(self,
181 |                      df_scaled):
182 |         """Select the best number of clusters via elbow method.
183 | 
184 |         Parameters
185 |         ----------
186 |         df_scaled dataframe:
187 |             Scaled feature data with patient ids as index
188 | 
189 |         Returns
190 |         -------
191 |         int:
192 |             Best number of clusters
193 |         """
194 | 
195 |         data = df_scaled.to_numpy()
196 | 
197 |         Z = linkage(data, self.linkage)
198 |         last = Z[-self.max_cl:, 2]
199 | 
200 |         acceleration = np.diff(last, 2)  # 2nd derivative of the distances
201 |         acceleration_rev = acceleration[::-1]
202 |         k = acceleration_rev.argmax() + 2  # if idx 0 is the max of this we want 2 clusters
203 | 
204 |         return k
205 | 
206 |     @staticmethod
207 |     def fit(df_scaled, n_clu):
208 |         """Fit HC on patient feature data
209 | 
210 |         Parameters
211 |         ----------
212 |         df_scaled: dataframe
213 |             Dataframe of scaled feature data
214 |         n_clu: int
215 |             Number of clusters
216 |         Returns
217 |         -------
218 |         dictionary
219 |             Dictionary of patient ids and correspondent
220 |             clusters {pid: cl}
221 |         """
222 |         hclu = AgglomerativeClustering(n_clusters=n_clu)
223 |         lab_cl = hclu.fit_predict(df_scaled)
224 |         silh = silhouette_score(df_scaled, lab_cl)
225 |         logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh))
226 | 
227 |         num_count = np.unique(lab_cl, return_counts=True)[1]
228 |         for idx, nc in enumerate(num_count):
229 |             logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc))
230 | 
231 |         return {pid: cl for pid, cl in zip(df_scaled.index, lab_cl)}
232 | 
233 | 
234 | class KMeansEmbeddings:
235 |     """ Performs KMeans on patient embeddings"""
236 | 
237 |     def __init__(self, min_cl, max_cl):
238 |         self.min_cl = min_cl
239 |         self.max_cl = max_cl
240 | 
241 |     def find_best_nclu(self,
242 |                        mtx,
243 |                        n_iter,
244 |                        subsampl):
245 |         """Iterate clustering of subsets anf find best number of clusters
246 | 
247 |         Parameters
248 |         ----------
249 |         mtx: list
250 |             List of embeddings as returned by pt_embedding module
251 |         n_iter: int
252 |             number of iteration to select the best number of clusters
253 |         subsampl: float
254 |             Fraction of data to consider for clustering
255 | 
256 |         Returns
257 |         -------
258 |         int
259 |             Best number of clusters
260 |         """
261 |         n_cl_selected = []
262 |         for it in range(n_iter):
263 |             idx = np.random.randint(0, len(mtx), int(len(mtx) * subsampl))
264 |             sub_data = [mtx[i] for i in idx]
265 |             best_n_clu = self.elbow_method(sub_data)
266 |             # for n_clu in range(self.min_cl, self.max_cl):
267 |             #     hclu = AgglomerativeClustering(n_clusters=n_clu,
268 |             #                                    linkage=self.linkage,
269 |             #                                    affinity=self.affinity)
270 |             #     lab_cl = hclu.fit_predict(sub_data)
271 |             #     tmp_silh = silhouette_score(sub_data, lab_cl)
272 |             #     if tmp_silh > best_silh:
273 |             #         best_silh = tmp_silh
274 |             #         best_n_clu = n_clu
275 |             # print("(*) Iter {0} -- N clusters {1}".format(it, best_n_clu))
276 |             n_cl_selected.append(best_n_clu)
277 |         unique, counts = np.unique(n_cl_selected, return_counts=True)
278 |         logging.info("Counts of N clusters:")
279 |         logging.info("N clusters -- Count")
280 |         for un, ct in dict(zip(unique, counts)).items():
281 |             logging.info(un, ct)
282 |         best_n_clu = unique[np.argmax(counts)]
283 |         logging.info("\nBest N cluster:{0}".format(best_n_clu))
284 |         return best_n_clu
285 | 
286 |     def elbow_method(self,
287 |                      mtx):
288 |         """Select the best number of clusters via elbow method.
289 | 
290 |         Parameters
291 |         ----------
292 |         mtx list:
293 |             List of embeddings as returned by pt_embedding module
294 | 
295 |         Returns
296 |         -------
297 |         int:
298 |             Best number of clusters
299 |         """
300 |         # Scale data.
301 |         scaler = MinMaxScaler()
302 |         mtx = scaler.fit_transform(mtx)
303 | 
304 |         inertia = []  # Sum of square differences of samples from cluster centers
305 |         K = range(1, self.max_cl)
306 | 
307 |         for k in K:
308 |             kmean_model = KMeans(n_clusters=k).fit(mtx)
309 |             inertia.append(kmean_model.inertia_)
310 | 
311 |         acceleration = np.diff(inertia, 2)
312 | 
313 |         k = acceleration.argmax() + 2  # If idx 0 is the max of this we want 2 clusters
314 | 
315 |         return k
316 | 
317 |     @staticmethod
318 |     def fit(mtx, pid_list, n_clu):
319 |         """ Perform HC on patient embeddings
320 | 
321 |         Parameters
322 |         ----------
323 |         mtx: list
324 |             Embeddings list
325 |         pid_list: list
326 |             List of subjects id ordered as in mtx
327 |         n_clu: int
328 |             Number of clusters
329 | 
330 |         Returns
331 |         -------
332 |         dictionary
333 |             Dictionary with cluster label per subject id
334 |             {pid: cl}
335 |         """
336 |         # Scale data matrix
337 |         scaler = MinMaxScaler()
338 |         mtx = scaler.fit_transform(mtx)
339 | 
340 |         kmclu = KMeans(n_clusters=n_clu)
341 |         lab_cl = kmclu.fit_predict(mtx)
342 |         silh = silhouette_score(mtx, lab_cl)
343 |         logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh))
344 | 
345 |         num_count = np.unique(lab_cl, return_counts=True)[1]
346 |         for idx, nc in enumerate(num_count):
347 |             logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc))
348 | 
349 |         return {pid: cl for pid, cl in zip(pid_list, lab_cl)}
350 | 
351 | 
352 | class KMeansFeatures:
353 |     """ Performs Hierarchical clustering on feature data"""
354 | 
355 |     def __init__(self, min_cl, max_cl):
356 |         self.min_cl = min_cl
357 |         self.max_cl = max_cl
358 | 
359 |     def find_best_nclu(self,
360 |                        df_scaled,
361 |                        n_iter,
362 |                        subsampl):
363 |         """ Find the best number of clusters iterating over subset of data
364 | 
365 |         Parameters
366 |         ----------
367 |         df_scaled: dataframe
368 |             Scaled feature data with patient ids as index
369 |         n_iter: int
370 |             Number of iterations to perform
371 |         subsampl: float
372 |             Fraction of data to consider in the subset
373 |             at each iteration
374 | 
375 |         Returns
376 |         -------
377 |         int
378 |             best number of clusters
379 |         """
380 |         n_cl_selected = []
381 |         for it in range(n_iter):
382 |             idx = np.random.randint(0, len(df_scaled), int(len(df_scaled) * subsampl))
383 |             sub_df = df_scaled.iloc[[i for i in idx], :]
384 |             best_n_clu = self.elbow_method(sub_df)
385 |             # for n_clu in range(self.min_cl, self.max_cl):
386 |             #     hclu = AgglomerativeClustering(n_clusters=n_clu)
387 |             #     lab_cl = hclu.fit_predict(sub_df)
388 |             #     tmp_silh = silhouette_score(sub_df, lab_cl)
389 |             #     if tmp_silh > best_silh:
390 |             #         best_silh = tmp_silh
391 |             #         best_n_clu = n_clu
392 |             # print("(*) Iter {0} -- N clusters {1}".format(it,
393 |             #                                               best_n_clu))
394 |             n_cl_selected.append(best_n_clu)
395 |         unique, counts = np.unique(n_cl_selected, return_counts=True)
396 |         logging.info("Counts of N clusters:")
397 |         logging.info("N clusters -- Count")
398 |         for un, ct in dict(zip(unique, counts)).items():
399 |             logging.info(un, ct)
400 |         best_n_clu = unique[np.argmax(counts)]
401 |         logging.info("\nBest N cluster:{0}".format(best_n_clu))
402 |         return best_n_clu
403 | 
404 |     def elbow_method(self,
405 |                      df_scaled):
406 |         """Select the best number of clusters via elbow method.
407 | 
408 |         Parameters
409 |         ----------
410 |         df_scaled dataframe:
411 |             Scaled feature data with patient ids as index
412 | 
413 |         Returns
414 |         -------
415 |         int:
416 |             Best number of clusters
417 |         """
418 | 
419 |         data = df_scaled.to_numpy()
420 | 
421 |         inertia = []  # Sum of square differences of samples from cluster centers
422 |         K = range(1, self.max_cl)
423 | 
424 |         for k in K:
425 |             kmean_model = KMeans(n_clusters=k).fit(data)
426 |             inertia.append(kmean_model.inertia_)
427 | 
428 |         acceleration = np.diff(inertia, 2)
429 | 
430 |         k = acceleration.argmax() + 2  # If idx 0 is the max of this we want 2 clusters
431 | 
432 |         return k
433 | 
434 |     @staticmethod
435 |     def fit(df_scaled, n_clu):
436 |         """Fit HC on patient feature data
437 | 
438 |         Parameters
439 |         ----------
440 |         df_scaled: dataframe
441 |             Dataframe of scaled feature data
442 |         n_clu: int
443 |             Number of clusters
444 |         Returns
445 |         -------
446 |         dictionary
447 |             Dictionary of patient ids and correspondent
448 |             clusters {pid: cl}
449 |         """
450 |         kmclu = KMeans(n_clusters=n_clu)
451 |         lab_cl = kmclu.fit_predict(df_scaled)
452 |         silh = silhouette_score(df_scaled, lab_cl)
453 |         logging.info('(*) Number of clusters %d -- Silhouette score %.2f' % (n_clu, silh))
454 | 
455 |         num_count = np.unique(lab_cl, return_counts=True)[1]
456 |         for idx, nc in enumerate(num_count):
457 |             logging.info("Cluster {0} -- Numerosity {1}".format(idx, nc))
458 | 
459 |         return {pid: cl for pid, cl in zip(df_scaled.index, lab_cl)}
460 | 
461 | 
462 | def compare_clustering(cl1, cl2, method):
463 |     """Compute cluster comparison score (compare favorite cluster to other clustering techniques),
464 |     either Adjusted Mutual Information Score (AMI), or Fowlkes - Mallows Score (FM)
465 | 
466 |     Parameters
467 |     ----------
468 |     cl1: list, array
469 |         first clustering labels
470 |     cl2: list, array
471 |         second clustering labels
472 |     method: str
473 |         either 'AMI' or 'FM'
474 |     Returns
475 |     -------
476 |     float
477 |         desired score
478 |     """
479 |     if method == 'AMI':
480 |         return adjusted_mutual_info_score(cl1, cl2,
481 |                                           average_method='arithmetic')
482 |     else:
483 |         return fowlkes_mallows_score(cl1, cl2)
484 | 


--------------------------------------------------------------------------------
/behavioral_phenotyping_pipeline.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Behavioral Profile Stratification via Unsupervised learning"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "> Behavioral data embeddings for the stratification of individuals\n",
 15 |     "with neurodevelopmental conditions.\n",
 16 |     "\n",
 17 |     "> Designed for observational measurements of cognition and behavior of individuals with \n",
 18 |     "Autism Spectrum Conditions (ASCs).\n",
 19 |     "\n",
 20 |     "* `dataset.py`: Connects to the database and dump data\n",
 21 |     "* `features.py`: Returns vocabulary and dictionary of behavioral *EHRs* for each of the 4 possible depth levels. \n",
 22 |     "It also returns a dataset with quantitative scores for level 4 features\n",
 23 |     "* `pt_embedding.py`: Performs TFIDF for patient embeddings; Glove embeddings on words and average them out for \n",
 24 |     "subject embeddings; Word2vec embeddings on words, that are then averaged to output individual representations\n",
 25 |     "* `clustering.py`: Performs Hierarchical Clustering/k-means on embeddings, and quantitative 4th level features\n",
 26 |     "* `visualization.py`: Visualizes results (e.g. _scatterplot & dendrogram_)for sub-cluster visualization; \n",
 27 |     "_Heatmap_ for inspection of quantitative scores between sub-clusters"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     "---\n",
 35 |     "*Run the cell below to enable logging display in notebook. Otherwise the log info are written to `pipeline.log` file in `./log` folder.*"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "from importlib import reload  # Not needed in Python 2\n",
 45 |     "import logging\n",
 46 |     "reload(logging)\n",
 47 |     "logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', \n",
 48 |     "                    level=logging.INFO, datefmt='%I:%M:%S')"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "markdown",
 53 |    "metadata": {},
 54 |    "source": [
 55 |     "---"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Step 1: Data Loading\n",
 63 |     "\n",
 64 |     "> The `dataset` module access the database and dumps all the available tables. Information for Data Accessibility should be provided in the `utils.py` file. Then, subject (e.g., adults) and tables (e.g., ados-2 module 4) that need to be excluded are filtered out and dictionaries of subject demographics and encounter information are provided and saved to _.csv_ file. "
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "from dataset import access_db, data_wrangling, cohort_info"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {},
 80 |    "outputs": [],
 81 |    "source": [
 82 |     "# # it returns a dictionary of pandas dataframes storing tables from the db\n",
 83 |     "tables = access_db()"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "# # reduced dictionary (it excludes tables and subjects that are not required, e.g., ados-2modulo4, eas)\n",
 93 |     "rid_tables = data_wrangling(tables)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# # it returns dictionary of subjects info and encounters\n",
103 |     "pinfo, penc = cohort_info(rid_tables)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "## Step 2: Feature Processing\n",
111 |     "\n",
112 |     "> Class `DataFeatures`is initialized with the depth level desired. The depth level can range from 1 to 4, where levels 1-3 are sistematically derived from instrument item structures, and level 4 is empirically derived in accordance with clinical experts. According to the levels, _behavioral EHRs_ (bEHRs) and vocabulary of terms are created. For each subject, each item score $N$ is considered as a word of the form `instrument_name::item::N`, the sequence of \"words\" chronologically ordered becomes the bEHR for each individual. Moreover, all the behavioral terms obtained are collected into a vocabulary. \n",
113 |     "\n",
114 |     "> The `create_level_features` method is only available for level 4, due to noise and missingness of data. It represents each subject as a vector of quantitative scores to tests ordered according to 5 timeframes (F1-F5), clinically selected. Missing values are imputed with mean."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "from features import DataFeatures"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "datafeatures = DataFeatures(level=4, df_dict=rid_tables)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "behr, (bt_to_idx, idx_to_bt) = datafeatures.create_level_tokens()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": null,
147 |    "metadata": {},
148 |    "outputs": [],
149 |    "source": [
150 |     "feat_df, feat_df_scaled = datafeatures.create_level_features()"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "## Step 3: Embeddings\n",
158 |     "\n",
159 |     "> `Pembeddings` class consits of three methods: `tfidf` that outputs patient embeddings from SVD transform of word co-occurrence counts; `word2vec_emb` that computes word embeddings for each behavioral term learned via _continuous Skip-gram model_ (Mikolov et al., 2013) and outputs patient representations averaging out the behavioral terms of their sequence; `glove_pemb` that learns word embeddings via GloVe algorithm (Pennington et al., 2014) and averages out behavioral terms returning patient encodings."
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "from pt_embedding import Pembeddings"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "model = Pembeddings(behr, bt_to_idx)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "svd_pid_list, svd_mtx = model.tfidf()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "glove_pid_list, glove_emb, word_emb = model.glove_pemb()"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "w2v_pid_list, w2v_emb, w2v_word_emb, _ = model.word2vec_emb()"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "## Step 4: Clustering\n",
212 |     "\n",
213 |     "> This module performs _hierarchical clustering_ or _k-means clustering_ techniques on either subject embeddings or feature data. The best number of clusters is chosen via the Elbow Method."
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "from clustering import HclustEmbeddings, HclustFeatures, KMeansEmbeddings, KMeansFeatures, compare_clustering\n",
223 |     "import utils as ut"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "hclust_emb = HclustEmbeddings(min_cl=ut.min_cl, max_cl=ut.max_cl, \n",
233 |     "                              affinity='euclidean', linkage='ward')\n",
234 |     "\n",
235 |     "kmclust_emb = KMeansEmbeddings(min_cl=ut.min_cl, max_cl=ut.max_cl)"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "### `TF-IDF` Embedding"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": null,
248 |    "metadata": {},
249 |    "outputs": [],
250 |    "source": [
251 |     "# TFIDF EMBEDDING\n",
252 |     "# tfidf_best_cl = hclust_emb.find_best_nclu(svd_mtx, n_iter=ut.n_iter, \n",
253 |     "#                                           subsampl=ut.subsampl)\n",
254 |     "tfidf_best_hccl = hclust_emb.elbow_method(svd_mtx)\n",
255 |     "tfidf_hcsubc = hclust_emb.fit(svd_mtx, svd_pid_list, tfidf_best_hccl)"
256 |    ]
257 |   },
258 |   {
259 |    "cell_type": "code",
260 |    "execution_count": null,
261 |    "metadata": {},
262 |    "outputs": [],
263 |    "source": [
264 |     "# # KMeans clustering\n",
265 |     "tfidf_best_kmcl = kmclust_emb.elbow_method(svd_mtx)\n",
266 |     "tfidf_kmsubc = kmclust_emb.fit(svd_mtx, svd_pid_list, tfidf_best_kmcl)"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "markdown",
271 |    "metadata": {},
272 |    "source": [
273 |     "### `Glove` Embedding"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": null,
279 |    "metadata": {},
280 |    "outputs": [],
281 |    "source": [
282 |     "# GLOVE EMBEDDING\n",
283 |     "# glv_best_cl = hclust_emb.find_best_nclu(glove_emb, n_iter=ut.n_iter, subsampl=ut.subsampl)\n",
284 |     "glv_best_hccl = hclust_emb.elbow_method(glove_emb)\n",
285 |     "glv_hcsubc = hclust_emb.fit(glove_emb, glove_pid_list, glv_best_hccl)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "glv_best_kmcl = kmclust_emb.elbow_method(glove_emb)\n",
295 |     "glv_kmsubc = kmclust_emb.fit(glove_emb, glove_pid_list, glv_best_kmcl)"
296 |    ]
297 |   },
298 |   {
299 |    "cell_type": "markdown",
300 |    "metadata": {},
301 |    "source": [
302 |     "### `Word2Vec` Embedding"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": null,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "w2v_best_hccl = hclust_emb.elbow_method(w2v_emb)\n",
312 |     "w2v_hcsubc = hclust_emb.fit(w2v_emb, w2v_pid_list, w2v_best_hccl)"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": null,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "w2v_best_kmcl = kmclust_emb.elbow_method(w2v_emb)\n",
322 |     "w2v_kmsubc = kmclust_emb.fit(w2v_emb, w2v_pid_list, w2v_best_kmcl)"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "### Feature clustering"
330 |    ]
331 |   },
332 |   {
333 |    "cell_type": "code",
334 |    "execution_count": null,
335 |    "metadata": {},
336 |    "outputs": [],
337 |    "source": [
338 |     "hclust_feat = HclustFeatures(min_cl=ut.min_cl, max_cl=ut.max_cl, \n",
339 |     "                             affinity='euclidean', linkage='ward')\n",
340 |     "kmclust_feat = KMeansFeatures(min_cl=ut.min_cl, max_cl=ut.max_cl)"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": null,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "# FEATURES REPRESENTATION\n",
350 |     "# feat_best_cl = hclust_feat.find_best_nclu(feat_df_scaled, n_iter=ut.n_iter, subsampl=ut.subsampl)\n",
351 |     "feat_best_hccl = hclust_feat.elbow_method(feat_df_scaled)\n",
352 |     "feat_hcsubc = hclust_feat.fit(feat_df_scaled, feat_best_hccl)"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": null,
358 |    "metadata": {},
359 |    "outputs": [],
360 |    "source": [
361 |     "feat_best_kmcl = kmclust_feat.elbow_method(feat_df_scaled)\n",
362 |     "feat_kmsubc = kmclust_feat.fit(feat_df_scaled, feat_best_kmcl)"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "markdown",
367 |    "metadata": {},
368 |    "source": [
369 |     "## Step 5: Clustering II (Visualization) \n",
370 |     "\n",
371 |     "> The second clustering module (`visualization`) enables the visualization of dendrogram, and Elbow Method curve for number of clusters selection. Moreover, it allows the visualization of the identified subtypes with scatterplots (UMAP projection visualization technique) and heatmaps for phenotyping (quantitative scores of selected items are highlighted). All these plots are available for both patient embeddings and feature data."
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": null,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "from visualization import Visualization"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": null,
386 |    "metadata": {},
387 |    "outputs": [],
388 |    "source": [
389 |     "viz = Visualization(pinfo, ut.col_dict, ut.c_out)"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "markdown",
394 |    "metadata": {},
395 |    "source": [
396 |     "### Tf-idf "
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": null,
402 |    "metadata": {},
403 |    "outputs": [],
404 |    "source": [
405 |     "# # Example of visualization for tfidf embeddings\n",
406 |     "# # Prepare data for umap and dendrogram\n",
407 |     "# umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(svd_mtx, tfidf_hcsubc, svd_pid_list, random_state=42,\n",
408 |     "#                                                       n_neighbors = 100,\n",
409 |     "#                                                       min_dist=0.0)"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "code",
414 |    "execution_count": null,
415 |    "metadata": {},
416 |    "outputs": [],
417 |    "source": [
418 |     "# viz.scatterplot_dendrogram(svd_mtx, umap_mtx, pid_subc_list, 15, 10)"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "code",
423 |    "execution_count": null,
424 |    "metadata": {},
425 |    "outputs": [],
426 |    "source": [
427 |     "# # Prepare data for heatmap\n",
428 |     "# emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, tfidf_hcsubc, \n",
429 |     "#                                   save_df='df_tfidfemb_level4.csv')"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "code",
434 |    "execution_count": null,
435 |    "metadata": {},
436 |    "outputs": [],
437 |    "source": [
438 |     "# viz.heatmap_emb(emb_scaled, 500, 2000, save_html='tfidf_heatmap_level-4')"
439 |    ]
440 |   },
441 |   {
442 |    "cell_type": "markdown",
443 |    "metadata": {},
444 |    "source": [
445 |     "### `GloVe`"
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "code",
450 |    "execution_count": null,
451 |    "metadata": {},
452 |    "outputs": [],
453 |    "source": [
454 |     "# Visualization for GloVe embeddings\n",
455 |     "# Prepare data for umap and dendrogram\n",
456 |     "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(glove_emb, glv_hcsubc, glove_pid_list, random_state=42,\n",
457 |     "                                                      n_neighbors = 5,\n",
458 |     "                                                      min_dist=0.0)"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {},
465 |    "outputs": [],
466 |    "source": [
467 |     "viz.scatterplot_dendrogram(glove_emb, umap_mtx, pid_subc_list, 15, 10, save_fig=None)"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": null,
473 |    "metadata": {},
474 |    "outputs": [],
475 |    "source": [
476 |     "# Plot UMAP projection of word embeddings via GloVe\n",
477 |     "viz.plot_word_embedding(word_emb, idx_to_bt, 800, \n",
478 |     "                        800,\n",
479 |     "                        n_neighbors = 10,\n",
480 |     "                        min_dist=0.0)"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {},
487 |    "outputs": [],
488 |    "source": [
489 |     "# Prepare data for heatmap\n",
490 |     "emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, glv_hcsubc, \n",
491 |     "                                  save_df=None)"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": null,
497 |    "metadata": {},
498 |    "outputs": [],
499 |    "source": [
500 |     "viz.heatmap_emb(emb_scaled, 500, 1800, save_html=None)"
501 |    ]
502 |   },
503 |   {
504 |    "cell_type": "markdown",
505 |    "metadata": {},
506 |    "source": [
507 |     "### `Word2vec`"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "code",
512 |    "execution_count": null,
513 |    "metadata": {},
514 |    "outputs": [],
515 |    "source": [
516 |     "# Scatterplot and dendrogram of UMAP projections\n",
517 |     "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(w2v_emb, w2v_hcsubc, w2v_pid_list, random_state=42,\n",
518 |     "                                                      n_neighbors = 5,\n",
519 |     "                                                      min_dist=0.0)\n",
520 |     "viz.scatterplot_dendrogram(w2v_emb, umap_mtx, pid_subc_list, 15, 10)"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "code",
525 |    "execution_count": null,
526 |    "metadata": {},
527 |    "outputs": [],
528 |    "source": [
529 |     "# Plot UMAP projection of word embeddings via Word2Vec\n",
530 |     "viz.plot_word_embedding(w2v_word_emb.transpose(), \n",
531 |     "                        idx_to_bt, \n",
532 |     "                        800, \n",
533 |     "                        800,\n",
534 |     "                        n_neighbors =10,\n",
535 |     "                        min_dist=0.0)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": null,
541 |    "metadata": {},
542 |    "outputs": [],
543 |    "source": [
544 |     "# Prepare data for heatmap\n",
545 |     "emb_scaled = viz.data_heatmap_emb(behr, bt_to_idx, w2v_hcsubc, \n",
546 |     "                                  save_df=None)\n",
547 |     "viz.heatmap_emb(emb_scaled, 500, 1800)"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "### Features"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": null,
560 |    "metadata": {},
561 |    "outputs": [],
562 |    "source": [
563 |     "# Feature data visualization\n",
564 |     "# Prepare data for umap and dendrogram\n",
565 |     "umap_mtx, pid_subc_list = viz.data_scatter_dendrogram(feat_df_scaled, feat_hcsubc, random_state=42,\n",
566 |     "                                                      n_neighbors = 10,\n",
567 |     "                                                      min_dist=0.0)"
568 |    ]
569 |   },
570 |   {
571 |    "cell_type": "code",
572 |    "execution_count": null,
573 |    "metadata": {},
574 |    "outputs": [],
575 |    "source": [
576 |     "viz.scatterplot_dendrogram(feat_df_scaled, umap_mtx, pid_subc_list, 15, 10, save_fig=None)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": null,
582 |    "metadata": {},
583 |    "outputs": [],
584 |    "source": [
585 |     "# Prepare data for heatmap\n",
586 |     "emb_scaled = viz.data_heatmap_feat(feat_df, feat_df_scaled, feat_hcsubc, \n",
587 |     "                                  save_df=None)"
588 |    ]
589 |   },
590 |   {
591 |    "cell_type": "code",
592 |    "execution_count": null,
593 |    "metadata": {},
594 |    "outputs": [],
595 |    "source": [
596 |     "viz.heatmap_feat(emb_scaled, 1000, 2000, save_html=None)"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "metadata": {},
602 |    "source": [
603 |     "---\n"
604 |    ]
605 |   }
606 |  ],
607 |  "metadata": {
608 |   "kernelspec": {
609 |    "display_name": "Python 3",
610 |    "language": "python",
611 |    "name": "python3"
612 |   },
613 |   "language_info": {
614 |    "codemirror_mode": {
615 |     "name": "ipython",
616 |     "version": 3
617 |    },
618 |    "file_extension": ".py",
619 |    "mimetype": "text/x-python",
620 |    "name": "python",
621 |    "nbconvert_exporter": "python",
622 |    "pygments_lexer": "ipython3",
623 |    "version": "3.6.8"
624 |   }
625 |  },
626 |  "nbformat": 4,
627 |  "nbformat_minor": 2
628 | }
629 | 


--------------------------------------------------------------------------------
/visualization.py:
--------------------------------------------------------------------------------
  1 | import umap
  2 | from matplotlib import pyplot as plt
  3 | from scipy.cluster.hierarchy import dendrogram, linkage
  4 | from sklearn.cluster import KMeans
  5 | import pandas as pd
  6 | from bokeh.models import LinearColorMapper, BasicTicker, PrintfTickFormatter, \
  7 |     ColorBar, HoverTool, ColumnDataSource, CategoricalColorMapper
  8 | from bokeh.plotting import figure, show, output_notebook, output_file, save
  9 | from bokeh.io import export_svgs
 10 | import numpy as np
 11 | from sklearn.preprocessing import MinMaxScaler
 12 | from math import pi
 13 | import utils as ut
 14 | # Eliminate verbose warnings from Numba
 15 | import warnings
 16 | 
 17 | warnings.filterwarnings('ignore')
 18 | 
 19 | 
 20 | class Visualization:
 21 |     """Class for the visualization of data and results. It returns:
 22 |     scatter plot, dendrogram, heatmap, plot Glove embeddings.
 23 |     """
 24 | 
 25 |     def __init__(self, subject_info, col_dict, c_out):
 26 |         """
 27 |         Parameters
 28 |         ----------
 29 |         subject_info: dictionary
 30 |             Dictionary with subject demographics (Pinfo dataclass)
 31 |             as returned by cohort_info method in dataset module
 32 |         """
 33 |         self.c_out = c_out  # List of colors to exclude
 34 |         self.col_dict = col_dict  # Dictionary of colors from matplotlib
 35 |         # colormap = [c for c in self.col_dict if c not in self.c_out]
 36 |         colormap = ut.colormap
 37 |         self.colormap = colormap
 38 |         self.subject_info = subject_info
 39 | 
 40 |     @staticmethod
 41 |     def data_scatter_dendrogram(X,
 42 |                                 subc_dict,
 43 |                                 pid_list=None,
 44 |                                 **kwargs):
 45 |         """ Prepare the data to be visualized in umap scatterplot and
 46 |         dendrogram
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         X: array, dataframe
 51 |             either an array (as returned by patient embedding functions)
 52 |             or a dataframe (feature dataset)
 53 |         subc_dict: dictionary
 54 |             dictionary of pids and subcluster labels
 55 |         pid_list: list
 56 |             list of pids as ordered in X
 57 |         kwargs: kewyword arguments to be passed to UMAP
 58 | 
 59 |         Returns
 60 |         -------
 61 |         numpy array
 62 |             umap projection
 63 |         list
 64 |             list of tuple with pid and subcluster label
 65 |         """
 66 |         if isinstance(X, pd.DataFrame):
 67 |             pid_list = list(X.index)
 68 |             X = X.to_numpy()
 69 |         else:
 70 |             scaler = MinMaxScaler()
 71 |             X = scaler.fit_transform(X)
 72 | 
 73 |         umap_mtx = umap.UMAP(**kwargs).fit_transform(X)
 74 | 
 75 |         return umap_mtx, [(pid, subc_dict[pid]) for pid in pid_list]
 76 | 
 77 |     @staticmethod
 78 |     def plot_word_embedding(wemb_mtx,
 79 |                             vocab,
 80 |                             fig_width,
 81 |                             fig_height,
 82 |                             **kwargs):
 83 |         """ Function plotting word embeddings from Glove/Word2vec after UMAP transformation
 84 | 
 85 |         Parameters
 86 |         ----------
 87 |         wemb_mtx: numpy array
 88 |             word embeddings as stored in vocabulary
 89 |         vocab: dictionary
 90 |             idx_to_bt dictionary
 91 |         **kwargs: n_neighbors, min_dist for UMAP module
 92 |         """
 93 |         scaler = MinMaxScaler()
 94 |         wemb_mtx = scaler.fit_transform((wemb_mtx))
 95 | 
 96 |         umap_mtx = umap.UMAP(**kwargs).fit_transform(wemb_mtx)
 97 | 
 98 |         source = ColumnDataSource(data=dict(x=umap_mtx[:, 0],
 99 |                                             y=umap_mtx[:, 1],
100 |                                             words=list(vocab.values())))
101 | 
102 |         TOOLTIPS = [('word', '@words')]
103 | 
104 |         plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'
105 | 
106 |         p = figure(plot_width=fig_width,
107 |                    plot_height=fig_height,
108 |                    tools=plotTools)
109 |         p.add_tools(HoverTool(tooltips=TOOLTIPS))
110 |         p.scatter(x='x', y='y', size=8, source=source)
111 | 
112 |         show(p)
113 | 
114 |     def data_heatmap_feat(self, X, X_scaled, subc_dict, save_df=None):
115 |         """ Prepare data as input to heatmap feature.
116 | 
117 |         Parameters
118 |         ----------
119 |         X: dataframe
120 |             Dataframe with raw feature values
121 |         X_scaled: dataframe
122 |             Dataframe with scaled feature values
123 |         subc_dict: dictionary
124 |             Dictionary with subject ids and subcluster labels
125 |         save_df: str
126 |             if not None it stores the file name for csv dump
127 | 
128 |         Returns
129 |         -------
130 |         dataframe
131 |             Object with both scaled and raw values. A column with subcluster and
132 |             subject id is added.
133 |         """
134 |         label = {'0': 'SI',
135 |                  '1': 'SII',
136 |                  '2': 'SIII',
137 |                  '3': 'SIV',
138 |                  '4': 'SV',
139 |                  '5': 'SVI',
140 |                  '6': 'SVII',
141 |                  '7': 'SVIII',
142 |                  '8': 'SIX',
143 |                  '9': 'SX',
144 |                  '10': 'SXI',
145 |                  '11': 'SXII',
146 |                  '12': 'SXIII',
147 |                  '13': 'SXIV'}
148 | 
149 |         X_scaled = pd.DataFrame(X_scaled.sort_index().stack(),
150 |                                 columns=['score_sc']).reset_index()
151 |         X_scaled.columns = ['clpid', 'feat', 'score_sc']
152 |         X_scaled['clpid'] = ['-'.join([label[str(subc_dict[pid])], str(pid)])
153 |                              for pid in X_scaled.clpid]
154 |         X_scaled = X_scaled.sort_values(by='feat')
155 | 
156 |         X = pd.DataFrame(X.sort_index().stack(), columns=['score']).reset_index()
157 |         X.columns = ['pid', 'feat', 'score']
158 |         X = X.sort_values(by='feat')
159 | 
160 |         X_scaled['score'] = X['score']
161 | 
162 |         X_scaled = self._modify_df(X_scaled)
163 | 
164 |         if save_df is not None:
165 |             X_scaled.to_csv(f'./data/{save_df}',
166 |                             index=False)
167 |         return X_scaled
168 | 
169 |     def data_heatmap_emb(self, X, vocab, subc_dict, save_df=None):
170 |         """ Prepare data as input to heatmap embeddings.
171 | 
172 |         Parameters
173 |         ----------
174 |         X: dictionary
175 |             BEHR dictionary
176 |         vocab: dictionary
177 |             bt_to_idx vocabulary
178 |         subc_dict: dictionary
179 |             Dictionary with pid and subcluster labels
180 |         save_df: str
181 |             if not None, it stores the name for csv dump file
182 | 
183 |         Returns
184 |         -------
185 |         dataframe
186 |             Dataframe with raw scores and scaled scores for subclusters.
187 |             clpid columns with joined subcluster label and pid.
188 |         """
189 |         label = {'0': 'SI',
190 |                  '1': 'SII',
191 |                  '2': 'SIII',
192 |                  '3': 'SIV',
193 |                  '4': 'SV',
194 |                  '5': 'SVI',
195 |                  '6': 'SVII',
196 |                  '7': 'SVIII'}
197 | 
198 |         # Build feature list
199 |         c_lab = sorted(set(['::'.join(lab.split('::')[:-1])
200 |                             for lab in vocab.keys()]))
201 | 
202 |         dict_age = {}
203 |         for p, behr in X.items():
204 |             for vect in behr:
205 |                 if (p, vect[1]) not in dict_age:
206 |                     dict_age[(p, vect[1])] = {}
207 |                 for t in vect[2:]:
208 |                     ss = t.split('::')
209 |                     dict_age[(p, vect[1])].setdefault('::'.join(ss[:-1]),
210 |                                                       list()).append(int(ss[-1]))
211 |         # Create dataframe with cl-pid as index
212 |         val_dict = {}
213 |         indx = []
214 |         for vect in sorted(list(dict_age.keys())):
215 |             for f in c_lab:
216 |                 try:
217 |                     if len(dict_age[vect][f]) == 1:
218 |                         val_dict.setdefault(f, list()).extend(dict_age[vect][f])
219 |                     else:  # Mean of scores if multiple score per assessment
220 |                         val_dict.setdefault(f, list()).append(np.mean(dict_age[vect][f]))
221 |                 except KeyError:
222 |                     val_dict.setdefault(f, list()).append(None)
223 |             indx.append(('-'.join([label[str(subc_dict[vect[0]])], vect[0]]), vect[1]))
224 | 
225 |         # create dataframe with cl-pi as index
226 |         emb_df = pd.DataFrame(val_dict, index=indx)
227 |         emb_df_imp = emb_df.fillna(emb_df.mean(), inplace=False)
228 | 
229 |         scaler = MinMaxScaler()
230 |         emb_df_scaled = scaler.fit_transform(emb_df_imp.values)
231 |         emb_df_scaled = pd.DataFrame(emb_df_scaled, index=indx,
232 |                                      columns=emb_df.columns)
233 | 
234 |         emb_df = pd.DataFrame(emb_df.stack(dropna=False),
235 |                               columns=['score']).reset_index()
236 |         emb_df_scaled = pd.DataFrame(emb_df_scaled.stack(),
237 |                                      columns=['score_sc']).reset_index()
238 |         emb_df_scaled['score'] = emb_df['score']
239 |         emb_df_scaled.columns = ['cllab_aoa', 'feat', 'score_sc', 'score']
240 | 
241 |         emb_df_scaled = self._modify_df(emb_df_scaled)
242 | 
243 |         emb_df_scaled['clpid'] = [tup[0] for tup in emb_df_scaled['cllab_aoa']]
244 |         emb_df_scaled['aoa'] = [tup[1] for tup in emb_df_scaled['cllab_aoa']]
245 | 
246 |         emb_df_scaled = emb_df_scaled.dropna()
247 | 
248 |         if save_df is not None:
249 |             emb_df_scaled.to_csv(f'./data/{save_df}',
250 |                                  index=False)
251 | 
252 |         return emb_df_scaled
253 | 
254 |     def scatterplot_dendrogram(self,
255 |                                X,
256 |                                umap_mtx,
257 |                                pid_subc_list,
258 |                                fig_height,
259 |                                fig_width,
260 |                                save_fig=None):
261 |         """Scatterplot and dendrogram for clustering. The elbow method plot is also displayed.
262 | 
263 |         Parameters
264 |         ----------
265 |         X: numpy array or dataframe
266 |             dendrogram input
267 |         umap_mtx: np array
268 |             Umap projections of patients
269 |         pid_subc_list: list of tuples
270 |             list of pid and subclusters tuples as ordered in X
271 |         fig_height, fig_width: int
272 |         save_fig: str name of the figure (only method and level)
273 |         """
274 | 
275 |         # Scale embedding data matrix
276 |         if not isinstance(X, pd.DataFrame):
277 |             scaler = MinMaxScaler()
278 |             X = scaler.fit_transform(X)
279 | 
280 |         subc_list = [el[1] for el in pid_subc_list]
281 |         label = {'0': 'Subgroup I',
282 |                  '1': 'Subgroup II',
283 |                  '2': 'Subgroup III',
284 |                  '3': 'Subgroup IV',
285 |                  '4': 'Subgroup V',
286 |                  '5': 'Subgroup VI',
287 |                  '6': 'Subgroup VII',
288 |                  '7': 'Subgroup VIII',
289 |                  '8': 'Subgroup IX',
290 |                  '9': 'Subgroup X',
291 |                  '10': 'Subgroup XI',
292 |                  '11': 'Subgroup XII',
293 |                  '12': 'Subgroup XIII',
294 |                  '13': 'Subgroup XIV'}
295 |         colors = [self.colormap[cl] for cl in sorted(list(set(subc_list)))]
296 |         # Bokeh scatterplot
297 |         self._scatter_plot(umap_mtx, pid_subc_list, colors, fig_width, fig_height, label, save_fig)
298 | 
299 |         # Dendrogram
300 |         linked = linkage(X, 'ward')
301 |         # Color mapping
302 |         dflt_col = "#808080"  # Unclustered gray
303 |         # * rows in Z correspond to "inverted U" links that connect clusters
304 |         # * rows are ordered by increasing distance
305 |         # * if the colors of the connected clusters match, use that color for link
306 |         link_cols = {}
307 |         for idx, lidx in enumerate(linked[:, :2].astype(int)):
308 |             c1, c2 = (link_cols[x] if x > len(linked) else colors[subc_list[x]]
309 |                       for x in lidx)
310 |             link_cols[idx + 1 + len(linked)] = c1 if c1 == c2 else dflt_col
311 | 
312 |         plt.figure(figsize=(5, 5))
313 |         dendrogram(Z=linked,
314 |                    # labels=np.array([str(int(i) + 1) for i in subc_list]),
315 |                    labels=np.array([''] * len(subc_list)),
316 |                    color_threshold=None,
317 |                    leaf_font_size=5, leaf_rotation=0,
318 |                    link_color_func=lambda x: link_cols[x])
319 |         if save_fig is None:
320 |             plt.show()
321 |         else:
322 |             plt.savefig(f'./data/{save_fig}-dendrogram.eps')
323 |             plt.close()
324 | 
325 |         # Elbow method with clusters ranging from 2 to 15
326 |         plt.figure(figsize=(5, 5))
327 |         last = linked[-15:, 2]
328 |         last_rev = last[::-1]
329 |         idxs = np.arange(1, len(last) + 1, dtype=int)
330 |         plt.plot(idxs, last_rev)
331 | 
332 |         acceleration = np.diff(last, 2)  # 2nd derivative of the distances
333 |         acceleration_rev = acceleration[::-1]
334 |         plt.plot(idxs[:-2] + 1, acceleration_rev)
335 |         plt.xticks(idxs)
336 |         if save_fig is None:
337 |             plt.show()
338 |         else:
339 |             plt.savefig(f'./data/{save_fig}-elbow.eps')
340 |             plt.close()
341 | 
342 |     def scatterplot_kmeans(self,
343 |                            X,
344 |                            umap_mtx,
345 |                            pid_subc_list,
346 |                            fig_height,
347 |                            fig_width):
348 |         """Scatterplot and elbow method for KMeans clustering.
349 | 
350 |         Parameters
351 |         ----------
352 |         X: numpy array or dataframe
353 |             dendrogram input
354 |         umap_mtx: np array
355 |             Umap projections of patients
356 |         pid_subc_list: list of tuples
357 |             list of pid and subclusters tuples as ordered in X
358 |         fig_height, fig_width: int
359 |         """
360 | 
361 |         # Scale embedding data matrix
362 |         if not isinstance(X, pd.DataFrame):
363 |             scaler = MinMaxScaler()
364 |             X = scaler.fit_transform(X)
365 |         else:
366 |             X = X.to_numpy()
367 | 
368 |         subc_list = [el[1] for el in pid_subc_list]
369 | 
370 |         colors = [self.colormap[cl] for cl in sorted(list(set(subc_list)))]
371 |         # Bokeh scatterplot
372 |         self._scatter_plot(umap_mtx, pid_subc_list, colors, fig_width, fig_height)
373 | 
374 |         # Elbow method with clusters ranging from 2 to 15
375 |         inertia = []  # Sum of square differences of samples from cluster centers
376 |         K = np.arange(1, 15, dtype=int)
377 | 
378 |         for k in K:
379 |             kmean_model = KMeans(n_clusters=k).fit(X)
380 |             inertia.append(kmean_model.inertia_)
381 | 
382 |         plt.plot(K, inertia)
383 | 
384 |         acceleration = np.diff(inertia, 2)  # 2nd derivative of the distances
385 |         plt.plot(K[:-2] + 1, acceleration)
386 |         plt.xticks(K)
387 |         plt.show()
388 | 
389 |     @staticmethod
390 |     def heatmap_feat(X_scaled,
391 |                      fig_height,
392 |                      fig_width,
393 |                      save_html=None,
394 |                      save_svg=None):
395 |         """ Bokeh heatmap for the visualization of scaled scores in the
396 |         different subclusters. Hovertool displaying subject info and raw
397 |         scores.
398 | 
399 |         Parameters
400 |         ----------
401 |         X_scaled: dataframe
402 |             Feature scaled scores
403 |         fig_height, fig_width: int
404 |         save_html: str file name
405 |         save_svg: str svg file name
406 |         """
407 |         X_scaled = X_scaled.replace({'F1::psi-sf::padre::raw_ts': 'F1::psi-sf::caretakerm::raw_ts',
408 |                                      'F1::psi-sf::madre::raw_ts': 'F1::psi-sf::caretakerf::raw_ts',
409 |                                      'F2::psi-sf::padre::raw_ts': 'F2::psi-sf::caretakerm::raw_ts',
410 |                                      'F2::psi-sf::madre::raw_ts': 'F2::psi-sf::caretakerf::raw_ts',
411 |                                      'F3::psi-sf::padre::raw_ts': 'F3::psi-sf::caretakerm::raw_ts',
412 |                                      'F3::psi-sf::madre::raw_ts': 'F3::psi-sf::caretakerf::raw_ts',
413 |                                      'F4::psi-sf::padre::raw_ts': 'F4::psi-sf::caretakerm::raw_ts',
414 |                                      'F4::psi-sf::madre::raw_ts': 'F4::psi-sf::caretakerf::raw_ts',
415 |                                      'F5::psi-sf::padre::raw_ts': 'F5::psi-sf::caretakerm::raw_ts',
416 |                                      'F5::psi-sf::madre::raw_ts': 'F5::psi-sf::caretakerf::raw_ts'
417 |                                      })
418 | 
419 |         colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2",
420 |                   "#dfccce", "#ddb7b1", "#cc7878", "#933b41",
421 |                   "#550b1d"]
422 | 
423 |         mapper = LinearColorMapper(palette=colors,
424 |                                    low=X_scaled.score_sc.min(),
425 |                                    high=X_scaled.score_sc.max())
426 |         output_notebook()
427 |         p = figure(x_range=sorted(list(set(X_scaled['clpid']))),
428 |                    y_range=sorted(list(set(X_scaled['feat']))),
429 |                    x_axis_location="above",
430 |                    plot_width=fig_width,
431 |                    plot_height=fig_height,
432 |                    toolbar_location='below')
433 | 
434 |         TOOLTIPS = [('clpid', '@clpid'),
435 |                     ('sex', '@sex'),
436 |                     ('bdate', '@bdate'),
437 |                     ('feat', '@feat'),
438 |                     ('score', '@score'),
439 |                     ('n_enc', '@n_enc')]
440 | 
441 |         p.add_tools(HoverTool(tooltips=TOOLTIPS))
442 | 
443 |         p.grid.grid_line_color = None
444 |         p.axis.axis_line_color = None
445 |         p.axis.major_tick_line_color = None
446 |         p.xaxis.major_label_text_font_size = "7pt"
447 |         p.yaxis.major_label_text_font_size = "7pt"
448 |         p.axis.major_label_standoff = 0
449 |         p.xaxis.major_label_orientation = pi / 2
450 | 
451 |         p.rect(x="clpid", y="feat",
452 |                width=1, height=1,
453 |                source=X_scaled,
454 |                fill_color={'field': 'score_sc',
455 |                            'transform': mapper},
456 |                line_color=None)
457 | 
458 |         color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="8pt",
459 |                              ticker=BasicTicker(desired_num_ticks=len(colors)),
460 |                              formatter=PrintfTickFormatter(format="%.2f"),
461 |                              label_standoff=6, border_line_color=None, location=(0, 0))
462 |         p.add_layout(color_bar, 'right')
463 |         if save_html is not None:
464 |             output_file(f'./data/{save_html}.html')
465 |             save(p)
466 |         elif save_svg is not None:
467 |             p.output_backend = 'svg'
468 |             export_svgs(p, f'./data/{save_svg}.svg')
469 |         else:
470 |             show(p)
471 | 
472 |     @staticmethod
473 |     def heatmap_emb(emb_df_scaled,
474 |                     fig_height,
475 |                     fig_width,
476 |                     save_html=None,
477 |                     save_svg=None):
478 |         """ Bokeh heatmap of scaled scores for patient embedding subclusters.
479 |         Hovertool with subject info and subject raw scores.
480 | 
481 |         Parameters
482 |         ----------
483 |         emb_df_scaled: dataframe
484 |             output of data_heatmap_emb
485 |         fig_height, fig_width: int
486 |         save_html: str file name
487 |         save_svg: str file name
488 |         """
489 | 
490 |         emb_df_scaled = emb_df_scaled.replace({'psi-sf::padre::raw_ts': 'psi-sf::caretakerm::raw_ts',
491 |                                                'psi-sf::madre::raw_ts': 'psi-sf::caretakerf::raw_ts'})
492 |         colors = ["#75968f", "#a5bab7", "#c9d9d3", "#e2e2e2",
493 |                   "#dfccce", "#ddb7b1", "#cc7878", "#933b41",
494 |                   "#550b1d"]
495 | 
496 |         mapper = LinearColorMapper(palette=colors,
497 |                                    low=emb_df_scaled.score_sc.min(),
498 |                                    high=emb_df_scaled.score_sc.max())
499 | 
500 |         # output_notebook()
501 |         p = figure(x_range=sorted(list(set(emb_df_scaled['clpid']))),
502 |                    y_range=sorted(list(set(emb_df_scaled['feat']))),
503 |                    x_axis_location="above",
504 |                    plot_width=fig_width,
505 |                    plot_height=fig_height,
506 |                    toolbar_location='below')
507 | 
508 |         TOOLTIPS = [('clpid', '@clpid'),
509 |                     ('sex', '@sex'),
510 |                     ('bdate', '@bdate'),
511 |                     ('aoa', '@aoa'),
512 |                     ('feat', '@feat'),
513 |                     ('score', '@score'),
514 |                     ('n_enc', '@n_enc')]
515 | 
516 |         p.add_tools(HoverTool(tooltips=TOOLTIPS))
517 | 
518 |         p.grid.grid_line_color = None
519 |         p.axis.axis_line_color = None
520 |         p.axis.major_tick_line_color = None
521 |         p.xaxis.major_label_text_font_size = "7pt"
522 |         p.yaxis.major_label_text_font_size = "7pt"
523 |         p.axis.major_label_standoff = 0
524 |         p.xaxis.major_label_orientation = pi / 2
525 | 
526 |         p.rect(x="clpid", y="feat",
527 |                width=1, height=1,
528 |                source=emb_df_scaled,
529 |                fill_color={'field': 'score_sc',
530 |                            'transform': mapper},
531 |                line_color=None)
532 | 
533 |         color_bar = ColorBar(color_mapper=mapper, major_label_text_font_size="8pt",
534 |                              ticker=BasicTicker(desired_num_ticks=len(colors)),
535 |                              formatter=PrintfTickFormatter(format="%.2f"),
536 |                              label_standoff=6, border_line_color=None, location=(0, 0))
537 |         p.add_layout(color_bar, 'right')
538 |         if save_html is not None:
539 |             output_file(f'./data/{save_html}.html')
540 |             save(p)
541 |         elif save_svg is not None:
542 |             p.output_backend = 'svg'
543 |             export_svgs(p, f'./data/{save_svg}.svg')
544 |         else:
545 |             show(p)
546 | 
547 |     def _scatter_plot(self,
548 |                       umap_mtx,
549 |                       pid_subc_list,
550 |                       colors,
551 |                       fig_height,
552 |                       fig_width,
553 |                       label,
554 |                       save_fig):
555 |         """Bokeh scatterplot to visualize in jupyter clusters and subject info.
556 | 
557 |         Parameters
558 |         ----------
559 |         umap_mtx: np array
560 |             Array with UMAP projections
561 |         pid_subc_list: list of tuples
562 |             list of pids ordered as in umap_mtx and subcluster labels
563 |         colors: list
564 |             Color list
565 |         fig_height, fig_width: int
566 |             Figure dimensions
567 |         label: dict dictionary of class numbers and subtype labels
568 |         save_fig: str file name
569 |         """
570 | 
571 |         pid_list = list(map(lambda x: x[0], pid_subc_list))
572 |         subc_list = list(map(lambda x: x[1], pid_subc_list))
573 |         df_dict = {'x': umap_mtx[:, 0].tolist(),
574 |                    'y': umap_mtx[:, 1].tolist(),
575 |                    'pid_list': pid_list,
576 |                    'subc_list': subc_list}
577 | 
578 |         df = pd.DataFrame(df_dict).sort_values('subc_list')
579 | 
580 |         source = ColumnDataSource(dict(
581 |             x=df['x'].tolist(),
582 |             y=df['y'].tolist(),
583 |             pid=df['pid_list'].tolist(),
584 |             subc=list(map(lambda x: label[str(x)], df['subc_list'].tolist())),
585 |             col_class=[str(i) for i in df['subc_list'].tolist()],
586 |             bdate=[self.subject_info[pid].dob for pid in df['pid_list'].tolist()],
587 |             sex=[self.subject_info[pid].sex for pid in df['pid_list'].tolist()],
588 |             n_enc=[self.subject_info[pid].n_enc for pid in df['pid_list'].tolist()]))
589 | 
590 |         labels = [str(i) for i in df['subc_list']]
591 |         cmap = CategoricalColorMapper(factors=sorted(pd.unique(labels)),
592 |                                       palette=colors)
593 |         TOOLTIPS = [('pid', '@pid'),
594 |                     ('subc', '@subc'),
595 |                     ('sex', '@sex'),
596 |                     ('bdate', '@bdate'),
597 |                     ('n_enc', '@n_enc')]
598 | 
599 |         plotTools = 'box_zoom, wheel_zoom, pan,  crosshair, reset, save'
600 | 
601 |         output_notebook()
602 |         p = figure(plot_width=fig_width * 50, plot_height=fig_height * 50,
603 |                    tools=plotTools, title='Quantitative features')
604 |         p.add_tools(HoverTool(tooltips=TOOLTIPS))
605 |         p.circle('x', 'y', legend='subc', source=source,
606 |                  color={'field': 'col_class',
607 |                         # "field": 'subc',
608 |                         "transform": cmap}, size=8)
609 |         p.xaxis.major_tick_line_color = None
610 |         p.xaxis.minor_tick_line_color = None
611 |         p.yaxis.major_tick_line_color = None
612 |         p.yaxis.minor_tick_line_color = None
613 |         p.xaxis.major_label_text_color = None
614 |         p.yaxis.major_label_text_color = None
615 |         p.grid.grid_line_color = None
616 |         p.legend.location = 'bottom_right'
617 |         if save_fig is None:
618 |             show(p)
619 |         else:
620 |             p.output_backend = 'svg'
621 |             export_svgs(p, f'./data/{save_fig}-scatterplot.svg')
622 | 
623 |     def _modify_df(self, df):
624 |         """ Adds subject info to dataframe for heatmaps
625 | 
626 |         Parameters
627 |         ----------
628 |         df: dataframe
629 |             Stacked scaled dataframe with cl-pid column
630 | 
631 |         Returns
632 |         -------
633 |         dataframe
634 |             Dataframe with subject demographic info and number
635 |             of encounters
636 |         """
637 | 
638 |         sex_vect = []
639 |         bdate_vect = []
640 |         n_enc_vect = []
641 |         for pid in df.iloc[:, 0]:
642 |             if isinstance(pid, str):
643 |                 slab = pid.split('-')[1]
644 |                 sex_vect.append(self.subject_info[slab].sex)
645 |                 bdate_vect.append(self.subject_info[slab].dob)
646 |                 n_enc_vect.append(self.subject_info[slab].n_enc)
647 |             else:
648 |                 slab = pid[0].split('-')[1]
649 |                 sex_vect.append(self.subject_info[slab].sex)
650 |                 bdate_vect.append(self.subject_info[slab].dob)
651 |                 n_enc_vect.append(self.subject_info[slab].n_enc)
652 | 
653 |         df['sex'] = sex_vect
654 |         df['bdate'] = bdate_vect
655 |         df['n_enc'] = n_enc_vect
656 | 
657 |         return df
658 | 


--------------------------------------------------------------------------------