├── .DS_Store ├── .gitignore ├── LICENSE ├── README.md ├── signal.xlsx └── src ├── convert_sc_data.py ├── convert_seurat.R ├── download.py └── run.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data2intelligence/CytoSig_prediction/8bf4d45215a9e1869b7c3f0e71b54dfec6ddee13/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | build 3 | dist 4 | git_* 5 | .*project 6 | .settings 7 | *.egg-info 8 | __pycache__ 9 | *.pyc 10 | *~ 11 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Users can use CytoSig under the GNU General Public License version 3: https://www.gnu.org/licenses/gpl-3.0.en.html 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Example code of run CytoSig analysis and reproduce prediction results on bulk and single-cell cohorts 2 | 3 | Please CD into the src folder for the following steps. 4 | 5 | # Stage 1: download datasets 6 | run "./download.py" 7 | 8 | The starting data will be available in the folder data, named with bulk, single_cell, and output 9 | 10 | # Stage 2: convert single-cell data to CytoSig input format 11 | **This step is optional if you are only interested in bulk data analysis in Figure 4. Just jump to Stage 3.** 12 | 13 | run "./convert_sc_data.py" 14 | **Note**: This step needs a CPU with large memory of 64G as the dataset EGAS00001004571 contains many single cells. 15 | 16 | We included two single-cell datasets that are neither in CytoSig input format or CellRanger format. GSE145926 is released as H5 files. EGAS00001004571 is released as Seurat object. 17 | This program will convert both datasets to python pickles of dense matrices as CytoSig input. 18 | 19 | # Stage 3: predict CytoSig signaling activity and generate figures 20 | run "./run.py" 21 | If you converted single-cell data in step 2, this step will need a CPU with large memory of 64G as the dataset EGAS00001004571 contains many single cells. 22 | The relevant figure numbers are labeled above each function. 23 | 24 | **Task 1**: bulk data from tumor and inflammatory disease studies 25 | 26 | **Task 2**: single-cell data from COVID19 studies (only triggered if you have run the optional stage 2) 27 | -------------------------------------------------------------------------------- /signal.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/data2intelligence/CytoSig_prediction/8bf4d45215a9e1869b7c3f0e71b54dfec6ddee13/signal.xlsx -------------------------------------------------------------------------------- /src/convert_sc_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, pathlib, tables, pandas, numpy, re 4 | import scipy.sparse as sp_sparse 5 | 6 | from scipy import io 7 | from glob import glob 8 | 9 | base_path = pathlib.Path(__file__).parent.absolute() 10 | base_path = os.path.dirname(base_path) 11 | 12 | src_path = os.path.join(base_path, 'src') 13 | data_path = os.path.join(base_path, 'data') 14 | 15 | sc_path = os.path.join(data_path, 'single_cell') 16 | 17 | 18 | def filter_matrix(matrix): 19 | # jump empty barcodes 20 | matrix = matrix.loc[~matrix.index.isnull(), matrix.sum() > 0] 21 | 22 | # jump all zero genes 23 | matrix = matrix.loc[(matrix == 0).mean(axis=1) < 1] 24 | 25 | # jump ambiguous genes, if any 26 | cnt_map = matrix.index.value_counts() 27 | if cnt_map.max() > 1: 28 | matrix = matrix.loc[cnt_map.loc[matrix.index] == 1] 29 | 30 | return matrix 31 | 32 | 33 | 34 | def get_matrix_from_h5(filename): 35 | fin = tables.open_file(filename, 'r') 36 | 37 | mat_group = fin.get_node(fin.root, 'matrix') 38 | barcodes = fin.get_node(mat_group, 'barcodes').read() 39 | data = getattr(mat_group, 'data').read() 40 | indices = getattr(mat_group, 'indices').read() 41 | indptr = getattr(mat_group, 'indptr').read() 42 | shape = getattr(mat_group, 'shape').read() 43 | matrix = sp_sparse.csc_matrix((data, indices, indptr), shape=shape) 44 | 45 | feature_ref = {} 46 | feature_group = fin.get_node(mat_group, 'features') 47 | feature_ids = getattr(feature_group, 'id').read() 48 | feature_names = getattr(feature_group, 'name').read() 49 | feature_types = getattr(feature_group, 'feature_type').read() 50 | feature_ref['id'] = feature_ids 51 | feature_ref['name'] = feature_names 52 | feature_ref['feature_type'] = feature_types 53 | 54 | tag_keys = getattr(feature_group, '_all_tag_keys').read() 55 | 56 | for key in tag_keys: 57 | feature_ref[key] = getattr(feature_group, key.decode("utf-8")).read() 58 | 59 | matrix = pandas.DataFrame.sparse.from_spmatrix(matrix) 60 | matrix.index = map(lambda v: v.decode("utf-8"), feature_ref['name']) 61 | matrix.columns = map(lambda v: v.decode("utf-8"), barcodes) 62 | 63 | fin.close() 64 | 65 | return filter_matrix(matrix) 66 | 67 | 68 | 69 | 70 | def load_COVID19_GSE145926(): 71 | """ 72 | Load a COVID19 single-cell RNASeq data from patient lavage samples 73 | Final column names as cell_type.patient.barcode 74 | """ 75 | 76 | fprefix = os.path.join(sc_path, 'GSE145926') 77 | 78 | info = pandas.read_excel(os.path.join(fprefix, 'patient_group.xlsx'), index_col=0)['Group'] 79 | 80 | # the initial h5 files are downloaded from GEO 81 | file_list = glob(os.path.join(fprefix, '*.h5')) 82 | 83 | merge = [] 84 | 85 | for f in file_list: 86 | title = os.path.basename(f).split('_')[1] 87 | print(title) 88 | 89 | data = get_matrix_from_h5(f) 90 | 91 | cat = info.loc[title] 92 | 93 | data.columns = cat + '_' + title + '.' + data.columns 94 | merge.append(data) 95 | 96 | data = pandas.concat(merge, axis=1, join='inner') 97 | data = data.loc[(data == 0).mean(axis=1) < 1] 98 | 99 | # down-sizing by 10 100 | ratio = 1E5/data.sum() 101 | data *= ratio 102 | 103 | data = numpy.log2(data + 1) 104 | 105 | # add cell types 106 | ID_map = { 107 | 'S1': 'severe_C143', 108 | 'S2': 'severe_C145', 109 | 'S3': 'severe_C146', 110 | 'S4': 'severe_C148', 111 | 'S5': 'severe_C149', 112 | 'S6': 'severe_C152', 113 | 114 | 'M1': 'mild_C141', 115 | 'M2': 'mild_C142', 116 | 'M3': 'mild_C144', 117 | 118 | 'H1': 'healthy_C51', 119 | 'H2': 'healthy_C52', 120 | 'H3': 'healthy_C100', 121 | } 122 | 123 | # general cell map, these annotations are labeled by ourselves 124 | info = pandas.read_csv(os.path.join(fprefix, 'cluster_results.tsv'), sep='\t', index_col=0) 125 | info = info.loc[:, 'clusterID'] 126 | 127 | cluster = pandas.read_csv(os.path.join(fprefix, 'cluster_annotation.txt'), sep='\t', index_col=0) 128 | cluster = cluster.loc[:, 'cellTypeSub'] 129 | 130 | cell_type = info.apply(lambda v: cluster.loc[v]) 131 | cell_type.index = [ID_map[v.split('_')[0]] + '.' + v.split('_')[1] for v in info.index] 132 | 133 | # T cell map 134 | info = pandas.read_csv(os.path.join(fprefix, 'cluster_T_results.tsv'), sep='\t', index_col=0) 135 | info = info.loc[:, 'clusterID'] 136 | 137 | cluster = pandas.read_csv(os.path.join(fprefix, 'cluster_T_annotation.txt'), sep='\t', index_col=0) 138 | cluster = cluster.loc[:, 'cellTypeSub'] 139 | 140 | Tcell_type = info.apply(lambda v: cluster.loc[v]) 141 | Tcell_type.index = [ID_map[v.split('_')[0]] + '.' + v.split('_')[1] for v in info.index] 142 | 143 | cntmap = cell_type.loc[Tcell_type.index].value_counts() 144 | assert cntmap.shape[0] == 1 145 | 146 | cell_type.drop(Tcell_type.index, inplace=True) 147 | 148 | # put in cell type to data columns 149 | data_1 = data.loc[:, cell_type.index] 150 | data_1.columns = cell_type.loc[data_1.columns] + '.' + data_1.columns 151 | 152 | data_2 = data.loc[:, Tcell_type.index] 153 | data_2.columns = Tcell_type.loc[data_2.columns] + '.' + data_2.columns 154 | 155 | data = pandas.concat([data_1, data_2], axis=1, join='inner') 156 | 157 | print(data.shape) 158 | data.sparse.to_dense().to_pickle(fprefix + '.pickle.gz', compression='gzip') 159 | 160 | 161 | 162 | def load_mtx_to_dataframe(data, gene, barcode): 163 | data = io.mmread(data) 164 | data = pandas.DataFrame.sparse.from_spmatrix(data) 165 | 166 | gene = pandas.read_csv(gene, sep='\t', header=None).iloc[:,0] 167 | barcode = pandas.read_csv(barcode, sep='\t', header=None).iloc[:,0] 168 | 169 | data.index = gene 170 | data.columns = barcode 171 | 172 | return filter_matrix(data) 173 | 174 | 175 | 176 | def load_COVID19_EGAS00001004571(): 177 | fpath = os.path.join(sc_path, 'EGAS00001004571') 178 | 179 | # step 1: Seurat object to mtx matrix with normalized data 180 | # The initial rds file is downloaded from https://beta.fastgenomics.org/datasets/detail-dataset-952687f71ef34322a850553c4a24e82e#Files 181 | os.system(' '.join([ 182 | os.path.join(src_path, 'convert_seurat.R'), 183 | os.path.join(fpath, 'seurat_COVID19_PBMC_cohort1_10x_jonas_FG_2020-08-15.rds'), 184 | os.path.join(fpath, 'convert') 185 | ])) 186 | 187 | # step 2: sparse matrix to python pickle 188 | metadata = os.path.join(fpath, 'convert', 'metadata') 189 | metadata = pandas.read_csv(metadata, sep='\t', index_col=0, low_memory=False) 190 | 191 | postfix = re.compile('_[0-9]+$') 192 | 193 | cell_type = metadata['cluster_labels_res.0.4'].apply(lambda v: re.sub(postfix, '', v).replace('.', '-')) 194 | info = cell_type + '.' + metadata['group_per_sample'] + '_' + metadata['sampleID'].apply(lambda v: v.replace('.', '-')) 195 | 196 | # these data are seurat normalized already, no need to transform again 197 | data = load_mtx_to_dataframe( 198 | os.path.join(fpath, 'convert', 'matrix.mtx'), 199 | os.path.join(fpath, 'convert', 'genes.tsv'), 200 | os.path.join(fpath, 'convert', 'barcodes.tsv') 201 | ) 202 | 203 | data.columns = info.loc[data.columns] + '.' + data.columns 204 | 205 | data.sparse.to_dense().to_pickle(os.path.join(sc_path, 'EGAS00001004571_cohort1.pickle.gz')) 206 | 207 | 208 | 209 | 210 | def main(): 211 | load_COVID19_GSE145926() 212 | load_COVID19_EGAS00001004571() 213 | 214 | return 0 215 | 216 | if __name__ == '__main__': main() 217 | -------------------------------------------------------------------------------- /src/convert_seurat.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | suppressPackageStartupMessages(library(Seurat)) 4 | suppressPackageStartupMessages(library(DropletUtils)) 5 | 6 | commands = commandArgs(trailingOnly=T) 7 | 8 | input = commands[1] 9 | output_path = commands[2] 10 | 11 | data = readRDS(input) 12 | write10xCounts(x=data@assays$RNA@data, path=output_path) 13 | write.table(data@meta.data, file.path(output_path, 'metadata'), sep='\t', quote=F) 14 | -------------------------------------------------------------------------------- /src/download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, pathlib, tarfile 4 | import urllib.request 5 | import ssl 6 | ssl._create_default_https_context = ssl._create_unverified_context 7 | 8 | base_path = pathlib.Path(__file__).parent.absolute() 9 | base_path = os.path.dirname(base_path) 10 | 11 | for url in ['https://hpc.nih.gov/~Jiang_Lab/CytoSig/CytoSig_prediction/data.tar.gz']: 12 | f = os.path.basename(url.rstrip('/')) 13 | 14 | out = os.path.join(base_path, f) 15 | urllib.request.urlretrieve(url, out) 16 | 17 | if url.find('.tar.gz') > 0: 18 | my_tar = tarfile.open(out) 19 | my_tar.extractall(base_path) 20 | my_tar.close() 21 | 22 | os.remove(out) 23 | -------------------------------------------------------------------------------- /src/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import CytoSig 4 | import pandas 5 | import os, sys 6 | import re 7 | import numpy 8 | import pathlib 9 | import seaborn 10 | import matplotlib.pyplot as plt 11 | 12 | from scipy import stats 13 | from glob import glob 14 | from lifelines.fitters.coxph_fitter import CoxPHFitter 15 | from lifelines import KaplanMeierFitter 16 | from statsmodels.stats.multitest import multipletests 17 | from scipy.cluster import hierarchy as hc 18 | from matplotlib.backends.backend_pdf import PdfPages 19 | 20 | 21 | ############################################################### 22 | # define paths 23 | base_path = pathlib.Path(__file__).parent.absolute() 24 | base_path = os.path.dirname(base_path) 25 | 26 | data_path = os.path.join(base_path, 'data') 27 | bulk_path = os.path.join(data_path, 'bulk') 28 | sc_path = os.path.join(data_path, 'single_cell') 29 | output_path = os.path.join(data_path, 'output') 30 | 31 | ############################################################### 32 | # define parameters 33 | alpha = 1E4 34 | nrand = 1000 35 | alternative='two-sided' 36 | verbose_flag = False 37 | 38 | figure_width = 7 39 | font_size = 30 40 | 41 | plt.rcParams.update({'font.size': font_size}) 42 | 43 | # default colors array 44 | colors_default = plt.rcParams['axes.prop_cycle'].by_key()['color'] 45 | 46 | ############################################################### 47 | # load CytoSig signature 48 | signature = os.path.join(sys.prefix, 'bin', 'signature.centroid') 49 | signature = pandas.read_csv(signature, sep='\t', index_col=0) 50 | 51 | 52 | 53 | ############################################################### 54 | # define functions 55 | 56 | def category_background(expression, sep): 57 | """ 58 | Create the background vector for single-cell analysis 59 | """ 60 | 61 | if sep is None: 62 | background = expression.mean(axis=1) 63 | else: 64 | category = expression.groupby([v.split(sep)[0] for v in expression.columns], axis=1).apply(lambda v: v.mean(axis=1)) 65 | background = category.mean(axis=1) 66 | 67 | background.name = 'background' 68 | return background 69 | 70 | 71 | 72 | def heatmap(data, output, top_value=None, shrink=1, metric_type='correlation', flag_cluster=True): 73 | if type(data) == str: data = pandas.read_csv(data, sep='\t', index_col=0) 74 | 75 | if flag_cluster: 76 | data.fillna(0, inplace=True) 77 | z = hc.linkage(data, method='average', metric= metric_type) 78 | 79 | dendro = hc.dendrogram(z, labels=data.index.tolist(), no_plot=True) 80 | order_index = dendro['ivl'] 81 | 82 | z = hc.linkage(data.transpose(), method='average', metric=metric_type) 83 | dendro = hc.dendrogram(z, labels= data.columns.tolist(), no_plot=True) 84 | order_columns = dendro['ivl'] 85 | 86 | data = data.loc[order_index, order_columns] 87 | 88 | fig = plt.figure(figsize=data.shape, frameon=False) 89 | 90 | if top_value is None: 91 | g = seaborn.heatmap(data.transpose(), cmap="coolwarm", center=0, square=True, xticklabels=True, yticklabels=True, cbar_kws={"shrink": shrink}) 92 | else: 93 | g = seaborn.heatmap(data.transpose(), cmap="coolwarm", center=0, vmax=top_value, vmin=-top_value, square=True, xticklabels=True, yticklabels=True, cbar_kws={"shrink": shrink}) 94 | 95 | g.set_xticklabels(g.get_xmajorticklabels(), rotation=90)#, fontsize=font_size_label) 96 | g.set_yticklabels(g.get_ymajorticklabels(), rotation=0)#, fontsize=font_size_label) 97 | 98 | fig.savefig(output + '.pdf', bbox_inches='tight', transparent=True) 99 | #fig.savefig(output + '.svg', bbox_inches='tight', transparent=True) 100 | plt.close(fig) 101 | 102 | return data 103 | 104 | 105 | def analyze_three_level_significance(out, levels, top_value = 4, cnt_thres = 5, qthres=0.05, vthres=1, merged_compare=True): 106 | """ 107 | Statistical difference analysis between severe, mild, and healthy patient groups 108 | out: patient scores, such as CytoSig signalig activity or gene expression levels. 109 | top_value: the highest absolute z-score levels in plotting 110 | cnt_thres: minimum number of cells for each plotted sample 111 | qthres: threshold for significant corrected p-values 112 | vthres: threshold for minimum absolute z-scores to claim a significance event 113 | merged_compare: in the second comparison, whether merge severe and mild patients as one group against healthy controls 114 | alternatively, just compare severe and healthy controls 115 | """ 116 | 117 | result = pandas.read_csv(out, sep='\t', index_col=0) 118 | assert result.columns.value_counts().max() == 1 119 | 120 | # two types of comparison in analysis later 121 | label_comparison1 = '%s-%s' % (levels[0], levels[1]) 122 | 123 | if merged_compare: 124 | label_comparison2 = 'disease-%s' % levels[2] 125 | else: 126 | label_comparison2 = '%s-%s' % (levels[0], levels[2]) 127 | 128 | # (patient, cell type) group merge, merge on cell level 129 | flag = ['.'.join(v.split('.')[:2]) for v in result.columns] 130 | cntmap = pandas.Series(flag).value_counts() 131 | 132 | result = result.groupby(flag, axis=1).apply(lambda v: v.mean(axis=1)) 133 | 134 | # only keep entities with sufficent number of cells 135 | result = result.loc[:, cntmap.index[cntmap >= cnt_thres]] 136 | 137 | result_group = result.groupby([v.split('.')[0] for v in result.columns], axis=1) 138 | 139 | writer = pandas.ExcelWriter(out + '.stat.xlsx', engine='xlsxwriter') 140 | pdf = PdfPages(out + '.heatmap.pdf') 141 | 142 | merge_signal_score = [] 143 | 144 | mat_map_sig = {} 145 | cnt_map_sig = {} 146 | 147 | merge_severe_mild = [] 148 | 149 | for cell_type, result in result_group: 150 | result.columns = [v.split('.')[1] for v in result.columns] 151 | result = result.loc[(result == 0).mean(axis=1) < 1] 152 | result = result.loc[result.std(axis=1) > 1e-3] 153 | 154 | # dataset specific fix 155 | if out.find('GSE145926') >= 0: 156 | result.columns = [v.replace('_C', '_') for v in result.columns] 157 | 158 | elif out.find('EGAS00001004571') >= 0: 159 | result.columns = [v.replace('_C19-CB-', '_') for v in result.columns] 160 | 161 | 162 | patient_group = result.groupby([v.split('_')[0] for v in result.columns], axis=1) 163 | 164 | if len( set(levels).intersection(patient_group.groups.keys())) != len(levels): 165 | # some patient group is missing for the current cell type 166 | print('jump %s by lacking of patient group' % cell_type) 167 | continue 168 | 169 | # order signal names 170 | z = hc.linkage(result, method='average', metric='correlation') 171 | dendro = hc.dendrogram(z, labels=result.index.tolist(), no_plot=True) 172 | order_index = dendro['ivl'] 173 | 174 | order_columns = [] 175 | 176 | # order patients within each level group 177 | for level in levels: 178 | result_sub = patient_group.get_group(level) 179 | 180 | if result_sub.shape[1] > 1: # with something to order 181 | z = hc.linkage(result_sub.transpose(), method='average', metric='correlation') 182 | dendro = hc.dendrogram(z, labels= result_sub.columns.tolist(), no_plot=True) 183 | order_columns.extend(dendro['ivl']) 184 | else: 185 | order_columns.extend(result_sub.columns) 186 | 187 | result = result.loc[order_index, order_columns] 188 | 189 | fig = plt.figure(figsize=result.shape, frameon=False) 190 | g = seaborn.heatmap(result.transpose(), cmap="coolwarm", center=0, vmax=top_value, vmin=-top_value, square=True, xticklabels=True, yticklabels=True, cbar_kws={"shrink": 0.6}) 191 | g.set_xticklabels(g.get_xmajorticklabels(), rotation=90)#, fontsize=font_size_label) 192 | g.set_yticklabels(g.get_ymajorticklabels(), rotation=0)#, fontsize=font_size_label) 193 | plt.title(cell_type, fontsize=font_size) 194 | 195 | pdf.savefig(fig, bbox_inches='tight', transparent=True) 196 | plt.close(fig) 197 | 198 | # differential analysis 199 | result_lst = [] 200 | for level in levels: result_lst.append(patient_group.get_group(level)) 201 | 202 | merge = [] 203 | 204 | for gid in result_lst[0].index: 205 | arr_severe = result_lst[0].loc[gid] 206 | arr_mild = result_lst[1].loc[gid] 207 | arr_healthy = result_lst[2].loc[gid] 208 | 209 | if merged_compare: 210 | arr_disease = pandas.concat([arr_severe, arr_mild]) 211 | else: 212 | arr_disease = arr_severe 213 | 214 | z_severe_mild, p_severe_mild = stats.ranksums(arr_severe, arr_mild) 215 | z_disease_healthy, p_disease_healthy = stats.ranksums(arr_disease, arr_healthy) 216 | 217 | merge.append( 218 | pandas.Series( 219 | [arr_severe.median(), arr_mild.median(), arr_healthy.median(), 220 | z_severe_mild, z_disease_healthy, 221 | p_severe_mild, p_disease_healthy 222 | ], name=gid, 223 | index=['med.%s' % levels[0], 'med.%s' % levels[1], 'med.%s' % levels[2], 'z.%s' % label_comparison1, 'z.%s' % label_comparison2, 'p.%s' % label_comparison1, 'p.%s' % label_comparison2]) 224 | ) 225 | 226 | result_stat = pandas.concat(merge, axis=1, join='inner').transpose() 227 | 228 | signal_score = [] 229 | 230 | for label_comparison in [label_comparison1, label_comparison2]: 231 | arr = result_stat['FDR.%s' % label_comparison] = multipletests(result_stat['p.%s' %label_comparison], method='fdr_bh')[1] 232 | signal_score.append(arr.mean()) 233 | signal_score.append(arr.min()) 234 | 235 | signal_score = pandas.Series(signal_score, index=[label_comparison1 + '.mean', label_comparison1 + '.min', label_comparison2 + '.mean', label_comparison2 + '.min'], name=cell_type) 236 | merge_signal_score.append(signal_score) 237 | 238 | result_stat.sort_values('p.%s' % label_comparison1, inplace=True) 239 | result_stat.to_excel(writer, sheet_name=cell_type[:31]) 240 | 241 | # test for significance 242 | flag = (result_stat['FDR.%s' % label_comparison1] < qthres) & (result_stat['FDR.%s' % label_comparison2] < qthres) 243 | flag &= ( 244 | (result_stat['med.%s' % levels[0]].abs() > vthres) | (result_stat['med.%s' % levels[1]].abs() > vthres) | (result_stat['med.%s' % levels[2]].abs() > vthres) 245 | ) 246 | 247 | if sum(flag) > 0: 248 | print(cell_type) 249 | 250 | arr = result_stat.loc[:, 'z.%s' % label_comparison1] 251 | arr.name = cell_type 252 | merge_severe_mild.append(arr) 253 | 254 | result_stat = result_stat.loc[flag] 255 | result_stat.drop(['FDR.%s' % label_comparison1, 'FDR.%s' % label_comparison2], axis=1, inplace=True) 256 | result_stat.sort_values('z.%s' % label_comparison1, ascending=False).to_excel(writer, sheet_name= 'sig.' + cell_type) 257 | 258 | result_sub = result.loc[flag] 259 | result_sub.to_csv(out + '.' + cell_type, sep='\t', index_label=False) 260 | 261 | heatmap(result_sub, out + '.' + cell_type, top_value=top_value, shrink=0.5, flag_cluster=False) 262 | 263 | for gid in result_stat.index: 264 | prev = cnt_map_sig.get(gid, 0) 265 | cnt_map_sig[gid] = prev + 1 266 | 267 | mat_map_sig[cell_type] = result.groupby([v.split('_')[0] for v in result.columns], axis=1).median() 268 | 269 | pdf.close() 270 | 271 | 272 | # triplet pattern merge output 273 | if len(mat_map_sig) > 0: 274 | merge_severe_mild = pandas.concat(merge_severe_mild, axis=1, join='inner') 275 | 276 | if merge_severe_mild.shape[1] > 1: 277 | z = hc.linkage(merge_severe_mild.transpose(), method='average', metric='correlation') 278 | dendro = hc.dendrogram(z, labels= merge_severe_mild.columns.tolist(), no_plot=True) 279 | merge_severe_mild = merge_severe_mild.loc[:, dendro['ivl']] 280 | 281 | pandas.Series(cnt_map_sig).to_csv(out + '.cnt_map', sep='\t', index_label=False, header=False) 282 | 283 | merge = [] 284 | 285 | for cell_type in merge_severe_mild.columns: 286 | triplet = mat_map_sig[cell_type] 287 | triplet.columns = cell_type + '@' + triplet.columns 288 | merge.append(triplet.iloc[:, ::-1]) 289 | 290 | pandas.concat(merge, axis=1, join='inner').to_csv(out + '.triplet', sep='\t', index_label=False) 291 | 292 | # output test statistics 293 | merge_signal_score = pandas.concat(merge_signal_score, axis=1, join='inner').transpose() 294 | merge_signal_score.sort_values(merge_signal_score.columns[0], inplace=True, ascending=False) 295 | merge_signal_score.to_csv(out + '.FDR', sep='\t', index_label=False) 296 | 297 | format_align = writer.book.add_format({'align': 'center'}) 298 | format_number = writer.book.add_format({'num_format': '#,##0.000', 'align': 'center'}) 299 | format_stat = writer.book.add_format({'num_format': '0.00E+00', 'align': 'center'}) 300 | 301 | width = max(len('FDR.%s' % label_comparison1), len('FDR.%s' % label_comparison2)) 302 | 303 | for worksheet in writer.sheets: 304 | worksheet = writer.sheets[worksheet] 305 | 306 | worksheet.set_column(0, result.shape[1], None, format_align) 307 | worksheet.set_column(1, 5, width, format_number) 308 | worksheet.set_column(6, 7, width, format_stat) 309 | worksheet.set_column(8, 9, width, format_number) 310 | 311 | worksheet.set_zoom(150) 312 | 313 | writer.close() 314 | 315 | 316 | 317 | def survival_best_separation(data, pivot, z_continuous): 318 | """ 319 | Get the signal cutoff for KM plot through the best-separation criteria. 320 | However, all statistical test will be based on continuous value without cutoffs.. 321 | """ 322 | cf = CoxPHFitter() 323 | 324 | arr = data[pivot] 325 | arr_rank = arr.sort_values(ascending=False) 326 | N = arr.shape[0] 327 | 328 | vthres_max = z_max = None 329 | 330 | margin = max(5, int(0.1*N)) 331 | 332 | for i in range(margin-1, N-margin): 333 | vthres = arr_rank.iloc[i] 334 | 335 | data.loc[:, pivot] = (arr >= vthres) 336 | 337 | try: 338 | cf.fit(data, data.columns[0], event_col=data.columns[1]) 339 | except: 340 | continue 341 | 342 | z = cf.summary.loc[pivot, 'z'] 343 | 344 | if vthres_max is None or z * z_continuous > z_max * z_continuous: 345 | vthres_max = vthres 346 | z_max = z 347 | 348 | # recover the value 349 | data.loc[:, pivot] = arr 350 | 351 | return vthres_max, z_max 352 | 353 | 354 | def Survival_Analysis(clinical, data, signal, title, survival_type, output): 355 | cf = CoxPHFitter() 356 | 357 | data = pandas.read_csv(data, sep='\t', index_col=0) 358 | 359 | beta, se, zscore, pvalue = CytoSig.ridge_significance_test(signature, data, alpha, alternative, nrand, 1, True, False, verbose_flag) 360 | 361 | survival = pandas.read_csv(clinical, sep='\t', index_col=0) 362 | survival.index = survival.index.astype(str) # in case patient ID is integer 363 | 364 | mat = pandas.concat([survival, zscore.loc[signal]], axis=1, join='inner') 365 | 366 | cf.fit(mat, survival.columns[0], event_col=survival.columns[1]) 367 | 368 | # wald test z-score 369 | z_Wald = cf.summary.loc[signal, 'z'] 370 | 371 | # the CoxPH regression output two-sided p-values by default, we divide them by 2 to get one-sided 372 | p_oneside = cf.summary.loc[signal, 'p']/2 373 | 374 | # find the best separation cutoff only for KM plot, but not statistical test above 375 | thres, _ = survival_best_separation(mat, signal, z_Wald) 376 | 377 | flag = (mat[signal] > thres) 378 | 379 | fig = plt.figure(figsize=(figure_width, figure_width), frameon=False) 380 | kmf = KaplanMeierFitter() 381 | 382 | kmf.fit(mat.iloc[:,0].loc[flag], mat.iloc[:,1].loc[flag], label= 'High (n=%d)' % (sum(flag))) 383 | a1 = kmf.plot(ci_show=False, show_censors=True, color='red', linewidth=2) 384 | 385 | kmf.fit(mat.iloc[:,0].loc[~flag], mat.iloc[:,1].loc[~flag], label='Low (n=%d)' % (sum(~flag))) 386 | kmf.plot(ax=a1, ci_show=False, show_censors=True, color='blue', linestyle='--', linewidth=2) 387 | 388 | plt.xlabel(survival_type) 389 | plt.ylabel('Fraction') 390 | plt.legend(frameon=False) 391 | plt.title('p = %.2e' % p_oneside, fontsize=font_size) 392 | 393 | fig.savefig(output + '.pdf', bbox_inches='tight', transparent=True) 394 | plt.close(fig) 395 | 396 | 397 | def Survival_Analysis_Tumor(): 398 | """ 399 | Analyze across tumor datasets 400 | """ 401 | 402 | # data 1, Sunitinib multi-kinase inhibitor, targetting VEGF receptor, in metastatic clear-cell renal cell carcinoma (CCRCC) 403 | signal = 'VEGFA' 404 | title = 'Sunitinib CCRCC' 405 | survival_type = 'Progression-free survival (month)' 406 | 407 | fprefix = os.path.join(bulk_path, 'tumor', 'E-MTAB-3267') 408 | output = os.path.join(output_path, 'E-MTAB-3267.%s_%s' % (signal, title.replace(' ', '_'))) 409 | 410 | clinical = fprefix + '.PFS' 411 | data = fprefix + '.norm_subtract.gz' 412 | 413 | Survival_Analysis(clinical, data, signal, title, survival_type, output) 414 | 415 | # data 2, Bevacizumab mono-therapy treatment in Glioblastoma (GBM) 416 | signal = 'VEGFA' 417 | title = 'Bevacizumab GBM' 418 | survival_type = 'Overall survival (month)' 419 | 420 | fprefix = os.path.join(bulk_path, 'tumor', 'GSE72951') 421 | output = os.path.join(output_path, 'GSE72951.%s_%s' % (signal, title.replace(' ', '_'))) 422 | 423 | clinical = fprefix + '.OS.Bevacizumab' 424 | data = fprefix + '.self_subtract.gz' 425 | 426 | Survival_Analysis(clinical, data, signal, title, survival_type, output) 427 | 428 | # data 3, Atezolizumab anti-PDL1, requiring pre-existing interferon gamma signaling (IFNG), in metastatic urothelial carcinoma 429 | signal = 'IFNG' 430 | title = 'Atezolizumab Urothelial' 431 | survival_type = 'Overall survival (month)' 432 | 433 | fprefix = os.path.join(bulk_path, 'tumor', 'EGAS00001002556') 434 | output = os.path.join(output_path, 'EGAS00001002556.%s_%s' % (signal, title.replace(' ', '_'))) 435 | 436 | clinical = fprefix + '.OS' 437 | data = fprefix + '.self_subtract.gz' 438 | 439 | Survival_Analysis(clinical, data, signal, title, survival_type, output) 440 | 441 | 442 | 443 | def CytoSig_run_Inflam(): 444 | """ 445 | inflammatory diseases 446 | """ 447 | 448 | # this is an output folder from the FDC framework, processed result files should look like *.diff.1 449 | fpath = os.path.join(bulk_path, 'inflam') 450 | 451 | # run CytoSig for all datasets 452 | diff_lst = glob(os.path.join(fpath, '*.diff.1')) 453 | 454 | merge = [] 455 | 456 | for fprefix in diff_lst: 457 | title = os.path.basename(fprefix) 458 | print(title) 459 | 460 | data = pandas.read_csv(fprefix, sep='\t', index_col=0) 461 | beta, se, zscore, pvalue = CytoSig.ridge_significance_test(signature, data, alpha, alternative, nrand, 1, True, False, verbose_flag) 462 | zscore.to_csv(fprefix + '.signal', sep='\t', index_label=False) 463 | 464 | # if a dataset has individual sample information 465 | if os.path.exists(fprefix + '.sep.gz'): 466 | data = pandas.read_csv(fprefix + '.sep.gz', sep='\t', index_col=0) 467 | 468 | beta, se, zscore, pvalue = CytoSig.ridge_significance_test(signature, data, alpha, alternative, nrand, 1, True, False, verbose_flag) 469 | zscore.to_csv(fprefix + '.sep.signal', sep='\t', index_label=False) 470 | 471 | # merge results 472 | result = pandas.read_csv(fprefix + '.signal', sep='\t', index_col=0) 473 | result.columns = title.split('.')[0] + '_' + result.columns 474 | merge.append(result) 475 | 476 | merge = pandas.concat(merge, axis=1) 477 | assert merge.columns.value_counts().max() == 1 478 | 479 | merge.to_csv(os.path.join(fpath, 'merge'), sep='\t', index_label=False) 480 | 481 | 482 | 483 | 484 | def violinplot_one(handle, arr, i, col, alpha=0.6, flag_dot=False, alpha_dot=None, col_dot=None): 485 | parts = handle.violinplot(arr, [i], showmeans=False, showmedians=True, showextrema=False, widths=0.6) 486 | 487 | for pc in parts['bodies']: 488 | pc.set_facecolor(col) 489 | pc.set_edgecolor('black') 490 | pc.set_alpha(alpha) 491 | 492 | if 'cmeans' in parts: 493 | pc = parts['cmeans'] 494 | elif 'cmedians' in parts: 495 | pc = parts['cmedians'] 496 | else: 497 | pc = None 498 | 499 | if pc is not None: 500 | pc.set_linewidth(5) 501 | pc.set_edgecolor('black') 502 | pc.set_alpha(1) 503 | 504 | if flag_dot: 505 | if col_dot is None: col_dot = col 506 | if alpha_dot is None: alpha_dot = alpha 507 | 508 | x = numpy.random.normal(i, 0.1, size=arr.shape[0]) 509 | handle.plot(x, arr, color=col_dot, marker='o', linestyle='none', markersize=5, alpha=alpha_dot) 510 | 511 | 512 | 513 | def plot_canakinumab_GSE80060_response_activity(): 514 | """ 515 | systemic juvenile idiopathic arthritis (SJIA) 516 | """ 517 | 518 | output = os.path.join(output_path, 'GSE80060.IL1B_Canakinumab_SJIA') 519 | 520 | conditions = ['100', '90', '70', '50', '30', '0', 'Placebo'] 521 | 522 | # section 1: post versus pre activity 523 | result = os.path.join(bulk_path, 'inflam', 'GSE80060.MicroArray.GPL570.diff.1.sep.signal') 524 | result = pandas.read_csv(result, sep='\t', index_col=0) 525 | result = result.loc['IL1B'] 526 | 527 | # standardize sample names 528 | info = pandas.DataFrame([v.split('@')[1].split('&')[0].split('_') for v in result.index], index=result.index) 529 | info.columns = ['Treatment', 'Response'] 530 | 531 | flag_Canakinumab = (info.iloc[:,0] == 'Canakinumab') & (info.iloc[:,1] != 'nan') 532 | flag_Placebo = (info.iloc[:,0] == 'Placebo') & (info.iloc[:,1] == 'nan') 533 | info = info.loc[flag_Canakinumab | flag_Placebo] 534 | 535 | info.loc[flag_Canakinumab, 'Response'] = info.loc[flag_Canakinumab, 'Response'].apply(lambda v: v.replace('.0', '')) 536 | info = pandas.concat([info.loc[flag_Canakinumab, 'Response'], info.loc[flag_Placebo, 'Treatment']]) 537 | 538 | result = result.loc[info.index] 539 | result_group = result.groupby(info) 540 | 541 | x = [] 542 | y = [] 543 | 544 | fig = plt.figure(figsize=(figure_width, figure_width), frameon=False) 545 | 546 | for i, conc in enumerate(conditions): 547 | result = result_group.get_group(conc) 548 | violinplot_one(plt, result, i, colors_default[0], flag_dot=True, alpha=0.5, alpha_dot=0.6) 549 | 550 | if conc == 'Placebo': 551 | conc = 0 552 | else: 553 | conc = int(conc) 554 | 555 | x.append(conc) 556 | y.append(result.median()) 557 | 558 | r, p = stats.spearmanr(x, y) 559 | 560 | plt.title('r = %.2f, p = %.1e' % (r, p), fontsize=font_size) 561 | plt.axhline(0, linestyle='--', color='grey', lw=1) 562 | plt.xticks(range(len(conditions)), conditions, rotation=90) 563 | plt.tick_params(pad=10) 564 | 565 | plt.xlabel('Response (%)') 566 | plt.ylabel('IL1B activity diff day3') 567 | 568 | fig.savefig(output + '.pdf', bbox_inches='tight', transparent=True) 569 | plt.close(fig) 570 | 571 | 572 | def plot_IFNK_GSE72754_response_activity(): 573 | """ 574 | systemic lupus erythematosus 575 | """ 576 | 577 | output = os.path.join(output_path, 'GSE72754.IFN1_IFNK_SLE') 578 | 579 | # section 1: post versus pre activity 580 | result = os.path.join(bulk_path, 'inflam', 'GSE72754.MicroArray.HG-U133_Plus_2.diff.1.sep.signal') 581 | result = pandas.read_csv(result, sep='\t', index_col=0) 582 | result = result.loc['IFN1'] 583 | 584 | info = pandas.DataFrame([v.split('&')[1].split()[0].split('_') for v in result.index], index=result.index, columns=['titer', 'month']).astype(int) 585 | 586 | r, p = stats.spearmanr(info['titer'], result) 587 | 588 | fig = plt.figure(figsize=(figure_width, figure_width), frameon=False) 589 | 590 | plt.plot(info['titer'], result, 'o', markersize=10) 591 | plt.title('r = %.2f, p = %.1e' % (r, p), fontsize=font_size) 592 | plt.xlabel('anti-IFNA Titer') 593 | plt.ylabel('IFN1 activity diff') 594 | 595 | plt.axhline(0, linestyle='--', color='grey') 596 | plt.axvline(100, linestyle='--', color='grey') 597 | 598 | plt.gca().ticklabel_format(axis="x", style="sci", scilimits=(0,0)) 599 | 600 | fig.savefig(output + '.pdf', bbox_inches='tight', transparent=True) 601 | plt.close(fig) 602 | 603 | 604 | 605 | def CytoSig_run_COVID19_singlecell(): 606 | run_lst = [ 607 | ['GSE145926', ['severe', 'mild', 'healthy']], 608 | ['EGAS00001004571_cohort1', ['severe', 'mild', 'control']], 609 | ] 610 | 611 | for dataset, levels in run_lst: 612 | data = os.path.join(sc_path, dataset + '.pickle.gz') 613 | output = os.path.join(output_path, dataset + '.signal') 614 | 615 | if not os.path.exists(data): 616 | sys.stderr.write('Cannot find converted pickle for %s\n' % dataset) 617 | continue 618 | 619 | data = pandas.read_pickle(data) 620 | 621 | # normalize by health controls 622 | background = data.loc[:, [v.find(levels[2]) >= 0 for v in data.columns]] 623 | background = category_background(background, '.') 624 | data = data.subtract(background, axis=0) 625 | 626 | beta, se, zscore, pvalue = CytoSig.ridge_significance_test(signature, data, alpha, alternative, nrand, 1, True, False, verbose_flag) 627 | zscore.to_csv(output, sep='\t', index_label=False) 628 | 629 | qthres = 0.05 630 | # please see Methods section of our paper for neutrophils 631 | if dataset == 'EGAS00001004571_cohort1': qthres = 0.052 632 | 633 | analyze_three_level_significance(output, levels, qthres=qthres) 634 | 635 | 636 | 637 | 638 | def analyze_COVID19_SC_Severity_joint_heatmap(top_value = 4): 639 | datasets = ['GSE145926', 'EGAS00001004571_cohort1'] 640 | 641 | merge = [] 642 | 643 | included = set() 644 | 645 | for dataset in datasets: 646 | output = os.path.join(output_path, dataset + '.signal') 647 | 648 | if not os.path.exists(output + '.triplet'): 649 | sys.stderr.write('Cannot find result for %s\n' % dataset) 650 | continue 651 | 652 | mat = pandas.read_csv(output + '.triplet', sep='\t', index_col=0) 653 | merge.append(mat) 654 | 655 | N = len(set([v.split('@',1)[0] for v in mat.columns])) 656 | 657 | cnt_map = pandas.read_csv(output + '.cnt_map', sep='\t', index_col=0, header=None) 658 | cnt_map = cnt_map.iloc[:, 0] 659 | cnt_map = cnt_map.loc[cnt_map >= N/3.0] 660 | 661 | included.update(cnt_map.index) 662 | 663 | if len(merge) == 0: 664 | sys.stderr.write('Nothing to merge\n') 665 | return 666 | 667 | merge = pandas.concat(merge, axis=1, join='inner') 668 | merge = merge.loc[included].transpose() 669 | 670 | post_s = re.compile('s$') 671 | 672 | lst = [] 673 | for i, v in enumerate(merge.index): 674 | if (i-1) % 3 == 0: 675 | v = re.sub(post_s, '', v.split('@')[0]) 676 | else: 677 | v = '' 678 | 679 | lst.append(v) 680 | 681 | merge.index = lst 682 | 683 | g = seaborn.clustermap(merge, cmap="coolwarm", row_cluster=False, metric='correlation', vmax=top_value, vmin=-top_value, center=0, xticklabels=True, yticklabels=True, cbar_kws={'label': 'Median Activity', "shrink": .5}, 684 | figsize=(1.1*merge.shape[1], 0.5*merge.shape[0]))#, dendrogram_ratio=(0.5, 0.1)) 685 | 686 | plt.tick_params(pad=10) 687 | 688 | xlabel_lst = g.ax_heatmap.get_xticklabels() 689 | 690 | plt.setp(xlabel_lst, rotation=90) 691 | plt.setp(g.ax_heatmap.get_yticklabels(), rotation=0) 692 | 693 | merge = merge.loc[:, [v.get_text() for v in xlabel_lst]] 694 | 695 | for a in g.ax_row_dendrogram.collections: a.set_linewidth(2) 696 | for a in g.ax_col_dendrogram.collections: a.set_linewidth(2) 697 | 698 | out = os.path.join(output_path, 'COVID19.signal.triplet') 699 | 700 | merge.to_csv(out + '.sourcedata.csv') 701 | 702 | plt.savefig(out + '.significant.pdf', bbox_inches='tight', transparent=True) 703 | 704 | 705 | 706 | def get_map_lst(vmap, key, flag_map=False): 707 | lst = vmap.get(key) 708 | 709 | if lst is None: 710 | if flag_map: 711 | lst = vmap[key] = {} 712 | else: 713 | lst = vmap[key] = [] 714 | 715 | return lst 716 | 717 | 718 | def plot_blockade_activity_diff(): 719 | output = os.path.join(output_path, 'blockade_diff') 720 | 721 | info = pandas.read_excel(os.path.join(base_path, 'signal.xlsx'), engine='openpyxl') 722 | info.drop(['Comment'], axis=1, inplace=True) 723 | info.dropna(inplace=True) 724 | 725 | # VEGFA has clinical response data available, thus ignore its Xenograft studies 726 | info = info.loc[(info['Disease'] != 'Xenograft') | ((info['Disease'] == 'Xenograft') & (info['Target'] != 'VEGFA'))] 727 | 728 | # load previous CytoSig scores 729 | result = pandas.read_csv(os.path.join(bulk_path, 'inflam', 'merge'), sep='\t', index_col=0) 730 | 731 | dataset_map = {} 732 | 733 | for _, fields in info.iterrows(): 734 | dataset = fields['Dataset'] 735 | condition = fields['Condition'] 736 | targets = fields['Target'].split(',') 737 | targets = [v.strip() for v in targets] 738 | #treatment = fields['Treatment'] 739 | #disease = fields['Disease'] 740 | 741 | response = fields['Response'] 742 | 743 | for target in targets: 744 | if target not in result.index: continue 745 | v = result.loc[target, dataset + '_' + condition] 746 | 747 | # for all responder cases, plot their diff values 748 | if response != 'No': 749 | response_map = get_map_lst(dataset_map, target + '\t' + dataset, flag_map=True) 750 | lst = get_map_lst(response_map, response) 751 | lst.append(v) 752 | 753 | target_map = {} 754 | 755 | for dataset, response_map in dataset_map.items(): 756 | target, dataset = dataset.split('\t') 757 | 758 | if len(response_map) > 1: 759 | assert 'Yes' in response_map 760 | lst = response_map['Yes'] 761 | else: 762 | assert ('Yes' in response_map) or ('Unclear' in response_map) 763 | lst = list(response_map.values())[0] 764 | 765 | # dataset-level median 766 | v = numpy.median(lst) 767 | 768 | lst = target_map.get(target) 769 | if lst is None: lst = target_map[target] = [] 770 | lst.append(v) 771 | 772 | score_map = {} 773 | for target, lst in target_map.items(): score_map[target] = numpy.median(lst) 774 | score_map = pandas.Series(score_map).sort_values() 775 | 776 | p = stats.wilcoxon(score_map)[1] 777 | 778 | # create target merge orders 779 | accuracy_vthres = -1 780 | fig = plt.figure(figsize=(1.5*figure_width, figure_width), frameon=False) 781 | 782 | for i, target in enumerate(score_map.index): 783 | lst = target_map[target] 784 | 785 | lst = [max(v, -10) for v in lst] 786 | 787 | x = numpy.random.normal(i, 0.1, size= len(lst)) 788 | plt.plot(x, lst, marker='o', linestyle='none', markersize=10, color=colors_default[0]) 789 | plt.axvline(i+0.5, linestyle='--', color='grey', lw=1) 790 | 791 | plt.axhline(0, color='grey', lw=1) 792 | plt.axhline(accuracy_vthres, linestyle='--') 793 | plt.text(0, accuracy_vthres - 1, 'v = %d' % accuracy_vthres) 794 | 795 | plt.xticks(range(score_map.shape[0]), score_map.index, rotation=90) 796 | plt.tick_params(pad=10) 797 | plt.ylabel('Activity Diff') 798 | 799 | plt.title('accuracy = %.2f , p = %.1e' % ((score_map < accuracy_vthres).mean(), p), fontsize=font_size) 800 | 801 | fig.savefig(output + '.pdf', bbox_inches='tight', transparent=True) 802 | plt.close(fig) 803 | 804 | 805 | 806 | 807 | 808 | 809 | def main(): 810 | # Figure 4e & Extended Data Figure 4 811 | Survival_Analysis_Tumor() 812 | 813 | # prepare for Figure 4a-c 814 | CytoSig_run_Inflam() 815 | 816 | # Figure 4a 817 | plot_canakinumab_GSE80060_response_activity() 818 | 819 | # Figure 4b 820 | plot_IFNK_GSE72754_response_activity() 821 | 822 | # Figure 4c 823 | plot_blockade_activity_diff() 824 | 825 | # prepare for Figure 6 and generate Figure 6f 826 | CytoSig_run_COVID19_singlecell() 827 | 828 | # Figure 6g 829 | analyze_COVID19_SC_Severity_joint_heatmap() 830 | 831 | return 0 832 | 833 | if __name__ == '__main__': main() 834 | --------------------------------------------------------------------------------