├── 10X_Visium_samples ├── UMI_annotator.py ├── Visium.R ├── Visium_pipeline.sh ├── output │ ├── CRC_16.visium.raw_matrix.genus.csv │ ├── CRC_16.visium.raw_matrix.validate.csv │ ├── OSCC_2.visium.raw_matrix.genus.csv │ └── OSCC_2.visium.raw_matrix.validate.csv └── validate_and_count.py ├── LICENSE ├── README.md ├── cell_culture_samples ├── DE.r ├── INVADEseq.py ├── cell_culture_16s_pipeline.sh ├── cell_culture_Seurat.r ├── cell_culture_samples_GEX_pipeline.sh ├── merge_metadata.py ├── metadata_dedup.py └── output │ ├── HCT_116_gex_16s_mix_dedup.csv │ └── HT_29_gex_16s_mix_dedup.csv └── patient_samples ├── DE.r ├── INVADEseq.py ├── merge_metadata.py ├── metadata_dedup.py ├── out └── headneck_gex_16s_mix_dedup_updated.csv ├── patient_samples_16s_pipeline.sh ├── patient_samples_GEX_pipeline.sh └── patient_samples_Seurat.r /10X_Visium_samples/UMI_annotator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pysam 3 | import sys 4 | import gzip 5 | 6 | def read_cell_names1(pathseq_bam_file, write_bac): 7 | seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36) 8 | read_name_pathseq = open(write_bac,'w') 9 | total_pathseq_reads=0 10 | total_YP_reads=0 11 | for each_line in seqbam: 12 | total_pathseq_reads+=1 13 | if each_line.has_tag('YP'): 14 | total_YP_reads+=1 15 | outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.mapping_quality) + '\n' 16 | read_name_pathseq.write(outline) 17 | print('Total reads in pathseq bam = ',total_pathseq_reads) 18 | print('Total reads in pathseq bam with YP tag = ',total_YP_reads) 19 | return 20 | 21 | def read_readnames(readname_file): 22 | set_for_readnames = set() 23 | dict_name = {} 24 | with open (readname_file,'r') as r: 25 | for each_line in r: 26 | each_line = each_line.rstrip('\n') 27 | each_line_list = each_line.split('\t') 28 | set_for_readnames.add(each_line_list[0]) 29 | dict_name[each_line_list[0]] = {} 30 | dict_name[each_line_list[0]]["pathogen"] = each_line_list[1] 31 | dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2] 32 | return set_for_readnames, dict_name 33 | 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv): 35 | pathseq_report = open(pathseq_report_csv,'r') 36 | dict_for_genus = {} 37 | set_for_genera = set() 38 | for each_line in pathseq_report: 39 | each_line = each_line.rstrip('\n') 40 | each_line_list = each_line.split('\t') 41 | level = each_line_list[2] 42 | tax = each_line_list[3] 43 | if level == 'genus': 44 | set_for_genera.add(tax) 45 | if '|' in each_line_list[1]: 46 | name_string_list = each_line_list[1].split('|') 47 | for n in range(len(name_string_list)): 48 | pointer = -n-1 49 | if not '_' in name_string_list[pointer]: 50 | name = name_string_list[pointer] 51 | break 52 | if 'unclassified' in name_string_list[pointer]: 53 | name = name_string_list[pointer] 54 | break 55 | id = each_line_list[0] 56 | dict_for_genus[id] = name 57 | print ("len(dict_for_genus) = ",len(dict_for_genus)) 58 | return dict_for_genus 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file): 60 | white_list_set = set() 61 | white_list = gzip.open(barcode_whitelist_file, 'rt') 62 | for each_line in white_list: 63 | each_line = each_line.rstrip('\n') 64 | white_list_set.add(each_line) 65 | 66 | seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36) 67 | readname_cell_path = open(out_readname_cell_path,'w') 68 | unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w') 69 | unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam) 70 | 71 | set_for_infect_cells=set() 72 | total_cellranger_bam_reads = 0 73 | total_cellranger_reads_UB_CB_tags = 0 74 | total_cellranger_reads_UB_CB_unmap = 0 75 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0 76 | total_potential_UMI_including_ambigious_reads = set() 77 | for each_line in seqbam: 78 | total_cellranger_bam_reads+=1 79 | if each_line.has_tag('CB') and each_line.has_tag('UB'): 80 | if each_line.get_tag('CB') in white_list_set: 81 | total_cellranger_reads_UB_CB_tags+=1 82 | if each_line.is_unmapped: 83 | total_cellranger_reads_UB_CB_unmap+=1 84 | # added 102721: output a fasta file for kraken 85 | query_name_in_cellranger_bam = each_line.query_name 86 | seq_in_cellranger_bam = each_line.query_sequence 87 | unmap_cbub_fasta.write('>') 88 | unmap_cbub_fasta.write(query_name_in_cellranger_bam) 89 | unmap_cbub_fasta.write('\n') 90 | unmap_cbub_fasta.write(seq_in_cellranger_bam) 91 | unmap_cbub_fasta.write('\n') 92 | unmap_cbub_bam.write(each_line) 93 | if each_line.query_name in set_of_readnames: 94 | set_for_infect_cells.add(each_line.get_tag('CB')) 95 | readname = each_line.query_name 96 | cellname = each_line.get_tag('CB') 97 | umi = each_line.get_tag('UB') 98 | path = dict_name[readname]["pathogen"] 99 | id_string_list = path.split(',') 100 | genus_list = [] 101 | for each_id in id_string_list: 102 | if each_id in dict_for_genus: 103 | genus = dict_for_genus[each_id] 104 | genus_list.append(genus) 105 | else: 106 | print(each_id," not found!") 107 | genus_list = list(set(genus_list)) 108 | genus_list.sort() 109 | genus_list_string = ','.join(genus_list) 110 | mapping_score = dict_name[readname]["mapping_score"] 111 | outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n' 112 | readname_cell_path.write(outline) 113 | total_potential_UMI_including_ambigious_reads.add(umi) 114 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1 115 | print('total cellranger bam reads = ',total_cellranger_bam_reads) 116 | print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags) 117 | print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap) 118 | print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads) 119 | cell_list = open(out_cell_list,'w') 120 | for each_cell in set_for_infect_cells: 121 | cell_list.write(each_cell) 122 | cell_list.write('\n') 123 | return 124 | 125 | 126 | def generate_barcode_UMI_dict(out_readname_cell_path): 127 | cell_path_file = open(out_readname_cell_path,'r') 128 | barcode_UMI_dict = {} 129 | for each_line in cell_path_file: 130 | each_line = each_line.rstrip('\n') 131 | each_line_list = each_line.split('\t') 132 | read_name = each_line_list[0] 133 | cell_barcode = each_line_list[1] 134 | UMI = each_line_list[2] 135 | id_string = each_line_list[3] 136 | id_string_list = id_string.split(',') 137 | barcode_UMI = cell_barcode+'+'+UMI 138 | mapping_score = each_line_list[4] 139 | genus_string = each_line_list[5] 140 | if not barcode_UMI in barcode_UMI_dict: 141 | barcode_UMI_dict[barcode_UMI]={} 142 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 143 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 144 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 145 | elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]: 146 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 147 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 148 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 149 | return barcode_UMI_dict 150 | 151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus): 152 | cells_dict = {} 153 | for barcode_UMI in barcode_UMI_dict: 154 | cell = barcode_UMI.split('+')[0] 155 | if not cell in cells_dict: 156 | cells_dict[cell]=[] 157 | cells_dict[cell].append(barcode_UMI) 158 | else: 159 | cells_dict[cell].append(barcode_UMI) 160 | UMI_id_dict = {} 161 | for barcode_UMI in barcode_UMI_dict: 162 | if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]: 163 | UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"] 164 | unambigious_UMI = {} 165 | for barcode_UMI in UMI_id_dict: 166 | id_list = UMI_id_dict[barcode_UMI] 167 | genus_list = [] 168 | for each_id in id_list: 169 | if each_id in dict_for_genus: 170 | genus = dict_for_genus[each_id] 171 | genus_list.append(genus) 172 | genus_list = list(set(genus_list)) 173 | if len(genus_list) == 1:#only keep unambigious UMI 174 | unambigious_UMI[barcode_UMI] = genus_list[0] 175 | print('Total unambigious UMI = ',len(unambigious_UMI)) 176 | cell_metadata_dict = {} 177 | for barcode_UMI in unambigious_UMI: 178 | barcode = barcode_UMI.split('+')[0] 179 | UMI = barcode_UMI.split('+')[1] 180 | genus = unambigious_UMI[barcode_UMI] 181 | 182 | if not barcode in cell_metadata_dict: 183 | cell_metadata_dict[barcode] = {} 184 | cell_metadata_dict[barcode]['genus'] = [] 185 | cell_metadata_dict[barcode]['genus'].append(genus) 186 | cell_metadata_dict[barcode]['barcode_UMI']={} 187 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 188 | cell_metadata_dict[barcode]['pathogen_count']={} 189 | else: 190 | cell_metadata_dict[barcode]['genus'].append(genus) 191 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 192 | 193 | if not genus in cell_metadata_dict[barcode]['pathogen_count']: 194 | cell_metadata_dict[barcode]['pathogen_count'][genus] = 1 195 | else: 196 | cell_metadata_dict[barcode]['pathogen_count'][genus] += 1 197 | return cell_metadata_dict 198 | 199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file): 200 | print('total pathogen-associated gems = ', len(cell_metadata_dict)) 201 | white_list_set = set() 202 | white_list_dict = {} 203 | white_list = gzip.open(barcode_whitelist_file, 'rt') 204 | for each_line in white_list: 205 | each_line = each_line.rstrip('\n') 206 | white_list_set.add(each_line) 207 | for barcode in cell_metadata_dict: 208 | if barcode in white_list_set: 209 | white_list_dict[barcode]= cell_metadata_dict[barcode] 210 | cell_metadata_dict = white_list_dict 211 | print("total filtered pathogen-associated cells = ", len(cell_metadata_dict)) 212 | genus_file = open(out_genus_file,'w') 213 | header = 'cell_name,pathogen,UMI_count,pathogen_count\n' 214 | genus_file.write(header) 215 | 216 | for barcode in cell_metadata_dict: 217 | if not sample_ident == '': 218 | cell_name = sample_ident+'_'+barcode 219 | else: 220 | cell_name = barcode 221 | genus_list = [] 222 | for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']: 223 | genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI]) 224 | sorted_genus_list = list(set(genus_list)) 225 | sorted_genus_list.sort() 226 | genus = '+'.join(sorted_genus_list) 227 | UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI']) 228 | pathogen_count_list = [] 229 | for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']: 230 | pathogen_count=each_pathogen 231 | pathogen_count+=':' 232 | pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen]) 233 | pathogen_count_list.append(pathogen_count) 234 | pathogen_count_list.sort() 235 | pathogen_count_str = ';'.join(pathogen_count_list) 236 | 237 | Periority_pathogen = 'Fusobacterium' 238 | pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count'] 239 | temp_max_list = [] 240 | UMI_count_sum = 0 241 | max_count = max(pathogen_count_mini_dict.values()) 242 | for key,value in pathogen_count_mini_dict.items(): 243 | if value == max_count: 244 | temp_max_list.append(key) 245 | max_UMI = value 246 | UMI_count_sum += value 247 | 248 | UMI_count = UMI_count_sum 249 | if len(set(temp_max_list)) > 1: 250 | genus = 'MULTI' 251 | UMI_count = UMI_count_sum 252 | else: 253 | genus = temp_max_list[0] 254 | UMI_count = max_UMI 255 | output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n' 256 | if UMI_count >= 1: 257 | genus_file.write(output_line) 258 | return 259 | 260 | 261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv): 262 | white_list_set = set() 263 | white_list_dict = {} 264 | white_list = gzip.open(barcode_whitelist_file, 'rt') 265 | for each_line in white_list: 266 | each_line = each_line.rstrip('\n') 267 | white_list_set.add(each_line) 268 | print("total number of cells = ", len(white_list_set)) 269 | for barcode in cell_metadata_dict: 270 | if barcode in white_list_set: 271 | white_list_dict[barcode]= cell_metadata_dict[barcode] 272 | cell_metadata_dict = white_list_dict 273 | output_UMI_validate_table = open(output_UMI_validate_table_csv,'w') 274 | for each_cell in cell_metadata_dict: 275 | for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']: 276 | UMI = each_UMI 277 | pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI] 278 | output_UMI_validate_table.write(UMI+','+pathogen+'\n') 279 | 280 | output_UMI_table = open(output_UMI_table_csv,'w') 281 | genera_list_set = set() 282 | for barcode in cell_metadata_dict: 283 | for pathogen in cell_metadata_dict[barcode]['pathogen_count']: 284 | genera_list_set.add(pathogen) 285 | 286 | genera_list = sorted(list(genera_list_set)) 287 | header = ['barcode']+genera_list 288 | header_out = ','.join(header) 289 | output_UMI_table.write(header_out) 290 | output_UMI_table.write('\n') 291 | for barcode in cell_metadata_dict: 292 | if not sample_ident == '': 293 | cell_name = sample_ident+'_'+barcode 294 | else: 295 | cell_name = barcode 296 | genera_count_list = [] 297 | for each_genus in genera_list: 298 | if each_genus in cell_metadata_dict[barcode]['pathogen_count']: 299 | genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus] 300 | else: 301 | genus_count = 0 302 | genera_count_list.append(str(genus_count)) 303 | output_line = [cell_name]+genera_count_list 304 | output_line_out = ','.join(output_line) 305 | output_UMI_table.write(output_line_out) 306 | output_UMI_table.write('\n') 307 | return 308 | 309 | if __name__ == "__main__": 310 | cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:] 311 | dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv) 312 | step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq) 313 | step2 = read_readnames(read_name_pathseq) 314 | step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file) 315 | step4 = generate_barcode_UMI_dict(out_readname_cell_path) 316 | step5 = output_cells_genus_list(step4,dict_for_genus) 317 | 318 | output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file) 319 | cell_metadata_dict = step5 320 | UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv) 321 | 322 | # cellranger_bam_file, 323 | # sample_ident, 324 | # barcode_whitelist_file, 325 | # pathseq_bam_file, 326 | # pathseq_report_csv, 327 | # read_name_pathseq, 328 | # unmap_cbub_bam_file, 329 | # unmap_cbub_fasta_file, 330 | # out_cell_list, 331 | # out_readname_cell_path, 332 | # out_genus_file, 333 | # output_UMI_table_csv, 334 | # output_UMI_validate_table_csv=sys.argv[1:] 335 | -------------------------------------------------------------------------------- /10X_Visium_samples/Visium.R: -------------------------------------------------------------------------------- 1 | # add read count for V11A07-022_A1: 2 | setwd("Visium/data_processing") 3 | path <- 'Visium/raw_data/count' 4 | sample_pattern = 'V' 5 | samples <- sort(list.files(path, pattern=sample_pattern, full.names = TRUE)) 6 | sample.names <- basename(samples) 7 | processed_foler = 'Visium/data_processing/python' 8 | samples_folder = 'Visium/raw_data/count' 9 | output_path = 'Visium/data_processing/rds' 10 | 11 | # a loop for generating rds files 12 | for (each_sample in sample.names){ 13 | print(each_sample) 14 | data_path = paste0(samples_folder,'/',each_sample,'/outs') 15 | metadata_file = paste0(processed_foler,'/',each_sample,'.visium.raw_matrix.genus.csv') 16 | output_file_filtered = paste0(output_path,'/',each_sample,'.filtered_matrix.rds') 17 | output_file_raw = paste0(output_path,'/',each_sample,'.raw_matrix.rds') 18 | 19 | # processing: filtered matrix 20 | print('processing filtered matrix') 21 | list.files(samples_folder) # Should show filtered_feature_bc_matrix.h5 22 | tissue_sample<-Load10X_Spatial(data.dir = data_path,filename = "filtered_feature_bc_matrix.h5") 23 | plot1 <- VlnPlot(tissue_sample, features = "nCount_Spatial", pt.size = 0.1) + NoLegend() 24 | plot2 <- SpatialFeaturePlot(tissue_sample, features = "nCount_Spatial") + theme(legend.position = "right") 25 | plot1+plot2 26 | tissue_sample <- subset(tissue_sample, subset = nCount_Spatial > 3 & nFeature_Spatial > 3) 27 | SpatialFeaturePlot(tissue_sample, features = "nCount_Spatial") + theme(legend.position = "right") 28 | tissue_sample <- SCTransform(tissue_sample, assay = "Spatial") 29 | # delete original matrix to reduce size 30 | tissue_sample@assays$Spatial=NULL 31 | CellsMeta = tissue_sample@meta.data 32 | head(CellsMeta) 33 | umi_table_csv = metadata_file 34 | umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1) 35 | umi_table$Total <- rowSums(umi_table) 36 | umi_table[umi_table==0] <- NA 37 | # turn 0 to NA 38 | tissue_sample<-AddMetaData(tissue_sample, umi_table) 39 | 40 | p1 = SpatialFeaturePlot(tissue_sample,features = c('Total'),pt.size.factor = 1.52) + 41 | ggtitle(paste(each_sample,' ','Total Pathogen'," nUMI Filtered", sep = "")) + 42 | theme(legend.position = "right",plot.title = element_text(hjust = 0.5)) 43 | print(p1) 44 | # added 011322: cluster 45 | 46 | tissue_sample <- RunPCA(tissue_sample, assay = "SCT", verbose = FALSE) 47 | tissue_sample <- FindNeighbors(tissue_sample, reduction = "pca", dims = 1:20) 48 | tissue_sample <- FindClusters(tissue_sample, verbose = FALSE) 49 | set.seed(123) 50 | tissue_sample <- RunUMAP(tissue_sample, reduction = "pca", dims = 1:20) 51 | # find Spatially Variable Features 52 | tissue_sample <- FindSpatiallyVariableFeatures(tissue_sample, assay = "SCT", features = VariableFeatures(tissue_sample)[1:1000],selection.method = "markvariogram") 53 | 54 | # save filtered rds 55 | saveRDS(tissue_sample, file = output_file_filtered) 56 | 57 | # processing: raw matrix (mapping only) 58 | print('processing raw matrix') 59 | tissue_sample_raw<-Load10X_Spatial(data.dir = data_path,filename = "raw_feature_bc_matrix.h5") 60 | tissue_sample_raw<-AddMetaData(tissue_sample_raw, umi_table) 61 | 62 | p2 = SpatialFeaturePlot(tissue_sample_raw,features = c('Total'),pt.size.factor = 1.52) + 63 | ggtitle(paste(each_sample,' ','Total Pathogen'," nUMI RAW", sep = "")) + 64 | theme(legend.position = "right",plot.title = element_text(hjust = 0.5)) 65 | 66 | print(p2) 67 | 68 | saveRDS(tissue_sample_raw, file = output_file_raw) 69 | 70 | } 71 | 72 | 73 | read_table_csv = 'Visium/data_processing/python/CRC_16.visium.raw_matrix.genus.csv' 74 | read_table<-read.csv(read_table_csv,sep=',',header=TRUE,row.names = 1) 75 | CRC_16.visium<-AddMetaData(CRC_16.visium, read_table) 76 | 77 | read_table_csv = 'Visium/data_processing/python/OSCC_2.visium.raw_matrix.genus.csv' 78 | read_table<-read.csv(read_table_csv,sep=',',header=TRUE,row.names = 1) 79 | OSCC_2.visium<-AddMetaData(CRC_16.visium, read_table) 80 | 81 | -------------------------------------------------------------------------------- /10X_Visium_samples/Visium_pipeline.sh: -------------------------------------------------------------------------------- 1 | # The preprocessing pipeline for 10X Visium data, input of this pipeline is the output of spaceranger count 2 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8 3 | ml Python 4 | ml Pysam 5 | 6 | `raw_data_folder` # the folder containing Spaceranger output folders 7 | `root` # working directory 8 | `pathseqdb` # Pathseq database 9 | 10 | cd ${root} 11 | cd ${raw_data_folder} 12 | 13 | # PathSeq pipeline 14 | outpath=${root}/pathseq 15 | mkdir ${outpath} 16 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples 17 | # 18 | for folder in * 19 | do 20 | folder_name=${folder##*/} 21 | file=${folder}/outs/possorted_genome_bam.bam 22 | samplename=${folder_name} 23 | echo ${samplename} 24 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \ 25 | --input ${file} \ 26 | --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \ 27 | --kmer-file ${pathseqdb}/pathseq_host.bfi \ 28 | --min-clipped-read-length 60 \ 29 | --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \ 30 | --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \ 31 | --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \ 32 | --output ${outpath}/${samplename}.pathseq.complete.bam \ 33 | --scores-output ${outpath}/${samplename}.pathseq.complete.csv \ 34 | --is-host-aligned false \ 35 | --filter-duplicates false \ 36 | --min-score-identity .7 37 | done 38 | 39 | # Python script to generate bacteria matrix 40 | bam_path=${raw_data_folder} 41 | pathseq_path=${root}/pathseq 42 | out_path=${root}/python 43 | mkdir ${out_path} 44 | cd ${bam_path} 45 | 46 | for each_sample in * 47 | do 48 | echo ${each_sample} 49 | python UMI_annotator.py \ 50 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \ 51 | '' \ 52 | ${bam_path}/${each_sample}/outs/raw_feature_bc_matrix/barcodes.tsv.gz \ 53 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \ 54 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \ 55 | ${out_path}/${each_sample}.visium.raw_matrix.readname \ 56 | ${out_path}/${each_sample}.visium.raw_matrix.unmap_cbub.bam \ 57 | ${out_path}/${each_sample}.visium.raw_matrix.unmap_cbub.fasta \ 58 | ${out_path}/${each_sample}.visium.raw_matrix.list \ 59 | ${out_path}/${each_sample}.visium.raw.raw_matrix.readnamepath \ 60 | ${out_path}/${each_sample}.visium.raw_matrix.genus.cell \ 61 | ${out_path}/${each_sample}.visium.raw_matrix.genus.csv \ 62 | ${out_path}/${each_sample}.visium.raw_matrix.validate.csv 63 | done 64 | 65 | -------------------------------------------------------------------------------- /10X_Visium_samples/validate_and_count.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # what we want to know: 4 | # 1. sample name 5 | # 2. genera list 6 | # 2.5 number of this type of cells 7 | # 3. UMI for each genus for this sample 8 | # 4. read number for each genus for this sample 9 | # First, for target sample, extract the Bacteria Positive cells 10 | # Then using Validate csv, extract and count UMIs 11 | # Then use CellsMeta file, count reads 12 | 13 | def extract_bac_pos_cells(metadata_file, orig_ident): 14 | print(orig_ident) 15 | cell_names_set = set() 16 | metadata = open(metadata_file,'r') 17 | n=0 18 | # read in metadata 19 | for each_line in metadata: 20 | each_line = each_line.rstrip('\n') 21 | each_line_list = each_line.split(',') 22 | if not orig_ident == '': 23 | sample_name = each_line_list[0] 24 | else: 25 | sample_name = '' 26 | if n == 0: 27 | k=0 28 | for each_item in each_line_list: 29 | if each_item == "Total": 30 | position = k 31 | k+=1 32 | #print ('found k : ',k) 33 | #print ('found position : ',position) 34 | if n > 0 : 35 | if orig_ident in sample_name: 36 | if int(each_line_list[position]) > 0: 37 | cell_names = each_line_list[0].split('_')[-1] 38 | cell_names_set.add(cell_names) 39 | n+=1 40 | print('len(cell_names_set) = ',len(cell_names_set)) 41 | return cell_names_set 42 | 43 | # validate_dict is the dict for cell_UMI -> genus 44 | def extract_UMI(validate_file, cell_names_set): 45 | #print(cell_names_set) 46 | validate_dict = {} 47 | genus_set = set() 48 | validate = open(validate_file,'r') 49 | cell_name_set = set() 50 | for each_line in validate: 51 | each_line = each_line.rstrip('\n') 52 | cell_name = each_line.split('+')[0] 53 | cell_name_set.add(cell_name) 54 | cell_name_barcode = each_line.split(',')[0] 55 | #print(cell_name_barcode) 56 | genus = each_line.split(',')[1] 57 | if cell_name in cell_names_set: 58 | validate_dict[cell_name_barcode] = genus 59 | genus_set.add(genus) 60 | #print(cell_name_set) 61 | return validate_dict,genus_set 62 | 63 | # sum dict is the dict for cell_UMI -> list of readnames 64 | def count_read(cellsmeta_file,validate_dict): 65 | cellsmeta = open(cellsmeta_file,'r') 66 | sum_dict = {} 67 | for each_line in cellsmeta: 68 | each_line = each_line.rstrip('\n') 69 | each_line_list = each_line.split('\t') 70 | if not ',' in each_line_list[-1]: 71 | read_name = each_line_list[0] 72 | cell_name = each_line_list[1] 73 | barcode = each_line_list[2] 74 | cell_barcode = cell_name + '+' + barcode 75 | if cell_barcode in validate_dict: 76 | if not cell_barcode in sum_dict: 77 | sum_dict[cell_barcode] = [] 78 | sum_dict[cell_barcode].append(read_name) 79 | else: 80 | sum_dict[cell_barcode].append(read_name) 81 | for each_cell_barcode in sum_dict: 82 | sum_dict[each_cell_barcode] = list(set(sum_dict[each_cell_barcode])) 83 | return sum_dict 84 | 85 | # first, loop the genus names within each sample 86 | # for each genus names, extract and count cell_UMI, cell 87 | # for each extracted cell_UMI, count number of *unique readnames, add together 88 | # will be in a dict: genus[number of cell, number of UMI, number of reads] 89 | # 90 | # validate_dict is the dict for cell_UMI -> genus 91 | # sum dict is the dict for cell_UMI -> readnames 92 | def summarize_read(sum_dict,validate_dict,genus_set): 93 | genus_sum_dict = {} 94 | for each_genus in genus_set: 95 | for each_cell_UMI in validate_dict: 96 | if validate_dict[each_cell_UMI] == each_genus: 97 | cell_barcode = each_cell_UMI.split('+')[0] 98 | #print(cell_barcode) 99 | #print(each_cell_UMI) 100 | read_list = sum_dict[each_cell_UMI] 101 | if not each_genus in genus_sum_dict: 102 | genus_sum_dict[each_genus] = {} 103 | genus_sum_dict[each_genus]['cell_list']=[] 104 | genus_sum_dict[each_genus]['UMI_list']=[] 105 | genus_sum_dict[each_genus]['reads_list']=[] 106 | genus_sum_dict[each_genus]['cell_list'].append(cell_barcode) 107 | genus_sum_dict[each_genus]['UMI_list'].append(each_cell_UMI) 108 | genus_sum_dict[each_genus]['reads_list'] = genus_sum_dict[each_genus]['reads_list'] + read_list 109 | # then convert it to count dict 110 | return genus_sum_dict #genus_count_dict 111 | 112 | # this function is not used in visium analysis 113 | def add_dicts(genus_sum_dict_1,genus_sum_dict_2): 114 | genus_sum_dict1 = genus_sum_dict_1 115 | genus_sum_dict2 = genus_sum_dict_2 116 | for each_genus in genus_sum_dict2: 117 | if each_genus in genus_sum_dict1: 118 | genus_sum_dict1[each_genus]['cell_list'] += genus_sum_dict2[each_genus]['cell_list'] 119 | genus_sum_dict1[each_genus]['UMI_list'] += genus_sum_dict2[each_genus]['UMI_list'] 120 | genus_sum_dict1[each_genus]['reads_list'] += genus_sum_dict2[each_genus]['reads_list'] 121 | else: 122 | genus_sum_dict1[each_genus] = {} 123 | genus_sum_dict1[each_genus]['cell_list'] = genus_sum_dict2[each_genus]['cell_list'] 124 | genus_sum_dict1[each_genus]['UMI_list'] = genus_sum_dict2[each_genus]['UMI_list'] 125 | genus_sum_dict1[each_genus]['reads_list'] = genus_sum_dict2[each_genus]['reads_list'] 126 | genus_sum_dict = genus_sum_dict1 127 | # then convert it to count dict 128 | genus_count_dict = {} 129 | for each_genus in genus_sum_dict: 130 | #print(genus_sum_dict[each_genus]) 131 | number_of_cells = len(set(genus_sum_dict[each_genus]['cell_list'])) 132 | number_of_UMIs = len(set(genus_sum_dict[each_genus]['UMI_list'])) 133 | number_of_reads = len(set(genus_sum_dict[each_genus]['reads_list'])) 134 | if not each_genus in genus_count_dict: 135 | genus_count_dict[each_genus] = {} 136 | genus_count_dict[each_genus]['cell'] = 0 137 | genus_count_dict[each_genus]['UMI'] = 0 138 | genus_count_dict[each_genus]['reads'] = 0 139 | genus_count_dict[each_genus]['cell'] = number_of_cells 140 | genus_count_dict[each_genus]['UMI'] = number_of_UMIs 141 | genus_count_dict[each_genus]['reads'] = number_of_reads 142 | return genus_count_dict 143 | # 061322 update: 144 | # add a function to output readnames for each genera! (maybe: cb/ub are included in the header) 145 | def output_readnames(genus_sum_dict,output_path): 146 | # there are multiple genera, so we create one file for each 147 | for each_genus in genus_sum_dict: 148 | file_name = each_genus+'.csv' 149 | #cellsmeta = open(cellsmeta_file,'r') 150 | write_reads = open(output_path+'/'+file_name,'w') 151 | for each_readname in genus_sum_dict[each_genus]['reads_list']: 152 | output_line = each_readname+'\n' 153 | write_reads.write(output_line) 154 | return 155 | 156 | # instead of add_dicts, for visium I use a count_dicts instead 157 | def count_dicts(genus_sum_dict): 158 | genus_count_dict = {} 159 | for each_genus in genus_sum_dict: 160 | #print(genus_sum_dict[each_genus]) 161 | number_of_cells = len(set(genus_sum_dict[each_genus]['cell_list'])) 162 | number_of_UMIs = len(set(genus_sum_dict[each_genus]['UMI_list'])) 163 | number_of_reads = len(set(genus_sum_dict[each_genus]['reads_list'])) 164 | if not each_genus in genus_count_dict: 165 | genus_count_dict[each_genus] = {} 166 | genus_count_dict[each_genus]['cell'] = 0 167 | genus_count_dict[each_genus]['UMI'] = 0 168 | genus_count_dict[each_genus]['reads'] = 0 169 | genus_count_dict[each_genus]['cell'] = number_of_cells 170 | genus_count_dict[each_genus]['UMI'] = number_of_UMIs 171 | genus_count_dict[each_genus]['reads'] = number_of_reads 172 | return genus_count_dict 173 | 174 | def output_read(output_file_name, genus_count_dict): 175 | output_file = open(output_file_name,'w') 176 | header = 'Genus,Number_of_Cells,Number_of_UMI,Number_of_reads\n' 177 | output_file.write(header) 178 | for each_genus in genus_count_dict: 179 | number_of_cells = genus_count_dict[each_genus]['cell'] 180 | number_of_UMIs = genus_count_dict[each_genus]['UMI'] 181 | number_of_reads = genus_count_dict[each_genus]['reads'] 182 | output_line = each_genus + ',' + str(number_of_cells) + ',' + str(number_of_UMIs) + ',' + str(number_of_reads) + '\n' 183 | output_file.write(output_line) 184 | return 185 | 186 | print('start processing') 187 | # CRC_16 188 | sample_name = 'CRC_16' 189 | print(sample_name) 190 | metadata_file = 'data_processing/selected_samples_for_counting/sample.visium_CRC_16_metadata.csv' 191 | orig_ident = '' 192 | cell_names_set = extract_bac_pos_cells(metadata_file, orig_ident) 193 | validate_csv = 'data_processing/python/CRC_16.visium.raw_matrix.validate.csv' 194 | validate_dict,genus_set = extract_UMI(validate_csv, cell_names_set) 195 | readnamepath_csv = 'data_processing/python/CRC_16.visium.raw.raw_matrix.readnamepath' 196 | sum_dict = count_read(readnamepath_csv,validate_dict) 197 | genus_sum_dict = summarize_read(sum_dict,validate_dict,genus_set) 198 | readname_path = 'data_processing/selected_samples_for_counting/CRC_16' 199 | output_readnames(genus_sum_dict,readname_path) 200 | output_file = 'data_processing/selected_samples_for_counting/'+sample_name+'.sum.csv' 201 | genus_count_dict = count_dicts(genus_sum_dict) 202 | output_read(output_file, genus_count_dict) 203 | 204 | # OSCC_02 205 | sample_name = 'OSCC_02' 206 | print(sample_name) 207 | metadata_file = 'data_processing/selected_samples_for_counting/sample.visium_OSCC_02_metadata.csv' 208 | orig_ident = '' 209 | cell_names_set = extract_bac_pos_cells(metadata_file, orig_ident) 210 | validate_csv = 'data_processing/python/OSCC_02.visium.raw_matrix.validate.csv' 211 | validate_dict,genus_set = extract_UMI(validate_csv, cell_names_set) 212 | readnamepath_csv = 'data_processing/python/OSCC_02.visium.raw.raw_matrix.readnamepath' 213 | sum_dict = count_read(readnamepath_csv,validate_dict) 214 | genus_sum_dict = summarize_read(sum_dict,validate_dict,genus_set) 215 | readname_path = 'data_processing/selected_samples_for_counting/OSCC_02' 216 | output_readnames(genus_sum_dict,readname_path) 217 | output_file = 'data_processing/selected_samples_for_counting/'+sample_name+'.sum.csv' 218 | 219 | genus_count_dict = count_dicts(genus_sum_dict) 220 | output_read(output_file, genus_count_dict) 221 | 222 | 223 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Fred Hutchinson Cancer Center 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Galeano-Nino-Bullman-Intratumoral-Microbiota-2022 2 | 3 | 4 | [![DOI](https://zenodo.org/badge/530442339.svg)](https://zenodo.org/badge/latestdoi/530442339) 5 | 6 | 7 | Analysis code used in Galeano Nino et al., Effect of the intratumoral microbiota on spatial and cellular heterogeneity in cancer. 2022 8 | 9 | The code in this repository is organized to reflect the description in the Methods 10 | section of Galeano Nino et al., Effect of the intratumoral microbiota on spatial and cellular heterogeneity in cancer. 2022. 11 | ## 10X Visium Scans for CRC and OSCC samples 12 | 10X Visium Scans associated with manuscript submission are uploaded to AWS and Zenodo. 13 | 14 | Tiff files can be accessed via: https://fh-pi-bullman-s-eco-public.s3.us-west-2.amazonaws.com/DataTransfer/Galeano_Nino_et_al_visium_scans/CRC_OSCC_visium_tiff.tar.gz and https://doi.org/10.5281/zenodo.7419806 15 | 16 | Please note for sample `CRC_16`, the slide id is `V10S15-020` and area code is `D1`; for sample `OSCC_2`, the slide id is `V11A07-022` and area code is `A1`. 17 | 18 | We also uploaded fastq files to AWS for your convenience: https://fh-pi-bullman-s-eco-public.s3.us-west-2.amazonaws.com/DataTransfer/Galeano_Nino_et_al_visium_scans/CRC_OSCC_visium_fastq.tar.gz 19 | 20 | ## Environment and Reference Data 21 | 22 | ### Environment 23 | 24 | All of the analysis code documented in this repository was run on the shared computing cluster 25 | maintained at the Fred Hutchinson Cancer Research Center between May 2020 and August 2022. 26 | The software dependencies used by these scripts are provided using the EasyBuild installation 27 | maintained by the Fred Hutch Scientific Computing group. 28 | Those software dependencies are loaded into the environment with the `ml` command (e.g. `ml CellRanger/6.1.1`). 29 | 30 | ### Reference Data 31 | 32 | Prior to running the analysis scripts, reference databases were downloaded for PathSeq (December 2020) 33 | and CellRanger (January 2022). 34 | The location of those reference databases is provided to the analysis scripts using the environment variables `pathseqdb` and `cellrangerdb`. 35 | 36 | # Overview of the Computational Pipeline for Bacteria-associated Spots/Cells Annotation 37 | 38 | ## Part 1: 10x Visium spatial transcriptomic data 39 | 1. Identification of microbial reads within 10x Visium spatial transcriptomic data generated by 10x Space Ranger Count (`Visium_pipeline.sh`) 40 | 2. Bioinformatic analysis of 10x Visium spatial transcriptomic data (`Visium.R`) 41 | 3. summarize numbers of bacteria reads and UMIs in 10X Visium data (`validate_and_count.py`) The folder used as outputs from the previous steps should be provided as an argument to the `Visium_pipeline.sh` script. 42 | ### Output Data: 43 | - `CRC_16.visium.raw_matrix.genus.csv` and `OSCC_2.visium.raw_matrix.genus.csv` contain bacteria UMI counting matrix that can be used as metadata in visium data process 44 | - `CRC_16.visium.raw_matrix.validate.csv` and `OSCC_2.visium.raw_matrix.validate.csv` contain validation data that can be used as the input of `validate_and_count.py` 45 | 46 | ## Part 2: 10x Single cell data (For cell culture samples and patient samples) 47 | ### Input Data: 48 | - All of the input data for this analysis is provided in FASTQ format generated by the CellRanger `mkfastq` command 49 | - The folder containing those FASTQ files is set to the environment variable `raw_data_folder` 50 | ### Preprocess: 51 | 1. Identification of microbial reads within single cells GEX libraries (`patient_samples_GEX_pipeline.sh` and `cell_culture_samples_GEX_pipeline.sh`) 52 | 2. INVADEseq bacterial 16S rRNA gene libraries (`patient_samples_16s_pipeline.sh` and `cell_culture_16s_pipeline.sh`). The variable `gex_bam_path` should be set to the output folder from the `patient_samples_GEX_pipeline.sh` and `cell_culture_samples_GEX_pipeline.sh` script. 53 | 3. Combine and deduplication of microbial metadata from step 1 & 2 (`merge_metadata.py` and `metadata_dedup.py`). The folder used as outputs from the previous steps should be provided as an argument to the `merge_metadata.py` script. 54 | ### Output Data: 55 | - `headneck_gex_16s_mix_dedup.csv` `HT_29_gex_16s_mix_dedup.csv` `HCT_116_csv_gex_16s_mix_dedup.csv` contain bacteria UMI counting matrix that can be used as Seurat object metadata in single cell process. 56 | 57 | ### Processing of single cell data 58 | 1. Seurat data processing, Harmony integration, SingleR annotation and copyKAT predication (`patient_samples_Seurat.r` and `cell_culture_Seurat.r`) 59 | 2. Differentially expression analysis and GSEA (`DE.r`) 60 | -------------------------------------------------------------------------------- /cell_culture_samples/DE.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(clusterProfiler) 3 | library(org.Hs.eg.db) 4 | library(msigdbr) 5 | 6 | m_df <- msigdbr(species = "Homo sapiens") 7 | 8 | m_H <- msigdbr(species = "Homo sapiens", category = "H") %>% 9 | dplyr::select(gs_name, gene_symbol) 10 | 11 | 12 | DE_GSEA <- function(seurat_object, 13 | ident_1, 14 | ident_2, 15 | group_by, 16 | seurat_object.markers_filename, 17 | seurat_object.markers_filtered_filename, 18 | seurat_object.markers_gsea_filename){ 19 | seurat_object.markers <- FindMarkers(seurat_object, 20 | ident.1 = ident_1, 21 | ident.2 = ident_2, 22 | group.by = group_by, 23 | logfc.threshold = -Inf, 24 | min.pct = 0.1) 25 | 26 | write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE) 27 | 28 | #seurat_object.markers = filter(seurat_object.markers, p_val_adj <= 0.05) 29 | seurat_object.markers= seurat_object.markers[order(-seurat_object.markers$avg_log2FC),] 30 | seurat_object.markers_filename = seurat_object.markers_filtered_filename 31 | write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE) 32 | 33 | markers_seurat_object <- seurat_object.markers[,c("avg_log2FC")] 34 | names(markers_seurat_object) = as.character(rownames(seurat_object.markers)) 35 | markers_seurat_object = sort(markers_seurat_object, decreasing = TRUE) 36 | length(markers_seurat_object) 37 | 38 | markers_seurat_object.em2 <- GSEA(markers_seurat_object, 39 | TERM2GENE = m_H, 40 | eps=0.0, 41 | by = "fgsea") 42 | 43 | write.csv(markers_seurat_object.em2,seurat_object.markers_gsea_filename, row.names = FALSE) 44 | } 45 | 46 | -------------------------------------------------------------------------------- /cell_culture_samples/INVADEseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pysam 3 | import sys 4 | import gzip 5 | 6 | def read_cell_names1(pathseq_bam_file, write_bac): 7 | seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36) 8 | read_name_pathseq = open(write_bac,'w') 9 | total_pathseq_reads=0 10 | total_YP_reads=0 11 | for each_line in seqbam: 12 | total_pathseq_reads+=1 13 | if each_line.has_tag('YP'): 14 | total_YP_reads+=1 15 | outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.mapping_quality) + '\n' 16 | read_name_pathseq.write(outline) 17 | print('Total reads in pathseq bam = ',total_pathseq_reads) 18 | print('Total reads in pathseq bam with YP tag = ',total_YP_reads) 19 | return 20 | 21 | def read_readnames(readname_file): 22 | set_for_readnames = set() 23 | dict_name = {} 24 | with open (readname_file,'r') as r: 25 | for each_line in r: 26 | each_line = each_line.rstrip('\n') 27 | each_line_list = each_line.split('\t') 28 | set_for_readnames.add(each_line_list[0]) 29 | dict_name[each_line_list[0]] = {} 30 | dict_name[each_line_list[0]]["pathogen"] = each_line_list[1] 31 | dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2] 32 | return set_for_readnames, dict_name 33 | 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv): 35 | pathseq_report = open(pathseq_report_csv,'r') 36 | dict_for_genus = {} 37 | set_for_genera = set() 38 | for each_line in pathseq_report: 39 | each_line = each_line.rstrip('\n') 40 | each_line_list = each_line.split('\t') 41 | level = each_line_list[2] 42 | tax = each_line_list[3] 43 | if level == 'genus': 44 | set_for_genera.add(tax) 45 | if '|' in each_line_list[1]: 46 | name_string_list = each_line_list[1].split('|') 47 | for n in range(len(name_string_list)): 48 | pointer = -n-1 49 | if not '_' in name_string_list[pointer]: 50 | name = name_string_list[pointer] 51 | break 52 | if 'unclassified' in name_string_list[pointer]: 53 | name = name_string_list[pointer] 54 | break 55 | id = each_line_list[0] 56 | dict_for_genus[id] = name 57 | print ("len(dict_for_genus) = ",len(dict_for_genus)) 58 | return dict_for_genus 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file): 60 | white_list_set = set() 61 | white_list = gzip.open(barcode_whitelist_file, 'rt') 62 | for each_line in white_list: 63 | each_line = each_line.rstrip('\n') 64 | white_list_set.add(each_line) 65 | 66 | seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36) 67 | readname_cell_path = open(out_readname_cell_path,'w') 68 | unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w') 69 | unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam) 70 | 71 | set_for_infect_cells=set() 72 | total_cellranger_bam_reads = 0 73 | total_cellranger_reads_UB_CB_tags = 0 74 | total_cellranger_reads_UB_CB_unmap = 0 75 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0 76 | total_potential_UMI_including_ambigious_reads = set() 77 | for each_line in seqbam: 78 | total_cellranger_bam_reads+=1 79 | if each_line.has_tag('CB') and each_line.has_tag('UB'): 80 | if each_line.get_tag('CB') in white_list_set: 81 | total_cellranger_reads_UB_CB_tags+=1 82 | if each_line.is_unmapped: 83 | total_cellranger_reads_UB_CB_unmap+=1 84 | # added 102721: output a fasta file for kraken 85 | query_name_in_cellranger_bam = each_line.query_name 86 | seq_in_cellranger_bam = each_line.query_sequence 87 | unmap_cbub_fasta.write('>') 88 | unmap_cbub_fasta.write(query_name_in_cellranger_bam) 89 | unmap_cbub_fasta.write('\n') 90 | unmap_cbub_fasta.write(seq_in_cellranger_bam) 91 | unmap_cbub_fasta.write('\n') 92 | unmap_cbub_bam.write(each_line) 93 | if each_line.query_name in set_of_readnames: 94 | set_for_infect_cells.add(each_line.get_tag('CB')) 95 | readname = each_line.query_name 96 | cellname = each_line.get_tag('CB') 97 | umi = each_line.get_tag('UB') 98 | path = dict_name[readname]["pathogen"] 99 | id_string_list = path.split(',') 100 | genus_list = [] 101 | for each_id in id_string_list: 102 | if each_id in dict_for_genus: 103 | genus = dict_for_genus[each_id] 104 | genus_list.append(genus) 105 | else: 106 | print(each_id," not found!") 107 | genus_list = list(set(genus_list)) 108 | genus_list.sort() 109 | genus_list_string = ','.join(genus_list) 110 | mapping_score = dict_name[readname]["mapping_score"] 111 | outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n' 112 | readname_cell_path.write(outline) 113 | total_potential_UMI_including_ambigious_reads.add(umi) 114 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1 115 | print('total cellranger bam reads = ',total_cellranger_bam_reads) 116 | print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags) 117 | print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap) 118 | print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads) 119 | cell_list = open(out_cell_list,'w') 120 | for each_cell in set_for_infect_cells: 121 | cell_list.write(each_cell) 122 | cell_list.write('\n') 123 | return 124 | 125 | 126 | def generate_barcode_UMI_dict(out_readname_cell_path): 127 | cell_path_file = open(out_readname_cell_path,'r') 128 | barcode_UMI_dict = {} 129 | for each_line in cell_path_file: 130 | each_line = each_line.rstrip('\n') 131 | each_line_list = each_line.split('\t') 132 | read_name = each_line_list[0] 133 | cell_barcode = each_line_list[1] 134 | UMI = each_line_list[2] 135 | id_string = each_line_list[3] 136 | id_string_list = id_string.split(',') 137 | barcode_UMI = cell_barcode+'+'+UMI 138 | mapping_score = each_line_list[4] 139 | genus_string = each_line_list[5] 140 | if not barcode_UMI in barcode_UMI_dict: 141 | barcode_UMI_dict[barcode_UMI]={} 142 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 143 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 144 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 145 | elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]: 146 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 147 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 148 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 149 | return barcode_UMI_dict 150 | 151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus): 152 | cells_dict = {} 153 | for barcode_UMI in barcode_UMI_dict: 154 | cell = barcode_UMI.split('+')[0] 155 | if not cell in cells_dict: 156 | cells_dict[cell]=[] 157 | cells_dict[cell].append(barcode_UMI) 158 | else: 159 | cells_dict[cell].append(barcode_UMI) 160 | UMI_id_dict = {} 161 | for barcode_UMI in barcode_UMI_dict: 162 | if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]: 163 | UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"] 164 | unambigious_UMI = {} 165 | for barcode_UMI in UMI_id_dict: 166 | id_list = UMI_id_dict[barcode_UMI] 167 | genus_list = [] 168 | for each_id in id_list: 169 | if each_id in dict_for_genus: 170 | genus = dict_for_genus[each_id] 171 | genus_list.append(genus) 172 | genus_list = list(set(genus_list)) 173 | if len(genus_list) == 1:#only keep unambigious UMI 174 | unambigious_UMI[barcode_UMI] = genus_list[0] 175 | print('Total unambigious UMI = ',len(unambigious_UMI)) 176 | cell_metadata_dict = {} 177 | for barcode_UMI in unambigious_UMI: 178 | barcode = barcode_UMI.split('+')[0] 179 | UMI = barcode_UMI.split('+')[1] 180 | genus = unambigious_UMI[barcode_UMI] 181 | 182 | if not barcode in cell_metadata_dict: 183 | cell_metadata_dict[barcode] = {} 184 | cell_metadata_dict[barcode]['genus'] = [] 185 | cell_metadata_dict[barcode]['genus'].append(genus) 186 | cell_metadata_dict[barcode]['barcode_UMI']={} 187 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 188 | cell_metadata_dict[barcode]['pathogen_count']={} 189 | else: 190 | cell_metadata_dict[barcode]['genus'].append(genus) 191 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 192 | 193 | if not genus in cell_metadata_dict[barcode]['pathogen_count']: 194 | cell_metadata_dict[barcode]['pathogen_count'][genus] = 1 195 | else: 196 | cell_metadata_dict[barcode]['pathogen_count'][genus] += 1 197 | return cell_metadata_dict 198 | 199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file): 200 | print('total pathogen-associated gems = ', len(cell_metadata_dict)) 201 | white_list_set = set() 202 | white_list_dict = {} 203 | white_list = gzip.open(barcode_whitelist_file, 'rt') 204 | for each_line in white_list: 205 | each_line = each_line.rstrip('\n') 206 | white_list_set.add(each_line) 207 | for barcode in cell_metadata_dict: 208 | if barcode in white_list_set: 209 | white_list_dict[barcode]= cell_metadata_dict[barcode] 210 | cell_metadata_dict = white_list_dict 211 | print("total filtered pathogen-associated cells = ", len(cell_metadata_dict)) 212 | genus_file = open(out_genus_file,'w') 213 | header = 'cell_name,pathogen,UMI_count,pathogen_count\n' 214 | genus_file.write(header) 215 | 216 | for barcode in cell_metadata_dict: 217 | if not sample_ident == '': 218 | cell_name = sample_ident+'_'+barcode 219 | else: 220 | cell_name = barcode 221 | genus_list = [] 222 | for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']: 223 | genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI]) 224 | sorted_genus_list = list(set(genus_list)) 225 | sorted_genus_list.sort() 226 | genus = '+'.join(sorted_genus_list) 227 | UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI']) 228 | pathogen_count_list = [] 229 | for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']: 230 | pathogen_count=each_pathogen 231 | pathogen_count+=':' 232 | pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen]) 233 | pathogen_count_list.append(pathogen_count) 234 | pathogen_count_list.sort() 235 | pathogen_count_str = ';'.join(pathogen_count_list) 236 | 237 | Periority_pathogen = 'Fusobacterium' 238 | pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count'] 239 | temp_max_list = [] 240 | UMI_count_sum = 0 241 | max_count = max(pathogen_count_mini_dict.values()) 242 | for key,value in pathogen_count_mini_dict.items(): 243 | if value == max_count: 244 | temp_max_list.append(key) 245 | max_UMI = value 246 | UMI_count_sum += value 247 | 248 | UMI_count = UMI_count_sum 249 | if len(set(temp_max_list)) > 1: 250 | genus = 'MULTI' 251 | UMI_count = UMI_count_sum 252 | else: 253 | genus = temp_max_list[0] 254 | UMI_count = max_UMI 255 | output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n' 256 | if UMI_count >= 1: 257 | genus_file.write(output_line) 258 | return 259 | 260 | 261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv): 262 | white_list_set = set() 263 | white_list_dict = {} 264 | white_list = gzip.open(barcode_whitelist_file, 'rt') 265 | for each_line in white_list: 266 | each_line = each_line.rstrip('\n') 267 | white_list_set.add(each_line) 268 | print("total number of cells = ", len(white_list_set)) 269 | for barcode in cell_metadata_dict: 270 | if barcode in white_list_set: 271 | white_list_dict[barcode]= cell_metadata_dict[barcode] 272 | cell_metadata_dict = white_list_dict 273 | output_UMI_validate_table = open(output_UMI_validate_table_csv,'w') 274 | for each_cell in cell_metadata_dict: 275 | for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']: 276 | UMI = each_UMI 277 | pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI] 278 | output_UMI_validate_table.write(UMI+','+pathogen+'\n') 279 | 280 | output_UMI_table = open(output_UMI_table_csv,'w') 281 | genera_list_set = set() 282 | for barcode in cell_metadata_dict: 283 | for pathogen in cell_metadata_dict[barcode]['pathogen_count']: 284 | genera_list_set.add(pathogen) 285 | 286 | genera_list = sorted(list(genera_list_set)) 287 | header = ['barcode']+genera_list 288 | header_out = ','.join(header) 289 | output_UMI_table.write(header_out) 290 | output_UMI_table.write('\n') 291 | for barcode in cell_metadata_dict: 292 | if not sample_ident == '': 293 | cell_name = sample_ident+'_'+barcode 294 | else: 295 | cell_name = barcode 296 | genera_count_list = [] 297 | for each_genus in genera_list: 298 | if each_genus in cell_metadata_dict[barcode]['pathogen_count']: 299 | genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus] 300 | else: 301 | genus_count = 0 302 | genera_count_list.append(str(genus_count)) 303 | output_line = [cell_name]+genera_count_list 304 | output_line_out = ','.join(output_line) 305 | output_UMI_table.write(output_line_out) 306 | output_UMI_table.write('\n') 307 | return 308 | 309 | if __name__ == "__main__": 310 | cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:] 311 | dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv) 312 | step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq) 313 | step2 = read_readnames(read_name_pathseq) 314 | step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file) 315 | step4 = generate_barcode_UMI_dict(out_readname_cell_path) 316 | step5 = output_cells_genus_list(step4,dict_for_genus) 317 | 318 | output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file) 319 | cell_metadata_dict = step5 320 | UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv) 321 | 322 | # cellranger_bam_file, 323 | # sample_ident, 324 | # barcode_whitelist_file, 325 | # pathseq_bam_file, 326 | # pathseq_report_csv, 327 | # read_name_pathseq, 328 | # unmap_cbub_bam_file, 329 | # unmap_cbub_fasta_file, 330 | # out_cell_list, 331 | # out_readname_cell_path, 332 | # out_genus_file, 333 | # output_UMI_table_csv, 334 | # output_UMI_validate_table_csv=sys.argv[1:] 335 | -------------------------------------------------------------------------------- /cell_culture_samples/cell_culture_16s_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ml CellRanger/6.1.1 3 | 4 | ml BEDTools/2.29.2-GCC-9.3.0 5 | ml SAMtools/1.16.1-GCC-11.2.0 6 | ml FastQC/0.11.9-Java-11 7 | ml Trimmomatic/0.39-Java-11 8 | ml picard/2.21.6-Java-11 9 | 10 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8 11 | 12 | ml Python 13 | ml Pysam 14 | 15 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders 16 | `root` # working directory 17 | `pathseqdb` # Pathseq database 18 | `cellrangerdb` # Cellranger database 19 | `gex_bam_path` # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells 20 | 21 | root=${workdir} 22 | # Run cellranger count 23 | cd ${workdir} 24 | mkdir cellranger_count 25 | cd cellranger_count 26 | for folder in ${raw_data_folder}/* 27 | do 28 | folder_name=${folder##*/} 29 | path=${folder} 30 | cellranger count \ 31 | --id=${folder_name} \ 32 | --transcriptome=${cellrangerdb} \ 33 | --fastqs=${path} \ 34 | --sample=${folder_name} 35 | done 36 | 37 | cd ${workdir} 38 | mkdir split_reads 39 | cd split_reads 40 | 41 | # convert cellranger bam file to fastqs 42 | for folder in ${workdir}/cellranger_count/* 43 | do 44 | folder_name=${folder##*/} 45 | file=${folder}/outs/possorted_genome_bam.bam 46 | echo ${file} 47 | 48 | samplename=${folder_name} 49 | bedtools bamtofastq -i ${folder}/outs/possorted_genome_bam.bam \ 50 | -fq ${samplename}.r1.fq \ 51 | -fq2 ${samplename}.r2.fq 52 | done 53 | 54 | # run fastqc before trimmomatic 55 | mkdir ${root}/preqc 56 | fastqc \ 57 | -o ${root}/preqc \ 58 | ${root}/split_reads/*.fq 59 | 60 | # run trimmomatic on R1 61 | cd ${root}/split_reads 62 | mkdir trim 63 | for str in *r1.fq 64 | do 65 | # adjust -threads to number of cores you would like to use 66 | java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.39.jar SE \ 67 | -threads 36 \ 68 | ${str} \ 69 | trim/${str}.SE_trim.fq \ 70 | ILLUMINACLIP:$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa:2:30:10 \ 71 | LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 HEADCROP:15 72 | done 73 | 74 | # run fastqc after trimmomatic 75 | cd trim 76 | mkdir ${root}/postqc 77 | fastqc \ 78 | -o ${root}/postqc \ 79 | *.SE_trim.fq 80 | 81 | mkdir ${workdir}/ubams_r1 82 | cd ${root}/split_reads/trim 83 | 84 | # convert R1 to ubam file in order to run Pathseq 85 | for file in *SE_trim.fq 86 | do 87 | java -Xmx700G -jar $EBROOTPICARD/picard.jar FastqToSam \ 88 | FASTQ=${file} \ 89 | OUTPUT=${file}.bam \ 90 | READ_GROUP_NAME=16s \ 91 | SAMPLE_NAME=16s 92 | 93 | # move and rename generated ubam files 94 | mv ${file}.bam ${workdir}/ubams_r1 95 | done 96 | 97 | ubam_folder=${workdir}/ubams_r1 98 | outpath=${workdir}/pathseq_r1 99 | mkdir ${outpath} 100 | 101 | cd ${ubam_folder} 102 | 103 | # Pathseq to identify pathogen-associated cells 104 | for each_file in *.bam 105 | do 106 | echo ${each_file} 107 | filename="${each_file%.*}" 108 | filename="${filename%.*}" 109 | filename="${filename%.*}" 110 | samplename=${filename} 111 | echo ${samplename} 112 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples 113 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \ 114 | --input ${each_file} \ 115 | --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \ 116 | --kmer-file ${pathseqdb}/pathseq_host.bfi \ 117 | --min-clipped-read-length 60 \ 118 | --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \ 119 | --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \ 120 | --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \ 121 | --output ${outpath}/${samplename}.pathseq.complete.bam \ 122 | --scores-output ${outpath}/${samplename}.pathseq.complete.txt.csv \ 123 | --is-host-aligned false \ 124 | --filter-duplicates false \ 125 | --min-score-identity .7 126 | done 127 | 128 | # Python script to produce a bacteria UMI matrix (based on valid GEX cell) 129 | bam_path=${workdir}/cellranger_count 130 | pathseq_path=${workdir}/pathseq_r1 131 | out_path=${root}/python 132 | mkdir ${out_path} 133 | cd ${bam_path} 134 | # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells. 135 | for each_sample in * 136 | do 137 | echo ${each_sample} 138 | echo ${gex_bam_path} 139 | python INVADEseq.py \ 140 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \ 141 | ${each_sample} \ 142 | ${gex_bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \ 143 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.bam \ 144 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.txt.csv \ 145 | ${out_path}/${each_sample}.16s.filtered_matrix.readname \ 146 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.bam \ 147 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.fasta \ 148 | ${out_path}/${each_sample}.16s.filtered_matrix.list \ 149 | ${out_path}/${each_sample}.16s.raw.filtered_matrix.readnamepath \ 150 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.cell \ 151 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.csv \ 152 | ${out_path}/${each_sample}.16s.filtered_matrix.validate.csv 153 | done 154 | 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /cell_culture_samples/cell_culture_Seurat.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(harmony) 3 | library(Seurat) 4 | library(ggplot2) 5 | library(SingleR) 6 | library(celldex) 7 | library(msigdbr) 8 | library(cowplot) 9 | library(dplyr) 10 | hpca.se <- celldex::HumanPrimaryCellAtlasData() 11 | library("enrichplot") 12 | library(ggupset) 13 | library(gridExtra) 14 | library(pheatmap) 15 | options(bitmapType = 'cairo') 16 | knitr::opts_chunk$set(dev="CairoPNG") 17 | library(org.Hs.eg.db) 18 | 19 | sample_7_MOI_500_data = Read10X(data.dir = "/raw_data/cellranger/count/7_MOI_500_GEX/outs/filtered_feature_bc_matrix") 20 | sample_7_MOI_500 = CreateSeuratObject(counts = sample_7_MOI_500_data, project = "Sample_7_MOI_500", min.cells = 3, min.features = 200) 21 | sample_7_MOI_500[["percent.mt"]] <- PercentageFeatureSet(sample_7_MOI_500, pattern = "^MT-") 22 | 23 | sample_6_MOI_100_data = Read10X(data.dir = "/raw_data/cellranger/count/6_MOI_100_GEX/outs/filtered_feature_bc_matrix") 24 | sample_6_MOI_100 = CreateSeuratObject(counts = sample_6_MOI_100_data, project = "Sample_6_MOI_100", min.cells = 3, min.features = 200) 25 | sample_6_MOI_100[["percent.mt"]] <- PercentageFeatureSet(sample_6_MOI_100, pattern = "^MT-") 26 | 27 | sample_5_HCT_116_data = Read10X(data.dir = "/raw_data/cellranger/count/5_HCT_116_GEX/outs/filtered_feature_bc_matrix") 28 | sample_5_HCT_116 = CreateSeuratObject(counts = sample_5_HCT_116_data, project = "Sample_5_HCT_116", min.cells = 3, min.features = 200) 29 | sample_5_HCT_116[["percent.mt"]] <- PercentageFeatureSet(sample_5_HCT_116, pattern = "^MT-") 30 | 31 | sample.combine <- merge(sample_5_HCT_116, y = c(sample_6_MOI_100,sample_7_MOI_500), add.cell.ids = c("5_HCT_116_GEX","6_MOI_100_GEX",'7_MOI_500_GEX'), project = "SAMPLE.INTEGRATED") 32 | 33 | VlnPlot(sample.combine, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 34 | sample.combine <- NormalizeData(object = sample.combine, normalization.method = "LogNormalize", scale.factor = 10000) 35 | GetAssay(sample.combine,assay = "RNA") 36 | sample.combine <- FindVariableFeatures(object = sample.combine, selection.method = "vst", nfeatures = 5000) 37 | top20 <- head(x = VariableFeatures(object = sample.combine), 20) 38 | plot1 <- VariableFeaturePlot(object = sample.combine) 39 | plot2 <- LabelPoints(plot = plot2, points = top20, repel = TRUE) 40 | plot1+plot2 41 | all.genes <- rownames(sample.combine) 42 | sample.combine<- ScaleData(object = sample.combine,features = all.genes) 43 | sample.combine <- RunPCA(object = sample.combine,pc.genes = VariableFeatures(sample.combine)) 44 | ElbowPlot(sample.combine) 45 | seuratObj <- RunHarmony(sample.combine, group.by.vars="orig.ident",assay.use='RNA') 46 | names(seuratObj@reductions) 47 | seuratObj <- RunUMAP(seuratObj, dims = 1:20, 48 | reduction = "harmony",seed.use=111) 49 | sce=seuratObj 50 | sce <- FindNeighbors(sce, reduction = "harmony",dims = 1:20) 51 | sce <- FindClusters(sce, resolution = 0.5) 52 | seuratObj=sce 53 | sample.combine = seuratObj 54 | 55 | # add pathogen UMI metadata 56 | umi_table_csv = 'csv_novami_mix_dedup.csv' 57 | umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1) 58 | umi_table[is.na(umi_table)] <- 0 59 | umi_table$Total <- rowSums(umi_table) 60 | umi_table[umi_table==0] <- NA 61 | sample.headneck<-AddMetaData(sample.headneck, umi_table) 62 | 63 | -------------------------------------------------------------------------------- /cell_culture_samples/cell_culture_samples_GEX_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The preprocessing pipeline for Cell Culture samples single-cell GEX data 4 | ml CellRanger/6.1.1 5 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8 6 | ml Python 7 | ml Pysam 8 | 9 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders for cell culture samples 10 | `root` # working directory 11 | `pathseqdb` # Pathseq database 12 | `cellrangerdb` # Cellranger database 13 | 14 | cd ${root} 15 | mkdir cellranger_count 16 | cd cellranger_count 17 | 18 | # Cellranger count processing 19 | for folder in ${raw_data_folder}/* 20 | do 21 | folder=${folder} 22 | folder_name=${folder##*/} 23 | path=${folder} 24 | echo ${path} 25 | cellranger count \ 26 | --id=${folder_name} \ 27 | --transcriptome=${cellrangerdb} \ 28 | --fastqs=${path} \ 29 | --sample=${folder_name} 30 | done 31 | 32 | # PathSeq pipeline 33 | outpath=${root}/pathseq 34 | mkdir ${outpath} 35 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples 36 | # 37 | for folder in * 38 | do 39 | folder_name=${folder##*/} 40 | file=${folder}/outs/possorted_genome_bam.bam 41 | samplename=${folder_name} 42 | echo ${samplename} 43 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \ 44 | --input ${file} \ 45 | --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \ 46 | --kmer-file ${pathseqdb}/pathseq_host.bfi \ 47 | --min-clipped-read-length 60 \ 48 | --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \ 49 | --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \ 50 | --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \ 51 | --output ${outpath}/${samplename}.pathseq.complete.bam \ 52 | --scores-output ${outpath}/${samplename}.pathseq.complete.csv \ 53 | --is-host-aligned false \ 54 | --filter-duplicates false \ 55 | --min-score-identity .7 56 | done 57 | 58 | # Python script to generate bacteria matrix 59 | bam_path=${root}/cellranger_count 60 | pathseq_path=${root}/pathseq 61 | out_path=${root}/python 62 | mkdir ${out_path} 63 | cd ${bam_path} 64 | 65 | for each_sample in * 66 | do 67 | echo ${each_sample} 68 | python INVADEseq.py \ 69 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \ 70 | ${each_sample} \ 71 | ${bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \ 72 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \ 73 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \ 74 | ${out_path}/${each_sample}.gex.filtered_matrix.readname \ 75 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.bam \ 76 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.fasta \ 77 | ${out_path}/${each_sample}.gex.filtered_matrix.list \ 78 | ${out_path}/${each_sample}.gex.raw.filtered_matrix.readnamepath \ 79 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.cell \ 80 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.csv \ 81 | ${out_path}/${each_sample}.gex.filtered_matrix.validate.csv 82 | done 83 | 84 | -------------------------------------------------------------------------------- /cell_culture_samples/merge_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import sys 6 | pd.set_option('precision', 0) 7 | 8 | def merge_cellsmeta2(df1,df2): 9 | df_merged = pd.concat([df1, df2], sort=False) 10 | df_merged = df_merged.round() 11 | return df_merged 12 | 13 | def feed_csvs(path):#this will return a list of csvs in your path 14 | file_list = os.listdir(path) 15 | csv_list = [] 16 | for each_file in file_list: 17 | if each_file.endswith('genus.csv'): 18 | #if each_file.endswith('merged.csv'): 19 | csv_list.append(path+'/'+each_file) 20 | return csv_list 21 | 22 | if __name__ == "__main__": 23 | path = sys.argv[1] 24 | csv_merged = path+'/csv_novami.csv' 25 | csv_list = feed_csvs(path) 26 | 27 | csv1 = csv_list[0] 28 | df1 = pd.read_csv(csv1,header = 0,sep = ',') 29 | 30 | for each_csv in csv_list[1:]: 31 | print(each_csv) 32 | df2 = pd.read_csv(each_csv,header = 0,sep = ',') 33 | df1 = merge_cellsmeta2(df1,df2) 34 | 35 | #print(df1) 36 | df1 = df1.fillna(0) 37 | df1.to_csv(csv_merged,sep=',',index=False) 38 | 39 | -------------------------------------------------------------------------------- /cell_culture_samples/metadata_dedup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | from collections import Counter 4 | import os 5 | import sys 6 | """ 7 | # The purpose of this script is deduplication of the merged metadata. Since there are replicated cell names from GEX libraries and 16s libraries, it is necessary to add UMI count from both techniques together into unique cell names. 8 | # usage: 9 | # python metadata_dedup.py \ 10 | GEX_pathogen_UMI_matrix_output_folder(with validation csvs) \ 11 | 16s_pathogen_UMI_matrix_output_folder(with validation csvs) \ 12 | Merged_csv_matrix_from_previous_step \ 13 | Dedup_csv_matrix 14 | 15 | # Note: Merged_csv_matrix_from_previous_step is a csv file conting 3 cell culture samples from our study 16 | # otherwise please modify the sample names in the script. 17 | """ 18 | 19 | def read_and_mkdic(validate_csv_file, sample_name): 20 | UMI_bac_list = [] 21 | validate_csv = open(validate_csv_file,'r') 22 | for each_line in validate_csv: 23 | each_line = each_line.rstrip('\n') 24 | each_line = sample_name+'_'+each_line 25 | UMI_bac_list.append(each_line) 26 | return UMI_bac_list 27 | 28 | #add lists together 29 | # then count 30 | def count_elements(merged_UMI_bac_list): 31 | count_dict = Counter(merged_UMI_bac_list) 32 | return count_dict 33 | 34 | #add dics together: ndic = list(dict(dic0.items()) + list(dic1.items())) 35 | #generate a dataframe, that will be a minux matrix 36 | #df.values[rows, cols] = np.nan 37 | def minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup): 38 | nova_mi_merged = pd.read_csv(nova_mi_merged_csv_file,header = 0,sep = ',',index_col='barcode') 39 | print('before merge = ',len(nova_mi_merged)) 40 | nova_mi_merged = nova_mi_merged.groupby(nova_mi_merged.index).sum() 41 | print('after merge = ',len(nova_mi_merged)) 42 | n=0 43 | rowNamesArr = list(nova_mi_merged.index.values) 44 | #print(rowNamesArr[:10]) 45 | columnsNamesArr = list(nova_mi_merged.columns.values) 46 | for each_cell_UMI in count_dict: 47 | #print(each_cell_UMI) 48 | n+=1 49 | if n%1000 == 0: 50 | print('now working on: ',n/len(count_dict)*100,'%') 51 | cell = each_cell_UMI.split('+')[0] 52 | pathogen = each_cell_UMI.split(',')[1] 53 | count = count_dict[each_cell_UMI]-1 54 | colindex = columnsNamesArr.index(pathogen) 55 | rowindex = rowNamesArr.index(cell) 56 | prev = int(nova_mi_merged.loc[cell,pathogen]) 57 | nova_mi_merged.loc[cell,pathogen]=prev-count 58 | after = int(nova_mi_merged.loc[cell,pathogen]) 59 | nova_mi_merged.to_csv(nova_mi_merged_csv_file_dedup,sep=',',index=True) 60 | return 61 | 62 | 63 | if __name__ == "__main__": 64 | UMI_bac_list=[] 65 | 66 | validate_csv_file_nova = [ 67 | '5_HCT_116', 68 | '6_MOI_100', 69 | '7_MOI_500' 70 | ] 71 | sample_name_nova = [ 72 | '5_HCT_116', 73 | '6_MOI_100', 74 | '7_MOI_500' 75 | ] 76 | 77 | validate_csv_file_mi = [ 78 | '5_HCT_116', 79 | '6_MOI_100', 80 | '7_MOI_500' 81 | ] 82 | sample_name_mi = [ 83 | '5_HCT_116', 84 | '6_MOI_100', 85 | '7_MOI_500' 86 | ] 87 | 88 | path_nova = argv[1] 89 | path_mi = argv[2] 90 | nova_mi_merged_csv_file = argv[3] 91 | nova_mi_merged_csv_file_dedup = argv[4] 92 | 93 | #also modified the cell names in sub merged csvs! 94 | 95 | for n in range(0,len(validate_csv_file_nova)): 96 | UMI_bac_list = UMI_bac_list+read_and_mkdic(path_nova + '/' + validate_csv_file_nova[n]+'.gex.filtered_matrix.validate.csv', 'sample_'+sample_name_nova[n]) 97 | 98 | for n in range(0,len(validate_csv_file_mi)): 99 | UMI_bac_list = UMI_bac_list+read_and_mkdic(path_mi + '/' + validate_csv_file_mi[n]+'.16s.filtered_matrix.validate.csv', 'sample_'+sample_name_mi[n]) 100 | 101 | count_dict = count_elements(UMI_bac_list) 102 | minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup) 103 | 104 | 105 | -------------------------------------------------------------------------------- /patient_samples/DE.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(clusterProfiler) 3 | library(org.Hs.eg.db) 4 | library(msigdbr) 5 | 6 | m_df <- msigdbr(species = "Homo sapiens") 7 | 8 | m_H <- msigdbr(species = "Homo sapiens", category = "H") %>% 9 | dplyr::select(gs_name, gene_symbol) 10 | 11 | 12 | DE_GSEA <- function(seurat_object, 13 | ident_1, 14 | ident_2, 15 | group_by, 16 | seurat_object.markers_filename, 17 | seurat_object.markers_filtered_filename, 18 | seurat_object.markers_gsea_filename){ 19 | seurat_object.markers <- FindMarkers(seurat_object, 20 | ident.1 = ident_1, 21 | ident.2 = ident_2, 22 | group.by = group_by, 23 | logfc.threshold = -Inf, 24 | min.pct = 0.1) 25 | 26 | write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE) 27 | 28 | #seurat_object.markers = filter(seurat_object.markers, p_val_adj <= 0.05) 29 | seurat_object.markers= seurat_object.markers[order(-seurat_object.markers$avg_log2FC),] 30 | seurat_object.markers_filename = seurat_object.markers_filtered_filename 31 | write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE) 32 | 33 | markers_seurat_object <- seurat_object.markers[,c("avg_log2FC")] 34 | names(markers_seurat_object) = as.character(rownames(seurat_object.markers)) 35 | markers_seurat_object = sort(markers_seurat_object, decreasing = TRUE) 36 | length(markers_seurat_object) 37 | 38 | markers_seurat_object.em2 <- GSEA(markers_seurat_object, 39 | TERM2GENE = m_H, 40 | eps=0.0, 41 | by = "fgsea") 42 | 43 | write.csv(markers_seurat_object.em2,seurat_object.markers_gsea_filename, row.names = FALSE) 44 | } 45 | 46 | -------------------------------------------------------------------------------- /patient_samples/INVADEseq.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pysam 3 | import sys 4 | import gzip 5 | 6 | def read_cell_names1(pathseq_bam_file, write_bac): 7 | seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36) 8 | read_name_pathseq = open(write_bac,'w') 9 | total_pathseq_reads=0 10 | total_YP_reads=0 11 | for each_line in seqbam: 12 | total_pathseq_reads+=1 13 | if each_line.has_tag('YP'): 14 | total_YP_reads+=1 15 | outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.get_tag('AS')) + '\n' 16 | read_name_pathseq.write(outline) 17 | print('Total reads in pathseq bam = ',total_pathseq_reads) 18 | print('Total reads in pathseq bam with YP tag = ',total_YP_reads) 19 | return 20 | 21 | def read_readnames(readname_file): 22 | set_for_readnames = set() 23 | dict_name = {} 24 | with open (readname_file,'r') as r: 25 | for each_line in r: 26 | each_line = each_line.rstrip('\n') 27 | each_line_list = each_line.split('\t') 28 | set_for_readnames.add(each_line_list[0]) 29 | dict_name[each_line_list[0]] = {} 30 | dict_name[each_line_list[0]]["pathogen"] = each_line_list[1] 31 | dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2] 32 | return set_for_readnames, dict_name 33 | 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv): 35 | pathseq_report = open(pathseq_report_csv,'r') 36 | dict_for_genus = {} 37 | set_for_genera = set() 38 | for each_line in pathseq_report: 39 | each_line = each_line.rstrip('\n') 40 | each_line_list = each_line.split('\t') 41 | level = each_line_list[2] 42 | tax = each_line_list[3] 43 | if level == 'genus': 44 | set_for_genera.add(tax) 45 | if '|' in each_line_list[1]: 46 | name_string_list = each_line_list[1].split('|') 47 | for n in range(len(name_string_list)): 48 | pointer = -n-1 49 | if not '_' in name_string_list[pointer]: 50 | name = name_string_list[pointer] 51 | break 52 | if 'unclassified' in name_string_list[pointer]: 53 | name = name_string_list[pointer] 54 | break 55 | id = each_line_list[0] 56 | dict_for_genus[id] = name 57 | print ("len(dict_for_genus) = ",len(dict_for_genus)) 58 | return dict_for_genus 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file): 60 | white_list_set = set() 61 | white_list = gzip.open(barcode_whitelist_file, 'rt') 62 | for each_line in white_list: 63 | each_line = each_line.rstrip('\n') 64 | white_list_set.add(each_line) 65 | 66 | seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36) 67 | readname_cell_path = open(out_readname_cell_path,'w') 68 | unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w') 69 | unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam) 70 | 71 | set_for_infect_cells=set() 72 | total_cellranger_bam_reads = 0 73 | total_cellranger_reads_UB_CB_tags = 0 74 | total_cellranger_reads_UB_CB_unmap = 0 75 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0 76 | total_potential_UMI_including_ambigious_reads = set() 77 | for each_line in seqbam: 78 | total_cellranger_bam_reads+=1 79 | if each_line.has_tag('CB') and each_line.has_tag('UB'): 80 | if each_line.get_tag('CB') in white_list_set: 81 | total_cellranger_reads_UB_CB_tags+=1 82 | if each_line.is_unmapped: 83 | total_cellranger_reads_UB_CB_unmap+=1 84 | # added 102721: output a fasta file for kraken 85 | query_name_in_cellranger_bam = each_line.query_name 86 | seq_in_cellranger_bam = each_line.query_sequence 87 | unmap_cbub_fasta.write('>') 88 | unmap_cbub_fasta.write(query_name_in_cellranger_bam) 89 | unmap_cbub_fasta.write('\n') 90 | unmap_cbub_fasta.write(seq_in_cellranger_bam) 91 | unmap_cbub_fasta.write('\n') 92 | unmap_cbub_bam.write(each_line) 93 | if each_line.query_name in set_of_readnames: 94 | set_for_infect_cells.add(each_line.get_tag('CB')) 95 | readname = each_line.query_name 96 | cellname = each_line.get_tag('CB') 97 | umi = each_line.get_tag('UB') 98 | path = dict_name[readname]["pathogen"] 99 | id_string_list = path.split(',') 100 | genus_list = [] 101 | for each_id in id_string_list: 102 | if each_id in dict_for_genus: 103 | genus = dict_for_genus[each_id] 104 | genus_list.append(genus) 105 | else: 106 | print(each_id," not found!") 107 | genus_list = list(set(genus_list)) 108 | genus_list.sort() 109 | genus_list_string = ','.join(genus_list) 110 | mapping_score = dict_name[readname]["mapping_score"] 111 | outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n' 112 | readname_cell_path.write(outline) 113 | total_potential_UMI_including_ambigious_reads.add(umi) 114 | total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1 115 | print('total cellranger bam reads = ',total_cellranger_bam_reads) 116 | print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags) 117 | print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap) 118 | print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads) 119 | cell_list = open(out_cell_list,'w') 120 | for each_cell in set_for_infect_cells: 121 | cell_list.write(each_cell) 122 | cell_list.write('\n') 123 | return 124 | 125 | 126 | def generate_barcode_UMI_dict(out_readname_cell_path): 127 | cell_path_file = open(out_readname_cell_path,'r') 128 | barcode_UMI_dict = {} 129 | for each_line in cell_path_file: 130 | each_line = each_line.rstrip('\n') 131 | each_line_list = each_line.split('\t') 132 | read_name = each_line_list[0] 133 | cell_barcode = each_line_list[1] 134 | UMI = each_line_list[2] 135 | id_string = each_line_list[3] 136 | id_string_list = id_string.split(',') 137 | barcode_UMI = cell_barcode+'+'+UMI 138 | mapping_score = each_line_list[4] 139 | genus_string = each_line_list[5] 140 | if not barcode_UMI in barcode_UMI_dict: 141 | barcode_UMI_dict[barcode_UMI]={} 142 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 143 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 144 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 145 | elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]: 146 | barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list 147 | barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 148 | barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string 149 | return barcode_UMI_dict 150 | 151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus): 152 | cells_dict = {} 153 | for barcode_UMI in barcode_UMI_dict: 154 | cell = barcode_UMI.split('+')[0] 155 | if not cell in cells_dict: 156 | cells_dict[cell]=[] 157 | cells_dict[cell].append(barcode_UMI) 158 | else: 159 | cells_dict[cell].append(barcode_UMI) 160 | UMI_id_dict = {} 161 | for barcode_UMI in barcode_UMI_dict: 162 | if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]: 163 | UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"] 164 | unambigious_UMI = {} 165 | for barcode_UMI in UMI_id_dict: 166 | id_list = UMI_id_dict[barcode_UMI] 167 | genus_list = [] 168 | for each_id in id_list: 169 | if each_id in dict_for_genus: 170 | genus = dict_for_genus[each_id] 171 | genus_list.append(genus) 172 | genus_list = list(set(genus_list)) 173 | if len(genus_list) == 1:#only keep unambigious UMI 174 | unambigious_UMI[barcode_UMI] = genus_list[0] 175 | print('Total unambigious UMI = ',len(unambigious_UMI)) 176 | cell_metadata_dict = {} 177 | for barcode_UMI in unambigious_UMI: 178 | barcode = barcode_UMI.split('+')[0] 179 | UMI = barcode_UMI.split('+')[1] 180 | genus = unambigious_UMI[barcode_UMI] 181 | 182 | if not barcode in cell_metadata_dict: 183 | cell_metadata_dict[barcode] = {} 184 | cell_metadata_dict[barcode]['genus'] = [] 185 | cell_metadata_dict[barcode]['genus'].append(genus) 186 | cell_metadata_dict[barcode]['barcode_UMI']={} 187 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 188 | cell_metadata_dict[barcode]['pathogen_count']={} 189 | else: 190 | cell_metadata_dict[barcode]['genus'].append(genus) 191 | cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus 192 | 193 | if not genus in cell_metadata_dict[barcode]['pathogen_count']: 194 | cell_metadata_dict[barcode]['pathogen_count'][genus] = 1 195 | else: 196 | cell_metadata_dict[barcode]['pathogen_count'][genus] += 1 197 | return cell_metadata_dict 198 | 199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file): 200 | print('total pathogen-associated gems = ', len(cell_metadata_dict)) 201 | white_list_set = set() 202 | white_list_dict = {} 203 | white_list = gzip.open(barcode_whitelist_file, 'rt') 204 | for each_line in white_list: 205 | each_line = each_line.rstrip('\n') 206 | white_list_set.add(each_line) 207 | for barcode in cell_metadata_dict: 208 | if barcode in white_list_set: 209 | white_list_dict[barcode]= cell_metadata_dict[barcode] 210 | cell_metadata_dict = white_list_dict 211 | print("total filtered pathogen-associated cells = ", len(cell_metadata_dict)) 212 | genus_file = open(out_genus_file,'w') 213 | header = 'cell_name,pathogen,UMI_count,pathogen_count\n' 214 | genus_file.write(header) 215 | 216 | for barcode in cell_metadata_dict: 217 | if not sample_ident == '': 218 | cell_name = sample_ident+'_'+barcode 219 | else: 220 | cell_name = barcode 221 | genus_list = [] 222 | for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']: 223 | genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI]) 224 | sorted_genus_list = list(set(genus_list)) 225 | sorted_genus_list.sort() 226 | genus = '+'.join(sorted_genus_list) 227 | UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI']) 228 | pathogen_count_list = [] 229 | for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']: 230 | pathogen_count=each_pathogen 231 | pathogen_count+=':' 232 | pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen]) 233 | pathogen_count_list.append(pathogen_count) 234 | pathogen_count_list.sort() 235 | pathogen_count_str = ';'.join(pathogen_count_list) 236 | 237 | Periority_pathogen = 'Fusobacterium' 238 | pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count'] 239 | temp_max_list = [] 240 | UMI_count_sum = 0 241 | max_count = max(pathogen_count_mini_dict.values()) 242 | for key,value in pathogen_count_mini_dict.items(): 243 | if value == max_count: 244 | temp_max_list.append(key) 245 | max_UMI = value 246 | UMI_count_sum += value 247 | 248 | UMI_count = UMI_count_sum 249 | if len(set(temp_max_list)) > 1: 250 | genus = 'MULTI' 251 | UMI_count = UMI_count_sum 252 | else: 253 | genus = temp_max_list[0] 254 | UMI_count = max_UMI 255 | output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n' 256 | if UMI_count >= 1: 257 | genus_file.write(output_line) 258 | return 259 | 260 | 261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv): 262 | white_list_set = set() 263 | white_list_dict = {} 264 | white_list = gzip.open(barcode_whitelist_file, 'rt') 265 | for each_line in white_list: 266 | each_line = each_line.rstrip('\n') 267 | white_list_set.add(each_line) 268 | print("total number of cells = ", len(white_list_set)) 269 | for barcode in cell_metadata_dict: 270 | if barcode in white_list_set: 271 | white_list_dict[barcode]= cell_metadata_dict[barcode] 272 | cell_metadata_dict = white_list_dict 273 | output_UMI_validate_table = open(output_UMI_validate_table_csv,'w') 274 | for each_cell in cell_metadata_dict: 275 | for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']: 276 | UMI = each_UMI 277 | pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI] 278 | output_UMI_validate_table.write(UMI+','+pathogen+'\n') 279 | 280 | output_UMI_table = open(output_UMI_table_csv,'w') 281 | genera_list_set = set() 282 | for barcode in cell_metadata_dict: 283 | for pathogen in cell_metadata_dict[barcode]['pathogen_count']: 284 | genera_list_set.add(pathogen) 285 | 286 | genera_list = sorted(list(genera_list_set)) 287 | header = ['barcode']+genera_list 288 | header_out = ','.join(header) 289 | output_UMI_table.write(header_out) 290 | output_UMI_table.write('\n') 291 | for barcode in cell_metadata_dict: 292 | if not sample_ident == '': 293 | cell_name = sample_ident+'_'+barcode 294 | else: 295 | cell_name = barcode 296 | genera_count_list = [] 297 | for each_genus in genera_list: 298 | if each_genus in cell_metadata_dict[barcode]['pathogen_count']: 299 | genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus] 300 | else: 301 | genus_count = 0 302 | genera_count_list.append(str(genus_count)) 303 | output_line = [cell_name]+genera_count_list 304 | output_line_out = ','.join(output_line) 305 | output_UMI_table.write(output_line_out) 306 | output_UMI_table.write('\n') 307 | return 308 | 309 | if __name__ == "__main__": 310 | cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:] 311 | dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv) 312 | step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq) 313 | step2 = read_readnames(read_name_pathseq) 314 | step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file) 315 | step4 = generate_barcode_UMI_dict(out_readname_cell_path) 316 | step5 = output_cells_genus_list(step4,dict_for_genus) 317 | 318 | output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file) 319 | cell_metadata_dict = step5 320 | UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv) 321 | 322 | # cellranger_bam_file, 323 | # sample_ident, 324 | # barcode_whitelist_file, 325 | # pathseq_bam_file, 326 | # pathseq_report_csv, 327 | # read_name_pathseq, 328 | # unmap_cbub_bam_file, 329 | # unmap_cbub_fasta_file, 330 | # out_cell_list, 331 | # out_readname_cell_path, 332 | # out_genus_file, 333 | # output_UMI_table_csv, 334 | # output_UMI_validate_table_csv=sys.argv[1:] 335 | -------------------------------------------------------------------------------- /patient_samples/merge_metadata.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | import numpy as np 4 | import os 5 | import sys 6 | pd.set_option('precision', 0) 7 | 8 | def merge_cellsmeta2(df1,df2): 9 | df_merged = pd.concat([df1, df2], sort=False) 10 | df_merged = df_merged.round() 11 | return df_merged 12 | 13 | def feed_csvs(path):#this will return a list of csvs in your path 14 | file_list = os.listdir(path) 15 | csv_list = [] 16 | for each_file in file_list: 17 | if each_file.endswith('genus.csv'): 18 | #if each_file.endswith('merged.csv'): 19 | csv_list.append(path+'/'+each_file) 20 | return csv_list 21 | 22 | if __name__ == "__main__": 23 | path = sys.argv[1] 24 | csv_merged = path+'/csv_novami.csv' 25 | csv_list = feed_csvs(path) 26 | 27 | csv1 = csv_list[0] 28 | df1 = pd.read_csv(csv1,header = 0,sep = ',') 29 | 30 | for each_csv in csv_list[1:]: 31 | print(each_csv) 32 | df2 = pd.read_csv(each_csv,header = 0,sep = ',') 33 | df1 = merge_cellsmeta2(df1,df2) 34 | 35 | #print(df1) 36 | df1 = df1.fillna(0) 37 | df1.to_csv(csv_merged,sep=',',index=False) 38 | 39 | -------------------------------------------------------------------------------- /patient_samples/metadata_dedup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pandas as pd 3 | from collections import Counter 4 | import os 5 | import sys 6 | """ 7 | # The purpose of this script is deduplication of the merged metadata. Since there are replicated cell names from GEX libraries and 16s libraries, it is necessary to add UMI count from both techniques together into unique cell names. 8 | # usage: 9 | # python metadata_dedup.py \ 10 | GEX_pathogen_UMI_matrix_output_folder(with validation csvs) \ 11 | 16s_pathogen_UMI_matrix_output_folder(with validation csvs) \ 12 | Merged_csv_matrix_from_previous_step \ 13 | Dedup_csv_matrix 14 | 15 | # Note: Merged_csv_matrix_from_previous_step is a csv file conting 7 clinical samples from our study 16 | # otherwise please modify the sample names in the script. 17 | # 'nova' refer to GEX data, 'mi' refer to 16S data 18 | """ 19 | 20 | def read_and_mkdic(validate_csv_file, sample_name): 21 | UMI_bac_list = [] 22 | validate_csv = open(validate_csv_file,'r') 23 | for each_line in validate_csv: 24 | each_line = each_line.rstrip('\n') 25 | each_line = sample_name+'_'+each_line 26 | UMI_bac_list.append(each_line) 27 | return UMI_bac_list 28 | 29 | #add lists together 30 | # then count 31 | def count_elements(merged_UMI_bac_list): 32 | count_dict = Counter(merged_UMI_bac_list) 33 | return count_dict 34 | 35 | #add dics together: ndic = list(dict(dic0.items()) + list(dic1.items())) 36 | #generate a dataframe, that will be a minux matrix 37 | #df.values[rows, cols] = np.nan 38 | def minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup): 39 | nova_mi_merged = pd.read_csv(nova_mi_merged_csv_file,header = 0,sep = ',',index_col='barcode') 40 | print('before merge = ',len(nova_mi_merged)) 41 | nova_mi_merged = nova_mi_merged.groupby(nova_mi_merged.index).sum() 42 | print('after merge = ',len(nova_mi_merged)) 43 | n=0 44 | rowNamesArr = list(nova_mi_merged.index.values) 45 | #print(rowNamesArr[:10]) 46 | columnsNamesArr = list(nova_mi_merged.columns.values) 47 | for each_cell_UMI in count_dict: 48 | #print(each_cell_UMI) 49 | n+=1 50 | if n%1000 == 0: 51 | print('now working on: ',n/len(count_dict)*100,'%') 52 | cell = each_cell_UMI.split('+')[0] 53 | pathogen = each_cell_UMI.split(',')[1] 54 | count = count_dict[each_cell_UMI]-1 55 | colindex = columnsNamesArr.index(pathogen) 56 | rowindex = rowNamesArr.index(cell) 57 | prev = int(nova_mi_merged.loc[cell,pathogen]) 58 | nova_mi_merged.loc[cell,pathogen]=prev-count 59 | after = int(nova_mi_merged.loc[cell,pathogen]) 60 | nova_mi_merged.to_csv(nova_mi_merged_csv_file_dedup,sep=',',index=True) 61 | return 62 | 63 | 64 | if __name__ == "__main__": 65 | UMI_bac_list=[] 66 | 67 | validate_csv_file_nova = [ 68 | 'OSCC_15', 69 | 'OSCC_13', 70 | 'OSCC_14', 71 | 'OSCC_17', 72 | 'OSCC_12', 73 | 'OSCC_16', 74 | 'OSCC_11' 75 | ] 76 | sample_name_nova = [ 77 | 'OSCC_15', 78 | 'OSCC_13', 79 | 'OSCC_14', 80 | 'OSCC_17', 81 | 'OSCC_12', 82 | 'OSCC_16', 83 | 'OSCC_11' 84 | ] 85 | 86 | validate_csv_file_mi = [ 87 | 'OSCC_15', 88 | 'OSCC_13', 89 | 'OSCC_14', 90 | 'OSCC_17', 91 | 'OSCC_12', 92 | 'OSCC_16', 93 | 'OSCC_11' 94 | ] 95 | sample_name_mi = [ 96 | 'OSCC_15', 97 | 'OSCC_13', 98 | 'OSCC_14', 99 | 'OSCC_17', 100 | 'OSCC_12', 101 | 'OSCC_16', 102 | 'OSCC_11' 103 | ] 104 | 105 | path_nova = argv[1] 106 | path_mi = argv[2] 107 | nova_mi_merged_csv_file = argv[3] 108 | nova_mi_merged_csv_file_dedup = argv[4] 109 | 110 | #also modified the cell names in sub merged csvs! 111 | 112 | for n in range(0,len(validate_csv_file_nova)): 113 | UMI_bac_list = UMI_bac_list+read_and_mkdic(path_nova + '/' + validate_csv_file_nova[n]+'.gex.filtered_matrix.validate.csv', 'sample_'+sample_name_nova[n]) 114 | 115 | for n in range(0,len(validate_csv_file_mi)): 116 | UMI_bac_list = UMI_bac_list+read_and_mkdic(path_mi + '/' + validate_csv_file_mi[n]+'.16s.filtered_matrix.validate.csv', 'sample_'+sample_name_mi[n]) 117 | 118 | count_dict = count_elements(UMI_bac_list) 119 | minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup) 120 | 121 | 122 | -------------------------------------------------------------------------------- /patient_samples/patient_samples_16s_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ml CellRanger/6.1.1 3 | 4 | ml BEDTools/2.29.2-GCC-9.3.0 5 | ml SAMtools/1.16.1-GCC-11.2.0 6 | ml FastQC/0.11.9-Java-11 7 | ml Trimmomatic/0.39-Java-11 8 | ml picard/2.21.6-Java-11 9 | 10 | 11 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8 12 | 13 | ml Python 14 | ml Pysam 15 | 16 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders 17 | `root` # working directory 18 | `pathseqdb` # Pathseq database 19 | `cellrangerdb` # Cellranger database 20 | `gex_bam_path` # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells 21 | 22 | root=${workdir} 23 | # Run cellranger count 24 | cd ${workdir} 25 | mkdir cellranger_count 26 | cd cellranger_count 27 | for folder in ${raw_data_folder}/* 28 | do 29 | folder_name=${folder##*/} 30 | path=${folder} 31 | cellranger count \ 32 | --id=${folder_name} \ 33 | --transcriptome=${cellrangerdb} \ 34 | --fastqs=${path} \ 35 | --sample=${folder_name} 36 | done 37 | 38 | cd ${workdir} 39 | mkdir split_reads 40 | cd split_reads 41 | 42 | # convert cellranger bam file to fastqs 43 | for folder in ${workdir}/cellranger_count/* 44 | do 45 | folder_name=${folder##*/} 46 | file=${folder}/outs/possorted_genome_bam.bam 47 | echo ${file} 48 | 49 | samplename=${folder_name} 50 | bedtools bamtofastq -i ${folder}/outs/possorted_genome_bam.bam \ 51 | -fq ${samplename}.r1.fq \ 52 | -fq2 ${samplename}.r2.fq 53 | done 54 | 55 | # run fastqc before trimmomatic 56 | mkdir ${root}/preqc 57 | fastqc \ 58 | -o ${root}/preqc \ 59 | ${root}/split_reads/*.fq 60 | 61 | # run trimmomatic on R1 62 | cd ${root}/split_reads 63 | mkdir trim 64 | for str in *r1.fq 65 | do 66 | # adjust -threads to number of cores you would like to use 67 | java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.39.jar SE \ 68 | -threads 36 \ 69 | ${str} \ 70 | trim/${str}.SE_trim.fq \ 71 | ILLUMINACLIP:$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa:2:30:10 \ 72 | LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 HEADCROP:15 73 | done 74 | 75 | # run fastqc after trimmomatic 76 | cd trim 77 | mkdir ${root}/postqc 78 | fastqc \ 79 | -o ${root}/postqc \ 80 | *.SE_trim.fq 81 | 82 | mkdir ${workdir}/ubams_r1 83 | cd ${root}/split_reads/trim 84 | 85 | # convert R1 to ubam file in order to run Pathseq 86 | for file in *SE_trim.fq 87 | do 88 | java -Xmx700G -jar $EBROOTPICARD/picard.jar FastqToSam \ 89 | FASTQ=${file} \ 90 | OUTPUT=${file}.bam \ 91 | READ_GROUP_NAME=16s \ 92 | SAMPLE_NAME=16s 93 | 94 | # move and rename generated ubam files 95 | mv ${file}.bam ${workdir}/ubams_r1 96 | done 97 | 98 | ubam_folder=${workdir}/ubams_r1 99 | outpath=${workdir}/pathseq_r1 100 | mkdir ${outpath} 101 | 102 | cd ${ubam_folder} 103 | 104 | # Pathseq to identify pathogen-associated cells 105 | for each_file in *.bam 106 | do 107 | echo ${each_file} 108 | filename="${each_file%.*}" 109 | filename="${filename%.*}" 110 | filename="${filename%.*}" 111 | samplename=${filename} 112 | echo ${samplename} 113 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples 114 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \ 115 | --input ${each_file} \ 116 | --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \ 117 | --kmer-file ${pathseqdb}/pathseq_host.bfi \ 118 | --min-clipped-read-length 60 \ 119 | --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \ 120 | --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \ 121 | --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \ 122 | --output ${outpath}/${samplename}.pathseq.complete.bam \ 123 | --scores-output ${outpath}/${samplename}.pathseq.complete.txt.csv \ 124 | --is-host-aligned false \ 125 | --filter-duplicates false \ 126 | --min-score-identity .7 127 | done 128 | 129 | # Python script to produce a bacteria UMI matrix (based on valid GEX cell) 130 | bam_path=${workdir}/cellranger_count 131 | pathseq_path=${workdir}/pathseq_r1 132 | out_path=${root}/python 133 | mkdir ${out_path} 134 | cd ${bam_path} 135 | # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells. 136 | for each_sample in * 137 | do 138 | echo ${each_sample} 139 | echo ${gex_bam_path} 140 | python INVADEseq.py \ 141 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \ 142 | ${each_sample} \ 143 | ${gex_bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \ 144 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.bam \ 145 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.txt.csv \ 146 | ${out_path}/${each_sample}.16s.filtered_matrix.readname \ 147 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.bam \ 148 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.fasta \ 149 | ${out_path}/${each_sample}.16s.filtered_matrix.list \ 150 | ${out_path}/${each_sample}.16s.raw.filtered_matrix.readnamepath \ 151 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.cell \ 152 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.csv \ 153 | ${out_path}/${each_sample}.16s.filtered_matrix.validate.csv 154 | done 155 | 156 | 157 | 158 | -------------------------------------------------------------------------------- /patient_samples/patient_samples_GEX_pipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # The preprocessing pipeline for Patient samples single-cell GEX data 4 | ml CellRanger/6.1.1 5 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8 6 | 7 | ml Python 8 | ml Pysam 9 | 10 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders 11 | `root` # working directory 12 | `pathseqdb` # Pathseq database 13 | `cellrangerdb` # Cellranger database 14 | 15 | cd ${root} 16 | mkdir cellranger_count 17 | cd cellranger_count 18 | 19 | # Cellranger count processing 20 | for folder in ${raw_data_folder}/* 21 | do 22 | folder=${folder} 23 | folder_name=${folder##*/} 24 | path=${folder} 25 | echo ${path} 26 | cellranger count \ 27 | --id=${folder_name} \ 28 | --transcriptome=${cellrangerdb} \ 29 | --fastqs=${path} \ 30 | --sample=${folder_name} 31 | done 32 | 33 | # PathSeq pipeline 34 | outpath=${root}/pathseq 35 | mkdir ${outpath} 36 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples 37 | # 38 | for folder in * 39 | do 40 | folder_name=${folder##*/} 41 | file=${folder}/outs/possorted_genome_bam.bam 42 | samplename=${folder_name} 43 | echo ${samplename} 44 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \ 45 | --input ${file} \ 46 | --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \ 47 | --kmer-file ${pathseqdb}/pathseq_host.bfi \ 48 | --min-clipped-read-length 60 \ 49 | --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \ 50 | --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \ 51 | --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \ 52 | --output ${outpath}/${samplename}.pathseq.complete.bam \ 53 | --scores-output ${outpath}/${samplename}.pathseq.complete.csv \ 54 | --is-host-aligned false \ 55 | --filter-duplicates false \ 56 | --min-score-identity .7 57 | done 58 | 59 | # Python script to generate bacteria matrix 60 | bam_path=${root}/cellranger_count 61 | pathseq_path=${root}/pathseq 62 | out_path=${root}/python 63 | mkdir ${out_path} 64 | cd ${bam_path} 65 | 66 | for each_sample in * 67 | do 68 | echo ${each_sample} 69 | python INVADEseq.py \ 70 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \ 71 | ${each_sample} \ 72 | ${bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \ 73 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \ 74 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \ 75 | ${out_path}/${each_sample}.gex.filtered_matrix.readname \ 76 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.bam \ 77 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.fasta \ 78 | ${out_path}/${each_sample}.gex.filtered_matrix.list \ 79 | ${out_path}/${each_sample}.gex.raw.filtered_matrix.readnamepath \ 80 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.cell \ 81 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.csv \ 82 | ${out_path}/${each_sample}.gex.filtered_matrix.validate.csv 83 | done 84 | 85 | -------------------------------------------------------------------------------- /patient_samples/patient_samples_Seurat.r: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | library(harmony) 3 | library(Seurat) 4 | library(ggplot2) 5 | library(SingleR) 6 | library(celldex) 7 | library(msigdbr) 8 | library(cowplot) 9 | library(dplyr) 10 | hpca.se <- celldex::HumanPrimaryCellAtlasData() 11 | library("enrichplot") 12 | library(ggupset) 13 | library(gridExtra) 14 | library(pheatmap) 15 | options(bitmapType = 'cairo') 16 | knitr::opts_chunk$set(dev="CairoPNG") 17 | library(org.Hs.eg.db) 18 | 19 | sample_OSCC_17.data<-Read10X(data.dir = "OSCC_17/outs/filtered_feature_bc_matrix") 20 | sample_OSCC_17 = CreateSeuratObject(counts = sample_OSCC_17.data, project = "Sample_OSCC_17", min.cells = 3, min.features = 200) 21 | sample_OSCC_17[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_17, pattern = "^MT-") 22 | sample_OSCC_12.data<-Read10X(data.dir = "OSCC_12/outs/filtered_feature_bc_matrix") 23 | sample_OSCC_12 = CreateSeuratObject(counts = sample_OSCC_12.data, project = "Sample_OSCC_12", min.cells = 3, min.features = 200) 24 | sample_OSCC_12[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_12, pattern = "^MT-") 25 | sample_OSCC_13.data<-Read10X(data.dir = "OSCC_13/outs/filtered_feature_bc_matrix") 26 | sample_OSCC_13 = CreateSeuratObject(counts = sample_OSCC_13.data, project = "Sample_OSCC_13", min.cells = 3, min.features = 200) 27 | sample_OSCC_13[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_13, pattern = "^MT-") 28 | sample_OSCC_14.data<-Read10X(data.dir = "OSCC_14/outs/filtered_feature_bc_matrix") 29 | sample_OSCC_14 = CreateSeuratObject(counts = sample_OSCC_14.data, project = "Sample_OSCC_14", min.cells = 3, min.features = 200) 30 | sample_OSCC_14[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_14, pattern = "^MT-") 31 | sample_OSCC_15.data<-Read10X(data.dir = "OSCC_15/outs/filtered_feature_bc_matrix") 32 | sample_OSCC_15 = CreateSeuratObject(counts = sample_OSCC_15.data, project = "Sample_OSCC_15", min.cells = 3, min.features = 200) 33 | sample_OSCC_15[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_15, pattern = "^MT-") 34 | sample_OSCC_11.data<-Read10X(data.dir = "OSCC_11/outs/filtered_feature_bc_matrix") 35 | sample_OSCC_11 = CreateSeuratObject(counts = sample_OSCC_11.data, project = "Sample_OSCC_11", min.cells = 3, min.features = 200) 36 | sample_OSCC_11[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_11, pattern = "^MT-") 37 | sample_OSCC_16.data<-Read10X(data.dir = "OSCC_16/outs/filtered_feature_bc_matrix") 38 | sample_OSCC_16 = CreateSeuratObject(counts = sample_OSCC_16.data, project = "Sample_OSCC_16", min.cells = 3, min.features = 200) 39 | sample_OSCC_16[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_16, pattern = "^MT-") 40 | # merge, cluster and Harmony integration 41 | sample.headneck <- merge(sample_OSCC_17, y = c(sample_OSCC_12,sample_OSCC_13,sample_OSCC_14,sample_OSCC_15,sample_OSCC_11,sample_OSCC_16), add.cell.ids = c('Sample_OSCC_17','Sample_OSCC_12','Sample_OSCC_13_T','Sample_OSCC_14_T','Sample_OSCC_15_T','Sample_OSCC_11',"Sample_OSCC_16"), project = "SAMPLE.INTEGRATED") 42 | 43 | VlnPlot(sample.headneck, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3) 44 | sample.headneck <- NormalizeData(object = sample.headneck, normalization.method = "LogNormalize", scale.factor = 10000) 45 | GetAssay(sample.headneck,assay = "RNA") 46 | sample.headneck <- FindVariableFeatures(object = sample.headneck, selection.method = "vst", nfeatures = 5000) 47 | top20 <- head(x = VariableFeatures(object = sample.headneck), 20) 48 | plot1 <- VariableFeaturePlot(object = sample.headneck) 49 | plot2 <- LabelPoints(plot = plot1, points = top20, repel = TRUE) 50 | plot1+plot2 51 | all.genes <- rownames(sample.headneck) 52 | sample.headneck<- ScaleData(object = sample.headneck,features = all.genes) 53 | sample.headneck <- RunPCA(object = sample.headneck,pc.genes = VariableFeatures(sample.headneck)) 54 | ElbowPlot(sample.headneck) 55 | sample.headneck <- RunHarmony(sample.headneck, group.by.vars="orig.ident",assay.use='RNA') 56 | names(sample.headneck@reductions) 57 | sample.headneck <- RunUMAP(sample.headneck, dims = 1:20, 58 | reduction = "harmony",seed.use=111) 59 | DimPlot(sample.headneck,reduction = "umap",label=T ) 60 | DimPlot(sample.headneck,reduction = "umap",label=F ) + ggtitle("Integrated")+theme(plot.title = element_text(hjust = 0.5)) 61 | 62 | sample.headneck <- FindNeighbors(sample.headneck, reduction = "harmony",dims = 1:20) 63 | sample.headneck <- FindClusters(sample.headneck, resolution = 0.5) 64 | table(sample.headneck@meta.data$seurat_clusters) 65 | DimPlot(sample.headneck,reduction = "umap",label=T) 66 | DimPlot(sample.headneck,reduction = "umap",label=T, 67 | group.by = 'orig.ident') 68 | 69 | # SingleR annotation 70 | ref <- HumanPrimaryCellAtlasData() 71 | seuratObj_annot <- as.SingleCellExperiment(sample.headneck) 72 | library(SingleR) 73 | pred <- SingleR(test=seuratObj_annot, ref=ref, labels=ref$label.fine) 74 | head(pred) 75 | plotScoreHeatmap(pred) 76 | tab <- table(Assigned=pred$pruned.labels, Cluster=seuratObj_annot@colData$seurat_clusters) 77 | # Adding a pseudo-count of 10 to avoid strong color jumps with just 1 cell. 78 | 79 | pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101)) 80 | pred2 <- SingleR(test=seuratObj_annot, ref=ref, cluster=seuratObj_annot@colData$seurat_clusters, labels=ref$label.fine) 81 | sample.headneck.backup = sample.headneck 82 | sample.headneck@meta.data$cell.type.fine = sample.headneck@meta.data$seurat_clusters 83 | sample.headneck[["SingleR.cluster.labels"]] <- 84 | pred2$labels[match(sample.headneck[[]][["seurat_clusters"]], rownames(pred2))] 85 | 86 | Idents(sample.headneck) <- "SingleR.cluster.labels" 87 | sample.headneck <- RenameIdents(sample.headneck, 88 | 'Epithelial_cells:bronchial' = "Epithelial_cells", 89 | 'Epithelial_cells:bladder' = "Epithelial_cells" 90 | ) 91 | 92 | # CopyKAT prediction 93 | sample.headneck.exp.rawdata <- as.matrix(sample.headneck@assays$RNA@counts) 94 | sample.headneck.copykat.test <- copykat(rawmat=sample.headneck.exp.rawdata, id.type="S", ngene.chr=3, win.size=25, KS.cut=0.1, sam.name="sample.headneck", distance="euclidean", norm.cell.names="", n.cores=34,output.seg="FLASE") 95 | # Add CopyKAT metadata 96 | sample.headneck_copykat_prediction_csv = 'sample.headneck_copykat_prediction.txt' 97 | sample.headneck_copykat_prediction<-read.csv(sample.headneck_copykat_prediction_csv,sep='\t',header=TRUE,row.names=1) 98 | sample.headneck<-AddMetaData(sample.headneck, sample.headneck_copykat_prediction) 99 | 100 | # add pathogen UMI metadata 101 | umi_table_csv = 'csv_novami_mix_dedup_rename.csv' 102 | umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1) 103 | umi_table[is.na(umi_table)] <- 0 104 | umi_table$Total <- rowSums(umi_table) 105 | umi_table[umi_table==0] <- NA 106 | sample.headneck<-AddMetaData(sample.headneck, umi_table) 107 | #saveRDS(sample.headneck, file = sample.headneck.rds) 108 | --------------------------------------------------------------------------------