├── 10X_Visium_samples
    ├── UMI_annotator.py
    ├── Visium.R
    ├── Visium_pipeline.sh
    ├── output
    │   ├── CRC_16.visium.raw_matrix.genus.csv
    │   ├── CRC_16.visium.raw_matrix.validate.csv
    │   ├── OSCC_2.visium.raw_matrix.genus.csv
    │   └── OSCC_2.visium.raw_matrix.validate.csv
    └── validate_and_count.py
├── LICENSE
├── README.md
├── cell_culture_samples
    ├── DE.r
    ├── INVADEseq.py
    ├── cell_culture_16s_pipeline.sh
    ├── cell_culture_Seurat.r
    ├── cell_culture_samples_GEX_pipeline.sh
    ├── merge_metadata.py
    ├── metadata_dedup.py
    └── output
    │   ├── HCT_116_gex_16s_mix_dedup.csv
    │   └── HT_29_gex_16s_mix_dedup.csv
└── patient_samples
    ├── DE.r
    ├── INVADEseq.py
    ├── merge_metadata.py
    ├── metadata_dedup.py
    ├── out
        └── headneck_gex_16s_mix_dedup_updated.csv
    ├── patient_samples_16s_pipeline.sh
    ├── patient_samples_GEX_pipeline.sh
    └── patient_samples_Seurat.r


/10X_Visium_samples/UMI_annotator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pysam
  3 | import sys
  4 | import gzip
  5 | 
  6 | def read_cell_names1(pathseq_bam_file, write_bac):
  7 |     seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36)
  8 |     read_name_pathseq = open(write_bac,'w')
  9 |     total_pathseq_reads=0
 10 |     total_YP_reads=0
 11 |     for each_line in seqbam:
 12 |         total_pathseq_reads+=1
 13 |         if each_line.has_tag('YP'):
 14 |             total_YP_reads+=1
 15 |             outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.mapping_quality) + '\n'
 16 |             read_name_pathseq.write(outline)
 17 |     print('Total reads in pathseq bam = ',total_pathseq_reads)
 18 |     print('Total reads in pathseq bam with YP tag  = ',total_YP_reads)
 19 |     return
 20 | 
 21 | def read_readnames(readname_file):
 22 |     set_for_readnames = set()
 23 |     dict_name = {}
 24 |     with open (readname_file,'r') as r:
 25 |         for each_line in r:
 26 |             each_line = each_line.rstrip('\n')
 27 |             each_line_list = each_line.split('\t')
 28 |             set_for_readnames.add(each_line_list[0])
 29 |             dict_name[each_line_list[0]] = {}
 30 |             dict_name[each_line_list[0]]["pathogen"] = each_line_list[1]
 31 |             dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2]
 32 |     return set_for_readnames, dict_name
 33 | 
 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv):
 35 |     pathseq_report = open(pathseq_report_csv,'r')
 36 |     dict_for_genus = {}
 37 |     set_for_genera = set()
 38 |     for each_line in pathseq_report:
 39 |         each_line = each_line.rstrip('\n')
 40 |         each_line_list = each_line.split('\t')
 41 |         level = each_line_list[2]
 42 |         tax = each_line_list[3]
 43 |         if level == 'genus':
 44 |             set_for_genera.add(tax)
 45 |         if '|' in each_line_list[1]:
 46 |             name_string_list = each_line_list[1].split('|')
 47 |             for n in range(len(name_string_list)):
 48 |                 pointer = -n-1           
 49 |                 if not '_' in name_string_list[pointer]:
 50 |                     name = name_string_list[pointer]
 51 |                     break
 52 |                 if 'unclassified' in name_string_list[pointer]:
 53 |                     name = name_string_list[pointer]
 54 |                     break         
 55 |             id = each_line_list[0]
 56 |             dict_for_genus[id] = name
 57 |     print ("len(dict_for_genus) = ",len(dict_for_genus))
 58 |     return dict_for_genus
 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file):
 60 |     white_list_set = set()
 61 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
 62 |     for each_line in white_list:
 63 |         each_line = each_line.rstrip('\n')
 64 |         white_list_set.add(each_line)
 65 | 
 66 |     seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36)
 67 |     readname_cell_path = open(out_readname_cell_path,'w')
 68 |     unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w')
 69 |     unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam)
 70 | 
 71 |     set_for_infect_cells=set()
 72 |     total_cellranger_bam_reads = 0
 73 |     total_cellranger_reads_UB_CB_tags = 0
 74 |     total_cellranger_reads_UB_CB_unmap = 0
 75 |     total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0
 76 |     total_potential_UMI_including_ambigious_reads = set()
 77 |     for each_line in seqbam:
 78 |         total_cellranger_bam_reads+=1
 79 |         if each_line.has_tag('CB') and each_line.has_tag('UB'):
 80 |             if each_line.get_tag('CB') in white_list_set:
 81 |                 total_cellranger_reads_UB_CB_tags+=1
 82 |                 if each_line.is_unmapped:
 83 |                     total_cellranger_reads_UB_CB_unmap+=1
 84 |                     # added 102721: output a fasta file for kraken
 85 |                     query_name_in_cellranger_bam = each_line.query_name
 86 |                     seq_in_cellranger_bam = each_line.query_sequence
 87 |                     unmap_cbub_fasta.write('>')
 88 |                     unmap_cbub_fasta.write(query_name_in_cellranger_bam)
 89 |                     unmap_cbub_fasta.write('\n')
 90 |                     unmap_cbub_fasta.write(seq_in_cellranger_bam)
 91 |                     unmap_cbub_fasta.write('\n')
 92 |                     unmap_cbub_bam.write(each_line)
 93 |                     if each_line.query_name in set_of_readnames:
 94 |                         set_for_infect_cells.add(each_line.get_tag('CB'))
 95 |                         readname = each_line.query_name
 96 |                         cellname = each_line.get_tag('CB')
 97 |                         umi = each_line.get_tag('UB')
 98 |                         path = dict_name[readname]["pathogen"]
 99 |                         id_string_list = path.split(',')
100 |                         genus_list = []
101 |                         for each_id in id_string_list:
102 |                             if each_id in dict_for_genus:
103 |                                 genus = dict_for_genus[each_id]
104 |                                 genus_list.append(genus)
105 |                             else:
106 |                                 print(each_id,"  not found!")
107 |                         genus_list = list(set(genus_list))
108 |                         genus_list.sort()
109 |                         genus_list_string = ','.join(genus_list)           
110 |                         mapping_score = dict_name[readname]["mapping_score"]
111 |                         outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n'
112 |                         readname_cell_path.write(outline)
113 |                         total_potential_UMI_including_ambigious_reads.add(umi)
114 |                         total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1
115 |     print('total cellranger bam reads = ',total_cellranger_bam_reads)
116 |     print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags)
117 |     print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap)
118 |     print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads)
119 |     cell_list = open(out_cell_list,'w')
120 |     for each_cell in set_for_infect_cells:
121 |         cell_list.write(each_cell)
122 |         cell_list.write('\n')
123 |     return 
124 | 
125 | 
126 | def generate_barcode_UMI_dict(out_readname_cell_path):
127 |     cell_path_file = open(out_readname_cell_path,'r')
128 |     barcode_UMI_dict = {}
129 |     for each_line in cell_path_file:
130 |         each_line = each_line.rstrip('\n')
131 |         each_line_list = each_line.split('\t')   
132 |         read_name =  each_line_list[0]
133 |         cell_barcode = each_line_list[1]
134 |         UMI = each_line_list[2]
135 |         id_string = each_line_list[3]
136 |         id_string_list = id_string.split(',')
137 |         barcode_UMI = cell_barcode+'+'+UMI
138 |         mapping_score = each_line_list[4]
139 |         genus_string = each_line_list[5]
140 |         if not barcode_UMI in barcode_UMI_dict:
141 |             barcode_UMI_dict[barcode_UMI]={}
142 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
143 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score)
144 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
145 |         elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]:
146 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
147 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 
148 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
149 |     return barcode_UMI_dict 
150 | 
151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus):
152 |     cells_dict = {}
153 |     for barcode_UMI in barcode_UMI_dict:
154 |         cell = barcode_UMI.split('+')[0]
155 |         if not cell in cells_dict:
156 |             cells_dict[cell]=[]
157 |             cells_dict[cell].append(barcode_UMI)
158 |         else:
159 |             cells_dict[cell].append(barcode_UMI)
160 |     UMI_id_dict = {}
161 |     for barcode_UMI in barcode_UMI_dict:
162 |         if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]:
163 |             UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"]
164 |     unambigious_UMI = {}
165 |     for barcode_UMI in UMI_id_dict:
166 |         id_list = UMI_id_dict[barcode_UMI]
167 |         genus_list = []
168 |         for each_id in id_list:
169 |             if each_id in dict_for_genus:
170 |                 genus = dict_for_genus[each_id]
171 |                 genus_list.append(genus)
172 |         genus_list = list(set(genus_list))
173 |         if len(genus_list) == 1:#only keep unambigious UMI
174 |             unambigious_UMI[barcode_UMI] = genus_list[0]
175 |     print('Total unambigious UMI = ',len(unambigious_UMI))
176 |     cell_metadata_dict = {}
177 |     for barcode_UMI in unambigious_UMI:
178 |         barcode = barcode_UMI.split('+')[0]
179 |         UMI = barcode_UMI.split('+')[1]
180 |         genus = unambigious_UMI[barcode_UMI]
181 | 
182 |         if not barcode in cell_metadata_dict:
183 |             cell_metadata_dict[barcode] = {}
184 |             cell_metadata_dict[barcode]['genus'] = []
185 |             cell_metadata_dict[barcode]['genus'].append(genus)
186 |             cell_metadata_dict[barcode]['barcode_UMI']={}
187 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
188 |             cell_metadata_dict[barcode]['pathogen_count']={}
189 |         else:
190 |             cell_metadata_dict[barcode]['genus'].append(genus)
191 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
192 | 
193 |         if not genus in cell_metadata_dict[barcode]['pathogen_count']:
194 |             cell_metadata_dict[barcode]['pathogen_count'][genus] = 1
195 |         else:
196 |             cell_metadata_dict[barcode]['pathogen_count'][genus] += 1
197 |     return cell_metadata_dict
198 | 
199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file):
200 |     print('total pathogen-associated gems = ', len(cell_metadata_dict))
201 |     white_list_set = set()
202 |     white_list_dict = {}
203 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
204 |     for each_line in white_list:
205 |         each_line = each_line.rstrip('\n')
206 |         white_list_set.add(each_line)
207 |     for barcode in cell_metadata_dict:
208 |         if barcode in white_list_set:
209 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
210 |     cell_metadata_dict = white_list_dict
211 |     print("total filtered pathogen-associated cells = ", len(cell_metadata_dict))
212 |     genus_file = open(out_genus_file,'w')
213 |     header = 'cell_name,pathogen,UMI_count,pathogen_count\n'
214 |     genus_file.write(header)
215 | 
216 |     for barcode in cell_metadata_dict:
217 |         if not sample_ident == '':
218 |             cell_name = sample_ident+'_'+barcode
219 |         else:
220 |             cell_name = barcode
221 |         genus_list = []
222 |         for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']:
223 |             genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI])
224 |         sorted_genus_list = list(set(genus_list))
225 |         sorted_genus_list.sort()
226 |         genus = '+'.join(sorted_genus_list)            
227 |         UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI'])
228 |         pathogen_count_list = []
229 |         for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']:
230 |             pathogen_count=each_pathogen
231 |             pathogen_count+=':'
232 |             pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen])
233 |             pathogen_count_list.append(pathogen_count)
234 |         pathogen_count_list.sort()
235 |         pathogen_count_str = ';'.join(pathogen_count_list)
236 | 
237 |         Periority_pathogen = 'Fusobacterium'
238 |         pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count']
239 |         temp_max_list = []
240 |         UMI_count_sum = 0
241 |         max_count = max(pathogen_count_mini_dict.values())
242 |         for key,value in pathogen_count_mini_dict.items():
243 |             if value == max_count:
244 |                 temp_max_list.append(key)
245 |                 max_UMI = value
246 |             UMI_count_sum += value
247 |         
248 |         UMI_count = UMI_count_sum
249 |         if len(set(temp_max_list)) > 1: 
250 |             genus = 'MULTI'
251 |             UMI_count = UMI_count_sum
252 |         else:
253 |             genus = temp_max_list[0]
254 |             UMI_count = max_UMI
255 |         output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n'
256 |         if UMI_count >= 1:
257 |             genus_file.write(output_line)
258 |     return
259 | 
260 | 
261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv):
262 |     white_list_set = set()
263 |     white_list_dict = {}
264 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
265 |     for each_line in white_list:
266 |         each_line = each_line.rstrip('\n')
267 |         white_list_set.add(each_line)
268 |     print("total number of cells = ", len(white_list_set))
269 |     for barcode in cell_metadata_dict:
270 |         if barcode in white_list_set:
271 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
272 |     cell_metadata_dict = white_list_dict
273 |     output_UMI_validate_table = open(output_UMI_validate_table_csv,'w')
274 |     for each_cell in cell_metadata_dict:
275 |         for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']:
276 |             UMI = each_UMI
277 |             pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI]
278 |             output_UMI_validate_table.write(UMI+','+pathogen+'\n')
279 | 
280 |     output_UMI_table = open(output_UMI_table_csv,'w')
281 |     genera_list_set = set()
282 |     for barcode in cell_metadata_dict:
283 |         for pathogen in cell_metadata_dict[barcode]['pathogen_count']:
284 |             genera_list_set.add(pathogen)
285 | 
286 |     genera_list = sorted(list(genera_list_set))
287 |     header = ['barcode']+genera_list
288 |     header_out = ','.join(header)
289 |     output_UMI_table.write(header_out)
290 |     output_UMI_table.write('\n')
291 |     for barcode in cell_metadata_dict:
292 |         if not sample_ident == '':
293 |             cell_name = sample_ident+'_'+barcode
294 |         else:
295 |             cell_name = barcode
296 |         genera_count_list = []
297 |         for each_genus in genera_list:
298 |             if each_genus in cell_metadata_dict[barcode]['pathogen_count']:
299 |                 genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus]
300 |             else:
301 |                 genus_count = 0
302 |             genera_count_list.append(str(genus_count))
303 |         output_line = [cell_name]+genera_count_list
304 |         output_line_out = ','.join(output_line)
305 |         output_UMI_table.write(output_line_out)
306 |         output_UMI_table.write('\n')
307 |     return
308 | 
309 | if __name__ == "__main__":
310 |     cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:]
311 |     dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv)
312 |     step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq)
313 |     step2 = read_readnames(read_name_pathseq)
314 |     step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file)
315 |     step4 = generate_barcode_UMI_dict(out_readname_cell_path)
316 |     step5 = output_cells_genus_list(step4,dict_for_genus)
317 | 
318 |     output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file)
319 |     cell_metadata_dict = step5
320 |     UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv)
321 | 
322 | # cellranger_bam_file,
323 | # sample_ident,
324 | # barcode_whitelist_file,
325 | # pathseq_bam_file,
326 | # pathseq_report_csv,
327 | # read_name_pathseq,
328 | # unmap_cbub_bam_file,
329 | # unmap_cbub_fasta_file,
330 | # out_cell_list,
331 | # out_readname_cell_path,
332 | # out_genus_file,
333 | # output_UMI_table_csv,
334 | # output_UMI_validate_table_csv=sys.argv[1:]
335 | 


--------------------------------------------------------------------------------
/10X_Visium_samples/Visium.R:
--------------------------------------------------------------------------------
 1 | # add read count for V11A07-022_A1:
 2 | setwd("Visium/data_processing")
 3 | path <- 'Visium/raw_data/count'
 4 | sample_pattern = 'V'
 5 | samples <- sort(list.files(path, pattern=sample_pattern, full.names = TRUE))
 6 | sample.names <- basename(samples)
 7 | processed_foler = 'Visium/data_processing/python'
 8 | samples_folder = 'Visium/raw_data/count'
 9 | output_path = 'Visium/data_processing/rds'
10 | 
11 | # a loop for generating rds files
12 | for (each_sample in sample.names){
13 |     print(each_sample)
14 |     data_path = paste0(samples_folder,'/',each_sample,'/outs')
15 |     metadata_file = paste0(processed_foler,'/',each_sample,'.visium.raw_matrix.genus.csv')
16 |     output_file_filtered = paste0(output_path,'/',each_sample,'.filtered_matrix.rds')
17 |     output_file_raw = paste0(output_path,'/',each_sample,'.raw_matrix.rds')
18 | 
19 | # processing: filtered matrix
20 |     print('processing filtered matrix')
21 |     list.files(samples_folder) # Should show filtered_feature_bc_matrix.h5
22 |     tissue_sample<-Load10X_Spatial(data.dir = data_path,filename = "filtered_feature_bc_matrix.h5")
23 |     plot1 <- VlnPlot(tissue_sample, features = "nCount_Spatial", pt.size = 0.1) + NoLegend()
24 |     plot2 <- SpatialFeaturePlot(tissue_sample, features = "nCount_Spatial") + theme(legend.position = "right")
25 |     plot1+plot2
26 |     tissue_sample <- subset(tissue_sample, subset = nCount_Spatial > 3 & nFeature_Spatial > 3)
27 |     SpatialFeaturePlot(tissue_sample, features = "nCount_Spatial") + theme(legend.position = "right")
28 |     tissue_sample <- SCTransform(tissue_sample, assay = "Spatial")
29 | # delete original matrix to reduce size
30 |     tissue_sample@assays$Spatial=NULL
31 |     CellsMeta = tissue_sample@meta.data 
32 |     head(CellsMeta)
33 |     umi_table_csv = metadata_file
34 |     umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1)
35 |     umi_table$Total <- rowSums(umi_table)
36 |     umi_table[umi_table==0] <- NA
37 | # turn 0 to NA
38 |     tissue_sample<-AddMetaData(tissue_sample, umi_table)
39 | 
40 |     p1 = SpatialFeaturePlot(tissue_sample,features = c('Total'),pt.size.factor = 1.52) +
41 |                 ggtitle(paste(each_sample,' ','Total Pathogen'," nUMI Filtered", sep = "")) + 
42 |                 theme(legend.position = "right",plot.title = element_text(hjust = 0.5))
43 |     print(p1)
44 | # added 011322: cluster
45 | 
46 |     tissue_sample <- RunPCA(tissue_sample, assay = "SCT", verbose = FALSE)
47 |     tissue_sample <- FindNeighbors(tissue_sample, reduction = "pca", dims = 1:20)
48 |     tissue_sample <- FindClusters(tissue_sample, verbose = FALSE)
49 |     set.seed(123)
50 |     tissue_sample <- RunUMAP(tissue_sample, reduction = "pca", dims = 1:20) 
51 |     # find Spatially Variable Features
52 |     tissue_sample <- FindSpatiallyVariableFeatures(tissue_sample, assay = "SCT", features = VariableFeatures(tissue_sample)[1:1000],selection.method = "markvariogram")
53 | 
54 | # save filtered rds
55 |     saveRDS(tissue_sample, file = output_file_filtered)
56 | 
57 | # processing: raw matrix (mapping only)
58 |     print('processing raw matrix')
59 |     tissue_sample_raw<-Load10X_Spatial(data.dir = data_path,filename = "raw_feature_bc_matrix.h5")
60 |     tissue_sample_raw<-AddMetaData(tissue_sample_raw, umi_table)
61 | 
62 |     p2 = SpatialFeaturePlot(tissue_sample_raw,features = c('Total'),pt.size.factor = 1.52) +
63 |                 ggtitle(paste(each_sample,' ','Total Pathogen'," nUMI RAW", sep = "")) + 
64 |                 theme(legend.position = "right",plot.title = element_text(hjust = 0.5))
65 | 
66 |     print(p2)
67 | 
68 |     saveRDS(tissue_sample_raw, file = output_file_raw)
69 | 
70 | }
71 | 
72 | 
73 | read_table_csv = 'Visium/data_processing/python/CRC_16.visium.raw_matrix.genus.csv'
74 | read_table<-read.csv(read_table_csv,sep=',',header=TRUE,row.names = 1)
75 | CRC_16.visium<-AddMetaData(CRC_16.visium, read_table)
76 | 
77 | read_table_csv = 'Visium/data_processing/python/OSCC_2.visium.raw_matrix.genus.csv'
78 | read_table<-read.csv(read_table_csv,sep=',',header=TRUE,row.names = 1)
79 | OSCC_2.visium<-AddMetaData(CRC_16.visium, read_table)
80 | 
81 | 


--------------------------------------------------------------------------------
/10X_Visium_samples/Visium_pipeline.sh:
--------------------------------------------------------------------------------
 1 | # The preprocessing pipeline for 10X Visium data, input of this pipeline is the output of spaceranger count
 2 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8
 3 | ml Python
 4 | ml Pysam
 5 | 
 6 | `raw_data_folder` # the folder containing Spaceranger output folders 
 7 | `root` # working directory
 8 | `pathseqdb` # Pathseq database
 9 | 
10 | cd ${root}
11 | cd ${raw_data_folder}
12 | 
13 | # PathSeq pipeline
14 | outpath=${root}/pathseq
15 | mkdir ${outpath}
16 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples
17 | # 
18 | for folder in *
19 | do
20 | folder_name=${folder##*/}
21 | file=${folder}/outs/possorted_genome_bam.bam
22 | samplename=${folder_name}
23 | echo ${samplename}
24 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \
25 |     --input ${file} \
26 |     --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \
27 |     --kmer-file ${pathseqdb}/pathseq_host.bfi \
28 |     --min-clipped-read-length 60 \
29 |     --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \
30 |     --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \
31 |     --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \
32 |     --output ${outpath}/${samplename}.pathseq.complete.bam \
33 |     --scores-output ${outpath}/${samplename}.pathseq.complete.csv \
34 |     --is-host-aligned false \
35 |     --filter-duplicates false \
36 |     --min-score-identity .7
37 | done
38 | 
39 | # Python script to generate bacteria matrix
40 | bam_path=${raw_data_folder}
41 | pathseq_path=${root}/pathseq
42 | out_path=${root}/python
43 | mkdir ${out_path}
44 | cd ${bam_path}
45 | 
46 | for each_sample in *
47 | do
48 | echo ${each_sample}
49 | python UMI_annotator.py \
50 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \
51 | '' \
52 | ${bam_path}/${each_sample}/outs/raw_feature_bc_matrix/barcodes.tsv.gz \
53 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \
54 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \
55 | ${out_path}/${each_sample}.visium.raw_matrix.readname \
56 | ${out_path}/${each_sample}.visium.raw_matrix.unmap_cbub.bam \
57 | ${out_path}/${each_sample}.visium.raw_matrix.unmap_cbub.fasta \
58 | ${out_path}/${each_sample}.visium.raw_matrix.list \
59 | ${out_path}/${each_sample}.visium.raw.raw_matrix.readnamepath \
60 | ${out_path}/${each_sample}.visium.raw_matrix.genus.cell \
61 | ${out_path}/${each_sample}.visium.raw_matrix.genus.csv \
62 | ${out_path}/${each_sample}.visium.raw_matrix.validate.csv
63 | done
64 | 
65 | 


--------------------------------------------------------------------------------
/10X_Visium_samples/validate_and_count.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | # what we want to know:
  4 | #    1. sample name
  5 | #    2. genera list
  6 | #    2.5 number of this type of cells
  7 | #    3. UMI for each genus for this sample
  8 | #    4. read number for each genus for this sample
  9 | # First, for target sample, extract the Bacteria Positive cells
 10 | # Then using Validate csv, extract and count UMIs
 11 | # Then use CellsMeta file, count reads
 12 | 
 13 | def extract_bac_pos_cells(metadata_file, orig_ident):
 14 |     print(orig_ident)
 15 |     cell_names_set = set()
 16 |     metadata = open(metadata_file,'r')
 17 |     n=0
 18 |     # read in metadata
 19 |     for each_line in metadata:
 20 |         each_line = each_line.rstrip('\n')
 21 |         each_line_list = each_line.split(',')
 22 |         if not orig_ident == '':
 23 |             sample_name = each_line_list[0]
 24 |         else:
 25 |             sample_name = ''
 26 |         if n == 0:
 27 |             k=0
 28 |             for each_item in each_line_list:
 29 |                 if each_item == "Total":
 30 |                     position = k
 31 |                 k+=1
 32 |             #print ('found k : ',k)
 33 |             #print ('found position : ',position)
 34 |         if n > 0 :
 35 |             if orig_ident in sample_name:
 36 |                 if int(each_line_list[position]) > 0:
 37 |                     cell_names = each_line_list[0].split('_')[-1]
 38 |                     cell_names_set.add(cell_names)
 39 |         n+=1
 40 |     print('len(cell_names_set) = ',len(cell_names_set))
 41 |     return cell_names_set
 42 | 
 43 | # validate_dict is the dict for cell_UMI -> genus
 44 | def extract_UMI(validate_file, cell_names_set):
 45 |     #print(cell_names_set)
 46 |     validate_dict = {}
 47 |     genus_set = set()
 48 |     validate = open(validate_file,'r')
 49 |     cell_name_set = set()
 50 |     for each_line in validate:
 51 |         each_line = each_line.rstrip('\n')
 52 |         cell_name = each_line.split('+')[0]
 53 |         cell_name_set.add(cell_name)
 54 |         cell_name_barcode = each_line.split(',')[0]
 55 |         #print(cell_name_barcode)
 56 |         genus = each_line.split(',')[1]
 57 |         if cell_name in cell_names_set:
 58 |             validate_dict[cell_name_barcode] = genus
 59 |             genus_set.add(genus)
 60 |     #print(cell_name_set)
 61 |     return validate_dict,genus_set
 62 | 
 63 | # sum dict is the dict for cell_UMI -> list of readnames
 64 | def count_read(cellsmeta_file,validate_dict):
 65 |     cellsmeta = open(cellsmeta_file,'r')
 66 |     sum_dict = {}
 67 |     for each_line in cellsmeta:
 68 |         each_line = each_line.rstrip('\n')
 69 |         each_line_list = each_line.split('\t')
 70 |         if not ',' in each_line_list[-1]:
 71 |             read_name = each_line_list[0]
 72 |             cell_name = each_line_list[1]
 73 |             barcode = each_line_list[2]
 74 |             cell_barcode = cell_name + '+' + barcode
 75 |             if cell_barcode in validate_dict:
 76 |                 if not cell_barcode in sum_dict:
 77 |                     sum_dict[cell_barcode] = []
 78 |                     sum_dict[cell_barcode].append(read_name)
 79 |                 else:
 80 |                     sum_dict[cell_barcode].append(read_name)
 81 |     for each_cell_barcode in sum_dict:
 82 |         sum_dict[each_cell_barcode] = list(set(sum_dict[each_cell_barcode]))
 83 |     return sum_dict
 84 | 
 85 | # first, loop the genus names within each sample
 86 | # for each genus names, extract and count cell_UMI, cell
 87 | # for each extracted cell_UMI, count number of *unique readnames, add together
 88 | # will be in a dict: genus[number of cell, number of UMI, number of reads]
 89 | # 
 90 | # validate_dict is the dict for cell_UMI -> genus
 91 | # sum dict is the dict for cell_UMI -> readnames
 92 | def summarize_read(sum_dict,validate_dict,genus_set):
 93 |     genus_sum_dict = {}
 94 |     for each_genus in genus_set:
 95 |         for each_cell_UMI in validate_dict:
 96 |             if validate_dict[each_cell_UMI] == each_genus:
 97 |                 cell_barcode = each_cell_UMI.split('+')[0]
 98 |                 #print(cell_barcode)
 99 |                 #print(each_cell_UMI)
100 |                 read_list = sum_dict[each_cell_UMI]
101 |                 if not each_genus in genus_sum_dict:
102 |                     genus_sum_dict[each_genus] = {}
103 |                     genus_sum_dict[each_genus]['cell_list']=[]
104 |                     genus_sum_dict[each_genus]['UMI_list']=[]
105 |                     genus_sum_dict[each_genus]['reads_list']=[]
106 |                 genus_sum_dict[each_genus]['cell_list'].append(cell_barcode)
107 |                 genus_sum_dict[each_genus]['UMI_list'].append(each_cell_UMI)
108 |                 genus_sum_dict[each_genus]['reads_list'] = genus_sum_dict[each_genus]['reads_list'] + read_list 
109 |     # then convert it to count dict
110 |     return genus_sum_dict #genus_count_dict
111 | 
112 | # this function is not used in visium analysis 
113 | def add_dicts(genus_sum_dict_1,genus_sum_dict_2):
114 |     genus_sum_dict1 = genus_sum_dict_1
115 |     genus_sum_dict2 = genus_sum_dict_2
116 |     for each_genus in genus_sum_dict2:
117 |         if each_genus in genus_sum_dict1:
118 |             genus_sum_dict1[each_genus]['cell_list'] += genus_sum_dict2[each_genus]['cell_list']
119 |             genus_sum_dict1[each_genus]['UMI_list'] += genus_sum_dict2[each_genus]['UMI_list']
120 |             genus_sum_dict1[each_genus]['reads_list'] += genus_sum_dict2[each_genus]['reads_list']
121 |         else:
122 |             genus_sum_dict1[each_genus] = {}
123 |             genus_sum_dict1[each_genus]['cell_list'] = genus_sum_dict2[each_genus]['cell_list']
124 |             genus_sum_dict1[each_genus]['UMI_list'] = genus_sum_dict2[each_genus]['UMI_list']
125 |             genus_sum_dict1[each_genus]['reads_list'] = genus_sum_dict2[each_genus]['reads_list']
126 |     genus_sum_dict = genus_sum_dict1
127 |     # then convert it to count dict
128 |     genus_count_dict = {}
129 |     for each_genus in genus_sum_dict:
130 |         #print(genus_sum_dict[each_genus])
131 |         number_of_cells = len(set(genus_sum_dict[each_genus]['cell_list']))
132 |         number_of_UMIs = len(set(genus_sum_dict[each_genus]['UMI_list']))
133 |         number_of_reads = len(set(genus_sum_dict[each_genus]['reads_list']))
134 |         if not each_genus in genus_count_dict:
135 |             genus_count_dict[each_genus] = {}
136 |             genus_count_dict[each_genus]['cell'] = 0
137 |             genus_count_dict[each_genus]['UMI'] = 0
138 |             genus_count_dict[each_genus]['reads'] = 0
139 |         genus_count_dict[each_genus]['cell'] = number_of_cells
140 |         genus_count_dict[each_genus]['UMI'] = number_of_UMIs
141 |         genus_count_dict[each_genus]['reads'] = number_of_reads
142 |     return genus_count_dict
143 | # 061322 update:
144 | # add a function to output readnames for each genera! (maybe: cb/ub are included in the header)
145 | def output_readnames(genus_sum_dict,output_path):
146 |     # there are multiple genera, so we create one file for each
147 |     for each_genus in genus_sum_dict:
148 |         file_name = each_genus+'.csv'
149 |         #cellsmeta = open(cellsmeta_file,'r')
150 |         write_reads = open(output_path+'/'+file_name,'w')
151 |         for each_readname in genus_sum_dict[each_genus]['reads_list']:
152 |             output_line = each_readname+'\n'
153 |             write_reads.write(output_line)
154 |     return
155 | 
156 | # instead of add_dicts, for visium I use a count_dicts instead
157 | def count_dicts(genus_sum_dict):
158 |     genus_count_dict = {}
159 |     for each_genus in genus_sum_dict:
160 |         #print(genus_sum_dict[each_genus])
161 |         number_of_cells = len(set(genus_sum_dict[each_genus]['cell_list']))
162 |         number_of_UMIs = len(set(genus_sum_dict[each_genus]['UMI_list']))
163 |         number_of_reads = len(set(genus_sum_dict[each_genus]['reads_list']))
164 |         if not each_genus in genus_count_dict:
165 |             genus_count_dict[each_genus] = {}
166 |             genus_count_dict[each_genus]['cell'] = 0
167 |             genus_count_dict[each_genus]['UMI'] = 0
168 |             genus_count_dict[each_genus]['reads'] = 0
169 |         genus_count_dict[each_genus]['cell'] = number_of_cells
170 |         genus_count_dict[each_genus]['UMI'] = number_of_UMIs
171 |         genus_count_dict[each_genus]['reads'] = number_of_reads
172 |     return genus_count_dict
173 | 
174 | def output_read(output_file_name, genus_count_dict):
175 |     output_file = open(output_file_name,'w')
176 |     header = 'Genus,Number_of_Cells,Number_of_UMI,Number_of_reads\n'
177 |     output_file.write(header)
178 |     for each_genus in genus_count_dict:
179 |         number_of_cells = genus_count_dict[each_genus]['cell'] 
180 |         number_of_UMIs = genus_count_dict[each_genus]['UMI'] 
181 |         number_of_reads = genus_count_dict[each_genus]['reads']
182 |         output_line = each_genus + ',' + str(number_of_cells) + ',' + str(number_of_UMIs) + ',' + str(number_of_reads) + '\n'
183 |         output_file.write(output_line)
184 |     return
185 | 
186 | print('start processing')
187 | # CRC_16
188 | sample_name = 'CRC_16'
189 | print(sample_name)
190 | metadata_file = 'data_processing/selected_samples_for_counting/sample.visium_CRC_16_metadata.csv'
191 | orig_ident = ''
192 | cell_names_set = extract_bac_pos_cells(metadata_file, orig_ident)
193 | validate_csv = 'data_processing/python/CRC_16.visium.raw_matrix.validate.csv'
194 | validate_dict,genus_set = extract_UMI(validate_csv, cell_names_set)
195 | readnamepath_csv = 'data_processing/python/CRC_16.visium.raw.raw_matrix.readnamepath'
196 | sum_dict = count_read(readnamepath_csv,validate_dict)
197 | genus_sum_dict = summarize_read(sum_dict,validate_dict,genus_set)
198 | readname_path = 'data_processing/selected_samples_for_counting/CRC_16'
199 | output_readnames(genus_sum_dict,readname_path)
200 | output_file = 'data_processing/selected_samples_for_counting/'+sample_name+'.sum.csv'
201 | genus_count_dict = count_dicts(genus_sum_dict)
202 | output_read(output_file, genus_count_dict)
203 | 
204 | # OSCC_02
205 | sample_name = 'OSCC_02'
206 | print(sample_name)
207 | metadata_file = 'data_processing/selected_samples_for_counting/sample.visium_OSCC_02_metadata.csv'
208 | orig_ident = ''
209 | cell_names_set = extract_bac_pos_cells(metadata_file, orig_ident)
210 | validate_csv = 'data_processing/python/OSCC_02.visium.raw_matrix.validate.csv'
211 | validate_dict,genus_set = extract_UMI(validate_csv, cell_names_set)
212 | readnamepath_csv = 'data_processing/python/OSCC_02.visium.raw.raw_matrix.readnamepath'
213 | sum_dict = count_read(readnamepath_csv,validate_dict)
214 | genus_sum_dict = summarize_read(sum_dict,validate_dict,genus_set)
215 | readname_path = 'data_processing/selected_samples_for_counting/OSCC_02'
216 | output_readnames(genus_sum_dict,readname_path)
217 | output_file = 'data_processing/selected_samples_for_counting/'+sample_name+'.sum.csv'
218 | 
219 | genus_count_dict = count_dicts(genus_sum_dict)
220 | output_read(output_file, genus_count_dict)
221 | 
222 |  
223 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Fred Hutchinson Cancer Center
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Galeano-Nino-Bullman-Intratumoral-Microbiota-2022
 2 | 
 3 | 
 4 | [![DOI](https://zenodo.org/badge/530442339.svg)](https://zenodo.org/badge/latestdoi/530442339)
 5 | 
 6 | 
 7 | Analysis code used in Galeano Nino et al., Effect of the intratumoral microbiota on spatial and cellular heterogeneity in cancer. 2022
 8 | 
 9 | The code in this repository is organized to reflect the description in the Methods
10 | section of Galeano Nino et al., Effect of the intratumoral microbiota on spatial and cellular heterogeneity in cancer. 2022.
11 | ## 10X Visium Scans for CRC and OSCC samples
12 | 10X Visium Scans associated with manuscript submission are uploaded to AWS and Zenodo. 
13 | 
14 | Tiff files can be accessed via: https://fh-pi-bullman-s-eco-public.s3.us-west-2.amazonaws.com/DataTransfer/Galeano_Nino_et_al_visium_scans/CRC_OSCC_visium_tiff.tar.gz and https://doi.org/10.5281/zenodo.7419806
15 | 
16 | Please note for sample `CRC_16`, the slide id is `V10S15-020` and area code is `D1`; for sample `OSCC_2`, the slide id is `V11A07-022` and area code is `A1`.
17 | 
18 | We also uploaded fastq files to AWS for your convenience: https://fh-pi-bullman-s-eco-public.s3.us-west-2.amazonaws.com/DataTransfer/Galeano_Nino_et_al_visium_scans/CRC_OSCC_visium_fastq.tar.gz
19 | 
20 | ## Environment and Reference Data
21 | 
22 | ### Environment
23 | 
24 | All of the analysis code documented in this repository was run on the shared computing cluster
25 | maintained at the Fred Hutchinson Cancer Research Center between May 2020 and August 2022.
26 | The software dependencies used by these scripts are provided using the EasyBuild installation
27 | maintained by the Fred Hutch Scientific Computing group.
28 | Those software dependencies are loaded into the environment with the `ml` command (e.g. `ml CellRanger/6.1.1`).
29 | 
30 | ### Reference Data
31 | 
32 | Prior to running the analysis scripts, reference databases were downloaded for PathSeq (December 2020)
33 | and CellRanger (January 2022).
34 | The location of those reference databases is provided to the analysis scripts using the environment variables `pathseqdb` and `cellrangerdb`.
35 | 
36 | # Overview of the Computational Pipeline for Bacteria-associated Spots/Cells Annotation
37 | 
38 | ## Part 1: 10x Visium spatial transcriptomic data
39 |    1. Identification of microbial reads within 10x Visium spatial transcriptomic data generated by 10x Space Ranger Count (`Visium_pipeline.sh`)
40 |    2. Bioinformatic analysis of 10x Visium spatial transcriptomic data (`Visium.R`)
41 |    3. summarize numbers of bacteria reads and UMIs in 10X Visium data (`validate_and_count.py`) The folder used as outputs from the previous steps should be provided as an argument to the `Visium_pipeline.sh` script.
42 | ###   Output Data:
43 |    - `CRC_16.visium.raw_matrix.genus.csv` and `OSCC_2.visium.raw_matrix.genus.csv` contain bacteria UMI counting matrix that can be used as metadata in visium data process
44 |    - `CRC_16.visium.raw_matrix.validate.csv` and  `OSCC_2.visium.raw_matrix.validate.csv` contain validation data that can be used as the input of `validate_and_count.py`
45 | 
46 | ## Part 2: 10x Single cell data (For cell culture samples and patient samples)
47 | ###   Input Data:
48 |    - All of the input data for this analysis is provided in FASTQ format generated by the CellRanger `mkfastq` command
49 |    - The folder containing those FASTQ files is set to the environment variable `raw_data_folder`
50 | ###   Preprocess:
51 |    1. Identification of microbial reads within single cells GEX libraries (`patient_samples_GEX_pipeline.sh` and `cell_culture_samples_GEX_pipeline.sh`)
52 |    2. INVADEseq bacterial 16S rRNA gene libraries (`patient_samples_16s_pipeline.sh` and `cell_culture_16s_pipeline.sh`). The variable `gex_bam_path` should be set to the output folder from the `patient_samples_GEX_pipeline.sh` and `cell_culture_samples_GEX_pipeline.sh` script.
53 |    3. Combine and deduplication of microbial metadata from step 1 & 2 (`merge_metadata.py` and `metadata_dedup.py`). The folder used as outputs from the previous steps should be provided as an argument to the `merge_metadata.py` script.
54 | ###   Output Data:
55 |    - `headneck_gex_16s_mix_dedup.csv` `HT_29_gex_16s_mix_dedup.csv` `HCT_116_csv_gex_16s_mix_dedup.csv` contain bacteria UMI counting matrix that can be used as Seurat object metadata in single cell process.
56 | 
57 | ###   Processing of single cell data
58 |    1. Seurat data processing, Harmony integration, SingleR annotation and copyKAT predication (`patient_samples_Seurat.r` and `cell_culture_Seurat.r`)
59 |    2. Differentially expression analysis and GSEA (`DE.r`)
60 | 


--------------------------------------------------------------------------------
/cell_culture_samples/DE.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(clusterProfiler)
 3 | library(org.Hs.eg.db)
 4 | library(msigdbr)
 5 | 
 6 | m_df <- msigdbr(species = "Homo sapiens")
 7 | 
 8 | m_H <- msigdbr(species = "Homo sapiens", category = "H") %>% 
 9 |   dplyr::select(gs_name, gene_symbol)
10 | 
11 | 
12 | DE_GSEA <- function(seurat_object,
13 |                                 ident_1,
14 |                                 ident_2,
15 |                                 group_by,
16 |                                 seurat_object.markers_filename,
17 |                                 seurat_object.markers_filtered_filename,
18 |                                 seurat_object.markers_gsea_filename){
19 |     seurat_object.markers <- FindMarkers(seurat_object, 
20 |                                             ident.1 = ident_1,
21 |                                             ident.2 = ident_2,
22 |                                             group.by = group_by, 
23 |                                             logfc.threshold = -Inf, 
24 |                                             min.pct = 0.1)
25 | 
26 |     write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE)
27 | 
28 |     #seurat_object.markers = filter(seurat_object.markers, p_val_adj <= 0.05)
29 |     seurat_object.markers= seurat_object.markers[order(-seurat_object.markers$avg_log2FC),]
30 |     seurat_object.markers_filename = seurat_object.markers_filtered_filename
31 |     write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE)
32 | 
33 |     markers_seurat_object <- seurat_object.markers[,c("avg_log2FC")]
34 |     names(markers_seurat_object) = as.character(rownames(seurat_object.markers))
35 |     markers_seurat_object = sort(markers_seurat_object, decreasing = TRUE)
36 |     length(markers_seurat_object)
37 | 
38 |     markers_seurat_object.em2 <- GSEA(markers_seurat_object, 
39 |                                         TERM2GENE = m_H,
40 |                                         eps=0.0,
41 |                                         by = "fgsea")
42 | 
43 |     write.csv(markers_seurat_object.em2,seurat_object.markers_gsea_filename, row.names = FALSE)
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/cell_culture_samples/INVADEseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pysam
  3 | import sys
  4 | import gzip
  5 | 
  6 | def read_cell_names1(pathseq_bam_file, write_bac):
  7 |     seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36)
  8 |     read_name_pathseq = open(write_bac,'w')
  9 |     total_pathseq_reads=0
 10 |     total_YP_reads=0
 11 |     for each_line in seqbam:
 12 |         total_pathseq_reads+=1
 13 |         if each_line.has_tag('YP'):
 14 |             total_YP_reads+=1
 15 |             outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.mapping_quality) + '\n'
 16 |             read_name_pathseq.write(outline)
 17 |     print('Total reads in pathseq bam = ',total_pathseq_reads)
 18 |     print('Total reads in pathseq bam with YP tag  = ',total_YP_reads)
 19 |     return
 20 | 
 21 | def read_readnames(readname_file):
 22 |     set_for_readnames = set()
 23 |     dict_name = {}
 24 |     with open (readname_file,'r') as r:
 25 |         for each_line in r:
 26 |             each_line = each_line.rstrip('\n')
 27 |             each_line_list = each_line.split('\t')
 28 |             set_for_readnames.add(each_line_list[0])
 29 |             dict_name[each_line_list[0]] = {}
 30 |             dict_name[each_line_list[0]]["pathogen"] = each_line_list[1]
 31 |             dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2]
 32 |     return set_for_readnames, dict_name
 33 | 
 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv):
 35 |     pathseq_report = open(pathseq_report_csv,'r')
 36 |     dict_for_genus = {}
 37 |     set_for_genera = set()
 38 |     for each_line in pathseq_report:
 39 |         each_line = each_line.rstrip('\n')
 40 |         each_line_list = each_line.split('\t')
 41 |         level = each_line_list[2]
 42 |         tax = each_line_list[3]
 43 |         if level == 'genus':
 44 |             set_for_genera.add(tax)
 45 |         if '|' in each_line_list[1]:
 46 |             name_string_list = each_line_list[1].split('|')
 47 |             for n in range(len(name_string_list)):
 48 |                 pointer = -n-1           
 49 |                 if not '_' in name_string_list[pointer]:
 50 |                     name = name_string_list[pointer]
 51 |                     break
 52 |                 if 'unclassified' in name_string_list[pointer]:
 53 |                     name = name_string_list[pointer]
 54 |                     break         
 55 |             id = each_line_list[0]
 56 |             dict_for_genus[id] = name
 57 |     print ("len(dict_for_genus) = ",len(dict_for_genus))
 58 |     return dict_for_genus
 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file):
 60 |     white_list_set = set()
 61 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
 62 |     for each_line in white_list:
 63 |         each_line = each_line.rstrip('\n')
 64 |         white_list_set.add(each_line)
 65 | 
 66 |     seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36)
 67 |     readname_cell_path = open(out_readname_cell_path,'w')
 68 |     unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w')
 69 |     unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam)
 70 | 
 71 |     set_for_infect_cells=set()
 72 |     total_cellranger_bam_reads = 0
 73 |     total_cellranger_reads_UB_CB_tags = 0
 74 |     total_cellranger_reads_UB_CB_unmap = 0
 75 |     total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0
 76 |     total_potential_UMI_including_ambigious_reads = set()
 77 |     for each_line in seqbam:
 78 |         total_cellranger_bam_reads+=1
 79 |         if each_line.has_tag('CB') and each_line.has_tag('UB'):
 80 |             if each_line.get_tag('CB') in white_list_set:
 81 |                 total_cellranger_reads_UB_CB_tags+=1
 82 |                 if each_line.is_unmapped:
 83 |                     total_cellranger_reads_UB_CB_unmap+=1
 84 |                     # added 102721: output a fasta file for kraken
 85 |                     query_name_in_cellranger_bam = each_line.query_name
 86 |                     seq_in_cellranger_bam = each_line.query_sequence
 87 |                     unmap_cbub_fasta.write('>')
 88 |                     unmap_cbub_fasta.write(query_name_in_cellranger_bam)
 89 |                     unmap_cbub_fasta.write('\n')
 90 |                     unmap_cbub_fasta.write(seq_in_cellranger_bam)
 91 |                     unmap_cbub_fasta.write('\n')
 92 |                     unmap_cbub_bam.write(each_line)
 93 |                     if each_line.query_name in set_of_readnames:
 94 |                         set_for_infect_cells.add(each_line.get_tag('CB'))
 95 |                         readname = each_line.query_name
 96 |                         cellname = each_line.get_tag('CB')
 97 |                         umi = each_line.get_tag('UB')
 98 |                         path = dict_name[readname]["pathogen"]
 99 |                         id_string_list = path.split(',')
100 |                         genus_list = []
101 |                         for each_id in id_string_list:
102 |                             if each_id in dict_for_genus:
103 |                                 genus = dict_for_genus[each_id]
104 |                                 genus_list.append(genus)
105 |                             else:
106 |                                 print(each_id,"  not found!")
107 |                         genus_list = list(set(genus_list))
108 |                         genus_list.sort()
109 |                         genus_list_string = ','.join(genus_list)           
110 |                         mapping_score = dict_name[readname]["mapping_score"]
111 |                         outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n'
112 |                         readname_cell_path.write(outline)
113 |                         total_potential_UMI_including_ambigious_reads.add(umi)
114 |                         total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1
115 |     print('total cellranger bam reads = ',total_cellranger_bam_reads)
116 |     print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags)
117 |     print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap)
118 |     print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads)
119 |     cell_list = open(out_cell_list,'w')
120 |     for each_cell in set_for_infect_cells:
121 |         cell_list.write(each_cell)
122 |         cell_list.write('\n')
123 |     return 
124 | 
125 | 
126 | def generate_barcode_UMI_dict(out_readname_cell_path):
127 |     cell_path_file = open(out_readname_cell_path,'r')
128 |     barcode_UMI_dict = {}
129 |     for each_line in cell_path_file:
130 |         each_line = each_line.rstrip('\n')
131 |         each_line_list = each_line.split('\t')   
132 |         read_name =  each_line_list[0]
133 |         cell_barcode = each_line_list[1]
134 |         UMI = each_line_list[2]
135 |         id_string = each_line_list[3]
136 |         id_string_list = id_string.split(',')
137 |         barcode_UMI = cell_barcode+'+'+UMI
138 |         mapping_score = each_line_list[4]
139 |         genus_string = each_line_list[5]
140 |         if not barcode_UMI in barcode_UMI_dict:
141 |             barcode_UMI_dict[barcode_UMI]={}
142 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
143 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score)
144 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
145 |         elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]:
146 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
147 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 
148 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
149 |     return barcode_UMI_dict 
150 | 
151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus):
152 |     cells_dict = {}
153 |     for barcode_UMI in barcode_UMI_dict:
154 |         cell = barcode_UMI.split('+')[0]
155 |         if not cell in cells_dict:
156 |             cells_dict[cell]=[]
157 |             cells_dict[cell].append(barcode_UMI)
158 |         else:
159 |             cells_dict[cell].append(barcode_UMI)
160 |     UMI_id_dict = {}
161 |     for barcode_UMI in barcode_UMI_dict:
162 |         if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]:
163 |             UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"]
164 |     unambigious_UMI = {}
165 |     for barcode_UMI in UMI_id_dict:
166 |         id_list = UMI_id_dict[barcode_UMI]
167 |         genus_list = []
168 |         for each_id in id_list:
169 |             if each_id in dict_for_genus:
170 |                 genus = dict_for_genus[each_id]
171 |                 genus_list.append(genus)
172 |         genus_list = list(set(genus_list))
173 |         if len(genus_list) == 1:#only keep unambigious UMI
174 |             unambigious_UMI[barcode_UMI] = genus_list[0]
175 |     print('Total unambigious UMI = ',len(unambigious_UMI))
176 |     cell_metadata_dict = {}
177 |     for barcode_UMI in unambigious_UMI:
178 |         barcode = barcode_UMI.split('+')[0]
179 |         UMI = barcode_UMI.split('+')[1]
180 |         genus = unambigious_UMI[barcode_UMI]
181 | 
182 |         if not barcode in cell_metadata_dict:
183 |             cell_metadata_dict[barcode] = {}
184 |             cell_metadata_dict[barcode]['genus'] = []
185 |             cell_metadata_dict[barcode]['genus'].append(genus)
186 |             cell_metadata_dict[barcode]['barcode_UMI']={}
187 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
188 |             cell_metadata_dict[barcode]['pathogen_count']={}
189 |         else:
190 |             cell_metadata_dict[barcode]['genus'].append(genus)
191 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
192 | 
193 |         if not genus in cell_metadata_dict[barcode]['pathogen_count']:
194 |             cell_metadata_dict[barcode]['pathogen_count'][genus] = 1
195 |         else:
196 |             cell_metadata_dict[barcode]['pathogen_count'][genus] += 1
197 |     return cell_metadata_dict
198 | 
199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file):
200 |     print('total pathogen-associated gems = ', len(cell_metadata_dict))
201 |     white_list_set = set()
202 |     white_list_dict = {}
203 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
204 |     for each_line in white_list:
205 |         each_line = each_line.rstrip('\n')
206 |         white_list_set.add(each_line)
207 |     for barcode in cell_metadata_dict:
208 |         if barcode in white_list_set:
209 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
210 |     cell_metadata_dict = white_list_dict
211 |     print("total filtered pathogen-associated cells = ", len(cell_metadata_dict))
212 |     genus_file = open(out_genus_file,'w')
213 |     header = 'cell_name,pathogen,UMI_count,pathogen_count\n'
214 |     genus_file.write(header)
215 | 
216 |     for barcode in cell_metadata_dict:
217 |         if not sample_ident == '':
218 |             cell_name = sample_ident+'_'+barcode
219 |         else:
220 |             cell_name = barcode
221 |         genus_list = []
222 |         for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']:
223 |             genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI])
224 |         sorted_genus_list = list(set(genus_list))
225 |         sorted_genus_list.sort()
226 |         genus = '+'.join(sorted_genus_list)            
227 |         UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI'])
228 |         pathogen_count_list = []
229 |         for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']:
230 |             pathogen_count=each_pathogen
231 |             pathogen_count+=':'
232 |             pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen])
233 |             pathogen_count_list.append(pathogen_count)
234 |         pathogen_count_list.sort()
235 |         pathogen_count_str = ';'.join(pathogen_count_list)
236 | 
237 |         Periority_pathogen = 'Fusobacterium'
238 |         pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count']
239 |         temp_max_list = []
240 |         UMI_count_sum = 0
241 |         max_count = max(pathogen_count_mini_dict.values())
242 |         for key,value in pathogen_count_mini_dict.items():
243 |             if value == max_count:
244 |                 temp_max_list.append(key)
245 |                 max_UMI = value
246 |             UMI_count_sum += value
247 |         
248 |         UMI_count = UMI_count_sum
249 |         if len(set(temp_max_list)) > 1: 
250 |             genus = 'MULTI'
251 |             UMI_count = UMI_count_sum
252 |         else:
253 |             genus = temp_max_list[0]
254 |             UMI_count = max_UMI
255 |         output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n'
256 |         if UMI_count >= 1:
257 |             genus_file.write(output_line)
258 |     return
259 | 
260 | 
261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv):
262 |     white_list_set = set()
263 |     white_list_dict = {}
264 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
265 |     for each_line in white_list:
266 |         each_line = each_line.rstrip('\n')
267 |         white_list_set.add(each_line)
268 |     print("total number of cells = ", len(white_list_set))
269 |     for barcode in cell_metadata_dict:
270 |         if barcode in white_list_set:
271 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
272 |     cell_metadata_dict = white_list_dict
273 |     output_UMI_validate_table = open(output_UMI_validate_table_csv,'w')
274 |     for each_cell in cell_metadata_dict:
275 |         for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']:
276 |             UMI = each_UMI
277 |             pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI]
278 |             output_UMI_validate_table.write(UMI+','+pathogen+'\n')
279 | 
280 |     output_UMI_table = open(output_UMI_table_csv,'w')
281 |     genera_list_set = set()
282 |     for barcode in cell_metadata_dict:
283 |         for pathogen in cell_metadata_dict[barcode]['pathogen_count']:
284 |             genera_list_set.add(pathogen)
285 | 
286 |     genera_list = sorted(list(genera_list_set))
287 |     header = ['barcode']+genera_list
288 |     header_out = ','.join(header)
289 |     output_UMI_table.write(header_out)
290 |     output_UMI_table.write('\n')
291 |     for barcode in cell_metadata_dict:
292 |         if not sample_ident == '':
293 |             cell_name = sample_ident+'_'+barcode
294 |         else:
295 |             cell_name = barcode
296 |         genera_count_list = []
297 |         for each_genus in genera_list:
298 |             if each_genus in cell_metadata_dict[barcode]['pathogen_count']:
299 |                 genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus]
300 |             else:
301 |                 genus_count = 0
302 |             genera_count_list.append(str(genus_count))
303 |         output_line = [cell_name]+genera_count_list
304 |         output_line_out = ','.join(output_line)
305 |         output_UMI_table.write(output_line_out)
306 |         output_UMI_table.write('\n')
307 |     return
308 | 
309 | if __name__ == "__main__":
310 |     cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:]
311 |     dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv)
312 |     step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq)
313 |     step2 = read_readnames(read_name_pathseq)
314 |     step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file)
315 |     step4 = generate_barcode_UMI_dict(out_readname_cell_path)
316 |     step5 = output_cells_genus_list(step4,dict_for_genus)
317 | 
318 |     output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file)
319 |     cell_metadata_dict = step5
320 |     UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv)
321 | 
322 | # cellranger_bam_file,
323 | # sample_ident,
324 | # barcode_whitelist_file,
325 | # pathseq_bam_file,
326 | # pathseq_report_csv,
327 | # read_name_pathseq,
328 | # unmap_cbub_bam_file,
329 | # unmap_cbub_fasta_file,
330 | # out_cell_list,
331 | # out_readname_cell_path,
332 | # out_genus_file,
333 | # output_UMI_table_csv,
334 | # output_UMI_validate_table_csv=sys.argv[1:]
335 | 


--------------------------------------------------------------------------------
/cell_culture_samples/cell_culture_16s_pipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ml CellRanger/6.1.1
  3 | 
  4 | ml BEDTools/2.29.2-GCC-9.3.0
  5 | ml SAMtools/1.16.1-GCC-11.2.0
  6 | ml FastQC/0.11.9-Java-11
  7 | ml Trimmomatic/0.39-Java-11
  8 | ml picard/2.21.6-Java-11
  9 | 
 10 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8
 11 | 
 12 | ml Python
 13 | ml Pysam
 14 | 
 15 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders
 16 | `root` # working directory
 17 | `pathseqdb` # Pathseq database
 18 | `cellrangerdb` # Cellranger database
 19 | `gex_bam_path` # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells
 20 | 
 21 | root=${workdir}
 22 | # Run cellranger count
 23 | cd ${workdir}
 24 | mkdir cellranger_count
 25 | cd cellranger_count
 26 | for folder in ${raw_data_folder}/*
 27 | do
 28 | folder_name=${folder##*/}
 29 | path=${folder}
 30 | cellranger count \
 31 | --id=${folder_name} \
 32 | --transcriptome=${cellrangerdb} \
 33 | --fastqs=${path} \
 34 | --sample=${folder_name}
 35 | done
 36 | 
 37 | cd ${workdir}
 38 | mkdir split_reads
 39 | cd split_reads
 40 | 
 41 | # convert cellranger bam file to fastqs
 42 | for folder in ${workdir}/cellranger_count/*
 43 | do
 44 | folder_name=${folder##*/}
 45 | file=${folder}/outs/possorted_genome_bam.bam
 46 | echo ${file}
 47 | 
 48 | samplename=${folder_name}
 49 | bedtools bamtofastq -i ${folder}/outs/possorted_genome_bam.bam \
 50 |                       -fq ${samplename}.r1.fq \
 51 |                       -fq2 ${samplename}.r2.fq
 52 | done
 53 | 
 54 | # run fastqc before trimmomatic
 55 | mkdir ${root}/preqc
 56 | fastqc \
 57 | -o ${root}/preqc \
 58 | ${root}/split_reads/*.fq
 59 | 
 60 | # run trimmomatic on R1 
 61 | cd ${root}/split_reads
 62 | mkdir trim
 63 | for str in *r1.fq
 64 | do
 65 | # adjust -threads to number of cores you would like to use
 66 | java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.39.jar SE \
 67 | -threads 36 \
 68 | ${str} \
 69 | trim/${str}.SE_trim.fq \
 70 | ILLUMINACLIP:$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa:2:30:10 \
 71 | LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 HEADCROP:15
 72 | done
 73 | 
 74 | # run fastqc after trimmomatic 
 75 | cd trim
 76 | mkdir ${root}/postqc
 77 | fastqc \
 78 | -o ${root}/postqc \
 79 | *.SE_trim.fq
 80 | 
 81 | mkdir ${workdir}/ubams_r1
 82 | cd ${root}/split_reads/trim
 83 | 
 84 | # convert R1 to ubam file in order to run Pathseq
 85 | for file in *SE_trim.fq
 86 | do
 87 | java -Xmx700G -jar $EBROOTPICARD/picard.jar FastqToSam \
 88 |     FASTQ=${file} \
 89 |     OUTPUT=${file}.bam \
 90 |     READ_GROUP_NAME=16s \
 91 |     SAMPLE_NAME=16s
 92 | 
 93 | # move and rename generated ubam files
 94 | mv ${file}.bam ${workdir}/ubams_r1
 95 | done
 96 | 
 97 | ubam_folder=${workdir}/ubams_r1
 98 | outpath=${workdir}/pathseq_r1
 99 | mkdir ${outpath}
100 | 
101 | cd ${ubam_folder}
102 | 
103 | # Pathseq to identify pathogen-associated cells
104 | for each_file in *.bam
105 | do
106 | echo ${each_file}
107 | filename="${each_file%.*}"
108 | filename="${filename%.*}"
109 | filename="${filename%.*}"
110 | samplename=${filename}
111 | echo ${samplename}
112 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples
113 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \
114 |     --input ${each_file} \
115 |     --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \
116 |     --kmer-file ${pathseqdb}/pathseq_host.bfi \
117 |     --min-clipped-read-length 60 \
118 |     --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \
119 |     --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \
120 |     --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \
121 |     --output ${outpath}/${samplename}.pathseq.complete.bam \
122 |     --scores-output ${outpath}/${samplename}.pathseq.complete.txt.csv \
123 |     --is-host-aligned false \
124 |     --filter-duplicates false \
125 |     --min-score-identity .7
126 | done
127 | 
128 | # Python script to produce a bacteria UMI matrix (based on valid GEX cell)
129 | bam_path=${workdir}/cellranger_count
130 | pathseq_path=${workdir}/pathseq_r1
131 | out_path=${root}/python
132 | mkdir ${out_path}
133 | cd ${bam_path}
134 | # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells. 
135 | for each_sample in *
136 | do
137 | echo ${each_sample}
138 | echo ${gex_bam_path}
139 | python INVADEseq.py \
140 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \
141 | ${each_sample} \
142 | ${gex_bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \
143 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.bam \
144 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.txt.csv \
145 | ${out_path}/${each_sample}.16s.filtered_matrix.readname \
146 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.bam \
147 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.fasta \
148 | ${out_path}/${each_sample}.16s.filtered_matrix.list \
149 | ${out_path}/${each_sample}.16s.raw.filtered_matrix.readnamepath \
150 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.cell \
151 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.csv \
152 | ${out_path}/${each_sample}.16s.filtered_matrix.validate.csv
153 | done
154 | 
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/cell_culture_samples/cell_culture_Seurat.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(harmony)
 3 | library(Seurat)
 4 | library(ggplot2)
 5 | library(SingleR)
 6 | library(celldex)
 7 | library(msigdbr)
 8 | library(cowplot)
 9 | library(dplyr)
10 | hpca.se <- celldex::HumanPrimaryCellAtlasData()
11 | library("enrichplot")
12 | library(ggupset)
13 | library(gridExtra)
14 | library(pheatmap)
15 | options(bitmapType = 'cairo')
16 | knitr::opts_chunk$set(dev="CairoPNG")
17 | library(org.Hs.eg.db)
18 | 
19 | sample_7_MOI_500_data = Read10X(data.dir = "/raw_data/cellranger/count/7_MOI_500_GEX/outs/filtered_feature_bc_matrix")
20 | sample_7_MOI_500 = CreateSeuratObject(counts = sample_7_MOI_500_data, project = "Sample_7_MOI_500", min.cells = 3, min.features = 200)
21 | sample_7_MOI_500[["percent.mt"]] <- PercentageFeatureSet(sample_7_MOI_500, pattern = "^MT-")
22 | 
23 | sample_6_MOI_100_data = Read10X(data.dir = "/raw_data/cellranger/count/6_MOI_100_GEX/outs/filtered_feature_bc_matrix")
24 | sample_6_MOI_100 = CreateSeuratObject(counts = sample_6_MOI_100_data, project = "Sample_6_MOI_100", min.cells = 3, min.features = 200)
25 | sample_6_MOI_100[["percent.mt"]] <- PercentageFeatureSet(sample_6_MOI_100, pattern = "^MT-")
26 | 
27 | sample_5_HCT_116_data = Read10X(data.dir = "/raw_data/cellranger/count/5_HCT_116_GEX/outs/filtered_feature_bc_matrix")
28 | sample_5_HCT_116 = CreateSeuratObject(counts = sample_5_HCT_116_data, project = "Sample_5_HCT_116", min.cells = 3, min.features = 200)
29 | sample_5_HCT_116[["percent.mt"]] <- PercentageFeatureSet(sample_5_HCT_116, pattern = "^MT-")
30 | 
31 | sample.combine <- merge(sample_5_HCT_116, y = c(sample_6_MOI_100,sample_7_MOI_500), add.cell.ids = c("5_HCT_116_GEX","6_MOI_100_GEX",'7_MOI_500_GEX'), project = "SAMPLE.INTEGRATED")
32 | 
33 | VlnPlot(sample.combine, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
34 | sample.combine <- NormalizeData(object = sample.combine, normalization.method = "LogNormalize", scale.factor = 10000)
35 | GetAssay(sample.combine,assay = "RNA")
36 | sample.combine <- FindVariableFeatures(object = sample.combine, selection.method = "vst", nfeatures = 5000)
37 | top20 <- head(x = VariableFeatures(object = sample.combine), 20)
38 | plot1 <- VariableFeaturePlot(object = sample.combine)
39 | plot2 <- LabelPoints(plot = plot2, points = top20, repel = TRUE)
40 | plot1+plot2
41 | all.genes <- rownames(sample.combine)
42 | sample.combine<- ScaleData(object = sample.combine,features = all.genes)
43 | sample.combine <- RunPCA(object = sample.combine,pc.genes = VariableFeatures(sample.combine))
44 | ElbowPlot(sample.combine)
45 | seuratObj <- RunHarmony(sample.combine, group.by.vars="orig.ident",assay.use='RNA')
46 | names(seuratObj@reductions)
47 | seuratObj <- RunUMAP(seuratObj,  dims = 1:20, 
48 |                      reduction = "harmony",seed.use=111)
49 | sce=seuratObj
50 | sce <- FindNeighbors(sce, reduction = "harmony",dims = 1:20)
51 | sce <- FindClusters(sce, resolution = 0.5)
52 | seuratObj=sce
53 | sample.combine = seuratObj
54 | 
55 | # add pathogen UMI metadata
56 | umi_table_csv = 'csv_novami_mix_dedup.csv'
57 | umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1)
58 | umi_table[is.na(umi_table)] <- 0
59 | umi_table$Total <- rowSums(umi_table)
60 | umi_table[umi_table==0] <- NA
61 | sample.headneck<-AddMetaData(sample.headneck, umi_table)
62 | 
63 | 


--------------------------------------------------------------------------------
/cell_culture_samples/cell_culture_samples_GEX_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # The preprocessing pipeline for Cell Culture samples single-cell GEX data 
 4 | ml CellRanger/6.1.1
 5 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8
 6 | ml Python
 7 | ml Pysam
 8 | 
 9 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders for cell culture samples
10 | `root` # working directory
11 | `pathseqdb` # Pathseq database
12 | `cellrangerdb` # Cellranger database
13 | 
14 | cd ${root}
15 | mkdir cellranger_count
16 | cd cellranger_count
17 | 
18 | # Cellranger count processing
19 | for folder in ${raw_data_folder}/*
20 | do
21 | folder=${folder}
22 | folder_name=${folder##*/}
23 | path=${folder}
24 | echo ${path}
25 | cellranger count \
26 | --id=${folder_name} \
27 | --transcriptome=${cellrangerdb} \
28 | --fastqs=${path} \
29 | --sample=${folder_name}
30 | done
31 | 
32 | # PathSeq pipeline
33 | outpath=${root}/pathseq
34 | mkdir ${outpath}
35 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples
36 | # 
37 | for folder in *
38 | do
39 | folder_name=${folder##*/}
40 | file=${folder}/outs/possorted_genome_bam.bam
41 | samplename=${folder_name}
42 | echo ${samplename}
43 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \
44 |     --input ${file} \
45 |     --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \
46 |     --kmer-file ${pathseqdb}/pathseq_host.bfi \
47 |     --min-clipped-read-length 60 \
48 |     --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \
49 |     --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \
50 |     --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \
51 |     --output ${outpath}/${samplename}.pathseq.complete.bam \
52 |     --scores-output ${outpath}/${samplename}.pathseq.complete.csv \
53 |     --is-host-aligned false \
54 |     --filter-duplicates false \
55 |     --min-score-identity .7
56 | done
57 | 
58 | # Python script to generate bacteria matrix
59 | bam_path=${root}/cellranger_count
60 | pathseq_path=${root}/pathseq
61 | out_path=${root}/python
62 | mkdir ${out_path}
63 | cd ${bam_path}
64 | 
65 | for each_sample in *
66 | do
67 | echo ${each_sample}
68 | python INVADEseq.py \
69 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \
70 | ${each_sample} \
71 | ${bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \
72 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \
73 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \
74 | ${out_path}/${each_sample}.gex.filtered_matrix.readname \
75 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.bam \
76 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.fasta \
77 | ${out_path}/${each_sample}.gex.filtered_matrix.list \
78 | ${out_path}/${each_sample}.gex.raw.filtered_matrix.readnamepath \
79 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.cell \
80 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.csv \
81 | ${out_path}/${each_sample}.gex.filtered_matrix.validate.csv
82 | done
83 | 
84 | 


--------------------------------------------------------------------------------
/cell_culture_samples/merge_metadata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import sys
 6 | pd.set_option('precision', 0)
 7 | 
 8 | def merge_cellsmeta2(df1,df2):
 9 |     df_merged = pd.concat([df1, df2], sort=False)
10 |     df_merged = df_merged.round()
11 |     return df_merged
12 | 
13 | def feed_csvs(path):#this will return a list of csvs in your path
14 |     file_list = os.listdir(path)
15 |     csv_list = []
16 |     for each_file in file_list:
17 |         if each_file.endswith('genus.csv'):
18 |         #if each_file.endswith('merged.csv'):
19 |             csv_list.append(path+'/'+each_file)
20 |     return csv_list
21 | 
22 | if __name__ == "__main__":
23 |     path = sys.argv[1]
24 |     csv_merged = path+'/csv_novami.csv'
25 |     csv_list = feed_csvs(path)
26 | 
27 |     csv1 = csv_list[0]
28 |     df1 = pd.read_csv(csv1,header = 0,sep = ',')
29 | 
30 |     for each_csv in csv_list[1:]:
31 |         print(each_csv)
32 |         df2 = pd.read_csv(each_csv,header = 0,sep = ',')
33 |         df1 = merge_cellsmeta2(df1,df2)
34 | 
35 |     #print(df1)
36 |     df1 = df1.fillna(0)
37 |     df1.to_csv(csv_merged,sep=',',index=False)
38 | 
39 | 


--------------------------------------------------------------------------------
/cell_culture_samples/metadata_dedup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | from collections import Counter
  4 | import os
  5 | import sys
  6 | """
  7 | # The purpose of this script is deduplication of the merged metadata. Since there are replicated cell names from GEX libraries and 16s libraries, it is necessary to add UMI count from both techniques together into unique cell names.
  8 | # usage: 
  9 | # python metadata_dedup.py \
 10 |     GEX_pathogen_UMI_matrix_output_folder(with validation csvs) \
 11 |     16s_pathogen_UMI_matrix_output_folder(with validation csvs) \
 12 |     Merged_csv_matrix_from_previous_step \
 13 |     Dedup_csv_matrix
 14 | 
 15 | # Note: Merged_csv_matrix_from_previous_step is a csv file conting 3 cell culture samples from our study
 16 | #       otherwise please modify the sample names in the script.
 17 | """
 18 | 
 19 | def read_and_mkdic(validate_csv_file, sample_name):
 20 |     UMI_bac_list = []
 21 |     validate_csv = open(validate_csv_file,'r')
 22 |     for each_line in validate_csv:
 23 |         each_line = each_line.rstrip('\n')
 24 |         each_line = sample_name+'_'+each_line
 25 |         UMI_bac_list.append(each_line)
 26 |     return UMI_bac_list
 27 | 
 28 | #add lists together
 29 | # then count
 30 | def count_elements(merged_UMI_bac_list):
 31 |     count_dict = Counter(merged_UMI_bac_list)
 32 |     return count_dict
 33 | 
 34 | #add dics together: ndic = list(dict(dic0.items()) + list(dic1.items()))
 35 | #generate a dataframe, that will be a minux matrix
 36 | #df.values[rows, cols] = np.nan
 37 | def minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup):
 38 |     nova_mi_merged = pd.read_csv(nova_mi_merged_csv_file,header = 0,sep = ',',index_col='barcode')
 39 |     print('before merge = ',len(nova_mi_merged))
 40 |     nova_mi_merged = nova_mi_merged.groupby(nova_mi_merged.index).sum()
 41 |     print('after merge = ',len(nova_mi_merged))
 42 |     n=0
 43 |     rowNamesArr = list(nova_mi_merged.index.values)
 44 |     #print(rowNamesArr[:10])
 45 |     columnsNamesArr = list(nova_mi_merged.columns.values)
 46 |     for each_cell_UMI in count_dict:
 47 |         #print(each_cell_UMI)
 48 |         n+=1
 49 |         if n%1000 == 0:
 50 |             print('now working on: ',n/len(count_dict)*100,'%')
 51 |         cell = each_cell_UMI.split('+')[0]
 52 |         pathogen = each_cell_UMI.split(',')[1]
 53 |         count = count_dict[each_cell_UMI]-1
 54 |         colindex = columnsNamesArr.index(pathogen)
 55 |         rowindex = rowNamesArr.index(cell)
 56 |         prev = int(nova_mi_merged.loc[cell,pathogen])
 57 |         nova_mi_merged.loc[cell,pathogen]=prev-count
 58 |         after = int(nova_mi_merged.loc[cell,pathogen])
 59 |     nova_mi_merged.to_csv(nova_mi_merged_csv_file_dedup,sep=',',index=True)
 60 |     return
 61 | 
 62 | 
 63 | if __name__ == "__main__":
 64 |     UMI_bac_list=[]
 65 |     
 66 |     validate_csv_file_nova = [
 67 |         '5_HCT_116',
 68 |         '6_MOI_100',
 69 |         '7_MOI_500'
 70 |     ]
 71 |     sample_name_nova = [
 72 |         '5_HCT_116',
 73 |         '6_MOI_100',
 74 |         '7_MOI_500'
 75 |     ]
 76 | 
 77 |     validate_csv_file_mi = [
 78 |         '5_HCT_116',
 79 |         '6_MOI_100',
 80 |         '7_MOI_500'
 81 |     ]
 82 |     sample_name_mi = [
 83 |         '5_HCT_116',
 84 |         '6_MOI_100',
 85 |         '7_MOI_500'
 86 |     ]
 87 | 
 88 |     path_nova = argv[1]
 89 |     path_mi = argv[2]
 90 |     nova_mi_merged_csv_file = argv[3]
 91 |     nova_mi_merged_csv_file_dedup = argv[4]
 92 | 
 93 |     #also modified the cell names in sub merged csvs!
 94 | 
 95 |     for n in range(0,len(validate_csv_file_nova)):
 96 |         UMI_bac_list = UMI_bac_list+read_and_mkdic(path_nova + '/' + validate_csv_file_nova[n]+'.gex.filtered_matrix.validate.csv', 'sample_'+sample_name_nova[n])
 97 |     
 98 |     for n in range(0,len(validate_csv_file_mi)):
 99 |         UMI_bac_list = UMI_bac_list+read_and_mkdic(path_mi + '/' + validate_csv_file_mi[n]+'.16s.filtered_matrix.validate.csv', 'sample_'+sample_name_mi[n])
100 |     
101 |     count_dict = count_elements(UMI_bac_list)
102 |     minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup)
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/patient_samples/DE.r:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env Rscript
 2 | library(clusterProfiler)
 3 | library(org.Hs.eg.db)
 4 | library(msigdbr)
 5 | 
 6 | m_df <- msigdbr(species = "Homo sapiens")
 7 | 
 8 | m_H <- msigdbr(species = "Homo sapiens", category = "H") %>% 
 9 |   dplyr::select(gs_name, gene_symbol)
10 | 
11 | 
12 | DE_GSEA <- function(seurat_object,
13 |                                 ident_1,
14 |                                 ident_2,
15 |                                 group_by,
16 |                                 seurat_object.markers_filename,
17 |                                 seurat_object.markers_filtered_filename,
18 |                                 seurat_object.markers_gsea_filename){
19 |     seurat_object.markers <- FindMarkers(seurat_object, 
20 |                                             ident.1 = ident_1,
21 |                                             ident.2 = ident_2,
22 |                                             group.by = group_by, 
23 |                                             logfc.threshold = -Inf, 
24 |                                             min.pct = 0.1)
25 | 
26 |     write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE)
27 | 
28 |     #seurat_object.markers = filter(seurat_object.markers, p_val_adj <= 0.05)
29 |     seurat_object.markers= seurat_object.markers[order(-seurat_object.markers$avg_log2FC),]
30 |     seurat_object.markers_filename = seurat_object.markers_filtered_filename
31 |     write.csv(seurat_object.markers,seurat_object.markers_filename, row.names = TRUE)
32 | 
33 |     markers_seurat_object <- seurat_object.markers[,c("avg_log2FC")]
34 |     names(markers_seurat_object) = as.character(rownames(seurat_object.markers))
35 |     markers_seurat_object = sort(markers_seurat_object, decreasing = TRUE)
36 |     length(markers_seurat_object)
37 | 
38 |     markers_seurat_object.em2 <- GSEA(markers_seurat_object, 
39 |                                         TERM2GENE = m_H,
40 |                                         eps=0.0,
41 |                                         by = "fgsea")
42 | 
43 |     write.csv(markers_seurat_object.em2,seurat_object.markers_gsea_filename, row.names = FALSE)
44 | }
45 | 
46 | 


--------------------------------------------------------------------------------
/patient_samples/INVADEseq.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pysam
  3 | import sys
  4 | import gzip
  5 | 
  6 | def read_cell_names1(pathseq_bam_file, write_bac):
  7 |     seqbam = pysam.AlignmentFile(pathseq_bam_file, "rb",threads=36)
  8 |     read_name_pathseq = open(write_bac,'w')
  9 |     total_pathseq_reads=0
 10 |     total_YP_reads=0
 11 |     for each_line in seqbam:
 12 |         total_pathseq_reads+=1
 13 |         if each_line.has_tag('YP'):
 14 |             total_YP_reads+=1
 15 |             outline = each_line.query_name + '\t' + each_line.get_tag('YP') + '\t' + str(each_line.get_tag('AS')) + '\n'
 16 |             read_name_pathseq.write(outline)
 17 |     print('Total reads in pathseq bam = ',total_pathseq_reads)
 18 |     print('Total reads in pathseq bam with YP tag  = ',total_YP_reads)
 19 |     return
 20 | 
 21 | def read_readnames(readname_file):
 22 |     set_for_readnames = set()
 23 |     dict_name = {}
 24 |     with open (readname_file,'r') as r:
 25 |         for each_line in r:
 26 |             each_line = each_line.rstrip('\n')
 27 |             each_line_list = each_line.split('\t')
 28 |             set_for_readnames.add(each_line_list[0])
 29 |             dict_name[each_line_list[0]] = {}
 30 |             dict_name[each_line_list[0]]["pathogen"] = each_line_list[1]
 31 |             dict_name[each_line_list[0]]["mapping_score"] = each_line_list[2]
 32 |     return set_for_readnames, dict_name
 33 | 
 34 | def read_pathseq_report_and_create_dict(pathseq_report_csv):
 35 |     pathseq_report = open(pathseq_report_csv,'r')
 36 |     dict_for_genus = {}
 37 |     set_for_genera = set()
 38 |     for each_line in pathseq_report:
 39 |         each_line = each_line.rstrip('\n')
 40 |         each_line_list = each_line.split('\t')
 41 |         level = each_line_list[2]
 42 |         tax = each_line_list[3]
 43 |         if level == 'genus':
 44 |             set_for_genera.add(tax)
 45 |         if '|' in each_line_list[1]:
 46 |             name_string_list = each_line_list[1].split('|')
 47 |             for n in range(len(name_string_list)):
 48 |                 pointer = -n-1           
 49 |                 if not '_' in name_string_list[pointer]:
 50 |                     name = name_string_list[pointer]
 51 |                     break
 52 |                 if 'unclassified' in name_string_list[pointer]:
 53 |                     name = name_string_list[pointer]
 54 |                     break         
 55 |             id = each_line_list[0]
 56 |             dict_for_genus[id] = name
 57 |     print ("len(dict_for_genus) = ",len(dict_for_genus))
 58 |     return dict_for_genus
 59 | def read_cell_names2(set_of_readnames, dict_name, dict_for_genus,original_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file):
 60 |     white_list_set = set()
 61 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
 62 |     for each_line in white_list:
 63 |         each_line = each_line.rstrip('\n')
 64 |         white_list_set.add(each_line)
 65 | 
 66 |     seqbam = pysam.AlignmentFile(original_bam_file, "rb",threads=36)
 67 |     readname_cell_path = open(out_readname_cell_path,'w')
 68 |     unmap_cbub_fasta = open(unmap_cbub_fasta_file,'w')
 69 |     unmap_cbub_bam = pysam.AlignmentFile(unmap_cbub_bam_file, "wb", seqbam)
 70 | 
 71 |     set_for_infect_cells=set()
 72 |     total_cellranger_bam_reads = 0
 73 |     total_cellranger_reads_UB_CB_tags = 0
 74 |     total_cellranger_reads_UB_CB_unmap = 0
 75 |     total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads = 0
 76 |     total_potential_UMI_including_ambigious_reads = set()
 77 |     for each_line in seqbam:
 78 |         total_cellranger_bam_reads+=1
 79 |         if each_line.has_tag('CB') and each_line.has_tag('UB'):
 80 |             if each_line.get_tag('CB') in white_list_set:
 81 |                 total_cellranger_reads_UB_CB_tags+=1
 82 |                 if each_line.is_unmapped:
 83 |                     total_cellranger_reads_UB_CB_unmap+=1
 84 |                     # added 102721: output a fasta file for kraken
 85 |                     query_name_in_cellranger_bam = each_line.query_name
 86 |                     seq_in_cellranger_bam = each_line.query_sequence
 87 |                     unmap_cbub_fasta.write('>')
 88 |                     unmap_cbub_fasta.write(query_name_in_cellranger_bam)
 89 |                     unmap_cbub_fasta.write('\n')
 90 |                     unmap_cbub_fasta.write(seq_in_cellranger_bam)
 91 |                     unmap_cbub_fasta.write('\n')
 92 |                     unmap_cbub_bam.write(each_line)
 93 |                     if each_line.query_name in set_of_readnames:
 94 |                         set_for_infect_cells.add(each_line.get_tag('CB'))
 95 |                         readname = each_line.query_name
 96 |                         cellname = each_line.get_tag('CB')
 97 |                         umi = each_line.get_tag('UB')
 98 |                         path = dict_name[readname]["pathogen"]
 99 |                         id_string_list = path.split(',')
100 |                         genus_list = []
101 |                         for each_id in id_string_list:
102 |                             if each_id in dict_for_genus:
103 |                                 genus = dict_for_genus[each_id]
104 |                                 genus_list.append(genus)
105 |                             else:
106 |                                 print(each_id,"  not found!")
107 |                         genus_list = list(set(genus_list))
108 |                         genus_list.sort()
109 |                         genus_list_string = ','.join(genus_list)           
110 |                         mapping_score = dict_name[readname]["mapping_score"]
111 |                         outline = readname+'\t'+cellname+'\t'+umi+'\t'+path+'\t'+mapping_score+'\t'+genus_list_string+'\n'
112 |                         readname_cell_path.write(outline)
113 |                         total_potential_UMI_including_ambigious_reads.add(umi)
114 |                         total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads+=1
115 |     print('total cellranger bam reads = ',total_cellranger_bam_reads)
116 |     print('total cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_tags)
117 |     print('total UNMAPPED cellranger bam reads with UB CB tags (in-cell) = ',total_cellranger_reads_UB_CB_unmap)
118 |     print('total cellranger reads with UB_CB_unmap Aligned to Pathseq reads with YP tags = (in-cell)',total_cellranger_reads_UB_CB_unmap_Aligned_to_Pathseq_YP_reads)
119 |     cell_list = open(out_cell_list,'w')
120 |     for each_cell in set_for_infect_cells:
121 |         cell_list.write(each_cell)
122 |         cell_list.write('\n')
123 |     return 
124 | 
125 | 
126 | def generate_barcode_UMI_dict(out_readname_cell_path):
127 |     cell_path_file = open(out_readname_cell_path,'r')
128 |     barcode_UMI_dict = {}
129 |     for each_line in cell_path_file:
130 |         each_line = each_line.rstrip('\n')
131 |         each_line_list = each_line.split('\t')   
132 |         read_name =  each_line_list[0]
133 |         cell_barcode = each_line_list[1]
134 |         UMI = each_line_list[2]
135 |         id_string = each_line_list[3]
136 |         id_string_list = id_string.split(',')
137 |         barcode_UMI = cell_barcode+'+'+UMI
138 |         mapping_score = each_line_list[4]
139 |         genus_string = each_line_list[5]
140 |         if not barcode_UMI in barcode_UMI_dict:
141 |             barcode_UMI_dict[barcode_UMI]={}
142 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
143 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score)
144 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
145 |         elif int(mapping_score) > barcode_UMI_dict[barcode_UMI]["mapping_score"]:
146 |             barcode_UMI_dict[barcode_UMI]["id_string"] = id_string_list
147 |             barcode_UMI_dict[barcode_UMI]["mapping_score"] = int(mapping_score) 
148 |             barcode_UMI_dict[barcode_UMI]["genus_string"] = genus_string
149 |     return barcode_UMI_dict 
150 | 
151 | def output_cells_genus_list(barcode_UMI_dict,dict_for_genus):
152 |     cells_dict = {}
153 |     for barcode_UMI in barcode_UMI_dict:
154 |         cell = barcode_UMI.split('+')[0]
155 |         if not cell in cells_dict:
156 |             cells_dict[cell]=[]
157 |             cells_dict[cell].append(barcode_UMI)
158 |         else:
159 |             cells_dict[cell].append(barcode_UMI)
160 |     UMI_id_dict = {}
161 |     for barcode_UMI in barcode_UMI_dict:
162 |         if not ',' in barcode_UMI_dict[barcode_UMI]["genus_string"]:
163 |             UMI_id_dict[barcode_UMI] = barcode_UMI_dict[barcode_UMI]["id_string"]
164 |     unambigious_UMI = {}
165 |     for barcode_UMI in UMI_id_dict:
166 |         id_list = UMI_id_dict[barcode_UMI]
167 |         genus_list = []
168 |         for each_id in id_list:
169 |             if each_id in dict_for_genus:
170 |                 genus = dict_for_genus[each_id]
171 |                 genus_list.append(genus)
172 |         genus_list = list(set(genus_list))
173 |         if len(genus_list) == 1:#only keep unambigious UMI
174 |             unambigious_UMI[barcode_UMI] = genus_list[0]
175 |     print('Total unambigious UMI = ',len(unambigious_UMI))
176 |     cell_metadata_dict = {}
177 |     for barcode_UMI in unambigious_UMI:
178 |         barcode = barcode_UMI.split('+')[0]
179 |         UMI = barcode_UMI.split('+')[1]
180 |         genus = unambigious_UMI[barcode_UMI]
181 | 
182 |         if not barcode in cell_metadata_dict:
183 |             cell_metadata_dict[barcode] = {}
184 |             cell_metadata_dict[barcode]['genus'] = []
185 |             cell_metadata_dict[barcode]['genus'].append(genus)
186 |             cell_metadata_dict[barcode]['barcode_UMI']={}
187 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
188 |             cell_metadata_dict[barcode]['pathogen_count']={}
189 |         else:
190 |             cell_metadata_dict[barcode]['genus'].append(genus)
191 |             cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI] = genus
192 | 
193 |         if not genus in cell_metadata_dict[barcode]['pathogen_count']:
194 |             cell_metadata_dict[barcode]['pathogen_count'][genus] = 1
195 |         else:
196 |             cell_metadata_dict[barcode]['pathogen_count'][genus] += 1
197 |     return cell_metadata_dict
198 | 
199 | def output_cell_metadata(cell_metadata_dict,out_genus_file,sample_ident,barcode_whitelist_file):
200 |     print('total pathogen-associated gems = ', len(cell_metadata_dict))
201 |     white_list_set = set()
202 |     white_list_dict = {}
203 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
204 |     for each_line in white_list:
205 |         each_line = each_line.rstrip('\n')
206 |         white_list_set.add(each_line)
207 |     for barcode in cell_metadata_dict:
208 |         if barcode in white_list_set:
209 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
210 |     cell_metadata_dict = white_list_dict
211 |     print("total filtered pathogen-associated cells = ", len(cell_metadata_dict))
212 |     genus_file = open(out_genus_file,'w')
213 |     header = 'cell_name,pathogen,UMI_count,pathogen_count\n'
214 |     genus_file.write(header)
215 | 
216 |     for barcode in cell_metadata_dict:
217 |         if not sample_ident == '':
218 |             cell_name = sample_ident+'_'+barcode
219 |         else:
220 |             cell_name = barcode
221 |         genus_list = []
222 |         for barcode_UMI in cell_metadata_dict[barcode]['barcode_UMI']:
223 |             genus_list.append(cell_metadata_dict[barcode]['barcode_UMI'][barcode_UMI])
224 |         sorted_genus_list = list(set(genus_list))
225 |         sorted_genus_list.sort()
226 |         genus = '+'.join(sorted_genus_list)            
227 |         UMI_count = len(cell_metadata_dict[barcode]['barcode_UMI'])
228 |         pathogen_count_list = []
229 |         for each_pathogen in cell_metadata_dict[barcode]['pathogen_count']:
230 |             pathogen_count=each_pathogen
231 |             pathogen_count+=':'
232 |             pathogen_count+=str(cell_metadata_dict[barcode]['pathogen_count'][each_pathogen])
233 |             pathogen_count_list.append(pathogen_count)
234 |         pathogen_count_list.sort()
235 |         pathogen_count_str = ';'.join(pathogen_count_list)
236 | 
237 |         Periority_pathogen = 'Fusobacterium'
238 |         pathogen_count_mini_dict = cell_metadata_dict[barcode]['pathogen_count']
239 |         temp_max_list = []
240 |         UMI_count_sum = 0
241 |         max_count = max(pathogen_count_mini_dict.values())
242 |         for key,value in pathogen_count_mini_dict.items():
243 |             if value == max_count:
244 |                 temp_max_list.append(key)
245 |                 max_UMI = value
246 |             UMI_count_sum += value
247 |         
248 |         UMI_count = UMI_count_sum
249 |         if len(set(temp_max_list)) > 1: 
250 |             genus = 'MULTI'
251 |             UMI_count = UMI_count_sum
252 |         else:
253 |             genus = temp_max_list[0]
254 |             UMI_count = max_UMI
255 |         output_line = ','.join([cell_name,genus,str(UMI_count),pathogen_count_str])+'\n'
256 |         if UMI_count >= 1:
257 |             genus_file.write(output_line)
258 |     return
259 | 
260 | 
261 | def UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv):
262 |     white_list_set = set()
263 |     white_list_dict = {}
264 |     white_list = gzip.open(barcode_whitelist_file, 'rt')
265 |     for each_line in white_list:
266 |         each_line = each_line.rstrip('\n')
267 |         white_list_set.add(each_line)
268 |     print("total number of cells = ", len(white_list_set))
269 |     for barcode in cell_metadata_dict:
270 |         if barcode in white_list_set:
271 |             white_list_dict[barcode]= cell_metadata_dict[barcode]
272 |     cell_metadata_dict = white_list_dict
273 |     output_UMI_validate_table = open(output_UMI_validate_table_csv,'w')
274 |     for each_cell in cell_metadata_dict:
275 |         for each_UMI in cell_metadata_dict[each_cell]['barcode_UMI']:
276 |             UMI = each_UMI
277 |             pathogen = cell_metadata_dict[each_cell]['barcode_UMI'][UMI]
278 |             output_UMI_validate_table.write(UMI+','+pathogen+'\n')
279 | 
280 |     output_UMI_table = open(output_UMI_table_csv,'w')
281 |     genera_list_set = set()
282 |     for barcode in cell_metadata_dict:
283 |         for pathogen in cell_metadata_dict[barcode]['pathogen_count']:
284 |             genera_list_set.add(pathogen)
285 | 
286 |     genera_list = sorted(list(genera_list_set))
287 |     header = ['barcode']+genera_list
288 |     header_out = ','.join(header)
289 |     output_UMI_table.write(header_out)
290 |     output_UMI_table.write('\n')
291 |     for barcode in cell_metadata_dict:
292 |         if not sample_ident == '':
293 |             cell_name = sample_ident+'_'+barcode
294 |         else:
295 |             cell_name = barcode
296 |         genera_count_list = []
297 |         for each_genus in genera_list:
298 |             if each_genus in cell_metadata_dict[barcode]['pathogen_count']:
299 |                 genus_count = cell_metadata_dict[barcode]['pathogen_count'][each_genus]
300 |             else:
301 |                 genus_count = 0
302 |             genera_count_list.append(str(genus_count))
303 |         output_line = [cell_name]+genera_count_list
304 |         output_line_out = ','.join(output_line)
305 |         output_UMI_table.write(output_line_out)
306 |         output_UMI_table.write('\n')
307 |     return
308 | 
309 | if __name__ == "__main__":
310 |     cellranger_bam_file,sample_ident,barcode_whitelist_file,pathseq_bam_file,pathseq_report_csv,read_name_pathseq,unmap_cbub_bam_file,unmap_cbub_fasta_file,out_cell_list,out_readname_cell_path,out_genus_file,output_UMI_table_csv,output_UMI_validate_table_csv=sys.argv[1:]
311 |     dict_for_genus = read_pathseq_report_and_create_dict(pathseq_report_csv)
312 |     step1 = read_cell_names1(pathseq_bam_file, read_name_pathseq)
313 |     step2 = read_readnames(read_name_pathseq)
314 |     step3 = read_cell_names2(step2[0], step2[1], dict_for_genus,cellranger_bam_file,unmap_cbub_bam_file,unmap_cbub_fasta_file, out_cell_list,out_readname_cell_path,barcode_whitelist_file)
315 |     step4 = generate_barcode_UMI_dict(out_readname_cell_path)
316 |     step5 = output_cells_genus_list(step4,dict_for_genus)
317 | 
318 |     output_cell_metadata(step5,out_genus_file,sample_ident,barcode_whitelist_file)
319 |     cell_metadata_dict = step5
320 |     UMI_table_output(cell_metadata_dict,barcode_whitelist_file,sample_ident,output_UMI_table_csv,output_UMI_validate_table_csv)
321 | 
322 | # cellranger_bam_file,
323 | # sample_ident,
324 | # barcode_whitelist_file,
325 | # pathseq_bam_file,
326 | # pathseq_report_csv,
327 | # read_name_pathseq,
328 | # unmap_cbub_bam_file,
329 | # unmap_cbub_fasta_file,
330 | # out_cell_list,
331 | # out_readname_cell_path,
332 | # out_genus_file,
333 | # output_UMI_table_csv,
334 | # output_UMI_validate_table_csv=sys.argv[1:]
335 | 


--------------------------------------------------------------------------------
/patient_samples/merge_metadata.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pandas as pd
 3 | import numpy as np
 4 | import os
 5 | import sys
 6 | pd.set_option('precision', 0)
 7 | 
 8 | def merge_cellsmeta2(df1,df2):
 9 |     df_merged = pd.concat([df1, df2], sort=False)
10 |     df_merged = df_merged.round()
11 |     return df_merged
12 | 
13 | def feed_csvs(path):#this will return a list of csvs in your path
14 |     file_list = os.listdir(path)
15 |     csv_list = []
16 |     for each_file in file_list:
17 |         if each_file.endswith('genus.csv'):
18 |         #if each_file.endswith('merged.csv'):
19 |             csv_list.append(path+'/'+each_file)
20 |     return csv_list
21 | 
22 | if __name__ == "__main__":
23 |     path = sys.argv[1]
24 |     csv_merged = path+'/csv_novami.csv'
25 |     csv_list = feed_csvs(path)
26 | 
27 |     csv1 = csv_list[0]
28 |     df1 = pd.read_csv(csv1,header = 0,sep = ',')
29 | 
30 |     for each_csv in csv_list[1:]:
31 |         print(each_csv)
32 |         df2 = pd.read_csv(each_csv,header = 0,sep = ',')
33 |         df1 = merge_cellsmeta2(df1,df2)
34 | 
35 |     #print(df1)
36 |     df1 = df1.fillna(0)
37 |     df1.to_csv(csv_merged,sep=',',index=False)
38 | 
39 | 


--------------------------------------------------------------------------------
/patient_samples/metadata_dedup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | import pandas as pd
  3 | from collections import Counter
  4 | import os
  5 | import sys
  6 | """
  7 | # The purpose of this script is deduplication of the merged metadata. Since there are replicated cell names from GEX libraries and 16s libraries, it is necessary to add UMI count from both techniques together into unique cell names.
  8 | # usage: 
  9 | # python metadata_dedup.py \
 10 |     GEX_pathogen_UMI_matrix_output_folder(with validation csvs) \
 11 |     16s_pathogen_UMI_matrix_output_folder(with validation csvs) \
 12 |     Merged_csv_matrix_from_previous_step \
 13 |     Dedup_csv_matrix
 14 | 
 15 | # Note: Merged_csv_matrix_from_previous_step is a csv file conting 7 clinical samples from our study
 16 | #       otherwise please modify the sample names in the script.
 17 | #       'nova' refer to GEX data, 'mi' refer to 16S data
 18 | """
 19 | 
 20 | def read_and_mkdic(validate_csv_file, sample_name):
 21 |     UMI_bac_list = []
 22 |     validate_csv = open(validate_csv_file,'r')
 23 |     for each_line in validate_csv:
 24 |         each_line = each_line.rstrip('\n')
 25 |         each_line = sample_name+'_'+each_line
 26 |         UMI_bac_list.append(each_line)
 27 |     return UMI_bac_list
 28 | 
 29 | #add lists together
 30 | # then count
 31 | def count_elements(merged_UMI_bac_list):
 32 |     count_dict = Counter(merged_UMI_bac_list)
 33 |     return count_dict
 34 | 
 35 | #add dics together: ndic = list(dict(dic0.items()) + list(dic1.items()))
 36 | #generate a dataframe, that will be a minux matrix
 37 | #df.values[rows, cols] = np.nan
 38 | def minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup):
 39 |     nova_mi_merged = pd.read_csv(nova_mi_merged_csv_file,header = 0,sep = ',',index_col='barcode')
 40 |     print('before merge = ',len(nova_mi_merged))
 41 |     nova_mi_merged = nova_mi_merged.groupby(nova_mi_merged.index).sum()
 42 |     print('after merge = ',len(nova_mi_merged))
 43 |     n=0
 44 |     rowNamesArr = list(nova_mi_merged.index.values)
 45 |     #print(rowNamesArr[:10])
 46 |     columnsNamesArr = list(nova_mi_merged.columns.values)
 47 |     for each_cell_UMI in count_dict:
 48 |         #print(each_cell_UMI)
 49 |         n+=1
 50 |         if n%1000 == 0:
 51 |             print('now working on: ',n/len(count_dict)*100,'%')
 52 |         cell = each_cell_UMI.split('+')[0]
 53 |         pathogen = each_cell_UMI.split(',')[1]
 54 |         count = count_dict[each_cell_UMI]-1
 55 |         colindex = columnsNamesArr.index(pathogen)
 56 |         rowindex = rowNamesArr.index(cell)
 57 |         prev = int(nova_mi_merged.loc[cell,pathogen])
 58 |         nova_mi_merged.loc[cell,pathogen]=prev-count
 59 |         after = int(nova_mi_merged.loc[cell,pathogen])
 60 |     nova_mi_merged.to_csv(nova_mi_merged_csv_file_dedup,sep=',',index=True)
 61 |     return
 62 | 
 63 | 
 64 | if __name__ == "__main__":
 65 |     UMI_bac_list=[]
 66 |     
 67 |     validate_csv_file_nova = [
 68 |         'OSCC_15',
 69 |         'OSCC_13',
 70 |         'OSCC_14',
 71 |         'OSCC_17',
 72 |         'OSCC_12',
 73 |         'OSCC_16',
 74 |         'OSCC_11'
 75 |     ]
 76 |     sample_name_nova = [
 77 |         'OSCC_15',
 78 |         'OSCC_13',
 79 |         'OSCC_14',
 80 |         'OSCC_17',
 81 |         'OSCC_12',
 82 |         'OSCC_16',
 83 |         'OSCC_11'
 84 |     ]
 85 | 
 86 |     validate_csv_file_mi = [
 87 |         'OSCC_15',
 88 |         'OSCC_13',
 89 |         'OSCC_14',
 90 |         'OSCC_17',
 91 |         'OSCC_12',
 92 |         'OSCC_16',
 93 |         'OSCC_11'
 94 |     ]
 95 |     sample_name_mi = [
 96 |         'OSCC_15',
 97 |         'OSCC_13',
 98 |         'OSCC_14',
 99 |         'OSCC_17',
100 |         'OSCC_12',
101 |         'OSCC_16',
102 |         'OSCC_11'
103 |     ]
104 | 
105 |     path_nova = argv[1]
106 |     path_mi = argv[2]
107 |     nova_mi_merged_csv_file = argv[3]
108 |     nova_mi_merged_csv_file_dedup = argv[4]
109 | 
110 |     #also modified the cell names in sub merged csvs!
111 | 
112 |     for n in range(0,len(validate_csv_file_nova)):
113 |         UMI_bac_list = UMI_bac_list+read_and_mkdic(path_nova + '/' + validate_csv_file_nova[n]+'.gex.filtered_matrix.validate.csv', 'sample_'+sample_name_nova[n])
114 |     
115 |     for n in range(0,len(validate_csv_file_mi)):
116 |         UMI_bac_list = UMI_bac_list+read_and_mkdic(path_mi + '/' + validate_csv_file_mi[n]+'.16s.filtered_matrix.validate.csv', 'sample_'+sample_name_mi[n])
117 |     
118 |     count_dict = count_elements(UMI_bac_list)
119 |     minux_matrix(count_dict,nova_mi_merged_csv_file,nova_mi_merged_csv_file_dedup)
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/patient_samples/patient_samples_16s_pipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | ml CellRanger/6.1.1
  3 | 
  4 | ml BEDTools/2.29.2-GCC-9.3.0
  5 | ml SAMtools/1.16.1-GCC-11.2.0
  6 | ml FastQC/0.11.9-Java-11
  7 | ml Trimmomatic/0.39-Java-11
  8 | ml picard/2.21.6-Java-11
  9 | 
 10 | 
 11 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8
 12 | 
 13 | ml Python
 14 | ml Pysam
 15 | 
 16 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders
 17 | `root` # working directory
 18 | `pathseqdb` # Pathseq database
 19 | `cellrangerdb` # Cellranger database
 20 | `gex_bam_path` # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells
 21 | 
 22 | root=${workdir}
 23 | # Run cellranger count
 24 | cd ${workdir}
 25 | mkdir cellranger_count
 26 | cd cellranger_count
 27 | for folder in ${raw_data_folder}/*
 28 | do
 29 | folder_name=${folder##*/}
 30 | path=${folder}
 31 | cellranger count \
 32 | --id=${folder_name} \
 33 | --transcriptome=${cellrangerdb} \
 34 | --fastqs=${path} \
 35 | --sample=${folder_name}
 36 | done
 37 | 
 38 | cd ${workdir}
 39 | mkdir split_reads
 40 | cd split_reads
 41 | 
 42 | # convert cellranger bam file to fastqs
 43 | for folder in ${workdir}/cellranger_count/*
 44 | do
 45 | folder_name=${folder##*/}
 46 | file=${folder}/outs/possorted_genome_bam.bam
 47 | echo ${file}
 48 | 
 49 | samplename=${folder_name}
 50 | bedtools bamtofastq -i ${folder}/outs/possorted_genome_bam.bam \
 51 |                       -fq ${samplename}.r1.fq \
 52 |                       -fq2 ${samplename}.r2.fq
 53 | done
 54 | 
 55 | # run fastqc before trimmomatic
 56 | mkdir ${root}/preqc
 57 | fastqc \
 58 | -o ${root}/preqc \
 59 | ${root}/split_reads/*.fq
 60 | 
 61 | # run trimmomatic on R1 
 62 | cd ${root}/split_reads
 63 | mkdir trim
 64 | for str in *r1.fq
 65 | do
 66 | # adjust -threads to number of cores you would like to use
 67 | java -jar $EBROOTTRIMMOMATIC/trimmomatic-0.39.jar SE \
 68 | -threads 36 \
 69 | ${str} \
 70 | trim/${str}.SE_trim.fq \
 71 | ILLUMINACLIP:$EBROOTTRIMMOMATIC/adapters/TruSeq3-PE-2.fa:2:30:10 \
 72 | LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 HEADCROP:15
 73 | done
 74 | 
 75 | # run fastqc after trimmomatic 
 76 | cd trim
 77 | mkdir ${root}/postqc
 78 | fastqc \
 79 | -o ${root}/postqc \
 80 | *.SE_trim.fq
 81 | 
 82 | mkdir ${workdir}/ubams_r1
 83 | cd ${root}/split_reads/trim
 84 | 
 85 | # convert R1 to ubam file in order to run Pathseq
 86 | for file in *SE_trim.fq
 87 | do
 88 | java -Xmx700G -jar $EBROOTPICARD/picard.jar FastqToSam \
 89 |     FASTQ=${file} \
 90 |     OUTPUT=${file}.bam \
 91 |     READ_GROUP_NAME=16s \
 92 |     SAMPLE_NAME=16s
 93 | 
 94 | # move and rename generated ubam files
 95 | mv ${file}.bam ${workdir}/ubams_r1
 96 | done
 97 | 
 98 | ubam_folder=${workdir}/ubams_r1
 99 | outpath=${workdir}/pathseq_r1
100 | mkdir ${outpath}
101 | 
102 | cd ${ubam_folder}
103 | 
104 | # Pathseq to identify pathogen-associated cells
105 | for each_file in *.bam
106 | do
107 | echo ${each_file}
108 | filename="${each_file%.*}"
109 | filename="${filename%.*}"
110 | filename="${filename%.*}"
111 | samplename=${filename}
112 | echo ${samplename}
113 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples
114 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \
115 |     --input ${each_file} \
116 |     --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \
117 |     --kmer-file ${pathseqdb}/pathseq_host.bfi \
118 |     --min-clipped-read-length 60 \
119 |     --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \
120 |     --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \
121 |     --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \
122 |     --output ${outpath}/${samplename}.pathseq.complete.bam \
123 |     --scores-output ${outpath}/${samplename}.pathseq.complete.txt.csv \
124 |     --is-host-aligned false \
125 |     --filter-duplicates false \
126 |     --min-score-identity .7
127 | done
128 | 
129 | # Python script to produce a bacteria UMI matrix (based on valid GEX cell)
130 | bam_path=${workdir}/cellranger_count
131 | pathseq_path=${workdir}/pathseq_r1
132 | out_path=${root}/python
133 | mkdir ${out_path}
134 | cd ${bam_path}
135 | # barcodes.tsv.gz from GEX is used as a 'whitelist' for real cells. 
136 | for each_sample in *
137 | do
138 | echo ${each_sample}
139 | echo ${gex_bam_path}
140 | python INVADEseq.py \
141 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \
142 | ${each_sample} \
143 | ${gex_bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \
144 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.bam \
145 | ${pathseq_path}/${each_sample}.r1.fq.pathseq.complete.txt.csv \
146 | ${out_path}/${each_sample}.16s.filtered_matrix.readname \
147 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.bam \
148 | ${out_path}/${each_sample}.16s.filtered_matrix.unmap_cbub.fasta \
149 | ${out_path}/${each_sample}.16s.filtered_matrix.list \
150 | ${out_path}/${each_sample}.16s.raw.filtered_matrix.readnamepath \
151 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.cell \
152 | ${out_path}/${each_sample}.16s.filtered_matrix.genus.csv \
153 | ${out_path}/${each_sample}.16s.filtered_matrix.validate.csv
154 | done
155 | 
156 | 
157 | 
158 | 


--------------------------------------------------------------------------------
/patient_samples/patient_samples_GEX_pipeline.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # The preprocessing pipeline for Patient samples single-cell GEX data 
 4 | ml CellRanger/6.1.1
 5 | ml GATK/4.1.3.0-GCCcore-8.3.0-Java-1.8
 6 | 
 7 | ml Python
 8 | ml Pysam
 9 | 
10 | `raw_data_folder` # the folder containing Cellranger mkfastq output folders
11 | `root` # working directory
12 | `pathseqdb` # Pathseq database
13 | `cellrangerdb` # Cellranger database
14 | 
15 | cd ${root}
16 | mkdir cellranger_count
17 | cd cellranger_count
18 | 
19 | # Cellranger count processing
20 | for folder in ${raw_data_folder}/*
21 | do
22 | folder=${folder}
23 | folder_name=${folder##*/}
24 | path=${folder}
25 | echo ${path}
26 | cellranger count \
27 | --id=${folder_name} \
28 | --transcriptome=${cellrangerdb} \
29 | --fastqs=${path} \
30 | --sample=${folder_name}
31 | done
32 | 
33 | # PathSeq pipeline
34 | outpath=${root}/pathseq
35 | mkdir ${outpath}
36 | # PathSeq process # Please adjust "-Xmx750g" based on the memory you want to use. Adjust --min-score-identity and --min-clipped-read-length based on your samples
37 | # 
38 | for folder in *
39 | do
40 | folder_name=${folder##*/}
41 | file=${folder}/outs/possorted_genome_bam.bam
42 | samplename=${folder_name}
43 | echo ${samplename}
44 | gatk --java-options "-Xmx750g" PathSeqPipelineSpark \
45 |     --input ${file} \
46 |     --filter-bwa-image ${pathseqdb}/pathseq_host.fa.img \
47 |     --kmer-file ${pathseqdb}/pathseq_host.bfi \
48 |     --min-clipped-read-length 60 \
49 |     --microbe-fasta ${pathseqdb}/pathseq_microbe.fa \
50 |     --microbe-bwa-image ${pathseqdb}/pathseq_microbe.fa.img \
51 |     --taxonomy-file ${pathseqdb}/pathseq_taxonomy.db \
52 |     --output ${outpath}/${samplename}.pathseq.complete.bam \
53 |     --scores-output ${outpath}/${samplename}.pathseq.complete.csv \
54 |     --is-host-aligned false \
55 |     --filter-duplicates false \
56 |     --min-score-identity .7
57 | done
58 | 
59 | # Python script to generate bacteria matrix
60 | bam_path=${root}/cellranger_count
61 | pathseq_path=${root}/pathseq
62 | out_path=${root}/python
63 | mkdir ${out_path}
64 | cd ${bam_path}
65 | 
66 | for each_sample in *
67 | do
68 | echo ${each_sample}
69 | python INVADEseq.py \
70 | ${bam_path}/${each_sample}/outs/possorted_genome_bam.bam \
71 | ${each_sample} \
72 | ${bam_path}/${each_sample}/outs/filtered_feature_bc_matrix/barcodes.tsv.gz \
73 | ${pathseq_path}/${each_sample}.pathseq.complete.bam \
74 | ${pathseq_path}/${each_sample}.pathseq.complete.csv \
75 | ${out_path}/${each_sample}.gex.filtered_matrix.readname \
76 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.bam \
77 | ${out_path}/${each_sample}.gex.filtered_matrix.unmap_cbub.fasta \
78 | ${out_path}/${each_sample}.gex.filtered_matrix.list \
79 | ${out_path}/${each_sample}.gex.raw.filtered_matrix.readnamepath \
80 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.cell \
81 | ${out_path}/${each_sample}.gex.filtered_matrix.genus.csv \
82 | ${out_path}/${each_sample}.gex.filtered_matrix.validate.csv
83 | done
84 | 
85 | 


--------------------------------------------------------------------------------
/patient_samples/patient_samples_Seurat.r:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env Rscript
  2 | library(harmony)
  3 | library(Seurat)
  4 | library(ggplot2)
  5 | library(SingleR)
  6 | library(celldex)
  7 | library(msigdbr)
  8 | library(cowplot)
  9 | library(dplyr)
 10 | hpca.se <- celldex::HumanPrimaryCellAtlasData()
 11 | library("enrichplot")
 12 | library(ggupset)
 13 | library(gridExtra)
 14 | library(pheatmap)
 15 | options(bitmapType = 'cairo')
 16 | knitr::opts_chunk$set(dev="CairoPNG")
 17 | library(org.Hs.eg.db)
 18 | 
 19 | sample_OSCC_17.data<-Read10X(data.dir = "OSCC_17/outs/filtered_feature_bc_matrix")
 20 | sample_OSCC_17 = CreateSeuratObject(counts = sample_OSCC_17.data, project = "Sample_OSCC_17", min.cells = 3, min.features = 200)
 21 | sample_OSCC_17[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_17, pattern = "^MT-")
 22 | sample_OSCC_12.data<-Read10X(data.dir = "OSCC_12/outs/filtered_feature_bc_matrix")
 23 | sample_OSCC_12 = CreateSeuratObject(counts = sample_OSCC_12.data, project = "Sample_OSCC_12", min.cells = 3, min.features = 200)
 24 | sample_OSCC_12[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_12, pattern = "^MT-")
 25 | sample_OSCC_13.data<-Read10X(data.dir = "OSCC_13/outs/filtered_feature_bc_matrix")
 26 | sample_OSCC_13 = CreateSeuratObject(counts = sample_OSCC_13.data, project = "Sample_OSCC_13", min.cells = 3, min.features = 200)
 27 | sample_OSCC_13[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_13, pattern = "^MT-")
 28 | sample_OSCC_14.data<-Read10X(data.dir = "OSCC_14/outs/filtered_feature_bc_matrix")
 29 | sample_OSCC_14 = CreateSeuratObject(counts = sample_OSCC_14.data, project = "Sample_OSCC_14", min.cells = 3, min.features = 200)
 30 | sample_OSCC_14[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_14, pattern = "^MT-")
 31 | sample_OSCC_15.data<-Read10X(data.dir = "OSCC_15/outs/filtered_feature_bc_matrix")
 32 | sample_OSCC_15 = CreateSeuratObject(counts = sample_OSCC_15.data, project = "Sample_OSCC_15", min.cells = 3, min.features = 200)
 33 | sample_OSCC_15[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_15, pattern = "^MT-")
 34 | sample_OSCC_11.data<-Read10X(data.dir = "OSCC_11/outs/filtered_feature_bc_matrix")
 35 | sample_OSCC_11 = CreateSeuratObject(counts = sample_OSCC_11.data, project = "Sample_OSCC_11", min.cells = 3, min.features = 200)
 36 | sample_OSCC_11[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_11, pattern = "^MT-")
 37 | sample_OSCC_16.data<-Read10X(data.dir = "OSCC_16/outs/filtered_feature_bc_matrix")
 38 | sample_OSCC_16 = CreateSeuratObject(counts = sample_OSCC_16.data, project = "Sample_OSCC_16", min.cells = 3, min.features = 200)
 39 | sample_OSCC_16[["percent.mt"]] <- PercentageFeatureSet(sample_OSCC_16, pattern = "^MT-")
 40 | # merge, cluster and Harmony integration
 41 | sample.headneck <- merge(sample_OSCC_17, y = c(sample_OSCC_12,sample_OSCC_13,sample_OSCC_14,sample_OSCC_15,sample_OSCC_11,sample_OSCC_16), add.cell.ids = c('Sample_OSCC_17','Sample_OSCC_12','Sample_OSCC_13_T','Sample_OSCC_14_T','Sample_OSCC_15_T','Sample_OSCC_11',"Sample_OSCC_16"), project = "SAMPLE.INTEGRATED")
 42 | 
 43 | VlnPlot(sample.headneck, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
 44 | sample.headneck <- NormalizeData(object = sample.headneck, normalization.method = "LogNormalize", scale.factor = 10000)
 45 | GetAssay(sample.headneck,assay = "RNA")
 46 | sample.headneck <- FindVariableFeatures(object = sample.headneck, selection.method = "vst", nfeatures = 5000)
 47 | top20 <- head(x = VariableFeatures(object = sample.headneck), 20)
 48 | plot1 <- VariableFeaturePlot(object = sample.headneck)
 49 | plot2 <- LabelPoints(plot = plot1, points = top20, repel = TRUE)
 50 | plot1+plot2
 51 | all.genes <- rownames(sample.headneck)
 52 | sample.headneck<- ScaleData(object = sample.headneck,features = all.genes)
 53 | sample.headneck <- RunPCA(object = sample.headneck,pc.genes = VariableFeatures(sample.headneck))
 54 | ElbowPlot(sample.headneck)
 55 | sample.headneck <- RunHarmony(sample.headneck, group.by.vars="orig.ident",assay.use='RNA')
 56 | names(sample.headneck@reductions)
 57 | sample.headneck <- RunUMAP(sample.headneck,  dims = 1:20, 
 58 |                      reduction = "harmony",seed.use=111)
 59 | DimPlot(sample.headneck,reduction = "umap",label=T ) 
 60 | DimPlot(sample.headneck,reduction = "umap",label=F ) + ggtitle("Integrated")+theme(plot.title = element_text(hjust = 0.5))
 61 | 
 62 | sample.headneck <- FindNeighbors(sample.headneck, reduction = "harmony",dims = 1:20)
 63 | sample.headneck <- FindClusters(sample.headneck, resolution = 0.5)
 64 | table(sample.headneck@meta.data$seurat_clusters)
 65 | DimPlot(sample.headneck,reduction = "umap",label=T)  
 66 | DimPlot(sample.headneck,reduction = "umap",label=T,
 67 |         group.by = 'orig.ident') 
 68 | 
 69 | # SingleR annotation
 70 | ref <- HumanPrimaryCellAtlasData()
 71 | seuratObj_annot <- as.SingleCellExperiment(sample.headneck)
 72 | library(SingleR)
 73 | pred <- SingleR(test=seuratObj_annot, ref=ref, labels=ref$label.fine)
 74 | head(pred)
 75 | plotScoreHeatmap(pred)
 76 | tab <- table(Assigned=pred$pruned.labels, Cluster=seuratObj_annot@colData$seurat_clusters)
 77 | # Adding a pseudo-count of 10 to avoid strong color jumps with just 1 cell.
 78 | 
 79 | pheatmap(log2(tab+10), color=colorRampPalette(c("white", "blue"))(101))
 80 | pred2 <- SingleR(test=seuratObj_annot, ref=ref, cluster=seuratObj_annot@colData$seurat_clusters, labels=ref$label.fine)
 81 | sample.headneck.backup = sample.headneck
 82 | sample.headneck@meta.data$cell.type.fine = sample.headneck@meta.data$seurat_clusters
 83 | sample.headneck[["SingleR.cluster.labels"]] <- 
 84 |         pred2$labels[match(sample.headneck[[]][["seurat_clusters"]], rownames(pred2))]
 85 | 
 86 | Idents(sample.headneck) <- "SingleR.cluster.labels"
 87 | sample.headneck <- RenameIdents(sample.headneck, 
 88 |     'Epithelial_cells:bronchial' = "Epithelial_cells",
 89 |     'Epithelial_cells:bladder' = "Epithelial_cells"
 90 |     )
 91 | 
 92 | # CopyKAT prediction
 93 | sample.headneck.exp.rawdata <- as.matrix(sample.headneck@assays$RNA@counts)
 94 | sample.headneck.copykat.test <- copykat(rawmat=sample.headneck.exp.rawdata, id.type="S", ngene.chr=3, win.size=25, KS.cut=0.1, sam.name="sample.headneck", distance="euclidean", norm.cell.names="", n.cores=34,output.seg="FLASE")
 95 | # Add CopyKAT metadata
 96 | sample.headneck_copykat_prediction_csv = 'sample.headneck_copykat_prediction.txt'
 97 | sample.headneck_copykat_prediction<-read.csv(sample.headneck_copykat_prediction_csv,sep='\t',header=TRUE,row.names=1)
 98 | sample.headneck<-AddMetaData(sample.headneck, sample.headneck_copykat_prediction)
 99 | 
100 | # add pathogen UMI metadata
101 | umi_table_csv = 'csv_novami_mix_dedup_rename.csv'
102 | umi_table<-read.csv(umi_table_csv,sep=',',header=TRUE,row.names = 1)
103 | umi_table[is.na(umi_table)] <- 0
104 | umi_table$Total <- rowSums(umi_table)
105 | umi_table[umi_table==0] <- NA
106 | sample.headneck<-AddMetaData(sample.headneck, umi_table)
107 | #saveRDS(sample.headneck, file = sample.headneck.rds)
108 | 


--------------------------------------------------------------------------------