├── .gitignore ├── Base_edit_2 ├── Additional_BaseEdit_process_list.tsv ├── All_final_result_summation.py ├── BaseEdit_freq_crispresso.py ├── Each_base_summary.py ├── Indel_contr_dict_making.py ├── Indel_frequency_calculator.py ├── Kill_jobs.sh ├── MakeUserFolder.sh ├── Make_user_folder.sh ├── README.md ├── Run_BaseEdit_freq.py ├── Run_cmd.sh ├── Run_each_base_summary.sh ├── Run_sequence_freq.sh ├── Sequence_freq.py ├── Sequence_freq_add.py ├── Split_file.py ├── Sum_all_alt_freq.py ├── Summary_all_trim.py └── __init__.py ├── Core ├── CoreSystem.py ├── CoreSystem.pyc ├── __init__.py └── __init__.pyc ├── EDNAFULL ├── Indel_searcher_2 ├── BaseEdit_input_converter.py ├── Debugger.py ├── Flash_pair_read_merge.py ├── Indel_frequency_calculator.py ├── Indel_normalization.py ├── Indel_normalization.pyc ├── Indel_searcher_crispresso_hash.py ├── Input │ └── JaeWoo │ │ ├── FASTQ │ │ └── JaeWoo_test_samples │ │ │ └── Test_sample │ │ │ └── eCas9_rep1_D4.ext.fastq │ │ └── Reference │ │ └── JaeWoo_test_samples │ │ └── Cas9D7 │ │ ├── Barcode.txt │ │ ├── Reference_sequence.txt │ │ └── Target_region.txt ├── Kill_jobs.sh ├── Make_user_folder.sh ├── README.md ├── Run_cmd.sh ├── Run_converter.sh ├── Run_flash.sh ├── Run_indel_searcher.py ├── Run_random_barcode.sh ├── Summary_Random_barcode.py ├── Summary_all_trim.py └── User │ └── JaeWoo │ └── JaeWoo_test_samples.txt ├── LICENSE ├── PythonPath.txt └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /Base_edit_2/Additional_BaseEdit_process_list.tsv: -------------------------------------------------------------------------------- 1 | #ABE_Rep1_1_2_TF4 A,T ABE_Rep1_1_2_TF4_AtoG_Summary.txt 2 | #ABE_Rep1_1_2_TF4 A,C ABE_Rep1_1_2_TF4_AtoG_Summary.txt 3 | ABE_Rep1_1_2_TF4 A,G ABE_Rep1_1_2_TF4_AtoG_Summary.txt 4 | -------------------------------------------------------------------------------- /Base_edit_2/All_final_result_summation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import os, sys 4 | import pandas as pd 5 | 6 | from pdb import set_trace 7 | 8 | strProjectList = sys.argv[1] 9 | #strProjectList = 'Project_list2.txt' 10 | 11 | 12 | def Summation_all_final_result(): 13 | 14 | with open(strProjectList) as Input: 15 | 16 | listdfResult = [] 17 | for i, strSample in enumerate(Input): 18 | #print(strSample) 19 | #if i == 2: break 20 | strSample = strSample.replace('\n','').replace('\r','').strip() 21 | strFinalResultDir = './Output/%s/Summary/Merge_target_result/' % strSample 22 | 23 | for j, strFinalResultFile in enumerate(os.listdir(strFinalResultDir)): 24 | if j > 0: 25 | print('I expected one file, but there are more. check the target base change file') 26 | sys.exit(1) 27 | 28 | print(strFinalResultFile) 29 | strFinalResultPath = './Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile) 30 | 31 | listdfResult.append(pd.read_table(strFinalResultPath, low_memory=False)) 32 | 33 | dfAll = pd.concat(listdfResult) 34 | dfForw = dfAll.iloc[:,0:3] 35 | dfReve = dfAll.iloc[:,3:].replace(' ', '0').astype('int64') 36 | dfAllResult = pd.concat([dfForw, dfReve], axis=1).groupby(['Sample','Barcode','Ref']).sum() 37 | dfAllResult.reset_index(inplace=True) 38 | 39 | dfAllResult.to_csv('./Output/Summation_'+strProjectList, sep='\t') 40 | 41 | #with open('./Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile)) as FinalResult: 42 | """ 43 | for strRow in FinalResult: 44 | listCol = strRow.replace('\n','').split('\t') 45 | listSamBarRef = listCol[:3] 46 | = listCol[3:] 47 | """ 48 | 49 | 50 | def SummationSubIndel(): 51 | 52 | with open(strProjectList) as Input,\ 53 | open('./Output/Summation_' + strProjectList.replace('.txt','') + '_sub_indel.txt', 'w') as Output: 54 | 55 | dictResult = {} 56 | 57 | for i, strSample in enumerate(Input): 58 | print(strSample) 59 | #if i == 2: break 60 | strSample = strSample.replace('\n','').replace('\r','').strip() 61 | strSubIndelDir = './Output/%s/result' % strSample 62 | 63 | for strSubIndelFile in os.listdir(strSubIndelDir): 64 | if 'sub' in strSubIndelFile: 65 | with open(strSubIndelDir + '/' + strSubIndelFile) as SubIndel: 66 | for strRow in SubIndel: 67 | listCol = strRow.replace('\n','').split('\t') 68 | setIndelPattern = set(listCol[3].split(',')) 69 | intCount = int(listCol[2]) 70 | strNameBarcodePattern = '-'.join(listCol[0:2])+'-'+''.join(setIndelPattern) 71 | 72 | try: 73 | dictResult[strNameBarcodePattern] += intCount 74 | except KeyError: 75 | dictResult[strNameBarcodePattern] = intCount 76 | 77 | for strNameBarcodePattern, intCount in dictResult.items(): 78 | Output.write('\t'.join(strNameBarcodePattern.split('-')) + '\t' + str(intCount) + '\n') 79 | 80 | 81 | def ConfirmValidation(): 82 | 83 | with open(strProjectList) as Input: 84 | 85 | listdfResult = [] 86 | for i, strSample in enumerate(Input): 87 | if i == 2: break 88 | print(strSample) 89 | strSample = strSample.replace('\n','').replace('\r','').strip() 90 | strFinalResultDir = './Output/%s/Summary/Merge_target_result/' % strSample 91 | 92 | for strFinalResultFile in os.listdir(strFinalResultDir): 93 | print(strFinalResultFile) 94 | strFinalResultPath = './Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile) 95 | 96 | listdfResult.append(pd.read_table(strFinalResultPath, low_memory=False)) 97 | 98 | dfAll = pd.concat(listdfResult) 99 | dfForw = dfAll.iloc[:,0:3] 100 | dfReve = dfAll.iloc[:,3:].replace(' ', '0').astype('int64') 101 | dfAllResult = pd.concat([dfForw, dfReve], axis=1).groupby(['Sample','Barcode','Ref']).sum() 102 | dfAllResult.reset_index(inplace=True) 103 | print(dfAllResult.iloc[:, 3:].sum().values.tolist()) 104 | 105 | 106 | def Main(): 107 | Summation_all_final_result() 108 | SummationSubIndel() 109 | #ConfirmValidation() 110 | 111 | 112 | Main() 113 | -------------------------------------------------------------------------------- /Base_edit_2/BaseEdit_freq_crispresso.py: -------------------------------------------------------------------------------- 1 | import os, re, sys, logging 2 | 3 | import numpy as np 4 | import subprocess as sp 5 | import cPickle as pickle 6 | 7 | from pdb import set_trace 8 | from datetime import datetime 9 | from collections import OrderedDict 10 | 11 | sys.path.insert(0, os.path.dirname(os.getcwd())) 12 | from Core.CoreSystem import CoreGotoh 13 | 14 | 15 | class clsParameter(object): 16 | 17 | """ 18 | ./BaseEdit_freq_crispresso.py {forw} {GapO} {GapE} {barcode} {ref} {target_window} {indel_check_pos} 19 | {target_ref_alt} {outdir} {file_name} {PAM_seq} {PAM_pos} {Guide_pos} {ednafull} {log} 20 | """ 21 | def __init__(self): 22 | 23 | if len(sys.argv) > 1: 24 | self.strForwPath = sys.argv[1] 25 | self.floOg = float(sys.argv[2]) 26 | self.floOe = float(sys.argv[3]) 27 | self.strBarcode = sys.argv[4] 28 | strRef = sys.argv[5] 29 | self.strRef = strRef[strRef.index(self.strBarcode):] ## 'ACTG'ACGACACACGCAT, leftside bases are redundant. 30 | self.listTargetWindow = sys.argv[6].split('-') 31 | self.listIndelCheckPos = sys.argv[7].split('-') 32 | self.listTargetRefAlt = sys.argv[8].split(',') 33 | self.strOutputDir = sys.argv[9] 34 | self.strFileName = sys.argv[10] 35 | self.strPamSeq = sys.argv[11] 36 | self.listPamPos = sys.argv[12].split('-') 37 | self.listGuidePos = sys.argv[13].split('-') 38 | self.strEDNAFULL = os.path.abspath('../EDNAFULL') 39 | self.strLogPath = sys.argv[14] 40 | 41 | else: 42 | sManual = """ 43 | Usage: 44 | 45 | python2.7 ./indel_search_ver1.0.py splitted_input_1.fq splitted_input_2.fq reference.fa 46 | 47 | splitted_input_1.fq : forward 48 | splitted_input_2.fq : reverse 49 | 50 | Total FASTQ(fq) lines / 4 = remainder 0. 51 | """ 52 | print sManual 53 | sys.exit() 54 | 55 | 56 | class clsBaseEditParser(): 57 | 58 | def __init__(self, InstParameter): 59 | self.strForwPath = InstParameter.strForwPath 60 | self.strRef = InstParameter.strRef 61 | self.strBarcode = InstParameter.strBarcode 62 | self.strEDNAFULL = InstParameter.strEDNAFULL 63 | self.floOg = InstParameter.floOg 64 | self.floOe = InstParameter.floOe 65 | self.listIndelCheckPos = InstParameter.listIndelCheckPos 66 | self.listTargetWindow = InstParameter.listTargetWindow 67 | 68 | def OpenSequenceFiles(self): 69 | lSequence_forward = [] 70 | with open(self.strForwPath) as fa_1: 71 | lSequence_forward = [sRow.replace('\n', '').upper() for sRow in fa_1] 72 | return lSequence_forward 73 | 74 | def CalculateBaseEditFreq(self, lQuery_seq=[]): 75 | 76 | dRef = {} 77 | dResult = {} 78 | 79 | dRef[self.strBarcode] = (self.strRef) # total matched reads, insertion, deletion, complex 80 | dResult[self.strBarcode] = [0, 0, 0, 0, [], [], [], [], [], [], []] 81 | 82 | # lRef : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)] 83 | # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ], info] 84 | iCount = 0 85 | 86 | InstGotoh = CoreGotoh(strEDNAFULL=self.strEDNAFULL, floOg=self.floOg, floOe=self.floOe) 87 | 88 | for sQuery_seq_raw in lQuery_seq: 89 | 90 | iBarcode_matched = 0 91 | iNeedle_matched = 0 92 | iInsert_count = 0 93 | iDelete_count = 0 94 | iComplex_count = 0 95 | 96 | try: 97 | # Check the barcode pos and remove it. 98 | sQuery_seq_raw = sQuery_seq_raw.replace('\r', '') 99 | iBarcode_start_pos = sQuery_seq_raw.index(self.strBarcode) 100 | iBarcode_matched += 1 101 | 102 | sQuery_seq_with_barcode = sQuery_seq_raw[iBarcode_start_pos:] ## this is not after barcode seq. including barcode 103 | 104 | npGapIncentive = InstGotoh.GapIncentive(self.strRef) 105 | 106 | try: 107 | lResult = InstGotoh.RunCRISPResso2(sQuery_seq_with_barcode.upper(), self.strRef.upper(), npGapIncentive) 108 | except Exception as e: 109 | logging.error(e, exc_info=True) 110 | continue 111 | 112 | sQuery_needle_ori = lResult[0] 113 | sRef_needle_ori = lResult[1] 114 | 115 | # if _check == 1: 116 | # print(sRef_needle_ori) 117 | # print(sQuery_needle_ori) 118 | # set_trace() 119 | 120 | # detach forward ---, backward --- 121 | # e.g. ref ------AAAGGCTACGATCTGCG------ 122 | # query AAAAAAAAATCGCTCTCGCTCTCCGATCT 123 | # trimmed ref AAAGGCTACGATCTGCG 124 | # trimmed qeury AAATCGCTCTCGCTCTC 125 | iReal_ref_needle_start = 0 126 | iReal_ref_needle_end = len(sRef_needle_ori) 127 | iRef_needle_len = len(sRef_needle_ori) 128 | 129 | for i, sRef_nucle in enumerate(sRef_needle_ori): 130 | if sRef_nucle in ['A', 'C', 'G', 'T']: 131 | iReal_ref_needle_start = i 132 | break 133 | 134 | for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]): 135 | if sRef_nucle in ['A', 'C', 'G', 'T']: 136 | iReal_ref_needle_end = iRef_needle_len - (i + 1) 137 | # forward 0 1 2 len : 3 138 | # reverse 2 1 0, len - (2 + 1) = 0 139 | break 140 | 141 | sRef_needle = sRef_needle_ori[iReal_ref_needle_start:iReal_ref_needle_end + 1] 142 | if iReal_ref_needle_start: 143 | sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end] 144 | sQuery_needle = sQuery_needle_ori[:len(sRef_needle)] 145 | # detaching completion 146 | 147 | # indel info making. 148 | iNeedle_match_pos_ref = 0 149 | iNeedle_match_pos_query = 0 150 | iNeedle_insertion = 0 151 | iNeedle_deletion = 0 152 | 153 | lInsertion_in_read = [] # insertion result [[100, 1], [119, 13]] 154 | lDeletion_in_read = [] # deletion result [[97, 1], [102, 3]] 155 | 156 | # print 'sRef_needle', sRef_needle 157 | # print 'sQuery_needle', sQuery_needle 158 | for i, (sRef_nucle, sQuery_nucle) in enumerate(zip(sRef_needle, sQuery_needle)): 159 | 160 | if sRef_nucle == '-': 161 | iNeedle_insertion += 1 162 | 163 | if sQuery_nucle == '-': 164 | iNeedle_deletion += 1 165 | 166 | if sRef_nucle in ['A', 'C', 'G', 'T']: 167 | if iNeedle_insertion: 168 | lInsertion_in_read.append([iNeedle_match_pos_ref, iNeedle_insertion]) 169 | iNeedle_insertion = 0 170 | iNeedle_match_pos_ref += 1 171 | 172 | if sQuery_nucle in ['A', 'C', 'G', 'T']: 173 | if iNeedle_deletion: 174 | lDeletion_in_read.append([iNeedle_match_pos_query, iNeedle_deletion]) 175 | iNeedle_match_pos_query += iNeedle_deletion 176 | iNeedle_deletion = 0 177 | iNeedle_match_pos_query += 1 178 | # print 'sRef_needle', sRef_needle 179 | 180 | # print 'sQuery_needle', sQuery_needle 181 | # print 'lInsertion_in_read: onebase', lInsertion_in_read 182 | # print 'lDeletion_in_read: onebase', lDeletion_in_read 183 | # print 'i5bp_front_Indel_end', i5bp_front_Indel_end 184 | # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos 185 | 186 | lTarget_indel_result = [] # ['20M2I', '23M3D' ...] 187 | 188 | """ 189 | ins case 190 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT 191 | """ 192 | 193 | iCleavage_window_start = int(self.listIndelCheckPos[0]) 194 | iCleavage_window_end = int(self.listIndelCheckPos[1]) - 1 195 | 196 | for iMatch_pos, iInsertion_pos in lInsertion_in_read: 197 | if iCleavage_window_start <= iMatch_pos <= iCleavage_window_end: # iMatch_pos is one base 198 | iInsert_count = 1 199 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I') 200 | """ 201 | del case 1 202 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT 203 | del case 2 204 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT 205 | """ 206 | for iMatch_pos, iDeletion_pos in lDeletion_in_read: 207 | 208 | """ 209 | Insertion: 30M3I 210 | ^ 211 | ACGT---ACGT 212 | ACGTTTTACGT -> check this seq 213 | Insertion just check two position 214 | 215 | Deletion: 30M3D 216 | ^ 217 | ACGTTTTACGT 218 | ACGT---ACGT -> check this seq 219 | But deletion has to includes overlap deletion. 220 | """ 221 | 222 | if iMatch_pos <= iCleavage_window_end and iCleavage_window_start <= (iMatch_pos + iDeletion_pos): 223 | iDelete_count = 1 224 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D') 225 | 226 | if iInsert_count == 1 and iDelete_count == 1: 227 | iComplex_count = 1 228 | iInsert_count = 0 229 | iDelete_count = 0 230 | 231 | # """ test set 232 | # print 'sBarcode', sBarcode 233 | # print 'sTarget_region', sTarget_region 234 | # print 'sRef_seq_after_barcode', sRef_seq_after_barcode 235 | # print 'sSeq_after_barcode', sQuery_seq 236 | # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos 237 | # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos 238 | # """ 239 | 240 | """ 241 | 23M3I 242 | 23M is included junk_seq after barcode, 243 | 244 | barcorde junk targetseq others 245 | *********ACCCT-------------ACACACACC 246 | so should select target region. 247 | If junk seq is removed by target region seq index pos. 248 | """ 249 | 250 | ## 8: indel info 251 | dResult[self.strBarcode][8].append( 252 | [self.strRef, sQuery_seq_raw, lTarget_indel_result, 253 | "", sRef_needle_ori, sQuery_needle_ori]) ## "" -> target seq, but this is not used this project. 254 | 255 | # end: try 256 | except ValueError as e: 257 | print(e) 258 | continue 259 | 260 | # total matched reads, insertion, deletion, complex 261 | dResult[self.strBarcode][0] += iBarcode_matched 262 | dResult[self.strBarcode][1] += iInsert_count 263 | dResult[self.strBarcode][2] += iDelete_count 264 | dResult[self.strBarcode][3] += iComplex_count 265 | 266 | ## base editing frequency 267 | """ 268 | BaseEditPos : 0 1 2 269 | [OrderedDict([('A',0),('C',0),('G',0),('T',0)]), OrderedDict([('A',0),('C',0),('G',0),('T',0)]), ... 270 | 271 | and sum the counts each position 272 | """ 273 | 274 | ## No indel reads only 275 | if iInsert_count == 0 and iDelete_count == 0 and iComplex_count == 0: 276 | 277 | lBaseEdit = [] 278 | iTarget_len = int(self.listTargetWindow[1]) - int(self.listTargetWindow[0]) + 1 279 | 280 | for i in range(iTarget_len): 281 | lBaseEdit.append(OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])) 282 | 283 | iTarget_start = int(self.listTargetWindow[0]) - 1 284 | iTarget_end = int(self.listTargetWindow[1]) 285 | 286 | """ 287 | cleavage window start 288 | ^ 289 | [barcode]ACGACGTACGACGT[cleavage] 290 | [barcode]ACGACGTACGACGT[cleavage] 291 | """ 292 | 293 | iBase_edit_event = 0 294 | 295 | for i, tRef_Query_base in enumerate(zip(sRef_needle[iTarget_start: iTarget_end], sQuery_needle[iTarget_start: iTarget_end])): 296 | sRef_base = tRef_Query_base[0] 297 | sQuery_base = tRef_Query_base[1] 298 | 299 | if sRef_base == '-' or sQuery_base == '-': continue 300 | 301 | if sRef_base != sQuery_base and sQuery_base != 'N': 302 | iBase_edit_event = 1 303 | lBaseEdit[i][sQuery_base] += 1 304 | # print(sQuery_needle) 305 | 306 | dResult[self.strBarcode][9].append(lBaseEdit) 307 | ## Processed indel filtering and store aligned alt mut read. 308 | if iBase_edit_event == 1: 309 | dResult[self.strBarcode][10].append([self.strRef, sQuery_seq_raw, lTarget_indel_result, [list(orderedDict.values()) for orderedDict in lBaseEdit], sRef_needle_ori, sQuery_needle_ori]) 310 | # dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [BaseEdit_freq_data]] 311 | 312 | iBarcode_matched = 0 313 | iInsert_count = 0 314 | iDelete_count = 0 315 | iComplex_count = 0 316 | # end: for sBarcode, lCol_ref 317 | # end: for lCol_FASTQ 318 | return dResult 319 | 320 | 321 | class clsOutputMaker(): 322 | 323 | def __init__(self, InstParameter): 324 | 325 | self.strForwPath = InstParameter.strForwPath 326 | self.strRef = InstParameter.strRef 327 | self.strFileName = InstParameter.strFileName 328 | self.strOutputDir = InstParameter.strOutputDir 329 | self.listTargetRefAlt = InstParameter.listTargetRefAlt 330 | self.listTargetWindow = InstParameter.listTargetWindow 331 | self.strPamSeq = InstParameter.strPamSeq 332 | self.listPamPos = InstParameter.listPamPos 333 | self.listGuidePos = InstParameter.listGuidePos 334 | 335 | # index name, constant variable. 336 | self.intNumOfTotal = 0 337 | self.intNumOfIns = 1 338 | self.intNumOfDel = 2 339 | self.intNumOfCom = 3 340 | self.intTotalFastq = 4 341 | self.intInsFastq = 5 342 | self.intDelFastq = 6 343 | self.intComFastq = 7 344 | self.intIndelInfo = 8 345 | 346 | def MakeOutput(self, dResult): 347 | """ 348 | {'TTTGGTGCACACACATATA': [6, 2, 2, 0, [], [], [], [], [['TATCTCTA..ref', 'GAGTCGGTG...query', [13M5D], '', 349 | 'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG------------------------------------------------------------', 350 | 'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGA-TGCGGGGCGTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA'], 351 | ['TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG', '', '', '', 352 | 'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG------------------------------------------------------------', ... 353 | [[OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 1)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])], 354 | [OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 1)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])]]]} 355 | """ 356 | 357 | with open('{outdir}/Tmp/Alignment/{file_name}_filtered_indel.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Filtered,\ 358 | open('{outdir}/Tmp/Alignment/{file_name}_aligned_BaseEdit.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Ref_Alt_edit: 359 | 360 | for sBarcode in dResult: 361 | for lAligned_indel_result in dResult[sBarcode][8]: # 8 : indel list 362 | if lAligned_indel_result[2]: 363 | Filtered.write('\t'.join(map(str, lAligned_indel_result)) + '\n') 364 | 365 | for lAligned_alt_result in dResult[sBarcode][10]: # 10 : alt base list 366 | if lAligned_alt_result: 367 | lAligned_alt_result[2] = str(lAligned_alt_result[2]) 368 | try: 369 | Ref_Alt_edit.write('\t'.join(map(str, lAligned_alt_result)) + '\n') 370 | except Exception: 371 | set_trace() 372 | 373 | """ 374 | lAligned_result 375 | ['TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC', 'TCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCAAAGGTCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAAT\r', 376 | ['38M1D'], '', 'TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC-----------------------------------------------------------------', 'TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGT-CAAAGGTCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAAT'] 377 | """ 378 | 379 | dSelect_base = {'A': 0, 'C': 1, 'G': 2, 'T': 3} 380 | 381 | sTarget_ref = self.listTargetRefAlt[0] 382 | sTarget_alt = self.listTargetRefAlt[1] 383 | 384 | iTarget_base = dSelect_base[sTarget_alt] 385 | 386 | try: 387 | if not os.path.isdir('{outdir}/Tmp/All'.format(outdir=self.strOutputDir)): 388 | os.mkdir('{outdir}/Tmp/All'.format(outdir=self.strOutputDir)) 389 | if not os.path.isdir('{outdir}/Tmp/Target'.format(outdir=self.strOutputDir)): 390 | os.mkdir('{outdir}/Tmp/Target'.format(outdir=self.strOutputDir)) 391 | except OSError: 392 | pass 393 | 394 | for sBarcode, lValue in dResult.items(): 395 | 396 | iBarcode_start_pos = self.strRef.index(sBarcode) 397 | sRef_seq_without_barcode = self.strRef[iBarcode_start_pos+len(sBarcode):] 398 | 399 | llBaseEdit = lValue[9] 400 | lSum = [] 401 | 402 | for i, lBaseEdit in enumerate(llBaseEdit): 403 | 404 | if not lSum: 405 | lSum = [[0, 0, 0, 0] for iQuery in range(len(lBaseEdit))] 406 | 407 | for j in range(len(lBaseEdit)): 408 | for k, iCount in enumerate(list(llBaseEdit[i][j].values())): 409 | lSum[j][k] += iCount 410 | 411 | with open('{outdir}/Tmp/All/{file_name}_Summary.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Summary, \ 412 | open('{outdir}/Tmp/Target/{file_name}_{target}_Summary.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName, target=sTarget_ref + 'to' + sTarget_alt), 'w') as Target_summary: 413 | 414 | ## This Ref has barcode. 415 | sRef_target = self.strRef[int(self.listTargetWindow[0]) - 1:int(self.listTargetWindow[1])] 416 | 417 | iPAM_start = int(self.listPamPos[0]) - 1 418 | iPAM_end = int(self.listPamPos[1]) 419 | iGuide_start = int(self.listGuidePos[0]) - 1 420 | iGuide_end = int(self.listGuidePos[1]) 421 | iGuide_len = iGuide_end - iGuide_start 422 | iBarcode_len = len(sBarcode) 423 | 424 | """ 425 | barcode Guide st,ed 426 | <----><----------> NGG 427 | ACGTACGTACGTACGTACGTGGACG 428 | """ 429 | 430 | #sRef_target[iPAM_start:iPAM_end] = sPAM_seq 431 | ## iWithout_target_len = len(sRef_target[iBarcode_len:iGuide_start]) -> weird part. 432 | ## So I corrected it. 433 | iWithout_target_len = iGuide_start - iBarcode_len 434 | lWithout_target_pos = [-(i+1) for i in range(iWithout_target_len)][::-1] 435 | 436 | lWith_target_pos = [i + 1 for i in range(iGuide_len)] 437 | lAfter_PAM_pos = [i + 1 for i in range(len(self.strRef) - iPAM_end + 1)] 438 | 439 | lPos_num = lWithout_target_pos + lWith_target_pos + list(self.strPamSeq) + lAfter_PAM_pos 440 | lPos_annotated_ref = [str(i)+'.'+str(j) for i,j in zip(sRef_target, lPos_num)] 441 | ## ['A.-7', 'C.-6', 'A.-5', 'A.-4', 'G.-3', 'C.-2', 'A.-1', 'T.1', 'G.2', 'C.3', 'A.4', 'A.5', 'T.6', 'C.7', 'A.8', 'C.9', 'C.10', 'T.11', 'T.12', 'G.13', 'G.14', 442 | 443 | lMasked_pos_annotated_ref_target = [] ## '' '' '' A '' '' '' A A '' '' 444 | 445 | for sBase_pos in lPos_annotated_ref: 446 | sBase_only = sBase_pos.split('.')[0] 447 | if sBase_only != sTarget_ref: 448 | lMasked_pos_annotated_ref_target.append(' ') 449 | else: 450 | lMasked_pos_annotated_ref_target.append(sBase_pos) 451 | 452 | #set_trace() 453 | 454 | strFormat = "{sample}\t{bar}\t{ref}\t{NumTot}\t{NumIns}\t{NumDel}\t{NumCom}\t{BaseEditCount}\n" 455 | ## Making a header 456 | Summary.write("Sample\tBarcode\tRef\t# of Total\t# of Insertion\t# of Deletion\t# of Combination\t{refseq}\n".format(refseq='\t'.join(lPos_annotated_ref))) 457 | Target_summary.write("Sample\tBarcode\tRef\t# of Total\t# of Insertion\t# of Deletion\t# of Combination\t{refseq}\n".format(refseq='\t'.join(lMasked_pos_annotated_ref_target))) 458 | 459 | for i, lBase_count in enumerate(zip(*lSum)): ## lBase_count [(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)] 460 | 461 | if i == 0: 462 | Summary.write(strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal], NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom], 463 | BaseEditCount='\t'.join(map(str, lBase_count)))) 464 | else: 465 | Summary.write("\t\t\t\t\t\t\t{BaseEditCount}\n".format(BaseEditCount='\t'.join(map(str, lBase_count)))) 466 | 467 | try: 468 | lTarget_base_count = zip(*lSum)[iTarget_base] 469 | lMasked_target_base_count = [] ## '' 20 '' 30 '' '' '' '' 20 '' 470 | 471 | for sMasked_ref, fCount in zip(lMasked_pos_annotated_ref_target, lTarget_base_count): 472 | 473 | if sMasked_ref == ' ': 474 | lMasked_target_base_count.append(' ') 475 | else: 476 | lMasked_target_base_count.append(fCount) 477 | 478 | Target_summary.write((strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal], 479 | NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom], 480 | BaseEditCount='\t'.join(map(str, lMasked_target_base_count))))) 481 | 482 | except IndexError: 483 | print('Null query: ', self.strForwPath) 484 | ## Null query base count is all zero. 485 | Target_summary.write( 486 | (strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal], 487 | NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom], 488 | BaseEditCount='\t'.join(['0'] * len(lPos_annotated_ref))))) 489 | 490 | 491 | def Main(): 492 | 493 | InstParameter = clsParameter() 494 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 495 | level=logging.DEBUG, 496 | filename=InstParameter.strLogPath, 497 | filemode='a') 498 | 499 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 500 | 501 | # Output: 1. Count information of matched barcode e.g. TACGATCTA\t# total\tins\t# del\t# com 502 | # Output: 2. classify FASTQ. e.g. TAGAATATACACG.insertion.fastq 503 | 504 | logging.info('Program start : %s' % InstParameter.strFileName) 505 | 506 | InstParser = clsBaseEditParser(InstParameter) 507 | logging.info('File Open : %s' % InstParameter.strFileName) 508 | listSequenceForward = InstParser.OpenSequenceFiles() 509 | 510 | logging.info('Calculate base edit frequency : %s' % InstParameter.strFileName) 511 | dictResultForward = InstParser.CalculateBaseEditFreq(listSequenceForward) 512 | 513 | logging.info('Make output forward : %s' % InstParameter.strFileName) 514 | InstOutput = clsOutputMaker(InstParameter) 515 | InstOutput.MakeOutput(dictResultForward) 516 | 517 | logging.info('Program end : %s' % InstParameter.strFileName) 518 | # end: def Main 519 | 520 | 521 | if __name__ == '__main__': 522 | Main() 523 | 524 | 525 | -------------------------------------------------------------------------------- /Base_edit_2/Each_base_summary.py: -------------------------------------------------------------------------------- 1 | #!/home/hkimlab/anaconda2/bin/python2.7 2 | 3 | import os, sys 4 | from pdb import set_trace 5 | 6 | try: 7 | strUser = sys.argv[1] 8 | strProject = sys.argv[2] 9 | 10 | except IndexError: 11 | print('\n') 12 | print('usage : ./Each_base_summary.py user_name project_name\n') 13 | print('example : ./Each_base_summary.py SH p53_screening\n') 14 | sys.exit() 15 | 16 | 17 | def Make_target_ref_alt_summary(strSample='', strRef='', strAlt='', strFirstOutput=''): 18 | 19 | """ row 0: header, 1: A and info, 2: C, 3: G, 4: T 20 | Sample Barcode Ref # of Total # of Insertion # of Deletion # of Combination C.-7 T.-6 C.-5 T.-4 G.-3 G.-2 G.-1 G.1 T.2 C.3 A.4 G.5 G.6 G.7 A.8 C.9 A.10 G.11 T.12 G.13 G.14 A.15 C.16 T.17 C.18 G.19 A.20 A.N G.G G.G A.1 G.2 A.3 21 | Doench2014_1000 ACTAGCTATCGCTCA CTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 00 0 22 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 00 0 0 0 0 0 0 0 0 0 0 0 0 0 23 | 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 00 0 0 0 0 0 0 0 0 0 0 0 0 0 24 | 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 00 0 0 0 0 0 0 0 0 0 0 0 0 0 25 | """ 26 | dAlt = {'A' : 1, 'C' : 2, 'G' : 3, 'T' : 4} 27 | lHeader = [] 28 | llResult = [] 29 | 30 | strSampleDir = './Output/{user}/{project}/{sample}'.format(user=strUser, 31 | project=strProject, 32 | sample=strSample) 33 | strSummaryDir = os.path.join(strSampleDir, 'Result') 34 | strMergeTargetDir = os.path.join(strSummaryDir, 'Merge_target_result') 35 | 36 | with open(os.path.join(strMergeTargetDir,strFirstOutput)) as Fisrt_output,\ 37 | open(os.path.join(strMergeTargetDir, '{sample}_{ref}to{alt}_Summary_addition.txt'.format(sample=strSample, 38 | ref=strRef, 39 | alt=strAlt)), 'w') as Output: 40 | 41 | strSummaryAllDir = os.path.join(strSampleDir,'Tmp/All') 42 | for iFile_cnt, sFile in enumerate(os.listdir(strSummaryAllDir)): 43 | 44 | with open(os.path.join(strSummaryAllDir, sFile)) as Input: 45 | lNone_alt_col = [] 46 | lBaseEdit_Info = [] 47 | 48 | for i, sRow in enumerate(Input): 49 | lCol = sRow.replace('\n', '').split('\t') 50 | 51 | if i == 0: 52 | for j, sCol_name in enumerate(lCol[7:]): 53 | if strRef not in sCol_name: 54 | lNone_alt_col.append(7+j) 55 | lCol[7+j] = ' ' 56 | 57 | if lHeader == []: 58 | lHeader = lCol 59 | elif lHeader: 60 | for iHeader_col, tHeader in enumerate(zip(lHeader[7:], lCol[7:])): 61 | sHeader_current, sHeader_update = tHeader 62 | 63 | if sHeader_update == ' ': continue 64 | 65 | if sHeader_current == ' ': 66 | lHeader[iHeader_col+7] = sHeader_update 67 | 68 | else: 69 | assert sHeader_current == sHeader_update, 'Check header %s %s' % (repr(sHeader_current), repr(sHeader_update)) 70 | 71 | elif i == 1: 72 | lBaseEdit_Info = lCol[:7] 73 | 74 | elif i == dAlt[strAlt]: 75 | for iNon_col in lNone_alt_col: 76 | lCol[iNon_col] = ' ' 77 | lCol[:7] = lBaseEdit_Info 78 | #print(i, lCol) 79 | #(3, ['Doench2014_1000', 'ACTAGCTATCGCTCA', 'CTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA', '5', '0', '0', '0', '', '', '', '', '', '', '', '', '', '', '0', '', '', '', '0', '', '0', '', '', '', '', '0', '', '', '', '', '0', '0', '', '', '0', '', '0']) 80 | llResult.append(lCol) 81 | 82 | 83 | print('Total_files: ', iFile_cnt + 1) 84 | Output.write('\t'.join(lHeader) + '\n') 85 | 86 | """ 87 | All folder doesn't able to have any indel information if it hasn't any counts of alterantive alleles. 88 | That file has only a header. 89 | Hence, I check the first merged summary output data, then extract it doesn't have current additional output. 90 | """ 91 | 92 | dAdditional_output = {} ## dictionary to check for only header files in the 'all' folder. 93 | 94 | for lResult in llResult: 95 | sSample = lResult[0] 96 | dAdditional_output[sSample] = '\t'.join(lResult) + '\n' 97 | 98 | for i, sRow in enumerate(Fisrt_output): 99 | if i == 0: continue ## header skip 100 | lCol = sRow.replace('\n', '').split('\t') 101 | sSample = lCol[0] 102 | 103 | try: 104 | Output.write(dAdditional_output[sSample]) 105 | except KeyError: ## Exclusive possession 106 | Output.write(sRow) 107 | 108 | 109 | def Main(): 110 | 111 | with open('./User/{user}/Additional_BaseEdit_process_list.tsv'.format(user=strUser)) as Input: 112 | for sRow in Input: 113 | if sRow[0] == '#': continue 114 | lCol = sRow.replace('\n', '').replace('\r', '').split('\t') 115 | if len(lCol) == 1: 116 | lCol = lCol[0].split() 117 | print(lCol) 118 | 119 | strSample = lCol[0] 120 | listRefAlt = lCol[1].split(',') 121 | strRef = listRefAlt[0] 122 | strAlt = listRefAlt[1] 123 | strFirstOutput = lCol[2] 124 | 125 | Make_target_ref_alt_summary(strSample=strSample, 126 | strRef=strRef, 127 | strAlt=strAlt, 128 | strFirstOutput=strFirstOutput) 129 | 130 | Main() 131 | -------------------------------------------------------------------------------- /Base_edit_2/Indel_contr_dict_making.py: -------------------------------------------------------------------------------- 1 | #!/extdata1/JaeWoo/Tools/Python/miniconda2/bin/python2.7 2 | 3 | import os, sys 4 | from pdb import set_trace 5 | 6 | import cPickle 7 | 8 | #strSampleFolder = sys.argv[1] 9 | 10 | 11 | def MakeIndelContrDict(): 12 | 13 | for strSampleFolder in ['18K_D0_1','18K_D0_2','18K_D0_3']: 14 | with open('./Output/%s/%s_IndelSubtarction.txt' % (strSampleFolder, strSampleFolder), 'w') as Output: 15 | 16 | dictSub = {} 17 | 18 | for strFile in os.listdir('./Output/%s/result' % strSampleFolder): 19 | if 'filtered' in strFile: 20 | with open('./Output/%s/result/%s' % (strSampleFolder, strFile)) as Input: 21 | 22 | strBarcodeName = strFile.replace('_filtered_indel.txt','') 23 | for strRow in Input: 24 | listCol = strRow.replace('\n','').split('\t') 25 | #set_trace() 26 | strIndelPos = listCol[2].replace("['",'').replace("']",'') 27 | listIndelPos = strIndelPos.split('M') 28 | intMatch = int(listIndelPos[0]) 29 | strRefseq = listCol[4] 30 | strQueryseq = listCol[5] 31 | 32 | if 'I' in strIndelPos: ## insertion 33 | intInsertion = int(listIndelPos[1].replace('I', '')) 34 | strInsertseq = strQueryseq[intMatch:intMatch+intInsertion] 35 | #set_trace() 36 | strInsertPosSeq = strIndelPos+'_'+strInsertseq 37 | 38 | try: 39 | dictSub[strBarcodeName+':'+strInsertPosSeq].append([strInsertPosSeq, strRefseq, strQueryseq]) 40 | except KeyError: 41 | dictSub[strBarcodeName+':'+strInsertPosSeq] = [[strInsertPosSeq, strRefseq, strQueryseq]] 42 | 43 | elif 'D' in strIndelPos: 44 | intDeletion = int(listIndelPos[1].replace('D', '')) 45 | strDeleteSeq = strRefseq[intMatch:intMatch+intDeletion] 46 | strDeletePosSeq = strIndelPos+'_'+strDeleteSeq 47 | 48 | try: 49 | dictSub[strBarcodeName+':'+strDeletePosSeq].append([strDeletePosSeq, strRefseq, strQueryseq]) 50 | except KeyError: 51 | dictSub[strBarcodeName+':'+strDeletePosSeq] = [[strDeletePosSeq, strRefseq, strQueryseq]] 52 | 53 | for strBarcodeName, list2IndelPosSeq in dictSub.items(): 54 | for listIndelPosSeq in list2IndelPosSeq: 55 | Output.write('\t'.join([strBarcodeName] + listIndelPosSeq) + '\n') 56 | 57 | 58 | def ConcatContrDict(): 59 | 60 | DictSubNoDup = {} 61 | 62 | for strSampleFolder in ['18K_D0_1', '18K_D0_2', '18K_D0_3']: 63 | with open('./Output/%s/%s_IndelSubtarction.txt' % (strSampleFolder, strSampleFolder)) as Input: 64 | 65 | for strRow in Input: 66 | listCol = strRow.replace('\n', '').split('\t') 67 | try: 68 | DictSubNoDup[listCol[0]] += 1 69 | except KeyError: 70 | DictSubNoDup[listCol[0]] = 1 71 | 72 | #print(DictSubNoDup) 73 | with open('./Output/DictSubNoDup.pickle', 'wb') as PickleObj: 74 | cPickle.dump(DictSubNoDup, PickleObj) 75 | 76 | 77 | def Main(): 78 | #MakeIndelContrDict() 79 | ConcatContrDict() 80 | 81 | 82 | Main() -------------------------------------------------------------------------------- /Base_edit_2/Indel_frequency_calculator.py: -------------------------------------------------------------------------------- 1 | #!/home/hkimlab/anaconda2/bin/python2.7 2 | 3 | import os 4 | import sys 5 | import pdb 6 | from datetime import datetime 7 | from collections import namedtuple as nt 8 | from collections import OrderedDict 9 | 10 | sOutput_dir = sys.argv[1] 11 | 12 | def Calculate_indel_freq(): 13 | 14 | if not os.path.isdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir)): os.mkdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir)) 15 | 16 | for sFile in os.listdir('{outdir}/result/freq'.format(outdir=sOutput_dir)): 17 | #print sFile 18 | if os.path.isfile(os.path.join('{outdir}/result/freq'.format(outdir=sOutput_dir), sFile)): 19 | with open(os.path.join('{outdir}/result/freq'.format(outdir=sOutput_dir), sFile)) as Input_freq,\ 20 | open(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile), 'w') as Output_freq: 21 | 22 | sRef = Input_freq.readline() # first row is ref. 23 | sDelemiter = Input_freq.readline() # second row is '-------' delemiter. 24 | Output_freq.write(sRef+sDelemiter) 25 | 26 | lSeq_indel = [] # [namedtuple1(['TGCA', '30M3I']) namedtuple2 ... 27 | dFreq_count = {} # {'30M3I':2 ... } 28 | 29 | for sRow in Input_freq: 30 | Seq_indel = nt('Seq_indel', ['seq', 'indel', 'freq', 'ref_needle', 'query_needle']) 31 | 32 | if sRow == sRef: continue 33 | if sRow[0] == '-': continue 34 | 35 | try: 36 | lCol = sRow.replace('\n', '').split('\t') 37 | Seq_indel.seq = lCol[0] 38 | Seq_indel.indel = lCol[1] 39 | Seq_indel.ref_needle = lCol[3] 40 | Seq_indel.query_needle = lCol[4] 41 | lSeq_indel.append(Seq_indel) 42 | except IndexError: 43 | print sFile, lCol 44 | continue 45 | 46 | try: 47 | dFreq_count[Seq_indel.indel] += 1 48 | except KeyError: 49 | dFreq_count[Seq_indel.indel] = 1 50 | #end: for sRow 51 | 52 | # Add freq infomation pre-result data. 53 | lResult = [] 54 | iTotal = len(lSeq_indel) 55 | 56 | #print 'dFreq_count', dFreq_count 57 | #print 'lSeq_indel', lSeq_indel 58 | 59 | for Seq_indel in lSeq_indel: 60 | iCount = dFreq_count[Seq_indel.indel] 61 | Seq_indel.freq = float(iCount) / iTotal 62 | lResult.append(Seq_indel) 63 | 64 | lResult.sort(key=lambda x: x.indel) 65 | lResult.sort(key=lambda x: x.freq, reverse=True) 66 | 67 | #print 'lResult', lResult 68 | 69 | for Seq_indel in lResult: 70 | #print Seq_indel.__dict__ 71 | Output_freq.write('\t'.join(map(str, [Seq_indel.seq, Seq_indel.indel, Seq_indel.freq, Seq_indel.ref_needle, Seq_indel.query_needle]))+'\n') 72 | #end: with open 73 | #end: if os.path 74 | #end: sFile 75 | 76 | 77 | def Make_indel_summary(): 78 | 79 | lOutput = [] 80 | 81 | for sFile in os.listdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir)): 82 | if os.path.isfile(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile)): 83 | with open(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile)) as Input_freq: 84 | 85 | sRef = Input_freq.readline() # first row is ref. 86 | sDelemiter = Input_freq.readline() # second row is '-------' delemiter. 87 | 88 | dINDEL = OrderedDict() 89 | 90 | lTable = [sRow.replace('\n', '').split('\t') for sRow in Input_freq] 91 | iTotal = len(lTable) 92 | 93 | for lCol in lTable: 94 | sINDEL = lCol[1] 95 | try: 96 | dINDEL[sINDEL] += 1 97 | except KeyError: 98 | dINDEL[sINDEL] = 1 99 | 100 | dINDEL = OrderedDict(sorted(dINDEL.items(), key=lambda t: t[1], reverse=True)) 101 | 102 | llINDEL = [[sKey, iValue, round(iValue/float(iTotal),3)*100] for sKey, iValue in dINDEL.items()] 103 | sINDEL_result = ''.join([':'.join(map(str, lINDEL))+', ' for lINDEL in llINDEL])[:-2] 104 | 105 | lOutput.append([sFile, iTotal, sINDEL_result]) 106 | #Output_freq.write('\t'.join([sFile, sTotal, sINDEL_result]) + '\n') 107 | 108 | lOutput = sorted(lOutput, key=lambda x: x[1], reverse=True) 109 | 110 | with open('{outdir}/result/freq/freq_result/Indel_summary.txt'.format(outdir=sOutput_dir), 'w') as Output_freq: 111 | for lCol in lOutput: 112 | Output_freq.write('\t'.join(map(str, lCol)) + '\n') 113 | 114 | 115 | if __name__ == '__main__': 116 | print 'Indel frequency calculator start: ', datetime.now() 117 | Calculate_indel_freq() 118 | Make_indel_summary() 119 | print 'Indel frequency calculator end: ', datetime.now() 120 | -------------------------------------------------------------------------------- /Base_edit_2/Kill_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Confirm the jobs. 4 | # ps aux | grep hkim | grep BaseEdit_freq_ver1.0.py | less 5 | 6 | kill -9 $(ps aux | grep hkim | grep Run_BaseEdit_freq.py | awk '{print$2}') 7 | kill -9 $(ps aux | grep hkim | grep BaseEdit_freq_crispresso.py | awk '{print$2}') 8 | -------------------------------------------------------------------------------- /Base_edit_2/MakeUserFolder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user=SH 4 | project=24K_screening 5 | 6 | 7 | [ ! -d ./Input ] && { `mkdir ./Input`; } 8 | [ ! -d ./User ] && { `mkdir ./User`; } 9 | [ ! -d ./Output ] && { `mkdir ./Output`; } 10 | 11 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; } 12 | [ ! -d ./Input/${user}/Query ] && { `mkdir ./Input/${user}/Query`; } 13 | [ ! -d ./Input/${user}/Query/${project} ] && { `mkdir ./Input/${user}/Query/${project}`; } 14 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; } 15 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; } 16 | 17 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; } 18 | > ./User/${user}/${project}.txt 19 | -------------------------------------------------------------------------------- /Base_edit_2/Make_user_folder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user=JaeWoo 4 | project=JaeWoo_test_samples4 5 | 6 | 7 | [ ! -d ./Input ] && { `mkdir ./Input`; } 8 | [ ! -d ./User ] && { `mkdir ./User`; } 9 | [ ! -d ./Output ] && { `mkdir ./Output`; } 10 | 11 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; } 12 | [ ! -d ./Input/${user}/Query ] && { `mkdir ./Input/${user}/Query`; } 13 | [ ! -d ./Input/${user}/Query/${project} ] && { `mkdir ./Input/${user}/Query/${project}`; } 14 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; } 15 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; } 16 | 17 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; } 18 | > ./User/${user}/${project}.txt 19 | -------------------------------------------------------------------------------- /Base_edit_2/README.md: -------------------------------------------------------------------------------- 1 | # Base_edit_2 2 | Fast CRISPR base edit count tool 3 | 4 | 5 | ### Detailed options 6 | ./Run_BaseEdit_freq.py -h 7 | -------------------------------------------------------------------------------- /Base_edit_2/Run_BaseEdit_freq.py: -------------------------------------------------------------------------------- 1 | import os, re, sys, pdb, math, logging 2 | 3 | import subprocess as sp 4 | 5 | from pdb import set_trace 6 | from datetime import datetime 7 | from optparse import OptionParser 8 | 9 | sys.path.insert(0, os.path.dirname(os.getcwd())) 10 | from Core.CoreSystem import InitialFolder, UserFolderAdmin, Helper, RunMulticore, CheckProcessedFiles 11 | 12 | 13 | class clsBaseEditRunner(UserFolderAdmin): 14 | 15 | def __init__(self, strSample, strRef, options, InstInitFolder): 16 | UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) 17 | 18 | self.strSample = strSample 19 | self._RemoveTmpBeforStart() 20 | self.MakeSampleFolder() ## inheritance 21 | 22 | self.strRef = strRef 23 | self.intCore = options.multicore 24 | self.strGapOpen = options.gap_open 25 | self.strGapExtend = options.gap_extend 26 | self.strTargetWindow = options.target_window 27 | self.strIndelCheckPos = options.indel_check_pos 28 | self.strTargetRefAlt = options.target_ref_alt 29 | 30 | self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') 31 | self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference.txt') 32 | self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') 33 | 34 | self.strPamSeq = options.PAM_seq 35 | self.strPamPos = options.PAM_pos 36 | self.strGuidePos = options.Guide_pos 37 | 38 | Helper.MakeFolderIfNot('./Output/{user}/{project}/{sample}/Tmp/Alignment'.format(user=self.strUser, 39 | project=self.strProject, 40 | sample=self.strSample)) 41 | 42 | def MakeReference(self): 43 | 44 | with open(self.strBarcodeFile) as Barcode, \ 45 | open(self.strReferenceSeqFile) as Ref, \ 46 | open(self.strRefFile, 'w') as Output: 47 | 48 | listBarcode = Helper.RemoveNullAndBadKeyword(Barcode) 49 | listRef = Helper.RemoveNullAndBadKeyword(Ref) 50 | 51 | ## defensive 52 | assert len(listBarcode) == len(listRef), 'Barcode and Reference must be a same row number.' 53 | 54 | dictBarcode = {} 55 | 56 | for strBarcode in listBarcode: 57 | strBarcode = strBarcode.replace('\n','').replace('\r','').upper() 58 | Helper.CheckIntegrity(self.strBarcodeFile, strBarcode) ## defensive 59 | listBarcode = strBarcode.split(':') 60 | strBarSample = listBarcode[0] 61 | strBarcode = listBarcode[1] 62 | dictBarcode[strBarSample] = strBarcode 63 | 64 | for strRef in listRef: 65 | strRef = strRef.replace('\n','').replace('\r','').upper() 66 | Helper.CheckIntegrity(self.strBarcodeFile, strRef) ## defensive 67 | listRef = strRef.split(':') 68 | strRefSample = listRef[0] 69 | strRef = listRef[1] 70 | 71 | try: 72 | sBarcode = dictBarcode[strRefSample] 73 | Output.write('%s\t%s\t%s\n' % (strRefSample, sBarcode, strRef)) 74 | except KeyError: 75 | logging.error('no matching') 76 | logging.error(strRefSample,strRef) 77 | 78 | def MakeIndelSearcherCmd(self): 79 | 80 | listCmd = [] 81 | 82 | with open(self.strRefFile) as BarcodeRef: 83 | 84 | for strBarcodeRef in BarcodeRef: 85 | listBarcodeRef = strBarcodeRef.replace('\n', '').replace('\r','').split('\t') 86 | strFileName = listBarcodeRef[0] 87 | strBarcode = listBarcodeRef[1] 88 | strRef = listBarcodeRef[2] 89 | 90 | self._CheckOptionsCorrect(strBarcode) ## defensive 91 | 92 | strForwardQueryFile = './Input/{user}/Query/{project}/{sample}/{file_name}.txt'.format (user=self.strUser, 93 | project=self.strProject, 94 | sample=self.strSample, 95 | file_name=strFileName) 96 | 97 | strCmd = ('{python} ./BaseEdit_freq_crispresso.py {forw} {GapO} {GapE} {barcode} {ref} {target_window} {indel_check_pos}' 98 | ' {target_ref_alt} {outdir} {file_name} {PAM_seq} {PAM_pos} {guide_pos} {log}').format( 99 | python=self.strPython, forw=strForwardQueryFile, GapO=self.strGapOpen, GapE=self.strGapExtend, 100 | barcode=strBarcode, ref=strRef, target_window=self.strTargetWindow, indel_check_pos=self.strIndelCheckPos, 101 | target_ref_alt=self.strTargetRefAlt, outdir=self.strOutSampleDir, file_name=strFileName, 102 | PAM_seq=self.strPamSeq, PAM_pos=self.strPamPos, guide_pos=self.strGuidePos, log=self.strLogPath) 103 | listCmd.append(strCmd) 104 | 105 | return listCmd 106 | 107 | def MakeMergeTarget(self): 108 | strCmd = '{python} ./Summary_all_trim.py {output} {sample} {ref_alt}'.format(python=self.strPython, output=self.strOutSampleDir, 109 | sample=self.strSample, ref_alt=self.strTargetRefAlt) 110 | sp.call(strCmd, shell=True) 111 | 112 | def CopyToAllResultFolder(self): 113 | 114 | sp.call('cp $(find ./Output/{user}/{project}/*/Result/*Merge* -name "*_Summary.txt") ./Output/{user}/{project}/All_results'.format( 115 | user=self.strUser, project=self.strProject), shell=True) 116 | 117 | def _RemoveTmpBeforStart(self): 118 | strFolderPath = './Output/{user}/{project}/{sample}'.format(user=self.strUser, 119 | project=self.strProject, 120 | sample=self.strSample) 121 | 122 | if os.path.isdir(strFolderPath): 123 | strCmd = 'rm -r %s' % strFolderPath 124 | 125 | Helper.PreventFromRmMistake(strCmd) ## defensive 126 | 127 | logging.info('Delete the %s folder before starting if these were existed.' % self.strSample) 128 | sp.call(strCmd.format(user=self.strUser, 129 | project=self.strProject, 130 | sample=self.strSample), shell=True) 131 | 132 | ## defensive 133 | def _CheckOptionsCorrect(self, strBarcode): 134 | intBarcodeLen = len(strBarcode) 135 | intTargetStart = int(self.strTargetWindow.split('-')[0]) 136 | intTargetEnd = int(self.strTargetWindow.split('-')[1]) 137 | intIndelStart = int(self.strIndelCheckPos.split('-')[0]) 138 | intIndelEnd = int(self.strIndelCheckPos.split('-')[1]) 139 | 140 | intGuideStart = int(self.strGuidePos.split('-')[0]) 141 | intGuideEnd = int(self.strGuidePos.split('-')[1]) 142 | 143 | intPamStart = int(self.strPamPos.split('-')[0]) 144 | intPamEnd = int(self.strPamPos.split('-')[1]) 145 | 146 | intPamLen = len(self.strPamSeq) 147 | 148 | if intBarcodeLen >= intTargetStart: 149 | logging.error('Target window start position must be larger than barcode length') 150 | logging.error('Barcode length: %s, Window start: %s' % (intBarcodeLen, intTargetStart)) 151 | raise Exception 152 | 153 | if intTargetStart > intGuideStart or intTargetEnd < intGuideEnd: 154 | logging.error('Target window start, end range must be larger than guide range') 155 | logging.error('Target window: %s, Guide window: %s' % (self.strTargetWindow, self.strGuidePos)) 156 | raise Exception 157 | 158 | if intIndelStart >= intGuideEnd or intIndelEnd >= intGuideEnd: 159 | logging.error('Guide end position must be larger than Indel position') 160 | logging.error('Guide end position: %s, Indel position: %s' % (intGuideEnd, self.strIndelCheckPos)) 161 | raise Exception 162 | 163 | if intPamStart <= intGuideEnd or intPamEnd <= intGuideEnd: 164 | logging.error('PAM position must be larger than Guide end pos') 165 | logging.error('PAM position: %s, Guide end position: %s, ' % (self.strPamPos, intGuideEnd)) 166 | raise Exception 167 | 168 | if (intPamEnd - intPamStart + 1) != intPamLen: 169 | logging.error('PAM size and PAM seq must be same length.') 170 | logging.error('PAM pos: %s, PAM seq: %s, ' % (self.strPamPos, self.strPamSeq)) 171 | raise Exception 172 | 173 | 174 | def Main(): 175 | print('BaseEdit program start: %s' % datetime.now()) 176 | 177 | sCmd = ("BaseEdit frequency analyzer\n\n./Run_BaseEdit_freq.py -t 15 -w 16-48 --indel_check_pos 39-40 --target_ref_alt A,T --PAM_seq NGG --PAM_pos 43-45 --Guide_pos 23-42" 178 | " --gap_open -10 --gap_extend 1\n\n" 179 | "The sequence position is the one base position (start:1)\n" 180 | "1: Barcode\n" 181 | "2: Base target window (end pos = PAM pos +3)\n" 182 | "3: Indel check pos\n" 183 | "4: PAM pos\n" 184 | "5: Guide pos (without PAM)\n\n" 185 | "TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC\n" 186 | "<------1------><----------------2--------------->\n" 187 | " <3> <4> \n" 188 | " <---------5--------> \n\n") 189 | 190 | parser = OptionParser(sCmd) 191 | 192 | parser.add_option("-t", "--thread", default="1", type="int", dest="multicore", help="multiprocessing number") 193 | parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0') 194 | parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100') 195 | parser.add_option("-w", "--target_window", type="str", dest="target_window", help="a window size for target sequence : 20-48") 196 | parser.add_option("--indel_check_pos", type="str", dest="indel_check_pos", help="indel check position to filter : 39-40; insertion 39, deletion 39 & 40") 197 | parser.add_option("--target_ref_alt", type="str", dest="target_ref_alt", help="Ref 'A' is changed to Alt 'T': A,T") 198 | parser.add_option("--PAM_seq", type="str", dest="PAM_seq", help="PAM sequence: NGG, NGC ...") 199 | parser.add_option("--PAM_pos", type="str", dest="PAM_pos", help="PAM position range in the reference seqeunce : 43-45") 200 | parser.add_option("--Guide_pos", type="str", dest="Guide_pos", help="Guide position range in the reference seqeunce : 23-42") 201 | parser.add_option('--python', dest='python', help='The python path including the CRISPResso2') 202 | parser.add_option('--user', dest='user_name', help='The user name with no space') 203 | parser.add_option('--project', dest='project_name', help='The project name with no space') 204 | 205 | options, args = parser.parse_args() 206 | 207 | InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__)) 208 | InstInitFolder.MakeDefaultFolder() 209 | InstInitFolder.MakeInputFolder() 210 | InstInitFolder.MakeOutputFolder() 211 | 212 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 213 | level=logging.DEBUG, 214 | filename=InstInitFolder.strLogPath, 215 | filemode='a') 216 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 217 | 218 | logging.info('Program start') 219 | if options.multicore > 15: 220 | logging.warning('Optimal threads <= 15') 221 | logging.info(str(options)) 222 | 223 | with open(InstInitFolder.strProjectFile) as Sample_list: 224 | 225 | listSamples = Helper.RemoveNullAndBadKeyword(Sample_list) 226 | 227 | strInputProject = './Input/{user}/Query/{project}'.format(user=options.user_name, project=options.project_name) 228 | 229 | @CheckProcessedFiles 230 | def RunPipeline(**kwargs): 231 | 232 | for strSample in listSamples: 233 | if strSample[0] == '#': continue 234 | 235 | tupSampleInfo = Helper.SplitSampleInfo(strSample) 236 | if not tupSampleInfo: continue 237 | strSample, strRef, strExpCtrl = tupSampleInfo 238 | 239 | InstBaseEdit = clsBaseEditRunner(strSample, strRef, options, InstInitFolder) 240 | InstBaseEdit.MakeReference() 241 | 242 | listCmd = InstBaseEdit.MakeIndelSearcherCmd() 243 | ###print(lCmd[:5]) 244 | RunMulticore(listCmd, options.multicore) ## from CoreSystem.py 245 | 246 | InstBaseEdit.MakeMergeTarget() 247 | 248 | InstBaseEdit.CopyToAllResultFolder() 249 | 250 | RunPipeline(InstInitFolder=InstInitFolder, 251 | strInputProject=strInputProject, 252 | listSamples=listSamples, 253 | logging=logging) 254 | 255 | print('BaseEdit program end: %s' % datetime.now()) 256 | 257 | 258 | if __name__ == '__main__': 259 | Main() 260 | -------------------------------------------------------------------------------- /Base_edit_2/Run_cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #################### 4 | ## User parameter ## 5 | ################################### 6 | 7 | user=JaeWoo 8 | project=JaeWoo_test_samples 9 | target_window=20-59 10 | indel_check_pos=50-51 11 | target_ref_alt=A,G 12 | PAM_seq=NGG 13 | PAM_pos=54-56 14 | Guide_pos=23-53 15 | 16 | thread=15 17 | 18 | gap_open=-10 ## default 19 | gap_extend=1 ## default 20 | 21 | ################################### 22 | 23 | while read python_path;do 24 | python=$python_path 25 | done < ../PythonPath.txt 26 | 27 | [ ! -d ./Output/${user} ] && { `mkdir ./Output/${user}`; } 28 | [ ! -d ./Output/${user}/${project} ] && { `mkdir ./Output/${user}/${project}`; } 29 | [ ! -d ./Output/${user}/${project}/Log ] && { `mkdir ./Output/${user}/${project}/Log`; } 30 | 31 | nohup $python ./Run_BaseEdit_freq.py --python $python --user $user --project $project -w $target_window --indel_check_pos $indel_check_pos \ 32 | --target_ref_alt $target_ref_alt --PAM_seq $PAM_seq --PAM_pos $PAM_pos --Guide_pos $Guide_pos \ 33 | --gap_open $gap_open --gap_extend $gap_extend -t $thread > ./Output/${user}/${project}/Log/log.txt 2>&1 & 34 | -------------------------------------------------------------------------------- /Base_edit_2/Run_each_base_summary.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #################### 4 | ## User parameter ## 5 | ################################### 6 | 7 | user=SH 8 | project=24K_screening 9 | 10 | ################################### 11 | 12 | 13 | 14 | while read python_path;do 15 | python=$python_path 16 | done < ../PythonPath.txt 17 | 18 | nohup $python ./Each_base_summary.py $user $project > ./Output/${user}/${project}/Log/Each_base_summary_log.txt 2>&1 & 19 | -------------------------------------------------------------------------------- /Base_edit_2/Run_sequence_freq.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #################### 5 | ## User parameter ## 6 | ################################### 7 | 8 | user=SH 9 | project=24K_screening 10 | window=25-34 11 | thread=4 12 | 13 | ################################### 14 | 15 | 16 | 17 | while read python_path;do 18 | python=$python_path 19 | done < ../PythonPath.txt 20 | 21 | nohup $python ./Sequence_freq.py $user $project $window $thread > ./Output/${user}/${project}/Log/Sequence_freq_log.txt 2>&1 & 22 | -------------------------------------------------------------------------------- /Base_edit_2/Sequence_freq.py: -------------------------------------------------------------------------------- 1 | #!/home/hkim/anaconda2/bin/python2.7 2 | 3 | import os,sys 4 | import numpy as np 5 | from collections import Counter 6 | from collections import OrderedDict 7 | import multiprocessing as mp 8 | 9 | import logging 10 | from pdb import set_trace 11 | logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S', level=logging.DEBUG) 12 | 13 | sys.path.insert(0, os.path.dirname(os.getcwd())) 14 | from Core.CoreSystem import Helper 15 | 16 | 17 | try: 18 | strUser = sys.argv[1] 19 | strProject = sys.argv[2] 20 | lWindow = sys.argv[3].split('-') 21 | iWinStart = int(lWindow[0]) 22 | iWinEnd = int(lWindow[1]) 23 | iCore = int(sys.argv[4]) 24 | 25 | 26 | except IndexError: 27 | print('\nUsage: ./Sequence_freq.py SH 24K_screening 25-33 10\n' 28 | ' ./Sequence_freq.py user_name project_name window_range thread\n') 29 | sys.exit() 30 | 31 | 32 | def Count_seq_freq(lPara): 33 | 34 | """ aligned_BaseEdit.txt 35 | ACTAGCTATCGCTCACTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA CGCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGACTAGCTATCGCTCACTCTGGGGTCAGGGGCAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA [] 36 | [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]] 37 | ACTAGCTATCGCTCACTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTA--A ACTAGCTATCGCTCACTCTGGGGTCAGGGGCAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA 38 | """ 39 | 40 | strSample = lPara[0] 41 | sFile_path = lPara[1] 42 | sTotal_readcnt_path = lPara[2] 43 | dInput_fa = lPara[3] 44 | print ("Count_seq_freq: ", strSample, sFile_path, sTotal_readcnt_path) 45 | 46 | try: 47 | with open('./Output/{user}/{project}/{sample}/Result/Seq_freq.txt'.format(user=strUser, 48 | project=strProject, 49 | sample=strSample), 'w') as Output: 50 | 51 | Output.write('Filename\tSeq\tMotif\tCount\tTotal_cnt\tProportion\tSubstitution\n') 52 | 53 | ## A project has many file. The total read count is summation in the each file. 54 | for iFile_num, sFile in enumerate(os.listdir(sFile_path)): 55 | #set_trace() 56 | if 'aligned' in sFile: 57 | # print(iFile_num) 58 | sFilename = sFile.replace('_aligned_BaseEdit', '').split('.')[:-1][0] ## To search filename in other folder. 59 | sTotal_readcnt_file = sFilename + '_Summary.txt' ## extract totral read count for the sequence frequency. 60 | 61 | with open(sFile_path + '/' + sFile) as aligned_BaseEdit,\ 62 | open(sTotal_readcnt_path + '/' + sTotal_readcnt_file) as Total_readcnt: 63 | #print(sFile_path + '/' + sFile) 64 | 65 | iTotal_wo_indel = 0 66 | 67 | for i, sRow in enumerate(Total_readcnt): 68 | if i == 0: continue 69 | lCol = sRow.replace('\n', '').split('\t') 70 | iTotal_read = int(lCol[3]) ## This is read counts of a matched barcode. 71 | iIndel_read = int(lCol[4]) + int(lCol[5]) + int(lCol[6]) 72 | iTotal_wo_indel = iTotal_read - iIndel_read ## Total read is without indel reads 73 | break ## 2 row is target, over 3 is none 74 | 75 | lTarget_seq = [] 76 | sRef_seq = '' 77 | dSeq_wt_extend = {} ## WT + motif(target sequence) + WT 78 | 79 | for i, sRow in enumerate(aligned_BaseEdit): 80 | 81 | lCol = sRow.replace('\n', '').split('\t') 82 | sQuery_seq = lCol[5] 83 | 84 | if sRef_seq == '': ## Reference is same in the file, so store once. 85 | sRef_seq = lCol[0] 86 | dSeq_wt_extend[sRef_seq[iWinStart - 1: iWinEnd]] = sRef_seq 87 | 88 | lRef_seq_with_motif = list(sRef_seq) 89 | lRef_seq_with_motif[ iWinStart-1 : iWinEnd ] = list(sQuery_seq[ iWinStart-1 : iWinEnd ]) 90 | sRef_seq_with_motif = ''.join(lRef_seq_with_motif) 91 | 92 | dSeq_wt_extend[sQuery_seq[ iWinStart-1 : iWinEnd ]] = sRef_seq_with_motif 93 | lTarget_seq.append(sQuery_seq[ iWinStart-1 : iWinEnd ]) 94 | 95 | iNormal = iTotal_wo_indel - len(lTarget_seq) 96 | sRef_seq = sRef_seq[ iWinStart-1 : iWinEnd ] 97 | dSeq_cnt = Counter(lTarget_seq) 98 | 99 | try: 100 | iRef_cnt_in_aligned = dSeq_cnt[sRef_seq] ## check normal sequence because substitution exists outside of window size. 101 | iNormal = iNormal + iRef_cnt_in_aligned 102 | del dSeq_cnt[sRef_seq] 103 | except KeyError: 104 | pass 105 | 106 | if iNormal > 0: 107 | if sRef_seq == '': ## aligned result file can be none result file. So extract from input file. 108 | sRef_seq = dInput_fa[sFilename][1] ## dInput_fa[0] : full ref, dInput_fa[1] : target ref 109 | dSeq_wt_extend[sRef_seq] = dInput_fa[sFilename][0] 110 | try: 111 | Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sRef_seq], sRef_seq, iNormal, iTotal_wo_indel, round(iNormal/float(iTotal_wo_indel),4), 'ref_from_result']))+'\n') 112 | except Exception as e: 113 | print(e, 'line150') 114 | set_trace() 115 | 116 | elif iNormal == 0: ## if iNormal = 0, that means no result generation. because aligned_BaseEdit file is not contained non-read file. 117 | sRef_seq = dInput_fa[sFilename][1] 118 | dSeq_wt_extend[sRef_seq] = dInput_fa[sFilename][0] 119 | try: 120 | Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sRef_seq], sRef_seq, iNormal, iTotal_wo_indel, iNormal, 'ref_from_input'])) + '\n') 121 | except Exception as e: 122 | print(e, 'line158') 123 | set_trace() 124 | 125 | for sSeq, iCnt in dSeq_cnt.most_common(): 126 | try: 127 | Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sSeq], sSeq, iCnt, iTotal_wo_indel, round(iCnt/float(iTotal_wo_indel),4), 'alt']))+'\n') 128 | except Exception as e: 129 | print(lPara[0], sFilename) 130 | print(iCnt, iTotal_wo_indel) 131 | print(e, 'line175') 132 | #pass 133 | set_trace() 134 | #END: for 135 | #END: with 136 | #END: for 137 | #END: with 138 | except Exception as e: 139 | print(e) 140 | print("Error in the input: ", strSample, sFilename, sTotal_readcnt_file) 141 | pass 142 | #END: def 143 | 144 | 145 | def Make_ref_dict(strRef): 146 | 147 | dInput_fa = {} 148 | 149 | with open('./Input/{user}/Reference/{project}/{ref}/Reference.fa'.format(user=strUser, 150 | project=strProject, 151 | ref=strRef)) as Input_ref: 152 | """ 153 | YSKim_0525+01614_98_repeat1 TATACACGCATGTAT TTTGTATACACGCATGTATGCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 154 | 155 | 1 file has 1 barcode. 156 | This should be done. 157 | """ 158 | for sRow in Input_ref: 159 | lCol = sRow.replace('\n', '').split('\t') 160 | 161 | sInputFile = lCol[0] 162 | sBarcode = lCol[1] 163 | sInputRef = lCol[2] 164 | 165 | iBarcode_start = sInputRef.index(sBarcode) 166 | sBarcode_start_ref = sInputRef[iBarcode_start:] 167 | dInput_fa[sInputFile] = [sBarcode_start_ref, sBarcode_start_ref[iWinStart - 1: iWinEnd]] 168 | 169 | return dInput_fa 170 | 171 | 172 | def Count_group(): 173 | 174 | """ 175 | Filename Seq Count Total_cnt Proportion Substitution 176 | Doench2014_1000 AGGGACA 13 14 0.9286 ref_from_result 177 | Doench2014_1000 AG----- 1 14 0.0714 alt 178 | Doench2014_1001 GGCGCCA 17 26 0.6538 ref_from_result 179 | Doench2014_1001 GGTGCCA 5 26 0.1923 alt 180 | Doench2014_1001 GGAGCCA 2 26 0.0769 alt 181 | Doench2014_1001 GGCGCTA 1 26 0.0385 alt 182 | """ 183 | 184 | sHeader = '' 185 | dTotal_cnt = {} 186 | 187 | ## Make dictionary to sum the total reads count of the group. The total reads count is always same in their group. 188 | with open('Group_list.txt') as Group_list: 189 | for sGroupname in Group_list: 190 | if sGroupname[0] == "#": continue 191 | sGroupname = sGroupname.replace('\n', '').strip() 192 | if not os.path.isdir('./Output/Group_result'): os.mkdir('./Output/Group_result') 193 | 194 | for sFile in os.listdir('./Output'): 195 | if sGroupname in sFile: ## matched group names -> Sum the counts 196 | with open('./Output/%s/Summary/Seq_freq.txt' % sFile) as SeqFreq: 197 | 198 | sHeader = SeqFreq.readline() 199 | dSelect_one_total_cnt = {} 200 | for sRow in SeqFreq: 201 | lCol = sRow.replace('\n', '').split('\t') 202 | sFilename = lCol[0] 203 | try: 204 | iTotal_read_cnt = int(lCol[4]) 205 | except IndexError: 206 | set_trace() 207 | dSelect_one_total_cnt[sFilename] = iTotal_read_cnt 208 | 209 | for sFilename, iTotal_read_cnt in dSelect_one_total_cnt.items(): 210 | try: 211 | dTotal_cnt[sGroupname + '_' + sFilename] += iTotal_read_cnt 212 | except KeyError: 213 | dTotal_cnt[sGroupname + '_' + sFilename] = iTotal_read_cnt 214 | 215 | with open('Group_list.txt') as Group_list: 216 | for sGroupname in Group_list: 217 | if sGroupname[0] == "#": continue 218 | sGroupname = sGroupname.replace('\n', '').strip() 219 | dSeq_freq = OrderedDict() ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']), 220 | ## Unique key, only one list. 221 | if not os.path.isdir('./Output/Group_result'): os.mkdir('./Output/Group_result') 222 | 223 | for sFile in os.listdir('./Output'): 224 | if sGroupname in sFile: ## matched group names -> Sum the counts 225 | with open('./Output/%s/Summary/Seq_freq.txt' % sFile) as SeqFreq: 226 | 227 | sHeader = SeqFreq.readline() 228 | 229 | for sRow in SeqFreq: 230 | lCol = sRow.replace('\n', '').split('\t') 231 | sFilename = lCol[0] 232 | sSeq_wt_extend = lCol[1] 233 | sFile_seq = lCol[0] + '_' + lCol[2] ## Unique name : Doench2014_1000_CTCTGGGGT 234 | iCount = int(lCol[3]) 235 | iTotal_read_cnt = dTotal_cnt[sGroupname + '_' + sFilename] 236 | 237 | lCol[3] = iCount 238 | lCol[4] = iTotal_read_cnt 239 | 240 | try: 241 | _ = dSeq_freq[sFile_seq] 242 | 243 | dSeq_freq[sFile_seq][3] += iCount 244 | #dSeq_freq[sFile_seq][4] = iTotal_read_cnt 245 | 246 | except KeyError: 247 | dSeq_freq[sFile_seq] = lCol ## initial assignment 248 | 249 | ## x[0] : key, x[1] : value, int(x[1][5]) : proportion, x[1][6]: alt, wt category, x[1][0]: filename, 250 | llSeq_freq = sorted(sorted(dSeq_freq.items(), key=lambda x:x[1][6], reverse=True), key=lambda x:x[1][0]) 251 | if not os.path.isdir('./Output/Group_result/%s' % sGroupname): os.mkdir('./Output/Group_result/%s' % sGroupname) 252 | with open('./Output/Group_result/%s/Seq_freq.txt' % sGroupname, 'w') as Output: 253 | 254 | Output.write(sHeader) 255 | 256 | for sFile_seq, lCol in llSeq_freq: 257 | try: 258 | try: 259 | lCol[5] = round(float(lCol[3])/lCol[4], 4) ## proportion calculation, previous proportion is not correct. 260 | except ZeroDivisionError: 261 | lCol[5] = 0 262 | except Exception: 263 | set_trace() 264 | Output.write('\t'.join(map(str, lCol)).replace('ref_from_result', 'wt').replace('ref_from_input', 'wt')+'\n') 265 | #END: for 266 | #END: with 267 | 268 | 269 | def Trim_data(): 270 | 271 | """ 272 | Remove gap seqs (e.g. AC---) 273 | """ 274 | with open('Group_list.txt') as Group_list: 275 | for sGroupname in Group_list: 276 | if sGroupname[0] == "#": continue 277 | sGroupname = sGroupname.replace('\n', '').strip() 278 | dSeq_freq = OrderedDict() 279 | 280 | with open('./Output/Group_result/%s/Seq_freq.txt' % sGroupname) as Group_result,\ 281 | open('./Output/Group_result/%s/Trimmed_seq_freq.txt' % sGroupname, 'w') as Trimmed_result: 282 | 283 | sHeader = '' 284 | 285 | for i, sRow in enumerate(Group_result): 286 | 287 | if i == 0: 288 | sHeader = sRow 289 | continue 290 | 291 | lCol = sRow.replace('\n', '').split('\t') 292 | sFilename = lCol[0] ## Doench2014_1000 293 | 294 | try: 295 | dSeq_freq[sFilename].append(lCol) 296 | except KeyError: 297 | dSeq_freq[sFilename] = [lCol] 298 | 299 | for sFilename in dSeq_freq: 300 | llFilename = dSeq_freq[sFilename] ## [[Doench2014_1000,ACAGCAGCGAAC...,ACGCATC, 12,30,0.4][],[]... 301 | ## A Same file name chunk in the group file. 302 | iRecal_total = 0 ## sub the gap seq cnt 303 | #lDele_key = [] 304 | llPre_recal_total = [] 305 | llRecal_total = [] 306 | 307 | for i, lFilename in enumerate(llFilename): 308 | sMotif = lFilename[2] 309 | iMotif_cnt = int(lFilename[3]) 310 | iTotal_read_cnt = int(lFilename[4]) 311 | 312 | if lFilename[6] == 'wt': 313 | iRecal_total = iTotal_read_cnt 314 | llPre_recal_total.append(lFilename) 315 | 316 | elif '-' in sMotif: 317 | iRecal_total -= iMotif_cnt 318 | continue 319 | else: 320 | llPre_recal_total.append(lFilename) ## store AC----- row key 321 | 322 | for lPre_recal_total in llPre_recal_total: 323 | lPre_recal_total[4] = iRecal_total 324 | try: 325 | lPre_recal_total[5] = round(float(lPre_recal_total[3])/iRecal_total,4) ## recal proportion because of sub. 326 | except ZeroDivisionError: 327 | pass 328 | llRecal_total.append(lPre_recal_total) 329 | 330 | #llRecal_total[1:] = sorted(llRecal_total[1:], key=lambda x: float(x[5]), reverse=True) 331 | dSeq_freq[sFilename] = llRecal_total ## reassign the total cnt 332 | #END for 333 | 334 | llFilename_chunk = sorted(dSeq_freq) ## key is a filename 335 | for sKey in llFilename_chunk: 336 | llCol = dSeq_freq[sKey] 337 | llCol = sorted(llCol, key=lambda x: x[6], reverse=True) ## wild type category first 338 | 339 | if llCol[0][6] != 'wt': 340 | logging.critical('error, wildtype must be fisrt row. If you see this error message, please contact the developer.') 341 | logging.critical('This program will be terminated.') 342 | sys.exit() 343 | 344 | if len(llCol) > 1: ## It has alt. only a wt file does not necessary. 345 | llCol[1:] = sorted(llCol[1:], key=lambda x: float(x[5]), reverse=True) 346 | dSeq_freq[sKey] = llCol 347 | 348 | Trimmed_result.write(sHeader) 349 | for llRecal_total_final in dSeq_freq.values(): 350 | for lRecal_total_final in llRecal_total_final: 351 | Trimmed_result.write('\t'.join(map(str,lRecal_total_final))+'\n') 352 | #END with 353 | #END for 354 | #END with 355 | 356 | 357 | def Main(): 358 | 359 | logging.info('Program Start') 360 | 361 | logging.info('Make commands for a multiple processing') 362 | lPara = [] 363 | with open('./User/{user}/{project}.txt'.format(user=strUser, project=strProject)) as Project_list: 364 | 365 | for strSample in Project_list: 366 | if strSample[0] == '#': continue 367 | 368 | tupSampleInfo = Helper.SplitSampleInfo(strSample) 369 | if not tupSampleInfo: continue 370 | strSample, strRef, strExpCtrl = tupSampleInfo 371 | 372 | strSample = strSample.replace('\n', '').replace('\r', '') 373 | sFile_path = './Output/{user}/{project}/{sample}/Tmp/Alignment'.format(user=strUser, project=strProject, sample=strSample) 374 | sTotal_readcnt_path = './Output/{user}/{project}/{sample}/Tmp/All'.format(user=strUser, project=strProject, sample=strSample) 375 | dInput_fa = Make_ref_dict(strRef) 376 | 377 | lPara.append([strSample, sFile_path, sTotal_readcnt_path, dInput_fa]) 378 | 379 | logging.info('Multiple processing Start') 380 | p = mp.Pool(iCore) 381 | p.map_async(Count_seq_freq, lPara).get() 382 | logging.info('Multiple processing End') 383 | 384 | #logging.info('Count group Start') 385 | #Count_group() 386 | #logging.info('Count group End') 387 | 388 | #logging.info('Trim data Start') 389 | #Trim_data() 390 | #logging.info('Trim data End') 391 | 392 | logging.info('Program End') 393 | 394 | 395 | Main() 396 | -------------------------------------------------------------------------------- /Base_edit_2/Sequence_freq_add.py: -------------------------------------------------------------------------------- 1 | #!/home/hkim/anaconda2/bin/python2.7 2 | 3 | from pdb import set_trace 4 | 5 | ## D0 Sub list 6 | """ 7 | Euchromatin_206_repeat5 TCTATCGTACATCGC Euchromatin_206_repeat5:39M2D_AC 1 8 | ExtremeGC_811 CTACATCGTCATACA ExtremeGC_811:39M1D_G 1 9 | """ 10 | #strSubHiseq1 = './Sub_indel_result/Summation_Project_list_sub_indel.txt' ## total indel cnt : 8929 11 | #strSubHiseq2 = './Sub_indel_result/Summation_Project_list2_sub_indel.txt' ## total indel cnt : 8367 12 | #strSubNeon1 = './Sub_indel_result/Summation_Project_list3_sub_indel.txt' ## total indel cnt : 9396 13 | #3strSubNeon2 = './Sub_indel_result/Summation_Project_list4_sub_indel.txt' ## total indel cnt : 8321 14 | strSubHiseq1 = './Output/Summation_Project_list_sub_indel.txt' ## total indel cnt : 8929 15 | strSubHiseq2 = './Output/Summation_Project_list2_sub_indel.txt' ## total indel cnt : 8367 16 | strSubNeon1 = './Output/Summation_Project_list3_sub_indel.txt' ## total indel cnt : 9396 17 | strSubNeon2 = './Output/Summation_Project_list4_sub_indel.txt' ## total indel cnt : 8321 18 | 19 | ## Total sum file 20 | """ 21 | Sample Barcode Ref # of Total # of Insertion # of Deletion # of Combination A.-7 22 | 0 Doench2014_1 CGCATATCATCATCA TAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 322 0 0 0 0 23 | """ 24 | strTotalHiseq1 = './Output/Summation_Project_list.txt' 25 | strTotalHiseq2 = './Output/Summation_Project_list2.txt' 26 | strTotalNeon1 = './Output/Summation_Project_list3.txt' 27 | strTotalNeon2 = './Output/Summation_Project_list4.txt' 28 | 29 | ## Freq result file 30 | """ 31 | Filename Seq Motif Count Total_cnt Proportion Substitution 32 | Doench2014_1 CGCATATCATCATCATAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 33 | AGAGAGACA 246 257 0.9572 wt 34 | """ 35 | strFreqHiseq1 = './Output/Group_result/180903_split_hiseq_R1/Seq_freq.txt' 36 | strFreqHiseq2 = './Output/Group_result/180903_split_hiseq_R2/Seq_freq.txt' 37 | strFreqNeon1 = './Output/Group_result/190311_Neon_splitBE4_R1/Seq_freq.txt' 38 | strFreqNeon2 = './Output/Group_result/190311_Neon_splitBE4_R2/Seq_freq.txt' 39 | 40 | ## Result 41 | """ 42 | Filename Seq Motif Count Total_cnt Proportion Substitution 43 | Doench2014_1 CGCATATCATCATCATAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 44 | AGAGAGACA 246 257-(D0 indel count) 0.9572 wt 45 | -> next line 46 | + indelcompelex sum count 47 | 48 | """ 49 | strResultHiseq1 = './Output/Seq_freq_add_info_Hiseq1.txt' 50 | strResultHiseq2 = './Output/Seq_freq_add_info_Hiseq2.txt' 51 | strResultNeon1 = './Output/Seq_freq_add_info_Neon1.txt' 52 | strResultNeon2 = './Output/Seq_freq_add_info_Neon2.txt' 53 | 54 | 55 | def Add_info_result(strSub, strTotal, strFreq, strResult): 56 | 57 | with open(strSub) as Sub,\ 58 | open(strTotal) as Total,\ 59 | open(strFreq) as Freq,\ 60 | open(strResult, 'w') as Result: 61 | 62 | dictSubCnt = {} ## Doench2016_1948:39M1D_G, Doench2016_1948:39M1I_G -> Neon: two file name is same but the pattern is different. 63 | ## I should merge these pattern based on the file name. 64 | dictTotalAppliedSub = {} 65 | dictIndelSum = {} 66 | 67 | for strRow in Sub: 68 | listCol = strRow.replace('\n', '').split('\t') 69 | strFile = listCol[0] 70 | intCount = int(listCol[3]) 71 | 72 | try: 73 | dictSubCnt[strFile] += intCount 74 | except KeyError: 75 | dictSubCnt[strFile] = intCount 76 | 77 | # intSubIndelAllCnt = sum([v for k,v in dictSubCnt.items()]) 78 | # print('%s sub indel all count : %s' % (strSub, intSubIndelAllCnt)) ## checked all count is correct. 79 | 80 | #""" 81 | for i, strRow in enumerate(Total): 82 | if i == 0: continue ## header skip. 83 | listCol = strRow.replace('\n', '').split('\t') 84 | strFile = listCol[1] 85 | intTotal = int(listCol[4]) 86 | intIns = int(listCol[5]) 87 | intDel = int(listCol[6]) 88 | intCom = int(listCol[7]) 89 | 90 | try: 91 | intSub = dictSubCnt[strFile] 92 | except KeyError: 93 | intSub = 0 94 | 95 | intTotalAppliedSub = intTotal - intSub ## The total count is not subtracted by DOindel, so apply it. 96 | 97 | dictTotalAppliedSub[strFile] = intTotalAppliedSub 98 | dictIndelSum[strFile] = intIns + intDel + intCom ## each file row indel complex count sum 99 | 100 | dictFreq = {} ## {'GECKO_346': [[Filename Seq Motif Count Total_cnt Proportion Substitution],[],[],[]]} 101 | strHeader = '' 102 | 103 | for i, strRow in enumerate(Freq): ## Freq total was removed by crispr indel. 104 | if i == 0: 105 | strHeader = strRow 106 | continue ## header skip. 107 | listCol = strRow.replace('\n', '').split('\t') 108 | strFile = listCol[0] 109 | intCount = int(listCol[3]) 110 | intTotal = int(listCol[4]) 111 | floProp = float(listCol[5]) 112 | listCol[3] = intCount 113 | listCol[4] = intTotal 114 | listCol[5] = floProp 115 | 116 | try: 117 | dictFreq[strFile].append(listCol) 118 | except KeyError: 119 | dictFreq[strFile] = [listCol] 120 | 121 | Result.write(strHeader.replace('\n','')+'\tTotal(D0)\tD0_indel\n') 122 | 123 | for strFile, list2Col in dictFreq.items(): 124 | 125 | list2Col[1:] = sorted(list2Col[1:], key=lambda x: x[5], reverse=True) ## sort by proportion 126 | 127 | intAltAllCnt = sum([listAlt[3] for listAlt in list2Col[1:]]) 128 | 129 | ## for validation 130 | listCountCheck = [] 131 | intTotalCheck = 0 132 | 133 | for i, listCol in enumerate(list2Col): 134 | strSubstitution = listCol[6] 135 | intTotal = listCol[4] 136 | 137 | intTotalAppliedSub = dictTotalAppliedSub[strFile] 138 | 139 | intIndelSum = dictIndelSum[strFile] ## intIns + intDel + intCom 140 | intTotalD0 = intTotal + intIndelSum ## freq total are substrated by indel sum, so add it again 141 | 142 | intD0IndelCount = intTotalD0 - intTotalAppliedSub 143 | 144 | if strSubstitution == 'wt': ## modify WT count. Total - alt count = wt count 145 | intModiCount = intTotalAppliedSub - intAltAllCnt - intIndelSum 146 | if intModiCount < 0: 147 | print('minus value error, this integer is positive.') 148 | set_trace() 149 | 150 | listCol[3] = intModiCount 151 | #if listCol[0] == 'GECKO_7232': Neon1, 2761 152 | # set_trace() 153 | listCountCheck.append(listCol[3]) ## for validation 154 | 155 | listCol[4] = intTotalAppliedSub 156 | try: 157 | listCol[5] = round(float(listCol[3]) / listCol[4], 4) 158 | except Exception: 159 | listCol[5] = 0 160 | 161 | Result.write('\t'.join(map(str, listCol + [intTotalD0, intD0IndelCount]))+'\n') 162 | 163 | if i == 0: 164 | listCountCheck.append(intIndelSum) 165 | intTotalCheck = listCol[4] ## for validation 166 | 167 | listResultCol = len(listCol) * ['~'] + ['~', '~'] 168 | listResultCol[0] = strFile 169 | listResultCol[6] = 'Indel' 170 | listResultCol[3] = intIndelSum 171 | 172 | Result.write('\t'.join(map(str, listResultCol))+'\n') 173 | 174 | #if strFile == 'GECKO_7232': 175 | # set_trace() 176 | intCountCheckTotal = sum(listCountCheck) 177 | if intCountCheckTotal != intTotalCheck: 178 | print('Count total is diffrent. result:%s, file:%s, CountCheckTotal:%s, TotalCheck:%s' % (strResult, strFile, intCountCheckTotal, intTotalCheck)) 179 | #""" 180 | 181 | 182 | def Main(): 183 | 184 | for strSub, strTotal, strFreq, strResult in [[strSubHiseq1, strTotalHiseq1, strFreqHiseq1, strResultHiseq1], 185 | [strSubHiseq2, strTotalHiseq2, strFreqHiseq2, strResultHiseq2], 186 | [strSubNeon1, strTotalNeon1, strFreqNeon1, strResultNeon1], 187 | [strSubNeon2, strTotalNeon2, strFreqNeon2, strResultNeon2]]: 188 | 189 | Add_info_result(strSub, strTotal, strFreq, strResult) 190 | 191 | Main() 192 | 193 | """ 194 | ## deprecated 195 | def Merge_sub_indel_and_dict(strInput1, strInput2): 196 | dictSubIndel = {} 197 | 198 | with open(strInput1) as Input1, \ 199 | open(strInput2) as Input2: 200 | 201 | for strRow in Input1: 202 | listCol = strRow.replace('\n', '').split('\t') 203 | strFile = listCol[0] 204 | strBarcode = listCol[1] 205 | strPattern = listCol[2] 206 | intCount = int(listCol[3]) 207 | 208 | dictSubIndel[strFile] = [strBarcode, strPattern, intCount] 209 | 210 | for strRow in Input2: 211 | listCol = strRow.replace('\n', '').split('\t') 212 | strFile = listCol[0] 213 | intCount = int(listCol[3]) 214 | 215 | dictSubIndel[strFile][2] += intCount 216 | """ -------------------------------------------------------------------------------- /Base_edit_2/Split_file.py: -------------------------------------------------------------------------------- 1 | #!/home/hkimlab/anaconda2/bin/python2.7 2 | 3 | 4 | 5 | import sys 6 | import subprocess as sp 7 | 8 | 9 | sFile_path = sys.argv[1] 10 | 11 | iSplit_line = int(sys.argv[2]) #400000 12 | iSplit_num = int(sys.argv[3]) #11 13 | 14 | def Split(): 15 | 16 | with open(sFile_path) as fq: 17 | 18 | for num in range(1, iSplit_num+1): 19 | with open('%s_%s.fq' % (sFile_path, num), 'w') as out: 20 | iCount = 0 21 | for sRow in fq: 22 | iCount += 1 23 | out.write(sRow) 24 | if iCount == iSplit_line: 25 | break 26 | 27 | 28 | def Make_filelist(): 29 | 30 | with open('./LongGuide_Synthetic_2nd.txt', 'w') as filelist: 31 | 32 | for sFilename in sp.check_output('ls', shell=True).split('\n'): 33 | 34 | lFilename = sFilename.split('.') 35 | #print(lFilename) 36 | if lFilename[-1] == 'fq': 37 | filelist.write(sFilename+'\n') 38 | 39 | 40 | #Split() 41 | Make_filelist() 42 | -------------------------------------------------------------------------------- /Base_edit_2/Sum_all_alt_freq.py: -------------------------------------------------------------------------------- 1 | #!/media/hkim/7434A5B334A5792E/bin/Python/Python2/bin/python2 2 | 3 | import os,sys 4 | import numpy as np 5 | from pdb import set_trace 6 | 7 | 8 | sProject = sys.argv[1] 9 | 10 | 11 | def Sum_all_freq(): 12 | 13 | sFile_path = './Output/%s/Summary/All' % sProject 14 | sHeader = '' 15 | 16 | """ 17 | Sample Barcode Ref # of Total # of Insertion # of Deletion # of Combination T.-7 A.-6 G.-5 18 | Doench2014_1001 ATACATAGCTACATG CAGCGGTCAGCTTACTCGACTTAA... 60 0 0 0 0 0 0 19 | 0 0 0 20 | 0 0 0 21 | 0 0 0 22 | """ 23 | 24 | lSum_total_and_indel_data = [] 25 | lSum_target_data = [] 26 | 27 | for iFile_num, sFile in enumerate(os.listdir(sFile_path)): 28 | #print(iFile_num) 29 | with open(sFile_path + '/' + sFile) as Input: 30 | lSum_target = [] 31 | 32 | for i, sRow in enumerate(Input): 33 | if i == 0: 34 | sHeader = sRow 35 | continue 36 | 37 | lCol = sRow.replace('\n','').split('\t') 38 | 39 | if i == 1: ## This data is in the second row 40 | lTotal_and_indel_col = map(int, lCol[3:7]) 41 | if lSum_total_and_indel_data == []: 42 | lSum_total_and_indel_data = np.zeros((len(lTotal_and_indel_col)), int) 43 | lSum_total_and_indel_data += lTotal_and_indel_col 44 | 45 | lTarget_col = map(int, lCol[7:]) 46 | if lSum_target_data == []: 47 | lSum_target_data = np.zeros((4, len(lTarget_col)), int) 48 | 49 | lSum_target.append(lTarget_col) 50 | 51 | if lSum_target: 52 | lSum_target_data += lSum_target 53 | 54 | print(lSum_target_data) 55 | 56 | with open('./Output/%s/Summary/Alt_freq.txt' % sProject, 'w') as Output: 57 | 58 | lHeader = sHeader.split('\t') 59 | lHeader[7:] = [sCol.split('.')[1] for sCol in lHeader[7:]] 60 | Output.write('Alt_base\t' + '\t'.join(lHeader[3:])) 61 | 62 | cnt = -1 63 | 64 | for sBase, lSum in zip(['A','C','G','T'], lSum_target_data): 65 | cnt += 1 66 | if cnt == 0: 67 | Output.write(sBase + '\t' + '\t'.join(map(str, lSum_total_and_indel_data)) + '\t' + '\t'.join(map(str, lSum)) + '\n') 68 | else: 69 | Output.write(sBase + '\t\t\t\t\t' + '\t'.join(map(str, lSum)) + '\n') 70 | 71 | 72 | def Main(): 73 | Sum_all_freq() 74 | 75 | 76 | Main() 77 | -------------------------------------------------------------------------------- /Base_edit_2/Summary_all_trim.py: -------------------------------------------------------------------------------- 1 | #!/media/hkim/7434A5B334A5792E/bin/Python/Python2/bin/python2 2 | 3 | import os, sys 4 | import subprocess as sp 5 | from pdb import set_trace 6 | 7 | sOutput_dir = sys.argv[1] 8 | sSample = sys.argv[2] 9 | lRef_alt = sys.argv[3].split(',') 10 | 11 | 12 | def Concat_summary(): 13 | 14 | sRef = lRef_alt[0] 15 | sAlt = lRef_alt[1] 16 | sSummary_dir = "{outdir}/Tmp/Target".format(outdir=sOutput_dir) 17 | lHeader = [] 18 | lData = [] 19 | 20 | for sFile in os.listdir(sSummary_dir): 21 | if sRef + 'to' + sAlt in sFile: 22 | 23 | with open(sSummary_dir + '/' + sFile) as Input: 24 | for i, sRow in enumerate(Input): 25 | if i == 0: 26 | lCol = sRow.replace('\n', '').split('\t') 27 | if lHeader: 28 | for iCol_num in range(len(lHeader)): 29 | if iCol_num > 6: 30 | 31 | if lHeader[iCol_num] == "" or lHeader[iCol_num] == " ": 32 | lHeader[iCol_num] = lCol[iCol_num] 33 | else: 34 | lHeader = lCol 35 | else: 36 | lData.append(sRow) 37 | #END: for 38 | #END: with 39 | #END: if 40 | #END: for 41 | 42 | if not os.path.isdir('{outdir}/Result/Merge_target_result'.format(outdir=sOutput_dir)): os.mkdir('{outdir}/Result/Merge_target_result'.format(outdir=sOutput_dir)) 43 | 44 | with open('{outdir}/Result/Merge_target_result/{sample}_{ref}to{alt}_Summary.txt'.format(outdir=sOutput_dir, 45 | sample=sSample, 46 | ref=sRef, 47 | alt=sAlt), 'w') as Output: 48 | 49 | Output.write('\t'.join(lHeader) +'\n') 50 | for sData in lData: 51 | Output.write(sData) 52 | 53 | 54 | if __name__ == '__main__': 55 | Concat_summary() 56 | -------------------------------------------------------------------------------- /Base_edit_2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Base_edit_2/__init__.py -------------------------------------------------------------------------------- /Core/CoreSystem.py: -------------------------------------------------------------------------------- 1 | import os, re, sys, logging 2 | import subprocess as sp 3 | import multiprocessing as mp 4 | 5 | from pdb import set_trace 6 | from datetime import datetime 7 | 8 | import numpy as np 9 | 10 | from CRISPResso2 import CRISPResso2Align 11 | 12 | 13 | class Helper(object): 14 | 15 | @staticmethod 16 | def MakeFolderIfNot(strDir): 17 | if not os.path.isdir(strDir): os.makedirs(strDir) 18 | 19 | @staticmethod 20 | def RemoveNullAndBadKeyword(Sample_list): 21 | listSamples = [strRow for strRow in Sample_list.readlines() if strRow not in ["''", '', '""', '\n', '\r', '\r\n']] 22 | return listSamples 23 | 24 | @staticmethod ## defensive 25 | def CheckSameNum(strInputProject, listSamples): 26 | 27 | listProjectNumInInput = [i for i in sp.check_output('ls %s' % strInputProject, shell=True).split('\n') if i != ''] 28 | 29 | setSamples = set(listSamples) 30 | setProjectNumInInput = set(listProjectNumInInput) 31 | 32 | intProjectNumInTxt = len(listSamples) 33 | intProjectNumInInput = len(listProjectNumInInput) 34 | 35 | if intProjectNumInTxt != len(setSamples - setProjectNumInInput): 36 | logging.warning('The number of samples in the input folder and in the project list does not matched.') 37 | logging.warning('Input folder: %s, Project list samples: %s' % (intProjectNumInInput, intProjectNumInTxt)) 38 | raise AssertionError 39 | else: 40 | logging.info('The file list is correct, pass\n') 41 | 42 | @staticmethod ## defensive 43 | def CheckAllDone(strOutputProject, listSamples): 44 | intProjectNumInOutput = len([i for i in sp.check_output('ls %s' % strOutputProject, shell=True).split('\n') if i not in ['All_results', 'Log', '']]) 45 | 46 | if intProjectNumInOutput != len(listSamples): 47 | logging.warning('The number of samples in the output folder and in the project list does not matched.') 48 | logging.warning('Output folder: %s, Project list samples: %s\n' % (intProjectNumInOutput, len(listSamples))) 49 | else: 50 | logging.info('All output folders have been created.\n') 51 | 52 | @staticmethod 53 | def SplitSampleInfo(strSample): 54 | 55 | if strSample[0] == '#': return False 56 | logging.info('Processing sample : %s' % strSample) 57 | lSampleRef = strSample.replace('\n', '').replace('\r', '').replace(' ', '').split('\t') 58 | 59 | if len(lSampleRef) == 2: 60 | strSample = lSampleRef[0] 61 | strRef = lSampleRef[1] 62 | return (strSample, strRef, '') 63 | 64 | elif len(lSampleRef) == 3: 65 | strSample = lSampleRef[0] 66 | strRef = lSampleRef[1] 67 | strExpCtrl = lSampleRef[2].upper() 68 | return (strSample, strRef, strExpCtrl) 69 | 70 | else: 71 | logging.error('Confirm the file format is correct. -> Sample name\tReference name\tGroup') 72 | logging.error('Sample list input : %s\n' % lSampleRef) 73 | raise Exception 74 | 75 | @staticmethod 76 | def CheckIntegrity(strBarcodeFile, strSeq): ## defensive 77 | rec = re.compile(r'[A|C|G|T|N]') 78 | 79 | if ':' in strSeq: 80 | strSeq = strSeq.split(':')[1] 81 | 82 | strNucle = re.findall(rec, strSeq) 83 | if len(strNucle) != len(strSeq): 84 | logging.error('This sequence is not suitable, check A,C,G,T,N are used only : %s' % strBarcodeFile) 85 | set_trace() 86 | sys.exit(1) 87 | 88 | @staticmethod 89 | def PreventFromRmMistake(strCmd): 90 | rec = re.compile(r'rm.+-rf*.+(\.$|\/$|\*$|User$|Input$|Output$)') ## This reg can prevent . / * ./User User ... 91 | if re.findall(rec, strCmd): 92 | raise Exception('%s is critical mistake! never do like this.' % strCmd) 93 | 94 | 95 | class InitialFolder(object): 96 | 97 | def __init__(self, strUser, strProject, strProgram): 98 | self.strUser = strUser 99 | self.strProject = strProject 100 | self.strProgram = strProgram 101 | 102 | def MakeDefaultFolder(self): 103 | Helper.MakeFolderIfNot('Input') 104 | Helper.MakeFolderIfNot('Output') 105 | Helper.MakeFolderIfNot('User') 106 | 107 | def MakeInputFolder(self): 108 | ## './Input/JaeWoo' 109 | strUserInputDir = './Input/{user}'.format(user=self.strUser) 110 | Helper.MakeFolderIfNot(strUserInputDir) 111 | 112 | if self.strProgram == 'Run_indel_searcher.py': 113 | ## './Input/JaeWoo/FASTQ' 114 | strUserFastqDir = os.path.join(strUserInputDir, 'FASTQ') 115 | Helper.MakeFolderIfNot(strUserFastqDir) 116 | elif self.strProgram == 'Run_BaseEdit_freq.py': 117 | ## './Input/JaeWoo/Query' 118 | strUserFastqDir = os.path.join(strUserInputDir, 'Query') 119 | Helper.MakeFolderIfNot(strUserFastqDir) 120 | else: 121 | print('CoreSystem.py -> CoreSystem error, check the script.') 122 | raise Exception 123 | 124 | ## './Input/JaeWoo/FASTQ/Test_samples' 125 | strUserProjectDir = os.path.join(strUserFastqDir, self.strProject) 126 | Helper.MakeFolderIfNot(strUserProjectDir) 127 | 128 | ## './Input/JaeWoo/Reference' 129 | strUserReference = os.path.join(strUserInputDir, 'Reference') 130 | Helper.MakeFolderIfNot(strUserReference) 131 | 132 | ## './Input/JaeWoo/Reference/Test_samples' 133 | strUserRefProject = os.path.join(strUserReference, self.strProject) 134 | Helper.MakeFolderIfNot(strUserRefProject) 135 | 136 | ## './User/JaeWoo' 137 | strUserDir = './User/{user}'.format(user=self.strUser) 138 | Helper.MakeFolderIfNot(strUserDir) 139 | 140 | ## '> ./User/JaeWoo/Test_samples.txt' 141 | self.strProjectFile = os.path.join(strUserDir, self.strProject+'.txt') 142 | if not os.path.isfile(self.strProjectFile): 143 | sp.call('> ' + self.strProjectFile, shell=True) 144 | 145 | def MakeOutputFolder(self): 146 | 147 | ## './Output/JaeWoo' 148 | strOutputUserDir = './Output/{user}'.format(user=self.strUser) 149 | Helper.MakeFolderIfNot(strOutputUserDir) 150 | 151 | ## './Output/JaeWoo/Test_samples' 152 | self.strOutputProjectDir = os.path.join(strOutputUserDir, self.strProject) 153 | Helper.MakeFolderIfNot(self.strOutputProjectDir) 154 | 155 | ## './Output/JaeWoo/Test_samples/Log' 156 | strOutputLog = os.path.join(self.strOutputProjectDir, 'Log') 157 | Helper.MakeFolderIfNot(strOutputLog) 158 | 159 | strLogName = str(datetime.now()).replace('-', '_').replace(':', '_').replace(' ', '_').split('.')[0] 160 | self.strLogPath = os.path.join(self.strOutputProjectDir, 'Log/{logname}_log.txt'.format(logname=strLogName)) 161 | 162 | 163 | class UserFolderAdmin(object): 164 | 165 | """ 166 | InitialFolder : out of the loop 167 | UserFolderAdmin : in the loop 168 | 169 | So InitialFolder and UserFolderAdmin must be distinguished. 170 | """ 171 | 172 | def __init__(self, strSample, strRef, options, strLogPath): 173 | 174 | self.strSample = strSample 175 | self.strRef = strRef 176 | self.strLogPath = strLogPath 177 | 178 | self.strUser = options.user_name 179 | self.strProject = options.project_name 180 | 181 | self.intCore = options.multicore 182 | self.strGapOpen = options.gap_open # CRISPresso aligner option 183 | self.strGapExtend = options.gap_extend # 184 | self.strPython = options.python 185 | 186 | self.strOutProjectDir = '' 187 | self.strOutSampleDir = '' 188 | self.strRefDir = '' 189 | 190 | def MakeSampleFolder(self): 191 | 192 | ## './Output/Jaewoo/Test_samples' 193 | self.strOutProjectDir = './Output/{user}/{project}'.format(user=self.strUser, project=self.strProject) 194 | 195 | ## './Output/Jaewoo/Test_samples/Sample_1' 196 | self.strOutSampleDir = os.path.join(self.strOutProjectDir, self.strSample) 197 | Helper.MakeFolderIfNot(self.strOutSampleDir) 198 | 199 | ## './Output/Jaewoo/Test_samples/Sample_1/Tmp' 200 | Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Tmp')) 201 | 202 | ## './Output/Jaewoo/Test_samples/Sample_1/Tmp/Pickle' 203 | Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Tmp/Pickle')) 204 | 205 | ## './Output/Jaewoo/Test_samples/Sample_1/Result' 206 | Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Result')) 207 | 208 | ## './Output/Jaewoo/Test_samples/All_results 209 | strAllResultDir = os.path.join(self.strOutProjectDir, 'All_results') 210 | Helper.MakeFolderIfNot(strAllResultDir) 211 | 212 | self.strRefDir = './Input/{user}/Reference/{project}/{ref}'.format(user=self.strUser, 213 | project=self.strProject, 214 | ref=self.strRef) 215 | 216 | 217 | class CoreHash(object): 218 | 219 | @staticmethod 220 | def MakeHashTable(strSeq, intBarcodeLen): 221 | listSeqWindow = [strSeq[i:i + intBarcodeLen] for i in range(len(strSeq))[:-intBarcodeLen - 1]] 222 | return listSeqWindow 223 | 224 | @staticmethod 225 | def IndexHashTable(dictRef, strSeqWindow, intFirstBarcode): 226 | lCol_ref = dictRef[strSeqWindow] 227 | strBarcode = strSeqWindow 228 | intFirstBarcode = 1 229 | 230 | return (lCol_ref, strBarcode, intFirstBarcode) 231 | 232 | 233 | class CoreGotoh(object): 234 | 235 | def __init__(self, strEDNAFULL='', floOg='', floOe=''): 236 | self.npAlnMatrix = CRISPResso2Align.read_matrix(strEDNAFULL) 237 | self.floOg = floOg 238 | self.floOe = floOe 239 | 240 | def GapIncentive(self, strRefSeqAfterBarcode): 241 | ## cripsress no incentive == gotoh 242 | intAmpLen = len(strRefSeqAfterBarcode) 243 | npGapIncentive = np.zeros(intAmpLen + 1, dtype=np.int) 244 | return npGapIncentive 245 | 246 | def RunCRISPResso2(self, strQuerySeqAfterBarcode, strRefSeqAfterBarcode, npGapIncentive): 247 | listResult = CRISPResso2Align.global_align(strQuerySeqAfterBarcode.upper(), strRefSeqAfterBarcode.upper(), 248 | matrix=self.npAlnMatrix, gap_open=self.floOg, gap_extend=self.floOe, 249 | gap_incentive=npGapIncentive) 250 | return listResult 251 | 252 | 253 | def CheckProcessedFiles(Func): 254 | def Wrapped_func(**kwargs): 255 | 256 | InstInitFolder = kwargs['InstInitFolder'] 257 | strInputProject = kwargs['strInputProject'] 258 | listSamples = kwargs['listSamples'] 259 | logging = kwargs['logging'] 260 | 261 | logging.info('File num check: input folder and project list') 262 | Helper.CheckSameNum(strInputProject, listSamples) 263 | 264 | Func(**kwargs) 265 | 266 | logging.info('Check that all folder are well created.') 267 | Helper.CheckAllDone(InstInitFolder.strOutputProjectDir, listSamples) 268 | 269 | return Wrapped_func 270 | 271 | 272 | def AttachSeqToIndel(strSample, strBarcodeName, strIndelPos, 273 | strRefseq, strQueryseq, dictSub): 274 | 275 | listIndelPos = strIndelPos.split('M') 276 | intMatch = int(listIndelPos[0]) 277 | 278 | if 'I' in strIndelPos: 279 | intInsertion = int(listIndelPos[1].replace('I', '')) 280 | strInDelSeq = strQueryseq[intMatch:intMatch + intInsertion] 281 | 282 | elif 'D' in strIndelPos: 283 | intDeletion = int(listIndelPos[1].replace('D', '')) 284 | strInDelSeq = strRefseq[intMatch:intMatch + intDeletion] 285 | 286 | else: 287 | logging.info('strIndelClass is included I or D. This variable is %s' % strIndelPos) 288 | raise Exception 289 | 290 | strInDelPosSeq = strIndelPos + '_' + strInDelSeq 291 | 292 | try: 293 | _ = dictSub[strSample][strBarcodeName] 294 | except KeyError: 295 | dictSub[strSample][strBarcodeName] = {} 296 | 297 | try: 298 | dictSub[strSample][strBarcodeName][strBarcodeName + ':' + strInDelPosSeq]['IndelCount'] += 1 299 | except KeyError: 300 | dictSub[strSample][strBarcodeName][strBarcodeName + ':' + strInDelPosSeq] = {'IndelCount':1} 301 | 302 | 303 | 304 | def RunProgram(sCmd): 305 | sp.call(sCmd, shell=True) 306 | 307 | def RunMulticore(lCmd, iCore): 308 | for sCmd in lCmd: 309 | print(sCmd) 310 | 311 | p = mp.Pool(iCore) 312 | p.map_async(RunProgram, lCmd).get() 313 | p.close() 314 | -------------------------------------------------------------------------------- /Core/CoreSystem.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/CoreSystem.pyc -------------------------------------------------------------------------------- /Core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/__init__.py -------------------------------------------------------------------------------- /Core/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/__init__.pyc -------------------------------------------------------------------------------- /EDNAFULL: -------------------------------------------------------------------------------- 1 | # 2 | # This matrix was created by Todd Lowe 12/10/92 3 | # 4 | # Uses ambiguous nucleotide codes, probabilities rounded to 5 | # nearest integer 6 | # 7 | # Lowest score = -4, Highest score = 5 8 | # 9 | A T G C S W R Y K M B V H D N U 10 | A 5 -4 -4 -4 -4 1 1 -4 -4 1 -4 -1 -1 -1 -2 -4 11 | T -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 5 12 | G -4 -4 5 -4 1 -4 1 -4 1 -4 -1 -1 -4 -1 -2 -4 13 | C -4 -4 -4 5 1 -4 -4 1 -4 1 -1 -1 -1 -4 -2 -4 14 | S -4 -4 1 1 -1 -4 -2 -2 -2 -2 -1 -1 -3 -3 -1 -4 15 | W 1 1 -4 -4 -4 -1 -2 -2 -2 -2 -3 -3 -1 -1 -1 1 16 | R 1 -4 1 -4 -2 -2 -1 -4 -2 -2 -3 -1 -3 -1 -1 -4 17 | Y -4 1 -4 1 -2 -2 -4 -1 -2 -2 -1 -3 -1 -3 -1 1 18 | K -4 1 1 -4 -2 -2 -2 -2 -1 -4 -1 -3 -3 -1 -1 1 19 | M 1 -4 -4 1 -2 -2 -2 -2 -4 -1 -3 -1 -1 -3 -1 -4 20 | B -4 -1 -1 -1 -1 -3 -3 -1 -1 -3 -1 -2 -2 -2 -1 -1 21 | V -1 -4 -1 -1 -1 -3 -1 -3 -3 -1 -2 -1 -2 -2 -1 -4 22 | H -1 -1 -4 -1 -3 -1 -3 -1 -3 -1 -2 -2 -1 -2 -1 -1 23 | D -1 -1 -1 -4 -3 -1 -1 -3 -1 -3 -2 -2 -2 -1 -1 -1 24 | N -2 -2 -2 -2 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -2 25 | U -4 5 -4 -4 -4 1 -4 1 1 -4 -1 -4 -1 -1 -2 5 26 | -------------------------------------------------------------------------------- /Indel_searcher_2/BaseEdit_input_converter.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from pdb import set_trace 3 | 4 | import multiprocessing as mp 5 | 6 | sys.path.insert(0, os.path.dirname(os.getcwd())) 7 | from Core.CoreSystem import Helper 8 | 9 | strUser = sys.argv[1] 10 | strProject = sys.argv[2] 11 | 12 | print('Usage : python ./BaseEdit_input_converter.py user_name project_name') 13 | print('Usage : python ./BaseEdit_input_converter.py JaeWoo Test_samples') 14 | 15 | 16 | """ 17 | --> Conversion format 18 | Barcode.txt 19 | ACACACACACACAGCTCATA:ACACACACACACAGCTCATA 20 | Reference.txt 21 | ACACACACACACAGCTCATA:TTTGTATACACGCATGTATGCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 22 | Query reads 23 | ACACACACACACAGCTCATA.txt 24 | 25 | BaseEdit output 26 | Barcode.txt 27 | YSKim_0525+01614_98_repeat1:TATACACGCATGTAT 28 | ... 29 | Reference.txt 30 | YSKim_0525+01614_98_repeat1:TTTGTATACACGCATGTAT GCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 31 | ... 32 | Read 33 | YSKim_0525+01614_98_repeat1.txt 34 | """ 35 | 36 | def Convert_Indelsearcher_output(strSampleRefGroup): 37 | 38 | listSampleRefGroup = strSampleRefGroup.replace('\n', '').replace('\r', '').split('\t') 39 | 40 | strSample = listSampleRefGroup[0] 41 | strRef = listSampleRefGroup[1] 42 | 43 | print('Processing: %s, %s' % (strSample, strRef)) 44 | 45 | strBaseEditRefFolder = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format(user=strUser, 46 | project=strProject, 47 | ref=strRef) 48 | strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format(user=strUser, 49 | project=strProject, 50 | sample=strSample) 51 | try: 52 | Helper.MakeFolderIfNot(strBaseEditRefFolder) 53 | Helper.MakeFolderIfNot(strBaseEditQueryFolder) 54 | except OSError as e: 55 | print(e) 56 | pass 57 | 58 | ## BaseEdit refer format : filename, barcode, reference 59 | ReferenceFile_in_IndelSearcher = open('./Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'.format(user=strUser, 60 | project=strProject, 61 | ref=strRef)) 62 | BarcodeFile_in_IndelSearcher = open('./Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(user=strUser, 63 | project=strProject, 64 | ref=strRef)) 65 | BarcodeFile_for_BaseEdit = open('../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(user=strUser, 66 | project=strProject, 67 | ref=strRef), 'w') 68 | Reference_for_BaseEdit = open('../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'.format(user=strUser, 69 | ref=strRef, 70 | project=strProject), 'w') ## conversion target to barcode:refseq 71 | 72 | dictBarcodeSeq = {} 73 | 74 | for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip(BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher): 75 | 76 | strBarcodeIndelSearcher = strBarcodeIndelSearcher.replace('\n', '').strip() 77 | strReferenceIndelSearcher = strReferenceIndelSearcher.replace('\n', '').strip() 78 | 79 | dictBarcodeSeq[strBarcodeIndelSearcher] = [] 80 | BarcodeFile_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n') ## first is filename, second is barcode. BaseEdit barcode format 81 | Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strReferenceIndelSearcher + '\n') 82 | 83 | ReferenceFile_in_IndelSearcher.close() 84 | BarcodeFile_in_IndelSearcher.close() 85 | Reference_for_BaseEdit.close() 86 | 87 | Total_result_file = open('./Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq'.format(user=strUser, 88 | project=strProject, 89 | sample=strSample)) 90 | 91 | intCheckTotLine = 0 92 | intOneLineMore = 0 93 | 94 | for i, strRow in enumerate(Total_result_file): ## for query reads 95 | 96 | if intOneLineMore == 1: 97 | intCheckTotLine = 0 98 | intOneLineMore = 0 99 | 100 | if i % 4 == 0: ## Classified_Indel_barcode has all total sequence. 101 | strBarcode = strRow.split('Barcode_')[1].split(':')[0] 102 | intCheckTotLine = 1 103 | 104 | elif intCheckTotLine == 1: 105 | dictBarcodeSeq[strBarcode].append(strRow) 106 | intOneLineMore = 1 107 | 108 | for strBarcode, listSeq in dictBarcodeSeq.items(): 109 | with open('../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt'.format( 110 | user=strUser, project=strProject, sample=strSample, barcode=strBarcode), 'w') as Output: 111 | Output.write(''.join(listSeq)) 112 | 113 | Total_result_file.close() 114 | 115 | 116 | def Main(): 117 | print('Program Start') 118 | p = mp.Pool(2) 119 | 120 | with open('./User/{user}/{project}.txt'.format(user=strUser, project=strProject)) as SampleList: 121 | listSampleRefGroup = [strSampleRefGroup for strSampleRefGroup in SampleList if strSampleRefGroup[0] != '#'] 122 | p.map_async(Convert_Indelsearcher_output, listSampleRefGroup).get() 123 | 124 | p.close() 125 | 126 | print('Program End') 127 | 128 | 129 | Main() 130 | -------------------------------------------------------------------------------- /Indel_searcher_2/Debugger.py: -------------------------------------------------------------------------------- 1 | #!/media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python2.7 2 | 3 | import os, re, sys, pickle 4 | import subprocess as sp 5 | from Bio import AlignIO 6 | from pdb import set_trace 7 | 8 | 9 | strFastq='/media/hkim/Pipeline/CRISPR_Indel_searcher/Input/FASTQ/190807_Nahye_24k_NG_rep1-24kLib/NG_rep1.extendedFrags.fastq' 10 | strBarcode='TTTGGTGATCTCACTCTCGACAACTC' 11 | 12 | sRef_fa = './Input/Reference/190807_Nahye_24k_NG_rep1-24kLib/Reference.fa' 13 | sBarcode_PAM_pos='Foward' 14 | 15 | 16 | def CountBar(): 17 | with open(strFastq) as Input: 18 | 19 | intCnt=0 20 | 21 | for strRow in Input: 22 | if strBarcode in strRow: 23 | intCnt+=1 24 | 25 | print(intCnt) 26 | 27 | def ExtractFastq(): 28 | 29 | with open(strFastq) as Input,\ 30 | open('./Input/FASTQ/Test1/Test1.fastq_target', 'w') as Output: 31 | 32 | listFastq = [] 33 | 34 | for i, strRow in enumerate(Input): 35 | listFastq.append(strRow.replace('\n', '')) 36 | if i % 4 == 3: 37 | #print(listFastq) 38 | if strBarcode in listFastq[1]: 39 | Output.write('\n'.join(listFastq)+'\n') 40 | listFastq = [] 41 | 42 | 43 | def LoadPickle(): 44 | 45 | with open('Output/Test1/Pickle/Test1.fastq_1.fq.pickle', 'rb') as Input: 46 | obj = pickle.load(Input) 47 | set_trace() 48 | 49 | 50 | def CheckSearch(): 51 | dRef = {} 52 | dResult = {} 53 | 54 | with open(sRef_fa) as Ref: 55 | 56 | iCount = 0 57 | sBarcode = "" 58 | sTarget_region = "" 59 | 60 | for sRow in Ref: 61 | iCount += 1 62 | 63 | if iCount % 2 != 0: 64 | # barcode target region 65 | # >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC 66 | sBarcode_indel_seq = sRow.strip().replace('\n', '').replace('\r', '').split(':') 67 | sBarcode = sBarcode_indel_seq[0].replace('>', '') 68 | sTarget_region = sBarcode_indel_seq[1] 69 | 70 | ## Reverse the sentence. If it is done, all methods are same before work. 71 | if sBarcode_PAM_pos == 'Reverse': 72 | sBarcode = sBarcode[::-1] 73 | sTarget_region = sTarget_region[::-1] 74 | 75 | elif iCount % 2 == 0: 76 | ## Reverse 77 | sRef_seq = sRow.strip().replace('\n', '').replace('\r', '') 78 | 79 | if sBarcode_PAM_pos == 'Reverse': 80 | sRef_seq = sRef_seq[::-1] 81 | 82 | Seq_matcher = re.compile(r'(?=(%s))' % sTarget_region) 83 | # iIndel_start_pos = sRef_seq.index(sTarget_region) # There is possible to exist two indel. 84 | iIndel_start_pos = Seq_matcher.finditer(sRef_seq) 85 | 86 | for i, match in enumerate(iIndel_start_pos): 87 | iIndel_start_pos = match.start() 88 | # print iIndel_start_pos 89 | # print len(sTarget_region) 90 | # print sRef_seq 91 | iIndel_end_pos = iIndel_start_pos + len(sTarget_region) - 1 92 | 93 | try: 94 | iBarcode_start_pos = sRef_seq.index(sBarcode) 95 | 96 | #if iIndel_start_pos <= iBarcode_start_pos: 97 | # print(iIndel_start_pos, iBarcode_start_pos) 98 | # raise IndexError('indel is before barcode') 99 | 100 | iBarcode_end_pos = iBarcode_start_pos + len(sBarcode) - 1 101 | sRef_seq_after_barcode = sRef_seq[iBarcode_end_pos + 1:] 102 | 103 | # modified. to -1 104 | iIndel_end_next_pos_from_barcode_end = iIndel_end_pos - iBarcode_end_pos - 1 105 | 106 | iIndel_start_next_pos_from_barcode_end = iIndel_start_pos - iBarcode_end_pos - 1 107 | 108 | # "barcode"-------------*(N) that distance. 109 | # ^ ^ ^ 110 | # *NNNN*NNNN 111 | # ^ ^ indel pos, the sequence matcher selects indel event pos front of it. 112 | 113 | dRef[sBarcode] = (sRef_seq, sTarget_region, sRef_seq_after_barcode, iIndel_start_next_pos_from_barcode_end, 114 | iIndel_end_next_pos_from_barcode_end, iIndel_start_pos,iIndel_end_pos) # total matched reads, insertion, deletion, complex 115 | dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], []] 116 | except ValueError: 117 | continue 118 | 119 | with open('test.seq') as Input: 120 | 121 | iBarcode_matched = 0 122 | 123 | for sSeq in Input: 124 | sSeq = sSeq.replace('\n','') 125 | 126 | listSeqWindow = [sSeq[i:i + 26] for i in range(len(sSeq))[:-25]] 127 | 128 | iNeedle_matched = 0 129 | iInsert_count = 0 130 | iDelete_count = 0 131 | iComplex_count = 0 132 | 133 | intFirstBarcode = 0 ## check whether a barcode is one in a sequence. 134 | 135 | for strSeqWindow in listSeqWindow: 136 | 137 | if intFirstBarcode == 1: break ## A second barcode in a sequence is not considerable. 138 | 139 | try: 140 | lCol_ref = dRef[strSeqWindow] 141 | sBarcode = strSeqWindow 142 | intFirstBarcode = 1 143 | except KeyError: 144 | continue 145 | 146 | iBarcode_matched += 1 147 | 148 | print(iBarcode_matched) 149 | 150 | 151 | def CheckNeedle(): 152 | 153 | sBarcode = 'TTTGACTAGTCATCACTATAGCATAA' 154 | sRef_seq_after_barcode = 'TACAGTGTTTTTTTTTTTTCAGAGGAAGCTTGGCGTAACTAGATCT' 155 | sQuery_seq_after_barcode = 'TACAGTGTTTTTTTTTTTCAGAGGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA' 156 | 157 | sRef_seq = r'<(echo -e ">{name}\n{seq}")'.format(name='Ref', seq=sRef_seq_after_barcode) 158 | sQuery_seq = r'<(echo -e ">{name}\n{seq}")'.format(name='Query', seq=sQuery_seq_after_barcode) 159 | 160 | sNeedle_cmd = r"/bin/bash -c 'needle -filter {0} {1} -outfile stdout -gapopen {2} -gapextend {3} -endweight Y -endopen {4} -endextend {5}'".format(sRef_seq, sQuery_seq, '20', '1', '20', '1') 161 | 162 | Needle_result = sp.Popen(sNeedle_cmd, stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True,shell=True) 163 | lResult = [Instance.seq._data for Instance in AlignIO.read(Needle_result.stdout, "emboss")] 164 | print(lResult) 165 | 166 | 167 | def LoggingTest(): 168 | import logging 169 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 170 | level=logging.DEBUG, filename='test.log', filemode='a' 171 | ) 172 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 173 | 174 | logging.info('test') 175 | a = a * 10 176 | 177 | 178 | def Main(): 179 | #CountBar() 180 | #ExtractFastq() 181 | #LoadPickle() 182 | #CheckSearch() 183 | #CheckNeedle() 184 | LoggingTest() 185 | 186 | Main() 187 | -------------------------------------------------------------------------------- /Indel_searcher_2/Flash_pair_read_merge.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import subprocess as sp 3 | 4 | strUser = sys.argv[1] 5 | strProject = sys.argv[2] 6 | strFlash = sys.argv[3] 7 | strThread = sys.argv[4] 8 | 9 | 10 | def RunFlash(): 11 | 12 | strFlashDir = '../{flash}'.format(flash=strFlash) 13 | strProjectDir = './Input/{user}/FASTQ/{project}'.format(user=strUser, project=strProject) 14 | 15 | for strSampleDir in os.listdir(strProjectDir): 16 | strSamplePath = os.path.join(strProjectDir, strSampleDir) 17 | 18 | if os.path.isdir(strSamplePath): 19 | 20 | listPairFiles = [] 21 | 22 | for strFile in os.listdir(os.path.join(strProjectDir, strSampleDir)): 23 | if '_1.fastq.gz' in strFile or '_2.fastq.gz' in strFile: 24 | listPairFiles.append(strFile) 25 | 26 | strForward = os.path.join(strSamplePath, listPairFiles[0]) 27 | strReverse = os.path.join(strSamplePath, listPairFiles[1]) 28 | strOutput = os.path.join(strSamplePath, listPairFiles[0].replace('_1.fastq.gz', '')) 29 | 30 | strLog = './Output/{user}/{project}/Log'.format(user=strUser, 31 | project=strProject) 32 | 33 | if not os.path.isdir(strLog): os.makedirs(strLog) 34 | 35 | strCmd = '{flash_dir}/flash -m 10 -M 400 -O -o {output} -t {thread} {r1} {r2} >{log}/flash.log 2>&1 '.format( 36 | flash_dir=strFlashDir, 37 | output=strOutput, 38 | thread=strThread, 39 | r1=strForward, 40 | r2=strReverse, 41 | log=strLog) 42 | 43 | print(strCmd) 44 | sp.call(strCmd, shell=True) 45 | print('complete, {fow} {rev} are moved to project folder'.format(fow=listPairFiles[0], rev=listPairFiles[1])) 46 | sp.call('mv {sample_path}/*.fastq.gz {project_dir} &&' 47 | ' rm {sample_path}/*hist* {project_dir} &&' 48 | ' rm {sample_path}/*notCombined* {project_dir}'.format(sample_path=strSamplePath, 49 | project_dir=strProjectDir), shell=True) 50 | 51 | 52 | def Main(): 53 | RunFlash() 54 | 55 | 56 | Main() 57 | -------------------------------------------------------------------------------- /Indel_searcher_2/Indel_frequency_calculator.py: -------------------------------------------------------------------------------- 1 | import os, sys, logging 2 | 3 | from pdb import set_trace 4 | from datetime import datetime 5 | from collections import OrderedDict 6 | from collections import namedtuple as nt 7 | 8 | strOutputDir = sys.argv[1] 9 | strSample = sys.argv[2] 10 | strLogPath = sys.argv[3] 11 | 12 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 13 | level=logging.DEBUG, 14 | filename=strLogPath, 15 | filemode='a') 16 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 17 | 18 | 19 | def MakeIndelSummary(): 20 | 21 | """ 22 | Input 23 | TTTGCAGAGTATATCACACCATATCA AGTCAGACAAGGAGCACCACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 17M1I 0.134 AGTCAGACAAGGAGCAC-ACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC------- AGTCAGACAAGGAGCACCACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 24 | 25 | 0: barcode 26 | 1: target region 27 | 2: indel pos 28 | 3: total freq 29 | 4: ref seq 30 | 5: query seq 31 | 32 | Output 33 | TTTGTCTCGTACACTCGTATGCTGCA 2 18M2D:1:50.0, 24M1I:1:50.0 34 | TTTGACATCTACAGTGTCTCTCCACA 2 22M1I:2:100.0 35 | """ 36 | 37 | listOutput = [] 38 | 39 | with open('{outdir}/Tmp/{sample}_Indel_freq.txt'.format(sample=strSample, outdir=strOutputDir)) as InputFreq,\ 40 | open('{outdir}/Tmp/{sample}_Indel_summary.txt'.format(sample=strSample, outdir=strOutputDir), 'w') as OutputFreq: 41 | 42 | listTable = [strRow.replace('\n', '').split('\t') for strRow in InputFreq] 43 | intTotal = len(listTable) 44 | 45 | #strBarcode = listCol[0] 46 | dictINDEL = OrderedDict({listCol[0]:OrderedDict({'Total':0}) for listCol in listTable}) ## {'TTTGACATCTACAGTGTCTCTCCACA': {22M1I : 2, ...}} 47 | 48 | for listCol in listTable: 49 | strBarcode = listCol[0] 50 | strIndel = listCol[2] 51 | 52 | dictINDEL[strBarcode]['Total'] += 1 53 | 54 | try: 55 | dictINDEL[strBarcode][strIndel] += 1 56 | except KeyError: 57 | dictINDEL[strBarcode][strIndel] = 1 58 | 59 | #dictINDEL = OrderedDict(sorted(dictINDEL.items(), key=lambda t: t[1], reverse=True)) ## sort value count. 60 | 61 | list2Result = [] 62 | for strBarcode in dictINDEL: 63 | intTotal = dictINDEL[strBarcode]['Total'] 64 | list2INDEL = [[strIndel, intCount, round(intCount/float(intTotal),3)*100] for strIndel, intCount in dictINDEL[strBarcode].items()] 65 | list2INDEL = sorted(list2INDEL, key=lambda x: x[1], reverse=True) 66 | strIndelResult = ''.join([':'.join(map(str, listINDEL))+', ' for listINDEL in list2INDEL if listINDEL[0] != 'Total']) 67 | list2Result.append([strBarcode, intTotal, strIndelResult]) 68 | 69 | for listResult in sorted(list2Result, key=lambda x: x[1], reverse=True): 70 | OutputFreq.write('\t'.join(map(str, listResult)) + '\n') 71 | 72 | 73 | if __name__ == '__main__': 74 | logging.info('Indel frequency calculator start: %s' % str(datetime.now())) 75 | MakeIndelSummary() 76 | logging.info('Indel frequency calculator end: %s' % str(datetime.now())) 77 | -------------------------------------------------------------------------------- /Indel_searcher_2/Indel_normalization.py: -------------------------------------------------------------------------------- 1 | import os, sys, logging 2 | 3 | from pdb import set_trace 4 | 5 | import pandas as pd 6 | 7 | sys.path.insert(0, os.path.dirname(os.getcwd())) 8 | from Core.CoreSystem import SplitSampleInfo, AttachSeqToIndel, Helper 9 | 10 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 11 | level=logging.INFO) 12 | 13 | 14 | strProjectFile = sys.argv[1] 15 | strUserName = sys.argv[2] 16 | strProjectName = sys.argv[3] 17 | 18 | """ 19 | /media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python ./Indel_normalization.py User/JaeWoo/JaeWoo_test_samples.txt JaeWoo JaeWoo_test_samples 20 | """ 21 | 22 | def MakeIndelSeqDict(): 23 | 24 | """ 25 | dictD0Indel: 26 | {'sample_*': {'ACGATCGAT': {'Total': 300, {'ACGATCGAT_30M2I_AG': {'IndelCount': 3}}}}}}} 27 | 28 | validation 29 | ./Output/JaeWoo/JaeWoo_test_samples/190819_Nahye_12K_D7_2_D0_1-Cas9D7/Tmp 30 | grep TTTGGATCGTCTATCGTCG 190819_Nahye_12K_D7_2_D0_1-Cas9D7_Indel_freq.txt | grep 18M16D | wc -l 31 | -> Indel count 32 | """ 33 | 34 | dictD0Indel = {} 35 | dictExpIndel = {} 36 | 37 | with open(strProjectFile) as SampleList: 38 | 39 | for strSample in SampleList: 40 | print(strSample) 41 | 42 | tupSampleInfo = SplitSampleInfo(strSample) 43 | if not tupSampleInfo: continue 44 | strSample, strRef, strExpCtrl = tupSampleInfo 45 | 46 | if strExpCtrl == 'CTRL': 47 | dictD0Indel[strSample] = {} 48 | elif strExpCtrl == 'EXP': 49 | dictExpIndel[strSample] = {} 50 | 51 | with open('./Output/{user}/{project}/{sample}/Tmp/{sample}_Indel_freq.txt'.format( 52 | user=strUserName, project=strProjectName, sample=strSample)) as IndelFreq,\ 53 | open('./Output/{user}/{project}/{sample}/Result/{sample}_Summary_result.tsv'.format( 54 | user=strUserName, project=strProjectName, sample=strSample)) as TotalResult: 55 | 56 | for strRow in IndelFreq: 57 | listCol = strRow.replace('\n','').split('\t') 58 | strBarcode = listCol[0] 59 | strIndelPos = listCol[2] 60 | strRefseq = listCol[4] 61 | strQueryseq = listCol[5] 62 | 63 | if strExpCtrl == 'CTRL': 64 | AttachSeqToIndel(strSample, strBarcode, strIndelPos, strRefseq, strQueryseq, dictD0Indel) 65 | elif strExpCtrl == 'EXP': 66 | AttachSeqToIndel(strSample, strBarcode, strIndelPos, strRefseq, strQueryseq, dictExpIndel) 67 | 68 | TotalResult.readline() ## skip header 69 | for strRow in TotalResult: 70 | listCol = strRow.replace('\n', '').split('\t') 71 | strBarcode = listCol[0] 72 | intTotal = int(listCol[1]) 73 | 74 | try: 75 | dictD0Indel[strSample][strBarcode]['Total'] = intTotal 76 | except KeyError: 77 | pass 78 | 79 | try: 80 | dictExpIndel[strSample][strBarcode]['Total'] = intTotal 81 | except KeyError: 82 | pass 83 | 84 | #set_trace() 85 | #print(dictSub.items())# 86 | return (dictD0Indel, dictExpIndel) 87 | 88 | 89 | def MakeTmp(dictD0Indel, dictExpIndel): 90 | 91 | for dictIndel in [dictD0Indel, dictExpIndel]: 92 | for strSample, dictBarcode in dictIndel.items(): 93 | strTmpDir = './Output/{user}/{project}/{sample}/Tmp'.format(user=strUserName, 94 | project=strProjectName, 95 | sample=strSample) 96 | with open(os.path.join(strTmpDir, strSample+'_indel_seq_count.txt'), 'w') as Output: 97 | for strBarcode, dictCountTotalAndIndel in dictBarcode.items(): 98 | for strIndelSeq, dictCount in dictCountTotalAndIndel.items(): 99 | if strIndelSeq == 'Total': continue 100 | Output.write('\t'.join([strIndelSeq, str(dictCount['IndelCount'])])+'\n') 101 | 102 | 103 | def MergeD0SampleResults(dictD0Indel): 104 | 105 | """ 106 | dictD0Indel: 107 | {'sample_*': {'ACGATCGAT': {'Total': 300, {'ACGATCGAT_30M2I_AG': {'IndelCount': 3}}}}}}} 108 | 109 | -> sum total, sum indelcount 110 | 111 | dictD0IndelMerge: 112 | {'ACGATCGAT': {'Total': 600, {'ACGATCGAT_30M2I_AG': {'IndelCount': 5}}}}}}} 113 | """ 114 | 115 | dictD0IndelMerge = {} 116 | 117 | for strD0SampleName in dictD0Indel: 118 | for strBarcode, dictCountTotalAndIndel in dictD0Indel[strD0SampleName].items(): 119 | 120 | try: 121 | dictD0IndelMerge[strBarcode]['Total'] += dictCountTotalAndIndel['Total'] 122 | except KeyError: 123 | dictD0IndelMerge[strBarcode] = {} 124 | dictD0IndelMerge[strBarcode]['Total'] = dictCountTotalAndIndel['Total'] 125 | 126 | for strIndelSeq, dictCount in dictCountTotalAndIndel.items(): ## dcitCount : {'TTTGAGCATATCACACGAT:33M1D_T': {'IndelCount': 0}} 127 | if strIndelSeq == 'Total': continue 128 | 129 | try: 130 | dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount'] += dictCount['IndelCount'] 131 | except KeyError: 132 | dictD0IndelMerge[strBarcode][strIndelSeq] = {} 133 | dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount'] = dictCount['IndelCount'] 134 | 135 | return dictD0IndelMerge 136 | 137 | 138 | def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel): 139 | 140 | """ 141 | dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion 142 | """ 143 | strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format(user=strUserName, project=strProjectName) 144 | Helper.MakeFolderIfNot(strD0SubResultDir) 145 | 146 | for strSample, dictBarcode in dictExpIndel.items(): 147 | with open(os.path.join(strD0SubResultDir, '{sample}_D0SubResult.txt').format(sample=strSample), 'w') as Output: 148 | Output.write('Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n') 149 | 150 | for strBarcode, dictCountTotalAndIndel in dictBarcode.items(): 151 | 152 | intExpTotal = dictCountTotalAndIndel['Total'] 153 | 154 | for strIndelSeq, dictCount in dictCountTotalAndIndel.items(): 155 | if strIndelSeq == 'Total': continue 156 | 157 | try: 158 | intD0Total = dictD0IndelMerge[strBarcode]['Total'] 159 | intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount'] 160 | 161 | floD0Prop = round(intD0Count / float(intD0Total), 6) 162 | 163 | intExpCount = dictCount['IndelCount'] 164 | floExpProp = round(intExpCount / float(intExpTotal), 6) 165 | 166 | floSubExpIndel = floExpProp - floD0Prop 167 | if floSubExpIndel < 0: 168 | floSubExpIndel = 0 169 | 170 | Output.write('\t'.join(map(str, [strIndelSeq,intD0Total, floD0Prop, 171 | intExpTotal, floExpProp, floSubExpIndel]))+'\n') 172 | except KeyError: 173 | intExpCount = dictCount['IndelCount'] 174 | floExpProp = round(intExpCount / float(intExpTotal), 6) 175 | 176 | Output.write('\t'.join(map(str, [strIndelSeq, 'None', 'None', 177 | intExpTotal, floExpProp, floExpProp]))+'\n') 178 | 179 | 180 | def Main(): 181 | logging.info("Indel normalization Start") 182 | logging.info("MakeIndelSeqDict") 183 | dictD0Indel, dictExpIndel = MakeIndelSeqDict() 184 | logging.info("MakeTmp") 185 | MakeTmp(dictD0Indel, dictExpIndel) 186 | logging.info("MergeD0SampleResults") 187 | dictD0IndelMerge = MergeD0SampleResults(dictD0Indel) 188 | logging.info("SubtractIndelWithD0") 189 | SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel) 190 | logging.info("Indel normalization End") 191 | 192 | 193 | Main() 194 | 195 | -------------------------------------------------------------------------------- /Indel_searcher_2/Indel_normalization.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Indel_searcher_2/Indel_normalization.pyc -------------------------------------------------------------------------------- /Indel_searcher_2/Indel_searcher_crispresso_hash.py: -------------------------------------------------------------------------------- 1 | import os, re, sys, logging 2 | 3 | import numpy as np 4 | import subprocess as sp 5 | import cPickle as pickle 6 | 7 | from pdb import set_trace 8 | 9 | sys.path.insert(0, os.path.dirname(os.getcwd())) 10 | from Core.CoreSystem import CoreHash, CoreGotoh 11 | 12 | 13 | class clsParameter(object): 14 | 15 | def __init__(self): 16 | 17 | if len(sys.argv) > 1: 18 | self.strForwardFqPath = sys.argv[1] 19 | self.strReverseFqPath = sys.argv[2] 20 | self.strRefFa = sys.argv[3] 21 | self.strPair = sys.argv[4] 22 | self.floOg = float(sys.argv[5]) 23 | self.floOe = float(sys.argv[6]) 24 | self.intInsertionWin = int(sys.argv[7]) 25 | self.intDeletionWin = int(sys.argv[8]) 26 | self.strPamType = sys.argv[9].upper() ## Cpf1, Cas9 27 | self.strBarcodePamPos = sys.argv[10] ## PAM - BARCODE type (reverse) or BARCODE - PAM type (forward) 28 | self.intQualCutoff = int(sys.argv[11]) 29 | self.strOutputdir = sys.argv[12] 30 | self.strLogPath = sys.argv[13] 31 | self.strEDNAFULL = os.path.abspath('../EDNAFULL') 32 | 33 | else: 34 | sManual = """ 35 | Usage: 36 | 37 | python2.7 ./indel_search_ver1.0.py splitted_input_1.fq splitted_input_2.fq reference.fa 38 | 39 | splitted_input_1.fq : forward 40 | splitted_input_2.fq : reverse 41 | 42 | Total FASTQ(fq) lines / 4 = remainder 0. 43 | """ 44 | print(sManual) 45 | sys.exit() 46 | 47 | 48 | class clsFastqOpener(object): 49 | 50 | def __init__(self, InstParameter): 51 | 52 | self.strForwardFqPath = InstParameter.strForwardFqPath 53 | self.strReverseFqPath = InstParameter.strReverseFqPath 54 | 55 | def OpenFastqForward(self): 56 | 57 | listFastqForward = [] 58 | listStore = [] 59 | 60 | with open(self.strForwardFqPath) as Fastq1: 61 | 62 | for i, strRow in enumerate(Fastq1): 63 | 64 | i = i + 1 65 | strRow = strRow.replace('\n', '').upper() 66 | 67 | if i % 4 == 1 or i % 4 == 2: 68 | listStore.append(strRow) 69 | elif i % 4 == 0: 70 | listQual = [ord(i) - 33 for i in strRow] 71 | listStore.append(listQual) 72 | listFastqForward.append(tuple(listStore)) 73 | listStore = [] 74 | 75 | return listFastqForward 76 | 77 | def OpenFastqReverse(self): 78 | 79 | listFastqReverse = [] 80 | listStore = [] 81 | 82 | dictRev = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} 83 | 84 | #with open('./6_AsD0_2_small_test.fq') as fa_2: 85 | with open(self.strReverseFqPath) as Fastq2: 86 | 87 | for i, strRow in enumerate(Fastq2): 88 | i = i + 1 89 | strRow = strRow.replace('\n', '').upper() 90 | 91 | if i % 4 == 1: 92 | listStore.append(strRow) 93 | elif i % 4 == 2: 94 | listStore.append(''.join([dictRev[strNucle] for strNucle in strRow[::-1]])) 95 | elif i % 4 == 0: 96 | listQual = [ord(i) - 33 for i in strRow][::-1] 97 | listStore.append(listQual) 98 | listFastqReverse.append(tuple(listStore)) 99 | listStore = [] 100 | 101 | return listFastqReverse 102 | #end1: return 103 | #end: def 104 | 105 | 106 | class clsIndelSearchParser(object): 107 | 108 | def __init__(self, InstParameter): 109 | 110 | # index name, constant variable. 111 | self.intNumOfTotal = 0 112 | self.intNumOfIns = 1 113 | self.intNumOfDel = 2 114 | self.intNumofCom = 3 115 | self.intTotalFastq = 4 116 | self.intInsFastq = 5 117 | self.intDelFastq = 6 118 | self.intComFastq = 7 119 | self.intIndelInfo = 8 120 | 121 | self.strRefFa = InstParameter.strRefFa 122 | self.floOg = InstParameter.floOg 123 | self.floOe = InstParameter.floOe 124 | self.strEDNAFULL = InstParameter.strEDNAFULL 125 | self.strPamType = InstParameter.strPamType 126 | self.intInsertionWin = InstParameter.intInsertionWin 127 | self.intDeletionWin = InstParameter.intDeletionWin 128 | self.intQualCutoff = InstParameter.intQualCutoff 129 | 130 | def SearchBarcodeIndelPosition(self, sBarcode_PAM_pos): 131 | 132 | dRef = {} 133 | dResult = {} 134 | 135 | with open(self.strRefFa) as Ref: 136 | 137 | sBarcode = "" 138 | sTarget_region = "" 139 | intBarcodeLen = 0 140 | 141 | for i, sRow in enumerate(Ref): 142 | 143 | if i % 2 == 0: ## >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC 144 | sBarcode, sTarget_region, intBarcodeLen = self._SeperateFaHeader(sRow, sBarcode, sTarget_region, 145 | intBarcodeLen, sBarcode_PAM_pos) 146 | 147 | elif i % 2 != 0: ## AGCATCGATCAGCTACGATCGATCGATCACTAGCTACGATCGATCA 148 | sRef_seq, iIndel_start_pos, iIndel_end_pos = self._SearchIndelPos(sRow, sBarcode_PAM_pos, sTarget_region) 149 | 150 | try: 151 | self._MakeRefAndResultTemplate(sRef_seq, sBarcode, iIndel_start_pos, iIndel_end_pos, 152 | sTarget_region, dRef, dResult) 153 | except ValueError: 154 | continue 155 | 156 | assert len(dRef.keys()) == len(dResult.keys()) 157 | 158 | return dRef, dResult 159 | # end1: return 160 | 161 | def _SeperateFaHeader(self, sRow, sBarcode, sTarget_region, intBarcodeLen, sBarcode_PAM_pos): 162 | 163 | # barcode target region 164 | # >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC 165 | sBarcode_indel_seq = sRow.strip().replace('\n', '').replace('\r', '').split(':') 166 | sBarcode = sBarcode_indel_seq[0].replace('>', '') 167 | 168 | if intBarcodeLen > 0: 169 | assert intBarcodeLen == len(sBarcode), 'All of the barcode lengths must be same.' 170 | intBarcodeLen = len(sBarcode) 171 | 172 | sTarget_region = sBarcode_indel_seq[1] 173 | 174 | ## Reverse the sentence. If it is done, all methods are same before work. 175 | if sBarcode_PAM_pos == 'Reverse': 176 | sBarcode = sBarcode[::-1] 177 | sTarget_region = sTarget_region[::-1] 178 | 179 | return (sBarcode, sTarget_region, intBarcodeLen) 180 | 181 | def _SearchIndelPos(self, sRow, sBarcode_PAM_pos, sTarget_region): 182 | 183 | sRef_seq = sRow.strip().replace('\n', '').replace('\r', '') 184 | 185 | if sBarcode_PAM_pos == 'Reverse': 186 | sRef_seq = sRef_seq[::-1] 187 | 188 | Seq_matcher = re.compile(r'(?=(%s))' % sTarget_region) 189 | # iIndel_start_pos = sRef_seq.index(sTarget_region) # There is possible to exist two indel. 190 | iIndel_start_pos = Seq_matcher.finditer(sRef_seq) 191 | 192 | for i, match in enumerate(iIndel_start_pos): 193 | iIndel_start_pos = match.start() 194 | # print iIndel_start_pos 195 | # print len(sTarget_region) 196 | # print sRef_seq 197 | iIndel_end_pos = iIndel_start_pos + len(sTarget_region) - 1 198 | 199 | return (sRef_seq, iIndel_start_pos, iIndel_end_pos) 200 | 201 | def _MakeRefAndResultTemplate(self, sRef_seq, sBarcode, iIndel_start_pos, 202 | iIndel_end_pos, sTarget_region, dRef, dResult): 203 | iBarcode_start_pos = sRef_seq.index(sBarcode) 204 | 205 | # if iIndel_start_pos <= iBarcode_start_pos: 206 | # print(iIndel_start_pos, iBarcode_start_pos) 207 | # raise IndexError('indel is before barcode') 208 | 209 | iBarcode_end_pos = iBarcode_start_pos + len(sBarcode) - 1 210 | sRef_seq_after_barcode = sRef_seq[iBarcode_end_pos + 1:] 211 | 212 | # modified. to -1 213 | iIndel_end_next_pos_from_barcode_end = iIndel_end_pos - iBarcode_end_pos - 1 214 | iIndel_start_next_pos_from_barcode_end = iIndel_start_pos - iBarcode_end_pos - 1 215 | 216 | # "barcode"-------------*(N) that distance. 217 | # ^ ^ ^ 218 | # *NNNN*NNNN 219 | # ^ ^ indel pos, the sequence matcher selects indel event pos front of it. 220 | 221 | ## Result 222 | dRef[sBarcode] = (sRef_seq, sTarget_region, sRef_seq_after_barcode, iIndel_start_next_pos_from_barcode_end, 223 | iIndel_end_next_pos_from_barcode_end, iIndel_start_pos, iIndel_end_pos) # total matched reads, insertion, deletion, complex 224 | dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], []] 225 | 226 | 227 | def SearchIndel(self, lFASTQ=[], dRef = {}, dResult={}, sBarcode_PAM_pos=""): 228 | 229 | # lFASTQ : [(seq, qual),(seq, qual)] 230 | # lRef : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)] 231 | # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ]] 232 | iCount = 0 233 | intBarcodeLen = len(dRef.keys()[0]) 234 | #print('intBarcodeLen', intBarcodeLen) 235 | 236 | InstGotoh = CoreGotoh(strEDNAFULL=self.strEDNAFULL, floOg=self.floOg, floOe=self.floOe) 237 | 238 | for lCol_FASTQ in lFASTQ: 239 | sName = lCol_FASTQ[0] 240 | if sBarcode_PAM_pos == 'Reverse': 241 | sSeq = lCol_FASTQ[1][::-1] 242 | lQual = lCol_FASTQ[2][::-1] 243 | else: 244 | sSeq = lCol_FASTQ[1] 245 | lQual = lCol_FASTQ[2] 246 | 247 | assert isinstance(sName, str) and isinstance(sSeq, str) and isinstance(lQual, list) 248 | 249 | listSeqWindow = CoreHash.MakeHashTable(sSeq, intBarcodeLen) 250 | 251 | iBarcode_matched = 0 252 | iInsert_count = 0 253 | iDelete_count = 0 254 | iComplex_count = 0 255 | 256 | intFirstBarcode = 0 ## check whether a barcode is one in a sequence. 257 | 258 | for strSeqWindow in listSeqWindow: 259 | 260 | if intFirstBarcode == 1: break ## A second barcode in a sequence is not considerable. 261 | 262 | try: 263 | lCol_ref, sBarcode, intFirstBarcode = CoreHash.IndexHashTable(dRef, strSeqWindow, intFirstBarcode) 264 | except KeyError: 265 | continue 266 | 267 | sRef_seq = lCol_ref[0] 268 | sTarget_region = lCol_ref[1] 269 | iIndel_seq_len = len(sTarget_region) 270 | sRef_seq_after_barcode = lCol_ref[2] 271 | iIndel_start_from_barcode_pos = lCol_ref[3] 272 | iIndel_end_from_barcode_pos = lCol_ref[4] 273 | try: 274 | if self.strPamType == 'CAS9': 275 | iKbp_front_Indel_end = iIndel_end_from_barcode_pos - 6 ## cas9:-6, cpf1:-4 276 | elif self.strPamType == 'CAF1': 277 | iKbp_front_Indel_end = iIndel_end_from_barcode_pos - 4 ## NN(N)*NNN(N)*NNNN 278 | except Exception: 279 | set_trace() 280 | 281 | """ 282 | * ^ : iIndel_end_from_barcode_pos 283 | GGCG TCGCTCATGTACCTCCCGT 284 | TATAGTCTGTCATGCGATGGCG---TCGCTCATGTACCTCCCGTTACAGCCACAAAGCAGGA 285 | * 286 | GGCGTC GCTCATGTACCTCCCGT 287 | 6 17 288 | """ 289 | 290 | ## bug fix 291 | if sBarcode == "": continue 292 | 293 | (sSeq, iBarcode_matched, sQuery_seq_after_barcode, lQuery_qual_after_barcode) = \ 294 | self._CheckBarcodePosAndRemove(sSeq, sBarcode, iBarcode_matched, lQual) 295 | 296 | ## Alignment Seq to Ref 297 | npGapIncentive = InstGotoh.GapIncentive(sRef_seq_after_barcode) 298 | 299 | try: 300 | lResult = InstGotoh.RunCRISPResso2(sQuery_seq_after_barcode.upper(), 301 | sRef_seq_after_barcode.upper(), 302 | npGapIncentive) 303 | except Exception as e: 304 | logging.error(e, exc_info=True) 305 | continue 306 | 307 | sQuery_needle_ori = lResult[0] 308 | sRef_needle_ori = lResult[1] 309 | 310 | sRef_needle, sQuery_needle = self._TrimRedundantSideAlignment(sRef_needle_ori, sQuery_needle_ori) 311 | lInsertion_in_read, lDeletion_in_read = self._MakeIndelPosInfo(sRef_needle, sQuery_needle) 312 | 313 | # print 'sQuery_needle', sQuery_needle 314 | # print 'lInsertion_in_read: onebase', lInsertion_in_read 315 | # print 'lDeletion_in_read: onebase', lDeletion_in_read 316 | # print 'i5bp_front_Indel_end', i5bp_front_Indel_end 317 | # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos 318 | 319 | lTarget_indel_result = [] # ['20M2I', '23M3D' ...] 320 | 321 | iInsert_count = self._TakeInsertionFromAlignment(lInsertion_in_read, iKbp_front_Indel_end, lTarget_indel_result, 322 | iIndel_end_from_barcode_pos, iInsert_count) 323 | 324 | iDelete_count = self._TakeDeletionFromAlignment(lDeletion_in_read, iKbp_front_Indel_end, lTarget_indel_result, 325 | iIndel_end_from_barcode_pos, iDelete_count) 326 | 327 | if iInsert_count == 1 and iDelete_count == 1: 328 | iComplex_count = 1 329 | iInsert_count = 0 330 | iDelete_count = 0 331 | 332 | # """ test set 333 | # print 'sBarcode', sBarcode 334 | # print 'sTarget_region', sTarget_region 335 | # print 'sRef_seq_after_barcode', sRef_seq_after_barcode 336 | # print 'sSeq_after_barcode', sQuery_seq 337 | # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos 338 | # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos 339 | # """ 340 | 341 | listResultFASTQ = self._MakeAndStoreQuality(sName, sSeq, lQual, dResult, sBarcode) 342 | 343 | """ 344 | iQual_end_pos + 1 is not correct, because the position is like this. 345 | *NNNN*(N) 346 | So, '+ 1' is removed. 347 | Howerver, seqeunce inspects until (N) position. indel is detected front of *(N). 348 | """ 349 | ################################################################ 350 | #print(lTarget_indel_result) 351 | #set_trace() 352 | # len(sQuery_seq_after_barcode) == len(lQuery_qual_after_barcode) 353 | if np.mean(lQuery_qual_after_barcode[iIndel_start_from_barcode_pos : iIndel_end_from_barcode_pos + 1]) >= self.intQualCutoff: ## Quality cutoff 354 | 355 | """ 356 | 23M3I 357 | 23M is included junk_seq after barcode, 358 | 359 | barcorde junk targetseq others 360 | *********ACCCT-------------ACACACACC 361 | so should select target region. 362 | If junk seq is removed by target region seq index pos. 363 | """ 364 | # filter start, 365 | iTarget_start_from_barcode = sRef_seq_after_barcode.index(sTarget_region) 366 | lTrimmed_target_indel_result = self._FixPos(lTarget_indel_result, iTarget_start_from_barcode) 367 | 368 | # print 'Check' 369 | # print sRef_seq_after_barcode 370 | # print sQuery_seq_after_barcode 371 | # print lTrimmed_target_indel_result 372 | # print('Trimmed', lTrimmed_target_indel_result) 373 | 374 | sRef_seq_after_barcode, sQuery_seq_after_barcode = self._StoreToDictResult(sRef_seq_after_barcode, sQuery_seq_after_barcode, iTarget_start_from_barcode, 375 | dResult, sBarcode, lTrimmed_target_indel_result, sTarget_region, sRef_needle_ori, 376 | sQuery_needle_ori, iInsert_count, iDelete_count, iComplex_count, listResultFASTQ) 377 | else: 378 | iInsert_count = 0 379 | iDelete_count = 0 380 | iComplex_count = 0 381 | 382 | # total matched reads, insertion, deletion, complex 383 | dResult[sBarcode][self.intNumOfTotal] += iBarcode_matched 384 | dResult[sBarcode][self.intNumOfIns] += iInsert_count 385 | dResult[sBarcode][self.intNumOfDel] += iDelete_count 386 | dResult[sBarcode][self.intNumofCom] += iComplex_count 387 | 388 | iBarcode_matched = 0 389 | iInsert_count = 0 390 | iDelete_count = 0 391 | iComplex_count = 0 392 | 393 | #End:for 394 | #END:for 395 | return dResult 396 | 397 | def _CheckBarcodePosAndRemove(self, sSeq, sBarcode, iBarcode_matched, lQual): 398 | 399 | # Check the barcode pos and remove it. 400 | sSeq = sSeq.replace('\r', '') 401 | iBarcode_start_pos_FASTQ = sSeq.index(sBarcode) 402 | iBarcode_matched += 1 403 | iBarcode_end_pos_FASTQ = iBarcode_start_pos_FASTQ + len(sBarcode) - 1 404 | 405 | """ 406 | junk seq target region 407 | ref: AGGAG AGAGAGAGAGA 408 | que: AGGAG AGAGAGAGAGA 409 | But, It doesnt know where is the target region because of existed indels. 410 | So, There is no way not to include it. 411 | """ 412 | # Use this. 413 | sQuery_seq_after_barcode = sSeq[iBarcode_end_pos_FASTQ + 1:] 414 | lQuery_qual_after_barcode = lQual[iBarcode_end_pos_FASTQ:] 415 | 416 | return (sSeq, iBarcode_matched, sQuery_seq_after_barcode, lQuery_qual_after_barcode) 417 | 418 | def _TrimRedundantSideAlignment(self, sRef_needle_ori, sQuery_needle_ori): 419 | 420 | # detach forward ---, backward --- 421 | # e.g. ref ------AAAGGCTACGATCTGCG------ 422 | # query AAAAAAAAATCGCTCTCGCTCTCCGATCT 423 | # trimmed ref AAAGGCTACGATCTGCG 424 | # trimmed qeury AAATCGCTCTCGCTCTC 425 | iReal_ref_needle_start = 0 426 | iReal_ref_needle_end = len(sRef_needle_ori) 427 | iRef_needle_len = len(sRef_needle_ori) 428 | 429 | for i, sRef_nucle in enumerate(sRef_needle_ori): 430 | if sRef_nucle in ['A', 'C', 'G', 'T']: 431 | iReal_ref_needle_start = i 432 | break 433 | 434 | for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]): 435 | if sRef_nucle in ['A', 'C', 'G', 'T']: 436 | iReal_ref_needle_end = iRef_needle_len - (i + 1) 437 | # forward 0 1 2 len : 3 438 | # reverse 2 1 0, len - (2 + 1) = 0 439 | break 440 | 441 | sRef_needle = sRef_needle_ori[iReal_ref_needle_start:iReal_ref_needle_end + 1] 442 | if iReal_ref_needle_start: 443 | sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end] 444 | sQuery_needle = sQuery_needle_ori[:len(sRef_needle)] 445 | # detaching completion 446 | return (sRef_needle, sQuery_needle) 447 | 448 | def _MakeIndelPosInfo(self, sRef_needle, sQuery_needle): 449 | 450 | # indel info making. 451 | iNeedle_match_pos_ref = 0 452 | iNeedle_match_pos_query = 0 453 | iNeedle_insertion = 0 454 | iNeedle_deletion = 0 455 | 456 | lInsertion_in_read = [] # insertion result [[100, 1], [119, 13]] 457 | lDeletion_in_read = [] # deletion result [[97, 1], [102, 3]] 458 | 459 | # print 'sRef_needle', sRef_needle 460 | # print 'sQuery_needle', sQuery_needle 461 | for i, (sRef_nucle, sQuery_nucle) in enumerate(zip(sRef_needle, sQuery_needle)): 462 | 463 | if sRef_nucle == '-': 464 | iNeedle_insertion += 1 465 | 466 | if sQuery_nucle == '-': 467 | iNeedle_deletion += 1 468 | 469 | if sRef_nucle in ['A', 'C', 'G', 'T']: 470 | if iNeedle_insertion: 471 | lInsertion_in_read.append([iNeedle_match_pos_ref, iNeedle_insertion]) 472 | iNeedle_insertion = 0 473 | iNeedle_match_pos_ref += 1 474 | 475 | if sQuery_nucle in ['A', 'C', 'G', 'T']: 476 | if iNeedle_deletion: 477 | lDeletion_in_read.append([iNeedle_match_pos_query, iNeedle_deletion]) 478 | iNeedle_match_pos_query += iNeedle_deletion 479 | iNeedle_deletion = 0 480 | iNeedle_match_pos_query += 1 481 | # print 'sRef_needle', sRef_needle 482 | 483 | return (lInsertion_in_read, lDeletion_in_read) 484 | 485 | 486 | def _TakeInsertionFromAlignment(self, lInsertion_in_read, iKbp_front_Indel_end, lTarget_indel_result, 487 | iIndel_end_from_barcode_pos, iInsert_count): 488 | """ 489 | ins case 490 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT 491 | """ 492 | for iMatch_pos, iInsertion_pos in lInsertion_in_read: 493 | if self.strPamType == 'CAS9': 494 | # if i5bp_front_Indel_end == iMatch_pos -1 or iIndel_end_from_barcode_pos == iMatch_pos -1: # iMatch_pos is one base # original ver 495 | if iKbp_front_Indel_end - self.intInsertionWin <= iMatch_pos - 1 <= iKbp_front_Indel_end + self.intInsertionWin: # iMatch_pos is one base 496 | iInsert_count = 1 497 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I') 498 | 499 | elif self.strPamType == 'CPF1': 500 | if iKbp_front_Indel_end - self.intInsertionWin <= iMatch_pos - 1 <= iKbp_front_Indel_end + self.intInsertionWin or \ 501 | iIndel_end_from_barcode_pos - self.intInsertionWin <= iMatch_pos - 1 <= iIndel_end_from_barcode_pos + self.intInsertionWin: # iMatch_pos is one base 502 | iInsert_count = 1 503 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I') 504 | 505 | return iInsert_count 506 | 507 | def _TakeDeletionFromAlignment(self, lDeletion_in_read, iKbp_front_Indel_end, lTarget_indel_result, 508 | iIndel_end_from_barcode_pos, iDelete_count): 509 | 510 | """ 511 | del case 1 512 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT 513 | del case 2 514 | ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT 515 | """ 516 | for iMatch_pos, iDeletion_pos in lDeletion_in_read: 517 | """ 518 | Insertion: 30M3I 519 | ^ 520 | ACGT---ACGT 521 | ACGTTTTACGT -> check this seq 522 | Insertion just check two position 523 | 524 | Deletion: 30M3D 525 | ^ 526 | ACGTTTTACGT 527 | ACGT---ACGT -> check this seq 528 | But deletion has to includes overlap deletion. 529 | """ 530 | if self.strPamType == 'CAS9': 531 | if (iMatch_pos - self.intDeletionWin - 1 <= iKbp_front_Indel_end and iKbp_front_Indel_end < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)): 532 | iDelete_count = 1 533 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D') 534 | elif self.strPamType == 'CPF1': 535 | if (iMatch_pos - self.intDeletionWin - 1 <= iKbp_front_Indel_end and iKbp_front_Indel_end < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)) or \ 536 | (iMatch_pos - self.intDeletionWin - 1 <= iIndel_end_from_barcode_pos and iIndel_end_from_barcode_pos < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)): 537 | iDelete_count = 1 538 | lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D') 539 | 540 | return iDelete_count 541 | 542 | def _MakeAndStoreQuality(self, sName, sSeq, lQual, dResult, sBarcode): 543 | listResultFASTQ = [sName, sSeq, '+', ''.join(chr(i + 33) for i in lQual)] 544 | dResult[sBarcode][self.intTotalFastq].append(listResultFASTQ) 545 | return listResultFASTQ 546 | 547 | def _FixPos(self, lTarget_indel_result, iTarget_start_from_barcode): 548 | 549 | lTrimmed_target_indel_result = [] 550 | 551 | for sINDEL in lTarget_indel_result: 552 | # B - A is not included B position, so +1 553 | iMatch_target_start = int(sINDEL.split('M')[0]) - iTarget_start_from_barcode 554 | """ This part determines a deletion range. 555 | ^ current match pos 556 | AGCTACGATCAGCATCTGACTTACTTC[barcode] 557 | 558 | 559 | ^ fix the match start at here. (target region) 560 | AGCTACGATCAGCATC TGACTTACTTC[barcode] 561 | 562 | if iMatch_target_start < 0: 563 | sContinue = 1 564 | 565 | But, this method has some problems. 566 | 567 | ^ barcode start 568 | AGCTACGATCAGCAT*********C[barcode] 569 | Like this pattern doesn't seleted. because, deletion checking is begun the target region start position. 570 | Thus, I have fixed this problem. 571 | """ 572 | 573 | if iMatch_target_start <= -(iTarget_start_from_barcode): 574 | # print(iMatch_target_start, iTarget_start_from_barcode) 575 | continue 576 | 577 | lTrimmed_target_indel_result.append(str(iMatch_target_start) + 'M' + sINDEL.split('M')[1]) 578 | # filter end 579 | return lTrimmed_target_indel_result 580 | 581 | def _StoreToDictResult(self, sRef_seq_after_barcode, sQuery_seq_after_barcode, iTarget_start_from_barcode, 582 | dResult, sBarcode, lTrimmed_target_indel_result, sTarget_region, sRef_needle_ori, sQuery_needle_ori, 583 | iInsert_count, iDelete_count, iComplex_count, listResultFASTQ): 584 | 585 | sRef_seq_after_barcode = sRef_seq_after_barcode[iTarget_start_from_barcode:] 586 | sQuery_seq_after_barcode = sQuery_seq_after_barcode[iTarget_start_from_barcode:] 587 | 588 | dResult[sBarcode][self.intIndelInfo].append([sRef_seq_after_barcode, sQuery_seq_after_barcode, lTrimmed_target_indel_result, 589 | sTarget_region, sRef_needle_ori, sQuery_needle_ori]) 590 | if iInsert_count: 591 | dResult[sBarcode][self.intInsFastq].append(listResultFASTQ) 592 | elif iDelete_count: 593 | dResult[sBarcode][self.intDelFastq].append(listResultFASTQ) 594 | elif iComplex_count: 595 | dResult[sBarcode][self.intComFastq].append(listResultFASTQ) 596 | 597 | return (sRef_seq_after_barcode, sQuery_seq_after_barcode) 598 | 599 | def CalculateIndelFrequency(self, dResult): 600 | dResult_INDEL_freq = {} 601 | 602 | for sBarcode, lValue in dResult.items(): # lValue[gINDEL_info] : [[sRef_seq_after_barcode, sQuery_seq_after_barcode, lTarget_indel_result, sTarget_region], ..]) 603 | sRef_seq_loop = '' 604 | llINDEL_store = [] # ['ACAGACAGA', ['20M2I', '23M3D']] 605 | dINDEL_freq = {} 606 | 607 | if lValue[self.intIndelInfo]: 608 | for sRef_seq_loop, sQuery_seq, lINDEL, sTarget_region, sRef_needle, sQuery_needle in lValue[self.intIndelInfo]: # llINDEL : [['20M2I', '23M3D'], ...] 609 | # print 'lINDEL', lINDEL 610 | for sINDEL in lINDEL: 611 | llINDEL_store.append([sQuery_seq, sINDEL, sRef_needle, sQuery_needle]) 612 | 613 | iTotal = len([lINDEL for sQuery_seq, lINDEL, sRef_needle, sQuery_needle in llINDEL_store]) 614 | 615 | for sQuery_seq, sINDEL, sRef_needle, sQuery_needle in llINDEL_store: 616 | dINDEL_freq[sINDEL] = [[], 0, [], []] 617 | 618 | for sQuery_seq, sINDEL, sRef_needle, sQuery_needle in llINDEL_store: 619 | dINDEL_freq[sINDEL][1] += 1 620 | dINDEL_freq[sINDEL][0].append(sQuery_seq) 621 | dINDEL_freq[sINDEL][2].append(sRef_needle) 622 | dINDEL_freq[sINDEL][3].append(sQuery_needle) 623 | 624 | for sINDEL in dINDEL_freq: 625 | lQuery = dINDEL_freq[sINDEL][0] 626 | iFreq = dINDEL_freq[sINDEL][1] 627 | lRef_needle = dINDEL_freq[sINDEL][2] 628 | lQuery_needle = dINDEL_freq[sINDEL][3] 629 | 630 | try: 631 | dResult_INDEL_freq[sBarcode].append([sRef_seq_loop, lQuery, sINDEL, float(iFreq) / iTotal, 632 | sTarget_region, lRef_needle, lQuery_needle]) 633 | except (KeyError, TypeError, AttributeError) as e: 634 | dResult_INDEL_freq[sBarcode] = [] 635 | dResult_INDEL_freq[sBarcode].append([sRef_seq_loop, lQuery, sINDEL, float(iFreq) / iTotal, 636 | sTarget_region, lRef_needle, lQuery_needle]) 637 | # end: if lValue[gINDEL_info] 638 | # end: for sBarcode, lValue 639 | return dResult_INDEL_freq 640 | # end1: return 641 | # end: def 642 | #END:class 643 | 644 | 645 | class clsOutputMaker(object): 646 | 647 | def __init__(self, InstParameter): 648 | 649 | self.strOutputdir = InstParameter.strOutputdir 650 | self.strForwardFqPath = InstParameter.strForwardFqPath 651 | 652 | def MakePickleOutput(self, dictResult, dictResultIndelFreq, strBarcodePamPos=''): 653 | 654 | dictOutput = {'dictResult': dictResult, 655 | 'dictResultIndelFreq': dictResultIndelFreq, 656 | 'strBarcodePamPos': strBarcodePamPos} 657 | 658 | with open('{outdir}/Tmp/Pickle/{fq}.pickle'.format(outdir=self.strOutputdir, fq=os.path.basename(self.strForwardFqPath)), 'wb') as Pickle: 659 | pickle.dump(dictOutput, Pickle) 660 | 661 | 662 | def Main(): 663 | 664 | InstParameter = clsParameter() 665 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 666 | level=logging.DEBUG, 667 | filename=InstParameter.strLogPath, 668 | filemode='a') 669 | 670 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 671 | 672 | logging.info('Program start : %s' % InstParameter.strForwardFqPath) 673 | 674 | logging.info('File Open') 675 | InstFileOpen = clsFastqOpener(InstParameter) 676 | listFastqForward = InstFileOpen.OpenFastqForward() 677 | if InstParameter.strPair == 'True': 678 | listFastqReverse = InstFileOpen.OpenFastqReverse() 679 | 680 | InstIndelSearch = clsIndelSearchParser(InstParameter) 681 | 682 | InstOutput = clsOutputMaker(InstParameter) 683 | 684 | if InstParameter.strPamType == 'CPF1': 685 | logging.info('Search barcode INDEL pos') 686 | dRef, dResult = InstIndelSearch.SearchBarcodeIndelPosition(InstParameter.strBarcodePamPos) # ref check. 687 | 688 | logging.info('Search INDEL forward') 689 | dResultForward = InstIndelSearch.SearchIndel(listFastqForward, dRef, dResult) 690 | 691 | if InstParameter.strPair == 'True': 692 | logging.info('Search INDEL reverse') 693 | dResultReverse = InstIndelSearch.SearchIndel(listFastqReverse, dRef, dResultForward) 694 | 695 | logging.info('Calculate INDEL frequency') 696 | dictResultIndelFreq = InstIndelSearch.CalculateIndelFrequency(dResultReverse) 697 | 698 | logging.info('Make pickle output forward') 699 | InstOutput.MakePickleOutput(dResultReverse, dictResultIndelFreq) 700 | 701 | else: 702 | logging.info('Calculate INDEL frequency') 703 | dictResultIndelFreq = InstIndelSearch.CalculateIndelFrequency(dResultForward) 704 | 705 | logging.info('Make pickle output forward') 706 | InstOutput.MakePickleOutput(dResultForward, dictResultIndelFreq) 707 | 708 | elif InstParameter.strPamType == 'CAS9': 709 | logging.info('Search barcode INDEL pos') 710 | dRef, dResult = InstIndelSearch.SearchBarcodeIndelPosition(InstParameter.strBarcodePamPos) 711 | logging.info('Search INDEL') 712 | dResult_forward = InstIndelSearch.SearchIndel(listFastqForward, dRef, dResult, InstParameter.strBarcodePamPos) 713 | logging.info('Calculate INDEL frequency') 714 | dResult_INDEL_freq = InstIndelSearch.CalculateIndelFrequency(dResult_forward) 715 | 716 | logging.info('Make pickle output forward') 717 | InstOutput.MakePickleOutput(dResult_forward, dResult_INDEL_freq, InstParameter.strBarcodePamPos) 718 | 719 | logging.info('Program end : %s' % InstParameter.strForwardFqPath) 720 | #END:def 721 | 722 | 723 | if __name__ == '__main__': 724 | Main() 725 | 726 | 727 | -------------------------------------------------------------------------------- /Indel_searcher_2/Kill_jobs.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Confirm the jobs. 4 | # ps aux | grep hkim | grep BaseEdit_freq_ver1.0.py | less 5 | 6 | kill -9 $(ps aux | grep hkim | grep Run_indel_searcher | awk '{print$2}') 7 | kill -9 $(ps aux | grep hkim | grep Indel_searcher_crispresso_hash | awk '{print$2}') 8 | -------------------------------------------------------------------------------- /Indel_searcher_2/Make_user_folder.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | user=JaeWoo 4 | project=JaeWoo_test_samples 5 | 6 | [ ! -d ./Input ] && { `mkdir ./Input`; } 7 | [ ! -d ./User ] && { `mkdir ./User`; } 8 | [ ! -d ./Output ] && { `mkdir ./Output`; } 9 | 10 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; } 11 | [ ! -d ./Input/${user}/FASTQ ] && { `mkdir ./Input/${user}/FASTQ`; } 12 | [ ! -d ./Input/${user}/FASTQ/${project} ] && { `mkdir ./Input/${user}/FASTQ/${project}`; } 13 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; } 14 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; } 15 | 16 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; } 17 | > ./User/${user}/${project}.txt 18 | -------------------------------------------------------------------------------- /Indel_searcher_2/README.md: -------------------------------------------------------------------------------- 1 | # Indel_searcher_2 2 | Fast CRISPR indel search tool 3 | 4 | ### Prerequisites to run 5 | ``` 6 | # install the miniconda2. 7 | https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh 8 | 9 | # Run the conda package manager. 10 | conda config --add channels defaults 11 | conda config --add channels bioconda 12 | conda config --add channels conda-forge 13 | conda install CRISPResso2 14 | 15 | vi ~/.bashrc 16 | export PATH=$PATH:/path/to/minicodna2/bin 17 | 18 | vi Make_user_folder.sh 19 | # Modify the user name and project name. 20 | user=JaeWoo 21 | project=JaeWoo_test_samples 22 | ./Make_user_folder.sh 23 | 24 | vi Run_cmd.sh 25 | # Modify the parameters. The user and project name must be the same as that used in the 'Make_user_folder.sh'. 26 | user=JaeWoo 27 | project=JaeWoo_test_samples 28 | pam_type=Cas9 29 | pam_pos=Forward 30 | thread=15 31 | ./Run_cmd.sh 32 | ``` 33 | -------------------------------------------------------------------------------- /Indel_searcher_2/Run_cmd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | #################### 4 | ## User parameter ## 5 | ################################### 6 | 7 | user=JaeWoo 8 | project=JaeWoo_test_samples 9 | pam_type=Cas9 10 | pam_pos=Forward 11 | thread=15 12 | 13 | gap_open=-10 ## default 14 | gap_extend=1 ## default 15 | 16 | ################################### 17 | 18 | while read python_path;do 19 | python=$python_path 20 | done < ../PythonPath.txt 21 | 22 | [ ! -d ./Output/${user} ] && { `mkdir ./Output/${user}`; } 23 | [ ! -d ./Output/${user}/${project} ] && { `mkdir ./Output/${user}/${project}`; } 24 | [ ! -d ./Output/${user}/${project}/Log ] && { `mkdir ./Output/${user}/${project}/Log`; } 25 | 26 | nohup $python ./Run_indel_searcher.py --python $python --user $user --project $project --pam_type $pam_type --pam_pos $pam_pos -t $thread > ./Output/${user}/${project}/Log/log.txt 2>&1 & 27 | -------------------------------------------------------------------------------- /Indel_searcher_2/Run_converter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #################### 5 | ## User parameter ## 6 | #################################### 7 | 8 | user=JaeWoo 9 | project=JaeWoo_test_samples 10 | 11 | 12 | #################################### 13 | 14 | 15 | 16 | 17 | 18 | while read python_path;do 19 | python=$python_path 20 | done < ../PythonPath.txt 21 | 22 | nohup $python ./BaseEdit_input_converter.py $user $project > ./Output/${user}/${project}/Log/Converter_log.txt 2>&1 & 23 | -------------------------------------------------------------------------------- /Indel_searcher_2/Run_flash.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #################### 5 | ## User parameter ## 6 | #################################### 7 | 8 | user=SH 9 | project=p53_screening 10 | flash=FLASH-1.2.11-Linux-x86_64 11 | thread=4 12 | 13 | #################################### 14 | 15 | 16 | while read python_path;do 17 | python=$python_path 18 | done < ../PythonPath.txt 19 | 20 | nohup $python ./Flash_pair_read_merge.py $user $project $flash $thread > ./Output/${user}/${project}/Log/flash_log.txt 2>&1 & 21 | -------------------------------------------------------------------------------- /Indel_searcher_2/Run_indel_searcher.py: -------------------------------------------------------------------------------- 1 | import os, re, sys, math, logging 2 | 3 | import cPickle as pickle 4 | import subprocess as sp 5 | 6 | from pdb import set_trace 7 | from datetime import datetime 8 | from optparse import OptionParser 9 | 10 | sys.path.insert(0, os.path.dirname(os.getcwd())) 11 | from Core.CoreSystem import InitialFolder, UserFolderAdmin, Helper, RunMulticore, CheckProcessedFiles 12 | 13 | 14 | class clsIndelSearcherRunner(UserFolderAdmin): 15 | 16 | """ 17 | self.strOutputDir is inherited variable. 18 | 19 | """ 20 | def __init__(self, strSample, strRef, options, InstInitFolder): 21 | UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath) 22 | self.MakeSampleFolder() 23 | 24 | self.strProjectFile = InstInitFolder.strProjectFile 25 | self.intChunkSize = options.chunk_number 26 | self.strQualCutoff = options.base_quality 27 | self.intInsertionWin = options.insertion_window # Insertion window 0,1,2,3,4 28 | self.intDeletionWin = options.deletion_window # Deletion window 0,1,2,3,4 29 | self.strPamType = options.pam_type # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage) 30 | self.strPamPos = options.pam_pos # Barcode target position : Forward (barcode + target), Reverse (target + barcode) 31 | self.strPickle = options.pickle 32 | self.strClassFASTQ = options.class_fastq 33 | self.strSplit = options.split 34 | self.strLogPath = InstInitFolder.strLogPath 35 | 36 | self.strBarcodeFile = os.path.join(self.strRefDir, 'Barcode.txt') 37 | self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt') 38 | self.strTargetSeqFile = os.path.join(self.strRefDir, 'Target_region.txt') 39 | self.strRefFile = os.path.join(self.strRefDir, 'Reference.fa') 40 | 41 | ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake. 42 | ## This part is to fix the situation as mentioned above. 43 | if not os.path.isfile(self.strBarcodeFile): 44 | if os.path.isfile(self.strRefDir + 'barcode.txt'): 45 | self.strBarcodeFile = self.strRefDir + 'barcode.txt' 46 | else: 47 | logging.error('Barcode path is not correct, please make sure the path correctly.') 48 | if not os.path.isfile(self.strReferenceSeqFile): 49 | if os.path.isfile(self.strRefDir + 'reference_sequence.txt'): 50 | self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt' 51 | else: 52 | logging.error('Reference path is not correct, please make sure the path correctly.') 53 | if not os.path.isfile(self.strTargetSeqFile): 54 | if os.path.isfile(self.strRefDir + 'target_region.txt'): 55 | self.strTargetSeqFile = self.strRefDir + 'target_region.txt' 56 | else: 57 | logging.error('Target path is not correct, please make sure the path correctly.') 58 | 59 | 60 | self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser, 61 | project=self.strProject) 62 | ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1' 63 | self.strSampleDir = os.path.join(self.strFastqDir, self.strSample) 64 | 65 | self.strFastq_name = '' 66 | for strFile in os.listdir(self.strSampleDir): 67 | if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq': 68 | self.strFastq_name = '.'.join(strFile.split('.')[:-1]) 69 | logging.info('File name : %s' % self.strFastq_name) 70 | 71 | ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq' 72 | self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq') 73 | ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt' 74 | self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt') 75 | 76 | ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files' 77 | self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files') 78 | Helper.MakeFolderIfNot(self.strSplitPath) 79 | 80 | self.strPair = 'False' # FASTQ pair: True, False 81 | 82 | def SplitFile(self): 83 | 84 | ### Defensive : original fastq wc == split fastq wc 85 | #intTotalLines = len(open(self.strInputFile).readlines()) 86 | intTotalLines = int(sp.check_output('wc -l {input_file}'.format(input_file=self.strInputFile), shell=True).split()[0]) 87 | intSplitNum = int(math.ceil(intTotalLines/float(self.intChunkSize))) ## e.g. 15.4 -> 16 88 | 89 | if intSplitNum == 0: intSplitNum = 1 90 | logging.info('Total lines:%s, Chunk size:%s, Split number:%s' % (intTotalLines, self.intChunkSize, intSplitNum)) 91 | 92 | with open(self.strInputFile) as fq, \ 93 | open(self.strInputList, 'w') as OutList: 94 | 95 | for intNum in range(1, intSplitNum + 1): 96 | 97 | strSplitFile = self.strSplitPath + '/{sample}_{num}.fq'.format(sample=os.path.basename(self.strInputFile), 98 | num=intNum) 99 | with open(strSplitFile, 'w') as out: 100 | OutList.write(os.path.basename(strSplitFile) + '\n') 101 | intCount = 0 102 | 103 | for strRow in fq: 104 | intCount += 1 105 | out.write(strRow) 106 | 107 | if intCount == self.intChunkSize: 108 | break 109 | 110 | ## defensive 111 | #strOriginal = sp.check_output('wc -l {input_file}'.format(input_file=self.strInputFile), shell=True) 112 | strSplited = sp.check_output('cat {splited}/*.fq | wc -l'.format(splited=self.strSplitPath), shell=True) 113 | #strOrigianlWc = strOriginal.split()[0] 114 | intSplitedWc = int(strSplited.replace('\n','')) 115 | 116 | if intTotalLines != intSplitedWc: 117 | logging.error('The number of total lines of splited file is not corresponded to origial fastq.') 118 | logging.error('Original FASTQ line number : %s, Splited FASTQ line number : %s' % (intTotalLines, strSplited)) 119 | sys.exit(1) 120 | 121 | def MakeReference(self): 122 | 123 | if not os.path.isfile(self.strRefFile): 124 | with open(self.strBarcodeFile) as Barcode, \ 125 | open(self.strTargetSeqFile) as Target, \ 126 | open(self.strReferenceSeqFile) as Ref, \ 127 | open(self.strRefFile, 'w') as Output: 128 | 129 | listBarcode = Helper.RemoveNullAndBadKeyword(Barcode) 130 | listTarget = Helper.RemoveNullAndBadKeyword(Target) 131 | listRef = Helper.RemoveNullAndBadKeyword(Ref) 132 | 133 | ## defensive 134 | assert len(listBarcode) == len(listTarget) == len(listRef), 'Barcode, Target and Reference must be a same row number.' 135 | 136 | listName = [] 137 | for strBar, strTar in zip(listBarcode, listTarget): 138 | strBar = strBar.replace('\n', '').replace('\r', '').strip().upper() 139 | strTar = strTar.replace('\n', '').replace('\r', '').strip().upper() 140 | 141 | Helper.CheckIntegrity(self.strBarcodeFile, strBar) ## defensive 142 | Helper.CheckIntegrity(self.strBarcodeFile, strTar) ## defensive 143 | 144 | listName.append(strBar + ':' + strTar + '\n') 145 | 146 | for i, strRow in enumerate(listRef): 147 | strRow = strRow.replace('\r', '').strip().upper() 148 | Output.write('>' + listName[i] + strRow + '\n') 149 | 150 | def MakeIndelSearcherCmd(self): 151 | 152 | listCmd = [] 153 | strReverse = 'None' 154 | 155 | with open(self.strInputList) as Input: 156 | for strFile in Input: 157 | listFile = strFile.replace('\n', '').split(' ') 158 | strForward = self.strSplitPath + '/' + listFile[0] 159 | 160 | #if self.strPair == 'True': 161 | # strReverse = self.strSplitPath + '/' + listFile[1] 162 | 163 | listCmd.append(('{python} Indel_searcher_crispresso_hash.py {forw} {reve} {ref} {pair} {GapO} {GapE}' 164 | ' {Insertion_win} {Deletion_win} {PAM_type} {PAM_pos} {Qual} {outdir} {logpath}').format( 165 | python=self.strPython, 166 | forw=strForward, reve=strReverse, ref=self.strRefFile, pair=self.strPair, 167 | GapO=self.strGapOpen, GapE=self.strGapExtend, 168 | Insertion_win=self.intInsertionWin, Deletion_win=self.intDeletionWin, 169 | PAM_type=self.strPamType, PAM_pos=self.strPamPos, Qual=self.strQualCutoff, 170 | outdir=self.strOutSampleDir, logpath=self.strLogPath)) 171 | return listCmd 172 | 173 | def RunIndelFreqCalculator(self): 174 | sp.call('{python} Indel_frequency_calculator.py {outdir} {sample} {logpath}'.format(python=self.strPython, 175 | outdir=self.strOutSampleDir, 176 | sample=self.strSample, 177 | logpath=self.strLogPath), shell=True) 178 | sp.call('{python} Summary_all_trim.py {outdir} {sample} {logpath}'.format(python=self.strPython, 179 | outdir=self.strOutSampleDir, 180 | sample=self.strSample, 181 | logpath=self.strLogPath), shell=True) 182 | sp.call('cp $(find ./Output/{user}/{project} -name "*.tsv") ./Output/{user}/{project}/All_results'.format(user=self.strUser, 183 | project=self.strProject), shell=True) 184 | 185 | def IndelNormalization(self): 186 | 187 | sp.call('{python} Indel_normalization.py {project_file} {user} {project}'.format(python=self.strPython, 188 | project_file=self.strProjectFile, 189 | user=self.strUser, 190 | project=self.strProject), shell=True) 191 | 192 | def MakeOutput(self): 193 | """ 194 | dictResult 195 | {'TTTGTAGTCATACATCGCAATGTCAA': [0, 0, 0, 0, [], [], [], [], []]} 196 | dictResultIndelFreq 197 | {'TTTGCTCAGTCACACGTCACGAGCTG': [['TCATCGACTTGCAGGACATTAGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC', 198 | ['TCATCGACTTGCAGGACGAAGCTTGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA'], '19M3I', 1.0, 199 | 'TCATCGACTTGCAGGACATTAGGCGA', ['TCATCGACTTGCAGGACAT---TAGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC---------'], ['TCATCGACTTGCAGGACGAAGCTTGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA']]]} 200 | strBarcodePamPos 201 | Foward 202 | """ 203 | # index name, constant variable. 204 | intTotal = 0 205 | intNumIns = 1 206 | intNumDel = 2 207 | intNumCom = 3 208 | intTotalFastq = 4 209 | intInsFastq = 5 210 | intDelFastq = 6 211 | intComFastq = 7 212 | intIndelInfo = 8 213 | 214 | with open('{outdir}/Tmp/{sample}_Summary.txt'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as Summary, \ 215 | open('{outdir}/Tmp/{sample}_Classified_Indel_barcode.fastq'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as FastqOut, \ 216 | open('{outdir}/Tmp/{sample}_Indel_freq.txt'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as FreqOut: 217 | 218 | for binPickle in os.listdir('{outdir}/Tmp/Pickle'.format(outdir=self.strOutSampleDir)): 219 | with open('{outdir}/Tmp/Pickle/{pickle}'.format(outdir=self.strOutSampleDir, pickle=binPickle), 'rb') as PickleResult: 220 | 221 | dictPickleResult = pickle.load(PickleResult) 222 | dictResult = dictPickleResult['dictResult'] 223 | dictResultIndelFreq = dictPickleResult['dictResultIndelFreq'] 224 | strBarcodePamPos = dictPickleResult['strBarcodePamPos'] 225 | 226 | for strBarcode, listValue in dictResult.items(): 227 | if strBarcodePamPos == 'Reverse': 228 | strBarcode = strBarcode[::-1] 229 | 230 | Summary.write("{Bar}\t{NumTot}\t{NumIns}\t{NumDel}\t{NumCom}\n".format( 231 | Bar=strBarcode, NumTot=listValue[intTotal], NumIns=listValue[intNumIns], NumDel=listValue[intNumDel], NumCom=listValue[intNumCom])) 232 | 233 | if self.strClassFASTQ == 'True': 234 | for strJudge, intFastqKind in [('total', intTotalFastq), ('insertion', intInsFastq), ('deletion', intDelFastq), ('complex', intComFastq)]: 235 | for listFastq in listValue[intFastqKind]: ## category 236 | listFastqAddClass = [listFastq[0]+':Barcode_%s:%s' % (strBarcode, strJudge)] 237 | FastqOut.write('\n'.join(listFastqAddClass + listFastq[1:]) + '\n') 238 | 239 | for strBarcode in dictResultIndelFreq: # dictResultIndelFreq [sRef_seq, lQuery, float(iFreq)/iTotal, sTarget_region] 240 | 241 | if strBarcodePamPos == 'Reverse': 242 | strBarcode = strBarcode[::-1] 243 | 244 | for strRefSeq, listQuery, strINDEL, floFreq, strTargetRegion, listRefNeedle, listQueryNeedle in sorted(dictResultIndelFreq[strBarcode], key=lambda x: x[3], reverse=True): 245 | for strQuery, strRefNeedle, strQueryNeedle in zip(listQuery, listRefNeedle, listQueryNeedle): 246 | 247 | if strBarcodePamPos == 'Reverse': 248 | strQuery = strQuery[::-1] 249 | strRefNeedle = strRefNeedle[::-1] 250 | strQueryNeedle = strQueryNeedle[::-1] 251 | 252 | FreqOut.write('\t'.join([strBarcode, strQuery, strINDEL, str(round(floFreq, 4)), strRefNeedle, strQueryNeedle])+'\n') 253 | #END:for 254 | #END:with 255 | #END:for 256 | 257 | if self.strPickle == 'False': 258 | logging.info('Delete tmp pickles') 259 | sp.call('rm {outdir}/Tmp/Pickle/*.pickle'.format(outdir=self.strOutSampleDir), shell=True) 260 | 261 | elif self.strSplit == 'False': 262 | logging.info('Delete splited input files') 263 | sp.call('rm {split_path}/*.fq'.format(split_path=self.strSplitPath), shell=True) 264 | 265 | #END:with 266 | #END:def 267 | #END:cls 268 | 269 | 270 | def Main(): 271 | parser = OptionParser('Indel search program for CRISPR CAS9 & CPF1\n python2.7 Run_indel_searcher.py --pam_type Cas9 --pam_pos Forward') 272 | 273 | parser.add_option('-t', '--thread', default='1', type='int', dest='multicore', help='multiprocessing number, recommendation:t<16') 274 | parser.add_option('-c', '--chunk_number', default='400000', type='int', dest='chunk_number', 275 | help='split FASTQ, must be multiples of 4. file size < 1G recommendation:40000, size > 1G recommendation:400000') 276 | parser.add_option('-q', '--base_quality', default='20', dest='base_quality', help='NGS read base quality') 277 | parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0') 278 | parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100') 279 | parser.add_option('-i', '--insertion_window', default='4', type='int', dest='insertion_window', help='a window size for insertions') 280 | parser.add_option('-d', '--deletion_window', default='4', type='int', dest='deletion_window', help='a window size for deletions') 281 | parser.add_option('--pam_type', dest='pam_type', help='PAM type: Cas9 Cpf1') 282 | parser.add_option('--pam_pos', dest='pam_pos', help='PAM position: Forward Reverse') 283 | parser.add_option('--python', dest='python', help='The python path including the CRISPResso2') 284 | parser.add_option('--user', dest='user_name', help='The user name with no space') 285 | parser.add_option('--project', dest='project_name', help='The project name with no space') 286 | parser.add_option('--pickle', dest='pickle', default='False', help='Dont remove the pickles in the tmp folder : True, False') 287 | parser.add_option('--split', dest='split', default='False', help='Dont remove the split files in the input folder : True, False') 288 | parser.add_option('--classfied_FASTQ', dest='class_fastq', default='True', help='Dont remove the ClassfiedFASTQ in the tmp folder : True, False') 289 | parser.add_option('--ednafull', dest='ednafull', help='The nucleotide alignment matrix') 290 | 291 | options, args = parser.parse_args() 292 | 293 | InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__)) 294 | InstInitFolder.MakeDefaultFolder() 295 | InstInitFolder.MakeInputFolder() 296 | InstInitFolder.MakeOutputFolder() 297 | 298 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 299 | level=logging.DEBUG, 300 | filename=InstInitFolder.strLogPath, 301 | filemode='a') 302 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 303 | 304 | logging.info('Program start') 305 | if options.multicore > 15: 306 | logging.warning('Optimal treads <= 15') 307 | logging.info(str(options)) 308 | 309 | with open(InstInitFolder.strProjectFile) as Sample_list: 310 | 311 | listSamples = Helper.RemoveNullAndBadKeyword(Sample_list) 312 | intProjectNumInTxt = len(listSamples) 313 | 314 | strInputProject = './Input/{user}/FASTQ/{project}'.format(user=options.user_name, project=options.project_name) 315 | 316 | @CheckProcessedFiles 317 | def RunPipeline(**kwargs): 318 | 319 | setGroup = set() 320 | for strSample in listSamples: 321 | 322 | tupSampleInfo = Helper.SplitSampleInfo(strSample) 323 | if not tupSampleInfo: continue 324 | strSample, strRef, strExpCtrl = tupSampleInfo 325 | setGroup.add(strExpCtrl) 326 | 327 | InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder) 328 | #""" 329 | logging.info('SplitFile') 330 | InstRunner.SplitFile() 331 | logging.info('MakeReference') 332 | InstRunner.MakeReference() 333 | logging.info('MakeIndelSearcherCmd') 334 | listCmd = InstRunner.MakeIndelSearcherCmd() 335 | logging.info('RunMulticore') 336 | RunMulticore(listCmd, options.multicore) ## from CoreSystem.py 337 | logging.info('MakeOutput') 338 | InstRunner.MakeOutput() 339 | logging.info('RunIndelFreqCalculator') 340 | InstRunner.RunIndelFreqCalculator() 341 | #""" 342 | 343 | if setGroup == {'EXP', 'CTRL'}: 344 | InstRunner.IndelNormalization() 345 | elif setGroup in [set(), set([]), set(['']), set([' '])]: 346 | pass 347 | else: 348 | logging.error('The group category is not appropriate. : %s' % setGroup) 349 | logging.error('Please make sure your project file is correct.') 350 | logging.error('The group category must be Exp or Ctrl') 351 | raise Exception 352 | #""" 353 | 354 | RunPipeline(InstInitFolder=InstInitFolder, 355 | strInputProject=strInputProject, 356 | intProjectNumInTxt=intProjectNumInTxt, 357 | listSamples=listSamples, 358 | logging=logging) 359 | 360 | logging.info('Program end') 361 | #END:def 362 | 363 | 364 | if __name__ == '__main__': 365 | Main() 366 | -------------------------------------------------------------------------------- /Indel_searcher_2/Run_random_barcode.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | #################### 5 | ## User parameter ## 6 | #################################### 7 | 8 | user=SH 9 | project=p53_screening 10 | thread=2 11 | 12 | 13 | #################################### 14 | 15 | 16 | 17 | 18 | 19 | while read python_path;do 20 | python=$python_path 21 | done < ../PythonPath.txt 22 | 23 | 24 | nohup $python ./Summary_Random_barcode.py -u $user -p $project -t $thread > ./Output/${user}/${project}/Log/Random_barcode_log.txt 2>&1 & 25 | -------------------------------------------------------------------------------- /Indel_searcher_2/Summary_Random_barcode.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import logging 3 | import multiprocessing as mp 4 | 5 | from argparse import ArgumentParser 6 | from collections import OrderedDict 7 | 8 | sys.path.insert(0, os.path.dirname(os.getcwd())) 9 | from Core.CoreSystem import Helper 10 | 11 | 12 | class clsParameters(): 13 | 14 | def __init__(self, options): 15 | self.strUser = options.user_name 16 | self.strProject = options.project_name.replace('.txt', '') ## A user can be confused the input. So I prevented from it using 'replace'. 17 | self.strGroup = options.group 18 | self.intCore = options.thread 19 | 20 | self.strSampleList = 'User/{user}/{project}.txt'.format(user=options.user_name, project=options.project_name) 21 | 22 | 23 | def SummaryRandomBarcode(sFile_path): 24 | 25 | """ 26 | /Tmp 27 | 190819_Nahye_24k_2_D0_2-24kLib_Classified_Indel_barcode.fastq* -> process target 28 | 190819_Nahye_24k_2_D0_2-24kLib_Indel_freq.txt* 29 | 190819_Nahye_24k_2_D0_2-24kLib_Indel_summary.txt* 30 | 190819_Nahye_24k_2_D0_2-24kLib_Summary.txt* 31 | Pickle 32 | 33 | dBarcode_cnt = {'ACGTACTC_sorting_barcode': {'ACATACAC_random': 5, 'CGTGTTGA_random': 3, ...} 34 | """ 35 | dictBarcodeCnt = {} 36 | strClassCheck = '' 37 | 38 | strSample = sFile_path.split('/')[-1] 39 | logging.info('Summary_random_barcode start : %s, %s' % (sFile_path, strSample)) 40 | 41 | for sFile in os.listdir(sFile_path+'/Tmp/'): 42 | if '.fastq' in sFile: 43 | with open(sFile_path+'/Tmp/'+sFile) as Input: 44 | for i, strRow in enumerate(Input): 45 | 46 | # @D00235:683:CE1P6ANXX:6:1114:2135:5231 1:N:0:CTGAAGCT+CCTATCCT:Barcode_TTTGCTATCTCGACGTATGGACAGTG:total 47 | if i % 4 == 0: 48 | listBarClass = strRow.replace('\n','').split('Barcode_')[1].split(':') 49 | strBarcode = listBarClass[0] 50 | strClass = listBarClass[1] 51 | 52 | if strClass == 'total': 53 | strClassCheck = 'total' 54 | 55 | if i % 4 == 1 and strClassCheck == 'total': 56 | strRow = strRow.replace('\n','').upper() 57 | intBarcodeStart = strRow.find(strBarcode) 58 | strRandom_barcode = strRow[intBarcodeStart-8:intBarcodeStart] 59 | 60 | try: 61 | _ = dictBarcodeCnt[strBarcode] 62 | except KeyError: 63 | dictBarcodeCnt[strBarcode] = {} 64 | try: 65 | dictBarcodeCnt[strBarcode][strRandom_barcode] += 1 66 | except KeyError: 67 | dictBarcodeCnt[strBarcode][strRandom_barcode] = 1 68 | #print(sBarcode, sRandom_barcode, iBarcode_start, sRow) 69 | 70 | strClassCheck = '' 71 | 72 | if not os.path.isdir(sFile_path + '/Summary_Random_barcode'): os.mkdir(sFile_path + '/Summary_Random_barcode') 73 | with open(sFile_path + '/Summary_Random_barcode/%s_all_random_barcode.txt' % strSample, 'w') as All_random,\ 74 | open(sFile_path + '/Summary_Random_barcode/%s_Unique_RandomBarcodeNumber_In_SortingBarcode.txt' % strSample, 'w') as Random_sorting: 75 | 76 | All_random.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\tRandomBarcode\tEach_RandomBarcode_read_count\n') 77 | Random_sorting.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n') 78 | 79 | for sBarcode, dRandom_barcode_cnt in dictBarcodeCnt.items(): 80 | iRandom_barcode_num = len(dRandom_barcode_cnt.keys()) 81 | Random_sorting.write('\t'.join(map(str, [sBarcode, iRandom_barcode_num]))+'\n') 82 | 83 | for sRandom_barcode, iCnt in dRandom_barcode_cnt.items(): 84 | All_random.write('\t'.join(map(str, [sBarcode, iRandom_barcode_num, sRandom_barcode, iCnt]))+'\n') 85 | 86 | logging.info('Summary_random_barcode end: %s' % sFile_path) 87 | 88 | ## on going 89 | def CountGroup(InstParameters): 90 | """ 91 | Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode RandomBarcode Each_RandomBarcode_read_count 92 | TATATCATAGCGTACTCATC 8 TGCGTTTG 3 93 | TATATCATAGCGTACTCATC 8 CGCGTTTG 3 94 | TATATCATAGCGTACTCATC 8 TAGTTTTG 1 95 | TATATCATAGCGTACTCATC 8 ATAGTTTG 1 96 | """ 97 | 98 | sHeader = '' 99 | 100 | with open(InstParameters.strSampleList) as Sample: ## tmp input 101 | 102 | listSample = Sample.readlines() 103 | 104 | setGroup = set([strRow.replace('\n', '').split('\t')[2].upper() for strRow in listSample]) 105 | 106 | for strGroup in setGroup: 107 | if strGroup == 'CTRL': continue 108 | 109 | for strRow in listSample: 110 | if strGroup == strGroupOfSample: ## matched group names -> Sum the counts 111 | listCol = strRow.replace('\n', '').split('\t') 112 | strSample = listCol[0] 113 | strRef = listCol[1] 114 | strGroupOfSample = listCol[2] 115 | 116 | strProjectDir = './Output/{user}/{project}'.format(user=InstParameters.strUser, 117 | project=InstParameters.strProject) 118 | strGroupDir = os.path.join(strProjectDir, 'Group_result') 119 | Helper.MakeFolderIfNot(strGroupDir) 120 | 121 | dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict() ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']), 122 | ## Unique key, only one list. 123 | 124 | with open('{project_dir}/{sample}_all_random_barcode.txt'.format(project_dir=strProjectDir, 125 | sample=strSample)) as RandomBarcode_SeqFreq: 126 | sHeader = RandomBarcode_SeqFreq.readline() 127 | 128 | for sRow in RandomBarcode_SeqFreq: 129 | lCol = sRow.replace('\n', '').split('\t') 130 | 131 | sSortingBarcode = lCol[0] 132 | #iTotal_RandomBarcode_cnt_in_SortingBarcode = int(lCol[1]) 133 | sSorting_and_Random_barcode_seq = lCol[0] + '_' + lCol[2] ## Unique name : Doench2014_1000_CTCTGGGGT 134 | iRandomBarcode_count = int(lCol[3]) 135 | 136 | lCol[3] = iRandomBarcode_count 137 | 138 | try: 139 | _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq] 140 | 141 | dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq][3] += iRandomBarcode_count 142 | 143 | except KeyError: 144 | dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq] = lCol ## initial assignment 145 | #END for 146 | dRecal_total_kind_of_RandomBarcode = OrderedDict() 147 | for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode: ## sSorting_and_Random_barcode_seq 148 | sSortBarcode = sSort_Rand_seq.split('_')[0] 149 | try: 150 | dRecal_total_kind_of_RandomBarcode[sSortBarcode].append(dTotal_RandomBarcode_cnt_in_SortingBarcode[sSort_Rand_seq]) 151 | except KeyError: 152 | dRecal_total_kind_of_RandomBarcode[sSortBarcode] = [dTotal_RandomBarcode_cnt_in_SortingBarcode[sSort_Rand_seq]] 153 | 154 | for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items(): 155 | ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ... 156 | iKind_of_RandomBarcode = len(llValue) ################## why do I make like this ????? 157 | for lValue in llValue: 158 | lValue[1] = iKind_of_RandomBarcode ## Recal using group total cnt. 159 | 160 | llValue = sorted(llValue, key=lambda x:x[3], reverse=True) 161 | dRecal_total_kind_of_RandomBarcode[sKey] = llValue 162 | 163 | strEachGroup = './Output/Group_result/%s' % strGroup 164 | Helper.MakeFolderIfNot(strEachGroup) 165 | 166 | with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\ 167 | open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt: 168 | 169 | Sort_Random_cnt.write(sHeader) 170 | Uniq_random_cnt.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n') 171 | 172 | for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items(): 173 | Uniq_random_cnt.write('\t'.join(map(str, [sSortBarcode, len(llCol)]))+'\n') 174 | for lCol in llCol: 175 | Sort_Random_cnt.write('\t'.join(map(str, lCol))+'\n') 176 | #END: for 177 | #END: with 178 | 179 | 180 | def Main(): 181 | 182 | logging.info('Program Start') 183 | logging.info('Make commands for a multiple processing') 184 | 185 | parser = ArgumentParser(description='Script for counting the random barcodes') 186 | 187 | parser.add_argument('-u', '--user_name', type=str, dest='user_name', help='The user name in the /user subdir') 188 | parser.add_argument('-p', '--project_name', type=str, dest='project_name', help='The project name in the /user/user_name/ subdir') 189 | parser.add_argument('-g', '--group', type=str, dest='group', default='false', help='The group sum run of the barcodes, default: false') 190 | parser.add_argument('-t', '--thread', type=int, dest='thread', default='15', help='The multicore number 1~15') 191 | options = parser.parse_args() 192 | 193 | InstParameters = clsParameters(options) 194 | 195 | lPara = [] 196 | 197 | with open(InstParameters.strSampleList) as SampleList: 198 | 199 | for strSample in SampleList: 200 | if strSample[0] == '#' or strSample[0] in ['', ' ', '\r', '\n', '\r\n']: continue 201 | strSample = strSample.replace('\n', '').replace('\r', '').split('\t')[0] 202 | sFile_path = './Output/{user}/{project}/{sample}'.format(user=options.user_name, 203 | project=options.project_name, 204 | sample=strSample) 205 | #print('sFile_path', sFile_path) 206 | lPara.append(sFile_path) 207 | 208 | ## single_test 209 | #Summary_random_barcode(lPara[0]) 210 | 211 | logging.info('Multiple processing Start') 212 | p = mp.Pool(options.thread) 213 | p.map_async(SummaryRandomBarcode, lPara).get() 214 | logging.info('Multiple processing End') 215 | 216 | #logging.info('Count group Start') 217 | #CountGroup(InstParameters) 218 | #logging.info('Count group End') 219 | 220 | #logging.info('Program End') 221 | 222 | Main() 223 | -------------------------------------------------------------------------------- /Indel_searcher_2/Summary_all_trim.py: -------------------------------------------------------------------------------- 1 | import os, sys, logging 2 | import pandas as pd 3 | import subprocess as sp 4 | from pdb import set_trace 5 | 6 | sOutput_dir = sys.argv[1] 7 | strSample = sys.argv[2] 8 | strLogPath = sys.argv[3] 9 | 10 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s', 11 | level=logging.DEBUG, 12 | filename=strLogPath, 13 | filemode='a') 14 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout)) 15 | 16 | 17 | def Parsing_summary(): 18 | 19 | dfSummary = pd.read_table('{outdir}/Tmp/{sample}_Summary.txt'.format(sample=strSample, outdir=sOutput_dir), header=None) 20 | dfSummary.columns = ['Barcode', 'Total', 'Insertion', 'Deletion', 'Complex'] 21 | dfSummary = dfSummary.groupby(['Barcode']).sum() 22 | dfSummary['Total_indel'] = dfSummary['Insertion'] + dfSummary['Deletion'] + dfSummary['Complex'] 23 | dfSummary['IND/TOT'] = dfSummary['Total_indel'] / dfSummary['Total'] 24 | dfSummary['IND/TOT'].fillna(0, inplace=True) 25 | dfSummary.to_csv('{outdir}/Result/{sample}_Summary_result.tsv'.format(sample=strSample, outdir=sOutput_dir), sep='\t') 26 | 27 | def Annotate_final_result(): 28 | 29 | dfCount_INDEL = pd.read_table('{outdir}/Tmp/{sample}_Indel_summary.txt'.format(sample=strSample, outdir=sOutput_dir), header=None) 30 | dfSummary = pd.read_table('{outdir}/Result/{sample}_Summary_result.tsv'.format(sample=strSample, outdir=sOutput_dir), index_col='Barcode') 31 | 32 | dfCount_INDEL.set_index(0, inplace=True) 33 | dfConcat_result = pd.concat([dfCount_INDEL, dfSummary.loc[:,['Total_indel', 'Total', 'IND/TOT']]],axis=1) 34 | dfConcat_result.dropna(inplace=True) 35 | dfConcat_result = dfConcat_result.reset_index() 36 | dfConcat_result = dfConcat_result.loc[:,['index','Total_indel', 'Total', 'IND/TOT', 1,2]] 37 | dfConcat_result.columns = ['Barcode', 'Total_indel', 'Total', 'IND/TOT', 'Match','Info'] 38 | dfConcat_result = dfConcat_result.round(2) 39 | dfConcat_result.to_csv('{outdir}/Result/{sample}_Final_indel_result.tsv'.format(sample=strSample, outdir=sOutput_dir), sep='\t', index=False) 40 | 41 | if __name__ == '__main__': 42 | logging.info('Make a summary result.') 43 | Parsing_summary() 44 | Annotate_final_result() 45 | logging.info('The summary result has been completed.\n\n') 46 | -------------------------------------------------------------------------------- /Indel_searcher_2/User/JaeWoo/JaeWoo_test_samples.txt: -------------------------------------------------------------------------------- 1 | #190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7 Cas9D7 2 | #190819_Nahye_12K_D4_eCas9_Rep2-Cas9D7 Cas9D7 3 | Test_sample Cas9D7 4 | #190819_Nahye_12K_D4_D0_1-Cas9D7 Cas9D7 5 | #190819_Nahye_12K_D7_2_D0_1-Cas9D7 Cas9D7 6 | #190819_Nahye_12K_D7_D0_1-Cas9D7 Cas9D7 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Jae Woo Choi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /PythonPath.txt: -------------------------------------------------------------------------------- 1 | /media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CRISPR_toolkit 2 | CRISPR Indel, base edit analysis 3 | --------------------------------------------------------------------------------