├── .gitignore
├── Base_edit_2
    ├── Additional_BaseEdit_process_list.tsv
    ├── All_final_result_summation.py
    ├── BaseEdit_freq_crispresso.py
    ├── Each_base_summary.py
    ├── Indel_contr_dict_making.py
    ├── Indel_frequency_calculator.py
    ├── Kill_jobs.sh
    ├── MakeUserFolder.sh
    ├── Make_user_folder.sh
    ├── README.md
    ├── Run_BaseEdit_freq.py
    ├── Run_cmd.sh
    ├── Run_each_base_summary.sh
    ├── Run_sequence_freq.sh
    ├── Sequence_freq.py
    ├── Sequence_freq_add.py
    ├── Split_file.py
    ├── Sum_all_alt_freq.py
    ├── Summary_all_trim.py
    └── __init__.py
├── Core
    ├── CoreSystem.py
    ├── CoreSystem.pyc
    ├── __init__.py
    └── __init__.pyc
├── EDNAFULL
├── Indel_searcher_2
    ├── BaseEdit_input_converter.py
    ├── Debugger.py
    ├── Flash_pair_read_merge.py
    ├── Indel_frequency_calculator.py
    ├── Indel_normalization.py
    ├── Indel_normalization.pyc
    ├── Indel_searcher_crispresso_hash.py
    ├── Input
    │   └── JaeWoo
    │   │   ├── FASTQ
    │   │       └── JaeWoo_test_samples
    │   │       │   └── Test_sample
    │   │       │       └── eCas9_rep1_D4.ext.fastq
    │   │   └── Reference
    │   │       └── JaeWoo_test_samples
    │   │           └── Cas9D7
    │   │               ├── Barcode.txt
    │   │               ├── Reference_sequence.txt
    │   │               └── Target_region.txt
    ├── Kill_jobs.sh
    ├── Make_user_folder.sh
    ├── README.md
    ├── Run_cmd.sh
    ├── Run_converter.sh
    ├── Run_flash.sh
    ├── Run_indel_searcher.py
    ├── Run_random_barcode.sh
    ├── Summary_Random_barcode.py
    ├── Summary_all_trim.py
    └── User
    │   └── JaeWoo
    │       └── JaeWoo_test_samples.txt
├── LICENSE
├── PythonPath.txt
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/Base_edit_2/Additional_BaseEdit_process_list.tsv:
--------------------------------------------------------------------------------
1 | #ABE_Rep1_1_2_TF4	A,T	ABE_Rep1_1_2_TF4_AtoG_Summary.txt
2 | #ABE_Rep1_1_2_TF4	A,C	ABE_Rep1_1_2_TF4_AtoG_Summary.txt
3 | ABE_Rep1_1_2_TF4	A,G	ABE_Rep1_1_2_TF4_AtoG_Summary.txt
4 | 


--------------------------------------------------------------------------------
/Base_edit_2/All_final_result_summation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import os, sys
  4 | import pandas as pd
  5 | 
  6 | from pdb import set_trace
  7 | 
  8 | strProjectList = sys.argv[1]
  9 | #strProjectList = 'Project_list2.txt'
 10 | 
 11 | 
 12 | def Summation_all_final_result():
 13 | 
 14 |     with open(strProjectList) as Input:
 15 | 
 16 |         listdfResult = []
 17 |         for i, strSample in enumerate(Input):
 18 |             #print(strSample)
 19 |             #if i == 2: break
 20 |             strSample = strSample.replace('\n','').replace('\r','').strip()
 21 |             strFinalResultDir = './Output/%s/Summary/Merge_target_result/' % strSample
 22 | 
 23 |             for j, strFinalResultFile in enumerate(os.listdir(strFinalResultDir)):
 24 |                 if j > 0:
 25 |                     print('I expected one file, but there are more. check the target base change file')
 26 |                     sys.exit(1)
 27 |                     
 28 |                 print(strFinalResultFile)
 29 |                 strFinalResultPath = './Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile)
 30 | 
 31 |                 listdfResult.append(pd.read_table(strFinalResultPath, low_memory=False))
 32 | 
 33 |         dfAll        = pd.concat(listdfResult)
 34 |         dfForw       = dfAll.iloc[:,0:3]
 35 |         dfReve       = dfAll.iloc[:,3:].replace(' ', '0').astype('int64')
 36 |         dfAllResult  = pd.concat([dfForw, dfReve], axis=1).groupby(['Sample','Barcode','Ref']).sum()
 37 |         dfAllResult.reset_index(inplace=True)
 38 | 
 39 |         dfAllResult.to_csv('./Output/Summation_'+strProjectList, sep='\t')
 40 | 
 41 |         #with open('./Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile)) as FinalResult:
 42 |         """
 43 |             for strRow in FinalResult:
 44 |                 listCol       = strRow.replace('\n','').split('\t')
 45 |                 listSamBarRef = listCol[:3]
 46 |                 = listCol[3:]
 47 |         """
 48 | 
 49 | 
 50 | def SummationSubIndel():
 51 | 
 52 |     with open(strProjectList) as Input,\
 53 |         open('./Output/Summation_' + strProjectList.replace('.txt','') + '_sub_indel.txt', 'w') as Output:
 54 | 
 55 |         dictResult = {}
 56 | 
 57 |         for i, strSample in enumerate(Input):
 58 |             print(strSample)
 59 |             #if i == 2: break
 60 |             strSample = strSample.replace('\n','').replace('\r','').strip()
 61 |             strSubIndelDir = './Output/%s/result' % strSample
 62 | 
 63 |             for strSubIndelFile in os.listdir(strSubIndelDir):
 64 |                 if 'sub' in strSubIndelFile:
 65 |                     with open(strSubIndelDir + '/' + strSubIndelFile) as SubIndel:
 66 |                         for strRow in SubIndel:
 67 |                             listCol         = strRow.replace('\n','').split('\t')
 68 |                             setIndelPattern = set(listCol[3].split(','))
 69 |                             intCount        = int(listCol[2])
 70 |                             strNameBarcodePattern  = '-'.join(listCol[0:2])+'-'+''.join(setIndelPattern)
 71 | 
 72 |                             try:
 73 |                                 dictResult[strNameBarcodePattern] += intCount
 74 |                             except KeyError:
 75 |                                 dictResult[strNameBarcodePattern] = intCount
 76 | 
 77 |         for strNameBarcodePattern, intCount in dictResult.items():
 78 |             Output.write('\t'.join(strNameBarcodePattern.split('-')) + '\t' + str(intCount) + '\n')
 79 | 
 80 | 
 81 | def ConfirmValidation():
 82 | 
 83 |     with open(strProjectList) as Input:
 84 | 
 85 |         listdfResult = []
 86 |         for i, strSample in enumerate(Input):
 87 |             if i == 2: break
 88 |             print(strSample)
 89 |             strSample = strSample.replace('\n','').replace('\r','').strip()
 90 |             strFinalResultDir = './Output/%s/Summary/Merge_target_result/' % strSample
 91 | 
 92 |             for strFinalResultFile in os.listdir(strFinalResultDir):
 93 |                 print(strFinalResultFile)
 94 |                 strFinalResultPath = './Output/%s/Summary/Merge_target_result/%s' % (strSample, strFinalResultFile)
 95 | 
 96 |                 listdfResult.append(pd.read_table(strFinalResultPath, low_memory=False))
 97 | 
 98 |         dfAll        = pd.concat(listdfResult)
 99 |         dfForw       = dfAll.iloc[:,0:3]
100 |         dfReve       = dfAll.iloc[:,3:].replace(' ', '0').astype('int64')
101 |         dfAllResult  = pd.concat([dfForw, dfReve], axis=1).groupby(['Sample','Barcode','Ref']).sum()
102 |         dfAllResult.reset_index(inplace=True)
103 |         print(dfAllResult.iloc[:, 3:].sum().values.tolist())
104 | 
105 | 
106 | def Main():
107 |     Summation_all_final_result()
108 |     SummationSubIndel()
109 |     #ConfirmValidation()
110 | 
111 | 
112 | Main()
113 | 


--------------------------------------------------------------------------------
/Base_edit_2/BaseEdit_freq_crispresso.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys, logging
  2 | 
  3 | import numpy as np
  4 | import subprocess as sp
  5 | import cPickle as pickle
  6 | 
  7 | from pdb import set_trace
  8 | from datetime import datetime
  9 | from collections import OrderedDict
 10 | 
 11 | sys.path.insert(0, os.path.dirname(os.getcwd()))
 12 | from Core.CoreSystem import CoreGotoh
 13 | 
 14 | 
 15 | class clsParameter(object):
 16 | 
 17 |     """
 18 |     ./BaseEdit_freq_crispresso.py {forw} {GapO} {GapE} {barcode} {ref} {target_window} {indel_check_pos}
 19 |      {target_ref_alt} {outdir} {file_name} {PAM_seq} {PAM_pos} {Guide_pos} {ednafull} {log}
 20 |     """
 21 |     def __init__(self):
 22 | 
 23 |         if len(sys.argv) > 1:
 24 |             self.strForwPath        = sys.argv[1]
 25 |             self.floOg              = float(sys.argv[2])
 26 |             self.floOe              = float(sys.argv[3])
 27 |             self.strBarcode         = sys.argv[4]
 28 |             strRef                  = sys.argv[5]
 29 |             self.strRef             = strRef[strRef.index(self.strBarcode):]  ## 'ACTG'<barcode>ACGACACACGCAT, leftside bases are redundant.
 30 |             self.listTargetWindow   = sys.argv[6].split('-')
 31 |             self.listIndelCheckPos  = sys.argv[7].split('-')
 32 |             self.listTargetRefAlt   = sys.argv[8].split(',')
 33 |             self.strOutputDir       = sys.argv[9]
 34 |             self.strFileName        = sys.argv[10]
 35 |             self.strPamSeq          = sys.argv[11]
 36 |             self.listPamPos         = sys.argv[12].split('-')
 37 |             self.listGuidePos       = sys.argv[13].split('-')
 38 |             self.strEDNAFULL        = os.path.abspath('../EDNAFULL')
 39 |             self.strLogPath         = sys.argv[14]
 40 | 
 41 |         else:
 42 |             sManual = """
 43 |             Usage:
 44 | 
 45 |             python2.7 ./indel_search_ver1.0.py splitted_input_1.fq splitted_input_2.fq reference.fa
 46 | 
 47 |             splitted_input_1.fq : forward
 48 |             splitted_input_2.fq : reverse
 49 | 
 50 |             Total FASTQ(fq) lines / 4 = remainder 0.
 51 |             """
 52 |             print sManual
 53 |             sys.exit()
 54 | 
 55 | 
 56 | class clsBaseEditParser():
 57 | 
 58 |     def __init__(self, InstParameter):
 59 |         self.strForwPath       = InstParameter.strForwPath
 60 |         self.strRef            = InstParameter.strRef
 61 |         self.strBarcode        = InstParameter.strBarcode
 62 |         self.strEDNAFULL       = InstParameter.strEDNAFULL
 63 |         self.floOg             = InstParameter.floOg
 64 |         self.floOe             = InstParameter.floOe
 65 |         self.listIndelCheckPos = InstParameter.listIndelCheckPos
 66 |         self.listTargetWindow  = InstParameter.listTargetWindow
 67 | 
 68 |     def OpenSequenceFiles(self):
 69 |         lSequence_forward = []
 70 |         with open(self.strForwPath) as fa_1:
 71 |             lSequence_forward = [sRow.replace('\n', '').upper() for sRow in fa_1]
 72 |         return lSequence_forward
 73 | 
 74 |     def CalculateBaseEditFreq(self, lQuery_seq=[]):
 75 | 
 76 |         dRef    = {}
 77 |         dResult = {}
 78 | 
 79 |         dRef[self.strBarcode]    = (self.strRef)  # total matched reads, insertion, deletion, complex
 80 |         dResult[self.strBarcode] = [0, 0, 0, 0, [], [], [], [], [], [], []]
 81 | 
 82 |         # lRef   : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)]
 83 |         # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ], info]
 84 |         iCount = 0
 85 | 
 86 |         InstGotoh = CoreGotoh(strEDNAFULL=self.strEDNAFULL, floOg=self.floOg, floOe=self.floOe)
 87 | 
 88 |         for sQuery_seq_raw in lQuery_seq:
 89 | 
 90 |             iBarcode_matched = 0
 91 |             iNeedle_matched  = 0
 92 |             iInsert_count    = 0
 93 |             iDelete_count    = 0
 94 |             iComplex_count   = 0
 95 | 
 96 |             try:
 97 |                 # Check the barcode pos and remove it.
 98 |                 sQuery_seq_raw = sQuery_seq_raw.replace('\r', '')
 99 |                 iBarcode_start_pos = sQuery_seq_raw.index(self.strBarcode)
100 |                 iBarcode_matched += 1
101 | 
102 |                 sQuery_seq_with_barcode = sQuery_seq_raw[iBarcode_start_pos:]  ## this is not after barcode seq. including barcode
103 | 
104 |                 npGapIncentive = InstGotoh.GapIncentive(self.strRef)
105 | 
106 |                 try:
107 |                     lResult = InstGotoh.RunCRISPResso2(sQuery_seq_with_barcode.upper(), self.strRef.upper(), npGapIncentive)
108 |                 except Exception as e:
109 |                     logging.error(e, exc_info=True)
110 |                     continue
111 | 
112 |                 sQuery_needle_ori = lResult[0]
113 |                 sRef_needle_ori   = lResult[1]
114 | 
115 |                 # if _check == 1:
116 |                 #     print(sRef_needle_ori)
117 |                 #     print(sQuery_needle_ori)
118 |                 #     set_trace()
119 | 
120 |                 # detach forward ---, backward ---
121 |                 # e.g.    ref   ------AAAGGCTACGATCTGCG------
122 |                 #         query AAAAAAAAATCGCTCTCGCTCTCCGATCT
123 |                 # trimmed ref         AAAGGCTACGATCTGCG
124 |                 # trimmed qeury       AAATCGCTCTCGCTCTC
125 |                 iReal_ref_needle_start = 0
126 |                 iReal_ref_needle_end   = len(sRef_needle_ori)
127 |                 iRef_needle_len        = len(sRef_needle_ori)
128 | 
129 |                 for i, sRef_nucle in enumerate(sRef_needle_ori):
130 |                     if sRef_nucle in ['A', 'C', 'G', 'T']:
131 |                         iReal_ref_needle_start = i
132 |                         break
133 | 
134 |                 for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]):
135 |                     if sRef_nucle in ['A', 'C', 'G', 'T']:
136 |                         iReal_ref_needle_end = iRef_needle_len - (i + 1)
137 |                         # forward 0 1 2  len : 3
138 |                         # reverse 2 1 0,  len - (2 + 1) = 0
139 |                         break
140 | 
141 |                 sRef_needle = sRef_needle_ori[iReal_ref_needle_start:iReal_ref_needle_end + 1]
142 |                 if iReal_ref_needle_start:
143 |                     sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end]
144 |                 sQuery_needle = sQuery_needle_ori[:len(sRef_needle)]
145 |                 # detaching completion
146 | 
147 |                 # indel info making.
148 |                 iNeedle_match_pos_ref   = 0
149 |                 iNeedle_match_pos_query = 0
150 |                 iNeedle_insertion       = 0
151 |                 iNeedle_deletion        = 0
152 | 
153 |                 lInsertion_in_read = []  # insertion result [[100, 1], [119, 13]]
154 |                 lDeletion_in_read  = []  # deletion result  [[97, 1], [102, 3]]
155 | 
156 |                 # print 'sRef_needle', sRef_needle
157 |                 # print 'sQuery_needle', sQuery_needle
158 |                 for i, (sRef_nucle, sQuery_nucle) in enumerate(zip(sRef_needle, sQuery_needle)):
159 | 
160 |                     if sRef_nucle == '-':
161 |                         iNeedle_insertion += 1
162 | 
163 |                     if sQuery_nucle == '-':
164 |                         iNeedle_deletion += 1
165 | 
166 |                     if sRef_nucle in ['A', 'C', 'G', 'T']:
167 |                         if iNeedle_insertion:
168 |                             lInsertion_in_read.append([iNeedle_match_pos_ref, iNeedle_insertion])
169 |                             iNeedle_insertion = 0
170 |                         iNeedle_match_pos_ref += 1
171 | 
172 |                     if sQuery_nucle in ['A', 'C', 'G', 'T']:
173 |                         if iNeedle_deletion:
174 |                             lDeletion_in_read.append([iNeedle_match_pos_query, iNeedle_deletion])
175 |                             iNeedle_match_pos_query += iNeedle_deletion
176 |                             iNeedle_deletion = 0
177 |                         iNeedle_match_pos_query += 1
178 |                         # print 'sRef_needle', sRef_needle
179 | 
180 |                 # print 'sQuery_needle', sQuery_needle
181 |                 # print 'lInsertion_in_read: onebase', lInsertion_in_read
182 |                 # print 'lDeletion_in_read: onebase', lDeletion_in_read
183 |                 # print 'i5bp_front_Indel_end', i5bp_front_Indel_end
184 |                 # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos
185 | 
186 |                 lTarget_indel_result = []  # ['20M2I', '23M3D' ...]
187 | 
188 |                 """
189 |                 ins case
190 |                 ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT
191 |                 """
192 | 
193 |                 iCleavage_window_start = int(self.listIndelCheckPos[0])
194 |                 iCleavage_window_end = int(self.listIndelCheckPos[1]) - 1
195 | 
196 |                 for iMatch_pos, iInsertion_pos in lInsertion_in_read:
197 |                     if iCleavage_window_start <= iMatch_pos <= iCleavage_window_end:  # iMatch_pos is one base
198 |                         iInsert_count = 1
199 |                         lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I')
200 |                 """
201 |                 del case 1
202 |                 ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT
203 |                 del case 2
204 |                 ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT
205 |                 """
206 |                 for iMatch_pos, iDeletion_pos in lDeletion_in_read:
207 | 
208 |                     """
209 |                     Insertion: 30M3I
210 |                            ^
211 |                     ACGT---ACGT
212 |                     ACGTTTTACGT -> check this seq
213 |                     Insertion just check two position
214 | 
215 |                     Deletion: 30M3D
216 |                          ^
217 |                     ACGTTTTACGT
218 |                     ACGT---ACGT -> check this seq
219 |                     But deletion has to includes overlap deletion.
220 |                     """
221 | 
222 |                     if iMatch_pos <= iCleavage_window_end and iCleavage_window_start <= (iMatch_pos + iDeletion_pos):
223 |                         iDelete_count = 1
224 |                         lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D')
225 | 
226 |                 if iInsert_count == 1 and iDelete_count == 1:
227 |                     iComplex_count = 1
228 |                     iInsert_count = 0
229 |                     iDelete_count = 0
230 | 
231 |                     # """ test set
232 |                     # print 'sBarcode', sBarcode
233 |                     # print 'sTarget_region', sTarget_region
234 |                     # print 'sRef_seq_after_barcode', sRef_seq_after_barcode
235 |                     # print 'sSeq_after_barcode', sQuery_seq
236 |                     # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos
237 |                     # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos
238 |                     # """
239 | 
240 |                     """
241 |                     23M3I
242 |                     23M is included junk_seq after barcode,
243 | 
244 |                     barcorde  junk   targetseq   others
245 |                     *********ACCCT-------------ACACACACC
246 |                     so should select target region.
247 |                     If junk seq is removed by target region seq index pos.
248 |                     """
249 | 
250 |                 ## 8: indel info
251 |                 dResult[self.strBarcode][8].append(
252 |                     [self.strRef, sQuery_seq_raw, lTarget_indel_result,
253 |                      "", sRef_needle_ori, sQuery_needle_ori])  ## "" -> target seq, but this is not used this project.
254 | 
255 |             # end: try
256 |             except ValueError as e:
257 |                 print(e)
258 |                 continue
259 | 
260 |             # total matched reads, insertion, deletion, complex
261 |             dResult[self.strBarcode][0] += iBarcode_matched
262 |             dResult[self.strBarcode][1] += iInsert_count
263 |             dResult[self.strBarcode][2] += iDelete_count
264 |             dResult[self.strBarcode][3] += iComplex_count
265 | 
266 |             ## base editing frequency
267 |             """
268 |                    BaseEditPos : 0                                                    1                                  2
269 |             [OrderedDict([('A',0),('C',0),('G',0),('T',0)]), OrderedDict([('A',0),('C',0),('G',0),('T',0)]), ...
270 | 
271 |             and sum the counts each position
272 |             """
273 | 
274 |             ## No indel reads only
275 |             if iInsert_count == 0 and iDelete_count == 0 and iComplex_count == 0:
276 | 
277 |                 lBaseEdit = []
278 |                 iTarget_len = int(self.listTargetWindow[1]) - int(self.listTargetWindow[0]) + 1
279 | 
280 |                 for i in range(iTarget_len):
281 |                     lBaseEdit.append(OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]))
282 | 
283 |                 iTarget_start = int(self.listTargetWindow[0]) - 1
284 |                 iTarget_end = int(self.listTargetWindow[1])
285 | 
286 |                 """
287 |                                        cleavage window start
288 |                                         ^
289 |                 [barcode]ACGACGTACGACGT[cleavage]
290 |                 [barcode]ACGACGTACGACGT[cleavage]
291 |                 """
292 | 
293 |                 iBase_edit_event = 0
294 | 
295 |                 for i, tRef_Query_base in enumerate(zip(sRef_needle[iTarget_start: iTarget_end], sQuery_needle[iTarget_start: iTarget_end])):
296 |                     sRef_base   = tRef_Query_base[0]
297 |                     sQuery_base = tRef_Query_base[1]
298 | 
299 |                     if sRef_base == '-' or sQuery_base == '-': continue
300 | 
301 |                     if sRef_base != sQuery_base and sQuery_base != 'N':
302 |                         iBase_edit_event = 1
303 |                         lBaseEdit[i][sQuery_base] += 1
304 |                         # print(sQuery_needle)
305 | 
306 |                 dResult[self.strBarcode][9].append(lBaseEdit)
307 |                 ## Processed indel filtering and store aligned alt mut read.
308 |                 if iBase_edit_event == 1:
309 |                     dResult[self.strBarcode][10].append([self.strRef, sQuery_seq_raw, lTarget_indel_result, [list(orderedDict.values()) for orderedDict in lBaseEdit], sRef_needle_ori, sQuery_needle_ori])
310 |                 # dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], [], [BaseEdit_freq_data]]
311 | 
312 |             iBarcode_matched = 0
313 |             iInsert_count = 0
314 |             iDelete_count = 0
315 |             iComplex_count = 0
316 |             # end: for sBarcode, lCol_ref
317 |         # end: for lCol_FASTQ
318 |         return dResult
319 | 
320 | 
321 | class clsOutputMaker():
322 | 
323 |     def __init__(self, InstParameter):
324 | 
325 |         self.strForwPath      = InstParameter.strForwPath
326 |         self.strRef           = InstParameter.strRef
327 |         self.strFileName      = InstParameter.strFileName
328 |         self.strOutputDir     = InstParameter.strOutputDir
329 |         self.listTargetRefAlt = InstParameter.listTargetRefAlt
330 |         self.listTargetWindow = InstParameter.listTargetWindow
331 |         self.strPamSeq        = InstParameter.strPamSeq
332 |         self.listPamPos       = InstParameter.listPamPos
333 |         self.listGuidePos     = InstParameter.listGuidePos
334 | 
335 |         # index name, constant variable.
336 |         self.intNumOfTotal = 0
337 |         self.intNumOfIns   = 1
338 |         self.intNumOfDel   = 2
339 |         self.intNumOfCom   = 3
340 |         self.intTotalFastq = 4
341 |         self.intInsFastq   = 5
342 |         self.intDelFastq   = 6
343 |         self.intComFastq   = 7
344 |         self.intIndelInfo  = 8
345 | 
346 |     def MakeOutput(self, dResult):
347 |         """
348 |        {'TTTGGTGCACACACATATA': [6, 2, 2, 0, [], [], [], [], [['TATCTCTA..ref', 'GAGTCGGTG...query', [13M5D], '',
349 |        'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG------------------------------------------------------------',
350 |        'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGA-TGCGGGGCGTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA'],
351 |        ['TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG', '', '', '',
352 |        'TTTGGTGCACACACATATAACTGGAACACAAAGCATAGACTGCGGGGCG------------------------------------------------------------', ...
353 |        [[OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 1)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])],
354 |        [OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 1)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)]), OrderedDict([('A', 0), ('C', 0), ('G', 0), ('T', 0)])]]]}
355 |         """
356 | 
357 |         with open('{outdir}/Tmp/Alignment/{file_name}_filtered_indel.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Filtered,\
358 |             open('{outdir}/Tmp/Alignment/{file_name}_aligned_BaseEdit.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Ref_Alt_edit:
359 | 
360 |             for sBarcode in dResult:
361 |                 for lAligned_indel_result in dResult[sBarcode][8]:  # 8 : indel list
362 |                     if lAligned_indel_result[2]:
363 |                         Filtered.write('\t'.join(map(str, lAligned_indel_result)) + '\n')
364 | 
365 |                 for lAligned_alt_result in dResult[sBarcode][10]:  # 10 : alt base list
366 |                     if lAligned_alt_result:
367 |                         lAligned_alt_result[2] = str(lAligned_alt_result[2])
368 |                         try:
369 |                             Ref_Alt_edit.write('\t'.join(map(str, lAligned_alt_result)) + '\n')
370 |                         except Exception:
371 |                             set_trace()
372 | 
373 |             """
374 |             lAligned_result
375 |             ['TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC', 'TCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGTATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCAAAGGTCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAAT\r',
376 |             ['38M1D'], '', 'TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC-----------------------------------------------------------------', 'TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGT-CAAAGGTCCAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAAT']
377 |             """
378 | 
379 |         dSelect_base = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
380 | 
381 |         sTarget_ref = self.listTargetRefAlt[0]
382 |         sTarget_alt = self.listTargetRefAlt[1]
383 | 
384 |         iTarget_base = dSelect_base[sTarget_alt]
385 | 
386 |         try:
387 |             if not os.path.isdir('{outdir}/Tmp/All'.format(outdir=self.strOutputDir)):
388 |                 os.mkdir('{outdir}/Tmp/All'.format(outdir=self.strOutputDir))
389 |             if not os.path.isdir('{outdir}/Tmp/Target'.format(outdir=self.strOutputDir)):
390 |                 os.mkdir('{outdir}/Tmp/Target'.format(outdir=self.strOutputDir))
391 |         except OSError:
392 |             pass
393 | 
394 |         for sBarcode, lValue in dResult.items():
395 | 
396 |             iBarcode_start_pos       = self.strRef.index(sBarcode)
397 |             sRef_seq_without_barcode = self.strRef[iBarcode_start_pos+len(sBarcode):]
398 | 
399 |             llBaseEdit = lValue[9]
400 |             lSum = []
401 | 
402 |             for i, lBaseEdit in enumerate(llBaseEdit):
403 | 
404 |                 if not lSum:
405 |                     lSum = [[0, 0, 0, 0] for iQuery in range(len(lBaseEdit))]
406 | 
407 |                 for j in range(len(lBaseEdit)):
408 |                     for k, iCount in enumerate(list(llBaseEdit[i][j].values())):
409 |                         lSum[j][k] += iCount
410 | 
411 |             with open('{outdir}/Tmp/All/{file_name}_Summary.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName), 'w') as Summary, \
412 |                 open('{outdir}/Tmp/Target/{file_name}_{target}_Summary.txt'.format(outdir=self.strOutputDir, file_name=self.strFileName, target=sTarget_ref + 'to' + sTarget_alt), 'w') as Target_summary:
413 | 
414 |                 ## This Ref has barcode.
415 |                 sRef_target = self.strRef[int(self.listTargetWindow[0]) - 1:int(self.listTargetWindow[1])]
416 | 
417 |                 iPAM_start    = int(self.listPamPos[0]) - 1
418 |                 iPAM_end      = int(self.listPamPos[1])
419 |                 iGuide_start  = int(self.listGuidePos[0]) - 1
420 |                 iGuide_end    = int(self.listGuidePos[1])
421 |                 iGuide_len    = iGuide_end - iGuide_start
422 |                 iBarcode_len  = len(sBarcode)
423 | 
424 |                 """
425 |                 barcode Guide st,ed 
426 |                 <----><----------> NGG
427 |                 ACGTACGTACGTACGTACGTGGACG
428 |                 """
429 | 
430 |                 #sRef_target[iPAM_start:iPAM_end] = sPAM_seq
431 |                 ## iWithout_target_len = len(sRef_target[iBarcode_len:iGuide_start]) -> weird part.
432 |                 ## So I corrected it.
433 |                 iWithout_target_len = iGuide_start - iBarcode_len
434 |                 lWithout_target_pos = [-(i+1) for i in range(iWithout_target_len)][::-1]
435 | 
436 |                 lWith_target_pos = [i + 1 for i in range(iGuide_len)]
437 |                 lAfter_PAM_pos   = [i + 1 for i in range(len(self.strRef) - iPAM_end + 1)]
438 | 
439 |                 lPos_num           = lWithout_target_pos + lWith_target_pos + list(self.strPamSeq) + lAfter_PAM_pos
440 |                 lPos_annotated_ref = [str(i)+'.'+str(j) for i,j in zip(sRef_target, lPos_num)]
441 |                 ## ['A.-7', 'C.-6', 'A.-5', 'A.-4', 'G.-3', 'C.-2', 'A.-1', 'T.1', 'G.2', 'C.3', 'A.4', 'A.5', 'T.6', 'C.7', 'A.8', 'C.9', 'C.10', 'T.11', 'T.12', 'G.13', 'G.14',
442 | 
443 |                 lMasked_pos_annotated_ref_target = []   ## '' '' '' A '' '' '' A A '' ''
444 | 
445 |                 for sBase_pos in lPos_annotated_ref:
446 |                     sBase_only = sBase_pos.split('.')[0]
447 |                     if sBase_only != sTarget_ref:
448 |                         lMasked_pos_annotated_ref_target.append(' ')
449 |                     else:
450 |                         lMasked_pos_annotated_ref_target.append(sBase_pos)
451 | 
452 |                 #set_trace()
453 | 
454 |                 strFormat = "{sample}\t{bar}\t{ref}\t{NumTot}\t{NumIns}\t{NumDel}\t{NumCom}\t{BaseEditCount}\n"
455 |                 ## Making a header
456 |                 Summary.write("Sample\tBarcode\tRef\t# of Total\t# of Insertion\t# of Deletion\t# of Combination\t{refseq}\n".format(refseq='\t'.join(lPos_annotated_ref)))
457 |                 Target_summary.write("Sample\tBarcode\tRef\t# of Total\t# of Insertion\t# of Deletion\t# of Combination\t{refseq}\n".format(refseq='\t'.join(lMasked_pos_annotated_ref_target)))
458 | 
459 |                 for i, lBase_count in enumerate(zip(*lSum)):  ## lBase_count [(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)]
460 | 
461 |                     if i == 0:
462 |                         Summary.write(strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal], NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom],
463 |                             BaseEditCount='\t'.join(map(str, lBase_count))))
464 |                     else:
465 |                         Summary.write("\t\t\t\t\t\t\t{BaseEditCount}\n".format(BaseEditCount='\t'.join(map(str, lBase_count))))
466 | 
467 |                 try:
468 |                     lTarget_base_count = zip(*lSum)[iTarget_base]
469 |                     lMasked_target_base_count = []  ## '' 20 '' 30 '' '' '' '' 20 ''
470 | 
471 |                     for sMasked_ref, fCount in zip(lMasked_pos_annotated_ref_target, lTarget_base_count):
472 | 
473 |                         if sMasked_ref == ' ':
474 |                             lMasked_target_base_count.append(' ')
475 |                         else:
476 |                             lMasked_target_base_count.append(fCount)
477 | 
478 |                     Target_summary.write((strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal],
479 |                                                            NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom],
480 |                         BaseEditCount='\t'.join(map(str, lMasked_target_base_count)))))
481 | 
482 |                 except IndexError:
483 |                     print('Null query: ', self.strForwPath)
484 |                     ## Null query base count is all zero.
485 |                     Target_summary.write(
486 |                         (strFormat.format(sample=self.strFileName, bar=sBarcode, ref=sRef_seq_without_barcode, NumTot=lValue[self.intNumOfTotal],
487 |                             NumIns=lValue[self.intNumOfIns], NumDel=lValue[self.intNumOfDel], NumCom=lValue[self.intNumOfCom],
488 |                             BaseEditCount='\t'.join(['0'] * len(lPos_annotated_ref)))))
489 | 
490 | 
491 | def Main():
492 | 
493 |     InstParameter = clsParameter()
494 |     logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
495 |                         level=logging.DEBUG,
496 |                         filename=InstParameter.strLogPath,
497 |                         filemode='a')
498 | 
499 |     logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
500 | 
501 |     # Output: 1. Count information of matched barcode e.g. TACGATCTA\t# total\tins\t# del\t# com
502 |     # Output: 2. classify FASTQ.    e.g. TAGAATATACACG.insertion.fastq
503 | 
504 |     logging.info('Program start : %s' % InstParameter.strFileName)
505 | 
506 |     InstParser = clsBaseEditParser(InstParameter)
507 |     logging.info('File Open : %s' % InstParameter.strFileName)
508 |     listSequenceForward = InstParser.OpenSequenceFiles()
509 | 
510 |     logging.info('Calculate base edit frequency : %s' % InstParameter.strFileName)
511 |     dictResultForward  = InstParser.CalculateBaseEditFreq(listSequenceForward)
512 | 
513 |     logging.info('Make output forward : %s' % InstParameter.strFileName)
514 |     InstOutput = clsOutputMaker(InstParameter)
515 |     InstOutput.MakeOutput(dictResultForward)
516 | 
517 |     logging.info('Program end : %s' % InstParameter.strFileName)
518 | # end: def Main
519 | 
520 | 
521 | if __name__ == '__main__':
522 |     Main()
523 | 
524 | 
525 | 


--------------------------------------------------------------------------------
/Base_edit_2/Each_base_summary.py:
--------------------------------------------------------------------------------
  1 | #!/home/hkimlab/anaconda2/bin/python2.7
  2 | 
  3 | import os, sys
  4 | from pdb import set_trace
  5 | 
  6 | try:
  7 |     strUser        = sys.argv[1]
  8 |     strProject     = sys.argv[2]
  9 | 
 10 | except IndexError:
 11 |     print('\n')
 12 |     print('usage   : ./Each_base_summary.py user_name project_name\n')
 13 |     print('example : ./Each_base_summary.py SH p53_screening\n')
 14 |     sys.exit()
 15 | 
 16 | 
 17 | def Make_target_ref_alt_summary(strSample='', strRef='', strAlt='', strFirstOutput=''):
 18 | 
 19 |     """ row 0: header, 1: A and info, 2: C, 3: G, 4: T
 20 |     Sample          Barcode         Ref                                                                                          # of Total # of Insertion # of Deletion  # of Combination  C.-7    T.-6    C.-5    T.-4    G.-3    G.-2    G.-1    G.1     T.2     C.3     A.4     G.5     G.6     G.7      A.8     C.9     A.10    G.11    T.12    G.13    G.14    A.15    C.16    T.17    C.18    G.19    A.20    A.N     G.G     G.G     A.1     G.2     A.3
 21 |     Doench2014_1000 ACTAGCTATCGCTCA CTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA       5       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       00       0
 22 |                                                                                                                                             0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       00       0       0       0       0       0       0       0       0       0       0       0       0       0
 23 |                                                                                                                                             0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       0       00       0       0       0       0       0       0       0       0       0       0       0       0       0
 24 |                                                                                                                                             0       0       0       0       0       0       0       0       0       0       0       1       0       0       0       0       0       0       00       0       0       0       0       0       0       0       0       0       0       0       0       0
 25 |     """
 26 |     dAlt = {'A' : 1, 'C' : 2, 'G' : 3, 'T' : 4}
 27 |     lHeader = []
 28 |     llResult = []
 29 | 
 30 |     strSampleDir  = './Output/{user}/{project}/{sample}'.format(user=strUser,
 31 |                                                                 project=strProject,
 32 |                                                                 sample=strSample)
 33 |     strSummaryDir = os.path.join(strSampleDir, 'Result')
 34 |     strMergeTargetDir = os.path.join(strSummaryDir, 'Merge_target_result')
 35 | 
 36 |     with open(os.path.join(strMergeTargetDir,strFirstOutput)) as Fisrt_output,\
 37 |         open(os.path.join(strMergeTargetDir, '{sample}_{ref}to{alt}_Summary_addition.txt'.format(sample=strSample,
 38 |                                                                                                  ref=strRef,
 39 |                                                                                                  alt=strAlt)), 'w') as Output:
 40 | 
 41 |         strSummaryAllDir = os.path.join(strSampleDir,'Tmp/All')
 42 |         for iFile_cnt, sFile in enumerate(os.listdir(strSummaryAllDir)):
 43 | 
 44 |             with open(os.path.join(strSummaryAllDir, sFile)) as Input:
 45 |                 lNone_alt_col  = []
 46 |                 lBaseEdit_Info = []
 47 | 
 48 |                 for i, sRow in enumerate(Input):
 49 |                     lCol = sRow.replace('\n', '').split('\t')
 50 | 
 51 |                     if i == 0:
 52 |                         for j, sCol_name in enumerate(lCol[7:]):
 53 |                             if strRef not in sCol_name:
 54 |                                 lNone_alt_col.append(7+j)
 55 |                                 lCol[7+j] = ' '
 56 | 
 57 |                         if lHeader == []:
 58 |                             lHeader = lCol
 59 |                         elif lHeader:
 60 |                             for iHeader_col, tHeader in enumerate(zip(lHeader[7:], lCol[7:])):
 61 |                                 sHeader_current, sHeader_update = tHeader
 62 | 
 63 |                                 if sHeader_update == ' ': continue
 64 | 
 65 |                                 if sHeader_current == ' ':
 66 |                                     lHeader[iHeader_col+7] = sHeader_update
 67 | 
 68 |                                 else:
 69 |                                     assert  sHeader_current == sHeader_update, 'Check header %s %s' % (repr(sHeader_current), repr(sHeader_update))
 70 | 
 71 |                     elif i == 1:
 72 |                         lBaseEdit_Info = lCol[:7]
 73 | 
 74 |                     elif i == dAlt[strAlt]:
 75 |                         for iNon_col in lNone_alt_col:
 76 |                             lCol[iNon_col] = ' '
 77 |                         lCol[:7] = lBaseEdit_Info
 78 |                         #print(i, lCol)
 79 |                         #(3, ['Doench2014_1000', 'ACTAGCTATCGCTCA', 'CTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA', '5', '0', '0', '0', '', '', '', '', '', '', '', '', '', '', '0', '', '', '', '0', '', '0', '', '', '', '', '0', '', '', '', '', '0', '0', '', '', '0', '', '0'])
 80 |                         llResult.append(lCol)
 81 | 
 82 | 
 83 |         print('Total_files: ', iFile_cnt + 1)
 84 |         Output.write('\t'.join(lHeader) + '\n')
 85 | 
 86 |         """
 87 |         All folder doesn't able to have any indel information if it hasn't any counts of alterantive alleles.
 88 |         That file has only a header.
 89 |         Hence, I check the first merged summary output data, then extract it doesn't have current additional output.   
 90 |         """
 91 | 
 92 |         dAdditional_output = {} ## dictionary to check for only header files in the 'all' folder.
 93 | 
 94 |         for lResult in llResult:
 95 |             sSample = lResult[0]
 96 |             dAdditional_output[sSample] = '\t'.join(lResult) + '\n'
 97 | 
 98 |         for i, sRow in enumerate(Fisrt_output):
 99 |             if i == 0: continue ## header skip
100 |             lCol    = sRow.replace('\n', '').split('\t')
101 |             sSample = lCol[0]
102 | 
103 |             try:
104 |                 Output.write(dAdditional_output[sSample])
105 |             except KeyError:     ## Exclusive possession
106 |                 Output.write(sRow)
107 | 
108 | 
109 | def Main():
110 | 
111 |     with open('./User/{user}/Additional_BaseEdit_process_list.tsv'.format(user=strUser)) as Input:
112 |         for sRow in Input:
113 |             if sRow[0] == '#': continue
114 |             lCol = sRow.replace('\n', '').replace('\r', '').split('\t')
115 |             if len(lCol) == 1:
116 |                 lCol = lCol[0].split()
117 |             print(lCol)
118 | 
119 |             strSample      = lCol[0]
120 |             listRefAlt     = lCol[1].split(',')
121 |             strRef         = listRefAlt[0]
122 |             strAlt         = listRefAlt[1]
123 |             strFirstOutput = lCol[2]
124 | 
125 |             Make_target_ref_alt_summary(strSample=strSample,
126 |                                         strRef=strRef,
127 |                                         strAlt=strAlt,
128 |                                         strFirstOutput=strFirstOutput)
129 | 
130 | Main()
131 | 


--------------------------------------------------------------------------------
/Base_edit_2/Indel_contr_dict_making.py:
--------------------------------------------------------------------------------
 1 | #!/extdata1/JaeWoo/Tools/Python/miniconda2/bin/python2.7
 2 | 
 3 | import os, sys
 4 | from pdb import set_trace
 5 | 
 6 | import cPickle
 7 | 
 8 | #strSampleFolder = sys.argv[1]
 9 | 
10 | 
11 | def MakeIndelContrDict():
12 | 
13 |     for strSampleFolder in ['18K_D0_1','18K_D0_2','18K_D0_3']:
14 |         with open('./Output/%s/%s_IndelSubtarction.txt' % (strSampleFolder, strSampleFolder), 'w') as Output:
15 | 
16 |             dictSub = {}
17 | 
18 |             for strFile in os.listdir('./Output/%s/result' % strSampleFolder):
19 |                 if 'filtered' in strFile:
20 |                     with open('./Output/%s/result/%s' % (strSampleFolder, strFile)) as Input:
21 | 
22 |                         strBarcodeName = strFile.replace('_filtered_indel.txt','')
23 |                         for strRow in Input:
24 |                             listCol      = strRow.replace('\n','').split('\t')
25 |                             #set_trace()
26 |                             strIndelPos  = listCol[2].replace("['",'').replace("']",'')
27 |                             listIndelPos = strIndelPos.split('M')
28 |                             intMatch     = int(listIndelPos[0])
29 |                             strRefseq    = listCol[4]
30 |                             strQueryseq  = listCol[5]
31 | 
32 |                             if 'I' in strIndelPos: ## insertion
33 |                                 intInsertion    = int(listIndelPos[1].replace('I', ''))
34 |                                 strInsertseq    = strQueryseq[intMatch:intMatch+intInsertion]
35 |                                 #set_trace()
36 |                                 strInsertPosSeq = strIndelPos+'_'+strInsertseq
37 | 
38 |                                 try:
39 |                                     dictSub[strBarcodeName+':'+strInsertPosSeq].append([strInsertPosSeq, strRefseq, strQueryseq])
40 |                                 except KeyError:
41 |                                     dictSub[strBarcodeName+':'+strInsertPosSeq] = [[strInsertPosSeq, strRefseq, strQueryseq]]
42 | 
43 |                             elif 'D' in strIndelPos:
44 |                                 intDeletion     = int(listIndelPos[1].replace('D', ''))
45 |                                 strDeleteSeq    = strRefseq[intMatch:intMatch+intDeletion]
46 |                                 strDeletePosSeq = strIndelPos+'_'+strDeleteSeq
47 | 
48 |                                 try:
49 |                                     dictSub[strBarcodeName+':'+strDeletePosSeq].append([strDeletePosSeq, strRefseq, strQueryseq])
50 |                                 except KeyError:
51 |                                     dictSub[strBarcodeName+':'+strDeletePosSeq] = [[strDeletePosSeq, strRefseq, strQueryseq]]
52 | 
53 |             for strBarcodeName, list2IndelPosSeq in dictSub.items():
54 |                 for listIndelPosSeq in list2IndelPosSeq:
55 |                     Output.write('\t'.join([strBarcodeName] + listIndelPosSeq) + '\n')
56 | 
57 | 
58 | def ConcatContrDict():
59 | 
60 |     DictSubNoDup = {}
61 | 
62 |     for strSampleFolder in ['18K_D0_1', '18K_D0_2', '18K_D0_3']:
63 |         with open('./Output/%s/%s_IndelSubtarction.txt' % (strSampleFolder, strSampleFolder)) as Input:
64 | 
65 |             for strRow in Input:
66 |                 listCol = strRow.replace('\n', '').split('\t')
67 |                 try:
68 |                     DictSubNoDup[listCol[0]] += 1
69 |                 except KeyError:
70 |                     DictSubNoDup[listCol[0]] = 1
71 | 
72 |     #print(DictSubNoDup)
73 |     with open('./Output/DictSubNoDup.pickle', 'wb') as PickleObj:
74 |         cPickle.dump(DictSubNoDup, PickleObj)
75 | 
76 | 
77 | def Main():
78 |     #MakeIndelContrDict()
79 |     ConcatContrDict()
80 | 
81 | 
82 | Main()


--------------------------------------------------------------------------------
/Base_edit_2/Indel_frequency_calculator.py:
--------------------------------------------------------------------------------
  1 | #!/home/hkimlab/anaconda2/bin/python2.7
  2 | 
  3 | import os
  4 | import sys
  5 | import pdb
  6 | from datetime import datetime
  7 | from collections import namedtuple as nt
  8 | from collections import OrderedDict
  9 | 
 10 | sOutput_dir = sys.argv[1]
 11 | 
 12 | def Calculate_indel_freq():
 13 | 
 14 |     if not os.path.isdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir)): os.mkdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir))
 15 | 
 16 |     for sFile in os.listdir('{outdir}/result/freq'.format(outdir=sOutput_dir)):
 17 |         #print sFile
 18 |         if os.path.isfile(os.path.join('{outdir}/result/freq'.format(outdir=sOutput_dir), sFile)):
 19 |             with open(os.path.join('{outdir}/result/freq'.format(outdir=sOutput_dir), sFile)) as Input_freq,\
 20 |                 open(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile), 'w') as Output_freq:
 21 | 
 22 |                 sRef       = Input_freq.readline()  # first row is ref.
 23 |                 sDelemiter = Input_freq.readline()  # second row is '-------' delemiter.
 24 |                 Output_freq.write(sRef+sDelemiter)
 25 | 
 26 |                 lSeq_indel = []   # [namedtuple1(['TGCA', '30M3I']) namedtuple2 ...
 27 |                 dFreq_count = {}   # {'30M3I':2 ... }
 28 | 
 29 |                 for sRow in Input_freq:
 30 |                     Seq_indel = nt('Seq_indel', ['seq', 'indel', 'freq', 'ref_needle', 'query_needle'])
 31 | 
 32 |                     if sRow == sRef: continue
 33 |                     if sRow[0] == '-': continue
 34 |                     
 35 |                     try:
 36 |                         lCol   = sRow.replace('\n', '').split('\t')
 37 |                         Seq_indel.seq          = lCol[0]
 38 |                         Seq_indel.indel        = lCol[1]
 39 |                         Seq_indel.ref_needle   = lCol[3]
 40 |                         Seq_indel.query_needle = lCol[4]
 41 |                         lSeq_indel.append(Seq_indel)
 42 |                     except IndexError:
 43 |                         print sFile, lCol                        
 44 |                         continue
 45 |                         
 46 |                     try:
 47 |                         dFreq_count[Seq_indel.indel] += 1
 48 |                     except KeyError:
 49 |                         dFreq_count[Seq_indel.indel] = 1
 50 |                 #end: for sRow
 51 | 
 52 |                 # Add freq infomation pre-result data.
 53 |                 lResult = []
 54 |                 iTotal = len(lSeq_indel)
 55 | 
 56 |                 #print 'dFreq_count', dFreq_count
 57 |                 #print 'lSeq_indel', lSeq_indel
 58 | 
 59 |                 for Seq_indel in lSeq_indel:
 60 |                     iCount = dFreq_count[Seq_indel.indel]
 61 |                     Seq_indel.freq  = float(iCount) / iTotal
 62 |                     lResult.append(Seq_indel)
 63 | 
 64 |                 lResult.sort(key=lambda x: x.indel)
 65 |                 lResult.sort(key=lambda x: x.freq, reverse=True)
 66 | 
 67 |                 #print 'lResult', lResult
 68 | 
 69 |                 for Seq_indel in lResult:
 70 |                     #print Seq_indel.__dict__
 71 |                     Output_freq.write('\t'.join(map(str, [Seq_indel.seq, Seq_indel.indel, Seq_indel.freq, Seq_indel.ref_needle, Seq_indel.query_needle]))+'\n')
 72 |             #end: with open
 73 |         #end: if os.path
 74 |     #end: sFile
 75 | 
 76 | 
 77 | def Make_indel_summary():
 78 | 
 79 |     lOutput = []
 80 | 
 81 |     for sFile in os.listdir('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir)):
 82 |         if os.path.isfile(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile)):
 83 |             with open(os.path.join('{outdir}/result/freq/freq_result'.format(outdir=sOutput_dir), sFile)) as Input_freq:
 84 | 
 85 |                 sRef       = Input_freq.readline()  # first row is ref.
 86 |                 sDelemiter = Input_freq.readline()  # second row is '-------' delemiter.
 87 | 
 88 |                 dINDEL = OrderedDict()
 89 | 
 90 |                 lTable = [sRow.replace('\n', '').split('\t') for sRow in Input_freq]
 91 |                 iTotal = len(lTable)
 92 | 
 93 |                 for lCol in lTable:
 94 |                     sINDEL = lCol[1]
 95 |                     try:
 96 |                         dINDEL[sINDEL] += 1
 97 |                     except KeyError:
 98 |                         dINDEL[sINDEL] = 1
 99 | 
100 |                 dINDEL = OrderedDict(sorted(dINDEL.items(), key=lambda t: t[1], reverse=True))
101 |                 
102 |                 llINDEL = [[sKey, iValue, round(iValue/float(iTotal),3)*100] for sKey, iValue in dINDEL.items()]
103 |                 sINDEL_result = ''.join([':'.join(map(str, lINDEL))+', ' for lINDEL in llINDEL])[:-2]
104 | 
105 |                 lOutput.append([sFile, iTotal, sINDEL_result])
106 |                 #Output_freq.write('\t'.join([sFile, sTotal, sINDEL_result]) + '\n')
107 | 
108 |     lOutput = sorted(lOutput, key=lambda x: x[1], reverse=True)
109 | 
110 |     with open('{outdir}/result/freq/freq_result/Indel_summary.txt'.format(outdir=sOutput_dir), 'w') as Output_freq:
111 |         for lCol in lOutput:
112 |             Output_freq.write('\t'.join(map(str, lCol)) + '\n')
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     print 'Indel frequency calculator start: ', datetime.now()
117 |     Calculate_indel_freq()
118 |     Make_indel_summary()
119 |     print 'Indel frequency calculator end: ', datetime.now()
120 | 


--------------------------------------------------------------------------------
/Base_edit_2/Kill_jobs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Confirm the jobs.
4 | # ps aux | grep hkim | grep BaseEdit_freq_ver1.0.py | less
5 | 
6 | kill -9 $(ps aux | grep hkim | grep Run_BaseEdit_freq.py | awk '{print$2}')
7 | kill -9 $(ps aux | grep hkim | grep BaseEdit_freq_crispresso.py | awk '{print$2}')
8 | 


--------------------------------------------------------------------------------
/Base_edit_2/MakeUserFolder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user=SH
 4 | project=24K_screening
 5 | 
 6 | 
 7 | [ ! -d ./Input ] && { `mkdir ./Input`; }
 8 | [ ! -d ./User ] && { `mkdir ./User`; }
 9 | [ ! -d ./Output ] && { `mkdir ./Output`; }
10 | 
11 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; }
12 | [ ! -d ./Input/${user}/Query ] && { `mkdir ./Input/${user}/Query`; }
13 | [ ! -d ./Input/${user}/Query/${project} ] && { `mkdir ./Input/${user}/Query/${project}`; }
14 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; }
15 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; }
16 | 
17 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; }
18 | > ./User/${user}/${project}.txt
19 | 


--------------------------------------------------------------------------------
/Base_edit_2/Make_user_folder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user=JaeWoo
 4 | project=JaeWoo_test_samples4
 5 | 
 6 | 
 7 | [ ! -d ./Input ] && { `mkdir ./Input`; }
 8 | [ ! -d ./User ] && { `mkdir ./User`; }
 9 | [ ! -d ./Output ] && { `mkdir ./Output`; }
10 | 
11 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; }
12 | [ ! -d ./Input/${user}/Query ] && { `mkdir ./Input/${user}/Query`; }
13 | [ ! -d ./Input/${user}/Query/${project} ] && { `mkdir ./Input/${user}/Query/${project}`; }
14 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; }
15 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; }
16 | 
17 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; }
18 | > ./User/${user}/${project}.txt
19 | 


--------------------------------------------------------------------------------
/Base_edit_2/README.md:
--------------------------------------------------------------------------------
1 | # Base_edit_2
2 | Fast CRISPR base edit count tool
3 | 
4 | 
5 | ### Detailed options
6 |     ./Run_BaseEdit_freq.py -h
7 | 


--------------------------------------------------------------------------------
/Base_edit_2/Run_BaseEdit_freq.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys, pdb, math, logging
  2 | 
  3 | import subprocess as sp
  4 | 
  5 | from pdb import set_trace
  6 | from datetime import datetime
  7 | from optparse import OptionParser
  8 | 
  9 | sys.path.insert(0, os.path.dirname(os.getcwd()))
 10 | from Core.CoreSystem import InitialFolder, UserFolderAdmin, Helper, RunMulticore, CheckProcessedFiles
 11 | 
 12 | 
 13 | class clsBaseEditRunner(UserFolderAdmin):
 14 | 
 15 |     def __init__(self, strSample, strRef, options, InstInitFolder):
 16 |         UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath)
 17 | 
 18 |         self.strSample        = strSample
 19 |         self._RemoveTmpBeforStart()
 20 |         self.MakeSampleFolder() ## inheritance
 21 | 
 22 |         self.strRef           = strRef
 23 |         self.intCore          = options.multicore
 24 |         self.strGapOpen       = options.gap_open
 25 |         self.strGapExtend     = options.gap_extend
 26 |         self.strTargetWindow  = options.target_window
 27 |         self.strIndelCheckPos = options.indel_check_pos
 28 |         self.strTargetRefAlt  = options.target_ref_alt
 29 | 
 30 |         self.strBarcodeFile      = os.path.join(self.strRefDir, 'Barcode.txt')
 31 |         self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference.txt')
 32 |         self.strRefFile          = os.path.join(self.strRefDir, 'Reference.fa')
 33 | 
 34 |         self.strPamSeq   = options.PAM_seq
 35 |         self.strPamPos   = options.PAM_pos
 36 |         self.strGuidePos = options.Guide_pos
 37 | 
 38 |         Helper.MakeFolderIfNot('./Output/{user}/{project}/{sample}/Tmp/Alignment'.format(user=self.strUser,
 39 |                                                                                          project=self.strProject,
 40 |                                                                                          sample=self.strSample))
 41 | 
 42 |     def MakeReference(self):
 43 | 
 44 |         with open(self.strBarcodeFile) as Barcode, \
 45 |             open(self.strReferenceSeqFile) as Ref, \
 46 |             open(self.strRefFile, 'w') as Output:
 47 | 
 48 |             listBarcode = Helper.RemoveNullAndBadKeyword(Barcode)
 49 |             listRef     = Helper.RemoveNullAndBadKeyword(Ref)
 50 | 
 51 |             ## defensive
 52 |             assert len(listBarcode) == len(listRef), 'Barcode and Reference must be a same row number.'
 53 | 
 54 |             dictBarcode = {}
 55 | 
 56 |             for strBarcode in listBarcode:
 57 |                 strBarcode = strBarcode.replace('\n','').replace('\r','').upper()
 58 |                 Helper.CheckIntegrity(self.strBarcodeFile, strBarcode) ## defensive
 59 |                 listBarcode   = strBarcode.split(':')
 60 |                 strBarSample  = listBarcode[0]
 61 |                 strBarcode    = listBarcode[1]
 62 |                 dictBarcode[strBarSample] = strBarcode
 63 | 
 64 |             for strRef in listRef:
 65 |                 strRef = strRef.replace('\n','').replace('\r','').upper()
 66 |                 Helper.CheckIntegrity(self.strBarcodeFile, strRef) ## defensive
 67 |                 listRef      = strRef.split(':')
 68 |                 strRefSample = listRef[0]
 69 |                 strRef       = listRef[1]
 70 | 
 71 |                 try:
 72 |                     sBarcode = dictBarcode[strRefSample]
 73 |                     Output.write('%s\t%s\t%s\n' % (strRefSample, sBarcode, strRef))
 74 |                 except KeyError:
 75 |                     logging.error('no matching')
 76 |                     logging.error(strRefSample,strRef)
 77 | 
 78 |     def MakeIndelSearcherCmd(self):
 79 | 
 80 |         listCmd = []
 81 | 
 82 |         with open(self.strRefFile) as BarcodeRef:
 83 | 
 84 |             for strBarcodeRef in BarcodeRef:
 85 |                 listBarcodeRef = strBarcodeRef.replace('\n', '').replace('\r','').split('\t')
 86 |                 strFileName    = listBarcodeRef[0]
 87 |                 strBarcode     = listBarcodeRef[1]
 88 |                 strRef         = listBarcodeRef[2]
 89 | 
 90 |                 self._CheckOptionsCorrect(strBarcode) ## defensive
 91 | 
 92 |                 strForwardQueryFile = './Input/{user}/Query/{project}/{sample}/{file_name}.txt'.format (user=self.strUser,
 93 |                                                                                                        project=self.strProject,
 94 |                                                                                                        sample=self.strSample,
 95 |                                                                                                        file_name=strFileName)
 96 | 
 97 |                 strCmd = ('{python} ./BaseEdit_freq_crispresso.py {forw} {GapO} {GapE} {barcode} {ref} {target_window} {indel_check_pos}'
 98 |                           ' {target_ref_alt} {outdir} {file_name} {PAM_seq} {PAM_pos} {guide_pos} {log}').format(
 99 |                         python=self.strPython, forw=strForwardQueryFile, GapO=self.strGapOpen, GapE=self.strGapExtend,
100 |                         barcode=strBarcode, ref=strRef, target_window=self.strTargetWindow, indel_check_pos=self.strIndelCheckPos,
101 |                         target_ref_alt=self.strTargetRefAlt, outdir=self.strOutSampleDir, file_name=strFileName,
102 |                         PAM_seq=self.strPamSeq, PAM_pos=self.strPamPos, guide_pos=self.strGuidePos, log=self.strLogPath)
103 |                 listCmd.append(strCmd)
104 | 
105 |         return listCmd
106 | 
107 |     def MakeMergeTarget(self):
108 |         strCmd = '{python} ./Summary_all_trim.py {output} {sample} {ref_alt}'.format(python=self.strPython, output=self.strOutSampleDir,
109 |                                                                                      sample=self.strSample, ref_alt=self.strTargetRefAlt)
110 |         sp.call(strCmd, shell=True)
111 | 
112 |     def CopyToAllResultFolder(self):
113 | 
114 |         sp.call('cp $(find ./Output/{user}/{project}/*/Result/*Merge* -name "*_Summary.txt") ./Output/{user}/{project}/All_results'.format(
115 |             user=self.strUser, project=self.strProject), shell=True)
116 | 
117 |     def _RemoveTmpBeforStart(self):
118 |         strFolderPath = './Output/{user}/{project}/{sample}'.format(user=self.strUser,
119 |                                                                     project=self.strProject,
120 |                                                                     sample=self.strSample)
121 | 
122 |         if os.path.isdir(strFolderPath):
123 |             strCmd = 'rm -r %s' % strFolderPath
124 | 
125 |             Helper.PreventFromRmMistake(strCmd) ## defensive
126 | 
127 |             logging.info('Delete the %s folder before starting if these were existed.' % self.strSample)
128 |             sp.call(strCmd.format(user=self.strUser,
129 |                                   project=self.strProject,
130 |                                   sample=self.strSample), shell=True)
131 | 
132 |     ## defensive
133 |     def _CheckOptionsCorrect(self, strBarcode):
134 |         intBarcodeLen  = len(strBarcode)
135 |         intTargetStart = int(self.strTargetWindow.split('-')[0])
136 |         intTargetEnd   = int(self.strTargetWindow.split('-')[1])
137 |         intIndelStart  = int(self.strIndelCheckPos.split('-')[0])
138 |         intIndelEnd    = int(self.strIndelCheckPos.split('-')[1])
139 | 
140 |         intGuideStart  = int(self.strGuidePos.split('-')[0])
141 |         intGuideEnd    = int(self.strGuidePos.split('-')[1])
142 | 
143 |         intPamStart    = int(self.strPamPos.split('-')[0])
144 |         intPamEnd      = int(self.strPamPos.split('-')[1])
145 | 
146 |         intPamLen      = len(self.strPamSeq)
147 | 
148 |         if intBarcodeLen >= intTargetStart:
149 |             logging.error('Target window start position must be larger than barcode length')
150 |             logging.error('Barcode length: %s, Window start: %s' % (intBarcodeLen, intTargetStart))
151 |             raise Exception
152 | 
153 |         if intTargetStart > intGuideStart or intTargetEnd < intGuideEnd:
154 |             logging.error('Target window start, end range must be larger than guide range')
155 |             logging.error('Target window: %s, Guide window: %s' % (self.strTargetWindow, self.strGuidePos))
156 |             raise Exception
157 | 
158 |         if intIndelStart >= intGuideEnd or intIndelEnd >= intGuideEnd:
159 |             logging.error('Guide end position must be larger than Indel position')
160 |             logging.error('Guide end position: %s, Indel position: %s' % (intGuideEnd, self.strIndelCheckPos))
161 |             raise Exception
162 | 
163 |         if intPamStart <= intGuideEnd or intPamEnd <= intGuideEnd:
164 |             logging.error('PAM position must be larger than Guide end pos')
165 |             logging.error('PAM position: %s, Guide end position: %s, ' % (self.strPamPos, intGuideEnd))
166 |             raise Exception
167 | 
168 |         if (intPamEnd - intPamStart + 1) != intPamLen:
169 |             logging.error('PAM size and PAM seq must be same length.')
170 |             logging.error('PAM pos: %s, PAM seq: %s, ' % (self.strPamPos, self.strPamSeq))
171 |             raise Exception
172 | 
173 | 
174 | def Main():
175 |     print('BaseEdit program start: %s' % datetime.now())
176 | 
177 |     sCmd = ("BaseEdit frequency analyzer\n\n./Run_BaseEdit_freq.py -t 15 -w 16-48 --indel_check_pos 39-40 --target_ref_alt A,T --PAM_seq NGG --PAM_pos 43-45 --Guide_pos 23-42"
178 |             " --gap_open -10 --gap_extend 1\n\n"
179 |             "The sequence position is the one base position (start:1)\n"
180 |             "1: Barcode\n"
181 |             "2: Base target window (end pos = PAM pos +3)\n"
182 |             "3: Indel check pos\n"
183 |             "4: PAM pos\n"
184 |             "5: Guide pos (without PAM)\n\n"
185 |             "TATCTCTATCAGCACACAAGCATGCAATCACCTTGGGTCCAAAGGTCC\n"
186 |             "<------1------><----------------2--------------->\n"
187 |             "                                     <3>  <4>   \n"
188 |             "                      <---------5-------->      \n\n")
189 | 
190 |     parser = OptionParser(sCmd)
191 | 
192 |     parser.add_option("-t", "--thread", default="1", type="int", dest="multicore", help="multiprocessing number")
193 |     parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0')
194 |     parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100')
195 |     parser.add_option("-w", "--target_window", type="str", dest="target_window", help="a window size for target sequence : 20-48")
196 |     parser.add_option("--indel_check_pos", type="str", dest="indel_check_pos", help="indel check position to filter : 39-40; insertion 39, deletion 39 & 40")
197 |     parser.add_option("--target_ref_alt", type="str", dest="target_ref_alt", help="Ref 'A' is changed to Alt 'T': A,T")
198 |     parser.add_option("--PAM_seq", type="str", dest="PAM_seq", help="PAM sequence: NGG, NGC ...")
199 |     parser.add_option("--PAM_pos", type="str", dest="PAM_pos", help="PAM position range in the reference seqeunce : 43-45")
200 |     parser.add_option("--Guide_pos", type="str", dest="Guide_pos", help="Guide position range in the reference seqeunce : 23-42")
201 |     parser.add_option('--python', dest='python', help='The python path including the CRISPResso2')
202 |     parser.add_option('--user', dest='user_name', help='The user name with no space')
203 |     parser.add_option('--project', dest='project_name', help='The project name with no space')
204 | 
205 |     options, args = parser.parse_args()
206 | 
207 |     InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__))
208 |     InstInitFolder.MakeDefaultFolder()
209 |     InstInitFolder.MakeInputFolder()
210 |     InstInitFolder.MakeOutputFolder()
211 | 
212 |     logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
213 |                         level=logging.DEBUG,
214 |                         filename=InstInitFolder.strLogPath,
215 |                         filemode='a')
216 |     logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
217 | 
218 |     logging.info('Program start')
219 |     if options.multicore > 15:
220 |         logging.warning('Optimal threads <= 15')
221 |     logging.info(str(options))
222 | 
223 |     with open(InstInitFolder.strProjectFile) as Sample_list:
224 | 
225 |         listSamples = Helper.RemoveNullAndBadKeyword(Sample_list)
226 | 
227 |         strInputProject = './Input/{user}/Query/{project}'.format(user=options.user_name, project=options.project_name)
228 | 
229 |         @CheckProcessedFiles
230 |         def RunPipeline(**kwargs):
231 | 
232 |             for strSample in listSamples:
233 |                 if strSample[0] == '#': continue
234 | 
235 |                 tupSampleInfo = Helper.SplitSampleInfo(strSample)
236 |                 if not tupSampleInfo: continue
237 |                 strSample, strRef, strExpCtrl = tupSampleInfo
238 | 
239 |                 InstBaseEdit = clsBaseEditRunner(strSample, strRef, options, InstInitFolder)
240 |                 InstBaseEdit.MakeReference()
241 | 
242 |                 listCmd = InstBaseEdit.MakeIndelSearcherCmd()
243 |                 ###print(lCmd[:5])
244 |                 RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py
245 | 
246 |                 InstBaseEdit.MakeMergeTarget()
247 | 
248 |             InstBaseEdit.CopyToAllResultFolder()
249 | 
250 |         RunPipeline(InstInitFolder=InstInitFolder,
251 |                     strInputProject=strInputProject,
252 |                     listSamples=listSamples,
253 |                     logging=logging)
254 | 
255 |     print('BaseEdit program end: %s' % datetime.now())
256 | 
257 | 
258 | if __name__ == '__main__':
259 |     Main()
260 | 


--------------------------------------------------------------------------------
/Base_edit_2/Run_cmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ####################
 4 | ## User parameter ##
 5 | ###################################
 6 | 
 7 | user=JaeWoo
 8 | project=JaeWoo_test_samples
 9 | target_window=20-59
10 | indel_check_pos=50-51
11 | target_ref_alt=A,G
12 | PAM_seq=NGG
13 | PAM_pos=54-56
14 | Guide_pos=23-53
15 | 
16 | thread=15
17 | 
18 | gap_open=-10 ## default
19 | gap_extend=1 ## default
20 | 
21 | ###################################
22 | 
23 | while read python_path;do
24 |     python=$python_path
25 | done < ../PythonPath.txt
26 | 
27 | [ ! -d ./Output/${user} ] && { `mkdir ./Output/${user}`; }
28 | [ ! -d ./Output/${user}/${project} ] && { `mkdir ./Output/${user}/${project}`; }
29 | [ ! -d ./Output/${user}/${project}/Log ] && { `mkdir ./Output/${user}/${project}/Log`; }
30 | 
31 | nohup $python ./Run_BaseEdit_freq.py --python $python --user $user --project $project -w $target_window --indel_check_pos $indel_check_pos \
32 |                                --target_ref_alt $target_ref_alt --PAM_seq $PAM_seq --PAM_pos $PAM_pos --Guide_pos $Guide_pos \
33 |                                --gap_open $gap_open --gap_extend $gap_extend -t $thread > ./Output/${user}/${project}/Log/log.txt 2>&1 &
34 | 


--------------------------------------------------------------------------------
/Base_edit_2/Run_each_base_summary.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ####################
 4 | ## User parameter ##
 5 | ###################################
 6 | 
 7 | user=SH
 8 | project=24K_screening
 9 | 
10 | ###################################
11 | 
12 | 
13 | 
14 | while read python_path;do
15 |     python=$python_path
16 | done < ../PythonPath.txt
17 | 
18 | nohup $python ./Each_base_summary.py $user $project > ./Output/${user}/${project}/Log/Each_base_summary_log.txt 2>&1 & 
19 | 


--------------------------------------------------------------------------------
/Base_edit_2/Run_sequence_freq.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ####################
 5 | ## User parameter ##
 6 | ###################################
 7 | 
 8 | user=SH
 9 | project=24K_screening
10 | window=25-34
11 | thread=4
12 | 
13 | ###################################
14 | 
15 | 
16 | 
17 | while read python_path;do
18 |     python=$python_path
19 | done < ../PythonPath.txt
20 | 
21 | nohup $python ./Sequence_freq.py $user $project $window $thread > ./Output/${user}/${project}/Log/Sequence_freq_log.txt 2>&1 &
22 | 


--------------------------------------------------------------------------------
/Base_edit_2/Sequence_freq.py:
--------------------------------------------------------------------------------
  1 | #!/home/hkim/anaconda2/bin/python2.7
  2 | 
  3 | import os,sys
  4 | import numpy as np
  5 | from collections import Counter
  6 | from collections import OrderedDict
  7 | import multiprocessing as mp
  8 | 
  9 | import logging
 10 | from pdb import set_trace
 11 | logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S', level=logging.DEBUG)
 12 | 
 13 | sys.path.insert(0, os.path.dirname(os.getcwd()))
 14 | from Core.CoreSystem import Helper
 15 | 
 16 | 
 17 | try:
 18 |     strUser    = sys.argv[1]
 19 |     strProject = sys.argv[2]
 20 |     lWindow    = sys.argv[3].split('-')
 21 |     iWinStart  = int(lWindow[0])
 22 |     iWinEnd    = int(lWindow[1])
 23 |     iCore      = int(sys.argv[4])
 24 | 
 25 | 
 26 | except IndexError:
 27 |     print('\nUsage: ./Sequence_freq.py SH 24K_screening 25-33 10\n'
 28 |           '         ./Sequence_freq.py user_name project_name window_range thread\n')
 29 |     sys.exit()
 30 | 
 31 | 
 32 | def Count_seq_freq(lPara):
 33 | 
 34 |     """ aligned_BaseEdit.txt
 35 |     ACTAGCTATCGCTCACTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA        CGCTTGAAAAAGTGGCACCGAGTCGGTGCTTTTTTGACTAGCTATCGCTCACTCTGGGGTCAGGGGCAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA  []
 36 |     [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]]
 37 |     ACTAGCTATCGCTCACTCTGGGGTCAGGGACAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTA--A      ACTAGCTATCGCTCACTCTGGGGTCAGGGGCAGTGGACTCGAAGGAGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA
 38 |     """
 39 | 
 40 |     strSample           = lPara[0]
 41 |     sFile_path          = lPara[1]
 42 |     sTotal_readcnt_path = lPara[2]
 43 |     dInput_fa           = lPara[3]
 44 |     print ("Count_seq_freq: ", strSample, sFile_path, sTotal_readcnt_path)
 45 | 
 46 |     try:
 47 |         with open('./Output/{user}/{project}/{sample}/Result/Seq_freq.txt'.format(user=strUser,
 48 |                                                                                    project=strProject,
 49 |                                                                                    sample=strSample), 'w') as Output:
 50 | 
 51 |             Output.write('Filename\tSeq\tMotif\tCount\tTotal_cnt\tProportion\tSubstitution\n')
 52 | 
 53 |             ## A project has many file. The total read count is summation in the each file.
 54 |             for iFile_num, sFile in enumerate(os.listdir(sFile_path)):
 55 |                 #set_trace()
 56 |                 if 'aligned' in sFile:
 57 |                     # print(iFile_num)
 58 |                     sFilename = sFile.replace('_aligned_BaseEdit', '').split('.')[:-1][0] ## To search filename in other folder.
 59 |                     sTotal_readcnt_file = sFilename + '_Summary.txt'                      ## extract totral read count for the sequence frequency.
 60 | 
 61 |                     with open(sFile_path + '/' + sFile) as aligned_BaseEdit,\
 62 |                         open(sTotal_readcnt_path + '/' + sTotal_readcnt_file) as Total_readcnt:
 63 |                         #print(sFile_path + '/' + sFile)
 64 | 
 65 |                         iTotal_wo_indel = 0
 66 | 
 67 |                         for i, sRow in enumerate(Total_readcnt):
 68 |                             if i == 0: continue
 69 |                             lCol = sRow.replace('\n', '').split('\t')
 70 |                             iTotal_read = int(lCol[3])                   ## This is read counts of a matched barcode.
 71 |                             iIndel_read = int(lCol[4]) + int(lCol[5]) + int(lCol[6])
 72 |                             iTotal_wo_indel = iTotal_read - iIndel_read  ## Total read is without indel reads
 73 |                             break                                        ## 2 row is target, over 3 is none
 74 | 
 75 |                         lTarget_seq        = []
 76 |                         sRef_seq           = ''
 77 |                         dSeq_wt_extend     = {}  ## WT + motif(target sequence) + WT
 78 | 
 79 |                         for i, sRow in enumerate(aligned_BaseEdit):
 80 | 
 81 |                             lCol       = sRow.replace('\n', '').split('\t')
 82 |                             sQuery_seq = lCol[5]
 83 | 
 84 |                             if sRef_seq == '':   ## Reference is same in the file, so store once.
 85 |                                 sRef_seq = lCol[0]
 86 |                                 dSeq_wt_extend[sRef_seq[iWinStart - 1: iWinEnd]] = sRef_seq
 87 | 
 88 |                             lRef_seq_with_motif = list(sRef_seq)
 89 |                             lRef_seq_with_motif[ iWinStart-1 : iWinEnd ] = list(sQuery_seq[ iWinStart-1 : iWinEnd ])
 90 |                             sRef_seq_with_motif = ''.join(lRef_seq_with_motif)
 91 | 
 92 |                             dSeq_wt_extend[sQuery_seq[ iWinStart-1 : iWinEnd ]] = sRef_seq_with_motif
 93 |                             lTarget_seq.append(sQuery_seq[ iWinStart-1 : iWinEnd ])
 94 | 
 95 |                         iNormal  = iTotal_wo_indel - len(lTarget_seq)
 96 |                         sRef_seq = sRef_seq[ iWinStart-1 : iWinEnd ]
 97 |                         dSeq_cnt = Counter(lTarget_seq)
 98 | 
 99 |                         try:
100 |                             iRef_cnt_in_aligned = dSeq_cnt[sRef_seq]  ## check normal sequence because substitution exists outside of window size.
101 |                             iNormal = iNormal + iRef_cnt_in_aligned
102 |                             del dSeq_cnt[sRef_seq]
103 |                         except KeyError:
104 |                             pass
105 | 
106 |                         if iNormal > 0:
107 |                             if sRef_seq == '':                     ## aligned result file can be none result file. So extract from input file.
108 |                                 sRef_seq = dInput_fa[sFilename][1] ## dInput_fa[0] : full ref, dInput_fa[1] : target ref
109 |                                 dSeq_wt_extend[sRef_seq] = dInput_fa[sFilename][0]
110 |                             try:
111 |                                 Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sRef_seq], sRef_seq, iNormal, iTotal_wo_indel, round(iNormal/float(iTotal_wo_indel),4), 'ref_from_result']))+'\n')
112 |                             except Exception as e:
113 |                                 print(e, 'line150')
114 |                                 set_trace()
115 | 
116 |                         elif iNormal == 0:  ## if iNormal = 0, that means no result generation. because aligned_BaseEdit file is not contained non-read file.
117 |                             sRef_seq = dInput_fa[sFilename][1]
118 |                             dSeq_wt_extend[sRef_seq] = dInput_fa[sFilename][0]
119 |                             try:
120 |                                 Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sRef_seq], sRef_seq, iNormal, iTotal_wo_indel, iNormal, 'ref_from_input'])) + '\n')
121 |                             except Exception as e:
122 |                                 print(e, 'line158')
123 |                                 set_trace()
124 | 
125 |                         for sSeq, iCnt in dSeq_cnt.most_common():
126 |                             try:
127 |                                 Output.write('\t'.join(map(str, [sFilename, dSeq_wt_extend[sSeq], sSeq, iCnt, iTotal_wo_indel, round(iCnt/float(iTotal_wo_indel),4), 'alt']))+'\n')
128 |                             except Exception as e:
129 |                                 print(lPara[0], sFilename)
130 |                                 print(iCnt, iTotal_wo_indel)
131 |                                 print(e, 'line175')
132 |                                 #pass
133 |                                 set_trace()
134 |                     #END: for
135 |                 #END: with
136 |             #END: for
137 |         #END: with
138 |     except Exception as e:
139 |         print(e)
140 |         print("Error in the input: ", strSample, sFilename, sTotal_readcnt_file)
141 |         pass
142 | #END: def
143 | 
144 | 
145 | def Make_ref_dict(strRef):
146 | 
147 |     dInput_fa = {}
148 | 
149 |     with open('./Input/{user}/Reference/{project}/{ref}/Reference.fa'.format(user=strUser,
150 |                                                                              project=strProject,
151 |                                                                              ref=strRef)) as Input_ref:
152 |         """
153 |         YSKim_0525+01614_98_repeat1     TATACACGCATGTAT TTTGTATACACGCATGTATGCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
154 | 
155 |         1 file has 1 barcode.
156 |         This should be done.
157 |         """
158 |         for sRow in Input_ref:
159 |             lCol = sRow.replace('\n', '').split('\t')
160 | 
161 |             sInputFile = lCol[0]
162 |             sBarcode   = lCol[1]
163 |             sInputRef  = lCol[2]
164 | 
165 |             iBarcode_start        = sInputRef.index(sBarcode)
166 |             sBarcode_start_ref    = sInputRef[iBarcode_start:]
167 |             dInput_fa[sInputFile] = [sBarcode_start_ref, sBarcode_start_ref[iWinStart - 1: iWinEnd]]
168 | 
169 |     return dInput_fa
170 | 
171 | 
172 | def Count_group():
173 | 
174 |     """
175 |     Filename        Seq     Count   Total_cnt       Proportion      Substitution
176 |     Doench2014_1000 AGGGACA 13      14      0.9286  ref_from_result
177 |     Doench2014_1000 AG----- 1       14      0.0714  alt
178 |     Doench2014_1001 GGCGCCA 17      26      0.6538  ref_from_result
179 |     Doench2014_1001 GGTGCCA 5       26      0.1923  alt
180 |     Doench2014_1001 GGAGCCA 2       26      0.0769  alt
181 |     Doench2014_1001 GGCGCTA 1       26      0.0385  alt
182 |     """
183 | 
184 |     sHeader    = ''
185 |     dTotal_cnt = {}
186 | 
187 |     ## Make dictionary to sum the total reads count of the group. The total reads count is always same in their group.
188 |     with open('Group_list.txt') as Group_list:
189 |         for sGroupname in Group_list:
190 |             if sGroupname[0] == "#": continue
191 |             sGroupname = sGroupname.replace('\n', '').strip()
192 |             if not os.path.isdir('./Output/Group_result'): os.mkdir('./Output/Group_result')
193 | 
194 |             for sFile in os.listdir('./Output'):
195 |                 if sGroupname in sFile:  ## matched group names -> Sum the counts
196 |                     with open('./Output/%s/Summary/Seq_freq.txt' % sFile) as SeqFreq:
197 | 
198 |                         sHeader = SeqFreq.readline()
199 |                         dSelect_one_total_cnt = {}
200 |                         for sRow in SeqFreq:
201 |                             lCol            = sRow.replace('\n', '').split('\t')
202 |                             sFilename       = lCol[0]
203 |                             try:
204 |                                 iTotal_read_cnt = int(lCol[4])
205 |                             except IndexError:
206 |                                 set_trace()
207 |                             dSelect_one_total_cnt[sFilename] = iTotal_read_cnt
208 | 
209 |                         for sFilename, iTotal_read_cnt in dSelect_one_total_cnt.items():
210 |                             try:
211 |                                 dTotal_cnt[sGroupname + '_' + sFilename] += iTotal_read_cnt
212 |                             except KeyError:
213 |                                 dTotal_cnt[sGroupname + '_' + sFilename] = iTotal_read_cnt
214 | 
215 |     with open('Group_list.txt') as Group_list:
216 |         for sGroupname in Group_list:
217 |             if sGroupname[0] == "#": continue
218 |             sGroupname = sGroupname.replace('\n', '').strip()
219 |             dSeq_freq  = OrderedDict()   ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']),
220 |                                          ## Unique key, only one list.
221 |             if not os.path.isdir('./Output/Group_result'): os.mkdir('./Output/Group_result')
222 | 
223 |             for sFile in os.listdir('./Output'):
224 |                 if sGroupname in sFile:  ## matched group names -> Sum the counts
225 |                     with open('./Output/%s/Summary/Seq_freq.txt' % sFile) as SeqFreq:
226 | 
227 |                         sHeader = SeqFreq.readline()
228 | 
229 |                         for sRow in SeqFreq:
230 |                             lCol            = sRow.replace('\n', '').split('\t')
231 |                             sFilename       = lCol[0]
232 |                             sSeq_wt_extend  = lCol[1]
233 |                             sFile_seq       = lCol[0] + '_' + lCol[2]  ## Unique name : Doench2014_1000_CTCTGGGGT
234 |                             iCount          = int(lCol[3])
235 |                             iTotal_read_cnt = dTotal_cnt[sGroupname + '_' + sFilename]
236 | 
237 |                             lCol[3] = iCount
238 |                             lCol[4] = iTotal_read_cnt
239 | 
240 |                             try:
241 |                                 _ = dSeq_freq[sFile_seq]
242 | 
243 |                                 dSeq_freq[sFile_seq][3] += iCount
244 |                                 #dSeq_freq[sFile_seq][4] = iTotal_read_cnt
245 | 
246 |                             except KeyError:
247 |                                 dSeq_freq[sFile_seq] = lCol  ## initial assignment
248 | 
249 |             ## x[0] : key, x[1] : value, int(x[1][5]) : proportion, x[1][6]: alt, wt category, x[1][0]: filename,
250 |             llSeq_freq = sorted(sorted(dSeq_freq.items(), key=lambda x:x[1][6], reverse=True), key=lambda x:x[1][0])
251 |             if not os.path.isdir('./Output/Group_result/%s' % sGroupname): os.mkdir('./Output/Group_result/%s' % sGroupname)
252 |             with open('./Output/Group_result/%s/Seq_freq.txt' % sGroupname, 'w') as Output:
253 | 
254 |                 Output.write(sHeader)
255 | 
256 |                 for sFile_seq, lCol in llSeq_freq:
257 |                     try:
258 |                         try:
259 |                             lCol[5] = round(float(lCol[3])/lCol[4], 4)  ## proportion calculation, previous proportion is not correct.
260 |                         except ZeroDivisionError:
261 |                             lCol[5] = 0
262 |                     except Exception:
263 |                         set_trace()
264 |                     Output.write('\t'.join(map(str, lCol)).replace('ref_from_result', 'wt').replace('ref_from_input', 'wt')+'\n')
265 |         #END: for
266 |     #END: with
267 | 
268 | 
269 | def Trim_data():
270 | 
271 |     """
272 |     Remove gap seqs (e.g. AC---)
273 |     """
274 |     with open('Group_list.txt') as Group_list:
275 |         for sGroupname in Group_list:
276 |             if sGroupname[0] == "#": continue
277 |             sGroupname = sGroupname.replace('\n', '').strip()
278 |             dSeq_freq  = OrderedDict()
279 | 
280 |             with open('./Output/Group_result/%s/Seq_freq.txt' % sGroupname) as Group_result,\
281 |                 open('./Output/Group_result/%s/Trimmed_seq_freq.txt' % sGroupname, 'w') as Trimmed_result:
282 | 
283 |                 sHeader = ''
284 | 
285 |                 for i, sRow in enumerate(Group_result):
286 | 
287 |                     if i == 0:
288 |                         sHeader = sRow
289 |                         continue
290 | 
291 |                     lCol            = sRow.replace('\n', '').split('\t')
292 |                     sFilename       = lCol[0]  ## Doench2014_1000
293 | 
294 |                     try:
295 |                         dSeq_freq[sFilename].append(lCol)
296 |                     except KeyError:
297 |                         dSeq_freq[sFilename] = [lCol]
298 | 
299 |                 for sFilename in dSeq_freq:
300 |                     llFilename = dSeq_freq[sFilename]  ## [[Doench2014_1000,ACAGCAGCGAAC...,ACGCATC, 12,30,0.4][],[]...
301 |                                                            ## A Same file name chunk in the group file.
302 |                     iRecal_total      = 0  ## sub the gap seq cnt
303 |                     #lDele_key     = []
304 |                     llPre_recal_total = []
305 |                     llRecal_total     = []
306 | 
307 |                     for i, lFilename in enumerate(llFilename):
308 |                         sMotif          = lFilename[2]
309 |                         iMotif_cnt      = int(lFilename[3])
310 |                         iTotal_read_cnt = int(lFilename[4])
311 | 
312 |                         if lFilename[6] == 'wt':
313 |                             iRecal_total = iTotal_read_cnt
314 |                             llPre_recal_total.append(lFilename)
315 | 
316 |                         elif '-' in sMotif:
317 |                             iRecal_total -= iMotif_cnt
318 |                             continue
319 |                         else:
320 |                             llPre_recal_total.append(lFilename) ## store AC----- row key
321 | 
322 |                     for lPre_recal_total in llPre_recal_total:
323 |                         lPre_recal_total[4] = iRecal_total
324 |                         try:
325 |                             lPre_recal_total[5] = round(float(lPre_recal_total[3])/iRecal_total,4)  ## recal proportion because of sub.
326 |                         except ZeroDivisionError:
327 |                             pass
328 |                         llRecal_total.append(lPre_recal_total)
329 | 
330 |                     #llRecal_total[1:]    = sorted(llRecal_total[1:], key=lambda x: float(x[5]), reverse=True)
331 |                     dSeq_freq[sFilename] = llRecal_total ## reassign the total cnt
332 |                 #END for
333 | 
334 |                 llFilename_chunk = sorted(dSeq_freq) ## key is a filename
335 |                 for sKey in llFilename_chunk:
336 |                     llCol = dSeq_freq[sKey]
337 |                     llCol = sorted(llCol, key=lambda x: x[6], reverse=True) ## wild type category first
338 | 
339 |                     if llCol[0][6] != 'wt':
340 |                         logging.critical('error, wildtype must be fisrt row. If you see this error message, please contact the developer.')
341 |                         logging.critical('This program will be terminated.')
342 |                         sys.exit()
343 | 
344 |                     if len(llCol) > 1:  ## It has alt. only a wt file does not necessary.
345 |                         llCol[1:] = sorted(llCol[1:], key=lambda x: float(x[5]), reverse=True)
346 |                         dSeq_freq[sKey] = llCol
347 | 
348 |                 Trimmed_result.write(sHeader)
349 |                 for llRecal_total_final in dSeq_freq.values():
350 |                     for lRecal_total_final in llRecal_total_final:
351 |                         Trimmed_result.write('\t'.join(map(str,lRecal_total_final))+'\n')
352 |             #END with
353 |         #END for
354 |     #END with
355 | 
356 | 
357 | def Main():
358 | 
359 |     logging.info('Program Start')
360 | 
361 |     logging.info('Make commands for a multiple processing')
362 |     lPara = []
363 |     with open('./User/{user}/{project}.txt'.format(user=strUser, project=strProject)) as Project_list:
364 | 
365 |         for strSample in Project_list:
366 |             if strSample[0] == '#': continue
367 | 
368 |             tupSampleInfo = Helper.SplitSampleInfo(strSample)
369 |             if not tupSampleInfo: continue
370 |             strSample, strRef, strExpCtrl = tupSampleInfo
371 | 
372 |             strSample           = strSample.replace('\n', '').replace('\r', '')
373 |             sFile_path          = './Output/{user}/{project}/{sample}/Tmp/Alignment'.format(user=strUser, project=strProject, sample=strSample)
374 |             sTotal_readcnt_path = './Output/{user}/{project}/{sample}/Tmp/All'.format(user=strUser, project=strProject, sample=strSample)
375 |             dInput_fa           = Make_ref_dict(strRef)
376 | 
377 |             lPara.append([strSample, sFile_path, sTotal_readcnt_path, dInput_fa])
378 | 
379 |     logging.info('Multiple processing Start')
380 |     p = mp.Pool(iCore)
381 |     p.map_async(Count_seq_freq, lPara).get()
382 |     logging.info('Multiple processing End')
383 | 
384 |     #logging.info('Count group Start')
385 |     #Count_group()
386 |     #logging.info('Count group End')
387 | 
388 |     #logging.info('Trim data Start')
389 |     #Trim_data()
390 |     #logging.info('Trim data End')
391 | 
392 |     logging.info('Program End')
393 | 
394 | 
395 | Main()
396 | 


--------------------------------------------------------------------------------
/Base_edit_2/Sequence_freq_add.py:
--------------------------------------------------------------------------------
  1 | #!/home/hkim/anaconda2/bin/python2.7
  2 | 
  3 | from pdb import set_trace
  4 | 
  5 | ## D0 Sub list
  6 | """
  7 | Euchromatin_206_repeat5 TCTATCGTACATCGC Euchromatin_206_repeat5:39M2D_AC        1
  8 | ExtremeGC_811   CTACATCGTCATACA ExtremeGC_811:39M1D_G   1
  9 | """
 10 | #strSubHiseq1 = './Sub_indel_result/Summation_Project_list_sub_indel.txt'  ## total indel cnt : 8929
 11 | #strSubHiseq2 = './Sub_indel_result/Summation_Project_list2_sub_indel.txt' ## total indel cnt : 8367
 12 | #strSubNeon1  = './Sub_indel_result/Summation_Project_list3_sub_indel.txt' ## total indel cnt : 9396
 13 | #3strSubNeon2  = './Sub_indel_result/Summation_Project_list4_sub_indel.txt' ## total indel cnt : 8321
 14 | strSubHiseq1 = './Output/Summation_Project_list_sub_indel.txt'  ## total indel cnt : 8929
 15 | strSubHiseq2 = './Output/Summation_Project_list2_sub_indel.txt' ## total indel cnt : 8367
 16 | strSubNeon1  = './Output/Summation_Project_list3_sub_indel.txt' ## total indel cnt : 9396
 17 | strSubNeon2  = './Output/Summation_Project_list4_sub_indel.txt' ## total indel cnt : 8321
 18 | 
 19 | ## Total sum file
 20 | """
 21 |        Sample  Barcode Ref     # of Total      # of Insertion  # of Deletion   # of Combination        A.-7  
 22 | 0       Doench2014_1    CGCATATCATCATCA TAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA       322     0       0       0       0      
 23 | """
 24 | strTotalHiseq1 = './Output/Summation_Project_list.txt'
 25 | strTotalHiseq2 = './Output/Summation_Project_list2.txt'
 26 | strTotalNeon1  = './Output/Summation_Project_list3.txt'
 27 | strTotalNeon2  = './Output/Summation_Project_list4.txt'
 28 | 
 29 | ## Freq result file
 30 | """
 31 | Filename        Seq     Motif   Count   Total_cnt       Proportion      Substitution
 32 | Doench2014_1    CGCATATCATCATCATAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
 33 |         AGAGAGACA       246     257     0.9572  wt
 34 | """
 35 | strFreqHiseq1 = './Output/Group_result/180903_split_hiseq_R1/Seq_freq.txt'
 36 | strFreqHiseq2 = './Output/Group_result/180903_split_hiseq_R2/Seq_freq.txt'
 37 | strFreqNeon1  = './Output/Group_result/190311_Neon_splitBE4_R1/Seq_freq.txt'
 38 | strFreqNeon2  = './Output/Group_result/190311_Neon_splitBE4_R2/Seq_freq.txt'
 39 | 
 40 | ## Result
 41 | """
 42 | Filename        Seq     Motif   Count   Total_cnt       Proportion      Substitution
 43 | Doench2014_1    CGCATATCATCATCATAGATTGAAGAGAGACAGTACATGCCCTGGGAGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
 44 |         AGAGAGACA       246     257-(D0 indel count)     0.9572  wt
 45 | -> next line
 46 |   + indelcompelex sum count        
 47 |         
 48 | """
 49 | strResultHiseq1 = './Output/Seq_freq_add_info_Hiseq1.txt'
 50 | strResultHiseq2 = './Output/Seq_freq_add_info_Hiseq2.txt'
 51 | strResultNeon1  = './Output/Seq_freq_add_info_Neon1.txt'
 52 | strResultNeon2  = './Output/Seq_freq_add_info_Neon2.txt'
 53 | 
 54 | 
 55 | def Add_info_result(strSub, strTotal, strFreq, strResult):
 56 | 
 57 |     with open(strSub) as Sub,\
 58 |         open(strTotal) as Total,\
 59 |         open(strFreq) as Freq,\
 60 |         open(strResult, 'w') as Result:
 61 | 
 62 |         dictSubCnt = {}  ## Doench2016_1948:39M1D_G, Doench2016_1948:39M1I_G -> Neon: two file name is same but the pattern is different.
 63 |                          ## I should merge these pattern based on the file name.
 64 |         dictTotalAppliedSub = {}
 65 |         dictIndelSum        = {}
 66 | 
 67 |         for strRow in Sub:
 68 |             listCol  = strRow.replace('\n', '').split('\t')
 69 |             strFile  = listCol[0]
 70 |             intCount = int(listCol[3])
 71 | 
 72 |             try:
 73 |                 dictSubCnt[strFile] += intCount
 74 |             except KeyError:
 75 |                 dictSubCnt[strFile] = intCount
 76 | 
 77 |         # intSubIndelAllCnt = sum([v for k,v in dictSubCnt.items()])
 78 |         # print('%s sub indel all count : %s' % (strSub, intSubIndelAllCnt)) ## checked all count is correct.
 79 | 
 80 |         #"""
 81 |         for i, strRow in enumerate(Total):
 82 |             if i == 0: continue ## header skip.
 83 |             listCol  = strRow.replace('\n', '').split('\t')
 84 |             strFile  = listCol[1]
 85 |             intTotal = int(listCol[4])
 86 |             intIns   = int(listCol[5])
 87 |             intDel   = int(listCol[6])
 88 |             intCom   = int(listCol[7])
 89 | 
 90 |             try:
 91 |                 intSub  = dictSubCnt[strFile]
 92 |             except KeyError:
 93 |                 intSub = 0
 94 | 
 95 |             intTotalAppliedSub = intTotal - intSub            ## The total count is not subtracted by DOindel, so apply it.
 96 | 
 97 |             dictTotalAppliedSub[strFile] = intTotalAppliedSub
 98 |             dictIndelSum[strFile] = intIns + intDel + intCom  ## each file row indel complex count sum
 99 | 
100 |         dictFreq = {}  ## {'GECKO_346': [[Filename	Seq	Motif	Count	Total_cnt	Proportion	Substitution],[],[],[]]}
101 |         strHeader = ''
102 | 
103 |         for i, strRow in enumerate(Freq):                        ## Freq total was removed by crispr indel.
104 |             if i == 0:
105 |                 strHeader = strRow
106 |                 continue             ## header skip.
107 |             listCol = strRow.replace('\n', '').split('\t')
108 |             strFile  = listCol[0]
109 |             intCount = int(listCol[3])
110 |             intTotal = int(listCol[4])
111 |             floProp  = float(listCol[5])
112 |             listCol[3] = intCount
113 |             listCol[4] = intTotal
114 |             listCol[5] = floProp
115 | 
116 |             try:
117 |                 dictFreq[strFile].append(listCol)
118 |             except KeyError:
119 |                 dictFreq[strFile] = [listCol]
120 | 
121 |         Result.write(strHeader.replace('\n','')+'\tTotal(D0)\tD0_indel\n')
122 | 
123 |         for strFile, list2Col in dictFreq.items():
124 | 
125 |             list2Col[1:] = sorted(list2Col[1:], key=lambda x: x[5], reverse=True) ## sort by proportion
126 | 
127 |             intAltAllCnt = sum([listAlt[3] for listAlt in list2Col[1:]])
128 | 
129 |             ## for validation
130 |             listCountCheck = []
131 |             intTotalCheck  = 0
132 | 
133 |             for i, listCol in enumerate(list2Col):
134 |                 strSubstitution = listCol[6]
135 |                 intTotal        = listCol[4]
136 | 
137 |                 intTotalAppliedSub = dictTotalAppliedSub[strFile]
138 | 
139 |                 intIndelSum = dictIndelSum[strFile]   ## intIns + intDel + intCom
140 |                 intTotalD0  = intTotal + intIndelSum  ## freq total are substrated by indel sum, so add it again
141 | 
142 |                 intD0IndelCount = intTotalD0 - intTotalAppliedSub
143 | 
144 |                 if strSubstitution == 'wt':  ## modify WT count. Total - alt count = wt count
145 |                     intModiCount = intTotalAppliedSub - intAltAllCnt - intIndelSum
146 |                     if intModiCount < 0:
147 |                         print('minus value error, this integer is positive.')
148 |                         set_trace()
149 | 
150 |                     listCol[3]   = intModiCount
151 |                     #if listCol[0] == 'GECKO_7232': Neon1, 2761
152 |                     #    set_trace()
153 |                 listCountCheck.append(listCol[3])  ## for validation
154 | 
155 |                 listCol[4] = intTotalAppliedSub
156 |                 try:
157 |                     listCol[5] = round(float(listCol[3]) / listCol[4], 4)
158 |                 except Exception:
159 |                     listCol[5] = 0
160 | 
161 |                 Result.write('\t'.join(map(str, listCol + [intTotalD0, intD0IndelCount]))+'\n')
162 | 
163 |                 if i == 0:
164 |                     listCountCheck.append(intIndelSum)
165 |                     intTotalCheck = listCol[4] ## for validation
166 | 
167 |                     listResultCol = len(listCol) * ['~'] + ['~', '~']
168 |                     listResultCol[0] = strFile
169 |                     listResultCol[6] = 'Indel'
170 |                     listResultCol[3] = intIndelSum
171 | 
172 |                     Result.write('\t'.join(map(str, listResultCol))+'\n')
173 | 
174 |             #if strFile == 'GECKO_7232':
175 |             #    set_trace()
176 |             intCountCheckTotal = sum(listCountCheck)
177 |             if intCountCheckTotal != intTotalCheck:
178 |                 print('Count total is diffrent. result:%s, file:%s, CountCheckTotal:%s, TotalCheck:%s' % (strResult, strFile, intCountCheckTotal, intTotalCheck))
179 |         #"""
180 | 
181 | 
182 | def Main():
183 | 
184 |     for strSub, strTotal, strFreq, strResult in [[strSubHiseq1, strTotalHiseq1, strFreqHiseq1, strResultHiseq1],
185 |                                       [strSubHiseq2, strTotalHiseq2, strFreqHiseq2, strResultHiseq2],
186 |                                       [strSubNeon1, strTotalNeon1, strFreqNeon1, strResultNeon1],
187 |                                       [strSubNeon2, strTotalNeon2, strFreqNeon2, strResultNeon2]]:
188 | 
189 |         Add_info_result(strSub, strTotal, strFreq, strResult)
190 | 
191 | Main()
192 | 
193 | """
194 | ## deprecated
195 | def Merge_sub_indel_and_dict(strInput1, strInput2):
196 |     dictSubIndel = {}
197 | 
198 |     with open(strInput1) as Input1, \
199 |             open(strInput2) as Input2:
200 | 
201 |         for strRow in Input1:
202 |             listCol = strRow.replace('\n', '').split('\t')
203 |             strFile = listCol[0]
204 |             strBarcode = listCol[1]
205 |             strPattern = listCol[2]
206 |             intCount = int(listCol[3])
207 | 
208 |             dictSubIndel[strFile] = [strBarcode, strPattern, intCount]
209 | 
210 |         for strRow in Input2:
211 |             listCol = strRow.replace('\n', '').split('\t')
212 |             strFile = listCol[0]
213 |             intCount = int(listCol[3])
214 | 
215 |             dictSubIndel[strFile][2] += intCount
216 | """


--------------------------------------------------------------------------------
/Base_edit_2/Split_file.py:
--------------------------------------------------------------------------------
 1 | #!/home/hkimlab/anaconda2/bin/python2.7
 2 | 
 3 | 
 4 | 
 5 | import sys
 6 | import subprocess as sp
 7 | 
 8 | 
 9 | sFile_path = sys.argv[1]
10 | 
11 | iSplit_line = int(sys.argv[2]) #400000
12 | iSplit_num  = int(sys.argv[3]) #11
13 | 
14 | def Split():
15 | 
16 |     with open(sFile_path) as fq:
17 | 
18 |         for num in range(1, iSplit_num+1):
19 |             with open('%s_%s.fq' % (sFile_path, num), 'w') as out:
20 |                 iCount = 0
21 |                 for sRow in fq:
22 |                     iCount += 1
23 |                     out.write(sRow)
24 |                     if iCount == iSplit_line:
25 |                         break         
26 | 
27 | 
28 | def Make_filelist():
29 |     
30 |     with open('./LongGuide_Synthetic_2nd.txt', 'w') as filelist:
31 |         
32 |         for sFilename in sp.check_output('ls', shell=True).split('\n'):
33 | 
34 |             lFilename = sFilename.split('.')
35 |             #print(lFilename)  
36 |             if lFilename[-1] == 'fq':
37 |                 filelist.write(sFilename+'\n')
38 | 
39 | 
40 | #Split()
41 | Make_filelist()
42 | 


--------------------------------------------------------------------------------
/Base_edit_2/Sum_all_alt_freq.py:
--------------------------------------------------------------------------------
 1 | #!/media/hkim/7434A5B334A5792E/bin/Python/Python2/bin/python2
 2 | 
 3 | import os,sys
 4 | import numpy as np
 5 | from pdb import set_trace
 6 | 
 7 | 
 8 | sProject = sys.argv[1]
 9 | 
10 | 
11 | def Sum_all_freq():
12 | 
13 |     sFile_path = './Output/%s/Summary/All' % sProject
14 |     sHeader    = ''
15 | 
16 |     """
17 |     Sample	         Barcode	     Ref	                 # of Total	# of Insertion	# of Deletion	# of Combination	T.-7	A.-6  G.-5
18 |     Doench2014_1001	ATACATAGCTACATG	CAGCGGTCAGCTTACTCGACTTAA... 	60	         0	          0	               0	          0	     0     	0
19 |     																														  0	     0	    0 
20 |     																														  0	     0	    0
21 |     																														  0	     0	    0
22 |     """
23 | 
24 |     lSum_total_and_indel_data = []
25 |     lSum_target_data = []
26 | 
27 |     for iFile_num, sFile in enumerate(os.listdir(sFile_path)):
28 |         #print(iFile_num)
29 |         with open(sFile_path + '/' + sFile) as Input:
30 |             lSum_target = []
31 | 
32 |             for i, sRow in enumerate(Input):
33 |                 if i == 0:
34 |                     sHeader = sRow
35 |                     continue
36 | 
37 |                 lCol = sRow.replace('\n','').split('\t')
38 | 
39 |                 if i == 1: ## This data is in the second row
40 |                     lTotal_and_indel_col = map(int, lCol[3:7])
41 |                     if lSum_total_and_indel_data == []:
42 |                         lSum_total_and_indel_data = np.zeros((len(lTotal_and_indel_col)), int)
43 |                     lSum_total_and_indel_data += lTotal_and_indel_col
44 | 
45 |                 lTarget_col = map(int, lCol[7:])
46 |                 if lSum_target_data == []:
47 |                     lSum_target_data = np.zeros((4, len(lTarget_col)), int)
48 | 
49 |                 lSum_target.append(lTarget_col)
50 | 
51 |             if lSum_target:
52 |                 lSum_target_data += lSum_target
53 | 
54 |     print(lSum_target_data)
55 | 
56 |     with open('./Output/%s/Summary/Alt_freq.txt' % sProject, 'w') as Output:
57 | 
58 |         lHeader = sHeader.split('\t')
59 |         lHeader[7:] = [sCol.split('.')[1] for sCol in lHeader[7:]]
60 |         Output.write('Alt_base\t' + '\t'.join(lHeader[3:]))
61 |         
62 |         cnt = -1
63 | 
64 |         for sBase, lSum in zip(['A','C','G','T'], lSum_target_data):
65 |             cnt += 1
66 |             if cnt == 0:
67 |                 Output.write(sBase + '\t' + '\t'.join(map(str, lSum_total_and_indel_data)) + '\t' + '\t'.join(map(str, lSum)) + '\n')
68 |             else:
69 |                 Output.write(sBase + '\t\t\t\t\t' + '\t'.join(map(str, lSum)) + '\n')
70 | 
71 | 
72 | def Main():
73 |     Sum_all_freq()
74 | 
75 | 
76 | Main()
77 | 


--------------------------------------------------------------------------------
/Base_edit_2/Summary_all_trim.py:
--------------------------------------------------------------------------------
 1 | #!/media/hkim/7434A5B334A5792E/bin/Python/Python2/bin/python2
 2 | 
 3 | import os, sys
 4 | import subprocess as sp
 5 | from pdb import set_trace
 6 | 
 7 | sOutput_dir = sys.argv[1]
 8 | sSample     = sys.argv[2]
 9 | lRef_alt    = sys.argv[3].split(',')
10 | 
11 | 
12 | def Concat_summary():
13 | 
14 |     sRef         = lRef_alt[0]
15 |     sAlt         = lRef_alt[1]
16 |     sSummary_dir = "{outdir}/Tmp/Target".format(outdir=sOutput_dir)
17 |     lHeader      = []
18 |     lData        = []
19 | 
20 |     for sFile in os.listdir(sSummary_dir):
21 |         if sRef + 'to' + sAlt in sFile:
22 | 
23 |             with open(sSummary_dir + '/' + sFile) as Input:
24 |                 for i, sRow in enumerate(Input):
25 |                     if i == 0:
26 |                         lCol = sRow.replace('\n', '').split('\t')
27 |                         if lHeader:
28 |                             for iCol_num in range(len(lHeader)):
29 |                                 if iCol_num > 6:
30 | 
31 |                                     if lHeader[iCol_num] == "" or lHeader[iCol_num] == " ":
32 |                                         lHeader[iCol_num] = lCol[iCol_num]
33 |                         else:
34 |                             lHeader = lCol
35 |                     else:
36 |                         lData.append(sRow)
37 |                 #END: for
38 |             #END: with
39 |         #END: if
40 |     #END: for
41 | 
42 |     if not os.path.isdir('{outdir}/Result/Merge_target_result'.format(outdir=sOutput_dir)): os.mkdir('{outdir}/Result/Merge_target_result'.format(outdir=sOutput_dir))
43 | 
44 |     with open('{outdir}/Result/Merge_target_result/{sample}_{ref}to{alt}_Summary.txt'.format(outdir=sOutput_dir,
45 |                                                                                                sample=sSample,
46 |                                                                                                ref=sRef,
47 |                                                                                                alt=sAlt), 'w') as Output:
48 | 
49 |         Output.write('\t'.join(lHeader) +'\n')
50 |         for sData in lData:
51 |             Output.write(sData)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     Concat_summary()
56 | 


--------------------------------------------------------------------------------
/Base_edit_2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Base_edit_2/__init__.py


--------------------------------------------------------------------------------
/Core/CoreSystem.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys, logging
  2 | import subprocess as sp
  3 | import multiprocessing as mp
  4 | 
  5 | from pdb import set_trace
  6 | from datetime import datetime
  7 | 
  8 | import numpy as np
  9 | 
 10 | from CRISPResso2 import CRISPResso2Align
 11 | 
 12 | 
 13 | class Helper(object):
 14 | 
 15 |     @staticmethod
 16 |     def MakeFolderIfNot(strDir):
 17 |         if not os.path.isdir(strDir): os.makedirs(strDir)
 18 | 
 19 |     @staticmethod
 20 |     def RemoveNullAndBadKeyword(Sample_list):
 21 |         listSamples = [strRow for strRow in Sample_list.readlines() if strRow not in ["''", '', '""', '\n', '\r', '\r\n']]
 22 |         return listSamples
 23 | 
 24 |     @staticmethod ## defensive
 25 |     def CheckSameNum(strInputProject, listSamples):
 26 | 
 27 |         listProjectNumInInput = [i for i in sp.check_output('ls %s' % strInputProject, shell=True).split('\n') if i != '']
 28 | 
 29 |         setSamples           = set(listSamples)
 30 |         setProjectNumInInput = set(listProjectNumInInput)
 31 | 
 32 |         intProjectNumInTxt    = len(listSamples)
 33 |         intProjectNumInInput  = len(listProjectNumInInput)
 34 | 
 35 |         if intProjectNumInTxt != len(setSamples - setProjectNumInInput):
 36 |             logging.warning('The number of samples in the input folder and in the project list does not matched.')
 37 |             logging.warning('Input folder: %s, Project list samples: %s' % (intProjectNumInInput, intProjectNumInTxt))
 38 |             raise AssertionError
 39 |         else:
 40 |             logging.info('The file list is correct, pass\n')
 41 | 
 42 |     @staticmethod ## defensive
 43 |     def CheckAllDone(strOutputProject, listSamples):
 44 |         intProjectNumInOutput = len([i for i in sp.check_output('ls %s' % strOutputProject, shell=True).split('\n') if i not in ['All_results', 'Log', '']])
 45 | 
 46 |         if intProjectNumInOutput != len(listSamples):
 47 |             logging.warning('The number of samples in the output folder and in the project list does not matched.')
 48 |             logging.warning('Output folder: %s, Project list samples: %s\n' % (intProjectNumInOutput, len(listSamples)))
 49 |         else:
 50 |             logging.info('All output folders have been created.\n')
 51 | 
 52 |     @staticmethod
 53 |     def SplitSampleInfo(strSample):
 54 | 
 55 |         if strSample[0] == '#': return False
 56 |         logging.info('Processing sample : %s' % strSample)
 57 |         lSampleRef = strSample.replace('\n', '').replace('\r', '').replace(' ', '').split('\t')
 58 | 
 59 |         if len(lSampleRef) == 2:
 60 |             strSample = lSampleRef[0]
 61 |             strRef = lSampleRef[1]
 62 |             return (strSample, strRef, '')
 63 | 
 64 |         elif len(lSampleRef) == 3:
 65 |             strSample = lSampleRef[0]
 66 |             strRef = lSampleRef[1]
 67 |             strExpCtrl = lSampleRef[2].upper()
 68 |             return (strSample, strRef, strExpCtrl)
 69 | 
 70 |         else:
 71 |             logging.error('Confirm the file format is correct. -> Sample name\tReference name\tGroup')
 72 |             logging.error('Sample list input : %s\n' % lSampleRef)
 73 |             raise Exception
 74 | 
 75 |     @staticmethod
 76 |     def CheckIntegrity(strBarcodeFile, strSeq): ## defensive
 77 |         rec = re.compile(r'[A|C|G|T|N]')
 78 | 
 79 |         if ':' in strSeq:
 80 |             strSeq = strSeq.split(':')[1]
 81 | 
 82 |         strNucle = re.findall(rec, strSeq)
 83 |         if len(strNucle) != len(strSeq):
 84 |             logging.error('This sequence is not suitable, check A,C,G,T,N are used only : %s' % strBarcodeFile)
 85 |             set_trace()
 86 |             sys.exit(1)
 87 | 
 88 |     @staticmethod
 89 |     def PreventFromRmMistake(strCmd):
 90 |         rec = re.compile(r'rm.+-rf*.+(\.$|\/$|\*$|User$|Input$|Output$)') ## This reg can prevent . / * ./User User ...
 91 |         if re.findall(rec, strCmd):
 92 |             raise Exception('%s is critical mistake! never do like this.' % strCmd)
 93 | 
 94 | 
 95 | class InitialFolder(object):
 96 | 
 97 |     def __init__(self, strUser, strProject, strProgram):
 98 |         self.strUser    = strUser
 99 |         self.strProject = strProject
100 |         self.strProgram = strProgram
101 | 
102 |     def MakeDefaultFolder(self):
103 |         Helper.MakeFolderIfNot('Input')
104 |         Helper.MakeFolderIfNot('Output')
105 |         Helper.MakeFolderIfNot('User')
106 | 
107 |     def MakeInputFolder(self):
108 |         ## './Input/JaeWoo'
109 |         strUserInputDir = './Input/{user}'.format(user=self.strUser)
110 |         Helper.MakeFolderIfNot(strUserInputDir)
111 | 
112 |         if self.strProgram == 'Run_indel_searcher.py':
113 |             ## './Input/JaeWoo/FASTQ'
114 |             strUserFastqDir = os.path.join(strUserInputDir, 'FASTQ')
115 |             Helper.MakeFolderIfNot(strUserFastqDir)
116 |         elif self.strProgram == 'Run_BaseEdit_freq.py':
117 |             ## './Input/JaeWoo/Query'
118 |             strUserFastqDir = os.path.join(strUserInputDir, 'Query')
119 |             Helper.MakeFolderIfNot(strUserFastqDir)
120 |         else:
121 |             print('CoreSystem.py -> CoreSystem error, check the script.')
122 |             raise Exception
123 | 
124 |         ## './Input/JaeWoo/FASTQ/Test_samples'
125 |         strUserProjectDir = os.path.join(strUserFastqDir, self.strProject)
126 |         Helper.MakeFolderIfNot(strUserProjectDir)
127 | 
128 |         ## './Input/JaeWoo/Reference'
129 |         strUserReference = os.path.join(strUserInputDir, 'Reference')
130 |         Helper.MakeFolderIfNot(strUserReference)
131 | 
132 |         ## './Input/JaeWoo/Reference/Test_samples'
133 |         strUserRefProject = os.path.join(strUserReference, self.strProject)
134 |         Helper.MakeFolderIfNot(strUserRefProject)
135 | 
136 |         ## './User/JaeWoo'
137 |         strUserDir = './User/{user}'.format(user=self.strUser)
138 |         Helper.MakeFolderIfNot(strUserDir)
139 | 
140 |         ## '> ./User/JaeWoo/Test_samples.txt'
141 |         self.strProjectFile = os.path.join(strUserDir, self.strProject+'.txt')
142 |         if not os.path.isfile(self.strProjectFile):
143 |             sp.call('> ' + self.strProjectFile, shell=True)
144 | 
145 |     def MakeOutputFolder(self):
146 | 
147 |         ## './Output/JaeWoo'
148 |         strOutputUserDir = './Output/{user}'.format(user=self.strUser)
149 |         Helper.MakeFolderIfNot(strOutputUserDir)
150 | 
151 |         ## './Output/JaeWoo/Test_samples'
152 |         self.strOutputProjectDir = os.path.join(strOutputUserDir, self.strProject)
153 |         Helper.MakeFolderIfNot(self.strOutputProjectDir)
154 | 
155 |         ## './Output/JaeWoo/Test_samples/Log'
156 |         strOutputLog = os.path.join(self.strOutputProjectDir, 'Log')
157 |         Helper.MakeFolderIfNot(strOutputLog)
158 | 
159 |         strLogName = str(datetime.now()).replace('-', '_').replace(':', '_').replace(' ', '_').split('.')[0]
160 |         self.strLogPath = os.path.join(self.strOutputProjectDir, 'Log/{logname}_log.txt'.format(logname=strLogName))
161 | 
162 | 
163 | class UserFolderAdmin(object):
164 | 
165 |     """
166 |     InitialFolder : out of the loop
167 |     UserFolderAdmin : in the loop
168 | 
169 |     So InitialFolder and UserFolderAdmin must be distinguished.
170 |     """
171 | 
172 |     def __init__(self, strSample, strRef, options, strLogPath):
173 | 
174 |         self.strSample  = strSample
175 |         self.strRef     = strRef
176 |         self.strLogPath = strLogPath
177 | 
178 |         self.strUser      = options.user_name
179 |         self.strProject   = options.project_name
180 | 
181 |         self.intCore      = options.multicore
182 |         self.strGapOpen   = options.gap_open    # CRISPresso aligner option
183 |         self.strGapExtend = options.gap_extend  # 
184 |         self.strPython    = options.python
185 | 
186 |         self.strOutProjectDir = ''
187 |         self.strOutSampleDir  = ''
188 |         self.strRefDir        = ''
189 | 
190 |     def MakeSampleFolder(self):
191 | 
192 |         ## './Output/Jaewoo/Test_samples'
193 |         self.strOutProjectDir = './Output/{user}/{project}'.format(user=self.strUser, project=self.strProject)
194 | 
195 |         ## './Output/Jaewoo/Test_samples/Sample_1'
196 |         self.strOutSampleDir = os.path.join(self.strOutProjectDir, self.strSample)
197 |         Helper.MakeFolderIfNot(self.strOutSampleDir)
198 | 
199 |         ## './Output/Jaewoo/Test_samples/Sample_1/Tmp'
200 |         Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Tmp'))
201 | 
202 |         ## './Output/Jaewoo/Test_samples/Sample_1/Tmp/Pickle'
203 |         Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Tmp/Pickle'))
204 | 
205 |         ## './Output/Jaewoo/Test_samples/Sample_1/Result'
206 |         Helper.MakeFolderIfNot(os.path.join(self.strOutSampleDir, 'Result'))
207 | 
208 |         ## './Output/Jaewoo/Test_samples/All_results
209 |         strAllResultDir = os.path.join(self.strOutProjectDir, 'All_results')
210 |         Helper.MakeFolderIfNot(strAllResultDir)
211 | 
212 |         self.strRefDir = './Input/{user}/Reference/{project}/{ref}'.format(user=self.strUser,
213 |                                                                            project=self.strProject,
214 |                                                                            ref=self.strRef)
215 | 
216 | 
217 | class CoreHash(object):
218 | 
219 |     @staticmethod
220 |     def MakeHashTable(strSeq, intBarcodeLen):
221 |         listSeqWindow = [strSeq[i:i + intBarcodeLen] for i in range(len(strSeq))[:-intBarcodeLen - 1]]
222 |         return listSeqWindow
223 | 
224 |     @staticmethod
225 |     def IndexHashTable(dictRef, strSeqWindow, intFirstBarcode):
226 |         lCol_ref = dictRef[strSeqWindow]
227 |         strBarcode = strSeqWindow
228 |         intFirstBarcode = 1
229 | 
230 |         return (lCol_ref, strBarcode, intFirstBarcode)
231 | 
232 | 
233 | class CoreGotoh(object):
234 | 
235 |     def __init__(self, strEDNAFULL='', floOg='', floOe=''):
236 |         self.npAlnMatrix = CRISPResso2Align.read_matrix(strEDNAFULL)
237 |         self.floOg       = floOg
238 |         self.floOe       = floOe
239 | 
240 |     def GapIncentive(self, strRefSeqAfterBarcode):
241 |         ## cripsress no incentive == gotoh
242 |         intAmpLen = len(strRefSeqAfterBarcode)
243 |         npGapIncentive = np.zeros(intAmpLen + 1, dtype=np.int)
244 |         return npGapIncentive
245 | 
246 |     def RunCRISPResso2(self, strQuerySeqAfterBarcode, strRefSeqAfterBarcode, npGapIncentive):
247 |         listResult = CRISPResso2Align.global_align(strQuerySeqAfterBarcode.upper(), strRefSeqAfterBarcode.upper(),
248 |                                                   matrix=self.npAlnMatrix, gap_open=self.floOg, gap_extend=self.floOe,
249 |                                                   gap_incentive=npGapIncentive)
250 |         return listResult
251 | 
252 | 
253 | def CheckProcessedFiles(Func):
254 |     def Wrapped_func(**kwargs):
255 | 
256 |         InstInitFolder     = kwargs['InstInitFolder']
257 |         strInputProject    = kwargs['strInputProject']
258 |         listSamples        = kwargs['listSamples']
259 |         logging            = kwargs['logging']
260 | 
261 |         logging.info('File num check: input folder and project list')
262 |         Helper.CheckSameNum(strInputProject, listSamples)
263 | 
264 |         Func(**kwargs)
265 | 
266 |         logging.info('Check that all folder are well created.')
267 |         Helper.CheckAllDone(InstInitFolder.strOutputProjectDir, listSamples)
268 | 
269 |     return Wrapped_func
270 | 
271 | 
272 | def AttachSeqToIndel(strSample, strBarcodeName, strIndelPos,
273 |                      strRefseq, strQueryseq, dictSub):
274 | 
275 |     listIndelPos = strIndelPos.split('M')
276 |     intMatch     = int(listIndelPos[0])
277 | 
278 |     if 'I' in strIndelPos:
279 |         intInsertion    = int(listIndelPos[1].replace('I', ''))
280 |         strInDelSeq     = strQueryseq[intMatch:intMatch + intInsertion]
281 | 
282 |     elif 'D' in strIndelPos:
283 |         intDeletion     = int(listIndelPos[1].replace('D', ''))
284 |         strInDelSeq    = strRefseq[intMatch:intMatch + intDeletion]
285 | 
286 |     else:
287 |         logging.info('strIndelClass is included I or D. This variable is %s' % strIndelPos)
288 |         raise Exception
289 | 
290 |     strInDelPosSeq = strIndelPos + '_' + strInDelSeq
291 | 
292 |     try:
293 |         _ = dictSub[strSample][strBarcodeName]
294 |     except KeyError:
295 |         dictSub[strSample][strBarcodeName] = {}
296 | 
297 |     try:
298 |         dictSub[strSample][strBarcodeName][strBarcodeName + ':' + strInDelPosSeq]['IndelCount'] += 1
299 |     except KeyError:
300 |         dictSub[strSample][strBarcodeName][strBarcodeName + ':' + strInDelPosSeq] = {'IndelCount':1}
301 | 
302 | 
303 | 
304 | def RunProgram(sCmd):
305 |     sp.call(sCmd, shell=True)
306 | 
307 | def RunMulticore(lCmd, iCore):
308 |     for sCmd in lCmd:
309 |         print(sCmd)
310 | 
311 |     p = mp.Pool(iCore)
312 |     p.map_async(RunProgram, lCmd).get()
313 |     p.close()
314 | 


--------------------------------------------------------------------------------
/Core/CoreSystem.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/CoreSystem.pyc


--------------------------------------------------------------------------------
/Core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/__init__.py


--------------------------------------------------------------------------------
/Core/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Core/__init__.pyc


--------------------------------------------------------------------------------
/EDNAFULL:
--------------------------------------------------------------------------------
 1 | #
 2 | # This matrix was created by Todd Lowe   12/10/92
 3 | #
 4 | # Uses ambiguous nucleotide codes, probabilities rounded to
 5 | #  nearest integer
 6 | #
 7 | # Lowest score = -4, Highest score = 5
 8 | #
 9 |     A   T   G   C   S   W   R   Y   K   M   B   V   H   D   N   U
10 | A   5  -4  -4  -4  -4   1   1  -4  -4   1  -4  -1  -1  -1  -2  -4
11 | T  -4   5  -4  -4  -4   1  -4   1   1  -4  -1  -4  -1  -1  -2   5
12 | G  -4  -4   5  -4   1  -4   1  -4   1  -4  -1  -1  -4  -1  -2  -4
13 | C  -4  -4  -4   5   1  -4  -4   1  -4   1  -1  -1  -1  -4  -2  -4
14 | S  -4  -4   1   1  -1  -4  -2  -2  -2  -2  -1  -1  -3  -3  -1  -4
15 | W   1   1  -4  -4  -4  -1  -2  -2  -2  -2  -3  -3  -1  -1  -1   1
16 | R   1  -4   1  -4  -2  -2  -1  -4  -2  -2  -3  -1  -3  -1  -1  -4
17 | Y  -4   1  -4   1  -2  -2  -4  -1  -2  -2  -1  -3  -1  -3  -1   1
18 | K  -4   1   1  -4  -2  -2  -2  -2  -1  -4  -1  -3  -3  -1  -1   1
19 | M   1  -4  -4   1  -2  -2  -2  -2  -4  -1  -3  -1  -1  -3  -1  -4
20 | B  -4  -1  -1  -1  -1  -3  -3  -1  -1  -3  -1  -2  -2  -2  -1  -1
21 | V  -1  -4  -1  -1  -1  -3  -1  -3  -3  -1  -2  -1  -2  -2  -1  -4
22 | H  -1  -1  -4  -1  -3  -1  -3  -1  -3  -1  -2  -2  -1  -2  -1  -1
23 | D  -1  -1  -1  -4  -3  -1  -1  -3  -1  -3  -2  -2  -2  -1  -1  -1
24 | N  -2  -2  -2  -2  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -1  -2
25 | U  -4   5  -4  -4  -4   1  -4   1   1  -4  -1  -4  -1  -1  -2   5
26 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/BaseEdit_input_converter.py:
--------------------------------------------------------------------------------
  1 | import os,sys
  2 | from pdb import set_trace
  3 | 
  4 | import multiprocessing as mp
  5 | 
  6 | sys.path.insert(0, os.path.dirname(os.getcwd()))
  7 | from Core.CoreSystem import Helper
  8 | 
  9 | strUser    = sys.argv[1]
 10 | strProject = sys.argv[2]
 11 | 
 12 | print('Usage : python ./BaseEdit_input_converter.py user_name project_name')
 13 | print('Usage : python ./BaseEdit_input_converter.py JaeWoo Test_samples')
 14 | 
 15 | 
 16 | """
 17 | --> Conversion format
 18 | Barcode.txt
 19 | ACACACACACACAGCTCATA:ACACACACACACAGCTCATA
 20 | Reference.txt
 21 | ACACACACACACAGCTCATA:TTTGTATACACGCATGTATGCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
 22 | Query reads
 23 | ACACACACACACAGCTCATA.txt
 24 | 
 25 | BaseEdit output
 26 | Barcode.txt
 27 | YSKim_0525+01614_98_repeat1:TATACACGCATGTAT
 28 | ...
 29 | Reference.txt
 30 | YSKim_0525+01614_98_repeat1:TTTGTATACACGCATGTAT GCATCCTGCAGGTCTCGCTCTGACATGTGGGAAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
 31 | ...
 32 | Read
 33 | YSKim_0525+01614_98_repeat1.txt
 34 | """
 35 | 
 36 | def Convert_Indelsearcher_output(strSampleRefGroup):
 37 | 
 38 |     listSampleRefGroup = strSampleRefGroup.replace('\n', '').replace('\r', '').split('\t')
 39 | 
 40 |     strSample = listSampleRefGroup[0]
 41 |     strRef    = listSampleRefGroup[1]
 42 | 
 43 |     print('Processing: %s, %s' % (strSample, strRef))
 44 | 
 45 |     strBaseEditRefFolder   = '../Base_edit_2/Input/{user}/Reference/{project}/{ref}'.format(user=strUser,
 46 |                                                                                             project=strProject,
 47 |                                                                                             ref=strRef)
 48 |     strBaseEditQueryFolder = '../Base_edit_2/Input/{user}/Query/{project}/{sample}'.format(user=strUser,
 49 |                                                                                            project=strProject,
 50 |                                                                                            sample=strSample)
 51 |     try:
 52 |         Helper.MakeFolderIfNot(strBaseEditRefFolder)
 53 |         Helper.MakeFolderIfNot(strBaseEditQueryFolder)
 54 |     except OSError as e:
 55 |         print(e)
 56 |         pass
 57 | 
 58 |     ## BaseEdit refer format : filename, barcode, reference
 59 |     ReferenceFile_in_IndelSearcher = open('./Input/{user}/Reference/{project}/{ref}/Reference_sequence.txt'.format(user=strUser,
 60 |                                                                                                                    project=strProject,
 61 |                                                                                                                    ref=strRef))
 62 |     BarcodeFile_in_IndelSearcher   = open('./Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(user=strUser,
 63 |                                                                                                         project=strProject,
 64 |                                                                                                         ref=strRef))
 65 |     BarcodeFile_for_BaseEdit       = open('../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Barcode.txt'.format(user=strUser,
 66 |                                                                                                                      project=strProject,
 67 |                                                                                                                      ref=strRef), 'w')
 68 |     Reference_for_BaseEdit         = open('../Base_edit_2/Input/{user}/Reference/{project}/{ref}/Reference.txt'.format(user=strUser,
 69 |                                                                                                                        ref=strRef,
 70 |                                                                                                                        project=strProject), 'w') ## conversion target to barcode:refseq
 71 | 
 72 |     dictBarcodeSeq = {}
 73 | 
 74 |     for strBarcodeIndelSearcher, strReferenceIndelSearcher in zip(BarcodeFile_in_IndelSearcher, ReferenceFile_in_IndelSearcher):
 75 | 
 76 |         strBarcodeIndelSearcher   = strBarcodeIndelSearcher.replace('\n', '').strip()
 77 |         strReferenceIndelSearcher = strReferenceIndelSearcher.replace('\n', '').strip()
 78 | 
 79 |         dictBarcodeSeq[strBarcodeIndelSearcher] = []
 80 |         BarcodeFile_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strBarcodeIndelSearcher + '\n') ## first is filename, second is barcode. BaseEdit barcode format
 81 |         Reference_for_BaseEdit.write(strBarcodeIndelSearcher + ':' + strReferenceIndelSearcher + '\n')
 82 | 
 83 |     ReferenceFile_in_IndelSearcher.close()
 84 |     BarcodeFile_in_IndelSearcher.close()
 85 |     Reference_for_BaseEdit.close()
 86 | 
 87 |     Total_result_file = open('./Output/{user}/{project}/{sample}/Tmp/{sample}_Classified_Indel_barcode.fastq'.format(user=strUser,
 88 |                                                                                                                      project=strProject,
 89 |                                                                                                                      sample=strSample))
 90 | 
 91 |     intCheckTotLine = 0
 92 |     intOneLineMore  = 0
 93 | 
 94 |     for i, strRow in enumerate(Total_result_file):  ## for query reads
 95 | 
 96 |         if intOneLineMore == 1:
 97 |             intCheckTotLine = 0
 98 |             intOneLineMore  = 0
 99 | 
100 |         if i % 4 == 0: ## Classified_Indel_barcode has all total sequence.
101 |             strBarcode = strRow.split('Barcode_')[1].split(':')[0]
102 |             intCheckTotLine = 1
103 | 
104 |         elif intCheckTotLine == 1:
105 |             dictBarcodeSeq[strBarcode].append(strRow)
106 |             intOneLineMore = 1
107 | 
108 |     for strBarcode, listSeq in dictBarcodeSeq.items():
109 |         with open('../Base_edit_2/Input/{user}/Query/{project}/{sample}/{barcode}.txt'.format(
110 |                 user=strUser, project=strProject, sample=strSample, barcode=strBarcode), 'w') as Output:
111 |             Output.write(''.join(listSeq))
112 | 
113 |     Total_result_file.close()
114 | 
115 | 
116 | def Main():
117 |     print('Program Start')
118 |     p = mp.Pool(2)
119 | 
120 |     with open('./User/{user}/{project}.txt'.format(user=strUser, project=strProject)) as SampleList:
121 |         listSampleRefGroup = [strSampleRefGroup for strSampleRefGroup in SampleList if strSampleRefGroup[0] != '#']
122 |         p.map_async(Convert_Indelsearcher_output, listSampleRefGroup).get()
123 | 
124 |     p.close()
125 | 
126 |     print('Program End')
127 | 
128 | 
129 | Main()
130 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Debugger.py:
--------------------------------------------------------------------------------
  1 | #!/media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python2.7
  2 | 
  3 | import os, re, sys, pickle
  4 | import subprocess as sp
  5 | from Bio import AlignIO
  6 | from pdb import set_trace
  7 | 
  8 | 
  9 | strFastq='/media/hkim/Pipeline/CRISPR_Indel_searcher/Input/FASTQ/190807_Nahye_24k_NG_rep1-24kLib/NG_rep1.extendedFrags.fastq'
 10 | strBarcode='TTTGGTGATCTCACTCTCGACAACTC'
 11 | 
 12 | sRef_fa = './Input/Reference/190807_Nahye_24k_NG_rep1-24kLib/Reference.fa'
 13 | sBarcode_PAM_pos='Foward'
 14 | 
 15 | 
 16 | def CountBar():
 17 |     with open(strFastq) as Input:
 18 |         
 19 |         intCnt=0
 20 | 
 21 |         for strRow in Input:
 22 |             if strBarcode in strRow:
 23 |                 intCnt+=1
 24 | 
 25 |         print(intCnt)
 26 | 
 27 | def ExtractFastq():
 28 | 
 29 |     with open(strFastq) as Input,\
 30 |         open('./Input/FASTQ/Test1/Test1.fastq_target', 'w') as Output:
 31 | 
 32 |         listFastq  = []
 33 | 
 34 |         for i, strRow in enumerate(Input):
 35 |             listFastq.append(strRow.replace('\n', ''))
 36 |             if i % 4 == 3:
 37 |                 #print(listFastq)
 38 |                 if strBarcode in listFastq[1]:
 39 |                     Output.write('\n'.join(listFastq)+'\n')
 40 |                 listFastq = []
 41 | 
 42 |             
 43 | def LoadPickle():
 44 |     
 45 |     with open('Output/Test1/Pickle/Test1.fastq_1.fq.pickle', 'rb') as Input:
 46 |         obj = pickle.load(Input)
 47 |         set_trace()
 48 | 
 49 | 
 50 | def CheckSearch():
 51 |     dRef = {}
 52 |     dResult = {}
 53 | 
 54 |     with open(sRef_fa) as Ref:
 55 | 
 56 |         iCount         = 0
 57 |         sBarcode       = ""
 58 |         sTarget_region = ""
 59 | 
 60 |         for sRow in Ref:
 61 |             iCount += 1
 62 | 
 63 |             if iCount % 2 != 0:
 64 |                 #      barcode               target region
 65 |                 # >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC
 66 |                 sBarcode_indel_seq = sRow.strip().replace('\n', '').replace('\r', '').split(':')
 67 |                 sBarcode = sBarcode_indel_seq[0].replace('>', '')
 68 |                 sTarget_region = sBarcode_indel_seq[1]
 69 | 
 70 |                 ## Reverse the sentence. If it is done, all methods are same before work.
 71 |                 if sBarcode_PAM_pos == 'Reverse':
 72 |                     sBarcode = sBarcode[::-1]
 73 |                     sTarget_region = sTarget_region[::-1]
 74 | 
 75 |             elif iCount % 2 == 0:
 76 |                 ## Reverse
 77 |                 sRef_seq = sRow.strip().replace('\n', '').replace('\r', '')
 78 | 
 79 |                 if sBarcode_PAM_pos == 'Reverse':
 80 |                     sRef_seq = sRef_seq[::-1]
 81 | 
 82 |                 Seq_matcher = re.compile(r'(?=(%s))' % sTarget_region)
 83 |                 # iIndel_start_pos       = sRef_seq.index(sTarget_region)               # There is possible to exist two indel.
 84 |                 iIndel_start_pos = Seq_matcher.finditer(sRef_seq)
 85 | 
 86 |                 for i, match in enumerate(iIndel_start_pos):
 87 |                     iIndel_start_pos = match.start()
 88 |                 # print iIndel_start_pos
 89 |                 # print len(sTarget_region)
 90 |                 # print sRef_seq
 91 |                 iIndel_end_pos = iIndel_start_pos + len(sTarget_region) - 1
 92 | 
 93 |                 try:
 94 |                     iBarcode_start_pos = sRef_seq.index(sBarcode)
 95 | 
 96 |                     #if iIndel_start_pos <= iBarcode_start_pos:
 97 |                     #    print(iIndel_start_pos, iBarcode_start_pos)
 98 |                     #    raise IndexError('indel is before barcode')
 99 | 
100 |                     iBarcode_end_pos = iBarcode_start_pos + len(sBarcode) - 1
101 |                     sRef_seq_after_barcode = sRef_seq[iBarcode_end_pos + 1:]
102 | 
103 |                     # modified. to -1
104 |                     iIndel_end_next_pos_from_barcode_end = iIndel_end_pos - iBarcode_end_pos - 1
105 | 
106 |                     iIndel_start_next_pos_from_barcode_end = iIndel_start_pos - iBarcode_end_pos - 1
107 | 
108 |                     #  "barcode"-------------*(N) that distance.
109 |                     #          ^  ^            ^
110 |                     #   *NNNN*NNNN
111 |                     #    ^    ^     indel pos, the sequence matcher selects indel event pos front of it.
112 | 
113 |                     dRef[sBarcode] = (sRef_seq, sTarget_region, sRef_seq_after_barcode, iIndel_start_next_pos_from_barcode_end,
114 |                                       iIndel_end_next_pos_from_barcode_end, iIndel_start_pos,iIndel_end_pos)  # total matched reads, insertion, deletion, complex
115 |                     dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], []]
116 |                 except ValueError:
117 |                     continue
118 | 
119 |     with open('test.seq') as Input:
120 |         
121 |         iBarcode_matched = 0
122 | 
123 |         for sSeq in Input:
124 |             sSeq = sSeq.replace('\n','') 
125 | 
126 |             listSeqWindow = [sSeq[i:i + 26] for i in range(len(sSeq))[:-25]]
127 | 
128 |             iNeedle_matched  = 0
129 |             iInsert_count    = 0
130 |             iDelete_count    = 0
131 |             iComplex_count   = 0
132 | 
133 |             intFirstBarcode  = 0 ## check whether a barcode is one in a sequence.
134 | 
135 |             for strSeqWindow in listSeqWindow:
136 | 
137 |                 if intFirstBarcode == 1: break ## A second barcode in a sequence is not considerable.
138 | 
139 |                 try:
140 |                     lCol_ref = dRef[strSeqWindow]
141 |                     sBarcode = strSeqWindow
142 |                     intFirstBarcode = 1
143 |                 except KeyError:
144 |                     continue
145 | 
146 |                 iBarcode_matched += 1
147 | 
148 |         print(iBarcode_matched)
149 | 
150 | 
151 | def CheckNeedle():
152 |     
153 |     sBarcode = 'TTTGACTAGTCATCACTATAGCATAA'
154 |     sRef_seq_after_barcode = 'TACAGTGTTTTTTTTTTTTCAGAGGAAGCTTGGCGTAACTAGATCT'
155 |     sQuery_seq_after_barcode = 'TACAGTGTTTTTTTTTTTCAGAGGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA'
156 | 
157 |     sRef_seq    = r'<(echo -e ">{name}\n{seq}")'.format(name='Ref', seq=sRef_seq_after_barcode)
158 |     sQuery_seq  = r'<(echo -e ">{name}\n{seq}")'.format(name='Query', seq=sQuery_seq_after_barcode)
159 | 
160 |     sNeedle_cmd = r"/bin/bash -c 'needle -filter {0} {1} -outfile stdout -gapopen {2} -gapextend {3} -endweight Y -endopen {4} -endextend {5}'".format(sRef_seq, sQuery_seq, '20', '1', '20', '1')
161 |         
162 |     Needle_result = sp.Popen(sNeedle_cmd, stdout=sp.PIPE, stderr=sp.PIPE, universal_newlines=True,shell=True)
163 |     lResult = [Instance.seq._data for Instance in AlignIO.read(Needle_result.stdout, "emboss")]
164 |     print(lResult)
165 | 
166 | 
167 | def LoggingTest():
168 |     import logging
169 |     logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
170 |                     level=logging.DEBUG, filename='test.log', filemode='a'
171 |                     )
172 |     logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
173 | 
174 |     logging.info('test')
175 |     a = a * 10
176 | 
177 | 
178 | def Main():
179 |     #CountBar()
180 |     #ExtractFastq()
181 |     #LoadPickle()
182 |     #CheckSearch()
183 |     #CheckNeedle()
184 |     LoggingTest()
185 | 
186 | Main()
187 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Flash_pair_read_merge.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import subprocess as sp
 3 | 
 4 | strUser    = sys.argv[1]
 5 | strProject = sys.argv[2]
 6 | strFlash   = sys.argv[3]
 7 | strThread  = sys.argv[4]
 8 | 
 9 | 
10 | def RunFlash():
11 | 
12 |     strFlashDir   = '../{flash}'.format(flash=strFlash)
13 |     strProjectDir = './Input/{user}/FASTQ/{project}'.format(user=strUser, project=strProject)
14 | 
15 |     for strSampleDir in os.listdir(strProjectDir):
16 |         strSamplePath = os.path.join(strProjectDir, strSampleDir)
17 | 
18 |         if os.path.isdir(strSamplePath):
19 | 
20 |             listPairFiles = []
21 | 
22 |             for strFile in os.listdir(os.path.join(strProjectDir, strSampleDir)):
23 |                 if '_1.fastq.gz' in strFile or '_2.fastq.gz' in strFile:
24 |                     listPairFiles.append(strFile)
25 | 
26 |             strForward = os.path.join(strSamplePath, listPairFiles[0])
27 |             strReverse = os.path.join(strSamplePath, listPairFiles[1])
28 |             strOutput  = os.path.join(strSamplePath, listPairFiles[0].replace('_1.fastq.gz', ''))
29 | 
30 |             strLog = './Output/{user}/{project}/Log'.format(user=strUser,
31 |                                                             project=strProject)
32 | 
33 |             if not os.path.isdir(strLog): os.makedirs(strLog)
34 | 
35 |             strCmd = '{flash_dir}/flash -m 10 -M 400 -O -o {output} -t {thread} {r1} {r2} >{log}/flash.log 2>&1 '.format(
36 |                 flash_dir=strFlashDir,
37 |                 output=strOutput,
38 |                 thread=strThread,
39 |                 r1=strForward,
40 |                 r2=strReverse,
41 |                 log=strLog)
42 | 
43 |             print(strCmd)
44 |             sp.call(strCmd, shell=True)
45 |             print('complete, {fow} {rev} are moved to project folder'.format(fow=listPairFiles[0], rev=listPairFiles[1]))
46 |             sp.call('mv {sample_path}/*.fastq.gz {project_dir} &&'
47 |                     ' rm {sample_path}/*hist* {project_dir} &&'
48 |                     ' rm {sample_path}/*notCombined* {project_dir}'.format(sample_path=strSamplePath,
49 |                                                                            project_dir=strProjectDir), shell=True)
50 | 
51 | 
52 | def Main():
53 |     RunFlash()
54 | 
55 | 
56 | Main()
57 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Indel_frequency_calculator.py:
--------------------------------------------------------------------------------
 1 | import os, sys, logging
 2 | 
 3 | from pdb import set_trace
 4 | from datetime import datetime
 5 | from collections import OrderedDict
 6 | from collections import namedtuple as nt
 7 | 
 8 | strOutputDir = sys.argv[1]
 9 | strSample    = sys.argv[2]
10 | strLogPath   = sys.argv[3]
11 | 
12 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
13 |                     level=logging.DEBUG,
14 |                     filename=strLogPath,
15 |                     filemode='a')
16 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
17 | 
18 | 
19 | def MakeIndelSummary():
20 | 
21 |     """
22 |     Input
23 |     TTTGCAGAGTATATCACACCATATCA AGTCAGACAAGGAGCACCACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA 17M1I 0.134 AGTCAGACAAGGAGCAC-ACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC------- AGTCAGACAAGGAGCACCACACGGTGGAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAA
24 | 
25 |     0: barcode
26 |     1: target region
27 |     2: indel pos
28 |     3: total freq
29 |     4: ref seq
30 |     5: query seq
31 | 
32 |     Output
33 |     TTTGTCTCGTACACTCGTATGCTGCA 2 18M2D:1:50.0, 24M1I:1:50.0
34 |     TTTGACATCTACAGTGTCTCTCCACA 2 22M1I:2:100.0
35 |     """
36 | 
37 |     listOutput = []
38 | 
39 |     with open('{outdir}/Tmp/{sample}_Indel_freq.txt'.format(sample=strSample, outdir=strOutputDir)) as InputFreq,\
40 |         open('{outdir}/Tmp/{sample}_Indel_summary.txt'.format(sample=strSample, outdir=strOutputDir), 'w') as OutputFreq:
41 | 
42 |         listTable = [strRow.replace('\n', '').split('\t') for strRow in InputFreq]
43 |         intTotal  = len(listTable)
44 | 
45 |         #strBarcode = listCol[0]
46 |         dictINDEL = OrderedDict({listCol[0]:OrderedDict({'Total':0}) for listCol in listTable}) ## {'TTTGACATCTACAGTGTCTCTCCACA': {22M1I : 2, ...}}
47 | 
48 |         for listCol in listTable:
49 |             strBarcode = listCol[0]
50 |             strIndel   = listCol[2]
51 | 
52 |             dictINDEL[strBarcode]['Total'] += 1
53 | 
54 |             try:
55 |                 dictINDEL[strBarcode][strIndel] += 1
56 |             except KeyError:
57 |                 dictINDEL[strBarcode][strIndel] = 1
58 | 
59 |         #dictINDEL = OrderedDict(sorted(dictINDEL.items(), key=lambda t: t[1], reverse=True)) ## sort value count.
60 | 
61 |         list2Result = []
62 |         for strBarcode in dictINDEL:
63 |             intTotal       = dictINDEL[strBarcode]['Total']
64 |             list2INDEL     = [[strIndel, intCount, round(intCount/float(intTotal),3)*100] for strIndel, intCount in dictINDEL[strBarcode].items()]
65 |             list2INDEL     = sorted(list2INDEL, key=lambda x: x[1], reverse=True)
66 |             strIndelResult = ''.join([':'.join(map(str, listINDEL))+', ' for listINDEL in list2INDEL if listINDEL[0] != 'Total'])
67 |             list2Result.append([strBarcode, intTotal, strIndelResult])
68 | 
69 |         for listResult in sorted(list2Result, key=lambda x: x[1], reverse=True):
70 |             OutputFreq.write('\t'.join(map(str, listResult)) + '\n')
71 | 
72 | 
73 | if __name__ == '__main__':
74 |     logging.info('Indel frequency calculator start: %s' % str(datetime.now()))
75 |     MakeIndelSummary()
76 |     logging.info('Indel frequency calculator end: %s' % str(datetime.now()))
77 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Indel_normalization.py:
--------------------------------------------------------------------------------
  1 | import os, sys, logging
  2 | 
  3 | from pdb import set_trace
  4 | 
  5 | import pandas as pd
  6 | 
  7 | sys.path.insert(0, os.path.dirname(os.getcwd()))
  8 | from Core.CoreSystem import SplitSampleInfo, AttachSeqToIndel, Helper
  9 | 
 10 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
 11 |                     level=logging.INFO)
 12 | 
 13 | 
 14 | strProjectFile    = sys.argv[1]
 15 | strUserName       = sys.argv[2]
 16 | strProjectName    = sys.argv[3]
 17 | 
 18 | """
 19 | /media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python ./Indel_normalization.py User/JaeWoo/JaeWoo_test_samples.txt JaeWoo JaeWoo_test_samples
 20 | """
 21 | 
 22 | def MakeIndelSeqDict():
 23 | 
 24 |     """
 25 |     dictD0Indel:
 26 |     {'sample_*': {'ACGATCGAT': {'Total': 300, {'ACGATCGAT_30M2I_AG': {'IndelCount': 3}}}}}}}
 27 | 
 28 |     validation
 29 |     ./Output/JaeWoo/JaeWoo_test_samples/190819_Nahye_12K_D7_2_D0_1-Cas9D7/Tmp
 30 |     grep TTTGGATCGTCTATCGTCG 190819_Nahye_12K_D7_2_D0_1-Cas9D7_Indel_freq.txt | grep 18M16D | wc -l
 31 |     -> Indel count
 32 |     """
 33 | 
 34 |     dictD0Indel  = {}
 35 |     dictExpIndel = {}
 36 | 
 37 |     with open(strProjectFile) as SampleList:
 38 | 
 39 |         for strSample in SampleList:
 40 |             print(strSample)
 41 | 
 42 |             tupSampleInfo = SplitSampleInfo(strSample)
 43 |             if not tupSampleInfo: continue
 44 |             strSample, strRef, strExpCtrl = tupSampleInfo
 45 | 
 46 |             if strExpCtrl == 'CTRL':
 47 |                 dictD0Indel[strSample] = {}
 48 |             elif strExpCtrl == 'EXP':
 49 |                 dictExpIndel[strSample] = {}
 50 | 
 51 |             with open('./Output/{user}/{project}/{sample}/Tmp/{sample}_Indel_freq.txt'.format(
 52 |                       user=strUserName, project=strProjectName, sample=strSample)) as IndelFreq,\
 53 |                 open('./Output/{user}/{project}/{sample}/Result/{sample}_Summary_result.tsv'.format(
 54 |                       user=strUserName, project=strProjectName, sample=strSample)) as TotalResult:
 55 | 
 56 |                 for strRow in IndelFreq:
 57 |                     listCol      = strRow.replace('\n','').split('\t')
 58 |                     strBarcode   = listCol[0]
 59 |                     strIndelPos  = listCol[2]
 60 |                     strRefseq    = listCol[4]
 61 |                     strQueryseq  = listCol[5]
 62 | 
 63 |                     if strExpCtrl == 'CTRL':
 64 |                         AttachSeqToIndel(strSample, strBarcode, strIndelPos, strRefseq, strQueryseq, dictD0Indel)
 65 |                     elif strExpCtrl == 'EXP':
 66 |                         AttachSeqToIndel(strSample, strBarcode, strIndelPos, strRefseq, strQueryseq, dictExpIndel)
 67 | 
 68 |                 TotalResult.readline() ## skip header
 69 |                 for strRow in TotalResult:
 70 |                     listCol    = strRow.replace('\n', '').split('\t')
 71 |                     strBarcode = listCol[0]
 72 |                     intTotal   = int(listCol[1])
 73 | 
 74 |                     try:
 75 |                         dictD0Indel[strSample][strBarcode]['Total'] = intTotal
 76 |                     except KeyError:
 77 |                         pass
 78 | 
 79 |                     try:
 80 |                         dictExpIndel[strSample][strBarcode]['Total'] = intTotal
 81 |                     except KeyError:
 82 |                         pass
 83 | 
 84 |         #set_trace()
 85 |         #print(dictSub.items())#
 86 |     return (dictD0Indel, dictExpIndel)
 87 | 
 88 | 
 89 | def MakeTmp(dictD0Indel, dictExpIndel):
 90 | 
 91 |     for dictIndel in [dictD0Indel, dictExpIndel]:
 92 |         for strSample, dictBarcode in dictIndel.items():
 93 |             strTmpDir = './Output/{user}/{project}/{sample}/Tmp'.format(user=strUserName,
 94 |                                                                         project=strProjectName,
 95 |                                                                         sample=strSample)
 96 |             with open(os.path.join(strTmpDir, strSample+'_indel_seq_count.txt'), 'w') as Output:
 97 |                 for strBarcode, dictCountTotalAndIndel in dictBarcode.items():
 98 |                     for strIndelSeq, dictCount in dictCountTotalAndIndel.items():
 99 |                         if strIndelSeq == 'Total': continue
100 |                         Output.write('\t'.join([strIndelSeq, str(dictCount['IndelCount'])])+'\n')
101 | 
102 | 
103 | def MergeD0SampleResults(dictD0Indel):
104 | 
105 |     """
106 |     dictD0Indel:
107 |     {'sample_*': {'ACGATCGAT': {'Total': 300, {'ACGATCGAT_30M2I_AG': {'IndelCount': 3}}}}}}}
108 | 
109 |     -> sum total, sum indelcount
110 | 
111 |     dictD0IndelMerge:
112 |     {'ACGATCGAT': {'Total': 600, {'ACGATCGAT_30M2I_AG': {'IndelCount': 5}}}}}}}
113 |     """
114 | 
115 |     dictD0IndelMerge = {}
116 | 
117 |     for strD0SampleName in dictD0Indel:
118 |         for strBarcode, dictCountTotalAndIndel in dictD0Indel[strD0SampleName].items():
119 | 
120 |             try:
121 |                 dictD0IndelMerge[strBarcode]['Total'] += dictCountTotalAndIndel['Total']
122 |             except KeyError:
123 |                 dictD0IndelMerge[strBarcode] = {}
124 |                 dictD0IndelMerge[strBarcode]['Total'] = dictCountTotalAndIndel['Total']
125 | 
126 |             for strIndelSeq, dictCount in dictCountTotalAndIndel.items():  ## dcitCount : {'TTTGAGCATATCACACGAT:33M1D_T': {'IndelCount': 0}}
127 |                 if strIndelSeq == 'Total': continue
128 | 
129 |                 try:
130 |                     dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount'] += dictCount['IndelCount']
131 |                 except KeyError:
132 |                     dictD0IndelMerge[strBarcode][strIndelSeq] = {}
133 |                     dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount'] = dictCount['IndelCount']
134 | 
135 |     return dictD0IndelMerge
136 | 
137 | 
138 | def SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel):
139 | 
140 |     """
141 |     dictD0IndelMerge: indel proportion - dictExpIndel: indel proportion
142 |     """
143 |     strD0SubResultDir = './Output/{user}/{project}/All_results/D0SubResult'.format(user=strUserName, project=strProjectName)
144 |     Helper.MakeFolderIfNot(strD0SubResultDir)
145 | 
146 |     for strSample, dictBarcode in dictExpIndel.items():
147 |         with open(os.path.join(strD0SubResultDir, '{sample}_D0SubResult.txt').format(sample=strSample), 'w') as Output:
148 |             Output.write('Barcode_indel_seq\tD0_total\tD0_indel_prop\tExp_total\tExp_indel_prop\tD0_sub_indel_prop\n')
149 | 
150 |             for strBarcode, dictCountTotalAndIndel in dictBarcode.items():
151 | 
152 |                 intExpTotal = dictCountTotalAndIndel['Total']
153 | 
154 |                 for strIndelSeq, dictCount in dictCountTotalAndIndel.items():
155 |                     if strIndelSeq == 'Total': continue
156 | 
157 |                     try:
158 |                         intD0Total = dictD0IndelMerge[strBarcode]['Total']
159 |                         intD0Count = dictD0IndelMerge[strBarcode][strIndelSeq]['IndelCount']
160 | 
161 |                         floD0Prop  = round(intD0Count / float(intD0Total), 6)
162 | 
163 |                         intExpCount = dictCount['IndelCount']
164 |                         floExpProp  = round(intExpCount / float(intExpTotal), 6)
165 | 
166 |                         floSubExpIndel = floExpProp - floD0Prop
167 |                         if floSubExpIndel < 0:
168 |                             floSubExpIndel = 0
169 | 
170 |                         Output.write('\t'.join(map(str, [strIndelSeq,intD0Total, floD0Prop,
171 |                                                          intExpTotal, floExpProp, floSubExpIndel]))+'\n')
172 |                     except KeyError:
173 |                         intExpCount = dictCount['IndelCount']
174 |                         floExpProp  = round(intExpCount / float(intExpTotal), 6)
175 | 
176 |                         Output.write('\t'.join(map(str, [strIndelSeq, 'None', 'None',
177 |                                                          intExpTotal, floExpProp, floExpProp]))+'\n')
178 | 
179 | 
180 | def Main():
181 |     logging.info("Indel normalization Start")
182 |     logging.info("MakeIndelSeqDict")
183 |     dictD0Indel, dictExpIndel = MakeIndelSeqDict()
184 |     logging.info("MakeTmp")
185 |     MakeTmp(dictD0Indel, dictExpIndel)
186 |     logging.info("MergeD0SampleResults")
187 |     dictD0IndelMerge = MergeD0SampleResults(dictD0Indel)
188 |     logging.info("SubtractIndelWithD0")
189 |     SubtractIndelWithD0(dictD0IndelMerge, dictExpIndel)
190 |     logging.info("Indel normalization End")
191 | 
192 | 
193 | Main()
194 | 
195 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Indel_normalization.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/CRISPRJWCHOI/CRISPR_toolkit/e903e4476dad29b9b0a1301019ec023308355eb5/Indel_searcher_2/Indel_normalization.pyc


--------------------------------------------------------------------------------
/Indel_searcher_2/Indel_searcher_crispresso_hash.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys, logging
  2 | 
  3 | import numpy as np
  4 | import subprocess as sp
  5 | import cPickle as pickle
  6 | 
  7 | from pdb import set_trace
  8 | 
  9 | sys.path.insert(0, os.path.dirname(os.getcwd()))
 10 | from Core.CoreSystem import CoreHash, CoreGotoh
 11 | 
 12 | 
 13 | class clsParameter(object):
 14 | 
 15 |     def __init__(self):
 16 | 
 17 |         if len(sys.argv) > 1:
 18 |             self.strForwardFqPath = sys.argv[1]
 19 |             self.strReverseFqPath = sys.argv[2]
 20 |             self.strRefFa         = sys.argv[3]
 21 |             self.strPair          = sys.argv[4]
 22 |             self.floOg            = float(sys.argv[5])
 23 |             self.floOe            = float(sys.argv[6])
 24 |             self.intInsertionWin  = int(sys.argv[7])
 25 |             self.intDeletionWin   = int(sys.argv[8])
 26 |             self.strPamType       = sys.argv[9].upper()  ## Cpf1, Cas9
 27 |             self.strBarcodePamPos = sys.argv[10]  ## PAM - BARCODE type (reverse) or BARCODE - PAM type (forward)
 28 |             self.intQualCutoff    = int(sys.argv[11])
 29 |             self.strOutputdir     = sys.argv[12]
 30 |             self.strLogPath       = sys.argv[13]
 31 |             self.strEDNAFULL      = os.path.abspath('../EDNAFULL')
 32 | 
 33 |         else:
 34 |             sManual = """
 35 |             Usage:
 36 | 
 37 |             python2.7 ./indel_search_ver1.0.py splitted_input_1.fq splitted_input_2.fq reference.fa
 38 | 
 39 |             splitted_input_1.fq : forward
 40 |             splitted_input_2.fq : reverse
 41 | 
 42 |             Total FASTQ(fq) lines / 4 = remainder 0.
 43 |             """
 44 |             print(sManual)
 45 |             sys.exit()
 46 | 
 47 | 
 48 | class clsFastqOpener(object):
 49 | 
 50 |     def __init__(self, InstParameter):
 51 | 
 52 |         self.strForwardFqPath = InstParameter.strForwardFqPath
 53 |         self.strReverseFqPath = InstParameter.strReverseFqPath
 54 | 
 55 |     def OpenFastqForward(self):
 56 | 
 57 |         listFastqForward = []
 58 |         listStore        = []
 59 | 
 60 |         with open(self.strForwardFqPath) as Fastq1:
 61 | 
 62 |             for i, strRow in enumerate(Fastq1):
 63 | 
 64 |                 i = i + 1
 65 |                 strRow = strRow.replace('\n', '').upper()
 66 | 
 67 |                 if i % 4 == 1 or i % 4 == 2:
 68 |                     listStore.append(strRow)
 69 |                 elif i % 4 == 0:
 70 |                     listQual = [ord(i) - 33 for i in strRow]
 71 |                     listStore.append(listQual)
 72 |                     listFastqForward.append(tuple(listStore))
 73 |                     listStore = []
 74 | 
 75 |         return listFastqForward
 76 | 
 77 |     def OpenFastqReverse(self):
 78 | 
 79 |         listFastqReverse = []
 80 |         listStore        = []
 81 | 
 82 |         dictRev = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'}
 83 | 
 84 |         #with open('./6_AsD0_2_small_test.fq') as fa_2:
 85 |         with open(self.strReverseFqPath) as Fastq2:
 86 | 
 87 |             for i, strRow in enumerate(Fastq2):
 88 |                 i = i + 1
 89 |                 strRow = strRow.replace('\n', '').upper()
 90 | 
 91 |                 if i % 4 == 1:
 92 |                     listStore.append(strRow)
 93 |                 elif i % 4 == 2:
 94 |                     listStore.append(''.join([dictRev[strNucle] for strNucle in strRow[::-1]]))
 95 |                 elif i % 4 == 0:
 96 |                     listQual = [ord(i) - 33 for i in strRow][::-1]
 97 |                     listStore.append(listQual)
 98 |                     listFastqReverse.append(tuple(listStore))
 99 |                     listStore = []
100 | 
101 |         return listFastqReverse
102 |         #end1: return
103 |     #end: def
104 | 
105 | 
106 | class clsIndelSearchParser(object):
107 | 
108 |     def __init__(self, InstParameter):
109 | 
110 |         # index name, constant variable.
111 |         self.intNumOfTotal = 0
112 |         self.intNumOfIns   = 1
113 |         self.intNumOfDel   = 2
114 |         self.intNumofCom   = 3
115 |         self.intTotalFastq = 4
116 |         self.intInsFastq   = 5
117 |         self.intDelFastq   = 6
118 |         self.intComFastq   = 7
119 |         self.intIndelInfo  = 8
120 | 
121 |         self.strRefFa        = InstParameter.strRefFa
122 |         self.floOg           = InstParameter.floOg
123 |         self.floOe           = InstParameter.floOe
124 |         self.strEDNAFULL     = InstParameter.strEDNAFULL
125 |         self.strPamType      = InstParameter.strPamType
126 |         self.intInsertionWin = InstParameter.intInsertionWin
127 |         self.intDeletionWin  = InstParameter.intDeletionWin
128 |         self.intQualCutoff   = InstParameter.intQualCutoff
129 | 
130 |     def SearchBarcodeIndelPosition(self, sBarcode_PAM_pos):
131 | 
132 |         dRef    = {}
133 |         dResult = {}
134 | 
135 |         with open(self.strRefFa) as Ref:
136 | 
137 |             sBarcode       = ""
138 |             sTarget_region = ""
139 |             intBarcodeLen  = 0
140 | 
141 |             for i, sRow in enumerate(Ref):
142 | 
143 |                 if i % 2 == 0: ## >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC
144 |                     sBarcode, sTarget_region, intBarcodeLen = self._SeperateFaHeader(sRow, sBarcode, sTarget_region,
145 |                                                                                     intBarcodeLen, sBarcode_PAM_pos)
146 | 
147 |                 elif i % 2 != 0: ## AGCATCGATCAGCTACGATCGATCGATCACTAGCTACGATCGATCA
148 |                     sRef_seq, iIndel_start_pos, iIndel_end_pos = self._SearchIndelPos(sRow, sBarcode_PAM_pos, sTarget_region)
149 | 
150 |                     try:
151 |                         self._MakeRefAndResultTemplate(sRef_seq, sBarcode, iIndel_start_pos, iIndel_end_pos,
152 |                                                        sTarget_region, dRef, dResult)
153 |                     except ValueError:
154 |                         continue
155 | 
156 |         assert len(dRef.keys()) == len(dResult.keys())
157 | 
158 |         return dRef, dResult
159 |         # end1: return
160 | 
161 |     def _SeperateFaHeader(self, sRow, sBarcode, sTarget_region, intBarcodeLen, sBarcode_PAM_pos):
162 | 
163 |         #      barcode               target region
164 |         # >CGCTCTACGTAGACA:CTCTATTACTCGCCCCACCTCCCCCAGCCC
165 |         sBarcode_indel_seq = sRow.strip().replace('\n', '').replace('\r', '').split(':')
166 |         sBarcode           = sBarcode_indel_seq[0].replace('>', '')
167 | 
168 |         if intBarcodeLen > 0:
169 |             assert intBarcodeLen == len(sBarcode), 'All of the barcode lengths must be same.'
170 |         intBarcodeLen = len(sBarcode)
171 | 
172 |         sTarget_region = sBarcode_indel_seq[1]
173 | 
174 |         ## Reverse the sentence. If it is done, all methods are same before work.
175 |         if sBarcode_PAM_pos == 'Reverse':
176 |             sBarcode = sBarcode[::-1]
177 |             sTarget_region = sTarget_region[::-1]
178 | 
179 |         return (sBarcode, sTarget_region, intBarcodeLen)
180 | 
181 |     def _SearchIndelPos(self, sRow, sBarcode_PAM_pos, sTarget_region):
182 | 
183 |         sRef_seq = sRow.strip().replace('\n', '').replace('\r', '')
184 | 
185 |         if sBarcode_PAM_pos == 'Reverse':
186 |             sRef_seq = sRef_seq[::-1]
187 | 
188 |         Seq_matcher = re.compile(r'(?=(%s))' % sTarget_region)
189 |         # iIndel_start_pos       = sRef_seq.index(sTarget_region)               # There is possible to exist two indel.
190 |         iIndel_start_pos = Seq_matcher.finditer(sRef_seq)
191 | 
192 |         for i, match in enumerate(iIndel_start_pos):
193 |             iIndel_start_pos = match.start()
194 |         # print iIndel_start_pos
195 |         # print len(sTarget_region)
196 |         # print sRef_seq
197 |         iIndel_end_pos = iIndel_start_pos + len(sTarget_region) - 1
198 | 
199 |         return (sRef_seq, iIndel_start_pos, iIndel_end_pos)
200 | 
201 |     def _MakeRefAndResultTemplate(self, sRef_seq, sBarcode, iIndel_start_pos,
202 |                                  iIndel_end_pos, sTarget_region, dRef, dResult):
203 |         iBarcode_start_pos = sRef_seq.index(sBarcode)
204 | 
205 |         # if iIndel_start_pos <= iBarcode_start_pos:
206 |         #    print(iIndel_start_pos, iBarcode_start_pos)
207 |         #    raise IndexError('indel is before barcode')
208 | 
209 |         iBarcode_end_pos       = iBarcode_start_pos + len(sBarcode) - 1
210 |         sRef_seq_after_barcode = sRef_seq[iBarcode_end_pos + 1:]
211 | 
212 |         # modified. to -1
213 |         iIndel_end_next_pos_from_barcode_end   = iIndel_end_pos - iBarcode_end_pos - 1
214 |         iIndel_start_next_pos_from_barcode_end = iIndel_start_pos - iBarcode_end_pos - 1
215 | 
216 |         #  "barcode"-------------*(N) that distance.
217 |         #          ^  ^            ^
218 |         #   *NNNN*NNNN
219 |         #    ^    ^     indel pos, the sequence matcher selects indel event pos front of it.
220 | 
221 |         ## Result
222 |         dRef[sBarcode] = (sRef_seq, sTarget_region, sRef_seq_after_barcode, iIndel_start_next_pos_from_barcode_end,
223 |                           iIndel_end_next_pos_from_barcode_end, iIndel_start_pos, iIndel_end_pos)  # total matched reads, insertion, deletion, complex
224 |         dResult[sBarcode] = [0, 0, 0, 0, [], [], [], [], []]
225 | 
226 | 
227 |     def SearchIndel(self, lFASTQ=[], dRef = {}, dResult={}, sBarcode_PAM_pos=""):
228 | 
229 |         # lFASTQ : [(seq, qual),(seq, qual)]
230 |         # lRef   : [(ref_seq, ref_seq_after_barcode, barcode, barcode end pos, indel end pos, indel from barcode),(...)]
231 |         # dResult = [# of total, # of ins, # of del, # of com, [total FASTQ], [ins FASTQ], [del FASTQ], [com FASTQ]]
232 |         iCount = 0
233 |         intBarcodeLen = len(dRef.keys()[0])
234 |         #print('intBarcodeLen', intBarcodeLen)
235 | 
236 |         InstGotoh = CoreGotoh(strEDNAFULL=self.strEDNAFULL, floOg=self.floOg, floOe=self.floOe)
237 | 
238 |         for lCol_FASTQ in lFASTQ:
239 |             sName = lCol_FASTQ[0]
240 |             if sBarcode_PAM_pos == 'Reverse':
241 |                 sSeq  = lCol_FASTQ[1][::-1]
242 |                 lQual = lCol_FASTQ[2][::-1]
243 |             else:
244 |                 sSeq  = lCol_FASTQ[1]
245 |                 lQual = lCol_FASTQ[2]
246 | 
247 |             assert isinstance(sName, str) and isinstance(sSeq, str) and isinstance(lQual, list)
248 | 
249 |             listSeqWindow = CoreHash.MakeHashTable(sSeq, intBarcodeLen)
250 | 
251 |             iBarcode_matched = 0
252 |             iInsert_count    = 0
253 |             iDelete_count    = 0
254 |             iComplex_count   = 0
255 | 
256 |             intFirstBarcode  = 0 ## check whether a barcode is one in a sequence.
257 | 
258 |             for strSeqWindow in listSeqWindow:
259 | 
260 |                 if intFirstBarcode == 1: break ## A second barcode in a sequence is not considerable.
261 | 
262 |                 try:
263 |                     lCol_ref, sBarcode, intFirstBarcode = CoreHash.IndexHashTable(dRef, strSeqWindow, intFirstBarcode)
264 |                 except KeyError:
265 |                     continue
266 | 
267 |                 sRef_seq                      = lCol_ref[0]
268 |                 sTarget_region                = lCol_ref[1]
269 |                 iIndel_seq_len                = len(sTarget_region)
270 |                 sRef_seq_after_barcode        = lCol_ref[2]
271 |                 iIndel_start_from_barcode_pos = lCol_ref[3]
272 |                 iIndel_end_from_barcode_pos   = lCol_ref[4]
273 |                 try:
274 |                     if self.strPamType == 'CAS9':
275 |                         iKbp_front_Indel_end = iIndel_end_from_barcode_pos - 6  ## cas9:-6, cpf1:-4
276 |                     elif self.strPamType == 'CAF1':
277 |                         iKbp_front_Indel_end = iIndel_end_from_barcode_pos - 4  ## NN(N)*NNN(N)*NNNN
278 |                 except Exception:
279 |                     set_trace()
280 | 
281 |                 """
282 |                                                      *     ^ : iIndel_end_from_barcode_pos
283 |                                   GGCG   TCGCTCATGTACCTCCCGT
284 |                 TATAGTCTGTCATGCGATGGCG---TCGCTCATGTACCTCCCGTTACAGCCACAAAGCAGGA
285 |                      *
286 |                 GGCGTC GCTCATGTACCTCCCGT
287 |                   6          17 
288 |                 """
289 | 
290 |                 ## bug fix
291 |                 if sBarcode == "": continue
292 | 
293 |                 (sSeq, iBarcode_matched, sQuery_seq_after_barcode, lQuery_qual_after_barcode) = \
294 |                     self._CheckBarcodePosAndRemove(sSeq, sBarcode, iBarcode_matched, lQual)
295 | 
296 |                 ## Alignment Seq to Ref
297 |                 npGapIncentive = InstGotoh.GapIncentive(sRef_seq_after_barcode)
298 | 
299 |                 try:
300 |                     lResult = InstGotoh.RunCRISPResso2(sQuery_seq_after_barcode.upper(),
301 |                                                        sRef_seq_after_barcode.upper(),
302 |                                                        npGapIncentive)
303 |                 except Exception as e:
304 |                     logging.error(e, exc_info=True)
305 |                     continue
306 | 
307 |                 sQuery_needle_ori = lResult[0]
308 |                 sRef_needle_ori   = lResult[1]
309 | 
310 |                 sRef_needle, sQuery_needle            = self._TrimRedundantSideAlignment(sRef_needle_ori, sQuery_needle_ori)
311 |                 lInsertion_in_read, lDeletion_in_read = self._MakeIndelPosInfo(sRef_needle, sQuery_needle)
312 | 
313 |                 # print 'sQuery_needle', sQuery_needle
314 |                 # print 'lInsertion_in_read: onebase', lInsertion_in_read
315 |                 # print 'lDeletion_in_read: onebase', lDeletion_in_read
316 |                 # print 'i5bp_front_Indel_end', i5bp_front_Indel_end
317 |                 # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos
318 | 
319 |                 lTarget_indel_result = []  # ['20M2I', '23M3D' ...]
320 | 
321 |                 iInsert_count = self._TakeInsertionFromAlignment(lInsertion_in_read, iKbp_front_Indel_end, lTarget_indel_result,
322 |                                                                  iIndel_end_from_barcode_pos, iInsert_count)
323 | 
324 |                 iDelete_count = self._TakeDeletionFromAlignment(lDeletion_in_read, iKbp_front_Indel_end, lTarget_indel_result,
325 |                                                                 iIndel_end_from_barcode_pos, iDelete_count)
326 | 
327 |                 if iInsert_count == 1 and iDelete_count == 1:
328 |                     iComplex_count = 1
329 |                     iInsert_count = 0
330 |                     iDelete_count = 0
331 | 
332 |                 # """ test set
333 |                 # print 'sBarcode', sBarcode
334 |                 # print 'sTarget_region', sTarget_region
335 |                 # print 'sRef_seq_after_barcode', sRef_seq_after_barcode
336 |                 # print 'sSeq_after_barcode', sQuery_seq
337 |                 # print 'iIndel_start_from_barcode_pos', iIndel_start_from_barcode_pos
338 |                 # print 'iIndel_end_from_barcode_pos', iIndel_end_from_barcode_pos
339 |                 # """
340 | 
341 |                 listResultFASTQ = self._MakeAndStoreQuality(sName, sSeq, lQual, dResult, sBarcode)
342 | 
343 |                 """
344 |                 iQual_end_pos + 1 is not correct, because the position is like this.
345 |                 *NNNN*(N)
346 |                 So, '+ 1' is removed.
347 |                 Howerver, seqeunce inspects until (N) position. indel is detected front of *(N).
348 |                 """
349 |                 ################################################################
350 |                 #print(lTarget_indel_result)
351 |                 #set_trace()
352 |                 # len(sQuery_seq_after_barcode) == len(lQuery_qual_after_barcode)
353 |                 if np.mean(lQuery_qual_after_barcode[iIndel_start_from_barcode_pos : iIndel_end_from_barcode_pos + 1]) >= self.intQualCutoff: ## Quality cutoff
354 | 
355 |                     """
356 |                     23M3I
357 |                     23M is included junk_seq after barcode,
358 | 
359 |                     barcorde  junk   targetseq   others
360 |                     *********ACCCT-------------ACACACACC
361 |                     so should select target region.
362 |                     If junk seq is removed by target region seq index pos.
363 |                     """
364 |                     # filter start,
365 |                     iTarget_start_from_barcode   = sRef_seq_after_barcode.index(sTarget_region)
366 |                     lTrimmed_target_indel_result = self._FixPos(lTarget_indel_result, iTarget_start_from_barcode)
367 | 
368 |                     # print 'Check'
369 |                     # print sRef_seq_after_barcode
370 |                     # print sQuery_seq_after_barcode
371 |                     # print lTrimmed_target_indel_result
372 |                     # print('Trimmed', lTrimmed_target_indel_result)
373 | 
374 |                     sRef_seq_after_barcode, sQuery_seq_after_barcode = self._StoreToDictResult(sRef_seq_after_barcode, sQuery_seq_after_barcode, iTarget_start_from_barcode,
375 |                                                                        dResult, sBarcode, lTrimmed_target_indel_result, sTarget_region, sRef_needle_ori,
376 |                                                                        sQuery_needle_ori, iInsert_count, iDelete_count, iComplex_count, listResultFASTQ)
377 |                 else:
378 |                     iInsert_count  = 0
379 |                     iDelete_count  = 0
380 |                     iComplex_count = 0
381 | 
382 |                 # total matched reads, insertion, deletion, complex
383 |                 dResult[sBarcode][self.intNumOfTotal] += iBarcode_matched
384 |                 dResult[sBarcode][self.intNumOfIns] += iInsert_count
385 |                 dResult[sBarcode][self.intNumOfDel] += iDelete_count
386 |                 dResult[sBarcode][self.intNumofCom] += iComplex_count
387 | 
388 |                 iBarcode_matched = 0
389 |                 iInsert_count    = 0
390 |                 iDelete_count    = 0
391 |                 iComplex_count   = 0
392 | 
393 |             #End:for
394 |         #END:for
395 |         return dResult
396 | 
397 |     def _CheckBarcodePosAndRemove(self, sSeq, sBarcode, iBarcode_matched, lQual):
398 | 
399 |         # Check the barcode pos and remove it.
400 |         sSeq = sSeq.replace('\r', '')
401 |         iBarcode_start_pos_FASTQ = sSeq.index(sBarcode)
402 |         iBarcode_matched += 1
403 |         iBarcode_end_pos_FASTQ = iBarcode_start_pos_FASTQ + len(sBarcode) - 1
404 | 
405 |         """
406 |             junk seq  target region
407 |         ref: AGGAG    AGAGAGAGAGA
408 |         que: AGGAG    AGAGAGAGAGA
409 |         But, It doesnt know where is the target region because of existed indels.
410 |         So, There is no way not to include it.
411 |         """
412 |         # Use this.
413 |         sQuery_seq_after_barcode = sSeq[iBarcode_end_pos_FASTQ + 1:]
414 |         lQuery_qual_after_barcode = lQual[iBarcode_end_pos_FASTQ:]
415 | 
416 |         return (sSeq, iBarcode_matched, sQuery_seq_after_barcode, lQuery_qual_after_barcode)
417 | 
418 |     def _TrimRedundantSideAlignment(self, sRef_needle_ori, sQuery_needle_ori):
419 | 
420 |         # detach forward ---, backward ---
421 |         # e.g.    ref   ------AAAGGCTACGATCTGCG------
422 |         #         query AAAAAAAAATCGCTCTCGCTCTCCGATCT
423 |         # trimmed ref         AAAGGCTACGATCTGCG
424 |         # trimmed qeury       AAATCGCTCTCGCTCTC
425 |         iReal_ref_needle_start = 0
426 |         iReal_ref_needle_end = len(sRef_needle_ori)
427 |         iRef_needle_len = len(sRef_needle_ori)
428 | 
429 |         for i, sRef_nucle in enumerate(sRef_needle_ori):
430 |             if sRef_nucle in ['A', 'C', 'G', 'T']:
431 |                 iReal_ref_needle_start = i
432 |                 break
433 | 
434 |         for i, sRef_nucle in enumerate(sRef_needle_ori[::-1]):
435 |             if sRef_nucle in ['A', 'C', 'G', 'T']:
436 |                 iReal_ref_needle_end = iRef_needle_len - (i + 1)
437 |                 # forward 0 1 2  len : 3
438 |                 # reverse 2 1 0,  len - (2 + 1) = 0
439 |                 break
440 | 
441 |         sRef_needle = sRef_needle_ori[iReal_ref_needle_start:iReal_ref_needle_end + 1]
442 |         if iReal_ref_needle_start:
443 |             sQuery_needle = sQuery_needle_ori[:iReal_ref_needle_end]
444 |         sQuery_needle = sQuery_needle_ori[:len(sRef_needle)]
445 |         # detaching completion
446 |         return (sRef_needle, sQuery_needle)
447 | 
448 |     def _MakeIndelPosInfo(self, sRef_needle, sQuery_needle):
449 | 
450 |         # indel info making.
451 |         iNeedle_match_pos_ref   = 0
452 |         iNeedle_match_pos_query = 0
453 |         iNeedle_insertion       = 0
454 |         iNeedle_deletion        = 0
455 | 
456 |         lInsertion_in_read = []  # insertion result [[100, 1], [119, 13]]
457 |         lDeletion_in_read  = []  # deletion result  [[97, 1], [102, 3]]
458 | 
459 |         # print 'sRef_needle', sRef_needle
460 |         # print 'sQuery_needle', sQuery_needle
461 |         for i, (sRef_nucle, sQuery_nucle) in enumerate(zip(sRef_needle, sQuery_needle)):
462 | 
463 |             if sRef_nucle == '-':
464 |                 iNeedle_insertion += 1
465 | 
466 |             if sQuery_nucle == '-':
467 |                 iNeedle_deletion += 1
468 | 
469 |             if sRef_nucle in ['A', 'C', 'G', 'T']:
470 |                 if iNeedle_insertion:
471 |                     lInsertion_in_read.append([iNeedle_match_pos_ref, iNeedle_insertion])
472 |                     iNeedle_insertion = 0
473 |                 iNeedle_match_pos_ref += 1
474 | 
475 |             if sQuery_nucle in ['A', 'C', 'G', 'T']:
476 |                 if iNeedle_deletion:
477 |                     lDeletion_in_read.append([iNeedle_match_pos_query, iNeedle_deletion])
478 |                     iNeedle_match_pos_query += iNeedle_deletion
479 |                     iNeedle_deletion = 0
480 |                 iNeedle_match_pos_query += 1
481 |                 # print 'sRef_needle', sRef_needle
482 | 
483 |         return (lInsertion_in_read, lDeletion_in_read)
484 | 
485 | 
486 |     def _TakeInsertionFromAlignment(self, lInsertion_in_read, iKbp_front_Indel_end, lTarget_indel_result,
487 |                                     iIndel_end_from_barcode_pos, iInsert_count):
488 |         """
489 |         ins case
490 |         ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNN*NNNNNAGCTT
491 |         """
492 |         for iMatch_pos, iInsertion_pos in lInsertion_in_read:
493 |             if self.strPamType == 'CAS9':
494 |                 # if i5bp_front_Indel_end == iMatch_pos -1 or iIndel_end_from_barcode_pos == iMatch_pos -1: # iMatch_pos is one base # original ver
495 |                 if iKbp_front_Indel_end - self.intInsertionWin <= iMatch_pos - 1 <= iKbp_front_Indel_end + self.intInsertionWin:  # iMatch_pos is one base
496 |                     iInsert_count = 1
497 |                     lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I')
498 | 
499 |             elif self.strPamType == 'CPF1':
500 |                 if iKbp_front_Indel_end - self.intInsertionWin <= iMatch_pos - 1 <= iKbp_front_Indel_end + self.intInsertionWin or \
501 |                         iIndel_end_from_barcode_pos - self.intInsertionWin <= iMatch_pos - 1 <= iIndel_end_from_barcode_pos + self.intInsertionWin:  # iMatch_pos is one base
502 |                     iInsert_count = 1
503 |                     lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iInsertion_pos) + 'I')
504 | 
505 |         return iInsert_count
506 | 
507 |     def _TakeDeletionFromAlignment(self, lDeletion_in_read, iKbp_front_Indel_end, lTarget_indel_result,
508 |                                    iIndel_end_from_barcode_pos, iDelete_count):
509 | 
510 |         """
511 |         del case 1
512 |         ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNAGCTT
513 |         del case 2
514 |         ...............................NNNNNNNNNNNNNN....NNNNNNNNNNNNNNNNNNNNN**NNNNNCTT
515 |         """
516 |         for iMatch_pos, iDeletion_pos in lDeletion_in_read:
517 |             """
518 |             Insertion: 30M3I
519 |                    ^
520 |             ACGT---ACGT
521 |             ACGTTTTACGT -> check this seq
522 |             Insertion just check two position
523 | 
524 |             Deletion: 30M3D
525 |                  ^
526 |             ACGTTTTACGT
527 |             ACGT---ACGT -> check this seq
528 |             But deletion has to includes overlap deletion.
529 |             """
530 |             if self.strPamType == 'CAS9':
531 |                 if (iMatch_pos - self.intDeletionWin - 1 <= iKbp_front_Indel_end and iKbp_front_Indel_end < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)):
532 |                     iDelete_count = 1
533 |                     lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D')
534 |             elif self.strPamType == 'CPF1':
535 |                 if (iMatch_pos - self.intDeletionWin - 1 <= iKbp_front_Indel_end and iKbp_front_Indel_end < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)) or \
536 |                    (iMatch_pos - self.intDeletionWin - 1 <= iIndel_end_from_barcode_pos and iIndel_end_from_barcode_pos < (iMatch_pos + iDeletion_pos + self.intDeletionWin - 1)):
537 |                     iDelete_count = 1
538 |                     lTarget_indel_result.append(str(iMatch_pos) + 'M' + str(iDeletion_pos) + 'D')
539 | 
540 |         return iDelete_count
541 | 
542 |     def _MakeAndStoreQuality(self, sName, sSeq, lQual, dResult, sBarcode):
543 |         listResultFASTQ = [sName, sSeq, '+', ''.join(chr(i + 33) for i in lQual)]
544 |         dResult[sBarcode][self.intTotalFastq].append(listResultFASTQ)
545 |         return listResultFASTQ
546 | 
547 |     def _FixPos(self, lTarget_indel_result, iTarget_start_from_barcode):
548 | 
549 |         lTrimmed_target_indel_result = []
550 | 
551 |         for sINDEL in lTarget_indel_result:
552 |             # B - A is not included B position, so +1
553 |             iMatch_target_start = int(sINDEL.split('M')[0]) - iTarget_start_from_barcode
554 |             """ This part determines a deletion range.
555 |                                       ^ current match pos                                           
556 |             AGCTACGATCAGCATCTGACTTACTTC[barcode]
557 | 
558 | 
559 |                            ^ fix the match start at here. (target region)                                           
560 |             AGCTACGATCAGCATC TGACTTACTTC[barcode]
561 | 
562 |             if iMatch_target_start < 0:
563 |                 sContinue = 1
564 | 
565 |             But, this method has some problems.
566 | 
567 |                            ^ barcode start
568 |             AGCTACGATCAGCAT*********C[barcode]
569 |             Like this pattern doesn't seleted. because, deletion checking is begun the target region start position. 
570 |             Thus, I have fixed this problem.
571 |             """
572 | 
573 |             if iMatch_target_start <= -(iTarget_start_from_barcode):
574 |                 # print(iMatch_target_start, iTarget_start_from_barcode)
575 |                 continue
576 | 
577 |             lTrimmed_target_indel_result.append(str(iMatch_target_start) + 'M' + sINDEL.split('M')[1])
578 |         # filter end
579 |         return lTrimmed_target_indel_result
580 | 
581 |     def _StoreToDictResult(self, sRef_seq_after_barcode, sQuery_seq_after_barcode, iTarget_start_from_barcode,
582 |                            dResult, sBarcode, lTrimmed_target_indel_result, sTarget_region, sRef_needle_ori, sQuery_needle_ori,
583 |                            iInsert_count, iDelete_count, iComplex_count, listResultFASTQ):
584 | 
585 |         sRef_seq_after_barcode   = sRef_seq_after_barcode[iTarget_start_from_barcode:]
586 |         sQuery_seq_after_barcode = sQuery_seq_after_barcode[iTarget_start_from_barcode:]
587 | 
588 |         dResult[sBarcode][self.intIndelInfo].append([sRef_seq_after_barcode, sQuery_seq_after_barcode, lTrimmed_target_indel_result,
589 |                                                      sTarget_region, sRef_needle_ori, sQuery_needle_ori])
590 |         if iInsert_count:
591 |             dResult[sBarcode][self.intInsFastq].append(listResultFASTQ)
592 |         elif iDelete_count:
593 |             dResult[sBarcode][self.intDelFastq].append(listResultFASTQ)
594 |         elif iComplex_count:
595 |             dResult[sBarcode][self.intComFastq].append(listResultFASTQ)
596 | 
597 |         return (sRef_seq_after_barcode, sQuery_seq_after_barcode)
598 | 
599 |     def CalculateIndelFrequency(self, dResult):
600 |         dResult_INDEL_freq = {}
601 | 
602 |         for sBarcode, lValue in dResult.items():  # lValue[gINDEL_info] : [[sRef_seq_after_barcode, sQuery_seq_after_barcode, lTarget_indel_result, sTarget_region], ..])
603 |             sRef_seq_loop = ''
604 |             llINDEL_store = []  # ['ACAGACAGA', ['20M2I', '23M3D']]
605 |             dINDEL_freq   = {}
606 | 
607 |             if lValue[self.intIndelInfo]:
608 |                 for sRef_seq_loop, sQuery_seq, lINDEL, sTarget_region, sRef_needle, sQuery_needle in lValue[self.intIndelInfo]: # llINDEL : [['20M2I', '23M3D'], ...]
609 |                     # print 'lINDEL', lINDEL
610 |                     for sINDEL in lINDEL:
611 |                         llINDEL_store.append([sQuery_seq, sINDEL, sRef_needle, sQuery_needle])
612 | 
613 |                 iTotal = len([lINDEL for sQuery_seq, lINDEL, sRef_needle, sQuery_needle in llINDEL_store])
614 | 
615 |                 for sQuery_seq, sINDEL, sRef_needle, sQuery_needle in llINDEL_store:
616 |                     dINDEL_freq[sINDEL] = [[], 0, [], []]
617 | 
618 |                 for sQuery_seq, sINDEL, sRef_needle, sQuery_needle in llINDEL_store:
619 |                     dINDEL_freq[sINDEL][1] += 1
620 |                     dINDEL_freq[sINDEL][0].append(sQuery_seq)
621 |                     dINDEL_freq[sINDEL][2].append(sRef_needle)
622 |                     dINDEL_freq[sINDEL][3].append(sQuery_needle)
623 | 
624 |                 for sINDEL in dINDEL_freq:
625 |                     lQuery        = dINDEL_freq[sINDEL][0]
626 |                     iFreq         = dINDEL_freq[sINDEL][1]
627 |                     lRef_needle   = dINDEL_freq[sINDEL][2]
628 |                     lQuery_needle = dINDEL_freq[sINDEL][3]
629 | 
630 |                     try:
631 |                         dResult_INDEL_freq[sBarcode].append([sRef_seq_loop, lQuery, sINDEL, float(iFreq) / iTotal,
632 |                                                              sTarget_region, lRef_needle, lQuery_needle])
633 |                     except (KeyError, TypeError, AttributeError) as e:
634 |                         dResult_INDEL_freq[sBarcode] = []
635 |                         dResult_INDEL_freq[sBarcode].append([sRef_seq_loop, lQuery, sINDEL, float(iFreq) / iTotal,
636 |                                                              sTarget_region, lRef_needle, lQuery_needle])
637 |             # end: if lValue[gINDEL_info]
638 |         # end: for sBarcode, lValue
639 |         return dResult_INDEL_freq
640 |         # end1: return
641 |     # end: def
642 | #END:class
643 | 
644 | 
645 | class clsOutputMaker(object):
646 | 
647 |     def __init__(self, InstParameter):
648 | 
649 |         self.strOutputdir     = InstParameter.strOutputdir
650 |         self.strForwardFqPath = InstParameter.strForwardFqPath
651 | 
652 |     def MakePickleOutput(self, dictResult, dictResultIndelFreq, strBarcodePamPos=''):
653 | 
654 |         dictOutput = {'dictResult': dictResult,
655 |                       'dictResultIndelFreq': dictResultIndelFreq,
656 |                       'strBarcodePamPos': strBarcodePamPos}
657 | 
658 |         with open('{outdir}/Tmp/Pickle/{fq}.pickle'.format(outdir=self.strOutputdir, fq=os.path.basename(self.strForwardFqPath)), 'wb') as Pickle:
659 |             pickle.dump(dictOutput, Pickle)
660 | 
661 | 
662 | def Main():
663 | 
664 |     InstParameter = clsParameter()
665 |     logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
666 |                         level=logging.DEBUG,
667 |                         filename=InstParameter.strLogPath,
668 |                         filemode='a')
669 | 
670 |     logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
671 | 
672 |     logging.info('Program start : %s' % InstParameter.strForwardFqPath)
673 | 
674 |     logging.info('File Open')
675 |     InstFileOpen     = clsFastqOpener(InstParameter)
676 |     listFastqForward = InstFileOpen.OpenFastqForward()
677 |     if InstParameter.strPair == 'True':
678 |         listFastqReverse = InstFileOpen.OpenFastqReverse()
679 | 
680 |     InstIndelSearch = clsIndelSearchParser(InstParameter)
681 | 
682 |     InstOutput = clsOutputMaker(InstParameter)
683 | 
684 |     if InstParameter.strPamType == 'CPF1':
685 |         logging.info('Search barcode INDEL pos')
686 |         dRef, dResult = InstIndelSearch.SearchBarcodeIndelPosition(InstParameter.strBarcodePamPos)  # ref check.
687 | 
688 |         logging.info('Search INDEL forward')
689 |         dResultForward = InstIndelSearch.SearchIndel(listFastqForward, dRef, dResult)
690 | 
691 |         if InstParameter.strPair == 'True':
692 |             logging.info('Search INDEL reverse')
693 |             dResultReverse = InstIndelSearch.SearchIndel(listFastqReverse, dRef, dResultForward)
694 | 
695 |             logging.info('Calculate INDEL frequency')
696 |             dictResultIndelFreq = InstIndelSearch.CalculateIndelFrequency(dResultReverse)
697 | 
698 |             logging.info('Make pickle output forward')
699 |             InstOutput.MakePickleOutput(dResultReverse, dictResultIndelFreq)
700 | 
701 |         else:
702 |             logging.info('Calculate INDEL frequency')
703 |             dictResultIndelFreq = InstIndelSearch.CalculateIndelFrequency(dResultForward)
704 | 
705 |             logging.info('Make pickle output forward')
706 |             InstOutput.MakePickleOutput(dResultForward, dictResultIndelFreq)
707 | 
708 |     elif InstParameter.strPamType == 'CAS9':
709 |         logging.info('Search barcode INDEL pos')
710 |         dRef, dResult   = InstIndelSearch.SearchBarcodeIndelPosition(InstParameter.strBarcodePamPos)
711 |         logging.info('Search INDEL')
712 |         dResult_forward = InstIndelSearch.SearchIndel(listFastqForward, dRef, dResult, InstParameter.strBarcodePamPos)
713 |         logging.info('Calculate INDEL frequency')
714 |         dResult_INDEL_freq = InstIndelSearch.CalculateIndelFrequency(dResult_forward)
715 | 
716 |         logging.info('Make pickle output forward')
717 |         InstOutput.MakePickleOutput(dResult_forward, dResult_INDEL_freq, InstParameter.strBarcodePamPos)
718 | 
719 |     logging.info('Program end : %s' % InstParameter.strForwardFqPath)
720 | #END:def
721 | 
722 | 
723 | if __name__ == '__main__':
724 |     Main()
725 | 
726 | 
727 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Kill_jobs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Confirm the jobs.
4 | # ps aux | grep hkim | grep BaseEdit_freq_ver1.0.py | less
5 | 
6 | kill -9 $(ps aux | grep hkim | grep Run_indel_searcher | awk '{print$2}')
7 | kill -9 $(ps aux | grep hkim | grep Indel_searcher_crispresso_hash | awk '{print$2}')
8 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Make_user_folder.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | user=JaeWoo
 4 | project=JaeWoo_test_samples
 5 | 
 6 | [ ! -d ./Input ] && { `mkdir ./Input`; }
 7 | [ ! -d ./User ] && { `mkdir ./User`; }
 8 | [ ! -d ./Output ] && { `mkdir ./Output`; }
 9 | 
10 | [ ! -d ./Input/${user} ] && { `mkdir ./Input/${user}`; }
11 | [ ! -d ./Input/${user}/FASTQ ] && { `mkdir ./Input/${user}/FASTQ`; }
12 | [ ! -d ./Input/${user}/FASTQ/${project} ] && { `mkdir ./Input/${user}/FASTQ/${project}`; }
13 | [ ! -d ./Input/${user}/Reference ] && { `mkdir ./Input/${user}/Reference`; }
14 | [ ! -d ./Input/${user}/Reference/${project} ] && { `mkdir ./Input/${user}/Reference/${project}`; }
15 | 
16 | [ ! -d ./User/${user} ] && { `mkdir ./User/${user}`; }
17 | > ./User/${user}/${project}.txt
18 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/README.md:
--------------------------------------------------------------------------------
 1 | # Indel_searcher_2
 2 | Fast CRISPR indel search tool
 3 | 
 4 | ### Prerequisites to run
 5 | ```
 6 |  # install the miniconda2.
 7 |  https://repo.anaconda.com/miniconda/Miniconda2-latest-Linux-x86_64.sh
 8 | 
 9 |  # Run the conda package manager.
10 |  conda config --add channels defaults
11 |  conda config --add channels bioconda
12 |  conda config --add channels conda-forge
13 |  conda install CRISPResso2
14 |  
15 |  vi ~/.bashrc
16 |  export PATH=$PATH:/path/to/minicodna2/bin
17 | 
18 |  vi Make_user_folder.sh
19 |  # Modify the user name and project name.
20 |  user=JaeWoo
21 |  project=JaeWoo_test_samples
22 |  ./Make_user_folder.sh
23 | 
24 |  vi Run_cmd.sh
25 |  # Modify the parameters. The user and project name must be the same as that used in the 'Make_user_folder.sh'.
26 |  user=JaeWoo
27 |  project=JaeWoo_test_samples
28 |  pam_type=Cas9
29 |  pam_pos=Forward
30 |  thread=15
31 |  ./Run_cmd.sh
32 | ```
33 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Run_cmd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | ####################
 4 | ## User parameter ##
 5 | ###################################
 6 | 
 7 | user=JaeWoo
 8 | project=JaeWoo_test_samples
 9 | pam_type=Cas9
10 | pam_pos=Forward
11 | thread=15
12 | 
13 | gap_open=-10 ## default
14 | gap_extend=1 ## default
15 | 
16 | ###################################
17 | 
18 | while read python_path;do
19 |     python=$python_path
20 | done < ../PythonPath.txt
21 | 
22 | [ ! -d ./Output/${user} ] && { `mkdir ./Output/${user}`; }
23 | [ ! -d ./Output/${user}/${project} ] && { `mkdir ./Output/${user}/${project}`; }
24 | [ ! -d ./Output/${user}/${project}/Log ] && { `mkdir ./Output/${user}/${project}/Log`; }
25 | 
26 | nohup $python ./Run_indel_searcher.py --python $python --user $user --project $project --pam_type $pam_type --pam_pos $pam_pos -t $thread > ./Output/${user}/${project}/Log/log.txt 2>&1 &
27 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Run_converter.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ####################
 5 | ## User parameter ##
 6 | ####################################
 7 | 
 8 | user=JaeWoo
 9 | project=JaeWoo_test_samples
10 | 
11 | 
12 | ####################################
13 | 
14 | 
15 | 
16 | 
17 | 
18 | while read python_path;do
19 |     python=$python_path
20 | done < ../PythonPath.txt
21 | 
22 | nohup $python ./BaseEdit_input_converter.py $user $project > ./Output/${user}/${project}/Log/Converter_log.txt 2>&1 &
23 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Run_flash.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ####################
 5 | ## User parameter ##
 6 | ####################################
 7 | 
 8 | user=SH
 9 | project=p53_screening
10 | flash=FLASH-1.2.11-Linux-x86_64
11 | thread=4
12 | 
13 | ####################################
14 | 
15 | 
16 | while read python_path;do
17 |     python=$python_path
18 | done < ../PythonPath.txt
19 | 
20 | nohup $python ./Flash_pair_read_merge.py $user $project $flash $thread > ./Output/${user}/${project}/Log/flash_log.txt 2>&1 &
21 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Run_indel_searcher.py:
--------------------------------------------------------------------------------
  1 | import os, re, sys, math, logging
  2 | 
  3 | import cPickle as pickle
  4 | import subprocess as sp
  5 | 
  6 | from pdb import set_trace
  7 | from datetime import datetime
  8 | from optparse import OptionParser
  9 | 
 10 | sys.path.insert(0, os.path.dirname(os.getcwd()))
 11 | from Core.CoreSystem import InitialFolder, UserFolderAdmin, Helper, RunMulticore, CheckProcessedFiles
 12 | 
 13 | 
 14 | class clsIndelSearcherRunner(UserFolderAdmin):
 15 | 
 16 |     """
 17 |     self.strOutputDir is inherited variable.
 18 | 
 19 |     """
 20 |     def __init__(self, strSample, strRef, options, InstInitFolder):
 21 |         UserFolderAdmin.__init__(self, strSample, strRef, options, InstInitFolder.strLogPath)
 22 |         self.MakeSampleFolder()
 23 | 
 24 |         self.strProjectFile    = InstInitFolder.strProjectFile
 25 |         self.intChunkSize      = options.chunk_number
 26 |         self.strQualCutoff     = options.base_quality
 27 |         self.intInsertionWin   = options.insertion_window  # Insertion window 0,1,2,3,4
 28 |         self.intDeletionWin    = options.deletion_window  # Deletion window 0,1,2,3,4
 29 |         self.strPamType        = options.pam_type  # CRISPR type : Cpf1(2 cleavages), Cas9(1 cleavage)
 30 |         self.strPamPos         = options.pam_pos  # Barcode target position : Forward (barcode + target), Reverse (target + barcode)
 31 |         self.strPickle         = options.pickle
 32 |         self.strClassFASTQ     = options.class_fastq
 33 |         self.strSplit          = options.split
 34 |         self.strLogPath        = InstInitFolder.strLogPath
 35 | 
 36 |         self.strBarcodeFile      = os.path.join(self.strRefDir, 'Barcode.txt')
 37 |         self.strReferenceSeqFile = os.path.join(self.strRefDir, 'Reference_sequence.txt')
 38 |         self.strTargetSeqFile    = os.path.join(self.strRefDir, 'Target_region.txt')
 39 |         self.strRefFile          = os.path.join(self.strRefDir, 'Reference.fa')
 40 | 
 41 |         ## The file name required for the user is 'B'arcode.txt but it may be written as 'b'arcode.txt by mistake.
 42 |         ## This part is to fix the situation as mentioned above.
 43 |         if not os.path.isfile(self.strBarcodeFile):
 44 |             if os.path.isfile(self.strRefDir + 'barcode.txt'):
 45 |                 self.strBarcodeFile = self.strRefDir + 'barcode.txt'
 46 |             else:
 47 |                 logging.error('Barcode path is not correct, please make sure the path correctly.')
 48 |         if not os.path.isfile(self.strReferenceSeqFile):
 49 |             if os.path.isfile(self.strRefDir + 'reference_sequence.txt'):
 50 |                 self.strReferenceSeqFile = self.strRefDir + 'reference_sequence.txt'
 51 |             else:
 52 |                 logging.error('Reference path is not correct, please make sure the path correctly.')
 53 |         if not os.path.isfile(self.strTargetSeqFile):
 54 |             if os.path.isfile(self.strRefDir + 'target_region.txt'):
 55 |                 self.strTargetSeqFile = self.strRefDir + 'target_region.txt'
 56 |             else:
 57 |                 logging.error('Target path is not correct, please make sure the path correctly.')
 58 | 
 59 | 
 60 |         self.strFastqDir = './Input/{user}/FASTQ/{project}'.format(user=self.strUser,
 61 |                                                                      project=self.strProject)
 62 |         ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1'
 63 |         self.strSampleDir  = os.path.join(self.strFastqDir, self.strSample)
 64 | 
 65 |         self.strFastq_name = ''
 66 |         for strFile in os.listdir(self.strSampleDir):
 67 |             if os.path.isfile(self.strSampleDir + '/' + strFile) and strFile.split('.')[-1] == 'fastq':
 68 |                 self.strFastq_name = '.'.join(strFile.split('.')[:-1])
 69 |         logging.info('File name : %s' % self.strFastq_name)
 70 | 
 71 |         ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.fastq'
 72 |         self.strInputFile = os.path.join(self.strSampleDir, self.strFastq_name+'.fastq')
 73 |         ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Fastq_file.txt'
 74 |         self.strInputList = os.path.join(self.strSampleDir, self.strFastq_name+'.txt')
 75 | 
 76 |         ## './Input/JaeWoo/FASTQ/Test_samples/Sample_1/Split_files'
 77 |         self.strSplitPath = os.path.join(self.strSampleDir, 'Split_files')
 78 |         Helper.MakeFolderIfNot(self.strSplitPath)
 79 | 
 80 |         self.strPair = 'False'  # FASTQ pair: True, False
 81 | 
 82 |     def SplitFile(self):
 83 |         
 84 |         ### Defensive : original fastq wc == split fastq wc
 85 |         #intTotalLines = len(open(self.strInputFile).readlines())
 86 |         intTotalLines = int(sp.check_output('wc -l {input_file}'.format(input_file=self.strInputFile), shell=True).split()[0])
 87 |         intSplitNum   = int(math.ceil(intTotalLines/float(self.intChunkSize)))  ## e.g. 15.4 -> 16
 88 | 
 89 |         if intSplitNum == 0: intSplitNum = 1
 90 |         logging.info('Total lines:%s, Chunk size:%s, Split number:%s'  % (intTotalLines, self.intChunkSize, intSplitNum))
 91 |         
 92 |         with open(self.strInputFile) as fq, \
 93 |             open(self.strInputList, 'w') as OutList:
 94 | 
 95 |             for intNum in range(1, intSplitNum + 1):
 96 | 
 97 |                 strSplitFile = self.strSplitPath + '/{sample}_{num}.fq'.format(sample=os.path.basename(self.strInputFile),
 98 |                                                                                num=intNum)
 99 |                 with open(strSplitFile, 'w') as out:
100 |                     OutList.write(os.path.basename(strSplitFile) + '\n')
101 |                     intCount = 0
102 | 
103 |                     for strRow in fq:
104 |                         intCount += 1
105 |                         out.write(strRow)
106 | 
107 |                         if intCount == self.intChunkSize:
108 |                             break
109 | 
110 |         ## defensive
111 |         #strOriginal   = sp.check_output('wc -l {input_file}'.format(input_file=self.strInputFile), shell=True)
112 |         strSplited    = sp.check_output('cat {splited}/*.fq | wc -l'.format(splited=self.strSplitPath), shell=True)
113 |         #strOrigianlWc = strOriginal.split()[0]
114 |         intSplitedWc  = int(strSplited.replace('\n',''))
115 | 
116 |         if intTotalLines != intSplitedWc:
117 |             logging.error('The number of total lines of splited file is not corresponded to origial fastq.')
118 |             logging.error('Original FASTQ line number : %s, Splited FASTQ line number : %s' % (intTotalLines, strSplited))
119 |             sys.exit(1)
120 | 
121 |     def MakeReference(self):
122 | 
123 |         if not os.path.isfile(self.strRefFile):
124 |             with open(self.strBarcodeFile) as Barcode, \
125 |                 open(self.strTargetSeqFile) as Target, \
126 |                 open(self.strReferenceSeqFile) as Ref, \
127 |                 open(self.strRefFile, 'w') as Output:
128 | 
129 |                 listBarcode = Helper.RemoveNullAndBadKeyword(Barcode)
130 |                 listTarget  = Helper.RemoveNullAndBadKeyword(Target)
131 |                 listRef     = Helper.RemoveNullAndBadKeyword(Ref)
132 | 
133 |                 ## defensive
134 |                 assert len(listBarcode) == len(listTarget) == len(listRef), 'Barcode, Target and Reference must be a same row number.'
135 | 
136 |                 listName = []
137 |                 for strBar, strTar in zip(listBarcode, listTarget):
138 |                     strBar = strBar.replace('\n', '').replace('\r', '').strip().upper()
139 |                     strTar = strTar.replace('\n', '').replace('\r', '').strip().upper()
140 | 
141 |                     Helper.CheckIntegrity(self.strBarcodeFile, strBar) ## defensive
142 |                     Helper.CheckIntegrity(self.strBarcodeFile, strTar) ## defensive
143 | 
144 |                     listName.append(strBar + ':' + strTar + '\n')
145 |                 
146 |                 for i, strRow in enumerate(listRef):
147 |                     strRow = strRow.replace('\r', '').strip().upper()
148 |                     Output.write('>' + listName[i] + strRow + '\n')
149 | 
150 |     def MakeIndelSearcherCmd(self):
151 | 
152 |         listCmd    = []
153 |         strReverse = 'None'
154 | 
155 |         with open(self.strInputList) as Input:
156 |             for strFile in Input:
157 |                 listFile   = strFile.replace('\n', '').split(' ')
158 |                 strForward = self.strSplitPath + '/' + listFile[0]
159 | 
160 |                 #if self.strPair == 'True':
161 |                 #    strReverse = self.strSplitPath + '/' + listFile[1]
162 | 
163 |                 listCmd.append(('{python} Indel_searcher_crispresso_hash.py {forw} {reve} {ref} {pair} {GapO} {GapE}' 
164 |                              ' {Insertion_win} {Deletion_win} {PAM_type} {PAM_pos} {Qual} {outdir} {logpath}').format(
165 |                         python=self.strPython,
166 |                         forw=strForward, reve=strReverse, ref=self.strRefFile, pair=self.strPair,
167 |                         GapO=self.strGapOpen, GapE=self.strGapExtend,
168 |                         Insertion_win=self.intInsertionWin, Deletion_win=self.intDeletionWin,
169 |                         PAM_type=self.strPamType, PAM_pos=self.strPamPos, Qual=self.strQualCutoff,
170 |                         outdir=self.strOutSampleDir, logpath=self.strLogPath))
171 |         return listCmd
172 | 
173 |     def RunIndelFreqCalculator(self):
174 |         sp.call('{python} Indel_frequency_calculator.py {outdir} {sample} {logpath}'.format(python=self.strPython,
175 |                                                                                             outdir=self.strOutSampleDir,
176 |                                                                                             sample=self.strSample,
177 |                                                                                             logpath=self.strLogPath), shell=True)
178 |         sp.call('{python} Summary_all_trim.py {outdir} {sample} {logpath}'.format(python=self.strPython,
179 |                                                                                   outdir=self.strOutSampleDir,
180 |                                                                                   sample=self.strSample,
181 |                                                                                   logpath=self.strLogPath), shell=True)
182 |         sp.call('cp $(find ./Output/{user}/{project} -name "*.tsv") ./Output/{user}/{project}/All_results'.format(user=self.strUser,
183 |                                                                                                                   project=self.strProject), shell=True)
184 | 
185 |     def IndelNormalization(self):
186 | 
187 |         sp.call('{python} Indel_normalization.py {project_file} {user} {project}'.format(python=self.strPython,
188 |                                                                                          project_file=self.strProjectFile,
189 |                                                                                          user=self.strUser,
190 |                                                                                          project=self.strProject), shell=True)
191 | 
192 |     def MakeOutput(self):
193 |         """
194 |         dictResult
195 |         {'TTTGTAGTCATACATCGCAATGTCAA': [0, 0, 0, 0, [], [], [], [], []]}
196 |         dictResultIndelFreq
197 |         {'TTTGCTCAGTCACACGTCACGAGCTG': [['TCATCGACTTGCAGGACATTAGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC',
198 |         ['TCATCGACTTGCAGGACGAAGCTTGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA'], '19M3I', 1.0,
199 |         'TCATCGACTTGCAGGACATTAGGCGA', ['TCATCGACTTGCAGGACAT---TAGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTC---------'], ['TCATCGACTTGCAGGACGAAGCTTGGCGAAGCTTGGCGTAACTAGATCTCTACTCTACCACTTGTACTTCAGCGGTCAGCTTACTCGACTTAATA']]]}
200 |         strBarcodePamPos
201 |         Foward
202 |         """
203 |         # index name, constant variable.
204 |         intTotal      = 0
205 |         intNumIns     = 1
206 |         intNumDel     = 2
207 |         intNumCom     = 3
208 |         intTotalFastq = 4
209 |         intInsFastq   = 5
210 |         intDelFastq   = 6
211 |         intComFastq   = 7
212 |         intIndelInfo  = 8
213 | 
214 |         with open('{outdir}/Tmp/{sample}_Summary.txt'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as Summary, \
215 |             open('{outdir}/Tmp/{sample}_Classified_Indel_barcode.fastq'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as FastqOut, \
216 |             open('{outdir}/Tmp/{sample}_Indel_freq.txt'.format(outdir=self.strOutSampleDir, sample=self.strSample), 'w') as FreqOut:
217 | 
218 |             for binPickle in os.listdir('{outdir}/Tmp/Pickle'.format(outdir=self.strOutSampleDir)):
219 |                 with open('{outdir}/Tmp/Pickle/{pickle}'.format(outdir=self.strOutSampleDir, pickle=binPickle), 'rb') as PickleResult:
220 | 
221 |                     dictPickleResult    = pickle.load(PickleResult)
222 |                     dictResult          = dictPickleResult['dictResult']
223 |                     dictResultIndelFreq = dictPickleResult['dictResultIndelFreq']
224 |                     strBarcodePamPos    = dictPickleResult['strBarcodePamPos']
225 | 
226 |                     for strBarcode, listValue in dictResult.items():
227 |                         if strBarcodePamPos == 'Reverse':
228 |                             strBarcode = strBarcode[::-1]
229 | 
230 |                         Summary.write("{Bar}\t{NumTot}\t{NumIns}\t{NumDel}\t{NumCom}\n".format(
231 |                             Bar=strBarcode, NumTot=listValue[intTotal], NumIns=listValue[intNumIns], NumDel=listValue[intNumDel], NumCom=listValue[intNumCom]))
232 | 
233 |                         if self.strClassFASTQ == 'True':
234 |                             for strJudge, intFastqKind in [('total', intTotalFastq), ('insertion', intInsFastq), ('deletion', intDelFastq), ('complex', intComFastq)]:
235 |                                 for listFastq in listValue[intFastqKind]: ## category
236 |                                     listFastqAddClass = [listFastq[0]+':Barcode_%s:%s' % (strBarcode, strJudge)]
237 |                                     FastqOut.write('\n'.join(listFastqAddClass + listFastq[1:]) + '\n')
238 | 
239 |                     for strBarcode in dictResultIndelFreq: # dictResultIndelFreq [sRef_seq, lQuery, float(iFreq)/iTotal, sTarget_region]
240 | 
241 |                         if strBarcodePamPos == 'Reverse':
242 |                             strBarcode = strBarcode[::-1]
243 | 
244 |                         for strRefSeq, listQuery, strINDEL, floFreq, strTargetRegion, listRefNeedle, listQueryNeedle in sorted(dictResultIndelFreq[strBarcode], key=lambda x: x[3], reverse=True):
245 |                             for strQuery, strRefNeedle, strQueryNeedle in zip(listQuery, listRefNeedle, listQueryNeedle):
246 | 
247 |                                 if strBarcodePamPos == 'Reverse':
248 |                                     strQuery       = strQuery[::-1]
249 |                                     strRefNeedle   = strRefNeedle[::-1]
250 |                                     strQueryNeedle = strQueryNeedle[::-1]
251 | 
252 |                                 FreqOut.write('\t'.join([strBarcode, strQuery, strINDEL, str(round(floFreq, 4)), strRefNeedle, strQueryNeedle])+'\n')
253 |                     #END:for
254 |                 #END:with
255 |             #END:for
256 | 
257 |             if self.strPickle == 'False':
258 |                 logging.info('Delete tmp pickles')
259 |                 sp.call('rm {outdir}/Tmp/Pickle/*.pickle'.format(outdir=self.strOutSampleDir), shell=True)
260 | 
261 |             elif self.strSplit == 'False':
262 |                 logging.info('Delete splited input files')
263 |                 sp.call('rm {split_path}/*.fq'.format(split_path=self.strSplitPath), shell=True)
264 | 
265 |         #END:with
266 |     #END:def
267 | #END:cls
268 | 
269 | 
270 | def Main():
271 |     parser = OptionParser('Indel search program for CRISPR CAS9 & CPF1\n<All default option> python2.7 Run_indel_searcher.py --pam_type Cas9 --pam_pos Forward')
272 | 
273 |     parser.add_option('-t', '--thread', default='1', type='int', dest='multicore', help='multiprocessing number, recommendation:t<16')
274 |     parser.add_option('-c', '--chunk_number', default='400000', type='int', dest='chunk_number',
275 |                       help='split FASTQ, must be multiples of 4. file size < 1G recommendation:40000, size > 1G recommendation:400000')
276 |     parser.add_option('-q', '--base_quality', default='20', dest='base_quality', help='NGS read base quality')
277 |     parser.add_option('--gap_open', default='-10', type='float', dest='gap_open', help='gap open: -100~0')
278 |     parser.add_option('--gap_extend', default='1', type='float', dest='gap_extend', help='gap extend: 1~100')
279 |     parser.add_option('-i', '--insertion_window', default='4', type='int', dest='insertion_window', help='a window size for insertions')
280 |     parser.add_option('-d', '--deletion_window', default='4', type='int', dest='deletion_window', help='a window size for deletions')
281 |     parser.add_option('--pam_type', dest='pam_type', help='PAM type: Cas9 Cpf1')
282 |     parser.add_option('--pam_pos', dest='pam_pos', help='PAM position: Forward Reverse')
283 |     parser.add_option('--python', dest='python', help='The python path including the CRISPResso2')
284 |     parser.add_option('--user', dest='user_name', help='The user name with no space')
285 |     parser.add_option('--project', dest='project_name', help='The project name with no space')
286 |     parser.add_option('--pickle', dest='pickle', default='False', help='Dont remove the pickles in the tmp folder : True, False')
287 |     parser.add_option('--split', dest='split', default='False', help='Dont remove the split files in the input folder : True, False')
288 |     parser.add_option('--classfied_FASTQ', dest='class_fastq', default='True', help='Dont remove the ClassfiedFASTQ in the tmp folder : True, False')
289 |     parser.add_option('--ednafull', dest='ednafull', help='The nucleotide alignment matrix')
290 | 
291 |     options, args = parser.parse_args()
292 | 
293 |     InstInitFolder = InitialFolder(options.user_name, options.project_name, os.path.basename(__file__))
294 |     InstInitFolder.MakeDefaultFolder()
295 |     InstInitFolder.MakeInputFolder()
296 |     InstInitFolder.MakeOutputFolder()
297 | 
298 |     logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
299 |                         level=logging.DEBUG,
300 |                         filename=InstInitFolder.strLogPath,
301 |                         filemode='a')
302 |     logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
303 | 
304 |     logging.info('Program start')
305 |     if options.multicore > 15:
306 |         logging.warning('Optimal treads <= 15')
307 |     logging.info(str(options))
308 | 
309 |     with open(InstInitFolder.strProjectFile) as Sample_list:
310 | 
311 |         listSamples        = Helper.RemoveNullAndBadKeyword(Sample_list)
312 |         intProjectNumInTxt = len(listSamples)
313 | 
314 |         strInputProject = './Input/{user}/FASTQ/{project}'.format(user=options.user_name, project=options.project_name)
315 | 
316 |         @CheckProcessedFiles
317 |         def RunPipeline(**kwargs):
318 | 
319 |             setGroup = set()
320 |             for strSample in listSamples:
321 | 
322 |                 tupSampleInfo = Helper.SplitSampleInfo(strSample)
323 |                 if not tupSampleInfo: continue
324 |                 strSample, strRef, strExpCtrl = tupSampleInfo
325 |                 setGroup.add(strExpCtrl)
326 | 
327 |                 InstRunner = clsIndelSearcherRunner(strSample, strRef, options, InstInitFolder)
328 |                 #"""
329 |                 logging.info('SplitFile')
330 |                 InstRunner.SplitFile()
331 |                 logging.info('MakeReference')
332 |                 InstRunner.MakeReference()
333 |                 logging.info('MakeIndelSearcherCmd')
334 |                 listCmd = InstRunner.MakeIndelSearcherCmd()
335 |                 logging.info('RunMulticore')
336 |                 RunMulticore(listCmd, options.multicore)  ## from CoreSystem.py
337 |                 logging.info('MakeOutput')
338 |                 InstRunner.MakeOutput()
339 |                 logging.info('RunIndelFreqCalculator')
340 |                 InstRunner.RunIndelFreqCalculator()
341 |                 #"""
342 | 
343 |             if setGroup == {'EXP', 'CTRL'}:
344 |                 InstRunner.IndelNormalization()
345 |             elif setGroup in [set(), set([]), set(['']), set([' '])]:
346 |                 pass
347 |             else:
348 |                 logging.error('The group category is not appropriate. : %s' % setGroup)
349 |                 logging.error('Please make sure your project file is correct.')
350 |                 logging.error('The group category must be Exp or Ctrl')
351 |                 raise Exception
352 |             #"""
353 | 
354 |         RunPipeline(InstInitFolder=InstInitFolder,
355 |                     strInputProject=strInputProject,
356 |                     intProjectNumInTxt=intProjectNumInTxt,
357 |                     listSamples=listSamples,
358 |                     logging=logging)
359 | 
360 |     logging.info('Program end')
361 | #END:def
362 | 
363 | 
364 | if __name__ == '__main__':
365 |     Main()
366 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Run_random_barcode.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ####################
 5 | ## User parameter ##
 6 | ####################################
 7 | 
 8 | user=SH
 9 | project=p53_screening
10 | thread=2
11 | 
12 | 
13 | ####################################
14 | 
15 | 
16 | 
17 | 
18 | 
19 | while read python_path;do
20 |     python=$python_path
21 | done < ../PythonPath.txt
22 | 
23 | 
24 | nohup $python ./Summary_Random_barcode.py -u $user -p $project -t $thread > ./Output/${user}/${project}/Log/Random_barcode_log.txt 2>&1 &
25 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Summary_Random_barcode.py:
--------------------------------------------------------------------------------
  1 | import os, sys
  2 | import logging
  3 | import multiprocessing as mp
  4 | 
  5 | from argparse import ArgumentParser
  6 | from collections import OrderedDict
  7 | 
  8 | sys.path.insert(0, os.path.dirname(os.getcwd()))
  9 | from Core.CoreSystem import  Helper
 10 | 
 11 | 
 12 | class clsParameters():
 13 | 
 14 |     def __init__(self, options):
 15 |         self.strUser    = options.user_name
 16 |         self.strProject = options.project_name.replace('.txt', '') ## A user can be confused the input. So I prevented from it using 'replace'.
 17 |         self.strGroup   = options.group
 18 |         self.intCore    = options.thread
 19 | 
 20 |         self.strSampleList = 'User/{user}/{project}.txt'.format(user=options.user_name, project=options.project_name)
 21 | 
 22 | 
 23 | def SummaryRandomBarcode(sFile_path):
 24 | 
 25 |     """
 26 |     /Tmp
 27 |     190819_Nahye_24k_2_D0_2-24kLib_Classified_Indel_barcode.fastq* -> process target
 28 |     190819_Nahye_24k_2_D0_2-24kLib_Indel_freq.txt*
 29 |     190819_Nahye_24k_2_D0_2-24kLib_Indel_summary.txt*
 30 |     190819_Nahye_24k_2_D0_2-24kLib_Summary.txt*
 31 |     Pickle
 32 | 
 33 |     dBarcode_cnt = {'ACGTACTC_sorting_barcode': {'ACATACAC_random': 5, 'CGTGTTGA_random': 3, ...}
 34 |     """
 35 |     dictBarcodeCnt = {}
 36 |     strClassCheck  = ''
 37 | 
 38 |     strSample = sFile_path.split('/')[-1]
 39 |     logging.info('Summary_random_barcode start : %s, %s' % (sFile_path, strSample))
 40 | 
 41 |     for sFile in os.listdir(sFile_path+'/Tmp/'):
 42 |         if '.fastq' in sFile:
 43 |             with open(sFile_path+'/Tmp/'+sFile) as Input:
 44 |                 for i, strRow in enumerate(Input):
 45 | 
 46 |                     # @D00235:683:CE1P6ANXX:6:1114:2135:5231 1:N:0:CTGAAGCT+CCTATCCT:Barcode_TTTGCTATCTCGACGTATGGACAGTG:total
 47 |                     if i % 4 == 0:
 48 |                         listBarClass = strRow.replace('\n','').split('Barcode_')[1].split(':')
 49 |                         strBarcode   = listBarClass[0]
 50 |                         strClass     = listBarClass[1]
 51 | 
 52 |                         if strClass == 'total':
 53 |                             strClassCheck = 'total'
 54 | 
 55 |                     if i % 4 == 1 and strClassCheck == 'total':
 56 |                         strRow = strRow.replace('\n','').upper()
 57 |                         intBarcodeStart   = strRow.find(strBarcode)
 58 |                         strRandom_barcode = strRow[intBarcodeStart-8:intBarcodeStart]
 59 | 
 60 |                         try:
 61 |                             _ = dictBarcodeCnt[strBarcode]
 62 |                         except KeyError:
 63 |                             dictBarcodeCnt[strBarcode] = {}
 64 |                         try:
 65 |                             dictBarcodeCnt[strBarcode][strRandom_barcode] += 1
 66 |                         except KeyError:
 67 |                             dictBarcodeCnt[strBarcode][strRandom_barcode] = 1
 68 |                         #print(sBarcode, sRandom_barcode, iBarcode_start, sRow)
 69 | 
 70 |                         strClassCheck = ''
 71 | 
 72 |     if not os.path.isdir(sFile_path + '/Summary_Random_barcode'): os.mkdir(sFile_path + '/Summary_Random_barcode')
 73 |     with open(sFile_path + '/Summary_Random_barcode/%s_all_random_barcode.txt' % strSample, 'w') as All_random,\
 74 |         open(sFile_path + '/Summary_Random_barcode/%s_Unique_RandomBarcodeNumber_In_SortingBarcode.txt' % strSample, 'w') as Random_sorting:
 75 | 
 76 |         All_random.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\tRandomBarcode\tEach_RandomBarcode_read_count\n')
 77 |         Random_sorting.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n')
 78 | 
 79 |         for sBarcode, dRandom_barcode_cnt in dictBarcodeCnt.items():
 80 |             iRandom_barcode_num = len(dRandom_barcode_cnt.keys())
 81 |             Random_sorting.write('\t'.join(map(str, [sBarcode, iRandom_barcode_num]))+'\n')
 82 | 
 83 |             for sRandom_barcode, iCnt in dRandom_barcode_cnt.items():
 84 |                 All_random.write('\t'.join(map(str, [sBarcode, iRandom_barcode_num, sRandom_barcode, iCnt]))+'\n')
 85 | 
 86 |     logging.info('Summary_random_barcode end: %s' % sFile_path)
 87 | 
 88 | ## on going
 89 | def CountGroup(InstParameters):
 90 |     """
 91 |     Sorting_barcode Unique_RandomBarcodeNumber_In_SortingBarcode    RandomBarcode   Each_RandomBarcode_read_count
 92 |     TATATCATAGCGTACTCATC    8       TGCGTTTG        3
 93 |     TATATCATAGCGTACTCATC    8       CGCGTTTG        3
 94 |     TATATCATAGCGTACTCATC    8       TAGTTTTG        1
 95 |     TATATCATAGCGTACTCATC    8       ATAGTTTG        1
 96 |     """
 97 | 
 98 |     sHeader = ''
 99 | 
100 |     with open(InstParameters.strSampleList) as Sample: ## tmp input
101 | 
102 |         listSample = Sample.readlines()
103 | 
104 |         setGroup = set([strRow.replace('\n', '').split('\t')[2].upper() for strRow in listSample])
105 | 
106 |         for strGroup in setGroup:
107 |             if strGroup == 'CTRL': continue
108 | 
109 |             for strRow in listSample:
110 |                 if strGroup == strGroupOfSample:  ## matched group names -> Sum the counts
111 |                     listCol          = strRow.replace('\n', '').split('\t')
112 |                     strSample        = listCol[0]
113 |                     strRef           = listCol[1]
114 |                     strGroupOfSample = listCol[2]
115 | 
116 |                     strProjectDir = './Output/{user}/{project}'.format(user=InstParameters.strUser,
117 |                                                                        project=InstParameters.strProject)
118 |                     strGroupDir = os.path.join(strProjectDir, 'Group_result')
119 |                     Helper.MakeFolderIfNot(strGroupDir)
120 | 
121 |                     dTotal_RandomBarcode_cnt_in_SortingBarcode = OrderedDict() ## ('GECKO_6367_GATCTGCTC', ['GECKO_6367', 'GATCTGCTC', 2, 156, '0.0128']),
122 |                                                                                ## Unique key, only one list.
123 | 
124 |                     with open('{project_dir}/{sample}_all_random_barcode.txt'.format(project_dir=strProjectDir,
125 |                                                                                      sample=strSample)) as RandomBarcode_SeqFreq:
126 |                         sHeader = RandomBarcode_SeqFreq.readline()
127 | 
128 |                         for sRow in RandomBarcode_SeqFreq:
129 |                             lCol = sRow.replace('\n', '').split('\t')
130 | 
131 |                             sSortingBarcode                             = lCol[0]
132 |                             #iTotal_RandomBarcode_cnt_in_SortingBarcode  = int(lCol[1])
133 |                             sSorting_and_Random_barcode_seq             = lCol[0] + '_' + lCol[2]  ## Unique name : Doench2014_1000_CTCTGGGGT
134 |                             iRandomBarcode_count                        = int(lCol[3])
135 | 
136 |                             lCol[3] = iRandomBarcode_count
137 | 
138 |                             try:
139 |                                 _ = dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq]
140 | 
141 |                                 dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq][3] += iRandomBarcode_count
142 | 
143 |                             except KeyError:
144 |                                 dTotal_RandomBarcode_cnt_in_SortingBarcode[sSorting_and_Random_barcode_seq] = lCol  ## initial assignment
145 |                     #END for
146 |                     dRecal_total_kind_of_RandomBarcode = OrderedDict()
147 |                     for sSort_Rand_seq in dTotal_RandomBarcode_cnt_in_SortingBarcode:  ## sSorting_and_Random_barcode_seq
148 |                         sSortBarcode = sSort_Rand_seq.split('_')[0]
149 |                         try:
150 |                             dRecal_total_kind_of_RandomBarcode[sSortBarcode].append(dTotal_RandomBarcode_cnt_in_SortingBarcode[sSort_Rand_seq])
151 |                         except KeyError:
152 |                             dRecal_total_kind_of_RandomBarcode[sSortBarcode] = [dTotal_RandomBarcode_cnt_in_SortingBarcode[sSort_Rand_seq]]
153 | 
154 |                     for sKey, llValue in dRecal_total_kind_of_RandomBarcode.items():
155 |                         ## sKey: TATATCATAGCGTACTCATC, llValue : [[TATATCATAGCGTACTCATC, 8, TGCGTTTG, 3],[],[] ...
156 |                         iKind_of_RandomBarcode = len(llValue)  ################## why do I make like this ?????
157 |                         for lValue in llValue:
158 |                             lValue[1] = iKind_of_RandomBarcode ## Recal using group total cnt.
159 | 
160 |                         llValue = sorted(llValue, key=lambda x:x[3], reverse=True)
161 |                         dRecal_total_kind_of_RandomBarcode[sKey] = llValue
162 | 
163 |                     strEachGroup = './Output/Group_result/%s' % strGroup
164 |                     Helper.MakeFolderIfNot(strEachGroup)
165 | 
166 |                     with open(os.path.join(strEachGroup, 'Summary_all_random_barcode_in_group.txt'), 'w') as Sort_Random_cnt,\
167 |                         open(os.path.join(strEachGroup, 'Summary_Unique_RandomBarcodeNumber_in_group.txt'), 'w') as Uniq_random_cnt:
168 | 
169 |                         Sort_Random_cnt.write(sHeader)
170 |                         Uniq_random_cnt.write('Sorting_barcode\tUnique_RandomBarcodeNumber_In_SortingBarcode\n')
171 | 
172 |                         for sSortBarcode, llCol in dRecal_total_kind_of_RandomBarcode.items():
173 |                             Uniq_random_cnt.write('\t'.join(map(str, [sSortBarcode, len(llCol)]))+'\n')
174 |                             for lCol in llCol:
175 |                                 Sort_Random_cnt.write('\t'.join(map(str, lCol))+'\n')
176 |         #END: for
177 |     #END: with
178 | 
179 | 
180 | def Main():
181 | 
182 |     logging.info('Program Start')
183 |     logging.info('Make commands for a multiple processing')
184 | 
185 |     parser = ArgumentParser(description='Script for counting the random barcodes')
186 | 
187 |     parser.add_argument('-u', '--user_name', type=str, dest='user_name', help='The user name in the /user subdir')
188 |     parser.add_argument('-p', '--project_name', type=str, dest='project_name', help='The project name in the /user/user_name/ subdir')
189 |     parser.add_argument('-g', '--group',  type=str, dest='group', default='false', help='The group sum run of the barcodes, default: false')
190 |     parser.add_argument('-t', '--thread', type=int, dest='thread', default='15', help='The multicore number 1~15')
191 |     options = parser.parse_args()
192 | 
193 |     InstParameters = clsParameters(options)
194 | 
195 |     lPara = []
196 | 
197 |     with open(InstParameters.strSampleList) as SampleList:
198 | 
199 |         for strSample in SampleList:
200 |             if strSample[0] == '#' or strSample[0] in ['', ' ', '\r', '\n', '\r\n']: continue
201 |             strSample  = strSample.replace('\n', '').replace('\r', '').split('\t')[0]
202 |             sFile_path = './Output/{user}/{project}/{sample}'.format(user=options.user_name,
203 |                                                                      project=options.project_name,
204 |                                                                      sample=strSample)
205 |             #print('sFile_path', sFile_path)
206 |             lPara.append(sFile_path)
207 | 
208 |     ## single_test
209 |     #Summary_random_barcode(lPara[0])
210 | 
211 |     logging.info('Multiple processing Start')
212 |     p = mp.Pool(options.thread)
213 |     p.map_async(SummaryRandomBarcode, lPara).get()
214 |     logging.info('Multiple processing End')
215 | 
216 |     #logging.info('Count group Start')
217 |     #CountGroup(InstParameters)
218 |     #logging.info('Count group End')
219 | 
220 |     #logging.info('Program End')
221 | 
222 | Main()
223 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/Summary_all_trim.py:
--------------------------------------------------------------------------------
 1 | import os, sys, logging
 2 | import pandas as pd
 3 | import subprocess as sp
 4 | from pdb import set_trace
 5 | 
 6 | sOutput_dir = sys.argv[1]
 7 | strSample  = sys.argv[2]
 8 | strLogPath  = sys.argv[3]
 9 | 
10 | logging.basicConfig(format='%(process)d %(levelname)s %(asctime)s : %(message)s',
11 |                     level=logging.DEBUG,
12 |                     filename=strLogPath,
13 |                     filemode='a')
14 | logging.getLogger().addHandler(logging.StreamHandler(sys.stdout))
15 | 
16 | 
17 | def Parsing_summary():
18 | 
19 |     dfSummary                = pd.read_table('{outdir}/Tmp/{sample}_Summary.txt'.format(sample=strSample, outdir=sOutput_dir), header=None)
20 |     dfSummary.columns        = ['Barcode', 'Total', 'Insertion', 'Deletion', 'Complex']
21 |     dfSummary                = dfSummary.groupby(['Barcode']).sum()
22 |     dfSummary['Total_indel'] = dfSummary['Insertion'] + dfSummary['Deletion'] + dfSummary['Complex']
23 |     dfSummary['IND/TOT']     = dfSummary['Total_indel'] / dfSummary['Total']
24 |     dfSummary['IND/TOT'].fillna(0, inplace=True)
25 |     dfSummary.to_csv('{outdir}/Result/{sample}_Summary_result.tsv'.format(sample=strSample, outdir=sOutput_dir), sep='\t')
26 | 
27 | def Annotate_final_result():
28 | 
29 |     dfCount_INDEL = pd.read_table('{outdir}/Tmp/{sample}_Indel_summary.txt'.format(sample=strSample, outdir=sOutput_dir), header=None)
30 |     dfSummary     = pd.read_table('{outdir}/Result/{sample}_Summary_result.tsv'.format(sample=strSample, outdir=sOutput_dir), index_col='Barcode')
31 | 
32 |     dfCount_INDEL.set_index(0, inplace=True)
33 |     dfConcat_result  = pd.concat([dfCount_INDEL, dfSummary.loc[:,['Total_indel', 'Total', 'IND/TOT']]],axis=1)
34 |     dfConcat_result.dropna(inplace=True)
35 |     dfConcat_result  = dfConcat_result.reset_index()
36 |     dfConcat_result  = dfConcat_result.loc[:,['index','Total_indel', 'Total', 'IND/TOT', 1,2]]
37 |     dfConcat_result.columns = ['Barcode', 'Total_indel', 'Total', 'IND/TOT', 'Match','Info']
38 |     dfConcat_result  = dfConcat_result.round(2)
39 |     dfConcat_result.to_csv('{outdir}/Result/{sample}_Final_indel_result.tsv'.format(sample=strSample, outdir=sOutput_dir), sep='\t', index=False)
40 | 
41 | if __name__ == '__main__':
42 |     logging.info('Make a summary result.')
43 |     Parsing_summary()
44 |     Annotate_final_result()
45 |     logging.info('The summary result has been completed.\n\n')
46 | 


--------------------------------------------------------------------------------
/Indel_searcher_2/User/JaeWoo/JaeWoo_test_samples.txt:
--------------------------------------------------------------------------------
1 | #190819_Nahye_12K_D4_eCas9_Rep1-Cas9D7	Cas9D7
2 | #190819_Nahye_12K_D4_eCas9_Rep2-Cas9D7	Cas9D7
3 | Test_sample	Cas9D7
4 | #190819_Nahye_12K_D4_D0_1-Cas9D7	Cas9D7
5 | #190819_Nahye_12K_D7_2_D0_1-Cas9D7	Cas9D7
6 | #190819_Nahye_12K_D7_D0_1-Cas9D7	Cas9D7	
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Jae Woo Choi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/PythonPath.txt:
--------------------------------------------------------------------------------
1 | /media/hkim/Pipeline/Indel_searcher_2/miniconda2/bin/python
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CRISPR_toolkit
2 | CRISPR Indel, base edit analysis
3 | 


--------------------------------------------------------------------------------