├── Modules
    ├── BLAST.py
    ├── __init__.py
    ├── GATK.pyc
    ├── HTseq.pyc
    ├── Homer.pyc
    ├── Lumpy.pyc
    ├── Picard.pyc
    ├── Aligner.pyc
    ├── CNVnator.pyc
    ├── GeneMark.pyc
    ├── PBhoney.pyc
    ├── SVTyper.pyc
    ├── Samtools.pyc
    ├── Sniffles.pyc
    ├── StringTie.pyc
    ├── __init__.pyc
    ├── Trimmomatic.pyc
    ├── f01_file_process.pyc
    ├── __pycache__
    │   ├── HTseq.cpython-34.pyc
    │   ├── Aligner.cpython-34.pyc
    │   ├── Samtools.cpython-34.pyc
    │   ├── __init__.cpython-34.pyc
    │   ├── __init__.cpython-35.pyc
    │   ├── Trimmomatic.cpython-34.pyc
    │   ├── f01_file_process.cpython-34.pyc
    │   └── f01_file_process.cpython-35.pyc
    ├── Sniffles.py
    ├── SVTyper.py
    ├── GeneMark.py
    ├── Lumpy.py
    ├── StringTie.py
    ├── HTseq.py
    ├── Samtools.py
    ├── CNVnator.py
    ├── Picard.py
    ├── PBhoney.py
    ├── Trimmomatic.py
    ├── f02_parse_gff.py
    ├── Homer.py
    ├── Aligner.py
    ├── f01_file_process.py
    └── GATK.py
├── Parameters
    ├── SV_Pacbio_Sniffle.yaml
    ├── SV_Pacbio_PBHoney.yaml
    ├── CNVnator.yaml
    ├── StringTie_quant.yaml
    ├── RibosomeProfiling.yaml
    ├── GRO_Seq_Cap.yaml
    ├── STAR_get_bam.yaml
    ├── SV_Illumina_lumpy.yaml
    ├── GATK_RNA_CHO.yaml
    ├── RNAseq_count.yaml
    └── GATK_DNA_CHO.yaml
├── README.md
├── SV_Pacbio_Sniffle.py
├── mapped_BAM_to_fastq.ipynb
├── STAR_get_bam.py
├── SV_Pacbio_PBHoney.py
├── Salmon_quant.py
├── StringTie_quant.py
├── CNV_CNVnator.py
├── SV_Illumina_Lumpy.py
├── RNAseq_count.py
├── RibosomeProfiling.py
├── RNAseq_STARpipeline.sh
├── GRO_Seq_Cap.py
├── VCF_snpEff_annotation.py
├── GATK_RNA_CHO.py
├── GATK_DNA_CHO.py
├── Eukaryote_genome_annotation.py
└── Genome_Annotation.py


/Modules/BLAST.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Modules/GATK.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/GATK.pyc


--------------------------------------------------------------------------------
/Modules/HTseq.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/HTseq.pyc


--------------------------------------------------------------------------------
/Modules/Homer.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Homer.pyc


--------------------------------------------------------------------------------
/Modules/Lumpy.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Lumpy.pyc


--------------------------------------------------------------------------------
/Modules/Picard.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Picard.pyc


--------------------------------------------------------------------------------
/Modules/Aligner.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Aligner.pyc


--------------------------------------------------------------------------------
/Modules/CNVnator.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/CNVnator.pyc


--------------------------------------------------------------------------------
/Modules/GeneMark.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/GeneMark.pyc


--------------------------------------------------------------------------------
/Modules/PBhoney.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/PBhoney.pyc


--------------------------------------------------------------------------------
/Modules/SVTyper.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/SVTyper.pyc


--------------------------------------------------------------------------------
/Modules/Samtools.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Samtools.pyc


--------------------------------------------------------------------------------
/Modules/Sniffles.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Sniffles.pyc


--------------------------------------------------------------------------------
/Modules/StringTie.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/StringTie.pyc


--------------------------------------------------------------------------------
/Modules/__init__.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__init__.pyc


--------------------------------------------------------------------------------
/Modules/Trimmomatic.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Trimmomatic.pyc


--------------------------------------------------------------------------------
/Modules/f01_file_process.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/f01_file_process.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/HTseq.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/HTseq.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/Aligner.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Aligner.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/Samtools.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Samtools.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/__init__.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/__init__.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/Trimmomatic.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Trimmomatic.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/f01_file_process.cpython-34.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/f01_file_process.cpython-34.pyc


--------------------------------------------------------------------------------
/Modules/__pycache__/f01_file_process.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/f01_file_process.cpython-35.pyc


--------------------------------------------------------------------------------
/Parameters/SV_Pacbio_Sniffle.yaml:
--------------------------------------------------------------------------------
1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
2 | RawDataPath: '/data/shangzhong/Pacbio/sniffle'
3 | thread: 16
4 | # database parameters
5 | ref_fa: '/data/genome/hamster/picr/picr.fa'
6 | bwa_db: '/data/genome/hamster/new_pacbio_assemble/bwaDb'
7 | aligner: 'ngmlr'
8 | 
9 | 


--------------------------------------------------------------------------------
/Modules/Sniffles.py:
--------------------------------------------------------------------------------
 1 | import sarge,sys
 2 | 
 3 | def sniffle(bam,outVCF,otherParameters=['']):
 4 |     """run sniffle to detect SV using pacbio"""
 5 |     cmd = ('sniffles -m {bam} -v {outVCF} ').format(bam=bam,outVCF=outVCF)
 6 |     if otherParameters != ['']:
 7 |         cmd = cmd + ' '.join(otherParameters)
 8 |     print(cmd);sys.stdout.flush()
 9 |     sarge.run(cmd)
10 | 
11 | 


--------------------------------------------------------------------------------
/Parameters/SV_Pacbio_PBHoney.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Pacbio/fa'
 3 | thread: 24
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/multi_pacbio_assemble/picr.fa'
 6 | sa_index: '/data/genome/hamster/multi_pacbio_assemble/picr.fa.sa'
 7 | # tool specific parameters
 8 | blasr_jobs_per_batch: 2
 9 | sam_sort_jobs_per_batch: 6
10 | 


--------------------------------------------------------------------------------
/Parameters/CNVnator.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv'
 3 | thread: 12
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa'
 6 | # tool specific parameters
 7 | trim_reads: False
 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
 9 | trim_jobs_per_batch: 6
10 | adapter: ''
11 | 
12 | bwa_jobs_per_batch: 2
13 | bwa_Db: '/data/genome/hamster/ncbi_refseq/bwa_Db'  # should be a folder
14 | 
15 | bin_win: 100
16 | chrom: ['-chrom NW_006887432.1']
17 | 


--------------------------------------------------------------------------------
/Parameters/StringTie_quant.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Proteogenomics/fq'
 3 | thread: 12
 4 | QC: False
 5 | # database parameters
 6 | ref_fa: '/data/genome/hamster/picr/picr.fa'
 7 | gff: '/data/genome/hamster/picr/updated_final.gff3'
 8 | # tool specific parameters
 9 | trim_reads: False
10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
11 | trim_jobs_per_batch: 6
12 | adapter: ''
13 | 
14 | star_jobs_per_batch: 1
15 | STAR_index_path: '/data/genome/hamster/picr/picr_STAR_Db'  
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/Modules/SVTyper.py:
--------------------------------------------------------------------------------
 1 | import sarge,sys
 2 | def svtyper(in_vcf,out_vcf,bam):
 3 |     '''
 4 |     this function run svtyper to add genotype to vcf 
 5 |     '''
 6 |     # 1. generate json file
 7 |     json = bam[:-3] + 'json'
 8 |     cmd = ('svtyper -B {bam} -l {j} && samtools index {bam}').format(bam=bam,j=json)
 9 |     print(cmd);sys.stdout.flush()
10 |     sarge.run(cmd)
11 |     # 2. generate json plot
12 |     sarge.run('lib_stats.R {j} {j}.pdf'.format(j=json))
13 |     # 3. run svtyper
14 |     cmd = ('svtyper -B {bam} -i {invcf} -l {j} -o {out}').format(
15 |                     bam=bam,invcf=in_vcf,j=json,out=out_vcf)
16 |     print(cmd);sys.stdout.flush()
17 |     sarge.run(cmd)


--------------------------------------------------------------------------------
/Parameters/RibosomeProfiling.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/DE/tissue'
 3 | thread: 12
 4 | QC: False
 5 | # database parameters
 6 | rRNA_fa: '/data/shangzhong/DE/tissue/rtRNA.fa'
 7 | ref_fa: '/data/genome/hamster/picr/picr.fa'
 8 | gff: '/data/genome/hamster/picr/updated_final.gff3'
 9 | # tool specific parameters
10 | trim_reads: False
11 | trim_jobs_per_batch: 6
12 | adapter: ''
13 | 
14 | hisat2_jobs_per_batch: 2
15 | hisat2_rrna_index:  '/data/shangzhong/DE/tissue/rRNA_Db' # should be a path
16 | hisat2_target_index: '/data/genome/hamster/picr/hisat2_rRNA_Db'
17 | 
18 | other: ['']
19 | 
20 | 
21 | 
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/Modules/GeneMark.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import sarge,sys
 3 | def geneMark_ES(ref_fa,other_params=['']):
 4 |     '''run geneMark_ES'''
 5 |     cmd = ('gmes_petap.pl --ES {other} --sequence {fa}').format(fa=ref_fa,
 6 |                                             other=' '.join(other_params))
 7 |     print(cmd);sys.stdout.flush()
 8 |     sarge.run(cmd)
 9 |     sarge.run('genemark_gtf2gff3 genemark.gtf > genemark.gff')  # this code is download from maker
10 |     df = pd.read_csv('genemark.gff',sep='\t',comment='#',header=None)
11 |     df[0] = df[0].map(lambda x: x.split(' ')[0])
12 |     df.to_csv('genemark.gff3',sep='\t',index=False,header=None)
13 |     #return os.getcwd() +'/genemark.gff'
14 | 
15 | 


--------------------------------------------------------------------------------
/Modules/Lumpy.py:
--------------------------------------------------------------------------------
 1 | import sarge,sys,os
 2 | def lumpyexpress(in_bams,out_vcf,others=['']):
 3 |     '''This function runs lumpy express
 4 |     * in_bams: sorted bams '''
 5 |     bams = ','.join(in_bams)
 6 |     splits = ','.join([b[:-3]+'split.bam' for b in in_bams])
 7 |     discs = ','.join([b[:-3]+'disc.bam' for b in in_bams])
 8 |     cmd = ('lumpyexpress -B {bams} -S {splits} -D {discs} {other} '
 9 |               '-k -o {out}').format(
10 |                 bams=bams,splits=splits,discs=discs,out=out_vcf,other=' '.join(others))
11 |     with open('cmd.sh','w') as f:
12 |         f.write('#!/bin/bash\n' + cmd)
13 |     print(cmd);sys.stdout.flush()
14 |     sarge.run('chmod 777 cmd.sh && ./cmd.sh && rm cmd.sh')
15 |     


--------------------------------------------------------------------------------
/Parameters/GRO_Seq_Cap.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/TSS/fq'
 3 | thread: 6
 4 | QC: False
 5 | # database parameters
 6 | ref_fa: '/data/genome/hamster/picr/picr.fa'
 7 | gff: '/data/genome/hamster/picr/picr.gff3'
 8 | # tool specific parameters
 9 | trim_reads: False
10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
11 | trim_jobs_per_batch: 6
12 | adapter: '/data/shangzhong/TSS/fq/Gro_adapter.txt'
13 | 
14 | star_jobs_per_batch: 1
15 | STAR_index_path:  '/data/genome/hamster/picr/picr_STAR_Db'  #'/opt/genome/cho/STAR_Db'  # alinger index
16 | 
17 | 
18 | 
19 | 
20 | 
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/Parameters/STAR_get_bam.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/DE/helene'
 3 | thread: 8
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/picr/picr.fa'
 6 | gff: ''
 7 | # tool specific parameters
 8 | trim_reads: True
 9 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
10 | trim_jobs_per_batch: 6
11 | adapter: ''
12 | 
13 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar'
14 | 
15 | star_jobs_per_batch: 1  # at most 2
16 | star_index: '/data/genome/hamster/multi_pacbio_assemble/picr_STAR_Db'
17 | star_pass: 2
18 | 
19 | star_params: ['']  # should be a list of strings
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/Parameters/SV_Illumina_lumpy.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Pacbio/CHOS_illu_DNA'
 3 | thread: 16
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/new_pacbio_assemble/ch_illumina_pbj.fasta'
 6 | # tool specific parameters
 7 | trim_reads: True
 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
 9 | trim_jobs_per_batch: 6
10 | adapter: ''
11 | 
12 | bwa_jobs_per_batch: 2
13 | bwa_index: '/data/genome/hamster/new_pacbio_assemble/bwaDb'
14 | read_groups : ['@RG\tID:lane1\tSM:CHOS','@RG\tID:lane2\tSM:CHOS',
15 |              '@RG\tID:lane3\tSM:CHOS','@RG\tID:lane4\tSM:CHOS',
16 |              '@RG\tID:lane5\tSM:CHOS','@RG\tID:lane6\tSM:CHOS']
17 | 
18 | 


--------------------------------------------------------------------------------
/Parameters/GATK_RNA_CHO.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Proteogenomics/test'
 3 | thread: 9
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa'
 6 | # tool specific parameters
 7 | trim_reads: True
 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
 9 | trim_jobs_per_batch: 6
10 | adapter: ''
11 | 
12 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar'
13 | gatk: '/home/shangzhong/Installation/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar'
14 | 
15 | star_jobs_per_batch: 1
16 | star_index: '/data/genome/hamster/ncbi_refseq/hamster_STAR_Db'
17 | 
18 | sample_name: 'hamster'
19 | read_groups: ['@RG\tID:CellLine8_2\tSM:CellLine8_2','@RG\tID:CellLine8_3\tSM:CellLine8_3']
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 
26 | 
27 | 


--------------------------------------------------------------------------------
/Parameters/RNAseq_count.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/DE/mouse/fq'
 3 | thread: 12
 4 | QC: False
 5 | # database parameters
 6 | ref_fa: '/data/genome/cho/chok1.fa'
 7 | gff: '/data/genome/cho/chok1.gff'
 8 | # tool specific parameters
 9 | trim_reads: False
10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
11 | trim_jobs_per_batch: 6
12 | adapter: ''
13 | 
14 | star_jobs_per_batch: 1 # at most 2
15 | STAR_index_path:  '/data/genome/cho/cho_STAR_Db'  #'/opt/genome/cho/STAR_Db'  # alinger index
16 | 
17 | htseq_anno_source: 'ncbi'  # alternative values: 'ncbi', 'ensembl' or leave it empty.
18 | strand_specific: 'no'  # 'yes, no, reverse'
19 | id_name: 'id' # if you want gene id in the count result, set 'id', if you want gene name, set 'name'
20 | 
21 | 
22 | 
23 | 
24 | 
25 | 


--------------------------------------------------------------------------------
/Parameters/GATK_DNA_CHO.yaml:
--------------------------------------------------------------------------------
 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net
 2 | RawDataPath: '/data/shangzhong/Proteogenomics/test'
 3 | thread: 9
 4 | # database parameters
 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa'
 6 | gff: '/data/genome/hamster/ncbi_refseq/hamster.gff' 
 7 | # tool specific parameters
 8 | trim_reads: True
 9 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar'
10 | trim_jobs_per_batch: 6
11 | adapter: ''
12 | 
13 | 
14 | QC: True
15 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar'
16 | gatk: '/home/shangzhong/Installation/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar'
17 | 
18 | bwa_jobs_per_batch: 2
19 | bwa_=db: '/data/genome/hamster/ncbi_refseq/bwa_Db'
20 | 
21 | sample_name: 'hamster'
22 | read_groups: ['@RG\\tID:CellLine8_2\\tSM:CellLine8_2','@RG\\tID:CellLine8_3\\tSM:CellLine8_3']
23 | 
24 | 
25 | 
26 | 
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/Modules/StringTie.py:
--------------------------------------------------------------------------------
 1 | import sarge,sys,glob
 2 | import pandas as pd
 3 | from natsort import natsorted 
 4 | 
 5 | def stringtie(in_bam,out_gtf,thread,annotation):
 6 |     '''
 7 |     '''
 8 |     quant = out_gtf[:-3] + 'abund.tab'
 9 |     cov_ref = out_gtf[:-3] + 'cov_ref.gtf'
10 |     cmd = ('stringtie {bam} -o {gtf} -p {t} -G {gff} -A {q} \
11 |             -C {cov}').format(bam=in_bam,gtf=out_gtf,t=str(thread),
12 |                                     gff=annotation,q=quant,cov=cov_ref)
13 |     print(cmd);sys.stdout.flush()
14 |     sarge.run(cmd)
15 | 
16 | 
17 | def merge_stringtie_tpm(path):
18 |     """This function merges tpm fpkm results to one file.
19 |     each file has two columns [geneid, tpm]
20 |     """
21 |     files = natsorted(glob.glob(path + '/*.tab'))
22 |     dfs = []
23 |     for f in files:
24 |         sp = f.split('/')[-1].split('.')[0]
25 |         df = pd.read_csv(f,sep='\t',header=0,usecols=[1,8],names=['name',sp],index_col=0)
26 |         df = df[df.index.values !='-']
27 |         df = df.groupby('name').sum()
28 |         dfs.append(df)
29 |     res_df = pd.concat(dfs,axis=1)
30 |     return res_df
31 | 
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # NewPipeline
 2 | Pipelines to process NGS or Pacbio data
 3 | ---------------------------------------
 4 | 
 5 | ## Method to run these pipelines.
 6 | #### Paired end files should end with _1.fq.gz, _2.fq.gz or _1.fastq.gz,_2.fastq.fz. Single end files should end with _1.fq.gz
 7 | #### STAR takes a lot of memory(30-50 GB) each run, so don't run more than 2 STAR in parallel at each batch.
 8 | 1. define all parameters in the corresponding parameter file in parameters folder.
 9 | 2. In bash terminal, run the followsing command:
10 | 	* nohup python pipeline.py parameter.yaml > log.txt &
11 | 	Or if you are running in screen, try the following command:
12 | 	* python pipeline.py parameter.yaml 2>&1 | tee log.txt
13 | 3. Press enter
14 | 
15 | * Finished Pipeline
16 | 	* RNAseq_count: quantify number of reads mapping to each gene
17 | 	* GATK_RNA_CHO: call variants for RNAseq
18 | 	* SV_Pacbio_PBHoney: call structure variation for Pacbio data using PBHoney
19 | 	* SV_Pacbio_Sniffle: call structure variation for Pacbio data using Sniffle
20 | 	
21 | 
22 | ## Pipeline specific notes
23 | 
24 | ### RNAseq_STARpipeline.sh
25 | 
26 | * Be aware, this is a bash pipeline and does not manage flow and reruns like ruffus.
27 | 	
28 | 


--------------------------------------------------------------------------------
/Modules/HTseq.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | import sarge
 3 | 
 4 | def htseq_count(sortedBam,countFile,annotation,strand,annotationSource):
 5 |     """This function run htseq_count to count reads given bam file
 6 |     * sortedBam: str. Bamfile name
 7 |     * countFile: outputfilename
 8 |     * annotation: annotation file
 9 |     * outputpath: path to store the result files
10 |     * annotation: source. 'ncbi','ensembl'
11 |     """
12 |     # 2. check the annotation source
13 |     if annotationSource == 'ncbi':
14 |         seqType = 'exon'
15 |         id_attr = 'gene'
16 |     elif annotationSource == 'ensembl':
17 |         seqType = 'exon'
18 |         id_attr = 'gene_id'
19 |     elif annotationSource == 'genedb':
20 |         seqType = 'CDS'
21 |         id_attr = 'Parent'
22 |     elif annotationSource == 'plasmodium':
23 |         seqType = 'exon'
24 |         id_attr = 'Parent'
25 |     # 3. run htseq-count
26 |     cmd = ('htseq-count -f bam -s {strand} -t {type} -i {gene} {bam} {annotation} > {output}').format(strand=strand,
27 |          type=seqType,gene=id_attr,bam=sortedBam,annotation=annotation,output=countFile)#os.path.join(outpath,countFile))
28 |     print(cmd);sys.stdout.flush()
29 |     sarge.run(cmd)
30 |         
31 |         
32 | def Message(string,email):
33 |     """
34 |     This function send message to email when it run. 
35 |     Used to calculate the time code runs.
36 |     """
37 |     cmd = ('echo {quote}|mailx -s "{string}" {email}').format(quote="",string=string,email=email)
38 |     sarge.run(cmd)
39 |     
40 | 


--------------------------------------------------------------------------------
/Modules/Samtools.py:
--------------------------------------------------------------------------------
 1 | import sarge
 2 | import sys
 3 | 
 4 | def sortBam(bamFile,sortedBamFile,thread=1,sortType=''):
 5 |     """
 6 |     This function sort bam files
 7 |     """
 8 |     if sortType == 'name':
 9 |             tag = ' -n'
10 |     else:
11 |         tag = ''
12 |     cmd = ('samtools sort{tag} -m 4G -@ {thread} -T {sort} -o {sortBam} {bam} ').format(
13 |             tag=tag,thread=str(thread),sort=bamFile[:-3]+'sort',bam=bamFile,sortBam=sortedBamFile)
14 |     print(cmd);sys.stdout.flush()
15 |     sarge.run(cmd)
16 |     if sortType !='name':
17 |         cmd = ('samtools index {bam} ').format(bam=sortedBamFile)
18 |         print(cmd);sys.stdout.flush()
19 |         sarge.run(cmd)
20 | 
21 | def sam2bam(samFile,bamFile,thread):
22 |     """
23 |     This function change sam file to bam file
24 |     """
25 |     cmd = ('samtools view -@ {thread} -h {sam} -o {bam} ').format(
26 |             thread=thread,sam=samFile,bam=bamFile)
27 |     print(cmd);sys.stdout.flush()
28 |     sarge.run(cmd)
29 |             
30 | 
31 | def build_fa_index(ref_fa):
32 |     '''build fai file for fa file for GATK
33 |     '''
34 |     cmd = ('samtools faidx {ref}').format(ref=ref_fa)
35 |     print(cmd);sys.stdout.flush()
36 |     sarge.run(cmd)
37 |     
38 | 
39 | def merge_bams(bamfiles,outputbam):
40 |     """this function merges bam files into one"""
41 |     if len(bamfiles) == 1:
42 |         cmd = ('mv {input} {output}').format(input=bamfiles[0],output=outputbam)
43 |     else:
44 |         bam = ' '.join(bamfiles)
45 |         cmd = ('samtools merge -f {output} {input}').format(output=outputbam,input=bam)
46 |     print(cmd);sys.stdout.flush()
47 |     sarge.run(cmd)
48 | 


--------------------------------------------------------------------------------
/Modules/CNVnator.py:
--------------------------------------------------------------------------------
 1 | import sarge,sys
 2 | 
 3 | 
 4 | 
 5 | def cnv_extract_bam(in_bam,out_root,others=['']):
 6 |     '''
 7 |     extract read mapping from bam files
 8 |     '''
 9 |     cmd = ('cnvnator -root {out} -unique -tree {bam} {other}').format(out=out_root,bam=in_bam,
10 |                                                         other=' '.join(others))
11 |     print(cmd);sys.stdout.flush()
12 |     sarge.run(cmd)
13 | 
14 | 
15 | def cnv_generate_hist(in_root,chr_path,bin_win,others=['']):
16 |     '''
17 |     generating histogram
18 |     '''
19 |     # 2. get histogram
20 |     cmd = ('cnvnator -root {root} -his {bin} -d {dir} {other}').format(root=in_root,
21 |                                 bin=str(bin_win),dir=chr_path,other=' '.join(others))
22 |     print(cmd);sys.stdout.flush()
23 |     sarge.run(cmd)
24 | 
25 | 
26 | def cnv_statistics(in_root,bin_win,others=['']):
27 |     cmd = ('cnvnator -root {root} -stat {bin} {other}').format(root=in_root,bin=str(bin_win),other=' '.join(others))
28 |     print(cmd);sys.stdout.flush()
29 |     sarge.run(cmd)
30 | 
31 | 
32 | def cnv_partitioning(in_root,bin_win,others=['']):
33 |     cmd = ('cnvnator -root {root} -partition {bin} {other}').format(root=in_root,bin=str(bin_win),other=' '.join(others))
34 |     print(cmd);sys.stdout.flush()
35 |     sarge.run(cmd)
36 |     
37 |     
38 | def cnv_call(in_root,out,bin_win,others=['']):
39 |     cmd = ('cnvnator -root {root} -call {bin} {other} > {out}').format(root=in_root,bin=str(bin_win),out=out,other=' '.join(others))
40 |     print(cmd);sys.stdout.flush()
41 |     sarge.run(cmd)
42 |     
43 | # output_file = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/cnv/merge.txt'
44 | # chr_path = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/cnv/scaffold'
45 | # bin_win = 100
46 | # others = ['-chrom NW_006887432.1']
47 | # root = output_file[:-3] + 'root'
48 | # cnv_generate_hist(root,chr_path,bin_win,others)
49 | # # 3
50 | # cnv_statistics(root,bin_win,others)
51 | # # 4
52 | # cnv_partitioning(root,bin_win,others)
53 | # # 5
54 | # cnv_call(root,output_file,bin_win,others)
55 |     
56 | 
57 | 


--------------------------------------------------------------------------------
/Modules/Picard.py:
--------------------------------------------------------------------------------
 1 | import sarge
 2 | import sys,os
 3 | 
 4 | def sam2fq(inSam,outPrefix,picard,endType):
 5 |     """sam file to fastq files
 6 |     """
 7 |     if endType == 'single':
 8 |         cmd = ('java -jar {picard} SamToFastq I={input} F={fq} '
 9 |                'VALIDATION_STRINGENCY=LENIENT ').format(
10 |             picard=picard,input=inSam,fq=outPrefix+'.fq.gz')
11 |     else:
12 |         cmd = ('java -jar {picard} SamToFastq I={input} F={fq1} F2={fq2} '
13 |                'VALIDATION_STRINGENCY=LENIENT ').format(picard=picard,
14 |                 input=inSam,fq1=outPrefix+'_1.fq.gz',fq2=outPrefix+'_2.fq.gz')
15 |     print(cmd);sys.stdout.flush()
16 |     sarge.run(cmd)
17 | 
18 | # if __name__ == '__main__':
19 | #     path = '/data/shangzhong/DetectVirus/unmap_bam'
20 | #     picard = '/home/shangzhong/Installation/picard-tools-1.141/picard.jar'
21 | #     os.chdir(path)
22 | #     bams = [f for f in os.listdir(path) if f.endswith('.bam')]
23 | #     for bam in bams:
24 | #         out = bam.split('.')[0]
25 | #         sam2fq(bam,out,picard,'single')
26 | 
27 | 
28 | def build_fa_dict(ref_fa,picard):
29 |     '''build dictionary file for fa file '''
30 |     out = '.'.join(ref_fa.split('.')[:-1]) + '.dict'
31 |     cmd = ('java -jar {picard} CreateSequenceDictionary R={ref} O={out}').format(
32 |             picard = picard,ref=ref_fa,out=out)
33 |     print(cmd);sys.stdout.flush()
34 |     sarge.run(cmd)
35 |     
36 | 
37 | def mark_duplicates(sortBam,dedupBam,picard):
38 |     '''mark duplicates'''
39 |     cmd = ('java -Djava.io.tmpdir=tmp -jar {picard} MarkDuplicates I={input} O={out} '
40 |            'CREATE_INDEX=true METRICS_FILE=metrics.txt MAX_RECORDS_IN_RAM=8000000 '
41 |            'MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 '
42 |            'VALIDATION_STRINGENCY=LENIENT').format(picard=picard,input=sortBam,out=dedupBam)
43 |     print(cmd);sys.stdout.flush()
44 |     sarge.run(cmd)
45 | 
46 | 
47 | def add_readgroup(sortBam,rgBam,readgroup,picard):
48 |     '''add read group'''
49 |     if not os.path.exists('tmp'):os.mkdir('tmp')
50 |     rg = readgroup.split('\\t')
51 |     ID = rg[1][3:]
52 |     SM = rg[2][3:]
53 |     PL = 'illumina'
54 |     LB = 'lib20000'
55 |     PU = 'unit1'
56 |     cmd = ('java -jar {picard} AddOrReplaceReadGroups I={input} O={rgBam} SO=coordinate '
57 |             'RGID={ID} RGSM={SM} RGPL={PL} RGLB={LB} RGPU={PU} TMP_DIR=tmp').format(
58 |            picard=picard,input=sortBam,rgBam=rgBam,ID=ID,SM=SM,PL=PL,LB=LB,PU=PU)
59 |     print(cmd);sys.stdout.flush()
60 |     sarge.run(cmd)
61 | 
62 |     
63 |     


--------------------------------------------------------------------------------
/Modules/PBhoney.py:
--------------------------------------------------------------------------------
 1 | import sarge
 2 | import re
 3 | import os
 4 | 
 5 | def Honey_pie(sortBam,sortTailBam,ref_fa,thread,tmp,otherParams=['']):
 6 |     """Honey pip extract soft clip reads and remap them
 7 |     """
 8 |     tailBam = re.sub('\.final\.bam$','.tail.bam',sortTailBam)
 9 |     cmd = ('Honey.py pie -o {tail} -n {thread} {input} {ref} --temp {tmp}').format(tail=tailBam,
10 |                             thread=str(thread),input=sortBam,ref=ref_fa,tmp=tmp)
11 |     cmd = cmd + ' '.join(otherParams)
12 |     print(cmd)
13 |     sarge.run(cmd)
14 |     # sort
15 |     cmd = ('samtools sort -m 4G -@ {thread} -T {pre} -o {sortBam} {bam} ').format(
16 |             thread=str(thread),pre=tailBam[:-4],sortBam=sortTailBam,bam=tailBam)
17 |     print(cmd)
18 |     sarge.run(cmd)
19 |     # index
20 |     cmd = ('samtools index {out} ').format(out=sortTailBam)
21 |     print(cmd)
22 |     sarge.run(cmd)
23 | #     os.remove(sortBam)
24 |     
25 | 
26 | def Honey_tails(finalBam,bamTail,otherParams=['']):
27 |     """This function run Honey tail,culster the soft clipped reads
28 |     """
29 |     cmd = ('Honey.py tails -o {out} {input} ').format(input=finalBam,out=bamTail)
30 |     cmd = cmd + ' '.join(otherParams)
31 |     print(cmd)
32 |     sarge.run(cmd)
33 | 
34 | 
35 | def Honey_spots(finalBam,spotFile,ref_fa,thread,otherParams=['']):
36 |     """This function run Honey sorts.
37 |     """
38 |     cmd = ('Honey.py spots --reference {ref} -n {thread} -o {out} {input} ').format(
39 |             input=finalBam,ref=ref_fa,thread=str(thread),out=spotFile)
40 |     cmd = cmd + ' '.join(otherParams)
41 |     print(cmd)
42 |     sarge.run(cmd)
43 | 
44 | import pandas as pd
45 | import numpy as np
46 | class pb_tail_res(object):
47 |     '''
48 |     Input is output result from pb tail. should be pandas dataframe
49 |     '''
50 |     def __init__(self,df):
51 |         self.df = df
52 |         self.df.columns = ['id','chrKey','uRef','uBreak','uMapq','dRef','dBreak','dMapq','remainSeq','annot','numReads','numZMWs','evidence']
53 |     
54 |     def get_sv_types(self):
55 |         '''get all the sv types'''
56 |         types = list(set(self.df['annot'].tolist()))
57 |         return types
58 |         
59 |     def get_sv_num(self,sv_type):
60 |         '''get sv number'''
61 |         df = self.df
62 |         return df[df['annot'].values==sv_type].shape[0]
63 |     
64 |     def add_sv_len(self):
65 |         '''add sv length for each sv except translocation whose break points are in different chromosome.'''
66 |         df = self.df
67 |         df['len'] = df.apply(lambda row: 'NA' if row['annot']=='TLOC' else int(row['dBreak'])-int(row['uBreak']),axis=1)
68 |         return df
69 |     
70 |     def get_sv_num4_each_chr(self,chr_len_df,sv_type,count_log=False,length_log=False):
71 |         '''get sv number for each scaffold for specific sv type
72 |         * chr_len_df: pandas dataframe with 1 column. ['chr_len']. chr name is index
73 |         '''
74 |         df = self.df[self.df['annot'].values==sv_type]
75 |         sv_count = df.groupby(['uRef']).size()
76 |         df = pd.concat([sv_count,chr_len_df],axis=1)
77 |         df = df.fillna(0)
78 |         df = df.rename(columns={0:'count'})
79 |         if count_log == True:
80 |             df['count'] = np.log10(df['count'])
81 |         if length_log == True:
82 |             df['chr_len'] = np.log10(df['chr_len'])
83 |         return df
84 | 
85 | 
86 |     
87 |         
88 |         
89 |         


--------------------------------------------------------------------------------
/SV_Pacbio_Sniffle.py:
--------------------------------------------------------------------------------
 1 | import os,sys
 2 | from Modules.f01_file_process import *
 3 | from ruffus import *
 4 | from Modules.Aligner import bwa_mem,bwa_Db,ngmlr
 5 | from Modules.Sniffles import sniffle
 6 | import yaml
 7 | from Modules.Samtools import sortBam
 8 | 
 9 | #============ parameters ======================
10 | parameter_file =  sys.argv[1]
11 | #parameter_file = '/home/shangzhong/Codes/NewPipeline/Parameters/SV_Pacbio_Sniffle.yaml'
12 | with open(parameter_file,'r') as f:
13 |     doc = yaml.load(f)
14 | p = dic2obj(**doc)
15 | #------------- get parameters -----------
16 | file_path = p.RawDataPath
17 | thread = p.thread
18 | # all parameter
19 | ref_fa = p.ref_fa
20 | db_path = p.bwa_db
21 | contact = p.contact
22 | aligner = p.aligner
23 | #===============================================================================
24 | #                    Pipeline part
25 | #===============================================================================
26 | #--------------------- 1. read all files ------------------------------------------------
27 | Message('Sniffle start',contact)
28 | os.chdir(file_path)
29 | #--------------------- 2. align all files -----------------------------------------------
30 | if aligner == 'bwa':
31 |     fastqFiles = [f for f in os.listdir(file_path) if f.endswith('fq.gz') or f.endswith('fastq.gz')]
32 |     # build index
33 |     @active_if(not os.path.exists(db_path))
34 |     def bwa_index():
35 |         bwa_Db(db_path,ref_fa)
36 |         os.chdir(file_path)
37 |      
38 |     @follows(bwa_index)
39 |     @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
40 |     @check_if_uptodate(check_file_exists)
41 |     @transform(fastqFiles,formatter('.*\.f.*q\.gz'),'bam/{basename[0]}.bam')
42 |     def run_bwa(input_file,output_file):
43 |         print(input_file + '-->' + output_file)
44 |         bwa_mem([input_file],output_file,db_path+'/bwa',thread,otherParameters=['-M','-x pacbio'])
45 | elif aligner == 'ngmlr':
46 |     faFiles = [f for f in os.listdir(file_path) if f.endswith('fa.gz') or f.endswith('fasta.gz')]
47 |     @transform(faFiles,formatter('.*\.f.*a\.gz'),'bam/{basename[0]}.bam')
48 |     def run_ngmlr(input_file,output_file):
49 |         ngmlr(input_file,output_file,ref_fa,thread)
50 | #--------------------- 3. sort bam file -----------------------------------------------
51 | @follows(run_bwa,run_ngmlr)
52 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam')
53 | @check_if_uptodate(check_file_exists)
54 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam')
55 | def run_sortBam(input_file,output_file):
56 |     sortBam(input_file,output_file,thread)
57 | #--------------------- 4. Detect SV -----------------------------------------------
58 | @follows(run_sortBam)
59 | @mkdir(fastqFiles,formatter(),'{path[0]}/vcf')
60 | @check_if_uptodate(check_file_exists)
61 | @transform(run_sortBam,formatter('.*\.sort\.bam'),'vcf/{basename[0]}.vcf')
62 | def run_sniffle(input_file,output_file):
63 |     sniffle(input_file,output_file,otherParameters=[''])
64 | #--------------------- 4. return finish message -----------------------------------------------------
65 | @follows(run_sniffle)
66 | def last_function():
67 |     Message('SV_Sniffle finished',contact)
68 |     
69 | 
70 | if __name__ == '__main__':
71 |     try:
72 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
73 |                  touch_files_only=False,verbose=15)
74 |     except:
75 |         Message('SV_Sniffle failed',contact)


--------------------------------------------------------------------------------
/mapped_BAM_to_fastq.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Extraction of mapped reads from bam files."
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "```Inputs: bam files generated by STAR or other aligners```\n",
 15 |     "\n",
 16 |     "Navigate to the directory containging the BAM files.\n",
 17 |     "\n",
 18 |     "If the BAM files do now follow this naming convention: ```'*.fastq.sort.bam'```, specifier your own file identifier in the second code bock\n",
 19 |     "\n",
 20 |     "```Outputs: FASTQ files containing ONLY reads mapped to your reference genome (specified during reads alignment)```"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "!pwd"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import os\n",
 39 |     "picard_jar_loc = '/home/chihchung/Installation/picard.jar'\n",
 40 |     "BAM_file_postfix = '.fastq.sort.bam'"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "bamfs = [fs for fs in os.listdir('./') if fs.endswith(BAM_file_postfix) and not fs.startswith('mapped_')];bamfs"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "import multiprocessing\n",
 59 |     "import subprocess\n",
 60 |     "\n",
 61 |     "def work(cmd):\n",
 62 |     "    return subprocess.call(cmd, shell=True)\n",
 63 |     "count = multiprocessing.cpu_count()\n",
 64 |     "pool = multiprocessing.Pool(processes=count)\n"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {},
 70 |    "source": [
 71 |     "get mapped bam files"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "if not os.path.isdir('mapped_fastq'): os.mkdir('mapped_fastq')\n",
 81 |     "cmd_get_mapped = ['samtools view -b -F 4 %s > mapped_%s'% (bam_fn, bam_fn) for bam_fn in bamfs]\n",
 82 |     "cmd_get_fastq_from_bams = [\n",
 83 |     "'java -jar %s SamToFastq \\\n",
 84 |     "     I= mapped_%s \\\n",
 85 |     "     FASTQ= mapped_fastq/%s.fastq'%(picard_jar_loc, bam_fn, bam_fn[:-len(BAM_file_postfix)]) for bam_fn in bamfs]\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {},
 92 |    "outputs": [],
 93 |    "source": [
 94 |     "pool.map(work, cmd_get_mapped)\n",
 95 |     "pool.map(work, cmd_get_fastq_from_bams)"
 96 |    ]
 97 |   }
 98 |  ],
 99 |  "metadata": {
100 |   "kernelspec": {
101 |    "display_name": "Python [conda env:pytorch]",
102 |    "language": "python",
103 |    "name": "conda-env-pytorch-py"
104 |   },
105 |   "language_info": {
106 |    "codemirror_mode": {
107 |     "name": "ipython",
108 |     "version": 3
109 |    },
110 |    "file_extension": ".py",
111 |    "mimetype": "text/x-python",
112 |    "name": "python",
113 |    "nbconvert_exporter": "python",
114 |    "pygments_lexer": "ipython3",
115 |    "version": "3.6.8"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 2
120 | }
121 | 


--------------------------------------------------------------------------------
/STAR_get_bam.py:
--------------------------------------------------------------------------------
 1 | from ruffus import *
 2 | from Modules.f01_file_process import *
 3 | from Modules.Aligner import STAR_Db,STAR
 4 | from Modules.Trimmomatic import Trimmomatic
 5 | from Modules.Samtools import *
 6 | import yaml,sys
 7 | import shutil
 8 | import glob
 9 | 
10 | 
11 | #============ parameters ======================
12 | parameter_file =  sys.argv[1]
13 | #parameter_file = '/data/shangzhong/Proteogenomics/STAR_get_bam.yaml'
14 | with open(parameter_file,'r') as f:
15 |     doc = yaml.load(f)
16 | p = dic2obj(**doc)
17 | #------------- get parameters -----------
18 | file_path = p.RawDataPath
19 | thread = p.thread
20 | # all parameter
21 | ref_fa = p.ref_fa
22 | gff = p.gff
23 | # trimmomatic parameter
24 | trim = p.trim_reads
25 | trimmomatic = p.trimmomatic_path
26 | trim_batch = p.trim_jobs_per_batch
27 | adapter = p.adapter
28 | 
29 | star_batch = p.star_jobs_per_batch
30 | star_db = p.star_index
31 | run_pass = p.star_pass
32 | other_params = p.star_params
33 | 
34 | contact = p.contact
35 | #===============================================================================
36 | #                    Pipeline part
37 | #===============================================================================
38 | Message('get bam start',contact)
39 | os.chdir(file_path)
40 | #===============================================================================
41 | #                     Part I. Preprocess
42 | #===============================================================================
43 | #--------------------- 1. read all files ------------------------------------------------
44 | fastqFiles = list_fq_files(file_path)
45 | if fastqFiles[0][0].startswith('trim_'):
46 |     trim = False
47 | def trim_parameters():
48 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
49 |     for infile, output in zip(infiles,outfiles):
50 |         yield infile,output
51 | #--------------------- 2. trim reads-----------------------------------------------------
52 | @active_if(trim)
53 | @jobs_limit(trim_batch)
54 | @files(trim_parameters)
55 | def trim_reads(input_file,output_file):
56 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
57 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
58 |     remove(input_file)
59 | #--------------------- 4. Map with STAR -----------------------------------------------------
60 | def get_fq():
61 |     fqFiles = list_fq_files(file_path)
62 |     for fq in fqFiles:
63 |         out = 'sortBam/' + re.sub('\.f.*q\.gz','.bam',fq[0])
64 |         yield fq,out
65 | # build index
66 | @active_if(not os.path.exists(star_db))
67 | @follows(trim_reads)
68 | def star_index():
69 |     STAR_Db(star_db,ref_fa,thread)
70 | # align
71 | other_params.extend(['--outSAMtype BAM', 'SortedByCoordinate'])
72 | if run_pass == 2:
73 |     other_params.append('--twopassMode Basic')
74 |     
75 | 
76 | @jobs_limit(star_batch)
77 | @follows(star_index)
78 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam')
79 | #@transform(fastqFiles,formatter('.*\.f.*?\.gz'),'sortBam/{basename[0]}.bam')
80 | @files(get_fq)
81 | def run_star(input_file,output_file):
82 |     n = num_thread2use(star_batch,len(fastqFiles),thread)
83 |     STAR(input_file,output_file,star_db,n,gff,other_params)
84 | 
85 | @follows(run_star)
86 | def last_function():
87 |     Message('get bam succeed',contact)
88 |     
89 | if __name__ == '__main__':
90 |     try:
91 | #         pipeline_printout(sys.stdout, [last_function], verbose=3)
92 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
93 |                     touch_files_only=False,verbose=5)
94 |     except:
95 |         Message('get bam failed',contact)
96 |         
97 |         


--------------------------------------------------------------------------------
/SV_Pacbio_PBHoney.py:
--------------------------------------------------------------------------------
 1 | from ruffus import *
 2 | import yaml
 3 | from Modules.f01_file_process import *
 4 | from Modules.Aligner import BLASR
 5 | from Modules.Samtools import *
 6 | from Modules.PBhoney import *
 7 | import shutil,os
 8 | import sys
 9 | #============ parameters ======================
10 | parameter_file =  sys.argv[1]
11 | #parameter_file = '/home/shangzhong/Codes/NewPipeline/Parameters/Pacbio_SV.yaml'
12 | with open(parameter_file,'r') as f:
13 |     doc = yaml.load(f)
14 | p = dic2obj(**doc)
15 | #------------- get parameters -----------
16 | file_path = p.RawDataPath
17 | thread = p.thread
18 | # all parameter
19 | ref_fa = p.ref_fa
20 | sa = p.sa_index
21 | # tool parameters
22 | blasr_batch = p.blasr_jobs_per_batch
23 | sam_sort_batch = p.sam_sort_jobs_per_batch
24 | 
25 | contact = p.contact
26 | #===============================================================================
27 | #                    Pipeline part
28 | #===============================================================================
29 | #--------------------- 1. read all files ------------------------------------------------
30 | Message('PBHoney start',contact)
31 | os.chdir(file_path)
32 | faFiles = [os.path.join(file_path,f) for f in os.listdir(file_path) if f.endswith('.fa')]
33 | print faFiles;sys.stdout.flush()
34 | #--------------------- 2. run BLASR -----------------------------------------------------
35 | @jobs_limit(blasr_batch)
36 | @mkdir(faFiles,formatter(),'{path[0]}/bam')
37 | @transform(faFiles,formatter(),'bam/{basename[0]}.bam')        #regex('.*\.fa'),'.bam')
38 | @check_if_uptodate(check_file_exists)
39 | def run_blasr(input_file,output_file):
40 |     n = num_thread2use(blasr_batch,len(faFiles),thread)
41 |     BLASR(input_file,output_file,ref_fa,n,['-clipping soft','-sa '+sa])
42 | #--------------------- 3. Sam2SortBam -----------------------------------------------------
43 | # sort bam
44 | @follows(run_blasr)
45 | @jobs_limit(sam_sort_batch)
46 | @mkdir(faFiles,formatter(),'{path[0]}/sortBam')
47 | @transform(run_blasr,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam')
48 | @check_if_uptodate(check_file_exists)
49 | def sortbam(input_file,output_file):
50 |     n = num_thread2use(sam_sort_batch,len(faFiles),thread)
51 |     sortBam(input_file,output_file,n)
52 |     if os.path.exists('bam'): shutil.rmtree('bam')
53 | #--------------------- 4. detect SV using PBhoney -----------------------------------------------------
54 | @mkdir(faFiles,formatter(),'{path[0]}/HoneyPie')
55 | @transform(sortbam,formatter('.*\.sort\.bam'),'HoneyPie/{basename[0]}.final.bam')
56 | @check_if_uptodate(check_file_exists)
57 | def Honeypie(input_file,output_file):
58 |     n = num_thread2use(thread,len(faFiles),thread)
59 |     Honey_pie(input_file,output_file,ref_fa,n,'HoneyPie')
60 | 
61 | @follows(Honeypie)
62 | @mkdir(faFiles,formatter(),'{path[0]}/HoneyTail')
63 | @transform(Honeypie,formatter('.*\.final\.bam'),'HoneyTail/{basename[0]}.tailes')
64 | @check_if_uptodate(check_file_exists)
65 | def Honeytailes(input_file,output_file):
66 |     Honey_tails(input_file,output_file)
67 | 
68 | @follows(Honeytailes)
69 | @mkdir(faFiles,formatter(),'{path[0]}/HoneySpots')
70 | @transform(Honeypie,formatter('.*\.final\.bam'),'HoneySpots/{basename[0]}.spots')
71 | @check_if_uptodate(check_file_exists)
72 | def Honeyspots(input_file,output_file):
73 |     n = num_thread2use(thread,len(faFiles),thread)
74 |     Honey_spots(input_file,output_file,ref_fa,n)
75 | 
76 | #---------------------- 5. report succeed -------------------------------------------------------------
77 | @follows(Honeyspots)
78 | def last_function():
79 |     Message('job finished',contact)
80 | 
81 | 
82 | if __name__ == '__main__':
83 |     try:
84 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
85 |                  touch_files_only=False,verbose=5)
86 |     except:
87 |         Message('Pacbio SV failed',contact)
88 |         pass
89 | 


--------------------------------------------------------------------------------
/Modules/Trimmomatic.py:
--------------------------------------------------------------------------------
 1 | import sarge
 2 | import os
 3 | import gzip
 4 | import sys
 5 | 
 6 | def get_phred_score(fq):
 7 |     """This function get phred score of fastq.gz file
 8 |     """
 9 |     score_found = False
10 |     with gzip.open(fq,'rb') as f:
11 |         n = 0
12 |         for line in f:
13 |             n = n + 1
14 |             line = line.rstrip()
15 |             if n%4 == 0:  # only get quality line
16 |                 vals = [ord(c) for c in line]
17 |                 lmin = min(vals);lmax=max(vals)
18 |                 if lmin <= 50.:
19 |                     return '33'
20 |                     score_found=True
21 |                     break
22 |                 if lmax >= 83.:
23 |                     score_found=True
24 |                     return '64'
25 |                     break
26 |     if score_found == False:
27 |         raise 'could not find the phred score, need to manually set'
28 |     
29 | def Trimmomatic(fqFiles,trim_fqFiles,trimmomatic,thread,adapter_file='',min_len=36):
30 |     """This function run trimmomatic to trim reads"""
31 |     # main parameters
32 |     unpair = [f + 'unpair' for f in fqFiles]
33 |     phred = get_phred_score(fqFiles[0])
34 |     if len(fqFiles) == 1:
35 |         trimCmd1st = ('java -jar {trim} SE -threads {thread} -phred{type} '
36 |                               '{input} {output} ').format(trim=trimmomatic,thread = int(thread),
37 |                             input = fqFiles[0],output=trim_fqFiles[0],type=phred)
38 |         trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=min_len)
39 |     elif len(fqFiles) == 2:
40 |         trimCmd1st = ('java -jar {trim} PE -threads {thread} -phred{type} {fastq1} {fastq2} '
41 |                 '{Trimmed1} {unpair1} {Trimmed2} {unpair2} ').format(trim=trimmomatic,
42 |                     thread=int(thread),type=phred,fastq1 = fqFiles[0], fastq2=fqFiles[1], 
43 |                     Trimmed1 = trim_fqFiles[0], Trimmed2 = trim_fqFiles[1],unpair1=unpair[0],unpair2=unpair[1])
44 |         trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=str(min_len))
45 |     # adapter file
46 |     if adapter_file != '':
47 |         adaptCmd = 'ILLUMINACLIP:{adapter}:2:30:10 '.format(adapter=adapter_file)
48 |     else:
49 |         adaptCmd = ''
50 |     cmd = trimCmd1st + adaptCmd + trimCmd2nd
51 |     print(cmd);sys.stdout.flush()
52 |     sarge.run(cmd)
53 |     for un in unpair:
54 |         if os.path.exists(un):
55 |             os.remove(un)
56 | 
57 | def conda_Trimmomatic(fqFiles,trim_fqFiles,thread,adapter_file='',min_len=36):
58 |     """This function run trimmomatic to trim reads"""
59 |     # main parameters
60 |     unpair = [f + 'unpair' for f in fqFiles]
61 |     phred = get_phred_score(fqFiles[0])
62 |     if len(fqFiles) == 1:
63 |         trimCmd1st = ('trimmomatic SE -threads {thread} -phred{type} '
64 |                               '{input} {output} ').format(thread = int(thread),
65 |                             input = fqFiles[0],output=trim_fqFiles[0],type=phred)
66 |         trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=min_len)
67 |     elif len(fqFiles) == 2:
68 |         trimCmd1st = ('trimmomatic PE -threads {thread} -phred{type} {fastq1} {fastq2} '
69 |                 '{Trimmed1} {unpair1} {Trimmed2} {unpair2} ').format(
70 |                     thread=int(thread),type=phred,fastq1 = fqFiles[0], fastq2=fqFiles[1], 
71 |                     Trimmed1 = trim_fqFiles[0], Trimmed2 = trim_fqFiles[1],unpair1=unpair[0],unpair2=unpair[1])
72 |         trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=str(min_len))
73 |     # adapter file
74 |     if adapter_file != '':
75 |         adaptCmd = 'ILLUMINACLIP:{adapter}:2:30:10 '.format(adapter=adapter_file)
76 |     else:
77 |         adaptCmd = ''
78 |     cmd = trimCmd1st + adaptCmd + trimCmd2nd
79 |     print(cmd);sys.stdout.flush()
80 |     sarge.run(cmd)
81 |     for un in unpair:
82 |         if os.path.exists(un):
83 |             os.remove(un)


--------------------------------------------------------------------------------
/Salmon_quant.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import STAR,STAR_Db
  4 | from Modules.Trimmomatic import Trimmomatic
  5 | from Modules.Samtools import sortBam
  6 | from Modules.HTseq import htseq_count
  7 | import yaml
  8 | import sys,shutil
  9 | from Modules.StringTie import stringtie
 10 | #============ parameters ======================
 11 | parameter_file =  sys.argv[1]
 12 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml'
 13 | with open(parameter_file,'r') as f:
 14 |     doc = yaml.load(f)
 15 | p = dic2obj(**doc)
 16 | #------------- get parameters -----------
 17 | file_path = p.RawDataPath
 18 | thread = p.thread
 19 | QC = p.QC
 20 | # all parameter
 21 | ref_fa = p.ref_fa
 22 | annotation = p.gff
 23 | # trimmomatic parameter
 24 | trim = p.trim_reads
 25 | trimmomatic = p.trimmomatic_path
 26 | trim_batch = p.trim_jobs_per_batch
 27 | adapter = p.adapter
 28 | # star parameter
 29 | star_batch = p.star_jobs_per_batch
 30 | db_path = p.STAR_index_path
 31 | 
 32 | contact = p.contact
 33 | #===============================================================================
 34 | #                    Pipeline part
 35 | #===============================================================================
 36 | #--------------------- 1. read all files ------------------------------------------------
 37 | Message('stringtie start',contact)
 38 | os.chdir(file_path)
 39 | fastqFiles = list_fq_files(file_path)
 40 | if fastqFiles[0][0].startswith('trim_'):
 41 |     trim = False
 42 | #--------------------- 2. trim reads-----------------------------------------------------
 43 | def trim_parameters():
 44 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 45 |     for infile, output in zip(infiles,outfiles):
 46 |         yield infile,output
 47 | #------------- run fastqc before trimming -----------
 48 | @active_if(QC)
 49 | @jobs_limit(thread)
 50 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 51 | @files(trim_parameters)
 52 | def run_QC1(input_file,output_file):
 53 |     for fq in input_file:
 54 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 55 | #------------ trim file ------------------
 56 | @active_if(trim)
 57 | @follows(run_QC1)
 58 | @jobs_limit(trim_batch)
 59 | @files(trim_parameters)
 60 | def trim_reads(input_file,output_file):
 61 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 62 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 63 |     remove(input_file)
 64 | #------------ run fastqc after trimming ------------
 65 | @active_if(QC and trim)
 66 | @follows(trim_reads)
 67 | @jobs_limit(thread)
 68 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 69 | @files(trim_parameters)
 70 | def run_QC2(input_file,output_file):
 71 |     for fq in output_file:
 72 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 73 | #--------------------- 3. run STAR ------------------------------------------------------
 74 | # build index
 75 | @active_if(not os.path.exists(db_path))
 76 | @follows(trim_reads,run_QC2)
 77 | def star_index():
 78 |     STAR_Db(db_path,ref_fa,thread)
 79 | # align
 80 | if trim == False:
 81 |     trim_reads=fastqFiles
 82 | @jobs_limit(star_batch)
 83 | @follows(star_index)
 84 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
 85 | @check_if_uptodate(check_file_exists)
 86 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam')
 87 | def run_star(input_file,output_file):
 88 |     n = num_thread2use(star_batch,len(fastqFiles),thread)
 89 |     STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM', 'SortedByCoordinate','--outSAMunmapped Within'])
 90 | #--------------------- 5. run stringtie -----------------------------------------------------
 91 | @follows(run_star)
 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/stringtie')
 93 | @check_if_uptodate(check_file_exists)
 94 | @transform(run_star,formatter('.*\.bam'),'stringtie/{basename[0]}.gtf')
 95 | def run_stringtie(input_file,output_file):
 96 |     stringtie(input_file,output_file,thread,annotation)
 97 | #--------------------- 7. return finish message -----------------------------------------------------
 98 | @follows(run_stringtie)
 99 | def last_function():
100 |     Message('stringtie finished',contact)
101 | 
102 | if __name__ == '__main__':
103 |     try:
104 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
105 |                  touch_files_only=False,verbose=5)
106 |     except:
107 |         Message('stringtie failed',contact)
108 |     


--------------------------------------------------------------------------------
/StringTie_quant.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import STAR,STAR_Db
  4 | from Modules.Trimmomatic import Trimmomatic
  5 | from Modules.Samtools import sortBam
  6 | from Modules.HTseq import htseq_count
  7 | import yaml
  8 | import sys,shutil
  9 | from Modules.StringTie import stringtie
 10 | #============ parameters ======================
 11 | parameter_file =  sys.argv[1]
 12 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml'
 13 | with open(parameter_file,'r') as f:
 14 |     doc = yaml.load(f)
 15 | p = dic2obj(**doc)
 16 | #------------- get parameters -----------
 17 | file_path = p.RawDataPath
 18 | thread = p.thread
 19 | QC = p.QC
 20 | # all parameter
 21 | ref_fa = p.ref_fa
 22 | annotation = p.gff
 23 | # trimmomatic parameter
 24 | trim = p.trim_reads
 25 | trimmomatic = p.trimmomatic_path
 26 | trim_batch = p.trim_jobs_per_batch
 27 | adapter = p.adapter
 28 | # star parameter
 29 | star_batch = p.star_jobs_per_batch
 30 | db_path = p.STAR_index_path
 31 | 
 32 | contact = p.contact
 33 | #===============================================================================
 34 | #                    Pipeline part
 35 | #===============================================================================
 36 | #--------------------- 1. read all files ------------------------------------------------
 37 | Message('stringtie start',contact)
 38 | os.chdir(file_path)
 39 | fastqFiles = list_fq_files(file_path)
 40 | if fastqFiles[0][0].startswith('trim_'):
 41 |     trim = False
 42 | #--------------------- 2. trim reads-----------------------------------------------------
 43 | def trim_parameters():
 44 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 45 |     for infile, output in zip(infiles,outfiles):
 46 |         yield infile,output
 47 | #------------- run fastqc before trimming -----------
 48 | @active_if(QC)
 49 | @jobs_limit(thread)
 50 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 51 | @files(trim_parameters)
 52 | def run_QC1(input_file,output_file):
 53 |     for fq in input_file:
 54 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 55 | #------------ trim file ------------------
 56 | @active_if(trim)
 57 | @follows(run_QC1)
 58 | @jobs_limit(trim_batch)
 59 | @files(trim_parameters)
 60 | def trim_reads(input_file,output_file):
 61 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 62 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 63 |     remove(input_file)
 64 | #------------ run fastqc after trimming ------------
 65 | @active_if(QC and trim)
 66 | @follows(trim_reads)
 67 | @jobs_limit(thread)
 68 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 69 | @files(trim_parameters)
 70 | def run_QC2(input_file,output_file):
 71 |     for fq in output_file:
 72 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 73 | #--------------------- 3. run STAR ------------------------------------------------------
 74 | # build index
 75 | @active_if(not os.path.exists(db_path))
 76 | @follows(trim_reads,run_QC2)
 77 | def star_index():
 78 |     STAR_Db(db_path,ref_fa,thread)
 79 | # align
 80 | if trim == False:
 81 |     trim_reads=fastqFiles
 82 | @jobs_limit(star_batch)
 83 | @follows(star_index)
 84 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
 85 | @check_if_uptodate(check_file_exists)
 86 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam')
 87 | def run_star(input_file,output_file):
 88 |     n = num_thread2use(star_batch,len(fastqFiles),thread)
 89 |     STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM', 'SortedByCoordinate','--outSAMunmapped Within'])
 90 | #--------------------- 5. run stringtie -----------------------------------------------------
 91 | @follows(run_star)
 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/stringtie')
 93 | @check_if_uptodate(check_file_exists)
 94 | @transform(run_star,formatter('.*\.bam'),'stringtie/{basename[0]}.gtf')
 95 | def run_stringtie(input_file,output_file):
 96 |     stringtie(input_file,output_file,thread,annotation)
 97 | #--------------------- 7. return finish message -----------------------------------------------------
 98 | @follows(run_stringtie)
 99 | def last_function():
100 |     Message('stringtie finished',contact)
101 | 
102 | if __name__ == '__main__':
103 |     try:
104 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
105 |                  touch_files_only=False,verbose=5)
106 |     except:
107 |         Message('stringtie failed',contact)
108 |     


--------------------------------------------------------------------------------
/CNV_CNVnator.py:
--------------------------------------------------------------------------------
  1 | import os,sys
  2 | from Modules.f01_file_process import *
  3 | from ruffus import *
  4 | from Modules.Aligner import bwa_Db,bwa_mem
  5 | from Modules.Trimmomatic import Trimmomatic
  6 | import yaml
  7 | from Modules.Samtools import sortBam,merge_bams
  8 | import shutil
  9 | from Modules.CNVnator import *
 10 | from Bio import SeqIO
 11 | #============ parameters ======================
 12 | parameter_file =  sys.argv[1]
 13 | #parameter_file = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/CNVnator.yaml'
 14 | with open(parameter_file,'r') as f:
 15 |     doc = yaml.load(f)
 16 | p = dic2obj(**doc)
 17 | #------------- get parameters -----------
 18 | file_path = p.RawDataPath
 19 | thread = p.thread
 20 | # all parameter
 21 | ref_fa = p.ref_fa
 22 | # trimmomatic parameter
 23 | trim = p.trim_reads
 24 | trimmomatic = p.trimmomatic_path
 25 | trim_batch = p.trim_jobs_per_batch
 26 | adapter = p.adapter
 27 | 
 28 | bwa_batch = p.bwa_jobs_per_batch
 29 | db_path = p.bwa_Db
 30 | 
 31 | bin_win = p.bin_win
 32 | others = p.chrom
 33 | contact = p.contact
 34 | #===============================================================================
 35 | #                    Pipeline part
 36 | #===============================================================================
 37 | #--------------------- 1. read all files ------------------------------------------------
 38 | Message('cnvnator start',contact)
 39 | os.chdir(file_path)
 40 | fastqFiles = list_fq_files(file_path)
 41 | def trim_parameters():
 42 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 43 |     for infile, output in zip(infiles,outfiles):
 44 |         yield infile,output
 45 | #--------------------- 2. trim reads-----------------------------------------------------
 46 | @active_if(trim)
 47 | @jobs_limit(trim_batch)
 48 | @files(trim_parameters)
 49 | def trim_reads(input_file,output_file):
 50 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 51 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 52 |     remove(input_file)
 53 | #--------------------- 3. Map with bwa -----------------------------------------------------
 54 | def get_fq():
 55 |     fqFiles = list_fq_files(file_path)
 56 |     for fq in fqFiles:
 57 |         out = 'bam/' + re.sub('\.f.*q\.gz','.bam',fq[0])
 58 |         yield fq,out
 59 | # build index
 60 | @active_if(not os.path.exists(db_path))
 61 | @follows(trim_reads)
 62 | def bwa_index():
 63 |     bwa_Db(db_path,ref_fa)
 64 | # align
 65 | @jobs_limit(bwa_batch)
 66 | @follows(trim_reads,bwa_index)
 67 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
 68 | @files(get_fq)
 69 | def run_bwa(input_file,output_file):
 70 |     n = num_thread2use(bwa_batch,len(fastqFiles),thread)
 71 |     db_index = db_path + '/' + os.listdir(db_path)[0].split('.')[0]
 72 |     bwa_mem(input_file,output_file,db_index,n)
 73 | #--------------------- 5. Sort bam file --------------------------------------------------
 74 | @jobs_limit(trim_batch)
 75 | @follows(run_bwa)
 76 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam')
 77 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam')
 78 | @check_if_uptodate(check_file_exists)
 79 | def sort_by_pos(input_file,output_file):
 80 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 81 |     sortBam(input_file,output_file,n,sortType='pos')
 82 | @follows(sort_by_pos)
 83 | def remove_bam():
 84 |     if os.path.exists('bam'): shutil.rmtree('bam')   # remove bam folder
 85 | 
 86 | @follows(sort_by_pos,remove_bam)
 87 | @mkdir(fastqFiles,formatter(),'{path[0]}/mergeBam')
 88 | @merge(sort_by_pos,'mergeBam/merge.bam')
 89 | @check_if_uptodate(check_file_exists)
 90 | def run_merge_bam(input_file,output_file):
 91 |     if len(input_file) > 1:
 92 |         merge_bams(input_file,output_file)
 93 |     else:
 94 |         os.rename(input_file[0],output_file)
 95 | #------------------- 6. run CNVnator -----------------------------------------------------
 96 | @jobs_limit(thread)
 97 | @follows(run_merge_bam)
 98 | @mkdir(fastqFiles,formatter(),'{path[0]}/cnv')
 99 | @transform(run_merge_bam,formatter('.*\.bam'),'cnv/{basename[0]}.txt')
100 | @check_if_uptodate(check_file_exists)
101 | def run_cnvnator(input_file,output_file):
102 |     root = output_file[:-3] + 'root'
103 |     # 1
104 |     cnv_extract_bam(input_file,root,others)
105 |     # 2
106 |     chr_path = file_path + '/cnv/scaffold'
107 |     path = chr_path
108 |     if not os.path.exists(path):
109 |         os.mkdir(path)
110 |         for record in SeqIO.parse(ref_fa,'fasta'):
111 |             SeqIO.write(record,path+'/'+record.id+'.fa','fasta')
112 |     
113 |     cnv_generate_hist(root,chr_path,bin_win,others)
114 |     # 3
115 |     cnv_statistics(root,bin_win,others)
116 |     # 4
117 |     cnv_partitioning(root,bin_win,others)
118 |     # 5
119 |     cnv_call(root,output_file,bin_win,others)
120 |     
121 | @follows(run_cnvnator)
122 | def last_function():
123 | #     Message('cnvnator succeed',contact)
124 |     pass
125 | if __name__ == '__main__':
126 |     try:
127 |         pipeline_run([run_cnvnator,last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
128 |                  touch_files_only=False)
129 |     except:
130 |         Message('cnvnator failed',contact)


--------------------------------------------------------------------------------
/SV_Illumina_Lumpy.py:
--------------------------------------------------------------------------------
  1 | import os,sys
  2 | from Modules.f01_file_process import *
  3 | from ruffus import *
  4 | from Modules.Aligner import bwa_Db,bwa_samblaster
  5 | from Modules.Trimmomatic import Trimmomatic
  6 | import yaml
  7 | from Modules.Samtools import sortBam,merge_bams
  8 | import shutil
  9 | from Modules.Lumpy import lumpyexpress
 10 | from Modules.SVTyper import svtyper
 11 | #============ parameters ======================
 12 | parameter_file =  sys.argv[1]
 13 | #parameter_file = '/data/shangzhong/SV_lumpy/SV_lumpy.yaml'
 14 | with open(parameter_file,'r') as f:
 15 |     doc = yaml.load(f)
 16 | p = dic2obj(**doc)
 17 | #------------- get parameters -----------
 18 | file_path = p.RawDataPath
 19 | thread = p.thread
 20 | # all parameter
 21 | ref_fa = p.ref_fa
 22 | # trimmomatic parameter
 23 | trim = p.trim_reads
 24 | trimmomatic = p.trimmomatic_path
 25 | trim_batch = p.trim_jobs_per_batch
 26 | adapter = p.adapter
 27 | 
 28 | bwa_batch = p.bwa_jobs_per_batch
 29 | bwa_index = p.bwa_index
 30 | read_groups = p.read_groups
 31 | contact = p.contact
 32 | #===============================================================================
 33 | #                    Pipeline part
 34 | #===============================================================================
 35 | #--------------------- 1. read all files ------------------------------------------------
 36 | Message('Lumpy start',contact)
 37 | os.chdir(file_path)
 38 | fastqFiles = list_fq_files(file_path)
 39 | def trim_parameters():
 40 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 41 |     for infile, output in zip(infiles,outfiles):
 42 |         yield infile,output
 43 | #--------------------- 2. trim reads-----------------------------------------------------
 44 | @active_if(trim)
 45 | @jobs_limit(trim_batch)
 46 | @files(trim_parameters)
 47 | def trim_reads(input_file,output_file):
 48 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 49 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 50 |     remove(input_file)
 51 | #--------------------- 3. Map with bwa -----------------------------------------------------
 52 | def get_fq_and_readgroup():
 53 |     fqFiles = list_fq_files(file_path)
 54 |     for fq, rg in zip(fqFiles,read_groups):
 55 |         out = 'bam/' + re.sub('\.f.*q\.gz','.bam',fq[0])
 56 |         yield fq,out,rg
 57 | # build index
 58 | @active_if(not os.path.exists('/'.join(bwa_index.split('/')[:-1])))
 59 | @follows(trim_reads)
 60 | def run_bwa_index():
 61 |     bwa_Db(bwa_index,ref_fa)
 62 | # align
 63 | @jobs_limit(bwa_batch)
 64 | @follows(trim_reads,run_bwa_index)
 65 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
 66 | @files(get_fq_and_readgroup)
 67 | def run_bwa(input_file,output_file,rg):
 68 |     n = num_thread2use(bwa_batch,len(fastqFiles),thread)
 69 |     lib = rg.split('\\t')[2][3:]
 70 |     readgroup = '\'' + rg+'\\tLB:'+lib+'\\tPL:illumina\\tPU:unit1\''
 71 |     bwa_samblaster(input_file,output_file,bwa_index,n,otherParameters=['-R '+ readgroup])
 72 | #--------------------- 5. Sort bam file --------------------------------------------------
 73 | @jobs_limit(trim_batch)
 74 | @follows(run_bwa)
 75 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam')
 76 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam')
 77 | @check_if_uptodate(check_file_exists)
 78 | def sort_by_pos(input_file,output_file):
 79 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 80 |     sortBam(input_file,output_file,n,sortType='pos')
 81 |     disc = input_file[:-3] + 'disc.sam'
 82 |     disc_sort = output_file[:-3] + 'disc.bam'
 83 |     sortBam(disc,disc_sort,n,sortType='pos')
 84 |     split = input_file[:-3] + 'split.sam'
 85 |     split_sort = output_file[:-3] + 'split.bam'
 86 |     sortBam(split,split_sort,n,sortType='pos')
 87 | @follows(sort_by_pos)
 88 | def remove_bam():
 89 |     if os.path.exists('bam'): shutil.rmtree('bam')   # remove bam folder
 90 | #--------------------- 6. run lumpyexpress ------------------------------
 91 | @follows(sort_by_pos)
 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/vcf')
 93 | @merge(sort_by_pos,'vcf/lumpy.vcf')
 94 | @check_if_uptodate(check_file_exists)
 95 | def run_lumpyexpress(input_files,output_file):
 96 |     lumpyexpress(input_files,output_file,['-T vcf'])
 97 | #--------------------- 7. run SVTyper ------------------------------
 98 | @follows(run_lumpyexpress)
 99 | @mkdir(fastqFiles,formatter(),'{path[0]}/merge')
100 | @merge(sort_by_pos,'merge/merge.bam')
101 | @check_if_uptodate(check_file_exists)
102 | def run_svtyper(input_files,output_file):
103 |     if len(input_files) > 1:
104 |         merge_bams(input_files,output_file)
105 |     else:
106 |         os.rename(input_files[0],output_file)
107 |     sarge.run('samtools index {b}'.format(b=output_file))
108 |     svtyper('vcf/lumpy.vcf','vcf/lumpy_gt.vcf',output_file)
109 |     shutil.move('merge/merge.json','vcf')
110 |     shutil.move('merge/merge.json.pdf','vcf')
111 | 
112 | @follows(run_svtyper)
113 | def last_function():
114 |     Message('lumpy succeed',contact)
115 |     
116 | if __name__ == '__main__':
117 |     try:
118 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 
119 |                  touch_files_only=False,verbose=20)
120 |     except:
121 |         Message('lumpyexpress failed',contact)
122 | 
123 | 
124 | 
125 | 
126 | 


--------------------------------------------------------------------------------
/Modules/f02_parse_gff.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import pandas as pd
  3 | 
  4 | 
  5 | class ncbi_gff(object):
  6 |     def __init__(self,df):
  7 |         self.df = df
  8 |         self.df.columns=['chr','source','feature','start','end','score','strand','frame','anno']
  9 |         self.df['start'] = self.df['start'] - 1
 10 |         self.df = self.df[self.df['feature'].values!='region']
 11 |         self.df = self.df.reset_index(drop=True)
 12 |         self.df['geneid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'GeneID:'))
 13 |         self.df['trid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'transcript_id='))
 14 |         self.df['prid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'protein_id='))
 15 |     @staticmethod
 16 |     def get_id(anno,feature):
 17 |         '''get id based on the feature provided'''
 18 |         try:
 19 |             gene_id = re.search('(?<={id}).+?(?=[;,]|$)'.format(id=feature),anno).group(0)
 20 |         except:
 21 |             gene_id = None
 22 |         return gene_id
 23 |     
 24 |     @staticmethod
 25 |     def get_tr_longest_intron(tr_df):
 26 |         '''get the longest intron the the transcript'''
 27 |         start = tr_df['start'].tolist()
 28 |         end = tr_df['end'].tolist()
 29 |         strand = tr_df['strand'].tolist()
 30 |         if len(start) == 1:
 31 |             return 0
 32 |         if strand[0] == '+':
 33 |             intron = max([abs(int(s)-int(e)) for s,e in zip(start[1:],end[:-1])])
 34 |         else:
 35 |             intron = max([abs(int(s)-int(e)) for s,e in zip(start[:-1],end[1:])])
 36 |         return intron
 37 |     
 38 |     def get_longest_intron(self):
 39 |         '''this is the longest intron across the whole genome'''
 40 |         df = self.df
 41 |         df = df[(~df['prid'].isnull()) & (df['feature'].values=='CDS')]
 42 |         df = df.reset_index(drop=True)
 43 |         df = df.groupby(['chr','prid']).apply(ncbi_gff.get_tr_longest_intron)
 44 |         return df#.max()
 45 |     
 46 |     def get_all_id(self):
 47 |         '''this function gets all ids in the gff file
 48 |         '''
 49 |         df = self.df
 50 |         id_df = df[df['feature'].isin(['exon','CDS'])]
 51 |         id_df = id_df.reset_index(drop=True)
 52 |         
 53 |         id_df['sym'] = id_df['anno'].map(lambda x: ncbi_gff.get_id(x,'gene='))
 54 |         id_df['rna'] = id_df['anno'].map(lambda x: ncbi_gff.get_id(x,'Parent='))
 55 |         
 56 |         exn_df = id_df[id_df['feature'].values=='exon'][['geneid','sym','chr','rna','trid']].drop_duplicates()
 57 |         exn_df = exn_df.reset_index(drop=True)
 58 |         
 59 |         cds_df = id_df[id_df['feature'].values=='CDS'][['geneid','sym','chr','rna','prid']].drop_duplicates()
 60 |         cds_df = cds_df.reset_index(drop=True)
 61 |         
 62 |         merge_df = pd.merge(exn_df,cds_df,how='outer',on=['geneid','sym','chr','rna'])
 63 |         merge_df.columns = ['GeneID','GeneSymbol','Chrom','TrID','TrAccess','PrAccess']
 64 |         merge_df.fillna('-',inplace=True)
 65 |         merge_df.fillna('-',inplace=True)
 66 | #         merge_df = merge_df[(merge_df['TrAccess'].values != '-') | (merge_df['PrAccess'].values != '-')]
 67 |         merge_df = merge_df.sort_values(['GeneID'])
 68 |         return merge_df[['GeneID','GeneSymbol','Chrom','TrAccess','PrAccess','TrID']]
 69 |     
 70 |     def get_gene_seq(self,ref_dic,gid,id_type='tr'):
 71 |         '''this function gets seqeunce of a transcript or protein
 72 |         '''
 73 |         df = self.df
 74 |         if id_type == 'tr':
 75 |             feature = 'exon'
 76 |             id_t = 'trid'
 77 |         elif id_type == 'pr':
 78 |             feature = 'CDS'
 79 |             id_t = 'prid'
 80 |         region_df = df[(df['feature'].values==feature) & (df[id_t].values==gid)]
 81 |         # get sequence
 82 |         scaff = region_df['chr'].tolist()[0]
 83 |         scaff_seq = ref_dic[scaff].seq
 84 |         strand = region_df['strand'].tolist()[0]
 85 |         
 86 |         g_seq = ''
 87 |         for s,e in zip(region_df['start'],region_df['end']):
 88 |             g_seq += scaff_seq[int(s):int(e)]
 89 |         # consider strand
 90 |         if strand == '-':
 91 |             g_seq = g_seq.reverse_complement()
 92 |         
 93 |         if id_type == 'pr':
 94 |             g_seq = g_seq.translate()
 95 |         return g_seq
 96 |         
 97 | # gff_fn = '/data/genome/hamster/ncbi_refseq/hamster.gff'
 98 | # df = pd.read_csv(gff_fn,sep='\t',header=None,comment='#')
 99 | # obj = ncbi_gff(df)
100 | # all_id_df = obj.get_all_id()
101 | # all_id_df.to_csv('/data/genome/hamster/ncbi_refseq/all_id.txt',sep='\t',index=False)
102 | # 
103 | # ref_dic = SeqIO.index('/data/genome/hamster/picr/picr.fa','fasta')
104 | # res = obj.get_gene_seq(ref_dic,'NM_001246795',id_type='tr')
105 | # print res
106 | 
107 | 
108 | from Bio import SeqIO
109 | 
110 | def getr_tRNA(fa,gff,output):
111 |     '''get rRNA and tRNA sequence
112 |     * output: fa file stores rtRNA sequence'''
113 |     index = SeqIO.index(fa,'fasta')
114 |     with open(gff) as f, open(output,'w') as out:
115 |         for line in f:
116 |             if line.startswith('#'):continue
117 |             item = line.strip().split('\t')
118 |             if item[2] in ['rRNA','tRNA']:
119 |                 name = re.search('(?<=product=).+?(?=$|;)',line).group(0)
120 |                 chrom = item[0]
121 |                 s = int(item[3])
122 |                 e = int(item[4])
123 |                 seq = str(index[chrom].seq[s-1:e])
124 |                 out.write('>'+name + '\n' + seq + '\n')


--------------------------------------------------------------------------------
/RNAseq_count.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import STAR,STAR_Db
  4 | from Modules.Trimmomatic import Trimmomatic
  5 | from Modules.Samtools import sortBam
  6 | from Modules.HTseq import htseq_count
  7 | import yaml
  8 | import sys,shutil
  9 | #============ parameters ======================
 10 | parameter_file =  sys.argv[1]
 11 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml'
 12 | with open(parameter_file,'r') as f:
 13 |     doc = yaml.load(f)
 14 | p = dic2obj(**doc)
 15 | #------------- get parameters -----------
 16 | file_path = p.RawDataPath
 17 | thread = p.thread
 18 | QC = p.QC
 19 | # all parameter
 20 | ref_fa = p.ref_fa
 21 | annotation = p.gff
 22 | # trimmomatic parameter
 23 | trim = p.trim_reads
 24 | trimmomatic = p.trimmomatic_path
 25 | trim_batch = p.trim_jobs_per_batch
 26 | adapter = p.adapter
 27 | # star parameter
 28 | star_batch = p.star_jobs_per_batch
 29 | db_path = p.STAR_index_path
 30 | # htseq parameter
 31 | htseq_anno_source = p.htseq_anno_source
 32 | strand = p.strand_specific
 33 | id_name = p.id_name
 34 | 
 35 | contact = p.contact
 36 | #===============================================================================
 37 | #                    Pipeline part
 38 | #===============================================================================
 39 | #--------------------- 1. read all files ------------------------------------------------
 40 | Message('RNA_count start',contact)
 41 | os.chdir(file_path)
 42 | fastqFiles = list_fq_files(file_path)
 43 | if fastqFiles[0][0].startswith('trim_'):
 44 |     trim = False
 45 | #--------------------- 2. trim reads-----------------------------------------------------
 46 | def trim_parameters():
 47 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 48 |     for infile, output in zip(infiles,outfiles):
 49 |         yield infile,output
 50 | #------------- run fastqc before trimming -----------
 51 | @active_if(QC)
 52 | @jobs_limit(thread)
 53 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 54 | @files(trim_parameters)
 55 | def run_QC1(input_file,output_file):
 56 |     for fq in input_file:
 57 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 58 | #------------ trim file ------------------
 59 | @active_if(trim)
 60 | @follows(run_QC1)
 61 | @jobs_limit(trim_batch)
 62 | @files(trim_parameters)
 63 | def trim_reads(input_file,output_file):
 64 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 65 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 66 |     remove(input_file)
 67 | #------------ run fastqc after trimming ------------
 68 | @active_if(QC and trim)
 69 | @follows(trim_reads)
 70 | @jobs_limit(thread)
 71 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 72 | @files(trim_parameters)
 73 | def run_QC2(input_file,output_file):
 74 |     for fq in output_file:
 75 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 76 | #--------------------- 3. run STAR ------------------------------------------------------
 77 | # build index
 78 | @active_if(not os.path.exists(db_path))
 79 | @follows(trim_reads,run_QC2)
 80 | def star_index():
 81 |     STAR_Db(db_path,ref_fa,thread)
 82 | # align
 83 | if trim == False:
 84 |     trim_reads=fastqFiles
 85 | @jobs_limit(star_batch)
 86 | @follows(star_index)
 87 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam')
 88 | @check_if_uptodate(check_file_exists)
 89 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam')
 90 | def run_star(input_file,output_file):
 91 |     n = num_thread2use(star_batch,len(fastqFiles),thread)
 92 |     STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM','Unsorted','--outSAMunmapped Within'])
 93 | #--------------------- 4. samtools sort by name -----------------------------------------
 94 | @jobs_limit(trim_batch)
 95 | @follows(run_star)
 96 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam')
 97 | @check_if_uptodate(check_file_exists)
 98 | @transform(run_star,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam')
 99 | def sort_by_name(input_file,output_file):
100 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
101 |     sortBam(input_file,output_file,n,sortType='name')
102 |     stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file))
103 |     with open(output_file[:-3]+'flagstat.txt','w') as f:
104 |         f.write(stat)
105 | @follows(sort_by_name)
106 | def remove_bam():
107 |     if os.path.exists('bam'): shutil.rmtree('bam')   # remove bam folder
108 | #--------------------- 5. run htseq -----------------------------------------------------
109 | @follows(remove_bam)
110 | @mkdir(fastqFiles,formatter(),'{path[0]}/htseq')
111 | @check_if_uptodate(check_file_exists)
112 | @transform(sort_by_name,formatter('.*\.sort\.bam'),'htseq/{basename[0]}.txt')
113 | def run_htseq(input_file,output_file):
114 |     htseq_count(input_file,output_file,annotation,strand,htseq_anno_source)
115 | #--------------------- 6. ID convertion -----------------------------------------------------
116 | @active_if(htseq_anno_source!='')
117 | @follows(run_htseq)
118 | @transform(run_htseq,suffix('.txt'),'.count.txt')
119 | def id_convert(input_file,output_file):
120 |     print(input_file+ '--->' + output_file)
121 |     # 1. get dictionary
122 |     if id_name == 'id':
123 |         sym2ID = 'yes'
124 |     else:
125 |         sym2ID = 'no'
126 |     dic = get_gene_name_id_dic(annotation,htseq_anno_source,sym2ID)
127 |     gene_id_name_convert(input_file,output_file,dic)
128 | #--------------------- 7. return finish message -----------------------------------------------------
129 | @follows(id_convert,run_htseq)
130 | def last_function():
131 |     Message('RNA_count finished',contact)
132 | 
133 | if __name__ == '__main__':
134 |     try:
135 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 
136 |                  touch_files_only=False,verbose=5)
137 |     except:
138 |         Message('RNA_count failed',contact)
139 |     


--------------------------------------------------------------------------------
/Modules/Homer.py:
--------------------------------------------------------------------------------
  1 | import sarge,sys,os
  2 | import pandas as pd
  3 | import matplotlib as mpl
  4 | mpl.use('Agg')
  5 | import matplotlib.pyplot as plt
  6 | mpl.style.use('ggplot')
  7 | 
  8 | 
  9 | def make_tag_directory(in_bam,tag_dir,ref_fa):
 10 |     '''make tag directory which extract mapping position into tsv file
 11 |     '''
 12 |     cmd = ('makeTagDirectory {o_dir} -genome {g} -checkGC \
 13 |             -single {bam}').format(o_dir=tag_dir,g=ref_fa,bam=in_bam)
 14 |     print(cmd);sys.stdout.flush()
 15 |     sarge.run(cmd)
 16 | 
 17 | 
 18 | def rm_5GRO_ctrl(GRO5_tag,ctrl_tag):
 19 |     '''this function remove ctrl_tag coverage from GRO-Cap tag
 20 |     '''
 21 |     before_sub = GRO5_tag+'/genome.tages_before_sub.tsv'
 22 |     after_sub = GRO5_tag+'/genome.tags.tsv'
 23 |     os.rename(after_sub,before_sub)
 24 |     df_raw = pd.read_csv(before_sub,sep='\t',header=None)
 25 |     df_ctr = pd.read_csv(ctrl_tag+'/genome.tags.tsv',sep='\t',header=None)
 26 |     
 27 |     
 28 | def hist(tag_dir,hist_out,ref_fa,anno,mode='tss',peak='',region=4000,res=10,pc=3):
 29 |     '''this function gets tag coverage around tss
 30 |     * tag_dir: tag directory
 31 |     * anno: gff file or gtf file
 32 |     * pc: number of tags to consider at each position
 33 |     * region: length to conisder in the x axis. which means -2000 to 2000 around tss
 34 |     * res: resolution of the histogram.
 35 |     '''
 36 |     if anno.endswith('gtf'):
 37 |         anno = '-gtf ' + anno
 38 |     else:
 39 |         anno = '-gff ' + anno
 40 |     if mode == 'tss':
 41 |         cmd = ('annotatePeaks.pl tss {ref_fa} {anno} -size {size} -hist {bin} -d {dir} -pc {pc} > {out}').format(
 42 |                     ref_fa=ref_fa,anno=anno,size=str(region),bin=str(res),dir=tag_dir,pc=str(pc),out=hist_out)
 43 |     elif mode == 'peak':
 44 |         if peak == '':
 45 |             raise ValueError('input is empty')
 46 |         cmd = ('annotatePeaks.pl {peak} {ref_fa} {anno} -size {size} -hist {bin} -d {dir} -pc {pc} > {out}').format(
 47 |                     peak=peak,ref_fa=ref_fa,anno=anno,size=str(region),bin=str(res),dir=tag_dir,pc=str(pc),out=hist_out)
 48 |     sarge.run(cmd)
 49 |     
 50 |     
 51 | def hist_plot(hist_out):
 52 |     #Visualize histogram.
 53 |     plt.figure()
 54 |     df = pd.read_csv(hist_out,sep='\t',header=0,names=['Distance from TSS','Coverage','+ Tags','- Tags'])
 55 |     plt.plot(df['Distance from TSS'],df['+ Tags'],label='+ Tags')
 56 |     plt.plot(df['Distance from TSS'],df['- Tags'],label='- Tags')
 57 |     plt.xlim([-500,500])
 58 |     plt.xlabel('Distance from TSS')
 59 |     plt.ylabel('Reads per bp per TSS')
 60 |     plt.axvline(x=0,c='k')
 61 |     plt.legend(loc='upper right')
 62 |     
 63 |     plt.savefig(os.path.splitext(hist_out)[0]+'.png')
 64 |     
 65 | # if __name__ == '__main__':
 66 | #     import glob
 67 | #     hists = glob.glob('/data/shangzhong/TSS/fq/f03_tags/*/hist.txt')
 68 | #     for h in hists:
 69 | #         plt.figure()
 70 | #         hist_plot(h)
 71 | 
 72 | 
 73 | def find_peaks(tag_dir,out_file,peak_style,control_dir,otherParams=['']):
 74 |     '''find peaks
 75 |     '''
 76 |     cmd = 'findPeaks {tag} -style {style} -o {out} -i {control} {other}'.format(
 77 |                 tag=tag_dir,style=peak_style,out=out_file,control=control_dir,
 78 |                 other=' '.join(otherParams))
 79 |     print(cmd);sys.stdout.flush()
 80 |     sarge.run(cmd)
 81 | 
 82 |     
 83 | def merge_peaks(input_files,output_file,dist):
 84 |     '''
 85 |     * input_files: a list of peak files, name format is 5gro_and_gro
 86 |     * otuput_file: final merged peak file
 87 |     '''
 88 |     cmd = ('mergePeaks -d {dist} {in_files} > {out}').format(dist=str(dist),
 89 |                         in_files=' '.join(input_files),out=output_file)
 90 |     print(cmd);sys.stdout.flush()
 91 |     sarge.run(cmd)
 92 | 
 93 | 
 94 | def annotate_peaks(peak_file,output_file,ref_fa,annotation):
 95 |     '''
 96 |     this function annotate peaks, basically get closes TSS to each peak.
 97 |     '''
 98 |     if annotation.endswith('gtf'):
 99 |         anno = '-gtf ' + annotation
100 |     else:
101 |         anno = '-gff ' + annotation
102 |     cmd = 'annotatePeaks.pl {peaks} {genome} {annotation} > {out}'.format(peaks=peak_file,genome=ref_fa,
103 |                             annotation=anno, out=output_file)
104 |     print(cmd);sys.stdout.flush()
105 |     sarge.run(cmd)
106 | 
107 | 
108 | def filter_anno_peak(in_peak_file,filter_peak_file):
109 |     '''this function extracts the reliable TSS from the peak file
110 |     The rule is: for each 5GRO, get overlap of peaks against different GROseq.
111 |     Then get union set from pevious peaks.
112 |     '''
113 |     df = pd.read_csv(in_peak_file,sep='\t',header=0)
114 |     gro5 = []
115 |     gro = []
116 |     for f in df['Focus Ratio/Region Size']:
117 |         files = f.split('|')
118 |         for sub_f in files: # sub_f is peak file result
119 |             peaks = sub_f.split('_and_')  # peaks has 5gro file and gro file name
120 |             for p in peaks:
121 |                 if p in gro5 or p in gro:
122 |                     continue
123 |                 else:
124 |                     if '5GRO' in p:
125 |                         gro5.append(p)
126 |                     else:
127 |                         gro.append(p)
128 |     print gro5,gro
129 |     def extract_peak(gro5,gro,fns):
130 |         '''fns is the splited filename in 6th column of annotated peak file'''
131 |         keep = []
132 |         res = True
133 |         for g5 in gro5:
134 |             keep.append([g5+'_and_'+g in fns for g in gro])
135 |         for k in keep:
136 |             if False in k:
137 |                 res=False
138 |         return res
139 |         
140 |     # filter out the annopeak
141 |     cri = df['Focus Ratio/Region Size'].map(lambda x: extract_peak(gro5,gro,x))
142 |     df = df[cri]
143 |     df.to_csv(filter_peak_file,sep='\t',index=False)
144 | 
145 | # if __name__ == '__main__':
146 | #     in_peak_file = '/data/shangzhong/TSS/fq/f05_annoPeaks/merge.anno'
147 | #     filter_peak_file = '/data/shangzhong/TSS/fq/f05_annoPeaks/merge_filter.anno'
148 | #     filter_anno_peak(in_peak_file,filter_peak_file)
149 |                 
150 |             
151 |     


--------------------------------------------------------------------------------
/RibosomeProfiling.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import hisat2_Db,hisat2
  4 | from Modules.Trimmomatic import conda_Trimmomatic
  5 | from Modules.Samtools import sortBam
  6 | import yaml
  7 | import sys,shutil
  8 | #============ parameters ======================
  9 | parameter_file =  sys.argv[1]
 10 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml'
 11 | with open(parameter_file,'r') as f:
 12 |     doc = yaml.load(f)
 13 | p = dic2obj(**doc)
 14 | #------------- get parameters -----------
 15 | file_path = p.RawDataPath
 16 | thread = p.thread
 17 | QC = p.QC
 18 | # all parameter
 19 | rRNA_fa = p.rRNA_fa
 20 | ref_fa = p.ref_fa
 21 | annotation = p.gff
 22 | # trimmomatic parameter
 23 | trim = p.trim_reads
 24 | trim_batch = p.trim_jobs_per_batch
 25 | min_len = p.min_len
 26 | adapter = p.adapter
 27 | # star parameter
 28 | hisat2_batch = p.hisat2_jobs_per_batch
 29 | hisat2_rRNA_db = p.hisat2_rrna_index
 30 | hisat2_target_db = p.hisat2_target_index
 31 | 
 32 | other = p.other
 33 | contact = p.contact
 34 | #===============================================================================
 35 | #                    Pipeline part
 36 | #===============================================================================
 37 | #--------------------- 1. read all files ------------------------------------------------
 38 | Message('Riboseq start',contact)
 39 | os.chdir(file_path)
 40 | fastqFiles = list_fq_files(file_path)
 41 | if fastqFiles[0][0].startswith('trim_'):
 42 |     trim = False
 43 | #--------------------- 2. trim reads-----------------------------------------------------
 44 | def trim_parameters():
 45 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 46 |     for infile, output in zip(infiles,outfiles):
 47 |         yield infile,output
 48 | #------------- run fastqc before trimming -----------
 49 | @active_if(QC)
 50 | @jobs_limit(thread)
 51 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 52 | @files(trim_parameters)
 53 | def run_QC1(input_file,output_file):
 54 |     for fq in input_file:
 55 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 56 | #------------ trim file ------------------
 57 | @active_if(trim)
 58 | @follows(run_QC1)
 59 | @jobs_limit(trim_batch)
 60 | @files(trim_parameters)
 61 | def trim_reads(input_file,output_file):
 62 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 63 |     conda_Trimmomatic(input_file,output_file,n,adapter,min_len)
 64 |     remove(input_file)
 65 |     # run fastqc after trimming
 66 |     if QC:
 67 |         for fq in output_file:
 68 |             sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 69 | #--------------------- 3. Align to rtRNA -----------------------------------------------
 70 | # build index
 71 | @active_if(not os.path.exists(hisat2_rRNA_db))
 72 | @follows(trim_reads,run_QC1)
 73 | def hisat2_rrna_index():
 74 |     if not os.path.exists(hisat2_rRNA_db): os.mkdir(hisat2_rRNA_db)
 75 |     hisat2_Db(rRNA_fa,hisat2_rRNA_db+'/rRNA',thread)
 76 | # align
 77 | if trim == False:
 78 |     trim_reads=fastqFiles
 79 | @jobs_limit(hisat2_batch)
 80 | @follows(hisat2_rrna_index)
 81 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01rRNA_bam')
 82 | @check_if_uptodate(check_file_exists)
 83 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01rRNA_bam/{basename[0]}_norrna.fq.gz')
 84 | def run_hisat2rRNA(input_file,output_file):
 85 |     n = num_thread2use(hisat2_batch,len(fastqFiles),thread)
 86 |     rrna_fq = output_file[:-13]+'.bam'
 87 |     hisat2(input_file,rrna_fq,hisat2_rRNA_db+'/rRNA',n,['--un-gz',output_file])
 88 | #--------------------- 3. Align to target genome ----------------------------------------
 89 | @active_if(not os.path.exists(hisat2_target_db))
 90 | @follows(run_hisat2rRNA)
 91 | def hisat2_index():
 92 |     if not os.path.exists(hisat2_target_db): os.mkdir(hisat2_target_db)
 93 |     hisat2_Db(ref_fa,hisat2_target_db+'/target',thread)
 94 | # align
 95 | @jobs_limit(hisat2_batch)
 96 | @follows(hisat2_index)
 97 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_bam')
 98 | @check_if_uptodate(check_file_exists)
 99 | @transform(run_hisat2rRNA,formatter('.*\.f.*?\.gz'),'f02_bam/{basename[0]}.bam')
100 | def run_hisat2(input_file,output_file):
101 |     n = num_thread2use(hisat2_batch,len(fastqFiles),thread)
102 |     hisat2([input_file],output_file,hisat2_target_db+'/target',n,['--known-splicesite-infile',annotation])
103 | #--------------------- 4. get primary mapping -------------------------------------------
104 | @jobs_limit(trim_batch)
105 | @follows(run_hisat2)
106 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_primaryBam')
107 | @check_if_uptodate(check_file_exists)
108 | @transform(run_hisat2,formatter('.*\.bam'),'f03_primaryBam/{basename[0]}.bam')
109 | def primary_bam(input_file,output_file):
110 |     cmd = ('samtools view -h {fst_map} | grep -E {pattern} | '
111 |                'samtools view -bh -F 256 - > {out}').format(fst_map=input_file,
112 |                 pattern='\'(NM:i:[012])|(^@)\'',out=output_file)
113 |     print(cmd)
114 |     sarge.run(cmd)
115 | #--------------------- 5. samtools sort by position --------------------------------------
116 | @jobs_limit(trim_batch)
117 | @follows(primary_bam)
118 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_sortBam')
119 | @check_if_uptodate(check_file_exists)
120 | @transform(primary_bam,formatter('.*\.bam'),'f04_sortBam/{basename[0]}.sort.bam')
121 | def sort_by_pos(input_file,output_file):
122 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
123 |     sortBam(input_file,output_file,n)
124 |     stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file))
125 |     with open(output_file[:-3]+'flagstat.txt','w') as f:
126 |         f.write(stat)
127 | # @follows(sort_by_pos)
128 | # def remove_bam():
129 | #     if os.path.exists('f01rRNA_bam'): shutil.rmtree('f01rRNA_bam')   # remove bam folder
130 | 
131 | 
132 | @follows(sort_by_pos)
133 | def last_function():
134 |     Message('Riboseq finished',contact)
135 | 
136 | if __name__ == '__main__':
137 |     try:
138 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
139 |                  touch_files_only=False,verbose=5)
140 |     except:
141 |         Message('Riboseq failed',contact)
142 | 


--------------------------------------------------------------------------------
/RNAseq_STARpipeline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # RNAseq qc and quantification
  4 | #QC: Picard and RNA-seQC
  5 | #quantification: HT-seq and RSME
  6 | 
  7 | ##### Constants
  8 | 
  9 | 
 10 | ##### Functions
 11 | 
 12 | 
 13 | usage()
 14 | {
 15 |     echo "usage: STARpipeline.sh -f1 fastq1 -f2 fastq2 -sid sample_id -starIndx star_index_directory -BAM STAR_bam_path -refFASTA refereceGenome -GFF referenceGTF -RSEM RSEMreference -ncores num_cores"
 16 | }
 17 | 
 18 | 
 19 | ##### Main
 20 | #fastq1="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/fastq1.gz"
 21 | #fastq2="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/fastq2.gz"
 22 | #sample_id="1870-01_W4G6DG07_S55"
 23 | #bam_path="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/"
 24 | #refGenome="/data/vahid/RNAseqPipeLine/Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta"
 25 | #refGTF="/data/vahid/RNAseqPipeLine/gencode.v26.GRCh38.genes.gtf"
 26 | #RSEMpath="/data/vahid/RNAseqPipeLine/rsemRef/rsem_reference"
 27 |     
 28 | ncores=10
 29 | while [ "$1" != "" ]; do
 30 |     case $1 in
 31 |         -f1 | --fastq1 )        shift
 32 |                                 fastq1=$1
 33 |                                 ;;
 34 |         -f2 | --fastq2 )        shift
 35 |                                 fastq2=$1
 36 |                                 ;;
 37 |         -sid | --sample_id )    shift
 38 |                                 sample_id=$1
 39 |                                 ;;
 40 |         -starIndx | --Star_index_directory )      shift
 41 |                                 star_index=$1
 42 |                                 ;;
 43 |         -BAM | --BAMpath )      shift
 44 |                                 bam_path=$1
 45 |                                 ;;
 46 |         -refFASTA | --refGenome )        shift
 47 |                                 refGenome=$1
 48 |                                 ;;
 49 |         -GFF | --refGFF )       shift
 50 |                                 refGTF=$1
 51 |                                 ;;
 52 |         -ncores | --num_cores )       shift
 53 |                                 ncores=$1
 54 |                                 ;;
 55 |         -RSEM | --RSEMreference )       shift
 56 |                                 RSEMpath=$1
 57 |                                 ;;
 58 |         -h | --help )           usage
 59 |                                 exit
 60 |                                 ;;
 61 |     esac
 62 |     shift
 63 | done
 64 | 
 65 | #mkdir -p -- ${bam_path}
 66 | 
 67 | #cp -f "$fastq1" ${bam_path}fastq1.gz
 68 | #cp -f "$fastq2" ${bam_path}fastq2.gz
 69 | 
 70 | bam_file_name="Aligned.sortedByCoord.out.bam"
 71 | 
 72 | STAR --runMode alignReads --runThreadN ${ncores} --genomeDir ${star_index} \
 73 | --twopassMode Basic \
 74 | --outFilterMultimapNmax 20 \
 75 | --alignSJoverhangMin 8 \
 76 | --alignSJDBoverhangMin 1 \
 77 | --outFilterMismatchNmax 999 \
 78 | --outFilterMismatchNoverLmax 0.1 \
 79 | --alignIntronMin 20 \
 80 | --alignIntronMax 1000000 \
 81 | --alignMatesGapMax 1000000 \
 82 | --outFilterType BySJout \
 83 | --outFilterScoreMinOverLread 0.33 \
 84 | --outFilterMatchNminOverLread 0.33 \
 85 | --limitSjdbInsertNsj 1200000 \
 86 | --readFilesIn $fastq1 $fastq2 \
 87 | --readFilesCommand zcat \
 88 | --outFileNamePrefix ${bam_path} \
 89 | --outSAMstrandField intronMotif \
 90 | --outFilterIntronMotifs None \
 91 | --alignSoftClipAtReferenceEnds Yes \
 92 | --quantMode TranscriptomeSAM GeneCounts \
 93 | --outSAMtype BAM SortedByCoordinate \
 94 | --outSAMunmapped Within \
 95 | --genomeLoad NoSharedMemory \
 96 | --chimSegmentMin 15 \
 97 | --chimJunctionOverhangMin 15 \
 98 | --chimOutType WithinBAM SoftClip \
 99 | --chimMainSegmentMultNmax 1 \
100 | --outSAMattributes NH HI AS nM NM ch --outSAMattrRGline ID:rg1 SM:sm1
101 | 
102 | #Marking the duplicates using picard
103 | mkdir -p -- ${bam_path}markedDup
104 | java -jar /data/vahid/RNAseqPipeLine/picard.jar \
105 |         MarkDuplicates I=${bam_path}${bam_file_name} \
106 |         O=${bam_path}markedDup/${bam_file_name} \
107 |         M=${bam_path}/markedDup/sample_id.marked_dup_metrics.txt \
108 |         ASSUME_SORT_ORDER=coordinate
109 | 
110 | #Running Picard QC
111 | mkdir -p -- ${bam_path}/PicardQC
112 | java -jar /data/vahid/RNAseqPipeLine/picard.jar CollectAlignmentSummaryMetrics \
113 | R=${refGenome} \
114 | INPUT=${bam_path}${bam_file_name} \
115 | OUTPUT=${bam_path}PicardQC/picard_QC.txt
116 | 
117 | #Running Picard fragment size
118 | java -jar /data/vahid/RNAseqPipeLine/picard.jar CollectInsertSizeMetrics \
119 |       I=${bam_path}${bam_file_name} \
120 |       O=${bam_path}PicardQC/insert_size_metrics.txt \
121 |       H=${bam_path}PicardQC/insert_size_histogram.pdf \
122 |       M=0.5
123 |       
124 | #Running samtools statistics on BAM indeces
125 | samtools index ${bam_path}${bam_file_name}
126 | samtools idxstats ${bam_path}${bam_file_name} > ${bam_path}PicardQC/indexStats.txt
127 | 
128 | #Running RNA-seQC
129 | 
130 | md_bam_file=${bam_path}markedDup/Aligned.sortedByCoord.out.bam
131 | mkdir -p -- ${bam_path}RNA_SeQC
132 | 
133 | 
134 | 
135 | samtools index ${md_bam_file}
136 | 
137 | javaPath="/home/vahid/.conda/pkgs/java-1.7.0-openjdk-cos6-x86_64-1.7.0.131-h06d78d4_0/x86_64-conda_cos6-linux-gnu/sysroot/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.131.x86_64/jre/bin/"
138 | ${javaPath}java -jar /data/vahid/RNAseqPipeLine/RNA-SeQC_1.1.9/RNA-SeQC.jar -n 1000 \
139 |     -s ${sample_id},${md_bam_file},${sample_id} \
140 |     -t ${refGTF} \
141 |     -r ${refGenome} \
142 |     -noDoC \
143 |     -strictMode \
144 |     -o ${bam_path}RNA_SeQC \
145 |     -gatkFlags --allow_potentially_misencoded_quality_scores \
146 |     -singleEnd no
147 |     
148 | 
149 | #Running RSEM
150 | 
151 | mkdir -p -- ${bam_path}RSEM
152 | /data/vahid/RNAseqPipeLine/RSEMpkg/RSEM-1.2.25/rsem-calculate-expression --num-threads 4 \
153 |         --fragment-length-max 1000 \
154 |         --estimate-rspd \
155 |         --no-bam-output \
156 |         --paired-end \
157 |         --bam ${bam_path}Aligned.toTranscriptome.out.bam \
158 |         ${RSEMpath} ${bam_path}RSEM/RSEM
159 | 
160 | #Running HT-Seq
161 | mkdir -p -- ${bam_path}HTseq
162 | htseq-count -f bam -r pos -s no -t gene ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqGeneUnion.txt &
163 | sleep 1m
164 | htseq-count -f bam -r pos -s no -t transcript ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqTranscriptUnion.txt &
165 | sleep 1m
166 | htseq-count -f bam -r pos -s no -t exon ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqExonUnion.txt &
167 | sleep 1m
168 | htseq-count -f bam -r pos -s no -t gene -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqGeneIntersect.txt &
169 | sleep 1m
170 | htseq-count -f bam -r pos -s no -t transcript -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqTranscriptIntersect.txt &
171 | sleep 1m
172 | htseq-count -f bam -r pos -s no -t exon -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqExonIntersect.txt
173 | sleep 30m
174 | 
175 | 
176 | 
177 | #rm -- ${bam_path}fastq1.gz
178 | #rm -- ${bam_path}fastq2.gz
179 | rm -rf -- ${bam_path}_STARpass1
180 | rm -rf -- ${bam_path}_STARgenome
181 | 
182 | rm -rf -- ${bam_path}markedDup
183 | 
184 | rm -- ${bam_path}Aligned.toTranscriptome.out.bam
185 | rm -- ${bam_path}Aligned.sortedByCoord.out.bam
186 | rm -- ${bam_path}Aligned.sortedByCoord.out.bam.bai


--------------------------------------------------------------------------------
/GRO_Seq_Cap.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Trimmomatic import Trimmomatic
  4 | from Modules.Aligner import STAR,STAR_Db
  5 | from Modules.Samtools import sortBam
  6 | from Modules.Homer import *
  7 | import yaml
  8 | import shutil
  9 | import itertools
 10 | 
 11 | #============ parameters ======================
 12 | parameter_file =  sys.argv[1]
 13 | # parameter_file = '/home/shangzhong/Codes/NGS-Pipeline/Parameters/GRO_Seq_Cap.yaml'
 14 | with open(parameter_file,'r') as f:
 15 |     doc = yaml.load(f)
 16 | p = dic2obj(**doc)
 17 | #------------- get parameters -----------
 18 | file_path = p.RawDataPath
 19 | thread = p.thread
 20 | QC = p.QC
 21 | # all parameter
 22 | ref_fa = p.ref_fa
 23 | annotation = p.gff
 24 | # trimmomatic parameter
 25 | trim = p.trim_reads
 26 | trimmomatic = p.trimmomatic_path
 27 | trim_batch = p.trim_jobs_per_batch
 28 | adapter = p.adapter
 29 | # star parameter
 30 | star_batch = p.star_jobs_per_batch
 31 | db_path = p.STAR_index_path
 32 | 
 33 | contact = p.contact
 34 | #===============================================================================
 35 | #                    Pipeline part
 36 | #===============================================================================
 37 | #--------------------- 1. read all files ------------------------------------------------
 38 | # Message('5GRO',contact)
 39 | os.chdir(file_path)
 40 | fastqFiles = list_fq_files(file_path)
 41 | if fastqFiles[0][0].startswith('trim_'):
 42 |     trim = False
 43 | #--------------------- 2. trim reads-----------------------------------------------------
 44 | def trim_parameters():
 45 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 46 |     for infile, output in zip(infiles,outfiles):
 47 |         yield infile,output
 48 | #------------- run fastqc before trimming -----------
 49 | @active_if(QC)
 50 | @jobs_limit(thread)
 51 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_fastqc')
 52 | @files(trim_parameters)
 53 | def run_QC1(input_file,output_file):
 54 |     for fq in input_file:
 55 |         sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq))
 56 | #------------ trim file ------------------
 57 | @active_if(trim)
 58 | @follows(run_QC1)
 59 | @jobs_limit(trim_batch)
 60 | @files(trim_parameters)
 61 | def trim_reads(input_file,output_file):
 62 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 63 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter,22)
 64 |     remove(input_file)
 65 | #------------ run fastqc after trimming ------------
 66 | @active_if(QC and trim)
 67 | @follows(trim_reads)
 68 | @jobs_limit(thread)
 69 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01_fastqc/{basename[0]}')
 70 | @check_if_uptodate(check_file_exists)
 71 | # @files(trim_parameters)
 72 | def run_QC2(input_file,output_file):
 73 |     for fq_in,fq_out in zip(input_file,output_file):
 74 |         if fq_in.startswith('trim_'):
 75 |             sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq_in))
 76 |         else:
 77 |             sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq_out))
 78 | #--------------------- 3. run STAR ------------------------------------------------------
 79 | # build index
 80 | @active_if(not os.path.exists(db_path))
 81 | @follows(trim_reads,run_QC2)
 82 | def star_index():
 83 |     STAR_Db(db_path,ref_fa,thread)
 84 | # align
 85 | if trim == False:
 86 |     trim_reads=fastqFiles
 87 | @jobs_limit(star_batch)
 88 | @follows(star_index)
 89 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_bam')
 90 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_flagstat')
 91 | @check_if_uptodate(check_file_exists)
 92 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f02_bam/{basename[0]}.bam')
 93 | def run_star(input_file,output_file):
 94 |     n = num_thread2use(star_batch,len(fastqFiles),thread)
 95 |     STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM','SortedByCoordinate','--outSAMunmapped Within'])
 96 |     stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file))
 97 |     flag_fn = output_file[:-3]+'flagstat.txt'
 98 |     with open(flag_fn,'w') as f:
 99 |         f.write(stat)
100 |     shutil.move(flag_fn,'f02_flagstat')
101 | #--------------------- 4. make tag_directory ------------------------------------------------------
102 | @follows(run_star)
103 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_tags')
104 | @check_if_uptodate(check_file_exists)
105 | @transform(run_star,formatter('\.bam'),'f03_tags/{basename[0]}')
106 | def make_tag(input_bam,out_dir):
107 |     make_tag_directory(input_bam,out_dir,ref_fa)
108 |     hist_out = out_dir+'/hist.txt'
109 |     hist(out_dir,hist_out,ref_fa,annotation,mode='tss',peak='',region=4000,res=10,pc=3)
110 |     hist_plot(hist_out)
111 | #--------------------- 5. find peaks ------------------------------------------------------
112 | def get_input_for_peak_call():
113 |     gro_cap = [f for f in os.listdir('f03_tags') if '5GRO' in f and 'contr' not in f]
114 | #     gro_cap_ctr = [[f for f in os.listdir('f03_tags') if 'contr' in f]]
115 |     gro_seq = [f for f in os.listdir('f03_tags') if '5GRO' not in f]
116 |     comb = list(itertools.product(gro_cap,gro_seq))
117 |     for com in comb:
118 |         out = com[0] + '_and_' + com[1]
119 |         yield ['f03_tags/'+f for f in com],'f04_peaks/' + out + '.peak'
120 | @jobs_limit(thread)
121 | @follows(make_tag)
122 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_peaks')
123 | @files(get_input_for_peak_call)
124 | def find_peak(input_files,output_file):
125 |     find_peaks(input_files[0],output_file,'tss',input_files[1],['-F 2'])
126 | #--------------------- 6. merge peaks ------------------------------------------------------
127 | @follows(find_peak)
128 | @merge(find_peak,'f04_peaks/merge.peak')
129 | def merge_peak(input_files,output_file):
130 |     merge_peaks(input_files,output_file,150)
131 | #--------------------- 6. annotate peaks ------------------------------------------------------
132 | @jobs_limit(thread)
133 | @follows(merge_peak)
134 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_annoPeaks')
135 | @transform(merge_peak,formatter('\.peak'),'f05_annoPeaks/{basename[0]}.anno')
136 | @check_if_uptodate(check_file_exists)
137 | def anno_peak(input_file,output_file):
138 |     annotate_peaks(input_file,output_file,ref_fa,annotation)
139 | #--------------------- 7. hist peaks ------------------------------------------------------
140 | @jobs_limit(thread)
141 | @follows(anno_peak)
142 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_histPeaks')
143 | @transform(find_peak,formatter('\.peak'),'f06_histPeaks/{basename[0]}.hist')
144 | @check_if_uptodate(check_file_exists)
145 | def peak_cov_hist(input_file,output_file): # input is peak file
146 |     gro_cap = [f for f in os.listdir('f03_tags') if '5GRO' in f]
147 |     tag = ['f03_tags/' + t for t in gro_cap if t in input_file]
148 |     hist(tag[0],output_file,ref_fa,annotation,mode='peak',peak=input_file,region=4000,res=25,pc=1)
149 |     hist_plot(output_file)
150 | 
151 | #--------------------- 8. merge peaks ------------------------------------------------------
152 | 
153 | @follows(peak_cov_hist)
154 | def last_function():
155 |     Message('GroCap finished',contact)    
156 | 
157 | 
158 | if __name__ == '__main__':
159 |     try:
160 | #         pipeline_printout(sys.stdout, [last_function], verbose=3)
161 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 
162 |                     touch_files_only=False,verbose=20)
163 |     except:
164 |         Message('GroCap failed',contact)
165 |     
166 | 
167 | 


--------------------------------------------------------------------------------
/Modules/Aligner.py:
--------------------------------------------------------------------------------
  1 | import sarge
  2 | import os
  3 | import shutil
  4 | import sys
  5 | #===============================================================================
  6 | #                         STAR
  7 | #===============================================================================
  8 | def STAR_Db(db_path,ref_fa,thread=1,annotation = '',genomeSize='large'):
  9 |     """
 10 |     This function generates database for alignment using STAR
 11 |     """
 12 |     if not os.path.exists(db_path): os.mkdir(db_path)
 13 |     if os.listdir(db_path) == []:
 14 |         cmd = ('STAR --runMode genomeGenerate --genomeDir {db_path} '
 15 |                '--genomeFastaFiles {ref_fa} --runThreadN {thread} '
 16 |                '--limitGenomeGenerateRAM 100000000000 ').format(
 17 |                 db_path=db_path,ref_fa=ref_fa,thread=str(thread))
 18 |         if annotation != '':
 19 |             cmd = cmd + ('--sjdbGTFfile {gff3} --sjdbGTFtagExonParentTranscript Parent '
 20 |                          '--sjdbOverhang 100').format(gff3=annotation)   # for geneDb add --sjdbGTFfeatureExon CDS
 21 |         if genomeSize == 'small':
 22 |             cmd = cmd + '--genomeChrBinNbits 6 --genomeSAindexNbases 4'
 23 |     print(cmd);sys.stdout.flush()
 24 |     sarge.run(cmd)
 25 | 
 26 | def STAR(fastqFiles,outSamFile,db_path,thread=1,annotation='',otherParameters=['']):
 27 |     """STAR for single end read"""
 28 |     if annotation != '':
 29 |         otherParameters.extend(['--sjdbGTFfile {gff}'.format(gff=annotation)])
 30 |     if annotation.endswith('gff') or annotation.endswith('gff3'):
 31 |         otherParameters.append('--sjdbGTFtagExonParentTranscript Parent')
 32 |     # generate command
 33 |     if len(fastqFiles) == 1:
 34 |         starCmd = ('STAR --genomeDir {ref} --readFilesCommand zcat '
 35 |                      '--readFilesIn {fq1} --runThreadN {thread} '
 36 |                      '--outFileNamePrefix {output} --outSAMstrandField intronMotif '
 37 |                      '--outFilterIntronMotifs RemoveNoncanonical').format(
 38 |                     ref=db_path,fq1=fastqFiles[0],
 39 |                     thread=thread,output=outSamFile)
 40 |     elif len(fastqFiles) == 2:
 41 |         starCmd = ('STAR --genomeDir {ref} --readFilesCommand zcat '
 42 |                      '--readFilesIn {fq1} {fq2} --runThreadN {thread} '
 43 |                      '--outFileNamePrefix {output} --outSAMstrandField intronMotif '
 44 |                      '--outFilterIntronMotifs RemoveNoncanonical').format(
 45 |                     ref=db_path,fq1=fastqFiles[0],fq2=fastqFiles[1],
 46 |                     thread=thread,output=outSamFile)
 47 |     cmd = starCmd + ' ' + ' '.join(otherParameters)
 48 |     print(cmd);sys.stdout.flush()
 49 |     sarge.run(cmd)
 50 |     if 'SortedByCoordinate' in otherParameters:
 51 |         outFile = outSamFile+'Aligned.sortedByCoord.out.bam'
 52 |     else:
 53 |         outFile = outSamFile+'Aligned.out.bam'
 54 |     os.rename(outFile,outSamFile)
 55 |     if os.path.exists(outSamFile+'_STARgenome'):
 56 |         shutil.rmtree(outSamFile+'_STARgenome')
 57 |     
 58 |     
 59 | def BLASR(faFile,outBam,ref_fa,thread,otherParameters=['']):
 60 |     """This function runs BLASR"""
 61 |     
 62 |     if otherParameters != ['']:
 63 |         other = ' '.join(otherParameters)
 64 |     else:
 65 |         other = ''
 66 |     cmd = ('blasr {input} {ref} -sam -nproc {thread} {other} | samtools view -hb - > {out}').format(
 67 |                     input=faFile,ref=ref_fa,thread=str(thread),other=other,out=outBam,)
 68 |     
 69 |     print(cmd);sys.stdout.flush()
 70 |     sarge.run(cmd)
 71 | 
 72 | #===============================================================================
 73 | #                         bwa
 74 | #===============================================================================
 75 | def bwa_Db(db_path,ref_fa):
 76 |     """build bwa index"""
 77 |     if not os.path.exists(db_path):
 78 |         os.mkdir(db_path)
 79 |     cmd = ('bwa index -p {db_path}/bwa -a bwtsw {fa}').format(fa=ref_fa,db_path=db_path)
 80 |     print(cmd);sys.stdout.flush()
 81 |     sarge.run(cmd)
 82 |     
 83 |     
 84 | def bwa_mem(fqFile,outSam,db_name,thread,otherParameters=['']):
 85 |     """run bwa"""
 86 |     if otherParameters != ['']:
 87 |         other =  ' '.join(otherParameters) + ' '
 88 |     else:
 89 |         other = ''
 90 |     if len(fqFile) == 1:
 91 |         bwaCmd = ('bwa mem -t {thread} {other}{db} {fq} | samtools view -bh - > {out} ').format(
 92 |                     thread=str(thread),other=other,db=db_name,fq=fqFile[0],
 93 |                     out=outSam)
 94 |     else:
 95 |         bwaCmd = ('bwa mem -t {thread} {other}{db} {fq1} {fq2} | samtools view -bh - > '
 96 |         '{out} ').format(thread=str(thread),other=other,db=db_name,fq1=fqFile[0],
 97 |         fq2=fqFile[1],out=outSam)
 98 |     print(bwaCmd);sys.stdout.flush()
 99 |     sarge.run(bwaCmd)
100 | #bwa_mem('/data/shangzhong/Pacbio/sniffle/CHOS.fq.gz','/data/shangzhong/Pacbio/sniffle/result.bam','5',['-x pacbio'])   
101 | 
102 | 
103 | def bwa_samblaster(fqFiles,outBam,db_name,thread,otherParameters=['']):
104 |     '''map for lumpy '''
105 |     if len(fqFiles) != 2:
106 |         assert False,'fastq files are not paired'
107 |     if otherParameters != ['']:
108 |         other =  ' '.join(otherParameters) + ' '
109 |     else:
110 |         other = ''
111 |     split = outBam[:-3]+'split.sam'
112 |     disc = outBam[:-3] + 'disc.sam'
113 |     cmd = ('bwa mem -t {thread} {other}{db} {fq1} {fq2} | samblaster --addMateTags -e -d {disc} -s {split} | \
114 |             samtools view -Sb - > {out}').format(thread=str(thread),other=other,db=db_name,fq1=fqFiles[0],
115 |             fq2=fqFiles[1],disc=disc,split=split,out=outBam)
116 |     print(cmd);sys.stdout.flush()
117 |     sarge.run(cmd)
118 | 
119 | #===============================================================================
120 | #                         HISAT2
121 | #===============================================================================
122 | def hisat2_Db(ref_fa,db,thread=1):
123 |     """
124 |     """
125 |     cmd = ('hisat2-build -p {t} {ref} {name} ').format(t=str(thread),ref=ref_fa,name=db)
126 |     print(cmd);sys.stdout.flush()
127 |     sarge.run(cmd)
128 |     
129 | 
130 | 
131 | def hisat2(fqFile,outBam,db_name,thread,otherParameters=['']):
132 |     """
133 |     """
134 |     if otherParameters != ['']:
135 |         other =  ' '.join(otherParameters) + ' '
136 |     else:
137 |         other = ''
138 |     if len(fqFile) == 1:
139 |         hisat2Cmd = ('hisat2 -x {db} -U {fq} -t {other} -p {thread} '
140 |                      '| samtools view -bh - > {out}').format(db=db_name,fq=fqFile[0],
141 |                       other=other,thread=str(thread),out=outBam)
142 |     else:
143 |         hisat2Cmd = ('hisat2 -x {db} -1 {fq1} -2 {fq2} -t {other} -p {thread} '
144 |                      '| samtools view -bh - > {out}').format(db=db_name,fq1=fqFile[0],fq2=fqFile[1],
145 |                     other=other,thread=str(thread),out=outBam)
146 |     
147 |     print(hisat2Cmd);sys.stdout.flush()
148 |     sarge.run(hisat2Cmd)
149 | 
150 | #===============================================================================
151 | #                     ngmlr
152 | #===============================================================================
153 | def ngmlr(in_fa,outBam,ref_fa,thread):
154 |     '''run nglmr for better SV detection using pacbio'''
155 |     cmd = ('ngmlr -t {thread} -r {ref} -q {fa} | samtools view -hb - > outBam').format(
156 |                             thread=str(thread),ref=ref_fa,fa=in_fa)
157 |     print(cmd);sys.stdout.flush()
158 |     sarge.run(cmd)


--------------------------------------------------------------------------------
/VCF_snpEff_annotation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | this pipeline annotation variant calling results in vcf file and then 
  3 | use provean to predict the effect
  4 | """
  5 | import sys,subprocess,os
  6 | sys.path.append('/home/shangzhong/Codes/Pipeline')
  7 | sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # disable buffer
  8 | from Modules.f11_snpEff_provean import *
  9 | from Modules.p01_FileProcess import get_parameters
 10 | from Modules.f00_Message import Message
 11 | from Modules.p05_ParseGff import *
 12 | from multiprocessing import Pool,Process
 13 | #parFile = sys.argv[1]
 14 | parFile = '/data/shangzhong/DNArepair/correction/Annotation_Parameters.txt'
 15 | param = get_parameters(parFile)
 16 | # parameters
 17 | thread = param['thread']
 18 | pathway = param['pathway']
 19 | email = param['email']
 20 | startMessage = param['startMessage']
 21 | endMessage = param['endMessage']
 22 | # database reference
 23 | fastaFile = param['reference']
 24 | record_dict = SeqIO.index(fastaFile,'fasta')
 25 | gffFile = param['annotation']
 26 | genome = param['genome']
 27 | # software parameters
 28 | snpSift = param['snpSift']
 29 | snpEff = param['snpEff']
 30 | provean = param['provean']
 31 | support_set_path = param['support_set']
 32 | provean_res_path = param['provean_results']
 33 | # other parameters
 34 | gene_file = param['gene_file']
 35 | 
 36 | #===============================================================================
 37 | #        Variant analysis pipeline
 38 | #===============================================================================
 39 | def chunk(l,n):
 40 |     n = max(1,n)
 41 |     res = [l[i:i+n] for i in range(0,len(l),n)]
 42 |     return res
 43 | 
 44 | def get_genes_from_file(gene_file):
 45 |     """read gene list from the file and return a list of gene symbols"""
 46 |     if gene_file == '':
 47 |         genes = ['']
 48 |     else:
 49 |         genes = []
 50 |         gene_df = pd.read_csv(gene_file,header=None,names=['GeneID'])
 51 |         genes = gene_df['GeneID'].tolist()
 52 |     return genes
 53 | 
 54 | def get_all_folders(pathway):
 55 |     """put each pair of vcf,vcf.idx files into separate folder, return folders"""
 56 |     folders = []
 57 |     files = [f for f in os.listdir(pathway) if f.endswith('.merged.filter.vcf')]
 58 |     if files != []:
 59 |         files = natsorted(files)
 60 |         for f in files:
 61 |             fp = f[:-18]
 62 |             folders.append(fp)
 63 |             if not os.path.exists(fp): os.mkdir(fp)
 64 |             os.rename(f,fp+'/'+f)
 65 |             os.rename(f+'.idx',fp+'/'+f+'.idx')
 66 |     else:
 67 |         all_folders = [fo for fo in os.listdir(pathway) if os.path.isdir(fo)]
 68 |         for folder in all_folders:
 69 |             fns = [f for f in os.listdir(folder) if f.endswith('merged.filter.vcf')]
 70 |             if fns != []:
 71 |                 folders.append(folder)
 72 |     print 'list directories succeeds'
 73 |     print 'folders are:',folders
 74 |     return folders
 75 | 
 76 | def prepare_fa_vari(workdir,snpEff,snpSift,email,genome,genes,record_dict,gffFile):
 77 |     """
 78 |     Prepare files for running provean, each folder should only have vcf and vcf.idx file
 79 |     * workdir: the folder that has vcf files
 80 |     * snpEff: path to snpEff
 81 |     * snpSift: path to snpSift
 82 |     * email: email or phone number (number@txt.att.net)
 83 |     * genome: genome name defined in snpEff
 84 |     * genes: A list of gene symbols
 85 |     * record_dict: 
 86 |     """
 87 |     gene_rna_lst = [f[:-11] for f in os.listdir(workdir) if f.endswith('protein.fa')]
 88 |     
 89 |     os.chdir(workdir) # set work directory
 90 |     vcfFiles = [f for f in os.listdir(workdir) if f.endswith('filter.vcf')]
 91 |     vcfFile = vcfFiles[0]
 92 |     proteinFiles = [];variantFiles = []
 93 |     #============= 1. Annotate vcf results using snpEff ================
 94 |     annotatedVCF = vcfFile[:-3] + 'eff.vcf'
 95 |     if not os.path.exists(workdir + '/' + annotatedVCF):
 96 |         annotatedVCF = snpEff_annotateVCF(vcfFile,snpEff,genome)  # annotated: filename.eff.vcf
 97 |     #============= 2. Loop for every genes ================================
 98 |     for gene in genes:
 99 |         print gene,'start to get input files for provean'
100 |         if gene == '':
101 |             try:
102 |                 filteredVCF = snpSift_filterVCF(annotatedVCF,snpSift,
103 |                             ['((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'])
104 |             except:
105 |                 print gene,'snpSift filter failed'
106 |                 Message('snpSift filter failed',email)
107 |         else:
108 |             gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene)
109 |             #============= (1). Filter the annotated file ========================
110 |             try:
111 |                 filteredVCF = snpSift_filterVCF(annotatedVCF,snpSift,[gene_if,'&'
112 |                             '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))'])
113 |                 print 'filteredVCF is: ',filteredVCF
114 |             except:
115 |                 print gene,'snpSift filter failed'
116 |                 Message('snpSift filter failed',email)
117 |         #============= (2). Get input files for provean ======================
118 |         try:
119 |             [protein_files,variant_files] = vcf2input4provean(filteredVCF,record_dict,gffFile,gene_rna_lst)
120 |         except:
121 |             print gene,'fail to get provean inputs'
122 |             Message('fail to get provean inputs',email)
123 |             raise
124 |         if protein_files != '':
125 |             proteinFiles.extend(protein_files)
126 |             variantFiles.extend(variant_files)
127 |             print gene,'prepare for provean input finish'
128 |         else:
129 |             print gene,'does not have interested variants'
130 |             raise
131 |     print workdir,'provean input succeed'
132 | 
133 | Message(startMessage,email)
134 | genes = get_genes_from_file(gene_file)
135 | #================= 0. list directories =========================================
136 | os.chdir(pathway) # set work directory
137 | folders = get_all_folders(pathway)
138 | folders = natsorted(folders)
139 | #============= 2. prepare input files for provean ======================================
140 | batch_folders = chunk(folders,int(thread))
141 | for batch in batch_folders:
142 |     proc = [Process(target=prepare_fa_vari,args=(pathway+'/'+f,snpEff,snpSift,email,genome,genes,record_dict,gffFile,)) for f in batch]
143 |     for p in proc:
144 |         p.start()
145 |     for p in proc:
146 |         p.join()
147 | #============= 3. Run provean ======================================    
148 | # # support set for provean, it can help proven skip the time consuming blast step
149 | for folder in folders:
150 |     support_set = [f for f in os.listdir(support_set_path) if f.endswith('.sss')]
151 |     workdir = pathway+'/'+folder
152 |     os.chdir(workdir)
153 |     proteinFiles = sorted([f for f in os.listdir(workdir) if f.endswith('protein.fa')])
154 |     variantFiles = sorted([f for f in os.listdir(workdir) if f.endswith('variant.txt')])
155 |     if not os.path.exists(provean_res_path): os.mkdir(provean_res_path)
156 |     provean_result = provean_res_path +'/'+folder+'_proveanScore.txt'
157 |     try:
158 |         capture_provean_scores(provean_result,provean,proteinFiles,variantFiles,support_set_path,support_set,thread)
159 |         print folder,'folder analysis succeeds'
160 |     except:
161 |         print 'capture provean scores failed'
162 |         Message('capture provean scores failed',email)
163 |         raise
164 |     #============= 4. move the sss support to the standard pathway ======================================
165 |     new_support_set = [f for f in os.listdir(pathway+'/'+folder) if f.endswith('.sss')]
166 |     for f in new_support_set:
167 |         if os.path.exists(f+'.fasta'):
168 |             os.rename(f,support_set_path+'/'+f)
169 |             os.rename(f+'.fasta',support_set_path+'/'+f+'.fasta')
170 | # cmd = ('rm */*.protein.fa'); subprocess.call(cmd,shell=True)
171 | # cmd = ('rm */*.variant.txt'); subprocess.call(cmd,shell=True)
172 | #for p in proteinFiles: os.remove(p)
173 | #for v in variantFiles: os.remove(v)
174 | #============= 4. Merge provean results ======================================
175 | outFile = pathway+'/provean_final_result.txt'
176 | try:
177 |     merge_provean_results(provean_res_path,outFile)
178 |     print 'merge succeed'
179 | except:
180 |     print 'merge failed'
181 | Message(endMessage,email)
182 | 
183 | 


--------------------------------------------------------------------------------
/Modules/f01_file_process.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from natsort import natsorted
  3 | import sarge
  4 | import re
  5 | import pandas as pd
  6 | 
  7 | 
  8 | 
  9 | class dic2obj:
 10 |     def __init__(self, **entries): 
 11 |         self.__dict__.update(entries)
 12 | 
 13 | def remove(files):
 14 |     """
 15 |     this function can remove files provided
 16 |     Arguments:  1. files: a list of files to be removed
 17 |     
 18 |     files: a list of files to be removed. [f1,f2,f3,...] or [[f1,f2],[f3],...] or with any depth of list layers
 19 |     """
 20 |     if isinstance(files,str):
 21 |         os.remove(files)
 22 |         try:
 23 |             os.remove(files + '.bai')
 24 |         except:
 25 |             pass
 26 |     if isinstance(files,list):
 27 |         for f in files:
 28 |             remove(f)
 29 |             try:
 30 |                 remove(f+'.bai')
 31 |             except:
 32 |                 continue
 33 | 
 34 | def check_file_exists(input_file, output_file):
 35 |     if not os.path.exists(output_file):
 36 |         return True, "Missing file %s" % output_file
 37 |     else:
 38 |         return False, "File %s exists" % output_file
 39 | 
 40 | def list_fq_files(file_path):
 41 |     """
 42 |     This function list all fastq files into a list
 43 |     """
 44 |     fst_files = natsorted([f for f in os.listdir(file_path) if '_R1_' in f and (f.endswith(".fastq.gz") or f.endswith(".fq.gz"))])
 45 |     snd_files = natsorted([f for f in os.listdir(file_path) if '_R2_' in f and (f.endswith(".fastq.gz") or f.endswith(".fq.gz"))])
 46 |     if fst_files == []:
 47 |         fst_files = natsorted([f for f in os.listdir(file_path) if f.endswith("_1.fastq.gz") or f.endswith("_1.fq.gz")])
 48 |         snd_files = natsorted([f for f in os.listdir(file_path) if f.endswith("_2.fastq.gz") or f.endswith("_2.fq.gz")])
 49 |     fastqFiles = [] # this list is going to stroe the paired or single file for running aligner
 50 |     if snd_files == []:
 51 |         fst_files = natsorted([f for f in os.listdir(file_path) if f.endswith(".fastq.gz") or f.endswith(".fq.gz")])
 52 |         fastqFiles = [[f] for f in fst_files]
 53 |     elif len(fst_files) == len(snd_files):
 54 |         fastqFiles = [[f1,f2] for f1,f2 in zip(fst_files,snd_files)]
 55 |     else:
 56 |         raise ValueError('input has single end and paired end mixed')
 57 |     return fastqFiles
 58 | 
 59 | 
 60 | #     allFiles = [f for f in os.listdir(file_path) if f.endswith(".fastq.gz") or f.endswith(".fq.gz")]
 61 | #     allFiles = natsorted(allFiles)
 62 | #     fastqFiles = []  # this list is going to stroe the paired or single file for running aligner
 63 | #     while len(allFiles) > 1:           # this is to append the single end or pair end files into a list.
 64 | #         if allFiles[0].endswith(".fastq.gz"):
 65 | #             index = allFiles[0].index(".fastq.gz")
 66 | #             if allFiles[1][index-2:index] == '_2':
 67 | #                 fastqFiles.append(allFiles[:2])
 68 | #                 del allFiles[:2]
 69 | #             else:
 70 | #                 fastqFiles.append(allFiles[:1])
 71 | #                 del allFiles[:1]
 72 | #         
 73 | #         if len(allFiles) != 0:        
 74 | #             if allFiles[0].endswith(".fq.gz"):
 75 | #                 index = allFiles[0].index(".fq.gz")
 76 | #                 if allFiles[1][index-2:index] == '_2':
 77 | #                     fastqFiles.append(allFiles[:2])
 78 | #                     del allFiles[:2]
 79 | #                 else:
 80 | #                     fastqFiles.append(allFiles[:1])
 81 | #                     del allFiles[:1]
 82 | #     if len(allFiles) == 1:
 83 | #         fastqFiles.append(allFiles)
 84 | # 
 85 | #     return fastqFiles
 86 | 
 87 | 
 88 | def replace_filename(inputfile,input_pattern,out_pattern):
 89 |     """
 90 |     This function generates the outputfile name to address a problem that transform failed to do:
 91 |        make the outputfile the same length with inputfile when inputfile length varies
 92 |     * inputfile: list. [[f1.fq.gz]...] or [[f1.fq.gz,f2.fq.gz]]
 93 |     * input_pattern: a pattern in input file.
 94 |     * out_pattern: a pattern for output file
 95 |     """
 96 |     outFile = []
 97 |     regex = re.compile(input_pattern)
 98 |     for infile in inputfile:
 99 |         res = []
100 |         for fn in infile:
101 |             output = regex.sub(out_pattern,fn)
102 |             res.append(output)
103 |         outFile.append(res)
104 |     return inputfile,outFile
105 | # result = replace_filename([['f_1.fq.gz','f_2.fq.gz']],'^','')
106 | # print result
107 | 
108 | def num_thread2use(jobs_per_batch,len_of_jobs,given_thread):
109 |     """
110 |     This function calculates how many thread to use for each job given the jobs to run per batch and total number of jobs
111 |     Some times total number of jobs is less than provided, in this case we can assign more thread to each job to run fast.
112 |     """
113 |     jobs = min(jobs_per_batch,len_of_jobs)
114 |     if jobs ==0:
115 |         thread = 1
116 |     else:
117 |         thread = int(given_thread/jobs)
118 |     if thread == 0:
119 |         thread = 1
120 |     return thread
121 | 
122 | 
123 | def Message(string,email):
124 |     """
125 |     This function send message to email when it run. 
126 |     Used to calculate the time code runs.
127 |     """
128 |     cmd = ('echo {quote}|mailx -s "{string}" {email}').format(quote="",string=string,email=email)
129 |     sarge.run(cmd)
130 | 
131 | def id_symbol_conversion(input_file,output_file,gene2refseq,tax_id,sym2ID='yes'):
132 |     """This function convers count file based on gene symbol to gene id
133 |     * inputfile: 2 columns. ['symbol','count']
134 |     * outputfile: 2 columns. ['geneid','count']"""
135 |     # 1. build {symbol:id conversion}
136 |     df = pd.read_csv(gene2refseq,sep='\t',header=None,usecols=[0,1,15],names=['tax','geneid','symbol'],comment='#',compression='gzip')
137 |     df = df[df['tax'].values==int(tax_id)]
138 |     sym_id_dict = df.set_index('symbol')['geneid'].to_dict()
139 |     # 2. transfer symbol -> id
140 |     symbol_df = pd.read_csv(input_file,sep='\t',header=None,names=['symbol','count'])
141 |     symbol_df['geneid'] = symbol_df['symbol'].map(lambda x: sym_id_dict[x] if x in sym_id_dict else x)
142 |     # 3. output
143 |     symbol_df[['geneid','count']].to_csv(output_file,sep='\t',header=None,index=False)
144 |     os.remove(input_file)
145 | 
146 | 
147 | 
148 | def get_gene_name_id_dic(gff,source,sym2ID='yes'):
149 |     '''
150 |     This function build {gene name: geneid} or {geneid:gene name} dictionary
151 |     * source: ncbi or ensembl
152 |     '''
153 |     df = pd.read_csv(gff,sep='\t',comment='#',header=None)
154 |     df = df[df[2].values=='gene']
155 |     df = df.reset_index(drop=True)
156 |     if source == 'ncbi':
157 |         gene_pattern = 'gene='
158 |         id_pattern   = 'GeneID\:'
159 |     elif source == 'ensembl':
160 |         gene_pattern = 'gene_name='
161 |         id_pattern   = 'ID='
162 |     df['geneid'] = df[8].map(lambda x: re.search('(?<={p}).+?(?=[.,;$])'.format(p=id_pattern),x).group(0))
163 |     df['genename'] = df[8].map(lambda x: re.search('(?<={p}).+?(?=[,;$])'.format(p=gene_pattern),x).group(0))
164 |     # build dictionary
165 |     if sym2ID=='yes':
166 |         return df.set_index('genename')['geneid'].to_dict()
167 |     else:
168 |         return df.set_index('geneid')['genename'].to_dict()
169 | 
170 | 
171 | def gene_id_name_convert_merge(in_file,out_file,gene_id_name_dic):
172 |     '''
173 |     * in_file: first row should be column name.
174 |     '''
175 |     df = pd.read_csv(in_file,sep='\t',header=0)
176 |     names = list(df.columns)
177 |     df.columns = ['id_before'] + names[1:]
178 |     df['id_after'] = df['id_before'].map(lambda x: gene_id_name_dic[x.split('.')[0]] if x.split('.')[0] in gene_id_name_dic else x.split('.')[0])
179 |     # 2. output
180 |     df[['id_after']+names[1:]].to_csv(out_file,sep='\t',header=None,index=False)
181 | 
182 | if __name__ == "__main__":
183 |     gff = '/data/genome/cho/chok1.gff'
184 |     dic = get_gene_name_id_dic(gff,'ncbi',sym2ID='yes')
185 |     in_file = '/path/to/file'
186 |     out_file = '/path/to/file'
187 |     gene_id_name_convert_merge(in_file,out_file,dic)
188 | 
189 | 
190 | def gene_id_name_convert(in_file,out_file,gene_id_name_dic):
191 |     # 1. transfer id
192 |     df = pd.read_csv(in_file,sep='\t',header=None,names=['id_before','count'])
193 |     df['id_after'] = df['id_before'].map(lambda x: gene_id_name_dic[x.split('.')[0]] if x.split('.')[0] in gene_id_name_dic else x.split('.')[0])
194 |     # 2. output
195 |     df[['id_after','count']].to_csv(out_file,sep='\t',header=None,index=False)
196 | 
197 | 
198 | 
199 | 
200 | 
201 | 
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/Modules/GATK.py:
--------------------------------------------------------------------------------
  1 | import sarge
  2 | import sys,re
  3 | 
  4 | def RealignerTargetCreator(dedupbam,interval,gatk,ref_fa,thread,gold_indels=['']):
  5 |     '''This function creates interval files for realigning.
  6 |     Input is deduplicated sorted bam files. reference is 
  7 |     fasta file.
  8 |     '''
  9 |     cmd = ('java -jar {gatk} -T RealignerTargetCreator '
 10 |            '-R {ref_fa} -I {dedup} -o {output} -nt {thread} ').format(
 11 |             gatk=gatk,ref_fa=ref_fa,dedup=dedupbam,output=interval,
 12 |             thread=str(thread))
 13 |     if gold_indels != ['']:
 14 |         gold_indels = ['-known ' + f for f in gold_indels]
 15 |         cmd = cmd + ' '.join(gold_indels)
 16 |     print(cmd);sys.stdout.flush()
 17 |     sarge.run(cmd)
 18 | 
 19 | 
 20 | def IndelRealigner(dedupBam,realiBam,gatk,ref_fa,interval,gold_indels=['']):
 21 |     '''This function realigns the deduped bam file to intervals
 22 |     reference is fasta file, target is target interval file.
 23 |     '''
 24 |     cmd = ('java -jar {gatk} -T IndelRealigner -R {ref_fa} '
 25 |            '-I {input} -targetIntervals {target} '
 26 |            '-o {output} ').format(gatk=gatk,ref_fa=ref_fa,
 27 |            input=dedupBam,target=interval,output=realiBam)
 28 |     if gold_indels != ['']:
 29 |         gold_indels = ['-known ' + f for f in gold_indels]
 30 |         cmd = cmd + gold_indels
 31 |     print(cmd);sys.stdout.flush()
 32 |     sarge.run(cmd)
 33 | 
 34 | 
 35 | def HaplotypeCaller_DNA_gVCF(recalBam,vcf,gatk,ref_fa,thread,otherParameters=[]):
 36 |     '''
 37 |     this function does calling variant and stores the result 
 38 |     into the gVCF file.
 39 |     '''
 40 |     cmd = ('java -jar {gatk} -T HaplotypeCaller -R {ref_fa} -I {input} '
 41 |            '--emitRefConfidence GVCF -o {output} -nct {t}').format(
 42 |             gatk=gatk,ref_fa=ref_fa,input=recalBam,output=vcf,t=str(thread))
 43 |     print(cmd);sys.stdout.flush()
 44 |     sarge.run(cmd)
 45 | 
 46 | 
 47 | def JointGenotype(raw_vcfs,gvcf,gatk,ref_fa,thread):
 48 |     ''' Merge all vcf files into one g.vcf file
 49 |     '''
 50 |     vcfs = ['--variant '+ f for f in raw_vcfs]
 51 |     cmd = ('java -Xmx100g -jar {gatk} -T GenotypeGVCFs -R {ref_fa} {vcf} '
 52 |            '-o {out} -nt {thread}').format(gatk=gatk,ref_fa=ref_fa,
 53 |             vcf=' '.join(vcfs),out=gvcf,thread=str(thread))
 54 |     print(cmd);sys.stdout.flush()
 55 |     sarge.run(cmd)
 56 |     
 57 | 
 58 | def SelectVariants(joint_variant,out_vcf,gatk,reference,extract_type,thread):
 59 |     """this function can extract either SNP or indel from the
 60 |     vcf file.
 61 |     """
 62 |     cmd = ('java -jar {gatk} -T SelectVariants -R {ref_fa} -V {input} '
 63 |            '-selectType {type} -o {output} -nt {thread}').format(gatk=gatk, 
 64 |             ref_fa=reference,input=joint_variant,type=extract_type,
 65 |             output=out_vcf,thread=str(thread))
 66 |     print(cmd);sys.stdout.flush()
 67 |     sarge.run(cmd)
 68 |     
 69 | 
 70 | def snpHardFilter(snp_vcf,out_vcf,gatk,ref_fa):
 71 |     """
 72 |     this function will filter the snps, output a gold standard snp database
 73 |     """
 74 |     filtercmd = ('QD < 3.0 || FS > 50.0 || MQ < 50.0 || HaplotypeScore > 10.0 '
 75 |                  '|| MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0')
 76 |     filtercmd = """'{filter}'""".format(filter=filtercmd)
 77 |     filtername = """'snp_filter'"""
 78 |     cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} -V {input} '
 79 |            '--filterExpression {filter} --filterName {filtername} '
 80 |            '-o {output}').format(gatk=gatk,ref_fa=ref_fa,input=snp_vcf,
 81 |                 filter = filtercmd,filtername=filtername,output=out_vcf)
 82 |     print(cmd);sys.stdout.flush()
 83 |     sarge.run(cmd)
 84 |     
 85 | 
 86 | def indelHardFilter(indel_file,out_vcf,gatk,ref_fa):
 87 |     """
 88 |     this function filter the indels,output a gold standard indel database
 89 |     """
 90 |     filtercmd = ("QD < 2.0 || FS > 200.0 || ReadPosRankSum < -15.0")
 91 |     filtercmd = """'{filter}'""".format(filter=filtercmd)
 92 |     filtername = """'indel_filter'"""
 93 |     cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} -V {input} '
 94 |            '--filterExpression {filter} --filterName {filtername} '
 95 |            '-o {output}').format(gatk=gatk,ref_fa=ref_fa,input=indel_file,
 96 |             filter = filtercmd,filtername=filtername,output=out_vcf)
 97 |     print(cmd);sys.stdout.flush()
 98 |     sarge.run(cmd)
 99 |     
100 | 
101 | def HardFilter(raw_gvcf,gold_snp_indel,gatk,ref_fa,thread):
102 |     """
103 |     this function will apply artificial filter for snp and indel
104 |     """
105 |     snp_filter = re.sub('g\.vcf$','snp.vcf',raw_gvcf)
106 |     indel_filter = re.sub('g\.vcf$','indel.vcf',raw_gvcf)
107 |     SelectVariants(raw_gvcf,snp_filter,gatk,ref_fa,'SNP',str(thread))
108 |     SelectVariants(raw_gvcf,indel_filter,gatk,ref_fa,'INDEL',str(thread))
109 |     snpHardFilter(snp_filter,gold_snp_indel[0],gatk,ref_fa)
110 |     indelHardFilter(indel_filter,gold_snp_indel[1],gatk,ref_fa)
111 |     
112 | 
113 | def BaseRecalibrator_1(realiBam,table,gold_pair,gatk,ref_fa,thread):
114 |     '''Step 1 of base recalibration.
115 |     '''
116 |     cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} '
117 |            '-I {realignbam} -knownSites {snp} -knownSites {indel} '
118 |            '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa,
119 |                 realignbam=realiBam,snp=gold_pair[0],indel=gold_pair[1],
120 |                 output=table,thread=str(thread))
121 |     print(cmd);sys.stdout.flush()
122 |     sarge.run(cmd)
123 | 
124 | 
125 | def BaseRecalibrator_2(realiBam,post_table,table,gold_pair,gatk,ref_fa,thread):
126 |     '''Step 2 of base recalibration: get post table'''
127 |     cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} '
128 |            '-I {realignbam} -knownSites {snp} -knownSites {indel} -BQSR {table} '
129 |            '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa,
130 |             realignbam=realiBam,snp=gold_pair[0],indel=gold_pair[1],output=post_table,table=table,thread=str(thread))
131 |     print(cmd);sys.stdout.flush()
132 |     sarge.run(cmd)
133 |            
134 | 
135 | def BaseRecalibrator_3(table,plot,post_table,gatk,ref_fa):
136 |     '''Step 3 of base recalibration: compare table and post table
137 |     '''
138 |     cmd = ('java -jar {gatk} -T AnalyzeCovariates -R {ref_fa} '
139 |            '-before {table} -after {post_table} -plots {output}.pdf').format(
140 |             gatk=gatk,ref_fa=ref_fa,table=table,post_table=post_table,output=plot)
141 |     print(cmd);sys.stdout.flush()
142 |     sarge.run(cmd)
143 | 
144 | 
145 | def BaseRecalibrator_4(realiBam,recalBam,gatk,ref_fa,gold_pair,table,thread):
146 |     ''' Step 4 of base recalibration: recalibrate the base quality.
147 |     '''
148 |     cmd = ('java -jar {gatk} -T PrintReads -R {ref_fa} -I {input} -BQSR {table} '
149 |            '-o {output} -nct {thread}').format(gatk=gatk,
150 |             ref_fa=ref_fa,input=realiBam,table=table,output=recalBam,
151 |             thread=str(thread))
152 |     print(cmd);sys.stdout.flush()
153 |     sarge.run(cmd)
154 |     
155 | 
156 | def CombineSNPandINDEL(vcfFiles,outvcf,gatk,ref_fa,otherParams=[]):
157 |     """
158 |     This function combines the vcf files.
159 |     * gatk: gatk software pathway
160 |     * ref_fa: reference genome fasta file
161 |     * variantFiles: a list of vcf files that need to be combined
162 |     * argus: additional argument
163 |     """
164 |     variCmd = ' '.join(['-V '+vcf for vcf in vcfFiles])
165 |     other = ' '.join(otherParams)
166 |     cmd = ('java -jar {gatk} -R {ref_fa} -T CombineVariants '
167 |            '{varis} -o {outputVcf} {other}').format(gatk=gatk,ref_fa=ref_fa,
168 |             varis=variCmd,outputVcf=outvcf,other=other)
169 |     print(cmd);sys.stdout.flush()
170 |     sarge.run(cmd)
171 | 
172 | 
173 | #===============================================================================
174 | #                     RNA part functions
175 | #===============================================================================
176 | def splitN(dedupBam,splitBam,gatk,ref_fa):
177 |     '''This function splits reads due to wrong splicng by STAR'''
178 |     cmd = ('java -jar {gatk} -T SplitNCigarReads -R {ref_fa} '
179 |             '-I {input} -o {output} -rf ReassignOneMappingQuality '
180 |             '-RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS').format(
181 |             gatk=gatk,ref_fa=ref_fa,input=dedupBam,output=splitBam)
182 |     print(cmd);sys.stdout.flush()
183 |     sarge.run(cmd)
184 | 
185 | 
186 | def HaplotypeCaller_RNA_VCF(recalBam,vcf,gatk,ref_fa,thread='1'):
187 |     """
188 |     This function calls variants in RNAseq
189 |     """
190 |     cmd = ('java -jar {gatk} -T HaplotypeCaller -R {ref_fa} '
191 |     '-I {input} -dontUseSoftClippedBases ' 
192 |     '-stand_call_conf 20.0 -stand_emit_conf 20.0 -o {output} -nct {thread}').format(
193 |     gatk=gatk,ref_fa=ref_fa,input=recalBam,output=vcf,thread=str(thread))
194 |     print(cmd);sys.stdout.flush()
195 |     sarge.run(cmd)
196 | 
197 | 
198 | def RNA_Vari_Filter(vcf,filterVCF,gatk,ref_fa):
199 |     """
200 |     This function filter out the results of the vari call 
201 |     """
202 |     FS = """'FS > 30.0'"""
203 |     QD = """'QD < 2.0'"""
204 |     cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} '
205 |             '-V {input} -window 35 -cluster 3 -filterName FS '
206 |             '-filter {FS} -filterName QD -filter {QD} '
207 |             '-o {output}').format(gatk=gatk,ref_fa=ref_fa,
208 |                 input=vcf,FS=FS,QD=QD,output=filterVCF)
209 |     print(cmd);sys.stdout.flush()
210 |     sarge.run(cmd)
211 |     
212 | 
213 | def RNA_BaseRecalibrator_1(realiBam,table,gatk,ref_fa,gold_vcf,thread='1'):
214 |     '''step 1 of base recalibration,generate a table'''
215 |     cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} '
216 |             '-I {realignbam} -knownSites {gold} '
217 |             '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa,
218 |             realignbam=realiBam,gold=gold_vcf,output=table,thread=str(thread))
219 |     print(cmd);sys.stdout.flush()
220 |     sarge.run(cmd)
221 | 
222 | 
223 | def RNA_BaseRecalibrator_2(realiBam,post_table,table,gatk,ref_fa,gold_vcf,thread='1'):
224 |     '''Step 2 of base recalibration,generate post table'''
225 |     cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} '
226 |            '-I {realignbam} -knownSites {gold} -BQSR {table} '
227 |            '-o {output} -nct {thread}').format(gatk=gatk,
228 |             ref_fa=ref_fa,realignbam=realiBam,gold=gold_vcf,
229 |             output=post_table,table=table,thread=str(thread))
230 |     print(cmd);sys.stdout.flush()
231 |     sarge.run(cmd)
232 | 
233 | 
234 | def RNA_BaseRecalibrator3(table,plot,post_table,gatk,ref_fa):
235 |     '''Step 3 of base recalibration, compare the two tables'''
236 |     cmd = ('java -jar {gatk} -T AnalyzeCovariates -R {ref_fa} '
237 |            '-before {table} -after {post_table} -plots {output}').format(
238 |             gatk=gatk,ref_fa=ref_fa,table=table,post_table=post_table,output=plot)
239 |     print(cmd);sys.stdout.flush()
240 |     sarge.run(cmd)
241 |     
242 |     
243 | def RNA_BaseRecalibrator4(realiBam,recalBam,gatk,table,ref_fa,gold_vcf,thread='1'):    
244 |     '''Step 4 of base recalibration'''
245 |     cmd = ('java -jar {gatk} -T PrintReads -R {ref_fa} '
246 |            '-I {input} -BQSR {table} -o {output} -nct {thread}').format(gatk=gatk,
247 |             ref_fa=ref_fa,input=realiBam,table=table,output=recalBam,thread=str(thread))
248 |     print(cmd);sys.stdout.flush()
249 |     sarge.run(cmd)
250 |     
251 |     
252 | 


--------------------------------------------------------------------------------
/GATK_RNA_CHO.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import STAR_Db,STAR
  4 | from Modules.Trimmomatic import Trimmomatic
  5 | from Modules.Samtools import build_fa_index,merge_bams
  6 | from Modules.Picard import build_fa_dict,mark_duplicates,add_readgroup
  7 | from Modules.GATK import *
  8 | import yaml
  9 | import shutil
 10 | import glob
 11 | 
 12 | 
 13 | #============ parameters ======================
 14 | parameter_file =  sys.argv[1]
 15 | #parameter_file = '/data/shangzhong/DE/ercc/GATK_RNA_CHO.yaml'
 16 | with open(parameter_file,'r') as f:
 17 |     doc = yaml.load(f)
 18 | p = dic2obj(**doc)
 19 | #------------- get parameters -----------
 20 | file_path = p.RawDataPath
 21 | thread = p.thread
 22 | # all parameter
 23 | ref_fa = p.ref_fa
 24 | # trimmomatic parameter
 25 | trim = p.trim_reads
 26 | trimmomatic = p.trimmomatic_path
 27 | trim_batch = p.trim_jobs_per_batch
 28 | adapter = p.adapter
 29 | 
 30 | picard = p.picard
 31 | gatk = p.gatk
 32 | 
 33 | star_batch = p.star_jobs_per_batch
 34 | star_db = p.star_index
 35 | 
 36 | sp = p.sample_name
 37 | read_groups = p.read_groups
 38 | 
 39 | contact = p.contact
 40 | #===============================================================================
 41 | #                    Pipeline part
 42 | #===============================================================================
 43 | Message('GATK_RNA_CHO start',contact)
 44 | os.chdir(file_path)
 45 | #===============================================================================
 46 | #                     Part I. Preprocess
 47 | #===============================================================================
 48 | #--------------------- 1. build index for fa file using samtools and GATK ------------------
 49 | dict_file = '.'.join(ref_fa.split('.')[:-1]) + '.dict'
 50 | fai_file = ref_fa + '.fai'
 51 | if not os.path.exists(dict_file): build_fa_dict(ref_fa,picard)
 52 | if not os.path.exists(fai_file): build_fa_index(ref_fa)
 53 | #--------------------- 2. read all files ------------------------------------------------
 54 | fastqFiles = list_fq_files(file_path)
 55 | if fastqFiles[0][0].startswith('trim_'):
 56 |     trim = False
 57 | def trim_parameters():
 58 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 59 |     for infile, output in zip(infiles,outfiles):
 60 |         yield infile,output
 61 | #--------------------- 3. trim reads-----------------------------------------------------
 62 | @active_if(trim)
 63 | @jobs_limit(trim_batch)
 64 | @files(trim_parameters)
 65 | def trim_reads(input_file,output_file):
 66 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 67 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 68 |     remove(input_file)
 69 | #--------------------- 4. Map with STAR -----------------------------------------------------
 70 | # build index
 71 | @active_if(not os.path.exists(star_db))
 72 | @follows(trim_reads)
 73 | def star_index():
 74 |     STAR_Db(star_db,ref_fa,thread)
 75 | # align
 76 | if trim == True:
 77 |     @jobs_limit(star_batch)
 78 |     @follows(trim_reads,star_index)
 79 |     @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam')
 80 |     @check_if_uptodate(check_file_exists)
 81 |     @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01_bam/{basename[0]}.sort.bam')
 82 |     def run_star(input_file,output_file):
 83 |         n = num_thread2use(star_batch,len(fastqFiles),thread)
 84 |         STAR(input_file,output_file,star_db,n,'',['--outSAMtype BAM','SortedByCoordinate','--twopassMode Basic'])
 85 | else:
 86 |     @jobs_limit(star_batch)
 87 |     @follows(star_index)
 88 |     @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam')
 89 |     @check_if_uptodate(check_file_exists)
 90 |     @transform(fastqFiles,formatter('.*\.f.*?\.gz'),'f01_bam/{basename[0]}.sort.bam')
 91 |     def run_star(input_file,output_file):
 92 |         n = num_thread2use(star_batch,len(fastqFiles),thread)
 93 |         STAR(input_file,output_file,star_db,n,'',['--outSAMtype BAM','SortedByCoordinate','--twopassMode Basic'])
 94 | #--------------------- 5. add read group --------------------------------------------------
 95 | def get_bam_and_rg():
 96 |     bams = [f for f in os.listdir('f01_bam') if f.endswith('.sort.bam')]
 97 |     bams = natsorted(bams)
 98 |     for bam, rg in zip(bams,read_groups):
 99 |         output = re.sub('\.sort\.bam','.adrg.bam',bam)
100 |         yield ['f01_bam/'+bam,rg],'f02_addGroup/' + output
101 | @jobs_limit(trim_batch*2)
102 | @follows(run_star,mkdir('tmp'),mkdir('f02_addGroup'))
103 | @files(get_bam_and_rg)
104 | @check_if_uptodate(check_file_exists)
105 | def run_add_group(input_file,output_file):
106 |     add_readgroup(input_file[0],output_file,input_file[1],picard)
107 | @follows(run_add_group)
108 | def remove_bam():
109 |     if os.path.exists('f01_bam'):shutil.rmtree('f01_bam')
110 | #--------------------- 6. Markduplicates using picard -------------------------------------
111 | @jobs_limit(trim_batch)
112 | @follows(run_add_group,remove_bam)
113 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_dedupBam')
114 | @transform(run_add_group,formatter('.*\.adrg\.bam'),'f03_dedupBam/{basename[0]}.dedup.bam')
115 | @check_if_uptodate(check_file_exists)
116 | def run_mark_duplicates(input_file,output_file):
117 |     mark_duplicates(input_file,output_file,picard)
118 | @follows(run_mark_duplicates)
119 | def remove_groupBam():
120 |     if os.path.exists('f02_addGroup'): shutil.rmtree('f02_addGroup')
121 | #--------------------- 7. Split N ---------------------------------------------------------
122 | @follows(run_mark_duplicates,remove_groupBam)
123 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_splitBam')
124 | @check_if_uptodate(check_file_exists)
125 | @transform(run_mark_duplicates,formatter('.*\.dedup\.bam'),'f04_splitBam/{basename[0]}.split.bam')
126 | def run_splitN(input_file,output_file):
127 |     splitN(input_file,output_file,gatk,ref_fa)
128 | @follows(run_splitN)
129 | def remove_dedupBam():
130 |     if os.path.exists('f03_dedupBam'): shutil.rmtree('f03_dedupBam')
131 | #--------------------- 8. Indel realignment ---------------------
132 | @jobs_limit(thread)
133 | @follows(run_splitN,remove_dedupBam)
134 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_indelReali')
135 | @transform(run_splitN,formatter('.*\.split\.bam'),'f05_indelReali/{basename[0]}.reali.bam')
136 | @check_if_uptodate(check_file_exists)
137 | def run_realign(input_file,output_file):
138 |     interval = re.sub('reali\.bam$','interval.list',output_file)
139 |     RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=[''])
140 |     IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=[''])
141 | @follows(run_realign)
142 | def remove_splitBam():
143 |     if os.path.exists('f04_splitBam'): shutil.rmtree('f04_splitBam')
144 | #--------------------- 9. Round 1 call ----------------------------------------------
145 | @jobs_limit(thread)
146 | @follows(run_realign,remove_splitBam)
147 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_Round1Call')
148 | @transform(run_realign,formatter('.*\.reali\.bam'),'f06_Round1Call/{basename[0]}.vcf')
149 | @check_if_uptodate(check_file_exists)
150 | def round1Vari_call(input_file,output_file):
151 |     n = num_thread2use(thread,len(fastqFiles),thread)
152 |     HaplotypeCaller_RNA_VCF(input_file,output_file,gatk,ref_fa,n)
153 | #--------------------- 10. filter gold snp and indel ---------------------------------
154 | @follows(round1Vari_call)
155 | @transform(round1Vari_call,suffix('.vcf'),'.gold.vcf')
156 | @check_if_uptodate(check_file_exists)
157 | def run_RNA_Vari_Filter(input_file,output_file):
158 |     RNA_Vari_Filter(input_file,output_file,gatk,ref_fa)
159 | #--------------------- 11. Base recalibration -----------------------------------------
160 | # step 1
161 | @follows(run_RNA_Vari_Filter)
162 | @mkdir(fastqFiles,formatter(),'{path[0]}/f07_BaseRecal')
163 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.table')
164 | @check_if_uptodate(check_file_exists)
165 | def run_RNA_Baserecalibration_1(input_file,output_file):
166 |     gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1]
167 |     RNA_BaseRecalibrator_1(input_file,output_file,gatk,ref_fa,gold_vcf,thread='1')
168 | # step 2
169 | @follows(run_RNA_Baserecalibration_1)
170 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.post_table')
171 | @check_if_uptodate(check_file_exists)
172 | def run_RNA_Baserecalibration_2(input_file,output_file):
173 |     table = re.sub('post_table$','table',output_file)
174 |     gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1]
175 |     RNA_BaseRecalibrator_2(input_file,output_file,table,gatk,ref_fa,gold_vcf,thread='1')
176 | # step 3
177 | @follows(run_RNA_Baserecalibration_2)
178 | @transform(run_RNA_Baserecalibration_1,formatter('.+\.table'),'f07_BaseRecal/{basename[0]}.plot.pdf')
179 | @check_if_uptodate(check_file_exists)
180 | def run_RNA_Baserecalibration_3(input_file,output_file):
181 |     post_table = re.sub('\.table$','.post_table',input_file)
182 |     RNA_BaseRecalibrator3(input_file,output_file,post_table,gatk,ref_fa)
183 | # step 4
184 | @jobs_limit(trim_batch)
185 | @follows(run_RNA_Baserecalibration_3)
186 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.recal.bam')
187 | @check_if_uptodate(check_file_exists)
188 | def run_RNA_Baserecalibration_4(input_file,output_file):
189 |     table = re.sub('\.recal\.bam','.table',output_file)
190 |     gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1]
191 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
192 |     RNA_BaseRecalibrator4(input_file,output_file,gatk,table,ref_fa,gold_vcf,n)
193 | @follows(run_RNA_Baserecalibration_4)
194 | def remove_realiBam():
195 |     if os.path.exists('f05_indelReali'): shutil.rmtree('f05_indelReali')
196 | #--------------------- 12. merge lanes for the same sample -----------------------------------------
197 | def get_group_bam():
198 |     readic = {}
199 |     bamfiles = natsorted(glob.glob('f07_BaseRecal/*.recal.bam'))
200 |     for rg,bam in zip(read_groups,bamfiles):
201 |         start = rg.index('SM:')
202 |         sample = rg[start+3:]
203 |         if sample in readic:
204 |             readic[sample].append(bam)
205 |         else:
206 |             readic[sample] = [bam]
207 |     for sp in readic:
208 |         output_file = 'f08_mergeBam/' + sp + '.merge.bam'
209 |         input_file = readic[sp]
210 |         yield input_file,output_file
211 | @follows(run_RNA_Baserecalibration_4,remove_realiBam)
212 | @mkdir(fastqFiles,formatter(),'{path[0]}/f08_mergeBam')
213 | @files(get_group_bam)
214 | @check_if_uptodate(check_file_exists)
215 | def mergeBam(input_files,output_file):
216 |     merge_bams(input_files,output_file)
217 | #--------------------- 13. Mark duplicates for merged file ---------------------------------------
218 | @jobs_limit(trim_batch)
219 | @follows(mergeBam)
220 | @mkdir(fastqFiles,formatter(),'{path[0]}/f09_dedupBam2')
221 | @transform(mergeBam,formatter('.*\.merge\.bam'),'f09_dedupBam2/{basename[0]}.dedup.bam')
222 | @check_if_uptodate(check_file_exists)
223 | def markduplicates2(input_file,output_file):
224 |     mark_duplicates(input_file,output_file,picard)
225 | @follows(markduplicates2)
226 | def remove_mergeBam():
227 |     if os.path.exists('f08_mergeBam'): shutil.rmtree('f08_mergeBam')
228 | #--------------------- 14. Indel realignment -----------------------------------------------------
229 | @jobs_limit(thread)
230 | @follows(markduplicates2,remove_mergeBam)
231 | @mkdir(fastqFiles,formatter(),'{path[0]}/f10_indelReali2')
232 | @transform(markduplicates2,formatter('.*\.dedup\.bam'),'f10_indelReali2/{basename[0]}.reali.bam')
233 | @check_if_uptodate(check_file_exists)
234 | def run_realign2(input_file,output_file):
235 |     interval = re.sub('reali\.bam$','interval.list',output_file)
236 |     RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=[''])
237 |     IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=[''])
238 | @follows(run_realign2)
239 | def remove_dedupBam2():
240 |     if os.path.exists('f09_dedupBam2'): shutil.rmtree('f09_dedupBam2')
241 | #--------------------- 15. Round 2 call ----------------------------------------------
242 | @follows(run_realign2,remove_dedupBam2)
243 | @mkdir(fastqFiles,formatter(),'{path[0]}/f11_Round2Call')
244 | @transform(run_realign2,formatter('.*\.reali\.bam'),'f11_Round2Call/{basename[0]}.vcf')
245 | @check_if_uptodate(check_file_exists)
246 | def run_round2Vari_call(input_file,output_file):
247 |     n = num_thread2use(thread,len(fastqFiles),thread)
248 |     HaplotypeCaller_RNA_VCF(input_file,output_file,gatk,ref_fa,n)
249 | #--------------------- 16. filter final vcf ---------------------------------
250 | @jobs_limit(thread)
251 | @follows(run_round2Vari_call)
252 | @mkdir(fastqFiles,formatter(),'{path[0]}/f12_FinalVcf')
253 | @transform(run_round2Vari_call,formatter('.*\.vcf'), 'f12_FinalVcf/{basename[0]}.merged.filter.vcf')
254 | @check_if_uptodate(check_file_exists)
255 | def run_filter_2(input_file,output_file):
256 |     output_file = output_file.split('.')[0] + '.merged.filter.vcf'
257 |     RNA_Vari_Filter(input_file,output_file,gatk,ref_fa)
258 |     
259 | @follows(run_filter_2)
260 | def last_function():
261 |     Message('GATK_RNA_CHO succeed',contact)
262 |     
263 | if __name__ == '__main__':
264 |     try:
265 | #         pipeline_printout(sys.stdout, [last_function], verbose=3)
266 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
267 |         touch_files_only=False)
268 |     except:
269 |         Message('GATK_RNA_CHO failed',contact)
270 | 
271 |     
272 |     
273 |     
274 |     
275 |     
276 |     
277 |     
278 |     


--------------------------------------------------------------------------------
/GATK_DNA_CHO.py:
--------------------------------------------------------------------------------
  1 | from ruffus import *
  2 | from Modules.f01_file_process import *
  3 | from Modules.Aligner import bwa_Db,bwa_mem
  4 | from Modules.Trimmomatic import Trimmomatic
  5 | from Modules.Samtools import sortBam,build_fa_index,merge_bams
  6 | from Modules.Picard import build_fa_dict,mark_duplicates
  7 | from Modules.GATK import *
  8 | import yaml
  9 | import sys,shutil
 10 | import glob
 11 | from natsort import natsorted
 12 | 
 13 | 
 14 | #============ parameters ======================
 15 | parameter_file =  sys.argv[1]
 16 | #parameter_file = '/data/shangzhong/DNArepair/fq/GATK_DNA_CHO.yaml'
 17 | with open(parameter_file,'r') as f:
 18 |     doc = yaml.load(f)
 19 | p = dic2obj(**doc)
 20 | #------------- get parameters -----------
 21 | file_path = p.RawDataPath
 22 | thread = p.thread
 23 | # all parameter
 24 | ref_fa = p.ref_fa
 25 | # trimmomatic parameter
 26 | trim = p.trim_reads
 27 | trimmomatic = p.trimmomatic_path
 28 | trim_batch = p.trim_jobs_per_batch
 29 | adapter = p.adapter
 30 | 
 31 | QC = p.QC
 32 | picard = p.picard
 33 | gatk = p.gatk
 34 | 
 35 | bwa_batch = p.bwa_jobs_per_batch
 36 | bwa_db = p.bwa_db
 37 | 
 38 | sp = p.sample_name
 39 | read_groups = p.read_groups
 40 | 
 41 | contact = p.contact
 42 | #===============================================================================
 43 | #                    Pipeline part
 44 | #===============================================================================
 45 | Message('GATK_DNA_start',contact)
 46 | os.chdir(file_path)
 47 | #===============================================================================
 48 | #                     Part I. Preprocess
 49 | #===============================================================================
 50 | #--------------------- 1. build index for fa file using samtools and GATK ------------------
 51 | dict_file = '.'.join(ref_fa.split('.')[:-1]) + '.dict'
 52 | fai_file = ref_fa + '.fai'
 53 | if not os.path.exists(dict_file): build_fa_dict(ref_fa,picard)
 54 | if not os.path.exists(fai_file): build_fa_index(ref_fa)
 55 | #--------------------- 2. read all files ------------------------------------------------
 56 | fastqFiles = list_fq_files(file_path)
 57 | if fastqFiles[0][0].startswith('trim_'):
 58 |     trim = False
 59 | def trim_parameters():
 60 |     infiles,outfiles = replace_filename(fastqFiles,'^','trim_')
 61 |     for infile, output in zip(infiles,outfiles):
 62 |         yield infile,output
 63 | #--------------------- run fastqc before trimming -----------
 64 | @active_if(QC)
 65 | @jobs_limit(thread)
 66 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc')
 67 | @files(trim_parameters)
 68 | def run_QC1(input_file,output_file):
 69 |     for fq in input_file:
 70 |         sarge.run('fastqc {input} -o fastqc'.format(input=fq))
 71 | #---------------------3. trim file ------------------
 72 | @active_if(trim)
 73 | @follows(run_QC1)
 74 | @jobs_limit(trim_batch)
 75 | @files(trim_parameters)
 76 | def trim_reads(input_file,output_file):
 77 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
 78 |     Trimmomatic(input_file,output_file,trimmomatic,n,adapter)
 79 |     remove(input_file)
 80 | #--------------------- 4. Map with bwa -----------------------------------------------------
 81 | def get_fq_and_readgroup():
 82 |     fqFiles = list_fq_files(file_path)
 83 |     for fq, rg in zip(fqFiles,read_groups):
 84 |         out = 'f01_bam/' + re.sub('\.f.*q\.gz','.bam',fq[0])
 85 |         yield fq,out,rg
 86 | # build index
 87 | @active_if(not os.path.exists(bwa_db))
 88 | @follows(trim_reads,run_QC1)
 89 | def bwa_index():
 90 |     bwa_Db(bwa_db,ref_fa)
 91 | # align
 92 | @jobs_limit(bwa_batch)
 93 | @follows(bwa_index,trim_reads,run_QC1)
 94 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam')
 95 | @files(get_fq_and_readgroup)
 96 | def run_bwa(input_file,output_file,rg):
 97 |     n = num_thread2use(bwa_batch,len(fastqFiles),thread)
 98 |     bwa_mem(input_file,output_file,bwa_db+'/bwa',n,otherParameters=['-R '+rg+'\\\\tPL:illumina\\\\tLB:lib20000\\\\tPU:unit1'])
 99 | #--------------------- 5. Sort bam file --------------------------------------------------
100 | @jobs_limit(trim_batch)
101 | @follows(run_bwa)
102 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_sortBam')
103 | @transform(run_bwa,formatter('.*\.bam'),'f02_sortBam/{basename[0]}.sort.bam')
104 | @check_if_uptodate(check_file_exists)
105 | def sort_by_pos(input_file,output_file):
106 |     n = num_thread2use(trim_batch,len(fastqFiles),thread)
107 |     sortBam(input_file,output_file,n,sortType='pos')
108 | @follows(sort_by_pos)
109 | def remove_bam():
110 |     if os.path.exists('f01_bam'): shutil.rmtree('f01_bam')   # remove bam folder
111 | #--------------------- 6. Markduplicates using picard -------------------------------------
112 | @jobs_limit(trim_batch)
113 | @follows(remove_bam)
114 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_dedupBam')
115 | @transform(sort_by_pos,formatter('.*\.sort\.bam'),'f03_dedupBam/{basename[0]}.dedup.bam')
116 | @check_if_uptodate(check_file_exists)
117 | def markduplicates(input_file,output_file):
118 |     mark_duplicates(input_file,output_file,picard)
119 | @follows(markduplicates)
120 | def remove_sortBam():
121 |     if os.path.exists('f02_sortBam'): shutil.rmtree('f02_sortBam')
122 | #--------------------- 7. Indel realignment ---------------------
123 | @jobs_limit(thread)
124 | @follows(remove_sortBam)
125 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_indelReali')
126 | @transform(markduplicates,formatter('.*\.dedup\.bam'),'f04_indelReali/{basename[0]}.reali.bam')
127 | @check_if_uptodate(check_file_exists)
128 | def Realign(input_file,output_file):
129 |     interval = re.sub('reali\.bam$','interval.list',output_file)
130 |     RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=[''])
131 |     IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=[''])
132 | @follows(Realign)
133 | def remove_dedupBam():
134 |     if os.path.exists('f03_dedupBam'): shutil.rmtree('f03_dedupBam')
135 | #--------------------- 8. Round 1 call ----------------------------------------------
136 | @jobs_limit(thread)
137 | @follows(remove_dedupBam)
138 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_Round1Call')
139 | @transform(Realign,formatter('.*\.reali\.bam'),'f05_Round1Call/{basename[0]}.raw.g.vcf')
140 | @check_if_uptodate(check_file_exists)
141 | def round1Vari_call(input_file,output_file):
142 |     n = num_thread2use(thread,len(fastqFiles),thread)
143 |     HaplotypeCaller_DNA_gVCF(input_file,output_file,gatk,ref_fa,n,otherParameters=[])
144 | #--------------------- 9. Merge raw vcf ----------------------------------------------
145 | @follows(round1Vari_call)
146 | @merge(round1Vari_call,'f05_Round1Call/round1.g.vcf')
147 | @check_if_uptodate(check_file_exists)
148 | def merge_vcf(input_files,output_file):
149 |     JointGenotype(input_files,output_file,gatk,ref_fa,thread)
150 | #--------------------- 10. filter gold snp and indel ---------------------------------
151 | @follows(merge_vcf)
152 | @transform(merge_vcf,suffix('.g.vcf'),['.gold_snp.vcf','.gold_indel.vcf'])
153 | # @check_if_uptodate(check_file_exists)
154 | def hard_filter(input_file,output_pair):
155 |     HardFilter(input_file,output_pair,gatk,ref_fa,thread)
156 | @follows(hard_filter)
157 | def remove_vcf():
158 |     if os.path.exists('f05_Round1Call'):
159 |         for f in glob.glob('f05_Round1Call/*'):
160 |             if 'gold' not in f:
161 |                 os.remove(f)
162 | #--------------------- 11. Base recalibration -----------------------------------------
163 | # step 1
164 | @follows(remove_vcf)
165 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_BaseRecal')
166 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.table')
167 | @check_if_uptodate(check_file_exists)
168 | def Baserecalibration_1(input_file,output_file):
169 |     n = num_thread2use(thread,len(fastqFiles),thread)
170 |     BaseRecalibrator_1(input_file[0],output_file,input_file[1],gatk,ref_fa,thread=str(n))
171 | # step 2
172 | @follows(Baserecalibration_1)
173 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.post_table')
174 | @check_if_uptodate(check_file_exists)
175 | def Baserecalibration_2(input_file,output_file):
176 |     bam = input_file[0].split('/')[-1]
177 |     table = 'f06_BaseRecal/' + re.sub('\.reali\.bam$','.reali.table',bam)
178 |     n = num_thread2use(thread,len(fastqFiles),thread)
179 |     BaseRecalibrator_2(input_file[0],output_file,table,input_file[1],gatk,ref_fa,thread=str(n))
180 | # step 3
181 | @follows(Baserecalibration_2)
182 | @transform(Baserecalibration_1,formatter('.+\.table'),'f06_BaseRecal/{basename[0]}.plot')
183 | @check_if_uptodate(check_file_exists)
184 | def Baserecalibration_3(input_file,output_file):
185 |     post_table = re.sub('\.table$','.post_table',input_file)
186 |     BaseRecalibrator_3(input_file,output_file,post_table,gatk,ref_fa)
187 | # step 4
188 | @jobs_limit(bwa_batch)
189 | @follows(Baserecalibration_3)
190 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.recal.bam')
191 | @check_if_uptodate(check_file_exists)
192 | def Baserecalibration_4(input_file,output_file):
193 |     table = 'f06_BaseRecal/'+re.sub('\.bam','.table',input_file[0].split('/')[1])
194 |     n = num_thread2use(bwa_batch,len(fastqFiles),thread)
195 |     BaseRecalibrator_4(input_file[0],output_file,gatk,ref_fa,input_file[1],table,n)
196 | @follows(Baserecalibration_4)
197 | def remove_realiBam():
198 |     if os.path.exists('f04_indelReali'): shutil.rmtree('f04_indelReali')
199 | #--------------------- 12. merge lanes for the same sample -----------------------------------------
200 | def get_rg_dic():
201 |     readic = {}
202 |     bamfiles = natsorted(glob.glob('f06_BaseRecal/*.recal.bam'))
203 |     for rg,bam in zip(read_groups,bamfiles):
204 |         start = rg.index('SM:')
205 |         sample = rg[start+3:]
206 |         if sample in readic:
207 |             readic[sample].append(bam)
208 |         else:
209 |             readic[sample] = [bam]
210 |     return readic
211 | def get_group_bam():
212 |     readic = get_rg_dic()
213 |     for sp in readic:
214 |         output_file = 'f07_mergeBam/' + sp + '.merge.bam'
215 |         input_file = readic[sp]
216 |         yield input_file,output_file
217 | @follows(remove_realiBam)
218 | @mkdir(fastqFiles,formatter(),'{path[0]}/f07_mergeBam')
219 | @files(get_group_bam)
220 | def mergeBam(input_files,output_file):
221 |     merge_bams(input_files,output_file)
222 | @follows(mergeBam)
223 | def remove_recalBam():
224 |     if os.path.exists('f06_BaseRecal'):
225 |         for f in glob.glob('f06_BaseRecal/*'):
226 |             os.remove(f)
227 |             if f.endswith('recal.bam'):
228 |                 handle = open(f,'w')
229 |                 handle.close()
230 | #--------------------- 13. Mark duplicates for merged file ---------------------------------------
231 | @jobs_limit(trim_batch)
232 | @follows(remove_recalBam)
233 | @mkdir(fastqFiles,formatter(),'{path[0]}/f08_dedupBam2')
234 | @transform(mergeBam,formatter('.*\.merge\.bam'),'f08_dedupBam2/{basename[0]}.dedup.bam')
235 | @check_if_uptodate(check_file_exists)
236 | def markduplicates2(input_file,output_file):
237 |     mark_duplicates(input_file,output_file,picard)
238 | @follows(markduplicates2)
239 | def remove_mergeBam():
240 |     if os.path.exists('f07_mergeBam'): shutil.rmtree('f07_mergeBam')
241 | #--------------------- 14. Indel realignment -----------------------------------------------------
242 | @follows(remove_mergeBam)
243 | @mkdir(fastqFiles,formatter(),'{path[0]}/f09_indelReali2')
244 | @transform(markduplicates2,formatter('.*\.dedup\.bam'),'f09_indelReali2/{basename[0]}.reali.bam')
245 | @check_if_uptodate(check_file_exists)
246 | def Realign2(input_file,output_file):
247 |     interval = re.sub('reali\.bam$','interval.list',output_file)
248 |     n = num_thread2use(len(get_rg_dic().keys()),len(fastqFiles),thread)
249 |     RealignerTargetCreator(input_file,interval,gatk,ref_fa,n,gold_indels=[''])
250 |     IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=[''])
251 | @follows(Realign2)
252 | def remove_dedupBam2():
253 |     if os.path.exists('f08_dedupBam2'): shutil.rmtree('f08_dedupBam2')
254 | #--------------------- 15. Round 2 call ----------------------------------------------
255 | @follows(remove_dedupBam2)
256 | @mkdir(fastqFiles,formatter(),'{path[0]}/f10_Round2Call')
257 | @transform(Realign2,formatter('.*\.reali\.bam'),'f10_Round2Call/{basename[0]}.raw.g.vcf')
258 | @check_if_uptodate(check_file_exists)
259 | def round2Vari_call(input_file,output_file):
260 |     n = num_thread2use(len(get_rg_dic().keys()),len(fastqFiles),thread)
261 |     HaplotypeCaller_DNA_gVCF(input_file,output_file,gatk,ref_fa,n,otherParameters=[])
262 | #--------------------- 16. Merge raw2 vcf ---------------------------------------------
263 | @follows(round2Vari_call)
264 | @merge(round2Vari_call,'f10_Round2Call/round2.g.vcf')
265 | def merge_vcf2(input_files,output_file):
266 |     JointGenotype(input_files,output_file,gatk,ref_fa,thread)
267 | #--------------------- 17. filter gold snp and indel ---------------------------------
268 | @follows(merge_vcf2)
269 | @transform(merge_vcf2,suffix('.g.vcf'),['.gold_snp.vcf','.gold_indel.vcf'])
270 | # @check_if_uptodate(check_file_exists)
271 | def hard_filter2(input_file,output_pair):
272 |     HardFilter(input_file,output_pair,gatk,ref_fa,thread)
273 | #--------------------- 18. combine vcf files ---------------------------------
274 | @follows(hard_filter2)
275 | @mkdir(fastqFiles,formatter(),'{path[0]}/f11_FinalVcf')
276 | @merge(hard_filter2,'f11_FinalVcf/'+sp+'.merged.filter.vcf')
277 | @check_if_uptodate(check_file_exists)
278 | def combine_vcf(input_files,output_file):
279 |     CombineSNPandINDEL(input_files,output_file,gatk,ref_fa,otherParams=['--assumeIdenticalSamples','--genotypemergeoption UNSORTED'])
280 | 
281 | @follows(combine_vcf)
282 | def last_function():
283 |     Message('GATK_DNA_succeed',contact)
284 |     
285 | 
286 | if __name__ == '__main__':
287 |     try:
288 | #         pipeline_printout(sys.stdout, [last_function], verbose=3)
289 |         pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 
290 |         touch_files_only=False,verbose=5)
291 |     except:
292 |         pass
293 |         Message('test failed',contact)
294 | 
295 |     
296 |     
297 | 
298 | 
299 |     
300 |     
301 |     
302 |     
303 |     


--------------------------------------------------------------------------------
/Eukaryote_genome_annotation.py:
--------------------------------------------------------------------------------
  1 | import os,re
  2 | from Bio import SeqIO
  3 | import sarge
  4 | import glob
  5 | from natsort import natsorted
  6 | import multiprocessing as mp
  7 | import pandas as pd
  8 | from Bio import Entrez
  9 | import sys
 10 | from Bio import Entrez
 11 | Entrez.email = 'shl198@eng.ucsd.edu'
 12 | 
 13 | # database files
 14 | ref_fa = '/data/genome/hamster/multi_pacbio_assemble/picr.fa'
 15 | rna_fa = '/data/shangzhong/Picr_assembly/Annotation/hamster_rna.fa'
 16 | refseq_pr = '/data/shangzhong/Picr_assembly/Annotation/hamster_pr.fa'
 17 | hamster_id = '/data/shangzhong/Database/hamster/hamster_all_id.txt'
 18 | # pathways
 19 | path = '/data/shangzhong/Picr_assembly/Annotation'
 20 | organism = 'hamster'
 21 | # exonerate parameters
 22 | exonerate_path = path + '/exonerate'
 23 | pr_gff = exonerate_path + '/exonerate.gff'
 24 | # PASA parameters
 25 | PASA_path = path + '/PASA'
 26 | pasa = '/home/shangzhong/Installation/PASApipeline-2.0.2'
 27 | ppl_fn = pasa + '/scripts/Launch_PASA_pipeline.pl'
 28 | config = pasa + '/pasa_conf/pasa.alignAssembly.Template.txt'
 29 | cmp_config = pasa + '/pasa_conf/pasa.annotationCompare.Template.txt'
 30 | load_fn = pasa + '/scripts/Load_Current_Gene_Annotations.dbi'
 31 | gff3_validate_fn = pasa + '/misc_utilities/pasa_gff3_validator.pl'
 32 | tr_gff = PASA_path + '/picr_db.pasa_assemblies.gff3'
 33 | #===============================================================================
 34 | #                     1. PASA Alignment assembly
 35 | #===============================================================================
 36 | def align_assemble(ppl_fn,config,ref_fa,rna_fa,thread,otherParameters=['']):
 37 |     '''This function do alignment assembly
 38 |     generate 4 type of files: 
 39 |     sample_mydb_pasa.assemblies.fasta :the PASA assemblies in FASTA format.
 40 |     sample_mydb_pasa.pasa_assemblies.gff3,.gtf,.bed :the PASA assembly structures.
 41 |     sample_mydb_pasa.pasa_alignment_assembly_building.ascii_illustrations.out :descriptions 
 42 |         of alignment assemblies and how they were constructed from the underlying transcript alignments.
 43 |     sample_mydb_pasa.pasa_assemblies_described.txt :tab-delimited format describing the contents
 44 |          of the PASA assemblies, including the identity of those transcripts that were assembled into the corresponding structure.
 45 |     '''
 46 |     cmd = ('{ppl} -c {config} -C -r -R -g {ref_fa} \
 47 |              -t {rna_fa} --ALIGNERS gmap --CPU {thread} {other}').format(ppl=ppl_fn,config=config,
 48 |                         ref_fa = ref_fa,rna_fa=rna_fa,thread=str(thread),other=' '.join(otherParameters))
 49 |     print(cmd);sys.stdout.flush()
 50 |     sarge.run(cmd)
 51 | 
 52 | def check_gff_compat(gff,ppl_fn,config):
 53 |     '''check the gff compatibility with pasa'''
 54 |     cmd = ('{ppl_fn} {gff}').format(ppl_fn=ppl_fn,gff=gff)
 55 |     sarge.run(cmd)
 56 | 
 57 | def load_gff(gff,ref_fa,ppl_fn,config):
 58 |     cmd = ('{ppl} -c {config} -g {ref} -P {gff}').format(ppl=ppl_fn,config=config,ref=ref_fa,gff=gff)
 59 |     print(cmd)
 60 |     sarge.run(cmd)
 61 | 
 62 | def com_update(ref_fa,ppl_fn,config,rna_fa,thread):
 63 |     '''compare the reads and update the annotation'''
 64 |     cmd = ('{ppl_fn} -c {config} -A -g {ref_fa} -t {rna} --CPU {t}').format(ppl_fn=ppl_fn,
 65 |                                         config=config,ref_fa=ref_fa,rna=rna_fa,t=str(thread))
 66 |     print(cmd)
 67 |     sarge.run(cmd)
 68 | 
 69 | def main_PASA(gff_fn,ppl_fn,config,ref_fa,rna_fa,thread):
 70 |     # 1. alignment assembly using gmap
 71 |     align_assemble(ppl_fn,config,ref_fa,rna_fa,thread) # 
 72 |     # 2. check gff compatability
 73 |     check_gff_compat(gff_fn,ppl_fn)
 74 |     # 3. load the gff file
 75 |     load_gff(gff_fn,ref_fa,load_fn,config)
 76 |     # 4. compare and update
 77 |     com_update(ref_fa,ppl_fn,cmp_config,rna_fa,thread)
 78 | #===============================================================================
 79 | #                     2. run exonerate
 80 | #===============================================================================
 81 | def exonerate(ref_fa,pr_fn,out_fn):
 82 |     '''map protein sequence to dna seq'''
 83 |     cmd = ('exonerate -m p2g -q {pr} -t {ref} --showalignment no \
 84 |     --showvulgar no --showtargetgff yes --minintron 20 --percent 50 \
 85 |     --score 100 --geneseed 250 -n 10 > {gff}').format(pr=pr_fn,ref=ref_fa,gff=out_fn)
 86 |     print(cmd)
 87 |     sarge.run(cmd)
 88 | 
 89 | def split_fa(fa,item_per_file,path):
 90 |     if not os.path.exists(path): os.mkdir(path)
 91 |     handle = SeqIO.parse(open(fa,'r'),'fasta')
 92 |     file_n = 0
 93 |     pr_n = 0
 94 |     out_fn = path+'/file'+str(file_n)+'.fa'
 95 |     if os.path.exists(out_fn): os.remove(out_fn)
 96 |     for record in handle:
 97 |         SeqIO.write(record,open(out_fn,'a'),'fasta')
 98 |         pr_n += 1
 99 |         if pr_n % int(item_per_file) == 0:
100 |             file_n +=1
101 |             out_fn = path+'/file'+str(file_n)+'.fa'
102 |             if os.path.exists(out_fn): os.remove(out_fn)
103 | 
104 | def exonerate2gff(gffs,out_gff,g_type='evm'):
105 |     '''This function transfer exonerate gff file to standard gff format.
106 |     gffs: a list of gff files
107 |     out_gff: output final gff to store information
108 |     '''
109 |     out_handle = open(out_gff,'w')
110 |     n = 1
111 |     m = 0
112 |     for gff in gffs:
113 |         cds = []
114 |         for line in open(gff):
115 |             if line.startswith('#') or line.startswith('Command') or line.startswith('Hostname') or line.startswith(' ') or line.startswith('--'):
116 |                 continue
117 |             else:
118 |                 item = line.strip().split('\t')
119 |                 if item[2] == 'cds':
120 |                     cds.append(line.strip().split('\t'))
121 |                 elif item[2] == 'gene' and g_type=='augustus':
122 |                     item[1] = 'exonerate'
123 |                     pr = item[8].split(';')[1].split(' ')[2]
124 |                     item[8] = ('ID=gene_{n};Target={pr}').format(n=n,pr=pr)
125 |                     out_handle.write('\t'.join(item) + '\n')
126 |                 elif item[2] == 'similarity':
127 |                     info = item[8].split(';')
128 |                     pr = info[1].split()[1]
129 |                     length = 0
130 |                     start = 1; end = 1
131 |                     for c in cds:   # decide start of the AA of each exon
132 |                         length += int(c[4]) - int(c[3]) + 1
133 |                         if length % 3 == 0:
134 |                             end = length/3 
135 |                             new_s = end + 1
136 |                         else:
137 |                             end = length/3 + 1
138 |                             new_s = end
139 |                         c[1] = 'exonerate'
140 |                         c[2] = 'cds_match'
141 |                         m += 1
142 |                         if g_type == 'evm':
143 |                             m = n
144 |                         c.append(('ID=pr_{m};Parent=gene_{n};Target={pr} {s} {e}').format(m=m,n=n,pr=pr,s=start,e=end))
145 |                         start = new_s
146 |                         out_handle.write('\t'.join(c) + '\n')
147 |                     cds = []
148 |                     n += 1
149 |     out_handle.close()
150 | 
151 | 
152 | def main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff,index_s=0,index_e=0):
153 |     '''
154 |     * refseq_pr: all protein seqeunces of the organism
155 |     * path: path to store splited protein sequences.
156 |     '''
157 |     if not os.path.exists(exonerate_path): os.mkdir(exonerate_path)
158 |     # 1) split protein fa file into many sub file, this is to parallel the process
159 |     os.chdir(exonerate_path)
160 |     if os.listdir(path) != []:
161 |         split_fa(refseq_pr,100,exonerate_path)
162 |     # 2) run exonerate for each file
163 |     faFiles = natsorted(glob.glob('file*.fa'))
164 |     if index_e == 0:
165 |         faFiles = faFiles[index_s:]
166 |     else:
167 |         faFiles = faFiles[index_s:index_e]
168 |     pool = mp.Pool(processes=int(thread))
169 |     for f in faFiles:
170 |         out = f[:-2]+'gff'
171 |         pool.apply_async(exonerate,args=(ref_fa,f,out))
172 |     pool.close()
173 |     pool.join()
174 |     # 3) merge the gff files
175 |     exonerate_gff = 'exonerate.gff'
176 |     if not os.path.exists(exonerate_gff):
177 |         gff_fns = natsorted(glob.glob('file*.gff'))
178 |         exonerate2gff(gff_fns,exonerate_gff)
179 |     
180 | # main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff)
181 | 
182 | 
183 | #===============================================================================
184 | #                     process the gmap results and exonerates results directly
185 | #===============================================================================
186 | #=============== 1. get all mapped geneid, rna_accession, pr_accession
187 | def gene_rna_pr_id(hamster_id,gmap_gff,out_fn):
188 |     '''this fnction get all gene rna pr id, including both refseq and gff information.
189 |     * hamster_id: a file that has all ids in hamster.gff file
190 |     * gmap_gff: gff results mapped using gmap
191 |     * out_fn:  
192 |     '''
193 |     # rna accession in gff file
194 |     ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0)
195 |     ham_id_df = ham_id_df.astype('str')
196 |     ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0])
197 |     ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0])
198 |     rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict()
199 |     rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict()
200 |     #-------- read rna gff file
201 |     rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
202 |     # add rna accession column
203 |     rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
204 |     mrna = list(set(rna_df['rna_ac'].tolist()))
205 |     # new rna in refseq compared to gff
206 |     new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys()))
207 |     # get geneid for new ref_rna gene id
208 |     for r in new_ref_rna:
209 |         handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read()
210 |         geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0)
211 |         try:
212 |             p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0)
213 |         except:
214 |             p = '-'
215 |         rna_gene_dic[r] = geneid
216 |         rna_pr_dic[r] = p
217 |     # transfer dic to dataframe
218 |     r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index')
219 |     r_g_df.columns = ['geneid']
220 |     r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index')
221 |     r_p_df.columns = ['pr_ac']
222 |     g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1)
223 |     g_r_p_df['rna_ac'] = g_r_p_df.index
224 |     g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False)
225 | 
226 | # gmap_exon_path = path + '/gmap_exonerate'
227 | # if not os.path.exists(gmap_exon_path): os.mkdir(gmap_exon_path)
228 | # os.chdir(gmap_exon_path)
229 | # gmap_gff = PASA_path + '/gmap.spliced_alignments.gff3'
230 | # g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt'
231 | # gene_rna_pr_id(hamster_id,gmap_gff,g_r_p_id_fn)
232 | 
233 | 
234 | def get_consensus_map(rna_df,pr_df,gene,rna_ac,pr_ac):
235 |     '''this function check if the rna map and pr map have the same splice sites
236 |     * rna_df: mRNA map to genome gff dataframe with additional rna_ac column
237 |     * pr_df: protein map to genome dataframe with additional 'pr_ac' and 'pr_id' column
238 |     '''
239 |     if not rna_df.empty:
240 |         # get rna scaffold name, if more than 1 scaffold then don't add it's annotation
241 |         rna_chr = list(set(rna_df[0].tolist()))
242 |         if len(rna_chr) != 1:
243 |             assert False, rna_ac + ' map to multiple scaffolds'
244 |         else:
245 |             rna_chr = rna_chr[0]
246 |         # get strand, if map to both strand don't output
247 |         rna_str = list(set(rna_df[6].tolist()))
248 |         if len(rna_str) != 1:
249 |             assert False, rna_ac + ' map to both strands'
250 |         else:
251 |             rna_str = rna_str[0]
252 |         # get rna splice sites
253 |         rna_splice = natsorted(rna_df[3].tolist() + rna_df[4].tolist())
254 |         # change exon id
255 |         n = 1
256 |         for i,row in rna_df.iterrows():
257 |             item = row[8].split(';')
258 |             iid = '.'.join(item[0].split('.')[:-1])
259 |             anno = iid+' '+str(n)+';'+re.sub('Name.+?;','',';'.join(item[1:]))+';Parent='+rna_ac+';gene_id='+gene+';transcript_id='+rna_ac
260 | #             anno = iid+'_'+str(n)+';'+ re.sub('Name','transcript_id',';'.join(item[1:]))+';Parent='+rna_ac+';gene_id='+gene
261 |             rna_df.loc[i,8] = anno
262 |             rna_df.loc[i,2] = 'exon'
263 |             n += 1
264 |     #--------------- process protein gff information
265 |     if not pr_df.empty:
266 |         pr_id = pr_df['pr_id'].tolist()[0]
267 |         sub_pr_df = pr_df[(pr_df['pr_id'].values==pr_id) & (pr_df[0].values==rna_chr)].copy()
268 |         # change cds id
269 |         m = 1
270 |         for i,row in sub_pr_df.iterrows():
271 |             item = row[8].split(';')
272 |             anno = 'ID='+pr_ac+'_'+str(m)+';'+';'.join(item[2:])+';protein_id='+pr_ac+';Parent='+rna_ac+';gene_id='+gene
273 |             sub_pr_df.loc[i,8] = anno
274 |             sub_pr_df.loc[i,2] = 'CDS'
275 |             m += 1
276 |         pr_splice = natsorted(sub_pr_df[3].tolist() + sub_pr_df[4].tolist())
277 |         if sub_pr_df.shape[0] == 1:
278 |             if not rna_splice[0]<pr_splice[0]<pr_splice[1]<rna_splice[1]:
279 |                 sub_pr_df = pd.DataFrame()
280 |         else:
281 |             rna_pr_sites_match = set(pr_splice[1:-1]).intersection(rna_splice)
282 |             m_len = len(rna_pr_sites_match)
283 |             pr_len = len(pr_splice[1:-1])
284 |             if  m_len != pr_len:
285 |                 print pr_ac,m_len,'/',pr_len
286 |             if len(pr_splice) > len(rna_splice):
287 |                 print 'protein has more splice than rna, rna/pr:',len(rna_splice),'/',len(pr_splice)
288 |                 sub_pr_df = pd.DataFrame()
289 |     else:
290 |         sub_pr_df = pr_df
291 |     return rna_df,sub_pr_df,rna_chr,rna_splice[0],rna_splice[-1],rna_str
292 | 
293 | 
294 | # import time
295 | # process_start = time.time()
296 | 
297 | def gmap_exonerate_merge_gff(gmap_gff,exonerate_gff,gmap_exon_path,all_id_fn):
298 |     #-------- read gmap gff file
299 |     rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
300 |     rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
301 |     # get multi mapping mRNAs
302 |     multi_map_rna = list(set(rna_df[rna_df[8].map(lambda x: 'path2' in x)]['rna_ac'].tolist()))  
303 |     # build gene rna protein id dictionary
304 |     g_r_p_dic = {}
305 |     g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt'
306 |     handle = open(g_r_p_id_fn)
307 |     for line in handle:
308 |         item = line.strip().split('\t')
309 |         if item[1] in multi_map_rna:
310 |             continue
311 |         if item[0] in g_r_p_dic:
312 |             g_r_p_dic[item[0]][item[1]] = item[2]
313 |         else:
314 |             g_r_p_dic[item[0]] = {item[1]:item[2]}
315 |     #-------- read exonerate gff file
316 |     pr_df = pd.read_csv(pr_gff,sep='\t',header=None)
317 |     pr_df['pr_ac'] = pr_df[8].map(lambda x: re.search('(?<=Target=).+?(?=\.)',x).group(0))
318 |      
319 |     def output_consensus_rna_pr(g,out_handle):
320 |         '''this function finds the consistend rna and protein and out put to file
321 |         g_r_p_dic: dictionary that has all the gene, rna and protein ids.
322 |         g: gene id
323 |         rna_df: rna gff dataframe
324 |         pr_df: protein gff dataframe
325 |         '''
326 |         g_n = 0
327 |         rna_pr_dic = g_r_p_dic[g]
328 |         for rna in rna_pr_dic:
329 |             pr = rna_pr_dic[rna]
330 |             single_rna_df = rna_df[rna_df['rna_ac'].values==rna].copy()
331 |             single_rna_df = single_rna_df.reset_index(drop=True)
332 |             if not single_rna_df.empty:
333 |                 single_pr_df = pr_df[pr_df['pr_ac'].values==pr].copy()
334 |                 single_pr_df = single_pr_df.reset_index(drop=True)
335 |                 single_pr_df.loc[:,'pr_id'] = single_pr_df[8].map(lambda x: re.search('(?<=ID=).+?(?=;)',x).group(0))
336 |                 res_rna_df,res_pr_df,chrome,start,end,strand=get_consensus_map(single_rna_df,single_pr_df,str(g),rna,pr)
337 |                 if g_n == 0:
338 |                     out_handle.write('\t'.join([chrome,'gmap_exonerate','gene',str(start),str(end),'.',\
339 |                                                 strand,'.','ID='+str(g)+';gene_id='+str(g)])+'\n')
340 |                     g_n += 1
341 |                 if not res_rna_df.empty:
342 |                     if rna.startswith('XR'):
343 |                         feature = 'lncRNA'
344 |                     else:
345 |                         feature = 'mRNA'
346 |                     out_handle.write('\t'.join([chrome,'gmap_exonerate',feature,str(start),str(end),'.',\
347 |                                                 strand,'.','ID='+rna+';Parent='+str(g)+';gene_id='+str(g)+';transcript_id='+rna])+'\n')
348 |                     res_rna_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None)
349 |                     if not res_pr_df.empty:
350 |                         res_pr_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None)
351 |      
352 |     out_fn = '02_gmap_exonerate.gff'
353 |     if os.path.exists(out_fn): os.remove(out_fn)
354 |     with open(out_fn,'a') as f:
355 |         for g in g_r_p_dic.keys():
356 |             output_consensus_rna_pr(g,f)
357 |     # define a function to find the start and end position of each gene
358 |     def get_gene_s_e(gene_df):
359 |         pos = gene_df[3].tolist() + gene_df[4].tolist()
360 |         gene_df.iloc[0,3] = min(pos)
361 |         gene_df.iloc[0,4] = max(pos)
362 |         return gene_df
363 |     # correct gene coordinates
364 |     gff_df = pd.read_csv(out_fn,sep='\t',header=None)
365 |     gff_df['geneid'] = gff_df[8].map(lambda x: re.search('(?<=gene_id=).+?(?=;|$)',x).group(0))
366 |     res_df = gff_df.groupby('geneid').apply(get_gene_s_e)
367 |     # add gene name 
368 |     all_id_df = pd.read_csv(all_id_fn,sep='\t',header=0)
369 |     all_id_df = all_id_df.astype('str')
370 |     g_s_dic = all_id_df.set_index('GeneID')['GeneSymbol'].to_dict()
371 |     res_df[8] = res_df.apply(lambda row: row[8]+';gene_name='+g_s_dic[row['geneid']] if row['geneid'] in g_s_dic else row[8]+';gene_name='+row['geneid'],axis=1)
372 |     res_df[range(9)].to_csv('02_gmap_exonerate.gff',sep='\t',index=False,header=None)
373 |     
374 | # gmap_gff = PASA_path+'/gmap.spliced_alignments.gff3'
375 | # gmap_exonerate_merge_gff(gmap_gff,pr_gff,gmap_exon_path,hamster_id)
376 | # 
377 | # print time.time() - process_start
378 | 
379 | 
380 | 
381 | 
382 | 
383 | 
384 | 
385 | 


--------------------------------------------------------------------------------
/Genome_Annotation.py:
--------------------------------------------------------------------------------
  1 | from Modules.GeneMark import geneMark_ES
  2 | import os,sarge
  3 | from Bio import SeqIO
  4 | import glob
  5 | from natsort import natsorted
  6 | import multiprocessing as mp
  7 | import sys
  8 | import pandas as pd
  9 | import re
 10 | from Bio import Entrez
 11 | Entrez.email = 'shl198@eng.ucsd.edu'
 12 | # database files
 13 | ref_fa = '/data/genome/hamster/multi_pacbio_assemble/picr.fa'
 14 | rna_fa = '/data/shangzhong/Picr_assembly/Annotation/hamster_rna.fa'
 15 | refseq_pr = '/data/shangzhong/Picr_assembly/Annotation/hamster_pr.fa'
 16 | hamster_id = '/data/shangzhong/Database/hamster/hamster_all_id.txt'
 17 | # pathways
 18 | path = '/data/shangzhong/Picr_assembly/Annotation'
 19 | organism = 'hamster'
 20 | # genemark parameters
 21 | genemark_path = path + '/genemark'
 22 | genemark_gff = genemark_path + '/genemark.gff3'
 23 | # exonerate parameters
 24 | exonerate_path = path + '/exonerate'
 25 | pr_gff = exonerate_path + '/exonerate.gff'
 26 | # PASA parameters
 27 | PASA_path = path + '/PASA'
 28 | pasa = '/home/shangzhong/Installation/PASApipeline-2.0.2'
 29 | ppl_fn = pasa + '/scripts/Launch_PASA_pipeline.pl'
 30 | config = pasa + '/pasa_conf/pasa.alignAssembly.Template.txt'
 31 | cmp_config = pasa + '/pasa_conf/pasa.annotationCompare.Template.txt'
 32 | load_fn = pasa + '/scripts/Load_Current_Gene_Annotations.dbi'
 33 | gff3_validate_fn = pasa + '/misc_utilities/pasa_gff3_validator.pl'
 34 | tr_gff = PASA_path + '/picr_db.pasa_assemblies.gff3'
 35 | # evm parameters
 36 | evm = '/home/shangzhong/Installation/EVidenceModeler-1.1.1'  
 37 | evm_path = path + '/EVM'
 38 | exon2align_gff = '/home/shangzhong/Installation/EVidenceModeler-1.1.1/EvmUtils/misc/exonerate_gff_to_alignment_gff3.pl'
 39 | weight_fn = evm + '/weights.txt'  # /EvmUtils
 40 | # blast
 41 | blast_db = path + '/blastp_db'
 42 | uniprot = path + '/uniprot_sprot.fasta.gz'
 43 | thread = '6'
 44 | #===============================================================================
 45 | #                     1. run GeneMark
 46 | #===============================================================================
 47 | os.chdir(genemark_path)
 48 | # genemark_gff = geneMark_ES(ref_fa)
 49 | #===============================================================================
 50 | #                     2. run exonerate
 51 | #===============================================================================
 52 | def exonerate(ref_fa,pr_fn,out_fn):
 53 |     '''map protein sequence to dna seq'''
 54 |     cmd = ('exonerate -m p2g -q {pr} -t {ref} --showalignment no \
 55 |     --showvulgar no --showtargetgff yes --minintron 20 --percent 50 \
 56 |     --score 100 --geneseed 250 -n 10 > {gff}').format(pr=pr_fn,ref=ref_fa,gff=out_fn)
 57 |     print(cmd)
 58 |     sarge.run(cmd)
 59 | 
 60 | def split_fa(fa,item_per_file,path):
 61 |     if not os.path.exists(path): os.mkdir(path)
 62 |     handle = SeqIO.parse(open(fa,'r'),'fasta')
 63 |     file_n = 0
 64 |     pr_n = 0
 65 |     out_fn = path+'/file'+str(file_n)+'.fa'
 66 |     if os.path.exists(out_fn): os.remove(out_fn)
 67 |     for record in handle:
 68 |         SeqIO.write(record,open(out_fn,'a'),'fasta')
 69 |         pr_n += 1
 70 |         if pr_n % int(item_per_file) == 0:
 71 |             file_n +=1
 72 |             out_fn = path+'/file'+str(file_n)+'.fa'
 73 |             if os.path.exists(out_fn): os.remove(out_fn)
 74 | 
 75 | def exonerate2gff(gffs,out_gff,g_type='evm'):
 76 |     '''This function transfer exonerate gff file to standard gff format.
 77 |     gffs: a list of gff files
 78 |     out_gff: output final gff to store information
 79 |     '''
 80 |     out_handle = open(out_gff,'w')
 81 |     n = 1
 82 |     m = 0
 83 |     for gff in gffs:
 84 |         cds = []
 85 |         for line in open(gff):
 86 |             if line.startswith('#') or line.startswith('Command') or line.startswith('Hostname') or line.startswith(' ') or line.startswith('--'):
 87 |                 continue
 88 |             else:
 89 |                 item = line.strip().split('\t')
 90 |                 if item[2] == 'cds':
 91 |                     cds.append(line.strip().split('\t'))
 92 |                 elif item[2] == 'gene' and g_type=='augustus':
 93 |                     item[1] = 'exonerate'
 94 |                     pr = item[8].split(';')[1].split(' ')[2]
 95 |                     item[8] = ('ID=gene_{n};Target={pr}').format(n=n,pr=pr)
 96 |                     out_handle.write('\t'.join(item) + '\n')
 97 |                 elif item[2] == 'similarity':
 98 |                     info = item[8].split(';')
 99 |                     pr = info[1].split()[1]
100 |                     length = 0
101 |                     start = 1; end = 1
102 |                     for c in cds:   # decide start of the AA of each exon
103 |                         length += int(c[4]) - int(c[3]) + 1
104 |                         if length % 3 == 0:
105 |                             end = length/3 
106 |                             new_s = end + 1
107 |                         else:
108 |                             end = length/3 + 1
109 |                             new_s = end
110 |                         c[1] = 'exonerate'
111 |                         c[2] = 'cds_match'
112 |                         m += 1
113 |                         if g_type == 'evm':
114 |                             m = n
115 |                         c.append(('ID=pr_{m};Parent=gene_{n};Target={pr} {s} {e}').format(m=m,n=n,pr=pr,s=start,e=end))
116 |                         start = new_s
117 |                         out_handle.write('\t'.join(c) + '\n')
118 |                     cds = []
119 |                     n += 1
120 |     out_handle.close()
121 | 
122 | 
123 | def main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff,index_s=0,index_e=0):
124 |     '''
125 |     * refseq_pr: all protein seqeunces of the organism
126 |     * path: path to store splited protein sequences.
127 |     '''
128 |     if not os.path.exists(exonerate_path): os.mkdir(exonerate_path)
129 |     # 1) split file
130 |     os.chdir(exonerate_path)
131 |     if os.listdir(path) != []:
132 |         split_fa(refseq_pr,100,exonerate_path)
133 |     # 2) run exonerate for each file
134 |     faFiles = natsorted(glob.glob('file*.fa'))
135 |     if index_e == 0:
136 |         faFiles = faFiles[index_s:]
137 |     else:
138 |         faFiles = faFiles[index_s:index_e]
139 |     pool = mp.Pool(processes=int(thread))
140 |     for f in faFiles:
141 |         out = f[:-2]+'gff'
142 |         pool.apply_async(exonerate,args=(ref_fa,f,out))
143 |     pool.close()
144 |     pool.join()
145 |     # 3) merge the gff files
146 |     exonerate_gff = 'exonerate.gff'
147 |     if not os.path.exists(exonerate_gff):
148 |         gff_fns = natsorted(glob.glob('file*.gff'))
149 |         exonerate2gff(gff_fns,exonerate_gff)
150 |     
151 | # main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff)
152 |     
153 | #===============================================================================
154 | #                     3. PASA Alignment assembly
155 | #===============================================================================
156 | def align_assemble(ppl_fn,config,ref_fa,rna_fa,thread,otherParameters=['']):
157 |     '''This function do alignment assembly
158 |     generate 4 type of files: 
159 |     sample_mydb_pasa.assemblies.fasta :the PASA assemblies in FASTA format.
160 |     sample_mydb_pasa.pasa_assemblies.gff3,.gtf,.bed :the PASA assembly structures.
161 |     sample_mydb_pasa.pasa_alignment_assembly_building.ascii_illustrations.out :descriptions 
162 |         of alignment assemblies and how they were constructed from the underlying transcript alignments.
163 |     sample_mydb_pasa.pasa_assemblies_described.txt :tab-delimited format describing the contents
164 |          of the PASA assemblies, including the identity of those transcripts that were assembled into the corresponding structure.
165 |     '''
166 |     cmd = ('{ppl} -c {config} -C -r -R -g {ref_fa} \
167 |              -t {rna_fa} --ALIGNERS gmap --CPU {thread} {other}').format(ppl=ppl_fn,config=config,
168 |                         ref_fa = ref_fa,rna_fa=rna_fa,thread=str(thread),other=' '.join(otherParameters))
169 |     print(cmd);sys.stdout.flush()
170 |     sarge.run(cmd)
171 | 
172 | def check_gff_compat(gff,ppl_fn,config):
173 |     '''check the gff compatibility with pasa'''
174 |     cmd = ('{ppl_fn} {gff}').format(ppl_fn=ppl_fn,gff=gff)
175 |     sarge.run(cmd)
176 | 
177 | def load_gff(gff,ref_fa,ppl_fn,config):
178 |     cmd = ('{ppl} -c {config} -g {ref} -P {gff}').format(ppl=ppl_fn,config=config,ref=ref_fa,gff=gff)
179 |     print(cmd)
180 |     sarge.run(cmd)
181 | 
182 | def com_update(ref_fa,ppl_fn,config,rna_fa,thread):
183 |     '''compare the reads and update the annotation'''
184 |     cmd = ('{ppl_fn} -c {config} -A -g {ref_fa} -t {rna} --CPU {t}').format(ppl_fn=ppl_fn,
185 |                                         config=config,ref_fa=ref_fa,rna=rna_fa,t=str(thread))
186 |     print(cmd)
187 |     sarge.run(cmd)
188 | 
189 | def main_PASA(gff_fn,ppl_fn,config,ref_fa,rna_fa,thread):
190 |     # 1. alignment assembly using gmap
191 |     align_assemble(ppl_fn,config,ref_fa,rna_fa,thread) # 
192 |     # 2. check gff compatability
193 |     check_gff_compat(gff_fn,ppl_fn)
194 |     # 3. load the gff file
195 |     load_gff(gff_fn,ref_fa,load_fn,config)
196 |     # 4. compare and update
197 |     com_update(ref_fa,ppl_fn,cmp_config,rna_fa,thread)
198 | 
199 | #===============================================================================
200 | #                     4. run EVM
201 | #===============================================================================
202 | def evm_partition(ref_fa,evm,gffs=[''],otherParams=['']):
203 |     '''run evm to merge all the gff files'''
204 |     cmd = ('{evm} --genome {ref} {gffs} {other} --segmentSize 50000000 \
205 |      --overlapSize 10000 --partition_listing partitions_list.out').format(evm=evm,ref=ref_fa,
206 |                                         gffs=' '.join(gffs),other=' '.join(otherParams))
207 |     print(cmd)
208 |     sarge.run(cmd)
209 |     
210 | def evm_cmd_list(out_fn,cmd_fn,evm,ref_fa,weight_fn,partition,gffs=['']):
211 |     '''create cmd list for evm'''
212 |     cmd = ('{evm} --genome {ref} --weights {w} {gffs} --output_file_name {out_fn} \
213 |      --partitions {par} >  {cmd_l}').format(evm=evm,ref=ref_fa,
214 |         w=weight_fn,gffs=' '.join(gffs),out_fn=out_fn,par=partition,cmd_l=cmd_fn)
215 |     print(cmd)
216 |     sarge.run(cmd)
217 | 
218 | def combine_partition(evm,partition):
219 |     '''combine all the results from running command line'''
220 |     cmd = ('{evm} --partitions {p} --output_file_name evm.out').format(evm=evm,p=partition)
221 |     print(cmd)
222 |     sarge.run(cmd)
223 | 
224 | def run_cmd(cmd):
225 |     try:
226 |         print(cmd);sys.stdout.flush()
227 |         sarge.run(cmd)
228 |     except:
229 |         print cmd,'error'
230 |         assert False
231 | 
232 | def filter_evm_gff(evm_path):
233 |     os.chdir(evm_path)
234 |     ds = [f for f in os.listdir(evm_path) if os.path.isdir(f)]
235 |     out_h = open('evm.evidence.txt','w')
236 |     for d in ds:
237 |         fPath = d + '/evm.out'
238 |         size = os.path.getsize(fPath)
239 |         if size > 0:
240 |             blocks = open(fPath).read().strip().split('#')[1:]
241 |             for block in blocks:
242 |                 coords = []
243 |                 evidence = []
244 |                 for line in block.strip().split('\n')[1:]:
245 |                     if line.strip() != '' and line[0] != '!':
246 |                         meta = line.strip().split('\t')
247 |                         coords.append(int(meta[0]))
248 |                         coords.append(int(meta[1]))
249 |                         coords.sort()
250 |                         evidence.extend([tuple(x[1:-1].split(';')) for x in meta[-1].split(',')])
251 |     
252 |                 evidence = set(evidence)
253 |                 sources = set([x[1] for x in evidence])
254 |     
255 |                 out_h.write(d + '\t' + str(coords[0]) + '\t' + str(coords[-1]) + '\t' + ','.join([x[0] for x in evidence]) + '\t' + ','.join(sources) + '\n')
256 |     out_h.close()
257 | 
258 | 
259 | def main_evm(thread):
260 |     os.chdir(evm_path)
261 |     evm_gffs = ['--gene_predictions '+genemark_gff,'--transcript_alignments '+tr_gff,'--protein_alignments '+pr_gff]
262 |     # 1. partition input
263 |     evm_partition(ref_fa,evm+'/EvmUtils/partition_EVM_inputs.pl',evm_gffs)
264 |     # 2. generate command lines
265 |     evm_cmd_out = 'evm.out'
266 |     cmd_fn = 'commands.list'
267 |     evm_cmd_list(evm_cmd_out,cmd_fn,evm+'/EvmUtils/write_EVM_commands.pl',ref_fa,weight_fn,'partitions_list.out',evm_gffs)
268 |     # 3. run commands
269 |     pool = mp.Pool(processes=int(thread))
270 |     cmds = open(cmd_fn).readlines()
271 |     for cmd in cmds:
272 |         pool.apply_async(run_cmd,args=(cmd,))
273 |     pool.close()
274 |     pool.join()
275 |     # 4. combine results
276 |     evm_combine = evm + '/EvmUtils/recombine_EVM_partial_outputs.pl'
277 |     combine_partition(evm_combine,'partitions_list.out')
278 |     # 5. transfer to gff
279 |     to_gff = evm + '/EvmUtils/convert_EVM_outputs_to_GFF3.pl'
280 |     cmd = ('{evm} --partitions partitions_list.out --output evm.out --genome {ref}').format(evm=to_gff,ref=ref_fa)
281 |     sarge.run(cmd)
282 |     # 6. merge gff
283 |     fns = glob.glob('*/*.out.gff3')
284 |     cmd = ('cat {input} > evm.merge.gff').format(input=' '.join(fns))
285 |     sarge.run(cmd)
286 |     # 7. extract genes supported by two algorithm
287 |     filter_evm_gff(evm_path)
288 | # main_evm(9)
289 | 
290 | #===============================================================================
291 | #                     5. Augustus
292 | #===============================================================================
293 | def gff2gb(gff,out_gb,ref):
294 |     '''transfer gff file to genbank file'''
295 |     cmd = ('gff2gbSmallDNA.pl {gff} {ref} 1000 {gb}').format(gff=gff,ref=ref,gb=out_gb)
296 |     print(cmd)
297 |     sarge.run(cmd)
298 |    
299 | #---- transfer exonerate.gff to exonerate.gb
300 | def augustus_train(exonerate_gff,out_gb,ref_fa):
301 |     
302 |     gff2gb(exonerate_gff,out_gb,ref_fa)
303 |     #---- clearn problematic genes
304 |     sarge.run('etraining --species={s} --stopCodonExcludedFromCDS=true {gb} 2> train.err'.format(s=organism,gb=out_gb))
305 |     sarge.run('cat train.err | perl -pe \'s/.*in sequence (\S+): .*/$1/\' > badgenes.lst')
306 |     sarge.run('filterGenes.pl badgenes.lst {gb} > genes.gb'.format(gb=out_gb))
307 |     #---- split gb file
308 |     sarge.run('randomSplit.pl genes.gb 1000')
309 |     os.remove('genes.gb')
310 |     os.remove('genes.gb.train')
311 |     sarge.run('randomSplit.pl genes.gb.test 100')
312 |     #---- create meta parameters file for ne species
313 |     sarge.run('new_species.pl --species={s}'.format(s=organism))
314 |     #---- initial training
315 |     sarge.run('etraining --species={s} --stopCodonExcludedFromCDS=true genes.gb.test.train'.format(s=organism))
316 |     #---- fly predict
317 |     sarge.run('augustus --species={s} genes.gb.test.test | tee firsttest.out'.format(s=organism))
318 |     #---- optimize
319 |     sarge.run('optimize_augustus.pl --species={s} genes.gb.test.train'.format(s=organism))
320 | 
321 | 
322 | def augustus_prepare_hint(pasa,exonerate):
323 |     '''
324 |     '''
325 |     dfs = []
326 |     for g,t,feature in zip([pasa,exonerate],['E','P'],['exonpart','CDSpart']):
327 |         df = pd.read_csv(g,sep='\t',header=None)
328 |         df[2] = df[2].map(lambda x: feature)
329 |         df[8] = df[8].map(lambda x: x+';grp='+re.search('(?<=ID=).+?(?=;)',x).group(0)+';src='+t)
330 |         dfs.append(df)
331 |     res = pd.concat(dfs)
332 |     res.to_csv('hints.gff',sep='\t',index=False,header=None)
333 |     
334 | 
335 | augs_path = path + '/augustus'
336 | if not os.path.exists(augs_path): os.mkdir(augs_path)
337 | os.chdir(augs_path)
338 | out_gb = 'exonerate.gb'
339 | # augustus_train(exonerate_path + '/exonerate_4_augustus.gff',out_gb,ref_fa)
340 | # augustus_prepare_hint(PASA_path+'/picr_db.pasa_assemblies.gff3',pr_gff)
341 | # sarge.run('augustus --species={s} {ref} \
342 | #  --extrinsicCfgFile=extrinsic.hamster.cfg --hintsfile=hints.gff --gff3=on > augustus.hints.gff'.format(s=organism,ref=ref_fa))
343 | 
344 | #===============================================================================
345 | #                     5. functional annotation of the new gff file
346 | #===============================================================================
347 | from Bio.Seq import Seq
348 | import shutil
349 | 
350 | def get_cds_sequence(rna,c_df,chrom_seq):
351 |     pr_df = c_df[c_df['rna_id'].values==rna]
352 |     strand = list(set(pr_df[6].tolist()))
353 |     if len(strand) == 2:
354 |         assert False, rna+' has both strands'
355 |     # seqeunce merge
356 |     chr_seq = Seq('')
357 |     for start,end in zip(pr_df[3],pr_df[4]):
358 |         if strand == ['-']:
359 |             chr_seq += chrom_seq[start-1:end].reverse_complement()
360 |         else:
361 |             chr_seq += chrom_seq[start-1:end]
362 |     # consider the frame information in 7th column
363 |     frame = int(pr_df[7].tolist()[0])
364 |     rna_seq = chr_seq[frame:]
365 |     return str(rna_seq.translate())
366 | 
367 | def output_cds(chrom,cds_df,dic):
368 |     '''this function get AA sequence and output to file with filename as chromosome name'''
369 |     chrom_seq = dic[chrom].seq
370 |     chr_df = cds_df[cds_df[0].values==chrom]
371 |     rnas = list(set(chr_df['rna_id'].tolist()))
372 |     out_h = open(chrom+'.fa','w')
373 |     for rna in rnas:#['evm.model.picr_0.1707']: #rnas: 
374 |         AA = get_cds_sequence(rna,chr_df,chrom_seq)
375 |         if AA.endswith('*'):
376 |             AA = AA[:-1]
377 |         out_h.write('>{rna}\n{pr}\n'.format(rna=rna,pr=AA))
378 |     out_h.close()
379 | 
380 | def get_evm_pr(evm_path,ref_fa,out_path):
381 |     '''this function get all evm proteins, output to files and merge them together
382 |     * evm_path: evm path that has gff file
383 |     * ref_fa: reference fa file
384 |     * out_path: path to save all temperary files and final protein files
385 |     '''
386 |     if os.path.exists(out_path): 
387 |         shutil.rmtree(out_path)
388 |     os.mkdir(out_path)
389 |     os.chdir(out_path)
390 |     evm_gff= evm_path + '/evm.merge.gff'
391 |     gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
392 |     dic = SeqIO.index(ref_fa,'fasta')
393 |     cds_df = gff_df[gff_df[2].values=='CDS']
394 |     cds_df = cds_df.reset_index(drop=True)
395 |     cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:])
396 |     scaffolds = list(set(cds_df[0].tolist()))
397 |     for scaff in scaffolds:
398 |         output_cds(scaff,cds_df,dic)
399 |     # merge files
400 |     fns = natsorted(glob.glob('*.fa'))
401 |     sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa'))
402 |     for f in fns:
403 |         os.remove(f)
404 | 
405 | evm_pr_path = path + '/evm_pr'
406 | 
407 | # get_evm_pr(evm_path,ref_fa,evm_pr_path)
408 | 
409 | def makeblast(ref_fa,out,db_type):
410 |     '''
411 |     ref_fa: gzipped fa file
412 |     '''
413 |     cmd = ('gunzip -c {ref} | makeblastdb -in - -dbtype {type} -out {out} -title {title}').format(
414 |             ref=ref_fa,type=db_type,out=out,title=out)
415 |     print(cmd)
416 |     sarge.run(cmd)
417 |     
418 | def blastp(query_fa,out_fn,db,thread):
419 |     cmd = ('blastp -query {q} -task blastp -db {db} -out {out} -evalue 1e-7 -word_size 4 \
420 |         -outfmt 6 -num_alignments 1 -num_threads {t}').format(q=query_fa,db=db,out=out_fn,t=str(thread))
421 |     print(cmd)
422 |     sarge.run(cmd)
423 | 
424 | def main_blast():
425 |     blast_db = path + '/blastp_db'
426 |     if not os.path.exists(blast_db): os.mkdir(blast_db)
427 |     os.chdir(blast_db)
428 | #     makeblast(uniprot,'pr','prot')
429 |     blastp(evm_pr_path +'/pr_merge.fa','blastp.txt','pr',24)
430 | 
431 | # import time
432 | # st = time.time()
433 | # main_blast()
434 | # print time.time() - st
435 | 
436 | def add_gene_name(x,rna_pr_dic):
437 |     ids = '.'.join(re.search('picr.+?(?=;)',x).group(0).split('.')[:2])
438 |     if ids in rna_pr_dic:
439 |         res = x + ';gene=' + rna_pr_dic[ids]
440 |     else:
441 |         res = x
442 |     return res
443 | 
444 | # add function of mapped genes to  gff file
445 | def add_gene_function(blast_db,evm_path):
446 |     '''add gene symbol to gff file. the information is from the blast results
447 |     '''
448 |     blastp_fn = blast_db + '/blastp.txt'
449 |     blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per'])
450 |     blast_df = blast_df[blast_df['per'].values>50]
451 |     blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:]))
452 |     blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0])
453 |     rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict()
454 |                                            
455 |     evm_gff= evm_path + '/evm.merge.gff'
456 |     gff_df = pd.read_csv(evm_gff,sep='\t',header=None)
457 |     gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic))
458 |     gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)]
459 |     gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False)
460 | 
461 | # add_gene_function(blast_db,evm_path)
462 | #===============================================================================
463 | #                     process the gmap results and exonerates results directly
464 | #===============================================================================
465 | #=============== 1. get all mapped geneid, rna_accession, pr_accession
466 | def gene_rna_pr_id(hamster_id,gmap_gff,out_fn):
467 |     '''this fnction get all gene rna pr id, including both refseq and gff information.
468 |     * hamster_id: a file that has all ids in hamster.gff file
469 |     * gmap_gff: gff results mapped using gmap
470 |     * out_fn:  
471 |     '''
472 |     # rna accession in gff file
473 |     ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0)
474 |     ham_id_df = ham_id_df.astype('str')
475 |     ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0])
476 |     ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0])
477 |     rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict()
478 |     rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict()
479 |     #-------- read rna gff file
480 |     rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
481 |     # add rna accession column
482 |     rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
483 |     mrna = list(set(rna_df['rna_ac'].tolist()))
484 |     # new rna in refseq compared to gff
485 |     new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys()))
486 |     # get geneid for new ref_rna gene id
487 |     for r in new_ref_rna:
488 |         handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read()
489 |         geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0)
490 |         try:
491 |             p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0)
492 |         except:
493 |             p = '-'
494 |         rna_gene_dic[r] = geneid
495 |         rna_pr_dic[r] = p
496 |     # transfer dic to dataframe
497 |     r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index')
498 |     r_g_df.columns = ['geneid']
499 |     r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index')
500 |     r_p_df.columns = ['pr_ac']
501 |     g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1)
502 |     g_r_p_df['rna_ac'] = g_r_p_df.index
503 |     g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False)
504 | 
505 | gmap_exon_path = path + '/gmap_exonerate'
506 | if not os.path.exists(gmap_exon_path): os.mkdir(gmap_exon_path)
507 | os.chdir(gmap_exon_path)
508 | # gmap_gff = PASA_path + '/gmap.spliced_alignments.gff3'
509 | # g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt'
510 | # gene_rna_pr_id(hamster_id,gmap_gff,g_r_p_id_fn)
511 | 
512 | 
513 | def get_consensus_map(rna_df,pr_df,gene,rna_ac,pr_ac):
514 |     '''this function check if the rna map and pr map have the same splice sites
515 |     * rna_df: mRNA map to genome gff dataframe with additional rna_ac column
516 |     * pr_df: protein map to genome dataframe with additional 'pr_ac' and 'pr_id' column
517 |     '''
518 |     if not rna_df.empty:
519 |         # get rna scaffold name, if more than 1 scaffold then don't add it's annotation
520 |         rna_chr = list(set(rna_df[0].tolist()))
521 |         if len(rna_chr) != 1:
522 |             assert False, rna_ac + ' map to multiple scaffolds'
523 |         else:
524 |             rna_chr = rna_chr[0]
525 |         # get strand, if map to both strand don't output
526 |         rna_str = list(set(rna_df[6].tolist()))
527 |         if len(rna_str) != 1:
528 |             assert False, rna_ac + ' map to both strands'
529 |         else:
530 |             rna_str = rna_str[0]
531 |         # get rna splice sites
532 |         rna_splice = natsorted(rna_df[3].tolist() + rna_df[4].tolist())
533 |         # change exon id
534 |         n = 1
535 |         for i,row in rna_df.iterrows():
536 |             item = row[8].split(';')
537 |             anno = '.'.join(item[0].split('.')[:-1])+'_'+str(n)+';'+ ';'.join(item[1:])+';Parent='+gene+';GeneID='+gene
538 |             rna_df.loc[i,8] = anno
539 |             rna_df.loc[i,2] = 'exon'
540 |             n += 1
541 |     #--------------- process protein gff information
542 |     if not pr_df.empty:
543 |         pr_id = pr_df['pr_id'].tolist()[0]
544 |         sub_pr_df = pr_df[(pr_df['pr_id'].values==pr_id) & (pr_df[0].values==rna_chr)].copy()
545 |         # change cds id
546 |         m = 1
547 |         for i,row in sub_pr_df.iterrows():
548 |             item = row[8].split(';')
549 |             anno = 'ID='+pr_ac+'_'+str(m)+';'+';'.join(item[1:])+';Parent='+rna_ac+';GeneID='+gene
550 |             sub_pr_df.loc[i,8] = anno
551 |             sub_pr_df.loc[i,2] = 'CDS'
552 |             m += 1
553 |         pr_splice = natsorted(sub_pr_df[3].tolist() + sub_pr_df[4].tolist())
554 |         if sub_pr_df.shape[0] == 1:
555 |             if not rna_splice[0]<pr_splice[0]<pr_splice[1]<rna_splice[1]:
556 |                 sub_pr_df = pd.DataFrame()
557 |         else:
558 |             rna_pr_sites_match = set(pr_splice[1:-1]).intersection(rna_splice)
559 |             m_len = len(rna_pr_sites_match)
560 |             pr_len = len(pr_splice[1:-1])
561 |             if  m_len != pr_len:
562 |                 print pr_ac,m_len,'/',pr_len
563 |             if len(pr_splice) > len(rna_splice):
564 |                 print 'protein has more splice than rna, rna/pr:',len(rna_splice),'/',len(pr_splice)
565 |                 sub_pr_df = pd.DataFrame()
566 |     else:
567 |         sub_pr_df = pr_df
568 |     return rna_df,sub_pr_df,rna_chr,rna_splice[0],rna_splice[-1],rna_str
569 | 
570 | 
571 | import time
572 | process_start = time.time()
573 | 
574 | def gmap_exonerate_merge_gff(gmap_gff,exonerate_gff,gmap_exon_path,all_id_fn):
575 |     #-------- read gmap gff file
576 |     rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#')
577 |     rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0))
578 |     # get multi mapping mRNAs
579 |     multi_map_rna = list(set(rna_df[rna_df[8].map(lambda x: 'path2' in x)]['rna_ac'].tolist()))  
580 |     # build gene rna protein id dictionary
581 |     g_r_p_dic = {}
582 |     g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt'
583 |     handle = open(g_r_p_id_fn)
584 |     for line in handle:
585 |         item = line.strip().split('\t')
586 |         if item[1] in multi_map_rna:
587 |             continue
588 |         if item[0] in g_r_p_dic:
589 |             g_r_p_dic[item[0]][item[1]] = item[2]
590 |         else:
591 |             g_r_p_dic[item[0]] = {item[1]:item[2]}
592 |     #-------- read exonerate gff file
593 |     pr_df = pd.read_csv(pr_gff,sep='\t',header=None)
594 |     pr_df['pr_ac'] = pr_df[8].map(lambda x: re.search('(?<=Target=).+?(?=\.)',x).group(0))
595 |     
596 |     def output_consensus_rna_pr(g,out_handle):
597 |         '''this function finds the consistend rna and protein and out put to file
598 |         g_r_p_dic: dictionary that has all the gene, rna and protein ids.
599 |         g: gene id
600 |         rna_df: rna gff dataframe
601 |         pr_df: protein gff dataframe
602 |         '''
603 |         g_n = 0
604 |         rna_pr_dic = g_r_p_dic[g]
605 |         for rna in rna_pr_dic:
606 |             pr = rna_pr_dic[rna]
607 |             single_rna_df = rna_df[rna_df['rna_ac'].values==rna].copy()
608 |             single_rna_df = single_rna_df.reset_index(drop=True)
609 |             if not single_rna_df.empty:
610 |                 single_pr_df = pr_df[pr_df['pr_ac'].values==pr].copy()
611 |                 single_pr_df = single_pr_df.reset_index(drop=True)
612 |                 single_pr_df.loc[:,'pr_id'] = single_pr_df[8].map(lambda x: re.search('(?<=ID=).+?(?=;)',x).group(0))
613 |                 res_rna_df,res_pr_df,chrome,start,end,strand=get_consensus_map(single_rna_df,single_pr_df,str(g),rna,pr)
614 |                 if g_n == 0:
615 |                     out_handle.write('\t'.join([chrome,'gmap_exonerate','gene',str(start),str(end),'.',\
616 |                                                 strand,'.','ID='+str(g)+';GeneID='+str(g)])+'\n')
617 |                     g_n += 1
618 |                 if not res_rna_df.empty:
619 |                     if rna.startswith('XR'):
620 |                         feature = 'lncRNA'
621 |                     else:
622 |                         feature = 'mRNA'
623 |                     out_handle.write('\t'.join([chrome,'gmap_exonerate',feature,str(start),str(end),'.',\
624 |                                                 strand,'.','ID='+rna+';Parent='+str(g)+';GeneID='+str(g)])+'\n')
625 |                     res_rna_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None)
626 |                     if not res_pr_df.empty:
627 |                         res_pr_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None)
628 |         
629 |     out_fn = '02_gmap_exonerate.gff'
630 |     if os.path.exists(out_fn): os.remove(out_fn)
631 |     with open(out_fn,'a') as f:
632 |         for g in g_r_p_dic.keys():
633 |             output_consensus_rna_pr(g,f)
634 |     # define a function to find the start and end position of each gene
635 |     def get_gene_s_e(gene_df):
636 |         pos = gene_df[3].tolist() + gene_df[4].tolist()
637 |         gene_df.iloc[0,3] = min(pos)
638 |         gene_df.iloc[0,4] = max(pos)
639 |         return gene_df
640 |     # correct gene coordinates
641 |     gff_df = pd.read_csv(out_fn,sep='\t',header=None)
642 |     gff_df['geneid'] = gff_df[8].map(lambda x: re.search('(?<=GeneID=).+?(?=$)',x).group(0))
643 |     res_df = gff_df.groupby('geneid').apply(get_gene_s_e)
644 |     # add gene name 
645 |     all_id_df = pd.read_csv(all_id_fn,sep='\t',header=0)
646 |     all_id_df = all_id_df.astype('str')
647 |     g_s_dic = all_id_df.set_index('GeneID')['GeneSymbol'].to_dict()
648 |     res_df[8] = res_df.apply(lambda row: row[8]+';GeneName='+g_s_dic[row['geneid']] if row['geneid'] in g_s_dic else row[8]+';GeneName=NA',axis=1)
649 |     res_df[range(9)].to_csv('02_gmap_exonerate.gff',sep='\t',index=False,header=None)
650 |     
651 | # gmap_gff = PASA_path+'/gmap.spliced_alignments.gff3'
652 | # gmap_exonerate_merge_gff(gmap_gff,pr_gff,gmap_exon_path,hamster_id)
653 | 
654 | print time.time() - process_start
655 | #===============================================================================
656 | #                     RATT
657 | #===============================================================================
658 | def fa2embl(fa,embl,gff,path):
659 |     if not os.path.exists(path): os.mkdir(path)
660 |     os.chdir(path)
661 |     df = pd.read_csv(gff,sep='\t',header=None,comment='#',usecols=[0,2])
662 |     df = df[df[2].values=='gene']
663 |     chroms = list(set(df[0].tolist()))
664 |     dic = SeqIO.index(fa,'fasta')
665 |     for s in chroms:
666 |         SeqIO.write(dic[s],open('fa','w'),'fasta')
667 |         sarge.run('grep \'{s}\' {gff} > gff'.format(s=s,gff=gff))
668 |         sarge.run('/home/shangzhong/Installation/EMBOSS-6.6.0/bin/seqret \
669 |         -sequence fa -feature -fformat gff -fopenfile1 gff -osformat2 embl \
670 |         -auto -outseq {s}.embl'.format(s=s))
671 |     fns = glob.glob('*.embl')
672 |     sarge.run('cat {files} > {embl}'.format(files=' '.join(fns),embl=embl))
673 | #     for f in fns:
674 | #         os.remove(f)
675 | # fa2embl('/data/genome/hamster/ncbi_refseq/hamster.fa','hamster.embl','/data/genome/hamster/ncbi_refseq/hamster.gff','/data/shangzhong/Picr_assembly/Annotation/RATT/embl')
676 |     
677 |     
678 |     


--------------------------------------------------------------------------------