├── Modules ├── BLAST.py ├── __init__.py ├── GATK.pyc ├── HTseq.pyc ├── Homer.pyc ├── Lumpy.pyc ├── Picard.pyc ├── Aligner.pyc ├── CNVnator.pyc ├── GeneMark.pyc ├── PBhoney.pyc ├── SVTyper.pyc ├── Samtools.pyc ├── Sniffles.pyc ├── StringTie.pyc ├── __init__.pyc ├── Trimmomatic.pyc ├── f01_file_process.pyc ├── __pycache__ │ ├── HTseq.cpython-34.pyc │ ├── Aligner.cpython-34.pyc │ ├── Samtools.cpython-34.pyc │ ├── __init__.cpython-34.pyc │ ├── __init__.cpython-35.pyc │ ├── Trimmomatic.cpython-34.pyc │ ├── f01_file_process.cpython-34.pyc │ └── f01_file_process.cpython-35.pyc ├── Sniffles.py ├── SVTyper.py ├── GeneMark.py ├── Lumpy.py ├── StringTie.py ├── HTseq.py ├── Samtools.py ├── CNVnator.py ├── Picard.py ├── PBhoney.py ├── Trimmomatic.py ├── f02_parse_gff.py ├── Homer.py ├── Aligner.py ├── f01_file_process.py └── GATK.py ├── Parameters ├── SV_Pacbio_Sniffle.yaml ├── SV_Pacbio_PBHoney.yaml ├── CNVnator.yaml ├── StringTie_quant.yaml ├── RibosomeProfiling.yaml ├── GRO_Seq_Cap.yaml ├── STAR_get_bam.yaml ├── SV_Illumina_lumpy.yaml ├── GATK_RNA_CHO.yaml ├── RNAseq_count.yaml └── GATK_DNA_CHO.yaml ├── README.md ├── SV_Pacbio_Sniffle.py ├── mapped_BAM_to_fastq.ipynb ├── STAR_get_bam.py ├── SV_Pacbio_PBHoney.py ├── Salmon_quant.py ├── StringTie_quant.py ├── CNV_CNVnator.py ├── SV_Illumina_Lumpy.py ├── RNAseq_count.py ├── RibosomeProfiling.py ├── RNAseq_STARpipeline.sh ├── GRO_Seq_Cap.py ├── VCF_snpEff_annotation.py ├── GATK_RNA_CHO.py ├── GATK_DNA_CHO.py ├── Eukaryote_genome_annotation.py └── Genome_Annotation.py /Modules/BLAST.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Modules/GATK.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/GATK.pyc -------------------------------------------------------------------------------- /Modules/HTseq.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/HTseq.pyc -------------------------------------------------------------------------------- /Modules/Homer.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Homer.pyc -------------------------------------------------------------------------------- /Modules/Lumpy.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Lumpy.pyc -------------------------------------------------------------------------------- /Modules/Picard.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Picard.pyc -------------------------------------------------------------------------------- /Modules/Aligner.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Aligner.pyc -------------------------------------------------------------------------------- /Modules/CNVnator.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/CNVnator.pyc -------------------------------------------------------------------------------- /Modules/GeneMark.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/GeneMark.pyc -------------------------------------------------------------------------------- /Modules/PBhoney.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/PBhoney.pyc -------------------------------------------------------------------------------- /Modules/SVTyper.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/SVTyper.pyc -------------------------------------------------------------------------------- /Modules/Samtools.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Samtools.pyc -------------------------------------------------------------------------------- /Modules/Sniffles.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Sniffles.pyc -------------------------------------------------------------------------------- /Modules/StringTie.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/StringTie.pyc -------------------------------------------------------------------------------- /Modules/__init__.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__init__.pyc -------------------------------------------------------------------------------- /Modules/Trimmomatic.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/Trimmomatic.pyc -------------------------------------------------------------------------------- /Modules/f01_file_process.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/f01_file_process.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/HTseq.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/HTseq.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/Aligner.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Aligner.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/Samtools.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Samtools.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/__init__.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/__init__.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/Trimmomatic.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/Trimmomatic.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/f01_file_process.cpython-34.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/f01_file_process.cpython-34.pyc -------------------------------------------------------------------------------- /Modules/__pycache__/f01_file_process.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LewisLabUCSD/NGS-Pipeline/HEAD/Modules/__pycache__/f01_file_process.cpython-35.pyc -------------------------------------------------------------------------------- /Parameters/SV_Pacbio_Sniffle.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Pacbio/sniffle' 3 | thread: 16 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/picr/picr.fa' 6 | bwa_db: '/data/genome/hamster/new_pacbio_assemble/bwaDb' 7 | aligner: 'ngmlr' 8 | 9 | -------------------------------------------------------------------------------- /Modules/Sniffles.py: -------------------------------------------------------------------------------- 1 | import sarge,sys 2 | 3 | def sniffle(bam,outVCF,otherParameters=['']): 4 | """run sniffle to detect SV using pacbio""" 5 | cmd = ('sniffles -m {bam} -v {outVCF} ').format(bam=bam,outVCF=outVCF) 6 | if otherParameters != ['']: 7 | cmd = cmd + ' '.join(otherParameters) 8 | print(cmd);sys.stdout.flush() 9 | sarge.run(cmd) 10 | 11 | -------------------------------------------------------------------------------- /Parameters/SV_Pacbio_PBHoney.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Pacbio/fa' 3 | thread: 24 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/multi_pacbio_assemble/picr.fa' 6 | sa_index: '/data/genome/hamster/multi_pacbio_assemble/picr.fa.sa' 7 | # tool specific parameters 8 | blasr_jobs_per_batch: 2 9 | sam_sort_jobs_per_batch: 6 10 | -------------------------------------------------------------------------------- /Parameters/CNVnator.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv' 3 | thread: 12 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa' 6 | # tool specific parameters 7 | trim_reads: False 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 9 | trim_jobs_per_batch: 6 10 | adapter: '' 11 | 12 | bwa_jobs_per_batch: 2 13 | bwa_Db: '/data/genome/hamster/ncbi_refseq/bwa_Db' # should be a folder 14 | 15 | bin_win: 100 16 | chrom: ['-chrom NW_006887432.1'] 17 | -------------------------------------------------------------------------------- /Parameters/StringTie_quant.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Proteogenomics/fq' 3 | thread: 12 4 | QC: False 5 | # database parameters 6 | ref_fa: '/data/genome/hamster/picr/picr.fa' 7 | gff: '/data/genome/hamster/picr/updated_final.gff3' 8 | # tool specific parameters 9 | trim_reads: False 10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 11 | trim_jobs_per_batch: 6 12 | adapter: '' 13 | 14 | star_jobs_per_batch: 1 15 | STAR_index_path: '/data/genome/hamster/picr/picr_STAR_Db' 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /Modules/SVTyper.py: -------------------------------------------------------------------------------- 1 | import sarge,sys 2 | def svtyper(in_vcf,out_vcf,bam): 3 | ''' 4 | this function run svtyper to add genotype to vcf 5 | ''' 6 | # 1. generate json file 7 | json = bam[:-3] + 'json' 8 | cmd = ('svtyper -B {bam} -l {j} && samtools index {bam}').format(bam=bam,j=json) 9 | print(cmd);sys.stdout.flush() 10 | sarge.run(cmd) 11 | # 2. generate json plot 12 | sarge.run('lib_stats.R {j} {j}.pdf'.format(j=json)) 13 | # 3. run svtyper 14 | cmd = ('svtyper -B {bam} -i {invcf} -l {j} -o {out}').format( 15 | bam=bam,invcf=in_vcf,j=json,out=out_vcf) 16 | print(cmd);sys.stdout.flush() 17 | sarge.run(cmd) -------------------------------------------------------------------------------- /Parameters/RibosomeProfiling.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/DE/tissue' 3 | thread: 12 4 | QC: False 5 | # database parameters 6 | rRNA_fa: '/data/shangzhong/DE/tissue/rtRNA.fa' 7 | ref_fa: '/data/genome/hamster/picr/picr.fa' 8 | gff: '/data/genome/hamster/picr/updated_final.gff3' 9 | # tool specific parameters 10 | trim_reads: False 11 | trim_jobs_per_batch: 6 12 | adapter: '' 13 | 14 | hisat2_jobs_per_batch: 2 15 | hisat2_rrna_index: '/data/shangzhong/DE/tissue/rRNA_Db' # should be a path 16 | hisat2_target_index: '/data/genome/hamster/picr/hisat2_rRNA_Db' 17 | 18 | other: [''] 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /Modules/GeneMark.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import sarge,sys 3 | def geneMark_ES(ref_fa,other_params=['']): 4 | '''run geneMark_ES''' 5 | cmd = ('gmes_petap.pl --ES {other} --sequence {fa}').format(fa=ref_fa, 6 | other=' '.join(other_params)) 7 | print(cmd);sys.stdout.flush() 8 | sarge.run(cmd) 9 | sarge.run('genemark_gtf2gff3 genemark.gtf > genemark.gff') # this code is download from maker 10 | df = pd.read_csv('genemark.gff',sep='\t',comment='#',header=None) 11 | df[0] = df[0].map(lambda x: x.split(' ')[0]) 12 | df.to_csv('genemark.gff3',sep='\t',index=False,header=None) 13 | #return os.getcwd() +'/genemark.gff' 14 | 15 | -------------------------------------------------------------------------------- /Modules/Lumpy.py: -------------------------------------------------------------------------------- 1 | import sarge,sys,os 2 | def lumpyexpress(in_bams,out_vcf,others=['']): 3 | '''This function runs lumpy express 4 | * in_bams: sorted bams ''' 5 | bams = ','.join(in_bams) 6 | splits = ','.join([b[:-3]+'split.bam' for b in in_bams]) 7 | discs = ','.join([b[:-3]+'disc.bam' for b in in_bams]) 8 | cmd = ('lumpyexpress -B {bams} -S {splits} -D {discs} {other} ' 9 | '-k -o {out}').format( 10 | bams=bams,splits=splits,discs=discs,out=out_vcf,other=' '.join(others)) 11 | with open('cmd.sh','w') as f: 12 | f.write('#!/bin/bash\n' + cmd) 13 | print(cmd);sys.stdout.flush() 14 | sarge.run('chmod 777 cmd.sh && ./cmd.sh && rm cmd.sh') 15 | -------------------------------------------------------------------------------- /Parameters/GRO_Seq_Cap.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/TSS/fq' 3 | thread: 6 4 | QC: False 5 | # database parameters 6 | ref_fa: '/data/genome/hamster/picr/picr.fa' 7 | gff: '/data/genome/hamster/picr/picr.gff3' 8 | # tool specific parameters 9 | trim_reads: False 10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 11 | trim_jobs_per_batch: 6 12 | adapter: '/data/shangzhong/TSS/fq/Gro_adapter.txt' 13 | 14 | star_jobs_per_batch: 1 15 | STAR_index_path: '/data/genome/hamster/picr/picr_STAR_Db' #'/opt/genome/cho/STAR_Db' # alinger index 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | -------------------------------------------------------------------------------- /Parameters/STAR_get_bam.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/DE/helene' 3 | thread: 8 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/picr/picr.fa' 6 | gff: '' 7 | # tool specific parameters 8 | trim_reads: True 9 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 10 | trim_jobs_per_batch: 6 11 | adapter: '' 12 | 13 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar' 14 | 15 | star_jobs_per_batch: 1 # at most 2 16 | star_index: '/data/genome/hamster/multi_pacbio_assemble/picr_STAR_Db' 17 | star_pass: 2 18 | 19 | star_params: [''] # should be a list of strings 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /Parameters/SV_Illumina_lumpy.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Pacbio/CHOS_illu_DNA' 3 | thread: 16 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/new_pacbio_assemble/ch_illumina_pbj.fasta' 6 | # tool specific parameters 7 | trim_reads: True 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 9 | trim_jobs_per_batch: 6 10 | adapter: '' 11 | 12 | bwa_jobs_per_batch: 2 13 | bwa_index: '/data/genome/hamster/new_pacbio_assemble/bwaDb' 14 | read_groups : ['@RG\tID:lane1\tSM:CHOS','@RG\tID:lane2\tSM:CHOS', 15 | '@RG\tID:lane3\tSM:CHOS','@RG\tID:lane4\tSM:CHOS', 16 | '@RG\tID:lane5\tSM:CHOS','@RG\tID:lane6\tSM:CHOS'] 17 | 18 | -------------------------------------------------------------------------------- /Parameters/GATK_RNA_CHO.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Proteogenomics/test' 3 | thread: 9 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa' 6 | # tool specific parameters 7 | trim_reads: True 8 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 9 | trim_jobs_per_batch: 6 10 | adapter: '' 11 | 12 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar' 13 | gatk: '/home/shangzhong/Installation/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar' 14 | 15 | star_jobs_per_batch: 1 16 | star_index: '/data/genome/hamster/ncbi_refseq/hamster_STAR_Db' 17 | 18 | sample_name: 'hamster' 19 | read_groups: ['@RG\tID:CellLine8_2\tSM:CellLine8_2','@RG\tID:CellLine8_3\tSM:CellLine8_3'] 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /Parameters/RNAseq_count.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/DE/mouse/fq' 3 | thread: 12 4 | QC: False 5 | # database parameters 6 | ref_fa: '/data/genome/cho/chok1.fa' 7 | gff: '/data/genome/cho/chok1.gff' 8 | # tool specific parameters 9 | trim_reads: False 10 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 11 | trim_jobs_per_batch: 6 12 | adapter: '' 13 | 14 | star_jobs_per_batch: 1 # at most 2 15 | STAR_index_path: '/data/genome/cho/cho_STAR_Db' #'/opt/genome/cho/STAR_Db' # alinger index 16 | 17 | htseq_anno_source: 'ncbi' # alternative values: 'ncbi', 'ensembl' or leave it empty. 18 | strand_specific: 'no' # 'yes, no, reverse' 19 | id_name: 'id' # if you want gene id in the count result, set 'id', if you want gene name, set 'name' 20 | 21 | 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /Parameters/GATK_DNA_CHO.yaml: -------------------------------------------------------------------------------- 1 | contact: 'user@gmail.com' # can be email address or phone_number@txt.att.net 2 | RawDataPath: '/data/shangzhong/Proteogenomics/test' 3 | thread: 9 4 | # database parameters 5 | ref_fa: '/data/genome/hamster/ncbi_refseq/hamster.fa' 6 | gff: '/data/genome/hamster/ncbi_refseq/hamster.gff' 7 | # tool specific parameters 8 | trim_reads: True 9 | trimmomatic_path: '/home/shangzhong/Installation/Trimmomatic-0.32/Trimmomatic-0.33/trimmomatic-0.33.jar' 10 | trim_jobs_per_batch: 6 11 | adapter: '' 12 | 13 | 14 | QC: True 15 | picard: '/home/shangzhong/Installation/picard-tools-1.141/picard.jar' 16 | gatk: '/home/shangzhong/Installation/GenomeAnalysisTK-3.5/GenomeAnalysisTK.jar' 17 | 18 | bwa_jobs_per_batch: 2 19 | bwa_=db: '/data/genome/hamster/ncbi_refseq/bwa_Db' 20 | 21 | sample_name: 'hamster' 22 | read_groups: ['@RG\\tID:CellLine8_2\\tSM:CellLine8_2','@RG\\tID:CellLine8_3\\tSM:CellLine8_3'] 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /Modules/StringTie.py: -------------------------------------------------------------------------------- 1 | import sarge,sys,glob 2 | import pandas as pd 3 | from natsort import natsorted 4 | 5 | def stringtie(in_bam,out_gtf,thread,annotation): 6 | ''' 7 | ''' 8 | quant = out_gtf[:-3] + 'abund.tab' 9 | cov_ref = out_gtf[:-3] + 'cov_ref.gtf' 10 | cmd = ('stringtie {bam} -o {gtf} -p {t} -G {gff} -A {q} \ 11 | -C {cov}').format(bam=in_bam,gtf=out_gtf,t=str(thread), 12 | gff=annotation,q=quant,cov=cov_ref) 13 | print(cmd);sys.stdout.flush() 14 | sarge.run(cmd) 15 | 16 | 17 | def merge_stringtie_tpm(path): 18 | """This function merges tpm fpkm results to one file. 19 | each file has two columns [geneid, tpm] 20 | """ 21 | files = natsorted(glob.glob(path + '/*.tab')) 22 | dfs = [] 23 | for f in files: 24 | sp = f.split('/')[-1].split('.')[0] 25 | df = pd.read_csv(f,sep='\t',header=0,usecols=[1,8],names=['name',sp],index_col=0) 26 | df = df[df.index.values !='-'] 27 | df = df.groupby('name').sum() 28 | dfs.append(df) 29 | res_df = pd.concat(dfs,axis=1) 30 | return res_df 31 | 32 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NewPipeline 2 | Pipelines to process NGS or Pacbio data 3 | --------------------------------------- 4 | 5 | ## Method to run these pipelines. 6 | #### Paired end files should end with _1.fq.gz, _2.fq.gz or _1.fastq.gz,_2.fastq.fz. Single end files should end with _1.fq.gz 7 | #### STAR takes a lot of memory(30-50 GB) each run, so don't run more than 2 STAR in parallel at each batch. 8 | 1. define all parameters in the corresponding parameter file in parameters folder. 9 | 2. In bash terminal, run the followsing command: 10 | * nohup python pipeline.py parameter.yaml > log.txt & 11 | Or if you are running in screen, try the following command: 12 | * python pipeline.py parameter.yaml 2>&1 | tee log.txt 13 | 3. Press enter 14 | 15 | * Finished Pipeline 16 | * RNAseq_count: quantify number of reads mapping to each gene 17 | * GATK_RNA_CHO: call variants for RNAseq 18 | * SV_Pacbio_PBHoney: call structure variation for Pacbio data using PBHoney 19 | * SV_Pacbio_Sniffle: call structure variation for Pacbio data using Sniffle 20 | 21 | 22 | ## Pipeline specific notes 23 | 24 | ### RNAseq_STARpipeline.sh 25 | 26 | * Be aware, this is a bash pipeline and does not manage flow and reruns like ruffus. 27 | 28 | -------------------------------------------------------------------------------- /Modules/HTseq.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | import sarge 3 | 4 | def htseq_count(sortedBam,countFile,annotation,strand,annotationSource): 5 | """This function run htseq_count to count reads given bam file 6 | * sortedBam: str. Bamfile name 7 | * countFile: outputfilename 8 | * annotation: annotation file 9 | * outputpath: path to store the result files 10 | * annotation: source. 'ncbi','ensembl' 11 | """ 12 | # 2. check the annotation source 13 | if annotationSource == 'ncbi': 14 | seqType = 'exon' 15 | id_attr = 'gene' 16 | elif annotationSource == 'ensembl': 17 | seqType = 'exon' 18 | id_attr = 'gene_id' 19 | elif annotationSource == 'genedb': 20 | seqType = 'CDS' 21 | id_attr = 'Parent' 22 | elif annotationSource == 'plasmodium': 23 | seqType = 'exon' 24 | id_attr = 'Parent' 25 | # 3. run htseq-count 26 | cmd = ('htseq-count -f bam -s {strand} -t {type} -i {gene} {bam} {annotation} > {output}').format(strand=strand, 27 | type=seqType,gene=id_attr,bam=sortedBam,annotation=annotation,output=countFile)#os.path.join(outpath,countFile)) 28 | print(cmd);sys.stdout.flush() 29 | sarge.run(cmd) 30 | 31 | 32 | def Message(string,email): 33 | """ 34 | This function send message to email when it run. 35 | Used to calculate the time code runs. 36 | """ 37 | cmd = ('echo {quote}|mailx -s "{string}" {email}').format(quote="",string=string,email=email) 38 | sarge.run(cmd) 39 | 40 | -------------------------------------------------------------------------------- /Modules/Samtools.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import sys 3 | 4 | def sortBam(bamFile,sortedBamFile,thread=1,sortType=''): 5 | """ 6 | This function sort bam files 7 | """ 8 | if sortType == 'name': 9 | tag = ' -n' 10 | else: 11 | tag = '' 12 | cmd = ('samtools sort{tag} -m 4G -@ {thread} -T {sort} -o {sortBam} {bam} ').format( 13 | tag=tag,thread=str(thread),sort=bamFile[:-3]+'sort',bam=bamFile,sortBam=sortedBamFile) 14 | print(cmd);sys.stdout.flush() 15 | sarge.run(cmd) 16 | if sortType !='name': 17 | cmd = ('samtools index {bam} ').format(bam=sortedBamFile) 18 | print(cmd);sys.stdout.flush() 19 | sarge.run(cmd) 20 | 21 | def sam2bam(samFile,bamFile,thread): 22 | """ 23 | This function change sam file to bam file 24 | """ 25 | cmd = ('samtools view -@ {thread} -h {sam} -o {bam} ').format( 26 | thread=thread,sam=samFile,bam=bamFile) 27 | print(cmd);sys.stdout.flush() 28 | sarge.run(cmd) 29 | 30 | 31 | def build_fa_index(ref_fa): 32 | '''build fai file for fa file for GATK 33 | ''' 34 | cmd = ('samtools faidx {ref}').format(ref=ref_fa) 35 | print(cmd);sys.stdout.flush() 36 | sarge.run(cmd) 37 | 38 | 39 | def merge_bams(bamfiles,outputbam): 40 | """this function merges bam files into one""" 41 | if len(bamfiles) == 1: 42 | cmd = ('mv {input} {output}').format(input=bamfiles[0],output=outputbam) 43 | else: 44 | bam = ' '.join(bamfiles) 45 | cmd = ('samtools merge -f {output} {input}').format(output=outputbam,input=bam) 46 | print(cmd);sys.stdout.flush() 47 | sarge.run(cmd) 48 | -------------------------------------------------------------------------------- /Modules/CNVnator.py: -------------------------------------------------------------------------------- 1 | import sarge,sys 2 | 3 | 4 | 5 | def cnv_extract_bam(in_bam,out_root,others=['']): 6 | ''' 7 | extract read mapping from bam files 8 | ''' 9 | cmd = ('cnvnator -root {out} -unique -tree {bam} {other}').format(out=out_root,bam=in_bam, 10 | other=' '.join(others)) 11 | print(cmd);sys.stdout.flush() 12 | sarge.run(cmd) 13 | 14 | 15 | def cnv_generate_hist(in_root,chr_path,bin_win,others=['']): 16 | ''' 17 | generating histogram 18 | ''' 19 | # 2. get histogram 20 | cmd = ('cnvnator -root {root} -his {bin} -d {dir} {other}').format(root=in_root, 21 | bin=str(bin_win),dir=chr_path,other=' '.join(others)) 22 | print(cmd);sys.stdout.flush() 23 | sarge.run(cmd) 24 | 25 | 26 | def cnv_statistics(in_root,bin_win,others=['']): 27 | cmd = ('cnvnator -root {root} -stat {bin} {other}').format(root=in_root,bin=str(bin_win),other=' '.join(others)) 28 | print(cmd);sys.stdout.flush() 29 | sarge.run(cmd) 30 | 31 | 32 | def cnv_partitioning(in_root,bin_win,others=['']): 33 | cmd = ('cnvnator -root {root} -partition {bin} {other}').format(root=in_root,bin=str(bin_win),other=' '.join(others)) 34 | print(cmd);sys.stdout.flush() 35 | sarge.run(cmd) 36 | 37 | 38 | def cnv_call(in_root,out,bin_win,others=['']): 39 | cmd = ('cnvnator -root {root} -call {bin} {other} > {out}').format(root=in_root,bin=str(bin_win),out=out,other=' '.join(others)) 40 | print(cmd);sys.stdout.flush() 41 | sarge.run(cmd) 42 | 43 | # output_file = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/cnv/merge.txt' 44 | # chr_path = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/cnv/scaffold' 45 | # bin_win = 100 46 | # others = ['-chrom NW_006887432.1'] 47 | # root = output_file[:-3] + 'root' 48 | # cnv_generate_hist(root,chr_path,bin_win,others) 49 | # # 3 50 | # cnv_statistics(root,bin_win,others) 51 | # # 4 52 | # cnv_partitioning(root,bin_win,others) 53 | # # 5 54 | # cnv_call(root,output_file,bin_win,others) 55 | 56 | 57 | -------------------------------------------------------------------------------- /Modules/Picard.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import sys,os 3 | 4 | def sam2fq(inSam,outPrefix,picard,endType): 5 | """sam file to fastq files 6 | """ 7 | if endType == 'single': 8 | cmd = ('java -jar {picard} SamToFastq I={input} F={fq} ' 9 | 'VALIDATION_STRINGENCY=LENIENT ').format( 10 | picard=picard,input=inSam,fq=outPrefix+'.fq.gz') 11 | else: 12 | cmd = ('java -jar {picard} SamToFastq I={input} F={fq1} F2={fq2} ' 13 | 'VALIDATION_STRINGENCY=LENIENT ').format(picard=picard, 14 | input=inSam,fq1=outPrefix+'_1.fq.gz',fq2=outPrefix+'_2.fq.gz') 15 | print(cmd);sys.stdout.flush() 16 | sarge.run(cmd) 17 | 18 | # if __name__ == '__main__': 19 | # path = '/data/shangzhong/DetectVirus/unmap_bam' 20 | # picard = '/home/shangzhong/Installation/picard-tools-1.141/picard.jar' 21 | # os.chdir(path) 22 | # bams = [f for f in os.listdir(path) if f.endswith('.bam')] 23 | # for bam in bams: 24 | # out = bam.split('.')[0] 25 | # sam2fq(bam,out,picard,'single') 26 | 27 | 28 | def build_fa_dict(ref_fa,picard): 29 | '''build dictionary file for fa file ''' 30 | out = '.'.join(ref_fa.split('.')[:-1]) + '.dict' 31 | cmd = ('java -jar {picard} CreateSequenceDictionary R={ref} O={out}').format( 32 | picard = picard,ref=ref_fa,out=out) 33 | print(cmd);sys.stdout.flush() 34 | sarge.run(cmd) 35 | 36 | 37 | def mark_duplicates(sortBam,dedupBam,picard): 38 | '''mark duplicates''' 39 | cmd = ('java -Djava.io.tmpdir=tmp -jar {picard} MarkDuplicates I={input} O={out} ' 40 | 'CREATE_INDEX=true METRICS_FILE=metrics.txt MAX_RECORDS_IN_RAM=8000000 ' 41 | 'MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 ' 42 | 'VALIDATION_STRINGENCY=LENIENT').format(picard=picard,input=sortBam,out=dedupBam) 43 | print(cmd);sys.stdout.flush() 44 | sarge.run(cmd) 45 | 46 | 47 | def add_readgroup(sortBam,rgBam,readgroup,picard): 48 | '''add read group''' 49 | if not os.path.exists('tmp'):os.mkdir('tmp') 50 | rg = readgroup.split('\\t') 51 | ID = rg[1][3:] 52 | SM = rg[2][3:] 53 | PL = 'illumina' 54 | LB = 'lib20000' 55 | PU = 'unit1' 56 | cmd = ('java -jar {picard} AddOrReplaceReadGroups I={input} O={rgBam} SO=coordinate ' 57 | 'RGID={ID} RGSM={SM} RGPL={PL} RGLB={LB} RGPU={PU} TMP_DIR=tmp').format( 58 | picard=picard,input=sortBam,rgBam=rgBam,ID=ID,SM=SM,PL=PL,LB=LB,PU=PU) 59 | print(cmd);sys.stdout.flush() 60 | sarge.run(cmd) 61 | 62 | 63 | -------------------------------------------------------------------------------- /Modules/PBhoney.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import re 3 | import os 4 | 5 | def Honey_pie(sortBam,sortTailBam,ref_fa,thread,tmp,otherParams=['']): 6 | """Honey pip extract soft clip reads and remap them 7 | """ 8 | tailBam = re.sub('\.final\.bam$','.tail.bam',sortTailBam) 9 | cmd = ('Honey.py pie -o {tail} -n {thread} {input} {ref} --temp {tmp}').format(tail=tailBam, 10 | thread=str(thread),input=sortBam,ref=ref_fa,tmp=tmp) 11 | cmd = cmd + ' '.join(otherParams) 12 | print(cmd) 13 | sarge.run(cmd) 14 | # sort 15 | cmd = ('samtools sort -m 4G -@ {thread} -T {pre} -o {sortBam} {bam} ').format( 16 | thread=str(thread),pre=tailBam[:-4],sortBam=sortTailBam,bam=tailBam) 17 | print(cmd) 18 | sarge.run(cmd) 19 | # index 20 | cmd = ('samtools index {out} ').format(out=sortTailBam) 21 | print(cmd) 22 | sarge.run(cmd) 23 | # os.remove(sortBam) 24 | 25 | 26 | def Honey_tails(finalBam,bamTail,otherParams=['']): 27 | """This function run Honey tail,culster the soft clipped reads 28 | """ 29 | cmd = ('Honey.py tails -o {out} {input} ').format(input=finalBam,out=bamTail) 30 | cmd = cmd + ' '.join(otherParams) 31 | print(cmd) 32 | sarge.run(cmd) 33 | 34 | 35 | def Honey_spots(finalBam,spotFile,ref_fa,thread,otherParams=['']): 36 | """This function run Honey sorts. 37 | """ 38 | cmd = ('Honey.py spots --reference {ref} -n {thread} -o {out} {input} ').format( 39 | input=finalBam,ref=ref_fa,thread=str(thread),out=spotFile) 40 | cmd = cmd + ' '.join(otherParams) 41 | print(cmd) 42 | sarge.run(cmd) 43 | 44 | import pandas as pd 45 | import numpy as np 46 | class pb_tail_res(object): 47 | ''' 48 | Input is output result from pb tail. should be pandas dataframe 49 | ''' 50 | def __init__(self,df): 51 | self.df = df 52 | self.df.columns = ['id','chrKey','uRef','uBreak','uMapq','dRef','dBreak','dMapq','remainSeq','annot','numReads','numZMWs','evidence'] 53 | 54 | def get_sv_types(self): 55 | '''get all the sv types''' 56 | types = list(set(self.df['annot'].tolist())) 57 | return types 58 | 59 | def get_sv_num(self,sv_type): 60 | '''get sv number''' 61 | df = self.df 62 | return df[df['annot'].values==sv_type].shape[0] 63 | 64 | def add_sv_len(self): 65 | '''add sv length for each sv except translocation whose break points are in different chromosome.''' 66 | df = self.df 67 | df['len'] = df.apply(lambda row: 'NA' if row['annot']=='TLOC' else int(row['dBreak'])-int(row['uBreak']),axis=1) 68 | return df 69 | 70 | def get_sv_num4_each_chr(self,chr_len_df,sv_type,count_log=False,length_log=False): 71 | '''get sv number for each scaffold for specific sv type 72 | * chr_len_df: pandas dataframe with 1 column. ['chr_len']. chr name is index 73 | ''' 74 | df = self.df[self.df['annot'].values==sv_type] 75 | sv_count = df.groupby(['uRef']).size() 76 | df = pd.concat([sv_count,chr_len_df],axis=1) 77 | df = df.fillna(0) 78 | df = df.rename(columns={0:'count'}) 79 | if count_log == True: 80 | df['count'] = np.log10(df['count']) 81 | if length_log == True: 82 | df['chr_len'] = np.log10(df['chr_len']) 83 | return df 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /SV_Pacbio_Sniffle.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from Modules.f01_file_process import * 3 | from ruffus import * 4 | from Modules.Aligner import bwa_mem,bwa_Db,ngmlr 5 | from Modules.Sniffles import sniffle 6 | import yaml 7 | from Modules.Samtools import sortBam 8 | 9 | #============ parameters ====================== 10 | parameter_file = sys.argv[1] 11 | #parameter_file = '/home/shangzhong/Codes/NewPipeline/Parameters/SV_Pacbio_Sniffle.yaml' 12 | with open(parameter_file,'r') as f: 13 | doc = yaml.load(f) 14 | p = dic2obj(**doc) 15 | #------------- get parameters ----------- 16 | file_path = p.RawDataPath 17 | thread = p.thread 18 | # all parameter 19 | ref_fa = p.ref_fa 20 | db_path = p.bwa_db 21 | contact = p.contact 22 | aligner = p.aligner 23 | #=============================================================================== 24 | # Pipeline part 25 | #=============================================================================== 26 | #--------------------- 1. read all files ------------------------------------------------ 27 | Message('Sniffle start',contact) 28 | os.chdir(file_path) 29 | #--------------------- 2. align all files ----------------------------------------------- 30 | if aligner == 'bwa': 31 | fastqFiles = [f for f in os.listdir(file_path) if f.endswith('fq.gz') or f.endswith('fastq.gz')] 32 | # build index 33 | @active_if(not os.path.exists(db_path)) 34 | def bwa_index(): 35 | bwa_Db(db_path,ref_fa) 36 | os.chdir(file_path) 37 | 38 | @follows(bwa_index) 39 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 40 | @check_if_uptodate(check_file_exists) 41 | @transform(fastqFiles,formatter('.*\.f.*q\.gz'),'bam/{basename[0]}.bam') 42 | def run_bwa(input_file,output_file): 43 | print(input_file + '-->' + output_file) 44 | bwa_mem([input_file],output_file,db_path+'/bwa',thread,otherParameters=['-M','-x pacbio']) 45 | elif aligner == 'ngmlr': 46 | faFiles = [f for f in os.listdir(file_path) if f.endswith('fa.gz') or f.endswith('fasta.gz')] 47 | @transform(faFiles,formatter('.*\.f.*a\.gz'),'bam/{basename[0]}.bam') 48 | def run_ngmlr(input_file,output_file): 49 | ngmlr(input_file,output_file,ref_fa,thread) 50 | #--------------------- 3. sort bam file ----------------------------------------------- 51 | @follows(run_bwa,run_ngmlr) 52 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam') 53 | @check_if_uptodate(check_file_exists) 54 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam') 55 | def run_sortBam(input_file,output_file): 56 | sortBam(input_file,output_file,thread) 57 | #--------------------- 4. Detect SV ----------------------------------------------- 58 | @follows(run_sortBam) 59 | @mkdir(fastqFiles,formatter(),'{path[0]}/vcf') 60 | @check_if_uptodate(check_file_exists) 61 | @transform(run_sortBam,formatter('.*\.sort\.bam'),'vcf/{basename[0]}.vcf') 62 | def run_sniffle(input_file,output_file): 63 | sniffle(input_file,output_file,otherParameters=['']) 64 | #--------------------- 4. return finish message ----------------------------------------------------- 65 | @follows(run_sniffle) 66 | def last_function(): 67 | Message('SV_Sniffle finished',contact) 68 | 69 | 70 | if __name__ == '__main__': 71 | try: 72 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 73 | touch_files_only=False,verbose=15) 74 | except: 75 | Message('SV_Sniffle failed',contact) -------------------------------------------------------------------------------- /mapped_BAM_to_fastq.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Extraction of mapped reads from bam files." 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "```Inputs: bam files generated by STAR or other aligners```\n", 15 | "\n", 16 | "Navigate to the directory containging the BAM files.\n", 17 | "\n", 18 | "If the BAM files do now follow this naming convention: ```'*.fastq.sort.bam'```, specifier your own file identifier in the second code bock\n", 19 | "\n", 20 | "```Outputs: FASTQ files containing ONLY reads mapped to your reference genome (specified during reads alignment)```" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "!pwd" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import os\n", 39 | "picard_jar_loc = '/home/chihchung/Installation/picard.jar'\n", 40 | "BAM_file_postfix = '.fastq.sort.bam'" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "bamfs = [fs for fs in os.listdir('./') if fs.endswith(BAM_file_postfix) and not fs.startswith('mapped_')];bamfs" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "import multiprocessing\n", 59 | "import subprocess\n", 60 | "\n", 61 | "def work(cmd):\n", 62 | " return subprocess.call(cmd, shell=True)\n", 63 | "count = multiprocessing.cpu_count()\n", 64 | "pool = multiprocessing.Pool(processes=count)\n" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": {}, 70 | "source": [ 71 | "get mapped bam files" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": null, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "if not os.path.isdir('mapped_fastq'): os.mkdir('mapped_fastq')\n", 81 | "cmd_get_mapped = ['samtools view -b -F 4 %s > mapped_%s'% (bam_fn, bam_fn) for bam_fn in bamfs]\n", 82 | "cmd_get_fastq_from_bams = [\n", 83 | "'java -jar %s SamToFastq \\\n", 84 | " I= mapped_%s \\\n", 85 | " FASTQ= mapped_fastq/%s.fastq'%(picard_jar_loc, bam_fn, bam_fn[:-len(BAM_file_postfix)]) for bam_fn in bamfs]\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "pool.map(work, cmd_get_mapped)\n", 95 | "pool.map(work, cmd_get_fastq_from_bams)" 96 | ] 97 | } 98 | ], 99 | "metadata": { 100 | "kernelspec": { 101 | "display_name": "Python [conda env:pytorch]", 102 | "language": "python", 103 | "name": "conda-env-pytorch-py" 104 | }, 105 | "language_info": { 106 | "codemirror_mode": { 107 | "name": "ipython", 108 | "version": 3 109 | }, 110 | "file_extension": ".py", 111 | "mimetype": "text/x-python", 112 | "name": "python", 113 | "nbconvert_exporter": "python", 114 | "pygments_lexer": "ipython3", 115 | "version": "3.6.8" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 2 120 | } 121 | -------------------------------------------------------------------------------- /STAR_get_bam.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import STAR_Db,STAR 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import * 6 | import yaml,sys 7 | import shutil 8 | import glob 9 | 10 | 11 | #============ parameters ====================== 12 | parameter_file = sys.argv[1] 13 | #parameter_file = '/data/shangzhong/Proteogenomics/STAR_get_bam.yaml' 14 | with open(parameter_file,'r') as f: 15 | doc = yaml.load(f) 16 | p = dic2obj(**doc) 17 | #------------- get parameters ----------- 18 | file_path = p.RawDataPath 19 | thread = p.thread 20 | # all parameter 21 | ref_fa = p.ref_fa 22 | gff = p.gff 23 | # trimmomatic parameter 24 | trim = p.trim_reads 25 | trimmomatic = p.trimmomatic_path 26 | trim_batch = p.trim_jobs_per_batch 27 | adapter = p.adapter 28 | 29 | star_batch = p.star_jobs_per_batch 30 | star_db = p.star_index 31 | run_pass = p.star_pass 32 | other_params = p.star_params 33 | 34 | contact = p.contact 35 | #=============================================================================== 36 | # Pipeline part 37 | #=============================================================================== 38 | Message('get bam start',contact) 39 | os.chdir(file_path) 40 | #=============================================================================== 41 | # Part I. Preprocess 42 | #=============================================================================== 43 | #--------------------- 1. read all files ------------------------------------------------ 44 | fastqFiles = list_fq_files(file_path) 45 | if fastqFiles[0][0].startswith('trim_'): 46 | trim = False 47 | def trim_parameters(): 48 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 49 | for infile, output in zip(infiles,outfiles): 50 | yield infile,output 51 | #--------------------- 2. trim reads----------------------------------------------------- 52 | @active_if(trim) 53 | @jobs_limit(trim_batch) 54 | @files(trim_parameters) 55 | def trim_reads(input_file,output_file): 56 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 57 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 58 | remove(input_file) 59 | #--------------------- 4. Map with STAR ----------------------------------------------------- 60 | def get_fq(): 61 | fqFiles = list_fq_files(file_path) 62 | for fq in fqFiles: 63 | out = 'sortBam/' + re.sub('\.f.*q\.gz','.bam',fq[0]) 64 | yield fq,out 65 | # build index 66 | @active_if(not os.path.exists(star_db)) 67 | @follows(trim_reads) 68 | def star_index(): 69 | STAR_Db(star_db,ref_fa,thread) 70 | # align 71 | other_params.extend(['--outSAMtype BAM', 'SortedByCoordinate']) 72 | if run_pass == 2: 73 | other_params.append('--twopassMode Basic') 74 | 75 | 76 | @jobs_limit(star_batch) 77 | @follows(star_index) 78 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam') 79 | #@transform(fastqFiles,formatter('.*\.f.*?\.gz'),'sortBam/{basename[0]}.bam') 80 | @files(get_fq) 81 | def run_star(input_file,output_file): 82 | n = num_thread2use(star_batch,len(fastqFiles),thread) 83 | STAR(input_file,output_file,star_db,n,gff,other_params) 84 | 85 | @follows(run_star) 86 | def last_function(): 87 | Message('get bam succeed',contact) 88 | 89 | if __name__ == '__main__': 90 | try: 91 | # pipeline_printout(sys.stdout, [last_function], verbose=3) 92 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 93 | touch_files_only=False,verbose=5) 94 | except: 95 | Message('get bam failed',contact) 96 | 97 | -------------------------------------------------------------------------------- /SV_Pacbio_PBHoney.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | import yaml 3 | from Modules.f01_file_process import * 4 | from Modules.Aligner import BLASR 5 | from Modules.Samtools import * 6 | from Modules.PBhoney import * 7 | import shutil,os 8 | import sys 9 | #============ parameters ====================== 10 | parameter_file = sys.argv[1] 11 | #parameter_file = '/home/shangzhong/Codes/NewPipeline/Parameters/Pacbio_SV.yaml' 12 | with open(parameter_file,'r') as f: 13 | doc = yaml.load(f) 14 | p = dic2obj(**doc) 15 | #------------- get parameters ----------- 16 | file_path = p.RawDataPath 17 | thread = p.thread 18 | # all parameter 19 | ref_fa = p.ref_fa 20 | sa = p.sa_index 21 | # tool parameters 22 | blasr_batch = p.blasr_jobs_per_batch 23 | sam_sort_batch = p.sam_sort_jobs_per_batch 24 | 25 | contact = p.contact 26 | #=============================================================================== 27 | # Pipeline part 28 | #=============================================================================== 29 | #--------------------- 1. read all files ------------------------------------------------ 30 | Message('PBHoney start',contact) 31 | os.chdir(file_path) 32 | faFiles = [os.path.join(file_path,f) for f in os.listdir(file_path) if f.endswith('.fa')] 33 | print faFiles;sys.stdout.flush() 34 | #--------------------- 2. run BLASR ----------------------------------------------------- 35 | @jobs_limit(blasr_batch) 36 | @mkdir(faFiles,formatter(),'{path[0]}/bam') 37 | @transform(faFiles,formatter(),'bam/{basename[0]}.bam') #regex('.*\.fa'),'.bam') 38 | @check_if_uptodate(check_file_exists) 39 | def run_blasr(input_file,output_file): 40 | n = num_thread2use(blasr_batch,len(faFiles),thread) 41 | BLASR(input_file,output_file,ref_fa,n,['-clipping soft','-sa '+sa]) 42 | #--------------------- 3. Sam2SortBam ----------------------------------------------------- 43 | # sort bam 44 | @follows(run_blasr) 45 | @jobs_limit(sam_sort_batch) 46 | @mkdir(faFiles,formatter(),'{path[0]}/sortBam') 47 | @transform(run_blasr,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam') 48 | @check_if_uptodate(check_file_exists) 49 | def sortbam(input_file,output_file): 50 | n = num_thread2use(sam_sort_batch,len(faFiles),thread) 51 | sortBam(input_file,output_file,n) 52 | if os.path.exists('bam'): shutil.rmtree('bam') 53 | #--------------------- 4. detect SV using PBhoney ----------------------------------------------------- 54 | @mkdir(faFiles,formatter(),'{path[0]}/HoneyPie') 55 | @transform(sortbam,formatter('.*\.sort\.bam'),'HoneyPie/{basename[0]}.final.bam') 56 | @check_if_uptodate(check_file_exists) 57 | def Honeypie(input_file,output_file): 58 | n = num_thread2use(thread,len(faFiles),thread) 59 | Honey_pie(input_file,output_file,ref_fa,n,'HoneyPie') 60 | 61 | @follows(Honeypie) 62 | @mkdir(faFiles,formatter(),'{path[0]}/HoneyTail') 63 | @transform(Honeypie,formatter('.*\.final\.bam'),'HoneyTail/{basename[0]}.tailes') 64 | @check_if_uptodate(check_file_exists) 65 | def Honeytailes(input_file,output_file): 66 | Honey_tails(input_file,output_file) 67 | 68 | @follows(Honeytailes) 69 | @mkdir(faFiles,formatter(),'{path[0]}/HoneySpots') 70 | @transform(Honeypie,formatter('.*\.final\.bam'),'HoneySpots/{basename[0]}.spots') 71 | @check_if_uptodate(check_file_exists) 72 | def Honeyspots(input_file,output_file): 73 | n = num_thread2use(thread,len(faFiles),thread) 74 | Honey_spots(input_file,output_file,ref_fa,n) 75 | 76 | #---------------------- 5. report succeed ------------------------------------------------------------- 77 | @follows(Honeyspots) 78 | def last_function(): 79 | Message('job finished',contact) 80 | 81 | 82 | if __name__ == '__main__': 83 | try: 84 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 85 | touch_files_only=False,verbose=5) 86 | except: 87 | Message('Pacbio SV failed',contact) 88 | pass 89 | -------------------------------------------------------------------------------- /Modules/Trimmomatic.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import os 3 | import gzip 4 | import sys 5 | 6 | def get_phred_score(fq): 7 | """This function get phred score of fastq.gz file 8 | """ 9 | score_found = False 10 | with gzip.open(fq,'rb') as f: 11 | n = 0 12 | for line in f: 13 | n = n + 1 14 | line = line.rstrip() 15 | if n%4 == 0: # only get quality line 16 | vals = [ord(c) for c in line] 17 | lmin = min(vals);lmax=max(vals) 18 | if lmin <= 50.: 19 | return '33' 20 | score_found=True 21 | break 22 | if lmax >= 83.: 23 | score_found=True 24 | return '64' 25 | break 26 | if score_found == False: 27 | raise 'could not find the phred score, need to manually set' 28 | 29 | def Trimmomatic(fqFiles,trim_fqFiles,trimmomatic,thread,adapter_file='',min_len=36): 30 | """This function run trimmomatic to trim reads""" 31 | # main parameters 32 | unpair = [f + 'unpair' for f in fqFiles] 33 | phred = get_phred_score(fqFiles[0]) 34 | if len(fqFiles) == 1: 35 | trimCmd1st = ('java -jar {trim} SE -threads {thread} -phred{type} ' 36 | '{input} {output} ').format(trim=trimmomatic,thread = int(thread), 37 | input = fqFiles[0],output=trim_fqFiles[0],type=phred) 38 | trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=min_len) 39 | elif len(fqFiles) == 2: 40 | trimCmd1st = ('java -jar {trim} PE -threads {thread} -phred{type} {fastq1} {fastq2} ' 41 | '{Trimmed1} {unpair1} {Trimmed2} {unpair2} ').format(trim=trimmomatic, 42 | thread=int(thread),type=phred,fastq1 = fqFiles[0], fastq2=fqFiles[1], 43 | Trimmed1 = trim_fqFiles[0], Trimmed2 = trim_fqFiles[1],unpair1=unpair[0],unpair2=unpair[1]) 44 | trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=str(min_len)) 45 | # adapter file 46 | if adapter_file != '': 47 | adaptCmd = 'ILLUMINACLIP:{adapter}:2:30:10 '.format(adapter=adapter_file) 48 | else: 49 | adaptCmd = '' 50 | cmd = trimCmd1st + adaptCmd + trimCmd2nd 51 | print(cmd);sys.stdout.flush() 52 | sarge.run(cmd) 53 | for un in unpair: 54 | if os.path.exists(un): 55 | os.remove(un) 56 | 57 | def conda_Trimmomatic(fqFiles,trim_fqFiles,thread,adapter_file='',min_len=36): 58 | """This function run trimmomatic to trim reads""" 59 | # main parameters 60 | unpair = [f + 'unpair' for f in fqFiles] 61 | phred = get_phred_score(fqFiles[0]) 62 | if len(fqFiles) == 1: 63 | trimCmd1st = ('trimmomatic SE -threads {thread} -phred{type} ' 64 | '{input} {output} ').format(thread = int(thread), 65 | input = fqFiles[0],output=trim_fqFiles[0],type=phred) 66 | trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=min_len) 67 | elif len(fqFiles) == 2: 68 | trimCmd1st = ('trimmomatic PE -threads {thread} -phred{type} {fastq1} {fastq2} ' 69 | '{Trimmed1} {unpair1} {Trimmed2} {unpair2} ').format( 70 | thread=int(thread),type=phred,fastq1 = fqFiles[0], fastq2=fqFiles[1], 71 | Trimmed1 = trim_fqFiles[0], Trimmed2 = trim_fqFiles[1],unpair1=unpair[0],unpair2=unpair[1]) 72 | trimCmd2nd = 'SLIDINGWINDOW:5:10 LEADING:15 TRAILING:10 MINLEN:{len} TOPHRED33 '.format(len=str(min_len)) 73 | # adapter file 74 | if adapter_file != '': 75 | adaptCmd = 'ILLUMINACLIP:{adapter}:2:30:10 '.format(adapter=adapter_file) 76 | else: 77 | adaptCmd = '' 78 | cmd = trimCmd1st + adaptCmd + trimCmd2nd 79 | print(cmd);sys.stdout.flush() 80 | sarge.run(cmd) 81 | for un in unpair: 82 | if os.path.exists(un): 83 | os.remove(un) -------------------------------------------------------------------------------- /Salmon_quant.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import STAR,STAR_Db 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import sortBam 6 | from Modules.HTseq import htseq_count 7 | import yaml 8 | import sys,shutil 9 | from Modules.StringTie import stringtie 10 | #============ parameters ====================== 11 | parameter_file = sys.argv[1] 12 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml' 13 | with open(parameter_file,'r') as f: 14 | doc = yaml.load(f) 15 | p = dic2obj(**doc) 16 | #------------- get parameters ----------- 17 | file_path = p.RawDataPath 18 | thread = p.thread 19 | QC = p.QC 20 | # all parameter 21 | ref_fa = p.ref_fa 22 | annotation = p.gff 23 | # trimmomatic parameter 24 | trim = p.trim_reads 25 | trimmomatic = p.trimmomatic_path 26 | trim_batch = p.trim_jobs_per_batch 27 | adapter = p.adapter 28 | # star parameter 29 | star_batch = p.star_jobs_per_batch 30 | db_path = p.STAR_index_path 31 | 32 | contact = p.contact 33 | #=============================================================================== 34 | # Pipeline part 35 | #=============================================================================== 36 | #--------------------- 1. read all files ------------------------------------------------ 37 | Message('stringtie start',contact) 38 | os.chdir(file_path) 39 | fastqFiles = list_fq_files(file_path) 40 | if fastqFiles[0][0].startswith('trim_'): 41 | trim = False 42 | #--------------------- 2. trim reads----------------------------------------------------- 43 | def trim_parameters(): 44 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 45 | for infile, output in zip(infiles,outfiles): 46 | yield infile,output 47 | #------------- run fastqc before trimming ----------- 48 | @active_if(QC) 49 | @jobs_limit(thread) 50 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 51 | @files(trim_parameters) 52 | def run_QC1(input_file,output_file): 53 | for fq in input_file: 54 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 55 | #------------ trim file ------------------ 56 | @active_if(trim) 57 | @follows(run_QC1) 58 | @jobs_limit(trim_batch) 59 | @files(trim_parameters) 60 | def trim_reads(input_file,output_file): 61 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 62 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 63 | remove(input_file) 64 | #------------ run fastqc after trimming ------------ 65 | @active_if(QC and trim) 66 | @follows(trim_reads) 67 | @jobs_limit(thread) 68 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 69 | @files(trim_parameters) 70 | def run_QC2(input_file,output_file): 71 | for fq in output_file: 72 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 73 | #--------------------- 3. run STAR ------------------------------------------------------ 74 | # build index 75 | @active_if(not os.path.exists(db_path)) 76 | @follows(trim_reads,run_QC2) 77 | def star_index(): 78 | STAR_Db(db_path,ref_fa,thread) 79 | # align 80 | if trim == False: 81 | trim_reads=fastqFiles 82 | @jobs_limit(star_batch) 83 | @follows(star_index) 84 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 85 | @check_if_uptodate(check_file_exists) 86 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam') 87 | def run_star(input_file,output_file): 88 | n = num_thread2use(star_batch,len(fastqFiles),thread) 89 | STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM', 'SortedByCoordinate','--outSAMunmapped Within']) 90 | #--------------------- 5. run stringtie ----------------------------------------------------- 91 | @follows(run_star) 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/stringtie') 93 | @check_if_uptodate(check_file_exists) 94 | @transform(run_star,formatter('.*\.bam'),'stringtie/{basename[0]}.gtf') 95 | def run_stringtie(input_file,output_file): 96 | stringtie(input_file,output_file,thread,annotation) 97 | #--------------------- 7. return finish message ----------------------------------------------------- 98 | @follows(run_stringtie) 99 | def last_function(): 100 | Message('stringtie finished',contact) 101 | 102 | if __name__ == '__main__': 103 | try: 104 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 105 | touch_files_only=False,verbose=5) 106 | except: 107 | Message('stringtie failed',contact) 108 | -------------------------------------------------------------------------------- /StringTie_quant.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import STAR,STAR_Db 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import sortBam 6 | from Modules.HTseq import htseq_count 7 | import yaml 8 | import sys,shutil 9 | from Modules.StringTie import stringtie 10 | #============ parameters ====================== 11 | parameter_file = sys.argv[1] 12 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml' 13 | with open(parameter_file,'r') as f: 14 | doc = yaml.load(f) 15 | p = dic2obj(**doc) 16 | #------------- get parameters ----------- 17 | file_path = p.RawDataPath 18 | thread = p.thread 19 | QC = p.QC 20 | # all parameter 21 | ref_fa = p.ref_fa 22 | annotation = p.gff 23 | # trimmomatic parameter 24 | trim = p.trim_reads 25 | trimmomatic = p.trimmomatic_path 26 | trim_batch = p.trim_jobs_per_batch 27 | adapter = p.adapter 28 | # star parameter 29 | star_batch = p.star_jobs_per_batch 30 | db_path = p.STAR_index_path 31 | 32 | contact = p.contact 33 | #=============================================================================== 34 | # Pipeline part 35 | #=============================================================================== 36 | #--------------------- 1. read all files ------------------------------------------------ 37 | Message('stringtie start',contact) 38 | os.chdir(file_path) 39 | fastqFiles = list_fq_files(file_path) 40 | if fastqFiles[0][0].startswith('trim_'): 41 | trim = False 42 | #--------------------- 2. trim reads----------------------------------------------------- 43 | def trim_parameters(): 44 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 45 | for infile, output in zip(infiles,outfiles): 46 | yield infile,output 47 | #------------- run fastqc before trimming ----------- 48 | @active_if(QC) 49 | @jobs_limit(thread) 50 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 51 | @files(trim_parameters) 52 | def run_QC1(input_file,output_file): 53 | for fq in input_file: 54 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 55 | #------------ trim file ------------------ 56 | @active_if(trim) 57 | @follows(run_QC1) 58 | @jobs_limit(trim_batch) 59 | @files(trim_parameters) 60 | def trim_reads(input_file,output_file): 61 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 62 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 63 | remove(input_file) 64 | #------------ run fastqc after trimming ------------ 65 | @active_if(QC and trim) 66 | @follows(trim_reads) 67 | @jobs_limit(thread) 68 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 69 | @files(trim_parameters) 70 | def run_QC2(input_file,output_file): 71 | for fq in output_file: 72 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 73 | #--------------------- 3. run STAR ------------------------------------------------------ 74 | # build index 75 | @active_if(not os.path.exists(db_path)) 76 | @follows(trim_reads,run_QC2) 77 | def star_index(): 78 | STAR_Db(db_path,ref_fa,thread) 79 | # align 80 | if trim == False: 81 | trim_reads=fastqFiles 82 | @jobs_limit(star_batch) 83 | @follows(star_index) 84 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 85 | @check_if_uptodate(check_file_exists) 86 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam') 87 | def run_star(input_file,output_file): 88 | n = num_thread2use(star_batch,len(fastqFiles),thread) 89 | STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM', 'SortedByCoordinate','--outSAMunmapped Within']) 90 | #--------------------- 5. run stringtie ----------------------------------------------------- 91 | @follows(run_star) 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/stringtie') 93 | @check_if_uptodate(check_file_exists) 94 | @transform(run_star,formatter('.*\.bam'),'stringtie/{basename[0]}.gtf') 95 | def run_stringtie(input_file,output_file): 96 | stringtie(input_file,output_file,thread,annotation) 97 | #--------------------- 7. return finish message ----------------------------------------------------- 98 | @follows(run_stringtie) 99 | def last_function(): 100 | Message('stringtie finished',contact) 101 | 102 | if __name__ == '__main__': 103 | try: 104 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 105 | touch_files_only=False,verbose=5) 106 | except: 107 | Message('stringtie failed',contact) 108 | -------------------------------------------------------------------------------- /CNV_CNVnator.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from Modules.f01_file_process import * 3 | from ruffus import * 4 | from Modules.Aligner import bwa_Db,bwa_mem 5 | from Modules.Trimmomatic import Trimmomatic 6 | import yaml 7 | from Modules.Samtools import sortBam,merge_bams 8 | import shutil 9 | from Modules.CNVnator import * 10 | from Bio import SeqIO 11 | #============ parameters ====================== 12 | parameter_file = sys.argv[1] 13 | #parameter_file = '/data/shangzhong/Pacbio/CHOS_illu_DNA/cnv/CNVnator.yaml' 14 | with open(parameter_file,'r') as f: 15 | doc = yaml.load(f) 16 | p = dic2obj(**doc) 17 | #------------- get parameters ----------- 18 | file_path = p.RawDataPath 19 | thread = p.thread 20 | # all parameter 21 | ref_fa = p.ref_fa 22 | # trimmomatic parameter 23 | trim = p.trim_reads 24 | trimmomatic = p.trimmomatic_path 25 | trim_batch = p.trim_jobs_per_batch 26 | adapter = p.adapter 27 | 28 | bwa_batch = p.bwa_jobs_per_batch 29 | db_path = p.bwa_Db 30 | 31 | bin_win = p.bin_win 32 | others = p.chrom 33 | contact = p.contact 34 | #=============================================================================== 35 | # Pipeline part 36 | #=============================================================================== 37 | #--------------------- 1. read all files ------------------------------------------------ 38 | Message('cnvnator start',contact) 39 | os.chdir(file_path) 40 | fastqFiles = list_fq_files(file_path) 41 | def trim_parameters(): 42 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 43 | for infile, output in zip(infiles,outfiles): 44 | yield infile,output 45 | #--------------------- 2. trim reads----------------------------------------------------- 46 | @active_if(trim) 47 | @jobs_limit(trim_batch) 48 | @files(trim_parameters) 49 | def trim_reads(input_file,output_file): 50 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 51 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 52 | remove(input_file) 53 | #--------------------- 3. Map with bwa ----------------------------------------------------- 54 | def get_fq(): 55 | fqFiles = list_fq_files(file_path) 56 | for fq in fqFiles: 57 | out = 'bam/' + re.sub('\.f.*q\.gz','.bam',fq[0]) 58 | yield fq,out 59 | # build index 60 | @active_if(not os.path.exists(db_path)) 61 | @follows(trim_reads) 62 | def bwa_index(): 63 | bwa_Db(db_path,ref_fa) 64 | # align 65 | @jobs_limit(bwa_batch) 66 | @follows(trim_reads,bwa_index) 67 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 68 | @files(get_fq) 69 | def run_bwa(input_file,output_file): 70 | n = num_thread2use(bwa_batch,len(fastqFiles),thread) 71 | db_index = db_path + '/' + os.listdir(db_path)[0].split('.')[0] 72 | bwa_mem(input_file,output_file,db_index,n) 73 | #--------------------- 5. Sort bam file -------------------------------------------------- 74 | @jobs_limit(trim_batch) 75 | @follows(run_bwa) 76 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam') 77 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam') 78 | @check_if_uptodate(check_file_exists) 79 | def sort_by_pos(input_file,output_file): 80 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 81 | sortBam(input_file,output_file,n,sortType='pos') 82 | @follows(sort_by_pos) 83 | def remove_bam(): 84 | if os.path.exists('bam'): shutil.rmtree('bam') # remove bam folder 85 | 86 | @follows(sort_by_pos,remove_bam) 87 | @mkdir(fastqFiles,formatter(),'{path[0]}/mergeBam') 88 | @merge(sort_by_pos,'mergeBam/merge.bam') 89 | @check_if_uptodate(check_file_exists) 90 | def run_merge_bam(input_file,output_file): 91 | if len(input_file) > 1: 92 | merge_bams(input_file,output_file) 93 | else: 94 | os.rename(input_file[0],output_file) 95 | #------------------- 6. run CNVnator ----------------------------------------------------- 96 | @jobs_limit(thread) 97 | @follows(run_merge_bam) 98 | @mkdir(fastqFiles,formatter(),'{path[0]}/cnv') 99 | @transform(run_merge_bam,formatter('.*\.bam'),'cnv/{basename[0]}.txt') 100 | @check_if_uptodate(check_file_exists) 101 | def run_cnvnator(input_file,output_file): 102 | root = output_file[:-3] + 'root' 103 | # 1 104 | cnv_extract_bam(input_file,root,others) 105 | # 2 106 | chr_path = file_path + '/cnv/scaffold' 107 | path = chr_path 108 | if not os.path.exists(path): 109 | os.mkdir(path) 110 | for record in SeqIO.parse(ref_fa,'fasta'): 111 | SeqIO.write(record,path+'/'+record.id+'.fa','fasta') 112 | 113 | cnv_generate_hist(root,chr_path,bin_win,others) 114 | # 3 115 | cnv_statistics(root,bin_win,others) 116 | # 4 117 | cnv_partitioning(root,bin_win,others) 118 | # 5 119 | cnv_call(root,output_file,bin_win,others) 120 | 121 | @follows(run_cnvnator) 122 | def last_function(): 123 | # Message('cnvnator succeed',contact) 124 | pass 125 | if __name__ == '__main__': 126 | try: 127 | pipeline_run([run_cnvnator,last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 128 | touch_files_only=False) 129 | except: 130 | Message('cnvnator failed',contact) -------------------------------------------------------------------------------- /SV_Illumina_Lumpy.py: -------------------------------------------------------------------------------- 1 | import os,sys 2 | from Modules.f01_file_process import * 3 | from ruffus import * 4 | from Modules.Aligner import bwa_Db,bwa_samblaster 5 | from Modules.Trimmomatic import Trimmomatic 6 | import yaml 7 | from Modules.Samtools import sortBam,merge_bams 8 | import shutil 9 | from Modules.Lumpy import lumpyexpress 10 | from Modules.SVTyper import svtyper 11 | #============ parameters ====================== 12 | parameter_file = sys.argv[1] 13 | #parameter_file = '/data/shangzhong/SV_lumpy/SV_lumpy.yaml' 14 | with open(parameter_file,'r') as f: 15 | doc = yaml.load(f) 16 | p = dic2obj(**doc) 17 | #------------- get parameters ----------- 18 | file_path = p.RawDataPath 19 | thread = p.thread 20 | # all parameter 21 | ref_fa = p.ref_fa 22 | # trimmomatic parameter 23 | trim = p.trim_reads 24 | trimmomatic = p.trimmomatic_path 25 | trim_batch = p.trim_jobs_per_batch 26 | adapter = p.adapter 27 | 28 | bwa_batch = p.bwa_jobs_per_batch 29 | bwa_index = p.bwa_index 30 | read_groups = p.read_groups 31 | contact = p.contact 32 | #=============================================================================== 33 | # Pipeline part 34 | #=============================================================================== 35 | #--------------------- 1. read all files ------------------------------------------------ 36 | Message('Lumpy start',contact) 37 | os.chdir(file_path) 38 | fastqFiles = list_fq_files(file_path) 39 | def trim_parameters(): 40 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 41 | for infile, output in zip(infiles,outfiles): 42 | yield infile,output 43 | #--------------------- 2. trim reads----------------------------------------------------- 44 | @active_if(trim) 45 | @jobs_limit(trim_batch) 46 | @files(trim_parameters) 47 | def trim_reads(input_file,output_file): 48 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 49 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 50 | remove(input_file) 51 | #--------------------- 3. Map with bwa ----------------------------------------------------- 52 | def get_fq_and_readgroup(): 53 | fqFiles = list_fq_files(file_path) 54 | for fq, rg in zip(fqFiles,read_groups): 55 | out = 'bam/' + re.sub('\.f.*q\.gz','.bam',fq[0]) 56 | yield fq,out,rg 57 | # build index 58 | @active_if(not os.path.exists('/'.join(bwa_index.split('/')[:-1]))) 59 | @follows(trim_reads) 60 | def run_bwa_index(): 61 | bwa_Db(bwa_index,ref_fa) 62 | # align 63 | @jobs_limit(bwa_batch) 64 | @follows(trim_reads,run_bwa_index) 65 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 66 | @files(get_fq_and_readgroup) 67 | def run_bwa(input_file,output_file,rg): 68 | n = num_thread2use(bwa_batch,len(fastqFiles),thread) 69 | lib = rg.split('\\t')[2][3:] 70 | readgroup = '\'' + rg+'\\tLB:'+lib+'\\tPL:illumina\\tPU:unit1\'' 71 | bwa_samblaster(input_file,output_file,bwa_index,n,otherParameters=['-R '+ readgroup]) 72 | #--------------------- 5. Sort bam file -------------------------------------------------- 73 | @jobs_limit(trim_batch) 74 | @follows(run_bwa) 75 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam') 76 | @transform(run_bwa,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam') 77 | @check_if_uptodate(check_file_exists) 78 | def sort_by_pos(input_file,output_file): 79 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 80 | sortBam(input_file,output_file,n,sortType='pos') 81 | disc = input_file[:-3] + 'disc.sam' 82 | disc_sort = output_file[:-3] + 'disc.bam' 83 | sortBam(disc,disc_sort,n,sortType='pos') 84 | split = input_file[:-3] + 'split.sam' 85 | split_sort = output_file[:-3] + 'split.bam' 86 | sortBam(split,split_sort,n,sortType='pos') 87 | @follows(sort_by_pos) 88 | def remove_bam(): 89 | if os.path.exists('bam'): shutil.rmtree('bam') # remove bam folder 90 | #--------------------- 6. run lumpyexpress ------------------------------ 91 | @follows(sort_by_pos) 92 | @mkdir(fastqFiles,formatter(),'{path[0]}/vcf') 93 | @merge(sort_by_pos,'vcf/lumpy.vcf') 94 | @check_if_uptodate(check_file_exists) 95 | def run_lumpyexpress(input_files,output_file): 96 | lumpyexpress(input_files,output_file,['-T vcf']) 97 | #--------------------- 7. run SVTyper ------------------------------ 98 | @follows(run_lumpyexpress) 99 | @mkdir(fastqFiles,formatter(),'{path[0]}/merge') 100 | @merge(sort_by_pos,'merge/merge.bam') 101 | @check_if_uptodate(check_file_exists) 102 | def run_svtyper(input_files,output_file): 103 | if len(input_files) > 1: 104 | merge_bams(input_files,output_file) 105 | else: 106 | os.rename(input_files[0],output_file) 107 | sarge.run('samtools index {b}'.format(b=output_file)) 108 | svtyper('vcf/lumpy.vcf','vcf/lumpy_gt.vcf',output_file) 109 | shutil.move('merge/merge.json','vcf') 110 | shutil.move('merge/merge.json.pdf','vcf') 111 | 112 | @follows(run_svtyper) 113 | def last_function(): 114 | Message('lumpy succeed',contact) 115 | 116 | if __name__ == '__main__': 117 | try: 118 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 119 | touch_files_only=False,verbose=20) 120 | except: 121 | Message('lumpyexpress failed',contact) 122 | 123 | 124 | 125 | 126 | -------------------------------------------------------------------------------- /Modules/f02_parse_gff.py: -------------------------------------------------------------------------------- 1 | import re 2 | import pandas as pd 3 | 4 | 5 | class ncbi_gff(object): 6 | def __init__(self,df): 7 | self.df = df 8 | self.df.columns=['chr','source','feature','start','end','score','strand','frame','anno'] 9 | self.df['start'] = self.df['start'] - 1 10 | self.df = self.df[self.df['feature'].values!='region'] 11 | self.df = self.df.reset_index(drop=True) 12 | self.df['geneid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'GeneID:')) 13 | self.df['trid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'transcript_id=')) 14 | self.df['prid'] = self.df['anno'].apply(lambda x: ncbi_gff.get_id(x,'protein_id=')) 15 | @staticmethod 16 | def get_id(anno,feature): 17 | '''get id based on the feature provided''' 18 | try: 19 | gene_id = re.search('(?<={id}).+?(?=[;,]|$)'.format(id=feature),anno).group(0) 20 | except: 21 | gene_id = None 22 | return gene_id 23 | 24 | @staticmethod 25 | def get_tr_longest_intron(tr_df): 26 | '''get the longest intron the the transcript''' 27 | start = tr_df['start'].tolist() 28 | end = tr_df['end'].tolist() 29 | strand = tr_df['strand'].tolist() 30 | if len(start) == 1: 31 | return 0 32 | if strand[0] == '+': 33 | intron = max([abs(int(s)-int(e)) for s,e in zip(start[1:],end[:-1])]) 34 | else: 35 | intron = max([abs(int(s)-int(e)) for s,e in zip(start[:-1],end[1:])]) 36 | return intron 37 | 38 | def get_longest_intron(self): 39 | '''this is the longest intron across the whole genome''' 40 | df = self.df 41 | df = df[(~df['prid'].isnull()) & (df['feature'].values=='CDS')] 42 | df = df.reset_index(drop=True) 43 | df = df.groupby(['chr','prid']).apply(ncbi_gff.get_tr_longest_intron) 44 | return df#.max() 45 | 46 | def get_all_id(self): 47 | '''this function gets all ids in the gff file 48 | ''' 49 | df = self.df 50 | id_df = df[df['feature'].isin(['exon','CDS'])] 51 | id_df = id_df.reset_index(drop=True) 52 | 53 | id_df['sym'] = id_df['anno'].map(lambda x: ncbi_gff.get_id(x,'gene=')) 54 | id_df['rna'] = id_df['anno'].map(lambda x: ncbi_gff.get_id(x,'Parent=')) 55 | 56 | exn_df = id_df[id_df['feature'].values=='exon'][['geneid','sym','chr','rna','trid']].drop_duplicates() 57 | exn_df = exn_df.reset_index(drop=True) 58 | 59 | cds_df = id_df[id_df['feature'].values=='CDS'][['geneid','sym','chr','rna','prid']].drop_duplicates() 60 | cds_df = cds_df.reset_index(drop=True) 61 | 62 | merge_df = pd.merge(exn_df,cds_df,how='outer',on=['geneid','sym','chr','rna']) 63 | merge_df.columns = ['GeneID','GeneSymbol','Chrom','TrID','TrAccess','PrAccess'] 64 | merge_df.fillna('-',inplace=True) 65 | merge_df.fillna('-',inplace=True) 66 | # merge_df = merge_df[(merge_df['TrAccess'].values != '-') | (merge_df['PrAccess'].values != '-')] 67 | merge_df = merge_df.sort_values(['GeneID']) 68 | return merge_df[['GeneID','GeneSymbol','Chrom','TrAccess','PrAccess','TrID']] 69 | 70 | def get_gene_seq(self,ref_dic,gid,id_type='tr'): 71 | '''this function gets seqeunce of a transcript or protein 72 | ''' 73 | df = self.df 74 | if id_type == 'tr': 75 | feature = 'exon' 76 | id_t = 'trid' 77 | elif id_type == 'pr': 78 | feature = 'CDS' 79 | id_t = 'prid' 80 | region_df = df[(df['feature'].values==feature) & (df[id_t].values==gid)] 81 | # get sequence 82 | scaff = region_df['chr'].tolist()[0] 83 | scaff_seq = ref_dic[scaff].seq 84 | strand = region_df['strand'].tolist()[0] 85 | 86 | g_seq = '' 87 | for s,e in zip(region_df['start'],region_df['end']): 88 | g_seq += scaff_seq[int(s):int(e)] 89 | # consider strand 90 | if strand == '-': 91 | g_seq = g_seq.reverse_complement() 92 | 93 | if id_type == 'pr': 94 | g_seq = g_seq.translate() 95 | return g_seq 96 | 97 | # gff_fn = '/data/genome/hamster/ncbi_refseq/hamster.gff' 98 | # df = pd.read_csv(gff_fn,sep='\t',header=None,comment='#') 99 | # obj = ncbi_gff(df) 100 | # all_id_df = obj.get_all_id() 101 | # all_id_df.to_csv('/data/genome/hamster/ncbi_refseq/all_id.txt',sep='\t',index=False) 102 | # 103 | # ref_dic = SeqIO.index('/data/genome/hamster/picr/picr.fa','fasta') 104 | # res = obj.get_gene_seq(ref_dic,'NM_001246795',id_type='tr') 105 | # print res 106 | 107 | 108 | from Bio import SeqIO 109 | 110 | def getr_tRNA(fa,gff,output): 111 | '''get rRNA and tRNA sequence 112 | * output: fa file stores rtRNA sequence''' 113 | index = SeqIO.index(fa,'fasta') 114 | with open(gff) as f, open(output,'w') as out: 115 | for line in f: 116 | if line.startswith('#'):continue 117 | item = line.strip().split('\t') 118 | if item[2] in ['rRNA','tRNA']: 119 | name = re.search('(?<=product=).+?(?=$|;)',line).group(0) 120 | chrom = item[0] 121 | s = int(item[3]) 122 | e = int(item[4]) 123 | seq = str(index[chrom].seq[s-1:e]) 124 | out.write('>'+name + '\n' + seq + '\n') -------------------------------------------------------------------------------- /RNAseq_count.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import STAR,STAR_Db 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import sortBam 6 | from Modules.HTseq import htseq_count 7 | import yaml 8 | import sys,shutil 9 | #============ parameters ====================== 10 | parameter_file = sys.argv[1] 11 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml' 12 | with open(parameter_file,'r') as f: 13 | doc = yaml.load(f) 14 | p = dic2obj(**doc) 15 | #------------- get parameters ----------- 16 | file_path = p.RawDataPath 17 | thread = p.thread 18 | QC = p.QC 19 | # all parameter 20 | ref_fa = p.ref_fa 21 | annotation = p.gff 22 | # trimmomatic parameter 23 | trim = p.trim_reads 24 | trimmomatic = p.trimmomatic_path 25 | trim_batch = p.trim_jobs_per_batch 26 | adapter = p.adapter 27 | # star parameter 28 | star_batch = p.star_jobs_per_batch 29 | db_path = p.STAR_index_path 30 | # htseq parameter 31 | htseq_anno_source = p.htseq_anno_source 32 | strand = p.strand_specific 33 | id_name = p.id_name 34 | 35 | contact = p.contact 36 | #=============================================================================== 37 | # Pipeline part 38 | #=============================================================================== 39 | #--------------------- 1. read all files ------------------------------------------------ 40 | Message('RNA_count start',contact) 41 | os.chdir(file_path) 42 | fastqFiles = list_fq_files(file_path) 43 | if fastqFiles[0][0].startswith('trim_'): 44 | trim = False 45 | #--------------------- 2. trim reads----------------------------------------------------- 46 | def trim_parameters(): 47 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 48 | for infile, output in zip(infiles,outfiles): 49 | yield infile,output 50 | #------------- run fastqc before trimming ----------- 51 | @active_if(QC) 52 | @jobs_limit(thread) 53 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 54 | @files(trim_parameters) 55 | def run_QC1(input_file,output_file): 56 | for fq in input_file: 57 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 58 | #------------ trim file ------------------ 59 | @active_if(trim) 60 | @follows(run_QC1) 61 | @jobs_limit(trim_batch) 62 | @files(trim_parameters) 63 | def trim_reads(input_file,output_file): 64 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 65 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 66 | remove(input_file) 67 | #------------ run fastqc after trimming ------------ 68 | @active_if(QC and trim) 69 | @follows(trim_reads) 70 | @jobs_limit(thread) 71 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 72 | @files(trim_parameters) 73 | def run_QC2(input_file,output_file): 74 | for fq in output_file: 75 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 76 | #--------------------- 3. run STAR ------------------------------------------------------ 77 | # build index 78 | @active_if(not os.path.exists(db_path)) 79 | @follows(trim_reads,run_QC2) 80 | def star_index(): 81 | STAR_Db(db_path,ref_fa,thread) 82 | # align 83 | if trim == False: 84 | trim_reads=fastqFiles 85 | @jobs_limit(star_batch) 86 | @follows(star_index) 87 | @mkdir(fastqFiles,formatter(),'{path[0]}/bam') 88 | @check_if_uptodate(check_file_exists) 89 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'bam/{basename[0]}.bam') 90 | def run_star(input_file,output_file): 91 | n = num_thread2use(star_batch,len(fastqFiles),thread) 92 | STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM','Unsorted','--outSAMunmapped Within']) 93 | #--------------------- 4. samtools sort by name ----------------------------------------- 94 | @jobs_limit(trim_batch) 95 | @follows(run_star) 96 | @mkdir(fastqFiles,formatter(),'{path[0]}/sortBam') 97 | @check_if_uptodate(check_file_exists) 98 | @transform(run_star,formatter('.*\.bam'),'sortBam/{basename[0]}.sort.bam') 99 | def sort_by_name(input_file,output_file): 100 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 101 | sortBam(input_file,output_file,n,sortType='name') 102 | stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file)) 103 | with open(output_file[:-3]+'flagstat.txt','w') as f: 104 | f.write(stat) 105 | @follows(sort_by_name) 106 | def remove_bam(): 107 | if os.path.exists('bam'): shutil.rmtree('bam') # remove bam folder 108 | #--------------------- 5. run htseq ----------------------------------------------------- 109 | @follows(remove_bam) 110 | @mkdir(fastqFiles,formatter(),'{path[0]}/htseq') 111 | @check_if_uptodate(check_file_exists) 112 | @transform(sort_by_name,formatter('.*\.sort\.bam'),'htseq/{basename[0]}.txt') 113 | def run_htseq(input_file,output_file): 114 | htseq_count(input_file,output_file,annotation,strand,htseq_anno_source) 115 | #--------------------- 6. ID convertion ----------------------------------------------------- 116 | @active_if(htseq_anno_source!='') 117 | @follows(run_htseq) 118 | @transform(run_htseq,suffix('.txt'),'.count.txt') 119 | def id_convert(input_file,output_file): 120 | print(input_file+ '--->' + output_file) 121 | # 1. get dictionary 122 | if id_name == 'id': 123 | sym2ID = 'yes' 124 | else: 125 | sym2ID = 'no' 126 | dic = get_gene_name_id_dic(annotation,htseq_anno_source,sym2ID) 127 | gene_id_name_convert(input_file,output_file,dic) 128 | #--------------------- 7. return finish message ----------------------------------------------------- 129 | @follows(id_convert,run_htseq) 130 | def last_function(): 131 | Message('RNA_count finished',contact) 132 | 133 | if __name__ == '__main__': 134 | try: 135 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 136 | touch_files_only=False,verbose=5) 137 | except: 138 | Message('RNA_count failed',contact) 139 | -------------------------------------------------------------------------------- /Modules/Homer.py: -------------------------------------------------------------------------------- 1 | import sarge,sys,os 2 | import pandas as pd 3 | import matplotlib as mpl 4 | mpl.use('Agg') 5 | import matplotlib.pyplot as plt 6 | mpl.style.use('ggplot') 7 | 8 | 9 | def make_tag_directory(in_bam,tag_dir,ref_fa): 10 | '''make tag directory which extract mapping position into tsv file 11 | ''' 12 | cmd = ('makeTagDirectory {o_dir} -genome {g} -checkGC \ 13 | -single {bam}').format(o_dir=tag_dir,g=ref_fa,bam=in_bam) 14 | print(cmd);sys.stdout.flush() 15 | sarge.run(cmd) 16 | 17 | 18 | def rm_5GRO_ctrl(GRO5_tag,ctrl_tag): 19 | '''this function remove ctrl_tag coverage from GRO-Cap tag 20 | ''' 21 | before_sub = GRO5_tag+'/genome.tages_before_sub.tsv' 22 | after_sub = GRO5_tag+'/genome.tags.tsv' 23 | os.rename(after_sub,before_sub) 24 | df_raw = pd.read_csv(before_sub,sep='\t',header=None) 25 | df_ctr = pd.read_csv(ctrl_tag+'/genome.tags.tsv',sep='\t',header=None) 26 | 27 | 28 | def hist(tag_dir,hist_out,ref_fa,anno,mode='tss',peak='',region=4000,res=10,pc=3): 29 | '''this function gets tag coverage around tss 30 | * tag_dir: tag directory 31 | * anno: gff file or gtf file 32 | * pc: number of tags to consider at each position 33 | * region: length to conisder in the x axis. which means -2000 to 2000 around tss 34 | * res: resolution of the histogram. 35 | ''' 36 | if anno.endswith('gtf'): 37 | anno = '-gtf ' + anno 38 | else: 39 | anno = '-gff ' + anno 40 | if mode == 'tss': 41 | cmd = ('annotatePeaks.pl tss {ref_fa} {anno} -size {size} -hist {bin} -d {dir} -pc {pc} > {out}').format( 42 | ref_fa=ref_fa,anno=anno,size=str(region),bin=str(res),dir=tag_dir,pc=str(pc),out=hist_out) 43 | elif mode == 'peak': 44 | if peak == '': 45 | raise ValueError('input is empty') 46 | cmd = ('annotatePeaks.pl {peak} {ref_fa} {anno} -size {size} -hist {bin} -d {dir} -pc {pc} > {out}').format( 47 | peak=peak,ref_fa=ref_fa,anno=anno,size=str(region),bin=str(res),dir=tag_dir,pc=str(pc),out=hist_out) 48 | sarge.run(cmd) 49 | 50 | 51 | def hist_plot(hist_out): 52 | #Visualize histogram. 53 | plt.figure() 54 | df = pd.read_csv(hist_out,sep='\t',header=0,names=['Distance from TSS','Coverage','+ Tags','- Tags']) 55 | plt.plot(df['Distance from TSS'],df['+ Tags'],label='+ Tags') 56 | plt.plot(df['Distance from TSS'],df['- Tags'],label='- Tags') 57 | plt.xlim([-500,500]) 58 | plt.xlabel('Distance from TSS') 59 | plt.ylabel('Reads per bp per TSS') 60 | plt.axvline(x=0,c='k') 61 | plt.legend(loc='upper right') 62 | 63 | plt.savefig(os.path.splitext(hist_out)[0]+'.png') 64 | 65 | # if __name__ == '__main__': 66 | # import glob 67 | # hists = glob.glob('/data/shangzhong/TSS/fq/f03_tags/*/hist.txt') 68 | # for h in hists: 69 | # plt.figure() 70 | # hist_plot(h) 71 | 72 | 73 | def find_peaks(tag_dir,out_file,peak_style,control_dir,otherParams=['']): 74 | '''find peaks 75 | ''' 76 | cmd = 'findPeaks {tag} -style {style} -o {out} -i {control} {other}'.format( 77 | tag=tag_dir,style=peak_style,out=out_file,control=control_dir, 78 | other=' '.join(otherParams)) 79 | print(cmd);sys.stdout.flush() 80 | sarge.run(cmd) 81 | 82 | 83 | def merge_peaks(input_files,output_file,dist): 84 | ''' 85 | * input_files: a list of peak files, name format is 5gro_and_gro 86 | * otuput_file: final merged peak file 87 | ''' 88 | cmd = ('mergePeaks -d {dist} {in_files} > {out}').format(dist=str(dist), 89 | in_files=' '.join(input_files),out=output_file) 90 | print(cmd);sys.stdout.flush() 91 | sarge.run(cmd) 92 | 93 | 94 | def annotate_peaks(peak_file,output_file,ref_fa,annotation): 95 | ''' 96 | this function annotate peaks, basically get closes TSS to each peak. 97 | ''' 98 | if annotation.endswith('gtf'): 99 | anno = '-gtf ' + annotation 100 | else: 101 | anno = '-gff ' + annotation 102 | cmd = 'annotatePeaks.pl {peaks} {genome} {annotation} > {out}'.format(peaks=peak_file,genome=ref_fa, 103 | annotation=anno, out=output_file) 104 | print(cmd);sys.stdout.flush() 105 | sarge.run(cmd) 106 | 107 | 108 | def filter_anno_peak(in_peak_file,filter_peak_file): 109 | '''this function extracts the reliable TSS from the peak file 110 | The rule is: for each 5GRO, get overlap of peaks against different GROseq. 111 | Then get union set from pevious peaks. 112 | ''' 113 | df = pd.read_csv(in_peak_file,sep='\t',header=0) 114 | gro5 = [] 115 | gro = [] 116 | for f in df['Focus Ratio/Region Size']: 117 | files = f.split('|') 118 | for sub_f in files: # sub_f is peak file result 119 | peaks = sub_f.split('_and_') # peaks has 5gro file and gro file name 120 | for p in peaks: 121 | if p in gro5 or p in gro: 122 | continue 123 | else: 124 | if '5GRO' in p: 125 | gro5.append(p) 126 | else: 127 | gro.append(p) 128 | print gro5,gro 129 | def extract_peak(gro5,gro,fns): 130 | '''fns is the splited filename in 6th column of annotated peak file''' 131 | keep = [] 132 | res = True 133 | for g5 in gro5: 134 | keep.append([g5+'_and_'+g in fns for g in gro]) 135 | for k in keep: 136 | if False in k: 137 | res=False 138 | return res 139 | 140 | # filter out the annopeak 141 | cri = df['Focus Ratio/Region Size'].map(lambda x: extract_peak(gro5,gro,x)) 142 | df = df[cri] 143 | df.to_csv(filter_peak_file,sep='\t',index=False) 144 | 145 | # if __name__ == '__main__': 146 | # in_peak_file = '/data/shangzhong/TSS/fq/f05_annoPeaks/merge.anno' 147 | # filter_peak_file = '/data/shangzhong/TSS/fq/f05_annoPeaks/merge_filter.anno' 148 | # filter_anno_peak(in_peak_file,filter_peak_file) 149 | 150 | 151 | -------------------------------------------------------------------------------- /RibosomeProfiling.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import hisat2_Db,hisat2 4 | from Modules.Trimmomatic import conda_Trimmomatic 5 | from Modules.Samtools import sortBam 6 | import yaml 7 | import sys,shutil 8 | #============ parameters ====================== 9 | parameter_file = sys.argv[1] 10 | #parameter_file = '/data/shangzhong/Proteogenomics/RNAseq_count.yaml' 11 | with open(parameter_file,'r') as f: 12 | doc = yaml.load(f) 13 | p = dic2obj(**doc) 14 | #------------- get parameters ----------- 15 | file_path = p.RawDataPath 16 | thread = p.thread 17 | QC = p.QC 18 | # all parameter 19 | rRNA_fa = p.rRNA_fa 20 | ref_fa = p.ref_fa 21 | annotation = p.gff 22 | # trimmomatic parameter 23 | trim = p.trim_reads 24 | trim_batch = p.trim_jobs_per_batch 25 | min_len = p.min_len 26 | adapter = p.adapter 27 | # star parameter 28 | hisat2_batch = p.hisat2_jobs_per_batch 29 | hisat2_rRNA_db = p.hisat2_rrna_index 30 | hisat2_target_db = p.hisat2_target_index 31 | 32 | other = p.other 33 | contact = p.contact 34 | #=============================================================================== 35 | # Pipeline part 36 | #=============================================================================== 37 | #--------------------- 1. read all files ------------------------------------------------ 38 | Message('Riboseq start',contact) 39 | os.chdir(file_path) 40 | fastqFiles = list_fq_files(file_path) 41 | if fastqFiles[0][0].startswith('trim_'): 42 | trim = False 43 | #--------------------- 2. trim reads----------------------------------------------------- 44 | def trim_parameters(): 45 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 46 | for infile, output in zip(infiles,outfiles): 47 | yield infile,output 48 | #------------- run fastqc before trimming ----------- 49 | @active_if(QC) 50 | @jobs_limit(thread) 51 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 52 | @files(trim_parameters) 53 | def run_QC1(input_file,output_file): 54 | for fq in input_file: 55 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 56 | #------------ trim file ------------------ 57 | @active_if(trim) 58 | @follows(run_QC1) 59 | @jobs_limit(trim_batch) 60 | @files(trim_parameters) 61 | def trim_reads(input_file,output_file): 62 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 63 | conda_Trimmomatic(input_file,output_file,n,adapter,min_len) 64 | remove(input_file) 65 | # run fastqc after trimming 66 | if QC: 67 | for fq in output_file: 68 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 69 | #--------------------- 3. Align to rtRNA ----------------------------------------------- 70 | # build index 71 | @active_if(not os.path.exists(hisat2_rRNA_db)) 72 | @follows(trim_reads,run_QC1) 73 | def hisat2_rrna_index(): 74 | if not os.path.exists(hisat2_rRNA_db): os.mkdir(hisat2_rRNA_db) 75 | hisat2_Db(rRNA_fa,hisat2_rRNA_db+'/rRNA',thread) 76 | # align 77 | if trim == False: 78 | trim_reads=fastqFiles 79 | @jobs_limit(hisat2_batch) 80 | @follows(hisat2_rrna_index) 81 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01rRNA_bam') 82 | @check_if_uptodate(check_file_exists) 83 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01rRNA_bam/{basename[0]}_norrna.fq.gz') 84 | def run_hisat2rRNA(input_file,output_file): 85 | n = num_thread2use(hisat2_batch,len(fastqFiles),thread) 86 | rrna_fq = output_file[:-13]+'.bam' 87 | hisat2(input_file,rrna_fq,hisat2_rRNA_db+'/rRNA',n,['--un-gz',output_file]) 88 | #--------------------- 3. Align to target genome ---------------------------------------- 89 | @active_if(not os.path.exists(hisat2_target_db)) 90 | @follows(run_hisat2rRNA) 91 | def hisat2_index(): 92 | if not os.path.exists(hisat2_target_db): os.mkdir(hisat2_target_db) 93 | hisat2_Db(ref_fa,hisat2_target_db+'/target',thread) 94 | # align 95 | @jobs_limit(hisat2_batch) 96 | @follows(hisat2_index) 97 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_bam') 98 | @check_if_uptodate(check_file_exists) 99 | @transform(run_hisat2rRNA,formatter('.*\.f.*?\.gz'),'f02_bam/{basename[0]}.bam') 100 | def run_hisat2(input_file,output_file): 101 | n = num_thread2use(hisat2_batch,len(fastqFiles),thread) 102 | hisat2([input_file],output_file,hisat2_target_db+'/target',n,['--known-splicesite-infile',annotation]) 103 | #--------------------- 4. get primary mapping ------------------------------------------- 104 | @jobs_limit(trim_batch) 105 | @follows(run_hisat2) 106 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_primaryBam') 107 | @check_if_uptodate(check_file_exists) 108 | @transform(run_hisat2,formatter('.*\.bam'),'f03_primaryBam/{basename[0]}.bam') 109 | def primary_bam(input_file,output_file): 110 | cmd = ('samtools view -h {fst_map} | grep -E {pattern} | ' 111 | 'samtools view -bh -F 256 - > {out}').format(fst_map=input_file, 112 | pattern='\'(NM:i:[012])|(^@)\'',out=output_file) 113 | print(cmd) 114 | sarge.run(cmd) 115 | #--------------------- 5. samtools sort by position -------------------------------------- 116 | @jobs_limit(trim_batch) 117 | @follows(primary_bam) 118 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_sortBam') 119 | @check_if_uptodate(check_file_exists) 120 | @transform(primary_bam,formatter('.*\.bam'),'f04_sortBam/{basename[0]}.sort.bam') 121 | def sort_by_pos(input_file,output_file): 122 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 123 | sortBam(input_file,output_file,n) 124 | stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file)) 125 | with open(output_file[:-3]+'flagstat.txt','w') as f: 126 | f.write(stat) 127 | # @follows(sort_by_pos) 128 | # def remove_bam(): 129 | # if os.path.exists('f01rRNA_bam'): shutil.rmtree('f01rRNA_bam') # remove bam folder 130 | 131 | 132 | @follows(sort_by_pos) 133 | def last_function(): 134 | Message('Riboseq finished',contact) 135 | 136 | if __name__ == '__main__': 137 | try: 138 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 139 | touch_files_only=False,verbose=5) 140 | except: 141 | Message('Riboseq failed',contact) 142 | -------------------------------------------------------------------------------- /RNAseq_STARpipeline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # RNAseq qc and quantification 4 | #QC: Picard and RNA-seQC 5 | #quantification: HT-seq and RSME 6 | 7 | ##### Constants 8 | 9 | 10 | ##### Functions 11 | 12 | 13 | usage() 14 | { 15 | echo "usage: STARpipeline.sh -f1 fastq1 -f2 fastq2 -sid sample_id -starIndx star_index_directory -BAM STAR_bam_path -refFASTA refereceGenome -GFF referenceGTF -RSEM RSEMreference -ncores num_cores" 16 | } 17 | 18 | 19 | ##### Main 20 | #fastq1="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/fastq1.gz" 21 | #fastq2="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/fastq2.gz" 22 | #sample_id="1870-01_W4G6DG07_S55" 23 | #bam_path="/data/vahid/RNAseqB5STAR/1870-01_W4G6DG07_S55/" 24 | #refGenome="/data/vahid/RNAseqPipeLine/Homo_sapiens_assembly38_noALT_noHLA_noDecoy_ERCC.fasta" 25 | #refGTF="/data/vahid/RNAseqPipeLine/gencode.v26.GRCh38.genes.gtf" 26 | #RSEMpath="/data/vahid/RNAseqPipeLine/rsemRef/rsem_reference" 27 | 28 | ncores=10 29 | while [ "$1" != "" ]; do 30 | case $1 in 31 | -f1 | --fastq1 ) shift 32 | fastq1=$1 33 | ;; 34 | -f2 | --fastq2 ) shift 35 | fastq2=$1 36 | ;; 37 | -sid | --sample_id ) shift 38 | sample_id=$1 39 | ;; 40 | -starIndx | --Star_index_directory ) shift 41 | star_index=$1 42 | ;; 43 | -BAM | --BAMpath ) shift 44 | bam_path=$1 45 | ;; 46 | -refFASTA | --refGenome ) shift 47 | refGenome=$1 48 | ;; 49 | -GFF | --refGFF ) shift 50 | refGTF=$1 51 | ;; 52 | -ncores | --num_cores ) shift 53 | ncores=$1 54 | ;; 55 | -RSEM | --RSEMreference ) shift 56 | RSEMpath=$1 57 | ;; 58 | -h | --help ) usage 59 | exit 60 | ;; 61 | esac 62 | shift 63 | done 64 | 65 | #mkdir -p -- ${bam_path} 66 | 67 | #cp -f "$fastq1" ${bam_path}fastq1.gz 68 | #cp -f "$fastq2" ${bam_path}fastq2.gz 69 | 70 | bam_file_name="Aligned.sortedByCoord.out.bam" 71 | 72 | STAR --runMode alignReads --runThreadN ${ncores} --genomeDir ${star_index} \ 73 | --twopassMode Basic \ 74 | --outFilterMultimapNmax 20 \ 75 | --alignSJoverhangMin 8 \ 76 | --alignSJDBoverhangMin 1 \ 77 | --outFilterMismatchNmax 999 \ 78 | --outFilterMismatchNoverLmax 0.1 \ 79 | --alignIntronMin 20 \ 80 | --alignIntronMax 1000000 \ 81 | --alignMatesGapMax 1000000 \ 82 | --outFilterType BySJout \ 83 | --outFilterScoreMinOverLread 0.33 \ 84 | --outFilterMatchNminOverLread 0.33 \ 85 | --limitSjdbInsertNsj 1200000 \ 86 | --readFilesIn $fastq1 $fastq2 \ 87 | --readFilesCommand zcat \ 88 | --outFileNamePrefix ${bam_path} \ 89 | --outSAMstrandField intronMotif \ 90 | --outFilterIntronMotifs None \ 91 | --alignSoftClipAtReferenceEnds Yes \ 92 | --quantMode TranscriptomeSAM GeneCounts \ 93 | --outSAMtype BAM SortedByCoordinate \ 94 | --outSAMunmapped Within \ 95 | --genomeLoad NoSharedMemory \ 96 | --chimSegmentMin 15 \ 97 | --chimJunctionOverhangMin 15 \ 98 | --chimOutType WithinBAM SoftClip \ 99 | --chimMainSegmentMultNmax 1 \ 100 | --outSAMattributes NH HI AS nM NM ch --outSAMattrRGline ID:rg1 SM:sm1 101 | 102 | #Marking the duplicates using picard 103 | mkdir -p -- ${bam_path}markedDup 104 | java -jar /data/vahid/RNAseqPipeLine/picard.jar \ 105 | MarkDuplicates I=${bam_path}${bam_file_name} \ 106 | O=${bam_path}markedDup/${bam_file_name} \ 107 | M=${bam_path}/markedDup/sample_id.marked_dup_metrics.txt \ 108 | ASSUME_SORT_ORDER=coordinate 109 | 110 | #Running Picard QC 111 | mkdir -p -- ${bam_path}/PicardQC 112 | java -jar /data/vahid/RNAseqPipeLine/picard.jar CollectAlignmentSummaryMetrics \ 113 | R=${refGenome} \ 114 | INPUT=${bam_path}${bam_file_name} \ 115 | OUTPUT=${bam_path}PicardQC/picard_QC.txt 116 | 117 | #Running Picard fragment size 118 | java -jar /data/vahid/RNAseqPipeLine/picard.jar CollectInsertSizeMetrics \ 119 | I=${bam_path}${bam_file_name} \ 120 | O=${bam_path}PicardQC/insert_size_metrics.txt \ 121 | H=${bam_path}PicardQC/insert_size_histogram.pdf \ 122 | M=0.5 123 | 124 | #Running samtools statistics on BAM indeces 125 | samtools index ${bam_path}${bam_file_name} 126 | samtools idxstats ${bam_path}${bam_file_name} > ${bam_path}PicardQC/indexStats.txt 127 | 128 | #Running RNA-seQC 129 | 130 | md_bam_file=${bam_path}markedDup/Aligned.sortedByCoord.out.bam 131 | mkdir -p -- ${bam_path}RNA_SeQC 132 | 133 | 134 | 135 | samtools index ${md_bam_file} 136 | 137 | javaPath="/home/vahid/.conda/pkgs/java-1.7.0-openjdk-cos6-x86_64-1.7.0.131-h06d78d4_0/x86_64-conda_cos6-linux-gnu/sysroot/usr/lib/jvm/java-1.7.0-openjdk-1.7.0.131.x86_64/jre/bin/" 138 | ${javaPath}java -jar /data/vahid/RNAseqPipeLine/RNA-SeQC_1.1.9/RNA-SeQC.jar -n 1000 \ 139 | -s ${sample_id},${md_bam_file},${sample_id} \ 140 | -t ${refGTF} \ 141 | -r ${refGenome} \ 142 | -noDoC \ 143 | -strictMode \ 144 | -o ${bam_path}RNA_SeQC \ 145 | -gatkFlags --allow_potentially_misencoded_quality_scores \ 146 | -singleEnd no 147 | 148 | 149 | #Running RSEM 150 | 151 | mkdir -p -- ${bam_path}RSEM 152 | /data/vahid/RNAseqPipeLine/RSEMpkg/RSEM-1.2.25/rsem-calculate-expression --num-threads 4 \ 153 | --fragment-length-max 1000 \ 154 | --estimate-rspd \ 155 | --no-bam-output \ 156 | --paired-end \ 157 | --bam ${bam_path}Aligned.toTranscriptome.out.bam \ 158 | ${RSEMpath} ${bam_path}RSEM/RSEM 159 | 160 | #Running HT-Seq 161 | mkdir -p -- ${bam_path}HTseq 162 | htseq-count -f bam -r pos -s no -t gene ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqGeneUnion.txt & 163 | sleep 1m 164 | htseq-count -f bam -r pos -s no -t transcript ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqTranscriptUnion.txt & 165 | sleep 1m 166 | htseq-count -f bam -r pos -s no -t exon ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqExonUnion.txt & 167 | sleep 1m 168 | htseq-count -f bam -r pos -s no -t gene -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqGeneIntersect.txt & 169 | sleep 1m 170 | htseq-count -f bam -r pos -s no -t transcript -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqTranscriptIntersect.txt & 171 | sleep 1m 172 | htseq-count -f bam -r pos -s no -t exon -m intersection-strict ${bam_path}${bam_file_name} ${refGTF} > ${bam_path}HTseq/HTseqExonIntersect.txt 173 | sleep 30m 174 | 175 | 176 | 177 | #rm -- ${bam_path}fastq1.gz 178 | #rm -- ${bam_path}fastq2.gz 179 | rm -rf -- ${bam_path}_STARpass1 180 | rm -rf -- ${bam_path}_STARgenome 181 | 182 | rm -rf -- ${bam_path}markedDup 183 | 184 | rm -- ${bam_path}Aligned.toTranscriptome.out.bam 185 | rm -- ${bam_path}Aligned.sortedByCoord.out.bam 186 | rm -- ${bam_path}Aligned.sortedByCoord.out.bam.bai -------------------------------------------------------------------------------- /GRO_Seq_Cap.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Trimmomatic import Trimmomatic 4 | from Modules.Aligner import STAR,STAR_Db 5 | from Modules.Samtools import sortBam 6 | from Modules.Homer import * 7 | import yaml 8 | import shutil 9 | import itertools 10 | 11 | #============ parameters ====================== 12 | parameter_file = sys.argv[1] 13 | # parameter_file = '/home/shangzhong/Codes/NGS-Pipeline/Parameters/GRO_Seq_Cap.yaml' 14 | with open(parameter_file,'r') as f: 15 | doc = yaml.load(f) 16 | p = dic2obj(**doc) 17 | #------------- get parameters ----------- 18 | file_path = p.RawDataPath 19 | thread = p.thread 20 | QC = p.QC 21 | # all parameter 22 | ref_fa = p.ref_fa 23 | annotation = p.gff 24 | # trimmomatic parameter 25 | trim = p.trim_reads 26 | trimmomatic = p.trimmomatic_path 27 | trim_batch = p.trim_jobs_per_batch 28 | adapter = p.adapter 29 | # star parameter 30 | star_batch = p.star_jobs_per_batch 31 | db_path = p.STAR_index_path 32 | 33 | contact = p.contact 34 | #=============================================================================== 35 | # Pipeline part 36 | #=============================================================================== 37 | #--------------------- 1. read all files ------------------------------------------------ 38 | # Message('5GRO',contact) 39 | os.chdir(file_path) 40 | fastqFiles = list_fq_files(file_path) 41 | if fastqFiles[0][0].startswith('trim_'): 42 | trim = False 43 | #--------------------- 2. trim reads----------------------------------------------------- 44 | def trim_parameters(): 45 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 46 | for infile, output in zip(infiles,outfiles): 47 | yield infile,output 48 | #------------- run fastqc before trimming ----------- 49 | @active_if(QC) 50 | @jobs_limit(thread) 51 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_fastqc') 52 | @files(trim_parameters) 53 | def run_QC1(input_file,output_file): 54 | for fq in input_file: 55 | sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq)) 56 | #------------ trim file ------------------ 57 | @active_if(trim) 58 | @follows(run_QC1) 59 | @jobs_limit(trim_batch) 60 | @files(trim_parameters) 61 | def trim_reads(input_file,output_file): 62 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 63 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter,22) 64 | remove(input_file) 65 | #------------ run fastqc after trimming ------------ 66 | @active_if(QC and trim) 67 | @follows(trim_reads) 68 | @jobs_limit(thread) 69 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01_fastqc/{basename[0]}') 70 | @check_if_uptodate(check_file_exists) 71 | # @files(trim_parameters) 72 | def run_QC2(input_file,output_file): 73 | for fq_in,fq_out in zip(input_file,output_file): 74 | if fq_in.startswith('trim_'): 75 | sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq_in)) 76 | else: 77 | sarge.run('fastqc {input} -o f01_fastqc'.format(input=fq_out)) 78 | #--------------------- 3. run STAR ------------------------------------------------------ 79 | # build index 80 | @active_if(not os.path.exists(db_path)) 81 | @follows(trim_reads,run_QC2) 82 | def star_index(): 83 | STAR_Db(db_path,ref_fa,thread) 84 | # align 85 | if trim == False: 86 | trim_reads=fastqFiles 87 | @jobs_limit(star_batch) 88 | @follows(star_index) 89 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_bam') 90 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_flagstat') 91 | @check_if_uptodate(check_file_exists) 92 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f02_bam/{basename[0]}.bam') 93 | def run_star(input_file,output_file): 94 | n = num_thread2use(star_batch,len(fastqFiles),thread) 95 | STAR(input_file,output_file,db_path,n,annotation,['--outSAMtype BAM','SortedByCoordinate','--outSAMunmapped Within']) 96 | stat = sarge.get_stdout('samtools flagstat {bam}'.format(bam=output_file)) 97 | flag_fn = output_file[:-3]+'flagstat.txt' 98 | with open(flag_fn,'w') as f: 99 | f.write(stat) 100 | shutil.move(flag_fn,'f02_flagstat') 101 | #--------------------- 4. make tag_directory ------------------------------------------------------ 102 | @follows(run_star) 103 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_tags') 104 | @check_if_uptodate(check_file_exists) 105 | @transform(run_star,formatter('\.bam'),'f03_tags/{basename[0]}') 106 | def make_tag(input_bam,out_dir): 107 | make_tag_directory(input_bam,out_dir,ref_fa) 108 | hist_out = out_dir+'/hist.txt' 109 | hist(out_dir,hist_out,ref_fa,annotation,mode='tss',peak='',region=4000,res=10,pc=3) 110 | hist_plot(hist_out) 111 | #--------------------- 5. find peaks ------------------------------------------------------ 112 | def get_input_for_peak_call(): 113 | gro_cap = [f for f in os.listdir('f03_tags') if '5GRO' in f and 'contr' not in f] 114 | # gro_cap_ctr = [[f for f in os.listdir('f03_tags') if 'contr' in f]] 115 | gro_seq = [f for f in os.listdir('f03_tags') if '5GRO' not in f] 116 | comb = list(itertools.product(gro_cap,gro_seq)) 117 | for com in comb: 118 | out = com[0] + '_and_' + com[1] 119 | yield ['f03_tags/'+f for f in com],'f04_peaks/' + out + '.peak' 120 | @jobs_limit(thread) 121 | @follows(make_tag) 122 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_peaks') 123 | @files(get_input_for_peak_call) 124 | def find_peak(input_files,output_file): 125 | find_peaks(input_files[0],output_file,'tss',input_files[1],['-F 2']) 126 | #--------------------- 6. merge peaks ------------------------------------------------------ 127 | @follows(find_peak) 128 | @merge(find_peak,'f04_peaks/merge.peak') 129 | def merge_peak(input_files,output_file): 130 | merge_peaks(input_files,output_file,150) 131 | #--------------------- 6. annotate peaks ------------------------------------------------------ 132 | @jobs_limit(thread) 133 | @follows(merge_peak) 134 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_annoPeaks') 135 | @transform(merge_peak,formatter('\.peak'),'f05_annoPeaks/{basename[0]}.anno') 136 | @check_if_uptodate(check_file_exists) 137 | def anno_peak(input_file,output_file): 138 | annotate_peaks(input_file,output_file,ref_fa,annotation) 139 | #--------------------- 7. hist peaks ------------------------------------------------------ 140 | @jobs_limit(thread) 141 | @follows(anno_peak) 142 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_histPeaks') 143 | @transform(find_peak,formatter('\.peak'),'f06_histPeaks/{basename[0]}.hist') 144 | @check_if_uptodate(check_file_exists) 145 | def peak_cov_hist(input_file,output_file): # input is peak file 146 | gro_cap = [f for f in os.listdir('f03_tags') if '5GRO' in f] 147 | tag = ['f03_tags/' + t for t in gro_cap if t in input_file] 148 | hist(tag[0],output_file,ref_fa,annotation,mode='peak',peak=input_file,region=4000,res=25,pc=1) 149 | hist_plot(output_file) 150 | 151 | #--------------------- 8. merge peaks ------------------------------------------------------ 152 | 153 | @follows(peak_cov_hist) 154 | def last_function(): 155 | Message('GroCap finished',contact) 156 | 157 | 158 | if __name__ == '__main__': 159 | try: 160 | # pipeline_printout(sys.stdout, [last_function], verbose=3) 161 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = False, 162 | touch_files_only=False,verbose=20) 163 | except: 164 | Message('GroCap failed',contact) 165 | 166 | 167 | -------------------------------------------------------------------------------- /Modules/Aligner.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import os 3 | import shutil 4 | import sys 5 | #=============================================================================== 6 | # STAR 7 | #=============================================================================== 8 | def STAR_Db(db_path,ref_fa,thread=1,annotation = '',genomeSize='large'): 9 | """ 10 | This function generates database for alignment using STAR 11 | """ 12 | if not os.path.exists(db_path): os.mkdir(db_path) 13 | if os.listdir(db_path) == []: 14 | cmd = ('STAR --runMode genomeGenerate --genomeDir {db_path} ' 15 | '--genomeFastaFiles {ref_fa} --runThreadN {thread} ' 16 | '--limitGenomeGenerateRAM 100000000000 ').format( 17 | db_path=db_path,ref_fa=ref_fa,thread=str(thread)) 18 | if annotation != '': 19 | cmd = cmd + ('--sjdbGTFfile {gff3} --sjdbGTFtagExonParentTranscript Parent ' 20 | '--sjdbOverhang 100').format(gff3=annotation) # for geneDb add --sjdbGTFfeatureExon CDS 21 | if genomeSize == 'small': 22 | cmd = cmd + '--genomeChrBinNbits 6 --genomeSAindexNbases 4' 23 | print(cmd);sys.stdout.flush() 24 | sarge.run(cmd) 25 | 26 | def STAR(fastqFiles,outSamFile,db_path,thread=1,annotation='',otherParameters=['']): 27 | """STAR for single end read""" 28 | if annotation != '': 29 | otherParameters.extend(['--sjdbGTFfile {gff}'.format(gff=annotation)]) 30 | if annotation.endswith('gff') or annotation.endswith('gff3'): 31 | otherParameters.append('--sjdbGTFtagExonParentTranscript Parent') 32 | # generate command 33 | if len(fastqFiles) == 1: 34 | starCmd = ('STAR --genomeDir {ref} --readFilesCommand zcat ' 35 | '--readFilesIn {fq1} --runThreadN {thread} ' 36 | '--outFileNamePrefix {output} --outSAMstrandField intronMotif ' 37 | '--outFilterIntronMotifs RemoveNoncanonical').format( 38 | ref=db_path,fq1=fastqFiles[0], 39 | thread=thread,output=outSamFile) 40 | elif len(fastqFiles) == 2: 41 | starCmd = ('STAR --genomeDir {ref} --readFilesCommand zcat ' 42 | '--readFilesIn {fq1} {fq2} --runThreadN {thread} ' 43 | '--outFileNamePrefix {output} --outSAMstrandField intronMotif ' 44 | '--outFilterIntronMotifs RemoveNoncanonical').format( 45 | ref=db_path,fq1=fastqFiles[0],fq2=fastqFiles[1], 46 | thread=thread,output=outSamFile) 47 | cmd = starCmd + ' ' + ' '.join(otherParameters) 48 | print(cmd);sys.stdout.flush() 49 | sarge.run(cmd) 50 | if 'SortedByCoordinate' in otherParameters: 51 | outFile = outSamFile+'Aligned.sortedByCoord.out.bam' 52 | else: 53 | outFile = outSamFile+'Aligned.out.bam' 54 | os.rename(outFile,outSamFile) 55 | if os.path.exists(outSamFile+'_STARgenome'): 56 | shutil.rmtree(outSamFile+'_STARgenome') 57 | 58 | 59 | def BLASR(faFile,outBam,ref_fa,thread,otherParameters=['']): 60 | """This function runs BLASR""" 61 | 62 | if otherParameters != ['']: 63 | other = ' '.join(otherParameters) 64 | else: 65 | other = '' 66 | cmd = ('blasr {input} {ref} -sam -nproc {thread} {other} | samtools view -hb - > {out}').format( 67 | input=faFile,ref=ref_fa,thread=str(thread),other=other,out=outBam,) 68 | 69 | print(cmd);sys.stdout.flush() 70 | sarge.run(cmd) 71 | 72 | #=============================================================================== 73 | # bwa 74 | #=============================================================================== 75 | def bwa_Db(db_path,ref_fa): 76 | """build bwa index""" 77 | if not os.path.exists(db_path): 78 | os.mkdir(db_path) 79 | cmd = ('bwa index -p {db_path}/bwa -a bwtsw {fa}').format(fa=ref_fa,db_path=db_path) 80 | print(cmd);sys.stdout.flush() 81 | sarge.run(cmd) 82 | 83 | 84 | def bwa_mem(fqFile,outSam,db_name,thread,otherParameters=['']): 85 | """run bwa""" 86 | if otherParameters != ['']: 87 | other = ' '.join(otherParameters) + ' ' 88 | else: 89 | other = '' 90 | if len(fqFile) == 1: 91 | bwaCmd = ('bwa mem -t {thread} {other}{db} {fq} | samtools view -bh - > {out} ').format( 92 | thread=str(thread),other=other,db=db_name,fq=fqFile[0], 93 | out=outSam) 94 | else: 95 | bwaCmd = ('bwa mem -t {thread} {other}{db} {fq1} {fq2} | samtools view -bh - > ' 96 | '{out} ').format(thread=str(thread),other=other,db=db_name,fq1=fqFile[0], 97 | fq2=fqFile[1],out=outSam) 98 | print(bwaCmd);sys.stdout.flush() 99 | sarge.run(bwaCmd) 100 | #bwa_mem('/data/shangzhong/Pacbio/sniffle/CHOS.fq.gz','/data/shangzhong/Pacbio/sniffle/result.bam','5',['-x pacbio']) 101 | 102 | 103 | def bwa_samblaster(fqFiles,outBam,db_name,thread,otherParameters=['']): 104 | '''map for lumpy ''' 105 | if len(fqFiles) != 2: 106 | assert False,'fastq files are not paired' 107 | if otherParameters != ['']: 108 | other = ' '.join(otherParameters) + ' ' 109 | else: 110 | other = '' 111 | split = outBam[:-3]+'split.sam' 112 | disc = outBam[:-3] + 'disc.sam' 113 | cmd = ('bwa mem -t {thread} {other}{db} {fq1} {fq2} | samblaster --addMateTags -e -d {disc} -s {split} | \ 114 | samtools view -Sb - > {out}').format(thread=str(thread),other=other,db=db_name,fq1=fqFiles[0], 115 | fq2=fqFiles[1],disc=disc,split=split,out=outBam) 116 | print(cmd);sys.stdout.flush() 117 | sarge.run(cmd) 118 | 119 | #=============================================================================== 120 | # HISAT2 121 | #=============================================================================== 122 | def hisat2_Db(ref_fa,db,thread=1): 123 | """ 124 | """ 125 | cmd = ('hisat2-build -p {t} {ref} {name} ').format(t=str(thread),ref=ref_fa,name=db) 126 | print(cmd);sys.stdout.flush() 127 | sarge.run(cmd) 128 | 129 | 130 | 131 | def hisat2(fqFile,outBam,db_name,thread,otherParameters=['']): 132 | """ 133 | """ 134 | if otherParameters != ['']: 135 | other = ' '.join(otherParameters) + ' ' 136 | else: 137 | other = '' 138 | if len(fqFile) == 1: 139 | hisat2Cmd = ('hisat2 -x {db} -U {fq} -t {other} -p {thread} ' 140 | '| samtools view -bh - > {out}').format(db=db_name,fq=fqFile[0], 141 | other=other,thread=str(thread),out=outBam) 142 | else: 143 | hisat2Cmd = ('hisat2 -x {db} -1 {fq1} -2 {fq2} -t {other} -p {thread} ' 144 | '| samtools view -bh - > {out}').format(db=db_name,fq1=fqFile[0],fq2=fqFile[1], 145 | other=other,thread=str(thread),out=outBam) 146 | 147 | print(hisat2Cmd);sys.stdout.flush() 148 | sarge.run(hisat2Cmd) 149 | 150 | #=============================================================================== 151 | # ngmlr 152 | #=============================================================================== 153 | def ngmlr(in_fa,outBam,ref_fa,thread): 154 | '''run nglmr for better SV detection using pacbio''' 155 | cmd = ('ngmlr -t {thread} -r {ref} -q {fa} | samtools view -hb - > outBam').format( 156 | thread=str(thread),ref=ref_fa,fa=in_fa) 157 | print(cmd);sys.stdout.flush() 158 | sarge.run(cmd) -------------------------------------------------------------------------------- /VCF_snpEff_annotation.py: -------------------------------------------------------------------------------- 1 | """ 2 | this pipeline annotation variant calling results in vcf file and then 3 | use provean to predict the effect 4 | """ 5 | import sys,subprocess,os 6 | sys.path.append('/home/shangzhong/Codes/Pipeline') 7 | sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) # disable buffer 8 | from Modules.f11_snpEff_provean import * 9 | from Modules.p01_FileProcess import get_parameters 10 | from Modules.f00_Message import Message 11 | from Modules.p05_ParseGff import * 12 | from multiprocessing import Pool,Process 13 | #parFile = sys.argv[1] 14 | parFile = '/data/shangzhong/DNArepair/correction/Annotation_Parameters.txt' 15 | param = get_parameters(parFile) 16 | # parameters 17 | thread = param['thread'] 18 | pathway = param['pathway'] 19 | email = param['email'] 20 | startMessage = param['startMessage'] 21 | endMessage = param['endMessage'] 22 | # database reference 23 | fastaFile = param['reference'] 24 | record_dict = SeqIO.index(fastaFile,'fasta') 25 | gffFile = param['annotation'] 26 | genome = param['genome'] 27 | # software parameters 28 | snpSift = param['snpSift'] 29 | snpEff = param['snpEff'] 30 | provean = param['provean'] 31 | support_set_path = param['support_set'] 32 | provean_res_path = param['provean_results'] 33 | # other parameters 34 | gene_file = param['gene_file'] 35 | 36 | #=============================================================================== 37 | # Variant analysis pipeline 38 | #=============================================================================== 39 | def chunk(l,n): 40 | n = max(1,n) 41 | res = [l[i:i+n] for i in range(0,len(l),n)] 42 | return res 43 | 44 | def get_genes_from_file(gene_file): 45 | """read gene list from the file and return a list of gene symbols""" 46 | if gene_file == '': 47 | genes = [''] 48 | else: 49 | genes = [] 50 | gene_df = pd.read_csv(gene_file,header=None,names=['GeneID']) 51 | genes = gene_df['GeneID'].tolist() 52 | return genes 53 | 54 | def get_all_folders(pathway): 55 | """put each pair of vcf,vcf.idx files into separate folder, return folders""" 56 | folders = [] 57 | files = [f for f in os.listdir(pathway) if f.endswith('.merged.filter.vcf')] 58 | if files != []: 59 | files = natsorted(files) 60 | for f in files: 61 | fp = f[:-18] 62 | folders.append(fp) 63 | if not os.path.exists(fp): os.mkdir(fp) 64 | os.rename(f,fp+'/'+f) 65 | os.rename(f+'.idx',fp+'/'+f+'.idx') 66 | else: 67 | all_folders = [fo for fo in os.listdir(pathway) if os.path.isdir(fo)] 68 | for folder in all_folders: 69 | fns = [f for f in os.listdir(folder) if f.endswith('merged.filter.vcf')] 70 | if fns != []: 71 | folders.append(folder) 72 | print 'list directories succeeds' 73 | print 'folders are:',folders 74 | return folders 75 | 76 | def prepare_fa_vari(workdir,snpEff,snpSift,email,genome,genes,record_dict,gffFile): 77 | """ 78 | Prepare files for running provean, each folder should only have vcf and vcf.idx file 79 | * workdir: the folder that has vcf files 80 | * snpEff: path to snpEff 81 | * snpSift: path to snpSift 82 | * email: email or phone number (number@txt.att.net) 83 | * genome: genome name defined in snpEff 84 | * genes: A list of gene symbols 85 | * record_dict: 86 | """ 87 | gene_rna_lst = [f[:-11] for f in os.listdir(workdir) if f.endswith('protein.fa')] 88 | 89 | os.chdir(workdir) # set work directory 90 | vcfFiles = [f for f in os.listdir(workdir) if f.endswith('filter.vcf')] 91 | vcfFile = vcfFiles[0] 92 | proteinFiles = [];variantFiles = [] 93 | #============= 1. Annotate vcf results using snpEff ================ 94 | annotatedVCF = vcfFile[:-3] + 'eff.vcf' 95 | if not os.path.exists(workdir + '/' + annotatedVCF): 96 | annotatedVCF = snpEff_annotateVCF(vcfFile,snpEff,genome) # annotated: filename.eff.vcf 97 | #============= 2. Loop for every genes ================================ 98 | for gene in genes: 99 | print gene,'start to get input files for provean' 100 | if gene == '': 101 | try: 102 | filteredVCF = snpSift_filterVCF(annotatedVCF,snpSift, 103 | ['((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))']) 104 | except: 105 | print gene,'snpSift filter failed' 106 | Message('snpSift filter failed',email) 107 | else: 108 | gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene) 109 | #============= (1). Filter the annotated file ======================== 110 | try: 111 | filteredVCF = snpSift_filterVCF(annotatedVCF,snpSift,[gene_if,'&' 112 | '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))']) 113 | print 'filteredVCF is: ',filteredVCF 114 | except: 115 | print gene,'snpSift filter failed' 116 | Message('snpSift filter failed',email) 117 | #============= (2). Get input files for provean ====================== 118 | try: 119 | [protein_files,variant_files] = vcf2input4provean(filteredVCF,record_dict,gffFile,gene_rna_lst) 120 | except: 121 | print gene,'fail to get provean inputs' 122 | Message('fail to get provean inputs',email) 123 | raise 124 | if protein_files != '': 125 | proteinFiles.extend(protein_files) 126 | variantFiles.extend(variant_files) 127 | print gene,'prepare for provean input finish' 128 | else: 129 | print gene,'does not have interested variants' 130 | raise 131 | print workdir,'provean input succeed' 132 | 133 | Message(startMessage,email) 134 | genes = get_genes_from_file(gene_file) 135 | #================= 0. list directories ========================================= 136 | os.chdir(pathway) # set work directory 137 | folders = get_all_folders(pathway) 138 | folders = natsorted(folders) 139 | #============= 2. prepare input files for provean ====================================== 140 | batch_folders = chunk(folders,int(thread)) 141 | for batch in batch_folders: 142 | proc = [Process(target=prepare_fa_vari,args=(pathway+'/'+f,snpEff,snpSift,email,genome,genes,record_dict,gffFile,)) for f in batch] 143 | for p in proc: 144 | p.start() 145 | for p in proc: 146 | p.join() 147 | #============= 3. Run provean ====================================== 148 | # # support set for provean, it can help proven skip the time consuming blast step 149 | for folder in folders: 150 | support_set = [f for f in os.listdir(support_set_path) if f.endswith('.sss')] 151 | workdir = pathway+'/'+folder 152 | os.chdir(workdir) 153 | proteinFiles = sorted([f for f in os.listdir(workdir) if f.endswith('protein.fa')]) 154 | variantFiles = sorted([f for f in os.listdir(workdir) if f.endswith('variant.txt')]) 155 | if not os.path.exists(provean_res_path): os.mkdir(provean_res_path) 156 | provean_result = provean_res_path +'/'+folder+'_proveanScore.txt' 157 | try: 158 | capture_provean_scores(provean_result,provean,proteinFiles,variantFiles,support_set_path,support_set,thread) 159 | print folder,'folder analysis succeeds' 160 | except: 161 | print 'capture provean scores failed' 162 | Message('capture provean scores failed',email) 163 | raise 164 | #============= 4. move the sss support to the standard pathway ====================================== 165 | new_support_set = [f for f in os.listdir(pathway+'/'+folder) if f.endswith('.sss')] 166 | for f in new_support_set: 167 | if os.path.exists(f+'.fasta'): 168 | os.rename(f,support_set_path+'/'+f) 169 | os.rename(f+'.fasta',support_set_path+'/'+f+'.fasta') 170 | # cmd = ('rm */*.protein.fa'); subprocess.call(cmd,shell=True) 171 | # cmd = ('rm */*.variant.txt'); subprocess.call(cmd,shell=True) 172 | #for p in proteinFiles: os.remove(p) 173 | #for v in variantFiles: os.remove(v) 174 | #============= 4. Merge provean results ====================================== 175 | outFile = pathway+'/provean_final_result.txt' 176 | try: 177 | merge_provean_results(provean_res_path,outFile) 178 | print 'merge succeed' 179 | except: 180 | print 'merge failed' 181 | Message(endMessage,email) 182 | 183 | -------------------------------------------------------------------------------- /Modules/f01_file_process.py: -------------------------------------------------------------------------------- 1 | import os 2 | from natsort import natsorted 3 | import sarge 4 | import re 5 | import pandas as pd 6 | 7 | 8 | 9 | class dic2obj: 10 | def __init__(self, **entries): 11 | self.__dict__.update(entries) 12 | 13 | def remove(files): 14 | """ 15 | this function can remove files provided 16 | Arguments: 1. files: a list of files to be removed 17 | 18 | files: a list of files to be removed. [f1,f2,f3,...] or [[f1,f2],[f3],...] or with any depth of list layers 19 | """ 20 | if isinstance(files,str): 21 | os.remove(files) 22 | try: 23 | os.remove(files + '.bai') 24 | except: 25 | pass 26 | if isinstance(files,list): 27 | for f in files: 28 | remove(f) 29 | try: 30 | remove(f+'.bai') 31 | except: 32 | continue 33 | 34 | def check_file_exists(input_file, output_file): 35 | if not os.path.exists(output_file): 36 | return True, "Missing file %s" % output_file 37 | else: 38 | return False, "File %s exists" % output_file 39 | 40 | def list_fq_files(file_path): 41 | """ 42 | This function list all fastq files into a list 43 | """ 44 | fst_files = natsorted([f for f in os.listdir(file_path) if '_R1_' in f and (f.endswith(".fastq.gz") or f.endswith(".fq.gz"))]) 45 | snd_files = natsorted([f for f in os.listdir(file_path) if '_R2_' in f and (f.endswith(".fastq.gz") or f.endswith(".fq.gz"))]) 46 | if fst_files == []: 47 | fst_files = natsorted([f for f in os.listdir(file_path) if f.endswith("_1.fastq.gz") or f.endswith("_1.fq.gz")]) 48 | snd_files = natsorted([f for f in os.listdir(file_path) if f.endswith("_2.fastq.gz") or f.endswith("_2.fq.gz")]) 49 | fastqFiles = [] # this list is going to stroe the paired or single file for running aligner 50 | if snd_files == []: 51 | fst_files = natsorted([f for f in os.listdir(file_path) if f.endswith(".fastq.gz") or f.endswith(".fq.gz")]) 52 | fastqFiles = [[f] for f in fst_files] 53 | elif len(fst_files) == len(snd_files): 54 | fastqFiles = [[f1,f2] for f1,f2 in zip(fst_files,snd_files)] 55 | else: 56 | raise ValueError('input has single end and paired end mixed') 57 | return fastqFiles 58 | 59 | 60 | # allFiles = [f for f in os.listdir(file_path) if f.endswith(".fastq.gz") or f.endswith(".fq.gz")] 61 | # allFiles = natsorted(allFiles) 62 | # fastqFiles = [] # this list is going to stroe the paired or single file for running aligner 63 | # while len(allFiles) > 1: # this is to append the single end or pair end files into a list. 64 | # if allFiles[0].endswith(".fastq.gz"): 65 | # index = allFiles[0].index(".fastq.gz") 66 | # if allFiles[1][index-2:index] == '_2': 67 | # fastqFiles.append(allFiles[:2]) 68 | # del allFiles[:2] 69 | # else: 70 | # fastqFiles.append(allFiles[:1]) 71 | # del allFiles[:1] 72 | # 73 | # if len(allFiles) != 0: 74 | # if allFiles[0].endswith(".fq.gz"): 75 | # index = allFiles[0].index(".fq.gz") 76 | # if allFiles[1][index-2:index] == '_2': 77 | # fastqFiles.append(allFiles[:2]) 78 | # del allFiles[:2] 79 | # else: 80 | # fastqFiles.append(allFiles[:1]) 81 | # del allFiles[:1] 82 | # if len(allFiles) == 1: 83 | # fastqFiles.append(allFiles) 84 | # 85 | # return fastqFiles 86 | 87 | 88 | def replace_filename(inputfile,input_pattern,out_pattern): 89 | """ 90 | This function generates the outputfile name to address a problem that transform failed to do: 91 | make the outputfile the same length with inputfile when inputfile length varies 92 | * inputfile: list. [[f1.fq.gz]...] or [[f1.fq.gz,f2.fq.gz]] 93 | * input_pattern: a pattern in input file. 94 | * out_pattern: a pattern for output file 95 | """ 96 | outFile = [] 97 | regex = re.compile(input_pattern) 98 | for infile in inputfile: 99 | res = [] 100 | for fn in infile: 101 | output = regex.sub(out_pattern,fn) 102 | res.append(output) 103 | outFile.append(res) 104 | return inputfile,outFile 105 | # result = replace_filename([['f_1.fq.gz','f_2.fq.gz']],'^','') 106 | # print result 107 | 108 | def num_thread2use(jobs_per_batch,len_of_jobs,given_thread): 109 | """ 110 | This function calculates how many thread to use for each job given the jobs to run per batch and total number of jobs 111 | Some times total number of jobs is less than provided, in this case we can assign more thread to each job to run fast. 112 | """ 113 | jobs = min(jobs_per_batch,len_of_jobs) 114 | if jobs ==0: 115 | thread = 1 116 | else: 117 | thread = int(given_thread/jobs) 118 | if thread == 0: 119 | thread = 1 120 | return thread 121 | 122 | 123 | def Message(string,email): 124 | """ 125 | This function send message to email when it run. 126 | Used to calculate the time code runs. 127 | """ 128 | cmd = ('echo {quote}|mailx -s "{string}" {email}').format(quote="",string=string,email=email) 129 | sarge.run(cmd) 130 | 131 | def id_symbol_conversion(input_file,output_file,gene2refseq,tax_id,sym2ID='yes'): 132 | """This function convers count file based on gene symbol to gene id 133 | * inputfile: 2 columns. ['symbol','count'] 134 | * outputfile: 2 columns. ['geneid','count']""" 135 | # 1. build {symbol:id conversion} 136 | df = pd.read_csv(gene2refseq,sep='\t',header=None,usecols=[0,1,15],names=['tax','geneid','symbol'],comment='#',compression='gzip') 137 | df = df[df['tax'].values==int(tax_id)] 138 | sym_id_dict = df.set_index('symbol')['geneid'].to_dict() 139 | # 2. transfer symbol -> id 140 | symbol_df = pd.read_csv(input_file,sep='\t',header=None,names=['symbol','count']) 141 | symbol_df['geneid'] = symbol_df['symbol'].map(lambda x: sym_id_dict[x] if x in sym_id_dict else x) 142 | # 3. output 143 | symbol_df[['geneid','count']].to_csv(output_file,sep='\t',header=None,index=False) 144 | os.remove(input_file) 145 | 146 | 147 | 148 | def get_gene_name_id_dic(gff,source,sym2ID='yes'): 149 | ''' 150 | This function build {gene name: geneid} or {geneid:gene name} dictionary 151 | * source: ncbi or ensembl 152 | ''' 153 | df = pd.read_csv(gff,sep='\t',comment='#',header=None) 154 | df = df[df[2].values=='gene'] 155 | df = df.reset_index(drop=True) 156 | if source == 'ncbi': 157 | gene_pattern = 'gene=' 158 | id_pattern = 'GeneID\:' 159 | elif source == 'ensembl': 160 | gene_pattern = 'gene_name=' 161 | id_pattern = 'ID=' 162 | df['geneid'] = df[8].map(lambda x: re.search('(?<={p}).+?(?=[.,;$])'.format(p=id_pattern),x).group(0)) 163 | df['genename'] = df[8].map(lambda x: re.search('(?<={p}).+?(?=[,;$])'.format(p=gene_pattern),x).group(0)) 164 | # build dictionary 165 | if sym2ID=='yes': 166 | return df.set_index('genename')['geneid'].to_dict() 167 | else: 168 | return df.set_index('geneid')['genename'].to_dict() 169 | 170 | 171 | def gene_id_name_convert_merge(in_file,out_file,gene_id_name_dic): 172 | ''' 173 | * in_file: first row should be column name. 174 | ''' 175 | df = pd.read_csv(in_file,sep='\t',header=0) 176 | names = list(df.columns) 177 | df.columns = ['id_before'] + names[1:] 178 | df['id_after'] = df['id_before'].map(lambda x: gene_id_name_dic[x.split('.')[0]] if x.split('.')[0] in gene_id_name_dic else x.split('.')[0]) 179 | # 2. output 180 | df[['id_after']+names[1:]].to_csv(out_file,sep='\t',header=None,index=False) 181 | 182 | if __name__ == "__main__": 183 | gff = '/data/genome/cho/chok1.gff' 184 | dic = get_gene_name_id_dic(gff,'ncbi',sym2ID='yes') 185 | in_file = '/path/to/file' 186 | out_file = '/path/to/file' 187 | gene_id_name_convert_merge(in_file,out_file,dic) 188 | 189 | 190 | def gene_id_name_convert(in_file,out_file,gene_id_name_dic): 191 | # 1. transfer id 192 | df = pd.read_csv(in_file,sep='\t',header=None,names=['id_before','count']) 193 | df['id_after'] = df['id_before'].map(lambda x: gene_id_name_dic[x.split('.')[0]] if x.split('.')[0] in gene_id_name_dic else x.split('.')[0]) 194 | # 2. output 195 | df[['id_after','count']].to_csv(out_file,sep='\t',header=None,index=False) 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | -------------------------------------------------------------------------------- /Modules/GATK.py: -------------------------------------------------------------------------------- 1 | import sarge 2 | import sys,re 3 | 4 | def RealignerTargetCreator(dedupbam,interval,gatk,ref_fa,thread,gold_indels=['']): 5 | '''This function creates interval files for realigning. 6 | Input is deduplicated sorted bam files. reference is 7 | fasta file. 8 | ''' 9 | cmd = ('java -jar {gatk} -T RealignerTargetCreator ' 10 | '-R {ref_fa} -I {dedup} -o {output} -nt {thread} ').format( 11 | gatk=gatk,ref_fa=ref_fa,dedup=dedupbam,output=interval, 12 | thread=str(thread)) 13 | if gold_indels != ['']: 14 | gold_indels = ['-known ' + f for f in gold_indels] 15 | cmd = cmd + ' '.join(gold_indels) 16 | print(cmd);sys.stdout.flush() 17 | sarge.run(cmd) 18 | 19 | 20 | def IndelRealigner(dedupBam,realiBam,gatk,ref_fa,interval,gold_indels=['']): 21 | '''This function realigns the deduped bam file to intervals 22 | reference is fasta file, target is target interval file. 23 | ''' 24 | cmd = ('java -jar {gatk} -T IndelRealigner -R {ref_fa} ' 25 | '-I {input} -targetIntervals {target} ' 26 | '-o {output} ').format(gatk=gatk,ref_fa=ref_fa, 27 | input=dedupBam,target=interval,output=realiBam) 28 | if gold_indels != ['']: 29 | gold_indels = ['-known ' + f for f in gold_indels] 30 | cmd = cmd + gold_indels 31 | print(cmd);sys.stdout.flush() 32 | sarge.run(cmd) 33 | 34 | 35 | def HaplotypeCaller_DNA_gVCF(recalBam,vcf,gatk,ref_fa,thread,otherParameters=[]): 36 | ''' 37 | this function does calling variant and stores the result 38 | into the gVCF file. 39 | ''' 40 | cmd = ('java -jar {gatk} -T HaplotypeCaller -R {ref_fa} -I {input} ' 41 | '--emitRefConfidence GVCF -o {output} -nct {t}').format( 42 | gatk=gatk,ref_fa=ref_fa,input=recalBam,output=vcf,t=str(thread)) 43 | print(cmd);sys.stdout.flush() 44 | sarge.run(cmd) 45 | 46 | 47 | def JointGenotype(raw_vcfs,gvcf,gatk,ref_fa,thread): 48 | ''' Merge all vcf files into one g.vcf file 49 | ''' 50 | vcfs = ['--variant '+ f for f in raw_vcfs] 51 | cmd = ('java -Xmx100g -jar {gatk} -T GenotypeGVCFs -R {ref_fa} {vcf} ' 52 | '-o {out} -nt {thread}').format(gatk=gatk,ref_fa=ref_fa, 53 | vcf=' '.join(vcfs),out=gvcf,thread=str(thread)) 54 | print(cmd);sys.stdout.flush() 55 | sarge.run(cmd) 56 | 57 | 58 | def SelectVariants(joint_variant,out_vcf,gatk,reference,extract_type,thread): 59 | """this function can extract either SNP or indel from the 60 | vcf file. 61 | """ 62 | cmd = ('java -jar {gatk} -T SelectVariants -R {ref_fa} -V {input} ' 63 | '-selectType {type} -o {output} -nt {thread}').format(gatk=gatk, 64 | ref_fa=reference,input=joint_variant,type=extract_type, 65 | output=out_vcf,thread=str(thread)) 66 | print(cmd);sys.stdout.flush() 67 | sarge.run(cmd) 68 | 69 | 70 | def snpHardFilter(snp_vcf,out_vcf,gatk,ref_fa): 71 | """ 72 | this function will filter the snps, output a gold standard snp database 73 | """ 74 | filtercmd = ('QD < 3.0 || FS > 50.0 || MQ < 50.0 || HaplotypeScore > 10.0 ' 75 | '|| MappingQualityRankSum < -12.5 || ReadPosRankSum < -8.0') 76 | filtercmd = """'{filter}'""".format(filter=filtercmd) 77 | filtername = """'snp_filter'""" 78 | cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} -V {input} ' 79 | '--filterExpression {filter} --filterName {filtername} ' 80 | '-o {output}').format(gatk=gatk,ref_fa=ref_fa,input=snp_vcf, 81 | filter = filtercmd,filtername=filtername,output=out_vcf) 82 | print(cmd);sys.stdout.flush() 83 | sarge.run(cmd) 84 | 85 | 86 | def indelHardFilter(indel_file,out_vcf,gatk,ref_fa): 87 | """ 88 | this function filter the indels,output a gold standard indel database 89 | """ 90 | filtercmd = ("QD < 2.0 || FS > 200.0 || ReadPosRankSum < -15.0") 91 | filtercmd = """'{filter}'""".format(filter=filtercmd) 92 | filtername = """'indel_filter'""" 93 | cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} -V {input} ' 94 | '--filterExpression {filter} --filterName {filtername} ' 95 | '-o {output}').format(gatk=gatk,ref_fa=ref_fa,input=indel_file, 96 | filter = filtercmd,filtername=filtername,output=out_vcf) 97 | print(cmd);sys.stdout.flush() 98 | sarge.run(cmd) 99 | 100 | 101 | def HardFilter(raw_gvcf,gold_snp_indel,gatk,ref_fa,thread): 102 | """ 103 | this function will apply artificial filter for snp and indel 104 | """ 105 | snp_filter = re.sub('g\.vcf$','snp.vcf',raw_gvcf) 106 | indel_filter = re.sub('g\.vcf$','indel.vcf',raw_gvcf) 107 | SelectVariants(raw_gvcf,snp_filter,gatk,ref_fa,'SNP',str(thread)) 108 | SelectVariants(raw_gvcf,indel_filter,gatk,ref_fa,'INDEL',str(thread)) 109 | snpHardFilter(snp_filter,gold_snp_indel[0],gatk,ref_fa) 110 | indelHardFilter(indel_filter,gold_snp_indel[1],gatk,ref_fa) 111 | 112 | 113 | def BaseRecalibrator_1(realiBam,table,gold_pair,gatk,ref_fa,thread): 114 | '''Step 1 of base recalibration. 115 | ''' 116 | cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} ' 117 | '-I {realignbam} -knownSites {snp} -knownSites {indel} ' 118 | '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa, 119 | realignbam=realiBam,snp=gold_pair[0],indel=gold_pair[1], 120 | output=table,thread=str(thread)) 121 | print(cmd);sys.stdout.flush() 122 | sarge.run(cmd) 123 | 124 | 125 | def BaseRecalibrator_2(realiBam,post_table,table,gold_pair,gatk,ref_fa,thread): 126 | '''Step 2 of base recalibration: get post table''' 127 | cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} ' 128 | '-I {realignbam} -knownSites {snp} -knownSites {indel} -BQSR {table} ' 129 | '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa, 130 | realignbam=realiBam,snp=gold_pair[0],indel=gold_pair[1],output=post_table,table=table,thread=str(thread)) 131 | print(cmd);sys.stdout.flush() 132 | sarge.run(cmd) 133 | 134 | 135 | def BaseRecalibrator_3(table,plot,post_table,gatk,ref_fa): 136 | '''Step 3 of base recalibration: compare table and post table 137 | ''' 138 | cmd = ('java -jar {gatk} -T AnalyzeCovariates -R {ref_fa} ' 139 | '-before {table} -after {post_table} -plots {output}.pdf').format( 140 | gatk=gatk,ref_fa=ref_fa,table=table,post_table=post_table,output=plot) 141 | print(cmd);sys.stdout.flush() 142 | sarge.run(cmd) 143 | 144 | 145 | def BaseRecalibrator_4(realiBam,recalBam,gatk,ref_fa,gold_pair,table,thread): 146 | ''' Step 4 of base recalibration: recalibrate the base quality. 147 | ''' 148 | cmd = ('java -jar {gatk} -T PrintReads -R {ref_fa} -I {input} -BQSR {table} ' 149 | '-o {output} -nct {thread}').format(gatk=gatk, 150 | ref_fa=ref_fa,input=realiBam,table=table,output=recalBam, 151 | thread=str(thread)) 152 | print(cmd);sys.stdout.flush() 153 | sarge.run(cmd) 154 | 155 | 156 | def CombineSNPandINDEL(vcfFiles,outvcf,gatk,ref_fa,otherParams=[]): 157 | """ 158 | This function combines the vcf files. 159 | * gatk: gatk software pathway 160 | * ref_fa: reference genome fasta file 161 | * variantFiles: a list of vcf files that need to be combined 162 | * argus: additional argument 163 | """ 164 | variCmd = ' '.join(['-V '+vcf for vcf in vcfFiles]) 165 | other = ' '.join(otherParams) 166 | cmd = ('java -jar {gatk} -R {ref_fa} -T CombineVariants ' 167 | '{varis} -o {outputVcf} {other}').format(gatk=gatk,ref_fa=ref_fa, 168 | varis=variCmd,outputVcf=outvcf,other=other) 169 | print(cmd);sys.stdout.flush() 170 | sarge.run(cmd) 171 | 172 | 173 | #=============================================================================== 174 | # RNA part functions 175 | #=============================================================================== 176 | def splitN(dedupBam,splitBam,gatk,ref_fa): 177 | '''This function splits reads due to wrong splicng by STAR''' 178 | cmd = ('java -jar {gatk} -T SplitNCigarReads -R {ref_fa} ' 179 | '-I {input} -o {output} -rf ReassignOneMappingQuality ' 180 | '-RMQF 255 -RMQT 60 -U ALLOW_N_CIGAR_READS').format( 181 | gatk=gatk,ref_fa=ref_fa,input=dedupBam,output=splitBam) 182 | print(cmd);sys.stdout.flush() 183 | sarge.run(cmd) 184 | 185 | 186 | def HaplotypeCaller_RNA_VCF(recalBam,vcf,gatk,ref_fa,thread='1'): 187 | """ 188 | This function calls variants in RNAseq 189 | """ 190 | cmd = ('java -jar {gatk} -T HaplotypeCaller -R {ref_fa} ' 191 | '-I {input} -dontUseSoftClippedBases ' 192 | '-stand_call_conf 20.0 -stand_emit_conf 20.0 -o {output} -nct {thread}').format( 193 | gatk=gatk,ref_fa=ref_fa,input=recalBam,output=vcf,thread=str(thread)) 194 | print(cmd);sys.stdout.flush() 195 | sarge.run(cmd) 196 | 197 | 198 | def RNA_Vari_Filter(vcf,filterVCF,gatk,ref_fa): 199 | """ 200 | This function filter out the results of the vari call 201 | """ 202 | FS = """'FS > 30.0'""" 203 | QD = """'QD < 2.0'""" 204 | cmd = ('java -jar {gatk} -T VariantFiltration -R {ref_fa} ' 205 | '-V {input} -window 35 -cluster 3 -filterName FS ' 206 | '-filter {FS} -filterName QD -filter {QD} ' 207 | '-o {output}').format(gatk=gatk,ref_fa=ref_fa, 208 | input=vcf,FS=FS,QD=QD,output=filterVCF) 209 | print(cmd);sys.stdout.flush() 210 | sarge.run(cmd) 211 | 212 | 213 | def RNA_BaseRecalibrator_1(realiBam,table,gatk,ref_fa,gold_vcf,thread='1'): 214 | '''step 1 of base recalibration,generate a table''' 215 | cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} ' 216 | '-I {realignbam} -knownSites {gold} ' 217 | '-o {output} -nct {thread}').format(gatk=gatk,ref_fa=ref_fa, 218 | realignbam=realiBam,gold=gold_vcf,output=table,thread=str(thread)) 219 | print(cmd);sys.stdout.flush() 220 | sarge.run(cmd) 221 | 222 | 223 | def RNA_BaseRecalibrator_2(realiBam,post_table,table,gatk,ref_fa,gold_vcf,thread='1'): 224 | '''Step 2 of base recalibration,generate post table''' 225 | cmd = ('java -jar {gatk} -T BaseRecalibrator -R {ref_fa} ' 226 | '-I {realignbam} -knownSites {gold} -BQSR {table} ' 227 | '-o {output} -nct {thread}').format(gatk=gatk, 228 | ref_fa=ref_fa,realignbam=realiBam,gold=gold_vcf, 229 | output=post_table,table=table,thread=str(thread)) 230 | print(cmd);sys.stdout.flush() 231 | sarge.run(cmd) 232 | 233 | 234 | def RNA_BaseRecalibrator3(table,plot,post_table,gatk,ref_fa): 235 | '''Step 3 of base recalibration, compare the two tables''' 236 | cmd = ('java -jar {gatk} -T AnalyzeCovariates -R {ref_fa} ' 237 | '-before {table} -after {post_table} -plots {output}').format( 238 | gatk=gatk,ref_fa=ref_fa,table=table,post_table=post_table,output=plot) 239 | print(cmd);sys.stdout.flush() 240 | sarge.run(cmd) 241 | 242 | 243 | def RNA_BaseRecalibrator4(realiBam,recalBam,gatk,table,ref_fa,gold_vcf,thread='1'): 244 | '''Step 4 of base recalibration''' 245 | cmd = ('java -jar {gatk} -T PrintReads -R {ref_fa} ' 246 | '-I {input} -BQSR {table} -o {output} -nct {thread}').format(gatk=gatk, 247 | ref_fa=ref_fa,input=realiBam,table=table,output=recalBam,thread=str(thread)) 248 | print(cmd);sys.stdout.flush() 249 | sarge.run(cmd) 250 | 251 | 252 | -------------------------------------------------------------------------------- /GATK_RNA_CHO.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import STAR_Db,STAR 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import build_fa_index,merge_bams 6 | from Modules.Picard import build_fa_dict,mark_duplicates,add_readgroup 7 | from Modules.GATK import * 8 | import yaml 9 | import shutil 10 | import glob 11 | 12 | 13 | #============ parameters ====================== 14 | parameter_file = sys.argv[1] 15 | #parameter_file = '/data/shangzhong/DE/ercc/GATK_RNA_CHO.yaml' 16 | with open(parameter_file,'r') as f: 17 | doc = yaml.load(f) 18 | p = dic2obj(**doc) 19 | #------------- get parameters ----------- 20 | file_path = p.RawDataPath 21 | thread = p.thread 22 | # all parameter 23 | ref_fa = p.ref_fa 24 | # trimmomatic parameter 25 | trim = p.trim_reads 26 | trimmomatic = p.trimmomatic_path 27 | trim_batch = p.trim_jobs_per_batch 28 | adapter = p.adapter 29 | 30 | picard = p.picard 31 | gatk = p.gatk 32 | 33 | star_batch = p.star_jobs_per_batch 34 | star_db = p.star_index 35 | 36 | sp = p.sample_name 37 | read_groups = p.read_groups 38 | 39 | contact = p.contact 40 | #=============================================================================== 41 | # Pipeline part 42 | #=============================================================================== 43 | Message('GATK_RNA_CHO start',contact) 44 | os.chdir(file_path) 45 | #=============================================================================== 46 | # Part I. Preprocess 47 | #=============================================================================== 48 | #--------------------- 1. build index for fa file using samtools and GATK ------------------ 49 | dict_file = '.'.join(ref_fa.split('.')[:-1]) + '.dict' 50 | fai_file = ref_fa + '.fai' 51 | if not os.path.exists(dict_file): build_fa_dict(ref_fa,picard) 52 | if not os.path.exists(fai_file): build_fa_index(ref_fa) 53 | #--------------------- 2. read all files ------------------------------------------------ 54 | fastqFiles = list_fq_files(file_path) 55 | if fastqFiles[0][0].startswith('trim_'): 56 | trim = False 57 | def trim_parameters(): 58 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 59 | for infile, output in zip(infiles,outfiles): 60 | yield infile,output 61 | #--------------------- 3. trim reads----------------------------------------------------- 62 | @active_if(trim) 63 | @jobs_limit(trim_batch) 64 | @files(trim_parameters) 65 | def trim_reads(input_file,output_file): 66 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 67 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 68 | remove(input_file) 69 | #--------------------- 4. Map with STAR ----------------------------------------------------- 70 | # build index 71 | @active_if(not os.path.exists(star_db)) 72 | @follows(trim_reads) 73 | def star_index(): 74 | STAR_Db(star_db,ref_fa,thread) 75 | # align 76 | if trim == True: 77 | @jobs_limit(star_batch) 78 | @follows(trim_reads,star_index) 79 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam') 80 | @check_if_uptodate(check_file_exists) 81 | @transform(trim_reads,formatter('.*\.f.*?\.gz'),'f01_bam/{basename[0]}.sort.bam') 82 | def run_star(input_file,output_file): 83 | n = num_thread2use(star_batch,len(fastqFiles),thread) 84 | STAR(input_file,output_file,star_db,n,'',['--outSAMtype BAM','SortedByCoordinate','--twopassMode Basic']) 85 | else: 86 | @jobs_limit(star_batch) 87 | @follows(star_index) 88 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam') 89 | @check_if_uptodate(check_file_exists) 90 | @transform(fastqFiles,formatter('.*\.f.*?\.gz'),'f01_bam/{basename[0]}.sort.bam') 91 | def run_star(input_file,output_file): 92 | n = num_thread2use(star_batch,len(fastqFiles),thread) 93 | STAR(input_file,output_file,star_db,n,'',['--outSAMtype BAM','SortedByCoordinate','--twopassMode Basic']) 94 | #--------------------- 5. add read group -------------------------------------------------- 95 | def get_bam_and_rg(): 96 | bams = [f for f in os.listdir('f01_bam') if f.endswith('.sort.bam')] 97 | bams = natsorted(bams) 98 | for bam, rg in zip(bams,read_groups): 99 | output = re.sub('\.sort\.bam','.adrg.bam',bam) 100 | yield ['f01_bam/'+bam,rg],'f02_addGroup/' + output 101 | @jobs_limit(trim_batch*2) 102 | @follows(run_star,mkdir('tmp'),mkdir('f02_addGroup')) 103 | @files(get_bam_and_rg) 104 | @check_if_uptodate(check_file_exists) 105 | def run_add_group(input_file,output_file): 106 | add_readgroup(input_file[0],output_file,input_file[1],picard) 107 | @follows(run_add_group) 108 | def remove_bam(): 109 | if os.path.exists('f01_bam'):shutil.rmtree('f01_bam') 110 | #--------------------- 6. Markduplicates using picard ------------------------------------- 111 | @jobs_limit(trim_batch) 112 | @follows(run_add_group,remove_bam) 113 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_dedupBam') 114 | @transform(run_add_group,formatter('.*\.adrg\.bam'),'f03_dedupBam/{basename[0]}.dedup.bam') 115 | @check_if_uptodate(check_file_exists) 116 | def run_mark_duplicates(input_file,output_file): 117 | mark_duplicates(input_file,output_file,picard) 118 | @follows(run_mark_duplicates) 119 | def remove_groupBam(): 120 | if os.path.exists('f02_addGroup'): shutil.rmtree('f02_addGroup') 121 | #--------------------- 7. Split N --------------------------------------------------------- 122 | @follows(run_mark_duplicates,remove_groupBam) 123 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_splitBam') 124 | @check_if_uptodate(check_file_exists) 125 | @transform(run_mark_duplicates,formatter('.*\.dedup\.bam'),'f04_splitBam/{basename[0]}.split.bam') 126 | def run_splitN(input_file,output_file): 127 | splitN(input_file,output_file,gatk,ref_fa) 128 | @follows(run_splitN) 129 | def remove_dedupBam(): 130 | if os.path.exists('f03_dedupBam'): shutil.rmtree('f03_dedupBam') 131 | #--------------------- 8. Indel realignment --------------------- 132 | @jobs_limit(thread) 133 | @follows(run_splitN,remove_dedupBam) 134 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_indelReali') 135 | @transform(run_splitN,formatter('.*\.split\.bam'),'f05_indelReali/{basename[0]}.reali.bam') 136 | @check_if_uptodate(check_file_exists) 137 | def run_realign(input_file,output_file): 138 | interval = re.sub('reali\.bam$','interval.list',output_file) 139 | RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=['']) 140 | IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=['']) 141 | @follows(run_realign) 142 | def remove_splitBam(): 143 | if os.path.exists('f04_splitBam'): shutil.rmtree('f04_splitBam') 144 | #--------------------- 9. Round 1 call ---------------------------------------------- 145 | @jobs_limit(thread) 146 | @follows(run_realign,remove_splitBam) 147 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_Round1Call') 148 | @transform(run_realign,formatter('.*\.reali\.bam'),'f06_Round1Call/{basename[0]}.vcf') 149 | @check_if_uptodate(check_file_exists) 150 | def round1Vari_call(input_file,output_file): 151 | n = num_thread2use(thread,len(fastqFiles),thread) 152 | HaplotypeCaller_RNA_VCF(input_file,output_file,gatk,ref_fa,n) 153 | #--------------------- 10. filter gold snp and indel --------------------------------- 154 | @follows(round1Vari_call) 155 | @transform(round1Vari_call,suffix('.vcf'),'.gold.vcf') 156 | @check_if_uptodate(check_file_exists) 157 | def run_RNA_Vari_Filter(input_file,output_file): 158 | RNA_Vari_Filter(input_file,output_file,gatk,ref_fa) 159 | #--------------------- 11. Base recalibration ----------------------------------------- 160 | # step 1 161 | @follows(run_RNA_Vari_Filter) 162 | @mkdir(fastqFiles,formatter(),'{path[0]}/f07_BaseRecal') 163 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.table') 164 | @check_if_uptodate(check_file_exists) 165 | def run_RNA_Baserecalibration_1(input_file,output_file): 166 | gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1] 167 | RNA_BaseRecalibrator_1(input_file,output_file,gatk,ref_fa,gold_vcf,thread='1') 168 | # step 2 169 | @follows(run_RNA_Baserecalibration_1) 170 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.post_table') 171 | @check_if_uptodate(check_file_exists) 172 | def run_RNA_Baserecalibration_2(input_file,output_file): 173 | table = re.sub('post_table$','table',output_file) 174 | gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1] 175 | RNA_BaseRecalibrator_2(input_file,output_file,table,gatk,ref_fa,gold_vcf,thread='1') 176 | # step 3 177 | @follows(run_RNA_Baserecalibration_2) 178 | @transform(run_RNA_Baserecalibration_1,formatter('.+\.table'),'f07_BaseRecal/{basename[0]}.plot.pdf') 179 | @check_if_uptodate(check_file_exists) 180 | def run_RNA_Baserecalibration_3(input_file,output_file): 181 | post_table = re.sub('\.table$','.post_table',input_file) 182 | RNA_BaseRecalibrator3(input_file,output_file,post_table,gatk,ref_fa) 183 | # step 4 184 | @jobs_limit(trim_batch) 185 | @follows(run_RNA_Baserecalibration_3) 186 | @transform(run_realign,formatter('.*\.reali\.bam'),'f07_BaseRecal/{basename[0]}.recal.bam') 187 | @check_if_uptodate(check_file_exists) 188 | def run_RNA_Baserecalibration_4(input_file,output_file): 189 | table = re.sub('\.recal\.bam','.table',output_file) 190 | gold_vcf = 'f06_Round1Call/'+re.sub('\.bam$','.gold.vcf',input_file).split('/')[-1] 191 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 192 | RNA_BaseRecalibrator4(input_file,output_file,gatk,table,ref_fa,gold_vcf,n) 193 | @follows(run_RNA_Baserecalibration_4) 194 | def remove_realiBam(): 195 | if os.path.exists('f05_indelReali'): shutil.rmtree('f05_indelReali') 196 | #--------------------- 12. merge lanes for the same sample ----------------------------------------- 197 | def get_group_bam(): 198 | readic = {} 199 | bamfiles = natsorted(glob.glob('f07_BaseRecal/*.recal.bam')) 200 | for rg,bam in zip(read_groups,bamfiles): 201 | start = rg.index('SM:') 202 | sample = rg[start+3:] 203 | if sample in readic: 204 | readic[sample].append(bam) 205 | else: 206 | readic[sample] = [bam] 207 | for sp in readic: 208 | output_file = 'f08_mergeBam/' + sp + '.merge.bam' 209 | input_file = readic[sp] 210 | yield input_file,output_file 211 | @follows(run_RNA_Baserecalibration_4,remove_realiBam) 212 | @mkdir(fastqFiles,formatter(),'{path[0]}/f08_mergeBam') 213 | @files(get_group_bam) 214 | @check_if_uptodate(check_file_exists) 215 | def mergeBam(input_files,output_file): 216 | merge_bams(input_files,output_file) 217 | #--------------------- 13. Mark duplicates for merged file --------------------------------------- 218 | @jobs_limit(trim_batch) 219 | @follows(mergeBam) 220 | @mkdir(fastqFiles,formatter(),'{path[0]}/f09_dedupBam2') 221 | @transform(mergeBam,formatter('.*\.merge\.bam'),'f09_dedupBam2/{basename[0]}.dedup.bam') 222 | @check_if_uptodate(check_file_exists) 223 | def markduplicates2(input_file,output_file): 224 | mark_duplicates(input_file,output_file,picard) 225 | @follows(markduplicates2) 226 | def remove_mergeBam(): 227 | if os.path.exists('f08_mergeBam'): shutil.rmtree('f08_mergeBam') 228 | #--------------------- 14. Indel realignment ----------------------------------------------------- 229 | @jobs_limit(thread) 230 | @follows(markduplicates2,remove_mergeBam) 231 | @mkdir(fastqFiles,formatter(),'{path[0]}/f10_indelReali2') 232 | @transform(markduplicates2,formatter('.*\.dedup\.bam'),'f10_indelReali2/{basename[0]}.reali.bam') 233 | @check_if_uptodate(check_file_exists) 234 | def run_realign2(input_file,output_file): 235 | interval = re.sub('reali\.bam$','interval.list',output_file) 236 | RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=['']) 237 | IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=['']) 238 | @follows(run_realign2) 239 | def remove_dedupBam2(): 240 | if os.path.exists('f09_dedupBam2'): shutil.rmtree('f09_dedupBam2') 241 | #--------------------- 15. Round 2 call ---------------------------------------------- 242 | @follows(run_realign2,remove_dedupBam2) 243 | @mkdir(fastqFiles,formatter(),'{path[0]}/f11_Round2Call') 244 | @transform(run_realign2,formatter('.*\.reali\.bam'),'f11_Round2Call/{basename[0]}.vcf') 245 | @check_if_uptodate(check_file_exists) 246 | def run_round2Vari_call(input_file,output_file): 247 | n = num_thread2use(thread,len(fastqFiles),thread) 248 | HaplotypeCaller_RNA_VCF(input_file,output_file,gatk,ref_fa,n) 249 | #--------------------- 16. filter final vcf --------------------------------- 250 | @jobs_limit(thread) 251 | @follows(run_round2Vari_call) 252 | @mkdir(fastqFiles,formatter(),'{path[0]}/f12_FinalVcf') 253 | @transform(run_round2Vari_call,formatter('.*\.vcf'), 'f12_FinalVcf/{basename[0]}.merged.filter.vcf') 254 | @check_if_uptodate(check_file_exists) 255 | def run_filter_2(input_file,output_file): 256 | output_file = output_file.split('.')[0] + '.merged.filter.vcf' 257 | RNA_Vari_Filter(input_file,output_file,gatk,ref_fa) 258 | 259 | @follows(run_filter_2) 260 | def last_function(): 261 | Message('GATK_RNA_CHO succeed',contact) 262 | 263 | if __name__ == '__main__': 264 | try: 265 | # pipeline_printout(sys.stdout, [last_function], verbose=3) 266 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 267 | touch_files_only=False) 268 | except: 269 | Message('GATK_RNA_CHO failed',contact) 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | -------------------------------------------------------------------------------- /GATK_DNA_CHO.py: -------------------------------------------------------------------------------- 1 | from ruffus import * 2 | from Modules.f01_file_process import * 3 | from Modules.Aligner import bwa_Db,bwa_mem 4 | from Modules.Trimmomatic import Trimmomatic 5 | from Modules.Samtools import sortBam,build_fa_index,merge_bams 6 | from Modules.Picard import build_fa_dict,mark_duplicates 7 | from Modules.GATK import * 8 | import yaml 9 | import sys,shutil 10 | import glob 11 | from natsort import natsorted 12 | 13 | 14 | #============ parameters ====================== 15 | parameter_file = sys.argv[1] 16 | #parameter_file = '/data/shangzhong/DNArepair/fq/GATK_DNA_CHO.yaml' 17 | with open(parameter_file,'r') as f: 18 | doc = yaml.load(f) 19 | p = dic2obj(**doc) 20 | #------------- get parameters ----------- 21 | file_path = p.RawDataPath 22 | thread = p.thread 23 | # all parameter 24 | ref_fa = p.ref_fa 25 | # trimmomatic parameter 26 | trim = p.trim_reads 27 | trimmomatic = p.trimmomatic_path 28 | trim_batch = p.trim_jobs_per_batch 29 | adapter = p.adapter 30 | 31 | QC = p.QC 32 | picard = p.picard 33 | gatk = p.gatk 34 | 35 | bwa_batch = p.bwa_jobs_per_batch 36 | bwa_db = p.bwa_db 37 | 38 | sp = p.sample_name 39 | read_groups = p.read_groups 40 | 41 | contact = p.contact 42 | #=============================================================================== 43 | # Pipeline part 44 | #=============================================================================== 45 | Message('GATK_DNA_start',contact) 46 | os.chdir(file_path) 47 | #=============================================================================== 48 | # Part I. Preprocess 49 | #=============================================================================== 50 | #--------------------- 1. build index for fa file using samtools and GATK ------------------ 51 | dict_file = '.'.join(ref_fa.split('.')[:-1]) + '.dict' 52 | fai_file = ref_fa + '.fai' 53 | if not os.path.exists(dict_file): build_fa_dict(ref_fa,picard) 54 | if not os.path.exists(fai_file): build_fa_index(ref_fa) 55 | #--------------------- 2. read all files ------------------------------------------------ 56 | fastqFiles = list_fq_files(file_path) 57 | if fastqFiles[0][0].startswith('trim_'): 58 | trim = False 59 | def trim_parameters(): 60 | infiles,outfiles = replace_filename(fastqFiles,'^','trim_') 61 | for infile, output in zip(infiles,outfiles): 62 | yield infile,output 63 | #--------------------- run fastqc before trimming ----------- 64 | @active_if(QC) 65 | @jobs_limit(thread) 66 | @mkdir(fastqFiles,formatter(),'{path[0]}/fastqc') 67 | @files(trim_parameters) 68 | def run_QC1(input_file,output_file): 69 | for fq in input_file: 70 | sarge.run('fastqc {input} -o fastqc'.format(input=fq)) 71 | #---------------------3. trim file ------------------ 72 | @active_if(trim) 73 | @follows(run_QC1) 74 | @jobs_limit(trim_batch) 75 | @files(trim_parameters) 76 | def trim_reads(input_file,output_file): 77 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 78 | Trimmomatic(input_file,output_file,trimmomatic,n,adapter) 79 | remove(input_file) 80 | #--------------------- 4. Map with bwa ----------------------------------------------------- 81 | def get_fq_and_readgroup(): 82 | fqFiles = list_fq_files(file_path) 83 | for fq, rg in zip(fqFiles,read_groups): 84 | out = 'f01_bam/' + re.sub('\.f.*q\.gz','.bam',fq[0]) 85 | yield fq,out,rg 86 | # build index 87 | @active_if(not os.path.exists(bwa_db)) 88 | @follows(trim_reads,run_QC1) 89 | def bwa_index(): 90 | bwa_Db(bwa_db,ref_fa) 91 | # align 92 | @jobs_limit(bwa_batch) 93 | @follows(bwa_index,trim_reads,run_QC1) 94 | @mkdir(fastqFiles,formatter(),'{path[0]}/f01_bam') 95 | @files(get_fq_and_readgroup) 96 | def run_bwa(input_file,output_file,rg): 97 | n = num_thread2use(bwa_batch,len(fastqFiles),thread) 98 | bwa_mem(input_file,output_file,bwa_db+'/bwa',n,otherParameters=['-R '+rg+'\\\\tPL:illumina\\\\tLB:lib20000\\\\tPU:unit1']) 99 | #--------------------- 5. Sort bam file -------------------------------------------------- 100 | @jobs_limit(trim_batch) 101 | @follows(run_bwa) 102 | @mkdir(fastqFiles,formatter(),'{path[0]}/f02_sortBam') 103 | @transform(run_bwa,formatter('.*\.bam'),'f02_sortBam/{basename[0]}.sort.bam') 104 | @check_if_uptodate(check_file_exists) 105 | def sort_by_pos(input_file,output_file): 106 | n = num_thread2use(trim_batch,len(fastqFiles),thread) 107 | sortBam(input_file,output_file,n,sortType='pos') 108 | @follows(sort_by_pos) 109 | def remove_bam(): 110 | if os.path.exists('f01_bam'): shutil.rmtree('f01_bam') # remove bam folder 111 | #--------------------- 6. Markduplicates using picard ------------------------------------- 112 | @jobs_limit(trim_batch) 113 | @follows(remove_bam) 114 | @mkdir(fastqFiles,formatter(),'{path[0]}/f03_dedupBam') 115 | @transform(sort_by_pos,formatter('.*\.sort\.bam'),'f03_dedupBam/{basename[0]}.dedup.bam') 116 | @check_if_uptodate(check_file_exists) 117 | def markduplicates(input_file,output_file): 118 | mark_duplicates(input_file,output_file,picard) 119 | @follows(markduplicates) 120 | def remove_sortBam(): 121 | if os.path.exists('f02_sortBam'): shutil.rmtree('f02_sortBam') 122 | #--------------------- 7. Indel realignment --------------------- 123 | @jobs_limit(thread) 124 | @follows(remove_sortBam) 125 | @mkdir(fastqFiles,formatter(),'{path[0]}/f04_indelReali') 126 | @transform(markduplicates,formatter('.*\.dedup\.bam'),'f04_indelReali/{basename[0]}.reali.bam') 127 | @check_if_uptodate(check_file_exists) 128 | def Realign(input_file,output_file): 129 | interval = re.sub('reali\.bam$','interval.list',output_file) 130 | RealignerTargetCreator(input_file,interval,gatk,ref_fa,1,gold_indels=['']) 131 | IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=['']) 132 | @follows(Realign) 133 | def remove_dedupBam(): 134 | if os.path.exists('f03_dedupBam'): shutil.rmtree('f03_dedupBam') 135 | #--------------------- 8. Round 1 call ---------------------------------------------- 136 | @jobs_limit(thread) 137 | @follows(remove_dedupBam) 138 | @mkdir(fastqFiles,formatter(),'{path[0]}/f05_Round1Call') 139 | @transform(Realign,formatter('.*\.reali\.bam'),'f05_Round1Call/{basename[0]}.raw.g.vcf') 140 | @check_if_uptodate(check_file_exists) 141 | def round1Vari_call(input_file,output_file): 142 | n = num_thread2use(thread,len(fastqFiles),thread) 143 | HaplotypeCaller_DNA_gVCF(input_file,output_file,gatk,ref_fa,n,otherParameters=[]) 144 | #--------------------- 9. Merge raw vcf ---------------------------------------------- 145 | @follows(round1Vari_call) 146 | @merge(round1Vari_call,'f05_Round1Call/round1.g.vcf') 147 | @check_if_uptodate(check_file_exists) 148 | def merge_vcf(input_files,output_file): 149 | JointGenotype(input_files,output_file,gatk,ref_fa,thread) 150 | #--------------------- 10. filter gold snp and indel --------------------------------- 151 | @follows(merge_vcf) 152 | @transform(merge_vcf,suffix('.g.vcf'),['.gold_snp.vcf','.gold_indel.vcf']) 153 | # @check_if_uptodate(check_file_exists) 154 | def hard_filter(input_file,output_pair): 155 | HardFilter(input_file,output_pair,gatk,ref_fa,thread) 156 | @follows(hard_filter) 157 | def remove_vcf(): 158 | if os.path.exists('f05_Round1Call'): 159 | for f in glob.glob('f05_Round1Call/*'): 160 | if 'gold' not in f: 161 | os.remove(f) 162 | #--------------------- 11. Base recalibration ----------------------------------------- 163 | # step 1 164 | @follows(remove_vcf) 165 | @mkdir(fastqFiles,formatter(),'{path[0]}/f06_BaseRecal') 166 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.table') 167 | @check_if_uptodate(check_file_exists) 168 | def Baserecalibration_1(input_file,output_file): 169 | n = num_thread2use(thread,len(fastqFiles),thread) 170 | BaseRecalibrator_1(input_file[0],output_file,input_file[1],gatk,ref_fa,thread=str(n)) 171 | # step 2 172 | @follows(Baserecalibration_1) 173 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.post_table') 174 | @check_if_uptodate(check_file_exists) 175 | def Baserecalibration_2(input_file,output_file): 176 | bam = input_file[0].split('/')[-1] 177 | table = 'f06_BaseRecal/' + re.sub('\.reali\.bam$','.reali.table',bam) 178 | n = num_thread2use(thread,len(fastqFiles),thread) 179 | BaseRecalibrator_2(input_file[0],output_file,table,input_file[1],gatk,ref_fa,thread=str(n)) 180 | # step 3 181 | @follows(Baserecalibration_2) 182 | @transform(Baserecalibration_1,formatter('.+\.table'),'f06_BaseRecal/{basename[0]}.plot') 183 | @check_if_uptodate(check_file_exists) 184 | def Baserecalibration_3(input_file,output_file): 185 | post_table = re.sub('\.table$','.post_table',input_file) 186 | BaseRecalibrator_3(input_file,output_file,post_table,gatk,ref_fa) 187 | # step 4 188 | @jobs_limit(bwa_batch) 189 | @follows(Baserecalibration_3) 190 | @transform(Realign,formatter('.*\.reali\.bam'),add_inputs(hard_filter),'f06_BaseRecal/{basename[0]}.recal.bam') 191 | @check_if_uptodate(check_file_exists) 192 | def Baserecalibration_4(input_file,output_file): 193 | table = 'f06_BaseRecal/'+re.sub('\.bam','.table',input_file[0].split('/')[1]) 194 | n = num_thread2use(bwa_batch,len(fastqFiles),thread) 195 | BaseRecalibrator_4(input_file[0],output_file,gatk,ref_fa,input_file[1],table,n) 196 | @follows(Baserecalibration_4) 197 | def remove_realiBam(): 198 | if os.path.exists('f04_indelReali'): shutil.rmtree('f04_indelReali') 199 | #--------------------- 12. merge lanes for the same sample ----------------------------------------- 200 | def get_rg_dic(): 201 | readic = {} 202 | bamfiles = natsorted(glob.glob('f06_BaseRecal/*.recal.bam')) 203 | for rg,bam in zip(read_groups,bamfiles): 204 | start = rg.index('SM:') 205 | sample = rg[start+3:] 206 | if sample in readic: 207 | readic[sample].append(bam) 208 | else: 209 | readic[sample] = [bam] 210 | return readic 211 | def get_group_bam(): 212 | readic = get_rg_dic() 213 | for sp in readic: 214 | output_file = 'f07_mergeBam/' + sp + '.merge.bam' 215 | input_file = readic[sp] 216 | yield input_file,output_file 217 | @follows(remove_realiBam) 218 | @mkdir(fastqFiles,formatter(),'{path[0]}/f07_mergeBam') 219 | @files(get_group_bam) 220 | def mergeBam(input_files,output_file): 221 | merge_bams(input_files,output_file) 222 | @follows(mergeBam) 223 | def remove_recalBam(): 224 | if os.path.exists('f06_BaseRecal'): 225 | for f in glob.glob('f06_BaseRecal/*'): 226 | os.remove(f) 227 | if f.endswith('recal.bam'): 228 | handle = open(f,'w') 229 | handle.close() 230 | #--------------------- 13. Mark duplicates for merged file --------------------------------------- 231 | @jobs_limit(trim_batch) 232 | @follows(remove_recalBam) 233 | @mkdir(fastqFiles,formatter(),'{path[0]}/f08_dedupBam2') 234 | @transform(mergeBam,formatter('.*\.merge\.bam'),'f08_dedupBam2/{basename[0]}.dedup.bam') 235 | @check_if_uptodate(check_file_exists) 236 | def markduplicates2(input_file,output_file): 237 | mark_duplicates(input_file,output_file,picard) 238 | @follows(markduplicates2) 239 | def remove_mergeBam(): 240 | if os.path.exists('f07_mergeBam'): shutil.rmtree('f07_mergeBam') 241 | #--------------------- 14. Indel realignment ----------------------------------------------------- 242 | @follows(remove_mergeBam) 243 | @mkdir(fastqFiles,formatter(),'{path[0]}/f09_indelReali2') 244 | @transform(markduplicates2,formatter('.*\.dedup\.bam'),'f09_indelReali2/{basename[0]}.reali.bam') 245 | @check_if_uptodate(check_file_exists) 246 | def Realign2(input_file,output_file): 247 | interval = re.sub('reali\.bam$','interval.list',output_file) 248 | n = num_thread2use(len(get_rg_dic().keys()),len(fastqFiles),thread) 249 | RealignerTargetCreator(input_file,interval,gatk,ref_fa,n,gold_indels=['']) 250 | IndelRealigner(input_file,output_file,gatk,ref_fa,interval,gold_indels=['']) 251 | @follows(Realign2) 252 | def remove_dedupBam2(): 253 | if os.path.exists('f08_dedupBam2'): shutil.rmtree('f08_dedupBam2') 254 | #--------------------- 15. Round 2 call ---------------------------------------------- 255 | @follows(remove_dedupBam2) 256 | @mkdir(fastqFiles,formatter(),'{path[0]}/f10_Round2Call') 257 | @transform(Realign2,formatter('.*\.reali\.bam'),'f10_Round2Call/{basename[0]}.raw.g.vcf') 258 | @check_if_uptodate(check_file_exists) 259 | def round2Vari_call(input_file,output_file): 260 | n = num_thread2use(len(get_rg_dic().keys()),len(fastqFiles),thread) 261 | HaplotypeCaller_DNA_gVCF(input_file,output_file,gatk,ref_fa,n,otherParameters=[]) 262 | #--------------------- 16. Merge raw2 vcf --------------------------------------------- 263 | @follows(round2Vari_call) 264 | @merge(round2Vari_call,'f10_Round2Call/round2.g.vcf') 265 | def merge_vcf2(input_files,output_file): 266 | JointGenotype(input_files,output_file,gatk,ref_fa,thread) 267 | #--------------------- 17. filter gold snp and indel --------------------------------- 268 | @follows(merge_vcf2) 269 | @transform(merge_vcf2,suffix('.g.vcf'),['.gold_snp.vcf','.gold_indel.vcf']) 270 | # @check_if_uptodate(check_file_exists) 271 | def hard_filter2(input_file,output_pair): 272 | HardFilter(input_file,output_pair,gatk,ref_fa,thread) 273 | #--------------------- 18. combine vcf files --------------------------------- 274 | @follows(hard_filter2) 275 | @mkdir(fastqFiles,formatter(),'{path[0]}/f11_FinalVcf') 276 | @merge(hard_filter2,'f11_FinalVcf/'+sp+'.merged.filter.vcf') 277 | @check_if_uptodate(check_file_exists) 278 | def combine_vcf(input_files,output_file): 279 | CombineSNPandINDEL(input_files,output_file,gatk,ref_fa,otherParams=['--assumeIdenticalSamples','--genotypemergeoption UNSORTED']) 280 | 281 | @follows(combine_vcf) 282 | def last_function(): 283 | Message('GATK_DNA_succeed',contact) 284 | 285 | 286 | if __name__ == '__main__': 287 | try: 288 | # pipeline_printout(sys.stdout, [last_function], verbose=3) 289 | pipeline_run([last_function],multiprocess=thread,gnu_make_maximal_rebuild_mode = True, 290 | touch_files_only=False,verbose=5) 291 | except: 292 | pass 293 | Message('test failed',contact) 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | -------------------------------------------------------------------------------- /Eukaryote_genome_annotation.py: -------------------------------------------------------------------------------- 1 | import os,re 2 | from Bio import SeqIO 3 | import sarge 4 | import glob 5 | from natsort import natsorted 6 | import multiprocessing as mp 7 | import pandas as pd 8 | from Bio import Entrez 9 | import sys 10 | from Bio import Entrez 11 | Entrez.email = 'shl198@eng.ucsd.edu' 12 | 13 | # database files 14 | ref_fa = '/data/genome/hamster/multi_pacbio_assemble/picr.fa' 15 | rna_fa = '/data/shangzhong/Picr_assembly/Annotation/hamster_rna.fa' 16 | refseq_pr = '/data/shangzhong/Picr_assembly/Annotation/hamster_pr.fa' 17 | hamster_id = '/data/shangzhong/Database/hamster/hamster_all_id.txt' 18 | # pathways 19 | path = '/data/shangzhong/Picr_assembly/Annotation' 20 | organism = 'hamster' 21 | # exonerate parameters 22 | exonerate_path = path + '/exonerate' 23 | pr_gff = exonerate_path + '/exonerate.gff' 24 | # PASA parameters 25 | PASA_path = path + '/PASA' 26 | pasa = '/home/shangzhong/Installation/PASApipeline-2.0.2' 27 | ppl_fn = pasa + '/scripts/Launch_PASA_pipeline.pl' 28 | config = pasa + '/pasa_conf/pasa.alignAssembly.Template.txt' 29 | cmp_config = pasa + '/pasa_conf/pasa.annotationCompare.Template.txt' 30 | load_fn = pasa + '/scripts/Load_Current_Gene_Annotations.dbi' 31 | gff3_validate_fn = pasa + '/misc_utilities/pasa_gff3_validator.pl' 32 | tr_gff = PASA_path + '/picr_db.pasa_assemblies.gff3' 33 | #=============================================================================== 34 | # 1. PASA Alignment assembly 35 | #=============================================================================== 36 | def align_assemble(ppl_fn,config,ref_fa,rna_fa,thread,otherParameters=['']): 37 | '''This function do alignment assembly 38 | generate 4 type of files: 39 | sample_mydb_pasa.assemblies.fasta :the PASA assemblies in FASTA format. 40 | sample_mydb_pasa.pasa_assemblies.gff3,.gtf,.bed :the PASA assembly structures. 41 | sample_mydb_pasa.pasa_alignment_assembly_building.ascii_illustrations.out :descriptions 42 | of alignment assemblies and how they were constructed from the underlying transcript alignments. 43 | sample_mydb_pasa.pasa_assemblies_described.txt :tab-delimited format describing the contents 44 | of the PASA assemblies, including the identity of those transcripts that were assembled into the corresponding structure. 45 | ''' 46 | cmd = ('{ppl} -c {config} -C -r -R -g {ref_fa} \ 47 | -t {rna_fa} --ALIGNERS gmap --CPU {thread} {other}').format(ppl=ppl_fn,config=config, 48 | ref_fa = ref_fa,rna_fa=rna_fa,thread=str(thread),other=' '.join(otherParameters)) 49 | print(cmd);sys.stdout.flush() 50 | sarge.run(cmd) 51 | 52 | def check_gff_compat(gff,ppl_fn,config): 53 | '''check the gff compatibility with pasa''' 54 | cmd = ('{ppl_fn} {gff}').format(ppl_fn=ppl_fn,gff=gff) 55 | sarge.run(cmd) 56 | 57 | def load_gff(gff,ref_fa,ppl_fn,config): 58 | cmd = ('{ppl} -c {config} -g {ref} -P {gff}').format(ppl=ppl_fn,config=config,ref=ref_fa,gff=gff) 59 | print(cmd) 60 | sarge.run(cmd) 61 | 62 | def com_update(ref_fa,ppl_fn,config,rna_fa,thread): 63 | '''compare the reads and update the annotation''' 64 | cmd = ('{ppl_fn} -c {config} -A -g {ref_fa} -t {rna} --CPU {t}').format(ppl_fn=ppl_fn, 65 | config=config,ref_fa=ref_fa,rna=rna_fa,t=str(thread)) 66 | print(cmd) 67 | sarge.run(cmd) 68 | 69 | def main_PASA(gff_fn,ppl_fn,config,ref_fa,rna_fa,thread): 70 | # 1. alignment assembly using gmap 71 | align_assemble(ppl_fn,config,ref_fa,rna_fa,thread) # 72 | # 2. check gff compatability 73 | check_gff_compat(gff_fn,ppl_fn) 74 | # 3. load the gff file 75 | load_gff(gff_fn,ref_fa,load_fn,config) 76 | # 4. compare and update 77 | com_update(ref_fa,ppl_fn,cmp_config,rna_fa,thread) 78 | #=============================================================================== 79 | # 2. run exonerate 80 | #=============================================================================== 81 | def exonerate(ref_fa,pr_fn,out_fn): 82 | '''map protein sequence to dna seq''' 83 | cmd = ('exonerate -m p2g -q {pr} -t {ref} --showalignment no \ 84 | --showvulgar no --showtargetgff yes --minintron 20 --percent 50 \ 85 | --score 100 --geneseed 250 -n 10 > {gff}').format(pr=pr_fn,ref=ref_fa,gff=out_fn) 86 | print(cmd) 87 | sarge.run(cmd) 88 | 89 | def split_fa(fa,item_per_file,path): 90 | if not os.path.exists(path): os.mkdir(path) 91 | handle = SeqIO.parse(open(fa,'r'),'fasta') 92 | file_n = 0 93 | pr_n = 0 94 | out_fn = path+'/file'+str(file_n)+'.fa' 95 | if os.path.exists(out_fn): os.remove(out_fn) 96 | for record in handle: 97 | SeqIO.write(record,open(out_fn,'a'),'fasta') 98 | pr_n += 1 99 | if pr_n % int(item_per_file) == 0: 100 | file_n +=1 101 | out_fn = path+'/file'+str(file_n)+'.fa' 102 | if os.path.exists(out_fn): os.remove(out_fn) 103 | 104 | def exonerate2gff(gffs,out_gff,g_type='evm'): 105 | '''This function transfer exonerate gff file to standard gff format. 106 | gffs: a list of gff files 107 | out_gff: output final gff to store information 108 | ''' 109 | out_handle = open(out_gff,'w') 110 | n = 1 111 | m = 0 112 | for gff in gffs: 113 | cds = [] 114 | for line in open(gff): 115 | if line.startswith('#') or line.startswith('Command') or line.startswith('Hostname') or line.startswith(' ') or line.startswith('--'): 116 | continue 117 | else: 118 | item = line.strip().split('\t') 119 | if item[2] == 'cds': 120 | cds.append(line.strip().split('\t')) 121 | elif item[2] == 'gene' and g_type=='augustus': 122 | item[1] = 'exonerate' 123 | pr = item[8].split(';')[1].split(' ')[2] 124 | item[8] = ('ID=gene_{n};Target={pr}').format(n=n,pr=pr) 125 | out_handle.write('\t'.join(item) + '\n') 126 | elif item[2] == 'similarity': 127 | info = item[8].split(';') 128 | pr = info[1].split()[1] 129 | length = 0 130 | start = 1; end = 1 131 | for c in cds: # decide start of the AA of each exon 132 | length += int(c[4]) - int(c[3]) + 1 133 | if length % 3 == 0: 134 | end = length/3 135 | new_s = end + 1 136 | else: 137 | end = length/3 + 1 138 | new_s = end 139 | c[1] = 'exonerate' 140 | c[2] = 'cds_match' 141 | m += 1 142 | if g_type == 'evm': 143 | m = n 144 | c.append(('ID=pr_{m};Parent=gene_{n};Target={pr} {s} {e}').format(m=m,n=n,pr=pr,s=start,e=end)) 145 | start = new_s 146 | out_handle.write('\t'.join(c) + '\n') 147 | cds = [] 148 | n += 1 149 | out_handle.close() 150 | 151 | 152 | def main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff,index_s=0,index_e=0): 153 | ''' 154 | * refseq_pr: all protein seqeunces of the organism 155 | * path: path to store splited protein sequences. 156 | ''' 157 | if not os.path.exists(exonerate_path): os.mkdir(exonerate_path) 158 | # 1) split protein fa file into many sub file, this is to parallel the process 159 | os.chdir(exonerate_path) 160 | if os.listdir(path) != []: 161 | split_fa(refseq_pr,100,exonerate_path) 162 | # 2) run exonerate for each file 163 | faFiles = natsorted(glob.glob('file*.fa')) 164 | if index_e == 0: 165 | faFiles = faFiles[index_s:] 166 | else: 167 | faFiles = faFiles[index_s:index_e] 168 | pool = mp.Pool(processes=int(thread)) 169 | for f in faFiles: 170 | out = f[:-2]+'gff' 171 | pool.apply_async(exonerate,args=(ref_fa,f,out)) 172 | pool.close() 173 | pool.join() 174 | # 3) merge the gff files 175 | exonerate_gff = 'exonerate.gff' 176 | if not os.path.exists(exonerate_gff): 177 | gff_fns = natsorted(glob.glob('file*.gff')) 178 | exonerate2gff(gff_fns,exonerate_gff) 179 | 180 | # main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff) 181 | 182 | 183 | #=============================================================================== 184 | # process the gmap results and exonerates results directly 185 | #=============================================================================== 186 | #=============== 1. get all mapped geneid, rna_accession, pr_accession 187 | def gene_rna_pr_id(hamster_id,gmap_gff,out_fn): 188 | '''this fnction get all gene rna pr id, including both refseq and gff information. 189 | * hamster_id: a file that has all ids in hamster.gff file 190 | * gmap_gff: gff results mapped using gmap 191 | * out_fn: 192 | ''' 193 | # rna accession in gff file 194 | ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0) 195 | ham_id_df = ham_id_df.astype('str') 196 | ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0]) 197 | ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0]) 198 | rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict() 199 | rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict() 200 | #-------- read rna gff file 201 | rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#') 202 | # add rna accession column 203 | rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0)) 204 | mrna = list(set(rna_df['rna_ac'].tolist())) 205 | # new rna in refseq compared to gff 206 | new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys())) 207 | # get geneid for new ref_rna gene id 208 | for r in new_ref_rna: 209 | handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read() 210 | geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0) 211 | try: 212 | p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0) 213 | except: 214 | p = '-' 215 | rna_gene_dic[r] = geneid 216 | rna_pr_dic[r] = p 217 | # transfer dic to dataframe 218 | r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index') 219 | r_g_df.columns = ['geneid'] 220 | r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index') 221 | r_p_df.columns = ['pr_ac'] 222 | g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1) 223 | g_r_p_df['rna_ac'] = g_r_p_df.index 224 | g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False) 225 | 226 | # gmap_exon_path = path + '/gmap_exonerate' 227 | # if not os.path.exists(gmap_exon_path): os.mkdir(gmap_exon_path) 228 | # os.chdir(gmap_exon_path) 229 | # gmap_gff = PASA_path + '/gmap.spliced_alignments.gff3' 230 | # g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt' 231 | # gene_rna_pr_id(hamster_id,gmap_gff,g_r_p_id_fn) 232 | 233 | 234 | def get_consensus_map(rna_df,pr_df,gene,rna_ac,pr_ac): 235 | '''this function check if the rna map and pr map have the same splice sites 236 | * rna_df: mRNA map to genome gff dataframe with additional rna_ac column 237 | * pr_df: protein map to genome dataframe with additional 'pr_ac' and 'pr_id' column 238 | ''' 239 | if not rna_df.empty: 240 | # get rna scaffold name, if more than 1 scaffold then don't add it's annotation 241 | rna_chr = list(set(rna_df[0].tolist())) 242 | if len(rna_chr) != 1: 243 | assert False, rna_ac + ' map to multiple scaffolds' 244 | else: 245 | rna_chr = rna_chr[0] 246 | # get strand, if map to both strand don't output 247 | rna_str = list(set(rna_df[6].tolist())) 248 | if len(rna_str) != 1: 249 | assert False, rna_ac + ' map to both strands' 250 | else: 251 | rna_str = rna_str[0] 252 | # get rna splice sites 253 | rna_splice = natsorted(rna_df[3].tolist() + rna_df[4].tolist()) 254 | # change exon id 255 | n = 1 256 | for i,row in rna_df.iterrows(): 257 | item = row[8].split(';') 258 | iid = '.'.join(item[0].split('.')[:-1]) 259 | anno = iid+' '+str(n)+';'+re.sub('Name.+?;','',';'.join(item[1:]))+';Parent='+rna_ac+';gene_id='+gene+';transcript_id='+rna_ac 260 | # anno = iid+'_'+str(n)+';'+ re.sub('Name','transcript_id',';'.join(item[1:]))+';Parent='+rna_ac+';gene_id='+gene 261 | rna_df.loc[i,8] = anno 262 | rna_df.loc[i,2] = 'exon' 263 | n += 1 264 | #--------------- process protein gff information 265 | if not pr_df.empty: 266 | pr_id = pr_df['pr_id'].tolist()[0] 267 | sub_pr_df = pr_df[(pr_df['pr_id'].values==pr_id) & (pr_df[0].values==rna_chr)].copy() 268 | # change cds id 269 | m = 1 270 | for i,row in sub_pr_df.iterrows(): 271 | item = row[8].split(';') 272 | anno = 'ID='+pr_ac+'_'+str(m)+';'+';'.join(item[2:])+';protein_id='+pr_ac+';Parent='+rna_ac+';gene_id='+gene 273 | sub_pr_df.loc[i,8] = anno 274 | sub_pr_df.loc[i,2] = 'CDS' 275 | m += 1 276 | pr_splice = natsorted(sub_pr_df[3].tolist() + sub_pr_df[4].tolist()) 277 | if sub_pr_df.shape[0] == 1: 278 | if not rna_splice[0] len(rna_splice): 287 | print 'protein has more splice than rna, rna/pr:',len(rna_splice),'/',len(pr_splice) 288 | sub_pr_df = pd.DataFrame() 289 | else: 290 | sub_pr_df = pr_df 291 | return rna_df,sub_pr_df,rna_chr,rna_splice[0],rna_splice[-1],rna_str 292 | 293 | 294 | # import time 295 | # process_start = time.time() 296 | 297 | def gmap_exonerate_merge_gff(gmap_gff,exonerate_gff,gmap_exon_path,all_id_fn): 298 | #-------- read gmap gff file 299 | rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#') 300 | rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0)) 301 | # get multi mapping mRNAs 302 | multi_map_rna = list(set(rna_df[rna_df[8].map(lambda x: 'path2' in x)]['rna_ac'].tolist())) 303 | # build gene rna protein id dictionary 304 | g_r_p_dic = {} 305 | g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt' 306 | handle = open(g_r_p_id_fn) 307 | for line in handle: 308 | item = line.strip().split('\t') 309 | if item[1] in multi_map_rna: 310 | continue 311 | if item[0] in g_r_p_dic: 312 | g_r_p_dic[item[0]][item[1]] = item[2] 313 | else: 314 | g_r_p_dic[item[0]] = {item[1]:item[2]} 315 | #-------- read exonerate gff file 316 | pr_df = pd.read_csv(pr_gff,sep='\t',header=None) 317 | pr_df['pr_ac'] = pr_df[8].map(lambda x: re.search('(?<=Target=).+?(?=\.)',x).group(0)) 318 | 319 | def output_consensus_rna_pr(g,out_handle): 320 | '''this function finds the consistend rna and protein and out put to file 321 | g_r_p_dic: dictionary that has all the gene, rna and protein ids. 322 | g: gene id 323 | rna_df: rna gff dataframe 324 | pr_df: protein gff dataframe 325 | ''' 326 | g_n = 0 327 | rna_pr_dic = g_r_p_dic[g] 328 | for rna in rna_pr_dic: 329 | pr = rna_pr_dic[rna] 330 | single_rna_df = rna_df[rna_df['rna_ac'].values==rna].copy() 331 | single_rna_df = single_rna_df.reset_index(drop=True) 332 | if not single_rna_df.empty: 333 | single_pr_df = pr_df[pr_df['pr_ac'].values==pr].copy() 334 | single_pr_df = single_pr_df.reset_index(drop=True) 335 | single_pr_df.loc[:,'pr_id'] = single_pr_df[8].map(lambda x: re.search('(?<=ID=).+?(?=;)',x).group(0)) 336 | res_rna_df,res_pr_df,chrome,start,end,strand=get_consensus_map(single_rna_df,single_pr_df,str(g),rna,pr) 337 | if g_n == 0: 338 | out_handle.write('\t'.join([chrome,'gmap_exonerate','gene',str(start),str(end),'.',\ 339 | strand,'.','ID='+str(g)+';gene_id='+str(g)])+'\n') 340 | g_n += 1 341 | if not res_rna_df.empty: 342 | if rna.startswith('XR'): 343 | feature = 'lncRNA' 344 | else: 345 | feature = 'mRNA' 346 | out_handle.write('\t'.join([chrome,'gmap_exonerate',feature,str(start),str(end),'.',\ 347 | strand,'.','ID='+rna+';Parent='+str(g)+';gene_id='+str(g)+';transcript_id='+rna])+'\n') 348 | res_rna_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None) 349 | if not res_pr_df.empty: 350 | res_pr_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None) 351 | 352 | out_fn = '02_gmap_exonerate.gff' 353 | if os.path.exists(out_fn): os.remove(out_fn) 354 | with open(out_fn,'a') as f: 355 | for g in g_r_p_dic.keys(): 356 | output_consensus_rna_pr(g,f) 357 | # define a function to find the start and end position of each gene 358 | def get_gene_s_e(gene_df): 359 | pos = gene_df[3].tolist() + gene_df[4].tolist() 360 | gene_df.iloc[0,3] = min(pos) 361 | gene_df.iloc[0,4] = max(pos) 362 | return gene_df 363 | # correct gene coordinates 364 | gff_df = pd.read_csv(out_fn,sep='\t',header=None) 365 | gff_df['geneid'] = gff_df[8].map(lambda x: re.search('(?<=gene_id=).+?(?=;|$)',x).group(0)) 366 | res_df = gff_df.groupby('geneid').apply(get_gene_s_e) 367 | # add gene name 368 | all_id_df = pd.read_csv(all_id_fn,sep='\t',header=0) 369 | all_id_df = all_id_df.astype('str') 370 | g_s_dic = all_id_df.set_index('GeneID')['GeneSymbol'].to_dict() 371 | res_df[8] = res_df.apply(lambda row: row[8]+';gene_name='+g_s_dic[row['geneid']] if row['geneid'] in g_s_dic else row[8]+';gene_name='+row['geneid'],axis=1) 372 | res_df[range(9)].to_csv('02_gmap_exonerate.gff',sep='\t',index=False,header=None) 373 | 374 | # gmap_gff = PASA_path+'/gmap.spliced_alignments.gff3' 375 | # gmap_exonerate_merge_gff(gmap_gff,pr_gff,gmap_exon_path,hamster_id) 376 | # 377 | # print time.time() - process_start 378 | 379 | 380 | 381 | 382 | 383 | 384 | 385 | -------------------------------------------------------------------------------- /Genome_Annotation.py: -------------------------------------------------------------------------------- 1 | from Modules.GeneMark import geneMark_ES 2 | import os,sarge 3 | from Bio import SeqIO 4 | import glob 5 | from natsort import natsorted 6 | import multiprocessing as mp 7 | import sys 8 | import pandas as pd 9 | import re 10 | from Bio import Entrez 11 | Entrez.email = 'shl198@eng.ucsd.edu' 12 | # database files 13 | ref_fa = '/data/genome/hamster/multi_pacbio_assemble/picr.fa' 14 | rna_fa = '/data/shangzhong/Picr_assembly/Annotation/hamster_rna.fa' 15 | refseq_pr = '/data/shangzhong/Picr_assembly/Annotation/hamster_pr.fa' 16 | hamster_id = '/data/shangzhong/Database/hamster/hamster_all_id.txt' 17 | # pathways 18 | path = '/data/shangzhong/Picr_assembly/Annotation' 19 | organism = 'hamster' 20 | # genemark parameters 21 | genemark_path = path + '/genemark' 22 | genemark_gff = genemark_path + '/genemark.gff3' 23 | # exonerate parameters 24 | exonerate_path = path + '/exonerate' 25 | pr_gff = exonerate_path + '/exonerate.gff' 26 | # PASA parameters 27 | PASA_path = path + '/PASA' 28 | pasa = '/home/shangzhong/Installation/PASApipeline-2.0.2' 29 | ppl_fn = pasa + '/scripts/Launch_PASA_pipeline.pl' 30 | config = pasa + '/pasa_conf/pasa.alignAssembly.Template.txt' 31 | cmp_config = pasa + '/pasa_conf/pasa.annotationCompare.Template.txt' 32 | load_fn = pasa + '/scripts/Load_Current_Gene_Annotations.dbi' 33 | gff3_validate_fn = pasa + '/misc_utilities/pasa_gff3_validator.pl' 34 | tr_gff = PASA_path + '/picr_db.pasa_assemblies.gff3' 35 | # evm parameters 36 | evm = '/home/shangzhong/Installation/EVidenceModeler-1.1.1' 37 | evm_path = path + '/EVM' 38 | exon2align_gff = '/home/shangzhong/Installation/EVidenceModeler-1.1.1/EvmUtils/misc/exonerate_gff_to_alignment_gff3.pl' 39 | weight_fn = evm + '/weights.txt' # /EvmUtils 40 | # blast 41 | blast_db = path + '/blastp_db' 42 | uniprot = path + '/uniprot_sprot.fasta.gz' 43 | thread = '6' 44 | #=============================================================================== 45 | # 1. run GeneMark 46 | #=============================================================================== 47 | os.chdir(genemark_path) 48 | # genemark_gff = geneMark_ES(ref_fa) 49 | #=============================================================================== 50 | # 2. run exonerate 51 | #=============================================================================== 52 | def exonerate(ref_fa,pr_fn,out_fn): 53 | '''map protein sequence to dna seq''' 54 | cmd = ('exonerate -m p2g -q {pr} -t {ref} --showalignment no \ 55 | --showvulgar no --showtargetgff yes --minintron 20 --percent 50 \ 56 | --score 100 --geneseed 250 -n 10 > {gff}').format(pr=pr_fn,ref=ref_fa,gff=out_fn) 57 | print(cmd) 58 | sarge.run(cmd) 59 | 60 | def split_fa(fa,item_per_file,path): 61 | if not os.path.exists(path): os.mkdir(path) 62 | handle = SeqIO.parse(open(fa,'r'),'fasta') 63 | file_n = 0 64 | pr_n = 0 65 | out_fn = path+'/file'+str(file_n)+'.fa' 66 | if os.path.exists(out_fn): os.remove(out_fn) 67 | for record in handle: 68 | SeqIO.write(record,open(out_fn,'a'),'fasta') 69 | pr_n += 1 70 | if pr_n % int(item_per_file) == 0: 71 | file_n +=1 72 | out_fn = path+'/file'+str(file_n)+'.fa' 73 | if os.path.exists(out_fn): os.remove(out_fn) 74 | 75 | def exonerate2gff(gffs,out_gff,g_type='evm'): 76 | '''This function transfer exonerate gff file to standard gff format. 77 | gffs: a list of gff files 78 | out_gff: output final gff to store information 79 | ''' 80 | out_handle = open(out_gff,'w') 81 | n = 1 82 | m = 0 83 | for gff in gffs: 84 | cds = [] 85 | for line in open(gff): 86 | if line.startswith('#') or line.startswith('Command') or line.startswith('Hostname') or line.startswith(' ') or line.startswith('--'): 87 | continue 88 | else: 89 | item = line.strip().split('\t') 90 | if item[2] == 'cds': 91 | cds.append(line.strip().split('\t')) 92 | elif item[2] == 'gene' and g_type=='augustus': 93 | item[1] = 'exonerate' 94 | pr = item[8].split(';')[1].split(' ')[2] 95 | item[8] = ('ID=gene_{n};Target={pr}').format(n=n,pr=pr) 96 | out_handle.write('\t'.join(item) + '\n') 97 | elif item[2] == 'similarity': 98 | info = item[8].split(';') 99 | pr = info[1].split()[1] 100 | length = 0 101 | start = 1; end = 1 102 | for c in cds: # decide start of the AA of each exon 103 | length += int(c[4]) - int(c[3]) + 1 104 | if length % 3 == 0: 105 | end = length/3 106 | new_s = end + 1 107 | else: 108 | end = length/3 + 1 109 | new_s = end 110 | c[1] = 'exonerate' 111 | c[2] = 'cds_match' 112 | m += 1 113 | if g_type == 'evm': 114 | m = n 115 | c.append(('ID=pr_{m};Parent=gene_{n};Target={pr} {s} {e}').format(m=m,n=n,pr=pr,s=start,e=end)) 116 | start = new_s 117 | out_handle.write('\t'.join(c) + '\n') 118 | cds = [] 119 | n += 1 120 | out_handle.close() 121 | 122 | 123 | def main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff,index_s=0,index_e=0): 124 | ''' 125 | * refseq_pr: all protein seqeunces of the organism 126 | * path: path to store splited protein sequences. 127 | ''' 128 | if not os.path.exists(exonerate_path): os.mkdir(exonerate_path) 129 | # 1) split file 130 | os.chdir(exonerate_path) 131 | if os.listdir(path) != []: 132 | split_fa(refseq_pr,100,exonerate_path) 133 | # 2) run exonerate for each file 134 | faFiles = natsorted(glob.glob('file*.fa')) 135 | if index_e == 0: 136 | faFiles = faFiles[index_s:] 137 | else: 138 | faFiles = faFiles[index_s:index_e] 139 | pool = mp.Pool(processes=int(thread)) 140 | for f in faFiles: 141 | out = f[:-2]+'gff' 142 | pool.apply_async(exonerate,args=(ref_fa,f,out)) 143 | pool.close() 144 | pool.join() 145 | # 3) merge the gff files 146 | exonerate_gff = 'exonerate.gff' 147 | if not os.path.exists(exonerate_gff): 148 | gff_fns = natsorted(glob.glob('file*.gff')) 149 | exonerate2gff(gff_fns,exonerate_gff) 150 | 151 | # main_exonerate(ref_fa,refseq_pr,exonerate_path,thread,exon2align_gff) 152 | 153 | #=============================================================================== 154 | # 3. PASA Alignment assembly 155 | #=============================================================================== 156 | def align_assemble(ppl_fn,config,ref_fa,rna_fa,thread,otherParameters=['']): 157 | '''This function do alignment assembly 158 | generate 4 type of files: 159 | sample_mydb_pasa.assemblies.fasta :the PASA assemblies in FASTA format. 160 | sample_mydb_pasa.pasa_assemblies.gff3,.gtf,.bed :the PASA assembly structures. 161 | sample_mydb_pasa.pasa_alignment_assembly_building.ascii_illustrations.out :descriptions 162 | of alignment assemblies and how they were constructed from the underlying transcript alignments. 163 | sample_mydb_pasa.pasa_assemblies_described.txt :tab-delimited format describing the contents 164 | of the PASA assemblies, including the identity of those transcripts that were assembled into the corresponding structure. 165 | ''' 166 | cmd = ('{ppl} -c {config} -C -r -R -g {ref_fa} \ 167 | -t {rna_fa} --ALIGNERS gmap --CPU {thread} {other}').format(ppl=ppl_fn,config=config, 168 | ref_fa = ref_fa,rna_fa=rna_fa,thread=str(thread),other=' '.join(otherParameters)) 169 | print(cmd);sys.stdout.flush() 170 | sarge.run(cmd) 171 | 172 | def check_gff_compat(gff,ppl_fn,config): 173 | '''check the gff compatibility with pasa''' 174 | cmd = ('{ppl_fn} {gff}').format(ppl_fn=ppl_fn,gff=gff) 175 | sarge.run(cmd) 176 | 177 | def load_gff(gff,ref_fa,ppl_fn,config): 178 | cmd = ('{ppl} -c {config} -g {ref} -P {gff}').format(ppl=ppl_fn,config=config,ref=ref_fa,gff=gff) 179 | print(cmd) 180 | sarge.run(cmd) 181 | 182 | def com_update(ref_fa,ppl_fn,config,rna_fa,thread): 183 | '''compare the reads and update the annotation''' 184 | cmd = ('{ppl_fn} -c {config} -A -g {ref_fa} -t {rna} --CPU {t}').format(ppl_fn=ppl_fn, 185 | config=config,ref_fa=ref_fa,rna=rna_fa,t=str(thread)) 186 | print(cmd) 187 | sarge.run(cmd) 188 | 189 | def main_PASA(gff_fn,ppl_fn,config,ref_fa,rna_fa,thread): 190 | # 1. alignment assembly using gmap 191 | align_assemble(ppl_fn,config,ref_fa,rna_fa,thread) # 192 | # 2. check gff compatability 193 | check_gff_compat(gff_fn,ppl_fn) 194 | # 3. load the gff file 195 | load_gff(gff_fn,ref_fa,load_fn,config) 196 | # 4. compare and update 197 | com_update(ref_fa,ppl_fn,cmp_config,rna_fa,thread) 198 | 199 | #=============================================================================== 200 | # 4. run EVM 201 | #=============================================================================== 202 | def evm_partition(ref_fa,evm,gffs=[''],otherParams=['']): 203 | '''run evm to merge all the gff files''' 204 | cmd = ('{evm} --genome {ref} {gffs} {other} --segmentSize 50000000 \ 205 | --overlapSize 10000 --partition_listing partitions_list.out').format(evm=evm,ref=ref_fa, 206 | gffs=' '.join(gffs),other=' '.join(otherParams)) 207 | print(cmd) 208 | sarge.run(cmd) 209 | 210 | def evm_cmd_list(out_fn,cmd_fn,evm,ref_fa,weight_fn,partition,gffs=['']): 211 | '''create cmd list for evm''' 212 | cmd = ('{evm} --genome {ref} --weights {w} {gffs} --output_file_name {out_fn} \ 213 | --partitions {par} > {cmd_l}').format(evm=evm,ref=ref_fa, 214 | w=weight_fn,gffs=' '.join(gffs),out_fn=out_fn,par=partition,cmd_l=cmd_fn) 215 | print(cmd) 216 | sarge.run(cmd) 217 | 218 | def combine_partition(evm,partition): 219 | '''combine all the results from running command line''' 220 | cmd = ('{evm} --partitions {p} --output_file_name evm.out').format(evm=evm,p=partition) 221 | print(cmd) 222 | sarge.run(cmd) 223 | 224 | def run_cmd(cmd): 225 | try: 226 | print(cmd);sys.stdout.flush() 227 | sarge.run(cmd) 228 | except: 229 | print cmd,'error' 230 | assert False 231 | 232 | def filter_evm_gff(evm_path): 233 | os.chdir(evm_path) 234 | ds = [f for f in os.listdir(evm_path) if os.path.isdir(f)] 235 | out_h = open('evm.evidence.txt','w') 236 | for d in ds: 237 | fPath = d + '/evm.out' 238 | size = os.path.getsize(fPath) 239 | if size > 0: 240 | blocks = open(fPath).read().strip().split('#')[1:] 241 | for block in blocks: 242 | coords = [] 243 | evidence = [] 244 | for line in block.strip().split('\n')[1:]: 245 | if line.strip() != '' and line[0] != '!': 246 | meta = line.strip().split('\t') 247 | coords.append(int(meta[0])) 248 | coords.append(int(meta[1])) 249 | coords.sort() 250 | evidence.extend([tuple(x[1:-1].split(';')) for x in meta[-1].split(',')]) 251 | 252 | evidence = set(evidence) 253 | sources = set([x[1] for x in evidence]) 254 | 255 | out_h.write(d + '\t' + str(coords[0]) + '\t' + str(coords[-1]) + '\t' + ','.join([x[0] for x in evidence]) + '\t' + ','.join(sources) + '\n') 256 | out_h.close() 257 | 258 | 259 | def main_evm(thread): 260 | os.chdir(evm_path) 261 | evm_gffs = ['--gene_predictions '+genemark_gff,'--transcript_alignments '+tr_gff,'--protein_alignments '+pr_gff] 262 | # 1. partition input 263 | evm_partition(ref_fa,evm+'/EvmUtils/partition_EVM_inputs.pl',evm_gffs) 264 | # 2. generate command lines 265 | evm_cmd_out = 'evm.out' 266 | cmd_fn = 'commands.list' 267 | evm_cmd_list(evm_cmd_out,cmd_fn,evm+'/EvmUtils/write_EVM_commands.pl',ref_fa,weight_fn,'partitions_list.out',evm_gffs) 268 | # 3. run commands 269 | pool = mp.Pool(processes=int(thread)) 270 | cmds = open(cmd_fn).readlines() 271 | for cmd in cmds: 272 | pool.apply_async(run_cmd,args=(cmd,)) 273 | pool.close() 274 | pool.join() 275 | # 4. combine results 276 | evm_combine = evm + '/EvmUtils/recombine_EVM_partial_outputs.pl' 277 | combine_partition(evm_combine,'partitions_list.out') 278 | # 5. transfer to gff 279 | to_gff = evm + '/EvmUtils/convert_EVM_outputs_to_GFF3.pl' 280 | cmd = ('{evm} --partitions partitions_list.out --output evm.out --genome {ref}').format(evm=to_gff,ref=ref_fa) 281 | sarge.run(cmd) 282 | # 6. merge gff 283 | fns = glob.glob('*/*.out.gff3') 284 | cmd = ('cat {input} > evm.merge.gff').format(input=' '.join(fns)) 285 | sarge.run(cmd) 286 | # 7. extract genes supported by two algorithm 287 | filter_evm_gff(evm_path) 288 | # main_evm(9) 289 | 290 | #=============================================================================== 291 | # 5. Augustus 292 | #=============================================================================== 293 | def gff2gb(gff,out_gb,ref): 294 | '''transfer gff file to genbank file''' 295 | cmd = ('gff2gbSmallDNA.pl {gff} {ref} 1000 {gb}').format(gff=gff,ref=ref,gb=out_gb) 296 | print(cmd) 297 | sarge.run(cmd) 298 | 299 | #---- transfer exonerate.gff to exonerate.gb 300 | def augustus_train(exonerate_gff,out_gb,ref_fa): 301 | 302 | gff2gb(exonerate_gff,out_gb,ref_fa) 303 | #---- clearn problematic genes 304 | sarge.run('etraining --species={s} --stopCodonExcludedFromCDS=true {gb} 2> train.err'.format(s=organism,gb=out_gb)) 305 | sarge.run('cat train.err | perl -pe \'s/.*in sequence (\S+): .*/$1/\' > badgenes.lst') 306 | sarge.run('filterGenes.pl badgenes.lst {gb} > genes.gb'.format(gb=out_gb)) 307 | #---- split gb file 308 | sarge.run('randomSplit.pl genes.gb 1000') 309 | os.remove('genes.gb') 310 | os.remove('genes.gb.train') 311 | sarge.run('randomSplit.pl genes.gb.test 100') 312 | #---- create meta parameters file for ne species 313 | sarge.run('new_species.pl --species={s}'.format(s=organism)) 314 | #---- initial training 315 | sarge.run('etraining --species={s} --stopCodonExcludedFromCDS=true genes.gb.test.train'.format(s=organism)) 316 | #---- fly predict 317 | sarge.run('augustus --species={s} genes.gb.test.test | tee firsttest.out'.format(s=organism)) 318 | #---- optimize 319 | sarge.run('optimize_augustus.pl --species={s} genes.gb.test.train'.format(s=organism)) 320 | 321 | 322 | def augustus_prepare_hint(pasa,exonerate): 323 | ''' 324 | ''' 325 | dfs = [] 326 | for g,t,feature in zip([pasa,exonerate],['E','P'],['exonpart','CDSpart']): 327 | df = pd.read_csv(g,sep='\t',header=None) 328 | df[2] = df[2].map(lambda x: feature) 329 | df[8] = df[8].map(lambda x: x+';grp='+re.search('(?<=ID=).+?(?=;)',x).group(0)+';src='+t) 330 | dfs.append(df) 331 | res = pd.concat(dfs) 332 | res.to_csv('hints.gff',sep='\t',index=False,header=None) 333 | 334 | 335 | augs_path = path + '/augustus' 336 | if not os.path.exists(augs_path): os.mkdir(augs_path) 337 | os.chdir(augs_path) 338 | out_gb = 'exonerate.gb' 339 | # augustus_train(exonerate_path + '/exonerate_4_augustus.gff',out_gb,ref_fa) 340 | # augustus_prepare_hint(PASA_path+'/picr_db.pasa_assemblies.gff3',pr_gff) 341 | # sarge.run('augustus --species={s} {ref} \ 342 | # --extrinsicCfgFile=extrinsic.hamster.cfg --hintsfile=hints.gff --gff3=on > augustus.hints.gff'.format(s=organism,ref=ref_fa)) 343 | 344 | #=============================================================================== 345 | # 5. functional annotation of the new gff file 346 | #=============================================================================== 347 | from Bio.Seq import Seq 348 | import shutil 349 | 350 | def get_cds_sequence(rna,c_df,chrom_seq): 351 | pr_df = c_df[c_df['rna_id'].values==rna] 352 | strand = list(set(pr_df[6].tolist())) 353 | if len(strand) == 2: 354 | assert False, rna+' has both strands' 355 | # seqeunce merge 356 | chr_seq = Seq('') 357 | for start,end in zip(pr_df[3],pr_df[4]): 358 | if strand == ['-']: 359 | chr_seq += chrom_seq[start-1:end].reverse_complement() 360 | else: 361 | chr_seq += chrom_seq[start-1:end] 362 | # consider the frame information in 7th column 363 | frame = int(pr_df[7].tolist()[0]) 364 | rna_seq = chr_seq[frame:] 365 | return str(rna_seq.translate()) 366 | 367 | def output_cds(chrom,cds_df,dic): 368 | '''this function get AA sequence and output to file with filename as chromosome name''' 369 | chrom_seq = dic[chrom].seq 370 | chr_df = cds_df[cds_df[0].values==chrom] 371 | rnas = list(set(chr_df['rna_id'].tolist())) 372 | out_h = open(chrom+'.fa','w') 373 | for rna in rnas:#['evm.model.picr_0.1707']: #rnas: 374 | AA = get_cds_sequence(rna,chr_df,chrom_seq) 375 | if AA.endswith('*'): 376 | AA = AA[:-1] 377 | out_h.write('>{rna}\n{pr}\n'.format(rna=rna,pr=AA)) 378 | out_h.close() 379 | 380 | def get_evm_pr(evm_path,ref_fa,out_path): 381 | '''this function get all evm proteins, output to files and merge them together 382 | * evm_path: evm path that has gff file 383 | * ref_fa: reference fa file 384 | * out_path: path to save all temperary files and final protein files 385 | ''' 386 | if os.path.exists(out_path): 387 | shutil.rmtree(out_path) 388 | os.mkdir(out_path) 389 | os.chdir(out_path) 390 | evm_gff= evm_path + '/evm.merge.gff' 391 | gff_df = pd.read_csv(evm_gff,sep='\t',header=None) 392 | dic = SeqIO.index(ref_fa,'fasta') 393 | cds_df = gff_df[gff_df[2].values=='CDS'] 394 | cds_df = cds_df.reset_index(drop=True) 395 | cds_df['rna_id'] = cds_df[8].map(lambda x: x.split(';')[1][7:]) 396 | scaffolds = list(set(cds_df[0].tolist())) 397 | for scaff in scaffolds: 398 | output_cds(scaff,cds_df,dic) 399 | # merge files 400 | fns = natsorted(glob.glob('*.fa')) 401 | sarge.run('cat {fns} > {out}'.format(fns=' '.join(fns),out='pr_merge.fa')) 402 | for f in fns: 403 | os.remove(f) 404 | 405 | evm_pr_path = path + '/evm_pr' 406 | 407 | # get_evm_pr(evm_path,ref_fa,evm_pr_path) 408 | 409 | def makeblast(ref_fa,out,db_type): 410 | ''' 411 | ref_fa: gzipped fa file 412 | ''' 413 | cmd = ('gunzip -c {ref} | makeblastdb -in - -dbtype {type} -out {out} -title {title}').format( 414 | ref=ref_fa,type=db_type,out=out,title=out) 415 | print(cmd) 416 | sarge.run(cmd) 417 | 418 | def blastp(query_fa,out_fn,db,thread): 419 | cmd = ('blastp -query {q} -task blastp -db {db} -out {out} -evalue 1e-7 -word_size 4 \ 420 | -outfmt 6 -num_alignments 1 -num_threads {t}').format(q=query_fa,db=db,out=out_fn,t=str(thread)) 421 | print(cmd) 422 | sarge.run(cmd) 423 | 424 | def main_blast(): 425 | blast_db = path + '/blastp_db' 426 | if not os.path.exists(blast_db): os.mkdir(blast_db) 427 | os.chdir(blast_db) 428 | # makeblast(uniprot,'pr','prot') 429 | blastp(evm_pr_path +'/pr_merge.fa','blastp.txt','pr',24) 430 | 431 | # import time 432 | # st = time.time() 433 | # main_blast() 434 | # print time.time() - st 435 | 436 | def add_gene_name(x,rna_pr_dic): 437 | ids = '.'.join(re.search('picr.+?(?=;)',x).group(0).split('.')[:2]) 438 | if ids in rna_pr_dic: 439 | res = x + ';gene=' + rna_pr_dic[ids] 440 | else: 441 | res = x 442 | return res 443 | 444 | # add function of mapped genes to gff file 445 | def add_gene_function(blast_db,evm_path): 446 | '''add gene symbol to gff file. the information is from the blast results 447 | ''' 448 | blastp_fn = blast_db + '/blastp.txt' 449 | blast_df = pd.read_csv(blastp_fn,sep='\t',usecols=[0,1,2],names=['ref','query','per']) 450 | blast_df = blast_df[blast_df['per'].values>50] 451 | blast_df['rna'] = blast_df['ref'].map(lambda x: '.'.join(x.split('.')[-2:])) 452 | blast_df['pr'] = blast_df['query'].map(lambda x: x.split('|')[-1].split('_')[0]) 453 | rna_pr_dic = blast_df.set_index('rna')['pr'].to_dict() 454 | 455 | evm_gff= evm_path + '/evm.merge.gff' 456 | gff_df = pd.read_csv(evm_gff,sep='\t',header=None) 457 | gff_df[8] = gff_df[8].map(lambda x: add_gene_name(x,rna_pr_dic)) 458 | gff_df = gff_df[~gff_df[8].map(lambda x: 'gene=LORF2' in x)] 459 | gff_df.to_csv(blast_db +'/final.gff',sep='\t',index=False) 460 | 461 | # add_gene_function(blast_db,evm_path) 462 | #=============================================================================== 463 | # process the gmap results and exonerates results directly 464 | #=============================================================================== 465 | #=============== 1. get all mapped geneid, rna_accession, pr_accession 466 | def gene_rna_pr_id(hamster_id,gmap_gff,out_fn): 467 | '''this fnction get all gene rna pr id, including both refseq and gff information. 468 | * hamster_id: a file that has all ids in hamster.gff file 469 | * gmap_gff: gff results mapped using gmap 470 | * out_fn: 471 | ''' 472 | # rna accession in gff file 473 | ham_id_df = pd.read_csv(hamster_id,sep='\t',header=0) 474 | ham_id_df = ham_id_df.astype('str') 475 | ham_id_df['TrAccess'] = ham_id_df['TrAccess'].map(lambda x: x.split('.')[0]) 476 | ham_id_df['PrAccess'] = ham_id_df['PrAccess'].map(lambda x: x.split('.')[0]) 477 | rna_gene_dic = ham_id_df.set_index('TrAccess')['GeneID'].to_dict() 478 | rna_pr_dic = ham_id_df.set_index('TrAccess')['PrAccess'].to_dict() 479 | #-------- read rna gff file 480 | rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#') 481 | # add rna accession column 482 | rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0)) 483 | mrna = list(set(rna_df['rna_ac'].tolist())) 484 | # new rna in refseq compared to gff 485 | new_ref_rna = list(set(mrna) - set(rna_gene_dic.keys())) 486 | # get geneid for new ref_rna gene id 487 | for r in new_ref_rna: 488 | handle = Entrez.efetch(db='nucleotide',id=r,rettype='gb',retmode='text').read() 489 | geneid = re.search('(?<=GeneID:).+?(?=\")',handle).group(0) 490 | try: 491 | p = re.search('(?<=protein_id=\").+?(?=\.)',handle).group(0) 492 | except: 493 | p = '-' 494 | rna_gene_dic[r] = geneid 495 | rna_pr_dic[r] = p 496 | # transfer dic to dataframe 497 | r_g_df = pd.DataFrame.from_dict(rna_gene_dic,'index') 498 | r_g_df.columns = ['geneid'] 499 | r_p_df = pd.DataFrame.from_dict(rna_pr_dic,'index') 500 | r_p_df.columns = ['pr_ac'] 501 | g_r_p_df = pd.concat([r_g_df,r_p_df],axis=1) 502 | g_r_p_df['rna_ac'] = g_r_p_df.index 503 | g_r_p_df[['geneid','rna_ac','pr_ac']].to_csv(out_fn,sep='\t',index=False) 504 | 505 | gmap_exon_path = path + '/gmap_exonerate' 506 | if not os.path.exists(gmap_exon_path): os.mkdir(gmap_exon_path) 507 | os.chdir(gmap_exon_path) 508 | # gmap_gff = PASA_path + '/gmap.spliced_alignments.gff3' 509 | # g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt' 510 | # gene_rna_pr_id(hamster_id,gmap_gff,g_r_p_id_fn) 511 | 512 | 513 | def get_consensus_map(rna_df,pr_df,gene,rna_ac,pr_ac): 514 | '''this function check if the rna map and pr map have the same splice sites 515 | * rna_df: mRNA map to genome gff dataframe with additional rna_ac column 516 | * pr_df: protein map to genome dataframe with additional 'pr_ac' and 'pr_id' column 517 | ''' 518 | if not rna_df.empty: 519 | # get rna scaffold name, if more than 1 scaffold then don't add it's annotation 520 | rna_chr = list(set(rna_df[0].tolist())) 521 | if len(rna_chr) != 1: 522 | assert False, rna_ac + ' map to multiple scaffolds' 523 | else: 524 | rna_chr = rna_chr[0] 525 | # get strand, if map to both strand don't output 526 | rna_str = list(set(rna_df[6].tolist())) 527 | if len(rna_str) != 1: 528 | assert False, rna_ac + ' map to both strands' 529 | else: 530 | rna_str = rna_str[0] 531 | # get rna splice sites 532 | rna_splice = natsorted(rna_df[3].tolist() + rna_df[4].tolist()) 533 | # change exon id 534 | n = 1 535 | for i,row in rna_df.iterrows(): 536 | item = row[8].split(';') 537 | anno = '.'.join(item[0].split('.')[:-1])+'_'+str(n)+';'+ ';'.join(item[1:])+';Parent='+gene+';GeneID='+gene 538 | rna_df.loc[i,8] = anno 539 | rna_df.loc[i,2] = 'exon' 540 | n += 1 541 | #--------------- process protein gff information 542 | if not pr_df.empty: 543 | pr_id = pr_df['pr_id'].tolist()[0] 544 | sub_pr_df = pr_df[(pr_df['pr_id'].values==pr_id) & (pr_df[0].values==rna_chr)].copy() 545 | # change cds id 546 | m = 1 547 | for i,row in sub_pr_df.iterrows(): 548 | item = row[8].split(';') 549 | anno = 'ID='+pr_ac+'_'+str(m)+';'+';'.join(item[1:])+';Parent='+rna_ac+';GeneID='+gene 550 | sub_pr_df.loc[i,8] = anno 551 | sub_pr_df.loc[i,2] = 'CDS' 552 | m += 1 553 | pr_splice = natsorted(sub_pr_df[3].tolist() + sub_pr_df[4].tolist()) 554 | if sub_pr_df.shape[0] == 1: 555 | if not rna_splice[0] len(rna_splice): 564 | print 'protein has more splice than rna, rna/pr:',len(rna_splice),'/',len(pr_splice) 565 | sub_pr_df = pd.DataFrame() 566 | else: 567 | sub_pr_df = pr_df 568 | return rna_df,sub_pr_df,rna_chr,rna_splice[0],rna_splice[-1],rna_str 569 | 570 | 571 | import time 572 | process_start = time.time() 573 | 574 | def gmap_exonerate_merge_gff(gmap_gff,exonerate_gff,gmap_exon_path,all_id_fn): 575 | #-------- read gmap gff file 576 | rna_df = pd.read_csv(gmap_gff,sep='\t',header=None,comment='#') 577 | rna_df['rna_ac'] = rna_df[8].map(lambda x: re.search('(?<=ID=).+?(?=\.)',x).group(0)) 578 | # get multi mapping mRNAs 579 | multi_map_rna = list(set(rna_df[rna_df[8].map(lambda x: 'path2' in x)]['rna_ac'].tolist())) 580 | # build gene rna protein id dictionary 581 | g_r_p_dic = {} 582 | g_r_p_id_fn = gmap_exon_path + '/01_gene_rna_pr.txt' 583 | handle = open(g_r_p_id_fn) 584 | for line in handle: 585 | item = line.strip().split('\t') 586 | if item[1] in multi_map_rna: 587 | continue 588 | if item[0] in g_r_p_dic: 589 | g_r_p_dic[item[0]][item[1]] = item[2] 590 | else: 591 | g_r_p_dic[item[0]] = {item[1]:item[2]} 592 | #-------- read exonerate gff file 593 | pr_df = pd.read_csv(pr_gff,sep='\t',header=None) 594 | pr_df['pr_ac'] = pr_df[8].map(lambda x: re.search('(?<=Target=).+?(?=\.)',x).group(0)) 595 | 596 | def output_consensus_rna_pr(g,out_handle): 597 | '''this function finds the consistend rna and protein and out put to file 598 | g_r_p_dic: dictionary that has all the gene, rna and protein ids. 599 | g: gene id 600 | rna_df: rna gff dataframe 601 | pr_df: protein gff dataframe 602 | ''' 603 | g_n = 0 604 | rna_pr_dic = g_r_p_dic[g] 605 | for rna in rna_pr_dic: 606 | pr = rna_pr_dic[rna] 607 | single_rna_df = rna_df[rna_df['rna_ac'].values==rna].copy() 608 | single_rna_df = single_rna_df.reset_index(drop=True) 609 | if not single_rna_df.empty: 610 | single_pr_df = pr_df[pr_df['pr_ac'].values==pr].copy() 611 | single_pr_df = single_pr_df.reset_index(drop=True) 612 | single_pr_df.loc[:,'pr_id'] = single_pr_df[8].map(lambda x: re.search('(?<=ID=).+?(?=;)',x).group(0)) 613 | res_rna_df,res_pr_df,chrome,start,end,strand=get_consensus_map(single_rna_df,single_pr_df,str(g),rna,pr) 614 | if g_n == 0: 615 | out_handle.write('\t'.join([chrome,'gmap_exonerate','gene',str(start),str(end),'.',\ 616 | strand,'.','ID='+str(g)+';GeneID='+str(g)])+'\n') 617 | g_n += 1 618 | if not res_rna_df.empty: 619 | if rna.startswith('XR'): 620 | feature = 'lncRNA' 621 | else: 622 | feature = 'mRNA' 623 | out_handle.write('\t'.join([chrome,'gmap_exonerate',feature,str(start),str(end),'.',\ 624 | strand,'.','ID='+rna+';Parent='+str(g)+';GeneID='+str(g)])+'\n') 625 | res_rna_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None) 626 | if not res_pr_df.empty: 627 | res_pr_df[range(9)].to_csv(out_handle,sep='\t',index=False,header=None) 628 | 629 | out_fn = '02_gmap_exonerate.gff' 630 | if os.path.exists(out_fn): os.remove(out_fn) 631 | with open(out_fn,'a') as f: 632 | for g in g_r_p_dic.keys(): 633 | output_consensus_rna_pr(g,f) 634 | # define a function to find the start and end position of each gene 635 | def get_gene_s_e(gene_df): 636 | pos = gene_df[3].tolist() + gene_df[4].tolist() 637 | gene_df.iloc[0,3] = min(pos) 638 | gene_df.iloc[0,4] = max(pos) 639 | return gene_df 640 | # correct gene coordinates 641 | gff_df = pd.read_csv(out_fn,sep='\t',header=None) 642 | gff_df['geneid'] = gff_df[8].map(lambda x: re.search('(?<=GeneID=).+?(?=$)',x).group(0)) 643 | res_df = gff_df.groupby('geneid').apply(get_gene_s_e) 644 | # add gene name 645 | all_id_df = pd.read_csv(all_id_fn,sep='\t',header=0) 646 | all_id_df = all_id_df.astype('str') 647 | g_s_dic = all_id_df.set_index('GeneID')['GeneSymbol'].to_dict() 648 | res_df[8] = res_df.apply(lambda row: row[8]+';GeneName='+g_s_dic[row['geneid']] if row['geneid'] in g_s_dic else row[8]+';GeneName=NA',axis=1) 649 | res_df[range(9)].to_csv('02_gmap_exonerate.gff',sep='\t',index=False,header=None) 650 | 651 | # gmap_gff = PASA_path+'/gmap.spliced_alignments.gff3' 652 | # gmap_exonerate_merge_gff(gmap_gff,pr_gff,gmap_exon_path,hamster_id) 653 | 654 | print time.time() - process_start 655 | #=============================================================================== 656 | # RATT 657 | #=============================================================================== 658 | def fa2embl(fa,embl,gff,path): 659 | if not os.path.exists(path): os.mkdir(path) 660 | os.chdir(path) 661 | df = pd.read_csv(gff,sep='\t',header=None,comment='#',usecols=[0,2]) 662 | df = df[df[2].values=='gene'] 663 | chroms = list(set(df[0].tolist())) 664 | dic = SeqIO.index(fa,'fasta') 665 | for s in chroms: 666 | SeqIO.write(dic[s],open('fa','w'),'fasta') 667 | sarge.run('grep \'{s}\' {gff} > gff'.format(s=s,gff=gff)) 668 | sarge.run('/home/shangzhong/Installation/EMBOSS-6.6.0/bin/seqret \ 669 | -sequence fa -feature -fformat gff -fopenfile1 gff -osformat2 embl \ 670 | -auto -outseq {s}.embl'.format(s=s)) 671 | fns = glob.glob('*.embl') 672 | sarge.run('cat {files} > {embl}'.format(files=' '.join(fns),embl=embl)) 673 | # for f in fns: 674 | # os.remove(f) 675 | # fa2embl('/data/genome/hamster/ncbi_refseq/hamster.fa','hamster.embl','/data/genome/hamster/ncbi_refseq/hamster.gff','/data/shangzhong/Picr_assembly/Annotation/RATT/embl') 676 | 677 | 678 | --------------------------------------------------------------------------------