├── fasta_paths.config ├── getCodingSequence.R ├── getCodingSequenceNonstop.R ├── README.md ├── getMutNeoantigenBindersMHCClassII.sh ├── getMutNeoantigenBinders.sh ├── runNetMHCpan.py ├── mutationPostProcess.py ├── mafToFastaV2.py └── LICENSE /fasta_paths.config: -------------------------------------------------------------------------------- 1 | [Reference Paths] 2 | GRCh37cds: /xchip/cga_home/margolis/mutationsToNeoantigen/goldStandard/Homo_sapiens.GRCh37.cds.all.fa 3 | GRCh37cdna: /xchip/cga_home/margolis/mutationsToNeoantigen/goldStandard/Homo_sapiens.GRCh37.cdna.all.fa 4 | GRCh38cds: /xchip/cga_home/margolis/mutationsToNeoantigen/goldStandard/Homo_sapiens.GRCh38.cds.all.fa 5 | GRCh38cdna: /xchip/cga_home/margolis/mutationsToNeoantigen/goldStandard/Homo_sapiens.GRCh38.cdna.all.fa 6 | -------------------------------------------------------------------------------- /getCodingSequence.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library('biomaRt') 4 | 5 | # Get transcript ID passed from python script 6 | functinput <- commandArgs(trailingOnly = TRUE) 7 | 8 | # Function to grab coding sequence 9 | getCodingSeq <- function(transcriptID) { 10 | # Set up mart database 11 | ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org") 12 | ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl) 13 | # Do getSequence query 14 | transcriptseq <- getSequence(id=transcriptID, type="ensembl_transcript_id", seqType="coding", mart=ensembl) 15 | # Return to command line 16 | cat(transcriptseq$coding) 17 | } 18 | 19 | # Call function 20 | getCodingSeq(functinput) 21 | -------------------------------------------------------------------------------- /getCodingSequenceNonstop.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | 3 | library('biomaRt') 4 | 5 | # Get transcript ID passed from python script 6 | functinput <- commandArgs(trailingOnly = TRUE) 7 | 8 | # Function to grab coding sequence 9 | getCodingSeq <- function(transcriptID) { 10 | # Set up mart database 11 | ensembl <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", host="grch37.ensembl.org") 12 | ensembl <- useDataset("hsapiens_gene_ensembl", mart=ensembl) 13 | # Do getSequence query 14 | transcriptseq <- paste(getSequence(id=transcriptID, type="ensembl_transcript_id", seqType="coding", mart=ensembl)$coding, 15 | getSequence(id=transcriptID, type="ensembl_transcript_id", seqType="3utr", mart=ensembl)$`3utr`, sep="") 16 | # Return to command line 17 | cat(transcriptseq) 18 | } 19 | 20 | # Call function 21 | getCodingSeq(functinput) 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # neoantigen_calling_pipeline 2 | 3 | This pipeline calls somatic cancer neoantigens generated from genetic mutations in patient tumor DNA. 4 | 5 | To run: 6 | - Download NetMHCPan-3.0 (http://www.cbs.dtu.dk/cgi-bin/nph-sw_request?netMHCpan) and configure paths in runNetMHCpan.py file (line 70). 7 | - Download GRCh37 Ensembl FASTA files, Homo_sapiens.GRCh37.cds.all.fa and Homo_sapiens.GRCh37.cdna.all.fa (http://grch37.ensembl.org/info/data/ftp/index.html) and update paths in fasta_paths.config file. 8 | - Change paths in shell script getMutNeoantigenBinders.sh (notes in file comments). 9 | - For each sample, pipeline is intended to run on a MuTect SNV maf file, a Strelka InDel maf file, and a list of patient Class I HLA alleles (e.g., HLA-A02:01). 10 | - Run getMutNeoantigenBinders.sh from command line as an SGE Array Job. This script is a wrapper and will call all other relevant scripts. 11 | 12 | Additional notes: 13 | - Currently, pipeline works for MHC Class I only. MHC Class II functionality in development. 14 | - Detailed execution instructions and functionality descriptions can be found in each script header, as well as for each individual function. 15 | 16 | -------------------------------------------------------------------------------- /getMutNeoantigenBindersMHCClassII.sh: -------------------------------------------------------------------------------- 1 | # Claire Margolis 2 | # 3 October 2016 3 | # getMutNeoantigenBindersMHCClassII.sh 4 | # 5 | # Summary: Shell script that preprocesses patient .maf files and runs NetMHCIIPan on them 6 | # 7 | # *NOTE*: If you want to run this script, go through and verify that the paths to relevant files 8 | # are in the correct format for your cohort. You will need to change out_dir.txt, among other 9 | # things, to make the script specific to your cohort. You can also change preferences for running 10 | # netMHCIpan vs. netMHCIIpan. 11 | 12 | # ----------------------------------------------------------------------------------------------- # 13 | 14 | # ----------------------------------------------------------------------------------------------- # 15 | # Specify shell / UGER preferences 16 | 17 | #!/bin/bash 18 | 19 | #$ -cwd 20 | #$ -q long 21 | #$ -m e 22 | #$ -l h_vmem=10g 23 | #$ -t 1-2 24 | # ----------------------------------------------------------------------------------------------- # 25 | 26 | # ----------------------------------------------------------------------------------------------- # 27 | # Use statements 28 | 29 | source /broad/software/scripts/useuse 30 | reuse Python-2.7 31 | use MySQL-5.6 32 | 33 | # ----------------------------------------------------------------------------------------------- # 34 | 35 | # ----------------------------------------------------------------------------------------------- # 36 | # Set directory paths 37 | 38 | patient_dir=pat_dirs.txt 39 | PAT_DIR=$(cat $patient_dir | head -n $SGE_TASK_ID | tail -n 1) 40 | snv_mafs=snv_mafs.txt 41 | SNV_MAF=$(cat $snv_mafs | head -n $SGE_TASK_ID | tail -n 1) 42 | indel_mafs=indel_mafs.txt 43 | INDEL_MAF=$(cat $indel_mafs | head -n $SGE_TASK_ID | tail -n 1) 44 | hla_types=hla_types.txt 45 | HLA_TYPE=$(cat $hla_types | head -n $SGE_TASK_ID | tail -n 1) 46 | 47 | # ----------------------------------------------------------------------------------------------- # 48 | 49 | # ----------------------------------------------------------------------------------------------- # 50 | # Run mafToFasta.py for each patient for both SNVs and indels 51 | # ( Converts mutations in maf file to mutant peptides, generates wild-type peptides as well, and 52 | # writes both to outfile ) 53 | 54 | echo 'Running mafToFasta.py script for both SNVs and indels.' 55 | python mafToFastaV2.py $SNV_MAF 0 18,19,20 $PAT_DIR ../$PAT_DIR 56 | python mafToFastaV2.py $INDEL_MAF 1 18,19,20 $PAT_DIR ../$PAT_DIR 57 | 58 | # ----------------------------------------------------------------------------------------------- # 59 | 60 | # ----------------------------------------------------------------------------------------------- # 61 | # Run runNetMHCpan.py for each patient 62 | # ( Runs netMHCpan program to get predicted binding affinities for each peptide based on patient 63 | # HLA type ) 64 | # *NOTE*: 1 for NetMHCPan, 2 for NetMHCIIPan. Must run script twice if you want both. 65 | 66 | echo 'Running runNetMHCpan.py script.' 67 | python runNetMHCpan.py ../$PAT_DIR/len18pep_FASTA_snv.txt,../$PAT_DIR/len18pep_FASTA_indel.txt,../$PAT_DIR/len19pep_FASTA_snv.txt,../$PAT_DIR/len19pep_FASTA_indel.txt,../$PAT_DIR/len20pep_FASTA_snv.txt,../$PAT_DIR/len20pep_FASTA_indel.txt $HLA_TYPE 18,18,19,19,20,20 2 ../$PAT_DIR 68 | 69 | # ----------------------------------------------------------------------------------------------- # 70 | 71 | 72 | # ----------------------------------------------------------------------------------------------- # 73 | # Run mutationPostProcess.py for each patient 74 | # ( Processes netMHCpan output to make a more user-friendly file incorporating both mutant and 75 | # wild-type data for each peptide ) 76 | # *NOTE*: 1 for NetMHCPan, 2 for NetMHCIIPan 77 | 78 | echo 'Running mutationPostProcess.py script.' 79 | python mutationPostProcess.py ../$PAT_DIR/NETMHCIIpan_out_18SNV.xls,../$PAT_DIR/NETMHCIIpan_out_18InDel.xls,../$PAT_DIR/NETMHCIIpan_out_19SNV.xls,../$PAT_DIR/NETMHCIIpan_out_19InDel.xls,../$PAT_DIR/NETMHCIIpan_out_20SNV.xls,../$PAT_DIR/NETMHCIIpan_out_20InDel.xls ../$PAT_DIR/len18pep_headermap_snv.txt,../$PAT_DIR/len18pep_headermap_indel.txt,../$PAT_DIR/len19pep_headermap_snv.txt,../$PAT_DIR/len19pep_headermap_indel.txt,../$PAT_DIR/len20pep_headermap_snv.txt,../$PAT_DIR/len20pep_headermap_indel.txt 18,18,19,19,20,20 $PAT_DIR 2 ../$PAT_DIR/ 80 | 81 | # ----------------------------------------------------------------------------------------------- # 82 | 83 | -------------------------------------------------------------------------------- /getMutNeoantigenBinders.sh: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------- # 2 | # Claire Margolis 3 | # 21 March 2017 4 | # getMutNeoantigenBinders.sh 5 | # 6 | # Summary: Shell script that calls all python scripts to run neoantigen pipeline on batch of 7 | # samples. Intended to be submitted to UGER from the command line with task array 8 | # of samples. 9 | # Sample usage: qsub getMutNeoantigenBinders.sh 10 | # 11 | # *NOTE*: If you want to run this script, go through and verify that the paths to relevant files 12 | # are in the correct format for your cohort. You will need to change out_dir.txt, among other 13 | # things, to make the script specific to your cohort. 14 | 15 | # ----------------------------------------------------------------------------------------------- # 16 | 17 | # ----------------------------------------------------------------------------------------------- # 18 | # Specify shell / UGER preferences 19 | 20 | #!/bin/bash 21 | 22 | #$ -cwd 23 | #$ -q long 24 | #$ -m e 25 | #$ -l h_vmem=10g 26 | #$ -t 1-2 27 | # ----------------------------------------------------------------------------------------------- # 28 | 29 | # ----------------------------------------------------------------------------------------------- # 30 | # Use statements 31 | 32 | source /broad/software/scripts/useuse 33 | reuse Python-2.7 34 | reuse MySQL-5.6 35 | reuse R-3.3 36 | 37 | # ----------------------------------------------------------------------------------------------- # 38 | 39 | # ----------------------------------------------------------------------------------------------- # 40 | # Set directory paths 41 | 42 | patient_dirs=./patient_dirs.txt # File should contain sample names (which double as directory names), one per line 43 | hla_paths=./hla_paths.txt # File should contain paths to HLA allele files for each sample, one per line, in same order as patient_dirs.txt file 44 | snv_maf_paths=./snv_maf_paths.txt # File should contain paths to MuTect files for each sample, one per line, in same order as patient_dirs.txt file 45 | indel_maf_paths=./indel_maf_paths.txt # File should contain paths to Strelka files for each sample, one per line, in same order as patient_dirs.txt file 46 | PAT_DIR=$(cat $patient_dirs | head -n $SGE_TASK_ID | tail -n 1) 47 | HLA_PATH=$(cat $hla_paths | head -n $SGE_TASK_ID | tail -n 1) 48 | SNV_MAF_PATH=$(cat $snv_maf_paths | head -n $SGE_TASK_ID | tail -n 1) 49 | INDEL_MAF_PATH=$(cat $indel_maf_paths | head -n $SGE_TASK_ID | tail -n 1) 50 | 51 | # ----------------------------------------------------------------------------------------------- # 52 | 53 | # ----------------------------------------------------------------------------------------------- # 54 | # Run mafToFasta.py for each patient for both SNVs and indels 55 | # ( Converts mutations in maf file to mutant peptides, generates wild-type peptides as well, and 56 | # writes both to outfile ) 57 | 58 | echo 'Running mafToFasta.py script for both SNVs and indels.' 59 | python mafToFastaV2.py $SNV_MAF_PATH 0 9,10 $PAT_DIR ./$PAT_DIR # Only change last argument (./$PAT_DIR) to contain whatever output path desired 60 | python mafToFastaV2.py $INDEL_MAF_PATH 1 9,10 $PAT_DIR ./$PAT_DIR # Only change last argument to contain whatever output path you want 61 | 62 | # ----------------------------------------------------------------------------------------------- # 63 | 64 | # ----------------------------------------------------------------------------------------------- # 65 | # Run runNetMHCpan.py for each patient 66 | # ( Runs netMHCpan program to get predicted binding affinities for each peptide based on patient 67 | # HLA type ) 68 | 69 | echo 'Running runNetMHCpan.py script.' 70 | python runNetMHCpan.py ./$PAT_DIR/len9pep_FASTA_snv.txt,./$PAT_DIR/len9pep_FASTA_indel.txt,./$PAT_DIR/len10pep_FASTA_snv.txt,./$PAT_DIR/len10pep_FASTA_indel.txt $HLA_PATH 9,9,10,10 1 ./$PAT_DIR # Change "./$PAT_DIR" parts if you wrote to a different output path above 71 | 72 | # ----------------------------------------------------------------------------------------------- # 73 | 74 | 75 | # ----------------------------------------------------------------------------------------------- # 76 | # Run mutationPostProcess.py for each patient 77 | # ( Processes netMHCpan output to make a more user-friendly file incorporating both mutant and 78 | # wild-type data for each peptide ) 79 | 80 | echo 'Running mutationPostProcess.py script.' 81 | python mutationPostProcess.py ./$PAT_DIR/NETMHCpan_out_9SNV.xls,./$PAT_DIR/NETMHCpan_out_9InDel.xls,./$PAT_DIR/NETMHCpan_out_10SNV.xls,./$PAT_DIR/NETMHCpan_out_10InDel.xls ./$PAT_DIR/len9pep_headermap_snv.txt,./$PAT_DIR/len9pep_headermap_indel.txt,./$PAT_DIR/len10pep_headermap_snv.txt,./$PAT_DIR/len10pep_headermap_indel.txt 9,9,10,10 $PAT_DIR 1 ./$PAT_DIR/ # Change "./$PAT_DIR" parts if you wrote to a different output path above 82 | 83 | # ----------------------------------------------------------------------------------------------- # 84 | 85 | -------------------------------------------------------------------------------- /runNetMHCpan.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------- # 2 | # Claire Margolis 3 | # runNetMHCpan.py 4 | # 5 | # Summary: Takes in one or more FASTA files containing all of the peptides upon which netMHCpan 6 | # or netMHCIIpan is to be run. Runs whichever version of netMHCpan is requested and returns 7 | # the results in an appropriately-named output file. 8 | # 9 | # Input format: python runNetMHCpan.py len9peptides.txt,len10peptides.txt HLAalleles.txt 1 outpath 10 | # Options for specifying which netMHCpan version: 11 | # 1 = netMHCIpan 12 | # 2 = netMHCIIpan 13 | # *RELEVANT*: HLA allele input file can be in one of two formats: 14 | # 1. Polysolver winners_hla.txt output file 15 | # example line from file: HLA-A hla_a_02_01_01_01 hla_a_32_01_01 16 | # 2. Already processed, one allele per line in netMHC compatible format 17 | # example line from file: HLA-A02:01 18 | # 19 | # Output: netMHCpan output .xls file(s) 20 | # 21 | # ----------------------------------------------------------------------------------------------- # 22 | 23 | 24 | # ----------------------------------------------------------------------------------------------- # 25 | # Import necessary packages 26 | 27 | #!/usr/bin/python 28 | import sys 29 | import numpy as np 30 | import subprocess 31 | import os 32 | 33 | # ----------------------------------------------------------------------------------------------- # 34 | 35 | 36 | # ----------------------------------------------------------------------------------------------- # 37 | # Function: runNetMHCIpan 38 | # Inputs: FASTA file of peptide sequences, patient HLA alleles (these are automatically given 39 | # by Polysolver and come in a .txt file that needs to be pre-processed into the correct format for 40 | # netMHCpan), peptide length outpath 41 | # Returns: None (netMHCpan will automatically write output to a .xls file) 42 | # Summary: Pre-processes patient HLA alleles, runs netMHCIpan. 43 | def runNetMHCIpan(pepfile, hlafile, length, outpath): 44 | # Determine whether we're dealing with a snv or indel file (for naming the outfile) 45 | varianttype = '' 46 | if pepfile.split('_FASTA_')[1].split('.')[0] == 'snv': 47 | varianttype = 'SNV' 48 | if pepfile.split('_FASTA_')[1].split('.')[0] == 'indel': 49 | varianttype = 'InDel' 50 | # Read in HLA alleles file and process 51 | with open(hlafile) as f: 52 | hlalines = f.read().splitlines() 53 | hlaalleles = [] 54 | # Determine which input format the hla allele file is in 55 | if len(hlalines[0].split('\t')) <= 1: # In already pre-processed format 56 | hlaalleles = hlalines 57 | else: # Polysolver output file 58 | for line in hlalines: 59 | split = line.split('\t') 60 | # Reformat each allele (2 for each type of HLA A, B, and C) 61 | for i in range(1, 3): 62 | currallele = 'HLA-' 63 | allele = split[i] 64 | components = allele.split('_') 65 | currallele += components[1].upper() + components[2] + ':' + components[3] 66 | hlaalleles.append(currallele) 67 | hlaalleles = list(set(hlaalleles)) # Remove duplicate alleles if there are any 68 | hlastring = ','.join(hlaalleles) 69 | # Run netMHCI pan 70 | command = 'export NHOME=/netMHCpan-4.1; export NETMHCpan=/netMHCpan-4.1/Linux_x86_64; /netMHCpan-4.1/Linux_x86_64/bin/netMHCpan -a '+hlastring+' -f '+pepfile+' -inptype 0 -l '+str(length)+' -s -xls -xlsfile '+outpath+'/NETMHCpan_out_'+str(length)+varianttype+'.xls -allname /netMHCpan-4.1/Linux_x86_64/data/allelenames -hlapseudo /netMHCpan-4.1/Linux_x86_64/data/MHC_pseudo.dat -t 500 -version /xchip/cga_home/margolis/Packages/netMHCPan/netMHCpan-3.0/data/version -tdir /netMHCpan-4.1/scratch/XXXXXX -rdir /netMHCpan-4.1/Linux_x86_64/ > '+outpath+'/netMHCpanoutlen_'+str(length)+varianttype+'.txt' 71 | subprocess.call(command, shell=True) 72 | 73 | # Catch case where peptide file was empty (create dummy file) 74 | dummyfile = outpath+'/NETMHCpan_out_'+str(length)+varianttype+'.xls' 75 | open(dummyfile, 'a').close() 76 | 77 | return 78 | 79 | # ----------------------------------------------------------------------------------------------- # 80 | 81 | 82 | # ----------------------------------------------------------------------------------------------- # 83 | # Function: runNetMHCIIpan 84 | # Inputs: FASTA file of peptide sequences, patient HLA alleles (these are automatically given 85 | # by Polysolver and come in a .txt file that needs to be pre-processed into the correct format for 86 | # netMHCIIpan), peptide length outpath 87 | # Returns: None (netMHCIIpan will automatically write output to a .xls file) 88 | # Summary: Pre-processes patient HLA alleles, runs netMHCIIpan 89 | def runNetMHCIIpan(pepfile, hlafile, length, outpath): 90 | # Determine whether we're dealing with a snv or indel file (for naming the outfile) 91 | varianttype = '' 92 | if pepfile.split('_FASTA_')[1].split('.')[0] == 'snv': 93 | varianttype = 'SNV' 94 | if pepfile.split('_FASTA_')[1].split('.')[0] == 'indel': 95 | varianttype = 'InDel' 96 | # Read in HLA alleles file and process 97 | with open(hlafile) as f: 98 | hlalines = f.read().splitlines() 99 | hlaalleles = [] 100 | # Determine which input format the hla allele file is in 101 | if len(hlalines[0].split('\t')) <= 1: # In already pre-processed format 102 | hlaalleles = hlalines 103 | else: # PHLAT output file 104 | # DQA1 105 | DQA1a = hlalines[4].split('\t')[1].split('*')[1][0:5] 106 | DQA1a = DQA1a.split(':')[0]+DQA1a.split(':')[1] 107 | DQA1b = hlalines[4].split('\t')[2].split('*')[1][0:5] 108 | DQA1b = DQA1b.split(':')[0]+DQA1b.split(':')[1] 109 | # DQB1 110 | DQB1a = hlalines[5].split('\t')[1].split('*')[1][0:5] 111 | DQB1a = DQB1a.split(':')[0]+DQB1a.split(':')[1] 112 | DQB1b = hlalines[5].split('\t')[2].split('*')[1][0:5] 113 | DQB1b = DQB1b.split(':')[0]+DQB1b.split(':')[1] 114 | # Concatenate four DQ isoforms to be in correct format 115 | DQA1B1a = 'HLA-DQA1'+DQA1a+'-DQB1'+DQB1a 116 | DQA1aB1b = 'HLA-DQA1'+DQA1a+'-DQB1'+DQB1b 117 | DQA1bB1a = 'HLA-DQA1'+DQA1b+'-DQB1'+DQB1a 118 | DQA1B1b = 'HLA-DQA1'+DQA1b+'-DQB1'+DQB1b 119 | # DRB1 120 | DRB1a = hlalines[6].split('\t')[1].split('*')[1][0:5] 121 | DRB1a = DRB1a.split(':')[0]+DRB1a.split(':')[1] 122 | DRB1b = hlalines[6].split('\t')[2].split('*')[1][0:5] 123 | DRB1b = DRB1b.split(':')[0]+DRB1b.split(':')[1] 124 | # Format DRB1 alleles 125 | DRB1a = 'DRB1_'+DRB1a 126 | DRB1b = 'DRB1_'+DRB1b 127 | # Add alleles to list 128 | hlaalleles.append(DQA1B1a) 129 | hlaalleles.append(DQA1aB1b) 130 | hlaalleles.append(DQA1bB1a) 131 | hlaalleles.append(DQA1B1b) 132 | hlaalleles.append(DRB1a) 133 | hlaalleles.append(DRB1b) 134 | hlaalleles = list(set(hlaalleles)) # Remove duplicate alleles if there are any 135 | hlastring = ','.join(hlaalleles) 136 | 137 | 138 | # Run netMHCIIpan if file is not empty 139 | if os.path.getsize(pepfile) > 1: 140 | command = 'export NHOME=/netMHCIIpan-4.0; export NETMHCpan=/netMHCIIpan-4.0/Linux_x86_64; /netMHCIIpan-4.0/netMHCIIpan -a '+hlastring+' -f '+pepfile+' -inptype 0 -length '+str(length)+' -fast -filter 1 -affF 500 -rankF 2.0 -s -xls -xlsfile '+outpath+'/NETMHCIIpan_out_'+str(length)+varianttype+'.xls rdir /netMHCIIpan-4.0/Linux_x86_64/ > '+outpath+'/netMHCIIpanoutlen_'+str(length)+varianttype+'.txt' 141 | subprocess.call(command, shell=True) 142 | 143 | # Catch case where peptide file was empty (create dummy file) 144 | dummyfile = outpath+'/NETMHCIIpan_out_'+str(length)+varianttype+'.xls' 145 | open(dummyfile, 'a').close() 146 | 147 | return 148 | 149 | # ----------------------------------------------------------------------------------------------- # 150 | 151 | 152 | # ----------------------------------------------------------------------------------------------- # 153 | # Main function 154 | def main(): 155 | # Check to make sure we have the right number of inputs 156 | if len(sys.argv) != 6: 157 | print 'Error: incorrect number of inputs.' 158 | print 'Please input FASTA file(s), a HLAalleles.txt file, the peptide length(s), a netMHCpan version, and an outpath.' 159 | sys.exit() 160 | # Parse inputs 161 | fastas = sys.argv[1] 162 | alleles = sys.argv[2] 163 | peplengths = sys.argv[3] 164 | versionchoice = sys.argv[4] 165 | outpath = sys.argv[5] 166 | # Split FASTA files and peptide lengths 167 | fastalist = fastas.split(',') 168 | lengthslist = peplengths.split(',') 169 | if len(fastalist) != len(lengthslist): 170 | print 'Error: Please make sure your peptide lengths correspond to the fasta files and are in the same order.' 171 | sys.exit() 172 | # Run whichever netMHC version is desired 173 | if versionchoice == '1': 174 | for i in range(0, len(fastalist)): 175 | runNetMHCIpan(fastalist[i], alleles, lengthslist[i], outpath) 176 | else: 177 | for i in range(0, len(fastalist)): 178 | runNetMHCIIpan(fastalist[i], alleles, lengthslist[i], outpath) 179 | 180 | return 181 | 182 | if __name__ == '__main__': 183 | main() 184 | 185 | # ----------------------------------------------------------------------------------------------- # 186 | 187 | 188 | -------------------------------------------------------------------------------- /mutationPostProcess.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------- # 2 | # Claire Margolis 3 | # mutationPostProcess.py 4 | # 5 | # Summary: Takes in NETMHC_out.xls file(s) and does postprocessing to create a more user-friendly 6 | # output format. 7 | # Input format: python mutationPostProcess.py NETMHCpan_out9snv.xls,NETMHCpan_out9indel.xls,NETMHCpan_out10snv.xls,NETMHCpan_out10indel.txt 8 | # len9pep_headermap_snv,len9pep_headermap_indel,len10pep_headermap_snv,len10pep_headermap_indel patientID outpath 9 | # Output format: processedcombinedNETMHCpan_out.txt 10 | # 11 | # ----------------------------------------------------------------------------------------------- # 12 | 13 | 14 | # ----------------------------------------------------------------------------------------------- # 15 | # Import necessary packages 16 | 17 | #!/usr/bin/python 18 | import sys 19 | import numpy as np 20 | import subprocess 21 | import os 22 | # ----------------------------------------------------------------------------------------------- # 23 | 24 | 25 | # ----------------------------------------------------------------------------------------------- # 26 | # Function: processSingleFileOutput 27 | # Inputs: netMHCpan output .xls file (tab-delimited), header map file, patient ID, version 28 | # Returns: An ndarray with rows corresponding to distinct binder peptides and columns corresponding 29 | # to binder features and metadata. 30 | # Summary: Postprocesses the netMHCpan output to eliminate useless rows, change data format from 31 | # wide to long, add allele name columns. Also incorporates information about the sequence from the 32 | # .maf file (which is stored in the header map file input parameter). 33 | def processSingleFileOutput(netMHCfile, mapfile, length, patID, version): 34 | # Catch case where netMHCfile is empty 35 | if os.path.getsize(netMHCfile) == 0: 36 | return np.empty(shape=[0, 17]) 37 | # Parse netMHC filename to get whether file is SNVs or InDels 38 | snvorindel = 0 39 | if "InDel" in netMHCfile: 40 | snvorindel = 1 41 | length = int(length) 42 | # Read in first line of file to get number and names of alleles 43 | with open(netMHCfile, 'r') as f: 44 | alleles = f.readline().strip().split('\t') 45 | alleles = filter(None, alleles) # Remove empty strings just in case 46 | # Read in rest of file (skip HLA alleles at the top and file header 47 | data = np.loadtxt(netMHCfile, dtype='S40', delimiter='\t', skiprows=2, ndmin=2) 48 | nrow = data.shape[0] 49 | ncol = data.shape[1] 50 | # Move columns so that data is in long form 51 | listofarrays = [] # Will store all allele-specific arrays 52 | initcols = data[:,0:3] # Initial three columns that are common to all HLA alleles 53 | if version == 1: 54 | for i in range(0, len(alleles)): 55 | currstartcol = (3*(i+1))+i 56 | currendcol = currstartcol+4 57 | currarray = data[:,currstartcol:currendcol] 58 | listofarrays.append(currarray) 59 | datav2 = np.vstack(tuple(listofarrays)) 60 | else: 61 | for i in range(0, len(alleles)): 62 | currstartcol = (3*(i+1)) 63 | currendcol = currstartcol+4 64 | currarray = data[:,currstartcol:currendcol] 65 | listofarrays.append(currarray) 66 | datav2 = np.vstack(tuple(listofarrays)) 67 | # Add initial columns and allele column into data frame 68 | # Allele column 69 | allelevec = [] 70 | for i in range(0, len(alleles)): 71 | currnewcol = [alleles[i]]*nrow 72 | allelevec.extend(currnewcol) 73 | datav2 = np.insert(datav2, 1, allelevec, axis=1) # Add allele column to datalong 74 | # Initial columns 75 | initcollist = [] 76 | for i in range(0, len(listofarrays)): 77 | initcollist.append(initcols) 78 | initcolstoappend = np.vstack(tuple(initcollist)) 79 | datav3 = np.concatenate((initcolstoappend, datav2), axis=1) 80 | datav3 = np.delete(datav3, 3, axis=1) 81 | # Create mutant / WT dictionary for SNVs 82 | if snvorindel == 0: 83 | newnrow = datav3.shape[0] 84 | newncol = datav3.shape[1] 85 | mutWTdict = {} 86 | WTindices = [] 87 | for i in range(0, newnrow): 88 | if datav3[i,2].strip().split('_')[2] == 'mut': # For each mutant row, do: 89 | currkey = datav3[i,2]+'|'+datav3[i,0]+'|'+datav3[i,3] 90 | # Find the corresponding wild-type row 91 | for j in range(i+1, newnrow): 92 | if datav3[j,0] == datav3[i,0] and datav3[j,2][0:5] == datav3[i,2][0:5]: 93 | currval = datav3[j,2]+'|'+datav3[j,1]+'|'+datav3[j,4]+'|'+datav3[j,5]+'|'+datav3[j,6] 94 | mutWTdict[currkey] = currval 95 | WTindices.append(j) 96 | break 97 | # Delete WT rows 98 | datav4 = np.delete(datav3, WTindices, axis=0) 99 | else: 100 | datav4 = datav3 101 | # Eliminate any rows that have a rank above 2% 102 | toremove = [] 103 | newnrow2 = datav4.shape[0] 104 | for i in range(0, newnrow2): 105 | if float(datav4[i,6]) > 2: 106 | toremove.append(i) 107 | datav5 = np.delete(datav4, toremove, 0) 108 | # Read in map file and create map dictionary 109 | headerdict = {} 110 | with open(mapfile, 'r') as f: 111 | lines = f.read().splitlines() 112 | for line in lines: 113 | key = line.split('\t')[0][1:] 114 | val = line.split('\t')[1] 115 | headerdict[key] = val 116 | # Initialize new ndarray columns (will eventually use np.hstack to stack them all theogether into a numpy array) 117 | patID,sample,transcript,chrom_loc,gene,gene_num,cdna_change,prot_change,pep_pos,pep_length,hla,pep_mut,aff_mut,rank_mut,pep_wt,aff_wt,rank_wt = ([] for i in range(17)) 118 | # For every row in current data array, use WT dict and header dict to find metainformation and save in a new ndarray 119 | newnrow3 = datav5.shape[0] 120 | for i in range(0, newnrow3): 121 | currrow = datav5[i,:] 122 | seqnum = currrow[2].split('_') 123 | headerdictkey = seqnum[0]+'_'+seqnum[1] 124 | headervals = headerdict[headerdictkey].split('|') 125 | patID.append(headervals[0]) 126 | sample.append(headervals[1]) 127 | transcript.append(headervals[3]) 128 | chrom_loc.append(headervals[2]) 129 | gene.append(headervals[4]) 130 | gene_num.append(headervals[5]) 131 | cdna_change.append(headervals[6]) 132 | prot_change.append(headervals[7]) 133 | pep_pos.append(currrow[0]) 134 | pep_length.append(length) 135 | hla.append(currrow[3]) 136 | pep_mut.append(currrow[1]) 137 | aff_mut.append(currrow[5]) 138 | rank_mut.append(currrow[6]) 139 | if snvorindel == 0: 140 | WTdictkey = currrow[2]+'|'+currrow[0]+'|'+currrow[3] 141 | WTvals = mutWTdict[WTdictkey].split('|') 142 | pep_wt.append(WTvals[1]) 143 | aff_wt.append(WTvals[3]) 144 | rank_wt.append(WTvals[4]) 145 | else: 146 | pep_wt.append('n/a') 147 | aff_wt.append('n/a') 148 | rank_wt.append('n/a') 149 | # Join all lists into new numpy array 150 | datafull = np.column_stack((patID,sample,transcript,chrom_loc,gene,gene_num,cdna_change,prot_change,pep_pos,pep_length,hla,pep_mut,aff_mut,rank_mut,pep_wt,aff_wt,rank_wt)) 151 | 152 | return datafull 153 | # ----------------------------------------------------------------------------------------------- # 154 | 155 | 156 | # ----------------------------------------------------------------------------------------------- # 157 | # Function: writeToOutfile 158 | # Inputs: final numpy array, patient ID, outpath 159 | # Returns: None (writes to file) 160 | # Summary: Takes the full numpy array and writes it, plus appropriate header, to a tab-delimited 161 | # file in the specified outpath. 162 | def writeToOutfile(array, patID, version, outpath): 163 | suffix = '' 164 | if version == 1: 165 | suffix = '_processedcombinedNETMHCpan_out.txt' 166 | else: 167 | suffix = '_processedcombinedNETMHCIIpan_out.txt' 168 | outfile = outpath+'/'+patID+suffix 169 | headerstring = 'patient\tsample\ttranscript\tchrom_loc\tgene\tgene_num\tcdna_change\tprot_change\tmut_pos\tpep_length\tHLA\tpep_mut\taff_mut\trank_mut\tpep_wt\taff_wt\trank_wt' 170 | np.savetxt(outfile, array, fmt='%s', delimiter='\t', header = headerstring, comments = '') 171 | 172 | return 173 | 174 | # ----------------------------------------------------------------------------------------------- # 175 | 176 | 177 | # ----------------------------------------------------------------------------------------------- # 178 | # Main function 179 | def main(): 180 | # Check to make sure we have the right number of inputs 181 | if len(sys.argv) != 7: 182 | print 'Error: incorrect number of inputs.' 183 | print 'Please input netMHC outfile(s), corresponding header map(s), corresponding length(s), the patient ID, version of netMHCpan that was run, and an outpath.' 184 | sys.exit() 185 | # Parse inputs 186 | netmhcoutfiles = sys.argv[1] 187 | headermapfiles = sys.argv[2] 188 | lengths = sys.argv[3] 189 | patientID = sys.argv[4] 190 | version = int(sys.argv[5]) 191 | outfilepath = sys.argv[6] 192 | # Split FASTA files and peptide lengths, making sure there are no leading/trailing whitespaces 193 | netmhclist = list(map(str.strip, netmhcoutfiles.split(','))) 194 | headerlist = list(map(str.strip, headermapfiles.split(','))) 195 | lengthslist = list(map(str.strip, lengths.split(','))) 196 | if len(netmhclist) != len(headerlist): 197 | print 'Error: Please make sure your header map files correspond to the netMHC outfiles and are in the same order.' 198 | sys.exit() 199 | # Loop through each netMHC file and add in relevant information 200 | procarrays = [] 201 | for i in range(0, len(netmhclist)): 202 | curroutputprocessed = processSingleFileOutput(netmhclist[i], headerlist[i], lengthslist[i], patientID, version) 203 | procarrays.append(curroutputprocessed) 204 | # If there is more than one processed array, concatenate them together then write to outfile 205 | if len(procarrays) > 1: 206 | fullarray = np.concatenate(tuple(procarrays), axis=0) 207 | writeToOutfile(fullarray, patientID, version, outfilepath) 208 | else: # Otherwise, write single array to file 209 | writeToOutfile(procarrays[0], patientID, version, outfilepath) 210 | 211 | return 212 | 213 | if __name__ == '__main__': 214 | main() 215 | # ----------------------------------------------------------------------------------------------- # 216 | -------------------------------------------------------------------------------- /mafToFastaV2.py: -------------------------------------------------------------------------------- 1 | # ----------------------------------------------------------------------------------------------- # 2 | # Claire Margolis 3 | # mafToFasta.py 4 | # 5 | # Summary: Takes in a .maf file of either SNVs or InDels and translates mutations to peptide 6 | # sequences of (desired length*2 - 1). These peptides will be fed into netMHCpan and will result 7 | # in getting the binding affinities of peptide of desired length with at least one AA overlapping 8 | # with the mutated nucleotide(s). Also outputs a map file of headers which will be used to process 9 | # and annotate the netMHC output. 10 | # 11 | # Input format: python mafToFasta.py maffile maffiletype peptidelengths patientID outpath 12 | # maffiletype = 0 for SNVs, 1 for indels 13 | # peptidelengths = comma-separated list of lengths (e.g., 9,10 for netMHCpan) 14 | # 15 | # Outputs: 16 | # For each length of peptide desired: 17 | # len#pep_headermap_(snv/indel).txt 18 | # len#pep_FASTA_(snv/indel).txt 19 | # 20 | # ------------------------------------------------------------------------------------------- # 21 | 22 | 23 | # ----------------------------------------------------------------------------------------------- # 24 | # Import necessary packages 25 | 26 | #!/usr/bin/python 27 | import sys 28 | import numpy as np 29 | import subprocess 30 | from Bio.Seq import Seq 31 | from ConfigParser import ConfigParser 32 | # ----------------------------------------------------------------------------------------------- # 33 | 34 | 35 | # ----------------------------------------------------------------------------------------------- # 36 | # Function: DNASeqToProtein 37 | # Inputs: List of DNA sequences, corresponding list of headers, peptide length 38 | # Returns: List of protein sequences, corresponding list of headers 39 | # Summary: Translates sequences in-frame to get proteins 40 | def DNASeqToProtein(nucs, headers, length): 41 | # Initialize codon table and list of peptides 42 | codontable = {'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M', 'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T', 43 | 'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K', 'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R', 44 | 'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L', 'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P', 45 | 'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q', 'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R', 46 | 'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V', 'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A', 47 | 'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E', 'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G', 48 | 'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S', 'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L', 49 | 'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*', 'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W'} 50 | peptides = [] 51 | pepheaders = [] 52 | # Translate nucleotide sequences 53 | for n in range(0, len(nucs)): 54 | seq = nucs[n] 55 | fullprotein = '' 56 | for s in xrange(0, len(seq), 3): 57 | codon = seq[s:s+3] 58 | # Break if the sequence ends with trailing AAs (should never happen) or codon is not in table (i.e., has an "N" for unknwn base) 59 | if len(codon) != 3 or codon not in codontable: 60 | break 61 | # Find corresponding AA to codon 62 | AA = codontable[codon] 63 | fullprotein += AA 64 | # Stop at stop codon, if there is one 65 | if '*' in fullprotein: 66 | substrings = ' '.join(fullprotein.split('*')).split() 67 | if len(substrings[0]) >= int(length): 68 | peptides.append(substrings[0]) 69 | pepheaders.append(headers[n]) 70 | else: # Case when there is no stop codon in full-length protein 71 | peptides.append(fullprotein) 72 | pepheaders.append(headers[n]) 73 | return peptides, pepheaders 74 | # ----------------------------------------------------------------------------------------------- # 75 | 76 | 77 | # ----------------------------------------------------------------------------------------------- # 78 | # Function: MutationsToDNASeq 79 | # Inputs: maffile, peptide length, patient ID, outpath, snv/indel indicator 80 | # Returns: List of mutant/wild type DNA sequences of desired length, list of corresponding headers 81 | # Summary: Takes in maf file, peptide length, patient ID, and outpath. Finds ORF orientation at 82 | # mutation location from .maf file Codon_change field, based on ORF orientation calculates nucleotide 83 | # window to yield correct number of peptides flanking the mutation, calls twoBitToFa function to 84 | # get nucleotide sequence, writes header map to output file. 85 | def MutationsToDNASeq(maf, length, patID, outpath, indicator, cds_path, cdna_path): 86 | # Read in maf file (desired columns only) 87 | #['Hugo_Symbol' 'Entrez_Gene_Id' 'Chromosome' 'Start_position' 'End_position' 'Variant_Classification' 'Tumor_Seq_Allele2' 88 | # 'Tumor_Sample_Barcode' 'Annotation_Transcript' 'Transcript_Strand' 'cDNA_Change' 'Codon_Change' 'Protein_Change' 'Variant_Type' 89 | # 'Transcript_Position' 'Reference_Allele']''' 90 | mafarray = np.loadtxt(maf, dtype=str, delimiter='\t', skiprows=0, usecols=(0,1,4,5,6,8,12,15, 305, 222, 302, 89, 287, 9, 278, 10), ndmin=2) 91 | # Create dictionary containing lengths to go backward and forward based on ORF orientation 92 | distancedict = {0:[3,0], 1:[2,1], 2:[1,2]} 93 | # Open header map file for writing 94 | headermapfile = '' 95 | # Map file will be labeled differently for SNVs and InDels 96 | if indicator == 0: 97 | headermapfile = open(outpath+'/len'+str(length)+'pep_headermap_snv.txt', 'w') 98 | else: 99 | headermapfile = open(outpath+'/len'+str(length)+'pep_headermap_indel.txt', 'a') 100 | # Translate length from AAs to nucleotides 101 | length = int(length)*3 102 | # Initialize sequence list, header list 103 | seqlist = [] 104 | headerlist = [] 105 | counter = 1 106 | # Loop through maf array and generate DNA sequences for each sequence 107 | isnonstop = 0 # Set indicator for nonstop mutations 108 | nonstopcounter = -1 109 | nonstopalphabet = 'abcdefghijklmnopqrstuvwxyz!@#$%^&*()~`"/,' 110 | for row in mafarray: 111 | 112 | # Check to make sure mutation is one that we care about and skip to next sequence if not 113 | classification = row[5] 114 | if not (classification == 'Missense_Mutation' or classification == 'Frame_Shift_Ins' or classification == 'Frame_Shift_Del' or classification == 'Nonstop_Mutation' or classification == 'In_Frame_Ins' or classification == 'In_Frame_Del'): 115 | continue 116 | # Go through special case for nonstop mutation 117 | if (classification == 'Nonstop_Mutation'): 118 | isnonstop = 1 119 | nonstopcounter += 1 120 | nonstopheadermapfile = open(outpath+'/len'+str(length/3)+'pep_headermap_indel.txt','a') 121 | # Calculate coding strand start and end positions 122 | orig_start = 0 123 | orig_end = 0 124 | if indicator == 0 and isnonstop == 0: # SNVs (but NOT nonstop mutations) 125 | if row[13] == 'SNP': 126 | orig_start = int(((row[10].split('.')[1]).split('>')[0])[0:-1])-1 # Subtract because MAFs are 1-indexed but python is 0-indexed 127 | orig_end = orig_start # SNVs only affect one position 128 | else: # case of DNPs or TNPs or ONPs 129 | orig_start = int((row[10].split('.')[1]).split('_')[0])-1 130 | orig_end = orig_start + len(row[6].strip()) - 1 # Will be 2 for DNPs, 3 for TNPs, ... for ONPs 131 | elif indicator == 0 and isnonstop == 1: # Nonstop Mutations specifically 132 | if row[13] == 'SNP': 133 | if '+' in row[9]: # Deal with cases on positive strand separately 134 | orig_start = int(row[14].strip())-1 # Subtract one because positive strand transcript positions seem to be 1-indexed 135 | orig_end = orig_start 136 | else: 137 | orig_start = int(row[14].strip()) # Negative strand transcript positions seem to be 0-indexed...? 138 | orig_end = orig_start 139 | else: # case of DNP/TNP/ONPs that are also nonstop mutations 140 | if '+' in row[9]: 141 | orig_start = int(row[14].split('_')[0])-1 142 | orig_end = orig_start + len(row[6].strip()) - 1 143 | else: # InDels 144 | if row[13] == 'DEL': # deletion 145 | orig_start = int(((row[10].split('.')[1]).split('del')[0]).split('_')[0])-1 146 | if '_' in row[10]: 147 | orig_end = int((row[10].split('del')[0]).split('_')[1])-1 148 | else: 149 | orig_end = orig_start 150 | else: # insertion 151 | orig_start = int(((row[10].split('.')[1]).split('ins')[0]).split('_')[0]) 152 | if '_' in row[10]: 153 | orig_end = int((row[10].split('ins')[0]).split('_')[1])-1 154 | else: 155 | orig_end = orig_start 156 | 157 | # Calculate mutation length 158 | mut_length = len(row[15].strip()) 159 | 160 | # Calculate ORF orientation at mutation start site 161 | if indicator == 0: # If SNV, use codon_change .maf field 162 | codonchange = (row[11].split(')')[1]).split('>')[0] 163 | orfpos = 0 # Initalize variable 164 | for i in range(0, len(codonchange)): 165 | if codonchange[i].isupper(): 166 | orfpos = i # Set ORF variable 167 | break 168 | else: # If InDel, use codon_change .maf field but do further processing 169 | codonstartnum = (((row[11].split('('))[1]).split('-'))[0] 170 | cdnastartnum = 0 171 | if row[13] == 'DEL': 172 | cdnastartnum = ((((row[10].split('c.'))[1]).split('del'))[0]).split('_')[0] 173 | else: 174 | cdnastartnum = ((((row[10].split('c.'))[1]).split('ins'))[0]).split('_')[0] 175 | cdnastartnum = int(cdnastartnum)+1 176 | orfpos = (int(cdnastartnum) - int(codonstartnum)) % 3 177 | 178 | # Set new start and end positions for chromosome region with appropriate nucleotide region around mutation site based on ORF orientation 179 | start = 0 180 | end = 0 181 | if indicator == 0: # For SNVs, do this: 182 | snvlength = orig_end-orig_start+1 183 | start = orig_start - (length - distancedict[orfpos][0]) 184 | end = orig_end + (length - distancedict[orfpos][1]) 185 | if row[13] != 'SNP': #Account for DNPs 186 | start = start + 3 187 | end = end + 3 188 | else: # For InDels, do this: 189 | start = orig_start - (length - distancedict[orfpos][0]) 190 | end = orig_start + mut_length + (length - distancedict[orfpos][1]) 191 | 192 | # Get output from R script that will contain the coding sequence for transcript of interest 193 | annot_transcript = row[8].split('.')[0] 194 | #CONFIG_FILENAME = 'fasta_paths.config' 195 | #config = ConfigParser() 196 | #config.read(CONFIG_FILENAME) 197 | if isnonstop == 0: 198 | #ref_37_path = config.get('Reference Paths','GRCh37cds') 199 | ref_path = cds_path 200 | else: 201 | #ref_37_path = config.get('Reference Paths', 'GRCh37cdna') 202 | ref_path = cdna_path 203 | command = "sed -n -e '/"+annot_transcript+"/,/>/ p' "+ref_path+" | sed -e '1d;$d'" 204 | codingseq = subprocess.check_output(command, shell=True) 205 | 206 | # Check to see whether transcript sequence has an entry in the reference genome (if not, continue) 207 | if len(codingseq) == 0: 208 | print 'Error: Reference does not contain coding sequence for transcript '+annot_transcript+'. Skipping this mutation.' 209 | continue 210 | 211 | # Get length of coding sequence plus position of mutation, and get desired sequence start and end indices 212 | codingseq = codingseq.replace('\n','') 213 | seqlength = len(codingseq) 214 | seqstart = 0 215 | seqend = 0 216 | if start >= 0: 217 | seqstart = start 218 | else: 219 | seqstart = 0 220 | if classification == 'Frame_Shift_Ins' or classification == 'Frame_Shift_Del': # Special case of frameshift mutations 221 | seqend = seqlength-1 222 | else: 223 | if end <= seqlength-1: 224 | seqend = end 225 | else: 226 | seqend = seqlength-1 227 | 228 | # Retrieve sequence desired 229 | sequence = codingseq[seqstart:seqend+1] 230 | 231 | # Substitute in mutation at appropriate position 232 | disttomut = orig_start - seqstart 233 | if indicator == 0: # SNV 234 | mutregion = row[10].split('>')[1] 235 | mutatedseq = sequence[0:disttomut]+mutregion+sequence[disttomut+snvlength:] 236 | else: # InDel 237 | if row[13] == 'DEL': 238 | mutatedseq = sequence[0:disttomut]+sequence[disttomut+mut_length:] 239 | else: 240 | mutregion = row[10].split('ins')[1] 241 | mutatedseq = sequence[0:disttomut]+mutregion+sequence[disttomut:] 242 | 243 | # Deal with nonstop mutations in their own way (separately) 244 | if isnonstop == 1: 245 | nonstopseqlist = [mutatedseq] 246 | nonstopheaderlist = ['>seq_'+nonstopalphabet[nonstopcounter]+'_mut'] 247 | nonstoppeptide, nonstoppepheader = DNASeqToProtein(nonstopseqlist, nonstopheaderlist, length/3) 248 | if len(nonstoppeptide) < 1 or len(nonstoppepheader) < 1: # If we hit a stop codon and can't make a large enough peptide, continue 249 | isnonstop = 0 250 | continue 251 | nonstopfilehandle = outpath+'/len'+str(length/3)+'pep_FASTA_indel.txt' 252 | f = open(nonstopfilehandle, 'a') 253 | f.write(nonstoppepheader[0]+'\n'+nonstoppeptide[0]+'\n') 254 | nonstopheadermapfile.write('>seq_'+nonstopalphabet[nonstopcounter]+'\t'+patID+'|'+row[7]+'|chr'+row[2]+':'+row[3]+'-'+row[4]+'|'+row[8]+'|'+row[0]+'|'+row[1]+'|'+row[10]+'|'+row[12]+'\n') 255 | isnonstop = 0 256 | continue 257 | 258 | # Add sequences to lists (mutant and WT for SNV, just mutant for indel) 259 | seqlist.append(mutatedseq) 260 | headerlist.append('>seq_'+str(counter)+'_mut') 261 | if indicator == 0: 262 | seqlist.append(sequence) 263 | headerlist.append('>seq_'+str(counter)+'_wt') 264 | # Write maf annotation information to map file (will be used in netMHC postprocessing) 265 | headermapfile.write('>seq_'+str(counter)+'\t'+patID+'|'+row[7]+'|chr'+row[2]+':'+row[3]+'-'+row[4]+'|'+row[8]+'|'+row[0]+'|'+row[1]+'|'+row[10]+'|'+row[12]+'\n') 266 | counter += 1 267 | headermapfile.close() 268 | 269 | return seqlist, headerlist 270 | # ----------------------------------------------------------------------------------------------- # 271 | 272 | 273 | # ----------------------------------------------------------------------------------------------- # 274 | # Function: writeToOutfile 275 | # Inputs: list of peptides, corresponding headers, current length, outpath, SNV vs. InDel indicator 276 | # Returns: None (writes to file) 277 | # Summary: Writes header+peptide combos to a file, one item per line, that will be an input to netMHC. 278 | # Example output file name: path/len9pep_snv_FASTA.txt (this file would contain SNV peptides of length 9). 279 | def writeToOutfile(peps, headers, length, outpath, indicator): 280 | filehandle = '' 281 | # If SNVs, do this: 282 | if indicator == 0: 283 | filehandle = outpath+'/len'+str(length)+'pep_FASTA_snv.txt' 284 | # Loop through the peptide, header lists and write to filehandle 285 | f = open(filehandle, 'a') 286 | # In the case of SNVs, need to check to make sure every mutant has corresponding wt and vice versa 287 | for i in range(0, len(peps)): 288 | if (len(peps[i])) > 0: 289 | if 'mut' in headers[i]: 290 | if '>seq_'+headers[i].split('_')[1]+'_wt' in headers: 291 | f.write(headers[i]+'\n'+peps[i]+'\n') 292 | else: 293 | if '>seq_'+headers[i].split('_')[1]+'_mut' in headers: 294 | f.write(headers[i]+'\n'+peps[i]+'\n') 295 | f.close() 296 | # If InDels, do this: 297 | else: 298 | filehandle = outpath+'/len'+str(length)+'pep_FASTA_indel.txt' 299 | # Loop through the peptide, header lists and write to filehandle 300 | f = open(filehandle, 'a') 301 | for i in range(0, len(peps)): # The peptide and header lists will always be the same length 302 | if len(peps[i]) > 0: 303 | f.write(headers[i]+'\n'+peps[i]+'\n') 304 | f.close() 305 | 306 | return 307 | # ----------------------------------------------------------------------------------------------- # 308 | 309 | 310 | # ----------------------------------------------------------------------------------------------- # 311 | # Main function 312 | def main(): 313 | # Check to make sure we have the right number of inputs 314 | if len(sys.argv) != 8: 315 | print 'Error: incorrect number of inputs.' 316 | print 'Please input a .maf file, .maf file type, the peptide lengths you want, the patient ID, an outfile path, and the paths to the cds and cdna reference files.' 317 | sys.exit() 318 | # Store inputs 319 | maffile = sys.argv[1] 320 | snvorindel = int(sys.argv[2]) 321 | lengthlist = sys.argv[3].split(',') 322 | patientID = sys.argv[4] 323 | outpath = sys.argv[5] 324 | cds_fasta_path = sys.argv[6] 325 | cdna_fasta_path = sys.argv[7] 326 | # For each peptide length in list, do this: 327 | for length in lengthlist: 328 | # Convert mutation into nucleotide sequences 329 | nucleotideseqs, nucleotideheaders = MutationsToDNASeq(maffile, length, patientID, outpath, snvorindel, cds_fasta_path, cdna_fasta_path) 330 | # Convert nucleotide sequences into peptide sequences 331 | peptideseqs, peptideheaders = DNASeqToProtein(nucleotideseqs, nucleotideheaders, length) 332 | # Print to outfile 333 | writeToOutfile(peptideseqs, peptideheaders, length, outpath, snvorindel) 334 | 335 | return 336 | 337 | if __name__ == '__main__': 338 | main() 339 | # ----------------------------------------------------------------------------------------------- # 340 | 341 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU GENERAL PUBLIC LICENSE 2 | Version 2, June 1991 3 | 4 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., 5 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 6 | Everyone is permitted to copy and distribute verbatim copies 7 | of this license document, but changing it is not allowed. 8 | 9 | Preamble 10 | 11 | The licenses for most software are designed to take away your 12 | freedom to share and change it. By contrast, the GNU General Public 13 | License is intended to guarantee your freedom to share and change free 14 | software--to make sure the software is free for all its users. This 15 | General Public License applies to most of the Free Software 16 | Foundation's software and to any other program whose authors commit to 17 | using it. (Some other Free Software Foundation software is covered by 18 | the GNU Lesser General Public License instead.) You can apply it to 19 | your programs, too. 20 | 21 | When we speak of free software, we are referring to freedom, not 22 | price. Our General Public Licenses are designed to make sure that you 23 | have the freedom to distribute copies of free software (and charge for 24 | this service if you wish), that you receive source code or can get it 25 | if you want it, that you can change the software or use pieces of it 26 | in new free programs; and that you know you can do these things. 27 | 28 | To protect your rights, we need to make restrictions that forbid 29 | anyone to deny you these rights or to ask you to surrender the rights. 30 | These restrictions translate to certain responsibilities for you if you 31 | distribute copies of the software, or if you modify it. 32 | 33 | For example, if you distribute copies of such a program, whether 34 | gratis or for a fee, you must give the recipients all the rights that 35 | you have. You must make sure that they, too, receive or can get the 36 | source code. And you must show them these terms so they know their 37 | rights. 38 | 39 | We protect your rights with two steps: (1) copyright the software, and 40 | (2) offer you this license which gives you legal permission to copy, 41 | distribute and/or modify the software. 42 | 43 | Also, for each author's protection and ours, we want to make certain 44 | that everyone understands that there is no warranty for this free 45 | software. If the software is modified by someone else and passed on, we 46 | want its recipients to know that what they have is not the original, so 47 | that any problems introduced by others will not reflect on the original 48 | authors' reputations. 49 | 50 | Finally, any free program is threatened constantly by software 51 | patents. We wish to avoid the danger that redistributors of a free 52 | program will individually obtain patent licenses, in effect making the 53 | program proprietary. To prevent this, we have made it clear that any 54 | patent must be licensed for everyone's free use or not licensed at all. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | GNU GENERAL PUBLIC LICENSE 60 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 61 | 62 | 0. This License applies to any program or other work which contains 63 | a notice placed by the copyright holder saying it may be distributed 64 | under the terms of this General Public License. The "Program", below, 65 | refers to any such program or work, and a "work based on the Program" 66 | means either the Program or any derivative work under copyright law: 67 | that is to say, a work containing the Program or a portion of it, 68 | either verbatim or with modifications and/or translated into another 69 | language. (Hereinafter, translation is included without limitation in 70 | the term "modification".) Each licensee is addressed as "you". 71 | 72 | Activities other than copying, distribution and modification are not 73 | covered by this License; they are outside its scope. The act of 74 | running the Program is not restricted, and the output from the Program 75 | is covered only if its contents constitute a work based on the 76 | Program (independent of having been made by running the Program). 77 | Whether that is true depends on what the Program does. 78 | 79 | 1. You may copy and distribute verbatim copies of the Program's 80 | source code as you receive it, in any medium, provided that you 81 | conspicuously and appropriately publish on each copy an appropriate 82 | copyright notice and disclaimer of warranty; keep intact all the 83 | notices that refer to this License and to the absence of any warranty; 84 | and give any other recipients of the Program a copy of this License 85 | along with the Program. 86 | 87 | You may charge a fee for the physical act of transferring a copy, and 88 | you may at your option offer warranty protection in exchange for a fee. 89 | 90 | 2. You may modify your copy or copies of the Program or any portion 91 | of it, thus forming a work based on the Program, and copy and 92 | distribute such modifications or work under the terms of Section 1 93 | above, provided that you also meet all of these conditions: 94 | 95 | a) You must cause the modified files to carry prominent notices 96 | stating that you changed the files and the date of any change. 97 | 98 | b) You must cause any work that you distribute or publish, that in 99 | whole or in part contains or is derived from the Program or any 100 | part thereof, to be licensed as a whole at no charge to all third 101 | parties under the terms of this License. 102 | 103 | c) If the modified program normally reads commands interactively 104 | when run, you must cause it, when started running for such 105 | interactive use in the most ordinary way, to print or display an 106 | announcement including an appropriate copyright notice and a 107 | notice that there is no warranty (or else, saying that you provide 108 | a warranty) and that users may redistribute the program under 109 | these conditions, and telling the user how to view a copy of this 110 | License. (Exception: if the Program itself is interactive but 111 | does not normally print such an announcement, your work based on 112 | the Program is not required to print an announcement.) 113 | 114 | These requirements apply to the modified work as a whole. If 115 | identifiable sections of that work are not derived from the Program, 116 | and can be reasonably considered independent and separate works in 117 | themselves, then this License, and its terms, do not apply to those 118 | sections when you distribute them as separate works. But when you 119 | distribute the same sections as part of a whole which is a work based 120 | on the Program, the distribution of the whole must be on the terms of 121 | this License, whose permissions for other licensees extend to the 122 | entire whole, and thus to each and every part regardless of who wrote it. 123 | 124 | Thus, it is not the intent of this section to claim rights or contest 125 | your rights to work written entirely by you; rather, the intent is to 126 | exercise the right to control the distribution of derivative or 127 | collective works based on the Program. 128 | 129 | In addition, mere aggregation of another work not based on the Program 130 | with the Program (or with a work based on the Program) on a volume of 131 | a storage or distribution medium does not bring the other work under 132 | the scope of this License. 133 | 134 | 3. You may copy and distribute the Program (or a work based on it, 135 | under Section 2) in object code or executable form under the terms of 136 | Sections 1 and 2 above provided that you also do one of the following: 137 | 138 | a) Accompany it with the complete corresponding machine-readable 139 | source code, which must be distributed under the terms of Sections 140 | 1 and 2 above on a medium customarily used for software interchange; or, 141 | 142 | b) Accompany it with a written offer, valid for at least three 143 | years, to give any third party, for a charge no more than your 144 | cost of physically performing source distribution, a complete 145 | machine-readable copy of the corresponding source code, to be 146 | distributed under the terms of Sections 1 and 2 above on a medium 147 | customarily used for software interchange; or, 148 | 149 | c) Accompany it with the information you received as to the offer 150 | to distribute corresponding source code. (This alternative is 151 | allowed only for noncommercial distribution and only if you 152 | received the program in object code or executable form with such 153 | an offer, in accord with Subsection b above.) 154 | 155 | The source code for a work means the preferred form of the work for 156 | making modifications to it. For an executable work, complete source 157 | code means all the source code for all modules it contains, plus any 158 | associated interface definition files, plus the scripts used to 159 | control compilation and installation of the executable. However, as a 160 | special exception, the source code distributed need not include 161 | anything that is normally distributed (in either source or binary 162 | form) with the major components (compiler, kernel, and so on) of the 163 | operating system on which the executable runs, unless that component 164 | itself accompanies the executable. 165 | 166 | If distribution of executable or object code is made by offering 167 | access to copy from a designated place, then offering equivalent 168 | access to copy the source code from the same place counts as 169 | distribution of the source code, even though third parties are not 170 | compelled to copy the source along with the object code. 171 | 172 | 4. You may not copy, modify, sublicense, or distribute the Program 173 | except as expressly provided under this License. Any attempt 174 | otherwise to copy, modify, sublicense or distribute the Program is 175 | void, and will automatically terminate your rights under this License. 176 | However, parties who have received copies, or rights, from you under 177 | this License will not have their licenses terminated so long as such 178 | parties remain in full compliance. 179 | 180 | 5. You are not required to accept this License, since you have not 181 | signed it. However, nothing else grants you permission to modify or 182 | distribute the Program or its derivative works. These actions are 183 | prohibited by law if you do not accept this License. Therefore, by 184 | modifying or distributing the Program (or any work based on the 185 | Program), you indicate your acceptance of this License to do so, and 186 | all its terms and conditions for copying, distributing or modifying 187 | the Program or works based on it. 188 | 189 | 6. Each time you redistribute the Program (or any work based on the 190 | Program), the recipient automatically receives a license from the 191 | original licensor to copy, distribute or modify the Program subject to 192 | these terms and conditions. You may not impose any further 193 | restrictions on the recipients' exercise of the rights granted herein. 194 | You are not responsible for enforcing compliance by third parties to 195 | this License. 196 | 197 | 7. If, as a consequence of a court judgment or allegation of patent 198 | infringement or for any other reason (not limited to patent issues), 199 | conditions are imposed on you (whether by court order, agreement or 200 | otherwise) that contradict the conditions of this License, they do not 201 | excuse you from the conditions of this License. If you cannot 202 | distribute so as to satisfy simultaneously your obligations under this 203 | License and any other pertinent obligations, then as a consequence you 204 | may not distribute the Program at all. For example, if a patent 205 | license would not permit royalty-free redistribution of the Program by 206 | all those who receive copies directly or indirectly through you, then 207 | the only way you could satisfy both it and this License would be to 208 | refrain entirely from distribution of the Program. 209 | 210 | If any portion of this section is held invalid or unenforceable under 211 | any particular circumstance, the balance of the section is intended to 212 | apply and the section as a whole is intended to apply in other 213 | circumstances. 214 | 215 | It is not the purpose of this section to induce you to infringe any 216 | patents or other property right claims or to contest validity of any 217 | such claims; this section has the sole purpose of protecting the 218 | integrity of the free software distribution system, which is 219 | implemented by public license practices. Many people have made 220 | generous contributions to the wide range of software distributed 221 | through that system in reliance on consistent application of that 222 | system; it is up to the author/donor to decide if he or she is willing 223 | to distribute software through any other system and a licensee cannot 224 | impose that choice. 225 | 226 | This section is intended to make thoroughly clear what is believed to 227 | be a consequence of the rest of this License. 228 | 229 | 8. If the distribution and/or use of the Program is restricted in 230 | certain countries either by patents or by copyrighted interfaces, the 231 | original copyright holder who places the Program under this License 232 | may add an explicit geographical distribution limitation excluding 233 | those countries, so that distribution is permitted only in or among 234 | countries not thus excluded. In such case, this License incorporates 235 | the limitation as if written in the body of this License. 236 | 237 | 9. The Free Software Foundation may publish revised and/or new versions 238 | of the General Public License from time to time. Such new versions will 239 | be similar in spirit to the present version, but may differ in detail to 240 | address new problems or concerns. 241 | 242 | Each version is given a distinguishing version number. If the Program 243 | specifies a version number of this License which applies to it and "any 244 | later version", you have the option of following the terms and conditions 245 | either of that version or of any later version published by the Free 246 | Software Foundation. If the Program does not specify a version number of 247 | this License, you may choose any version ever published by the Free Software 248 | Foundation. 249 | 250 | 10. If you wish to incorporate parts of the Program into other free 251 | programs whose distribution conditions are different, write to the author 252 | to ask for permission. For software which is copyrighted by the Free 253 | Software Foundation, write to the Free Software Foundation; we sometimes 254 | make exceptions for this. Our decision will be guided by the two goals 255 | of preserving the free status of all derivatives of our free software and 256 | of promoting the sharing and reuse of software generally. 257 | 258 | NO WARRANTY 259 | 260 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 261 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 262 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 263 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 264 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 265 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 266 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 267 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 268 | REPAIR OR CORRECTION. 269 | 270 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 271 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 272 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 273 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 274 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 275 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 276 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 277 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 278 | POSSIBILITY OF SUCH DAMAGES. 279 | 280 | END OF TERMS AND CONDITIONS 281 | 282 | How to Apply These Terms to Your New Programs 283 | 284 | If you develop a new program, and you want it to be of the greatest 285 | possible use to the public, the best way to achieve this is to make it 286 | free software which everyone can redistribute and change under these terms. 287 | 288 | To do so, attach the following notices to the program. It is safest 289 | to attach them to the start of each source file to most effectively 290 | convey the exclusion of warranty; and each file should have at least 291 | the "copyright" line and a pointer to where the full notice is found. 292 | 293 | 294 | Copyright (C) 295 | 296 | This program is free software; you can redistribute it and/or modify 297 | it under the terms of the GNU General Public License as published by 298 | the Free Software Foundation; either version 2 of the License, or 299 | (at your option) any later version. 300 | 301 | This program is distributed in the hope that it will be useful, 302 | but WITHOUT ANY WARRANTY; without even the implied warranty of 303 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 304 | GNU General Public License for more details. 305 | 306 | You should have received a copy of the GNU General Public License along 307 | with this program; if not, write to the Free Software Foundation, Inc., 308 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 309 | 310 | Also add information on how to contact you by electronic and paper mail. 311 | 312 | If the program is interactive, make it output a short notice like this 313 | when it starts in an interactive mode: 314 | 315 | Gnomovision version 69, Copyright (C) year name of author 316 | Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. 317 | This is free software, and you are welcome to redistribute it 318 | under certain conditions; type `show c' for details. 319 | 320 | The hypothetical commands `show w' and `show c' should show the appropriate 321 | parts of the General Public License. Of course, the commands you use may 322 | be called something other than `show w' and `show c'; they could even be 323 | mouse-clicks or menu items--whatever suits your program. 324 | 325 | You should also get your employer (if you work as a programmer) or your 326 | school, if any, to sign a "copyright disclaimer" for the program, if 327 | necessary. Here is a sample; alter the names: 328 | 329 | Yoyodyne, Inc., hereby disclaims all copyright interest in the program 330 | `Gnomovision' (which makes passes at compilers) written by James Hacker. 331 | 332 | , 1 April 1989 333 | Ty Coon, President of Vice 334 | 335 | This General Public License does not permit incorporating your program into 336 | proprietary programs. If your program is a subroutine library, you may 337 | consider it more useful to permit linking proprietary applications with the 338 | library. If this is what you want to do, use the GNU Lesser General 339 | Public License instead of this License. 340 | --------------------------------------------------------------------------------