├── Dockerfile
├── README.md
├── assets
    ├── Mods.txt
    └── workflow.png
├── bin
    ├── calculate_phastcons.py
    ├── calculate_phylocsf.py
    ├── extract_1mismatch_novpsm.py
    ├── group_novpepToLoci.py
    ├── label_nsSNP_pep.py
    ├── label_sub_pos.py
    ├── map_cosmic_snp_tohg19.py
    ├── map_novelpeptide2genome.py
    ├── parse_BLASTP_out.py
    ├── parse_BLAT_out.py
    ├── parse_annovar_out.py
    ├── parse_spectrumAI_out.py
    ├── peptide_pi_annotator.py
    ├── pi_database_splitter.py
    ├── reverse_decoy.py
    └── scan_bams.py
├── conf
    ├── base.config
    ├── sixft.config
    └── uppmax.config
├── envs
    └── environment.yml
├── main.nf
└── nextflow.config


/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nfcore/base
 2 | LABEL description="Docker image containing all requirements for IPAW pipeline"
 3 | 
 4 | COPY envs /envs/
 5 | RUN conda env create -f /envs/environment.yml && conda clean -a
 6 | 
 7 | RUN git clone https://github.com/yafeng/SpectrumAI /SpectrumAI
 8 | RUN cd /SpectrumAI && git pull && git reset --hard b8e7001807d834db633c30d265ef6e8361cdcb3c
 9 | 
10 | ENV PATH /opt/conda/envs/ipaw-0.5/bin:$PATH
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Integrated proteogenomics analysis workflow
  2 | ==============
  3 | 
  4 | [![Nextflow](https://img.shields.io/badge/nextflow-%E2%89%A50.32.0-brightgreen.svg)](https://www.nextflow.io/)
  5 | 
  6 | [![install with bioconda](https://img.shields.io/badge/install%20with-bioconda-brightgreen.svg)](http://bioconda.github.io/)
  7 | [![Docker](https://img.shields.io/docker/automated/glormph/ipaw.svg)](https://hub.docker.com/r/glormph/ipaw)
  8 | 
  9 | This is a workflow to identify, curate, and validate variant and novel peptides from MS proteomics spectra data, using databases containing novel and variant peptides, such as the VarDB database. VarDB combines entries from COSMIC, PGOHUM, CanProVar and lncipedia. The workflow takes mzML spectra files as input, is powered by [Nextflow](https://nextflow.io) and runs in [Docker](https://docker.com) or [Singularity](https://sylabs.io/singularity) containers.
 10 | 
 11 | Searches are run using [MSGF+](https://omics.pnl.gov/software/ms-gf) on a concatenated target and decoy databases which are then passed to [Percolator](http://percolator.ms) for statistical evaluation, in which FDR is determined in a class specific manner, filtering out known peptides and dividing novel/variant in different FDR arms. Thereafter a curation procedure is performed in which resulting peptides are evaluated on several different criteria, dependent on the peptide.
 12 | 
 13 | Please cite the following paper when you have used the workflow for publications :)
 14 | 
 15 | Zhu Y, Orre LM, Johansson HJ, Huss M, Boekel J, Vesterlund M, Fernandez-Woodbridge A, Branca RMM, Lehtio J: Discovery of coding regions in the human genome by integrated proteogenomics analysis workflow. Nat Commun 2018, 9(1):903.  [PMID: 29500430](https://www.ncbi.nlm.nih.gov/pubmed/29500430)
 16 | 
 17 | ![workflow image](https://github.com/lehtiolab/proteogenomics-analysis-workflow/blob/master/assets/workflow.png)
 18 | 
 19 | ### Before running 
 20 | 
 21 |   + Install [Docker](https://docker.io) or [Singularity](https://sylabs.io/singularity)
 22 |   + Install [Nextflow](https://nextflow.io)
 23 | 
 24 | 
 25 | ### Detailed pipeline inputs
 26 | 
 27 |   + Database search related inputs for MSGFplus
 28 |     + Spectra files input
 29 |     
 30 |     `--mzmldef  # a tab deliminated text file with mzmlfilepath(absolute path) and setname`
 31 |  
 32 |     + Modification file for MSGF+. Default file is for TMT labelled samples. [Here is an example.](https://bix-lab.ucsd.edu/download/attachments/13533355/Mods.txt?version=2&modificationDate=1358975546000)
 33 |     
 34 |     `--mods Mods.txt # use standard Unimod name for modification`
 35 |     
 36 |     + Fragment method
 37 |     
 38 |     `--activation hcd  # default, else use cid, etd`
 39 |     
 40 |     + Specify search DB
 41 |     
 42 |     `--tdb /path/to/vardb.fa`
 43 |     
 44 |   + Quantification related inputs
 45 |     + Labelling method. Do Not use this option if you have label-free data, include the reference channel(s)
 46 |       for calculating relative peptide intensities. Possible options, tmtpro, tmt10plex, tmt6plex, tmt2plex, itraq8plex, itraq4plex.
 47 |       Multiple reference channels are averaged, multiple sets are separated by a space and must match the 
 48 |       `--mzmldef` parameter.
 49 |     
 50 |     `--isobaric 'set01:tmt10plex:130C:131 set02:tmt10plex:126'
 51 | 
 52 |   + Post-search processing inputs
 53 |   
 54 |     + map the genomic positions of VarDB peptides require the annotation GTF file of VarDB
 55 |     
 56 |     `--gtf /path/tovardb.gtf`
 57 | 
 58 |     + Canonical protein FASTA for catching canonical proteins and BLAST
 59 |     ```
 60 |     --blastdb /path/to/Uniprot.Ensembl.GENCODE.proteins.fa # Here we use latest uniprot, ensembl, gencode annotated protein sequences 
 61 |     --knownproteins /path/to/Homo_sapiens.GRCh38.pep.all.fa # a prefiltering known proteins DB needed to remove known peptides during class FDR calculation 
 62 |     ```
 63 |    
 64 |     + Annovar peptide annotation program location
 65 |     
 66 |     `--annovar_dir path/to/annovar # Downloaded before running, due to licensing`
 67 | 
 68 |     + Bigwig files for phastCons/PhyloCSF:
 69 |     `--bigwigs  /path/to/bigwigs/`
 70 | 
 71 |     + Mark novel peptides which can be explained by nsSNPs
 72 |     
 73 |     `--snpfa /path/to/MSCanProVar_ensemblV79.fa  # CanProVar annotated peptide sequences derived from known nsSNPs`   
 74 |     
 75 |     + Genome FASTA to BLAT against to find potential peptides mapped to multiple genomic locations
 76 |     
 77 |     `--genome /path/to/hg19.fa` # use hg19.fa.masked version if you don't want to consider repeated regions.
 78 |   
 79 |   + When using the VarDB database
 80 | 
 81 |     + SNP and COSMIC databases, required to map genomic positions of single amino acid variant peptides.
 82 |     ```
 83 |     --dbsnp /path/to/SNP142CodingDbSnp.txt # a text file containing genomic coordinates of coding SNPs
 84 |     --cosmic /path/to/CosmicMutantExport.tsv # a text file containing genomic coordinates of mutations
 85 |     ```
 86 |     
 87 |     + __Optional__: RNA-Seq BAM and BAI files (in same directory) for reads support in detected novel coding regions. 
 88 |     
 89 |     `--bamfiles '/path/to/*.bam'
 90 |   
 91 |   + Nextflow command option:
 92 |     + Use `-profile` option to run it in locally or submit it in slurm or sge system.
 93 |     ```
 94 |     -profile ## options are standard and testing. Options and cpus allocated can be re-defined in nextflow.config file.
 95 |     -resume  ## use it to resume the jobs from the last stopped process.
 96 |     ```
 97 |   + Nextflow configuration
 98 |     + Define CPU resources for specific processes in `configuration/base.config`
 99 |    
100 | 
101 | ### Prepare once
102 | 
103 |   + Create account at [sanger](http://cancer.sanger.ac.uk/cosmic/help/download) for COSMIC database
104 |   + [Register](http://annovar.openbioinformatics.org/en/latest) for download of annovar
105 |   + Download SNP data from the [UCSC table browser](https://genome.ucsc.edu/cgi-bin/hgTables?hgsid=661199271_5BEJQ6aAEOgRhkgNqBRFQQhTW05G&clade=mammal&org=&db=hg19&hgta_group=varRep&hgta_track=snp142Common&hgta_table=snp142CodingDbSnp&hgta_regionType=genome&position=&hgta_outputType=primaryTable&hgta_outFileName=snp142CodingDbSnp.txt)
106 |   
107 | ```
108 | # Get this repo
109 | git clone https://github.com/lehtiolab/proteogenomics-analysis-workflow
110 | cd proteogenomics-analysis-workflow
111 | 
112 | # Get Annovar
113 | cd /path/to/your/annovar
114 | wget __link_you_get_from_annovar__
115 | tar xvfz annovar.latest.tar.gz
116 | # This creates a folder with annotate_variation.pl and more files, to be passed to the pipeline with --annovar_dir
117 | 
118 | 
119 | # Download bigwigs, this can take some time
120 | cd /path/to/your/bigwigs  # this dir will be passed to the pipeline with --bigwigs
121 | wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons.bw 
122 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+0.bw
123 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+1.bw
124 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+2.bw
125 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-0.bw
126 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-1.bw
127 | wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-2.bw
128 | 
129 | # In the meantime, download and extract varDB data (Fasta, GTF, BlastP, SNP Fasta) to a good spot
130 | wget -O varDB_data.tar.gz https://ndownloader.figshare.com/files/13358006 
131 | tar xvfz varDB_data.tar.gz
132 | 
133 | # Get the hg19 masked genome sequence
134 | wget hgdownload.soe.ucsc.edu/goldenPath/hg19/bigZips/chromFaMasked.tar.gz
135 | tar xvfz chromFaMasked.gz
136 | for chr in {1..22} X Y M; do cat chr$chr.fa.masked >> hg19.chr1-22.X.Y.M.fa.masked; done
137 | 
138 | # Download ENSEMBL database
139 | wget ftp://ftp.ensembl.org/pub/release-91/fasta/homo_sapiens/pep/Homo_sapiens.GRCh38.pep.all.fa.gz
140 | gunzip Homo_sapiens.GRCh38.pep.all.fa.gz
141 | 
142 | # Get the COSMIC database
143 | sftp 'your_email_address@example.com'@sftp-cancer.sanger.ac.uk
144 | # Download the data
145 | sftp> get cosmic/grch37/cosmic/v81/CosmicMutantExport.tsv.gz
146 | sftp> exit
147 | # Extract COSMIC data
148 | tar xvfz CosmicMutantExport.tsv.gz
149 | ```
150 | 
151 | ### Analyse your mzML files with VarDB
152 | Example command to search TMT 10-plex labelled data in docker
153 | Remove  `--isobaric` parameter if you have label-free data.
154 | ```
155 | nextflow run main.nf --tdb /path/to/VarDB.fasta \
156 |   --mzmldef spectra_file_list.txt \
157 |   --activation hcd \
158 |   --isobaric 'set01:tmt10plex:131 set02:tmt10plex:131' 'set03:tmt10plex:127N' \
159 |   --gtf /path/to/VarDB.gtf \
160 |   --mods /path/to/tmt_mods.txt \
161 |   --knownproteins /path/to/Homo_sapiens.GRCh38.pep.all.fa \
162 |   --blastdb /path/to/UniProteome+Ensembl94+GENCODE24.proteins.fasta \
163 |   --cosmic /path/to/CosmicMutantExport.tsv \
164 |   --snpfa /path/to/MSCanProVar_ensemblV79.filtered.fasta \
165 |   --genome /path/to/hg19.chr1-22.X.Y.M.fa \
166 |   --dbsnp /path/to/snp142CodingDbSnp.txt \
167 |   --annovar_dir /path/to/your/annovar \
168 |   --bigwigs /path/to/your/bigwigs \
169 |   --bamfiles /path/to/\*.bam \
170 |   --outdir /path/to/results \
171 |   -profile standard,docker # replace docker with singularity if needed
172 | ```
173 | 


--------------------------------------------------------------------------------
/assets/Mods.txt:
--------------------------------------------------------------------------------
1 | #Mods
2 | NumMods=2
3 | 229.162932,*,fix,N-term,TMT6plex
4 | 229.162932,K,fix,any,TMT6plex
5 | C2H3N1O1,C,fix,any,Carbamidomethyl
6 | O1,M,opt,any,Oxidation
7 | 
8 | 


--------------------------------------------------------------------------------
/assets/workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/lehtiolab/proteogenomics-analysis-workflow/b88d2b9fc8a4d82299aa85edde0a912d476f1bd4/assets/workflow.png


--------------------------------------------------------------------------------
/bin/calculate_phastcons.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | 
 4 | '''
 5 |     the script is written by Mikael Hussius @ SciLifeLab, https://github.com/hussius/gff-phastcons-human.
 6 |     slightly modified to work after the bigwig file is downloaded locally.
 7 | '''
 8 | import pyBigWig as pw
 9 | import numpy as np
10 | import sys
11 | 
12 | 
13 | if len(sys.argv)<4:
14 |     sys.exit("USAGE: python " + sys.argv[0] + "<GFF file with regions> <BigWig file> <output file>")
15 | 
16 | 
17 | # Need to download hg19.100way.phastCons.bw first, use the command:
18 | # wget http://hgdownload.cse.ucsc.edu/goldenPath/hg19/phastCons100way/hg19.100way.phastCons.bw
19 | 
20 | 
21 | infile = sys.argv[1]
22 | bw_file = sys.argv[2]
23 | outfile = sys.argv[3]
24 | 
25 | bw = pw.open(bw_file)
26 | oF = open(outfile, "w")
27 | 
28 | header = ["Bare peptide","phastcon_max_score","phastcon_mean_score"]
29 | oF.write("\t".join(header)+"\n")
30 | 
31 | pep_dic = {}
32 | 
33 | if not infile.endswith("gff") and not infile.endswith("gff3"):
34 |     sys.exit("The region file needs to be a GFF(3) file!")
35 | 
36 | for line in open(infile):
37 |     if not line.startswith("chr"):
38 |         continue
39 |     fields = line.strip().split()
40 |     (chr, start, end, pept) = (fields[0], fields[3], fields[4], fields[8])
41 |     if not pept.startswith("Parent="): continue
42 |     pept = pept.replace("Parent=","") # Remove "Parent="
43 |     
44 |     try:
45 |         values = bw.values(chr, int(start), int(end))
46 |         max_val = np.max(values)
47 |         mean_val = np.mean(values)
48 |     except:
49 |         print("Encountered error for line: " + line.strip())
50 |         max_val = -1
51 |         mean_val = -1
52 |     
53 |     if pept not in pep_dic:
54 |         pep_dic[pept]=[max_val, mean_val]
55 |     else:
56 |         pep_dic[pept][0]=max(max_val,pep_dic[pept][0])
57 |         pep_dic[pept][1]=np.mean([mean_val,pep_dic[pept][1]])
58 | 
59 | bw.close()
60 | 
61 | for pept in pep_dic:
62 |     oF.write("%s\t%f\t%f\n" % (pept,pep_dic[pept][0],pep_dic[pept][1]))
63 | 
64 | oF.close()
65 | 


--------------------------------------------------------------------------------
/bin/calculate_phylocsf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | '''
  4 |     the script is modified from Mikael Hussius @ SciLifeLab, https://github.com/hussius/gff-phylocsf-human
  5 |     
  6 |     download the following bigwig files first
  7 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+0.bw
  8 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+1.bw
  9 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF+2.bw
 10 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-0.bw
 11 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-1.bw
 12 |     # wget https://data.broadinstitute.org/compbio1/PhyloCSFtracks/hg19/latest/PhyloCSF-2.bw
 13 | '''
 14 | 
 15 | import sys
 16 | import os
 17 | import pyBigWig as pw
 18 | import numpy as np
 19 | 
 20 | def predict_coding(vec):
 21 |     coding = "OTHER"
 22 |     for v in vec:
 23 |         if not v: continue
 24 |         if v > 0: coding = "CODING"
 25 |     return(coding)
 26 | 
 27 | if len(sys.argv)<4:
 28 |     sys.exit("USAGE: python " + sys.argv[0] + "<GFF file> <BigWig file path> <output file>")
 29 | 
 30 | infile = sys.argv[1]
 31 | bw_file_path = sys.argv[2]
 32 | outfile = sys.argv[3]
 33 | 
 34 | regs = []
 35 | chrom={}
 36 | starts={}
 37 | ends={}
 38 | peptide={}
 39 | 
 40 | for line in open(infile):
 41 |     if not line.startswith("chr"):
 42 |         continue
 43 |     fields = line.strip().split()
 44 |     (chr, start, end, pept) = (fields[0], fields[3], fields[4], fields[8])
 45 |     if not pept.startswith("Parent="): continue
 46 |     name = chr+":"+start+"-"+end
 47 |     chrom[name]=chr
 48 |     starts[name]=int(start)
 49 |     ends[name]=int(end)
 50 |     peptide[name]=pept.split("=")[1]
 51 |     regs.append(name)
 52 | 
 53 | scores = {}
 54 | 
 55 | rpathbase = os.path.join(bw_file_path,"PhyloCSF")
 56 | 
 57 | for rf in ["+0","+1","+2","+3","-0","-1","-2","-3"]:
 58 |     rpath = rpathbase + rf + ".bw"
 59 |     if os.path.isfile(rpath):
 60 |         sys.stderr.write("Searching PhyloCSF reading frame " + rf + "\n")
 61 |         bw = pw.open(rpath)
 62 |         frame_score = {}
 63 |         count = 0
 64 |         for r in regs:
 65 |             count += 1
 66 |             if(count % 50 ==0): sys.stderr.write('\tProcessed ' + str(count) + " peptides out of " + str(len(regs)) + "\n")
 67 |             sys.stderr.flush()
 68 |             try:
 69 |                 score = bw.stats(chrom[r], starts[r], ends[r])[0]
 70 |             except RuntimeError:
 71 |                 score = None
 72 |             frame_score[r] = score
 73 |             scores[rf] = frame_score
 74 |         bw.close()
 75 |     else:
 76 |         sys.stderr.write("%s doesn't exist \n" % rpath)
 77 | 
 78 | 
 79 | 
 80 | output = open(outfile,"w")
 81 | output.write("\t".join(["Bare peptide","PhyloCSF+0.score","PhyloCSF+1.score","PhyloCSF+2.score","PhyloCSF-0.score","PhyloCSF-1.score","PhyloCSF-2.score","PhyloCSF_prediction"])+"\n")
 82 | 
 83 | pep_scores={}
 84 | 
 85 | for r in regs:
 86 |     scoreList = [scores["+0"][r], scores["+1"][r], scores["+2"][r], scores["-0"][r], scores["-1"][r], scores["-2"][r]]
 87 |     seq = peptide[r]
 88 |     if seq not in pep_scores:
 89 |         pep_scores[seq]=scoreList
 90 |     else: # this is to consider splice junction peptides which have two regions separated in gff file, we take mean phylocsf score of two regions
 91 |         for i in range(0,len(scoreList)):
 92 |             value = scoreList[i]
 93 |             if value is None and pep_scores[seq][i] is None:
 94 |                 continue
 95 |             elif None in [value, pep_scores[seq][i]]:
 96 |                 pep_scores[seq][i] = value if value else pep_scores[seq][i]
 97 |             else:
 98 |                 pep_scores[seq][i] = (pep_scores[seq][i] + value)/2
 99 | 
100 | for seq in pep_scores:
101 |     scoreList = pep_scores[seq]
102 |     row = [seq]+['NA' if x is None else str(x) for x in scoreList] + [predict_coding(scoreList)]
103 |     output.write('\t'.join(row) + '\n')
104 | 


--------------------------------------------------------------------------------
/bin/extract_1mismatch_novpsm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import re
 5 | 
 6 | input1 = open(sys.argv[1],"r") # blastp parse out novpep table
 7 | input2 = open(sys.argv[2],"r") # novel psm table
 8 | 
 9 | output = open(sys.argv[3],"w")
10 | 
11 | novpep_1mismatch = {}
12 | cols = input1.readline().split("\t")
13 | idx1 = cols.index("blastp_category")
14 | idx2 = cols.index("sub_pos")
15 | 
16 | for line in input1:
17 |     row = line.strip().split("\t")
18 |     cat = row[idx1]
19 |     pep = row[0]
20 |     sub_pos = row[idx2]
21 |     if cat=="map to known protein with 1 aa mismatch":
22 |         novpep_1mismatch[pep] = sub_pos
23 | 
24 | print ("%d peptides map to known protein with 1 aa mismatch" % len(novpep_1mismatch))
25 | 
26 | header = input2.readline().strip().split("\t")
27 | header += ["sub_pos"]
28 | 
29 | output.write("\t".join(header)+"\n")
30 | 
31 | idx = header.index('Peptide')
32 | for line in input2:
33 |     row = line.strip().split("\t")
34 |     seq = re.sub('[^a-zA-Z]','',row[idx])
35 |     if seq in novpep_1mismatch:
36 |         row.append(novpep_1mismatch[seq])
37 |         output.write("\t".join(row)+"\n")
38 | 
39 | input1.close()
40 | input2.close()
41 | output.close()
42 | 


--------------------------------------------------------------------------------
/bin/group_novpepToLoci.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import getopt
 5 | import os
 6 | from collections import OrderedDict
 7 | 
 8 | class Peptide(object):
 9 |     def __init__(self,chr=None,strand=None,start=0,end=0,psm=0,content=None):
10 |         self.start=start
11 |         self.end=end
12 |         self.strand=strand
13 |         self.chr=chr
14 |         self.content=content
15 | 
16 | 
17 | distance  = 10000 # default distance to group peptides is 10kb
18 | 
19 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
20 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
21 |     print("Example: python group_novpepToLoci.py --input novpep.table.txt --output novpep.table.Loci.txt --distance 10kb")
22 | else:
23 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=','output=','distance='])
24 |     for opt, arg in options:
25 |         if opt == '--input': input_file=arg
26 |         elif opt == '--output': output_file=arg
27 |         elif opt == '--distance':
28 |             try:
29 |                 distance=int(arg.replace("kb",""))*1000
30 |             except ValueError:
31 |                 print ("incorrect input for distance");sys.exit()
32 |         else:
33 |             print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
34 | 
35 | input1=open(input_file,"r")
36 | output=open(output_file,"w")
37 | 
38 | peplist=[]
39 | 
40 | header = input1.readline().strip().split("\t")
41 | header += ["Loci","Loci_info"]
42 | output.write("\t".join(header)+"\n")
43 | 
44 | idx=header.index("chr")
45 | 
46 | for line in input1: # 4rd to 7th column are chr, start, end, strand
47 |     row=line.strip().split("\t")
48 |     chr=row[idx].replace("chr","").replace("X","23").replace("Y","24")
49 |     chr = chr.replace('NA', '25')
50 |     peplist.append(Peptide(chr=chr,start=int(row[idx+1]),end=int(row[idx+2]),strand=row[idx+3],content=row))
51 | 
52 | ## sort by chr and then start cor
53 | peplist.sort(key=lambda x:list(map(int,(x.chr,x.start))))
54 | 
55 | print("total number of peptides",len(peplist))
56 | 
57 | loci_group=OrderedDict()
58 | loci_num=1
59 | 
60 | for i in range(0,len(peplist)-1):
61 |     if i==0:
62 |         loci_group[loci_num]=[peplist[i]]
63 |     
64 |     if min(abs(peplist[i].start-peplist[i+1].end), abs(peplist[i].end-peplist[i+1].start))<distance:
65 |         loci_group[loci_num].append(peplist[i+1])
66 |     else:
67 |         loci_num+=1
68 |         loci_group[loci_num]=[peplist[i+1]]
69 | 
70 | print("total number of coding loci after groupping",len(loci_group))
71 | 
72 | for loci in loci_group:
73 |     peptide_list=loci_group[loci]
74 |     loci_info = "supported by %d peptide" % len(peptide_list)
75 |     
76 |     for pep in peptide_list:
77 |         row = pep.content + [str(loci),loci_info]
78 |         output.write("\t".join(row)+"\n")
79 | 
80 | input1.close()
81 | output.close()
82 | 


--------------------------------------------------------------------------------
/bin/label_nsSNP_pep.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import os
 5 | import getopt
 6 | import sqlite3
 7 | from Bio import SeqIO
 8 | 
 9 | 
10 | 
11 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
12 |     print ("Warning! wrong command, please read the mannual in Readme.txt.")
13 |     print ("Example: python lab_sub_pos.py --input pep.tab.txt --nsSNPdb nsSNP_pep.fa --output output_filename")
14 | else:
15 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=','nsSNPdb=','output=', 'dbfile=', 'minlen='])
16 |     for opt, arg in options:
17 |         if opt == '--input': input_file=arg
18 |         elif opt == '--nsSNPdb': snp_db=arg
19 |         elif opt == '--dbfile': dbfn=arg
20 |         elif opt == '--output': output_file=arg
21 |         elif opt == '--minlen': minpeplen=arg
22 |         else:
23 |             print ("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
24 | 
25 | handle = SeqIO.parse(snp_db,"fasta")
26 | whole_proteins = {}
27 | 
28 | for record in handle:
29 |     if "rs" in record.description: # check if it has rs id in entry's description
30 |         whole_proteins[record.description] = str(record.seq)
31 | whole_proteins = {prot: seq.replace('L', 'I') for prot, seq in whole_proteins.items()}
32 |  
33 | 
34 | input1=open(input_file,"r") # novpep tab table
35 | 
36 | header= input1.readline().strip().split("\t")
37 | header += ["from.SNPdb"]
38 | 
39 | output=open(output_file,"w") # output table
40 | output.write('\t'.join(header) + '\n')
41 | 
42 | print ("searching SNPdb to see if any novel peptides derived from nsSNPs")
43 | conn = sqlite3.Connection(dbfn)
44 | 
45 | for line in input1:
46 |     row=line.strip().split("\t")
47 |     pep = row[0].replace('L', 'I')
48 |     query_output = "No match in SNP-DB"
49 |     cur = conn.execute('SELECT protein_acc, pos FROM protein_peptides WHERE seq="{}"'.format(pep[:int(minpeplen)]))
50 |     snpprots = [(protid, pos) for protid, pos in cur]
51 |     for prot_id, pos in snpprots:
52 |         protseq = whole_proteins[prot_id]
53 |         if pep in protseq:
54 |             query_output = prot_id
55 |             break
56 |     row.append(query_output)
57 |     output.write("\t".join(row)+"\n")
58 | 
59 | 
60 | input1.close()
61 | output.close()
62 | handle.close()
63 | 
64 | 


--------------------------------------------------------------------------------
/bin/label_sub_pos.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import re
 5 | import os
 6 | import getopt
 7 | 
 8 | ################  Comand-line arguments ################
 9 | peptide_column="Peptide" #default name to look for peptide column in input file
10 | protein_id_column = "Protein" #default name to look for protein accessions in input file
11 | splitchar = "_" #default splitting by '_'
12 | 
13 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
14 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
15 |     print("Example: python lab_sub_pos.py --input_psm PSM_filename --output output_filename")
16 | else:
17 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input_psm=','output=','splitchar='])
18 |     for opt, arg in options:
19 |         if opt == '--input_psm': input_file=arg
20 |         elif opt == '--output': output_file=arg
21 |         elif opt == '--splitchar': splitchar=arg
22 |         else:
23 |             print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
24 | 
25 | input1=open(input_file,"r") # vardb search psm table
26 | 
27 | header1=input1.readline().strip().split("\t")
28 | 
29 | try:
30 |     pep_index=header1.index(peptide_column)
31 |     pro_index=header1.index(protein_id_column)
32 | except ValueError:
33 |     print("Peptide, Protein columns are not in input table"); sys.exit()
34 | 
35 | output=open(output_file,"w") # 1 mismatch novel peptide PSM
36 | 
37 | newheader=header1+["sub_pos"]
38 | output.write("\t".join(newheader)+"\n")
39 | 
40 | for line in input1:
41 |     row=line.strip().split("\t")
42 |     pro=row[pro_index]
43 | 
44 |     sub_pos="NA"
45 |     acc=pro.split(";")[0]
46 |     acc=re.sub(r'\([^)]*\)', '', acc) # remove text within parentheses
47 |     
48 |     if acc[:6]=="CanPro":
49 |         sub_pos=acc.split("_")[-1]
50 |     elif acc[:6]=="COSMIC":
51 |         sub_pos=acc.split(":")[-1]
52 |     else:
53 |         splitheader = acc.split(splitchar)
54 |         if len(splitheader) == 1:
55 |             continue  # No split, no position
56 |         try:
57 |             int(splitheader[-1])  # something else found instead of position
58 |         except:
59 |             continue
60 |         else:
61 |             sub_pos = splitheader[-1]
62 | 
63 |     row.append(sub_pos)
64 |     output.write("\t".join(row)+"\n")
65 | 
66 | input1.close()
67 | output.close()
68 | 


--------------------------------------------------------------------------------
/bin/map_cosmic_snp_tohg19.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import getopt
  5 | import os
  6 | import re
  7 | 
  8 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
  9 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
 10 |     print("Example: python map_cosmic_snp_tohg19.py --input saav_peptable.txt --output saav_pep.coords.txt --cosmic_input CosmicMutantExport.tsv --dbsnp_input snp142CodingDbSnp.txt")
 11 | else:
 12 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=','output=','cosmic_input=','dbsnp_input='])
 13 |     for opt, arg in options:
 14 |         if opt == '--input': input_file=arg
 15 |         elif opt == '--output': output_file=arg
 16 |         elif opt == '--cosmic_input': cosmic_file=arg
 17 |         elif opt == '--dbsnp_input': dbsnp_file=arg
 18 |         else:
 19 |             print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
 20 | 
 21 | input=open(input_file,"r")
 22 | output=open(output_file,"w")
 23 | 
 24 | cosmic_input=open(cosmic_file,"r") # Download at \http://cancer.sanger.ac.uk/cosmic/download
 25 | dbsnp_input=open(dbsnp_file,"r") # Download at UCSC table browser, select Variation as group, and snp142CodingDbSnp as table
 26 | 
 27 | header = ["#CHROM","POS","ID","INFO"]
 28 | output.write("\t".join(header)+"\n")
 29 | 
 30 | cosmic_dic={}
 31 | 
 32 | print ("reading cosmic database")
 33 | header = cosmic_input.readline().strip().split('\t')
 34 | geneix = header.index('Gene name')
 35 | mrnaix = header.index('Accession Number')
 36 | mutdesc = header.index('Mutation Description')
 37 | mutcds = header.index('Mutation CDS')
 38 | mutaa = header.index('Mutation AA')
 39 | mutgenpos = [ix for ix, x in enumerate(header) if re.match('Mutation .*genome position', x)][0]
 40 | mutstr = [ix for ix, x in enumerate(header) if re.match('Mutation .*strand', x)][0]
 41 | 
 42 | for line in cosmic_input:
 43 |     row=line.strip().split("\t")
 44 |     if "coding silent" in row[mutdesc]: # correspond to the column "Mutation Description" in the cosmic_input
 45 |         continue;
 46 |     elif row[mutdesc]=="Unknown":
 47 |         continue;
 48 |     elif row[mutdesc]=="Whole gene deletion":
 49 |         continue;
 50 |     else:
 51 |         gene=row[geneix]
 52 |         mRNA=row[mrnaix]
 53 |         dna_mut=row[mutcds] # correspond to the column "Mutation CDS" in the cosmic_input
 54 |         aa_mut=row[mutaa] # correspond to the column "Mutation AA" in the cosmic_input
 55 |         chr_position=row[mutgenpos] # correspond to the column "Mutation genome position" in the cosmic_input
 56 |         chr_strand=row[mutstr] # correspond to the column "Mutation strand" in the cosmic_input
 57 |         id="COSMIC:%s:%s:%s:%s" % (gene,mRNA,dna_mut,aa_mut)
 58 |         if id not in cosmic_dic:
 59 |             cosmic_dic[id]=[chr_position,chr_strand,dna_mut,aa_mut]
 60 | 
 61 | print ("reading cosmic database finished")
 62 | 
 63 | print ("reading dbsnp database")
 64 | snp_dic={}
 65 | cor_dic={}
 66 | for line in dbsnp_input:
 67 |     row=line.strip().split("\t")
 68 |     rsid=row[4]
 69 |     if rsid not in snp_dic:
 70 |         snp_dic[rsid]= [row[1],row[3]]
 71 |         cor_dic[row[1]+":"+row[3]]=rsid
 72 | 
 73 | print ("reading dbsnp database finished")
 74 | 
 75 | output_dic={}
 76 | output_chr_dic={}
 77 | 
 78 | cosmic_inDBsnp_count=0
 79 | input.readline()
 80 | for line in input:
 81 |     row=line.strip().split("\t")
 82 |     chr=""
 83 |     start=""
 84 |     end=""
 85 |     try:
 86 |         #score=row[12]
 87 |         #PSMcount=row[-1]
 88 |         pep=re.sub("[\W\d]","",row[0].strip())
 89 |         spectrumai_result=row[-1]
 90 |         proid = row[1]
 91 |     except IndexError:
 92 |         print ("INDEX ERROR",line)
 93 |         continue;
 94 |     if "CanProVar_rs" in proid:
 95 |         acclist=proid.split(";")
 96 |         acc=""
 97 |         for proid in acclist:
 98 |             if "CanProVar_rs" in proid:
 99 |                 acc=proid
100 |                 break;
101 |         snpid=acc.split("_")[1]
102 |         if pep not in output_dic:
103 |             output_dic[pep]=1
104 |             if snpid in snp_dic:
105 |                 chr=snp_dic[snpid][0]
106 |                 start=snp_dic[snpid][1]
107 |                 entry="%s\t%s\tID=%s\tSequence=%s;SpectrumAI_result=%s\n" % (chr,start,acc,pep,spectrumai_result)
108 |                 output.write(entry)
109 |             else:
110 |                 print (snpid,"not found in dbSNP")
111 |     else:
112 |         if proid[:6]=="COSMIC":
113 |             acc=proid.split(";")[0]
114 |             acc_list=acc.split(":")[:5]
115 |             cosmic_id=":".join(acc_list)
116 |             if pep not in output_dic:
117 |                 output_dic[pep]=1
118 |                 try:
119 |                     chr_position=cosmic_dic[cosmic_id][0]
120 |                 except KeyError:
121 |                     print('COSMIC DB and VarDB do not match, skipping {}'.format(cosmic_id))
122 |                     continue
123 |                 try:
124 |                     index1=chr_position.index(":")
125 |                     index2=chr_position.index("-")
126 |                     chr="chr"+chr_position[:index1].replace("23","X").replace("24","Y") # some of COSMIC entries used chr23 instead of chrX
127 | 
128 |                     chr_start=chr_position[index1+1:index2]
129 |                     chr_strand=cosmic_dic[cosmic_id][1]
130 |                     #allele=cosmic_dic[cosmic_id][2]
131 |                     #pep_allele=cosmic_dic[cosmic_id][3]
132 |                     cosmic_cor="%s:%s" % (chr,chr_start)
133 |                     if cosmic_cor in cor_dic:
134 |                         cosmic_inDBsnp_count+=1
135 |                         newid="CanProVar_"+cor_dic[cosmic_cor]+"_"+"_".join(acc.split(":")[2:])
136 |                         entry="%s\t%s\tID=%s\tSequence=%s;SpectrumAI_result=%s\n" % (chr,chr_start,newid,pep,spectrumai_result)
137 |                         output.write(entry)
138 |                     else:
139 |                         entry="%s\t%s\tID=%s\tSequence=%s;SpectrumAI_result=%s\n" % (chr,chr_start,acc,pep,spectrumai_result)
140 |                         output.write(entry)
141 | 
142 | 
143 |                 except ValueError:
144 |                     print ("ValueError",cosmic_id,chr_position)
145 |         else:
146 |             print ("protein id %s are not CanProVar or COSMIC entries, skipped" % proid)
147 | 
148 | cosmic_input.close()
149 | dbsnp_input.close()
150 | input.close()
151 | output.close()
152 | 
153 | 


--------------------------------------------------------------------------------
/bin/map_novelpeptide2genome.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | '''
  4 |     this script will map peptides to genome and output an peptide gff3 file with coordinates.
  5 |     which genome it maps to depends on the GTF input file provided.
  6 |     written by Yafeng Zhu @ Karolinska Institutet.  Email: yafeng.zhu@ki.se
  7 | '''
  8 | 
  9 | import sys
 10 | import os
 11 | import getopt
 12 | import numpy as np
 13 | import re
 14 | from Bio import SeqIO
 15 | 
 16 | class EXON(object):
 17 |     def __init__(self,number=0,gene=None,variant=None,chr=None,strand=None,start=0,end=0,length=0,trans_start=0,trans_end=0):
 18 |         self.gene=gene
 19 |         self.variant=variant
 20 |         self.number=number
 21 |         self.start=start  #chromosome start coordinate
 22 |         self.end=end  #chromosome end coordinate
 23 |         self.strand=strand
 24 |         self.chr=chr
 25 |         self.trans_start=trans_start
 26 |         self.trans_end=trans_end
 27 |     def length(self):
 28 |         self.length=self.trans_end-self.trans_start+1
 29 | 
 30 | 
 31 | def cal_trans_pos(exon_list): # calculate transcript position of exon start & end, exon_list is a list of exon objects
 32 |     strand=exon_list[0].strand
 33 |     if strand=="+":
 34 |         new_exon_list=sorted(exon_list,key=lambda x:x.start)
 35 |     else:
 36 |         new_exon_list=sorted(exon_list,key=lambda x:x.start,reverse=True)
 37 |     
 38 |     sumExonlength=0
 39 |     for exon in new_exon_list:
 40 |         exon_length=exon.end-exon.start+1
 41 |         exon.trans_start=1+sumExonlength
 42 |         exon.trans_end=exon.trans_start+exon_length-1
 43 |         sumExonlength+=exon_length
 44 |         
 45 |     return new_exon_list
 46 | 
 47 | def get_pep_cor(exon_object_list,n1,n2): # return peptide's chromosome start and end cor given peptide's trans_start (n1) and trans_end (n2)
 48 |     pep_chr=""
 49 |     pep_strand=""
 50 |     pep_chr_start=0
 51 |     pep_chr_end=0
 52 |     pep_start_exon=0
 53 |     pep_end_exon=0
 54 |     for i in range(len(exon_object_list)):
 55 |         exon=exon_object_list[i]
 56 |         if n1<=exon.trans_end and n1>=exon.trans_start:
 57 |             pep_chr=exon.chr
 58 |             pep_strand=exon.strand
 59 |             pep_start_exon=i+1
 60 |             if pep_strand=='+':
 61 |                 pep_chr_start=exon.start+(n1-exon.trans_start)
 62 |             else:
 63 |                 pep_chr_end=exon.end-(n1-exon.trans_start)
 64 | 
 65 |         if n2<=exon.trans_end and n2>=exon.trans_start:
 66 |             pep_chr=exon.chr
 67 |             pep_strand=exon.strand
 68 |             pep_end_exon=i+1
 69 |             if pep_strand=='+':
 70 |                 pep_chr_end=exon.start+(n2-exon.trans_start)
 71 |             else: # chr_cor of n2 is pep_chr_start
 72 |                 pep_chr_start=exon.end-(n2-exon.trans_start)
 73 | 
 74 |     return pep_chr,pep_strand,pep_chr_start,pep_chr_end,pep_start_exon,pep_end_exon
 75 | 
 76 | def parse_gtf(infile):
 77 |     dic={}
 78 |     with open(infile,"r") as infile_object:
 79 |         for line in infile_object:
 80 |             if line[0]!="#": # skip lines commmented out
 81 |                 row=line.strip().split("\t")
 82 |                 if row[2]=="exon":
 83 |                     attri_list=row[8].split(";")
 84 |                     transID=""
 85 |                     exon=EXON(start=int(row[3]),end=int(row[4]),chr=row[0],strand=row[6])
 86 |                     for attri in attri_list:
 87 |                         if "transcript_id " in attri:
 88 |                             transID=attri.strip().replace("transcript_id ","").replace('\"',"")
 89 |                         elif "transcript_id=" in attri:
 90 |                             transID=attri.strip().replace("transcript_id=","").replace('\"',"")
 91 |                 
 92 |                     if transID not in dic:
 93 |                         dic[transID]=[exon]
 94 |                     else:
 95 |                         dic[transID].append(exon)
 96 |     return dic
 97 | 
 98 | def get_id(s):
 99 |     acclist=s.split(";")
100 |     acc = acclist[0]
101 |     prot_ID=acc.split("(")[0]
102 |     trans_ID = ""
103 |     if "PGOHUM_" in prot_ID:
104 |         trans_ID=prot_ID.split("_")[1]
105 |     elif "PGOHUM00000" in prot_ID:
106 |         trans_ID=prot_ID.split("_")[0]
107 |     elif "lnc-" in prot_ID:
108 |         trans_ID=prot_ID.split("_")[0]
109 |     elif "lncRNA" in prot_ID:
110 |         trans_ID=prot_ID.split("_")[1]
111 | 
112 |     return prot_ID,trans_ID
113 | 
114 | ################  Comand-line arguments ################
115 | 
116 | 
117 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
118 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
119 |     print("Example: python map_novelpeptide2genome.py --input input_filename --gtf VarDB.gtf --fastadb VarDB.fasta --tab_out file0.txt --fasta_out file1.fasta --gff3_out file2.gff3 --bed_out file3.bed")
120 | else:
121 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=',
122 |                                                          'gtf=',
123 |                                                          'fastadb=',
124 |                                                          'tab_out=',
125 |                                                          'fasta_out=',
126 |                                                          'gff3_out=',
127 |                                                          'bed_out='])
128 |     for opt, arg in options:
129 |         if opt == '--input': input_file=arg
130 |         elif opt == '--gtf':gtf_file=arg
131 |         elif opt == '--fastadb': db_file=arg
132 |         elif opt == '--tab_out': tab_file=arg
133 |         elif opt == '--fasta_out': fasta_file=arg
134 |         elif opt == '--gff3_out': gff_file=arg
135 |         elif opt == '--bed_out': bed_file=arg
136 |         else:
137 |             print(("Warning! Command-line argument: %s not recognized. Exiting..." % opt)); sys.exit()
138 | 
139 | print("reading GTF input file")
140 | feature_dic=parse_gtf(gtf_file)
141 | print(("number of unique transcripts in GTF file",len(feature_dic)))
142 | 
143 | seqdb = SeqIO.parse(db_file,'fasta')
144 | seq_dic={}
145 | for record in seqdb:
146 |     if record.id[:6] in ["COSMIC","CanPro"]:
147 |         continue
148 |     else:
149 |         seq_dic[record.id]=str(record.seq)
150 | 
151 | print(("number of lncRNA & pseudogene sequences in fasta file",len(seq_dic)))
152 | 
153 | input=open(input_file,'r') # peptide table with two columns, peptide sequence in first column, protein ID in second column
154 | 
155 | tab_output=open(tab_file,'w')
156 | fasta_output=open(fasta_file,'w')
157 | gff_output=open(gff_file,'w')
158 | bed_output=open(bed_file,'w')
159 | 
160 | header=input.readline().split("\t")
161 | 
162 | pep_col = header.index("Peptide")
163 | pro_col = header.index("Protein")
164 | 
165 | newheader=["Bare peptide","Peptide","Protein","chr","start","end","strand"]
166 | tab_output.write("\t".join(newheader) + '\n')
167 | 
168 | non_mapped_pep=0
169 | novpep_dic={}
170 | 
171 | for line in input:
172 |     row=line.strip().split("\t")
173 |     modpep = row[pep_col].strip()
174 |     peptide=re.sub("[\W\d]","", modpep).upper()
175 |     proteins = row[pro_col]
176 |     
177 |     if peptide not in novpep_dic:
178 |         novpep_dic[peptide] = 1
179 |         fasta_output.write(">%s\n%s\n" % (peptide,peptide))
180 |     else:
181 |         continue;
182 |     
183 |     if "chr" in proteins:
184 |         # Only report first protein match coordinates, remove multiple-hits by BLAT later
185 |         protein = proteins.split(';')[0]
186 |         cor_list=protein.split("_")
187 |         print(protein, cor_list)
188 |         pep_chr=cor_list[0]
189 |         pep_chr_start=cor_list[1]
190 |         pep_chr_end=cor_list[2]
191 |         pep_strand=cor_list[3]
192 |         
193 |         gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
194 |         gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,pep_chr_end,".",pep_strand,"0","Parent="+peptide]
195 |         gff_output.write("\t".join(map(str,gff_format_line1))+"\n")
196 |         gff_output.write("\t".join(map(str,gff_format_line2))+"\n")
197 |         
198 |         newrow=[peptide,modpep,proteins,pep_chr,pep_chr_start,pep_chr_end,pep_strand]
199 |         tab_output.write("\t".join(newrow)+"\n")
200 |     
201 |         continue;
202 | 
203 |     protein_id,transcript_id=get_id(proteins)
204 |     frame=protein_id[-1]
205 |     
206 |     try:
207 |         exons=feature_dic[transcript_id]
208 |     except KeyError:
209 |         non_mapped_pep+=1
210 |         print(("KeyError",transcript_id,"doesn't exit in GTF input file"))
211 |         continue;
212 |     
213 |     aa_seq=seq_dic[protein_id]
214 |     pep_index=aa_seq.index(peptide)
215 |     
216 |     pep_trans_start=3*pep_index+1
217 |     pep_trans_end=pep_trans_start+3*len(peptide)-1
218 |     
219 |     if frame=="2":
220 |         pep_trans_start+=1
221 |         pep_trans_end+=1
222 |     elif frame=="3":
223 |         pep_trans_start+=2
224 |         pep_trans_end+=2
225 | 
226 |     exons=cal_trans_pos(exons)
227 |     
228 |     #print pep_trans_start,pep_trans_end
229 |     pep_chr,pep_strand,pep_chr_start,pep_chr_end,pep_start_exon,pep_end_exon=get_pep_cor(exons,pep_trans_start,pep_trans_end)
230 | 
231 |     newrow=[peptide,modpep,proteins]+list(map(str,[pep_chr,pep_chr_start,pep_chr_end,pep_strand]))
232 |     tab_output.write("\t".join(newrow)+"\n")
233 | 
234 |     bed_output.write("%s\t%s\t%s\tA\t-\tComments:Seq=%s\n" % (pep_chr,pep_chr_start,pep_chr_start,peptide))
235 |     bed_output.write("%s\t%s\t%s\tA\t-\tComments:Seq=%s\n" % (pep_chr,pep_chr_end,pep_chr_end,peptide))
236 | 
237 |     #handle exceptions
238 |     if pep_chr_start>pep_chr_end:
239 |         non_mapped_pep+=1
240 |         print(("mapping error",peptide,protein_id, "skip this peptide"))
241 |         continue;
242 |     if pep_chr_start<=0:
243 |         non_mapped_pep+=1
244 |         print(("mapping error",peptide,protein_id,"skip this peptide"))
245 |         continue;
246 | 
247 |     #print pep_chr_start,pep_chr_end
248 |     #print pep_start_exon,pep_end_exon
249 |     if "chr" not in pep_chr:
250 |         pep_chr="chr"+pep_chr.replace("MT","M") # replace "MT" with "M"
251 | 
252 |     if pep_start_exon==pep_end_exon: #if peptide map to one exon
253 |         gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
254 |         gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,pep_chr_end,".",pep_strand,"0","Parent="+peptide]
255 |         gff_output.write("\t".join(map(str,gff_format_line1))+"\n")
256 |         gff_output.write("\t".join(map(str,gff_format_line2))+"\n")
257 |     elif abs(pep_start_exon-pep_end_exon)==1: #if it is a splice junction peptide spanning over two exons
258 |         if pep_strand=="+":
259 |             gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
260 |             gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,exons[pep_start_exon-1].end,".",pep_strand,"0","Parent="+peptide]
261 |             gff_format_line3=[pep_chr,"MS","CDS",exons[pep_end_exon-1].start,pep_chr_end,".",pep_strand,".","Parent="+peptide]
262 |         else:
263 |             gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
264 |             gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,exons[pep_end_exon-1].end,".",pep_strand,"0","Parent="+peptide]
265 |             gff_format_line3=[pep_chr,"MS","CDS",exons[pep_start_exon-1].start,pep_chr_end,".",pep_strand,".","Parent="+peptide]
266 | 
267 |         gff_output.write("\t".join(map(str,gff_format_line1))+"\n")
268 |         gff_output.write("\t".join(map(str,gff_format_line2))+"\n")
269 |         gff_output.write("\t".join(map(str,gff_format_line3))+"\n")
270 |     elif abs(pep_start_exon-pep_end_exon)>1: #if peptide span multiple exons,rare case!
271 |         if pep_strand=="+":
272 |             gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
273 |             gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,exons[pep_start_exon-1].end,".",pep_strand,"0","Parent="+peptide]
274 |             gff_format_line3=[pep_chr,"MS","CDS",exons[pep_end_exon-1].start,pep_chr_end,".",pep_strand,".","Parent="+peptide]
275 |         else:
276 |             gff_format_line1=[pep_chr,"MS","mRNA",pep_chr_start,pep_chr_end,".",pep_strand,".","ID="+peptide]
277 |             gff_format_line2=[pep_chr,"MS","CDS",pep_chr_start,exons[pep_end_exon-1].end,".",pep_strand,"0","Parent="+peptide]
278 |             gff_format_line3=[pep_chr,"MS","CDS",exons[pep_start_exon-1].start,pep_chr_end,".",pep_strand,".","Parent="+peptide]
279 | 
280 |         gff_output.write("\t".join(map(str,gff_format_line1))+"\n")
281 |         gff_output.write("\t".join(map(str,gff_format_line2))+"\n")
282 |         for k in range(min(pep_start_exon,pep_end_exon)+1,max(pep_start_exon,pep_end_exon)):
283 |             gff_format_line=[pep_chr,"MS","CDS",exons[k-1].start,exons[k-1].end,".",pep_strand,".","Parent="+peptide]
284 |             gff_output.write("\t".join(map(str,gff_format_line))+"\n")
285 | 
286 |         gff_output.write("\t".join(map(str,gff_format_line3))+"\n")
287 | 
288 | gff_output.close()
289 | tab_output.close()
290 | bed_output.close()
291 | fasta_output.close()
292 | 
293 | print(("total number of unique peptides",len(novpep_dic)))
294 | print(("total number of unmapped peptides",non_mapped_pep))
295 | 


--------------------------------------------------------------------------------
/bin/parse_BLASTP_out.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | from Bio import SeqIO
  7 | import re
  8 | 
  9 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
 10 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
 11 |     print("Example: python parse_blastp_output.py --input input_filename --blastp_result peptide.blastp.out.txt --fasta Homo_sapiens.GRCh37.75.pep.all.fa --output output_filename")
 12 | else:
 13 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=',
 14 |                                                          'blastp_result=',
 15 |                                                          'fasta=',
 16 |                                                          'output='])
 17 |     for opt, arg in options:
 18 |         if opt == '--input': input_file=arg
 19 |         elif opt == '--fasta': fasta_file=arg
 20 |         elif opt == '--blastp_result':blastp_file=arg
 21 |         elif opt == '--output': output_file=arg
 22 |         else:
 23 |             print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
 24 | 
 25 | input1= SeqIO.parse(fasta_file,"fasta") # blastp reference database
 26 | seqdb={}
 27 | for record in input1:
 28 |     seq=str(record.seq)
 29 |     if record.id not in seqdb:
 30 |         seqdb[record.id]=seq
 31 | 
 32 | print(len(seqdb))
 33 | 
 34 | input2= open(blastp_file,"r") # blastp output
 35 | input3= open(input_file,"r") # novel peptide tab txt
 36 | output= open(output_file,"w")
 37 | 
 38 | blastout={}
 39 | hits_dic={}
 40 | for line in input2:
 41 |     row=line.strip().split("\t")
 42 |     qid=row[0]
 43 |     sid=row[1]
 44 |     sseq=seqdb[sid]
 45 |     ident=row[2]
 46 |     peplen=int(row[3])
 47 |     alignlen=int(row[6])-int(row[5])+1
 48 |     sstart=int(row[7])
 49 |     send=int(row[8])
 50 |     mismatch=row[9]
 51 |     gap=row[12]
 52 |     alignseq=row[-3]
 53 |     evalue=float(row[-2])
 54 |     category="NA"
 55 |     single_sub_pos="NA"
 56 |     if sstart>3:
 57 |         Nterm_seq=sseq[sstart-4:sstart+2] #check up 3 amino acid before N-term of this peptide
 58 |     else:
 59 |         Nterm_seq=sseq[:sstart]
 60 | 
 61 |     if len(sseq)-send<3:
 62 |         Cterm_seq=sseq[send-1:]
 63 |     else:
 64 |         Cterm_seq=sseq[send-3:send+3]
 65 | 
 66 |     if alignlen==peplen:
 67 |         if float(ident)==100:
 68 |             category="match to known protein"
 69 |         
 70 |         elif int(gap)==0 and int(mismatch)==1:
 71 |             category="map to known protein with 1 aa mismatch"
 72 |             for i in range(peplen):
 73 |                 if qid[i]!=alignseq[i]:
 74 |                     single_sub_pos=str(i+1)
 75 | 
 76 |         elif int(gap)==1 and int(mismatch)==0:
 77 |             category="map to known protein with 1 aa insertion"
 78 |         else:
 79 |             category="novelpep (map to known protein with more than 2 mismatched aa)"
 80 |     elif peplen-alignlen==1 and float(ident)==100:
 81 |         category="map to known protein with 1 aa deletion"
 82 | 
 83 |     else:
 84 |         category="novelpep (map to known protein with more than 2 mismatched aa)"
 85 |     
 86 |     if qid not in hits_dic:
 87 |         hits_dic[qid]=evalue
 88 |         blastout[qid]=[category,sid,ident,peplen,single_sub_pos,Nterm_seq,alignseq,Cterm_seq,alignlen,mismatch,gap]
 89 |     else:
 90 |         if evalue<hits_dic[qid]:
 91 |             hits_dic[qid]=evalue
 92 |             blastout[qid]=[category,sid,ident,peplen,single_sub_pos,Nterm_seq,alignseq,Cterm_seq,alignlen,mismatch,gap]
 93 | 
 94 | header=input3.readline().strip().split("\t")
 95 | 
 96 | header=header+["blastp_category","blastp_match","identity","peplen","sub_pos","Nterm-seq(3aa)","aligned_seq","Cterm-seq(3aa)","alignlen","mismatch","gap"]
 97 | output.write("\t".join(header)+"\n")
 98 | 
 99 | for line in input3:
100 |     row=line.strip().split("\t")
101 |     peptide=row[0]
102 |     if peptide in blastout:
103 |         results=blastout[row[0]]
104 |         newrow=row+results
105 |         output.write("\t".join(map(str,newrow))+"\n")
106 |     else:
107 |         newrow=row+["novelpep (no match to known protein found)","NA","NA","NA","NA","NA","NA","NA","NA","NA","NA"]
108 |         output.write("\t".join(map(str,newrow))+"\n")
109 | 
110 | 
111 | input1.close()
112 | input2.close()
113 | input3.close()
114 | output.close()
115 | 


--------------------------------------------------------------------------------
/bin/parse_BLAT_out.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | 
 5 | input1=open(sys.argv[1],"r")  # blat output plx
 6 | input2=open(sys.argv[2],"r") # novel peptide gene cor file
 7 | 
 8 | output=open(sys.argv[3],"w")
 9 | 
10 | # parse blat output and collect peptides mapped to multiple loci
11 | pep_dic={}
12 | pep_map={}
13 | pep_multi_loci={}
14 | pep_unique_loci={}
15 | 
16 | n=0
17 | for line in input1:
18 |     row=line.strip().split("\t")
19 |     n+=1
20 |     if row[0].isdigit():
21 |         pep=row[-2].replace(",","")
22 |         pep_dic[pep]=1
23 |         match=int(row[0])
24 |         strand=row[8][1]
25 |         qsize=int(row[10])
26 |         qstart=int(row[11])
27 |         qend=int(row[12])
28 |         chr=row[13]
29 |         chr_start=row[15]
30 |         chr_end=row[16]
31 |         if match==qsize and qend-qstart==qsize:
32 |             if pep not in pep_map:
33 |                 pep_map[pep]=[chr+"_"+chr_start+"_"+chr_end+"_"+strand]
34 |             else:
35 |                 pep_map[pep].append(chr+"_"+chr_start+"_"+chr_end+"_"+strand)
36 | 
37 | print("the number of peptides input for blat:" ,n)
38 | print("blat qsize=match peptides", len(pep_map))
39 | print("blat return peptides", len(pep_dic))
40 | 
41 | for pep in pep_map:
42 |     if len(pep_map[pep])>1:
43 |         pep_multi_loci[pep]=pep_map[pep]
44 |     elif len(pep_map[pep])==1:
45 |         pep_unique_loci[pep]=pep_map[pep]
46 | 
47 | print("blat multi map peptides",len(pep_multi_loci))
48 | print("blat unique map peptides",len(pep_unique_loci))
49 | 
50 | input2.readline()
51 | header = ["Bare peptide","blat_category","blat_match"]
52 | output.write("\t".join(header)+"\n")
53 | 
54 | for line in input2:
55 |     row=line.strip().split("\t")
56 |     pep=row[0]
57 |     blat_category = "unique location"
58 |     blat_result = ["No match found by BLAT"]
59 | 
60 |     if pep in pep_multi_loci:
61 |         blat_category = "multiple locations"
62 |         blat_result = pep_multi_loci[pep]
63 |     elif pep in pep_unique_loci:
64 |         blat_result = pep_unique_loci[pep]
65 |     blat_result = ';'.join(blat_result)
66 | 
67 |     output.write("\t".join([pep,blat_category,blat_result])+"\n")
68 | 
69 | input1.close()
70 | input2.close()
71 | output.close()
72 | 


--------------------------------------------------------------------------------
/bin/parse_annovar_out.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import getopt
 5 | 
 6 | 
 7 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
 8 |     print("Warning! wrong command, please read the mannual in Readme.txt.")
 9 |     print("Example: python prepare_annovar_input.py --input novpep.tab.txt --annovar_out novpep.variant_function --output novpep_anovar.txt")
10 | else:
11 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['input=','output=', 'annovar_out='])
12 |     for opt, arg in options:
13 |         if opt == '--annovar_out': annovar_file=arg
14 |         elif opt == '--input': input_file=arg
15 |         elif opt == '--output': output_file=arg
16 |         else:
17 |             print("Warning! Command-line argument: %s not recognized. Exiting..." % opt); sys.exit()
18 | 
19 | 
20 | input1=open(annovar_file,"r") # annovar output
21 | input2=open(input_file,"r") # novel pep table
22 | output=open(output_file,"w")
23 | 
24 | category={}
25 | while True:
26 |     line1=input1.readline()
27 |     line2=input1.readline()
28 |     if not line2:break
29 |     row1=line1.strip().split("\t")
30 |     row2=line2.strip().split("\t")
31 |     pep=row1[-1].replace("Comments:Seq=","")
32 |     if row1[0]==row2[0]: # if the Nterm and Cterm of the peptide match to same functional class
33 |         category[pep]=[row1[0],row1[1]]
34 |     else:
35 |         fun=row1[0]+"-"+row2[0]
36 |         category[pep]=[fun,row1[1]+";"+row2[1]]
37 | 
38 | print("%d peptides returned with anovar annotation" % len(category))
39 | 
40 | input2.readline()
41 | header=["Bare peptide","anovar_category","associated_gene"]
42 | output.write("\t".join(header)+"\n")
43 | for line in input2:
44 |     row=line.strip().split("\t")
45 |     pep=row[0]
46 |     
47 |     anovar_cat = "NA"
48 |     associated_gene="NA"
49 |     if pep in category:
50 |         anovar_cat=category[pep][0]
51 |         associated_gene=category[pep][1]
52 | 
53 |     output.write("\t".join([pep,anovar_cat,associated_gene])+"\n")
54 | 
55 | input1.close()
56 | input2.close()
57 | output.close()
58 | 
59 | 
60 | 
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/bin/parse_spectrumAI_out.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import re
 5 | import getopt
 6 | 
 7 | 
 8 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
 9 |     print ("Warning! wrong command, please read the mannual in Readme.txt.")
10 |     print ("Example: python parse_spectrumAI_out.py --spectrumAI_out specAI_file --input input_psms.txt --output output_filename")
11 | else:
12 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['spectrumAI_out=',
13 |                                                          'input=',
14 |                                                          'output='])
15 |     for opt, arg in options:
16 |         if opt == '--spectrumAI_out': input1_file=arg
17 |         elif opt == '--input':input2_file=arg
18 |         elif opt == '--output': output_file=arg
19 |         else:
20 |             print(("Warning! Command-line argument: %s not recognized. Exiting...") % opt); sys.exit()
21 | 
22 | input1=open(input1_file,"r")  ## SpectrumAI output
23 | input2=open(input2_file,"r")  ## novelpep table, peptide sequence is in first column
24 | output=open(output_file,"w")
25 | header2=input2.readline().strip().split("\t")
26 | header2 += ["SpectrumAI_result"]
27 | output.write("\t".join(header2)+"\n")
28 | 
29 | header1=input1.readline().split("\t")
30 | index1=header1.index("Peptide")
31 | try:
32 |     index2=header1.index("flanking_ions_support")
33 |     index3=header1.index("status")
34 | except ValueError:
35 |     print ("the SpecturmAI output is empty")
36 |     sys.exit()
37 | 
38 | specAI_result={}  # found with b,y ion support
39 | 
40 | for line in input1:
41 |     row=line.strip().split("\t")
42 |     pep=re.sub("[\W\d]","",row[index1].strip())
43 |     try:
44 |         if row[index3]=="checked":
45 |             try:
46 |                 specAI_result[pep].append(row[index2])
47 |             except KeyError:
48 |                 specAI_result[pep] = [row[index2]]
49 |     except IndexError:
50 |         print(("the line doesn't have the right number of columns"), line)
51 | 
52 | 
53 | n1=0
54 | n2=0
55 | for line in input2: # peptide sequence is in first column
56 |     row=line.strip().split("\t")
57 |     pep=re.sub("[\W\d]","",row[0].strip())
58 |     n1+=1
59 |     try:
60 |         if 'YES' in specAI_result[pep]:
61 |             row.append('PASS')
62 |             n2+=1
63 |         else:
64 |             row.append('FAIL')
65 |     except KeyError:
66 |         row.append('NA')
67 | 
68 |     output.write("\t".join(row)+"\n")
69 | 
70 | input1.close()
71 | input2.close()
72 | output.close()
73 | 
74 | print(("%d out of %d single substitution novel peptides passed SpectrumAI curation" % (n2,n1)))
75 | 


--------------------------------------------------------------------------------
/bin/peptide_pi_annotator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | import re
  4 | import sys
  5 | import argparse
  6 | 
  7 | 
  8 | def main():
  9 |     if sys.argv[1:] == []:
 10 |         sys.argv.append('-h')
 11 |     args = parse_commandline()
 12 |     strips = {}
 13 |     if args.frac_col > 0:
 14 |         frac_col = args.frac_col - 1
 15 |     elif args.frac_col:
 16 |         frac_col = args.frac_col
 17 |     elif args.frac_colpattern:
 18 |         frac_col = get_col_by_pattern(args.peptable, args.frac_colpattern)
 19 |     else:
 20 |         raise RuntimeError('Must define fraction column')
 21 |     if args.stripcol > 0:
 22 |         stripcol = args.stripcol - 1
 23 |     elif args.stripcol:
 24 |         stripcol = args.stripcol
 25 |     elif args.stripcolpattern:
 26 |         stripcol = get_col_by_pattern(args.peptable, args.stripcolpattern)
 27 |     else:
 28 |         raise RuntimeError('Must define strip column')
 29 |     if args.pepcol:
 30 |         pepcol = args.pepcol - 1
 31 |     elif args.pepcolpattern:
 32 |         pepcol = get_col_by_pattern(args.peptable, args.pepcolpattern)
 33 |     else:
 34 |         raise RuntimeError('Must define peptide sequence column')
 35 |     for i, strip in enumerate(args.pipatterns):
 36 |         strips[strip] = {'intercept': args.intercepts[i],
 37 |                          'fr_width': args.fr_width[i]}
 38 |     with open(args.outpeptable, 'w') as fp:
 39 |         for outline in annotate_peptable(args.pipeps, args.peptable, pepcol,
 40 |                                          frac_col, stripcol, strips,
 41 |                                          args.ignoremods):
 42 |             fp.write('\t'.join([str(x) for x in outline]))
 43 |             fp.write('\n')
 44 | 
 45 | 
 46 | def get_strip(strips, string):
 47 |     for pattern in strips.keys():
 48 |         if re.search(pattern, string):
 49 |             return strips[pattern]
 50 |     return False
 51 | 
 52 | 
 53 | def get_col_by_pattern(peptable, colpattern):
 54 |     with open(peptable) as fp:
 55 |         header = next(fp).strip('\n').split('\t')
 56 |     for ix, field in enumerate(header):
 57 |         if colpattern in field:
 58 |             return ix
 59 | 
 60 | 
 61 | def annotate_peptable(predicted_peps_fn, peptable, seqcol, frac_col, stripcol,
 62 |                       strips, ignoremods):
 63 |     predicted_peps = {}
 64 |     with open(predicted_peps_fn) as fp:
 65 |         for line in fp:
 66 |             line = line.strip('\n').split('\t')
 67 |             predicted_peps[line[0]] = line[1]
 68 |     not_predicted_count, predicted_count = 0, 0
 69 |     with open(peptable) as fp:
 70 |         header = next(fp).strip('\n').split('\t')
 71 |         yield header + ['Experimental pI', 'Predicted pI', 'Delta pI']
 72 |         for line in fp:
 73 |             line = line.strip('\n').split('\t')
 74 |             sequence = line[seqcol]
 75 |             for weight in ignoremods:
 76 |                 if weight == '*':
 77 |                     regex = '[+-]\d*\.\d*'
 78 |                 else:
 79 |                     regex = '[+-]{}'.format(weight)
 80 |                 sequence = re.sub(regex, '', sequence)
 81 |             try:
 82 |                 pred_pi = float(predicted_peps[sequence])
 83 |             except KeyError:
 84 |                 print('CANNOT PREDICT', sequence)
 85 |                 not_predicted_count += 1
 86 |                 pred_pi, delta_pi = 'NA', 'NA'
 87 |             else:
 88 |                 predicted_count += 1
 89 |             strip = get_strip(strips, line[stripcol])
 90 |             if not strip:
 91 |                 exp_pi, delta_pi = 'NA', 'NA'
 92 |             else:
 93 |                 try:
 94 |                     exp_pi = (strip['fr_width'] * int(line[frac_col]) +
 95 |                               strip['intercept'])
 96 |                 except ValueError:
 97 |                     print('Cannot detect fraction for PSM {}'.format(sequence))
 98 |                     exp_pi, delta_pi = 'NA', 'NA'
 99 |                 else:
100 |                     if pred_pi != 'NA':
101 |                         delta_pi = exp_pi - pred_pi
102 |                     else:
103 |                         delta_pi = 'NA'
104 |             yield line + [exp_pi, pred_pi, delta_pi]
105 |     print('Number of peptides without pI prediction: {}\n'
106 |           'Number of peptides predicted: {}\n'.format(not_predicted_count,
107 |                                                       predicted_count))
108 | 
109 | 
110 | def parse_commandline():
111 |     parser = argparse.ArgumentParser(
112 |         formatter_class=argparse.RawTextHelpFormatter)
113 |     parser.add_argument('--out', dest='outpeptable', help='Output peptide '
114 |                         'table')
115 |     parser.add_argument('-p', dest='peptable', help='Peptide/PSM table with '
116 |                         'peptides, FDR, fraction numbers. Used to calculate'
117 |                         'pI shift.')
118 |     parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file '
119 |                         'with peptide seq, pI value')
120 |     parser.add_argument('--pepcolpattern', dest='pepcolpattern',
121 |                         help='Peptide sequence column pattern in peptide '
122 |                         'table.', default=False, type=str)
123 |     parser.add_argument('--pepcol', dest='pepcol', help='Peptide sequence '
124 |                         'column number in peptide table. First column is 1.',
125 |                         default=False, type=int)
126 |     parser.add_argument('--fraccolpattern', dest='frac_colpattern',
127 |                         help='Fraction number column pattern in peptide '
128 |                         'table.', default=False, type=str)
129 |     parser.add_argument('--fraccol', dest='frac_col', help='Fraction number '
130 |                         'column number in peptide table. First column is 1.',
131 |                         default=False, type=int)
132 |     parser.add_argument('--ignoremods', dest='ignoremods', help='Regex to '
133 |                         'identify modification weights to be ignored.',
134 |                         default=[], nargs='+', type=str)
135 |     parser.add_argument('--stripcolpattern', dest='stripcolpattern',
136 |                         help='Strip name column pattern in peptide '
137 |                         'table.', type=str, default=False)
138 |     parser.add_argument('--stripcol', dest='stripcol', help='Strip name '
139 |                         'column number in peptide table. Will be used to '
140 |                         'detect strips if multiple are present using pattern '
141 |                         'passed with --strippatterns. First column is nr. 1.',
142 |                         default=False, type=int)
143 |     parser.add_argument('--strippatterns', dest='pipatterns',
144 |                         help='Patterns to detect different pI ranges from e.g.'
145 |                         ' file name in peptide table', nargs='+')
146 |     parser.add_argument('--intercepts', dest='intercepts',
147 |                         help='pI Intercept of strips', nargs='+', type=float)
148 |     parser.add_argument('--widths', dest='fr_width', nargs='+',
149 |                         help='Strip fraction widths in pI', type=float)
150 |     return parser.parse_args(sys.argv[1:])
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     main()
155 | 


--------------------------------------------------------------------------------
/bin/pi_database_splitter.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | 
  4 | import sys
  5 | import argparse
  6 | import json
  7 | from numpy import median
  8 | from contextlib import ExitStack
  9 | 
 10 | from peptide_pi_annotator import get_col_by_pattern
 11 | 
 12 | 
 13 | def main():
 14 |     if sys.argv[1:] == []:
 15 |         sys.argv.append('-h')
 16 |     args = parse_commandline()
 17 |     with open(args.stripdef) as fp:
 18 |         strip = json.load(fp)
 19 |     if 'intercept' in strip:
 20 |         strip = {'1-{}'.format(strip['fr_amount']): strip}
 21 |     # Column nrs should start from 0
 22 |     # If negative, -1 is last item in list, etc
 23 |     if args.fdrcol > 0:
 24 |         fdrcol = args.fdrcol - 1
 25 |     elif args.fdrcol:
 26 |         fdrcol = args.fdrcol
 27 |     elif args.fdrcolpattern:
 28 |         fdrcol = get_col_by_pattern(args.train_peptable, args.fdrcolpattern)
 29 |     else:
 30 |         fdrcol = False
 31 |     if args.deltapicol > 0:
 32 |         deltapicol = args.deltapicol - 1
 33 |     elif args.deltapicol:
 34 |         deltapicol = args.deltapicol
 35 |     elif args.deltapicolpattern:
 36 |         deltapicol = get_col_by_pattern(args.train_peptable,
 37 |                                         args.deltapicolpattern)
 38 |     else:
 39 |         deltapicol = False
 40 |     if args.frac_colpattern:
 41 |         frac_col = get_col_by_pattern(args.train_peptable, args.frac_colpattern)
 42 |     else:
 43 |         raise RuntimeError('Must define fraction column')
 44 |     for striprange, stripdata in strip.items():
 45 |         frrange = striprange.split('-') 
 46 |         stripdata['fr_range'] =  (int(frrange[0]), int(frrange[1])) 
 47 |         stripdata['pishift'] = get_pishift(args.train_peptable, fdrcol, deltapicol, frac_col,
 48 |                 stripdata['fr_range'], args.fdrcutoff, args.picutoff)
 49 |     locfun = locatefraction if not stripdata['reverse'] else reverse_locatefraction
 50 |     binarray = get_bin_array(strip)
 51 |     write_fractions(args.pipeps, strip, binarray, locfun, args.minlen,
 52 |                     args.maxlen)
 53 | 
 54 | 
 55 | def locatefraction(pep_pi, bins):
 56 |     index = []
 57 |     for pibin in bins:
 58 |         if pep_pi > pibin[2]:
 59 |             continue
 60 |         elif pep_pi >= pibin[1]:
 61 |             index.append(pibin[0])
 62 |         else:
 63 |             return index
 64 |     return index
 65 | 
 66 | 
 67 | def reverse_locatefraction(pep_pi, bins):
 68 |     index = []
 69 |     for pibin in bins:
 70 |         if pep_pi < pibin[1]:
 71 |             continue
 72 |         elif pep_pi < pibin[2]:
 73 |             index.append(pibin[0])
 74 |         else:
 75 |             return index
 76 |     return index
 77 | 
 78 | 
 79 | def parse_commandline():
 80 |     parser = argparse.ArgumentParser(
 81 |         formatter_class=argparse.RawTextHelpFormatter)
 82 |     parser.add_argument('-p', dest='train_peptable', help='Peptide table with '
 83 |                         'peptides, FDR, and fraction numbers. Used to '
 84 |                         'calculate pI shift. Leave emtpy for no shift. '
 85 |                         'Tab separated file.')
 86 |     parser.add_argument('--deltacol', dest='deltapicol', help='Delta pI column'
 87 |                         ' number in peptide table. First column is nr. 1. '
 88 |                         'Negative number for counting from last col '
 89 |                         '(-1 is last).', default=False, type=int)
 90 |     parser.add_argument('--deltacolpattern', dest='deltapicolpattern',
 91 |                         help='Delta pI column header pattern in peptide '
 92 |                         'table.', default=False, type=str)
 93 |     parser.add_argument('--fraccolpattern', dest='frac_colpattern',
 94 |                         help='Fraction number column pattern in peptide '
 95 |                         'table.', default=False, type=str)
 96 |     parser.add_argument('--picutoff', dest='picutoff',
 97 |                         help='delta pI value to filter experimental peptides'
 98 |                         ' when calculating pi shift.', default=0.2, type=float)
 99 |     parser.add_argument('--fdrcolpattern', dest='fdrcolpattern',
100 |                         help='FDR column header pattern in peptide table.',
101 |                         default=False, type=str)
102 |     parser.add_argument('--fdrcol', dest='fdrcol', help='FDR column number in '
103 |                         'peptide table. First column is nr. 1. Empty includes '
104 |                         'all peptides', default=False, type=int)
105 |     parser.add_argument('--fdrcutoff', dest='fdrcutoff',
106 |                         help='FDR cutoff value to filter experimental peptides'
107 |                         ' when calculating pi shift.', default=0, type=float)
108 |     parser.add_argument('-i', dest='pipeps', help='A tab-separated txt file '
109 |                         'with accession, peptide seq, pI value')
110 |     parser.add_argument('--stripdef', dest='stripdef', help='Strip file generated by pipeline')
111 |     parser.add_argument('--minlen', dest='minlen', help='Minimal peptide length',
112 |                         type=int)
113 |     parser.add_argument('--maxlen', dest='maxlen', help='Maximal peptide length',
114 |                         type=int, default=False)
115 |     return parser.parse_args(sys.argv[1:])
116 | 
117 | 
118 | def get_pishift(peptable, fdrcol, deltapicol, frac_col, striprange, fdrcutoff, delta_pi_cutoff):
119 |     delta_pis = []
120 |     with open(peptable) as fp:
121 |         next(fp)  # skip header
122 |         for line in fp:
123 |             line = line.strip('\n').split('\t')
124 |             if fdrcol:
125 |                 try:
126 |                     fdr = float(line[fdrcol])
127 |                 except ValueError:
128 |                     continue
129 |                 if fdr > fdrcutoff:
130 |                     continue
131 |             try:
132 |                 delta_pi = float(line[deltapicol])
133 |             except ValueError:
134 |                 continue
135 |             if striprange[0] <= int(line[frac_col]) <= striprange[1] and delta_pi < delta_pi_cutoff:
136 |                 delta_pis.append(delta_pi)
137 |     shift = median(delta_pis)
138 |     print('pI shift (median of delta pIs) for fraction range {} - {}: {}'.format(
139 |         striprange[0], striprange[1], shift))
140 |     return shift
141 | 
142 |     
143 | def get_bin_array(strip):
144 |     bin_array = []
145 |     intervals = sorted([iv for iv in strip.values()], key=lambda x: x['fr_range'][0])
146 |     frnr = intervals[0]['fr_range'][0]
147 |     while frnr <= intervals[-1]['fr_range'][1]:
148 |         for striprange in intervals:
149 |             if striprange['fr_range'][0] <= frnr <= striprange['fr_range'][1]:
150 |                 break
151 |         pi_center = striprange['fr_width'] * frnr + striprange['intercept']
152 |         bin_left = pi_center - striprange['fr_width'] / 2 - striprange['tolerance'] - striprange['pishift']
153 |         bin_right = pi_center + striprange['fr_width'] / 2 + striprange['tolerance'] - striprange['pishift']
154 |         print('Bins in fraction', frnr, bin_left, bin_right)
155 |         bin_array.append((frnr, bin_left, bin_right))
156 |         frnr += 1
157 |     return bin_array
158 | 
159 | 
160 | def write_fractions(pi_peptides_fn, strip, bin_array, locate_function, minlen, maxlen):
161 |     amount_fractions = sum([x['fr_amount'] for x in strip.values()])
162 |     amountpad = len(str(amount_fractions))
163 |     with ExitStack() as stack:
164 |         target_out_fp = {frnr: ([], stack.enter_context(
165 |             open('target_fr{i:0{pad}}.fasta'.format(i=frnr, pad=amountpad), 'w')))
166 |             for frnr in range(1, amount_fractions + 1)}
167 |         input_fp = stack.enter_context(open(pi_peptides_fn))
168 |         pepcount = 0
169 |         for line in input_fp:
170 |             accs, pep, pi = line.strip().split("\t")
171 |             pi = float(pi)
172 |             if maxlen and len(pep) > maxlen:
173 |                 continue
174 |             elif len(pep) >= minlen:
175 |                 pepcount += 1
176 |                 for i in locate_function(pi, bin_array):
177 |                     target_out_fp[i][0].append('>{}\n{}\n'.format(accs, pep))
178 |             if pepcount > 1000000:
179 |                 # write in chunks to make it go faster
180 |                 pepcount = 0
181 |                 [fp.write(''.join(peps)) for peps, fp in
182 |                  target_out_fp.values()]
183 |                 target_out_fp = {fr: ([], pep_fp[1])
184 |                                  for fr, pep_fp in target_out_fp.items()}
185 |         [fp.write(''.join(peps)) for peps, fp in target_out_fp.values()]
186 | 
187 | 
188 | if __name__ == '__main__':
189 |     main()
190 | 


--------------------------------------------------------------------------------
/bin/reverse_decoy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from Bio import SeqIO
 3 | import sys
 4 | 
 5 | 
 6 | with open(sys.argv[1]) as fp, open('decoy_{}'.format(sys.argv[1]), 'w') as wfp:
 7 |   for target in SeqIO.parse(fp, 'fasta'):
 8 |     decoy = target[::-1] 
 9 |     decoy.description = decoy.description.replace('ENS', 'decoy_ENS')
10 |     decoy.id = 'decoy_{}'.format(decoy.id)
11 |     SeqIO.write(decoy, wfp, 'fasta')
12 | 


--------------------------------------------------------------------------------
/bin/scan_bams.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | 
  3 | import sys
  4 | import os
  5 | import getopt
  6 | import numpy as np
  7 | import re
  8 | from pysam import *
  9 | 
 10 | class PEPTIDE(object):
 11 |     def __init__(self,ID=None,seq=None,type=None,chr=None,strand=None,start=0,end=0,splice_start=0,splice_end=0):
 12 |         self.ID=ID
 13 |         self.seq=seq
 14 |         self.type=type
 15 |         self.start=start  #chromosome start coordinate
 16 |         self.end=end  #chromosome end coordinate
 17 |         self.strand=strand
 18 |         self.splice_start=splice_start
 19 |         self.splice_end=splice_end
 20 |         self.chr=chr
 21 | 
 22 | ## Function definitions.
 23 | 
 24 | # Determine if an alignment has few enough mismatches to pass.
 25 | def mismatches_ok(aln, max_mismatches=1):
 26 |     try:
 27 |         nm = aln.get_tag('nM')
 28 |     except:
 29 |         try:
 30 |             nm = aln.get_tag('NM')
 31 |         except:
 32 |             return(-1) # Could not find a tag for number of mismatches
 33 |     return (nm <= max_mismatches)
 34 | 
 35 | # Is pairing OK? Single-end passes automatically; paired-end passes if properly paired.
 36 | def pairing_ok(aln):
 37 |     if not aln.is_paired:
 38 |         return True
 39 |     elif aln.is_proper_pair:
 40 |         return True
 41 |     else:
 42 |         return False
 43 | 
 44 | # Is the read multi-mapping? Fail if so
 45 | def multimapping_ok(aln, max_loci=1):
 46 |     try:
 47 |         if aln.get_tag('NH') > max_loci:
 48 |             return False
 49 |         else:
 50 |             return True
 51 |     except:
 52 |         try:
 53 |             if aln.get_tag('XT') == 'R': # XT:A:U means unique, XT:A:R means repeat, apparently
 54 |                 return False
 55 |             else:
 56 |                 return True
 57 |         except:
 58 |             return(-1) # Could not find a tag for number of loci
 59 | 
 60 | 
 61 | def find_eligible_alns(region, bamfile, max_mismatches=1):
 62 |     good_alns = []
 63 |     try:
 64 |         iter = bamfile.fetch(region[0], region[1], region[2])
 65 |     except:
 66 |         sys.exit("Region " + region[0] + ' ' + str(region[1]) + ' ' + str(region[2]) + '\nBAM file' + bamfile + '\nMake sure that you have an indexed BAM file!')
 67 |     for x in iter:
 68 |         if mismatches_ok(x) and pairing_ok(x) and multimapping_ok(x):
 69 |             good_alns.append(x)
 70 |     return(good_alns)
 71 | 
 72 | 
 73 | def get_overlap(s1, e1, s2, e2):
 74 |     """
 75 |         Get the coordinates of the overlap between two intervals
 76 |         """
 77 |     if s1 > e2 or e1 < s2: return None
 78 |     if s1 <= s2 and e1 <= e2: return (s2, e1) # Will also work for s1 == s2 and e1 == e2
 79 |     if s1 <= s2 and e1 >= e2: return (s2, e2) # Alignment enclosed in peptide
 80 |     if s1 >= s2 and e1 <= e2: return (s1, e1) # Peptide enclosed in alignment
 81 |     if s1 >= s2 and e1 >= e2: return (s1, e2)
 82 |     sys.exit('Check your numbers')
 83 | 
 84 | ################  Comand-line arguments ################
 85 | 
 86 | 
 87 | if len(sys.argv[1:])<=1:  ### Indicates that there are insufficient number of command-line arguments
 88 |     print("Warning! wrong command, please read the manual in Readme.txt.")
 89 |     print("Example: python scanBam.py --gff_input novelpep.gff3 --bam_files bam_files_list.txt --output novelpep_readcount.txt")
 90 |     sys.exit()
 91 | else:
 92 |     options, remainder = getopt.getopt(sys.argv[1:],'', ['gff_input=',
 93 |                                                          'bam_files=',
 94 |                                                          'output='])
 95 |     for opt, arg in options:
 96 |         if opt == '--gff_input': gff_file=arg
 97 |         elif opt == '--bam_files': bam_files=arg
 98 |         elif opt == '--output': out_file=arg
 99 |         else:
100 |             print("Warning! Command-line argument: %s not recognized. Exiting...") % opt; sys.exit()
101 | 
102 | 
103 | ### pair-end reads need to be properly paired
104 | ### reads map to single location
105 | ### reads with max 1 mismatch
106 | ### reads mapped to the same strand of the peptide
107 | ### splice juncton peptides are only counted supported when found with corresponding
108 | ### reads overlap with peptide with minimum 1 nucleotide
109 | 
110 | input1=open(gff_file,"r")
111 | input2=open(bam_files,"r")
112 | output=open(out_file,"w")
113 | 
114 | pep_dic={} # store peptide object with sequence as key
115 | for line in input1:
116 |     if line[0]!="#":
117 |         row=line.strip().split("\t")
118 |         if row[8].startswith("ID"): continue
119 |         #seq=row[8].replace("ID=","")
120 |         seq=row[8].replace("Parent=","")
121 |         if seq not in pep_dic:
122 |             pep_dic[seq]=PEPTIDE(ID=seq,seq=seq,chr=row[0],start=row[3],end=row[4],strand=row[6],type="continuous")
123 |         else:
124 |             pep_dic[seq].type="spliced"
125 |             if pep_dic[seq].start==row[3]:
126 |                 pep_dic[seq].splice_start=row[4]
127 |             elif pep_dic[seq].end==row[4]:
128 |                 pep_dic[seq].splice_start=row[3]
129 | 
130 | 
131 | print(len(pep_dic))
132 | 
133 | aln_table = {}
134 | 
135 | for line in input2:
136 |     # add code to process bam files
137 |     bam = line.strip()
138 |     sys.stderr.write(bam + '\n')
139 |     aln_count = {} # A dictionary that will collect eligible alignment counts by peptide.
140 |     bamfile = AlignmentFile(bam,"rb")
141 |     for seq in pep_dic:
142 |         peptide=pep_dic[seq]
143 |         aln_count[peptide.seq]=0
144 |         region = [peptide.chr, int(peptide.start), int(peptide.end)]
145 |         # Find the alignments that could be eligible for counting based on multimapping, duplication, overall mismatches
146 |         eligible = find_eligible_alns(region, bamfile)
147 |         for aln in eligible:
148 |             if not 'N' in aln.cigarstring:
149 |                 # Unspliced alignment. This means that we only need to check that there is no mismatch in the peptide region.
150 |                 aln_count[peptide.seq]+=1
151 |                 # Spliced alignment. We have to check where the aligned segments actually are.
152 |                 ct = aln.cigartuples
153 |                 curr_loc = aln.reference_start
154 |                 aln_starts = []
155 |                 aln_ends = []
156 |                 for tup in ct:
157 |                     if tup[0] == 0:
158 |                         aln_starts.append(curr_loc)
159 |                         aln_ends.append(curr_loc + tup[1])
160 |                     curr_loc += tup[1]
161 |                 # Accept any segment that overlaps the peptide without mi
162 |                 matching_overlap = False
163 |                 overlap = False
164 |                 for e in zip(aln_starts, aln_ends):
165 |                     ol = get_overlap(region[1], region[2], e[0], e[1])
166 |                     if ol:
167 |                         overlap = True
168 |                 if overlap:
169 |                     aln_count[peptide.seq] += 1
170 |     #print(bam + " " + peptide.seq + " " + str(aln_count[peptide.seq]))
171 |     aln_table[bam] = aln_count
172 | 
173 | input1.close()
174 | input2.close()
175 | 
176 | bam_files = sorted(list(aln_table.keys()))
177 | 
178 | # Write output file header.
179 | output.write('Bare peptide\t')
180 | for i in range(0,len(bam_files)):
181 |     output.write(bam_files[i].split('.')[0].split('/')[-1])
182 |     if (i < (len(bam_files)-1)): output.write('\t')
183 |     else: output.write('\n')
184 | 
185 | # And the counts.
186 | for pep in sorted(aln_table[bam_files[0]].keys()):
187 |     output.write(pep + '\t')
188 |     for i in range(0,len(bam_files)): 
189 |         output.write(str(aln_table[bam_files[i]][pep]))
190 |         if (i < (len(bam_files)-1)): output.write('\t')
191 |     output.write('\n')
192 | 


--------------------------------------------------------------------------------
/conf/base.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |   threadspercore = 2
 3 |   clusterOptions = false
 4 |   max_memory = 128769.MB
 5 |   max_cpus = 16
 6 |   max_time = 240.h
 7 | }
 8 | 
 9 | process {
10 |   container = params.container
11 | 
12 |   cpus = { check_max( 1 * task.attempt, 'cpus') }
13 |   memory = { check_max( 16.GB * task.attempt, 'memory') }
14 |   time = { check_max( 2.h * task.attempt, 'time') }
15 |   errorStrategy = { task.exitStatus in [143, 137] ? 'retry' : 'terminate' }
16 | 
17 |   withName: msgfPlus {
18 |     time = { check_max( 6.h * task.attempt, 'time' ) }
19 |     cpus = { Runtime.runtime.availableProcessors() < 4 ? Runtime.runtime.availableProcessors() : 4 }
20 |     memory = { (db.size() >> 30) < 1 ? 16.GB : "${db.size() * 16}B"  }
21 |   }
22 | 
23 |   withName: percolator {
24 |     cpus = 2
25 |     time = { check_max( 5.h * task.attempt, 'time' ) }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/conf/sixft.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |   novheaders = '^chr;^decoy_chr'
 3 |   varheaders = false
 4 |   strips = ['3-10': [intercept: 3.5478, fr_width: 0.0676, tolerance: 0.11, fr_amount: 72, reverse: false],
 5 |             '3.7-4.9': [intercept: 3.5959, fr_width: 0.0174, tolerance: 0.08, fr_amount: 72, reverse: false],
 6 |             '11-6': [intercept: 10.3936, fr_width: -0.0762, tolerance: 0.11, fr_amount: 60, reverse: true],
 7 |             '6-9': [intercept: 6.1159, fr_width: 0.0336, pi_tolerance: 0.11, fr_amount: 72, reverse: false],
 8 |             '3.4-4.8': ['1-21': [intercept: 3.4395, fr_width: 0.0221, tolerance: 0.08, fr_amount: 21, reverse: false],
 9 |                         '22-64': [intercept: 3.6374, fr_width: 0.0128, tolerance: 0.08, fr_amount: 43, reverse: false],
10 |                         '65-72': [intercept: 1.7364, fr_width: 0.0424, tolerance: 0.08, fr_amount: 8, reverse: false],
11 |             ],
12 |   ]
13 | }
14 | 


--------------------------------------------------------------------------------
/conf/uppmax.config:
--------------------------------------------------------------------------------
 1 | singularity {
 2 |   enabled = true
 3 | }
 4 | 
 5 | process {
 6 |   executor = 'slurm'
 7 |   clusterOptions = { "-A $params.project -M $params.cluster ${params.clusterOptions ?: ''}" }
 8 |   //clusterOptions = { "-A $params.project ${params.clusterOptions ?: ''}" }
 9 |   //errorStrategy = { task.exitStatus == 1 && task.stdout.contains('Socket timed out on send/recv') ? 'retry' : 'terminate' }
10 |   maxRetries: 3
11 | 
12 |   withName: 'make.*Seq' {
13 |     scratch = '$SNIC_TMP'
14 |   }
15 |   withName: 'concatFasta' {
16 |     scratch = '$SNIC_TMP'
17 |   }
18 |   withName: createSpectraLookup {
19 |     time = { check_max( 0.05.h * mzmlfiles.size(), 'time' ) }
20 |   }
21 |   withName: msgfPlus {
22 |     memory = { check_max( { (mem >> 20 < 8192) ? 8192.MB : mem as nextflow.util.MemoryUnit}.call(), 'memory')}
23 |     cpus = 4
24 |   }
25 |   
26 |   withName: createPSMTables {
27 |     scratch = '$SNIC_TMP'
28 |     time = { check_max( 0.002.h * mzmlcount, 'time' ) } // 500 files, 1h
29 |   }
30 |   withName: percolator {
31 |     cpus = 2
32 |   }
33 |   withName: mergeSetPSMtable {
34 |     scratch = '$SNIC_TMP'
35 |   }
36 | }
37 | 
38 | params {
39 |   clusterOptions = false
40 |   cluster = 'rackham'
41 |   saveReference = true
42 |   // Max resources requested by a normal node on milou. If you need more memory, run on a fat node using:
43 |   //   --clusterOptions "-C mem512GB" --max_memory "512GB"
44 |   max_memory = '32 GB'
45 |   max_cpus = 16
46 |   max_time = 240.h
47 |   // illumina iGenomes reference file paths on UPPMAX
48 |   igenomes_base = '/sw/data/uppnex/igenomes/'
49 | }
50 | 


--------------------------------------------------------------------------------
/envs/environment.yml:
--------------------------------------------------------------------------------
 1 | name: ipaw-0.5
 2 | channels:
 3 |   - bioconda
 4 |   - conda-forge
 5 |   - defaults
 6 | dependencies:
 7 |   - bioconda::msgf_plus=2020.03.14
 8 |   - bioconda::openms=2.5.0
 9 |   - bioconda::msstitch=3.13
10 |   - conda-forge::biopython=1.72
11 |   - bioconda::percolator=3.4
12 |   - bioconda::blast=2.9.0
13 |   - bioconda::pybigwig=0.3.17
14 |   - bioconda::pysam=0.15.3
15 |   - bioconda::ucsc-blat=377
16 |     # annovar
17 |   - conda-forge:perl=5.26.2
18 |     # SpecAI
19 |   - conda-forge::r-protviz=0.4.0
20 |   - bioconda::bioconductor-msnbase=2.10.1
21 | 


--------------------------------------------------------------------------------
/main.nf:
--------------------------------------------------------------------------------
   1 | #!/usr/bin/env nextflow
   2 | 
   3 | /*
   4 | vim: syntax=groovy
   5 | -*- mode: groovy;-*-
   6 | 
   7 | ==============================
   8 | IPAW: HiRIEF II varDB pipeline
   9 | ==============================
  10 | @Authors
  11 | Jorrit Boekel @glormph
  12 | Yafeng Zhu @yafeng
  13 | 
  14 | https://github.com/lehtiolab/proteogenomics-analysis-workflow
  15 | */
  16 | 
  17 | nf_required_version = '19.04.0'
  18 | if( ! nextflow.version.matches(">= ${nf_required_version}") ){
  19 |   println("Nextflow version too old, ${nf_required_version} required")
  20 |   exit(1)
  21 | }
  22 | 
  23 | 
  24 | mods = file(params.mods)
  25 | knownproteins = file(params.knownproteins)
  26 | blastdb = file(params.blastdb)
  27 | gtffile = file(params.gtf)
  28 | snpfa = file(params.snpfa)
  29 | dbsnp = params.dbsnp ? file(params.dbsnp) : false
  30 | cosmic = params.cosmic ? file(params.cosmic) : false
  31 | genomefa = file(params.genome)
  32 | tdb = file(params.tdb)
  33 | normalpsms = params.normalpsms ? file(params.normalpsms) : false
  34 | 
  35 | 
  36 | 
  37 | /* PIPELINE START */
  38 | 
  39 | // Either feed an mzmldef file (tab separated lines with filepath\tsetname), or /path/to/\*.mzML
  40 | if (!params.mzmldef && !params.input) {
  41 |   Channel
  42 |     .fromPath(params.mzmls)
  43 |     .map { it -> [it, 'NA'] }
  44 |     .set { mzml_in }
  45 | } else {
  46 |   header = ['mzmlfile', 'setname', 'plate', 'fraction']
  47 |   mzmldef = params.mzmldef ?: params.input
  48 |   mzmllines = file(mzmldef).readLines().collect { it.tokenize('\t') }
  49 |   if (mzmllines[0] == header) {
  50 |     /* As above, future use with pushing files with a header becomes enabled, as long as
  51 |     they use this header format. We cannot do module importing etc yet, have to use DSL2
  52 |     for that. That is something to strive for in the future.
  53 |     */
  54 |     mzmllines.remove(0)
  55 |   }
  56 |   Channel
  57 |     .from(mzmllines)
  58 |     .set { mzml_in }
  59 | }
  60 | 
  61 | 
  62 | // Isobaric input parsing to setisobaric and setdenoms maps
  63 | // example: --isobaric 'set1:tmt10plex:127N:128N set2:tmtpro:sweep set3:itraq8plex:intensity'
  64 | isop = params.isobaric ? params.isobaric.tokenize(' ') : false
  65 | setisobaric = isop ? isop.collect() {
  66 |   y -> y.tokenize(':')
  67 | }.collectEntries() {
  68 |   x-> [x[0], x[1]]
  69 | } : false
  70 | // FIXME add non-isobaric sets here if we have any mixed-in?
  71 | setdenoms = isop ? isop.collect() {
  72 |   y -> y.tokenize(':')
  73 | }.collectEntries() {
  74 |   x-> [x[0], x[2..-1]]
  75 | } : false
  76 | 
  77 | 
  78 | mzml_in
  79 |   .tap { sets; mzmlcounter }
  80 |   .map { it -> [file(it[0]), it[1], it[2] ? it[2] : 'NA', it[3] ? it[3].toInteger() : 'NA' ]} // create file, set plate and fraction to NA if there is none
  81 |   .tap { strips }
  82 |   .map { it -> [it[1], it[0].baseName.replaceFirst(/.*\/(\S+)\.mzML/, "\$1"), it[0], it[2], it[3]] }
  83 |   .into { mzmlfiles; groupset_mzmls; mzml_isobaric; mzml_premsgf }
  84 | 
  85 | mzmlcounter
  86 |   .count()
  87 |   .subscribe { println "$it mzML files in analysis" }
  88 |   .set { mzmlcount_psm }
  89 | 
  90 | sets
  91 |   .map{ it -> it[1] }
  92 |   .unique()
  93 |   .tap { sets_for_emtpybam; sets_for_denoms; sets_for_six }
  94 |   .collect()
  95 |   .subscribe { println "Detected setnames: ${it.join(', ')}" }
  96 | 
  97 | 
  98 | strips
  99 |   .map { it -> [it[1], it[2]] }
 100 |   .unique()
 101 |   .groupTuple()
 102 |   .set { strips_for_six }
 103 | 
 104 | if (params.pisepdb) {
 105 |   sets_for_six
 106 |     .toList()
 107 |     .map { it -> [it, normalpsms]}
 108 |     .set { normpsms }
 109 | } else {
 110 |   sets_for_six.set{ normpsms }
 111 | }
 112 | 
 113 | process splitSetNormalSearchPsms {
 114 |   //  normal search psm table, split on set col, collect files
 115 | 
 116 |   when: params.pisepdb
 117 |   input:
 118 |   set val(setnames), file('normalpsms') from normpsms
 119 |   output:
 120 |   set val(setnames), file({setnames.collect() { it + '.tsv' }}) into setnormpsms
 121 |   """
 122 |   msstitch split -i normalpsms --splitcol bioset
 123 |   """
 124 | }
 125 | 
 126 | setnormpsms
 127 |   .transpose()
 128 |   .join(strips_for_six)
 129 |   .set { setnormpsmtable } 
 130 | 
 131 | process splitPlateNormalSearchPsms {
 132 |   // create pep tables, split on plate, collect
 133 | 
 134 |   when: params.pisepdb
 135 |   input:
 136 |   set val(setname), file(normpsm), val(stripnames) from setnormpsmtable
 137 |   output:
 138 |   set val(setname), val(stripnames), file({stripnames.collect() { it + '.tsv' }}) into setplatepsms
 139 |   """
 140 |   msstitch split -i $normpsm --splitcol `python -c 'with open("$normpsm") as fp: h=next(fp).strip().split("\\t");print(h.index("Strip")+1)'`
 141 |   """
 142 | }
 143 | 
 144 | setplatepsms
 145 |   .transpose()
 146 |   .set { setplatepsmtable }
 147 | 
 148 | process normalSearchPsmsToPeptides {
 149 |   // create pep tables, split on plate, collect
 150 | 
 151 |   when: params.pisepdb
 152 |   input:
 153 |   set val(setname), val(strip), file(normpsm) from setplatepsmtable
 154 |   output:
 155 |   set val(setname), val(strip), file('peptides') into setplatepeptides
 156 |   """
 157 |   msstitch peptides -i $normpsm -o peptides --scorecolpattern area --spectracol 1 
 158 |   """
 159 | }
 160 | 
 161 | pipep = params.pisepdb ? Channel.fromPath(params.pisepdb) : Channel.empty()
 162 | varnov_peptides = params.pisepdb ? Channel.fromPath(params.pisepdb) : Channel.from(tdb)
 163 | setplatepeptides
 164 |   .combine(pipep)
 165 |   .set { sixftcreation_in }
 166 | 
 167 | process create6FTDB {
 168 |   // create 6FT DB per peptable-plate, collect fr 
 169 | 
 170 |   when: params.pisepdb
 171 | 
 172 |   input:
 173 |   set val(setname), val(stripname), file(peptides), file(pipeptides) from sixftcreation_in
 174 | 
 175 |   output:
 176 |   set val(setname), val(stripname), file('target_fr*.fasta') into t_splitdb
 177 | 
 178 |   script:
 179 |   strip = params.strips[stripname]
 180 |   """
 181 |   echo \'${groovy.json.JsonOutput.toJson(strip)}\' >> strip.json
 182 |   pi_database_splitter.py -i $pipeptides -p $peptides --stripdef strip.json --deltacolpattern Delta --fraccolpattern Fraction --fdrcolpattern '^q-value' --picutoff 0.2 --fdrcutoff 0.0 --maxlen $params.maxlen --minlen $params.minlen
 183 |   """
 184 | }
 185 | 
 186 | 
 187 | // channel match plate/fr/mzML
 188 | if (params.pisepdb) {
 189 |   t_splitdb
 190 |     .transpose()
 191 |     .map { it -> ["${it[0]}_${it[1]}_${it[2].baseName.replaceFirst(/.*_fr[0]*/, "")}", it[2]]}
 192 |     .set { db_w_id }
 193 |   mzml_premsgf
 194 |     .map { it -> ["${it[0]}_${it[3]}_${it[4]}", it[0], it[1], it[2]] }  // add set_strip_fr identifier
 195 |     .into { mzml_dbid; mzml_dbfilter }
 196 | } else {
 197 |   Channel.from([['NA', tdb]]).set { db_w_id }
 198 |   mzml_premsgf
 199 |     .map { it -> ["NA", it[0], it[1], it[2]] }
 200 |     .into { mzml_dbid; mzml_dbfilter }
 201 | }
 202 | 
 203 | process makeTargetSeqLookup {
 204 | 
 205 |   input:
 206 |   file(tdb) from varnov_peptides
 207 |   file(knownproteins)
 208 | 
 209 |   output:
 210 |   set file('mslookup_db.sqlite'), file('decoy_known.fa') into target_seq_lookup
 211 | 
 212 |   script:
 213 |   """
 214 |   # create text file of pi sep (much faster to import to SQLite than fasta)
 215 |   ${params.pisepdb ? "cut -f2 $tdb | sort -u > targetseq.txt" : "grep -v '^>' $tdb > targetseq.txt"}
 216 |   # Add trypsinized known proteins to txt file
 217 |   msstitch trypsinize -i $knownproteins -o knowntryp
 218 |   grep -v '^>' knowntryp >> targetseq.txt
 219 | 
 220 |   # TODO parametrize notrypsin?
 221 |   msstitch storeseq -i targetseq.txt --minlen $params.minlen ${params.pisepdb ? '--notrypsin': ''}
 222 |   msstitch makedecoy -i $knownproteins --dbfile mslookup_db.sqlite -o decoy_known.fa --scramble tryp_rev --minlen $params.minlen
 223 |   """
 224 | }
 225 | 
 226 | // Join DB with mzmls, use join filters out DBs without a match (in case of missing mzML fractions) 
 227 | // or duplicates (when having reruns, or when not running pI separared DBs in which case you only need 
 228 | // this process once). So we dont generate more decoys than necessary
 229 | db_w_id
 230 |   .join(mzml_dbfilter)
 231 |   .map { it -> [it[0], it[1]] }
 232 |   .combine(target_seq_lookup)
 233 |   .set { db_filtered }
 234 | 
 235 | process concatFasta {
 236 |  
 237 |   input:
 238 |   set val(dbid), file(db), file(targetlookup), file('decoy_known.fa') from db_filtered
 239 |   file knownproteins
 240 | 
 241 |   output:
 242 |   set val(dbid), file('td_concat.fa') into db_concatdecoy
 243 | 
 244 |   script:
 245 |   """
 246 |   # copy DB for faster access on network FS
 247 |   cp ${targetlookup} localdb.sql
 248 |   cat $db $knownproteins > td_concat.fa
 249 |   msstitch makedecoy -i $db --dbfile localdb.sql -o decoy_db.fa --scramble tryp_rev --minlen $params.minlen ${params.pisepdb ? '--notrypsin': ''}
 250 |   cat decoy_db.fa decoy_known.fa >> td_concat.fa
 251 |   rm decoy_db.fa localdb.sql
 252 |   """
 253 | }
 254 | 
 255 | // Now re-match the DB (now with decoy) with mzML, this is needed to fan out if more mzMLs than
 256 | // DBs have been used so we use the cross operator
 257 | db_concatdecoy
 258 |   .cross(mzml_dbid) // gives two Arrays so unfold them in next map step
 259 |   .map { it -> [it[0][0], it[0][1], it[1][1], it[1][2], it[1][3]] } // dbid, db, set, sample, file
 260 |   .set { mzml_msgf }
 261 | 
 262 | process prepareFilterDB {
 263 | 
 264 |   input:
 265 |   file(knownproteins)
 266 |   file(snpfa)
 267 | 
 268 |   output:
 269 |   file('knownprot.sqlite') into protseqdb
 270 |   file('snpprot.sqlite') into snpseqdb
 271 |   file('mslookup_db.sqlite') into trypseqdb
 272 | 
 273 |   """
 274 |   msstitch storeseq -i $knownproteins --minlen $params.minlen --fullprotein --minlen 7 -o knownprot.sqlite
 275 |   msstitch storeseq -i $snpfa --minlen $params.minlen -o snpprot.sqlite --fullprotein --minlen 7
 276 |   msstitch storeseq -i $knownproteins --insourcefrag --minlen $params.minlen
 277 |   """
 278 | }
 279 | 
 280 | 
 281 | process IsobaricQuant {
 282 | 
 283 |   when: !params.quantlookup && params.isobaric
 284 | 
 285 |   input:
 286 |   set val(setname), val(sample), file(infile), val(strip), val(fraction) from mzml_isobaric
 287 | 
 288 |   output:
 289 |   set val(sample), file("${infile}.consensusXML") into isobaricxml
 290 | 
 291 |   script:
 292 |   activationtype = [hcd:'High-energy collision-induced dissociation', cid:'Collision-induced dissociation', etd:'Electron transfer dissociation'][params.activation]
 293 |   isobtype = setisobaric && setisobaric[setname] ? setisobaric[setname] : false
 294 |   isobtype = isobtype == 'tmtpro' ? 'tmt16plex' : isobtype
 295 |   plextype = isobtype ? isobtype.replaceFirst(/[0-9]+plex/, "") : 'false'
 296 |   massshift = [tmt:0.0013, itraq:0.00125, false:0][plextype]
 297 |   """
 298 |   IsobaricAnalyzer  -type $isobtype -in $infile -out "${infile}.consensusXML" -extraction:select_activation "$activationtype" -extraction:reporter_mass_shift $massshift -extraction:min_precursor_intensity 1.0 -extraction:keep_unannotated_precursor true -quantification:isotope_correction true 
 299 |   """
 300 | }
 301 | 
 302 | isobaricxml
 303 |   .ifEmpty(['NA', 'NA'])
 304 |   .toList()
 305 |   .flatMap { it.sort({a, b -> a[0] <=> b[0]}) }
 306 |   .map { it -> it[1] }
 307 |   .collect()
 308 |   .set { sorted_isoxml }
 309 | 
 310 | 
 311 | mzmlfiles
 312 |   .toList()
 313 |   .map { it.sort( {a, b -> a[1] <=> b[1]}) }
 314 |   .map { it -> [it.collect() { it[0] }, it.collect() { it[2] }] }
 315 |   .set{ mzmlfiles_all }
 316 | 
 317 | 
 318 | process createNewSpectraLookup {
 319 | 
 320 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true, saveAs: {it == 'mslookup_db.sqlite' ? 'quant_lookup.sql' : null }
 321 | 
 322 |   when: !params.quantlookup
 323 | 
 324 |   input:
 325 |   file(isobxmls) from sorted_isoxml 
 326 |   set val(setnames), file(mzmlfiles) from mzmlfiles_all
 327 |   
 328 |   output:
 329 |   file('mslookup_db.sqlite') into newspeclookup
 330 | 
 331 |   script:
 332 |   if(params.isobaric)
 333 |   """
 334 |   msstitch storespectra --spectra ${mzmlfiles.join(' ')} --setnames ${setnames.join(' ')}
 335 |   msstitch storequant --dbfile mslookup_db.sqlite --isobaric ${isobxmls.join(' ')} --spectra ${mzmlfiles.join(' ')}
 336 |   """
 337 |   else
 338 |   """
 339 |   msstitch storespectra --spectra ${mzmlfiles.join(' ')} --setnames ${setnames.join(' ')}
 340 |   """
 341 | }
 342 | 
 343 | 
 344 | if (!params.quantlookup) {
 345 |   newspeclookup
 346 |     .set { spec_lookup }
 347 | } else {
 348 |   Channel
 349 |     .fromPath(params.quantlookup)
 350 |     .set { spec_lookup }
 351 | } 
 352 | 
 353 | 
 354 | process msgfPlus {
 355 | 
 356 |   // Some versions have problems when converting to TSV, possible too long identifiers 
 357 |   // If problems arise, try to use an older version: msgf_plus:2016.10.26--py27_1
 358 | 
 359 |   input:
 360 |   set val(setfr_id), file(db), val(setname), val(sample), file(mzml) from mzml_msgf
 361 |   file mods
 362 | 
 363 |   output:
 364 |   set val(setname), val(sample), file("${sample}.mzid") into mzids
 365 |   set val(setname), file("${sample}.mzid"), file('out.mzid.tsv') into mzidtsvs
 366 |   
 367 |   script:
 368 |   mem = db.size() * 16 // used in conf profile
 369 |   msgfprotocol = 0
 370 |   """
 371 |   msgf_plus -Xmx${task.memory.toMega()}M -d $db -s $mzml -o "${sample}.mzid" -thread ${task.cpus * params.threadspercore} -mod $mods -maxMissedCleavages ${params.maxmiscleav} -tda 0 -t 10.0ppm -ti -1,2 -m 0 -inst 3 -e 1 -protocol ${msgfprotocol} -ntt 2 -minLength $params.minlen -maxLength $params.maxlen -minCharge 2 -maxCharge 6 -n 1 -addFeatures 1
 372 |   msgf_plus -Xmx3500M edu.ucsd.msjava.ui.MzIDToTsv -i "${sample}.mzid" -o out.mzid.tsv
 373 |   rm td_concat.c*
 374 |   """
 375 | }
 376 | 
 377 | mzids
 378 |   .groupTuple()
 379 |   .set { mzids_2pin }
 380 | 
 381 | 
 382 | process percolator {
 383 | 
 384 |   input:
 385 |   set val(setname), val(samples), file('mzid?') from mzids_2pin
 386 | 
 387 |   output:
 388 |   set val(setname), file('perco.xml') into percolated
 389 | 
 390 |   script:
 391 |   mzmlcount = samples.size() 
 392 |   """
 393 |   mkdir mzids
 394 |   count=1;for sam in ${samples.join(' ')}; do ln -s `pwd`/mzid\$count mzids/\${sam}.mzid; echo mzids/\${sam}.mzid >> metafile; ((count++));done
 395 |   msgf2pin -o percoin.xml -e trypsin -P "decoy_" metafile
 396 |   percolator -j percoin.xml -X perco.xml -N 500000 --decoy-xml-output -y
 397 |   """
 398 | }
 399 | 
 400 | 
 401 | percolated
 402 |   .tap { var_percolated }
 403 |   .set { nov_percolated }
 404 | 
 405 | 
 406 | process getVariantPercolator {
 407 | 
 408 |   when: params.varheaders
 409 | 
 410 |   input:
 411 |   set val(setname), file(x) from var_percolated
 412 | 
 413 |   output:
 414 |   set val(setname), val('variant'), file("${x}_h0.xml") into var_perco
 415 | 
 416 |   script:
 417 |   """
 418 |   msstitch splitperco -i $x --protheaders "known:${params.knownheaders}|novel:${params.varheaders}"
 419 |   """
 420 | }
 421 | 
 422 | 
 423 | process getNovelPercolator {
 424 | 
 425 |   when: params.novheaders
 426 | 
 427 |   input:
 428 |   set val(setname), file(x) from nov_percolated
 429 | 
 430 |   output:
 431 |   set val(setname), val('novel'), file("${x}_h0.xml") into nov_perco
 432 | 
 433 |   script:
 434 |   """
 435 |   msstitch splitperco -i $x --protheaders "known:${params.knownheaders}|novel:${params.novheaders}"
 436 |   """
 437 | }
 438 | 
 439 | 
 440 | nov_perco
 441 |   .concat(var_perco)
 442 |   .set { splitperco }
 443 | 
 444 | 
 445 | process filterPercolator {
 446 | 
 447 |   input:
 448 |   set val(setname), val(peptype), file(perco) from splitperco
 449 |   file 'trypseqdb' from trypseqdb
 450 |   file 'protseqdb' from protseqdb
 451 |   file knownproteins
 452 | 
 453 |   output:
 454 |   set val(setname), val(peptype), file('filtprot') into filtered_perco
 455 |   
 456 |   script:
 457 |   if (params.noclassfdr)
 458 |   """
 459 |   mv $perco filtprot
 460 |   """
 461 |   else
 462 |   """
 463 |   msstitch filterperco -i $perco -o filtseq --dbfile trypseqdb --insourcefrag 2 --deamidate 
 464 |   msstitch filterperco -i filtseq -o filtprot --fullprotein --dbfile protseqdb --minlen $params.minlen --deamidate
 465 |   """
 466 | }
 467 | 
 468 | nov_filtered_perco = Channel.create()
 469 | var_filtered_perco = Channel.create()
 470 | filtered_perco
 471 |   .choice( var_filtered_perco, nov_filtered_perco) { it -> it[1] == 'variant' ? 0 : 1 }
 472 | 
 473 | mzidtsvs
 474 |   .groupTuple()
 475 |   .tap { variantmzidtsv }
 476 |   .join(nov_filtered_perco)
 477 |   .set { nov_mzperco }
 478 | 
 479 | variantmzidtsv
 480 |   .join(var_filtered_perco)
 481 |   .concat(nov_mzperco)
 482 |   .set { allmzperco }
 483 | 
 484 | process svmToTSV {
 485 | 
 486 |   input:
 487 |   set val(setname), file('mzid?'), file('tsv?'), val(peptype), file(perco) from allmzperco 
 488 | 
 489 |   output:
 490 |   set val(setname), val(peptype), file('target.tsv') into mzidtsv_perco
 491 | 
 492 |   script:
 493 |   """
 494 |   tsvs=""
 495 |   mzids=""
 496 |   count=1; for tsvfn in \$(ls tsv*)
 497 |     do 
 498 |     tsvs="\${tsvs} tsv\${count}"
 499 |     mzids="\${mzids} mzid\${count}"
 500 |     ((count++))
 501 |     done
 502 |   mkdir outtables
 503 |   msstitch perco2psm --perco $perco -d outtables -i \$tsvs --mzids \$mzids
 504 |   msstitch concat -i outtables/* -o psms
 505 |   msstitch split -i psms --splitcol TD
 506 |   """
 507 | }
 508 | 
 509 | mzidtsv_perco
 510 |   .combine(spec_lookup)
 511 |   .set { prepsm }
 512 | 
 513 | process createPSMTable {
 514 | 
 515 |   input:
 516 |   set val(setname), val(peptype), file('psms'), file('lookup') from prepsm
 517 |   val(mzmlcount) from mzmlcount_psm
 518 | 
 519 |   output:
 520 |   set val(setname), val(peptype), file("${setname}_${peptype}_psmtable.txt") into psmtable
 521 | 
 522 |   script:
 523 |   """
 524 |   msstitch conffilt -i psms -o filtpsm --confidence-better lower --confidence-lvl 0.01 --confcolpattern 'PSM q-value'
 525 |   msstitch conffilt -i filtpsm -o filtpep --confidence-better lower --confidence-lvl 0.01 --confcolpattern 'peptide q-value'
 526 |   cp lookup psmlookup
 527 |   msstitch psmtable -i filtpep --dbfile psmlookup --addbioset -o ${setname}_${peptype}_psmtable.txt ${params.isobaric ? '--isobaric': ''}
 528 |   sed 's/\\#SpecFile/SpectraFile/' -i ${setname}_${peptype}_psmtable.txt
 529 |   """
 530 | }
 531 | 
 532 | 
 533 | variantpsms = Channel.create()
 534 | novelpsms = Channel.create()
 535 | psmtable
 536 |   .tap { setmergepsmtables; peppsms }
 537 |   .choice( variantpsms, novelpsms ) { it -> it[1] == 'variant' ? 0 : 1 }
 538 | 
 539 |  
 540 | setmergepsmtables
 541 |   .groupTuple(by: 1)
 542 |   .set { psmmerge_in }
 543 | 
 544 | 
 545 | process mergeSetPSMtable {
 546 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true
 547 | 
 548 |   input:
 549 |   set val(setnames), val(peptype), file(psmtables) from psmmerge_in
 550 | 
 551 |   output:
 552 |   file "${peptype}_psmtable.txt" into produced_psmtables
 553 | 
 554 |   """
 555 |   head -n1 ${psmtables[0]} > ${peptype}_psmtable.txt
 556 |   for fn in ${psmtables.join(' ')}; do tail -n+2 \$fn >> ${peptype}_psmtable.txt; done
 557 |   """
 558 | }
 559 | 
 560 | 
 561 | process prePeptideTable {
 562 | 
 563 |   input:
 564 |   set val(setname), val(peptype), file('psms.txt') from peppsms 
 565 | 
 566 |   output:
 567 |   set val(setname), val(peptype), file('peptidetable.txt') into peptable
 568 | 
 569 |   script:
 570 |   """
 571 |   msstitch peptides -i psms.txt -o peptidetable.txt --scorecolpattern svm --spectracol 1 \
 572 |     ${setisobaric && setisobaric[setname] ? "--isobquantcolpattern plex --minint 0.1 --logisoquant --denompatterns ${setdenoms[setname].join(' ')}" : ''}
 573 |   """
 574 | }
 575 | 
 576 | novelpsms
 577 |   .into{novelpsmsFastaBedGFF; novelpsms_specai}
 578 | 
 579 | novelprepep = Channel.create()
 580 | presai_peptable = Channel.create()
 581 | peptable
 582 |   .choice( presai_peptable, novelprepep ) { it -> it[1] == 'variant' ? 0 : 1 }
 583 | novelprepep
 584 |   .join(novelpsmsFastaBedGFF)
 585 |   .set { novelFaBdGfPep }
 586 | 
 587 | process createFastaBedGFF {
 588 |  
 589 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true, saveAs: { it == "${setname}_novel_peptides.gff3" ? "${setname}_novel_peptides.gff3" : null}
 590 |  
 591 |   input:
 592 |   set val(setname), val(peptype), file(peptides) , val(psmtype), file(psms) from novelFaBdGfPep
 593 |   file gtffile
 594 |   file tdb from tdb
 595 |  
 596 |   output:
 597 |   set val(setname), file('novel_peptides.fa') into novelfasta
 598 |   set val(setname), file('novel_peptides.bed') into novelbed
 599 |   set val(setname), file("${setname}_novel_peptides.gff3") into novelGFF3
 600 |   set val(setname), file('novel_peptides.tab.txt') into novelpep
 601 |   set val(setname), file('novpep_perco_quant.txt') into novelpep_percoquant
 602 |  
 603 |   """
 604 |   map_novelpeptide2genome.py --input $psms --gtf $gtffile --fastadb $tdb --tab_out novel_peptides.tab.txt --fasta_out novel_peptides.fa --gff3_out ${setname}_novel_peptides.gff3 --bed_out novel_peptides.bed
 605 |   sort -k 1b,1 <(tail -n+2 $peptides) |cut -f 1,14-500 > peptable_sorted
 606 |   sort -k 2b,2 <(tail -n+2 novel_peptides.tab.txt) > novpep_sorted
 607 |   paste <(cut -f 2 novpep_sorted) <(cut -f1,3-500 novpep_sorted) > novpep_pepcols
 608 |   join novpep_pepcols peptable_sorted -j 1 -a1 -o auto -e 'NA' -t \$'\\t' > novpep_pqjoin
 609 |   # Cut only bare peptide col and q-values/isoquant
 610 |   paste <(cut -f 2 novpep_pqjoin) <(cut -f8-500 novpep_pqjoin) > novpep_joined_pepcols
 611 |   paste <(head -n1 novel_peptides.tab.txt | cut -f1)  <(cut -f 14-500 $peptides |head -n1) > header
 612 |   cat header novpep_joined_pepcols > novpep_perco_quant.txt
 613 |   """
 614 | }
 615 | 
 616 | novelpep
 617 |   .into {blastnovelpep; blatnovelpep; annonovelpep; snpnovelpep}
 618 | novelfasta
 619 |   .into {blastnovelfasta; blatnovelfasta}
 620 | 
 621 | process BlastPNovel {
 622 | 
 623 |   input:
 624 |   set val(setname), file(novelfasta) from blastnovelfasta
 625 |   file blastdb
 626 | 
 627 |   output:
 628 |   set val(setname), file('blastp_out.txt') into novelblast
 629 |   
 630 |   """
 631 |   makeblastdb -in $blastdb -dbtype prot
 632 |   blastp -db $blastdb -query $novelfasta -outfmt '6 qseqid sseqid pident qlen slen qstart qend sstart send mismatch positive gapopen gaps qseq sseq evalue bitscore' -num_threads 4 -max_target_seqs 1 -evalue 1000 -out blastp_out.txt
 633 |   """
 634 | }
 635 | 
 636 | novelpsms_specai
 637 |   .map { it -> [it[0], it[2]] }
 638 |   .join(blastnovelpep)
 639 |   .join(novelblast)
 640 |   .set { novelblastout }
 641 | 
 642 | process ParseBlastpOut {
 643 |  
 644 |  input:
 645 |  set val(setname), file(psms), file(novelpep), file(novelblast) from novelblastout
 646 |  file blastdb
 647 | 
 648 |  output:
 649 |  set val(setname), file('peptable_blastp.txt') into peptable_blastp
 650 |  set val(setname), file('single_mismatch_novpeps.txt') into novpeps_singlemis
 651 | 
 652 |  """
 653 |  parse_BLASTP_out.py --input $novelpep --blastp_result $novelblast --fasta $blastdb --output peptable_blastp.txt
 654 |  extract_1mismatch_novpsm.py peptable_blastp.txt $psms single_mismatch_novpeps.txt
 655 |  """
 656 | 
 657 | }
 658 | 
 659 | groupset_mzmls
 660 |   .map { it -> [it[0], it[1], it[2]] } // strip fraction, strip
 661 |   .groupTuple()
 662 |   .tap { var_specaimzmls }
 663 |   .join(novpeps_singlemis)
 664 |   .set { grouped_saavnov_mzml_psms }
 665 |   
 666 | 
 667 | process ValidateSingleMismatchNovpeps {
 668 |   
 669 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true, saveAs: { it == "precursorError.histogram.plot.pdf" ? "${setname}_novel_precursorError_plot.pdf" : it }
 670 |   
 671 |   input:
 672 |   set val(setname), val(samples), file(mzmls), file(psms) from grouped_saavnov_mzml_psms
 673 | 
 674 |   output:
 675 |   set val(setname), file("${setname}_novel_saav_specai.txt") into singlemis_specai
 676 |   file 'precursorError.histogram.plot.pdf' optional true into novel_specai_plot
 677 | 
 678 |   """
 679 |   mkdir mzmls
 680 |   for fn in $mzmls; do ln -s `pwd`/\$fn mzmls/; done
 681 |   Rscript /SpectrumAI/SpectrumAI.R mzmls $psms ${setname}_novel_saav_specai.txt || cp $psms singlemis_specai.txt
 682 |   """
 683 | }
 684 | 
 685 | singlemis_specai
 686 |   .join(peptable_blastp)
 687 |   .set { nov_specaiparse }
 688 | 
 689 | process novpepSpecAIOutParse {
 690 | 
 691 |   input:
 692 |   set val(setname), file(x), file('peptide_table.txt') from nov_specaiparse
 693 |   
 694 |   output:
 695 |   set val(setname), file('novpep_specai.txt') into novpep_singlemisspecai
 696 | 
 697 |   """
 698 |   parse_spectrumAI_out.py --spectrumAI_out $x --input peptide_table.txt --output novpep_sa
 699 |   cut -f 1,8-19 novpep_sa > novpep_specai.txt
 700 |   """
 701 | }
 702 | 
 703 | 
 704 | process BLATNovel {
 705 | 
 706 |   input:
 707 |   set val(setname), file(novelfasta) from blatnovelfasta
 708 |   file genomefa
 709 | 
 710 |   output:
 711 |   set val(setname), file('blat_out.pslx') into novelblat
 712 | 
 713 |   """
 714 |   blat $genomefa $novelfasta -t=dnax -q=prot -tileSize=5 -minIdentity=99 -out=pslx blat_out.pslx 
 715 |   """
 716 | }
 717 | 
 718 | novelblat
 719 |   .join(blatnovelpep)
 720 |   .set { novblatparse }
 721 | 
 722 | process parseBLATout {
 723 | 
 724 |  input:
 725 |  set val(setname), file(novelblat), file(novelpep) from novblatparse
 726 | 
 727 |  output:
 728 |  set val(setname), file('peptable_blat.txt') into peptable_blat
 729 | 
 730 |  """
 731 |  parse_BLAT_out.py $novelblat $novelpep peptable_blat.txt
 732 | 
 733 |  """
 734 | }
 735 | 
 736 | process labelnsSNP {
 737 |   
 738 |   input:
 739 |   set val(setname), file(peptable) from snpnovelpep
 740 |   file snpfa
 741 |   file(snpdb) from snpseqdb
 742 | 
 743 |   output:
 744 |   set val(setname), file('nssnp.txt') into ns_snp_out
 745 | 
 746 |   """
 747 |   label_nsSNP_pep.py --input $peptable --nsSNPdb $snpfa --dbfile "$snpdb" --output nssnp.txt --minlen $params.minlen
 748 |   """
 749 | }
 750 | 
 751 | bwfile = Channel.fromPath(params.bigwigs)
 752 | novelGFF3
 753 |   .combine(bwfile)
 754 |   .into { novelGFF3_phast; novelGFF3_phylo; novelGFF3_bams }
 755 | 
 756 | process phastcons {
 757 |   
 758 |   input:
 759 |   set val(setname), file(novelgff), file('bigwigs') from novelGFF3_phast
 760 | 
 761 |   output:
 762 |   set val(setname), file ('phastcons.txt') into phastcons_out
 763 | 
 764 |   """
 765 |   calculate_phastcons.py $novelgff bigwigs/hg19.100way.phastCons.bw phastcons.txt
 766 |   """
 767 | }
 768 | 
 769 | process phyloCSF {
 770 |   
 771 |   input:
 772 |   set val(setname), file(novelgff), file('bigwigs') from novelGFF3_phylo
 773 | 
 774 |   output:
 775 |   set val(setname), file('phylocsf.txt') into phylocsf_out
 776 | 
 777 |   """
 778 |   calculate_phylocsf.py $novelgff bigwigs phylocsf.txt
 779 |   """
 780 | 
 781 | }
 782 | 
 783 | 
 784 | bamFiles = params.bamfiles ? Channel.fromPath(params.bamfiles).map { fn -> [ fn, fn + '.bai' ] } : Channel.empty()
 785 | 
 786 | process scanBams {
 787 | 
 788 |   when: params.bamfiles
 789 | 
 790 |   input:
 791 |   set val(setname), file(gff) from novelGFF3_bams
 792 |   file bams from bamFiles.collect()
 793 |   
 794 |   output:
 795 |   set val(setname), file('scannedbams.txt') into scannedbams
 796 | 
 797 |   """
 798 |   ls *.bam > bamfiles.txt
 799 |   scan_bams.py  --gff_input $gff --bam_files bamfiles.txt --output scannedbams.txt
 800 |   """
 801 | }
 802 | 
 803 | annoperl = Channel.fromPath("$params.annovar_dir/annotate_variation.pl")
 804 | annohumdb = Channel.fromPath("$params.annovar_dir/humandb/")
 805 | 
 806 | novelbed
 807 |   .combine(annoperl)
 808 |   .combine(annohumdb)
 809 |   .set { anno_in }
 810 | 
 811 | process annovar {
 812 |   
 813 |   input:
 814 |   set val(setname), file(novelbed), file(perlscript), file(humdb) from anno_in
 815 | 
 816 |   output:
 817 |   set val(setname), file('novpep_annovar.variant_function') into annovar_out
 818 | 
 819 |   """
 820 |   ./annotate_variation.pl -out novpep_annovar -build hg19 $novelbed humandb/
 821 |   """
 822 | 
 823 | }
 824 | 
 825 | annovar_out
 826 |   .join(annonovelpep)
 827 |   .set { parseanno }
 828 | 
 829 | process parseAnnovarOut {
 830 |   
 831 |   input:
 832 |   set val(setname), file(anno), file(novelpep) from parseanno
 833 | 
 834 |   output:
 835 |   set val(setname), file('parsed_annovar.txt') into annovar_parsed
 836 | 
 837 |   """
 838 |   parse_annovar_out.py --input $novelpep --output parsed_annovar.txt --annovar_out $anno 
 839 |   """
 840 | }
 841 | 
 842 | 
 843 | ns_snp_out
 844 |   .join(novpep_singlemisspecai)
 845 |   .join(peptable_blat)
 846 |   .join(annovar_parsed)
 847 |   .join(phastcons_out)
 848 |   .join(phylocsf_out)
 849 |   .join(novelpep_percoquant)
 850 |   .set { combined_novelprebam }
 851 | 
 852 | combined_novel = (params.bamfiles ? combined_novelprebam.join(scannedbams) : combined_novelprebam)
 853 | 
 854 | 
 855 | process combineResults{
 856 |   
 857 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true
 858 |   
 859 |   input:
 860 |   set val(setname), file(a), file(b), file(c), file(d), file(e), file(f), file(g), file(h) from combined_novel
 861 |   
 862 |   output:
 863 |   set val('nov'), val(setname), file("${setname}_novel_peptides.txt") into novpeps_finished 
 864 |   
 865 |   script:
 866 |   if (!params.bamfiles)
 867 |   """
 868 |   for fn in $a $b $c $d $e $f $g; do sort -k 1b,1 \$fn > tmpfn; mv tmpfn \$fn; done
 869 |   join $a $b -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined1
 870 |   join joined1 $c -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined2
 871 |   join joined2 $d -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined3
 872 |   join joined3 $e -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined4
 873 |   join joined4 $f -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined5
 874 |   join joined5 $g -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined6
 875 |   grep '^Bare peptide' joined6 > ${setname}_novel_peptides.txt
 876 |   grep -v '^Bare peptide' joined6 >> ${setname}_novel_peptides.txt
 877 |   """
 878 | 
 879 |   else
 880 |   """
 881 |   for fn in $a $b $c $d $e $f $g $h; do sort -k 1b,1 \$fn > tmpfn; mv tmpfn \$fn; done
 882 |   join $a $b -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined1
 883 |   join joined1 $c -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined2
 884 |   join joined2 $d -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined3
 885 |   join joined3 $e -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined4
 886 |   join joined4 $f -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined5
 887 |   join joined5 $g -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined6
 888 |   join joined6 $h -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined7
 889 |   grep '^Bare peptide' joined7 > ${setname}_novel_peptides.txt
 890 |   grep -v '^Bare peptide' joined7 >> ${setname}_novel_peptides.txt
 891 |   """
 892 | }
 893 | 
 894 | process prepSpectrumAI {
 895 | 
 896 |   input:
 897 |   set val(setname), val(peptype), file(psms) from variantpsms
 898 |   
 899 |   output:
 900 |   set val(setname), file('specai_in.txt') into var_specai_input
 901 |   
 902 |   script:
 903 |   if (params.saavheader)
 904 |   """
 905 |   cat <(head -n1 $psms) <(grep $params.saavheader $psms) > saavpsms
 906 |   label_sub_pos.py --input_psm saavpsms --output specai_in.txt ${params.splitchar ? "--splitchar ${params.splitchar}" : ''}
 907 |   """
 908 |   else
 909 |   """
 910 |   label_sub_pos.py --input_psm $psms --output specai_in.txt
 911 |   """
 912 | }
 913 | 
 914 | 
 915 | var_specaimzmls
 916 |   .join(var_specai_input)
 917 |   .set { var_specai_inmzml }
 918 | 
 919 | process SpectrumAI {
 920 | 
 921 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true, saveAs: { it == "precursorError.histogram.plot.pdf" ? "${setname}_variant_precursorError_plot.pdf" : it }
 922 | 
 923 |   input:
 924 |   set val(setname), val(samples), file(mzmls), file(specai_in) from var_specai_inmzml
 925 | 
 926 |   output:
 927 |   set val(setname), file("${setname}_variant_specairesult.txt") into specai
 928 |   file "precursorError.histogram.plot.pdf" into specai_plot
 929 | 
 930 |   """
 931 |   mkdir mzmls
 932 |   for fn in $mzmls; do ln -s `pwd`/\$fn mzmls/; done
 933 |   ls mzmls
 934 |   Rscript /SpectrumAI/SpectrumAI.R mzmls $specai_in ${setname}_variant_specairesult.txt
 935 |   """
 936 | }
 937 | 
 938 | specai
 939 |   .join(presai_peptable)
 940 |   .set { specai_peptable }
 941 | 
 942 | process mapVariantPeptidesToGenome {
 943 | 
 944 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true
 945 | 
 946 |   input:
 947 |   set val(setname), file(x), val(peptype), file(peptides) from specai_peptable
 948 |   file cosmic
 949 |   file dbsnp
 950 |   
 951 |   output:
 952 |   set val('var'), val(setname), file("${setname}_variant_peptides.txt") into varpeps_finished
 953 |   file "${setname}_variant_peptides.saav.pep.hg19cor.vcf" into saavvcfs_finished
 954 | 
 955 |   """
 956 |   ${params.saavheader ? "cat <(head -n1 ${peptides}) <(grep ${params.saavheader} ${peptides}) > saavpeps" : "mv ${peptides} saavpeps" }
 957 |   parse_spectrumAI_out.py --spectrumAI_out $x --input saavpeps --output setsaavs
 958 |   ${params.saavheader ? "cat setsaavs <(grep -v ${params.saavheader} ${peptides} | sed \$'s/\$/\tNA/') > ${setname}_variant_peptides.txt" : "mv setsaavs ${setname}_variant_peptides.txt"}
 959 |   map_cosmic_snp_tohg19.py --input ${setname}_variant_peptides.txt --output ${setname}_variant_peptides.saav.pep.hg19cor.vcf --cosmic_input $cosmic --dbsnp_input $dbsnp
 960 |   # Remove PSM-table specific stuff (RT, precursor, etc etc) from variant PEPTIDE table
 961 |   cut -f 1,2,14-5000 ${setname}_variant_peptides.txt > pepsfix
 962 |   mv pepsfix ${setname}_variant_peptides.txt
 963 |   """
 964 | }
 965 | 
 966 | novpeps_finished
 967 |   .concat(varpeps_finished) 
 968 |   .groupTuple()
 969 |   .set { setmerge_peps }
 970 | 
 971 | 
 972 | accession_keymap = ['var': 'Peptide sequence', 'nov': 'Peptide']
 973 | acc_removemap = ['nov': 'Bare peptide', 'var': 'Mod.peptide']
 974 | 
 975 | 
 976 | process mergeSetPeptidetable {
 977 | 
 978 |   publishDir "${params.outdir}", mode: 'copy', overwrite: true
 979 | 
 980 |   input:
 981 |   set val(peptype), val(setnames), file('peps?') from setmerge_peps
 982 | 
 983 |   output:
 984 |   file "${peptype}_peptidetable.txt" into produced_peptables
 985 | 
 986 |   """
 987 |   # build non-changing fields (seq based fields) table:
 988 |   fixfields=`head -n1 peps1 |tr -s '\\t' '\\n' | egrep -vn '(Setname|Spectrum|Files|Charge|q-val|plex|${acc_removemap[peptype]})' | cut -f 1 -d ':'`
 989 |   fixfields=`echo \$fixfields | sed 's/ /,/g'`
 990 |   head -n1 peps1 | cut -f `echo \$fixfields` > fixheader
 991 |   count=1; for setn in ${setnames.join(' ')} ; do
 992 |   cut -f  `echo \$fixfields` peps\$count | tail -n+2 >> fixpeps
 993 |   ((count++))
 994 |   done
 995 |   if [ ${peptype} == 'nov' ]
 996 |   then
 997 |      cat fixheader <(sort -u -k1b,1 fixpeps) > temp
 998 |      group_novpepToLoci.py  --input temp --output temp.loci --distance 10kb
 999 |      head -n1 temp.loci > fixheader
1000 |      tail -n+2 temp.loci > fixpeps
1001 |   fi
1002 |   sort -u -k1b,1 fixpeps > temp
1003 |   mv temp fixpeps
1004 | 
1005 |   ## Build changing fields table
1006 |   touch peptable
1007 |   count=1; for setn in ${setnames.join(' ')}; do
1008 |     varfields=`head -n1 peps\$count |tr -s '\\t' '\\n' | egrep -n '(${accession_keymap[peptype]}|Spectrum|q-val|plex)' | cut -f 1 -d ':'`
1009 |     varfields=`echo \$varfields| sed 's/ /,/g'`
1010 |     # first add to header, cut from f2 to remove join-key pep seq field
1011 |     head -n1 peps\$count | cut -f `echo \$varfields` | cut -f 2-5000| sed "s/^\\(\\w\\)/\${setn}_\\1/;s/\\(\\s\\)/\\1\${setn}_/g" > varhead
1012 |     paste fixheader varhead > newheader && mv newheader fixheader
1013 |     # then join the values
1014 |     tail -n+2 peps\$count | cut -f `echo \$varfields` | sort -k1b,1 > sortpep; join peptable sortpep -a1 -a2 -o auto -e 'NA' -t \$'\\t' > joined
1015 |     mv joined peptable
1016 |     ((count++))
1017 |   done
1018 |   join fixpeps peptable -a1 -a2 -o auto -e 'NA' -t \$'\\t' > fixvarpeps
1019 |   cat fixheader fixvarpeps > ${peptype}_peptidetable.txt
1020 |   """
1021 | }
1022 | 
1023 | 
1024 | produced_psmtables
1025 |   .concat(produced_peptables)
1026 |   .subscribe { println "Pipeline output ready: ${it}" }
1027 | 


--------------------------------------------------------------------------------
/nextflow.config:
--------------------------------------------------------------------------------
 1 | params {
 2 |   container = 'lehtiolab/ipaw:0.5' // Container slug. Stable releases should specify release tag!
 3 | 
 4 |   outdir = './results'
 5 |   tracedir = "${params.outdir}/pipeline_info"
 6 |   awsqueue = false
 7 |   awsregion = 'eu-west-1'
 8 |   external_config_version = 'master'
 9 | 
10 |   pisepdb = false
11 |   isobaric = false
12 |   activation = 'hcd'
13 |   bamfiles = false
14 |   mods = false
15 |   knownheaders = false
16 |   novheaders = false
17 |   varheaders = false
18 |   saavheader = false
19 |   noclassfdr = false
20 |   dbsnp = false
21 |   cosmic = false
22 |   mzmldef = false
23 |   input = false
24 |   normalpsms = false
25 |   annovar_dir = false
26 |   bigwigs = false
27 |   splitchar = false
28 |   quantlookup = false
29 |   minlen = 8
30 |   maxlen = 50
31 |   maxmiscleav = 0
32 | 
33 | }
34 | 
35 | includeConfig 'conf/base.config'
36 | profiles {
37 | 
38 |   conda { process.conda = "$baseDir/environment.yml" }
39 |   docker {
40 |     docker.enabled = true
41 |     docker.fixOwnership = true
42 |     docker.runOptions = '-u $(id -u):$(id -g)'
43 |     singularity.enabled = false
44 |     conda.enabled = false
45 |   }
46 |   singularity {
47 |     singularity.enabled = true
48 |   }
49 |   lehtio { 
50 |     includeConfig "https://raw.githubusercontent.com/lehtiolab/static-resources/${params.external_config_version}/nf-configs/lehtio.config"
51 |     includeConfig "https://raw.githubusercontent.com/lehtiolab/static-resources/${params.external_config_version}/nf-configs/${params.pisepdb ? 'ipaw_6ft': 'vardb'}.config"
52 |   }
53 |   uppmax { 
54 |     includeConfig 'conf/uppmax.config'
55 |   }
56 |   testing {
57 |     includeConfig 'conf/base.config'
58 |   }
59 | }
60 | 
61 | 
62 | // Function to ensure that resource requirements don't go beyond
63 | // a maximum limit
64 | def check_max(obj, type) {
65 |   if(type == 'memory'){
66 |     try {
67 |       if(obj.compareTo(params.max_memory as nextflow.util.MemoryUnit) == 1)
68 |         return params.max_memory as nextflow.util.MemoryUnit
69 |       else
70 |         return obj
71 |     } catch (all) {
72 |       println "   ### ERROR ###   Max memory '${params.max_memory}' is not valid! Using default value: $obj"
73 |       return obj
74 |     }
75 |   } else if(type == 'time'){
76 |     try {
77 |       if(obj.compareTo(params.max_time as nextflow.util.Duration) == 1)
78 |         return params.max_time as nextflow.util.Duration
79 |       else
80 |         return obj
81 |     } catch (all) {
82 |       println "   ### ERROR ###   Max time '${params.max_time}' is not valid! Using default value: $obj"
83 |       return obj
84 |     }
85 |   } else if(type == 'cpus'){
86 |     try {
87 |       return Math.min( obj, params.max_cpus as int )
88 |     } catch (all) {
89 |       println "   ### ERROR ###   Max cpus '${params.max_cpus}' is not valid! Using default value: $obj"
90 |       return obj
91 |     }
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------