├── Awk ├── gff2gff.awk ├── cutgff.awk └── extract_most_5p.awk ├── README.md ├── make_TSS_file_from_ensembl_annot.sh └── make_TSS_file_from_annotation_with_confidence_better.sh /Awk/gff2gff.awk: -------------------------------------------------------------------------------- 1 | 2 | # ~/Awk/gff2gff.awk 3 | 4 | $1!~/#/{ 5 | for (i=1;i<=7;i++) 6 | { 7 | printf $i"\t"; 8 | } 9 | printf $8; 10 | if(NF>8) 11 | { 12 | printf "\t"$9; 13 | for (i=10;i<=NF;i++) 14 | { 15 | printf " "$i; 16 | } 17 | } 18 | print ""; 19 | } 20 | 21 | $1~/#/{print} -------------------------------------------------------------------------------- /Awk/cutgff.awk: -------------------------------------------------------------------------------- 1 | # ~/Awk/cutgff.awk 2 | # this script cuts a gff file to the toth field (outpuf: from field no 1 to no to 3 | # which is specified as an argument) 4 | 5 | # awk -v to=10 -f ~/Awk/cutgff.awk in.gff > out.gff 6 | 7 | 8 | { 9 | s=""; 10 | if(to<=9) 11 | { 12 | for(i=1; i<=to-1; i++) 13 | { 14 | s=(s)($i)("\t"); 15 | } 16 | s=(s)($i); 17 | print s; 18 | } 19 | else 20 | { 21 | for(i=1; i<=8; i++) 22 | { 23 | s=(s)($i)("\t"); 24 | } 25 | for(i=9; i<=to-1; i++) 26 | { 27 | s=(s)($i)" "; 28 | } 29 | s=(s)($i); 30 | print s; 31 | } 32 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | MakeGencodeTSS 2 | ============== 3 | 4 | Makes distinct TSS of each gene from a Gencode or an ensembl gene annotation file in GTF format 5 | 6 | There are two versions of the same script here, one for Gencode and one for ensembl. 7 | 8 | This script takes as input: 9 | - a Gencode or Ensembl annotation file in GTF format (mandatory) 10 | - a list of transcript biotypes to consider in this file (optional, default: no filtering) 11 | 12 | This script provides as output the following files in the directory where it is executed: 13 | - a file of most 5' exons from each transcript in GFF2 format 14 | - a file of TSS from each transcript in GFF2 format 15 | - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 16 | tag was present in any transcript with this TSS 17 | -------------------------------------------------------------------------------- /Awk/extract_most_5p.awk: -------------------------------------------------------------------------------- 1 | #~/Awk/extract_most_5p.awk 2 | #For all the features of a gff input file that have the field no fldno in common, 3 | #extract the most 5' features, taking the strand into account 4 | #be careful: the features must be stranded 5 | #when two features are equally upstream then we take the longer one 6 | # on 12/18/2013 edited so that only + and - strand features are considered 7 | 8 | #usage 9 | #Chr21=/projects/encode/scaling_up/chr21_22/annotations/hg18/Havana_chr21_exons_nopseudo.gff 10 | #awk -v fldno=10 -f ~/Awk/extract_most_5p.awk $Chr21 11 | 12 | #$Chr21 13 | #chr21 VEGA_Novel_CDS exon 9884493 9884538 . + 0 transcript_id "AF254982.1-001"; gene_id "AF254982.1"; mRNA_start_not_found "0"; mRNA_end_not_found "0"; start_codon_not_found "0"; stop_codon_not_found "0"; 14 | 15 | 16 | (($7=="+")||($7=="-")){ 17 | seen[$fldno]++; 18 | if(seen[$fldno]==1) #initialization 19 | { 20 | chr[$fldno]=$1; 21 | strand[$fldno]=$7; 22 | most5p_beg[$fldno]=$4; 23 | most5p_end[$fldno]=$5; 24 | most5p_all[$fldno]=$0; 25 | } 26 | 27 | if(((strand[$fldno]=="+")&&(before($4,$5,most5p_beg[$fldno],most5p_end[$fldno])==1))||((strand[$fldno]=="-")&&(before($4,$5,most5p_beg[$fldno],most5p_end[$fldno])==0))) 28 | { 29 | most5p_beg[$fldno]=$4; 30 | most5p_end[$fldno]=$5; 31 | most5p_all[$fldno]=$0; 32 | } 33 | } 34 | 35 | END{ 36 | for(k in seen) 37 | { 38 | print most5p_all[k]; 39 | } 40 | } 41 | 42 | #the before function takes as input two objects and returns 1 if the first one is 43 | #before the second one. Note that when two features have the same start then it returns 1 44 | #if the first one is longer than the second 45 | function before(beg1,end1,beg2,end2) 46 | { 47 | return ((beg1(end2-beg2)))) ? 1 : 0; 48 | } 49 | 50 | -------------------------------------------------------------------------------- /make_TSS_file_from_ensembl_annot.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make_TSS_file_from_annotation_with_confidence_better.sh 4 | # this script takes as input: 5 | # - a Gencode annotation file in GTF format (mandatory) 6 | # - a list of transcript biotypes to consider in this file (optional, default: no filtering) 7 | # and provides as output the following files in the directory where it is executed: 8 | # - a file of most 5' exons from each transcript in GFF2 format 9 | # - a file of TSS from each transcript in GFF2 format 10 | # - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 11 | # tag was present in any transcript with this TSS 12 | 13 | # Be careful: 14 | ############# 15 | # makes the assumption that gene id and transcript id come as first and second (tag,value) pairs in the annotation GTF file 16 | 17 | # Usage: 18 | ######## 19 | # make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] 2> make_TSS_file_with_confidence_better_from_annotation.err & 20 | 21 | 22 | # Check that the annotation file exists, otherwise exit 23 | ####################################################### 24 | if [ ! -n "$1" ] 25 | then 26 | echo "" >&2 27 | echo Usage: make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] >&2 28 | echo Be careful: this script makes the assumption that gene id and transcript id come as first >&2 29 | echo and second \(tag\,value\) pairs \in the GTF file >&2 30 | echo In case no list of transcript biotypes is specified there will be no filtering done >&2 31 | echo "" >&2 32 | exit 1 33 | fi 34 | 35 | path="`dirname \"$0\"`" # relative path 36 | rootDir="`( cd \"$path\" && pwd )`" # absolute path 37 | 38 | # Initialize variables 39 | ###################### 40 | annotation=$1 41 | annotbase=`basename ${annotation%.gtf}` 42 | if [ -n "$2" ] 43 | then 44 | trbiotypes=$2 45 | fi 46 | 47 | # Programs 48 | ########## 49 | EXTRACT5p=$rootDir/Awk/extract_most_5p.awk 50 | CUTGFF=$rootDir/Awk/cutgff.awk 51 | GFF2GFF=$rootDir/Awk/gff2gff.awk 52 | 53 | 54 | ########################################################## 55 | # Make the TSS file for the asked transcript biotypes # 56 | ########################################################## 57 | 58 | # a. Extract most 5' exons of transcripts from specified transcript biotypes 59 | ############################################################################# 60 | echo I am extracting the most 5\' exons of transcripts from the biotypes specified by the user \(default is no selection\) >&2 61 | if [ ! -n "$trbiotypes" ] || [ ! -e "$trbiotypes" ] 62 | then 63 | awk '$3=="exon"' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff 64 | else 65 | awk -v fileRef=$trbiotypes 'BEGIN{while(getline < fileRef >0){biotype["\""$1"\"\;"]=1;}} $3=="exon"{i=9; while($i!="transcript_biotype"){i+=2}if(($i=="transcript_biotype")&&(biotype[$(i+1)]==1)){print}}' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff 66 | fi 67 | 68 | # b. Then the most 5' bp of each transcript for each gene (not that all tss are said to come from ensembl) 69 | ########################################################################################################## 70 | # with associated confidence level= low confidence level whenever the tss belongs to a tr where the CDS start 71 | ################################################################################################################# 72 | # not found tag was set, not_low otherwise (syntax was CDS start not found for v3c) 73 | ###################################################################################### 74 | echo I am extracting the most 5\' bp of each transcript for each gene, >&2 75 | echo associating a low confidence level when the tss comes from a tr where the cds_start_NF tag was set, >&2 76 | echo and adding the list of tr and of tr biotypes the tss comes from. >&2 77 | awk '{i=9; while($i!="transcript_biotype"){i++}if($i=="transcript_biotype"){split($(i+1),c,"\""); trbiot=c[2];} ($0~/cds_start_NF/) ? confidence="low" : confidence="not_low"; ($7=="+") ? tsspos=$4 : tsspos=$5; split($10,a,"\""); split($12,b,"\""); print $1, "Ensembl", "CapSite", tsspos, tsspos, ".", $7, ".", "gene_id", a[2], "tr", b[2], "trbiot", trbiot, "confidence", confidence;}' $annotbase\_exons_most5p.gff | awk -f $GFF2GFF > $annotbase\_capped_sites.gff 78 | 79 | # c. Finally collapse TSS per gene and put a low confidence level when the collapsed tss is composed of at least one low tss 80 | ############################################################################################################################# 81 | # also add the gene biotype at the end 82 | ########################################## 83 | echo I am collapsing all TSSs per gene, put a low confidence level whenever one of the indiv tss has a low confidence level, >&2 84 | echo and add the gene biotype. >&2 85 | cat $annotbase\_capped_sites.gff | awk -v to=10 -f $CUTGFF | sort -n | uniq -c | awk '{$1=""; print $0}' | awk -f $GFF2GFF | awk -v fileRef=$annotbase\_capped_sites.gff 'BEGIN{while (getline < fileRef >0){trlist[$1"_"$4"_"$5"_"$7,$10]=(trlist[$1"_"$4"_"$5"_"$7,$10])($12)(","); trbiotlist[$1"_"$4"_"$5"_"$7,$10]=(trbiotlist[$1"_"$4"_"$5"_"$7,$10])($14)(","); if($16=="low"){low[$1"_"$4"_"$5"_"$7,$10]=1;}}} {$11="trlist"; $12=trlist[$1"_"$4"_"$5"_"$7,$10]; $13="trbiotlist"; $14=trbiotlist[$1"_"$4"_"$5"_"$7,$10]; $15="confidence"; (low[$1"_"$4"_"$5"_"$7,$10]==1) ? $16="low" : $16="not_low"; print $0}' | awk -v fileRef=$annotation 'BEGIN{while(getline < fileRef >0){if($3=="gene"){split($10,a,"\""); i=9; while($i!="gene_biotype"){i+=2} if($i=="gene_biotype"){split($(i+1),b,"\"");} biotype[a[2]]=b[2]}}}{print $0, "gene_biotype", biotype[$10]}' | awk -f $GFF2GFF > $annotbase\_capped_sites_nr_with_confidence.gff 86 | 87 | 88 | -------------------------------------------------------------------------------- /make_TSS_file_from_annotation_with_confidence_better.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # make_TSS_file_from_annotation_with_confidence_better.sh 4 | # this script takes as input: 5 | # - a Gencode annotation file in GTF format (mandatory) 6 | # - a list of transcript biotypes to consider in this file (optional, default: no filtering) 7 | # and provides as output the following files in the directory where it is executed: 8 | # - a file of most 5' exons from each transcript in GFF2 format 9 | # - a file of TSS from each transcript in GFF2 format 10 | # - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 11 | # tag was present in any transcript with this TSS 12 | 13 | # Be careful: 14 | ############# 15 | # makes the assumption that gene id and transcript id come as first and second (tag,value) pairs in the annotation GTF file 16 | 17 | # Usage: 18 | ######## 19 | # make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] 2> make_TSS_file_with_confidence_better_from_annotation.err & 20 | 21 | 22 | # Check that the annotation file exists, otherwise exit 23 | ####################################################### 24 | if [ ! -n "$1" ] 25 | then 26 | echo "" >&2 27 | echo Usage: make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] >&2 28 | echo Be careful: this script makes the assumption that gene id and transcript id come as first >&2 29 | echo and second \(tag\,value\) pairs \in the GTF file >&2 30 | echo In case no list of transcript biotypes is specified there will be no filtering done >&2 31 | echo "" >&2 32 | exit 1 33 | fi 34 | 35 | path="`dirname \"$0\"`" # relative path 36 | rootDir="`( cd \"$path\" && pwd )`" # absolute path 37 | 38 | # Initialize variables 39 | ###################### 40 | annotation=$1 41 | annotbase=`basename ${annotation%.gtf}` 42 | if [ -n "$2" ] 43 | then 44 | trbiotypes=$2 45 | fi 46 | 47 | # Programs 48 | ########## 49 | EXTRACT5p=$rootDir/Awk/extract_most_5p.awk 50 | CUTGFF=$rootDir/Awk/cutgff.awk 51 | GFF2GFF=$rootDir/Awk/gff2gff.awk 52 | 53 | 54 | ########################################################## 55 | # Make the TSS file for the asked transcript biotypes # 56 | ########################################################## 57 | 58 | # a. Extract most 5' exons of transcripts from specified transcript biotypes 59 | ############################################################################# 60 | echo I am extracting the most 5\' exons of transcripts from the biotypes specified by the user \(default is no selection\) >&2 61 | if [ ! -n "$trbiotypes" ] || [ ! -e "$trbiotypes" ] 62 | then 63 | awk '$3=="exon"' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff 64 | else 65 | awk -v fileRef=$trbiotypes 'BEGIN{while(getline < fileRef >0){biotype["\""$1"\"\;"]=1;}} $3=="exon"{i=9; while($i!="transcript_type"){i+=2}if(($i=="transcript_type")&&(biotype[$(i+1)]==1)){print}}' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff 66 | fi 67 | 68 | # b. Then the most 5' bp of each transcript for each gene (not that all tss are said to come from gencode) 69 | ########################################################################################################## 70 | # with associated confidence level= low confidence level whenever the tss belongs to a tr where the CDS start 71 | ################################################################################################################# 72 | # not found tag was set, not_low otherwise (syntax was CDS start not found for v3c) 73 | ###################################################################################### 74 | echo I am extracting the most 5\' bp of each transcript for each gene, >&2 75 | echo associating a low confidence level when the tss comes from a tr where the cds_start_NF tag was set, >&2 76 | echo and adding the list of tr and of tr biotypes the tss comes from. >&2 77 | awk '{i=9; while($i!="transcript_type"){i++}if($i=="transcript_type"){split($(i+1),c,"\""); trbiot=c[2];} ($0~/cds_start_NF/) ? confidence="low" : confidence="not_low"; ($7=="+") ? tsspos=$4 : tsspos=$5; split($10,a,"\""); split($12,b,"\""); print $1, "Gencode", "CapSite", tsspos, tsspos, ".", $7, ".", "gene_id", a[2], "tr", b[2], "trbiot", trbiot, "confidence", confidence;}' $annotbase\_exons_most5p.gff | awk -f $GFF2GFF > $annotbase\_capped_sites.gff 78 | 79 | # c. Finally collapse TSS per gene and put a low confidence level when the collapsed tss is composed of at least one low tss 80 | ############################################################################################################################# 81 | # also add the gene biotype at the end 82 | ########################################## 83 | echo I am collapsing all TSSs per gene, put a low confidence level whenever one of the indiv tss has a low confidence level, >&2 84 | echo and add the gene biotype. >&2 85 | cat $annotbase\_capped_sites.gff | awk -v to=10 -f $CUTGFF | sort -n | uniq -c | awk '{$1=""; print $0}' | awk -f $GFF2GFF | awk -v fileRef=$annotbase\_capped_sites.gff 'BEGIN{while (getline < fileRef >0){trlist[$1"_"$4"_"$5"_"$7,$10]=(trlist[$1"_"$4"_"$5"_"$7,$10])($12)(","); trbiotlist[$1"_"$4"_"$5"_"$7,$10]=(trbiotlist[$1"_"$4"_"$5"_"$7,$10])($14)(","); if($16=="low"){low[$1"_"$4"_"$5"_"$7,$10]=1;}}} {$11="trlist"; $12=trlist[$1"_"$4"_"$5"_"$7,$10]; $13="trbiotlist"; $14=trbiotlist[$1"_"$4"_"$5"_"$7,$10]; $15="confidence"; (low[$1"_"$4"_"$5"_"$7,$10]==1) ? $16="low" : $16="not_low"; print $0}' | awk -v fileRef=$annotation 'BEGIN{while(getline < fileRef >0){if($3=="gene"){split($10,a,"\""); i=9; while($i!="gene_type"){i+=2} if($i=="gene_type"){split($(i+1),b,"\"");} biotype[a[2]]=b[2]}}}{print $0, "gene_biotype", biotype[$10]}' | awk -f $GFF2GFF > $annotbase\_capped_sites_nr_with_confidence.gff 86 | 87 | 88 | --------------------------------------------------------------------------------