├── Awk
    ├── gff2gff.awk
    ├── cutgff.awk
    └── extract_most_5p.awk
├── README.md
├── make_TSS_file_from_ensembl_annot.sh
└── make_TSS_file_from_annotation_with_confidence_better.sh


/Awk/gff2gff.awk:
--------------------------------------------------------------------------------
 1 | 
 2 | # ~/Awk/gff2gff.awk
 3 | 
 4 | $1!~/#/{
 5 |     for (i=1;i<=7;i++)
 6 |     {
 7 | 	printf $i"\t";
 8 |     }
 9 |     printf $8;
10 |     if(NF>8)
11 |     {
12 | 	printf "\t"$9;
13 | 	for (i=10;i<=NF;i++)
14 | 	{
15 | 	    printf " "$i;
16 | 	}
17 |     }
18 |     print "";
19 | }
20 | 
21 | $1~/#/{print}


--------------------------------------------------------------------------------
/Awk/cutgff.awk:
--------------------------------------------------------------------------------
 1 | # ~/Awk/cutgff.awk
 2 | # this script cuts a gff file to the toth field (outpuf: from field no 1 to no to
 3 | # which is specified as an argument)
 4 | 
 5 | # awk -v to=10 -f ~/Awk/cutgff.awk in.gff > out.gff
 6 | 
 7 | 
 8 | {
 9 |     s="";
10 |     if(to<=9)
11 |     {
12 | 	for(i=1; i<=to-1; i++)
13 | 	{
14 | 	    s=(s)($i)("\t");
15 | 	}  
16 | 	s=(s)($i);
17 | 	print s;
18 |     }
19 |     else
20 |     {
21 | 	for(i=1; i<=8; i++)
22 | 	{
23 | 	    s=(s)($i)("\t");
24 | 	}  
25 | 	for(i=9; i<=to-1; i++) 
26 | 	{
27 | 	    s=(s)($i)" ";
28 | 	}	
29 | 	s=(s)($i);
30 | 	print s;
31 |     }
32 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | MakeGencodeTSS
 2 | ==============
 3 | 
 4 | Makes distinct TSS of each gene from a Gencode or an ensembl gene annotation file in GTF format
 5 | 
 6 | There are two versions of the same script here, one for Gencode and one for ensembl.
 7 | 
 8 | This script takes as input:
 9 | - a Gencode or Ensembl annotation file in GTF format (mandatory)
10 | - a list of transcript biotypes to consider in this file (optional, default: no filtering)
11 | 
12 | This script provides as output the following files in the directory where it is executed:
13 | - a file of most 5' exons from each transcript in GFF2 format
14 | - a file of TSS from each transcript in GFF2 format
15 | - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 
16 |   tag was present in any transcript with this TSS
17 | 


--------------------------------------------------------------------------------
/Awk/extract_most_5p.awk:
--------------------------------------------------------------------------------
 1 | #~/Awk/extract_most_5p.awk
 2 | #For all the features of a gff input file that have the field no fldno in common,
 3 | #extract the most 5' features, taking the strand into account
 4 | #be careful: the features must be stranded
 5 | #when two features are equally upstream then we take the longer one
 6 | # on 12/18/2013 edited so that only + and - strand features are considered
 7 | 
 8 | #usage
 9 | #Chr21=/projects/encode/scaling_up/chr21_22/annotations/hg18/Havana_chr21_exons_nopseudo.gff
10 | #awk -v fldno=10 -f ~/Awk/extract_most_5p.awk $Chr21 
11 | 
12 | #$Chr21
13 | #chr21   VEGA_Novel_CDS  exon    9884493 9884538 .       +       0       transcript_id "AF254982.1-001"; gene_id "AF254982.1"; mRNA_start_not_found "0"; mRNA_end_not_found "0"; start_codon_not_found "0"; stop_codon_not_found "0";
14 | 
15 | 
16 | (($7=="+")||($7=="-")){
17 |   seen[$fldno]++;
18 |   if(seen[$fldno]==1)   #initialization
19 |     {
20 |       chr[$fldno]=$1;
21 |       strand[$fldno]=$7;
22 |       most5p_beg[$fldno]=$4;
23 |       most5p_end[$fldno]=$5;
24 |       most5p_all[$fldno]=$0;
25 |     }
26 | 
27 |   if(((strand[$fldno]=="+")&&(before($4,$5,most5p_beg[$fldno],most5p_end[$fldno])==1))||((strand[$fldno]=="-")&&(before($4,$5,most5p_beg[$fldno],most5p_end[$fldno])==0)))
28 |     {
29 |       most5p_beg[$fldno]=$4;
30 |       most5p_end[$fldno]=$5;
31 |       most5p_all[$fldno]=$0;
32 |     }
33 | }
34 | 
35 | END{
36 |   for(k in seen)
37 |     {
38 |       print most5p_all[k];
39 |     }
40 | }
41 | 
42 | #the before function takes as input two objects and returns 1 if the first one is 
43 | #before the second one. Note that when two features have the same start then it returns 1 
44 | #if the first one is longer than the second
45 | function before(beg1,end1,beg2,end2)
46 | {
47 |   return ((beg1<beg2)||((beg1==beg2)&&((end1-beg1)>(end2-beg2)))) ? 1 : 0;
48 | }
49 | 
50 | 


--------------------------------------------------------------------------------
/make_TSS_file_from_ensembl_annot.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # make_TSS_file_from_annotation_with_confidence_better.sh
 4 | # this script takes as input:
 5 | # - a Gencode annotation file in GTF format (mandatory)
 6 | # - a list of transcript biotypes to consider in this file (optional, default: no filtering)
 7 | # and provides as output the following files in the directory where it is executed:
 8 | # - a file of most 5' exons from each transcript in GFF2 format
 9 | # - a file of TSS from each transcript in GFF2 format
10 | # - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 
11 | #   tag was present in any transcript with this TSS
12 | 
13 | # Be careful:
14 | #############
15 | # makes the assumption that gene id and transcript id come as first and second (tag,value) pairs in the annotation GTF file
16 | 
17 | # Usage:
18 | ########
19 | # make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] 2> make_TSS_file_with_confidence_better_from_annotation.err &
20 | 
21 | 
22 | # Check that the annotation file exists, otherwise exit
23 | #######################################################
24 | if [ ! -n "$1" ]
25 | then
26 |     echo "" >&2
27 |     echo Usage: make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] >&2
28 |     echo Be careful: this script makes the assumption that gene id and transcript id come as first >&2
29 |     echo and second \(tag\,value\) pairs \in the GTF file >&2
30 |     echo In case no list of transcript biotypes is specified there will be no filtering done >&2
31 |     echo "" >&2
32 |     exit 1
33 | fi
34 | 
35 | path="`dirname \"$0\"`" # relative path
36 | rootDir="`( cd \"$path\" && pwd )`" # absolute path
37 | 
38 | # Initialize variables
39 | ######################
40 | annotation=$1
41 | annotbase=`basename ${annotation%.gtf}`
42 | if [ -n "$2" ] 
43 | then
44 |     trbiotypes=$2
45 | fi
46 |     
47 | # Programs
48 | ##########
49 | EXTRACT5p=$rootDir/Awk/extract_most_5p.awk
50 | CUTGFF=$rootDir/Awk/cutgff.awk
51 | GFF2GFF=$rootDir/Awk/gff2gff.awk
52 | 
53 | 
54 | ##########################################################
55 | # Make the TSS file for the asked transcript biotypes    #
56 | ##########################################################
57 | 
58 | # a. Extract most 5' exons of transcripts from specified transcript biotypes
59 | #############################################################################
60 | echo I am extracting the most 5\' exons of transcripts from the biotypes specified by the user \(default is no selection\) >&2
61 | if [ ! -n "$trbiotypes" ] || [ ! -e "$trbiotypes" ]
62 | then
63 | awk '$3=="exon"' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff
64 | else
65 | awk -v fileRef=$trbiotypes 'BEGIN{while(getline < fileRef >0){biotype["\""$1"\"\;"]=1;}} $3=="exon"{i=9; while($i!="transcript_biotype"){i+=2}if(($i=="transcript_biotype")&&(biotype[$(i+1)]==1)){print}}' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff
66 | fi
67 | 
68 | # b. Then the most 5' bp of each transcript for each gene (not that all tss are said to come from ensembl)
69 | ##########################################################################################################
70 | #    with associated confidence level= low confidence level whenever the tss belongs to a tr where the CDS start
71 | #################################################################################################################
72 | #    not found tag was set, not_low otherwise (syntax was CDS start not found for v3c)
73 | ######################################################################################
74 | echo I am extracting the most 5\' bp of each transcript for each gene, >&2
75 | echo associating a low confidence level when the tss comes from a tr where the cds_start_NF tag was set, >&2
76 | echo and adding the list of tr and of tr biotypes the tss comes from. >&2
77 | awk '{i=9; while($i!="transcript_biotype"){i++}if($i=="transcript_biotype"){split($(i+1),c,"\""); trbiot=c[2];} ($0~/cds_start_NF/) ? confidence="low" : confidence="not_low"; ($7=="+") ? tsspos=$4 : tsspos=$5; split($10,a,"\""); split($12,b,"\""); print $1, "Ensembl", "CapSite", tsspos, tsspos, ".", $7, ".", "gene_id", a[2], "tr", b[2], "trbiot", trbiot, "confidence", confidence;}' $annotbase\_exons_most5p.gff | awk -f $GFF2GFF > $annotbase\_capped_sites.gff
78 | 
79 | # c. Finally collapse TSS per gene and put a low confidence level when the collapsed tss is composed of at least one low tss
80 | #############################################################################################################################
81 | #    also add the gene biotype at the end
82 | ##########################################
83 | echo I am collapsing all TSSs per gene, put a low confidence level whenever one of the indiv tss has a low confidence level, >&2
84 | echo and add the gene biotype. >&2
85 | cat $annotbase\_capped_sites.gff | awk -v to=10 -f $CUTGFF | sort -n | uniq -c | awk '{$1=""; print $0}' | awk -f $GFF2GFF | awk -v fileRef=$annotbase\_capped_sites.gff 'BEGIN{while (getline < fileRef >0){trlist[$1"_"$4"_"$5"_"$7,$10]=(trlist[$1"_"$4"_"$5"_"$7,$10])($12)(","); trbiotlist[$1"_"$4"_"$5"_"$7,$10]=(trbiotlist[$1"_"$4"_"$5"_"$7,$10])($14)(","); if($16=="low"){low[$1"_"$4"_"$5"_"$7,$10]=1;}}} {$11="trlist"; $12=trlist[$1"_"$4"_"$5"_"$7,$10]; $13="trbiotlist"; $14=trbiotlist[$1"_"$4"_"$5"_"$7,$10]; $15="confidence"; (low[$1"_"$4"_"$5"_"$7,$10]==1) ? $16="low" : $16="not_low"; print $0}' | awk -v fileRef=$annotation 'BEGIN{while(getline < fileRef >0){if($3=="gene"){split($10,a,"\""); i=9; while($i!="gene_biotype"){i+=2} if($i=="gene_biotype"){split($(i+1),b,"\"");} biotype[a[2]]=b[2]}}}{print $0, "gene_biotype", biotype[$10]}' | awk -f $GFF2GFF > $annotbase\_capped_sites_nr_with_confidence.gff
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/make_TSS_file_from_annotation_with_confidence_better.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # make_TSS_file_from_annotation_with_confidence_better.sh
 4 | # this script takes as input:
 5 | # - a Gencode annotation file in GTF format (mandatory)
 6 | # - a list of transcript biotypes to consider in this file (optional, default: no filtering)
 7 | # and provides as output the following files in the directory where it is executed:
 8 | # - a file of most 5' exons from each transcript in GFF2 format
 9 | # - a file of TSS from each transcript in GFF2 format
10 | # - a file of distinct TSS from each gene in GFF2 format, associated to a low confidence level in case the cds_start_NF 
11 | #   tag was present in any transcript with this TSS
12 | 
13 | # Be careful:
14 | #############
15 | # makes the assumption that gene id and transcript id come as first and second (tag,value) pairs in the annotation GTF file
16 | 
17 | # Usage:
18 | ########
19 | # make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] 2> make_TSS_file_with_confidence_better_from_annotation.err &
20 | 
21 | 
22 | # Check that the annotation file exists, otherwise exit
23 | #######################################################
24 | if [ ! -n "$1" ]
25 | then
26 |     echo "" >&2
27 |     echo Usage: make_TSS_file_from_annotation_with_confidence_better.sh annot.gtf [tr_biotypes.txt] >&2
28 |     echo Be careful: this script makes the assumption that gene id and transcript id come as first >&2
29 |     echo and second \(tag\,value\) pairs \in the GTF file >&2
30 |     echo In case no list of transcript biotypes is specified there will be no filtering done >&2
31 |     echo "" >&2
32 |     exit 1
33 | fi
34 | 
35 | path="`dirname \"$0\"`" # relative path
36 | rootDir="`( cd \"$path\" && pwd )`" # absolute path
37 | 
38 | # Initialize variables
39 | ######################
40 | annotation=$1
41 | annotbase=`basename ${annotation%.gtf}`
42 | if [ -n "$2" ] 
43 | then
44 |     trbiotypes=$2
45 | fi
46 |     
47 | # Programs
48 | ##########
49 | EXTRACT5p=$rootDir/Awk/extract_most_5p.awk
50 | CUTGFF=$rootDir/Awk/cutgff.awk
51 | GFF2GFF=$rootDir/Awk/gff2gff.awk
52 | 
53 | 
54 | ##########################################################
55 | # Make the TSS file for the asked transcript biotypes    #
56 | ##########################################################
57 | 
58 | # a. Extract most 5' exons of transcripts from specified transcript biotypes
59 | #############################################################################
60 | echo I am extracting the most 5\' exons of transcripts from the biotypes specified by the user \(default is no selection\) >&2
61 | if [ ! -n "$trbiotypes" ] || [ ! -e "$trbiotypes" ]
62 | then
63 | awk '$3=="exon"' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff
64 | else
65 | awk -v fileRef=$trbiotypes 'BEGIN{while(getline < fileRef >0){biotype["\""$1"\"\;"]=1;}} $3=="exon"{i=9; while($i!="transcript_type"){i+=2}if(($i=="transcript_type")&&(biotype[$(i+1)]==1)){print}}' $annotation | awk -v fldno=12 -f $EXTRACT5p > $annotbase\_exons_most5p.gff
66 | fi
67 | 
68 | # b. Then the most 5' bp of each transcript for each gene (not that all tss are said to come from gencode)
69 | ##########################################################################################################
70 | #    with associated confidence level= low confidence level whenever the tss belongs to a tr where the CDS start
71 | #################################################################################################################
72 | #    not found tag was set, not_low otherwise (syntax was CDS start not found for v3c)
73 | ######################################################################################
74 | echo I am extracting the most 5\' bp of each transcript for each gene, >&2
75 | echo associating a low confidence level when the tss comes from a tr where the cds_start_NF tag was set, >&2
76 | echo and adding the list of tr and of tr biotypes the tss comes from. >&2
77 | awk '{i=9; while($i!="transcript_type"){i++}if($i=="transcript_type"){split($(i+1),c,"\""); trbiot=c[2];} ($0~/cds_start_NF/) ? confidence="low" : confidence="not_low"; ($7=="+") ? tsspos=$4 : tsspos=$5; split($10,a,"\""); split($12,b,"\""); print $1, "Gencode", "CapSite", tsspos, tsspos, ".", $7, ".", "gene_id", a[2], "tr", b[2], "trbiot", trbiot, "confidence", confidence;}' $annotbase\_exons_most5p.gff | awk -f $GFF2GFF > $annotbase\_capped_sites.gff
78 | 
79 | # c. Finally collapse TSS per gene and put a low confidence level when the collapsed tss is composed of at least one low tss
80 | #############################################################################################################################
81 | #    also add the gene biotype at the end
82 | ##########################################
83 | echo I am collapsing all TSSs per gene, put a low confidence level whenever one of the indiv tss has a low confidence level, >&2
84 | echo and add the gene biotype. >&2
85 | cat $annotbase\_capped_sites.gff | awk -v to=10 -f $CUTGFF | sort -n | uniq -c | awk '{$1=""; print $0}' | awk -f $GFF2GFF | awk -v fileRef=$annotbase\_capped_sites.gff 'BEGIN{while (getline < fileRef >0){trlist[$1"_"$4"_"$5"_"$7,$10]=(trlist[$1"_"$4"_"$5"_"$7,$10])($12)(","); trbiotlist[$1"_"$4"_"$5"_"$7,$10]=(trbiotlist[$1"_"$4"_"$5"_"$7,$10])($14)(","); if($16=="low"){low[$1"_"$4"_"$5"_"$7,$10]=1;}}} {$11="trlist"; $12=trlist[$1"_"$4"_"$5"_"$7,$10]; $13="trbiotlist"; $14=trbiotlist[$1"_"$4"_"$5"_"$7,$10]; $15="confidence"; (low[$1"_"$4"_"$5"_"$7,$10]==1) ? $16="low" : $16="not_low"; print $0}' | awk -v fileRef=$annotation 'BEGIN{while(getline < fileRef >0){if($3=="gene"){split($10,a,"\""); i=9; while($i!="gene_type"){i+=2} if($i=="gene_type"){split($(i+1),b,"\"");} biotype[a[2]]=b[2]}}}{print $0, "gene_biotype", biotype[$10]}' | awk -f $GFF2GFF > $annotbase\_capped_sites_nr_with_confidence.gff
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------