├── Makefile
├── example.png
├── guess
    ├── hg19.genes.gz
    ├── hg38.genes.gz
    ├── hg19.genome.gz
    ├── hg38.genome.gz
    ├── split_exon.pl
    ├── check_size.pl
    └── README.md
├── gencode
    ├── gene_type.png
    ├── genomic_region_length.png
    ├── genomic_region_proportion.png
    ├── gencode.v20.annotation.gtf.stats
    ├── gencode.v21.annotation.gtf.stats
    ├── gencode.v22.annotation.gtf.stats
    ├── gencode.v23.annotation.gtf.stats
    ├── gencode.v24.annotation.gtf.stats
    ├── gencode.v25.annotation.gtf.stats
    ├── gencode.v26.annotation.gtf.stats
    ├── gencode.v27.annotation.gtf.stats
    ├── gencode.v28.annotation.gtf.stats
    ├── gencode.v29.annotation.gtf.stats
    ├── gencode.v30.annotation.gtf.stats
    ├── gencode.v31.annotation.gtf.stats
    ├── gencode.v32.annotation.gtf.stats
    ├── gencode.v33.annotation.gtf.stats
    ├── gencode.v34.annotation.gtf.stats
    ├── gencode.v35.annotation.gtf.stats
    ├── README.md
    └── plot_stats.Rmd
├── .gitignore
├── chrom_info
    ├── araTha1.genome
    ├── README.md
    ├── hg19.genome
    └── hg38.genome
├── script
    ├── README.md
    ├── gtf_to_bed.pl
    ├── gtf_to_bed_single.pl
    ├── merge_by_id.pl
    └── gtf_add_exon.pl
├── coverage.pl
├── check_utr.pl
├── print_utr.pl
├── run.sh
├── promoter.pl
├── liftover
    └── README.md
├── run.pl
└── README.md


/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	./run.sh
3 | 
4 | clean:
5 | 	rm -rf bedtools2 *.gz
6 | 


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/example.png


--------------------------------------------------------------------------------
/guess/hg19.genes.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg19.genes.gz


--------------------------------------------------------------------------------
/guess/hg38.genes.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg38.genes.gz


--------------------------------------------------------------------------------
/gencode/gene_type.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/gene_type.png


--------------------------------------------------------------------------------
/guess/hg19.genome.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg19.genome.gz


--------------------------------------------------------------------------------
/guess/hg38.genome.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg38.genome.gz


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | bedtools2
2 | transcript*
3 | *.bed.gz
4 | *.gtf.gz
5 | gencode/*.gtf.gz
6 | my_*
7 | *.swp
8 | .DS_Store
9 | 


--------------------------------------------------------------------------------
/chrom_info/araTha1.genome:
--------------------------------------------------------------------------------
1 | 1	30427671
2 | 5	26975502
3 | 3	23459830
4 | 2	19698289
5 | 4	18585056
6 | Mt	366924
7 | Pt	154478
8 | 


--------------------------------------------------------------------------------
/gencode/genomic_region_length.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/genomic_region_length.png


--------------------------------------------------------------------------------
/gencode/genomic_region_proportion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/genomic_region_proportion.png


--------------------------------------------------------------------------------
/gencode/gencode.v20.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.70
2 | intron_coverage: 46.64
3 | intergenic_coverage: 49.66
4 | exon_length: 410.43
5 | intron_length: 5035.45
6 | intergenic_length: 48843.25
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v21.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.79
2 | intron_coverage: 46.75
3 | intergenic_coverage: 49.46
4 | exon_length: 418.99
5 | intron_length: 5021.59
6 | intergenic_length: 48055.09
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v22.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.82
2 | intron_coverage: 46.88
3 | intergenic_coverage: 49.30
4 | exon_length: 421.22
5 | intron_length: 5014.63
6 | intergenic_length: 47992.55
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v23.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.81
2 | intron_coverage: 46.90
3 | intergenic_coverage: 49.29
4 | exon_length: 419.83
5 | intron_length: 5011.03
6 | intergenic_length: 47987.25
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v24.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.82
2 | intron_coverage: 47.01
3 | intergenic_coverage: 49.17
4 | exon_length: 419.99
5 | intron_length: 5023.59
6 | intergenic_length: 47595.16
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v25.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.84
2 | intron_coverage: 47.41
3 | intergenic_coverage: 48.75
4 | exon_length: 422.98
5 | intron_length: 5056.68
6 | intergenic_length: 48517.51
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v26.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.86
2 | intron_coverage: 47.46
3 | intergenic_coverage: 48.69
4 | exon_length: 424.52
5 | intron_length: 5063.18
6 | intergenic_length: 48444.65
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v27.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.88
2 | intron_coverage: 47.63
3 | intergenic_coverage: 48.49
4 | exon_length: 426.11
5 | intron_length: 5060.58
6 | intergenic_length: 48331.74
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v28.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.92
2 | intron_coverage: 48.14
3 | intergenic_coverage: 47.94
4 | exon_length: 428.63
5 | intron_length: 5069.04
6 | intergenic_length: 48263.94
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v29.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.95
2 | intron_coverage: 48.94
3 | intergenic_coverage: 47.10
4 | exon_length: 430.83
5 | intron_length: 5092.15
6 | intergenic_length: 47885.29
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v30.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 3.99
2 | intron_coverage: 49.50
3 | intergenic_coverage: 46.50
4 | exon_length: 433.69
5 | intron_length: 5107.94
6 | intergenic_length: 47462.92
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v31.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 4.24
2 | intron_coverage: 51.95
3 | intergenic_coverage: 43.81
4 | exon_length: 449.80
5 | intron_length: 5176.52
6 | intergenic_length: 45084.36
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v32.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 4.25
2 | intron_coverage: 51.99
3 | intergenic_coverage: 43.76
4 | exon_length: 450.53
5 | intron_length: 5174.38
6 | intergenic_length: 45058.76
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v33.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 4.25
2 | intron_coverage: 51.98
3 | intergenic_coverage: 43.77
4 | exon_length: 451.04
5 | intron_length: 5168.91
6 | intergenic_length: 45024.93
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v34.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 4.27
2 | intron_coverage: 51.97
3 | intergenic_coverage: 43.76
4 | exon_length: 452.82
5 | intron_length: 5167.13
6 | intergenic_length: 45019.35
7 | 


--------------------------------------------------------------------------------
/gencode/gencode.v35.annotation.gtf.stats:
--------------------------------------------------------------------------------
1 | exon_coverage: 4.29
2 | intron_coverage: 52.03
3 | intergenic_coverage: 43.68
4 | exon_length: 454.52
5 | intron_length: 5161.43
6 | intergenic_length: 45067.04
7 | 


--------------------------------------------------------------------------------
/guess/split_exon.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | while(<>){
 7 |    chomp;
 8 |    next if /^chrom/;
 9 |    my ($chr, $starts, $ends) = split(/\t/);
10 |    my @starts = split(/,/, $starts);
11 |    my @ends = split(/,/, $ends);
12 |    foreach my $i (0..$#starts){
13 |       print join("\t", $chr, $starts[$i], $ends[$i]), "\n";
14 |    }
15 | }
16 | 
17 | exit();
18 | 
19 | 


--------------------------------------------------------------------------------
/chrom_info/README.md:
--------------------------------------------------------------------------------
 1 | ## README
 2 | 
 3 | Download genome annotations.
 4 | 
 5 | ```bash
 6 | for genome in hg19 hg38; do
 7 |    mysql --user=genome \
 8 |          --host=genome-mysql.cse.ucsc.edu \
 9 |          -A \
10 |          -e "select chrom, size from ${genome}.chromInfo" | grep -v "^chrom" > ${genome}.genome
11 | done
12 | 
13 | wget -q -O - http://genome-test.cse.ucsc.edu/~hiram/hubs/Plants/araTha1/araTha1.chrom.sizes |
14 |   sed 's/^chr//' |
15 |   sed 's/Cp/Pt/' > araTha1.genome
16 | ```
17 | 
18 | 


--------------------------------------------------------------------------------
/script/README.md:
--------------------------------------------------------------------------------
 1 | ## README
 2 | 
 3 | Download GTF file
 4 | 
 5 |     wget -c -N ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.annotation.gtf.gz
 6 | 
 7 | Convert to BED using "gene" entries in the GTF file
 8 | 
 9 |     ./gtf_to_bed.pl -i gencode.v33.annotation.gtf.gz -f gene > gencode.v33.gene.bed
10 | 
11 | ## Scripts
12 | 
13 | The `merge_by_id.pl` script will merge by the annotation column (column 4). This is useful when you only want to merge features with the same ID; I could not find an elegant solution, so I wrote this script. It requires `bedtools` and will create a lot of temporary files (one per feature in the BED file). It works by running `bedtools merge` on each feature, so it can be very slow with files with a lot of different features. You can specify more threads if you have multiple cores on your system.
14 | 
15 | 


--------------------------------------------------------------------------------
/coverage.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $v = 19;
 7 | 
 8 | my $exon_file       = "gencode_v${v}_exon_merged.bed.gz";
 9 | my $intergenic_file = "gencode_v${v}_intergenic.bed.gz";
10 | my $intron_file     = "gencode_v${v}_intron.bed.gz";
11 | 
12 | my $exon_coverage       = coverage($exon_file);
13 | my $intergenic_coverage = coverage($intergenic_file);
14 | my $intron_coverage     = coverage($intron_file);
15 | 
16 | my $total = $exon_coverage + $intergenic_coverage + $intron_coverage;
17 | 
18 | printf "Exon: %.2f\n", $exon_coverage*100/$total;
19 | printf "Intron: %.2f\n", $intron_coverage*100/$total;
20 | printf "Intergenic: %.2f\n", $intergenic_coverage*100/$total;
21 | 
22 | sub coverage {
23 |    my ($infile) = @_;
24 |    my $coverage = 0;
25 |    open(IN,'-|',"zcat $infile") || die "Could not open $infile: $!\n";
26 |    while(<IN>){
27 |       chomp;
28 |       my ($chr, $start, $end) = split(/\t/);
29 |       my $c = $end - $start;
30 |       $coverage += $c;
31 |    }
32 |    close(IN);
33 |    return($coverage);
34 | }
35 | 
36 | exit(0);
37 | 


--------------------------------------------------------------------------------
/gencode/README.md:
--------------------------------------------------------------------------------
 1 | ## README
 2 | 
 3 | Setup latest version of BEDTools.
 4 | 
 5 | ```bash
 6 | wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools-2.29.2.tar.gz
 7 | tar -xzf bedtools-2.29.2.tar.gz
 8 | cd bedtools2
 9 | make all
10 | 
11 | cd ~/bin/
12 | ln -s ~/src/bedtools2/bin/bedtools
13 | 
14 | bedtools --version
15 | # bedtools v2.29.2
16 | ```
17 | 
18 | Download GENCODE GTF files.
19 | 
20 | ```bash
21 | parallel --verbose wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{}/gencode.v{}.annotation.gtf.gz ::: {20..35}
22 | ```
23 | 
24 | Calculate stats.
25 | 
26 | ```bash
27 | parallel --verbose "../run.pl {} ../chrom_info/hg38.genome > {.}.stats" ::: *.gtf.gz
28 | ```
29 | 
30 | Plot using `plot_stats.Rmd`.
31 | 
32 | <img src="https://github.com/davetang/defining_genomic_regions/blob/main/gencode/genomic_region_proportion.png" width="600" />
33 | <img src="https://github.com/davetang/defining_genomic_regions/blob/main/gencode/genomic_region_length.png" width="600" />
34 | <img src="https://github.com/davetang/defining_genomic_regions/blob/main/gencode/gene_type.png" width="600" />
35 | 
36 | 


--------------------------------------------------------------------------------
/guess/check_size.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | use Getopt::Std;
 6 | 
 7 | my %opts = ();
 8 | getopts('h:s:b:', \%opts);
 9 | 
10 | if ($opts{'h'} ||
11 |     !exists $opts{'s'} ||
12 |     !exists $opts{'b'}
13 | ){
14 |    usage();
15 | }
16 | 
17 | my $genome = $opts{'s'};
18 | my %sizes = store_size($genome);
19 | my $bed = $opts{'b'};
20 | 
21 | my $fh;
22 | if ($bed =~ /\.gz$/){
23 |    open($fh, '-|', "gunzip -c $bed") or die "Could not open $bed $!\n";
24 | } else {
25 |    open($fh, '<', $bed) or die "Could not open $bed $!\n";
26 | }
27 | 
28 | while(<$fh>){
29 |    chomp;
30 |    next if /^browser/ || /^track/;
31 |    my ($chr, $start, $end, @rest) = split(/\t/);
32 |    if (exists $sizes{$chr}){
33 |       my $size = $sizes{$chr};
34 |       if (++$end > $size){
35 |          warn("$chr:$start-$end greater than $size\n");
36 |       }
37 |    } else {
38 |       die("$chr does not exist in $genome\n");
39 |    }
40 | }
41 | close($fh);
42 | 
43 | sub usage {
44 | print STDERR <<EOF;
45 | Usage: $0 -s hg19.genome.gz -b my.bed
46 | 
47 | Where:   -s         chromosome sizes
48 |          -b         BED file
49 |          -h         this helpful usage message
50 | 
51 | EOF
52 | exit();
53 | }
54 | 
55 | sub store_size {
56 |    my ($infile) = @_;
57 |    my $fh;
58 |    if ($infile =~ /\.gz$/){
59 |       open($fh, '-|', "gunzip -c $infile") or die "Could not open $infile $!\n";
60 |    } else {
61 |       open($fh, '<', $infile) or die "Could not open $infile $!\n";
62 |    }
63 |    my %sizes = ();
64 |    while(<$fh>){
65 |       chomp;
66 |       next if /^chrom/;
67 |       my ($chrom, $size) = split(/\t/);
68 |       $sizes{$chrom} = $size;
69 |    }
70 |    close($fh);
71 |    return(%sizes);
72 | }
73 | 
74 | 


--------------------------------------------------------------------------------
/check_utr.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "Usage: $0 <infile.annotation.gtf.gz>\n";
 7 | my $infile = shift or die $usage;
 8 | 
 9 | if ($infile =~ /\.gz/){
10 |    open(IN,'-|',"gunzip -c $infile") || die "Could not open $infile: $!\n";
11 | } else {
12 |    open(IN,'<',$infile) || die "Could not open $infile: $!\n";
13 | }
14 | 
15 | my %transcript = ();
16 | my $current_transcript = '';
17 | 
18 | while(<IN>){
19 |    chomp;
20 |    next if (/^#/);
21 |    #chr11   HAVANA  transcript      65265233        65273940        .       +       .       gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1";
22 |    my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/);
23 | 
24 |    my @annotation = split(/;\s/,$annotation);
25 |    my $transcript_id = 'none';
26 | 
27 |    if ($type eq 'transcript'){
28 |       foreach my $blah (@annotation){
29 |          my ($type,$name) = split(/\s+/,$blah);
30 |          if ($type eq 'transcript_id'){
31 |             $current_transcript = $name;
32 |             $current_transcript =~ s/"//g;
33 |             $transcript{$current_transcript} = 0;
34 |          }
35 |       }
36 |       if ($current_transcript eq 'none'){
37 |          die "No name for entry $.\n";
38 |       }
39 |    }
40 | 
41 |    if ($type eq 'UTR'){
42 |       $transcript{$current_transcript}++;
43 |    }
44 | }
45 | close(IN);
46 | 
47 | foreach my $transcript (keys %transcript){
48 |    print "$transcript\t$transcript{$transcript}\n";
49 | }
50 | 
51 | exit(0);
52 | 


--------------------------------------------------------------------------------
/script/gtf_to_bed.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Converts a GTF file into a BED file; the script will use "gene_id" as the BED name, if it exists in the attributes
 4 | #
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use Getopt::Std;
 9 | 
10 | my %opts = ();
11 | getopts('i:f:h:u:d:', \%opts);
12 | 
13 | if ($opts{'h'} ||
14 |     !exists $opts{'f'} ||
15 |     !exists $opts{'i'}
16 | ){
17 |    usage();
18 | }
19 | 
20 | my $gtf = $opts{'i'};
21 | my $my_feature = $opts{'f'};
22 | my $up = 0;
23 | my $down = 0;
24 | if (exists $opts{'u'}){
25 |    $up = $opts{'u'};
26 | }
27 | if (exists $opts{'d'}){
28 |    $up = $opts{'d'};
29 | }
30 | 
31 | if ($gtf =~ /\.gz$/){
32 |    open(IN, '-|', "gunzip -c $gtf") || die "Could not open $gtf: $!\n";
33 | } else {
34 |    open(IN, '<', $gtf) || die "Could not open $gtf: $!\n";
35 | }
36 | 
37 | while(<IN>){
38 |    chomp;
39 |    next if /^#/;
40 |    my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/);
41 |    next unless $feature eq $my_feature;
42 | 
43 |    my $name = '.';
44 |    if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){
45 |       $name = $1;
46 |    }
47 | 
48 |    # BED is 0-based
49 |    $start -= 1;
50 | 
51 |    if ($strand eq '+'){
52 |       $start = $start - $up;
53 |       $end = $end + $down;
54 |       if ($start < 0){
55 |          $start = 0;
56 |       }
57 |    } elsif ($strand eq '-'){
58 |       $start = $start - $down;
59 |       $end = $end + $up;
60 |       if ($start < 0){
61 |          $start = 0;
62 |       }
63 |    }
64 | 
65 |    print join("\t", $sequence, $start, $end, $name, $score, $strand), "\n";
66 | 
67 | }
68 | close(IN);
69 | 
70 | sub usage {
71 | print STDERR <<EOF;
72 | Usage: $0 -i FILE -f STRING -u INT -d INT
73 | 
74 | Where:   -i         GTF file
75 |          -f         Feature to keep, e.g. gene, transcript, CDS, exon
76 |          -u         Add upstream padding, default = 0
77 |          -d         Add downstream padding, default = 0
78 |          -h         this helpful usage message
79 | 
80 | EOF
81 | exit();
82 | }
83 | 
84 | __END__
85 | 
86 | 


--------------------------------------------------------------------------------
/print_utr.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "Usage: $0 <infile.annotation.gtf.gz>\n";
 7 | my $infile = shift or die $usage;
 8 | 
 9 | if ($infile =~ /\.gz/){
10 |    open(IN,'-|',"gunzip -c $infile") || die "Could not open $infile: $!\n";
11 | } else {
12 |    open(IN,'<',$infile) || die "Could not open $infile: $!\n";
13 | }
14 | 
15 | my %transcript = ();
16 | my $current_transcript = '';
17 | my $transcript_start = 0;
18 | my $transcript_end = 0;
19 | 
20 | while(<IN>){
21 |    chomp;
22 |    next if (/^#/);
23 |    #chr11   HAVANA  transcript      65265233        65273940        .       +       .       gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1";
24 |    my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/);
25 | 
26 |    my @annotation = split(/;\s/,$annotation);
27 |    my $transcript_id = 'none';
28 | 
29 |    if ($type eq 'transcript'){
30 |       foreach my $blah (@annotation){
31 |          my ($type,$name) = split(/\s+/,$blah);
32 |          if ($type eq 'transcript_id'){
33 |             $current_transcript = $name;
34 |             $current_transcript =~ s/"//g;
35 |             $transcript_start = $start;
36 |             $transcript_end = $end;
37 |          }
38 |       }
39 |       if ($current_transcript eq 'none'){
40 |          die "No name for entry $.\n";
41 |       }
42 |    }
43 | 
44 |    if ($type eq 'UTR'){
45 |       my $region = '';
46 |       if ($strand eq '+'){
47 |          my $dis_to_start = abs($start - $transcript_start);
48 |          my $dis_to_end = abs($start - $transcript_end);
49 |          $region = $dis_to_start < $dis_to_end ? '5_UTR' : '3_UTR';
50 |       } else {
51 |          my $dis_to_start = abs($end - $transcript_end);
52 |          my $dis_to_end = abs($end - $transcript_start);
53 |          $region = $dis_to_start < $dis_to_end ? '5_UTR' : '3_UTR';
54 |       }
55 |       print join ("\t", $chr, $start, $end, $region, $current_transcript, $strand),"\n";
56 |    }
57 | }
58 | close(IN);
59 | 
60 | exit(0);
61 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo Checking for bedtools
 4 | 
 5 | if [ ! -d bedtools2 ]
 6 | then
 7 |    git clone https://github.com/arq5x/bedtools2.git
 8 |    cd bedtools2
 9 |    make clean; make all
10 |    cd ..
11 | fi
12 | 
13 | echo Downloading GENCODE annotations
14 | 
15 | v=19
16 | 
17 | if [ ! -f gencode.v$v.annotation.gtf.gz ]
18 | then
19 |    wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_$v/gencode.v$v.annotation.gtf.gz
20 | fi
21 | 
22 | echo Creating exonic regions
23 | 
24 | if [ ! -f gencode_v${v}_exon_merged.bed.gz ]
25 | then
26 |    gunzip -c gencode.v$v.annotation.gtf.gz |
27 |    awk 'BEGIN{OFS="\t";} $3=="exon" {print $1,$4-1,$5}' |
28 |    bedtools2/bin/sortBed |
29 |    bedtools2/bin/mergeBed -i - | gzip > gencode_v${v}_exon_merged.bed.gz
30 | fi
31 | 
32 | echo Creating intronic regions
33 | 
34 | if [ ! -f gencode_v${v}_intron.bed.gz ]
35 | then
36 |    gunzip -c gencode.v$v.annotation.gtf.gz |
37 |    awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' |
38 |    bedtools2/bin/sortBed |
39 |    bedtools2/bin/subtractBed -a stdin -b gencode_v${v}_exon_merged.bed.gz |
40 |    gzip > gencode_v${v}_intron.bed.gz
41 | fi
42 | 
43 | # echo Downloading hg19 coordinates
44 | # 
45 | # if [ ! -f hg19.genome ]
46 | # then
47 | #    mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
48 | #    "select chrom, size from hg19.chromInfo"  > hg19.genome
49 | # fi
50 | 
51 | echo Creating intergenic regions
52 | 
53 | if [ ! -f gencode_v${v}_intergenic.bed.gz ]
54 | then 
55 |    gunzip -c gencode.v$v.annotation.gtf.gz |
56 |    awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' |
57 |    sort -k1V -k2,2n |
58 |    bedtools2/bin/complementBed -i stdin -g hg19.genome |
59 |    gzip > gencode_v${v}_intergenic.bed.gz
60 | fi
61 | 
62 | echo Counting UTRs
63 | 
64 | if [ ! -f transcript_utr_number.out.gz ]
65 | then
66 |    perl check_utr.pl gencode.v19.annotation.gtf.gz | gzip > transcript_utr_number.out.gz
67 | fi
68 | 
69 | echo Creating UTRs
70 | 
71 | if [ ! -f transcript_utr.bed.gz ]
72 | then
73 |    perl print_utr.pl gencode.v19.annotation.gtf.gz | gzip > transcript_utr.bed.gz
74 | fi
75 | 
76 | echo Creating promoter region
77 | 
78 | if [ ! -f promoter.bed.gz ]
79 | then
80 |    perl promoter.pl gencode.v19.annotation.gtf.gz 200 | gzip > promoter.bed.gz
81 | fi
82 | 
83 | 
84 | echo Done
85 | 


--------------------------------------------------------------------------------
/script/gtf_to_bed_single.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | #
 3 | # Output BED file with features of interest for genes with only one transcript model
 4 | #
 5 | 
 6 | use warnings;
 7 | use strict;
 8 | use Getopt::Std;
 9 | 
10 | my %opts = ();
11 | getopts('i:f:h:', \%opts);
12 | 
13 | if ($opts{'h'} ||
14 |     !exists $opts{'i'} ||
15 |     !exists $opts{'f'}
16 | ){
17 |    usage();
18 | }
19 | 
20 | my $my_feature = $opts{'f'};
21 | my %gene_anno = ();
22 | 
23 | # first read through to tally number of transcripts per gene
24 | my $gtf_file = $opts{'i'};
25 | my $gtf = open_file($gtf_file);
26 | while(<$gtf>){
27 |    chomp;
28 |    next if /^#/;
29 |    my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/);
30 | 
31 |    my $gene_id = '.';
32 |    if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){
33 |       $gene_id = $1;
34 |    }
35 | 
36 |    if ($feature eq "transcript"){
37 |       if (exists $gene_anno{$gene_id}->{'COUNT'}){
38 |          $gene_anno{$gene_id}->{'COUNT'}++;
39 |       } else {
40 |          $gene_anno{$gene_id}->{'COUNT'} = 1;
41 |       }
42 |    }
43 | 
44 | }
45 | close($gtf);
46 | 
47 | # second read through to output features of interest for genes with only one transcript model
48 | my $gtf2 = open_file($gtf_file);
49 | while(<$gtf2>){
50 |    chomp;
51 |    next if /^#/;
52 |    my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/);
53 | 
54 |    my $gene_id = '.';
55 |    if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){
56 |       $gene_id = $1;
57 |    }
58 | 
59 |    if ($gene_anno{$gene_id}->{COUNT} == 1 && $feature eq $my_feature){
60 |       # BED is 0-based
61 |       $start -= 1;
62 |       print join("\t", $sequence, $start, $end, $gene_id, $score, $strand), "\n";
63 |    }
64 | 
65 | }
66 | close($gtf2);
67 | 
68 | sub open_file {
69 |    my ($infile) = @_;
70 |    my $fh;
71 |    if ($infile =~ /\.gz$/){
72 |       open($fh, '-|', "gunzip -c $infile") || die "Could not open $infile $!\n";
73 |    } else {
74 |       open($fh, '<', $infile) || die "Could not open $infile $!\n";
75 |    }
76 |    return($fh);
77 | }
78 | 
79 | 
80 | sub usage {
81 | print STDERR <<EOF;
82 | Usage: $0 -f FILE -l STRING
83 | 
84 | Where:   -i         GTF file
85 |          -f         Feature to output, e.g. gene, transcript, CDS, exon
86 |          -h         this helpful usage message
87 | 
88 | EOF
89 | exit();
90 | }
91 | 
92 | __END__
93 | 
94 | 


--------------------------------------------------------------------------------
/promoter.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | 
 3 | use strict;
 4 | use warnings;
 5 | 
 6 | my $usage = "Usage: $0 <infile.annotation.gtf> <padding>\n";
 7 | my $infile = shift or die $usage;
 8 | my $span = shift or die $usage;
 9 | 
10 | if ($span !~ /^\d+$/){
11 |    die "Please enter a numeric value for the padding\n";
12 | }
13 | 
14 | my $hg19 = 'hg19.genome';
15 | my %hg19 = ();
16 | 
17 | open(IN,'<',$hg19) || die "Could not open $hg19: $!\n";
18 | while(<IN>){
19 |    chomp;
20 |    #chr9_gl000201_random    36148
21 |    my ($chr, $end) = split(/\t/);
22 |    $hg19{$chr} = $end;
23 | }
24 | close(IN);
25 | 
26 | if ($infile =~ /\.gz/){
27 |    open(IN,'-|',"gunzip -c $infile") || die "Could not open $infile: $!\n";
28 | } else {
29 |    open(IN,'<',$infile) || die "Could not open $infile: $!\n";
30 | }
31 | 
32 | while(<IN>){
33 |    chomp;
34 |    next if (/^#/);
35 |    #chr11   HAVANA  transcript      65265233        65273940        .       +       .       gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1";
36 |    my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/);
37 |    next unless $type eq 'transcript';
38 |    my @annotation = split(/;\s/,$annotation);
39 |    my $transcript_id = 'none';
40 |    foreach my $blah (@annotation){
41 |       my ($type,$name) = split(/\s+/,$blah);
42 |       if ($type eq 'transcript_id'){
43 |          $transcript_id = $name;
44 |          $transcript_id =~ s/"//g;
45 |       }
46 |    }
47 |    if ($transcript_id eq 'none'){
48 |       die "No name for entry $.\n";
49 |    }
50 |    my $promoter_start = '';
51 |    my $promoter_end = '';
52 |    if ($strand eq '+'){
53 |       $promoter_start = $start - $span;
54 |       $promoter_end = $start + $span;
55 |    } else {
56 |       $promoter_start = $end - $span;
57 |       $promoter_end = $end + $span;
58 |    }
59 |    if ($promoter_start < 0){
60 |       warn "Adjusted promoter start to 0\n";
61 |       $promoter_start = 0;
62 |    } elsif ($promoter_end > $hg19{$chr}){
63 |       warn "Adjusted promoter end to $hg19{$chr}\n";
64 |       $promoter_end = $hg19{$chr};
65 |    }
66 |    print join("\t",$chr,$promoter_start,$promoter_end,$transcript_id,0,$strand),"\n";
67 | }
68 | close(IN);
69 | 
70 | exit(0);
71 | 


--------------------------------------------------------------------------------
/liftover/README.md:
--------------------------------------------------------------------------------
 1 | ## README
 2 | 
 3 | Visit the [UCSC Genome Browser Store](https://genome-store.ucsc.edu/products/) and download liftOver after creating an account. It is free for personal and non-profit academic research use.
 4 | 
 5 | Download a chain file.
 6 | 
 7 | ```bash
 8 | wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
 9 | ```
10 | 
11 | Check out the chain file.
12 | 
13 | ```bash
14 | zcat hg19ToHg38.over.chain.gz | head -6
15 | chain 20851231461 chr1 249250621 + 10000 249240621 chr1 248956422 + 10000 248946422 2
16 | 167376  50041   80290
17 | 40302   253649  288020
18 | 1044699 1       2
19 | 3716    0       3
20 | 1134    4       18
21 | ```
22 | 
23 | The [chain format](https://genome.ucsc.edu/goldenPath/help/chain.html) has an initial header line starts with the keyword `chain`, followed by 11 required attribute values, and ends with a blank line. The attributes include:
24 | 
25 | * `score` -- chain score
26 | * `tName` -- chromosome (reference sequence)
27 | * `tSize` -- chromosome size (reference sequence)
28 | * `tStrand` -- strand (reference sequence)
29 | * `tStart` -- alignment start position (reference sequence)
30 | * `tEnd` -- alignment end position (reference sequence)
31 | * `qName` -- chromosome (query sequence)
32 | * `qSize` -- chromosome size (query sequence)
33 | * `qStrand` -- strand (query sequence)
34 | * `qStart` -- alignment start position (query sequence)
35 | * `qEnd` -- alignment end position (query sequence)
36 | * `id` -- chain ID
37 | 
38 | The alignment data lines contain three required attribute values:
39 | 
40 | * `size` -- the size of the ungapped alignment
41 | * `dt` -- the difference between the end of this block and the beginning of the next block (reference sequence)
42 | * `dq` -- the difference between the end of this block and the beginning of the next block (query sequence)
43 | 
44 | The block chr1:10000-177376 should liftover to the exact coordinates on hg38.
45 | 
46 | The `liftOver` tool requires four positional arguments: oldFile map.chain newFile unMapped
47 | 
48 | ```bash
49 | perl -le 'print join("\t", "chr1", 10000, 177376)' > chr1_10000_177376.txt
50 | ./liftOver chr1_10000_177376.txt hg19ToHg38.over.chain.gz chr1_10000_177376_hg38.txt chr1_10000_177376_unmapped.txt
51 | cat chr1_10000_177376_hg38.txt
52 | # chr1    10000   177376
53 | ```
54 | 
55 | If we create a BED file with a region that doesn't lift over, the output BED file will be trimmed. (The alignment block ends at chr1:10000:177376, so the 1 bp overhang will be trimmed.)
56 | 
57 | ```bash
58 | perl -le 'print join("\t", "chr1", 10000, 177377)' > chr1_10000_177377.txt
59 | ./liftOver chr1_10000_177377.txt hg19ToHg38.over.chain.gz chr1_10000_177377_hg38.txt chr1_10000_177377_unmapped.txt
60 | cat chr1_10000_177377_hg38.txt
61 | # chr1    10000   177376
62 | ```
63 | 
64 | 


--------------------------------------------------------------------------------
/script/merge_by_id.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | 
  3 | use strict;
  4 | use warnings;
  5 | use Getopt::Std;
  6 | use File::Path;
  7 | 
  8 | my %opts = ();
  9 | getopts('h:f:t:', \%opts);
 10 | 
 11 | if ($opts{'h'} ||
 12 |     !exists $opts{'f'}
 13 | ){
 14 |    usage();
 15 | }
 16 | 
 17 | chomp(my $bedtools = `command -v bedtools`);
 18 | if ($bedtools eq ''){
 19 |    die "Could not find bedtools\n";
 20 | }
 21 | 
 22 | my $infile = $opts{'f'};
 23 | my $fork_process = 1;
 24 | if (exists $opts{'t'}){
 25 |    $fork_process = $opts{'t'};
 26 | }
 27 | warn("Using $fork_process threads\n");
 28 | 
 29 | my $fh;
 30 | if ($infile =~ /\.gz$/){
 31 |    open($fh, '-|', "gunzip -c $infile") || die "Could not open $infile: $!\n";
 32 | } else {
 33 |    open($fh, '<', $infile) || die "Could not open $infile: $!\n";
 34 | }
 35 | 
 36 | # store all IDs
 37 | my %all_id = ();
 38 | while(<$fh>){
 39 |    chomp;
 40 |    my ($chr, $start, $end, $id, @rest) = split(/\t/);
 41 |    $all_id{$id} = 1;
 42 | }
 43 | close($fh);
 44 | 
 45 | my $tmp_dir = time() . "_tmp";
 46 | mkdir($tmp_dir) || die "Could not create $tmp_dir: $!\n";
 47 | 
 48 | my @command = ();
 49 | foreach my $id (keys %all_id){
 50 |    my $command;
 51 |    if ($infile =~ /\.gz$/){
 52 |       $command = "gunzip -c $infile | grep $id | sort -k1,1V -k2,2n | $bedtools merge -i - > $tmp_dir/$id.bed";
 53 |       push(@command, $command);
 54 |    } else {
 55 |       $command = "cat $infile | grep $id | sort -k1,1V -k2,2n | $bedtools merge -i - > $tmp_dir/$id.bed";
 56 |       push(@command, $command);
 57 |    }
 58 | }
 59 | 
 60 | my @child = ();
 61 | while(scalar(@command) > 0){
 62 |    for (1 .. $fork_process){
 63 |       my $pid = fork();
 64 |       if ($pid) {
 65 |          # parent
 66 |          push(@child, $pid);
 67 |          pop(@command);
 68 |       } elsif ($pid == 0) {
 69 |          # child
 70 |          if (scalar(@command) > 0){
 71 |             # print "$command[-1]\n";
 72 |             system($command[-1]);
 73 |          }
 74 |          exit(0);
 75 |       } else {
 76 |          die "Couldn't fork: $!\n";
 77 |       }
 78 |    }
 79 |    foreach my $pid (@child) {
 80 |       waitpid($pid, 0);
 81 |    }
 82 | }
 83 | 
 84 | # oepn merged filed
 85 | opendir(DIR, $tmp_dir) || die "Could not open $tmp_dir: $!\n";
 86 | while(my $bed = readdir(DIR)){
 87 |    next unless $bed =~ /\.bed$/;
 88 |    my $id = $bed;
 89 |    $id =~ s/\.bed//;
 90 |    open(my $fh, '<', "$tmp_dir/$bed") || die "Could not open $tmp_dir/$bed: $!\n";
 91 |    while(<$fh>){
 92 |       chomp;
 93 |       print "$_\t$id\n";
 94 |    }
 95 |    close($fh);
 96 | }
 97 | closedir(DIR);
 98 | 
 99 | rmtree($tmp_dir) || die "Could not remove $tmp_dir: $!\n";
100 | warn("Done\n");
101 | exit(0);
102 | 
103 | sub usage {
104 | print STDERR <<EOF;
105 | Usage: $0 -f file -t 16
106 | 
107 | Where:   -f         BED file with IDs in the fourth column
108 |          -t         threads to use (default 1)
109 |          -h         this helpful usage message
110 | 
111 | EOF
112 | exit(1);
113 | }
114 | 
115 | 


--------------------------------------------------------------------------------
/run.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # Use script to create exonic, intronic, and intergenic regions from a GTF file
  4 | # You must provide a "chrom.size" file, which is simply a flat file that specifies the size of chromosomes
  5 | # For an example, take a look at hg19.genome, which was created using the UCSC Genome Browser's MySQL database by running:
  6 | #
  7 | #    mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \
  8 | #    "select chrom, size from hg19.chromInfo" > hg19.genome
  9 | # 
 10 | 
 11 | use strict;
 12 | use warnings;
 13 | use File::Which;
 14 | use File::Basename;
 15 | 
 16 | my $bedtools = which('bedtools');
 17 | 
 18 | if (!$bedtools){
 19 |    print STDERR "bedtools was not found in your path:\n\n$ENV{PATH}\n\nPlease install bedtools and add it to your path:\n\n";
 20 |    print STDERR "git clone https://github.com/arq5x/bedtools2.git\ncd bedtools2\nmake clean\nmake all\n\n";
 21 |    exit(1);
 22 | }
 23 | 
 24 | my $usage = "Usage: $0 <infile.gtf.gz> <chrom.size>\n";
 25 | my $infile = shift or die $usage;
 26 | my $genome = shift or die $usage;
 27 | 
 28 | if ($infile !~ /\.gtf\.gz$/){
 29 |    print STDERR "Please provide a gzipped GTF file\n";
 30 |    exit(1);
 31 | }
 32 | 
 33 | my $basename = basename($infile,  ".gtf.gz");
 34 | my $exon_file       = "$basename.exon.merged.bed.gz";
 35 | my $intron_file     = "$basename.intron.bed.gz";
 36 | my $intergenic_file = "$basename.intergenic.bed.gz";
 37 | 
 38 | if (!-e $exon_file){
 39 |    warn "Creating exonic regions\n";
 40 |    my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"exon\" {print \$1,\$4-1,\$5}' | bedtools sort | bedtools merge -i - | gzip > $exon_file";
 41 |    system($command);
 42 | } else {
 43 |    warn "$exon_file already exists; skipping exon step\n";
 44 | }
 45 | 
 46 | if (!-e $intron_file){
 47 |    warn "Creating intronic regions\n";
 48 |    my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"gene\" {print \$1,\$4-1,\$5}' | bedtools sort | bedtools subtract -a stdin -b $basename.exon.merged.bed.gz | gzip > $intron_file";
 49 |    system($command);
 50 | } else {
 51 |    warn "$intron_file already exists; skipping intron step\n";
 52 | }
 53 | 
 54 | if (!-e $intergenic_file){
 55 |    warn "Creating intergenic regions\n";
 56 |    my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"gene\" {print \$1,\$4-1,\$5}' | bedtools sort -g $genome | bedtools complement -i stdin -g $genome | gzip > $intergenic_file";
 57 |    system($command);
 58 | } else {
 59 |    warn "$intergenic_file already exists; skipping intergenic step\n";
 60 | }
 61 | 
 62 | if (-e $exon_file && -e $intron_file && -e $intergenic_file){
 63 |    my ($exon_average, $exon_coverage) = stats($exon_file);
 64 |    my ($intergenic_average, $intergenic_coverage) = stats($intergenic_file);
 65 |    my ($intron_average, $intron_coverage) = stats($intron_file);
 66 | 
 67 |    my $total = $exon_coverage + $intergenic_coverage + $intron_coverage;
 68 | 
 69 |    printf "exon_coverage: %.2f\n", $exon_coverage*100/$total;
 70 |    printf "intron_coverage: %.2f\n", $intron_coverage*100/$total;
 71 |    printf "intergenic_coverage: %.2f\n", $intergenic_coverage*100/$total;
 72 |    print "exon_length: $exon_average\n";
 73 |    print "intron_length: $intron_average\n";
 74 |    print "intergenic_length: $intergenic_average\n";
 75 | }
 76 | 
 77 | sub stats {
 78 | 
 79 |    my ($infile) = @_;
 80 |    my $coverage = 0;
 81 |    my $total = 0;
 82 |    my $average = 0;
 83 | 
 84 |    open(IN, '-|' ,"gunzip -c $infile") || die "Could not open $infile: $!\n";
 85 |    while(<IN>){
 86 |       chomp;
 87 |       ++$total;
 88 |       my ($chr, $start, $end) = split(/\t/);
 89 |       my $c = $end - $start;
 90 |       $coverage += $c;
 91 |    }
 92 |    close(IN);
 93 | 
 94 |    $average = sprintf("%.2f", $coverage / $total);
 95 | 
 96 |    return($average, $coverage);
 97 | }
 98 | 
 99 | exit(0);
100 | 
101 | 


--------------------------------------------------------------------------------
/script/gtf_add_exon.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env perl
  2 | #
  3 | # Some GTF files do not contain exon features and thus do not work well with some tools. This script add exon features based on the CDS feature.
  4 | #
  5 | # Furthermore, some GTF files set all transcript_id's to "unknown_transcript_1", which can also create problems.
  6 | #
  7 | # This script can replace transcript_id's with the gene_id. However, this creates a problem when there are more than one transcript per gene.
  8 | # Thus only use the -t option when there is only one transcript model per gene.
  9 | #
 10 | # Lastly, the CDS, start_codon, and stop_codon lines will not be outputted by default. Use the -c option to output them.
 11 | #
 12 | # For your information, below are the definitions of CDS and exon:
 13 | #
 14 | # A CDS is a contiguous sequence which begins with, and includes, the start codon but does not include the stop codon.
 15 | # An exon is a region of the transcript sequence within a gene which is not removed from the primary RNA transcript by RNA splicing.
 16 | #
 17 | 
 18 | use warnings;
 19 | use strict;
 20 | use Getopt::Std;
 21 | 
 22 | my %opts = ();
 23 | getopts('i:h:c:t:', \%opts);
 24 | 
 25 | if ($opts{'h'} ||
 26 |     !exists $opts{'i'}
 27 | ){
 28 |    usage();
 29 | }
 30 | 
 31 | my $gtf = $opts{'i'};
 32 | my $keep_cds = 0;
 33 | my $replace_tid = 0;
 34 | 
 35 | if (exists $opts{'c'}){
 36 |    $keep_cds = 1;
 37 | }
 38 | if (exists $opts{'t'}){
 39 |    $replace_tid = 1;
 40 | }
 41 | 
 42 | # store coordinates to check for overlaps
 43 | my %cds = ();
 44 | my %exon = ();
 45 | 
 46 | if ($gtf =~ /\.gz$/){
 47 |    open(IN, '-|', "gunzip -c $gtf") || die "Could not open $gtf: $!\n";
 48 | } else {
 49 |    open(IN, '<', $gtf) || die "Could not open $gtf: $!\n";
 50 | }
 51 | 
 52 | LINE: while(<IN>){
 53 |    chomp;
 54 |    next if /^#/;
 55 |    my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/);
 56 | 
 57 |    my $gene_id = '';
 58 |    if ($attributes =~ /gene_id\s"([\/a-zA-Z0-9._-]*)";/){
 59 |       $gene_id = $1;
 60 |    } else {
 61 |       die "[ERROR] Could not extract gene_id on line $.: $_\n";
 62 |    }
 63 |    if ($gene_id eq ''){
 64 |       warn("[WARNING] $feature on line $. is not associated with any gene_id: skipping\n");
 65 |       warn("[WARNING] $_\n");
 66 |       next LINE;
 67 |    }
 68 | 
 69 |    if ($replace_tid){
 70 |       $attributes =~ s/transcript_id "[\/a-zA-Z0-9._-]+"/transcript_id "$gene_id"/;
 71 |    }
 72 | 
 73 |    if ($feature eq 'CDS'){
 74 |       if ($keep_cds){
 75 |          print "$_\n";
 76 |       }
 77 |       print join("\t", $sequence, $source, 'exon', $start, $end + 3, $score, $strand, $phase, $attributes), "\n";
 78 |       $cds{$start} = $end + 3;
 79 |    } elsif ($feature eq 'start_codon') {
 80 |       if ($keep_cds){
 81 |          print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n";
 82 |       }
 83 |    } elsif ($feature eq 'stop_codon') {
 84 |       if ($keep_cds){
 85 |          print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n";
 86 |       }
 87 |    } elsif ($feature eq 'exon') {
 88 |       $exon{$start} = $end;
 89 |    } else {
 90 |       print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n";
 91 |    }
 92 | 
 93 | }
 94 | close(IN);
 95 | 
 96 | # Issue warning if the newly created exon has coordinates identical to an existing exon
 97 | foreach my $start (keys %cds){
 98 |    my $end = $cds{$start};
 99 |    if (exists $exon{$start} && $exon{$start} == $end){
100 |       warn("[WARNING] Exon $start-$end is repeated; please confirm that they below to different transcript_id's.\n");
101 |    }
102 | }
103 | 
104 | warn("[WARNING] Finished processing $gtf.\n");
105 | 
106 | sub usage {
107 | print STDERR <<EOF;
108 | Usage: $0 -i FILE
109 | 
110 | Where:   -i         GTF file
111 |          -c         output CDS, start_codon, and stop_codon (default: FALSE)
112 |          -t         replace transcript ID with gene ID (default: FALSE)
113 |          -h         this helpful usage message
114 | 
115 | EOF
116 | exit();
117 | }
118 | 
119 | __END__
120 | 
121 | 


--------------------------------------------------------------------------------
/guess/README.md:
--------------------------------------------------------------------------------
 1 | ## README
 2 | 
 3 | Guess the genome assembly from an unknown BED file by checking whether coordinates span outside the defined boundary and checking the overlap with the expected genomics feature. For example, I have two BED files containing coordinates for exons but I am not sure what genome assembly the coordinates coorespond to.
 4 | 
 5 | ## Boundary check
 6 | 
 7 | Human genome chromosome sizes can be downloaded from the UCSC Genome Browser's database. I have included the two genome size files in this repository, so you do not need to run the command below. If you want to run it yourself, you will need Docker.
 8 | 
 9 | ```bash
10 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -e "select chrom, size from hg19.chromInfo" | gzip > hg19.genome.gz
11 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -e "select chrom, size from hg38.chromInfo" | gzip > hg38.genome.gz
12 | ```
13 | 
14 | Use `check_size.pl` to check.
15 | 
16 | ```bash
17 | ./check_size.pl -s hg19.genome.gz -b unknown1.bed.gz # lots of warnings
18 | ./check_size.pl -s hg38.genome.gz -b unknown1.bed.gz # no warnings
19 | 
20 | ./check_size.pl -s hg19.genome.gz -b unknown2.bed.gz # no warnings
21 | ./check_size.pl -s hg38.genome.gz -b unknown2.bed.gz # lots of warnings
22 | ```
23 | 
24 | ## Overlap check
25 | 
26 | We will use the UCSC Genome Browser's database and the RefSeq database to create a BED file containing exonic regions. UCSC Genome Browser's [internal database representations](https://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1) of coordinates always have a zero-based start and a one-based end, so we do not need to change the coordinates.
27 | 
28 | ```bash
29 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -D hg19 -e 'select chrom,exonStarts,exonEnds from refGene' | split_exon.pl | grep -v "_" | sort -k1,1V -k2,2n | uniq | gzip > hg19.genes.gz
30 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -D hg38 -e 'select chrom,exonStarts,exonEnds from refGene' | split_exon.pl | grep -v "_" | sort -k1,1V -k2,2n | uniq | gzip > hg38.genes.gz
31 | ```
32 | 
33 | Use `bedtools jaccard` to calculate the [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index).
34 | 
35 | ```bash
36 | # low Jaccard index
37 | bedtools jaccard -a unknown1.bed.gz -b hg19.genes.gz | column -t
38 | intersection  union      jaccard    n_intersections
39 | 3942923       135026099  0.0292012  25031
40 | 
41 | # much higher Jaccard index
42 | bedtools jaccard -a unknown1.bed.gz -b hg38.genes.gz | column -t
43 | intersection  union      jaccard  n_intersections
44 | 36268855      103033712  0.35201  190064
45 | 
46 | bedtools jaccard -a unknown2.bed.gz -b hg38.genes.gz | column -t
47 | intersection  union      jaccard    n_intersections
48 | 3566997       130829021  0.0272646  24223
49 | 
50 | bedtools jaccard -a unknown2.bed.gz -b hg19.genes.gz | column -t
51 | intersection  union      jaccard  n_intersections
52 | 32064945      101997528  0.31437  186043
53 | ```
54 | 
55 | Coordinates for `unknown1.bed.gz` are probably for hg38 and `unknown2.bed.gz` are for hg19.
56 | 
57 | ## Padding check
58 | 
59 | In addition I want to check whether the coordinates are "padded", which means that additional bps are added. If coordinates are padded, then if I shorten regions, the Jaccard index should increase.
60 | 
61 | ```bash
62 | # first check size of smallest region
63 | zcat unknown1.bed.gz | perl -lane 'print $F[2] - $F[1]' | sort -n | head -1
64 | 110
65 | 
66 | # remove 50 bps from start and end
67 | zcat unknown1.bed.gz | perl -lane 'print join("\t", $F[0], $F[1]+50, $F[2]-50)' | gzip > unknown1_shortened.bed.gz
68 | 
69 | bedtools jaccard -a unknown1_shortened.bed.gz -b hg38.genes.gz | column -t
70 | intersection  union     jaccard   n_intersections
71 | 27250690      92010876  0.296168  189508
72 | ```
73 | 
74 | The Jaccard index is decreased, so the BED file is probably not padded.
75 | 
76 | If the original BED file was padded say by 50 bp, we would have a Jaccard index of 0.31 and "removing" the padding would increase the Jaccard index to 0.35.
77 | 
78 | ```bash
79 | zcat unknown1.bed.gz | perl -lane 'print join("\t", $F[0], $F[1]-50, $F[2]+50)' | gzip > unknown1_lengthened.bed.gz
80 | bedtools jaccard -a unknown1_lengthened.bed.gz -b hg38.genes.gz | column -t
81 | intersection  union      jaccard   n_intersections
82 | 37895511      120602616  0.314218  190090
83 | ```
84 | 
85 | 


--------------------------------------------------------------------------------
/gencode/plot_stats.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Plot GENCODE stats"
  3 | date: "`r Sys.Date()`"
  4 | output: html_document
  5 | ---
  6 | 
  7 | ```{r setup, include=FALSE}
  8 | library(tidyverse)
  9 | library(reshape2)
 10 | library(rtracklayer)
 11 | library(scales)
 12 | library(cowplot)
 13 | theme_set(theme_bw())
 14 | knitr::opts_chunk$set(echo = TRUE)
 15 | ```
 16 | 
 17 | ## Genomic region stats
 18 | 
 19 | Plot.
 20 | 
 21 | ```{r load_and_plot, warning=FALSE, message=FALSE}
 22 | stat_files <- list.files(path = ".", pattern = "stats$")
 23 | file_list <- lapply(stat_files, function(x){
 24 |   read_delim(file = x, delim = " ", col_names = c("var", "value"))
 25 | })
 26 | 
 27 | names(file_list) <- sub(pattern = "*.annotation.gtf.stats", replacement = "", x = stat_files)
 28 | 
 29 | my_df <- as.data.frame(do.call(rbind, sapply(file_list, function(x){
 30 |   x[, 2]
 31 | })))
 32 | 
 33 | colnames(my_df) <- sub(pattern = ":", replacement = "", x = file_list[[1]]$var)
 34 | my_df$version <- sub(pattern = "gencode.v(\\d\\d).value", replacement = "\\1", x = row.names(my_df))
 35 | row.names(my_df) <- NULL
 36 | my_df <- melt(my_df, id.vars = "version")
 37 | 
 38 | my_df %>%
 39 |   filter(str_detect(variable, "coverage")) %>%
 40 |   ggplot(., aes(version, value, fill = variable)) +
 41 |   geom_col() +
 42 |   labs(x = "GENCODE version", y = "Percentage", title = "Genomic region proportions") -> p1
 43 | 
 44 | my_df %>%
 45 |   filter(str_detect(variable, "length")) %>%
 46 |   ggplot(., aes(version, value, fill = variable)) +
 47 |   geom_col() +
 48 |   labs(x = "GENCODE version", y = "Average length (bp)", title = "Genomic region lengths") -> p2
 49 | 
 50 | p1
 51 | p2
 52 | ```
 53 | 
 54 | Save plots.
 55 | 
 56 | ```{r save_plot, warning=FALSE, message=FALSE}
 57 | ggsave(filename = "genomic_region_proportion.png", plot = p1)
 58 | ggsave(filename = "genomic_region_length.png", plot = p2)
 59 | ```
 60 | 
 61 | ## GTF stats
 62 | 
 63 | Load all GTF files into R.
 64 | 
 65 | ```{r load_gtf, warning=FALSE, message=FALSE}
 66 | gtf_files <- list.files(path = ".", pattern = "gtf.gz$")
 67 | 
 68 | gtf_obj <- lapply(gtf_files, import)
 69 | 
 70 | names(gtf_obj) <- sub(pattern = "*.annotation.gtf.gz", replacement = "", x = gtf_files)
 71 | ```
 72 | 
 73 | Plot `gene_type`.
 74 | 
 75 | ```{r plot_gene_type, message=FALSE, warning=FALSE, fig.width=12, fig.height=8}
 76 | plot_gene_type <- function(gr, plot_title){
 77 |   gr %>%
 78 |     as.data.frame() %>%
 79 |     filter(type == "gene") %>%
 80 |     group_by(gene_type) %>%
 81 |     summarise(count = n()) %>%
 82 |     arrange(desc(count)) %>%
 83 |     mutate(gene_type = factor(x = gene_type, levels = gene_type)) %>%
 84 |     ggplot(., aes(gene_type, y = count)) +
 85 |       geom_col() +
 86 |       scale_y_log10() +
 87 |       theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
 88 |             axis.title.x = element_blank()) +
 89 |       scale_y_continuous(labels = comma) +
 90 |       labs(title = plot_title)
 91 | }
 92 | 
 93 | p1 <- plot_gene_type(gtf_obj$gencode.v20, "GENCODE version 20")
 94 | p2 <- plot_gene_type(gtf_obj$gencode.v30, "GENCODE version 30")
 95 | p3 <- plot_gene_type(gtf_obj$gencode.v31, "GENCODE version 31")
 96 | p4 <- plot_gene_type(gtf_obj$gencode.v35, "GENCODE version 35")
 97 | 
 98 | all_plot <- plot_grid(plotlist = list(p1, p2, p3, p4))
 99 | all_plot
100 | ggsave(filename = "gene_type.png", plot = all_plot)
101 | ```
102 | 
103 | Plot `transcript_support_level`, which are [transcript scores](https://www.gencodegenes.org/pages/data_format.html) according to how well mRNA and EST alignments match over its full length:
104 | 
105 | * 1 (all splice junctions of the transcript are supported by at least one non-suspect mRNA),
106 | * 2 (the best supporting mRNA is flagged as suspect or the support is from multiple ESTs),
107 | * 3 (the only support is from a single EST),
108 | * 4 (the best supporting EST is flagged as suspect),
109 | * 5 (no single transcript supports the model structure),
110 | * NA (the transcript was not analyzed)
111 | 
112 | ```{r plot_transcript_support, message=FALSE, warning=FALSE, fig.width=8, fig.height=6}
113 | plot_transcript_support <- function(gr, plot_title){
114 |   gr %>%
115 |     as.data.frame() %>%
116 |     filter(type == "transcript") %>%
117 |     group_by(transcript_support_level) %>%
118 |     summarise(count = n()) %>%
119 |     arrange(desc(count)) %>%
120 |     filter(!is.na(transcript_support_level)) %>%
121 |     mutate(transcript_support_level = factor(x = transcript_support_level, levels = c(1:5, "NA"))) %>%
122 |     ggplot(., aes(transcript_support_level, y = count)) +
123 |       geom_col() +
124 |       #scale_y_log10() +
125 |       theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
126 |             axis.title.x = element_blank()) +
127 |       scale_y_continuous(labels = comma) +
128 |       labs(title = plot_title)
129 | }
130 | 
131 | p1 <- plot_transcript_support(gtf_obj$gencode.v21, "GENCODE version 21")
132 | p2 <- plot_transcript_support(gtf_obj$gencode.v30, "GENCODE version 30")
133 | p3 <- plot_transcript_support(gtf_obj$gencode.v33, "GENCODE version 33")
134 | p4 <- plot_transcript_support(gtf_obj$gencode.v35, "GENCODE version 35")
135 | 
136 | all_plot <- plot_grid(plotlist = list(p1, p2, p3, p4))
137 | all_plot
138 | ggsave(filename = "transcript_support_level.png", plot = all_plot)
139 | ```
140 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | Defining genomic regions
  2 | ========================
  3 | 
  4 | A reference genome is simply a string of A's, C's, G's, and T's. However, there are many functional elements within the genome and massive efforts have been undertaken to annotate genomes. The [GENCODE Project](https://www.gencodegenes.org/pages/gencode.html) was tasked with cataloguing genes and gene variants in the human and mouse genomes. [TAIR](https://www.arabidopsis.org/portals/genAnnotation/gene_structural_annotation/annotation_data.jsp) coordinates and provides genome annotations for *Arabidopsis thaliana*. Typically, genome annotations are provided in a [GTF file](https://en.wikipedia.org/wiki/Gene_transfer_format).
  5 | 
  6 | We will define regions in a reference genome by using [BEDTools](http://bedtools.readthedocs.io/en/latest/) and a GTF file.
  7 | 
  8 | ![Example](example.png)
  9 | The IGV screenshot above shows various gene models in dark blue, exonic regions in light red, intronic regions in light green, and intergenic regions in light blue.
 10 | 
 11 | ## Install BEDTools
 12 | 
 13 | To get started, download and compile BEDTools, if you haven't already.
 14 | 
 15 | ```bash
 16 | git clone https://github.com/arq5x/bedtools2.git
 17 | cd bedtools2
 18 | make clean && make all
 19 | ```
 20 | 
 21 | Alternatively, you can install BEDTools using [Conda](https://davetang.github.io/reproducible_bioinformatics/conda.html).
 22 | 
 23 | ```bash
 24 | conda install -c bioconda bedtools
 25 | ```
 26 | 
 27 | ## Download GTF file
 28 | 
 29 | We will use the GTF file for *Arabidopsis thaliana*; more information on the format is provided at the [UCSC Genome Browser help page](https://genome.ucsc.edu/FAQ/FAQformat#format4).
 30 | 
 31 | ```bash
 32 | wget -c ftp://ftp.ensemblgenomes.org/pub/release-36/plants/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.36.gtf.gz
 33 | 
 34 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz | head
 35 | #!genome-build TAIR10
 36 | #!genome-version TAIR10
 37 | #!genome-date 2010-09
 38 | #!genome-build-accession GCA_000001735.1
 39 | #!genebuild-last-updated 2010-09
 40 | 1       araport11       gene    3631    5899    .       +       .       gene_id "AT1G01010"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding";
 41 | 1       araport11       transcript      3631    5899    .       +       .       gene_id "AT1G01010"; transcript_id "AT1G01010.1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding";
 42 | 1       araport11       exon    3631    3913    .       +       .       gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon1";
 43 | 1       araport11       CDS     3760    3913    .       +       0       gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; protein_version "1";
 44 | 1       araport11       start_codon     3760    3762    .       +       0       gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding";
 45 | ```
 46 | 
 47 | ## Define exons
 48 | 
 49 | Exons are already defined in the GTF file, so we simply need to print lines that are marked exonic.
 50 | 
 51 | ```bash
 52 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz |
 53 | awk 'BEGIN{OFS="\t";} $3=="exon" {print $1,$4-1,$5}' |
 54 | bedtools sort |
 55 | bedtools merge -i - | gzip > my_exon.bed.gz
 56 | ```
 57 | 
 58 | ## Define introns
 59 | 
 60 | To obtain introns, we simply need the gene and exonic coordinates; by subtracting the exonic regions from the genic region, we have the intronic region.
 61 | 
 62 | ```bash
 63 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz |
 64 | awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' |
 65 | bedtools sort |
 66 | bedtools subtract -a stdin -b my_exon.bed.gz |
 67 | gzip > my_intron.bed.gz
 68 | ```
 69 | 
 70 | ## Define intergenic
 71 | 
 72 | For the intergenic region, we will require the size of the chromosomes.
 73 | 
 74 | ```bash
 75 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz |
 76 |   awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' |
 77 |   bedtools sort -g chrom_info/araTha1.genome |
 78 |   bedtools complement -i stdin -g chrom_info/araTha1.genome |
 79 |   gzip > my_intergenic.bed.gz
 80 | ```
 81 | 
 82 | ## *Arabidopsis thaliana* regions
 83 | 
 84 | How much of the is made up of exonic, intronic, and intergenic regions?
 85 | 
 86 | ```bash
 87 | alias add='perl -nle '\''$i+=$_; END {print $i}'\'''
 88 | 
 89 | cat chrom_info/araTha1.genome
 90 | 1       30427671
 91 | 5       26975502
 92 | 3       23459830
 93 | 2       19698289
 94 | 4       18585056
 95 | Mt      366924
 96 | Pt      154478
 97 | 
 98 | cat chrom_info/araTha1.chrom.sizes | cut -f2 | add
 99 | 119667750
100 | 
101 | # exonic
102 | gunzip -c my_exon.bed.gz | awk '{print $3-$2}' | add
103 | 47821763
104 | bc -l<<<47821763*100/119667750
105 | 39.96211427055326100808
106 | 
107 | # intronic
108 | gunzip -c my_intron.bed.gz | awk '{print $3-$2}' | add
109 | 18164145
110 | bc -l<<<18164145*100/119667750
111 | 15.17881384082177529033
112 | 
113 | # intergenic
114 | gunzip -c my_intergenic.bed.gz | awk '{print $3-$2}' | add
115 | 53769447
116 | bc -l<<<53769447*100/119667750
117 | 44.93227874678014753348
118 | 
119 | # slightly off total
120 | bc -l<<<15.17881384082177529033+44.93227874678014753348+39.96211427055326100808
121 | 100.07320685815518383189
122 | ```
123 | 
124 | ## hg19 reference genome
125 | 
126 | If you are working with hg19, simply run `make` to create all the different genomic regions.
127 | 
128 | ```bash
129 | make
130 | 
131 | # if everything ran successfully
132 | for file in `ls *.gz`; do md5sum $file; done
133 | bd83e28270e595d3bde6bfcb21c9748f  gencode.v19.annotation.gtf.gz
134 | 8c97ec4b54eaa176ba1e48bfeb60c08a  gencode_v19_exon_merged.bed.gz
135 | ea03038b873ba2612383a4c0949c835d  gencode_v19_intergenic.bed.gz
136 | 4d5ff850e3115077bf50d87bc406a84f  gencode_v19_intron.bed.gz
137 | 48dbe15f4498baad1a2327c774a692c8  promoter.bed.gz
138 | 9d513cad3aafd5690bf8bbebb24b4df4  transcript_utr.bed.gz
139 | 35aed6aac655182c653cdc72060b914d  transcript_utr_number.out.gz
140 | ```
141 | 
142 | ## Annotate BAM files
143 | 
144 | ```bash
145 | samtools sort my_file.bam my_file
146 | bedtools2/bin/bedtools bamtobed -i my_file.bam > my_file.bed
147 | cat my_file.bed | wc -l
148 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_exon_merged.bed.gz -u | wc -l
149 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_intergenic.bed.gz -u | wc -l
150 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_intron.bed.gz -u | wc -l
151 | ```
152 | 
153 | ## run.pl
154 | 
155 | I wrote `run.pl` to create exonic, intronic, and intergenic BED files as well as providing some simple statistics of each region.
156 | 
157 | ```bash
158 | run.pl Arabidopsis_thaliana.TAIR10.37.gtf.gz chrom_info/araTha1.genome
159 | Creating exonic regions
160 | Creating intronic regions
161 | Creating intergenic regions
162 | 
163 | Coverage summary per region (percentage)
164 | 
165 | Exon: 39.99
166 | Intron: 15.16
167 | Intergenic: 44.84
168 | 
169 | Average length per region (bp)
170 | 
171 | Exon: 336.63
172 | Intron: 157.37
173 | Intergenic: 1951.60
174 | ```
175 | 
176 | ## Further reading
177 | 
178 | See assoicated blog post: <https://davetang.org/muse/2013/01/18/defining-genomic-regions/>
179 | 
180 | 


--------------------------------------------------------------------------------
/chrom_info/hg19.genome:
--------------------------------------------------------------------------------
  1 | chr1	249250621
  2 | chr2	243199373
  3 | chr3	198022430
  4 | chr4	191154276
  5 | chr5	180915260
  6 | chr6	171115067
  7 | chr7	159138663
  8 | chrX	155270560
  9 | chr8	146364022
 10 | chr9	141213431
 11 | chr10	135534747
 12 | chr11	135006516
 13 | chr12	133851895
 14 | chr13	115169878
 15 | chr14	107349540
 16 | chr15	102531392
 17 | chr16	90354753
 18 | chr17	81195210
 19 | chr18	78077248
 20 | chr20	63025520
 21 | chrY	59373566
 22 | chr19	59128983
 23 | chr22	51304566
 24 | chr21	48129895
 25 | chr1_jh636052_fix	7283150
 26 | chrX_jh806600_fix	6530008
 27 | chr6_ssto_hap7	4928567
 28 | chr6_mcf_hap5	4833398
 29 | chr6_cox_hap2	4795371
 30 | chr6_mann_hap4	4683263
 31 | chr6_apd_hap1	4622290
 32 | chr6_qbl_hap6	4611984
 33 | chr6_dbb_hap3	4610396
 34 | chrX_jh806587_fix	4110759
 35 | chr7_jh159134_fix	3821770
 36 | chrX_jh159150_fix	3110903
 37 | chrX_jh806590_fix	2418393
 38 | chr10_jh591181_fix	2281126
 39 | chr17_ctg5_hap1	1680828
 40 | chr1_jh636053_fix	1676126
 41 | chr5_gl339449_alt	1612928
 42 | chr14_kb021645_fix	1523386
 43 | chrX_jh720453_fix	1461188
 44 | chrX_jh806601_fix	1389764
 45 | chr7_gl582971_fix	1284284
 46 | chrX_jh806599_fix	1214327
 47 | chr19_gl949749_alt	1091840
 48 | chr19_gl949750_alt	1066389
 49 | chr19_gl949748_alt	1064303
 50 | chr19_kb021647_fix	1058686
 51 | chrX_jh806597_fix	1045622
 52 | chr10_ke332501_fix	1020827
 53 | chr19_gl949751_alt	1002682
 54 | chr19_gl949746_alt	987716
 55 | chr19_gl949752_alt	987100
 56 | chrX_jh806598_fix	899320
 57 | chrX_jh720451_fix	898979
 58 | chrX_jh806591_fix	882083
 59 | chr11_jh806581_fix	872115
 60 | chrX_jh806588_fix	862483
 61 | chrX_jh806592_fix	835911
 62 | chr19_gl949753_alt	796478
 63 | chr1_jh636054_fix	758378
 64 | chrX_jh720454_fix	752267
 65 | chr19_gl949747_alt	729519
 66 | chr7_jh636058_fix	716227
 67 | chrX_jh806602_fix	713266
 68 | chr17_gl383561_fix	644425
 69 | chr8_gl949743_fix	608579
 70 | chr2_kb663603_fix	599580
 71 | chr4_ctg9_hap1	590426
 72 | chr19_gl582977_fix	580393
 73 | chr19_ke332505_fix	579598
 74 | chr1_gl000192_random	547496
 75 | chr11_jh159140_fix	546435
 76 | chr5_ke332497_fix	543325
 77 | chr17_gl383560_fix	534288
 78 | chrX_jh720452_fix	522319
 79 | chr4_ke332496_fix	503215
 80 | chr6_kb663604_fix	478993
 81 | chrX_kb021648_fix	469972
 82 | chr11_jh591184_fix	462282
 83 | chr17_gl383558_fix	457041
 84 | chr17_jh720447_fix	454385
 85 | chrX_jh806595_fix	444074
 86 | chr10_jh636060_fix	437946
 87 | chr8_gl383535_fix	429806
 88 | chrX_jh806596_fix	413927
 89 | chr17_gl582976_fix	412535
 90 | chr11_jh720443_fix	408430
 91 | chr12_gl877876_alt	408271
 92 | chr3_jh159131_fix	393769
 93 | chr10_gl383543_fix	392792
 94 | chrX_jh806594_fix	390496
 95 | chr2_gl877871_fix	389939
 96 | chrX_jh806593_fix	389631
 97 | chr15_gl383555_alt	388773
 98 | chr17_jh159144_fix	388340
 99 | chr19_gl383573_alt	385657
100 | chr17_jh591186_fix	376223
101 | chr4_gl383528_alt	376187
102 | chr12_gl949745_alt	372609
103 | chr1_gl383520_alt	366579
104 | chr7_gl582968_fix	356330
105 | chr7_gl582970_fix	354970
106 | chr17_jh806582_fix	342635
107 | chr17_ke332502_fix	341712
108 | chr17_gl383559_fix	338640
109 | chr12_kb663607_fix	334922
110 | chr9_gl339450_fix	330164
111 | chr7_gl582972_fix	327774
112 | chr11_jh159142_fix	326647
113 | chr11_gl582973_fix	321004
114 | chr10_gl383546_alt	309802
115 | chr21_ke332506_fix	307252
116 | chr10_kb663606_fix	305900
117 | chr4_gl877872_fix	297485
118 | chr15_gl383554_alt	296527
119 | chr9_jh636059_fix	295379
120 | chr18_gl383567_alt	289831
121 | chrX_gl877877_fix	284527
122 | chr20_kb663608_fix	283551
123 | chr17_jh159146_alt	278131
124 | chr11_gl949744_fix	276448
125 | chr7_ke332499_fix	274521
126 | chr6_jh806576_fix	273386
127 | chr12_jh720444_fix	273128
128 | chrX_jh806589_fix	270630
129 | chr17_gl383563_alt	270261
130 | chr5_jh159133_fix	266316
131 | chr3_ke332495_fix	263861
132 | chr6_jh636056_fix	262912
133 | chr7_gl582969_fix	251823
134 | chr4_gl582967_fix	248177
135 | chr19_jh159149_fix	245473
136 | chr11_jh159141_fix	240775
137 | chr8_ke332500_fix	228602
138 | chr5_gl949742_alt	226852
139 | chr17_gl383565_alt	223995
140 | chr22_jh720449_fix	212298
141 | chr17_kb021646_fix	211416
142 | chr9_jh806579_fix	211307
143 | chrUn_gl000225	211173
144 | chr8_gl383536_fix	203777
145 | chr21_gl383579_alt	201198
146 | chr11_jh159136_alt	200998
147 | chr6_jh636057_fix	200195
148 | chr18_gl383571_alt	198278
149 | chr10_jh591182_fix	196262
150 | chr17_jh159145_fix	194862
151 | chr16_gl383556_alt	192462
152 | chr4_gl000194_random	191469
153 | chr11_jh159137_alt	191409
154 | chr11_jh159143_fix	191402
155 | chr4_gl000193_random	189789
156 | chr19_gl383576_alt	188024
157 | chr6_kb021644_alt	187824
158 | chr9_gl000200_random	187035
159 | chrUn_gl000222	186861
160 | chrUn_gl000212	186858
161 | chr17_jh636061_fix	186059
162 | chr12_gl383551_alt	184319
163 | chrX_jh806603_fix	182949
164 | chr7_gl000195_random	182896
165 | chr1_gl383518_alt	182439
166 | chr3_gl383526_alt	180671
167 | chrUn_gl000223	180455
168 | chr20_gl582979_fix	179899
169 | chrUn_gl000224	179693
170 | chr10_gl383545_alt	179254
171 | chrUn_gl000219	179198
172 | chr10_jh591183_fix	177920
173 | chr17_gl000205_random	174588
174 | chr5_gl383531_alt	173459
175 | chr3_jh636055_alt	173151
176 | chrUn_gl000215	172545
177 | chrUn_gl000216	172294
178 | chrUn_gl000217	172149
179 | chr3_gl383523_fix	171362
180 | chr9_gl383541_alt	171286
181 | chr19_gl383575_alt	170222
182 | chr15_jh720445_fix	170033
183 | chr9_gl000199_random	169874
184 | chr9_jh806578_fix	169437
185 | chr12_gl383550_alt	169178
186 | chr10_gl877873_fix	168465
187 | chr18_gl383569_alt	167950
188 | chr11_jh591185_fix	167437
189 | chr12_gl877875_alt	167313
190 | chr22_jh806583_fix	167183
191 | chrUn_gl000211	166566
192 | chr12_gl383548_fix	165247
193 | chr18_gl383570_alt	164789
194 | chr4_gl383527_alt	164536
195 | chrUn_gl000213	164239
196 | chr12_gl582974_fix	163298
197 | chr9_gl383539_alt	162988
198 | chr22_gl383582_alt	162811
199 | chrUn_gl000220	161802
200 | chrUn_gl000218	161147
201 | chr18_gl383572_alt	159547
202 | chr19_gl000209_random	159169
203 | chr9_kb663605_fix	155926
204 | chr19_gl383574_alt	155864
205 | chrUn_gl000221	155397
206 | chr11_gl383547_alt	154407
207 | chr12_gl383553_alt	152874
208 | chr1_gl949741_fix	151551
209 | chr6_ke332498_fix	149443
210 | chr2_gl383521_alt	143390
211 | chr12_gl383552_alt	138655
212 | chrUn_gl000214	137718
213 | chr17_gl383564_alt	133151
214 | chrUn_gl000228	129120
215 | chr20_gl383577_alt	128385
216 | chr10_gl383544_fix	128378
217 | chrUn_gl000227	128374
218 | chr6_gl383533_alt	124736
219 | chr2_gl383522_alt	123821
220 | chr4_gl383529_alt	121345
221 | chr12_gl383549_alt	120804
222 | chr11_jh159139_fix	120441
223 | chr7_gl383534_alt	119183
224 | chr21_gl383581_alt	116690
225 | chr1_gl383519_alt	110268
226 | chr11_jh159138_fix	108875
227 | chr1_gl000191_random	106433
228 | chr18_gl383568_alt	104552
229 | chr8_jh159135_fix	102251
230 | chr5_gl383530_alt	101241
231 | chr3_jh159132_fix	100694
232 | chr16_jh720446_fix	97345
233 | chr22_gl383583_alt	96924
234 | chr2_gl582966_alt	96131
235 | chr10_jh806580_fix	93149
236 | chr19_gl000208_random	92689
237 | chr17_gl383566_alt	90219
238 | chr9_gl000198_random	90085
239 | chr16_gl383557_alt	89672
240 | chr17_jh159148_alt	88070
241 | chr5_gl383532_alt	82728
242 | chr17_gl000204_random	81310
243 | chr3_gl383524_fix	78793
244 | chr21_gl383580_alt	74652
245 | chr22_kb663609_alt	74013
246 | chr22_jh806585_fix	73505
247 | chr9_gl383540_alt	71551
248 | chr22_jh806584_fix	70876
249 | chr20_jh720448_fix	70483
250 | chr17_jh159147_alt	70345
251 | chr2_gl877870_fix	66021
252 | chr3_gl383525_fix	65063
253 | chrX_jh720455_fix	65034
254 | chr21_gl383578_alt	63917
255 | chr9_gl383537_fix	62435
256 | chr9_gl383542_alt	60032
257 | chr1_gl383517_fix	49352
258 | chr1_gl383516_fix	49316
259 | chr9_gl383538_fix	49281
260 | chr1_jh806575_fix	47409
261 | chrUn_gl000233	45941
262 | chrUn_gl000237	45867
263 | chr17_gl383562_fix	45551
264 | chrUn_gl000230	43691
265 | chr22_jh806586_fix	43543
266 | chrUn_gl000242	43523
267 | chrUn_gl000243	43341
268 | chrUn_gl000241	42152
269 | chrUn_gl000236	41934
270 | chrUn_gl000240	41933
271 | chr17_gl000206_random	41001
272 | chrUn_gl000232	40652
273 | chrUn_gl000234	40531
274 | chr11_gl000202_random	40103
275 | chrUn_gl000238	39939
276 | chrUn_gl000244	39929
277 | chrUn_gl000248	39786
278 | chr8_gl000196_random	38914
279 | chrUn_gl000249	38502
280 | chrUn_gl000246	38154
281 | chr17_gl000203_random	37498
282 | chr8_gl000197_random	37175
283 | chrUn_gl000245	36651
284 | chrUn_gl000247	36422
285 | chr9_gl000201_random	36148
286 | chr13_gl582975_fix	34662
287 | chrUn_gl000235	34474
288 | chrUn_gl000239	33824
289 | chr21_gl000210_random	27682
290 | chrUn_gl000231	27386
291 | chr1_jh806573_fix	24680
292 | chr1_jh806574_fix	22982
293 | chr9_jh806577_fix	22394
294 | chrUn_gl000229	19913
295 | chrM	16571
296 | chrMT	16569
297 | chrUn_gl000226	15008
298 | chr18_gl000207_random	4262
299 | 


--------------------------------------------------------------------------------
/chrom_info/hg38.genome:
--------------------------------------------------------------------------------
  1 | chr1	248956422
  2 | chr2	242193529
  3 | chr3	198295559
  4 | chr4	190214555
  5 | chr5	181538259
  6 | chr6	170805979
  7 | chr7	159345973
  8 | chrX	156040895
  9 | chr8	145138636
 10 | chr9	138394717
 11 | chr11	135086622
 12 | chr10	133797422
 13 | chr12	133275309
 14 | chr13	114364328
 15 | chr14	107043718
 16 | chr15	101991189
 17 | chr16	90338345
 18 | chr17	83257441
 19 | chr18	80373285
 20 | chr20	64444167
 21 | chr19	58617616
 22 | chrY	57227415
 23 | chr22	50818468
 24 | chr21	46709983
 25 | chr8_KZ208915v1_fix	6367528
 26 | chr15_KI270905v1_alt	5161414
 27 | chr15_KN538374v1_fix	4998962
 28 | chr6_GL000256v2_alt	4929269
 29 | chr6_GL000254v2_alt	4827813
 30 | chr6_GL000251v2_alt	4795265
 31 | chr6_GL000253v2_alt	4677643
 32 | chr6_GL000250v2_alt	4672374
 33 | chr6_GL000255v2_alt	4606388
 34 | chr6_GL000252v2_alt	4604811
 35 | chr17_KI270857v1_alt	2877074
 36 | chr16_KI270853v1_alt	2659700
 37 | chr15_KQ031389v1_alt	2365364
 38 | chr16_KV880768v1_fix	1927115
 39 | chr16_KI270728v1_random	1872759
 40 | chr17_GL000258v2_alt	1821992
 41 | chr5_GL339449v2_alt	1612928
 42 | chr14_KI270847v1_alt	1511111
 43 | chr17_KI270908v1_alt	1423190
 44 | chr14_KI270846v1_alt	1351393
 45 | chr5_KI270897v1_alt	1144418
 46 | chr7_KI270803v1_alt	1111570
 47 | chr19_GL949749v2_alt	1091841
 48 | chr19_KI270938v1_alt	1066800
 49 | chr19_GL949750v2_alt	1066390
 50 | chr19_GL949748v2_alt	1064304
 51 | chr12_KZ208916v1_fix	1046838
 52 | chr19_GL949751v2_alt	1002683
 53 | chr19_GL949746v1_alt	987716
 54 | chr19_GL949752v1_alt	987100
 55 | chr8_KI270821v1_alt	985506
 56 | chr1_KI270763v1_alt	911658
 57 | chr6_KI270801v1_alt	870480
 58 | chr19_GL949753v2_alt	796479
 59 | chr19_GL949747v2_alt	729520
 60 | chr14_KZ208920v1_fix	690932
 61 | chr7_KZ208913v1_alt	680662
 62 | chr5_KV575244v1_fix	673059
 63 | chr8_KI270822v1_alt	624492
 64 | chr7_KZ208912v1_fix	589656
 65 | chr4_GL000257v2_alt	586476
 66 | chr12_KI270904v1_alt	572349
 67 | chr4_KI270925v1_alt	555799
 68 | chr1_KV880763v1_alt	551020
 69 | chr12_KN538369v1_fix	541038
 70 | chr2_KQ983256v1_alt	535088
 71 | chr2_KQ031384v1_fix	481245
 72 | chr16_KZ559113v1_fix	480415
 73 | chr15_KI270852v1_alt	478999
 74 | chr7_KV880765v1_fix	468267
 75 | chr1_KQ031383v1_fix	467143
 76 | chr1_KN538360v1_fix	460100
 77 | chr3_KN196475v1_fix	451168
 78 | chr15_KI270727v1_random	448248
 79 | chr9_KI270823v1_alt	439082
 80 | chr15_KI270850v1_alt	430880
 81 | chr1_KI270759v1_alt	425601
 82 | chr4_KV766193v1_alt	420675
 83 | chr10_KN538367v1_fix	420164
 84 | chr3_KN538364v1_fix	415308
 85 | chr3_KV766192v1_fix	411654
 86 | chr12_GL877876v1_alt	408271
 87 | chr18_KQ090028v1_fix	407387
 88 | chr19_KQ458386v1_fix	405389
 89 | chrUn_KI270442v1	392061
 90 | chr17_KI270862v1_alt	391357
 91 | chr15_GL383555v2_alt	388773
 92 | chr19_GL383573v1_alt	385657
 93 | chr4_KI270896v1_alt	378547
 94 | chr4_GL383528v1_alt	376187
 95 | chr17_GL383563v3_alt	375691
 96 | chr8_KI270810v1_alt	374415
 97 | chr3_KQ031385v1_fix	373699
 98 | chr19_KN196484v1_fix	370917
 99 | chr1_GL383520v2_alt	366580
100 | chr2_KN538363v1_fix	365499
101 | chr5_KV575243v1_alt	362221
102 | chr13_KN538372v1_fix	356766
103 | chr1_KI270762v1_alt	354444
104 | chr1_KQ458383v1_alt	349938
105 | chr9_KN196479v1_fix	330164
106 | chr1_KZ208906v1_fix	330031
107 | chr15_KI270848v1_alt	327382
108 | chr17_KI270909v1_alt	325800
109 | chr14_KI270844v1_alt	322166
110 | chr6_KQ031387v1_fix	320750
111 | chr8_KI270900v1_alt	318687
112 | chr12_KQ759760v1_fix	315610
113 | chr10_GL383546v1_alt	309802
114 | chr13_KI270838v1_alt	306913
115 | chr3_KN196476v1_fix	305979
116 | chr8_KI270816v1_alt	305841
117 | chr1_KN538361v1_fix	305542
118 | chr11_KZ559108v1_fix	305244
119 | chr22_KI270879v1_alt	304135
120 | chr3_KZ559103v1_alt	302885
121 | chr11_KZ559110v1_alt	301637
122 | chr8_KI270813v1_alt	300230
123 | chr11_KI270831v1_alt	296895
124 | chr15_GL383554v1_alt	296527
125 | chr19_KV575249v1_alt	293522
126 | chr8_KI270811v1_alt	292436
127 | chr18_GL383567v1_alt	289831
128 | chrX_KI270880v1_alt	284869
129 | chr8_KI270812v1_alt	282736
130 | chr19_KI270921v1_alt	282224
131 | chr17_KV766196v1_fix	281919
132 | chr17_KI270729v1_random	280839
133 | chr11_KZ559109v1_fix	279644
134 | chr1_KQ983255v1_alt	278659
135 | chr17_JH159146v1_alt	278131
136 | chr10_KN196480v1_fix	277797
137 | chr17_KV766198v1_alt	276292
138 | chrX_KI270913v1_alt	274009
139 | chr6_KI270798v1_alt	271782
140 | chr7_KI270808v1_alt	271455
141 | chr6_KN196478v1_fix	268330
142 | chr16_KQ090027v1_alt	267463
143 | chr8_KV880767v1_fix	265876
144 | chr10_KQ090021v1_fix	264545
145 | chr22_KI270876v1_alt	263666
146 | chr15_KI270851v1_alt	263054
147 | chr22_KI270875v1_alt	259914
148 | chr1_KI270766v1_alt	256271
149 | chr19_KI270882v1_alt	248807
150 | chr3_KI270778v1_alt	248252
151 | chr17_KV766197v1_alt	246895
152 | chr6_KQ090016v1_fix	245716
153 | chr15_KI270849v1_alt	244917
154 | chr4_KI270786v1_alt	244096
155 | chr6_KZ208911v1_fix	242796
156 | chr19_KV575250v1_alt	241058
157 | chr12_KI270835v1_alt	238139
158 | chr4_KQ090015v1_alt	236512
159 | chr17_KI270858v1_alt	235827
160 | chr19_KI270867v1_alt	233762
161 | chr16_KI270855v1_alt	232857
162 | chr18_KZ559115v1_fix	230843
163 | chr4_KQ983257v1_fix	230434
164 | chr8_KI270926v1_alt	229282
165 | chr5_GL949742v1_alt	226852
166 | chr3_KI270780v1_alt	224108
167 | chr17_GL383565v1_alt	223995
168 | chr2_KI270774v1_alt	223625
169 | chr19_KV575256v1_alt	223118
170 | chr4_KI270790v1_alt	220246
171 | chr11_KI270927v1_alt	218612
172 | chr19_KI270932v1_alt	215732
173 | chr11_KI270903v1_alt	214625
174 | chr2_KI270894v1_alt	214158
175 | chr1_KQ458384v1_alt	212205
176 | chr12_KN196482v1_fix	211377
177 | chr14_GL000225v1_random	211173
178 | chrUn_KI270743v1	210658
179 | chr11_KI270832v1_alt	210133
180 | chr7_KI270805v1_alt	209988
181 | chrY_KZ208924v1_fix	209722
182 | chr4_GL000008v2_random	209709
183 | chr7_KI270809v1_alt	209586
184 | chr19_KI270887v1_alt	209512
185 | chr2_KN538362v1_fix	208149
186 | chr13_KN538371v1_fix	206320
187 | chr4_KI270789v1_alt	205944
188 | chr4_KQ983258v1_alt	205407
189 | chr3_KI270779v1_alt	205312
190 | chr19_KI270914v1_alt	205194
191 | chr18_KQ458385v1_alt	205101
192 | chr19_KI270886v1_alt	204239
193 | chr11_KI270829v1_alt	204059
194 | chr11_KN538368v1_alt	203552
195 | chr14_GL000009v2_random	201709
196 | chr21_GL383579v2_alt	201197
197 | chr11_JH159136v1_alt	200998
198 | chr19_KI270930v1_alt	200773
199 | chrUn_KI270747v1	198735
200 | chr18_GL383571v1_alt	198278
201 | chr19_KI270920v1_alt	198005
202 | chr3_KZ559102v1_alt	197752
203 | chr6_KI270797v1_alt	197536
204 | chr3_KI270935v1_alt	197351
205 | chr11_KQ759759v1_fix	196940
206 | chr17_KI270861v1_alt	196688
207 | chr15_KI270906v1_alt	196384
208 | chr5_KI270791v1_alt	195710
209 | chr3_KZ559105v1_alt	195063
210 | chr14_KI270722v1_random	194050
211 | chr16_GL383556v1_alt	192462
212 | chr13_KI270840v1_alt	191684
213 | chr14_GL000194v1_random	191469
214 | chr11_JH159137v1_alt	191409
215 | chr19_KI270917v1_alt	190932
216 | chr7_KI270899v1_alt	190869
217 | chr19_KI270923v1_alt	189352
218 | chr10_KI270825v1_alt	188315
219 | chr19_GL383576v1_alt	188024
220 | chrX_KV766199v1_alt	188004
221 | chr19_KI270922v1_alt	187935
222 | chrUn_KI270742v1	186739
223 | chr1_KN196472v1_fix	186494
224 | chr22_KI270878v1_alt	186262
225 | chr19_KI270929v1_alt	186203
226 | chr11_KI270826v1_alt	186169
227 | chr6_KB021644v2_alt	185823
228 | chr17_GL000205v2_random	185591
229 | chr10_KQ090020v1_alt	185507
230 | chr1_KI270765v1_alt	185285
231 | chr19_KI270916v1_alt	184516
232 | chr19_KI270890v1_alt	184499
233 | chr3_KI270784v1_alt	184404
234 | chr12_GL383551v1_alt	184319
235 | chr20_KI270870v1_alt	183433
236 | chrUn_GL000195v1	182896
237 | chr1_GL383518v1_alt	182439
238 | chr11_KQ090022v1_fix	181958
239 | chr22_KI270736v1_random	181920
240 | chr2_KZ208907v1_alt	181658
241 | chr10_KI270824v1_alt	181496
242 | chr11_KZ559111v1_alt	181167
243 | chr14_KI270845v1_alt	180703
244 | chr3_GL383526v1_alt	180671
245 | chr13_KI270839v1_alt	180306
246 | chr7_KQ031388v1_fix	179932
247 | chr22_KI270733v1_random	179772
248 | chrUn_GL000224v1	179693
249 | chr10_GL383545v1_alt	179254
250 | chrUn_GL000219v1	179198
251 | chr5_KI270792v1_alt	179043
252 | chr17_KI270860v1_alt	178921
253 | chr19_KV575252v1_alt	178197
254 | chr19_GL000209v2_alt	177381
255 | chr11_KI270830v1_alt	177092
256 | chr9_KI270719v1_random	176845
257 | chrUn_GL000216v2	176608
258 | chr22_KI270928v1_alt	176103
259 | chr1_KI270712v1_random	176043
260 | chr3_KZ208909v1_alt	175849
261 | chr6_KI270800v1_alt	175808
262 | chr1_KI270706v1_random	175055
263 | chr12_KZ208918v1_alt	174808
264 | chr22_KQ458388v1_alt	174749
265 | chr2_KI270776v1_alt	174166
266 | chr18_KI270912v1_alt	174061
267 | chr3_KI270777v1_alt	173649
268 | chr5_GL383531v1_alt	173459
269 | chr3_JH636055v2_alt	173151
270 | chr14_KI270725v1_random	172810
271 | chr5_KI270796v1_alt	172708
272 | chr7_KZ559106v1_alt	172555
273 | chr14_KZ208919v1_alt	171798
274 | chr9_GL383541v1_alt	171286
275 | chr19_KV575259v1_alt	171263
276 | chr19_KI270885v1_alt	171027
277 | chr19_KI270919v1_alt	170701
278 | chr19_KI270889v1_alt	170698
279 | chr19_KI270891v1_alt	170680
280 | chr19_KI270915v1_alt	170665
281 | chr19_KI270933v1_alt	170537
282 | chr19_KI270883v1_alt	170399
283 | chr19_GL383575v2_alt	170222
284 | chr19_KV575247v1_alt	170206
285 | chr19_KI270931v1_alt	170148
286 | chr12_GL383550v2_alt	169178
287 | chr16_KQ031390v1_alt	169136
288 | chr13_KI270841v1_alt	169134
289 | chrUn_KI270744v1	168472
290 | chr13_KQ090024v1_alt	168146
291 | chr19_KV575248v1_alt	168131
292 | chr18_KI270863v1_alt	167999
293 | chr18_GL383569v1_alt	167950
294 | chr12_GL877875v1_alt	167313
295 | chr21_KI270874v1_alt	166743
296 | chr19_KV575253v1_alt	166713
297 | chr3_KI270924v1_alt	166540
298 | chr1_KN196473v1_fix	166200
299 | chr1_KZ208904v1_alt	166136
300 | chr1_KI270761v1_alt	165834
301 | chr3_KQ031386v1_fix	165718
302 | chr3_KI270937v1_alt	165607
303 | chr8_KZ208914v1_fix	165120
304 | chr22_KI270734v1_random	165050
305 | chr18_GL383570v1_alt	164789
306 | chr5_KI270794v1_alt	164558
307 | chr4_GL383527v1_alt	164536
308 | chrUn_GL000213v1	164239
309 | chr3_KI270936v1_alt	164170
310 | chr3_KZ559101v1_alt	164041
311 | chr19_KV575246v1_alt	163926
312 | chr9_KQ090018v1_alt	163882
313 | chr4_KQ090014v1_alt	163749
314 | chr3_KI270934v1_alt	163458
315 | chr18_KZ559116v1_alt	163186
316 | chr9_GL383539v1_alt	162988
317 | chr3_KI270895v1_alt	162896
318 | chr22_GL383582v2_alt	162811
319 | chr3_KI270782v1_alt	162429
320 | chr1_KI270892v1_alt	162212
321 | chrUn_GL000220v1	161802
322 | chr2_KI270767v1_alt	161578
323 | chr2_KI270715v1_random	161471
324 | chr2_KI270893v1_alt	161218
325 | chrUn_GL000218v1	161147
326 | chr19_KV575255v1_alt	161095
327 | chr18_GL383572v1_alt	159547
328 | chr19_KV575251v1_alt	159285
329 | chr8_KI270817v1_alt	158983
330 | chr4_KI270788v1_alt	158965
331 | chrUn_KI270749v1	158759
332 | chr7_KI270806v1_alt	158166
333 | chr7_KI270804v1_alt	157952
334 | chr18_KI270911v1_alt	157710
335 | chrUn_KI270741v1	157432
336 | chr17_KI270910v1_alt	157099
337 | chr19_KI270884v1_alt	157053
338 | chr8_KV880766v1_fix	156998
339 | chr19_KV575258v1_alt	156965
340 | chr22_KN196485v1_alt	156562
341 | chr22_KQ458387v1_alt	155930
342 | chr19_GL383574v1_alt	155864
343 | chr19_KI270888v1_alt	155532
344 | chr3_GL000221v1_random	155397
345 | chr17_KV575245v1_fix	154723
346 | chr11_GL383547v1_alt	154407
347 | chr12_KZ559112v1_alt	154139
348 | chr2_KI270716v1_random	153799
349 | chr22_KN196486v1_alt	153027
350 | chr12_GL383553v2_alt	152874
351 | chr6_KI270799v1_alt	152148
352 | chr22_KI270731v1_random	150754
353 | chrUn_KI270751v1	150742
354 | chrUn_KI270750v1	148850
355 | chr13_KN538373v1_fix	148762
356 | chr19_KV575260v1_alt	145691
357 | chr8_KI270818v1_alt	145606
358 | chr22_KQ759761v1_alt	145162
359 | chrX_KI270881v1_alt	144206
360 | chr21_KI270873v1_alt	143900
361 | chr2_GL383521v1_alt	143390
362 | chr7_KV880764v1_fix	142129
363 | chr8_KI270814v1_alt	141812
364 | chr1_KQ458382v1_alt	141019
365 | chr11_KV766195v1_fix	140877
366 | chr2_KZ208908v1_alt	140361
367 | chr1_KZ208905v1_alt	140355
368 | chr6_KV766194v1_fix	139427
369 | chr5_KN196477v1_alt	139087
370 | chr12_GL383552v1_alt	138655
371 | chrUn_KI270519v1	138126
372 | chr2_KI270775v1_alt	138019
373 | chr17_KI270907v1_alt	137721
374 | chrUn_GL000214v1	137718
375 | chr8_KI270901v1_alt	136959
376 | chr2_KI270770v1_alt	136240
377 | chr5_KZ208910v1_alt	135987
378 | chr16_KI270854v1_alt	134193
379 | chr9_KQ090019v1_alt	134099
380 | chr8_KI270819v1_alt	133535
381 | chr17_GL383564v2_alt	133151
382 | chr2_KI270772v1_alt	133041
383 | chr8_KI270815v1_alt	132244
384 | chr5_KI270795v1_alt	131892
385 | chr5_KI270898v1_alt	130957
386 | chr20_GL383577v2_alt	128386
387 | chr1_KI270708v1_random	127682
388 | chr7_KI270807v1_alt	126434
389 | chr5_KI270793v1_alt	126136
390 | chr6_GL383533v1_alt	124736
391 | chr2_GL383522v1_alt	123821
392 | chr13_KQ090025v1_alt	123480
393 | chr19_KI270918v1_alt	123111
394 | chr1_KN196474v1_fix	122022
395 | chr12_GL383549v1_alt	120804
396 | chr2_KI270769v1_alt	120616
397 | chr4_KI270785v1_alt	119912
398 | chr12_KI270834v1_alt	119498
399 | chr7_GL383534v2_alt	119183
400 | chr20_KI270869v1_alt	118774
401 | chr17_KZ559114v1_alt	116753
402 | chr21_GL383581v2_alt	116689
403 | chr3_KI270781v1_alt	113034
404 | chr17_KI270730v1_random	112551
405 | chrUn_KI270438v1	112505
406 | chr4_KI270787v1_alt	111943
407 | chr18_KI270864v1_alt	111737
408 | chr2_KI270771v1_alt	110395
409 | chr1_GL383519v1_alt	110268
410 | chr2_KI270768v1_alt	110099
411 | chr1_KI270760v1_alt	109528
412 | chr12_KQ090023v1_alt	109323
413 | chr3_KI270783v1_alt	109187
414 | chr11_KN196481v1_fix	108875
415 | chr17_KI270859v1_alt	108763
416 | chr11_KI270902v1_alt	106711
417 | chr3_KZ559104v1_fix	105527
418 | chr18_GL383568v1_alt	104552
419 | chr22_KI270737v1_random	103838
420 | chr13_KI270843v1_alt	103832
421 | chr8_KZ559107v1_alt	103072
422 | chr22_KI270877v1_alt	101331
423 | chr5_GL383530v1_alt	101241
424 | chrY_KN196487v1_fix	101150
425 | chr22_KQ759762v1_fix	101037
426 | chr19_KV575257v1_alt	100553
427 | chr11_KI270721v1_random	100316
428 | chr19_KV575254v1_alt	99845
429 | chr22_KI270738v1_random	99375
430 | chr22_GL383583v2_alt	96924
431 | chr2_GL582966v2_alt	96131
432 | chrUn_KI270748v1	93321
433 | chr18_KZ208922v1_fix	93070
434 | chrUn_KI270435v1	92983
435 | chr5_GL000208v1_random	92689
436 | chrUn_KI270538v1	91309
437 | chr4_KQ090013v1_alt	90922
438 | chr17_GL383566v1_alt	90219
439 | chr16_GL383557v1_alt	89672
440 | chr17_JH159148v1_alt	88070
441 | chr12_KN538370v1_fix	86533
442 | chr10_KN538366v1_fix	85284
443 | chr5_GL383532v1_alt	82728
444 | chr21_KI270872v1_alt	82692
445 | chr6_KQ090017v1_alt	82315
446 | chrUn_KI270756v1	79590
447 | chr16_KZ208921v1_alt	78609
448 | chr6_KI270758v1_alt	76752
449 | chr12_KI270833v1_alt	76061
450 | chr6_KI270802v1_alt	75005
451 | chr21_GL383580v2_alt	74653
452 | chr22_KB663609v1_alt	74013
453 | chr22_KI270739v1_random	73985
454 | chr9_GL383540v1_alt	71551
455 | chrUn_KI270757v1	71251
456 | chr2_KI270773v1_alt	70887
457 | chr17_JH159147v1_alt	70345
458 | chr11_KI270827v1_alt	67707
459 | chr1_KI270709v1_random	66860
460 | chrUn_KI270746v1	66486
461 | chr12_KZ208917v1_fix	64689
462 | chr16_KI270856v1_alt	63982
463 | chr21_GL383578v2_alt	63917
464 | chrUn_KI270753v1	62944
465 | chr19_KI270868v1_alt	61734
466 | chr9_GL383542v1_alt	60032
467 | chr16_KQ090026v1_alt	59016
468 | chr20_KI270871v1_alt	58661
469 | chr12_KI270836v1_alt	56134
470 | chr19_KI270865v1_alt	52969
471 | chr1_KI270764v1_alt	50258
472 | chrY_KZ208923v1_fix	48370
473 | chr1_KZ559100v1_fix	44955
474 | chrUn_KI270589v1	44474
475 | chr14_KI270726v1_random	43739
476 | chr19_KI270866v1_alt	43156
477 | chr22_KI270735v1_random	42811
478 | chr1_KI270711v1_random	42210
479 | chrUn_KI270745v1	41891
480 | chr1_KI270714v1_random	41717
481 | chr22_KI270732v1_random	41543
482 | chr1_KI270713v1_random	40745
483 | chrUn_KI270754v1	40191
484 | chr1_KI270710v1_random	40176
485 | chr12_KI270837v1_alt	40090
486 | chr9_KI270717v1_random	40062
487 | chr14_KI270724v1_random	39555
488 | chr9_KI270720v1_random	39050
489 | chr14_KI270723v1_random	38115
490 | chr9_KI270718v1_random	38054
491 | chrUn_KI270317v1	37690
492 | chr13_KI270842v1_alt	37287
493 | chrY_KI270740v1_random	37240
494 | chrUn_KI270755v1	36723
495 | chr8_KI270820v1_alt	36640
496 | chr13_KN196483v1_fix	35455
497 | chr1_KI270707v1_random	32032
498 | chrUn_KI270579v1	31033
499 | chrUn_KI270752v1	27745
500 | chrUn_KI270512v1	22689
501 | chrUn_KI270322v1	21476
502 | chrM	16569
503 | chrUn_GL000226v1	15008
504 | chr10_KN538365v1_fix	14347
505 | chrUn_KI270311v1	12399
506 | chrUn_KI270366v1	8320
507 | chrUn_KI270511v1	8127
508 | chrUn_KI270448v1	7992
509 | chrUn_KI270521v1	7642
510 | chrUn_KI270581v1	7046
511 | chrUn_KI270582v1	6504
512 | chrUn_KI270515v1	6361
513 | chrUn_KI270588v1	6158
514 | chrUn_KI270591v1	5796
515 | chrUn_KI270522v1	5674
516 | chrUn_KI270507v1	5353
517 | chrUn_KI270590v1	4685
518 | chrUn_KI270584v1	4513
519 | chrUn_KI270320v1	4416
520 | chrUn_KI270382v1	4215
521 | chrUn_KI270468v1	4055
522 | chrUn_KI270467v1	3920
523 | chrUn_KI270362v1	3530
524 | chrUn_KI270517v1	3253
525 | chrUn_KI270593v1	3041
526 | chrUn_KI270528v1	2983
527 | chrUn_KI270587v1	2969
528 | chrUn_KI270364v1	2855
529 | chrUn_KI270371v1	2805
530 | chrUn_KI270333v1	2699
531 | chrUn_KI270374v1	2656
532 | chrUn_KI270411v1	2646
533 | chrUn_KI270414v1	2489
534 | chrUn_KI270510v1	2415
535 | chrUn_KI270390v1	2387
536 | chrUn_KI270375v1	2378
537 | chrUn_KI270420v1	2321
538 | chrUn_KI270509v1	2318
539 | chrUn_KI270315v1	2276
540 | chrUn_KI270302v1	2274
541 | chrUn_KI270518v1	2186
542 | chrUn_KI270530v1	2168
543 | chrUn_KI270304v1	2165
544 | chrUn_KI270418v1	2145
545 | chrUn_KI270424v1	2140
546 | chrUn_KI270417v1	2043
547 | chrUn_KI270508v1	1951
548 | chrUn_KI270303v1	1942
549 | chrUn_KI270381v1	1930
550 | chrUn_KI270529v1	1899
551 | chrUn_KI270425v1	1884
552 | chrUn_KI270396v1	1880
553 | chrUn_KI270363v1	1803
554 | chrUn_KI270386v1	1788
555 | chrUn_KI270465v1	1774
556 | chrUn_KI270383v1	1750
557 | chrUn_KI270384v1	1658
558 | chrUn_KI270330v1	1652
559 | chrUn_KI270372v1	1650
560 | chrUn_KI270548v1	1599
561 | chrUn_KI270580v1	1553
562 | chrUn_KI270387v1	1537
563 | chrUn_KI270391v1	1484
564 | chrUn_KI270305v1	1472
565 | chrUn_KI270373v1	1451
566 | chrUn_KI270422v1	1445
567 | chrUn_KI270316v1	1444
568 | chrUn_KI270338v1	1428
569 | chrUn_KI270340v1	1428
570 | chrUn_KI270583v1	1400
571 | chrUn_KI270334v1	1368
572 | chrUn_KI270429v1	1361
573 | chrUn_KI270393v1	1308
574 | chrUn_KI270516v1	1300
575 | chrUn_KI270389v1	1298
576 | chrUn_KI270466v1	1233
577 | chrUn_KI270388v1	1216
578 | chrUn_KI270544v1	1202
579 | chrUn_KI270310v1	1201
580 | chrUn_KI270412v1	1179
581 | chrUn_KI270395v1	1143
582 | chrUn_KI270376v1	1136
583 | chrUn_KI270337v1	1121
584 | chrUn_KI270335v1	1048
585 | chrUn_KI270378v1	1048
586 | chrUn_KI270379v1	1045
587 | chrUn_KI270329v1	1040
588 | chrUn_KI270419v1	1029
589 | chrUn_KI270336v1	1026
590 | chrUn_KI270312v1	998
591 | chrUn_KI270539v1	993
592 | chrUn_KI270385v1	990
593 | chrUn_KI270423v1	981
594 | chrUn_KI270392v1	971
595 | chrUn_KI270394v1	970
596 | 


--------------------------------------------------------------------------------