├── Makefile ├── example.png ├── guess ├── hg19.genes.gz ├── hg38.genes.gz ├── hg19.genome.gz ├── hg38.genome.gz ├── split_exon.pl ├── check_size.pl └── README.md ├── gencode ├── gene_type.png ├── genomic_region_length.png ├── genomic_region_proportion.png ├── gencode.v20.annotation.gtf.stats ├── gencode.v21.annotation.gtf.stats ├── gencode.v22.annotation.gtf.stats ├── gencode.v23.annotation.gtf.stats ├── gencode.v24.annotation.gtf.stats ├── gencode.v25.annotation.gtf.stats ├── gencode.v26.annotation.gtf.stats ├── gencode.v27.annotation.gtf.stats ├── gencode.v28.annotation.gtf.stats ├── gencode.v29.annotation.gtf.stats ├── gencode.v30.annotation.gtf.stats ├── gencode.v31.annotation.gtf.stats ├── gencode.v32.annotation.gtf.stats ├── gencode.v33.annotation.gtf.stats ├── gencode.v34.annotation.gtf.stats ├── gencode.v35.annotation.gtf.stats ├── README.md └── plot_stats.Rmd ├── .gitignore ├── chrom_info ├── araTha1.genome ├── README.md ├── hg19.genome └── hg38.genome ├── script ├── README.md ├── gtf_to_bed.pl ├── gtf_to_bed_single.pl ├── merge_by_id.pl └── gtf_add_exon.pl ├── coverage.pl ├── check_utr.pl ├── print_utr.pl ├── run.sh ├── promoter.pl ├── liftover └── README.md ├── run.pl └── README.md /Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | ./run.sh 3 | 4 | clean: 5 | rm -rf bedtools2 *.gz 6 | -------------------------------------------------------------------------------- /example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/example.png -------------------------------------------------------------------------------- /guess/hg19.genes.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg19.genes.gz -------------------------------------------------------------------------------- /guess/hg38.genes.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg38.genes.gz -------------------------------------------------------------------------------- /gencode/gene_type.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/gene_type.png -------------------------------------------------------------------------------- /guess/hg19.genome.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg19.genome.gz -------------------------------------------------------------------------------- /guess/hg38.genome.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/guess/hg38.genome.gz -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | bedtools2 2 | transcript* 3 | *.bed.gz 4 | *.gtf.gz 5 | gencode/*.gtf.gz 6 | my_* 7 | *.swp 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /chrom_info/araTha1.genome: -------------------------------------------------------------------------------- 1 | 1 30427671 2 | 5 26975502 3 | 3 23459830 4 | 2 19698289 5 | 4 18585056 6 | Mt 366924 7 | Pt 154478 8 | -------------------------------------------------------------------------------- /gencode/genomic_region_length.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/genomic_region_length.png -------------------------------------------------------------------------------- /gencode/genomic_region_proportion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/davetang/defining_genomic_regions/HEAD/gencode/genomic_region_proportion.png -------------------------------------------------------------------------------- /gencode/gencode.v20.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.70 2 | intron_coverage: 46.64 3 | intergenic_coverage: 49.66 4 | exon_length: 410.43 5 | intron_length: 5035.45 6 | intergenic_length: 48843.25 7 | -------------------------------------------------------------------------------- /gencode/gencode.v21.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.79 2 | intron_coverage: 46.75 3 | intergenic_coverage: 49.46 4 | exon_length: 418.99 5 | intron_length: 5021.59 6 | intergenic_length: 48055.09 7 | -------------------------------------------------------------------------------- /gencode/gencode.v22.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.82 2 | intron_coverage: 46.88 3 | intergenic_coverage: 49.30 4 | exon_length: 421.22 5 | intron_length: 5014.63 6 | intergenic_length: 47992.55 7 | -------------------------------------------------------------------------------- /gencode/gencode.v23.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.81 2 | intron_coverage: 46.90 3 | intergenic_coverage: 49.29 4 | exon_length: 419.83 5 | intron_length: 5011.03 6 | intergenic_length: 47987.25 7 | -------------------------------------------------------------------------------- /gencode/gencode.v24.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.82 2 | intron_coverage: 47.01 3 | intergenic_coverage: 49.17 4 | exon_length: 419.99 5 | intron_length: 5023.59 6 | intergenic_length: 47595.16 7 | -------------------------------------------------------------------------------- /gencode/gencode.v25.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.84 2 | intron_coverage: 47.41 3 | intergenic_coverage: 48.75 4 | exon_length: 422.98 5 | intron_length: 5056.68 6 | intergenic_length: 48517.51 7 | -------------------------------------------------------------------------------- /gencode/gencode.v26.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.86 2 | intron_coverage: 47.46 3 | intergenic_coverage: 48.69 4 | exon_length: 424.52 5 | intron_length: 5063.18 6 | intergenic_length: 48444.65 7 | -------------------------------------------------------------------------------- /gencode/gencode.v27.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.88 2 | intron_coverage: 47.63 3 | intergenic_coverage: 48.49 4 | exon_length: 426.11 5 | intron_length: 5060.58 6 | intergenic_length: 48331.74 7 | -------------------------------------------------------------------------------- /gencode/gencode.v28.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.92 2 | intron_coverage: 48.14 3 | intergenic_coverage: 47.94 4 | exon_length: 428.63 5 | intron_length: 5069.04 6 | intergenic_length: 48263.94 7 | -------------------------------------------------------------------------------- /gencode/gencode.v29.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.95 2 | intron_coverage: 48.94 3 | intergenic_coverage: 47.10 4 | exon_length: 430.83 5 | intron_length: 5092.15 6 | intergenic_length: 47885.29 7 | -------------------------------------------------------------------------------- /gencode/gencode.v30.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 3.99 2 | intron_coverage: 49.50 3 | intergenic_coverage: 46.50 4 | exon_length: 433.69 5 | intron_length: 5107.94 6 | intergenic_length: 47462.92 7 | -------------------------------------------------------------------------------- /gencode/gencode.v31.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 4.24 2 | intron_coverage: 51.95 3 | intergenic_coverage: 43.81 4 | exon_length: 449.80 5 | intron_length: 5176.52 6 | intergenic_length: 45084.36 7 | -------------------------------------------------------------------------------- /gencode/gencode.v32.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 4.25 2 | intron_coverage: 51.99 3 | intergenic_coverage: 43.76 4 | exon_length: 450.53 5 | intron_length: 5174.38 6 | intergenic_length: 45058.76 7 | -------------------------------------------------------------------------------- /gencode/gencode.v33.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 4.25 2 | intron_coverage: 51.98 3 | intergenic_coverage: 43.77 4 | exon_length: 451.04 5 | intron_length: 5168.91 6 | intergenic_length: 45024.93 7 | -------------------------------------------------------------------------------- /gencode/gencode.v34.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 4.27 2 | intron_coverage: 51.97 3 | intergenic_coverage: 43.76 4 | exon_length: 452.82 5 | intron_length: 5167.13 6 | intergenic_length: 45019.35 7 | -------------------------------------------------------------------------------- /gencode/gencode.v35.annotation.gtf.stats: -------------------------------------------------------------------------------- 1 | exon_coverage: 4.29 2 | intron_coverage: 52.03 3 | intergenic_coverage: 43.68 4 | exon_length: 454.52 5 | intron_length: 5161.43 6 | intergenic_length: 45067.04 7 | -------------------------------------------------------------------------------- /guess/split_exon.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | while(<>){ 7 | chomp; 8 | next if /^chrom/; 9 | my ($chr, $starts, $ends) = split(/\t/); 10 | my @starts = split(/,/, $starts); 11 | my @ends = split(/,/, $ends); 12 | foreach my $i (0..$#starts){ 13 | print join("\t", $chr, $starts[$i], $ends[$i]), "\n"; 14 | } 15 | } 16 | 17 | exit(); 18 | 19 | -------------------------------------------------------------------------------- /chrom_info/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | Download genome annotations. 4 | 5 | ```bash 6 | for genome in hg19 hg38; do 7 | mysql --user=genome \ 8 | --host=genome-mysql.cse.ucsc.edu \ 9 | -A \ 10 | -e "select chrom, size from ${genome}.chromInfo" | grep -v "^chrom" > ${genome}.genome 11 | done 12 | 13 | wget -q -O - http://genome-test.cse.ucsc.edu/~hiram/hubs/Plants/araTha1/araTha1.chrom.sizes | 14 | sed 's/^chr//' | 15 | sed 's/Cp/Pt/' > araTha1.genome 16 | ``` 17 | 18 | -------------------------------------------------------------------------------- /script/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | Download GTF file 4 | 5 | wget -c -N ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_33/gencode.v33.annotation.gtf.gz 6 | 7 | Convert to BED using "gene" entries in the GTF file 8 | 9 | ./gtf_to_bed.pl -i gencode.v33.annotation.gtf.gz -f gene > gencode.v33.gene.bed 10 | 11 | ## Scripts 12 | 13 | The `merge_by_id.pl` script will merge by the annotation column (column 4). This is useful when you only want to merge features with the same ID; I could not find an elegant solution, so I wrote this script. It requires `bedtools` and will create a lot of temporary files (one per feature in the BED file). It works by running `bedtools merge` on each feature, so it can be very slow with files with a lot of different features. You can specify more threads if you have multiple cores on your system. 14 | 15 | -------------------------------------------------------------------------------- /coverage.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $v = 19; 7 | 8 | my $exon_file = "gencode_v${v}_exon_merged.bed.gz"; 9 | my $intergenic_file = "gencode_v${v}_intergenic.bed.gz"; 10 | my $intron_file = "gencode_v${v}_intron.bed.gz"; 11 | 12 | my $exon_coverage = coverage($exon_file); 13 | my $intergenic_coverage = coverage($intergenic_file); 14 | my $intron_coverage = coverage($intron_file); 15 | 16 | my $total = $exon_coverage + $intergenic_coverage + $intron_coverage; 17 | 18 | printf "Exon: %.2f\n", $exon_coverage*100/$total; 19 | printf "Intron: %.2f\n", $intron_coverage*100/$total; 20 | printf "Intergenic: %.2f\n", $intergenic_coverage*100/$total; 21 | 22 | sub coverage { 23 | my ($infile) = @_; 24 | my $coverage = 0; 25 | open(IN,'-|',"zcat $infile") || die "Could not open $infile: $!\n"; 26 | while(){ 27 | chomp; 28 | my ($chr, $start, $end) = split(/\t/); 29 | my $c = $end - $start; 30 | $coverage += $c; 31 | } 32 | close(IN); 33 | return($coverage); 34 | } 35 | 36 | exit(0); 37 | -------------------------------------------------------------------------------- /gencode/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | Setup latest version of BEDTools. 4 | 5 | ```bash 6 | wget https://github.com/arq5x/bedtools2/releases/download/v2.29.2/bedtools-2.29.2.tar.gz 7 | tar -xzf bedtools-2.29.2.tar.gz 8 | cd bedtools2 9 | make all 10 | 11 | cd ~/bin/ 12 | ln -s ~/src/bedtools2/bin/bedtools 13 | 14 | bedtools --version 15 | # bedtools v2.29.2 16 | ``` 17 | 18 | Download GENCODE GTF files. 19 | 20 | ```bash 21 | parallel --verbose wget ftp://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_{}/gencode.v{}.annotation.gtf.gz ::: {20..35} 22 | ``` 23 | 24 | Calculate stats. 25 | 26 | ```bash 27 | parallel --verbose "../run.pl {} ../chrom_info/hg38.genome > {.}.stats" ::: *.gtf.gz 28 | ``` 29 | 30 | Plot using `plot_stats.Rmd`. 31 | 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /guess/check_size.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Std; 6 | 7 | my %opts = (); 8 | getopts('h:s:b:', \%opts); 9 | 10 | if ($opts{'h'} || 11 | !exists $opts{'s'} || 12 | !exists $opts{'b'} 13 | ){ 14 | usage(); 15 | } 16 | 17 | my $genome = $opts{'s'}; 18 | my %sizes = store_size($genome); 19 | my $bed = $opts{'b'}; 20 | 21 | my $fh; 22 | if ($bed =~ /\.gz$/){ 23 | open($fh, '-|', "gunzip -c $bed") or die "Could not open $bed $!\n"; 24 | } else { 25 | open($fh, '<', $bed) or die "Could not open $bed $!\n"; 26 | } 27 | 28 | while(<$fh>){ 29 | chomp; 30 | next if /^browser/ || /^track/; 31 | my ($chr, $start, $end, @rest) = split(/\t/); 32 | if (exists $sizes{$chr}){ 33 | my $size = $sizes{$chr}; 34 | if (++$end > $size){ 35 | warn("$chr:$start-$end greater than $size\n"); 36 | } 37 | } else { 38 | die("$chr does not exist in $genome\n"); 39 | } 40 | } 41 | close($fh); 42 | 43 | sub usage { 44 | print STDERR <){ 65 | chomp; 66 | next if /^chrom/; 67 | my ($chrom, $size) = split(/\t/); 68 | $sizes{$chrom} = $size; 69 | } 70 | close($fh); 71 | return(%sizes); 72 | } 73 | 74 | -------------------------------------------------------------------------------- /check_utr.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | 6 | my $usage = "Usage: $0 \n"; 7 | my $infile = shift or die $usage; 8 | 9 | if ($infile =~ /\.gz/){ 10 | open(IN,'-|',"gunzip -c $infile") || die "Could not open $infile: $!\n"; 11 | } else { 12 | open(IN,'<',$infile) || die "Could not open $infile: $!\n"; 13 | } 14 | 15 | my %transcript = (); 16 | my $current_transcript = ''; 17 | 18 | while(){ 19 | chomp; 20 | next if (/^#/); 21 | #chr11 HAVANA transcript 65265233 65273940 . + . gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1"; 22 | my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/); 23 | 24 | my @annotation = split(/;\s/,$annotation); 25 | my $transcript_id = 'none'; 26 | 27 | if ($type eq 'transcript'){ 28 | foreach my $blah (@annotation){ 29 | my ($type,$name) = split(/\s+/,$blah); 30 | if ($type eq 'transcript_id'){ 31 | $current_transcript = $name; 32 | $current_transcript =~ s/"//g; 33 | $transcript{$current_transcript} = 0; 34 | } 35 | } 36 | if ($current_transcript eq 'none'){ 37 | die "No name for entry $.\n"; 38 | } 39 | } 40 | 41 | if ($type eq 'UTR'){ 42 | $transcript{$current_transcript}++; 43 | } 44 | } 45 | close(IN); 46 | 47 | foreach my $transcript (keys %transcript){ 48 | print "$transcript\t$transcript{$transcript}\n"; 49 | } 50 | 51 | exit(0); 52 | -------------------------------------------------------------------------------- /script/gtf_to_bed.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Converts a GTF file into a BED file; the script will use "gene_id" as the BED name, if it exists in the attributes 4 | # 5 | 6 | use warnings; 7 | use strict; 8 | use Getopt::Std; 9 | 10 | my %opts = (); 11 | getopts('i:f:h:u:d:', \%opts); 12 | 13 | if ($opts{'h'} || 14 | !exists $opts{'f'} || 15 | !exists $opts{'i'} 16 | ){ 17 | usage(); 18 | } 19 | 20 | my $gtf = $opts{'i'}; 21 | my $my_feature = $opts{'f'}; 22 | my $up = 0; 23 | my $down = 0; 24 | if (exists $opts{'u'}){ 25 | $up = $opts{'u'}; 26 | } 27 | if (exists $opts{'d'}){ 28 | $up = $opts{'d'}; 29 | } 30 | 31 | if ($gtf =~ /\.gz$/){ 32 | open(IN, '-|', "gunzip -c $gtf") || die "Could not open $gtf: $!\n"; 33 | } else { 34 | open(IN, '<', $gtf) || die "Could not open $gtf: $!\n"; 35 | } 36 | 37 | while(){ 38 | chomp; 39 | next if /^#/; 40 | my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/); 41 | next unless $feature eq $my_feature; 42 | 43 | my $name = '.'; 44 | if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){ 45 | $name = $1; 46 | } 47 | 48 | # BED is 0-based 49 | $start -= 1; 50 | 51 | if ($strand eq '+'){ 52 | $start = $start - $up; 53 | $end = $end + $down; 54 | if ($start < 0){ 55 | $start = 0; 56 | } 57 | } elsif ($strand eq '-'){ 58 | $start = $start - $down; 59 | $end = $end + $up; 60 | if ($start < 0){ 61 | $start = 0; 62 | } 63 | } 64 | 65 | print join("\t", $sequence, $start, $end, $name, $score, $strand), "\n"; 66 | 67 | } 68 | close(IN); 69 | 70 | sub usage { 71 | print STDERR <){ 21 | chomp; 22 | next if (/^#/); 23 | #chr11 HAVANA transcript 65265233 65273940 . + . gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1"; 24 | my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/); 25 | 26 | my @annotation = split(/;\s/,$annotation); 27 | my $transcript_id = 'none'; 28 | 29 | if ($type eq 'transcript'){ 30 | foreach my $blah (@annotation){ 31 | my ($type,$name) = split(/\s+/,$blah); 32 | if ($type eq 'transcript_id'){ 33 | $current_transcript = $name; 34 | $current_transcript =~ s/"//g; 35 | $transcript_start = $start; 36 | $transcript_end = $end; 37 | } 38 | } 39 | if ($current_transcript eq 'none'){ 40 | die "No name for entry $.\n"; 41 | } 42 | } 43 | 44 | if ($type eq 'UTR'){ 45 | my $region = ''; 46 | if ($strand eq '+'){ 47 | my $dis_to_start = abs($start - $transcript_start); 48 | my $dis_to_end = abs($start - $transcript_end); 49 | $region = $dis_to_start < $dis_to_end ? '5_UTR' : '3_UTR'; 50 | } else { 51 | my $dis_to_start = abs($end - $transcript_end); 52 | my $dis_to_end = abs($end - $transcript_start); 53 | $region = $dis_to_start < $dis_to_end ? '5_UTR' : '3_UTR'; 54 | } 55 | print join ("\t", $chr, $start, $end, $region, $current_transcript, $strand),"\n"; 56 | } 57 | } 58 | close(IN); 59 | 60 | exit(0); 61 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo Checking for bedtools 4 | 5 | if [ ! -d bedtools2 ] 6 | then 7 | git clone https://github.com/arq5x/bedtools2.git 8 | cd bedtools2 9 | make clean; make all 10 | cd .. 11 | fi 12 | 13 | echo Downloading GENCODE annotations 14 | 15 | v=19 16 | 17 | if [ ! -f gencode.v$v.annotation.gtf.gz ] 18 | then 19 | wget ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_human/release_$v/gencode.v$v.annotation.gtf.gz 20 | fi 21 | 22 | echo Creating exonic regions 23 | 24 | if [ ! -f gencode_v${v}_exon_merged.bed.gz ] 25 | then 26 | gunzip -c gencode.v$v.annotation.gtf.gz | 27 | awk 'BEGIN{OFS="\t";} $3=="exon" {print $1,$4-1,$5}' | 28 | bedtools2/bin/sortBed | 29 | bedtools2/bin/mergeBed -i - | gzip > gencode_v${v}_exon_merged.bed.gz 30 | fi 31 | 32 | echo Creating intronic regions 33 | 34 | if [ ! -f gencode_v${v}_intron.bed.gz ] 35 | then 36 | gunzip -c gencode.v$v.annotation.gtf.gz | 37 | awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' | 38 | bedtools2/bin/sortBed | 39 | bedtools2/bin/subtractBed -a stdin -b gencode_v${v}_exon_merged.bed.gz | 40 | gzip > gencode_v${v}_intron.bed.gz 41 | fi 42 | 43 | # echo Downloading hg19 coordinates 44 | # 45 | # if [ ! -f hg19.genome ] 46 | # then 47 | # mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -e \ 48 | # "select chrom, size from hg19.chromInfo" > hg19.genome 49 | # fi 50 | 51 | echo Creating intergenic regions 52 | 53 | if [ ! -f gencode_v${v}_intergenic.bed.gz ] 54 | then 55 | gunzip -c gencode.v$v.annotation.gtf.gz | 56 | awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' | 57 | sort -k1V -k2,2n | 58 | bedtools2/bin/complementBed -i stdin -g hg19.genome | 59 | gzip > gencode_v${v}_intergenic.bed.gz 60 | fi 61 | 62 | echo Counting UTRs 63 | 64 | if [ ! -f transcript_utr_number.out.gz ] 65 | then 66 | perl check_utr.pl gencode.v19.annotation.gtf.gz | gzip > transcript_utr_number.out.gz 67 | fi 68 | 69 | echo Creating UTRs 70 | 71 | if [ ! -f transcript_utr.bed.gz ] 72 | then 73 | perl print_utr.pl gencode.v19.annotation.gtf.gz | gzip > transcript_utr.bed.gz 74 | fi 75 | 76 | echo Creating promoter region 77 | 78 | if [ ! -f promoter.bed.gz ] 79 | then 80 | perl promoter.pl gencode.v19.annotation.gtf.gz 200 | gzip > promoter.bed.gz 81 | fi 82 | 83 | 84 | echo Done 85 | -------------------------------------------------------------------------------- /script/gtf_to_bed_single.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Output BED file with features of interest for genes with only one transcript model 4 | # 5 | 6 | use warnings; 7 | use strict; 8 | use Getopt::Std; 9 | 10 | my %opts = (); 11 | getopts('i:f:h:', \%opts); 12 | 13 | if ($opts{'h'} || 14 | !exists $opts{'i'} || 15 | !exists $opts{'f'} 16 | ){ 17 | usage(); 18 | } 19 | 20 | my $my_feature = $opts{'f'}; 21 | my %gene_anno = (); 22 | 23 | # first read through to tally number of transcripts per gene 24 | my $gtf_file = $opts{'i'}; 25 | my $gtf = open_file($gtf_file); 26 | while(<$gtf>){ 27 | chomp; 28 | next if /^#/; 29 | my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/); 30 | 31 | my $gene_id = '.'; 32 | if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){ 33 | $gene_id = $1; 34 | } 35 | 36 | if ($feature eq "transcript"){ 37 | if (exists $gene_anno{$gene_id}->{'COUNT'}){ 38 | $gene_anno{$gene_id}->{'COUNT'}++; 39 | } else { 40 | $gene_anno{$gene_id}->{'COUNT'} = 1; 41 | } 42 | } 43 | 44 | } 45 | close($gtf); 46 | 47 | # second read through to output features of interest for genes with only one transcript model 48 | my $gtf2 = open_file($gtf_file); 49 | while(<$gtf2>){ 50 | chomp; 51 | next if /^#/; 52 | my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/); 53 | 54 | my $gene_id = '.'; 55 | if ($attributes =~ /gene_id\s"([a-zA-Z0-9._]+)";/){ 56 | $gene_id = $1; 57 | } 58 | 59 | if ($gene_anno{$gene_id}->{COUNT} == 1 && $feature eq $my_feature){ 60 | # BED is 0-based 61 | $start -= 1; 62 | print join("\t", $sequence, $start, $end, $gene_id, $score, $strand), "\n"; 63 | } 64 | 65 | } 66 | close($gtf2); 67 | 68 | sub open_file { 69 | my ($infile) = @_; 70 | my $fh; 71 | if ($infile =~ /\.gz$/){ 72 | open($fh, '-|', "gunzip -c $infile") || die "Could not open $infile $!\n"; 73 | } else { 74 | open($fh, '<', $infile) || die "Could not open $infile $!\n"; 75 | } 76 | return($fh); 77 | } 78 | 79 | 80 | sub usage { 81 | print STDERR <){ 19 | chomp; 20 | #chr9_gl000201_random 36148 21 | my ($chr, $end) = split(/\t/); 22 | $hg19{$chr} = $end; 23 | } 24 | close(IN); 25 | 26 | if ($infile =~ /\.gz/){ 27 | open(IN,'-|',"gunzip -c $infile") || die "Could not open $infile: $!\n"; 28 | } else { 29 | open(IN,'<',$infile) || die "Could not open $infile: $!\n"; 30 | } 31 | 32 | while(){ 33 | chomp; 34 | next if (/^#/); 35 | #chr11 HAVANA transcript 65265233 65273940 . + . gene_id "ENSG00000251562.3"; transcript_id "ENST00000534336.1"; gene_type "processed_transcript"; gene_status "KNOWN"; gene_name "MALAT1"; transcript_type "non_coding"; transcript_status "KNOWN"; transcript_name "MALAT1-001"; level 2; havana_gene "OTTHUMG00000166322.1"; havana_transcript "OTTHUMT00000389143.1"; 36 | my ($chr,$source,$type,$start,$end,$score,$strand,$phase,$annotation) = split(/\t/); 37 | next unless $type eq 'transcript'; 38 | my @annotation = split(/;\s/,$annotation); 39 | my $transcript_id = 'none'; 40 | foreach my $blah (@annotation){ 41 | my ($type,$name) = split(/\s+/,$blah); 42 | if ($type eq 'transcript_id'){ 43 | $transcript_id = $name; 44 | $transcript_id =~ s/"//g; 45 | } 46 | } 47 | if ($transcript_id eq 'none'){ 48 | die "No name for entry $.\n"; 49 | } 50 | my $promoter_start = ''; 51 | my $promoter_end = ''; 52 | if ($strand eq '+'){ 53 | $promoter_start = $start - $span; 54 | $promoter_end = $start + $span; 55 | } else { 56 | $promoter_start = $end - $span; 57 | $promoter_end = $end + $span; 58 | } 59 | if ($promoter_start < 0){ 60 | warn "Adjusted promoter start to 0\n"; 61 | $promoter_start = 0; 62 | } elsif ($promoter_end > $hg19{$chr}){ 63 | warn "Adjusted promoter end to $hg19{$chr}\n"; 64 | $promoter_end = $hg19{$chr}; 65 | } 66 | print join("\t",$chr,$promoter_start,$promoter_end,$transcript_id,0,$strand),"\n"; 67 | } 68 | close(IN); 69 | 70 | exit(0); 71 | -------------------------------------------------------------------------------- /liftover/README.md: -------------------------------------------------------------------------------- 1 | ## README 2 | 3 | Visit the [UCSC Genome Browser Store](https://genome-store.ucsc.edu/products/) and download liftOver after creating an account. It is free for personal and non-profit academic research use. 4 | 5 | Download a chain file. 6 | 7 | ```bash 8 | wget https://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz 9 | ``` 10 | 11 | Check out the chain file. 12 | 13 | ```bash 14 | zcat hg19ToHg38.over.chain.gz | head -6 15 | chain 20851231461 chr1 249250621 + 10000 249240621 chr1 248956422 + 10000 248946422 2 16 | 167376 50041 80290 17 | 40302 253649 288020 18 | 1044699 1 2 19 | 3716 0 3 20 | 1134 4 18 21 | ``` 22 | 23 | The [chain format](https://genome.ucsc.edu/goldenPath/help/chain.html) has an initial header line starts with the keyword `chain`, followed by 11 required attribute values, and ends with a blank line. The attributes include: 24 | 25 | * `score` -- chain score 26 | * `tName` -- chromosome (reference sequence) 27 | * `tSize` -- chromosome size (reference sequence) 28 | * `tStrand` -- strand (reference sequence) 29 | * `tStart` -- alignment start position (reference sequence) 30 | * `tEnd` -- alignment end position (reference sequence) 31 | * `qName` -- chromosome (query sequence) 32 | * `qSize` -- chromosome size (query sequence) 33 | * `qStrand` -- strand (query sequence) 34 | * `qStart` -- alignment start position (query sequence) 35 | * `qEnd` -- alignment end position (query sequence) 36 | * `id` -- chain ID 37 | 38 | The alignment data lines contain three required attribute values: 39 | 40 | * `size` -- the size of the ungapped alignment 41 | * `dt` -- the difference between the end of this block and the beginning of the next block (reference sequence) 42 | * `dq` -- the difference between the end of this block and the beginning of the next block (query sequence) 43 | 44 | The block chr1:10000-177376 should liftover to the exact coordinates on hg38. 45 | 46 | The `liftOver` tool requires four positional arguments: oldFile map.chain newFile unMapped 47 | 48 | ```bash 49 | perl -le 'print join("\t", "chr1", 10000, 177376)' > chr1_10000_177376.txt 50 | ./liftOver chr1_10000_177376.txt hg19ToHg38.over.chain.gz chr1_10000_177376_hg38.txt chr1_10000_177376_unmapped.txt 51 | cat chr1_10000_177376_hg38.txt 52 | # chr1 10000 177376 53 | ``` 54 | 55 | If we create a BED file with a region that doesn't lift over, the output BED file will be trimmed. (The alignment block ends at chr1:10000:177376, so the 1 bp overhang will be trimmed.) 56 | 57 | ```bash 58 | perl -le 'print join("\t", "chr1", 10000, 177377)' > chr1_10000_177377.txt 59 | ./liftOver chr1_10000_177377.txt hg19ToHg38.over.chain.gz chr1_10000_177377_hg38.txt chr1_10000_177377_unmapped.txt 60 | cat chr1_10000_177377_hg38.txt 61 | # chr1 10000 177376 62 | ``` 63 | 64 | -------------------------------------------------------------------------------- /script/merge_by_id.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | 3 | use strict; 4 | use warnings; 5 | use Getopt::Std; 6 | use File::Path; 7 | 8 | my %opts = (); 9 | getopts('h:f:t:', \%opts); 10 | 11 | if ($opts{'h'} || 12 | !exists $opts{'f'} 13 | ){ 14 | usage(); 15 | } 16 | 17 | chomp(my $bedtools = `command -v bedtools`); 18 | if ($bedtools eq ''){ 19 | die "Could not find bedtools\n"; 20 | } 21 | 22 | my $infile = $opts{'f'}; 23 | my $fork_process = 1; 24 | if (exists $opts{'t'}){ 25 | $fork_process = $opts{'t'}; 26 | } 27 | warn("Using $fork_process threads\n"); 28 | 29 | my $fh; 30 | if ($infile =~ /\.gz$/){ 31 | open($fh, '-|', "gunzip -c $infile") || die "Could not open $infile: $!\n"; 32 | } else { 33 | open($fh, '<', $infile) || die "Could not open $infile: $!\n"; 34 | } 35 | 36 | # store all IDs 37 | my %all_id = (); 38 | while(<$fh>){ 39 | chomp; 40 | my ($chr, $start, $end, $id, @rest) = split(/\t/); 41 | $all_id{$id} = 1; 42 | } 43 | close($fh); 44 | 45 | my $tmp_dir = time() . "_tmp"; 46 | mkdir($tmp_dir) || die "Could not create $tmp_dir: $!\n"; 47 | 48 | my @command = (); 49 | foreach my $id (keys %all_id){ 50 | my $command; 51 | if ($infile =~ /\.gz$/){ 52 | $command = "gunzip -c $infile | grep $id | sort -k1,1V -k2,2n | $bedtools merge -i - > $tmp_dir/$id.bed"; 53 | push(@command, $command); 54 | } else { 55 | $command = "cat $infile | grep $id | sort -k1,1V -k2,2n | $bedtools merge -i - > $tmp_dir/$id.bed"; 56 | push(@command, $command); 57 | } 58 | } 59 | 60 | my @child = (); 61 | while(scalar(@command) > 0){ 62 | for (1 .. $fork_process){ 63 | my $pid = fork(); 64 | if ($pid) { 65 | # parent 66 | push(@child, $pid); 67 | pop(@command); 68 | } elsif ($pid == 0) { 69 | # child 70 | if (scalar(@command) > 0){ 71 | # print "$command[-1]\n"; 72 | system($command[-1]); 73 | } 74 | exit(0); 75 | } else { 76 | die "Couldn't fork: $!\n"; 77 | } 78 | } 79 | foreach my $pid (@child) { 80 | waitpid($pid, 0); 81 | } 82 | } 83 | 84 | # oepn merged filed 85 | opendir(DIR, $tmp_dir) || die "Could not open $tmp_dir: $!\n"; 86 | while(my $bed = readdir(DIR)){ 87 | next unless $bed =~ /\.bed$/; 88 | my $id = $bed; 89 | $id =~ s/\.bed//; 90 | open(my $fh, '<', "$tmp_dir/$bed") || die "Could not open $tmp_dir/$bed: $!\n"; 91 | while(<$fh>){ 92 | chomp; 93 | print "$_\t$id\n"; 94 | } 95 | close($fh); 96 | } 97 | closedir(DIR); 98 | 99 | rmtree($tmp_dir) || die "Could not remove $tmp_dir: $!\n"; 100 | warn("Done\n"); 101 | exit(0); 102 | 103 | sub usage { 104 | print STDERR < hg19.genome 9 | # 10 | 11 | use strict; 12 | use warnings; 13 | use File::Which; 14 | use File::Basename; 15 | 16 | my $bedtools = which('bedtools'); 17 | 18 | if (!$bedtools){ 19 | print STDERR "bedtools was not found in your path:\n\n$ENV{PATH}\n\nPlease install bedtools and add it to your path:\n\n"; 20 | print STDERR "git clone https://github.com/arq5x/bedtools2.git\ncd bedtools2\nmake clean\nmake all\n\n"; 21 | exit(1); 22 | } 23 | 24 | my $usage = "Usage: $0 \n"; 25 | my $infile = shift or die $usage; 26 | my $genome = shift or die $usage; 27 | 28 | if ($infile !~ /\.gtf\.gz$/){ 29 | print STDERR "Please provide a gzipped GTF file\n"; 30 | exit(1); 31 | } 32 | 33 | my $basename = basename($infile, ".gtf.gz"); 34 | my $exon_file = "$basename.exon.merged.bed.gz"; 35 | my $intron_file = "$basename.intron.bed.gz"; 36 | my $intergenic_file = "$basename.intergenic.bed.gz"; 37 | 38 | if (!-e $exon_file){ 39 | warn "Creating exonic regions\n"; 40 | my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"exon\" {print \$1,\$4-1,\$5}' | bedtools sort | bedtools merge -i - | gzip > $exon_file"; 41 | system($command); 42 | } else { 43 | warn "$exon_file already exists; skipping exon step\n"; 44 | } 45 | 46 | if (!-e $intron_file){ 47 | warn "Creating intronic regions\n"; 48 | my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"gene\" {print \$1,\$4-1,\$5}' | bedtools sort | bedtools subtract -a stdin -b $basename.exon.merged.bed.gz | gzip > $intron_file"; 49 | system($command); 50 | } else { 51 | warn "$intron_file already exists; skipping intron step\n"; 52 | } 53 | 54 | if (!-e $intergenic_file){ 55 | warn "Creating intergenic regions\n"; 56 | my $command = "gunzip -c $infile | awk 'BEGIN{OFS=\"\\t\";} \$3==\"gene\" {print \$1,\$4-1,\$5}' | bedtools sort -g $genome | bedtools complement -i stdin -g $genome | gzip > $intergenic_file"; 57 | system($command); 58 | } else { 59 | warn "$intergenic_file already exists; skipping intergenic step\n"; 60 | } 61 | 62 | if (-e $exon_file && -e $intron_file && -e $intergenic_file){ 63 | my ($exon_average, $exon_coverage) = stats($exon_file); 64 | my ($intergenic_average, $intergenic_coverage) = stats($intergenic_file); 65 | my ($intron_average, $intron_coverage) = stats($intron_file); 66 | 67 | my $total = $exon_coverage + $intergenic_coverage + $intron_coverage; 68 | 69 | printf "exon_coverage: %.2f\n", $exon_coverage*100/$total; 70 | printf "intron_coverage: %.2f\n", $intron_coverage*100/$total; 71 | printf "intergenic_coverage: %.2f\n", $intergenic_coverage*100/$total; 72 | print "exon_length: $exon_average\n"; 73 | print "intron_length: $intron_average\n"; 74 | print "intergenic_length: $intergenic_average\n"; 75 | } 76 | 77 | sub stats { 78 | 79 | my ($infile) = @_; 80 | my $coverage = 0; 81 | my $total = 0; 82 | my $average = 0; 83 | 84 | open(IN, '-|' ,"gunzip -c $infile") || die "Could not open $infile: $!\n"; 85 | while(){ 86 | chomp; 87 | ++$total; 88 | my ($chr, $start, $end) = split(/\t/); 89 | my $c = $end - $start; 90 | $coverage += $c; 91 | } 92 | close(IN); 93 | 94 | $average = sprintf("%.2f", $coverage / $total); 95 | 96 | return($average, $coverage); 97 | } 98 | 99 | exit(0); 100 | 101 | -------------------------------------------------------------------------------- /script/gtf_add_exon.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | # 3 | # Some GTF files do not contain exon features and thus do not work well with some tools. This script add exon features based on the CDS feature. 4 | # 5 | # Furthermore, some GTF files set all transcript_id's to "unknown_transcript_1", which can also create problems. 6 | # 7 | # This script can replace transcript_id's with the gene_id. However, this creates a problem when there are more than one transcript per gene. 8 | # Thus only use the -t option when there is only one transcript model per gene. 9 | # 10 | # Lastly, the CDS, start_codon, and stop_codon lines will not be outputted by default. Use the -c option to output them. 11 | # 12 | # For your information, below are the definitions of CDS and exon: 13 | # 14 | # A CDS is a contiguous sequence which begins with, and includes, the start codon but does not include the stop codon. 15 | # An exon is a region of the transcript sequence within a gene which is not removed from the primary RNA transcript by RNA splicing. 16 | # 17 | 18 | use warnings; 19 | use strict; 20 | use Getopt::Std; 21 | 22 | my %opts = (); 23 | getopts('i:h:c:t:', \%opts); 24 | 25 | if ($opts{'h'} || 26 | !exists $opts{'i'} 27 | ){ 28 | usage(); 29 | } 30 | 31 | my $gtf = $opts{'i'}; 32 | my $keep_cds = 0; 33 | my $replace_tid = 0; 34 | 35 | if (exists $opts{'c'}){ 36 | $keep_cds = 1; 37 | } 38 | if (exists $opts{'t'}){ 39 | $replace_tid = 1; 40 | } 41 | 42 | # store coordinates to check for overlaps 43 | my %cds = (); 44 | my %exon = (); 45 | 46 | if ($gtf =~ /\.gz$/){ 47 | open(IN, '-|', "gunzip -c $gtf") || die "Could not open $gtf: $!\n"; 48 | } else { 49 | open(IN, '<', $gtf) || die "Could not open $gtf: $!\n"; 50 | } 51 | 52 | LINE: while(){ 53 | chomp; 54 | next if /^#/; 55 | my ($sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes) = split(/\t/); 56 | 57 | my $gene_id = ''; 58 | if ($attributes =~ /gene_id\s"([\/a-zA-Z0-9._-]*)";/){ 59 | $gene_id = $1; 60 | } else { 61 | die "[ERROR] Could not extract gene_id on line $.: $_\n"; 62 | } 63 | if ($gene_id eq ''){ 64 | warn("[WARNING] $feature on line $. is not associated with any gene_id: skipping\n"); 65 | warn("[WARNING] $_\n"); 66 | next LINE; 67 | } 68 | 69 | if ($replace_tid){ 70 | $attributes =~ s/transcript_id "[\/a-zA-Z0-9._-]+"/transcript_id "$gene_id"/; 71 | } 72 | 73 | if ($feature eq 'CDS'){ 74 | if ($keep_cds){ 75 | print "$_\n"; 76 | } 77 | print join("\t", $sequence, $source, 'exon', $start, $end + 3, $score, $strand, $phase, $attributes), "\n"; 78 | $cds{$start} = $end + 3; 79 | } elsif ($feature eq 'start_codon') { 80 | if ($keep_cds){ 81 | print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n"; 82 | } 83 | } elsif ($feature eq 'stop_codon') { 84 | if ($keep_cds){ 85 | print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n"; 86 | } 87 | } elsif ($feature eq 'exon') { 88 | $exon{$start} = $end; 89 | } else { 90 | print join("\t", $sequence, $source, $feature, $start, $end, $score, $strand, $phase, $attributes), "\n"; 91 | } 92 | 93 | } 94 | close(IN); 95 | 96 | # Issue warning if the newly created exon has coordinates identical to an existing exon 97 | foreach my $start (keys %cds){ 98 | my $end = $cds{$start}; 99 | if (exists $exon{$start} && $exon{$start} == $end){ 100 | warn("[WARNING] Exon $start-$end is repeated; please confirm that they below to different transcript_id's.\n"); 101 | } 102 | } 103 | 104 | warn("[WARNING] Finished processing $gtf.\n"); 105 | 106 | sub usage { 107 | print STDERR < hg19.genome.gz 11 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -e "select chrom, size from hg38.chromInfo" | gzip > hg38.genome.gz 12 | ``` 13 | 14 | Use `check_size.pl` to check. 15 | 16 | ```bash 17 | ./check_size.pl -s hg19.genome.gz -b unknown1.bed.gz # lots of warnings 18 | ./check_size.pl -s hg38.genome.gz -b unknown1.bed.gz # no warnings 19 | 20 | ./check_size.pl -s hg19.genome.gz -b unknown2.bed.gz # no warnings 21 | ./check_size.pl -s hg38.genome.gz -b unknown2.bed.gz # lots of warnings 22 | ``` 23 | 24 | ## Overlap check 25 | 26 | We will use the UCSC Genome Browser's database and the RefSeq database to create a BED file containing exonic regions. UCSC Genome Browser's [internal database representations](https://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1) of coordinates always have a zero-based start and a one-based end, so we do not need to change the coordinates. 27 | 28 | ```bash 29 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -D hg19 -e 'select chrom,exonStarts,exonEnds from refGene' | split_exon.pl | grep -v "_" | sort -k1,1V -k2,2n | uniq | gzip > hg19.genes.gz 30 | docker run --rm -u $(stat -c "%u:%g" $HOME) -v $(pwd):$(pwd) -w $(pwd) mariadb:10.3 mysql --user=genome --host=genome-mysql.cse.ucsc.edu -A -P 3306 -D hg38 -e 'select chrom,exonStarts,exonEnds from refGene' | split_exon.pl | grep -v "_" | sort -k1,1V -k2,2n | uniq | gzip > hg38.genes.gz 31 | ``` 32 | 33 | Use `bedtools jaccard` to calculate the [Jaccard index](https://en.wikipedia.org/wiki/Jaccard_index). 34 | 35 | ```bash 36 | # low Jaccard index 37 | bedtools jaccard -a unknown1.bed.gz -b hg19.genes.gz | column -t 38 | intersection union jaccard n_intersections 39 | 3942923 135026099 0.0292012 25031 40 | 41 | # much higher Jaccard index 42 | bedtools jaccard -a unknown1.bed.gz -b hg38.genes.gz | column -t 43 | intersection union jaccard n_intersections 44 | 36268855 103033712 0.35201 190064 45 | 46 | bedtools jaccard -a unknown2.bed.gz -b hg38.genes.gz | column -t 47 | intersection union jaccard n_intersections 48 | 3566997 130829021 0.0272646 24223 49 | 50 | bedtools jaccard -a unknown2.bed.gz -b hg19.genes.gz | column -t 51 | intersection union jaccard n_intersections 52 | 32064945 101997528 0.31437 186043 53 | ``` 54 | 55 | Coordinates for `unknown1.bed.gz` are probably for hg38 and `unknown2.bed.gz` are for hg19. 56 | 57 | ## Padding check 58 | 59 | In addition I want to check whether the coordinates are "padded", which means that additional bps are added. If coordinates are padded, then if I shorten regions, the Jaccard index should increase. 60 | 61 | ```bash 62 | # first check size of smallest region 63 | zcat unknown1.bed.gz | perl -lane 'print $F[2] - $F[1]' | sort -n | head -1 64 | 110 65 | 66 | # remove 50 bps from start and end 67 | zcat unknown1.bed.gz | perl -lane 'print join("\t", $F[0], $F[1]+50, $F[2]-50)' | gzip > unknown1_shortened.bed.gz 68 | 69 | bedtools jaccard -a unknown1_shortened.bed.gz -b hg38.genes.gz | column -t 70 | intersection union jaccard n_intersections 71 | 27250690 92010876 0.296168 189508 72 | ``` 73 | 74 | The Jaccard index is decreased, so the BED file is probably not padded. 75 | 76 | If the original BED file was padded say by 50 bp, we would have a Jaccard index of 0.31 and "removing" the padding would increase the Jaccard index to 0.35. 77 | 78 | ```bash 79 | zcat unknown1.bed.gz | perl -lane 'print join("\t", $F[0], $F[1]-50, $F[2]+50)' | gzip > unknown1_lengthened.bed.gz 80 | bedtools jaccard -a unknown1_lengthened.bed.gz -b hg38.genes.gz | column -t 81 | intersection union jaccard n_intersections 82 | 37895511 120602616 0.314218 190090 83 | ``` 84 | 85 | -------------------------------------------------------------------------------- /gencode/plot_stats.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot GENCODE stats" 3 | date: "`r Sys.Date()`" 4 | output: html_document 5 | --- 6 | 7 | ```{r setup, include=FALSE} 8 | library(tidyverse) 9 | library(reshape2) 10 | library(rtracklayer) 11 | library(scales) 12 | library(cowplot) 13 | theme_set(theme_bw()) 14 | knitr::opts_chunk$set(echo = TRUE) 15 | ``` 16 | 17 | ## Genomic region stats 18 | 19 | Plot. 20 | 21 | ```{r load_and_plot, warning=FALSE, message=FALSE} 22 | stat_files <- list.files(path = ".", pattern = "stats$") 23 | file_list <- lapply(stat_files, function(x){ 24 | read_delim(file = x, delim = " ", col_names = c("var", "value")) 25 | }) 26 | 27 | names(file_list) <- sub(pattern = "*.annotation.gtf.stats", replacement = "", x = stat_files) 28 | 29 | my_df <- as.data.frame(do.call(rbind, sapply(file_list, function(x){ 30 | x[, 2] 31 | }))) 32 | 33 | colnames(my_df) <- sub(pattern = ":", replacement = "", x = file_list[[1]]$var) 34 | my_df$version <- sub(pattern = "gencode.v(\\d\\d).value", replacement = "\\1", x = row.names(my_df)) 35 | row.names(my_df) <- NULL 36 | my_df <- melt(my_df, id.vars = "version") 37 | 38 | my_df %>% 39 | filter(str_detect(variable, "coverage")) %>% 40 | ggplot(., aes(version, value, fill = variable)) + 41 | geom_col() + 42 | labs(x = "GENCODE version", y = "Percentage", title = "Genomic region proportions") -> p1 43 | 44 | my_df %>% 45 | filter(str_detect(variable, "length")) %>% 46 | ggplot(., aes(version, value, fill = variable)) + 47 | geom_col() + 48 | labs(x = "GENCODE version", y = "Average length (bp)", title = "Genomic region lengths") -> p2 49 | 50 | p1 51 | p2 52 | ``` 53 | 54 | Save plots. 55 | 56 | ```{r save_plot, warning=FALSE, message=FALSE} 57 | ggsave(filename = "genomic_region_proportion.png", plot = p1) 58 | ggsave(filename = "genomic_region_length.png", plot = p2) 59 | ``` 60 | 61 | ## GTF stats 62 | 63 | Load all GTF files into R. 64 | 65 | ```{r load_gtf, warning=FALSE, message=FALSE} 66 | gtf_files <- list.files(path = ".", pattern = "gtf.gz$") 67 | 68 | gtf_obj <- lapply(gtf_files, import) 69 | 70 | names(gtf_obj) <- sub(pattern = "*.annotation.gtf.gz", replacement = "", x = gtf_files) 71 | ``` 72 | 73 | Plot `gene_type`. 74 | 75 | ```{r plot_gene_type, message=FALSE, warning=FALSE, fig.width=12, fig.height=8} 76 | plot_gene_type <- function(gr, plot_title){ 77 | gr %>% 78 | as.data.frame() %>% 79 | filter(type == "gene") %>% 80 | group_by(gene_type) %>% 81 | summarise(count = n()) %>% 82 | arrange(desc(count)) %>% 83 | mutate(gene_type = factor(x = gene_type, levels = gene_type)) %>% 84 | ggplot(., aes(gene_type, y = count)) + 85 | geom_col() + 86 | scale_y_log10() + 87 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 88 | axis.title.x = element_blank()) + 89 | scale_y_continuous(labels = comma) + 90 | labs(title = plot_title) 91 | } 92 | 93 | p1 <- plot_gene_type(gtf_obj$gencode.v20, "GENCODE version 20") 94 | p2 <- plot_gene_type(gtf_obj$gencode.v30, "GENCODE version 30") 95 | p3 <- plot_gene_type(gtf_obj$gencode.v31, "GENCODE version 31") 96 | p4 <- plot_gene_type(gtf_obj$gencode.v35, "GENCODE version 35") 97 | 98 | all_plot <- plot_grid(plotlist = list(p1, p2, p3, p4)) 99 | all_plot 100 | ggsave(filename = "gene_type.png", plot = all_plot) 101 | ``` 102 | 103 | Plot `transcript_support_level`, which are [transcript scores](https://www.gencodegenes.org/pages/data_format.html) according to how well mRNA and EST alignments match over its full length: 104 | 105 | * 1 (all splice junctions of the transcript are supported by at least one non-suspect mRNA), 106 | * 2 (the best supporting mRNA is flagged as suspect or the support is from multiple ESTs), 107 | * 3 (the only support is from a single EST), 108 | * 4 (the best supporting EST is flagged as suspect), 109 | * 5 (no single transcript supports the model structure), 110 | * NA (the transcript was not analyzed) 111 | 112 | ```{r plot_transcript_support, message=FALSE, warning=FALSE, fig.width=8, fig.height=6} 113 | plot_transcript_support <- function(gr, plot_title){ 114 | gr %>% 115 | as.data.frame() %>% 116 | filter(type == "transcript") %>% 117 | group_by(transcript_support_level) %>% 118 | summarise(count = n()) %>% 119 | arrange(desc(count)) %>% 120 | filter(!is.na(transcript_support_level)) %>% 121 | mutate(transcript_support_level = factor(x = transcript_support_level, levels = c(1:5, "NA"))) %>% 122 | ggplot(., aes(transcript_support_level, y = count)) + 123 | geom_col() + 124 | #scale_y_log10() + 125 | theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5), 126 | axis.title.x = element_blank()) + 127 | scale_y_continuous(labels = comma) + 128 | labs(title = plot_title) 129 | } 130 | 131 | p1 <- plot_transcript_support(gtf_obj$gencode.v21, "GENCODE version 21") 132 | p2 <- plot_transcript_support(gtf_obj$gencode.v30, "GENCODE version 30") 133 | p3 <- plot_transcript_support(gtf_obj$gencode.v33, "GENCODE version 33") 134 | p4 <- plot_transcript_support(gtf_obj$gencode.v35, "GENCODE version 35") 135 | 136 | all_plot <- plot_grid(plotlist = list(p1, p2, p3, p4)) 137 | all_plot 138 | ggsave(filename = "transcript_support_level.png", plot = all_plot) 139 | ``` 140 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Defining genomic regions 2 | ======================== 3 | 4 | A reference genome is simply a string of A's, C's, G's, and T's. However, there are many functional elements within the genome and massive efforts have been undertaken to annotate genomes. The [GENCODE Project](https://www.gencodegenes.org/pages/gencode.html) was tasked with cataloguing genes and gene variants in the human and mouse genomes. [TAIR](https://www.arabidopsis.org/portals/genAnnotation/gene_structural_annotation/annotation_data.jsp) coordinates and provides genome annotations for *Arabidopsis thaliana*. Typically, genome annotations are provided in a [GTF file](https://en.wikipedia.org/wiki/Gene_transfer_format). 5 | 6 | We will define regions in a reference genome by using [BEDTools](http://bedtools.readthedocs.io/en/latest/) and a GTF file. 7 | 8 | ![Example](example.png) 9 | The IGV screenshot above shows various gene models in dark blue, exonic regions in light red, intronic regions in light green, and intergenic regions in light blue. 10 | 11 | ## Install BEDTools 12 | 13 | To get started, download and compile BEDTools, if you haven't already. 14 | 15 | ```bash 16 | git clone https://github.com/arq5x/bedtools2.git 17 | cd bedtools2 18 | make clean && make all 19 | ``` 20 | 21 | Alternatively, you can install BEDTools using [Conda](https://davetang.github.io/reproducible_bioinformatics/conda.html). 22 | 23 | ```bash 24 | conda install -c bioconda bedtools 25 | ``` 26 | 27 | ## Download GTF file 28 | 29 | We will use the GTF file for *Arabidopsis thaliana*; more information on the format is provided at the [UCSC Genome Browser help page](https://genome.ucsc.edu/FAQ/FAQformat#format4). 30 | 31 | ```bash 32 | wget -c ftp://ftp.ensemblgenomes.org/pub/release-36/plants/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.36.gtf.gz 33 | 34 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz | head 35 | #!genome-build TAIR10 36 | #!genome-version TAIR10 37 | #!genome-date 2010-09 38 | #!genome-build-accession GCA_000001735.1 39 | #!genebuild-last-updated 2010-09 40 | 1 araport11 gene 3631 5899 . + . gene_id "AT1G01010"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; 41 | 1 araport11 transcript 3631 5899 . + . gene_id "AT1G01010"; transcript_id "AT1G01010.1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; 42 | 1 araport11 exon 3631 3913 . + . gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; exon_id "AT1G01010.1.exon1"; 43 | 1 araport11 CDS 3760 3913 . + 0 gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; protein_id "AT1G01010.1"; protein_version "1"; 44 | 1 araport11 start_codon 3760 3762 . + 0 gene_id "AT1G01010"; transcript_id "AT1G01010.1"; exon_number "1"; gene_name "NAC001"; gene_source "araport11"; gene_biotype "protein_coding"; transcript_source "araport11"; transcript_biotype "protein_coding"; 45 | ``` 46 | 47 | ## Define exons 48 | 49 | Exons are already defined in the GTF file, so we simply need to print lines that are marked exonic. 50 | 51 | ```bash 52 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz | 53 | awk 'BEGIN{OFS="\t";} $3=="exon" {print $1,$4-1,$5}' | 54 | bedtools sort | 55 | bedtools merge -i - | gzip > my_exon.bed.gz 56 | ``` 57 | 58 | ## Define introns 59 | 60 | To obtain introns, we simply need the gene and exonic coordinates; by subtracting the exonic regions from the genic region, we have the intronic region. 61 | 62 | ```bash 63 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz | 64 | awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' | 65 | bedtools sort | 66 | bedtools subtract -a stdin -b my_exon.bed.gz | 67 | gzip > my_intron.bed.gz 68 | ``` 69 | 70 | ## Define intergenic 71 | 72 | For the intergenic region, we will require the size of the chromosomes. 73 | 74 | ```bash 75 | gunzip -c Arabidopsis_thaliana.TAIR10.36.gtf.gz | 76 | awk 'BEGIN{OFS="\t";} $3=="gene" {print $1,$4-1,$5}' | 77 | bedtools sort -g chrom_info/araTha1.genome | 78 | bedtools complement -i stdin -g chrom_info/araTha1.genome | 79 | gzip > my_intergenic.bed.gz 80 | ``` 81 | 82 | ## *Arabidopsis thaliana* regions 83 | 84 | How much of the is made up of exonic, intronic, and intergenic regions? 85 | 86 | ```bash 87 | alias add='perl -nle '\''$i+=$_; END {print $i}'\''' 88 | 89 | cat chrom_info/araTha1.genome 90 | 1 30427671 91 | 5 26975502 92 | 3 23459830 93 | 2 19698289 94 | 4 18585056 95 | Mt 366924 96 | Pt 154478 97 | 98 | cat chrom_info/araTha1.chrom.sizes | cut -f2 | add 99 | 119667750 100 | 101 | # exonic 102 | gunzip -c my_exon.bed.gz | awk '{print $3-$2}' | add 103 | 47821763 104 | bc -l<<<47821763*100/119667750 105 | 39.96211427055326100808 106 | 107 | # intronic 108 | gunzip -c my_intron.bed.gz | awk '{print $3-$2}' | add 109 | 18164145 110 | bc -l<<<18164145*100/119667750 111 | 15.17881384082177529033 112 | 113 | # intergenic 114 | gunzip -c my_intergenic.bed.gz | awk '{print $3-$2}' | add 115 | 53769447 116 | bc -l<<<53769447*100/119667750 117 | 44.93227874678014753348 118 | 119 | # slightly off total 120 | bc -l<<<15.17881384082177529033+44.93227874678014753348+39.96211427055326100808 121 | 100.07320685815518383189 122 | ``` 123 | 124 | ## hg19 reference genome 125 | 126 | If you are working with hg19, simply run `make` to create all the different genomic regions. 127 | 128 | ```bash 129 | make 130 | 131 | # if everything ran successfully 132 | for file in `ls *.gz`; do md5sum $file; done 133 | bd83e28270e595d3bde6bfcb21c9748f gencode.v19.annotation.gtf.gz 134 | 8c97ec4b54eaa176ba1e48bfeb60c08a gencode_v19_exon_merged.bed.gz 135 | ea03038b873ba2612383a4c0949c835d gencode_v19_intergenic.bed.gz 136 | 4d5ff850e3115077bf50d87bc406a84f gencode_v19_intron.bed.gz 137 | 48dbe15f4498baad1a2327c774a692c8 promoter.bed.gz 138 | 9d513cad3aafd5690bf8bbebb24b4df4 transcript_utr.bed.gz 139 | 35aed6aac655182c653cdc72060b914d transcript_utr_number.out.gz 140 | ``` 141 | 142 | ## Annotate BAM files 143 | 144 | ```bash 145 | samtools sort my_file.bam my_file 146 | bedtools2/bin/bedtools bamtobed -i my_file.bam > my_file.bed 147 | cat my_file.bed | wc -l 148 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_exon_merged.bed.gz -u | wc -l 149 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_intergenic.bed.gz -u | wc -l 150 | bedtools2/bin/bedtools intersect -a my_file.bed -b gencode_v19_intron.bed.gz -u | wc -l 151 | ``` 152 | 153 | ## run.pl 154 | 155 | I wrote `run.pl` to create exonic, intronic, and intergenic BED files as well as providing some simple statistics of each region. 156 | 157 | ```bash 158 | run.pl Arabidopsis_thaliana.TAIR10.37.gtf.gz chrom_info/araTha1.genome 159 | Creating exonic regions 160 | Creating intronic regions 161 | Creating intergenic regions 162 | 163 | Coverage summary per region (percentage) 164 | 165 | Exon: 39.99 166 | Intron: 15.16 167 | Intergenic: 44.84 168 | 169 | Average length per region (bp) 170 | 171 | Exon: 336.63 172 | Intron: 157.37 173 | Intergenic: 1951.60 174 | ``` 175 | 176 | ## Further reading 177 | 178 | See assoicated blog post: 179 | 180 | -------------------------------------------------------------------------------- /chrom_info/hg19.genome: -------------------------------------------------------------------------------- 1 | chr1 249250621 2 | chr2 243199373 3 | chr3 198022430 4 | chr4 191154276 5 | chr5 180915260 6 | chr6 171115067 7 | chr7 159138663 8 | chrX 155270560 9 | chr8 146364022 10 | chr9 141213431 11 | chr10 135534747 12 | chr11 135006516 13 | chr12 133851895 14 | chr13 115169878 15 | chr14 107349540 16 | chr15 102531392 17 | chr16 90354753 18 | chr17 81195210 19 | chr18 78077248 20 | chr20 63025520 21 | chrY 59373566 22 | chr19 59128983 23 | chr22 51304566 24 | chr21 48129895 25 | chr1_jh636052_fix 7283150 26 | chrX_jh806600_fix 6530008 27 | chr6_ssto_hap7 4928567 28 | chr6_mcf_hap5 4833398 29 | chr6_cox_hap2 4795371 30 | chr6_mann_hap4 4683263 31 | chr6_apd_hap1 4622290 32 | chr6_qbl_hap6 4611984 33 | chr6_dbb_hap3 4610396 34 | chrX_jh806587_fix 4110759 35 | chr7_jh159134_fix 3821770 36 | chrX_jh159150_fix 3110903 37 | chrX_jh806590_fix 2418393 38 | chr10_jh591181_fix 2281126 39 | chr17_ctg5_hap1 1680828 40 | chr1_jh636053_fix 1676126 41 | chr5_gl339449_alt 1612928 42 | chr14_kb021645_fix 1523386 43 | chrX_jh720453_fix 1461188 44 | chrX_jh806601_fix 1389764 45 | chr7_gl582971_fix 1284284 46 | chrX_jh806599_fix 1214327 47 | chr19_gl949749_alt 1091840 48 | chr19_gl949750_alt 1066389 49 | chr19_gl949748_alt 1064303 50 | chr19_kb021647_fix 1058686 51 | chrX_jh806597_fix 1045622 52 | chr10_ke332501_fix 1020827 53 | chr19_gl949751_alt 1002682 54 | chr19_gl949746_alt 987716 55 | chr19_gl949752_alt 987100 56 | chrX_jh806598_fix 899320 57 | chrX_jh720451_fix 898979 58 | chrX_jh806591_fix 882083 59 | chr11_jh806581_fix 872115 60 | chrX_jh806588_fix 862483 61 | chrX_jh806592_fix 835911 62 | chr19_gl949753_alt 796478 63 | chr1_jh636054_fix 758378 64 | chrX_jh720454_fix 752267 65 | chr19_gl949747_alt 729519 66 | chr7_jh636058_fix 716227 67 | chrX_jh806602_fix 713266 68 | chr17_gl383561_fix 644425 69 | chr8_gl949743_fix 608579 70 | chr2_kb663603_fix 599580 71 | chr4_ctg9_hap1 590426 72 | chr19_gl582977_fix 580393 73 | chr19_ke332505_fix 579598 74 | chr1_gl000192_random 547496 75 | chr11_jh159140_fix 546435 76 | chr5_ke332497_fix 543325 77 | chr17_gl383560_fix 534288 78 | chrX_jh720452_fix 522319 79 | chr4_ke332496_fix 503215 80 | chr6_kb663604_fix 478993 81 | chrX_kb021648_fix 469972 82 | chr11_jh591184_fix 462282 83 | chr17_gl383558_fix 457041 84 | chr17_jh720447_fix 454385 85 | chrX_jh806595_fix 444074 86 | chr10_jh636060_fix 437946 87 | chr8_gl383535_fix 429806 88 | chrX_jh806596_fix 413927 89 | chr17_gl582976_fix 412535 90 | chr11_jh720443_fix 408430 91 | chr12_gl877876_alt 408271 92 | chr3_jh159131_fix 393769 93 | chr10_gl383543_fix 392792 94 | chrX_jh806594_fix 390496 95 | chr2_gl877871_fix 389939 96 | chrX_jh806593_fix 389631 97 | chr15_gl383555_alt 388773 98 | chr17_jh159144_fix 388340 99 | chr19_gl383573_alt 385657 100 | chr17_jh591186_fix 376223 101 | chr4_gl383528_alt 376187 102 | chr12_gl949745_alt 372609 103 | chr1_gl383520_alt 366579 104 | chr7_gl582968_fix 356330 105 | chr7_gl582970_fix 354970 106 | chr17_jh806582_fix 342635 107 | chr17_ke332502_fix 341712 108 | chr17_gl383559_fix 338640 109 | chr12_kb663607_fix 334922 110 | chr9_gl339450_fix 330164 111 | chr7_gl582972_fix 327774 112 | chr11_jh159142_fix 326647 113 | chr11_gl582973_fix 321004 114 | chr10_gl383546_alt 309802 115 | chr21_ke332506_fix 307252 116 | chr10_kb663606_fix 305900 117 | chr4_gl877872_fix 297485 118 | chr15_gl383554_alt 296527 119 | chr9_jh636059_fix 295379 120 | chr18_gl383567_alt 289831 121 | chrX_gl877877_fix 284527 122 | chr20_kb663608_fix 283551 123 | chr17_jh159146_alt 278131 124 | chr11_gl949744_fix 276448 125 | chr7_ke332499_fix 274521 126 | chr6_jh806576_fix 273386 127 | chr12_jh720444_fix 273128 128 | chrX_jh806589_fix 270630 129 | chr17_gl383563_alt 270261 130 | chr5_jh159133_fix 266316 131 | chr3_ke332495_fix 263861 132 | chr6_jh636056_fix 262912 133 | chr7_gl582969_fix 251823 134 | chr4_gl582967_fix 248177 135 | chr19_jh159149_fix 245473 136 | chr11_jh159141_fix 240775 137 | chr8_ke332500_fix 228602 138 | chr5_gl949742_alt 226852 139 | chr17_gl383565_alt 223995 140 | chr22_jh720449_fix 212298 141 | chr17_kb021646_fix 211416 142 | chr9_jh806579_fix 211307 143 | chrUn_gl000225 211173 144 | chr8_gl383536_fix 203777 145 | chr21_gl383579_alt 201198 146 | chr11_jh159136_alt 200998 147 | chr6_jh636057_fix 200195 148 | chr18_gl383571_alt 198278 149 | chr10_jh591182_fix 196262 150 | chr17_jh159145_fix 194862 151 | chr16_gl383556_alt 192462 152 | chr4_gl000194_random 191469 153 | chr11_jh159137_alt 191409 154 | chr11_jh159143_fix 191402 155 | chr4_gl000193_random 189789 156 | chr19_gl383576_alt 188024 157 | chr6_kb021644_alt 187824 158 | chr9_gl000200_random 187035 159 | chrUn_gl000222 186861 160 | chrUn_gl000212 186858 161 | chr17_jh636061_fix 186059 162 | chr12_gl383551_alt 184319 163 | chrX_jh806603_fix 182949 164 | chr7_gl000195_random 182896 165 | chr1_gl383518_alt 182439 166 | chr3_gl383526_alt 180671 167 | chrUn_gl000223 180455 168 | chr20_gl582979_fix 179899 169 | chrUn_gl000224 179693 170 | chr10_gl383545_alt 179254 171 | chrUn_gl000219 179198 172 | chr10_jh591183_fix 177920 173 | chr17_gl000205_random 174588 174 | chr5_gl383531_alt 173459 175 | chr3_jh636055_alt 173151 176 | chrUn_gl000215 172545 177 | chrUn_gl000216 172294 178 | chrUn_gl000217 172149 179 | chr3_gl383523_fix 171362 180 | chr9_gl383541_alt 171286 181 | chr19_gl383575_alt 170222 182 | chr15_jh720445_fix 170033 183 | chr9_gl000199_random 169874 184 | chr9_jh806578_fix 169437 185 | chr12_gl383550_alt 169178 186 | chr10_gl877873_fix 168465 187 | chr18_gl383569_alt 167950 188 | chr11_jh591185_fix 167437 189 | chr12_gl877875_alt 167313 190 | chr22_jh806583_fix 167183 191 | chrUn_gl000211 166566 192 | chr12_gl383548_fix 165247 193 | chr18_gl383570_alt 164789 194 | chr4_gl383527_alt 164536 195 | chrUn_gl000213 164239 196 | chr12_gl582974_fix 163298 197 | chr9_gl383539_alt 162988 198 | chr22_gl383582_alt 162811 199 | chrUn_gl000220 161802 200 | chrUn_gl000218 161147 201 | chr18_gl383572_alt 159547 202 | chr19_gl000209_random 159169 203 | chr9_kb663605_fix 155926 204 | chr19_gl383574_alt 155864 205 | chrUn_gl000221 155397 206 | chr11_gl383547_alt 154407 207 | chr12_gl383553_alt 152874 208 | chr1_gl949741_fix 151551 209 | chr6_ke332498_fix 149443 210 | chr2_gl383521_alt 143390 211 | chr12_gl383552_alt 138655 212 | chrUn_gl000214 137718 213 | chr17_gl383564_alt 133151 214 | chrUn_gl000228 129120 215 | chr20_gl383577_alt 128385 216 | chr10_gl383544_fix 128378 217 | chrUn_gl000227 128374 218 | chr6_gl383533_alt 124736 219 | chr2_gl383522_alt 123821 220 | chr4_gl383529_alt 121345 221 | chr12_gl383549_alt 120804 222 | chr11_jh159139_fix 120441 223 | chr7_gl383534_alt 119183 224 | chr21_gl383581_alt 116690 225 | chr1_gl383519_alt 110268 226 | chr11_jh159138_fix 108875 227 | chr1_gl000191_random 106433 228 | chr18_gl383568_alt 104552 229 | chr8_jh159135_fix 102251 230 | chr5_gl383530_alt 101241 231 | chr3_jh159132_fix 100694 232 | chr16_jh720446_fix 97345 233 | chr22_gl383583_alt 96924 234 | chr2_gl582966_alt 96131 235 | chr10_jh806580_fix 93149 236 | chr19_gl000208_random 92689 237 | chr17_gl383566_alt 90219 238 | chr9_gl000198_random 90085 239 | chr16_gl383557_alt 89672 240 | chr17_jh159148_alt 88070 241 | chr5_gl383532_alt 82728 242 | chr17_gl000204_random 81310 243 | chr3_gl383524_fix 78793 244 | chr21_gl383580_alt 74652 245 | chr22_kb663609_alt 74013 246 | chr22_jh806585_fix 73505 247 | chr9_gl383540_alt 71551 248 | chr22_jh806584_fix 70876 249 | chr20_jh720448_fix 70483 250 | chr17_jh159147_alt 70345 251 | chr2_gl877870_fix 66021 252 | chr3_gl383525_fix 65063 253 | chrX_jh720455_fix 65034 254 | chr21_gl383578_alt 63917 255 | chr9_gl383537_fix 62435 256 | chr9_gl383542_alt 60032 257 | chr1_gl383517_fix 49352 258 | chr1_gl383516_fix 49316 259 | chr9_gl383538_fix 49281 260 | chr1_jh806575_fix 47409 261 | chrUn_gl000233 45941 262 | chrUn_gl000237 45867 263 | chr17_gl383562_fix 45551 264 | chrUn_gl000230 43691 265 | chr22_jh806586_fix 43543 266 | chrUn_gl000242 43523 267 | chrUn_gl000243 43341 268 | chrUn_gl000241 42152 269 | chrUn_gl000236 41934 270 | chrUn_gl000240 41933 271 | chr17_gl000206_random 41001 272 | chrUn_gl000232 40652 273 | chrUn_gl000234 40531 274 | chr11_gl000202_random 40103 275 | chrUn_gl000238 39939 276 | chrUn_gl000244 39929 277 | chrUn_gl000248 39786 278 | chr8_gl000196_random 38914 279 | chrUn_gl000249 38502 280 | chrUn_gl000246 38154 281 | chr17_gl000203_random 37498 282 | chr8_gl000197_random 37175 283 | chrUn_gl000245 36651 284 | chrUn_gl000247 36422 285 | chr9_gl000201_random 36148 286 | chr13_gl582975_fix 34662 287 | chrUn_gl000235 34474 288 | chrUn_gl000239 33824 289 | chr21_gl000210_random 27682 290 | chrUn_gl000231 27386 291 | chr1_jh806573_fix 24680 292 | chr1_jh806574_fix 22982 293 | chr9_jh806577_fix 22394 294 | chrUn_gl000229 19913 295 | chrM 16571 296 | chrMT 16569 297 | chrUn_gl000226 15008 298 | chr18_gl000207_random 4262 299 | -------------------------------------------------------------------------------- /chrom_info/hg38.genome: -------------------------------------------------------------------------------- 1 | chr1 248956422 2 | chr2 242193529 3 | chr3 198295559 4 | chr4 190214555 5 | chr5 181538259 6 | chr6 170805979 7 | chr7 159345973 8 | chrX 156040895 9 | chr8 145138636 10 | chr9 138394717 11 | chr11 135086622 12 | chr10 133797422 13 | chr12 133275309 14 | chr13 114364328 15 | chr14 107043718 16 | chr15 101991189 17 | chr16 90338345 18 | chr17 83257441 19 | chr18 80373285 20 | chr20 64444167 21 | chr19 58617616 22 | chrY 57227415 23 | chr22 50818468 24 | chr21 46709983 25 | chr8_KZ208915v1_fix 6367528 26 | chr15_KI270905v1_alt 5161414 27 | chr15_KN538374v1_fix 4998962 28 | chr6_GL000256v2_alt 4929269 29 | chr6_GL000254v2_alt 4827813 30 | chr6_GL000251v2_alt 4795265 31 | chr6_GL000253v2_alt 4677643 32 | chr6_GL000250v2_alt 4672374 33 | chr6_GL000255v2_alt 4606388 34 | chr6_GL000252v2_alt 4604811 35 | chr17_KI270857v1_alt 2877074 36 | chr16_KI270853v1_alt 2659700 37 | chr15_KQ031389v1_alt 2365364 38 | chr16_KV880768v1_fix 1927115 39 | chr16_KI270728v1_random 1872759 40 | chr17_GL000258v2_alt 1821992 41 | chr5_GL339449v2_alt 1612928 42 | chr14_KI270847v1_alt 1511111 43 | chr17_KI270908v1_alt 1423190 44 | chr14_KI270846v1_alt 1351393 45 | chr5_KI270897v1_alt 1144418 46 | chr7_KI270803v1_alt 1111570 47 | chr19_GL949749v2_alt 1091841 48 | chr19_KI270938v1_alt 1066800 49 | chr19_GL949750v2_alt 1066390 50 | chr19_GL949748v2_alt 1064304 51 | chr12_KZ208916v1_fix 1046838 52 | chr19_GL949751v2_alt 1002683 53 | chr19_GL949746v1_alt 987716 54 | chr19_GL949752v1_alt 987100 55 | chr8_KI270821v1_alt 985506 56 | chr1_KI270763v1_alt 911658 57 | chr6_KI270801v1_alt 870480 58 | chr19_GL949753v2_alt 796479 59 | chr19_GL949747v2_alt 729520 60 | chr14_KZ208920v1_fix 690932 61 | chr7_KZ208913v1_alt 680662 62 | chr5_KV575244v1_fix 673059 63 | chr8_KI270822v1_alt 624492 64 | chr7_KZ208912v1_fix 589656 65 | chr4_GL000257v2_alt 586476 66 | chr12_KI270904v1_alt 572349 67 | chr4_KI270925v1_alt 555799 68 | chr1_KV880763v1_alt 551020 69 | chr12_KN538369v1_fix 541038 70 | chr2_KQ983256v1_alt 535088 71 | chr2_KQ031384v1_fix 481245 72 | chr16_KZ559113v1_fix 480415 73 | chr15_KI270852v1_alt 478999 74 | chr7_KV880765v1_fix 468267 75 | chr1_KQ031383v1_fix 467143 76 | chr1_KN538360v1_fix 460100 77 | chr3_KN196475v1_fix 451168 78 | chr15_KI270727v1_random 448248 79 | chr9_KI270823v1_alt 439082 80 | chr15_KI270850v1_alt 430880 81 | chr1_KI270759v1_alt 425601 82 | chr4_KV766193v1_alt 420675 83 | chr10_KN538367v1_fix 420164 84 | chr3_KN538364v1_fix 415308 85 | chr3_KV766192v1_fix 411654 86 | chr12_GL877876v1_alt 408271 87 | chr18_KQ090028v1_fix 407387 88 | chr19_KQ458386v1_fix 405389 89 | chrUn_KI270442v1 392061 90 | chr17_KI270862v1_alt 391357 91 | chr15_GL383555v2_alt 388773 92 | chr19_GL383573v1_alt 385657 93 | chr4_KI270896v1_alt 378547 94 | chr4_GL383528v1_alt 376187 95 | chr17_GL383563v3_alt 375691 96 | chr8_KI270810v1_alt 374415 97 | chr3_KQ031385v1_fix 373699 98 | chr19_KN196484v1_fix 370917 99 | chr1_GL383520v2_alt 366580 100 | chr2_KN538363v1_fix 365499 101 | chr5_KV575243v1_alt 362221 102 | chr13_KN538372v1_fix 356766 103 | chr1_KI270762v1_alt 354444 104 | chr1_KQ458383v1_alt 349938 105 | chr9_KN196479v1_fix 330164 106 | chr1_KZ208906v1_fix 330031 107 | chr15_KI270848v1_alt 327382 108 | chr17_KI270909v1_alt 325800 109 | chr14_KI270844v1_alt 322166 110 | chr6_KQ031387v1_fix 320750 111 | chr8_KI270900v1_alt 318687 112 | chr12_KQ759760v1_fix 315610 113 | chr10_GL383546v1_alt 309802 114 | chr13_KI270838v1_alt 306913 115 | chr3_KN196476v1_fix 305979 116 | chr8_KI270816v1_alt 305841 117 | chr1_KN538361v1_fix 305542 118 | chr11_KZ559108v1_fix 305244 119 | chr22_KI270879v1_alt 304135 120 | chr3_KZ559103v1_alt 302885 121 | chr11_KZ559110v1_alt 301637 122 | chr8_KI270813v1_alt 300230 123 | chr11_KI270831v1_alt 296895 124 | chr15_GL383554v1_alt 296527 125 | chr19_KV575249v1_alt 293522 126 | chr8_KI270811v1_alt 292436 127 | chr18_GL383567v1_alt 289831 128 | chrX_KI270880v1_alt 284869 129 | chr8_KI270812v1_alt 282736 130 | chr19_KI270921v1_alt 282224 131 | chr17_KV766196v1_fix 281919 132 | chr17_KI270729v1_random 280839 133 | chr11_KZ559109v1_fix 279644 134 | chr1_KQ983255v1_alt 278659 135 | chr17_JH159146v1_alt 278131 136 | chr10_KN196480v1_fix 277797 137 | chr17_KV766198v1_alt 276292 138 | chrX_KI270913v1_alt 274009 139 | chr6_KI270798v1_alt 271782 140 | chr7_KI270808v1_alt 271455 141 | chr6_KN196478v1_fix 268330 142 | chr16_KQ090027v1_alt 267463 143 | chr8_KV880767v1_fix 265876 144 | chr10_KQ090021v1_fix 264545 145 | chr22_KI270876v1_alt 263666 146 | chr15_KI270851v1_alt 263054 147 | chr22_KI270875v1_alt 259914 148 | chr1_KI270766v1_alt 256271 149 | chr19_KI270882v1_alt 248807 150 | chr3_KI270778v1_alt 248252 151 | chr17_KV766197v1_alt 246895 152 | chr6_KQ090016v1_fix 245716 153 | chr15_KI270849v1_alt 244917 154 | chr4_KI270786v1_alt 244096 155 | chr6_KZ208911v1_fix 242796 156 | chr19_KV575250v1_alt 241058 157 | chr12_KI270835v1_alt 238139 158 | chr4_KQ090015v1_alt 236512 159 | chr17_KI270858v1_alt 235827 160 | chr19_KI270867v1_alt 233762 161 | chr16_KI270855v1_alt 232857 162 | chr18_KZ559115v1_fix 230843 163 | chr4_KQ983257v1_fix 230434 164 | chr8_KI270926v1_alt 229282 165 | chr5_GL949742v1_alt 226852 166 | chr3_KI270780v1_alt 224108 167 | chr17_GL383565v1_alt 223995 168 | chr2_KI270774v1_alt 223625 169 | chr19_KV575256v1_alt 223118 170 | chr4_KI270790v1_alt 220246 171 | chr11_KI270927v1_alt 218612 172 | chr19_KI270932v1_alt 215732 173 | chr11_KI270903v1_alt 214625 174 | chr2_KI270894v1_alt 214158 175 | chr1_KQ458384v1_alt 212205 176 | chr12_KN196482v1_fix 211377 177 | chr14_GL000225v1_random 211173 178 | chrUn_KI270743v1 210658 179 | chr11_KI270832v1_alt 210133 180 | chr7_KI270805v1_alt 209988 181 | chrY_KZ208924v1_fix 209722 182 | chr4_GL000008v2_random 209709 183 | chr7_KI270809v1_alt 209586 184 | chr19_KI270887v1_alt 209512 185 | chr2_KN538362v1_fix 208149 186 | chr13_KN538371v1_fix 206320 187 | chr4_KI270789v1_alt 205944 188 | chr4_KQ983258v1_alt 205407 189 | chr3_KI270779v1_alt 205312 190 | chr19_KI270914v1_alt 205194 191 | chr18_KQ458385v1_alt 205101 192 | chr19_KI270886v1_alt 204239 193 | chr11_KI270829v1_alt 204059 194 | chr11_KN538368v1_alt 203552 195 | chr14_GL000009v2_random 201709 196 | chr21_GL383579v2_alt 201197 197 | chr11_JH159136v1_alt 200998 198 | chr19_KI270930v1_alt 200773 199 | chrUn_KI270747v1 198735 200 | chr18_GL383571v1_alt 198278 201 | chr19_KI270920v1_alt 198005 202 | chr3_KZ559102v1_alt 197752 203 | chr6_KI270797v1_alt 197536 204 | chr3_KI270935v1_alt 197351 205 | chr11_KQ759759v1_fix 196940 206 | chr17_KI270861v1_alt 196688 207 | chr15_KI270906v1_alt 196384 208 | chr5_KI270791v1_alt 195710 209 | chr3_KZ559105v1_alt 195063 210 | chr14_KI270722v1_random 194050 211 | chr16_GL383556v1_alt 192462 212 | chr13_KI270840v1_alt 191684 213 | chr14_GL000194v1_random 191469 214 | chr11_JH159137v1_alt 191409 215 | chr19_KI270917v1_alt 190932 216 | chr7_KI270899v1_alt 190869 217 | chr19_KI270923v1_alt 189352 218 | chr10_KI270825v1_alt 188315 219 | chr19_GL383576v1_alt 188024 220 | chrX_KV766199v1_alt 188004 221 | chr19_KI270922v1_alt 187935 222 | chrUn_KI270742v1 186739 223 | chr1_KN196472v1_fix 186494 224 | chr22_KI270878v1_alt 186262 225 | chr19_KI270929v1_alt 186203 226 | chr11_KI270826v1_alt 186169 227 | chr6_KB021644v2_alt 185823 228 | chr17_GL000205v2_random 185591 229 | chr10_KQ090020v1_alt 185507 230 | chr1_KI270765v1_alt 185285 231 | chr19_KI270916v1_alt 184516 232 | chr19_KI270890v1_alt 184499 233 | chr3_KI270784v1_alt 184404 234 | chr12_GL383551v1_alt 184319 235 | chr20_KI270870v1_alt 183433 236 | chrUn_GL000195v1 182896 237 | chr1_GL383518v1_alt 182439 238 | chr11_KQ090022v1_fix 181958 239 | chr22_KI270736v1_random 181920 240 | chr2_KZ208907v1_alt 181658 241 | chr10_KI270824v1_alt 181496 242 | chr11_KZ559111v1_alt 181167 243 | chr14_KI270845v1_alt 180703 244 | chr3_GL383526v1_alt 180671 245 | chr13_KI270839v1_alt 180306 246 | chr7_KQ031388v1_fix 179932 247 | chr22_KI270733v1_random 179772 248 | chrUn_GL000224v1 179693 249 | chr10_GL383545v1_alt 179254 250 | chrUn_GL000219v1 179198 251 | chr5_KI270792v1_alt 179043 252 | chr17_KI270860v1_alt 178921 253 | chr19_KV575252v1_alt 178197 254 | chr19_GL000209v2_alt 177381 255 | chr11_KI270830v1_alt 177092 256 | chr9_KI270719v1_random 176845 257 | chrUn_GL000216v2 176608 258 | chr22_KI270928v1_alt 176103 259 | chr1_KI270712v1_random 176043 260 | chr3_KZ208909v1_alt 175849 261 | chr6_KI270800v1_alt 175808 262 | chr1_KI270706v1_random 175055 263 | chr12_KZ208918v1_alt 174808 264 | chr22_KQ458388v1_alt 174749 265 | chr2_KI270776v1_alt 174166 266 | chr18_KI270912v1_alt 174061 267 | chr3_KI270777v1_alt 173649 268 | chr5_GL383531v1_alt 173459 269 | chr3_JH636055v2_alt 173151 270 | chr14_KI270725v1_random 172810 271 | chr5_KI270796v1_alt 172708 272 | chr7_KZ559106v1_alt 172555 273 | chr14_KZ208919v1_alt 171798 274 | chr9_GL383541v1_alt 171286 275 | chr19_KV575259v1_alt 171263 276 | chr19_KI270885v1_alt 171027 277 | chr19_KI270919v1_alt 170701 278 | chr19_KI270889v1_alt 170698 279 | chr19_KI270891v1_alt 170680 280 | chr19_KI270915v1_alt 170665 281 | chr19_KI270933v1_alt 170537 282 | chr19_KI270883v1_alt 170399 283 | chr19_GL383575v2_alt 170222 284 | chr19_KV575247v1_alt 170206 285 | chr19_KI270931v1_alt 170148 286 | chr12_GL383550v2_alt 169178 287 | chr16_KQ031390v1_alt 169136 288 | chr13_KI270841v1_alt 169134 289 | chrUn_KI270744v1 168472 290 | chr13_KQ090024v1_alt 168146 291 | chr19_KV575248v1_alt 168131 292 | chr18_KI270863v1_alt 167999 293 | chr18_GL383569v1_alt 167950 294 | chr12_GL877875v1_alt 167313 295 | chr21_KI270874v1_alt 166743 296 | chr19_KV575253v1_alt 166713 297 | chr3_KI270924v1_alt 166540 298 | chr1_KN196473v1_fix 166200 299 | chr1_KZ208904v1_alt 166136 300 | chr1_KI270761v1_alt 165834 301 | chr3_KQ031386v1_fix 165718 302 | chr3_KI270937v1_alt 165607 303 | chr8_KZ208914v1_fix 165120 304 | chr22_KI270734v1_random 165050 305 | chr18_GL383570v1_alt 164789 306 | chr5_KI270794v1_alt 164558 307 | chr4_GL383527v1_alt 164536 308 | chrUn_GL000213v1 164239 309 | chr3_KI270936v1_alt 164170 310 | chr3_KZ559101v1_alt 164041 311 | chr19_KV575246v1_alt 163926 312 | chr9_KQ090018v1_alt 163882 313 | chr4_KQ090014v1_alt 163749 314 | chr3_KI270934v1_alt 163458 315 | chr18_KZ559116v1_alt 163186 316 | chr9_GL383539v1_alt 162988 317 | chr3_KI270895v1_alt 162896 318 | chr22_GL383582v2_alt 162811 319 | chr3_KI270782v1_alt 162429 320 | chr1_KI270892v1_alt 162212 321 | chrUn_GL000220v1 161802 322 | chr2_KI270767v1_alt 161578 323 | chr2_KI270715v1_random 161471 324 | chr2_KI270893v1_alt 161218 325 | chrUn_GL000218v1 161147 326 | chr19_KV575255v1_alt 161095 327 | chr18_GL383572v1_alt 159547 328 | chr19_KV575251v1_alt 159285 329 | chr8_KI270817v1_alt 158983 330 | chr4_KI270788v1_alt 158965 331 | chrUn_KI270749v1 158759 332 | chr7_KI270806v1_alt 158166 333 | chr7_KI270804v1_alt 157952 334 | chr18_KI270911v1_alt 157710 335 | chrUn_KI270741v1 157432 336 | chr17_KI270910v1_alt 157099 337 | chr19_KI270884v1_alt 157053 338 | chr8_KV880766v1_fix 156998 339 | chr19_KV575258v1_alt 156965 340 | chr22_KN196485v1_alt 156562 341 | chr22_KQ458387v1_alt 155930 342 | chr19_GL383574v1_alt 155864 343 | chr19_KI270888v1_alt 155532 344 | chr3_GL000221v1_random 155397 345 | chr17_KV575245v1_fix 154723 346 | chr11_GL383547v1_alt 154407 347 | chr12_KZ559112v1_alt 154139 348 | chr2_KI270716v1_random 153799 349 | chr22_KN196486v1_alt 153027 350 | chr12_GL383553v2_alt 152874 351 | chr6_KI270799v1_alt 152148 352 | chr22_KI270731v1_random 150754 353 | chrUn_KI270751v1 150742 354 | chrUn_KI270750v1 148850 355 | chr13_KN538373v1_fix 148762 356 | chr19_KV575260v1_alt 145691 357 | chr8_KI270818v1_alt 145606 358 | chr22_KQ759761v1_alt 145162 359 | chrX_KI270881v1_alt 144206 360 | chr21_KI270873v1_alt 143900 361 | chr2_GL383521v1_alt 143390 362 | chr7_KV880764v1_fix 142129 363 | chr8_KI270814v1_alt 141812 364 | chr1_KQ458382v1_alt 141019 365 | chr11_KV766195v1_fix 140877 366 | chr2_KZ208908v1_alt 140361 367 | chr1_KZ208905v1_alt 140355 368 | chr6_KV766194v1_fix 139427 369 | chr5_KN196477v1_alt 139087 370 | chr12_GL383552v1_alt 138655 371 | chrUn_KI270519v1 138126 372 | chr2_KI270775v1_alt 138019 373 | chr17_KI270907v1_alt 137721 374 | chrUn_GL000214v1 137718 375 | chr8_KI270901v1_alt 136959 376 | chr2_KI270770v1_alt 136240 377 | chr5_KZ208910v1_alt 135987 378 | chr16_KI270854v1_alt 134193 379 | chr9_KQ090019v1_alt 134099 380 | chr8_KI270819v1_alt 133535 381 | chr17_GL383564v2_alt 133151 382 | chr2_KI270772v1_alt 133041 383 | chr8_KI270815v1_alt 132244 384 | chr5_KI270795v1_alt 131892 385 | chr5_KI270898v1_alt 130957 386 | chr20_GL383577v2_alt 128386 387 | chr1_KI270708v1_random 127682 388 | chr7_KI270807v1_alt 126434 389 | chr5_KI270793v1_alt 126136 390 | chr6_GL383533v1_alt 124736 391 | chr2_GL383522v1_alt 123821 392 | chr13_KQ090025v1_alt 123480 393 | chr19_KI270918v1_alt 123111 394 | chr1_KN196474v1_fix 122022 395 | chr12_GL383549v1_alt 120804 396 | chr2_KI270769v1_alt 120616 397 | chr4_KI270785v1_alt 119912 398 | chr12_KI270834v1_alt 119498 399 | chr7_GL383534v2_alt 119183 400 | chr20_KI270869v1_alt 118774 401 | chr17_KZ559114v1_alt 116753 402 | chr21_GL383581v2_alt 116689 403 | chr3_KI270781v1_alt 113034 404 | chr17_KI270730v1_random 112551 405 | chrUn_KI270438v1 112505 406 | chr4_KI270787v1_alt 111943 407 | chr18_KI270864v1_alt 111737 408 | chr2_KI270771v1_alt 110395 409 | chr1_GL383519v1_alt 110268 410 | chr2_KI270768v1_alt 110099 411 | chr1_KI270760v1_alt 109528 412 | chr12_KQ090023v1_alt 109323 413 | chr3_KI270783v1_alt 109187 414 | chr11_KN196481v1_fix 108875 415 | chr17_KI270859v1_alt 108763 416 | chr11_KI270902v1_alt 106711 417 | chr3_KZ559104v1_fix 105527 418 | chr18_GL383568v1_alt 104552 419 | chr22_KI270737v1_random 103838 420 | chr13_KI270843v1_alt 103832 421 | chr8_KZ559107v1_alt 103072 422 | chr22_KI270877v1_alt 101331 423 | chr5_GL383530v1_alt 101241 424 | chrY_KN196487v1_fix 101150 425 | chr22_KQ759762v1_fix 101037 426 | chr19_KV575257v1_alt 100553 427 | chr11_KI270721v1_random 100316 428 | chr19_KV575254v1_alt 99845 429 | chr22_KI270738v1_random 99375 430 | chr22_GL383583v2_alt 96924 431 | chr2_GL582966v2_alt 96131 432 | chrUn_KI270748v1 93321 433 | chr18_KZ208922v1_fix 93070 434 | chrUn_KI270435v1 92983 435 | chr5_GL000208v1_random 92689 436 | chrUn_KI270538v1 91309 437 | chr4_KQ090013v1_alt 90922 438 | chr17_GL383566v1_alt 90219 439 | chr16_GL383557v1_alt 89672 440 | chr17_JH159148v1_alt 88070 441 | chr12_KN538370v1_fix 86533 442 | chr10_KN538366v1_fix 85284 443 | chr5_GL383532v1_alt 82728 444 | chr21_KI270872v1_alt 82692 445 | chr6_KQ090017v1_alt 82315 446 | chrUn_KI270756v1 79590 447 | chr16_KZ208921v1_alt 78609 448 | chr6_KI270758v1_alt 76752 449 | chr12_KI270833v1_alt 76061 450 | chr6_KI270802v1_alt 75005 451 | chr21_GL383580v2_alt 74653 452 | chr22_KB663609v1_alt 74013 453 | chr22_KI270739v1_random 73985 454 | chr9_GL383540v1_alt 71551 455 | chrUn_KI270757v1 71251 456 | chr2_KI270773v1_alt 70887 457 | chr17_JH159147v1_alt 70345 458 | chr11_KI270827v1_alt 67707 459 | chr1_KI270709v1_random 66860 460 | chrUn_KI270746v1 66486 461 | chr12_KZ208917v1_fix 64689 462 | chr16_KI270856v1_alt 63982 463 | chr21_GL383578v2_alt 63917 464 | chrUn_KI270753v1 62944 465 | chr19_KI270868v1_alt 61734 466 | chr9_GL383542v1_alt 60032 467 | chr16_KQ090026v1_alt 59016 468 | chr20_KI270871v1_alt 58661 469 | chr12_KI270836v1_alt 56134 470 | chr19_KI270865v1_alt 52969 471 | chr1_KI270764v1_alt 50258 472 | chrY_KZ208923v1_fix 48370 473 | chr1_KZ559100v1_fix 44955 474 | chrUn_KI270589v1 44474 475 | chr14_KI270726v1_random 43739 476 | chr19_KI270866v1_alt 43156 477 | chr22_KI270735v1_random 42811 478 | chr1_KI270711v1_random 42210 479 | chrUn_KI270745v1 41891 480 | chr1_KI270714v1_random 41717 481 | chr22_KI270732v1_random 41543 482 | chr1_KI270713v1_random 40745 483 | chrUn_KI270754v1 40191 484 | chr1_KI270710v1_random 40176 485 | chr12_KI270837v1_alt 40090 486 | chr9_KI270717v1_random 40062 487 | chr14_KI270724v1_random 39555 488 | chr9_KI270720v1_random 39050 489 | chr14_KI270723v1_random 38115 490 | chr9_KI270718v1_random 38054 491 | chrUn_KI270317v1 37690 492 | chr13_KI270842v1_alt 37287 493 | chrY_KI270740v1_random 37240 494 | chrUn_KI270755v1 36723 495 | chr8_KI270820v1_alt 36640 496 | chr13_KN196483v1_fix 35455 497 | chr1_KI270707v1_random 32032 498 | chrUn_KI270579v1 31033 499 | chrUn_KI270752v1 27745 500 | chrUn_KI270512v1 22689 501 | chrUn_KI270322v1 21476 502 | chrM 16569 503 | chrUn_GL000226v1 15008 504 | chr10_KN538365v1_fix 14347 505 | chrUn_KI270311v1 12399 506 | chrUn_KI270366v1 8320 507 | chrUn_KI270511v1 8127 508 | chrUn_KI270448v1 7992 509 | chrUn_KI270521v1 7642 510 | chrUn_KI270581v1 7046 511 | chrUn_KI270582v1 6504 512 | chrUn_KI270515v1 6361 513 | chrUn_KI270588v1 6158 514 | chrUn_KI270591v1 5796 515 | chrUn_KI270522v1 5674 516 | chrUn_KI270507v1 5353 517 | chrUn_KI270590v1 4685 518 | chrUn_KI270584v1 4513 519 | chrUn_KI270320v1 4416 520 | chrUn_KI270382v1 4215 521 | chrUn_KI270468v1 4055 522 | chrUn_KI270467v1 3920 523 | chrUn_KI270362v1 3530 524 | chrUn_KI270517v1 3253 525 | chrUn_KI270593v1 3041 526 | chrUn_KI270528v1 2983 527 | chrUn_KI270587v1 2969 528 | chrUn_KI270364v1 2855 529 | chrUn_KI270371v1 2805 530 | chrUn_KI270333v1 2699 531 | chrUn_KI270374v1 2656 532 | chrUn_KI270411v1 2646 533 | chrUn_KI270414v1 2489 534 | chrUn_KI270510v1 2415 535 | chrUn_KI270390v1 2387 536 | chrUn_KI270375v1 2378 537 | chrUn_KI270420v1 2321 538 | chrUn_KI270509v1 2318 539 | chrUn_KI270315v1 2276 540 | chrUn_KI270302v1 2274 541 | chrUn_KI270518v1 2186 542 | chrUn_KI270530v1 2168 543 | chrUn_KI270304v1 2165 544 | chrUn_KI270418v1 2145 545 | chrUn_KI270424v1 2140 546 | chrUn_KI270417v1 2043 547 | chrUn_KI270508v1 1951 548 | chrUn_KI270303v1 1942 549 | chrUn_KI270381v1 1930 550 | chrUn_KI270529v1 1899 551 | chrUn_KI270425v1 1884 552 | chrUn_KI270396v1 1880 553 | chrUn_KI270363v1 1803 554 | chrUn_KI270386v1 1788 555 | chrUn_KI270465v1 1774 556 | chrUn_KI270383v1 1750 557 | chrUn_KI270384v1 1658 558 | chrUn_KI270330v1 1652 559 | chrUn_KI270372v1 1650 560 | chrUn_KI270548v1 1599 561 | chrUn_KI270580v1 1553 562 | chrUn_KI270387v1 1537 563 | chrUn_KI270391v1 1484 564 | chrUn_KI270305v1 1472 565 | chrUn_KI270373v1 1451 566 | chrUn_KI270422v1 1445 567 | chrUn_KI270316v1 1444 568 | chrUn_KI270338v1 1428 569 | chrUn_KI270340v1 1428 570 | chrUn_KI270583v1 1400 571 | chrUn_KI270334v1 1368 572 | chrUn_KI270429v1 1361 573 | chrUn_KI270393v1 1308 574 | chrUn_KI270516v1 1300 575 | chrUn_KI270389v1 1298 576 | chrUn_KI270466v1 1233 577 | chrUn_KI270388v1 1216 578 | chrUn_KI270544v1 1202 579 | chrUn_KI270310v1 1201 580 | chrUn_KI270412v1 1179 581 | chrUn_KI270395v1 1143 582 | chrUn_KI270376v1 1136 583 | chrUn_KI270337v1 1121 584 | chrUn_KI270335v1 1048 585 | chrUn_KI270378v1 1048 586 | chrUn_KI270379v1 1045 587 | chrUn_KI270329v1 1040 588 | chrUn_KI270419v1 1029 589 | chrUn_KI270336v1 1026 590 | chrUn_KI270312v1 998 591 | chrUn_KI270539v1 993 592 | chrUn_KI270385v1 990 593 | chrUn_KI270423v1 981 594 | chrUn_KI270392v1 971 595 | chrUn_KI270394v1 970 596 | --------------------------------------------------------------------------------