├── README.md ├── haplotype-block-assignment ├── README.md ├── nucmer_all_chromosomes_chromosome_level_assemblies_wheat.sh ├── nucmer_all_chromosomes_scaffold_level_assemblies_wheat.sh ├── get_ref_coords.r ├── combine_mummer_and_BLAST.r ├── assign_BLAST_blocks_whole_genome.r └── assign_mummer_blocks_whole_genome.r ├── pairwise-blast └── convert_gene_blocks.sh ├── IBD-regions-stats ├── slice_block_stats.sh ├── regions-without-IBD.Rmd └── regions-without-IBD-validate.Rmd ├── .gitignore ├── LICENSE ├── sequence_complexity ├── mummer_show_snps_all_aln_per_chr.sh ├── calculate_snp_distribution_6A_CS.r ├── calculate_sequence_complexity.rmd └── calculate_seq_complexity_stats.r ├── figures ├── plot_extended_data_figure_3.rmd ├── plot_extended_data_figure_8.rmd ├── plot_extended_data_figure_5.rmd ├── plot_extended_data_figure_6.rmd ├── plot_extended_data_figure_7.rmd ├── plot_extended_data_figure_1.Rmd ├── figure_1_plots.rmd └── figure_4_plots.rmd └── precision-recall-analysis ├── calculate_combined_precision_recall.r └── calculate_BLAST_blocks_all_window_sizes_flanking_sequences.r /README.md: -------------------------------------------------------------------------------- 1 | # pangenome-haplotypes 2 | Scripts to do haplotype analysis on pan genomes. 3 | -------------------------------------------------------------------------------- /haplotype-block-assignment/README.md: -------------------------------------------------------------------------------- 1 | # haplotype-block-assignment 2 | Scripts to assign haplotype blocks based on whole chromosome mummer alignments and pairwise BLAST alignments based on gene projections. 3 | -------------------------------------------------------------------------------- /pairwise-blast/convert_gene_blocks.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cd pangenome-web 4 | 5 | input_folder=blocks 6 | 7 | for chr in `ls $input_folder | grep chr` ; do 8 | 9 | input="${input_folder}/${chr}/haplotype_blocks_BLAST_25_gene_window_2000bp.txt" 10 | output="${input_folder}/${chr}/haplotype_blocks_BLAST_25_gene_window_2000bp_converted_${chr}_10genesBlocks.csv" 11 | echo $chr 12 | rake "haplotypes:convert_gene_coordinates[${input},${output},Wheat]" 13 | 14 | done 15 | 16 | -------------------------------------------------------------------------------- /IBD-regions-stats/slice_block_stats.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | 3 | cd pangenome-web 4 | 5 | output_folder=block_stats_by_slice_500kbp/ 6 | mkdir -p $output_folder 7 | folder=regions 8 | regions_path=$folder/cs_regions.bed 9 | block_sizes='Tae_5000kbp Tae_2500kbp Tae_1000kbp ' 10 | slice_size=500000 11 | 12 | for f in $block_sizes ;do 13 | echo $f 14 | output="${output_folder}/Block_slice500k_stats_${f}.tsv" 15 | echo $output 16 | rake "haplotypes:export_haplotype_block_stats_in_points[${output},${f},Wheat,$slice_size]" & 17 | done 18 | 19 | wait 20 | 21 | for f in $block_sizes ;do 22 | echo $f 23 | output="${output_folder}/Block_slice500k_stats_${f}.tsv" 24 | echo $output 25 | gzip -f $output & 26 | gzip -f $output.missing & 27 | done -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # History files 2 | .Rhistory 3 | .Rapp.history 4 | 5 | # Session Data files 6 | .RData 7 | 8 | # User-specific files 9 | .Ruserdata 10 | 11 | # Example code in package build process 12 | *-Ex.R 13 | 14 | # Output files from R CMD build 15 | /*.tar.gz 16 | 17 | # Output files from R CMD check 18 | /*.Rcheck/ 19 | 20 | # RStudio files 21 | .Rproj.user/ 22 | 23 | # produced vignettes 24 | vignettes/*.html 25 | vignettes/*.pdf 26 | 27 | # OAuth2 token, see https://github.com/hadley/httr/releases/tag/v0.3 28 | .httr-oauth 29 | 30 | # knitr and R markdown default cache directories 31 | *_cache/ 32 | /cache/ 33 | 34 | # Temporary files created by R markdown 35 | *.utf8.md 36 | *.knit.md 37 | 38 | # R Environment Variables 39 | .Renviron 40 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Uauy-Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /sequence_complexity/mummer_show_snps_all_aln_per_chr.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -e 2 | #SBATCH --mem=10000 3 | #SBATCH --mail-type=END,FAIL 4 | #SBATCH -N 1 5 | #SBATCH -n 1 6 | #SBATCH -p jic-short,RG-Cristobal-Uauy 7 | #SBATCH --array=0-255 8 | #SBATCH -o ./run_logs/show_snps.%a.%N.%j.out 9 | #SBATCH -e ./run_logs/show_snps.%a.%N.%j.err 10 | 11 | #DEFINE CHROMOSOME 12 | CHR_NAME="2B" 13 | 14 | t_id=$SLURM_ARRAY_TASK_ID 15 | 16 | x=$(($t_id / 15)) 17 | q_x=$(($t_id % 15)) 18 | 19 | 20 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis" "cadenza" "claire" "paragon" "robigus" "weebil") 21 | 22 | source mummer-3.23 23 | 24 | 25 | echo $t_id 26 | echo $x 27 | echo $q_x 28 | echo $c_x 29 | 30 | 31 | REF_NAME=${names[$x]} 32 | QUERY_NAME=${names[$q_x]} 33 | 34 | 35 | echo $REF_NAME 36 | echo $QUERY_NAME 37 | echo $CHR_NAME 38 | 39 | ALN_DIR='/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/'$REF_NAME/$CHR_NAME 40 | ALN_ID=$REF_NAME'_v_'$QUERY_NAME'.all_'$CHR_NAME'_filtered_L20Kb_rq' 41 | 42 | OUTPUT_DIR='/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/surrounding_seq/'$CHR_NAME 43 | 44 | mkdir -p $OUTPUT_DIR 45 | 46 | if [ "$REF_NAME" == "$QUERY_NAME" ] 47 | then 48 | echo "Reference and Query are the same" 49 | else 50 | echo "Reference and Query are different, continuing with mummer alignments" 51 | #extract the 10bp up and downstream of each snp 52 | show-snps -C -T -x 10 $ALN_DIR/$ALN_ID.delta > $OUTPUT_DIR/$ALN_ID'_10bpFlank.snps' 53 | fi 54 | 55 | -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_3.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot precision recall graphs" 3 | author: "Jemima Brinton" 4 | date: "17/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(ggplot2) 12 | library(RColorBrewer) 13 | ``` 14 | 15 | ```{r} 16 | base_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/precision_recall/" 17 | plot_dir <- paste0(base_dir, "plots/") 18 | 19 | combined_data <- read.table(file = paste0(plot_dir, "combined_precision_recall_f1_table.tsv"), 20 | sep = "\t", 21 | header = TRUE, 22 | stringsAsFactors =FALSE) 23 | combined_data$gene_block <- factor(combined_data$gene_block) 24 | combined_data$window_size <- factor(combined_data$window_size, levels = c("cdsbp", "0bp", "1000bp", "2000bp", "5000bp")) 25 | 26 | precision <- ggplot(combined_data, aes(x = window_size, y = precision, fill = gene_block)) + 27 | geom_boxplot(outlier.size = 0.7) + 28 | scale_fill_brewer(palette = "Set1") + 29 | theme_bw() + 30 | xlab("Flanking Sequence") + 31 | ylab("Precision") 32 | 33 | precision 34 | ``` 35 | ```{r} 36 | recall <- ggplot(combined_data, aes(x = window_size, y = recall, fill = gene_block)) + 37 | geom_boxplot(outlier.size = 0.7) + 38 | scale_fill_brewer(palette = "Set1") + 39 | theme_bw() + 40 | xlab("Flanking Sequence") + 41 | ylab("Recall") 42 | 43 | recall 44 | ``` 45 | 46 | ```{r} 47 | f1 <- ggplot(combined_data, aes(x = window_size, y = f1_score, fill = gene_block)) + 48 | geom_boxplot(outlier.size = 0.7) + 49 | scale_fill_brewer(palette = "Set1") + 50 | theme_bw() + 51 | xlab("Flanking Sequence") + 52 | ylab("F1 score") 53 | 54 | f1 55 | ``` 56 | 57 | ```{r} 58 | height = 4 59 | width = 5.5 60 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/precision_combined.pdf", height = height, width = width) 61 | precision 62 | dev.off() 63 | 64 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recall_combined.pdf", height = height, width = width) 65 | recall 66 | dev.off() 67 | 68 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/f1_combined.pdf", height = height, width = width) 69 | f1 70 | dev.off() 71 | 72 | 73 | ``` 74 | -------------------------------------------------------------------------------- /haplotype-block-assignment/nucmer_all_chromosomes_chromosome_level_assemblies_wheat.sh: -------------------------------------------------------------------------------- 1 | ## this will be just to write and submit the scripts for each chromosome 2 | 3 | cd /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome 4 | 5 | chr_names=("1A" "1B" "1D" "2A" "2B" "2D" "3A" "3B" "3D" "4A" "4B" "4D" "5A" "5B" "5D" "6A" "6B" "6D" "7A" "7B" "7D") 6 | 7 | 8 | for CHR_NAME in ${chr_names[*]} 9 | do 10 | echo $CHR_NAME 11 | echo \ 12 | '#!/bin/bash -e 13 | #SBATCH --mem=30G 14 | #SBATCH --mail-type=END,FAIL 15 | #SBATCH -N 1 16 | #SBATCH -n 1 17 | #SBATCH -p RG-Cristobal-Uauy,jic-long 18 | #SBATCH --array=0-99 19 | #SBATCH -o ./run_logs/run_nucmer.%a.%N.%j.out 20 | #SBATCH -e ./run_logs/run_nucmer.%a.%N.%j.err 21 | 22 | 23 | 24 | t_id=$SLURM_ARRAY_TASK_ID 25 | 26 | x=$(($t_id / 10)) 27 | q_x=$(($t_id % 10)) 28 | 29 | 30 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis") 31 | 32 | source mummer-3.23 33 | 34 | 35 | echo $t_id 36 | echo $x 37 | echo $q_x 38 | echo $c_x 39 | 40 | 41 | REF_NAME=${names[$x]} 42 | QUERY_NAME=${names[$q_x]} 43 | 44 | 45 | echo $REF_NAME 46 | echo $QUERY_NAME 47 | echo '$CHR_NAME' 48 | 49 | 50 | ASSEMBLIES_PATH="/nbi/group-data/ifs/NBI/Cristobal-Uauy/ramirezr/Pangenomes/Bn" 51 | OUTPUT_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/$REF_NAME/'$CHR_NAME'" 52 | CHROM_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/fasta/'$CHR_NAME'" 53 | 54 | mkdir -p $OUTPUT_DIR 55 | 56 | if [ "$REF_NAME" == "$QUERY_NAME" ] 57 | then 58 | echo "Reference and Query are the same" 59 | else 60 | echo "Reference and Query are different, continuing with mummer alignments" 61 | #run nucmer 62 | nucmer --mum --delta $CHROM_DIR/$REF_NAME.'$CHR_NAME'.fa $CHROM_DIR/$QUERY_NAME.'$CHR_NAME'.fa --prefix $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME' 63 | #filter for alignments of at least 20kb and rq 64 | delta-filter -l 20000 -r -q $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME'.delta > $OUTPUT_DIR/$REF_NAME.v.$QUERY_NAME.'$CHR_NAME'.filtered_L20Kb_rq.delta 65 | fi 66 | ' > /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_$CHR_NAME.sh 67 | 68 | sbatch /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_$CHR_NAME.sh 69 | done -------------------------------------------------------------------------------- /haplotype-block-assignment/nucmer_all_chromosomes_scaffold_level_assemblies_wheat.sh: -------------------------------------------------------------------------------- 1 | ## this will be just to write and submit the scripts for each chromosome 2 | 3 | cd /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome 4 | 5 | chr_names=("1A" "1B" "1D" "2A" "2B" "2D" "3A" "3B" "3D" "4A" "4B" "4D" "5A" "5B" "5D" "6A" "6B" "6D" "7A" "7B" "7D") 6 | 7 | 8 | for CHR_NAME in ${chr_names[*]} 9 | do 10 | echo $CHR_NAME 11 | echo \ 12 | '#!/bin/bash -e 13 | #SBATCH --mem=30G 14 | #SBATCH --mail-type=END,FAIL 15 | #SBATCH -N 1 16 | #SBATCH -n 1 17 | #SBATCH -p RG-Cristobal-Uauy,jic-long 18 | #SBATCH --array=0-49 19 | #SBATCH -o ./run_logs/run_nucmer_EI.%a.%N.%j.out 20 | #SBATCH -e ./run_logs/run_nucmer_EI.%a.%N.%j.err 21 | 22 | t_id=$SLURM_ARRAY_TASK_ID 23 | 24 | x=$(($t_id % 10)) 25 | q_x=$(($t_id / 10)) 26 | 27 | 28 | declare -a names=("arinalrfor" "chinese" "jagger" "julius" "lancer" "landmark" "mace" "norin61" "stanley" "sy_mattis") 29 | declare -a query_names=("cadenza" "claire" "paragon" "robigus" "weebil") 30 | 31 | source mummer-3.23 32 | 33 | 34 | echo $t_id 35 | echo $x 36 | echo $q_x 37 | 38 | 39 | REF_NAME=${names[$x]} 40 | QUERY_NAME=${query_names[$q_x]} 41 | 42 | 43 | echo $REF_NAME 44 | echo $QUERY_NAME 45 | echo $CHR_NAME 46 | 47 | OUTPUT_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/aln/'$REF_NAME/$CHR_NAME'" 48 | CHROM_DIR="/jic/scratch/groups/Cristobal-Uauy/brintonj/haplotype/whole_genome_mummer/fasta/'$CHR_NAME'" 49 | 50 | mkdir -p $OUTPUT_DIR 51 | 52 | if [ "$REF_NAME" == "$QUERY_NAME" ] 53 | then 54 | echo "Reference and Query are the same" 55 | else 56 | echo "Reference and Query are different, continuing with mummer alignments" 57 | #run nucmer 58 | nucmer --mum --delta $CHROM_DIR/$REF_NAME'.chr'$CHR_NAME'.fa' $CHROM_DIR/$QUERY_NAME'.'$CHR_NAME'_scaffs_projections.fa' --prefix $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME 59 | #filter for alignments of at least 20kb and rq 60 | delta-filter -l 20000 -r -q $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME'.delta' > $OUTPUT_DIR/$REF_NAME'_v_'$QUERY_NAME'.'$CHR_NAME'_filtered_L20Kb_rq.delta' 61 | fi 62 | ' > /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_EI_$CHR_NAME.sh 63 | 64 | sbatch /nbi/group-data/ifs/NBI/Cristobal-Uauy/Jemima/haplotype/mummer_whole_genome/scripts/chrom_aln_scripts/nucmer_aln_EI_$CHR_NAME.sh 65 | done -------------------------------------------------------------------------------- /sequence_complexity/calculate_snp_distribution_6A_CS.r: -------------------------------------------------------------------------------- 1 | 2 | 3 | library(stringr) 4 | library(ggplot2) 5 | 6 | 7 | #Compare all the snps against chinese spring on 6A 8 | #Takes output from mummer show-snps 9 | 10 | 11 | sequence_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/6A/" 12 | output_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/" 13 | 14 | ref <- "chinese" 15 | 16 | sequence_files <- list.files(sequence_dir, pattern = paste0(ref, "_v_")) 17 | sequence_files 18 | 19 | snps_positions_combined <- integer() 20 | 21 | for (file in sequence_files){ 22 | 23 | ref_query_temp <- str_split_fixed(file, "\\.", 2)[1] 24 | 25 | query <- str_split_fixed(ref_query_temp, "_", 3)[3] 26 | 27 | 28 | if(query == "sy"){ 29 | query <- "sy_mattis" 30 | } 31 | 32 | snp_flanks <- read.table(file = paste0(sequence_dir, file), 33 | sep = "\t", 34 | skip = 4, 35 | header = FALSE, 36 | stringsAsFactors = FALSE) 37 | 38 | colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY") 39 | 40 | #remove any snps that are Ns and also get rid of all irrelevant info - i.e. only keep chinese spring position and alleles 41 | snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")), c("P1", "SUB1", "SUB2")] 42 | nrow(snp_flanks_noN) 43 | 44 | #also remove any where the alleles are "." i.e. an indel 45 | snps_only <- snp_flanks_noN[!((snp_flanks_noN$SUB1 == ".") | (snp_flanks_noN$SUB2 == ".")),] 46 | nrow(snps_only) 47 | 48 | snp_positions <- unique(snps_only$P1) 49 | length(snp_positions) 50 | 51 | snp_positions_to_keep <- snp_positions[!(snp_positions %in% snps_positions_combined)] 52 | 53 | snps_positions_combined <- c(snps_positions_combined, snp_positions_to_keep) 54 | print(file) 55 | } 56 | 57 | head(snps_positions_combined) 58 | length(snps_positions_combined) 59 | 60 | snps_positions_combined <- snps_positions_combined[order(snps_positions_combined)] 61 | hist(snps_positions_combined) 62 | 63 | snps_positions_combined_df <- data.frame(chrom = "chr6A", pos = snps_positions_combined) 64 | 65 | write.table(snps_positions_combined_df, 66 | file = "Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_dist_6A_CS_ref.tsv", 67 | sep = "\t", 68 | row.names = FALSE, 69 | quote = FALSE) 70 | -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_8.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot haplotype specific markers" 3 | author: "Jemima Brinton" 4 | date: "15/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(dplyr) 12 | library(ggplot2) 13 | library(reshape2) 14 | 15 | convert_long_01_AB <- function(data){ 16 | converted <- data.frame(marker=character(), variable = character(), value = numeric(),pos = numeric()) 17 | markers <- unique(data$marker) 18 | 19 | for (i in seq(1, length(markers))){ 20 | marker <- markers[i] 21 | marker_data <- data[data$marker == marker,] 22 | snp_calls <- unique(marker_data$value) 23 | alleles <- snp_calls[(snp_calls %in% c("A", "B"))] 24 | #make sure chinese spring allele is always 0 25 | CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"] 26 | alt_allele <- alleles[!(alleles %in% CS_allele)] 27 | marker_data$value <- gsub(CS_allele, "0", marker_data$value) 28 | marker_data$value <- gsub(alt_allele, "1", marker_data$value) 29 | converted <- rbind(converted, marker_data) 30 | } 31 | converted$value <- gsub("N", "NA", converted$value) 32 | converted$value <- as.numeric(converted$value) 33 | return(converted) 34 | } 35 | 36 | plot_capture_specified_order_small <- function(data, start, end, outfile_prefix, order_list){ 37 | data_zoom <- data[(data$pos > start) & (data$pos < end),] 38 | data_zoom$variable <- factor(x = data_zoom$variable, levels = order_list, ordered = TRUE) 39 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) + 40 | geom_tile(aes(fill = value), color = "gray") + 41 | #scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") + 42 | #scale_fill_continuous(type = "viridis") + 43 | scale_fill_gradient(low = "grey70", high = "grey20", na.value = "white", aesthetics = "fill") + 44 | theme_bw() + 45 | theme(axis.text.x=element_text(angle=90, hjust=1)) #+ 46 | #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) ) 47 | #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 6, width = 6) 48 | return(plot) 49 | } 50 | ``` 51 | 52 | ```{r} 53 | hap_specific <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/haplotype_specific_markers_pangenome.csv", 54 | sep = ",", 55 | header = TRUE, 56 | stringsAsFactors = FALSE) 57 | 58 | hap_specific_long <- melt(hap_specific, id.vars = c("marker", "pos")) 59 | 60 | #hap_specific_01 <- convert_long_01(hap_specific_long) 61 | data <- hap_specific_long 62 | converted <- data.frame(marker=character(), variable = character(), value = numeric(),pos = numeric()) 63 | markers <- unique(data$marker) 64 | 65 | for (i in seq(1, length(markers))){ 66 | marker <- markers[i] 67 | marker_data <- data[data$marker == marker,] 68 | snp_calls <- unique(marker_data$value) 69 | alleles <- snp_calls[(snp_calls %in% c("A", "B"))] 70 | #make sure chinese spring allele is always 0 71 | CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"] 72 | alt_allele <- alleles[!(alleles %in% CS_allele)] 73 | marker_data$value <- gsub(CS_allele, "0", marker_data$value) 74 | marker_data$value <- gsub(alt_allele, "1", marker_data$value) 75 | converted <- rbind(converted, marker_data) 76 | } 77 | converted$value <- gsub("N", "NA", converted$value) 78 | converted$value <- as.numeric(converted$value) 79 | 80 | hap_specific_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY") 81 | 82 | hap_specific_01 <- converted 83 | 84 | to_plot_subset <- subset(hap_specific_01, variable %in% hap_specific_order) 85 | 86 | 87 | plot_hap_specific <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(hap_specific_order)) 88 | plot_hap_specific 89 | 90 | width = 5 91 | height = 4 92 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/haplotype_specific_markers.pdf", width = width, height = height) 93 | print(plot_hap_specific) 94 | dev.off() 95 | ``` -------------------------------------------------------------------------------- /sequence_complexity/calculate_sequence_complexity.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Calculate sequence complexity" 3 | author: "Jemima Brinton" 4 | date: "23/03/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(dplyr) 12 | library(magrittr) 13 | library(GenomicRanges) 14 | library(ggplot2) 15 | library(tidyr) 16 | library(viridis) 17 | library(stringr) 18 | library(dada2) 19 | ``` 20 | 21 | ```{r} 22 | #read in haplotype blocks 23 | hap_coords <- read.table("X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords_block_numbers_SPELTA_ADDED.tsv", 24 | sep = "\t", 25 | header = TRUE, 26 | stringsAsFactors = FALSE) 27 | 28 | head(hap_coords) 29 | 30 | hap_coords_mummer <- hap_coords[hap_coords$source == "mummer",] 31 | 32 | hap_coords_mummer_6A <- hap_coords_mummer[hap_coords_mummer$chrom == "chr6A",] 33 | sequence_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/6A/" 34 | output_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/" 35 | 36 | sequence_files<- list.files(sequence_dir) 37 | sequence_files 38 | 39 | 40 | all_snps <- data.frame(P1=numeric(), 41 | REF_SEQ = character(), 42 | REF = character(), 43 | QUERY = character(), 44 | hap_block = character()) 45 | 46 | for (file in sequence_files){ 47 | 48 | ref_query_temp <- str_split_fixed(file, "\\.", 2)[1] 49 | ref <- str_split_fixed(ref_query_temp, "_", 3)[1] 50 | query <- str_split_fixed(ref_query_temp, "_", 3)[3] 51 | 52 | if(ref == "sy"){ 53 | ref <- "sy_mattis" 54 | query <- str_split_fixed(ref_query_temp, "_", 4)[4] 55 | } 56 | 57 | if(query == "sy"){ 58 | query <- "sy_mattis" 59 | } 60 | 61 | snp_flanks <- read.table(file = paste0(sequence_dir, file), 62 | sep = "\t", 63 | skip = 4, 64 | header = FALSE, 65 | stringsAsFactors = FALSE) 66 | 67 | colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY") 68 | snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")),] 69 | 70 | to_keep <- snp_flanks_noN[,c("P1", "REF_SEQ", "REF", "QUERY")] 71 | 72 | to_keep$hap_block <- NA 73 | 74 | comp_haps <- hap_coords_mummer_6A[(hap_coords_mummer_6A$ref == ref) & (hap_coords_mummer_6A$query == query),] 75 | if (nrow(comp_haps) > 0){ 76 | for (i in seq(1, nrow(comp_haps))){ 77 | start <- comp_haps[i, "ref_start"] 78 | end <- comp_haps[i, "ref_end"] 79 | to_keep[(to_keep$P1 >= start) & (to_keep$P1 <= end), "hap_block"] <- "Y" 80 | } 81 | } 82 | 83 | to_keep[is.na(to_keep$hap_block), "hap_block"] <- "N" 84 | 85 | to_keep$REF_CPLX <- seqComplexity(to_keep$REF_SEQ) 86 | 87 | write.table(to_keep, 88 | file = paste0(output_dir, ref, "_v_", query, "6A_snps_ref_complexity.tsv"), 89 | sep = "\t", 90 | col.names = TRUE, 91 | row.names = FALSE, 92 | quote = FALSE) 93 | } 94 | 95 | 96 | ``` 97 | 98 | Do the same thing for 2B so we can compare a different chromosome 99 | 100 | ```{r} 101 | chrom <- "2B" 102 | hap_coords_mummer_chrom <- hap_coords_mummer[hap_coords_mummer$chrom == paste0("chr", chrom),] 103 | 104 | sequence_dir <- paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/", chrom, "/") 105 | output_dir <- paste0(sequence_dir, "complexity/") 106 | 107 | dir.create(output_dir, recursive = TRUE) 108 | 109 | sequence_files <- list.files(sequence_dir, pattern = "snps") 110 | 111 | all_snps <- data.frame(P1=numeric(), 112 | REF_SEQ = character(), 113 | REF = character(), 114 | QUERY = character(), 115 | hap_block = character()) 116 | 117 | for (file in sequence_files){ 118 | 119 | ref_query_temp <- str_split_fixed(file, "\\.", 2)[1] 120 | ref <- str_split_fixed(ref_query_temp, "_", 3)[1] 121 | query <- str_split_fixed(ref_query_temp, "_", 3)[3] 122 | 123 | if(ref == "sy"){ 124 | ref <- "sy_mattis" 125 | query <- str_split_fixed(ref_query_temp, "_", 4)[4] 126 | } 127 | 128 | if(query == "sy"){ 129 | query <- "sy_mattis" 130 | } 131 | 132 | snp_flanks <- read.table(file = paste0(sequence_dir, file), 133 | sep = "\t", 134 | skip = 4, 135 | header = FALSE, 136 | stringsAsFactors = FALSE) 137 | 138 | colnames(snp_flanks) <- c("P1", "SUB1", "SUB2", "P2", "BUFF", "DIST", "REF_SEQ", "QUERY_SEQ", "FRM", "TAGS", "REF", "QUERY") 139 | snp_flanks_noN <- snp_flanks[!((snp_flanks$SUB1 == "N") | (snp_flanks$SUB2 == "N")),] 140 | 141 | to_keep <- snp_flanks_noN[,c("P1", "REF_SEQ", "REF", "QUERY")] 142 | 143 | to_keep$hap_block <- NA 144 | 145 | comp_haps <- hap_coords_mummer_chrom[(hap_coords_mummer_chrom$ref == ref) & (hap_coords_mummer_chrom$query == query),] 146 | if (nrow(comp_haps) > 0){ 147 | for (i in seq(1, nrow(comp_haps))){ 148 | start <- comp_haps[i, "ref_start"] 149 | end <- comp_haps[i, "ref_end"] 150 | to_keep[(to_keep$P1 >= start) & (to_keep$P1 <= end), "hap_block"] <- "Y" 151 | } 152 | } 153 | 154 | to_keep[is.na(to_keep$hap_block), "hap_block"] <- "N" 155 | 156 | to_keep$REF_CPLX <- seqComplexity(to_keep$REF_SEQ) 157 | 158 | write.table(to_keep, 159 | file = paste0(output_dir, ref, "_v_", query, chrom, "_snps_ref_complexity.tsv"), 160 | sep = "\t", 161 | col.names = TRUE, 162 | row.names = FALSE, 163 | quote = FALSE) 164 | } 165 | 166 | ``` 167 | -------------------------------------------------------------------------------- /IBD-regions-stats/regions-without-IBD.Rmd: -------------------------------------------------------------------------------- 1 | # Find regions that are not in IBD regions 2 | 3 | From all the IBD regions. For this, I first exported the regions that are identical by descent from the MySQL database. From ther I have to 4 | 5 | 1. Split the table per assembly 6 | 2. Fix the chromosome order and length based on each assembly. 7 | 3. use ```genomicRanges``` ```diff``` to find the regions that are not overlapping. 8 | 9 | ```{r, results='hide',message=FALSE} 10 | suppressWarnings({ 11 | library(ggplot2) 12 | library(sqldf) 13 | library(reshape2) 14 | library(ggbio) 15 | library(GenomicRanges) 16 | library(GenomicFeatures) 17 | library(rtracklayer) 18 | library(gridExtra) 19 | library(grid) 20 | #require("bio.tilling") 21 | }) 22 | ``` 23 | 24 | Loading the auxiliary functions 25 | 26 | ```{r} 27 | fixRangesOrder<-function(gr,lens, ordr = c("chr1A","chr1B", "chr1D","chr2A","chr2B", "chr2D","chr3A","chr3B", "chr3D","chr4A","chr4B", "chr4D","chr5A","chr5B", "chr5D","chr6A","chr6B", "chr6D","chr7A","chr7B", "chr7D","chrUn")){ 28 | as_df <-data.frame(gr) 29 | as_df$seqnames <- factor(as.character(as_df$seqnames),levels = ordr ) 30 | vect = lens$length 31 | names(vect) <-lens$seqname 32 | gr2 <- makeGRangesFromDataFrame(as_df,keep.extra.columns = T, seqinfo=vect) 33 | trim(gr2) 34 | } 35 | 36 | ``` 37 | 38 | 39 | Now we need to read the files 40 | 41 | ```{r,message=FALSE, results='hide', echo=TRUE} 42 | input_folder <- "/Users/ramirezr/Dropbox/JIC/SM1/jemima/block_in_pseudomolecules" 43 | window_sizes <- c("5000kbp") 44 | 45 | readAllBlocks<-function(input_folder, window_size="5000kbp", prefix="Blocks_in_pseudomolecules_Tae_", suffix=".tsv.gz", varieties=c("arinalrfor", "jagger" )){ 46 | path<-paste0(input_folder, "/", prefix, window_size, suffix) 47 | df <- read.csv(file =gzfile(path), header=T, sep = "\t" ) 48 | all_asm <- unique(df$assembly) 49 | missing_cultivars <- setdiff(all_asm,varieties ) 50 | 51 | blocks_to_remove <- df[df$assembly %in% missing_cultivars , "block_no"] 52 | 53 | df <- df[!(df$block_no %in% blocks_to_remove ),] 54 | 55 | all_grs <- list() 56 | for(asm in unique(df$assembly)){ 57 | tmp_df <- df[df$assembly == asm, ] 58 | chromosomes <- tmp_df[, c("chromosome", "chr_length")] 59 | colnames(chromosomes) <- c("seqnames", "length") 60 | chromosomes <- unique(chromosomes) 61 | tmp_df$chr_length <- NULL 62 | gr <- makeGRangesFromDataFrame(tmp_df,keep.extra.columns = T) 63 | gr <- fixRangesOrder(gr, chromosomes, ordr <- sort(chromosomes$seqnames)) 64 | all_grs[asm] <- gr 65 | } 66 | all_grs 67 | } 68 | suppressWarnings(blocks <- readAllBlocks(input_folder) 69 | ) 70 | ``` 71 | 72 | 73 | 74 | ```{r} 75 | autoplot(blocks$arinalrfor,layout = "karyogram") 76 | ``` 77 | 78 | Lets plot the block density to start: 79 | 80 | ```{r} 81 | 82 | calculate_block_coverage<-function(gr2){ 83 | v2 <- gr2 84 | cov_v2 <- coverage(x = v2) 85 | gr_bins <-GRanges(cov_v2) 86 | gr_bins$coverage <- gr_bins$score 87 | gr_bins$score <- NULL 88 | gr_bins$assembly <- gr2$assembly[1] 89 | gr_bins$reference <- gr2$reference[1] 90 | return(gr_bins) 91 | } 92 | 93 | 94 | plotBlockDensity<-function(gr,tilewidth=500000, base=2){ 95 | 96 | gr_bins <- calculate_block_coverage(gr) 97 | 98 | gr_bins$log_cov <- ceiling(log(gr_bins$coverage+1,base=base)) 99 | gr_bins$log_cov <- ifelse(gr_bins$log_cov < 1 ,0, gr_bins$log_cov ) 100 | cov_ranges<-list() 101 | for(r in sort(unique(gr_bins$log_cov))){ 102 | tmp_gr = gr_bins[gr_bins$log_cov == r] 103 | cov_ranges[as.character(r)] = paste0(round(min(tmp_gr$coverage),2),"-", round(max(tmp_gr$coverage),2)) 104 | } 105 | gr_bins$cov_legend <- factor(as.character(cov_ranges[as.character(gr_bins$log_cov)]), levels=cov_ranges) 106 | asm <- unique(gr$assembly) 107 | title <- paste0("Haplotype blocks for: " , asm ) 108 | autoplot(gr_bins, layout = "karyogram", aes(fill=cov_legend)) + ggtitle(title) + scale_fill_brewer(type="seq", palette = "Oranges") 109 | } 110 | 111 | 112 | ``` 113 | 114 | Now we are actually going to run the code for each dataset: 115 | 116 | 117 | Define the different sets to read 118 | ```{r} 119 | datasets <- list() 120 | datasets[["only_pseudomolecules_cultivar"]] <- c("arinalrfor","jagger","julius","lancer","landmark","mace","stanley","sy_mattis") 121 | datasets[["only_pseudomolecules"]] <- c("arinalrfor","chinese","jagger","julius","lancer","landmark","mace","norin61","stanley","sy_mattis","spelta") 122 | datasets[["only_cultivar"] ]<- c("arinalrfor","cadenza","claire","jagger","julius","lancer","landmark","mace","paragon","robigus","stanley","sy_mattis","weebil") 123 | datasets[["all_assemblies"]] <- c("arinalrfor","cadenza","chinese","claire","jagger","julius","lancer","landmark","mace","norin61","paragon","robigus","spelta","stanley","sy_mattis","weebil") 124 | datasets 125 | ``` 126 | 127 | ```{r} 128 | 129 | 130 | 131 | 132 | 133 | out_df <- NULL 134 | window_sizes <- c("5000kbp","2500kbp","1000kbp") 135 | for(dataset in names(datasets)){ 136 | output_folder <- paste0(input_folder, "/",dataset) 137 | vars <- datasets[[dataset]] 138 | print(output_folder) 139 | print(vars) 140 | dir.create(output_folder) 141 | 142 | for(window_size in window_sizes){ 143 | print(window_size) 144 | suppressWarnings(blocks <- readAllBlocks(input_folder,varieties=vars,window_size=window_size)) 145 | out_df <- NULL 146 | path_pdf <- paste0(output_folder,"/blocks_coverage_",window_size,".pdf") 147 | pdf(path_pdf,onefile = TRUE) 148 | for(asm in blocks){ 149 | print(plotBlockDensity(asm)) 150 | tmp_df <- data.frame(calculate_block_coverage(asm)) 151 | if(is.null(output_folder)){ 152 | out_df <- tmp_df 153 | }else{ 154 | out_df <- rbind(out_df, tmp_df) 155 | } 156 | } 157 | dev.off() 158 | write.csv(out_df, file=paste0(output_folder, "/blocks_coverage_",window_size,".csv")) 159 | } 160 | } 161 | ``` -------------------------------------------------------------------------------- /IBD-regions-stats/regions-without-IBD-validate.Rmd: -------------------------------------------------------------------------------- 1 | # Find regions that are not in IBD regions 2 | 3 | From all the IBD regions. For this, I first exported the regions that are identical by descent from the MySQL database. From ther I have to 4 | 5 | 1. Split the table per assembly 6 | 2. Fix the chromosome order and length based on each assembly. 7 | 3. use ```genomicRanges``` ```diff``` to find the regions that are not overlapping. 8 | 9 | ```{r, results='hide',message=FALSE} 10 | suppressWarnings({ 11 | library(ggplot2) 12 | library(sqldf) 13 | library(reshape2) 14 | library(ggbio) 15 | library(GenomicRanges) 16 | library(GenomicFeatures) 17 | library(rtracklayer) 18 | library(gridExtra) 19 | library(grid) 20 | #require("bio.tilling") 21 | }) 22 | ``` 23 | 24 | Loading the auxiliary functions 25 | 26 | ```{r} 27 | fixRangesOrder<-function(gr,lens, ordr = c("chr1A","chr1B", "chr1D","chr2A","chr2B", "chr2D","chr3A","chr3B", "chr3D","chr4A","chr4B", "chr4D","chr5A","chr5B", "chr5D","chr6A","chr6B", "chr6D","chr7A","chr7B", "chr7D","chrUn")){ 28 | as_df <-data.frame(gr) 29 | as_df$seqnames <- factor(as.character(as_df$seqnames),levels = ordr ) 30 | vect = lens$length 31 | names(vect) <-lens$seqname 32 | gr2 <- makeGRangesFromDataFrame(as_df,keep.extra.columns = T, seqinfo=vect) 33 | trim(gr2) 34 | } 35 | 36 | ``` 37 | 38 | 39 | Now we need to read the files 40 | 41 | ```{r,message=FALSE, results='hide', echo=TRUE} 42 | input_folder <- "/Users/ramirezr/Dropbox/JIC/SM1/jemima/block_in_pseudomolecules" 43 | window_sizes <- c("5000kbp") 44 | 45 | readAllBlocks<-function(input_folder, window_size="5000kbp", prefix="Blocks_in_pseudomolecules_Tae_", suffix=".tsv.gz", varieties=c("arinalrfor", "jagger" )){ 46 | path<-paste0(input_folder, "/", prefix, window_size, suffix) 47 | df <- read.csv(file =gzfile(path), header=T, sep = "\t" ) 48 | all_asm <- unique(df$assembly) 49 | missing_cultivars <- setdiff(all_asm,varieties ) 50 | 51 | blocks_to_remove <- df[df$assembly %in% missing_cultivars , "block_no"] 52 | 53 | df <- df[!(df$block_no %in% blocks_to_remove ),] 54 | 55 | all_grs <- list() 56 | for(asm in unique(df$assembly)){ 57 | tmp_df <- df[df$assembly == asm, ] 58 | chromosomes <- tmp_df[, c("chromosome", "chr_length")] 59 | colnames(chromosomes) <- c("seqnames", "length") 60 | chromosomes <- unique(chromosomes) 61 | tmp_df$chr_length <- NULL 62 | gr <- makeGRangesFromDataFrame(tmp_df,keep.extra.columns = T) 63 | gr <- fixRangesOrder(gr, chromosomes, ordr <- sort(chromosomes$seqnames)) 64 | all_grs[asm] <- gr 65 | } 66 | all_grs 67 | } 68 | suppressWarnings(blocks <- readAllBlocks(input_folder) 69 | ) 70 | ``` 71 | 72 | 73 | 74 | ```{r} 75 | autoplot(blocks$arinalrfor,layout = "karyogram") 76 | ``` 77 | 78 | Lets plot the block density to start: 79 | 80 | ```{r} 81 | 82 | calculate_block_coverage<-function(gr2){ 83 | v2 <- gr2 84 | cov_v2 <- coverage(x = v2) 85 | gr_bins <-GRanges(cov_v2) 86 | gr_bins$coverage <- gr_bins$score 87 | gr_bins$score <- NULL 88 | gr_bins$assembly <- gr2$assembly[1] 89 | gr_bins$reference <- gr2$reference[1] 90 | return(gr_bins) 91 | } 92 | 93 | 94 | plotBlockDensity<-function(gr,tilewidth=500000, base=2){ 95 | 96 | gr_bins <- calculate_block_coverage(gr) 97 | 98 | gr_bins$log_cov <- ceiling(log(gr_bins$coverage+1,base=base)) 99 | gr_bins$log_cov <- ifelse(gr_bins$log_cov < 1 ,0, gr_bins$log_cov ) 100 | cov_ranges<-list() 101 | for(r in sort(unique(gr_bins$log_cov))){ 102 | tmp_gr = gr_bins[gr_bins$log_cov == r] 103 | cov_ranges[as.character(r)] = paste0(round(min(tmp_gr$coverage),2),"-", round(max(tmp_gr$coverage),2)) 104 | } 105 | gr_bins$cov_legend <- factor(as.character(cov_ranges[as.character(gr_bins$log_cov)]), levels=cov_ranges) 106 | asm <- unique(gr$assembly) 107 | title <- paste0("Haplotype blocks for: " , asm ) 108 | autoplot(gr_bins, layout = "karyogram", aes(fill=cov_legend)) + ggtitle(title) + scale_fill_brewer(type="seq", palette = "Oranges") 109 | } 110 | 111 | 112 | ``` 113 | 114 | Now we are actually going to run the code for each dataset: 115 | 116 | 117 | Define the different sets to read 118 | ```{r} 119 | datasets <- list() 120 | datasets[["only_pseudomolecules_cultivar"]] <- c("arinalrfor","jagger","julius","lancer","landmark","mace","stanley","sy_mattis") 121 | datasets[["only_pseudomolecules"]] <- c("arinalrfor","chinese","jagger","julius","lancer","landmark","mace","norin61","stanley","sy_mattis","spelta") 122 | datasets[["only_cultivar"] ]<- c("arinalrfor","cadenza","claire","jagger","julius","lancer","landmark","mace","paragon","robigus","stanley","sy_mattis","weebil") 123 | datasets[["all_assemblies"]] <- c("arinalrfor","cadenza","chinese","claire","jagger","julius","lancer","landmark","mace","norin61","paragon","robigus","spelta","stanley","sy_mattis","weebil") 124 | datasets 125 | ``` 126 | 127 | ```{r} 128 | 129 | 130 | 131 | 132 | 133 | out_df <- NULL 134 | window_sizes <- c("5000kbp","2500kbp","1000kbp") 135 | for(dataset in names(datasets)){ 136 | output_folder <- paste0(input_folder, "/validate/",dataset) 137 | vars <- datasets[[dataset]] 138 | print(output_folder) 139 | print(vars) 140 | dir.create(output_folder) 141 | 142 | for(window_size in window_sizes){ 143 | print(window_size) 144 | suppressWarnings(blocks <- readAllBlocks(input_folder,varieties=vars,window_size=window_size)) 145 | out_df <- NULL 146 | path_pdf <- paste0(output_folder,"/blocks_coverage_",window_size,".pdf") 147 | pdf(path_pdf,onefile = TRUE) 148 | for(asm in blocks){ 149 | print(plotBlockDensity(asm)) 150 | tmp_df <- data.frame(calculate_block_coverage(asm)) 151 | if(is.null(output_folder)){ 152 | out_df <- tmp_df 153 | }else{ 154 | out_df <- rbind(out_df, tmp_df) 155 | } 156 | } 157 | dev.off() 158 | write.csv(out_df, file=paste0(output_folder, "/blocks_coverage_",window_size,".csv")) 159 | } 160 | } 161 | ``` -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_5.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plots for extended data 5" 3 | author: "Jemima Brinton" 4 | date: "12/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(ggplot2) 11 | library(viridis) 12 | ``` 13 | 14 | Want to plot the % of genome/chromosomes shared with each variety and every other variety 15 | 16 | ```{r} 17 | haplotypes <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/no_spelta_2gap/no_spelta_2gap/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv", 18 | sep = "\t", 19 | header = TRUE, 20 | stringsAsFactors = FALSE) 21 | 22 | head(haplotypes) 23 | 24 | chrom_lengths_pangenome <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt", 25 | header = FALSE, 26 | stringsAsFactors = FALSE) 27 | 28 | head(chrom_lengths_pangenome) 29 | 30 | chrom_lengths_refseq <- read.table("W:/WGAv1.0/161010_Chinese_Spring_v1.0_pseudomolecules.fai", 31 | sep = "\t", 32 | header = FALSE, 33 | stringsAsFactors = FALSE) 34 | 35 | head(chrom_lengths_refseq) 36 | 37 | chrom_lengths <- rbind(chrom_lengths_pangenome[,c(1,2)], chrom_lengths_refseq[,c(1,2)]) 38 | colnames(chrom_lengths) <- c("ref_chrom", "chrom_length") 39 | 40 | head(chrom_lengths) 41 | 42 | #add chrom lengths to the haplotype table 43 | 44 | haplotypes_chrom_len <- merge(haplotypes, chrom_lengths, all.x = TRUE, all.y = FALSE) 45 | head(haplotypes_chrom_len) 46 | 47 | #calculate each block as a percentage of chromosome length 48 | haplotypes_chrom_len$block_size_perc <- (haplotypes_chrom_len$block_size / haplotypes_chrom_len$chrom_length)*100 49 | head(haplotypes_chrom_len) 50 | 51 | # now we want to make a summary table with each ref, query, chromosome and percentage 52 | percentage_ref_query_chrom <- aggregate(block_size_perc ~ ref + query + chrom, data = haplotypes_chrom_len, FUN = sum) 53 | head(percentage_ref_query_chrom) 54 | 55 | #also want to calculate the whole genome value for each comparison 56 | genome_sizes <- aggregate(chrom_length ~ ref, data = unique(haplotypes_chrom_len[,c("ref", "chrom_length")]), FUN = sum) 57 | colnames(genome_sizes)[2] <- "genome_size" 58 | head(genome_sizes) 59 | 60 | #for the EI varieties we have an issue - since the block sizes are counted as coords in the reciprocal alignment where available, the genome size calculation is > than 21 chromosomes. Since we use the RefSeqv1.1 positions when considering EI EI varieties, we will use the chinese spring total genome size 61 | EI_var <- c("cadenza", "claire", "paragon", "robigus", "weebil") 62 | genome_sizes[genome_sizes$ref %in% EI_var, "genome_size"] <- genome_sizes[genome_sizes$ref == "chinese", "genome_size"] 63 | head(genome_sizes) 64 | 65 | haplotypes_chrom_len_genome <- merge(haplotypes_chrom_len, genome_sizes, all.x = TRUE, all.y = FALSE) 66 | head(haplotypes_chrom_len_genome) 67 | 68 | haplotypes_chrom_len_genome$block_size_perc_genome <- (haplotypes_chrom_len_genome$block_size / haplotypes_chrom_len_genome$genome_size)*100 69 | head(haplotypes_chrom_len_genome) 70 | 71 | # now we want to make a summary table with each ref, query and genome percentage 72 | percentage_ref_query_genome <- aggregate(block_size_perc_genome ~ ref + query, data = haplotypes_chrom_len_genome, FUN = sum) 73 | head(percentage_ref_query_genome) 74 | percentage_ref_query_genome$chrom <- "genome" 75 | colnames(percentage_ref_query_genome)[3] <- "block_size_perc" 76 | percentage_ref_query_genome <- percentage_ref_query_genome[,c("ref", "query", "chrom", "block_size_perc")] 77 | 78 | #combine the chromosome and genome into a single data frame 79 | percentage_ref_query_chrom_genome <- rbind(percentage_ref_query_genome, percentage_ref_query_chrom) 80 | head(percentage_ref_query_chrom_genome) 81 | #set the y axis order for plotting 82 | chromosomes <- unique(percentage_ref_query_chrom$chrom) 83 | percentage_ref_query_chrom_genome$chrom <- factor(percentage_ref_query_chrom_genome$chrom, levels = c(chromosomes[order(chromosomes, decreasing = TRUE)], "genome")) 84 | 85 | ``` 86 | 87 | Ok now we want to plot a heatmap per reference variety 88 | ```{r} 89 | text_size <- 4 90 | width = 5.8 91 | height = 6.5 92 | res = 8000 93 | 94 | refs <- unique(percentage_ref_query_chrom_genome$ref) 95 | 96 | for (i in seq(1, length(refs))){ 97 | ref <- refs[i] 98 | print(ref) 99 | ref_data <- percentage_ref_query_chrom_genome[percentage_ref_query_chrom_genome$ref == ref,] 100 | 101 | #want to order the ref data based on the shared genome percentage 102 | genome_shared <- ref_data[ref_data$chrom == "genome",] 103 | genome_shared <- genome_shared[order(genome_shared$block_size_perc),] 104 | ref_data$query <- factor(ref_data$query, levels = genome_shared$query) 105 | 106 | ref_shared_plot <- ggplot(ref_data, aes(y = chrom, x = query)) + 107 | geom_tile(aes(fill = block_size_perc)) + 108 | theme_bw() + 109 | theme(axis.text.x = element_text(angle = 90), panel.background = element_rect(fill = "black"), panel.grid.major = element_blank(), 110 | panel.grid.minor = element_blank(), legend.position = "none") + 111 | scale_fill_viridis(option = "inferno", limits = c(0,100)) + 112 | geom_text(aes(label = round(block_size_perc, digits = 1)), colour = "gray45", size = text_size) + 113 | ggtitle(ref) 114 | 115 | print(ref_shared_plot) 116 | 117 | 118 | #png(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/shared_genome_percentage_", ref, ".png"), width = width, height #= height, units = "in", res = res) 119 | #print(ref_shared_plot) 120 | #dev.off() 121 | 122 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/shared_genome_percentage_", ref, ".pdf"), width = width, height = height) 123 | print(ref_shared_plot) 124 | dev.off() 125 | } 126 | 127 | ``` -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_6.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plots for Extended data 5 panel B" 3 | author: "Jemima Brinton" 4 | date: "13/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | library(ggplot2) 11 | library(ggbio) 12 | library(stringr) 13 | library(GenomicRanges) 14 | library(viridis) 15 | library(scales) 16 | 17 | ``` 18 | 19 | Want to produced the figures for extended data 5 which show the highly conserved regions for different bin sizes and include locations of agronimically important genes 20 | 21 | ```{r} 22 | shared_data_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/unique_shared_regions/" 23 | out_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/" 24 | set <- "all_assemblies" 25 | 26 | set_dir <- paste0(shared_data_dir, set, "/") 27 | 28 | 29 | chrom_lengths_pangenome <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt", 30 | header = FALSE, 31 | stringsAsFactors = FALSE) 32 | 33 | head(chrom_lengths_pangenome) 34 | 35 | chrom_lengths_refseq <- read.table("W:/WGAv1.0/161010_Chinese_Spring_v1.0_pseudomolecules.fai", 36 | sep = "\t", 37 | header = FALSE, 38 | stringsAsFactors = FALSE) 39 | 40 | head(chrom_lengths_refseq) 41 | 42 | chrom_lengths <- rbind(chrom_lengths_pangenome[,c(1,2)], chrom_lengths_refseq[,c(1,2)]) 43 | colnames(chrom_lengths) <- c("seqnames", "chrom_length") 44 | 45 | head(chrom_lengths) 46 | 47 | important_genes <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/important_locus_locations.csv", 48 | sep = ",", 49 | header = TRUE, 50 | stringsAsFactors = FALSE) 51 | 52 | head(important_genes) 53 | important_genes <- merge(important_genes, chrom_lengths, all.x = TRUE, all.y = FALSE) 54 | head(important_genes) 55 | #add scaled chromosome position 56 | important_genes$perc_location <- ceiling((important_genes$location/important_genes$chrom_length)*1000000000) 57 | bin_sizes <- c(2500, 1000) 58 | ``` 59 | 60 | ```{r} 61 | for (bin_size in bin_sizes){ 62 | 63 | block_set_file <- paste0(set_dir, "blocks_coverage_", bin_size, "kbp.csv") 64 | block_set_file 65 | coverage <- read.table(block_set_file, 66 | sep = ",", 67 | header = TRUE, 68 | stringsAsFactors = FALSE) 69 | 70 | 71 | head(coverage) 72 | unique(coverage$coverage) 73 | 74 | 75 | 76 | coverage_len <- merge(coverage, chrom_lengths, all.x = TRUE, all.y = FALSE) 77 | 78 | #also add a column that has generic chromosome name 79 | coverage_len$chromosome <- str_split_fixed(coverage_len$seqnames, "_", 2)[,1] 80 | head(coverage_len) 81 | 82 | #calculate each chunk as a percentage of the length of the chromosome 83 | coverage_len$width_perc <- (coverage_len$width/coverage_len$chrom_length)*100 84 | 85 | 86 | ## add the scaled chromosome positions to coverage_len_gen 87 | coverage_len$start_perc <- ceiling((coverage_len$start/coverage_len$chrom_length)*1000000000) 88 | coverage_len$end_perc <- ceiling((coverage_len$end/coverage_len$chrom_length)*1000000000) 89 | coverage_len$perc_chrom_length <- 1000000001 90 | 91 | 92 | 93 | # generate the chromosome length vector for the scaled chromosome plots 94 | perc_chrom_length <- rep(1000000001, length(unique(coverage_len$chromosome))) 95 | names(perc_chrom_length) <- unique(coverage_len$chromosome) 96 | perc_chrom_length <- perc_chrom_length[order(names(perc_chrom_length))] 97 | 98 | 99 | #order the main table by chromosome and then coverage (to ensure correct order of plotting) 100 | coverage_len_gen_ordered <- coverage_len[order(coverage_len$chromosome), ] 101 | coverage_len_gen_ordered <- coverage_len_gen_ordered[order(coverage_len_gen_ordered$coverage),] 102 | 103 | # want the coverage to be discrete not continuous 104 | coverage_len_gen_ordered$cov_discrete <- coverage_len_gen_ordered$coverage 105 | coverage_len_gen_ordered$cov_discrete <- as.factor(as.character(coverage_len_gen_ordered$cov_discrete)) 106 | 107 | #cap at greater than 4 for the summary plotting 108 | coverage_len_gen_ordered_high <- coverage_len_gen_ordered[coverage_len_gen_ordered$coverage > 4,] 109 | coverage_len_gen_ordered_high <- coverage_len_gen_ordered_high[order(coverage_len_gen_ordered_high$coverage),] 110 | 111 | 112 | #Plot the highly conserved regions across all assemblies 113 | 114 | 115 | #get the viridis colours so we can specify the scale to start at 5 116 | vir_colours <- viridis(11) 117 | names(vir_colours) <- as.character(seq(1, 11)) 118 | 119 | coverage_len_gen_ordered_high$cov_discrete <- factor(coverage_len_gen_ordered_high$cov_discrete, levels = as.character(seq(1, 11))) 120 | 121 | coverage_GR_high <- GRanges(seqnames = coverage_len_gen_ordered_high$chromosome, 122 | ranges = IRanges(start = coverage_len_gen_ordered_high$start_perc, 123 | end = coverage_len_gen_ordered_high$end_perc), 124 | seqlengths = perc_chrom_length, 125 | coverage = coverage_len_gen_ordered_high$coverage, 126 | cov_discrete = coverage_len_gen_ordered_high$cov_discrete) 127 | 128 | 129 | important_genes_GR <- GRanges(seqnames = important_genes$seqnames, 130 | ranges = IRanges(start = important_genes$perc_location, 131 | end = important_genes$perc_location+1), 132 | y_value = 1, 133 | y_value_label = -6, 134 | seqlengths = perc_chrom_length, 135 | IDs = important_genes$Locus) 136 | 137 | summarised_conserved <- autoplot(coverage_GR_high, layout = "karyogram", aes(fill = cov_discrete, colour = cov_discrete), lwd = 0.2) + 138 | #scale_fill_viridis(discrete = TRUE) 139 | scale_fill_manual(values = vir_colours) + 140 | scale_colour_manual(values = vir_colours) + 141 | layout_karyogram(data = important_genes_GR, 142 | geom = "point", 143 | aes(x = start, y = y_value, label = IDs), 144 | colour = "red", 145 | fill = "red", 146 | size = 1, 147 | shape = 2) + 148 | layout_karyogram(data = important_genes_GR, 149 | geom = "text", 150 | aes(x = start, y = y_value_label, label = IDs), 151 | colour = "black", 152 | size = 3) 153 | 154 | summarised_conserved 155 | 156 | 157 | 158 | 159 | 160 | width = 6 161 | height = 6 162 | pdf(paste0(out_dir,"extended_data_5_summarised_conserved_regions_", bin_size, "bin.pdf"), width = width, height = height) 163 | print(summarised_conserved) 164 | dev.off() 165 | 166 | } 167 | ``` -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_7.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot 6A recombinants" 3 | author: "Jemima Brinton" 4 | date: "15/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(stringr) 12 | library(ggplot2) 13 | library("ggdendro") 14 | library(plyr) 15 | library(reshape2) 16 | library("grid") 17 | library(agricolae) 18 | library("gdata") 19 | library(RColorBrewer) 20 | 21 | convert_long_01 <- function(data){ 22 | converted <- data.frame(marker=character(), variable = character(), value = numeric(),pos = numeric()) 23 | markers <- unique(data$marker) 24 | 25 | for (i in seq(1, length(markers))){ 26 | marker <- markers[i] 27 | marker_data <- data[data$marker == marker,] 28 | snp_calls <- unique(marker_data$value) 29 | alleles <- snp_calls[(snp_calls %in% c("A", "C", "T", "G"))] 30 | if (length(alleles) > 1){ 31 | marker_data$value <- gsub(alleles[1], "0", marker_data$value) 32 | marker_data$value <- gsub(alleles[2], "1", marker_data$value) 33 | } else if (length(alleles) == 1) { 34 | marker_data$value <- gsub(alleles[1], "0", marker_data$value) 35 | print(marker) 36 | print(alleles) 37 | } else if (length(alleles) == 0 | length(alleles) >2) { 38 | print(marker) 39 | print(alleles) 40 | } 41 | converted <- rbind(converted, marker_data) 42 | } 43 | #converted$value <- gsub("N", "NA", converted$value) 44 | converted$value <- as.numeric(converted$value) 45 | return(converted) 46 | } 47 | 48 | plot_capture_clust <- function(data, start, end, outfile_prefix){ 49 | data_zoom <- data[(data$pos > start) & (data$pos < end),] 50 | #print(table(data_zoom$aln_type)) 51 | zoom_mat <- dcast(data_zoom[,c("variable", "pos", "value")], variable ~ pos) 52 | zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))]) 53 | dim(zoom.matrix) 54 | rownames(zoom.matrix) <- zoom_mat$variable 55 | zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix))) 56 | #dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE) 57 | #ggsave(plot = dendro.plot, file = paste0(outfile_prefix, "_dendrogram.png"), dpi = 600, height = 18, width = 8) 58 | zoom.order <- order.dendrogram(zoom.dendro) 59 | data_zoom$variable <- factor(x = data_zoom$variable, levels = zoom_mat$variable[zoom.order], ordered = TRUE) 60 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) + 61 | geom_tile(aes(fill = data_zoom$value)) + 62 | #scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") + 63 | #scale_fill_continuous(type = "viridis") + 64 | scale_fill_gradient(low = "#440154FF", high = "#1F968BFF", na.value = "grey50", aesthetics = "fill") + 65 | theme(axis.text.x=element_text(angle=90, hjust=1)) #+ 66 | #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) ) 67 | #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 8, width = 18) 68 | return(plot) 69 | } 70 | 71 | plot_capture_clust_AGCT <- function(data, start, end, outfile_prefix){ 72 | data_zoom <- data[(data$pos > start) & (data$pos < end),] 73 | #print(table(data_zoom$aln_type)) 74 | zoom_mat <- dcast(data_zoom[,c("variable", "pos", "value")], variable ~ pos) 75 | zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))]) 76 | dim(zoom.matrix) 77 | rownames(zoom.matrix) <- zoom_mat$variable 78 | zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix))) 79 | #dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE) 80 | #ggsave(plot = dendro.plot, file = paste0(outfile_prefix, "_dendrogram.png"), dpi = 600, height = 18, width = 8) 81 | zoom.order <- order.dendrogram(zoom.dendro) 82 | data_zoom$variable <- factor(x = data_zoom$variable, levels = zoom_mat$variable[zoom.order], ordered = TRUE) 83 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) + 84 | geom_tile(aes(fill = data_zoom$value)) + 85 | #scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") + 86 | #scale_fill_continuous(type = "viridis") + 87 | #scale_fill_gradient(low = "#440154FF", high = "#1F968BFF", na.value = "grey50", aesthetics = "fill") + 88 | theme(axis.text.x=element_text(angle=90, hjust=1)) #+ 89 | #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) ) 90 | #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 8, width = 18) 91 | return(plot) 92 | } 93 | ``` 94 | 95 | ```{r} 96 | summarised_data <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recombinants/anova_adjusted_means_5_year_summary_6A_recs.csv", 97 | sep = ",", 98 | header = TRUE, 99 | stringsAsFactors = FALSE) 100 | 101 | head(summarised_data) 102 | 103 | means <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = mean) 104 | 105 | head(means) 106 | 107 | means$sd <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = sd)[,3] 108 | means$count <- aggregate(grain_width ~ group + group_call, data = summarised_data, FUN = length)[,3] 109 | 110 | means$sterr <- means$sd/(sqrt(means$count)) 111 | 112 | groups <- rev(unique(summarised_data$group)) 113 | means$group <- factor(means$group, levels = groups) 114 | ``` 115 | 116 | plot bar chart of values 117 | 118 | ```{r} 119 | 120 | group_colours <- c("S" = "white", "SR" = "gray63", "R" = "gray15") 121 | means_plot <- ggplot(means, aes(x = grain_width, y = group, fill = group_call)) + 122 | geom_bar(stat="identity", colour = "black", width = 0.6) + 123 | theme_bw() + 124 | coord_cartesian(xlim = c(3.4, 3.8)) + 125 | geom_errorbar(aes(xmin=grain_width-sterr, xmax=grain_width+sterr), width=.2, 126 | position=position_dodge(.9)) + 127 | scale_fill_manual(values = group_colours) 128 | 129 | means_plot 130 | ``` 131 | ```{r} 132 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/recombinants/anova_adjusted_means_5_year_summary_6A.pdf", height = 6, width = 7) 133 | means_plot 134 | dev.off() 135 | 136 | ``` 137 | 138 | Also want to plots the 35K markers across this regions for the bottom panel of the figure 139 | 140 | ```{r} 141 | markers_6A <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/H3_35K_markers_whole6A.csv", 142 | sep = ",", 143 | header = TRUE, 144 | stringsAsFactors = FALSE) 145 | 146 | haplotype_alloc <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/UKRL_H3_plotting_order.csv", 147 | sep = ",", 148 | header = TRUE, 149 | stringsAsFactors = FALSE) 150 | 151 | H3_vars <- haplotype_alloc$line 152 | 153 | colnames(markers_6A)[1] <- "marker" 154 | 155 | markers_6A_long <- melt(markers_6A, id.vars = c("marker", "pos")) 156 | 157 | ``` 158 | 159 | ```{r} 160 | markers_H3_AGCT <- markers_6A_long[markers_6A_long$variable %in% H3_vars,] 161 | base_colours <- c("A" = "#7570B3", "C" = "#E7298A", "G" = "#1B9E77", "T" = "#E6AB02") 162 | 163 | 164 | data = markers_H3_AGCT 165 | start = 23602422 166 | end = 581841760 167 | data_zoom <- data[(data$pos > start) & (data$pos < end),] 168 | data_zoom$variable <- factor(data_zoom$variable, levels = rev(H3_vars)) 169 | 170 | 171 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) + 172 | geom_tile(aes(fill = value)) + 173 | theme(axis.text.x=element_text(angle=90, hjust=1)) + 174 | scale_fill_manual(values = base_colours) + 175 | theme_bw() 176 | 177 | plot 178 | 179 | height= 4 180 | width = 10 181 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/H3_35K_6Aregion.pdf", height = height, width = width) 182 | plot 183 | dev.off() 184 | ``` 185 | 186 | -------------------------------------------------------------------------------- /haplotype-block-assignment/get_ref_coords.r: -------------------------------------------------------------------------------- 1 | ## Get the position of the BLAST blocks with respect to the specific reference assemblies, rather than chinese spring RefSeq 2 | library(stringr) 3 | 4 | ## define functions here 5 | get_ref_start <- function(data, ref){ 6 | ref_start_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["start_transcript"], ] 7 | if (nrow(ref_start_data) > 1){ 8 | ref_start <- ref_start_data[ref_start_data$score == max(ref_start_data$score), "start"] 9 | if (length(ref_start) > 1){ 10 | temp <- ref_start_data[ref_start_data$score == max(ref_start_data$score), ] 11 | ref_start <- temp[temp$start == min(temp$start), "start"] 12 | } 13 | } else { 14 | ref_start <- ref_start_data[, "start"] 15 | } 16 | ref_start 17 | } 18 | 19 | get_ref_start_transcript <- function(data, ref){ 20 | ref_start_transcript_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["start_transcript"],] 21 | if (nrow(ref_start_transcript_data) > 1){ 22 | ref_start_transcript <- ref_start_transcript_data[ref_start_transcript_data$score == max(ref_start_transcript_data$score), "var_id"] 23 | if (length(ref_start_transcript) > 1){ 24 | temp <- ref_start_transcript_data[ref_start_transcript_data$score == max(ref_start_transcript_data$score), ] 25 | ref_start_transcript <- temp[temp$start == min(temp$start), "var_id"] 26 | } 27 | } else { 28 | ref_start_transcript <- ref_start_transcript_data[, "var_id"] 29 | } 30 | } 31 | 32 | get_ref_end <- function(data, ref){ 33 | ref_end_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["end_transcript"], ] 34 | if (nrow(ref_end_data) > 1){ 35 | ref_end <- ref_end_data[ref_end_data$score == max(ref_end_data$score), "end"] 36 | if (length(ref_end) > 1){ 37 | temp <- ref_end_data[ref_end_data$score == max(ref_end_data$score), ] 38 | ref_end <- temp[temp$start == min(temp$start), "end"] 39 | } 40 | } else { 41 | ref_end <- ref_end_data[, "end"] 42 | } 43 | ref_end 44 | } 45 | 46 | get_ref_end_transcript <- function(data, ref){ 47 | ref_end_transcript_data <- ref_gff_gene_only[ref_gff_gene_only$cs_id == data["end_transcript"], ] 48 | if (nrow(ref_end_transcript_data) > 1){ 49 | ref_end_transcript <- ref_end_transcript_data[ref_end_transcript_data$score == max(ref_end_transcript_data$score), "var_id"] 50 | if (length(ref_end_transcript) > 1){ 51 | temp <- ref_end_transcript_data[ref_end_transcript_data$score == max(ref_end_transcript_data$score), ] 52 | ref_end_transcript <- temp[temp$start == min(temp$start), "var_id"] 53 | } 54 | } else { 55 | ref_end_transcript <- ref_end_transcript_data[, "var_id"] 56 | } 57 | ref_end_transcript 58 | } 59 | 60 | ## end of functions 61 | data_dir <- "X:/brintonj/haplotype/whole_genome_blast/blocks/" 62 | 63 | chromosomes <- list.files(data_dir) 64 | 65 | for (chr in chromosomes){ 66 | BLAST_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp.txt") 67 | 68 | BLAST <- read.table(BLAST_path, 69 | sep = "\t", 70 | header = TRUE, 71 | stringsAsFactors = FALSE) 72 | 73 | head(BLAST) 74 | 75 | BLAST$ref <- str_split_fixed(BLAST$aln_type, "->", 2)[,1] 76 | BLAST$query <- str_split_fixed(BLAST$aln_type, "->", 2)[,2] 77 | 78 | #make the opposite ref query scenario for the blast 79 | BLAST_copy <- BLAST 80 | BLAST_copy$ref <- BLAST$query 81 | BLAST_copy$query <- BLAST$ref 82 | 83 | BLAST_recip <- rbind(BLAST, BLAST_copy) 84 | 85 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil") 86 | EI_blocks <- subset(BLAST_recip, ref %in% EI_vars) 87 | 88 | references <- unique(BLAST_recip$ref) 89 | references <- references[!(references %in% EI_vars)] 90 | 91 | ref_gff_dir <- "W:/assemblies/releasePGSBv2.0/gff/" 92 | 93 | BLAST_coords <- data.frame(block_no=numeric(), 94 | block_start = numeric(), 95 | block_end = numeric(), 96 | aln_type = character(), 97 | start_transcript = character(), 98 | end_transcript = character(), 99 | window = numeric(), 100 | ref = character(), 101 | query = character(), 102 | ref_start = numeric(), 103 | ref_start_transcript = character(), 104 | ref_end = numeric(), 105 | ref_end_transcript = character()) 106 | 107 | for (ref in references) { 108 | 109 | ref_gff_path <- paste0(ref_gff_dir, ref, ".gff") 110 | 111 | ref_gff <- read.table(ref_gff_path, 112 | sep = "\t", 113 | header = FALSE, 114 | stringsAsFactors = FALSE) 115 | 116 | 117 | ref_gff_gene_only <- ref_gff[ref_gff$V3 == "gene",] 118 | 119 | ref_gff_gene_only <- cbind(ref_gff_gene_only, str_split_fixed(ref_gff_gene_only$V9, ";", 4)) 120 | colnames(ref_gff_gene_only) <- c("chr", "annotation", "biotype", "start", "end", "score", "strand", "missing", "info", "ref_id", "cs_id", "frame", "note") 121 | 122 | ref_gff_gene_only$var_id <- gsub("ID=", "", ref_gff_gene_only$ref_id) 123 | ref_gff_gene_only$cs_id <- gsub("srcmodel=", "", ref_gff_gene_only$cs_id) 124 | ref_gff_gene_only$cs_id <- str_split_fixed(ref_gff_gene_only$cs_id, "\\.", 2)[,1] 125 | 126 | 127 | ## this bit gets the reference blocks 128 | ## we also need to make sure that we incorportate the EI varieties because we can't get their position (scaffolds) so we will use the query position for this too 129 | BLAST_ref <- BLAST_recip[BLAST_recip$ref == ref,] 130 | BLAST_ref <- rbind(BLAST_ref, EI_blocks[EI_blocks$query == ref,]) 131 | BLAST_ref <- BLAST_ref[complete.cases(BLAST_ref),] 132 | 133 | BLAST_ref$ref_start <- apply(BLAST_ref, 1, FUN=get_ref_start, ref = ref) 134 | BLAST_ref$ref_start_transcript <- apply(BLAST_ref, 1, FUN=get_ref_start_transcript, ref = ref) 135 | BLAST_ref$ref_end <- apply(BLAST_ref, 1, FUN=get_ref_end, ref = ref) 136 | BLAST_ref$ref_end_transcript <- apply(BLAST_ref, 1, FUN=get_ref_end_transcript, ref = ref) 137 | 138 | BLAST_coords <- rbind(BLAST_coords, BLAST_ref) 139 | } 140 | 141 | BLAST_coords$block_no <- as.character(BLAST_coords$block_no) 142 | BLAST_coords$block_start <- as.numeric(as.character(BLAST_coords$block_start)) 143 | BLAST_coords$block_end <- as.numeric(as.character(BLAST_coords$block_end)) 144 | BLAST_coords$aln_type <- as.character(BLAST_coords$aln_type) 145 | BLAST_coords$start_transcript <- as.character(BLAST_coords$start_transcript) 146 | BLAST_coords$end_transcript <- as.character(BLAST_coords$end_transcript) 147 | BLAST_coords$window <- as.numeric(as.character(BLAST_coords$window)) 148 | BLAST_coords$ref <- as.character(BLAST_coords$ref) 149 | BLAST_coords$query <- as.character(BLAST_coords$query) 150 | BLAST_coords$ref_start <- as.numeric(as.character(BLAST_coords$ref_start)) 151 | BLAST_coords$ref_start_transcript <- as.character(BLAST_coords$ref_start_transcript) 152 | BLAST_coords$ref_end <- as.numeric(as.character(BLAST_coords$ref_end)) 153 | BLAST_coords$ref_end_transcript <- as.character(BLAST_coords$ref_end_transcript) 154 | 155 | write.table(BLAST_coords, 156 | file = paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"), 157 | sep = "\t", 158 | col.names = TRUE, 159 | row.names = FALSE, 160 | quote = FALSE) 161 | } 162 | 163 | ## Unfortunately this misses out EI EI comparisons, so we need to add them back 164 | for (chr in chromosomes){ 165 | BLAST_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp.txt") 166 | 167 | BLAST <- read.table(BLAST_path, 168 | sep = "\t", 169 | header = TRUE, 170 | stringsAsFactors = FALSE) 171 | 172 | BLAST_coords_path <- paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt") 173 | 174 | BLAST_coords <- read.table(BLAST_coords_path, 175 | sep = "\t", 176 | header = TRUE, 177 | stringsAsFactors = FALSE) 178 | 179 | BLAST$ref <- str_split_fixed(BLAST$aln_type, "->", 2)[,1] 180 | BLAST$query <- str_split_fixed(BLAST$aln_type, "->", 2)[,2] 181 | 182 | EI_BLAST <- BLAST[BLAST$ref %in% EI_vars & BLAST$query %in% EI_vars,] 183 | 184 | EI_BLAST$ref_start <- EI_BLAST$block_start 185 | EI_BLAST$ref_start_transcript <- EI_BLAST$start_transcript 186 | EI_BLAST$ref_end <- EI_BLAST$block_end 187 | EI_BLAST$ref_end_transcript <- EI_BLAST$end_transcript 188 | 189 | EI_recip <- EI_BLAST 190 | EI_recip$ref <- EI_BLAST$query 191 | EI_recip$query <- EI_BLAST$ref 192 | EI_recip <- rbind(EI_BLAST, EI_recip) 193 | 194 | BLAST_coords <- rbind(BLAST_coords, EI_recip) 195 | 196 | write.table(BLAST_coords, 197 | file = paste0(data_dir, chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"), 198 | sep = "\t", 199 | col.names = TRUE, 200 | row.names = FALSE, 201 | quote = FALSE) 202 | } 203 | -------------------------------------------------------------------------------- /haplotype-block-assignment/combine_mummer_and_BLAST.r: -------------------------------------------------------------------------------- 1 | ## Script written by Jemima Brinton 2019 2 | ## Aim - combine haplotype blocks from nucmer and BLAST alignments. First use mummer/nucmer blocks, then check BLAST blocks - any not represented by the mummer blocks and >= bin size should be added to the final set 3 | 4 | library(GenomicRanges) 5 | ##functions 6 | 7 | check_block <- function(data, EI_vars, mummer_blocks){ 8 | #print(data) 9 | if ((data["ref"] %in% EI_vars) & (data["query"] %in% EI_vars)){ 10 | count_overlaps <- 0 11 | } else { 12 | if ((data["ref"] %in% EI_vars)){ 13 | mummer_gdf <- mummer_blocks[mummer_blocks$query == data["ref"] & mummer_blocks$ref == data["query"], c("chrom", "block_start", "block_end")] 14 | } else { 15 | mummer_gdf <- mummer_blocks[mummer_blocks$query == data["query"] & mummer_blocks$ref == data["ref"], c("chrom", "block_start", "block_end")] 16 | } 17 | if(nrow(mummer_gdf)>0){ 18 | colnames(mummer_gdf) <- c("chr", "start", "end") 19 | #print(mummer_gdf) 20 | mummer_gr <- makeGRangesFromDataFrame(mummer_gdf) 21 | 22 | to_check_gdf <- data.frame(chr = character(), start = numeric(), end = numeric(), stringsAsFactors = FALSE) 23 | to_check_gdf[1,"chr"] <- data["chrom"] 24 | to_check_gdf[1,"start"] <- data[("ref_start")] 25 | to_check_gdf[1,"end"] <- data[("ref_end")] 26 | #print(to_check_gdf) 27 | 28 | to_check_gr <- makeGRangesFromDataFrame(to_check_gdf) 29 | 30 | overlaps <- findOverlaps(to_check_gr, mummer_gr) 31 | count_overlaps <- length(unique(subjectHits(overlaps))) 32 | } else { 33 | count_overlaps <- 0 34 | } 35 | } 36 | return(count_overlaps) 37 | #print("finish") 38 | } 39 | 40 | ### 41 | 42 | mummer_dir <- "X:/brintonj/haplotype/whole_genome_mummer/blocks/" 43 | BLAST_dir <- "X:/brintonj/haplotype/whole_genome_blast/blocks/" 44 | 45 | chromosomes <- list.files(mummer_dir) 46 | 47 | bin_size <- 5000000 48 | 49 | combined_blocks <- data.frame(ref = character(), 50 | query = character(), 51 | chrom = character(), 52 | ref_start = numeric(), 53 | ref_end = numeric(), 54 | ref_start_transcript = character(), 55 | ref_end_transcript = character(), 56 | cs_start = numeric(), 57 | cs_end = numeric(), 58 | cs_start_transcript = character(), 59 | cs_end_transcript = character(), 60 | source = character()) 61 | 62 | for (chr in chromosomes){ 63 | mummer_blocks <- read.table(paste0(mummer_dir, chr, "/mummer_blocks_chr", chr, ".min20000.5Mb_bins.txt"), 64 | header = TRUE, 65 | sep = "\t", 66 | stringsAsFactors = FALSE) 67 | 68 | BLAST_blocks <- read.table(paste0(BLAST_dir, "chr", chr, "/haplotype_blocks_BLAST_25_gene_window_2000bp_ref_coords.txt"), 69 | header = TRUE, 70 | sep = "\t", 71 | stringsAsFactors = FALSE) 72 | 73 | #filter the BLAST for blocks only above the mummer bin size 74 | #remove spelta 75 | BLAST_blocks$chrom <- paste0("chr", chr) 76 | BLAST_blocks <- BLAST_blocks[!(BLAST_blocks$ref == "spelta" | BLAST_blocks$query == "spelta"),] 77 | BLAST_blocks$block_size <- BLAST_blocks$ref_end - BLAST_blocks$ref_start 78 | BLAST_bin <- BLAST_blocks[BLAST_blocks$block_size >= bin_size,] 79 | BLAST_bin <- BLAST_bin[complete.cases(BLAST_bin$block_start),] 80 | 81 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil") 82 | BLAST_bin$overlap <- apply(BLAST_bin, 1, check_block, EI_vars = EI_vars, mummer_blocks = mummer_blocks) 83 | BLAST_bin$source <- "BLAST" 84 | BLAST_bin$in_reciprocal <- NA 85 | 86 | mummer_blocks$source <- "mummer" 87 | 88 | BLAST_to_add <- BLAST_bin[BLAST_bin$overlap == 0, c("ref", 89 | "query", 90 | "chrom", 91 | "ref_start", 92 | "ref_end", 93 | "ref_start_transcript", 94 | "ref_end_transcript", 95 | "block_start", 96 | "block_end", 97 | "start_transcript", 98 | "end_transcript", 99 | "source")] 100 | 101 | colnames(BLAST_to_add)[c(8:11)] <- c("cs_start", "cs_end", "cs_start_transcript", "cs_end_transcript") 102 | 103 | mummer_blocks$cs_start <- NA 104 | mummer_blocks$cs_end <- NA 105 | mummer_blocks$cs_start_transcript <- NA 106 | mummer_blocks$cs_end_transcript <- NA 107 | mummer_blocks$ref_start_transcript <- NA 108 | mummer_blocks$ref_end_transcript <- NA 109 | 110 | mummer_to_add <- mummer_blocks[, c("ref", 111 | "query", 112 | "chrom", 113 | "block_start", 114 | "block_end", 115 | "ref_start_transcript", 116 | "ref_end_transcript", 117 | "cs_start", 118 | "cs_end", 119 | "cs_start_transcript", 120 | "cs_end_transcript", 121 | "source")] 122 | colnames(mummer_to_add)[c(4,5)] <- c("ref_start", "ref_end") 123 | 124 | mummer_BLAST <- rbind(mummer_to_add, BLAST_to_add) 125 | 126 | combined_blocks <- rbind(combined_blocks, mummer_BLAST) 127 | } 128 | 129 | write.table(combined_blocks, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 130 | 131 | ref_coords_only <- combined_blocks[,c(1:5, 12)] 132 | 133 | write.table(ref_coords_only, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 134 | 135 | ## we also want to format the dataframe with the blocks being given a unique identifier 136 | ## columns we want: assembly chromosome start end block_no assembly_coords 137 | 138 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil") 139 | 140 | ref_coords_only_copy <- ref_coords_only 141 | 142 | ref_coords_only_copy$block_no <- NA 143 | ref_coords_only_copy$assembly_coords <- NA 144 | 145 | ref_coords_only_copy$block_size <- ref_coords_only_copy$ref_end - ref_coords_only_copy$ref_start 146 | ref_coords_only_copy <- ref_coords_only_copy[order(ref_coords_only_copy$block_size, decreasing = TRUE),] 147 | 148 | block_no <- 1 149 | 150 | 151 | for(i in seq(1, nrow(ref_coords_only_copy))){ 152 | data <- ref_coords_only_copy[i,] 153 | 154 | if(is.na(ref_coords_only_copy[i, "block_no"])){ 155 | ref <- ref_coords_only_copy[i,"ref"] 156 | query <- ref_coords_only_copy[i,"query"] 157 | chrom <- ref_coords_only_copy[i,"chrom"] 158 | 159 | ref_coords_only_copy[i,"block_no"] <- block_no 160 | 161 | matching_rows <- which(((ref_coords_only_copy$ref == query) & 162 | (ref_coords_only_copy$query == ref) & 163 | (ref_coords_only_copy$chrom == chrom)) & (((ref_coords_only_copy$ref_start >= data$ref_start & ref_coords_only_copy$ref_start <= data$ref_end) | 164 | (ref_coords_only_copy$ref_end >= data$ref_start & ref_coords_only_copy$ref_end <= data$ref_end)) | 165 | ((data$ref_start >= ref_coords_only_copy$ref_start & data$ref_start <= ref_coords_only_copy$ref_end) | 166 | (data$ref_end >= ref_coords_only_copy$ref_start & data$ref_end <= ref_coords_only_copy$ref_end)))) 167 | 168 | 169 | if(length(matching_rows) > 0){ 170 | ref_coords_only_copy[matching_rows, "block_no"] <- block_no 171 | } else if (query %in% EI_vars) { 172 | last_line <- nrow(ref_coords_only_copy) 173 | ref_coords_only_copy <- rbind(ref_coords_only_copy, ref_coords_only_copy[i,]) 174 | ref_coords_only_copy[last_line+1,"block_no"] <- block_no 175 | ref_coords_only_copy[last_line+1,"ref"] <- query 176 | ref_coords_only_copy[last_line+1,"query"] <- ref 177 | ref_coords_only_copy[last_line+1,"assembly_coords"] <- ref 178 | } 179 | if(query %in% EI_vars & ref %in% EI_vars){ 180 | ref_coords_only_copy[i,"assembly_coords"] <- "IWGSCv1.1" 181 | ref_coords_only_copy[matching_rows,"assembly_coords"] <- "IWGSCv1.1" 182 | } else if (query %in% EI_vars) { 183 | ref_coords_only_copy[i,"assembly_coords"] <- ref 184 | ref_coords_only_copy[matching_rows,"assembly_coords"] <- ref 185 | } else if (ref %in% EI_vars) { 186 | ref_coords_only_copy[i,"assembly_coords"] <- query 187 | ref_coords_only_copy[matching_rows,"assembly_coords"] <- query 188 | } else { 189 | ref_coords_only_copy[i,"assembly_coords"] <- ref 190 | ref_coords_only_copy[matching_rows,"assembly_coords"] <- query 191 | } 192 | block_no <- block_no + 1 193 | } else { 194 | print("sorted") 195 | } 196 | } 197 | 198 | ref_coords_only_copy <- ref_coords_only_copy[order(ref_coords_only_copy$chrom, ref_coords_only_copy$ref, ref_coords_only_copy$ref_start),] 199 | 200 | #combine with the name of the ref chroms 201 | 202 | ref_suffix <- read.table("X:/brintonj/haplotype/assemblies_suffix.txt", sep = "\t", header = TRUE, stringsAsFactors = FALSE) 203 | ref_coords_only_copy$assembly_chrom <- ref_coords_only_copy$chrom 204 | 205 | for (i in seq(1, nrow(ref_coords_only_copy))){ 206 | assembly <- ref_coords_only_copy[i, "assembly_coords"] 207 | chrom <- ref_coords_only_copy[i, "chrom"] 208 | if (assembly == "IWGSCv1.1"){ 209 | } else { 210 | ref_coords_only_copy[i, "assembly_chrom"] <- paste0(chrom, "__", ref_suffix[ref_suffix$name == assembly, "suffix"]) 211 | } 212 | } 213 | 214 | colnames(ref_coords_only_copy)[8] <- "ref_assembly" 215 | colnames(ref_coords_only_copy)[10] <- "ref_chrom" 216 | 217 | write.table(ref_coords_only_copy, file = "X:/brintonj/haplotype/whole_genome_mummer_BLAST_5mbp_blocks_combined_ref_coords_block_numbers.tsv", sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 218 | 219 | -------------------------------------------------------------------------------- /precision-recall-analysis/calculate_combined_precision_recall.r: -------------------------------------------------------------------------------- 1 | #Jemima Brinton 2020 2 | #Script to carry out the precision recall calculations for the mummer and blast blocks 3 | 4 | library(GenomicRanges) 5 | library(ggplot2) 6 | ## Functions 7 | 8 | read_BLAST_remove_EI <- function(blast_blocks_path, chrom, EI_vars, gene_window_sizes){ 9 | blast_blocks <- data.frame(assembly = character(), 10 | reference = character(), 11 | chromosome = character(), 12 | start = numeric(), 13 | end = numeric(), 14 | block_no = numeric(), 15 | chr_length = numeric(), 16 | window_size = numeric(), 17 | gene_window = numeric()) 18 | 19 | for (gene_window in gene_window_sizes){ 20 | temp <- read.table(paste0(blast_blocks_path, "coord_converted.chr", chrom, "_haplotype_blocks_aln_", gene_window, "_gene_window_drop", ceiling(gene_window*0.1), ".tsv"), 21 | sep = "\t", 22 | header = TRUE, 23 | stringsAsFactors = FALSE) 24 | 25 | temp$gene_window <- gene_window 26 | temp$window_block <- paste0(temp$block_no, "_", gene_window, "gene") 27 | blast_blocks <- rbind(blast_blocks, temp) 28 | } 29 | 30 | #remove EI v EI comparisons as we can't compare with mummer 31 | 32 | block_numbers <- unique(blast_blocks$window_block) 33 | 34 | blocks_to_remove <- character() 35 | 36 | for (block_number in block_numbers){ 37 | assemblies <- unique(blast_blocks[blast_blocks$window_block == block_number, "assembly"]) 38 | if (length(assemblies) > 2){ 39 | print("ERROR") 40 | print(block_number) 41 | print(assemblies) 42 | } else if((assemblies[1] %in% EI_vars) & (assemblies[2] %in% EI_vars)){ 43 | blocks_to_remove <- c(blocks_to_remove, block_number) 44 | #print(block_number) 45 | #print(assemblies) 46 | 47 | } 48 | } 49 | 50 | blast_blocks_to_use <- blast_blocks[!(blast_blocks$window_block %in% blocks_to_remove),] 51 | blast_blocks_to_use[blast_blocks_to_use$window_size == "cds", "window_size"] <- "cdsbp" 52 | return(blast_blocks_to_use) 53 | } 54 | 55 | precision_recall_flanking_window_sizes <- function(overall_blocks_to_use, mummer){ 56 | precision_recall <- data.frame(aln_type = character(), window_size = character(), BLAST_blocks = numeric(), BLAST_found = numeric(), mummer_blocks = numeric(), mummer_found = numeric(), precision = numeric(), recall = numeric()) 57 | 58 | window_sizes <- unique(overall_blocks_to_use$window_size) 59 | alns <- unique(mummer$aln_type) 60 | 61 | for (window_size in window_sizes){ 62 | window_overall_blocks <- overall_blocks_to_use[overall_blocks_to_use$window_size == window_size,] 63 | block_numbers_window <- unique(window_overall_blocks$block_no) 64 | for (aln in alns){ 65 | mummer_aln <- mummer[mummer$aln_type == aln,] 66 | 67 | ref <- unique(mummer_aln$ref) 68 | query <- unique(mummer_aln$query) 69 | 70 | aln_blocks <- numeric() 71 | #get the corresponding blast block IDs 72 | for (block_number in block_numbers_window){ 73 | assemblies <- unique(window_overall_blocks[window_overall_blocks$block_no == block_number, "assembly"]) 74 | if((ref %in% assemblies) & (query %in% assemblies)){ 75 | aln_blocks <- c(aln_blocks, block_number) 76 | } 77 | } 78 | 79 | window_aln <- window_overall_blocks[(window_overall_blocks$block_no %in% aln_blocks) & (window_overall_blocks$assembly == ref),] 80 | 81 | 82 | mummer_blocks <- nrow(mummer_aln) 83 | BLAST_blocks <- nrow(window_aln) 84 | 85 | if(mummer_blocks == 0 | BLAST_blocks == 0){ 86 | BLAST_found <- 0 87 | mummer_found <- 0 88 | } else { 89 | mummer_gdf <- mummer_aln[,c("ref_chrom", "block_start", "block_end")] 90 | colnames(mummer_gdf) <- c("chr", "start", "end") 91 | 92 | window_gdf <- window_aln[,c("chromosome", "start", "end")] 93 | colnames(window_gdf) <- c("chr", "start", "end") 94 | 95 | # make genomic ranges objects for each 96 | mummer_gr <- makeGRangesFromDataFrame(mummer_gdf) 97 | window_gr <- makeGRangesFromDataFrame(window_gdf) 98 | 99 | overlaps <- findOverlaps(mummer_gr, window_gr) 100 | #precision first - how many BLAST blocks found in the mummer haplotype? 101 | BLAST_found <- length(unique(subjectHits(overlaps))) 102 | #recall - how many mummer blocks found in the BLAST haplotype? 103 | mummer_found <- length(unique(queryHits(overlaps))) 104 | } 105 | 106 | precision <- BLAST_found/BLAST_blocks 107 | recall <- mummer_found/mummer_blocks 108 | precision_recall_aln <- data.frame(aln_type = aln, window_size, BLAST_blocks, BLAST_found, mummer_blocks, mummer_found, precision, recall) 109 | precision_recall <- rbind(precision_recall, precision_recall_aln) 110 | } 111 | } 112 | return(precision_recall) 113 | } 114 | 115 | f1 <- function(precision, recall){ 116 | f1_score <- 2*((precision*recall)/(precision+recall)) 117 | return(f1_score) 118 | } 119 | 120 | precision_recall_all_windows <- function(blast_blocks_to_use, gene_window_sizes){ 121 | precision_recall_all_blocks <- data.frame(aln_type = character(), 122 | window_size = character(), 123 | BLAST_blocks = numeric(), 124 | BLAST_found = numeric(), 125 | mummer_blocks = numeric(), 126 | mummer_found = numeric(), 127 | precision = numeric(), 128 | recall = numeric(), 129 | gene_block = character()) 130 | 131 | 132 | 133 | for (gene_window in gene_window_sizes){ 134 | gene_window_data <- blast_blocks_to_use[blast_blocks_to_use$gene_window == gene_window,] 135 | 136 | 137 | precision_recall_block <- precision_recall_flanking_window_sizes(overall_blocks_to_use = gene_window_data, mummer = mummer_blocks) 138 | 139 | precision_recall_block$gene_block <- gene_window 140 | precision_recall_all_blocks <- rbind(precision_recall_all_blocks, precision_recall_block) 141 | } 142 | 143 | 144 | 145 | precision_recall_all_blocks$window_size <- factor(precision_recall_all_blocks$window_size, levels = c("cdsbp", "0bp", "1000bp", "2000bp", "5000bp")) 146 | precision_recall_all_blocks$gene_block <- as.character(precision_recall_all_blocks$gene_block) 147 | precision_recall_all_blocks$gene_block <- factor(precision_recall_all_blocks$gene_block, levels = c("10", "15", "20", "25", "30")) 148 | 149 | precision_recall_all_blocks$f1_score <- f1(precision = precision_recall_all_blocks$precision, recall = precision_recall_all_blocks$recall) 150 | return(precision_recall_all_blocks) 151 | } 152 | 153 | plots_precision_recall <- function(precision_recall_all_blocks, chrom, plot_dir){ 154 | precision <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = precision, fill = gene_block)) + 155 | geom_boxplot() + 156 | scale_fill_brewer(palette = "Set1") 157 | 158 | ggsave(precision, file = paste0(plot_dir, "chr", chrom, "precision_window_sizes_varying_gene_blocks.png"), 159 | height = 3, width = 6) 160 | 161 | recall <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = recall, fill = gene_block)) + 162 | geom_boxplot() + 163 | scale_fill_brewer(palette = "Set1") 164 | 165 | ggsave(recall, file = paste0(plot_dir, "chr", chrom, "recall_window_sizes_varying_gene_blocks.png"), 166 | height = 3, width = 6) 167 | 168 | f1 <- ggplot(precision_recall_all_blocks, aes(x = window_size, y = f1_score, fill = gene_block)) + 169 | geom_boxplot() + 170 | scale_fill_brewer(palette = "Set1") 171 | 172 | ggsave(f1, file = paste0(plot_dir, "chr", chrom, "f1_score_window_sizes_varying_gene_blocks.png"), 173 | height = 3, width = 6) 174 | 175 | } 176 | ## end of functions 177 | 178 | base_dir <- "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/precision_recall/" 179 | data_dir <- paste0(base_dir, "all_chrom_data_to_use/") 180 | plot_dir <- paste0(base_dir, "plots/") 181 | 182 | dir.create(plot_dir) 183 | 184 | mummer_blocks_path <- "X:/brintonj/haplotype/whole_genome_mummer/blocks/" 185 | 186 | EI_vars <- c("cadenza", "claire", "paragon", "robigus", "weebil") 187 | 188 | mummer_bin_size <- 5000000 189 | 190 | gene_window_sizes <- c(10, 15, 20, 25,30) 191 | 192 | chromosomes <- c("1D", "2B", "3B", "4D", "6A", "7A") 193 | 194 | precision_recall_all_blocks_all_chroms <- data.frame(aln_type = character(), 195 | window_size = character(), 196 | BLAST_blocks = numeric(), 197 | BLAST_found = numeric(), 198 | mummer_blocks = numeric(), 199 | mummer_found = numeric(), 200 | precision = numeric(), 201 | recall = numeric(), 202 | gene_block = character(), 203 | f1 = numeric(), 204 | chrom = character()) 205 | 206 | for (chrom in chromosomes){ 207 | print(chrom) 208 | #read in mummer blocks 209 | mummer_blocks <- read.table(paste0(mummer_blocks_path, chrom, "/mummer_blocks_chr", chrom, ".min20000.5Mb_bins.txt"), 210 | header = TRUE, 211 | stringsAsFactors = FALSE) 212 | 213 | mummer_blocks$aln_type <- paste0(mummer_blocks$ref, "->", mummer_blocks$query) 214 | 215 | #read in blast blocks 216 | blast_blocks_to_use <- read_BLAST_remove_EI(blast_blocks_path = data_dir, 217 | chrom = chrom, 218 | EI_vars = EI_vars, 219 | gene_window_sizes = gene_window_sizes) 220 | 221 | precision_recall_all_blocks <- precision_recall_all_windows(blast_blocks_to_use = blast_blocks_to_use, 222 | gene_window_sizes = gene_window_sizes) 223 | 224 | write.table(precision_recall_all_blocks, file = paste0(plot_dir, "chr", chrom, "precision_recall_f1_table.tsv"), 225 | sep = "\t", 226 | col.names = TRUE, 227 | row.names = FALSE, 228 | quote = FALSE) 229 | 230 | plots_precision_recall(precision_recall_all_blocks = precision_recall_all_blocks, 231 | chrom = chrom, 232 | plot_dir = plot_dir) 233 | 234 | precision_recall_all_blocks$chrom <- chrom 235 | 236 | precision_recall_all_blocks_all_chroms <- rbind(precision_recall_all_blocks_all_chroms, precision_recall_all_blocks) 237 | 238 | } 239 | 240 | write.table(precision_recall_all_blocks_all_chroms, file = paste0(plot_dir, "combined_precision_recall_f1_table.tsv"), 241 | sep = "\t", 242 | col.names = TRUE, 243 | row.names = FALSE, 244 | quote = FALSE) 245 | 246 | plots_precision_recall(precision_recall_all_blocks = precision_recall_all_blocks_all_chroms, 247 | chrom = "combined", 248 | plot_dir = plot_dir) 249 | 250 | -------------------------------------------------------------------------------- /precision-recall-analysis/calculate_BLAST_blocks_all_window_sizes_flanking_sequences.r: -------------------------------------------------------------------------------- 1 | library(stringr) 2 | library(plyr) 3 | library(reshape2) 4 | 5 | ## functions 6 | 7 | read_pairwise_position <- function(blast_path_gz, gtf, outfile){ 8 | all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 9 | 10 | head(all_comp) 11 | 12 | #get the refseq position of the genes 13 | gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1] 14 | gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id) 15 | 16 | gene_only <- gtf[gtf$V3 == "gene",] 17 | 18 | gene_positions <- gene_only[,c("transcript_id", "V1", "V4", "V5", "V7")] 19 | colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand") 20 | gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript) 21 | 22 | #now add positions to the pairwise comparison file 23 | 24 | all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE) 25 | head(all_comp_positions) 26 | 27 | write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 28 | return(all_comp_positions) 29 | } 30 | 31 | separate_output_identities <- function(comp_data, outdir, suffix){ 32 | varieties <- unique(comp_data$var_query) 33 | for (i in seq(1, length(varieties))){ 34 | variety <- varieties[i] 35 | print(variety) 36 | variety_data <- comp_data[grep(variety, comp_data$aln_type),] 37 | write.table(variety_data, file = paste0(outdir, variety, suffix, ".tab"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 38 | } 39 | } 40 | 41 | read_pairwise_position_no_header <- function(blast_path_gz, gtf, outfile, col_names){ 42 | all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = FALSE, stringsAsFactors = FALSE) 43 | colnames(all_comp) <- col_names 44 | head(all_comp) 45 | 46 | #get the refseq position of the genes 47 | gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1] 48 | gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id) 49 | 50 | gene_only <- gtf[gtf$V3 == "gene",] 51 | 52 | gene_positions <- gene_only[,c("transcript_id", "V1", "V4", "V5", "V7")] 53 | colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand") 54 | gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript) 55 | 56 | #now add positions to the pairwise comparison file 57 | 58 | all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE) 59 | head(all_comp_positions) 60 | 61 | write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 62 | return(all_comp_positions) 63 | } 64 | 65 | separate_output_identities <- function(comp_data, outdir, suffix){ 66 | varieties <- unique(comp_data$var_query) 67 | for (i in seq(1, length(varieties))){ 68 | variety <- varieties[i] 69 | print(variety) 70 | variety_data <- comp_data[grep(variety, comp_data$aln_type),] 71 | write.table(variety_data, file = paste0(outdir, variety, suffix, ".tab"), sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 72 | } 73 | } 74 | calculate_pid_windows <- function(aln_data, window_size = 20, drop_no = 2){ 75 | aln_data <- aln_data[order(aln_data$start),] 76 | aln <- unique(aln_data$aln_type) 77 | mean_pidents <- data.frame(start = numeric(), end = numeric(), start_position = numeric(), end_position = numeric(), pident_mean = numeric(), aln_type = character()) 78 | 79 | for(j in seq(1, nrow(aln_data)-window_size)){ 80 | window_start = j 81 | start_position = aln_data[window_start, "start"] 82 | window_end = (window_start + window_size) - 1 83 | end_position = aln_data[window_end, "end"] 84 | pidents <- aln_data[c(window_start:window_end), "pident"] 85 | pidents <- pidents[order(pidents, decreasing = TRUE)] 86 | pidents_sub <- pidents[c(1:(length(pidents)-drop_no))] 87 | pident_mean <- mean(pidents_sub) 88 | aln_to_add = data.frame(start = window_start, 89 | end = window_end, 90 | start_position = start_position, 91 | end_position = end_position, 92 | pident_mean = pident_mean, 93 | aln_type = aln) 94 | mean_pidents <- rbind(mean_pidents, aln_to_add) 95 | } 96 | return(mean_pidents) 97 | } 98 | 99 | 100 | assign_blocks <-function(mean_pidents){ 101 | block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character()) 102 | 103 | mean_pidents_copy <- mean_pidents 104 | mean_pidents_copy$block_no <- NA 105 | block_no = 1 106 | new_block <- "no" 107 | 108 | for (i in seq(1, nrow(mean_pidents_copy))){ 109 | print(i) 110 | if(mean_pidents_copy[i, "pident_mean"] < 100){ 111 | print("less than 100%") 112 | mean_pidents_copy[i, "block_no"] <- NA 113 | } else if (mean_pidents_copy[i, "pident_mean"] == 100){ 114 | if(new_block == "no"){ 115 | mean_pidents_copy[i, "block_no"] <- block_no 116 | } else if (new_block == "yes"){ 117 | if(mean_pidents_copy[i, "start_position"] < end_prev_block){ 118 | block_no <- block_no-1 119 | mean_pidents_copy[i, "block_no"] <- block_no 120 | } else { 121 | mean_pidents_copy[i, "block_no"] <- block_no 122 | } 123 | } 124 | if(i == nrow(mean_pidents_copy)){ 125 | print("end") 126 | } else if (mean_pidents_copy[i+1, "pident_mean"] == 100){ 127 | block_no <- block_no 128 | new_block <- "no" 129 | print("same block") 130 | } else if (mean_pidents_copy[i+1, "pident_mean"] < 100){ 131 | block_no <- block_no+1 132 | new_block <- "yes" 133 | print("new_block") 134 | end_prev_block <- mean_pidents_copy[i, "end_position"] 135 | } 136 | } 137 | } 138 | return(mean_pidents_copy) 139 | } 140 | 141 | summarise_blocks <- function(mean_pidents_copy){ 142 | #now get the start and end of each block 143 | blocks <- unique(mean_pidents_copy$block_no) 144 | blocks <- blocks[complete.cases(blocks)] 145 | 146 | block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character()) 147 | 148 | for(block in blocks){ 149 | block_data <- subset(mean_pidents_copy, block_no == block) 150 | block_start <- min(block_data$start_position) 151 | block_end <- max(block_data$end_position) 152 | aln_type <- unique(block_data$aln_type) 153 | to_add <- data.frame(block_no = block, block_start = block_start, block_end = block_end, aln_type = aln_type) 154 | block_positions <- rbind(block_positions, to_add) 155 | } 156 | return(block_positions) 157 | } 158 | 159 | calculate_blocks <- function (data_dir, window_sizes, gene_no_window, drop_no, chrom = "6A"){ 160 | 161 | overall_blocks <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character(), window = character()) 162 | 163 | for(window in window_sizes){ 164 | print(window) 165 | window_dir <- paste0(data_dir, window) 166 | id_data_path <- paste0(window_dir, "/varieties_", chrom, "_identities_", window, "_refseq_positions.tab") 167 | id_data <- read.table(id_data_path, sep = "\t", header = TRUE, stringsAsFactors = FALSE) 168 | id_data_noN <- id_data[id_data$Ns_total == 0,] 169 | 170 | window_blocks <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), aln_type = character(), window_size = character()) 171 | 172 | alns <- unique(id_data_noN$aln_type) 173 | for(aln in alns){ 174 | print(aln) 175 | aln_data <- id_data_noN[id_data_noN$aln_type == aln,] 176 | #block_info <- calculate_hap_blocks(aln_data = aln_data) 177 | 178 | aln_mean_pidents <- calculate_pid_windows(aln_data = aln_data, window_size = gene_no_window, drop_no = drop_no) 179 | aln_mean_pidents <- aln_mean_pidents[complete.cases(aln_mean_pidents),] 180 | if(max(aln_mean_pidents$pident_mean) < 100){ 181 | block_info <- data.frame(block_no = NA, block_start = NA, block_end = NA, aln_type = aln, window_size = window) 182 | } else { 183 | aln_mean_pidents_copy <- assign_blocks(mean_pidents = aln_mean_pidents) 184 | block_info <- summarise_blocks(mean_pidents_copy = aln_mean_pidents_copy) 185 | block_info$window_size <- window 186 | } 187 | window_blocks <- rbind(window_blocks, block_info) 188 | } 189 | overall_blocks <- rbind(overall_blocks, window_blocks) 190 | } 191 | outfile <- paste0(data_dir, "haplotype_blocks_aln_", gene_no_window, "_gene_window_drop", drop_no, ".tsv") 192 | write.table(overall_blocks, file = outfile, sep = "\t", row.names = FALSE, quote = FALSE) 193 | return(overall_blocks) 194 | } 195 | 196 | ## End of functions 197 | 198 | ## calculate the blocks for 1D, 2B, 3B, 4D, 6A, 7A 199 | 200 | raw_data_dir <- "W:/ramirezr/SM1/final_pairwise_blast/" 201 | out_dir <- "X:/brintonj/haplotype/whole_genome_blast/precision_recall/varying_window_blocks/" 202 | 203 | pairwise_header <- c("transcript", "query", "subject", "var_query", "var_subject", "aln_type", "length", "pident", "Ns_query", "Ns_subject", "Ns_total", "Flanking") 204 | #same gtf for all (using RefSeq order for now) 205 | HC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_HC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE) 206 | LC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_LC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE) 207 | ALL_gtf <- rbind(HC_gtf, LC_gtf) 208 | 209 | chromosomes_to_calculate <- c("1D", "2B", "3B", "4D", "6A", "7A") 210 | 211 | for (chrom in chromosomes_to_calculate){ 212 | files_to_run <- list.files(raw_data_dir)[grep(paste0("varieties_", chrom, "_identities"), list.files(raw_data_dir))] 213 | 214 | for (i in seq(1, length(files_to_run))){ 215 | file <- files_to_run[i] 216 | window <- gsub(paste0("varieties_", chrom, "_identities_"), "", file) 217 | window <- gsub("\\.tab.gz", "", window) 218 | window_dir <- paste0(out_dir, window, "/") 219 | 220 | dir.create(window_dir) 221 | 222 | all_comp_positions <- read_pairwise_position_no_header(blast_path_gz = paste0(raw_data_dir, file), gtf = ALL_gtf, outfile = paste0(window_dir, "/varieties_", chrom, "_identities_", window, "_refseq_positions.tab"), col_names = pairwise_header) 223 | 224 | #separate_output_identities(comp_data = all_comp_positions, outdir = window_dir, suffix = paste0( "_", chrom, "_identities_refseq_positions_", window)) 225 | } 226 | 227 | ## ok now we need to call the blocks for all the different window sizes and criteria 228 | 229 | window_sizes <- c("cds", "0bp", "1000bp", "2000bp", "5000bp") 230 | 231 | #OK now we want to calculate the blocks for all the different comparisons for the different window sizes 232 | 233 | gene_no_window <- 10 234 | 235 | window_10_blocks <- calculate_blocks(data_dir = out_dir, 236 | window_sizes = window_sizes, 237 | gene_no_window = gene_no_window, 238 | drop_no = ceiling(gene_no_window*0.1), 239 | chrom = chrom) 240 | 241 | window_10_blocks$block_size <- window_10_blocks$block_end-window_10_blocks$block_start 242 | window_10_blocks$chr <- paste0("chr", chrom) 243 | window_10_blocks$gene_block <- gene_no_window 244 | 245 | gene_no_window <- 15 246 | 247 | window_15_blocks <- calculate_blocks(data_dir = out_dir, 248 | window_sizes = window_sizes, 249 | gene_no_window = gene_no_window, 250 | drop_no = ceiling(gene_no_window*0.1), 251 | chrom = chrom) 252 | 253 | window_15_blocks$block_size <- window_15_blocks$block_end-window_15_blocks$block_start 254 | window_15_blocks$chr <- paste0("chr", chrom) 255 | window_15_blocks$gene_block <- gene_no_window 256 | 257 | gene_no_window <- 25 258 | 259 | window_25_blocks <- calculate_blocks(data_dir = out_dir, 260 | window_sizes = window_sizes, 261 | gene_no_window = gene_no_window, 262 | drop_no = ceiling(gene_no_window*0.1), 263 | chrom = chrom) 264 | 265 | window_25_blocks$block_size <- window_25_blocks$block_end-window_25_blocks$block_start 266 | window_25_blocks$chr <- paste0("chr", chrom) 267 | window_25_blocks$gene_block <- gene_no_window 268 | 269 | gene_no_window <- 30 270 | 271 | window_30_blocks <- calculate_blocks(data_dir = out_dir, 272 | window_sizes = window_sizes, 273 | gene_no_window = gene_no_window, 274 | drop_no = ceiling(gene_no_window*0.1), 275 | chrom = chrom) 276 | 277 | window_30_blocks$block_size <- window_30_blocks$block_end-window_30_blocks$block_start 278 | window_30_blocks$chr <- paste0("chr", chrom) 279 | window_30_blocks$gene_block <- gene_no_window 280 | 281 | gene_no_window <- 20 282 | 283 | window_20_blocks <- calculate_blocks(data_dir = out_dir, 284 | window_sizes = window_sizes, 285 | gene_no_window = gene_no_window, 286 | drop_no = ceiling(gene_no_window*0.1), 287 | chrom = chrom) 288 | 289 | window_20_blocks$block_size <- window_20_blocks$block_end-window_20_blocks$block_start 290 | window_20_blocks$chr <- paste0("chr", chrom) 291 | window_20_blocks$gene_block <- gene_no_window 292 | 293 | all_window_blocks <- rbind(window_10_blocks, window_15_blocks, window_20_blocks, window_25_blocks, window_30_blocks) 294 | 295 | write.table(all_window_blocks, file = paste0(out_dir, "combined_", chrom, "_all_flanking_all_windown_sizes_BLAST_blocks.tsv"), 296 | sep = "\t", 297 | col.names = TRUE, 298 | row.names = FALSE, 299 | quote = FALSE) 300 | } 301 | 302 | ##### 303 | -------------------------------------------------------------------------------- /haplotype-block-assignment/assign_BLAST_blocks_whole_genome.r: -------------------------------------------------------------------------------- 1 | ## Script written by Jemima Brinton 2019 2 | ## Aim - define haplotype blocks across whole wheat genome based on pairwise BLAST alignments 3 | 4 | 5 | ##DEFINE FUNCTIONS 6 | library(stringr) 7 | library(ggplot2) 8 | library("ggdendro") 9 | library(plyr) 10 | library(reshape2) 11 | library("grid") 12 | 13 | #read in BLAST alignments and get positions 14 | read_pairwise_position <- function(blast_path_gz, gtf, outfile, write_table = TRUE){ 15 | all_comp <- read.table(gzfile(blast_path_gz), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 16 | 17 | head(all_comp) 18 | 19 | #get the refseq position of the genes 20 | gtf$transcript_id <- str_split_fixed(gtf$V9, ";", 2)[,1] 21 | gtf$transcript_id <- gsub("transcript_id ", "", gtf$transcript_id) 22 | 23 | gene_only <- gtf[gtf$V3 == "gene",] 24 | 25 | gene_positions <- gene_only[,c("transcript_id", "V1", "V4", "V5", "V7")] 26 | colnames(gene_positions) <- c("transcript", "chr", "start", "end", "strand") 27 | gene_positions$transcript <- gsub("ID=", "", gene_positions$transcript) 28 | 29 | #now add positions to the pairwise comparison file 30 | 31 | all_comp_positions <- merge(all_comp, gene_positions, all.x = TRUE, all.y = FALSE) 32 | head(all_comp_positions) 33 | 34 | if(write_table == TRUE){ 35 | write.table(all_comp_positions, file = outfile, sep = "\t", col.names = TRUE, row.names = FALSE, quote = FALSE) 36 | } 37 | return(all_comp_positions) 38 | } 39 | 40 | #cap percentage identity at a certain value 41 | cap_data <- function(all_comp_positions, cap_value){ 42 | all_comp_positions$capped_ident <- all_comp_positions$pident 43 | all_comp_positions[all_comp_positions$capped_ident < cap_value, "capped_ident"] <- 0 44 | return(all_comp_positions) 45 | } 46 | 47 | #filter for the number of missing alignments 48 | filter_aln_no <- function(comp_data, missing_limit){ 49 | #total_number of alignments 50 | total_aln <- length(unique(comp_data$aln_type)) 51 | aln_limit <- total_aln - missing_limit 52 | 53 | transcript_counts<- data.frame(table(comp_data$transcript)) 54 | transcripts_to_keep <- transcript_counts[transcript_counts$Freq >= aln_limit, "Var1"] 55 | 56 | comp_data_filt <- subset(comp_data, transcript %in% transcripts_to_keep) 57 | 58 | nrow(comp_data) 59 | nrow(comp_data_filt) 60 | length(unique(comp_data$transcript)) 61 | length(unique(comp_data_filt$transcript)) 62 | 63 | return(comp_data_filt) 64 | } 65 | 66 | 67 | #this zooms in on an region and plots the 100% identity and orders alignments via heirachical clustering 68 | plot_zoom_100_clust <- function(data, start, end, outfile){ 69 | data <- data[order(data$start),] 70 | data_zoom <- data[(data$end > start) & (data$start < end),] 71 | #print(table(data_zoom$aln_type)) 72 | zoom_mat <- dcast(data_zoom[,c("aln_type", "transcript", "capped_ident")], aln_type ~ transcript) 73 | zoom.matrix <- as.matrix(zoom_mat[,c(2:(ncol(zoom_mat)-1))]) 74 | dim(zoom.matrix) 75 | rownames(zoom.matrix) <- zoom_mat$aln_type 76 | zoom.dendro <- as.dendrogram(hclust(d=dist(x = zoom.matrix))) 77 | dendro.plot <- ggdendrogram(data = zoom.dendro, rotate = TRUE) 78 | zoom.order <- order.dendrogram(zoom.dendro) 79 | data_zoom$aln_type <- factor(x = data_zoom$aln_type, levels = zoom_mat$aln_type[zoom.order], ordered = TRUE) 80 | plot <- ggplot(data_zoom , aes(x=factor(transcript),y=aln_type)) + 81 | geom_tile(aes(fill = data_zoom$capped_ident)) + 82 | scale_fill_distiller(limits=c(0,100), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "% ID") + 83 | theme(axis.text.x=element_text(angle=90, hjust=1)) 84 | ggsave(plot = plot, file =outfile, dpi = 600, height = 8, width = 18) 85 | return(plot) 86 | } 87 | 88 | #calculate sliding windows of percentage identity based on a certain number of consective transcripts (window_size) and dropping a specific numer with the lowest identity before calculating the mean % id 89 | calculate_pid_windows <- function(aln_data, window_size = 20, drop_no = 2){ 90 | aln_data <- aln_data[order(aln_data$start),] 91 | aln <- unique(aln_data$aln_type) 92 | mean_pidents <- data.frame(start = numeric(), 93 | end = numeric(), 94 | start_position = numeric(), 95 | end_position = numeric(), 96 | pident_mean = numeric(), 97 | aln_type = character(), 98 | start_transcript = character(), 99 | end_transcript = character()) 100 | 101 | for(j in seq(1, nrow(aln_data)-window_size)){ 102 | window_start = j 103 | start_position = aln_data[window_start, "start"] 104 | start_transcript = aln_data[window_start, "transcript"] 105 | window_end = (window_start + window_size) - 1 106 | end_position = aln_data[window_end, "end"] 107 | end_transcript = aln_data[window_end, "transcript"] 108 | pidents <- aln_data[c(window_start:window_end), "pident"] 109 | pidents <- pidents[order(pidents, decreasing = TRUE)] 110 | pidents_sub <- pidents[c(1:(length(pidents)-drop_no))] 111 | pident_mean <- mean(pidents_sub) 112 | aln_to_add = data.frame(start = window_start, 113 | end = window_end, 114 | start_position = start_position, 115 | end_position = end_position, 116 | pident_mean = pident_mean, 117 | aln_type = aln, 118 | start_transcript = start_transcript, 119 | end_transcript = end_transcript) 120 | mean_pidents <- rbind(mean_pidents, aln_to_add) 121 | } 122 | return(mean_pidents) 123 | } 124 | 125 | #assign haplotype blocks based on the sliding window mean of percentage identity 126 | assign_blocks <-function(mean_pidents){ 127 | mean_pidents_copy <- mean_pidents 128 | mean_pidents_copy$block_no <- NA 129 | block_no = 1 130 | new_block <- "no" 131 | 132 | for (i in seq(1, nrow(mean_pidents_copy))){ 133 | #print(i) 134 | if(mean_pidents_copy[i, "pident_mean"] < 100){ 135 | print("less than 100%") 136 | mean_pidents_copy[i, "block_no"] <- NA 137 | } else if (mean_pidents_copy[i, "pident_mean"] == 100){ 138 | if(new_block == "no"){ 139 | mean_pidents_copy[i, "block_no"] <- block_no 140 | } else if (new_block == "yes"){ 141 | if(mean_pidents_copy[i, "start_position"] < end_prev_block){ 142 | block_no <- block_no-1 143 | mean_pidents_copy[i, "block_no"] <- block_no 144 | } else { 145 | mean_pidents_copy[i, "block_no"] <- block_no 146 | } 147 | } 148 | if(i == nrow(mean_pidents_copy)){ 149 | print("end") 150 | } else if (mean_pidents_copy[i+1, "pident_mean"] == 100){ 151 | block_no <- block_no 152 | new_block <- "no" 153 | print("same block") 154 | } else if (mean_pidents_copy[i+1, "pident_mean"] < 100){ 155 | block_no <- block_no+1 156 | new_block <- "yes" 157 | print("new_block") 158 | end_prev_block <- mean_pidents_copy[i, "end_position"] 159 | } 160 | } 161 | } 162 | return(mean_pidents_copy) 163 | } 164 | 165 | #summarise adjacted haplotype blocks calculated from sliding window mean 166 | summarise_blocks <- function(mean_pidents_copy){ 167 | #now get the start and end of each block 168 | blocks <- unique(mean_pidents_copy$block_no) 169 | blocks <- blocks[complete.cases(blocks)] 170 | 171 | block_positions <- data.frame(block_no = numeric(), 172 | block_start = numeric(), 173 | block_end = numeric(), 174 | aln_type = character(), 175 | start_transcript = character(), 176 | end_transcript = character()) 177 | 178 | for(block in blocks){ 179 | block_data <- subset(mean_pidents_copy, block_no == block) 180 | block_start <- min(block_data$start_position) 181 | start_transcript <- unique(block_data[block_data$start_position == block_start, "start_transcript"]) 182 | block_end <- max(block_data$end_position) 183 | end_transcript <- unique(block_data[block_data$end_position == block_end, "end_transcript"]) 184 | aln_type <- unique(block_data$aln_type) 185 | to_add <- data.frame(block_no = block, 186 | block_start = block_start, 187 | block_end = block_end, 188 | aln_type = aln_type, 189 | start_transcript = start_transcript, 190 | end_transcript = end_transcript) 191 | 192 | block_positions <- rbind(block_positions, to_add) 193 | } 194 | return(block_positions) 195 | } 196 | 197 | calculate_blocks_single_window <- function (data, gene_no_window, drop_no, window, out_dir, chrom){ 198 | 199 | overall_blocks <- data.frame(block_no = numeric(), 200 | block_start = numeric(), 201 | block_end = numeric(), 202 | aln_type = character(), 203 | start_transcript = character(), 204 | end_transcript = character(), 205 | window = character()) 206 | 207 | alns <- unique(data$aln_type) 208 | for(aln in alns){ 209 | print(aln) 210 | aln_data <- data[data$aln_type == aln,] 211 | #block_info <- calculate_hap_blocks(aln_data = aln_data) 212 | 213 | aln_mean_pidents <- calculate_pid_windows(aln_data = aln_data, window_size = gene_no_window, drop_no = drop_no) 214 | aln_mean_pidents <- aln_mean_pidents[complete.cases(aln_mean_pidents),] 215 | if(max(aln_mean_pidents$pident_mean) < 100){ 216 | block_info <- data.frame(block_no = NA, 217 | block_start = NA, 218 | block_end = NA, 219 | aln_type = aln, 220 | start_transcript = NA, 221 | end_transcript = NA, 222 | window = window) 223 | } else { 224 | aln_mean_pidents_copy <- assign_blocks(mean_pidents = aln_mean_pidents) 225 | block_info <- summarise_blocks(mean_pidents_copy = aln_mean_pidents_copy) 226 | block_info$window <- window 227 | } 228 | overall_blocks <- rbind(overall_blocks, block_info) 229 | } 230 | 231 | #outfile <- paste0(out_dir, "haplotype_blocks_aln_", gene_no_window, "_gene_window_drop", drop_no, ".tsv") 232 | #write.table(overall_blocks, file = outfile, sep = "\t", row.names = FALSE, quote = FALSE) 233 | return(overall_blocks) 234 | } 235 | 236 | ## END OF FUNCTIONS 237 | 238 | raw_data_path <- "W:/ramirezr/SM1/pairwise_blast_nov_2019/varieties_all_identities_2000bp.tab.gz" 239 | out_dir <- "X:/brintonj/haplotype/whole_genome_blast/" 240 | plot_dir <- paste0(out_dir, "plots/") 241 | blocks_dir <- paste0(out_dir, "blocks/") 242 | 243 | dir.create(plot_dir) 244 | dir.create(blocks_dir) 245 | 246 | #same gtf for all (using RefSeq order for now) 247 | HC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_HC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE) 248 | LC_gtf <- read.table("W://WGAv1.0//annotation//IWGSC_v1.1_LC_20170706.gff3", sep = "\t", header = FALSE, stringsAsFactors = FALSE) 249 | ALL_gtf <- rbind(HC_gtf, LC_gtf) 250 | 251 | raw_data <- read_pairwise_position(blast_path_gz = raw_data_path, 252 | gtf = ALL_gtf, 253 | write_table = FALSE) 254 | 255 | #make a capped column in the all positions data_frame 256 | all_comp_positions <- cap_data(all_comp_positions = raw_data, cap_value = 100) 257 | 258 | #filter for Ns 259 | all_comp_positions_noN <- all_comp_positions[all_comp_positions$Ns_total == 0,] 260 | 261 | #plot how many Ns we lose for each chromosome 262 | gene_counts_all <- data.frame(table(all_comp_positions[,c("aln_type", "chr")])) 263 | gene_counts_all$filter <- "all" 264 | gene_counts_noN <- data.frame(table(all_comp_positions_noN[,c("aln_type", "chr")])) 265 | gene_counts_noN$filter <- "noN" 266 | 267 | gene_counts_filter <- rbind(gene_counts_all, gene_counts_noN) 268 | 269 | ggplot(gene_counts_filter, aes(x = chr, y = Freq, fill = filter)) + 270 | geom_boxplot() + 271 | theme(axis.text.x = element_text(angle = 90)) + 272 | ylab("number of genes") 273 | 274 | ggsave(file = paste0(out_dir, "gene_counts_pairwise_blast_whole_genome_all_v_noN.png"), height = 5, width = 6, dpi = 300) 275 | 276 | chromosomes <- unique(all_comp_positions_noN$chr) 277 | #get rid of chrUn - we need physical order 278 | chromosomes <- chromosomes[!(chromosomes == "chrUn")] 279 | 280 | window <- unique(all_comp_positions_noN$Flanking) 281 | 282 | #use a sliding window size of 25 consecutive transcripts 283 | gene_no_window <- 25 284 | 285 | for (chr in chromosomes){ 286 | chr_data <- all_comp_positions_noN[all_comp_positions_noN$chr == chr,] 287 | chrom_plot_dir <- paste0(plot_dir, chr, "/") 288 | dir.create(chrom_plot_dir, recursive = TRUE) 289 | 290 | chrom_blocks_dir <- paste0(blocks_dir, chr, "/") 291 | dir.create(chrom_blocks_dir, recursive = TRUE) 292 | 293 | varieties <- unique(chr_data$var_query) 294 | 295 | for (i in seq(1, length(varieties))){ 296 | variety <- varieties[i] 297 | 298 | variety_data <- chr_data[grep(variety, chr_data$aln_type),] 299 | 300 | #now filter for those in all comparisons 301 | variety_data_all <- filter_aln_no(comp_data = variety_data, missing_limit = 0) 302 | 303 | #exclude those transcripts missing more than two comparisons 304 | variety_data_2 <- filter_aln_no(comp_data = variety_data, missing_limit = 2) 305 | 306 | #plot heatmap with all data 307 | all_plot <- plot_zoom_100_clust(variety_data, 308 | start = 0, 309 | end = ceiling(max(variety_data$end)), 310 | outfile = paste(chrom_plot_dir, variety, "_", window, "_noN_100cap_clustered.png", sep = "")) 311 | 312 | #plot heatmap with no missing data 313 | all_plot <- plot_zoom_100_clust(variety_data_all, 314 | start = 0, 315 | end = ceiling(max(variety_data$end)), 316 | outfile = paste(chrom_plot_dir, variety, "_", window, "_0missing_noN_100cap_clustered.png", sep = "")) 317 | 318 | #plot heatmap with up to 2 missing data points 319 | all_plot <- plot_zoom_100_clust(variety_data_2, 320 | start = 0, 321 | end = ceiling(max(variety_data$end)), 322 | outfile = paste(chrom_plot_dir, variety, "_", window, "_max2missing_noN_100cap_clustered.png", sep = "")) 323 | } 324 | 325 | #now do the block calculation 326 | chrom_blocks <- calculate_blocks_single_window(data = chr_data, 327 | gene_no_window = gene_no_window, 328 | drop_no = ceiling(gene_no_window*0.1), 329 | window = window, 330 | out_dir = chrom_blocks_dir, 331 | chrom = chr) 332 | 333 | write.table(chrom_blocks, file = paste0(chrom_blocks_dir, "haplotype_blocks_BLAST_", gene_no_window, "_gene_window_", window, "bp.txt"), 334 | sep = "\t", 335 | col.names = TRUE, row.names = FALSE, quote = FALSE) 336 | 337 | 338 | } 339 | -------------------------------------------------------------------------------- /haplotype-block-assignment/assign_mummer_blocks_whole_genome.r: -------------------------------------------------------------------------------- 1 | ## Script written by Jemima Brinton 2019 2 | ## Aim - define haplotype blocks across whole wheat genome based on pairwise nucmer alignments between genome assemblies 3 | 4 | ###FUNCTIONS 5 | library(dplyr) 6 | library(magrittr) 7 | library(GenomicRanges) 8 | library(ggplot2) 9 | library(tidyr) 10 | library(viridis) 11 | library(stringr) 12 | 13 | 14 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/ 15 | #Reads in delta file output from nucmer into R and summarises into dataframe 16 | readDelta <- function(deltafile){ 17 | lines = scan(deltafile, 'a', sep='\n', quiet=TRUE) 18 | lines = lines[-1] 19 | lines.l = strsplit(lines, ' ') 20 | lines.len = lapply(lines.l, length) %>% as.numeric 21 | lines.l = lines.l[lines.len != 1] 22 | lines.len = lines.len[lines.len != 1] 23 | head.pos = which(lines.len == 4) 24 | head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos) 25 | mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7) 26 | res = as.data.frame(t(mat[1:5,])) 27 | colnames(res) = c('rs','re','qs','qe','error') 28 | res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2)) 29 | res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .) 30 | res$strand = ifelse(res$qe-res$qs > 0, '+', '-') 31 | res 32 | } 33 | 34 | #calculates percentage identity and mid point for each mummer alignment 35 | calculate_perc_id_mid_points <- function(data){ 36 | data$r_length <- (data$re - data$rs) 37 | data$perc_id <- ((data$r_length - data$error)/data$r_length)*100 38 | data$perc_id_factor <- data$perc_id 39 | data[data$perc_id < 100, "perc_id_factor"] <- "<100" 40 | data$perc_id_factor <- as.factor(data$perc_id_factor) 41 | data$r_mid <- (data$rs + data$re)/2 42 | data$q_mid <- (data$qs + data$qe)/2 43 | return(data) 44 | } 45 | 46 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size 47 | pre_plot_analysis <- function(delta_path, min_size = 20000){ 48 | data = readDelta(delta_path) 49 | data <- calculate_perc_id_mid_points(data) 50 | data_filt <- data[data$r_length >= min_size,] 51 | return(data_filt) 52 | } 53 | 54 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query 55 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){ 56 | ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id, shape = strand)) + geom_segment() + 57 | geom_point(alpha=.5) + 58 | theme_bw() + 59 | theme(strip.text.y=element_text(angle=180, size=5), 60 | strip.background=element_blank()) + 61 | xlab(unique(data$rid)) + ylab(unique(data$qid)) + 62 | scale_colour_viridis(limits = c(cap_lower, cap_upper)) 63 | } 64 | 65 | #scatter plot against reference position, y axis is percentage identity colour is size of alignment 66 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){ 67 | ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) + 68 | theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) + 69 | geom_point(alpha=.5) + 70 | ylim(ymin, ymax) + 71 | scale_colour_viridis() 72 | } 73 | 74 | #bins alignment dataframe by reference position 75 | bin_data <- function(data, bin_size, max_chrom_size){ 76 | bins <- seq(0, max_chrom_size, by = bin_size) 77 | data$bin <- NA 78 | for (i in bins){ 79 | data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin) 80 | } 81 | return(data) 82 | } 83 | 84 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold 85 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 86 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 87 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 88 | colnames(comparison_medians)[2] <- "perc_id_median" 89 | comparison_medians$cut_off <- NA 90 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 91 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 92 | 93 | ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) + 94 | geom_line(colour = "grey") + 95 | geom_point(size = 1) + 96 | ylim(ymin,ymax) + 97 | scale_colour_manual(values = c("#73D055FF", "#440154FF")) + 98 | labs(colour = "% id") + 99 | #scale_colour_viridis() + 100 | xlab(data$rid) + 101 | 102 | ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) 103 | } 104 | 105 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold 106 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 107 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 108 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 109 | colnames(comparison_medians)[2] <- "perc_id_median" 110 | comparison_medians$cut_off <- NA 111 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 112 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 113 | 114 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 115 | 116 | ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 117 | geom_boxplot(outlier.shape = NA) + 118 | ylim(ymin,ymax) + 119 | scale_fill_manual(values = c("#73D055FF", "#440154FF")) + 120 | labs(fill = "% id") + 121 | #scale_colour_viridis() + 122 | xlab(data$rid) + 123 | ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) 124 | } 125 | 126 | #assigns haplotype blocks based on bins of alignments exceeding the 99.99 % id threshold 127 | assign_blocks_mummer <-function(median_cutoffs){ 128 | median_cutoffs_copy <- median_cutoffs 129 | median_cutoffs_copy$block_no <- NA 130 | block_no = 1 131 | 132 | for (i in seq(1, nrow(median_cutoffs_copy))){ 133 | print(i) 134 | if(median_cutoffs_copy[i, "perc_id_median"] < 99.99){ 135 | median_cutoffs_copy[i, "block_no"] <- NA 136 | } else if (median_cutoffs_copy[i, "perc_id_median"] >= 99.99){ 137 | median_cutoffs_copy[i, "block_no"] <- block_no 138 | if (i > (nrow(median_cutoffs_copy)-3)){ 139 | print("coming to end") 140 | } else if ((median_cutoffs_copy[i+1, "perc_id_median"] < 99.99) & (median_cutoffs_copy[i+2, "perc_id_median"] < 99.99) & (median_cutoffs_copy[i+3, "perc_id_median"] < 99.99)){ 141 | block_no <- block_no + 1 142 | } 143 | } 144 | } 145 | return(median_cutoffs_copy) 146 | } 147 | 148 | #summarises adjacent bins into haplotype blocks 149 | summarise_blocks <- function(median_cutoffs_copy, bin_size){ 150 | #now get the start and end of each block 151 | blocks <- unique(median_cutoffs_copy$block_no) 152 | blocks <- blocks[complete.cases(blocks)] 153 | 154 | block_positions <- data.frame(block_no = numeric(), block_start = numeric(), block_end = numeric(), ref = character(), query = character()) 155 | 156 | for(block in blocks){ 157 | block_data <- subset(median_cutoffs_copy, block_no == block) 158 | block_start <- min(block_data$bin) - bin_size 159 | block_end <- max(block_data$bin) 160 | ref <- unique(block_data$ref) 161 | query <- unique(block_data$query) 162 | to_add <- data.frame(block_no = block, block_start = block_start, block_end = block_end, ref = ref, query = query) 163 | block_positions <- rbind(block_positions, to_add) 164 | } 165 | return(block_positions) 166 | } 167 | 168 | #checks if haplotype blocks are also called in the reciprocal mummer alignment 169 | check_reciprocal <- function(data, all_ref_query_coords, chrom){ 170 | ref <- data["ref"] 171 | query <- data["query"] 172 | 173 | ref_vars <- as.character(unique(all_ref_query_coords$ref)) 174 | 175 | if(query %in% ref_vars){ 176 | recip_table <- all_ref_query_coords[(all_ref_query_coords$ref == query) & (all_ref_query_coords$query == ref), ] 177 | if (nrow(recip_table) == 0){ 178 | in_reciprocal <- "N" 179 | } else { 180 | 181 | ref_gdf <- data.frame(chr=character(), start=numeric(), end=numeric()) 182 | to_add <- c(data["chrom"], data["block_start"], data["block_end"]) 183 | ref_gdf <- rbind(ref_gdf, to_add) 184 | colnames(ref_gdf) <- c("chr", "start", "end") 185 | 186 | 187 | query_gdf <- recip_table[, c("chrom", "block_start", "block_end")] 188 | colnames(query_gdf) <- c("chr", "start", "end") 189 | 190 | # make genomic ranges objects for each 191 | ref_gr <- makeGRangesFromDataFrame(ref_gdf) 192 | query_gr <- makeGRangesFromDataFrame(query_gdf) 193 | 194 | overlaps <- findOverlaps(ref_gr, query_gr) 195 | #precision first - how many BLAST blocks found in the mummer haplotype? 196 | count_overlaps <- length(unique(subjectHits(overlaps))) 197 | if (count_overlaps > 0){ 198 | in_reciprocal <- "Y" 199 | 200 | } else { 201 | in_reciprocal <- "N" 202 | } 203 | } 204 | } else { 205 | in_reciprocal <- NA 206 | } 207 | return(in_reciprocal) 208 | } 209 | 210 | ##END OF FUNCTIONS 211 | 212 | ## plots the median cut offs and call the blocks for each chromosome 213 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/" 214 | 215 | plot_base <- paste0(base_dir, "plots") 216 | blocks_base <- paste0(base_dir, "blocks") 217 | 218 | dir.create(plot_base) 219 | dir.create(blocks_base) 220 | 221 | #read in the fasta indexes so we can set the max chromosome length 222 | chrom_lengths <- read.table("W:/assemblies/releasePGSBv2.0/genome/combined_fai.txt", sep = "\t", header = FALSE, stringsAsFactors = FALSE) 223 | #remove spelta and synthetic from this table as we are not including them 224 | chrom_lengths <- chrom_lengths[!grepl("ash", chrom_lengths[,1]),] 225 | chrom_lengths <- chrom_lengths[!grepl("tsp", chrom_lengths[,1]),] 226 | 227 | #get a list of all the varieties to plot as references (chromosome level as references only) 228 | varieties_to_plot <- list.files(paste0(base_dir, "aln")) 229 | chromosomes_to_plot <- list.files(paste0(base_dir, "aln/", varieties_to_plot[1])) 230 | 231 | #we will only use the 20kb filter here (this is already filtered using mummer) 232 | min_size <- 20000 233 | cut_off <- 99.99 234 | bin_size <- 5000000 235 | 236 | for (i in seq(1, length(chromosomes_to_plot))){ 237 | chrom <- paste0("chr", chromosomes_to_plot[i]) 238 | print(chrom) 239 | max_chrom_size <- max(chrom_lengths[grepl(chrom, chrom_lengths[,1]),2]) 240 | 241 | blocks_chrom_dir <- paste0(blocks_base, "/", chromosomes_to_plot[i]) 242 | plots_chrom_dir <- paste0(plot_base, "/", chromosomes_to_plot[i]) 243 | 244 | dir.create(blocks_chrom_dir) 245 | dir.create(plots_chrom_dir) 246 | 247 | chrom_blocks_coords <- data.frame(block_no = numeric(), 248 | block_start = numeric(), 249 | block_end = numeric(), 250 | ref = character(), 251 | query = character(), 252 | chrom = character(), 253 | ref_chrom = character()) 254 | 255 | 256 | for (variety in varieties_to_plot){ 257 | print(variety) 258 | data_dir <- paste0(base_dir, "aln/", variety, "/", chromosomes_to_plot[i], "/") 259 | plot_dir <- paste0(plots_chrom_dir, "/", variety, "/") 260 | dir.create(plot_dir) 261 | 262 | all_files <- list.files(data_dir) 263 | #This gets just the filtered deltas 264 | filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)] 265 | 266 | for (comparison in filtered_delta){ 267 | comparison_delta_path <- paste0(data_dir, "/", comparison) 268 | comparison_id <- str_split_fixed(comparison, "\\.", 3)[1] 269 | 270 | ref <- variety 271 | if (ref == "sy_mattis"){ 272 | query <- str_split_fixed(comparison_id, "_", 4)[4] 273 | } else { 274 | query <- str_split_fixed(comparison_id, "_", 3)[3] 275 | } 276 | 277 | if (query == "sy"){ 278 | query <- "sy_mattis" 279 | } 280 | 281 | if(file.info(comparison_delta_path)$size == 0){ 282 | print("file empty") 283 | } else { 284 | #read and filter for the minimum size 285 | comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size) 286 | print("data read in and filtered") 287 | 288 | #plot the diagonal dot plots 289 | diagonal_dot_plot <- plot_by_perc_id_cap(comparison_filt, cap_lower = 97, cap_upper = 100) 290 | ggsave(diagonal_dot_plot, file = paste0(plot_dir, comparison_id, ".", chrom, ".diagonal_dot.min", min_size, ".png"), dpi = 300, height = 5, width = 9) 291 | 292 | #plot the percentage dotplot capped at 97% 293 | perc_dot_plot_97 <- plot_perc_id_v_ref(comparison_filt, ymin = 97) 294 | ggsave(perc_dot_plot_97, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_dot.min", min_size, "_cap97.png"), dpi = 300, height = 5, width = 9) 295 | 296 | #now plot the median cut offs 297 | perc_line_5M <- plot_medians_line_colour_cut_off(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size) 298 | ggsave(perc_line_5M, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_line.min", min_size, "_5Mb_bin_99.99median.png"), dpi = 300, height = 5, width = 9) 299 | 300 | perc_boxplot_5M <- plot_boxplots_median_colour_cut_off(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size) 301 | ggsave(perc_boxplot_5M, file = paste0(plot_dir, comparison_id, ".", chrom, ".percentage_boxplot.min", min_size, "_5Mb_bin_99.99median.png"), dpi = 300, height = 5, width = 9) 302 | 303 | #also output table with the medians of the bins 304 | 305 | comparison_filt_bin <- bin_data(comparison_filt, bin_size = bin_size, max_chrom_size = max_chrom_size) 306 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 307 | colnames(comparison_medians)[2] <- "perc_id_median" 308 | comparison_medians$cut_off <- NA 309 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 310 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 311 | 312 | comparison_medians$ref <- ref 313 | comparison_medians$query <- query 314 | 315 | write.table(comparison_medians, file = paste0(blocks_chrom_dir, "/", comparison_id, ".", chrom, ".percentage_medians.min", min_size, "_5Mb_bin.txt"), col.names = TRUE, row.names = FALSE, quote = FALSE, sep = "\t") 316 | 317 | #now assign the blocks 318 | ref_query_blocks <- assign_blocks_mummer(comparison_medians) 319 | ref_query_coords <- summarise_blocks(ref_query_blocks, bin_size = bin_size) 320 | if (nrow(ref_query_coords) > 0){ 321 | ref_query_coords$chrom <- chrom 322 | ref_query_coords$ref_chrom <- unique(comparison_filt$rid) 323 | 324 | chrom_blocks_coords <- rbind(chrom_blocks_coords, ref_query_coords) 325 | } 326 | } 327 | } 328 | } 329 | #now check if blocks are in reciprocals 330 | chrom_blocks_coords$in_reciprocal <- apply(chrom_blocks_coords, 1, check_reciprocal, all_ref_query_coords = chrom_blocks_coords) 331 | 332 | write.table(chrom_blocks_coords, 333 | file = paste0(blocks_chrom_dir, "/mummer_blocks_", chrom, ".min", min_size, ".", bin_size, "_bins.txt"), 334 | sep = "\t", 335 | col.names = TRUE, 336 | row.names = FALSE, 337 | quote = FALSE) 338 | } 339 | 340 | 341 | 342 | -------------------------------------------------------------------------------- /sequence_complexity/calculate_seq_complexity_stats.r: -------------------------------------------------------------------------------- 1 | library(dplyr) 2 | library(ggplot2) 3 | ##calculate for 6A 4 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/" 5 | 6 | seq_files <- list.files(base_dir, pattern = "tsv") 7 | 8 | length(seq_files) 9 | sample_no <- 10000 10 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6)) 11 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 12 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6)) 13 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 14 | 15 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13)) 16 | colnames(stats) <- c("FILE", 17 | "N_HAP", 18 | "N_NOT", 19 | "MEDIAN_HAP", 20 | "MEDIAN_NOT", 21 | "MEAN_HAP", 22 | "MEAN_NOT", 23 | "P_VALUE", 24 | "MEDIAN_HAP_SUB", 25 | "MEDIAN_NOT_SUB", 26 | "MEAN_HAP_SUB", 27 | "MEAN_NOT_SUB", 28 | "P_VALUE_SUB") 29 | 30 | row_start <- 1 31 | 32 | for (i in seq(1, length(seq_files))){ 33 | file <- seq_files[i] 34 | name <- gsub("_snps_ref_complexity.tsv", "", file) 35 | seqs <- read.table(file = paste0(base_dir, file), 36 | sep = "\t", 37 | header = TRUE, 38 | stringsAsFactors = FALSE) 39 | 40 | seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N")) 41 | 42 | distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) + 43 | geom_density(alpha = 0.5) + 44 | ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) )) 45 | 46 | 47 | pdf(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/", name, "_complexity_distribution.pdf"), height = 3, width = 3.5) 48 | print(distr) 49 | dev.off() 50 | 51 | head(seqs) 52 | nrow(seqs) 53 | counts <- data.frame(table(seqs$hap_block)) 54 | 55 | medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median)) 56 | means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean)) 57 | 58 | stats[i,"FILE"] <- file 59 | stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"] 60 | stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"] 61 | 62 | if (nrow(medians[medians$hap_block == "Y",]) > 0){ 63 | stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"] 64 | stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"] 65 | } 66 | if (nrow(medians[medians$hap_block == "N",]) > 0){ 67 | stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"] 68 | stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"] 69 | } 70 | 71 | if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){ 72 | stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block, p.adjust.method = "BH"))$p.value 73 | } 74 | 75 | row_end <- sample_no*i 76 | if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){ 77 | print("not enough samples") 78 | hap_blocks[c(row_start:row_end),"REF"] <- file 79 | not_block[c(row_start:row_end),"REF"] <- file 80 | } else { 81 | seqs$hap_block <- as.character(seqs$hap_block) 82 | hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no) 83 | hap_blocks[c(row_start:row_end),] <- hap_samples 84 | not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no) 85 | not_block[c(row_start:row_end),] <- not_samples 86 | 87 | #also want to calculate the subsampled stats per alignment 88 | subsamples <- rbind(hap_samples, not_samples) 89 | 90 | medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median)) 91 | means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean)) 92 | 93 | stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"] 94 | stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"] 95 | 96 | stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"] 97 | stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"] 98 | 99 | stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block, p.adjust.method = "BH"))$p.value 100 | } 101 | row_start <- row_end + 1 102 | } 103 | 104 | write.table(stats, file = paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_complexity_per_chrom", sample_no, "random.txt"), 105 | sep = "\t", 106 | col.names = TRUE, 107 | row.names = FALSE, 108 | quote = FALSE) 109 | 110 | head(hap_blocks) 111 | head(not_block) 112 | 113 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),] 114 | haps_noNA$hap_block <- "Y" 115 | not_noNA <- not_block[complete.cases(not_block$P1),] 116 | not_noNA$hap_block <- "N" 117 | 118 | combined <- rbind(haps_noNA, not_noNA) 119 | sink(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random.txt")) 120 | median(haps_noNA$REF_CPLX) 121 | median(not_noNA$REF_CPLX) 122 | 123 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block, p.adjust.method = "BH") 124 | 125 | sink() 126 | 127 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) + 128 | geom_boxplot() 129 | 130 | ggsave(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random_boxplot.png")) 131 | 132 | #plot density distribution 133 | combined$hap_block <- factor(combined$hap_block, levels = c("Y", "N")) 134 | 135 | distr_dens <- ggplot(combined, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) + 136 | geom_density(alpha = 0.5) + 137 | ggtitle(paste0("6A random ", sample_no, "\nN_hap: ", nrow(combined[combined$hap_block == "Y",]), "; N_not: ", nrow(combined[combined$hap_block == "N",]) )) 138 | 139 | pdf(paste0("X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/combined/stats_seq_overall_", sample_no, "random_density.pdf"), height = 3, width = 3.5) 140 | print(distr_dens) 141 | dev.off() 142 | 143 | ##now calculate for 2B 144 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/2B/complexity/" 145 | out_dir <- paste0(base_dir, "stats/") 146 | dir.create(out_dir) 147 | chrom <- "2B" 148 | seq_files <- list.files(base_dir, pattern = "tsv") 149 | 150 | length(seq_files) 151 | sample_no <- 10000 152 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6)) 153 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 154 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6)) 155 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 156 | 157 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13)) 158 | colnames(stats) <- c("FILE", 159 | "N_HAP", 160 | "N_NOT", 161 | "MEDIAN_HAP", 162 | "MEDIAN_NOT", 163 | "MEAN_HAP", 164 | "MEAN_NOT", 165 | "P_VALUE", 166 | "MEDIAN_HAP_SUB", 167 | "MEDIAN_NOT_SUB", 168 | "MEAN_HAP_SUB", 169 | "MEAN_NOT_SUB", 170 | "P_VALUE_SUB") 171 | 172 | row_start <- 1 173 | 174 | for (i in seq(1, length(seq_files))){ 175 | file <- seq_files[i] 176 | name <- gsub("_snps_ref_complexity.tsv", "", file) 177 | seqs <- read.table(file = paste0(base_dir, file), 178 | sep = "\t", 179 | header = TRUE, 180 | stringsAsFactors = FALSE) 181 | 182 | seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N")) 183 | 184 | distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) + 185 | geom_density(alpha = 0.5) + 186 | ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) )) 187 | 188 | pdf(paste0(out_dir, name, "_", chrom, "_complexity_distribution.pdf"), height = 3, width = 3.5) 189 | print(distr) 190 | dev.off() 191 | 192 | #ggsave(distr, file = paste0(out_dir, name, "_", chrom, "_complexity_distribution.png"), height = 5, width = 6) 193 | 194 | head(seqs) 195 | nrow(seqs) 196 | counts <- data.frame(table(seqs$hap_block)) 197 | 198 | medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median)) 199 | means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean)) 200 | 201 | stats[i,"FILE"] <- file 202 | stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"] 203 | stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"] 204 | 205 | if (nrow(medians[medians$hap_block == "Y",]) > 0){ 206 | stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"] 207 | stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"] 208 | } 209 | if (nrow(medians[medians$hap_block == "N",]) > 0){ 210 | stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"] 211 | stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"] 212 | } 213 | 214 | if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){ 215 | stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block, p.adjust.method = "BH"))$p.value 216 | } 217 | 218 | row_end <- sample_no*i 219 | if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){ 220 | print("not enough samples") 221 | hap_blocks[c(row_start:row_end),"REF"] <- file 222 | not_block[c(row_start:row_end),"REF"] <- file 223 | } else { 224 | seqs$hap_block <- as.character(seqs$hap_block) 225 | hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no) 226 | hap_blocks[c(row_start:row_end),] <- hap_samples 227 | not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no) 228 | not_block[c(row_start:row_end),] <- not_samples 229 | 230 | #also want to calculate the subsampled stats per alignment 231 | subsamples <- rbind(hap_samples, not_samples) 232 | 233 | medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median)) 234 | means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean)) 235 | 236 | stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"] 237 | stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"] 238 | 239 | stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"] 240 | stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"] 241 | 242 | stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block, p.adjust.method = "BH"))$p.value 243 | } 244 | row_start <- row_end + 1 245 | } 246 | 247 | write.table(stats, file = paste0(out_dir, "stats_seq_complexity_per_chrom_", chrom, sample_no, "random.txt"), 248 | sep = "\t", 249 | col.names = TRUE, 250 | row.names = FALSE, 251 | quote = FALSE) 252 | 253 | head(hap_blocks) 254 | head(not_block) 255 | 256 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),] 257 | haps_noNA$hap_block <- "Y" 258 | not_noNA <- not_block[complete.cases(not_block$P1),] 259 | not_noNA$hap_block <- "N" 260 | 261 | combined <- rbind(haps_noNA, not_noNA) 262 | sink(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random.txt")) 263 | median(haps_noNA$REF_CPLX) 264 | median(not_noNA$REF_CPLX) 265 | 266 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block, p.adjust.method = "BH") 267 | 268 | sink() 269 | 270 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) + 271 | geom_boxplot() 272 | 273 | ggsave(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_boxplot.png")) 274 | combined$hap_block <- factor(combined$hap_block, levels = c("Y", "N")) 275 | 276 | distr_dens <- ggplot(combined, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) + 277 | geom_density(alpha = 0.5) + 278 | ggtitle(paste0(file, "\nN_hap: ", nrow(combined[combined$hap_block == "Y",]), "; N_not: ", nrow(combined[combined$hap_block == "N",]) )) 279 | 280 | pdf(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_density.pdf"), height = 3, width = 3.5) 281 | print(distr_dens) 282 | dev.off() 283 | 284 | 285 | 286 | #Also want to remove any sequences that have any Ns in them from the calculations 287 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/surrounding_seq/2B/complexity/" 288 | out_dir <- paste0(base_dir, "stats/noN_unique/") 289 | dir.create(out_dir) 290 | chrom <- "2B" 291 | seq_files <- list.files(base_dir, pattern = "tsv") 292 | 293 | length(seq_files) 294 | sample_no <- 10000 295 | hap_blocks <- data.frame(matrix( nrow = length(seq_files)*sample_no, ncol = 6)) 296 | colnames(hap_blocks) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 297 | not_block <- data.frame(matrix(nrow = length(seq_files)*sample_no, ncol = 6)) 298 | colnames(not_block) <- c("P1", "REF_SEQ", "REF", "QUERY", "hap_block", "REF_CPLX") 299 | 300 | stats <- data.frame(matrix(nrow = length(seq_files), ncol = 13)) 301 | colnames(stats) <- c("FILE", 302 | "N_HAP", 303 | "N_NOT", 304 | "MEDIAN_HAP", 305 | "MEDIAN_NOT", 306 | "MEAN_HAP", 307 | "MEAN_NOT", 308 | "P_VALUE", 309 | "MEDIAN_HAP_SUB", 310 | "MEDIAN_NOT_SUB", 311 | "MEAN_HAP_SUB", 312 | "MEAN_NOT_SUB", 313 | "P_VALUE_SUB") 314 | 315 | row_start <- 1 316 | 317 | for (i in seq(1, length(seq_files))){ 318 | file <- seq_files[i] 319 | name <- gsub("_snps_ref_complexity.tsv", "", file) 320 | seqs_raw <- read.table(file = paste0(base_dir, file), 321 | sep = "\t", 322 | header = TRUE, 323 | stringsAsFactors = FALSE) 324 | 325 | seqs_noN <- seqs_raw[grep("N", seqs_raw$REF_SEQ),] 326 | 327 | seqs <- unique(seqs_noN) 328 | 329 | seqs$hap_block <- factor(seqs$hap_block, levels = c("Y", "N")) 330 | 331 | distr <- ggplot(seqs, aes(x = REF_CPLX, group = hap_block, fill = hap_block)) + 332 | geom_density(alpha = 0.5) + 333 | ggtitle(paste0(file, "\nN_hap: ", nrow(seqs[seqs$hap_block == "Y",]), "; N_not: ", nrow(seqs[seqs$hap_block == "N",]) )) 334 | 335 | 336 | ggsave(distr, file = paste0(out_dir, name, "_", chrom, "_complexity_distribution_noN_unique.png"), height = 5, width = 6) 337 | 338 | head(seqs) 339 | nrow(seqs) 340 | counts <- data.frame(table(seqs$hap_block)) 341 | 342 | medians <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = median)) 343 | means <- data.frame(aggregate(REF_CPLX ~ hap_block, data = seqs, FUN = mean)) 344 | 345 | stats[i,"FILE"] <- file 346 | stats[i,"N_HAP"] <- counts[counts$Var1 == "Y", "Freq"] 347 | stats[i,"N_NOT"] <- counts[counts$Var1 == "N", "Freq"] 348 | 349 | if (nrow(medians[medians$hap_block == "Y",]) > 0){ 350 | stats[i,"MEDIAN_HAP"] <- medians[medians$hap_block == "Y","REF_CPLX"] 351 | stats[i,"MEAN_HAP"] <- means[means$hap_block == "Y","REF_CPLX"] 352 | } 353 | if (nrow(medians[medians$hap_block == "N",]) > 0){ 354 | stats[i,"MEDIAN_NOT"] <- medians[medians$hap_block == "N","REF_CPLX"] 355 | stats[i,"MEAN_NOT"] <- means[means$hap_block == "N","REF_CPLX"] 356 | } 357 | 358 | if ((nrow(medians[medians$hap_block == "N",]) > 0) & (nrow(medians[medians$hap_block == "Y",]) > 0)){ 359 | stats[i,"P_VALUE"] <- (pairwise.wilcox.test(seqs$REF_CPLX,seqs$hap_block, p.adjust.method = "BH"))$p.value 360 | } 361 | 362 | row_end <- sample_no*i 363 | if((nrow(seqs[seqs$hap_block == "Y",]) < sample_no) | (nrow(seqs[seqs$hap_block == "N",]) < sample_no)){ 364 | print("not enough samples") 365 | hap_blocks[c(row_start:row_end),"REF"] <- file 366 | not_block[c(row_start:row_end),"REF"] <- file 367 | } else { 368 | seqs$hap_block <- as.character(seqs$hap_block) 369 | hap_samples <- sample_n(seqs[seqs$hap_block == "Y",], sample_no) 370 | hap_blocks[c(row_start:row_end),] <- hap_samples 371 | not_samples <- sample_n(seqs[seqs$hap_block == "N",], sample_no) 372 | not_block[c(row_start:row_end),] <- not_samples 373 | 374 | #also want to calculate the subsampled stats per alignment 375 | subsamples <- rbind(hap_samples, not_samples) 376 | 377 | medians_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = median)) 378 | means_sub <- data.frame(aggregate(REF_CPLX ~ hap_block, data = subsamples, FUN = mean)) 379 | 380 | stats[i,"MEDIAN_HAP_SUB"] <- medians_sub[medians_sub$hap_block == "Y","REF_CPLX"] 381 | stats[i,"MEAN_HAP_SUB"] <- means_sub[means_sub$hap_block == "Y","REF_CPLX"] 382 | 383 | stats[i,"MEDIAN_NOT_SUB"] <- medians_sub[medians_sub$hap_block == "N","REF_CPLX"] 384 | stats[i,"MEAN_NOT_SUB"] <- means_sub[means_sub$hap_block == "N","REF_CPLX"] 385 | 386 | stats[i,"P_VALUE_SUB"] <- (pairwise.wilcox.test(subsamples$REF_CPLX, subsamples$hap_block, p.adjust.method = "BH"))$p.value 387 | } 388 | row_start <- row_end + 1 389 | } 390 | 391 | write.table(stats, file = paste0(out_dir, "stats_seq_complexity_per_chrom_", chrom, sample_no, "random_noN_unique.txt"), 392 | sep = "\t", 393 | col.names = TRUE, 394 | row.names = FALSE, 395 | quote = FALSE) 396 | 397 | head(hap_blocks) 398 | head(not_block) 399 | 400 | haps_noNA <- hap_blocks[complete.cases(hap_blocks$P1),] 401 | haps_noNA$hap_block <- "Y" 402 | not_noNA <- not_block[complete.cases(not_block$P1),] 403 | not_noNA$hap_block <- "N" 404 | 405 | combined <- rbind(haps_noNA, not_noNA) 406 | sink(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_noN_unique.txt")) 407 | median(haps_noNA$REF_CPLX) 408 | median(not_noNA$REF_CPLX) 409 | 410 | pairwise.wilcox.test(combined$REF_CPLX, combined$hap_block, p.adjust.method = "BH") 411 | 412 | sink() 413 | 414 | 415 | ggplot(combined, aes(x = hap_block, y = REF_CPLX, group = hap_block)) + 416 | geom_boxplot() 417 | 418 | ggsave(paste0(out_dir, "stats_seq_overall_", chrom, sample_no, "random_boxplot_noN_unique.png")) 419 | -------------------------------------------------------------------------------- /figures/plot_extended_data_figure_1.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Supplemental_plots" 3 | author: "Jemima Brinton" 4 | date: "20/04/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(dplyr) 12 | library(magrittr) 13 | library(GenomicRanges) 14 | library(ggplot2) 15 | library(tidyr) 16 | library(viridis) 17 | library(stringr) 18 | library(gridExtra) 19 | 20 | 21 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/ 22 | #Reads in delta file output from nucmer into R and summarises into dataframe 23 | readDelta <- function(deltafile){ 24 | lines = scan(deltafile, 'a', sep='\n', quiet=TRUE) 25 | lines = lines[-1] 26 | lines.l = strsplit(lines, ' ') 27 | lines.len = lapply(lines.l, length) %>% as.numeric 28 | lines.l = lines.l[lines.len != 1] 29 | lines.len = lines.len[lines.len != 1] 30 | head.pos = which(lines.len == 4) 31 | head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos) 32 | mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7) 33 | res = as.data.frame(t(mat[1:5,])) 34 | colnames(res) = c('rs','re','qs','qe','error') 35 | res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2)) 36 | res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .) 37 | res$strand = ifelse(res$qe-res$qs > 0, '+', '-') 38 | res 39 | } 40 | 41 | #calculates percentage identity and mid point for each mummer alignment 42 | calculate_perc_id_mid_points <- function(data){ 43 | data$r_length <- (data$re - data$rs) 44 | data$perc_id <- ((data$r_length - data$error)/data$r_length)*100 45 | data$perc_id_factor <- data$perc_id 46 | data[data$perc_id < 100, "perc_id_factor"] <- "<100" 47 | data$perc_id_factor <- as.factor(data$perc_id_factor) 48 | data$r_mid <- (data$rs + data$re)/2 49 | data$q_mid <- (data$qs + data$qe)/2 50 | return(data) 51 | } 52 | 53 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size 54 | pre_plot_analysis <- function(delta_path, min_size = 20000){ 55 | data = readDelta(delta_path) 56 | data <- calculate_perc_id_mid_points(data) 57 | data_filt <- data[data$r_length >= min_size,] 58 | return(data_filt) 59 | } 60 | 61 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query 62 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){ 63 | #order data by % ID to help with overplotting 64 | data <- data[order(data$perc_id),] 65 | ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id)) + geom_segment() + 66 | geom_point(alpha=.5) + 67 | theme_bw() + 68 | theme(strip.text.y=element_text(angle=180, size=5), 69 | strip.background=element_blank()) + 70 | xlab(unique(data$rid)) + ylab(unique(data$qid)) + 71 | scale_colour_viridis(limits = c(cap_lower, cap_upper)) + 72 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 73 | scale_y_continuous(breaks=seq(0,max(data$qe),100000000)) 74 | } 75 | 76 | plot_by_perc_id_cap_midpoint <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){ 77 | #order data by % ID to help with overplotting 78 | data <- data[order(data$perc_id),] 79 | ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=r_mid, y=q_mid, colour=perc_id)) + 80 | geom_point(size = 0.3) + 81 | theme_bw() + 82 | theme(strip.text.y=element_text(angle=180, size=5), 83 | strip.background=element_blank(), 84 | legend.position = "none", 85 | axis.title.x=element_blank(), 86 | axis.text.x=element_blank(), 87 | axis.title.y=element_blank(), 88 | axis.text.y=element_blank(),) + 89 | #xlab(unique(data$rid)) + ylab(unique(data$qid)) + 90 | scale_colour_viridis(limits = c(cap_lower, cap_upper)) + 91 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 92 | scale_y_continuous(breaks=seq(0,max(data$qe),100000000))+ 93 | coord_cartesian(xlim=c(xmin,xmax)) 94 | } 95 | 96 | 97 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){ 98 | ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) + 99 | theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) + 100 | geom_point(alpha=.5) + 101 | ylim(ymin, ymax) + 102 | scale_colour_viridis() 103 | } 104 | 105 | plot_perc_id_v_ref_no_length <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100, breaks = 0.5){ 106 | ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id)) + 107 | theme_bw() + 108 | #xlab(data$rid) + 109 | #ylab(paste0('percentage ID v ', data$qid)) + 110 | theme(strip.text.y=element_text(angle=180, size=5), 111 | strip.background=element_blank(), 112 | legend.position = "none", 113 | axis.title.x=element_blank(), 114 | axis.text.x=element_blank(), 115 | axis.title.y=element_blank(), 116 | axis.text.y=element_blank(),) + 117 | geom_point(alpha=.5, size = 0.5, colour = "gray25") + 118 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 119 | scale_y_continuous(limits = c(ymin, ymax), breaks=seq(ymin,ymax,breaks)) + 120 | coord_cartesian(xlim=c(xmin,xmax)) 121 | } 122 | 123 | #bins alignment dataframe by reference position 124 | bin_data <- function(data, bin_size, max_chrom_size){ 125 | bins <- seq(0, max_chrom_size, by = bin_size) 126 | data$bin <- NA 127 | for (i in bins){ 128 | data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin) 129 | } 130 | return(data) 131 | } 132 | 133 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold 134 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 135 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 136 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 137 | colnames(comparison_medians)[2] <- "perc_id_median" 138 | comparison_medians$cut_off <- NA 139 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 140 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 141 | 142 | ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) + 143 | geom_line(colour = "grey") + 144 | geom_point(size = 1) + 145 | ylim(ymin,ymax) + 146 | scale_colour_manual(values = c("#73D055FF", "#440154FF")) + 147 | labs(colour = "% id") + 148 | theme_bw() + 149 | xlab(data$rid) + 150 | 151 | ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) 152 | } 153 | 154 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold 155 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 156 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 157 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 158 | colnames(comparison_medians)[2] <- "perc_id_median" 159 | comparison_medians$cut_off <- NA 160 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 161 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 162 | 163 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 164 | 165 | ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 166 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 167 | scale_fill_manual(values = c("darkorchid4", "gold")) + 168 | #labs(fill = "% id") + 169 | #scale_colour_viridis() + 170 | #xlab(data$rid) + 171 | theme_bw() + 172 | theme(strip.text.y=element_text(angle=180, size=5), 173 | strip.background=element_blank(), 174 | legend.position = "none", 175 | axis.title.x=element_blank(), 176 | axis.text.x=element_blank(), 177 | axis.title.y=element_blank(), 178 | axis.text.y=element_blank(),) + 179 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 180 | scale_x_continuous(breaks=seq(0,max(data$re),100000000)) + 181 | scale_y_continuous(limits = c(ymin,ymax),breaks=seq(ymin,ymax,0.5)) 182 | } 183 | 184 | 185 | ## END OF FUNCTIONS 186 | 187 | ``` 188 | 189 | We want to zoom in on the junction in mace and stanley with the smaller window sizes 190 | 191 | Read in the raw data 192 | 193 | ```{r} 194 | 195 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/" 196 | 197 | min_size <- 20000 198 | cut_off <- 99.99 199 | 200 | chr <- "6A" 201 | chrom <- paste0("chr", chr) 202 | 203 | variety <- "mace" 204 | 205 | data_dir_6A <- "X:/brintonj/6A_region/mummer/whole_6A_aln/" 206 | data_dir <- paste0(data_dir_6A, variety, "/") 207 | 208 | all_files <- list.files(data_dir) 209 | 210 | filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)] 211 | 212 | comparison <- "mace_v_stanley.all_6A_filtered_L20Kb_rq.delta" 213 | 214 | comparison_delta_path <- paste0(data_dir, "/", comparison) 215 | comparison_id <- str_split_fixed(comparison, "\\.", 3)[1] 216 | 217 | ref <- variety 218 | 219 | if (ref == "sy_mattis"){ 220 | query <- str_split_fixed(comparison_id, "_", 4)[4] 221 | } else { 222 | query <- str_split_fixed(comparison_id, "_", 3)[3] 223 | } 224 | 225 | #read and filter for the minimum size 226 | comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size) 227 | print("data read in and filtered") 228 | head(comparison_filt) 229 | min(comparison_filt$perc_id) 230 | 231 | x_max <- 610000000 232 | ``` 233 | 234 | Also read in the mummer blocks 235 | 236 | ```{r} 237 | #read in the mummer_block coordinates 238 | mummer <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected.tsv", 239 | header = TRUE, 240 | stringsAsFactors = FALSE) 241 | 242 | blocks_to_use <- mummer[(mummer$ref == "mace") & (mummer$query == "stanley") & (mummer$chrom == "chr6A") & (mummer$source == "mummer"),] 243 | 244 | blocks_to_use 245 | ``` 246 | 247 | We are interested in the 1.85e+08 2.80e+08 block, the left hand junction at 1.85e+08 248 | 249 | One level at a time...step down to the 2.5mbp blocks 250 | 251 | ```{r} 252 | #read in the mummer_block coordinates 253 | 254 | mummer_25mbp <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/no_spelta_2gap/whole_genome_mummer_BLAST_2500000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv", 255 | header = TRUE, 256 | stringsAsFactors = FALSE) 257 | 258 | blocks_to_use_25m <- mummer_25mbp[(mummer_25mbp$ref == "mace") & (mummer_25mbp$query == "stanley") & (mummer_25mbp$chrom == "chr6A") & (mummer_25mbp$source == "mummer"),] 259 | 260 | blocks_to_use_25m 261 | 262 | head(mummer_25mbp) 263 | 264 | ``` 265 | 266 | The 1.85e+08 2.80e+08 block is still there - lets try zooming into .150-2.50 267 | 268 | 5Mbp bins first 269 | ```{r} 270 | 271 | width = 4 272 | height = 3 273 | 274 | data = comparison_filt 275 | bin_size = 5000000 276 | max_chrom_size = max(comparison_filt$re) 277 | alpha = 0.5 278 | fill = "#de87cd" 279 | cut_off = 99.99 280 | ymin = 99.9 281 | ymax = 100 282 | xmin = 150000000 283 | xmax = 250000000 284 | 285 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 286 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 287 | colnames(comparison_medians)[2] <- "perc_id_median" 288 | comparison_medians$cut_off <- NA 289 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 290 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 291 | 292 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 293 | 294 | perc_boxplot_5M_shade_zoom <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 295 | annotate("rect", xmin = blocks_to_use$ref_start-(bin_size*0.5), xmax = blocks_to_use$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) + 296 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 297 | scale_fill_manual(values = c("white", "gray50")) + 298 | #labs(fill = "% id") + 299 | #scale_colour_viridis() + 300 | #xlab(data$rid) + 301 | theme_bw() + 302 | theme(strip.text.y=element_text(angle=180, size=5), 303 | axis.text.x = element_text(angle = 90), 304 | legend.position = "none")+#, 305 | #strip.background=element_blank(), 306 | #legend.position = "none", 307 | #axis.title.x=element_blank(), 308 | #axis.text.x=element_blank(), 309 | #axis.title.y=element_blank(), 310 | #axis.text.y=element_blank(),) + 311 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 312 | scale_x_continuous(breaks=seq(xmin, xmax, 10000000)) + 313 | scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) + 314 | coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax)) 315 | 316 | perc_boxplot_5M_shade_zoom 317 | 318 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height) 319 | print(perc_boxplot_5M_shade_zoom) 320 | dev.off() 321 | ``` 322 | 323 | ```{r} 324 | 325 | 326 | data = comparison_filt 327 | bin_size = 2500000 328 | max_chrom_size = max(comparison_filt$re) 329 | alpha = 0.5 330 | fill = "#de87cd" 331 | cut_off = 99.99 332 | ymin = 99.9 333 | ymax = 100 334 | xmin = 150000000 335 | xmax = 250000000 336 | 337 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 338 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 339 | colnames(comparison_medians)[2] <- "perc_id_median" 340 | comparison_medians$cut_off <- NA 341 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 342 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 343 | 344 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 345 | 346 | perc_boxplot_5M_shade_zoom_25mbp <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 347 | annotate("rect", xmin = blocks_to_use_25m$ref_start-(bin_size*0.5), xmax = blocks_to_use_25m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) + 348 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 349 | scale_fill_manual(values = c("white", "gray50")) + 350 | #labs(fill = "% id") + 351 | #scale_colour_viridis() + 352 | #xlab(data$rid) + 353 | theme_bw() + 354 | theme(strip.text.y=element_text(angle=180, size=5), 355 | axis.text.x = element_text(angle = 90), 356 | legend.position = "none")+#, 357 | #strip.background=element_blank(), 358 | #legend.position = "none", 359 | #axis.title.x=element_blank(), 360 | #axis.text.x=element_blank(), 361 | #axis.title.y=element_blank(), 362 | #axis.text.y=element_blank(),) + 363 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 364 | scale_x_continuous(breaks=seq(xmin, xmax, 10000000)) + 365 | scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) + 366 | coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax)) 367 | 368 | perc_boxplot_5M_shade_zoom_25mbp 369 | 370 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height) 371 | print(perc_boxplot_5M_shade_zoom_25mbp) 372 | dev.off() 373 | ``` 374 | 375 | 376 | 377 | 1mbp now 378 | 379 | 380 | 381 | ```{r} 382 | #read in the mummer_block coordinates 383 | mummer_1mbp <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/no_spelta_2gap/whole_genome_mummer_BLAST_1e+06_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv", 384 | header = TRUE, 385 | stringsAsFactors = FALSE) 386 | 387 | blocks_to_use_1m <- mummer_1mbp[(mummer_1mbp$ref == "mace") & (mummer_1mbp$query == "stanley") & (mummer_1mbp$chrom == "chr6A") & (mummer_1mbp$source == "mummer"),] 388 | 389 | blocks_to_use_1m 390 | 391 | 392 | 393 | ``` 394 | 395 | zooming into .150-2.50 396 | 397 | ```{r} 398 | 399 | data = comparison_filt 400 | bin_size = 1000000 401 | max_chrom_size = max(comparison_filt$re) 402 | alpha = 0.5 403 | fill = "#de87cd" 404 | cut_off = 99.99 405 | ymin = 99.9 406 | ymax = 100 407 | xmin = 165000000 408 | xmax = 215000000 409 | 410 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 411 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 412 | colnames(comparison_medians)[2] <- "perc_id_median" 413 | comparison_medians$cut_off <- NA 414 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 415 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 416 | 417 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 418 | 419 | perc_boxplot_5M_shade_zoom_1mbp <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 420 | annotate("rect", xmin = blocks_to_use_1m$ref_start-(bin_size*0.5), xmax = blocks_to_use_1m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) + 421 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 422 | scale_fill_manual(values = c("white", "gray50")) + 423 | #labs(fill = "% id") + 424 | #scale_colour_viridis() + 425 | #xlab(data$rid) + 426 | theme_bw() + 427 | theme(strip.text.y=element_text(angle=180, size=5), 428 | axis.text.x = element_text(angle = 90), 429 | legend.position = "none")+#, 430 | #strip.background=element_blank(), 431 | #legend.position = "none", 432 | #axis.title.x=element_blank(), 433 | #axis.text.x=element_blank(), 434 | #axis.title.y=element_blank(), 435 | #axis.text.y=element_blank(),) + 436 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 437 | scale_x_continuous(breaks=seq(xmin, xmax, 5000000)) + 438 | scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) + 439 | coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax)) 440 | 441 | perc_boxplot_5M_shade_zoom_1mbp 442 | 443 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_", bin_size, ".pdf"), width = width, height = height) 444 | print(perc_boxplot_5M_shade_zoom_1mbp) 445 | dev.off() 446 | ``` 447 | 448 | 449 | 450 | Try plotting the alignments in this border region with some indication of percentage ID 451 | ```{r} 452 | data = comparison_filt 453 | bin_size = 250000 454 | max_chrom_size = max(comparison_filt$re) 455 | alpha = 0.5 456 | fill = "#de87cd" 457 | cut_off = 99.99 458 | ymin = 99.9 459 | ymax = 100 460 | xmin = 186000000 461 | xmax = 188000000 462 | 463 | aln_plot <- ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=perc_id, yend=perc_id)) + 464 | annotate("rect", xmin = blocks_to_use_1m$ref_start-(bin_size*0.5), xmax = blocks_to_use_1m$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) + 465 | geom_segment() + 466 | #geom_point(alpha=.5) + 467 | theme_bw() + 468 | scale_x_continuous(breaks=seq(xmin, xmax, 100000)) + 469 | scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) + 470 | coord_cartesian(ylim=c(ymin,ymax), xlim = c(xmin, xmax)) + 471 | theme(axis.text.x = element_text(angle = 90)) 472 | 473 | aln_plot 474 | 475 | pdf(paste0("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/transition_zone_Ex1_alignments.pdf"), width = width, height = height) 476 | print(aln_plot) 477 | dev.off() 478 | 479 | ``` 480 | 481 | -------------------------------------------------------------------------------- /figures/figure_1_plots.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Plot figure 1 plots" 3 | author: "Jemima Brinton" 4 | date: "08/04/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | ## generate plots for figure 1 12 | ## using the mace/stanley 6A alignment example 13 | 14 | ## FUNCTIONS 15 | library(dplyr) 16 | library(magrittr) 17 | library(GenomicRanges) 18 | library(ggplot2) 19 | library(tidyr) 20 | library(viridis) 21 | library(stringr) 22 | library(gridExtra) 23 | 24 | 25 | #Read Delta functions and plotting functions (adapted) used are from https://jmonlong.github.io/Hippocamplus/2017/09/19/mummerplots-with-ggplot2/ 26 | #Reads in delta file output from nucmer into R and summarises into dataframe 27 | readDelta <- function(deltafile){ 28 | lines = scan(deltafile, 'a', sep='\n', quiet=TRUE) 29 | lines = lines[-1] 30 | lines.l = strsplit(lines, ' ') 31 | lines.len = lapply(lines.l, length) %>% as.numeric 32 | lines.l = lines.l[lines.len != 1] 33 | lines.len = lines.len[lines.len != 1] 34 | head.pos = which(lines.len == 4) 35 | head.id = rep(head.pos, c(head.pos[-1], length(lines.l)+1)-head.pos) 36 | mat = matrix(as.numeric(unlist(lines.l[lines.len==7])), 7) 37 | res = as.data.frame(t(mat[1:5,])) 38 | colnames(res) = c('rs','re','qs','qe','error') 39 | res$qid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 2)) 40 | res$rid = unlist(lapply(lines.l[head.id[lines.len==7]], '[', 1)) %>% gsub('^>', '', .) 41 | res$strand = ifelse(res$qe-res$qs > 0, '+', '-') 42 | res 43 | } 44 | 45 | #calculates percentage identity and mid point for each mummer alignment 46 | calculate_perc_id_mid_points <- function(data){ 47 | data$r_length <- (data$re - data$rs) 48 | data$perc_id <- ((data$r_length - data$error)/data$r_length)*100 49 | data$perc_id_factor <- data$perc_id 50 | data[data$perc_id < 100, "perc_id_factor"] <- "<100" 51 | data$perc_id_factor <- as.factor(data$perc_id_factor) 52 | data$r_mid <- (data$rs + data$re)/2 53 | data$q_mid <- (data$qs + data$qe)/2 54 | return(data) 55 | } 56 | 57 | #reads in delta file, calculates percentage ID and filters for a minimum alignment size 58 | pre_plot_analysis <- function(delta_path, min_size = 20000){ 59 | data = readDelta(delta_path) 60 | data <- calculate_perc_id_mid_points(data) 61 | data_filt <- data[data$r_length >= min_size,] 62 | return(data_filt) 63 | } 64 | 65 | # Plots diagonal scatter plot of a nucmer pairwise alignment with points coloured by percentage identity of alignment - x axis is reference, y axis is query 66 | plot_by_perc_id_cap <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){ 67 | #order data by % ID to help with overplotting 68 | data <- data[order(data$perc_id),] 69 | ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=rs, xend=re, y=qs, yend=qe, colour=perc_id)) + geom_segment() + 70 | geom_point(alpha=.5) + 71 | theme_bw() + 72 | theme(strip.text.y=element_text(angle=180, size=5), 73 | strip.background=element_blank()) + 74 | xlab(unique(data$rid)) + ylab(unique(data$qid)) + 75 | scale_colour_viridis(limits = c(cap_lower, cap_upper)) + 76 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 77 | scale_y_continuous(breaks=seq(0,max(data$qe),100000000)) 78 | } 79 | 80 | plot_by_perc_id_cap_midpoint <- function(data, xmin = 0, xmax = max(data$re), cap_lower, cap_upper){ 81 | #order data by % ID to help with overplotting 82 | data <- data[order(data$perc_id),] 83 | ggplot(data[data$rs > xmin & data$re < xmax,], aes(x=r_mid, y=q_mid, colour=perc_id)) + 84 | geom_point(size = 0.3) + 85 | theme_bw() + 86 | theme(strip.text.y=element_text(angle=180, size=5), 87 | strip.background=element_blank(), 88 | legend.position = "none", 89 | axis.title.x=element_blank(), 90 | axis.text.x=element_blank(), 91 | axis.title.y=element_blank(), 92 | axis.text.y=element_blank(),) + 93 | #xlab(unique(data$rid)) + ylab(unique(data$qid)) + 94 | scale_colour_viridis(limits = c(cap_lower, cap_upper)) + 95 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 96 | scale_y_continuous(breaks=seq(0,max(data$qe),100000000))+ 97 | coord_cartesian(xlim=c(xmin,xmax)) 98 | } 99 | 100 | 101 | plot_perc_id_v_ref <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100){ 102 | ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id, colour=r_length)) + 103 | theme_bw() + xlab(data$rid) + ylab(paste0('percentage ID v ', data$qid)) + 104 | geom_point(alpha=.5) + 105 | ylim(ymin, ymax) + 106 | scale_colour_viridis() 107 | } 108 | 109 | plot_perc_id_v_ref_no_length <- function(data, xmin = 0, xmax = max(data$re), ymin = 97, ymax = 100, breaks = 0.5){ 110 | ggplot(data[data$r_mid > xmin & data$r_mid < xmax,], aes(x=r_mid, y=perc_id)) + 111 | theme_bw() + 112 | #xlab(data$rid) + 113 | #ylab(paste0('percentage ID v ', data$qid)) + 114 | theme(strip.text.y=element_text(angle=180, size=5), 115 | strip.background=element_blank(), 116 | legend.position = "none", 117 | axis.title.x=element_blank(), 118 | axis.text.x=element_blank(), 119 | axis.title.y=element_blank(), 120 | axis.text.y=element_blank(),) + 121 | geom_point(alpha=.5, size = 0.5, colour = "gray25") + 122 | scale_x_continuous(breaks=seq(xmin,xmax,100000000)) + 123 | scale_y_continuous(limits = c(ymin, ymax), breaks=seq(ymin,ymax,breaks)) + 124 | coord_cartesian(xlim=c(xmin,xmax)) 125 | } 126 | 127 | #bins alignment dataframe by reference position 128 | bin_data <- function(data, bin_size, max_chrom_size){ 129 | bins <- seq(0, max_chrom_size, by = bin_size) 130 | data$bin <- NA 131 | for (i in bins){ 132 | data$bin <- ifelse(((data$r_mid > (i-bin_size)) & (data$r_mid < (i-1))), i, data$bin) 133 | } 134 | return(data) 135 | } 136 | 137 | #plots median percentage identity of each bin against reference position, colour of point is based on if the median percentage identity passes the identity threshold 138 | plot_medians_line_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 139 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 140 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 141 | colnames(comparison_medians)[2] <- "perc_id_median" 142 | comparison_medians$cut_off <- NA 143 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 144 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 145 | 146 | ggplot(comparison_medians, aes(x=bin, y = perc_id_median, colour = cut_off)) + 147 | geom_line(colour = "grey") + 148 | geom_point(size = 1) + 149 | ylim(ymin,ymax) + 150 | scale_colour_manual(values = c("#73D055FF", "#440154FF")) + 151 | labs(colour = "% id") + 152 | theme_bw() + 153 | xlab(data$rid) + 154 | 155 | ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) 156 | } 157 | 158 | #plots boxplots of percentage identity of each bin against reference position, colour of boxplot is based on if the median percentage identity passes the identity threshold 159 | plot_boxplots_median_colour_cut_off <- function(data, bin_size = 10000000, cut_off = 99.99, ymin = 97, ymax = 100, max_chrom_size){ 160 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 161 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 162 | colnames(comparison_medians)[2] <- "perc_id_median" 163 | comparison_medians$cut_off <- NA 164 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 165 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 166 | 167 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 168 | 169 | ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 170 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 171 | scale_fill_manual(values = c("darkorchid4", "gold")) + 172 | #labs(fill = "% id") + 173 | #scale_colour_viridis() + 174 | #xlab(data$rid) + 175 | theme_bw() + 176 | theme(strip.text.y=element_text(angle=180, size=5), 177 | strip.background=element_blank(), 178 | legend.position = "none", 179 | axis.title.x=element_blank(), 180 | axis.text.x=element_blank(), 181 | axis.title.y=element_blank(), 182 | axis.text.y=element_blank(),) + 183 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 184 | scale_x_continuous(breaks=seq(0,max(data$re),100000000)) + 185 | scale_y_continuous(limits = c(ymin,ymax),breaks=seq(ymin,ymax,0.5)) 186 | } 187 | 188 | 189 | ## END OF FUNCTIONS 190 | ``` 191 | 192 | ```{r} 193 | 194 | base_dir <- "X:/brintonj/haplotype/whole_genome_mummer/" 195 | 196 | min_size <- 20000 197 | cut_off <- 99.99 198 | 199 | chr <- "6A" 200 | chrom <- paste0("chr", chr) 201 | 202 | variety <- "mace" 203 | 204 | data_dir_6A <- "X:/brintonj/6A_region/mummer/whole_6A_aln/" 205 | data_dir <- paste0(data_dir_6A, variety, "/") 206 | 207 | all_files <- list.files(data_dir) 208 | 209 | filtered_delta <- all_files[grep("_L20Kb_rq.delta", all_files)] 210 | 211 | comparison <- "mace_v_stanley.all_6A_filtered_L20Kb_rq.delta" 212 | 213 | comparison_delta_path <- paste0(data_dir, "/", comparison) 214 | comparison_id <- str_split_fixed(comparison, "\\.", 3)[1] 215 | 216 | ref <- variety 217 | 218 | if (ref == "sy_mattis"){ 219 | query <- str_split_fixed(comparison_id, "_", 4)[4] 220 | } else { 221 | query <- str_split_fixed(comparison_id, "_", 3)[3] 222 | } 223 | 224 | #read and filter for the minimum size 225 | comparison_filt <- pre_plot_analysis(delta_path = comparison_delta_path, min_size = min_size) 226 | print("data read in and filtered") 227 | head(comparison_filt) 228 | min(comparison_filt$perc_id) 229 | 230 | x_max <- 610000000 231 | ``` 232 | 233 | 234 | 235 | ```{r} 236 | diagonal_dot_plot_mid <- plot_by_perc_id_cap_midpoint(comparison_filt, cap_lower = 97, cap_upper = 100, xmax = x_max) 237 | diagonal_dot_plot_mid 238 | ``` 239 | 240 | ```{r} 241 | diagonal_dot_plot_mid_legend <- plot_by_perc_id_cap(comparison_filt, cap_lower = 97, cap_upper = 100, xmax = x_max) 242 | diagonal_dot_plot_mid_legend 243 | 244 | ``` 245 | 246 | 247 | 248 | ```{r} 249 | perc_dot_plot_no_length <- plot_perc_id_v_ref_no_length(comparison_filt, ymin = 97, xmax = x_max) 250 | 251 | perc_dot_plot_no_length 252 | ``` 253 | ```{r} 254 | perc_boxplot_5M <- plot_boxplots_median_colour_cut_off(comparison_filt, bin_size = 5000000, max_chrom_size = max(comparison_filt$re)) 255 | perc_boxplot_5M 256 | ``` 257 | 258 | 259 | Add some shading in the background of the boxplots where haplotype blocks are called 260 | 261 | ```{r} 262 | #read in the mummer_block coordinates 263 | mummer <- read.table("X:/brintonj/haplotype/blocks_updated_coords/converted_10g/corrected/whole_genome_mummer_BLAST_5000000_blocks_combined_updated_ref_coords_10g_corrected_2gap_no_spelta.tsv", 264 | header = TRUE, 265 | stringsAsFactors = FALSE) 266 | 267 | blocks_to_use <- mummer[(mummer$ref == "mace") & (mummer$query == "stanley") & (mummer$chrom == "chr6A") & (mummer$source == "mummer"),] 268 | 269 | blocks_to_use 270 | ``` 271 | 272 | 273 | Focus just on the 99.9 - 100 % points 274 | 275 | Dot plot 276 | 277 | ```{r} 278 | perc_dot_plot_no_length_zoom <- plot_perc_id_v_ref_no_length(comparison_filt, ymin = 99.9, breaks = 0.01, xmax = x_max) 279 | 280 | perc_dot_plot_no_length_zoom 281 | ``` 282 | 283 | Boxplots 284 | ```{r} 285 | 286 | data = comparison_filt 287 | bin_size = 5000000 288 | max_chrom_size = max(comparison_filt$re) 289 | alpha = 0.5 290 | fill = "#de87cd" 291 | cut_off = 99.99 292 | ymin = 99.9 293 | ymax = 100 294 | 295 | comparison_filt_bin <- bin_data(data, bin_size = bin_size, max_chrom_size = max_chrom_size) 296 | comparison_medians <- aggregate(perc_id ~ bin, data = comparison_filt_bin, FUN=median) 297 | colnames(comparison_medians)[2] <- "perc_id_median" 298 | comparison_medians$cut_off <- NA 299 | comparison_medians$cut_off <- ifelse(comparison_medians$perc_id >= cut_off, paste0(">=",cut_off), paste0("<",cut_off)) 300 | comparison_medians$cut_off <- as.factor(comparison_medians$cut_off) 301 | 302 | comparison_to_plot <- merge(comparison_filt_bin, comparison_medians) 303 | 304 | perc_boxplot_5M_shade_zoom <- ggplot(comparison_to_plot, aes(x=bin, y = perc_id, group = bin, fill = cut_off)) + 305 | #geom_rect(aes(xmin=blocks_to_use[1,"ref_start"]+bin_size,xmax=blocks_to_use[1,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) + 306 | #geom_rect(aes(xmin=blocks_to_use[2,"ref_start"]+bin_size,xmax = blocks_to_use[2,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) + 307 | #geom_rect(aes(xmin=blocks_to_use[3,"ref_start"]+bin_size,xmax = blocks_to_use[3,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) + 308 | #geom_rect(aes(xmin=blocks_to_use[4,"ref_start"]+bin_size,xmax = blocks_to_use[4,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) + 309 | #geom_rect(aes(xmin=blocks_to_use[5,"ref_start"]+bin_size,xmax = blocks_to_use[5,"ref_end"]+bin_size,ymin=-Inf,ymax=Inf),alpha=alpha,fill=fill) + 310 | annotate("rect", xmin = blocks_to_use$ref_start+(bin_size*0.5), xmax = blocks_to_use$ref_end+(bin_size*0.5), ymin=-Inf,ymax=Inf, alpha = alpha, fill = fill) + 311 | geom_boxplot(outlier.shape = NA, lwd = 0.3) + 312 | scale_fill_manual(values = c("white", "gray50")) + 313 | #labs(fill = "% id") + 314 | #scale_colour_viridis() + 315 | #xlab(data$rid) + 316 | theme_bw() + 317 | theme(strip.text.y=element_text(angle=180, size=5), 318 | strip.background=element_blank(), 319 | legend.position = "none", 320 | axis.title.x=element_blank(), 321 | axis.text.x=element_blank(), 322 | axis.title.y=element_blank(), 323 | axis.text.y=element_blank(),) + 324 | #ggtitle(paste0(data$rid, " v ", data$qid, " BinSize: ", bin_size, " CutOff: ", cut_off)) + 325 | scale_x_continuous(breaks=seq(0,x_max,100000000)) + 326 | scale_y_continuous(limits = c(0,ymax),breaks=seq(0,ymax,0.01)) + 327 | coord_cartesian(ylim=c(ymin,ymax), xlim = c(0,x_max)) 328 | 329 | perc_boxplot_5M_shade_zoom 330 | ``` 331 | ```{r} 332 | #ggsave(combined, "Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots.png", dpi = 600, height = 10, width = 10) 333 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade.png", res = 1500, width = 4.3, height = 3.8, units = "in") 334 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_boxplot_5M_shade, nrow=3) 335 | dev.off() 336 | 337 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade_wide.png", res = 1500, width = 6.5, height = 4, units = "in") 338 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_boxplot_5M_shade, nrow=3) 339 | dev.off() 340 | 341 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_shade_zoomed_wide.png", res = 1500, width = 6.5, height = (4/3)*4, units = "in") 342 | grid.arrange(diagonal_dot_plot_mid, perc_dot_plot_no_length, perc_dot_plot_no_length_zoom, perc_boxplot_5M_shade_zoom, nrow=4) 343 | dev.off() 344 | 345 | ## also want to save them separately 346 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal.png", res = 1500, width = 6.5, height = (4/3), units = "in") 347 | print(diagonal_dot_plot_mid) 348 | dev.off() 349 | 350 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_perc_dot.png", res = 1500, width = 6.5, height = (4/3), units = "in") 351 | print(perc_dot_plot_no_length) 352 | dev.off() 353 | 354 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_perc_dot_zoom.png", res = 1500, width = 6.5, height = (4/3), units = "in") 355 | print(perc_dot_plot_no_length_zoom) 356 | dev.off() 357 | 358 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_boxplot_zoom.png", res = 1500, width = 6.5, height = (4/3), units = "in") 359 | print(perc_boxplot_5M_shade_zoom) 360 | dev.off() 361 | 362 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_with_legend.png", res = 1500, width = 6.5, height = 4, units = "in") 363 | print(diagonal_dot_plot_mid_legend) 364 | dev.off() 365 | 366 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_with_legend_alt.png", res = 1500, width = 6.5, height = 4, units = "in") 367 | print(diagonal_dot_plot_mid_legend) 368 | dev.off() 369 | 370 | png("Y:/Publications/Haplotypes/Figures/figure_1_mummer_plots_wide_diagonal_short.png", res = 1500, width = 6.5, height = 1, units = "in") 371 | print(diagonal_dot_plot_mid) 372 | dev.off() 373 | ``` 374 | 375 | Next we want to generate the heatmaps from the BLAST with the different amounts of flanking sequence 376 | 377 | ```{r} 378 | blast_data_dir <- "W:/ramirezr/SM1/pairwise_blast_oct_2019/" 379 | files_to_run <- list.files(blast_data_dir, pattern = "varieties_6A_identites") 380 | 381 | bp0 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_0bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 382 | bp1000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_1000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 383 | bp2000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_2000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 384 | bp5000 <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_5000bp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 385 | cds <- read.table(gzfile(paste0(blast_data_dir, "varieties_6A_identites_cdsbp.tab.gz")), sep = "\t", header = TRUE, stringsAsFactors = FALSE) 386 | 387 | all_flank <- rbind(cds, bp5000, bp2000, bp1000, bp0) 388 | 389 | mace_stanley <- all_flank[all_flank$aln_type == "mace->stanley",] 390 | mace_stanley <- mace_stanley[order(mace_stanley$transcript),] 391 | #add the capped identity 392 | mace_stanley$capped_ident <- mace_stanley$pident 393 | mace_stanley[mace_stanley$capped_ident < 100, "capped_ident"] <- 0 394 | mace_stanley$Flanking <- factor(mace_stanley$Flanking, levels = c("5000", "2000", "1000", "0", "cds")) 395 | mace_stanley$capped_ident <- factor(as.character(mace_stanley$capped_ident), levels = c("0", "100")) 396 | mace_stanley$capped_ident_noN <- mace_stanley$capped_ident 397 | mace_stanley[mace_stanley$Ns_total > 0, "capped_ident_noN"] <- NA 398 | mace_stanley_expand <- data.frame(complete(mace_stanley, transcript, Flanking)) 399 | 400 | 401 | mace_stanley_noN <- mace_stanley[mace_stanley$Ns_total == 0,] 402 | 403 | table(mace_stanley_noN$Flanking) 404 | mace_stanley_noN_expand <- data.frame(complete(mace_stanley_noN, transcript, Flanking)) 405 | ``` 406 | 407 | Try plotting all together this will show which transcripts get removed with the Ns 408 | 409 | ```{r} 410 | colours = c("0" = "gray", "100" = "black") 411 | na_col <- "white" 412 | combined_heatmap <- ggplot(mace_stanley_expand , aes(x=factor(transcript),y=Flanking)) + 413 | geom_tile(aes(fill = capped_ident_noN)) + 414 | scale_fill_manual(values = colours, na.value = na_col) + 415 | theme_bw() + 416 | theme(strip.text.y=element_text(angle=180, size=5), 417 | strip.background=element_blank(), 418 | legend.position = "none", 419 | axis.title.x=element_blank(), 420 | axis.text.x=element_blank(), 421 | axis.title.y=element_blank(), 422 | axis.text.y=element_blank()) 423 | 424 | 425 | combined_heatmap 426 | 427 | png("Y:/Publications/Haplotypes/Figures/figure_1_combined_heatmaps.png", res = 1500, width = 6.5, height = 2, units = "in") 428 | print(combined_heatmap) 429 | dev.off() 430 | 431 | combined_heatmap_axes <- ggplot(mace_stanley_expand , aes(x=factor(transcript),y=Flanking)) + 432 | geom_tile(aes(fill = capped_ident_noN)) + 433 | scale_fill_manual(values = colours, na.value = na_col) + 434 | theme_bw() + 435 | theme(strip.text.y=element_text(angle=180, size=5), 436 | strip.background=element_blank(), 437 | #legend.position = "none", 438 | axis.title.x=element_blank(), 439 | axis.text.x=element_blank(), 440 | axis.title.y=element_blank()) 441 | #axis.text.y=element_blank()) 442 | 443 | 444 | combined_heatmap_axes 445 | 446 | png("Y:/Publications/Haplotypes/Figures/figure_1_combined_heatmaps_axes.png", res = 1500, width = 6.5, height = 2, units = "in") 447 | print(combined_heatmap_axes) 448 | dev.off() 449 | ``` 450 | 451 | Try plotting separately 452 | 453 | ```{r} 454 | flanks <- unique(mace_stanley_expand$Flanking) 455 | 456 | for (flank in flanks){ 457 | flank_data <- mace_stanley_expand[mace_stanley_expand$Flanking == flank, ] 458 | 459 | flank_data_noN <- flank_data[flank_data$Ns_total == 0,] 460 | flank_data_noN <- flank_data_noN[complete.cases(flank_data_noN$Ns_total),] 461 | 462 | heatmap_axes <- ggplot(flank_data_noN , aes(x=factor(transcript),y=Flanking)) + 463 | geom_tile(aes(fill = capped_ident_noN)) + 464 | scale_fill_manual(values = colours, na.value = na_col) + 465 | theme_bw() + 466 | theme(strip.text.y=element_text(angle=180, size=5), 467 | strip.background=element_blank(), 468 | #legend.position = "none", 469 | axis.title.x=element_blank(), 470 | axis.text.x=element_blank(), 471 | axis.title.y=element_blank()) 472 | #axis.text.y=element_blank()) 473 | 474 | 475 | png(paste0("Y:/Publications/Haplotypes/Figures/figure_1_", flank, "_heatmap_axes.png"), res = 1500, width = 6.5, height = 2, units = "in") 476 | print(heatmap_axes) 477 | dev.off() 478 | 479 | heatmap <- ggplot(flank_data_noN , aes(x=factor(transcript),y=Flanking)) + 480 | geom_tile(aes(fill = capped_ident_noN)) + 481 | scale_fill_manual(values = colours, na.value = na_col) + 482 | theme_bw() + 483 | theme(strip.text.y=element_text(angle=180, size=5), 484 | strip.background=element_blank(), 485 | legend.position = "none", 486 | axis.title.x=element_blank(), 487 | axis.text.x=element_blank(), 488 | axis.title.y=element_blank(), 489 | axis.text.y=element_blank()) 490 | 491 | 492 | png(paste0("Y:/Publications/Haplotypes/Figures/figure_1_", flank, "_heatmap.png"), res = 1500, width = 6.5, height = 0.75, units = "in") 493 | print(heatmap) 494 | dev.off() 495 | } 496 | ``` 497 | 498 | 499 | -------------------------------------------------------------------------------- /figures/figure_4_plots.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "figure_3_plots" 3 | author: "Jemima Brinton" 4 | date: "04/05/2020" 5 | output: html_document 6 | --- 7 | 8 | ```{r setup, include=FALSE} 9 | knitr::opts_chunk$set(echo = TRUE) 10 | 11 | library(stringr) 12 | library(ggplot2) 13 | library(GenomicRanges) 14 | library(ggbio) 15 | library(viridis) 16 | library(reshape2) 17 | library("ggalluvial") 18 | library(RColorBrewer) 19 | 20 | 21 | convert_long_01 <- function(data){ 22 | converted <- data.frame(marker=character(), variable = character(), value = numeric(),pos = numeric()) 23 | markers <- unique(data$marker) 24 | 25 | for (i in seq(1, length(markers))){ 26 | marker <- markers[i] 27 | marker_data <- data[data$marker == marker,] 28 | snp_calls <- unique(marker_data$value) 29 | alleles <- snp_calls[(snp_calls %in% c("A", "C", "T", "G"))] 30 | #make sure chinese spring allele is always 0 31 | CS_allele <- marker_data[marker_data$variable == "CHINESE", "value"] 32 | alt_allele <- alleles[!(alleles %in% CS_allele)] 33 | marker_data$value <- gsub(CS_allele, "0", marker_data$value) 34 | marker_data$value <- gsub(alt_allele, "1", marker_data$value) 35 | converted <- rbind(converted, marker_data) 36 | } 37 | converted$value <- gsub("N", "NA", converted$value) 38 | converted$value <- as.numeric(converted$value) 39 | return(converted) 40 | } 41 | 42 | plot_capture_specified_order_small <- function(data, start, end, outfile_prefix, order_list){ 43 | data_zoom <- data[(data$pos > start) & (data$pos < end),] 44 | data_zoom$variable <- factor(x = data_zoom$variable, levels = order_list, ordered = TRUE) 45 | plot <- ggplot(data_zoom , aes(x=factor(pos),y=variable)) + 46 | geom_tile(aes(fill = value), color = "gray") + 47 | #scale_fill_distiller(limits=c(0,1), type='div', palette="YlGnBu", na.value = "gray94", direction = 1, name = "allele") + 48 | #scale_fill_continuous(type = "viridis") + 49 | scale_fill_gradient(low = "grey70", high = "grey20", na.value = "white", aesthetics = "fill") + 50 | theme_bw() + 51 | theme(axis.text.x=element_text(angle=90, hjust=1)) #+ 52 | #geom_point( aes( x = "TraesCS6A02G189400.1", y=aln_type, colour = "red" ) ) 53 | #ggsave(plot = plot, file = paste0(outfile_prefix, "_heatmap.png"), dpi = 600, height = 6, width = 6) 54 | return(plot) 55 | } 56 | ``` 57 | 58 | 59 | Ok now lets make the plots for the different genotyping platforms 60 | 61 | ```{r} 62 | cerealsdb_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/35K_cerealsdb_hap_examples.csv", 63 | sep = ",", 64 | header = TRUE, 65 | stringsAsFactors = FALSE) 66 | 67 | cerealsdb_manual_long <- melt(cerealsdb_manual, id.vars = c("marker", "pos")) 68 | 69 | cerealsdb_manual_01 <- convert_long_01(cerealsdb_manual_long) 70 | 71 | cerealsdb_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY") 72 | 73 | to_plot_subset <- subset(cerealsdb_manual_01, variable %in% cerealsdb_order) 74 | 75 | 76 | plot_manual_35k <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(cerealsdb_order)) 77 | plot_manual_35k 78 | 79 | plot_manual_35k_no_axes <- plot_manual_35k + 80 | theme(strip.text.y=element_text(angle=180, size=5), 81 | strip.background=element_blank(), 82 | legend.position = "none", 83 | axis.title.x=element_blank(), 84 | axis.text.x=element_blank(), 85 | axis.title.y=element_blank(), 86 | axis.text.y=element_blank()) 87 | 88 | plot_manual_35k_no_axes 89 | ``` 90 | 91 | Do this for the vos fells as well 92 | 93 | ```{r} 94 | VF_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/voss_fells_hap_examples.csv", 95 | sep = ",", 96 | header = TRUE, 97 | stringsAsFactors = FALSE) 98 | 99 | VF_manual_long <- melt(VF_manual, id.vars = c("marker", "pos")) 100 | 101 | VF_manual_01 <- convert_long_01(VF_manual_long) 102 | 103 | VF_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY", "X9K1") 104 | 105 | to_plot_subset <- subset(VF_manual_01, variable %in% VF_order) 106 | 107 | plot_manual_VF <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(VF_order)) 108 | plot_manual_VF 109 | 110 | plot_manual_VF_no_axes <- plot_manual_VF + 111 | theme(strip.text.y=element_text(angle=180, size=5), 112 | strip.background=element_blank(), 113 | legend.position = "none", 114 | axis.title.x=element_blank(), 115 | axis.text.x=element_blank(), 116 | axis.title.y=element_blank(), 117 | axis.text.y=element_blank()) 118 | 119 | plot_manual_VF_no_axes 120 | ``` 121 | 122 | Now the akhunov data 123 | 124 | ```{r} 125 | akhunov_manual <- read.table("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/figures/akhunov_hap_examples.csv", 126 | sep = ",", 127 | header = TRUE, 128 | stringsAsFactors = FALSE) 129 | 130 | akhunov_manual_long <- melt(akhunov_manual, id.vars = c("marker", "pos")) 131 | 132 | akhunov_order <- c("CHINESE" , "CADENZA", "PARAGON", "NORIN61", "LANCER", "CLAIRE", "JAGGER", "SY", "ROBIGUS", "ARINA", "JULIUS", "WEEBIL", "LANDMARK", "MACE", "STANLEY", "Ex1", "Ex2", "Ex3") 133 | 134 | to_plot_subset <- subset(akhunov_manual_long, variable %in% akhunov_order) 135 | 136 | plot_manual_akhunov <- plot_capture_specified_order_small(to_plot_subset, 187000000, 445000000, outfile_prefix = "test", order_list = rev(akhunov_order)) 137 | plot_manual_akhunov 138 | 139 | plot_manual_akhunov_no_axes <- plot_manual_akhunov + 140 | theme(strip.text.y=element_text(angle=180, size=5), 141 | strip.background=element_blank(), 142 | legend.position = "none", 143 | axis.title.x=element_blank(), 144 | axis.text.x=element_blank(), 145 | axis.title.y=element_blank(), 146 | axis.text.y=element_blank()) 147 | 148 | plot_manual_akhunov_no_axes 149 | ``` 150 | 151 | Now the SNP distributions 152 | ```{r} 153 | akhunov_pos <- read.table("X:/brintonj/6A_region/akhunov_capture/all.GP08_mm75_het3_publication01142019.vcf.gz_all6A_HCLC_GT_only.GT.FORMAT_position_only.tsv", 154 | sep = "\t", 155 | header = TRUE, 156 | stringsAsFactors = FALSE) 157 | head(akhunov_pos) 158 | nrow(akhunov_pos) 159 | nrow(unique(akhunov_pos)) 160 | 161 | akhunov_unfiltered_pos <- read.table("X:/brintonj/6A_region/akhunov_capture/all.GP08_mm75_het3_publication01142019.vcf.gz_all6A_GT_only.GT.FORMAT_position_only.tsv", 162 | sep = "\t", 163 | header = TRUE, 164 | stringsAsFactors = FALSE) 165 | head(akhunov_unfiltered_pos) 166 | nrow(akhunov_unfiltered_pos) 167 | nrow(unique(akhunov_unfiltered_pos)) 168 | 169 | breeders <- read.table("Y:/Jemima/6A/haplotype/35K/35k_probe_set_IWGSCv1.csv", 170 | sep = ",", 171 | header = TRUE, 172 | stringsAsFactors = FALSE) 173 | head(breeders) 174 | 175 | breeders_6A_pos <- breeders[breeders$IWGSC_v1_Chromosome == "chr6A", c(2,3)] 176 | colnames(breeders_6A_pos) <- colnames(akhunov_pos) 177 | 178 | nrow(breeders_6A_pos) 179 | nrow(unique(breeders_6A_pos)) 180 | breeders_6A_pos$POS <- as.numeric(breeders_6A_pos$POS) 181 | head(breeders_6A_pos) 182 | 183 | inifinium <- read.table("Y:/Jemima/6A/haplotype/Vos-Fels et al 2019/voss_fells_marker_positions_6A.csv", 184 | sep = ",", 185 | header = TRUE, 186 | stringsAsFactors = FALSE) 187 | 188 | head(inifinium) 189 | 190 | inifinium_pos <- inifinium[,c("chrom", "pos")] 191 | colnames(inifinium_pos) <- colnames(akhunov_pos) 192 | 193 | nrow(inifinium_pos) 194 | nrow(unique(inifinium_pos)) 195 | inifinium_pos$POS <- as.numeric(inifinium_pos$POS) 196 | head(inifinium_pos) 197 | 198 | pangenome_snps <- read.table("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_dist_6A_CS_ref.tsv", 199 | sep = "\t", 200 | header = TRUE, 201 | stringsAsFactors = FALSE) 202 | 203 | nrow(pangenome_snps) 204 | nrow(unique(pangenome_snps)) 205 | colnames(pangenome_snps) <- colnames(akhunov_pos) 206 | pangenome_snps$POS <- as.numeric(pangenome_snps$POS) 207 | head(pangenome_snps) 208 | ``` 209 | 210 | Plots 211 | ```{r} 212 | 213 | 214 | pangenome_snps <- ggplot(pangenome_snps, aes(x = POS)) + 215 | geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) + 216 | theme_bw() + 217 | scale_x_continuous(breaks=seq(0,CS_6A_len,100000000)) 218 | 219 | 220 | pangenome_snps 221 | 222 | pangenome_snps_no_axes <- pangenome_snps + 223 | theme(strip.text.y=element_text(angle=180, size=5), 224 | strip.background=element_blank(), 225 | legend.position = "none", 226 | axis.title.x=element_blank(), 227 | axis.text.x=element_blank(), 228 | axis.title.y=element_blank(), 229 | axis.text.y=element_blank()) 230 | 231 | pangenome_snps_no_axes 232 | 233 | ``` 234 | 235 | ```{r} 236 | 237 | 238 | akhunov_snps <- ggplot(akhunov_pos, aes(x = POS)) + 239 | geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) + 240 | theme_bw() + 241 | scale_x_continuous(breaks=seq(0,CS_6A_len,100000000)) 242 | 243 | akhunov_snps 244 | 245 | akhunov_snps_no_axes <- akhunov_snps + 246 | theme(strip.text.y=element_text(angle=180, size=5), 247 | strip.background=element_blank(), 248 | legend.position = "none", 249 | axis.title.x=element_blank(), 250 | axis.text.x=element_blank(), 251 | axis.title.y=element_blank(), 252 | axis.text.y=element_blank()) 253 | 254 | akhunov_snps_no_axes 255 | 256 | ``` 257 | 258 | ```{r} 259 | 260 | 261 | akhunov_unfilt_snps <- ggplot(akhunov_unfiltered_pos, aes(x = POS)) + 262 | geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) + 263 | theme_bw() + 264 | scale_x_continuous(breaks=seq(0,CS_6A_len,100000000)) 265 | 266 | akhunov_unfilt_snps 267 | 268 | akhunov_unfilt_snps_no_axes <- akhunov_unfilt_snps + 269 | theme(strip.text.y=element_text(angle=180, size=5), 270 | strip.background=element_blank(), 271 | legend.position = "none", 272 | axis.title.x=element_blank(), 273 | axis.text.x=element_blank(), 274 | axis.title.y=element_blank(), 275 | axis.text.y=element_blank()) 276 | 277 | akhunov_unfilt_snps_no_axes 278 | 279 | ``` 280 | 281 | ```{r} 282 | breeders_snps <- ggplot(breeders_6A_pos, aes(x = POS)) + 283 | geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) + 284 | theme_bw() + 285 | scale_x_continuous(breaks=seq(0,CS_6A_len,100000000)) 286 | breeders_snps 287 | 288 | breeders_snps_no_axes <- breeders_snps + 289 | theme(strip.text.y=element_text(angle=180, size=5), 290 | strip.background=element_blank(), 291 | legend.position = "none", 292 | axis.title.x=element_blank(), 293 | axis.text.x=element_blank(), 294 | axis.title.y=element_blank(), 295 | axis.text.y=element_blank()) 296 | 297 | breeders_snps_no_axes 298 | ``` 299 | 300 | ```{r} 301 | 302 | inifinium_snps <- ggplot(inifinium_pos, aes(x = POS)) + 303 | geom_histogram(alpha = 0.5, color = "black", binwidth = 5000000) + 304 | theme_bw() + 305 | scale_x_continuous(breaks=seq(0,CS_6A_len,100000000)) 306 | 307 | print(inifinium_snps) 308 | 309 | inifinium_snps_no_axes <- inifinium_snps + 310 | theme(strip.text.y=element_text(angle=180, size=5), 311 | strip.background=element_blank(), 312 | legend.position = "none", 313 | axis.title.x=element_blank(), 314 | axis.text.x=element_blank(), 315 | axis.title.y=element_blank(), 316 | axis.text.y=element_blank()) 317 | 318 | inifinium_snps_no_axes 319 | 320 | ``` 321 | 322 | Now we want to plots the genotypes of the panels 323 | 324 | ```{r} 325 | #calls coming from the public datasets 326 | haplotype_calls <- read.table(file = "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/pangenome_haplotype_allocations_35K_all_AK_USA_capture_vossfells_updated_May2020_FINAL_SET_TO_INCLUDE.csv", 327 | sep = ",", 328 | header = TRUE) 329 | 330 | 331 | ##calls with the haplotype-specific markers 332 | marker_calls <- read.table(file = "C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/watkins_analysis/final_haplotypes_watkins_UKRL.csv", 333 | sep = ",", 334 | header = TRUE) 335 | 336 | head(marker_calls) 337 | 338 | ``` 339 | 340 | ```{r} 341 | 342 | haplotype_colours <- c("#000080", 343 | "#5FC6BC", 344 | "#008080", 345 | "#FF7F2a", 346 | "#5F5Fd3", 347 | "#55DDFF", 348 | "#A793AC", 349 | "#DE87CD", 350 | "#C287DE", 351 | "#D9AEED", 352 | "#FFE680", 353 | "#FF5555", 354 | "#B3DE69", 355 | "#B3B3B3") 356 | 357 | 358 | haplotype_names <- c("H1", 359 | "H1/2", 360 | "H2", 361 | "H3", 362 | "H4", 363 | "H5", 364 | "H6", 365 | "H7", 366 | "H5/6/7", 367 | "H4/5/6/7", 368 | "35K1", 369 | "35K2", 370 | "wat_specific", 371 | "other") 372 | 373 | names(haplotype_colours) <- haplotype_names 374 | 375 | haplotype_calls_complete <- haplotype_calls[!(haplotype_calls$nonPG_simple_hap == "poor_data"),] 376 | 377 | haplotype_calls_complete$nonPG_simple_hap <- factor(haplotype_calls_complete$nonPG_simple_hap, levels = haplotype_names) 378 | haplotype_calls_complete$group <- factor(haplotype_calls_complete$group, levels = c("UK_RL", "voss_fells", "Australia", "USA","CIMCOG", "Watkins")) 379 | 380 | head(haplotype_calls_complete) 381 | 382 | ``` 383 | 384 | Try plotting using proportions rather than absolute values 385 | ```{r} 386 | 387 | haplotype_calls_complete <- haplotype_calls_complete[complete.cases(haplotype_calls_complete$nonPG_simple_hap),] 388 | hap_counts <- data.frame(table(haplotype_calls_complete[,c("group", "nonPG_simple_hap")])) 389 | 390 | group_counts <- data.frame(table(haplotype_calls_complete$group)) 391 | colnames(group_counts) <- c("group", "total_var") 392 | hap_counts <- merge(hap_counts, group_counts, all.x = TRUE, all.y = FALSE) 393 | hap_counts$perc <- (hap_counts$Freq/hap_counts$total_var)*100 394 | 395 | group_counts 396 | 397 | hap_counts$group <- factor(hap_counts$group, levels = c("UK_RL", "voss_fells", "Australia", "USA","CIMCOG", "Watkins")) 398 | 399 | plot <- ggplot(hap_counts, aes(x=group, y = perc, fill = nonPG_simple_hap)) + 400 | geom_bar(stat = "identity", width = 0.7) + 401 | ylab("% of varieties") + 402 | scale_x_discrete(labels = c(paste0("UK RL \n (n=", group_counts[group_counts$group == "UK_RL","total_var"], ")"), 403 | paste0("European \n (n=", group_counts[group_counts$group == "voss_fells","total_var"], ")"), 404 | paste0("Australia \n (n=", group_counts[group_counts$group == "Australia","total_var"], ")"), 405 | paste0("USA \n (n=", group_counts[group_counts$group == "USA","total_var"], ")"), 406 | paste0("CIMCOG \n (n=", group_counts[group_counts$group == "CIMCOG","total_var"], ")"), 407 | paste0("Watkins \n (n=", group_counts[group_counts$group == "Watkins","total_var"], ")"))) + 408 | scale_fill_manual(name = "Haplotype", values = haplotype_colours) + theme_bw() 409 | plot 410 | 411 | plot_no_watkins <- ggplot(hap_counts[!(hap_counts$group == "Watkins"),], aes(x=group, y = perc, fill = nonPG_simple_hap)) + 412 | geom_bar(stat = "identity", width = 0.7) + 413 | ylab("% of varieties") + 414 | scale_x_discrete(labels = c(paste0("UK RL \n (n=", group_counts[group_counts$group == "UK_RL","total_var"], ")"), 415 | paste0("European \n (n=", group_counts[group_counts$group == "voss_fells","total_var"], ")"), 416 | paste0("Australia \n (n=", group_counts[group_counts$group == "Australia","total_var"], ")"), 417 | paste0("USA \n (n=", group_counts[group_counts$group == "USA","total_var"], ")"), 418 | paste0("CIMCOG \n (n=", group_counts[group_counts$group == "CIMCOG","total_var"], ")"))) + 419 | scale_fill_manual(name = "Haplotype", values = haplotype_colours) + theme_bw() 420 | 421 | plot_no_watkins 422 | 423 | plot_no_watkins_no_axes <- plot_no_watkins + 424 | theme(strip.text.y=element_text(angle=180, size=5), 425 | strip.background=element_blank(), 426 | legend.position = "none", 427 | axis.title.x=element_blank(), 428 | axis.text.x=element_blank(), 429 | axis.title.y=element_blank(), 430 | axis.text.y=element_blank()) 431 | 432 | plot_no_watkins_no_axes 433 | 434 | ``` 435 | 436 | Now the watkins alluvial plots 437 | 438 | ```{r} 439 | #melt the dataframe 440 | #remove_poor_data in both data sets 441 | marker_calls_no_missing <- marker_calls[!((marker_calls$previous_summary == "poor_data") | (marker_calls$marker_summary == "poor_data") | (marker_calls$previous_summary == "no_data")), ] 442 | marker_melt <- melt(marker_calls_no_missing, id.vars = c("line", "group")) 443 | 444 | 445 | head(marker_melt) 446 | tail(marker_melt) 447 | 448 | 449 | names(haplotype_colours) <- haplotype_names 450 | 451 | 452 | marker_wat_summary <- marker_melt[(marker_melt$group == "watkins") & (marker_melt$variable == "previous_summary" | marker_melt$variable == "marker_summary"),] 453 | marker_wat_summary$value <- factor(marker_wat_summary$value, levels = names(haplotype_colours)) 454 | 455 | unique(marker_wat_summary$value) 456 | watkins_old_to_new <- ggplot(marker_wat_summary, aes(x = variable, stratum = value, alluvium = line, fill = value)) + 457 | geom_flow() + 458 | geom_stratum() + 459 | scale_fill_manual(name = "Haplotype", values = haplotype_colours) + 460 | theme_bw() 461 | 462 | watkins_old_to_new 463 | 464 | watkins_old_to_new_no_axes <- watkins_old_to_new + 465 | theme(strip.text.y=element_text(angle=180, size=5), 466 | strip.background=element_blank(), 467 | legend.position = "none", 468 | axis.title.x=element_blank(), 469 | axis.text.x=element_blank(), 470 | axis.title.y=element_blank(), 471 | axis.text.y=element_blank()) 472 | 473 | watkins_old_to_new_no_axes 474 | ``` 475 | 476 | Now just plot the watkins specific 477 | 478 | Plot european data over time 479 | ```{r} 480 | head(haplotype_calls_complete) 481 | 482 | voss_fells <- subset(haplotype_calls_complete, group %in% "voss_fells") 483 | 484 | plot_over_time <- ggplot(voss_fells, aes(x = year, fill = nonPG_simple_hap)) + 485 | geom_histogram(position = "stack", binwidth=5, col = "black") + 486 | xlab("Year of release (5 year bins)") + 487 | ylab("Number of varieties") + 488 | scale_fill_manual(name = "Haplotype", values = haplotype_colours, labels = c("H1/2", "H3", "H4", "H5", "H6", "H7", "other")) + theme_bw() 489 | plot_over_time 490 | 491 | ``` 492 | 493 | ```{r} 494 | width = 10 495 | height = 1 496 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/summarised_conserved_regions_cap5_6A.pdf", width = width, height = height) 497 | print(summarised_conserved) 498 | dev.off() 499 | 500 | width = 4 501 | height = 4 502 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_35K_axes.pdf", width = width, height = height) 503 | print(plot_manual_35k) 504 | dev.off() 505 | 506 | width = 4 507 | height = 4 508 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_35K.pdf", width = width, height = height) 509 | print(plot_manual_35k_no_axes) 510 | dev.off() 511 | 512 | width = 4 513 | height = 4 514 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_VF_axes.pdf", width = width, height = height) 515 | print(plot_manual_VF) 516 | dev.off() 517 | 518 | width = 4 519 | height = 4 520 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_VF.pdf", width = width, height = height) 521 | print(plot_manual_VF_no_axes) 522 | dev.off() 523 | 524 | width = 4 525 | height = 4 526 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_akhunov_axes.pdf", width = width, height = height) 527 | print(plot_manual_akhunov) 528 | dev.off() 529 | 530 | width = 4 531 | height = 4 532 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/haplotype_examples_akhunov.pdf", width = width, height = height) 533 | print(plot_manual_akhunov_no_axes) 534 | dev.off() 535 | 536 | width = 8 537 | height = 2 538 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_snp_positions_5mbpbin.pdf", width = width, height = height) 539 | print(akhunov_snps) 540 | dev.off() 541 | 542 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height) 543 | print(akhunov_snps_no_axes) 544 | dev.off() 545 | 546 | width = 8 547 | height = 2 548 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_unfiltered_snp_positions_5mbpbin.pdf", width = width, height = height) 549 | print(akhunov_unfilt_snps) 550 | dev.off() 551 | 552 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/akhunov_unfiltered_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height) 553 | print(akhunov_unfilt_snps_no_axes) 554 | dev.off() 555 | 556 | 557 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/9K_snp_positions_5mbpbin.pdf", width = width, height = height) 558 | print(inifinium_snps) 559 | dev.off() 560 | 561 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/9K_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height) 562 | print(inifinium_snps_no_axes) 563 | dev.off() 564 | 565 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/35K_snp_positions_5mbpbin.pdf", width = width, height = height) 566 | print(breeders_snps) 567 | dev.off() 568 | 569 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/35K_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height) 570 | print(breeders_snps_no_axes) 571 | dev.off() 572 | 573 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_positions_5mbpbin.pdf", width = width, height = height) 574 | print(pangenome_snps) 575 | dev.off() 576 | 577 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/pangenome_snp_positions_5mbpbin_no_axes.pdf", width = width, height = height) 578 | print(pangenome_snps_no_axes) 579 | dev.off() 580 | 581 | width = 6 582 | height = 4 583 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/panels_public_datasets_calls_no_watkins_no_axes.pdf", width = width, height = height) 584 | print(plot_no_watkins_no_axes) 585 | dev.off() 586 | 587 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/panels_public_datasets_calls_no_watkins.pdf", width = width, height = height) 588 | print(plot_no_watkins) 589 | dev.off() 590 | 591 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_summary_no_axes.pdf", width = width, height = height) 592 | print(watkins_old_to_new_no_axes) 593 | dev.off() 594 | 595 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_summary.pdf", width = width, height = height) 596 | print(watkins_old_to_new) 597 | dev.off() 598 | 599 | 600 | width = 1.5 601 | height = 4 602 | pdf("Y:/Publications/Haplotypes/Figures/figure_3/hap_specific_markers_watkins_specific_no_axes.pdf", width = width, height = height) 603 | print(watkins_specific_no_axes) 604 | dev.off() 605 | 606 | width = 5 607 | height = 4 608 | pdf("C:/Users/brintonj/Documents/2020_03_WFH/haplotype_manuscript/extended_data/european_data_over_time.pdf", width = width, height = height) 609 | print(plot_over_time) 610 | dev.off() 611 | 612 | ``` 613 | --------------------------------------------------------------------------------